diff --git a/iccv/iccv2013.json b/iccv/iccv2013.json index 7cfb271..b9ce8e6 100644 --- a/iccv/iccv2013.json +++ b/iccv/iccv2013.json @@ -21,14 +21,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;1;2", - "aff_unique_norm": "University of Michigan;Microsoft;Stanford University", + "aff_unique_norm": "University of Michigan;Microsoft Research;Stanford University", "aff_unique_dep": ";Microsoft Research;", - "aff_unique_url": "https://www.umich.edu;https://www.microsoft.com/en-us/research/group/microsoft-research-cambridge;https://www.stanford.edu", + "aff_unique_url": "https://www.umich.edu;https://www.microsoft.com/en-us/research/group/cambridge;https://www.stanford.edu", "aff_unique_abbr": "UM;MSR Cambridge;Stanford", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Cambridge;Stanford", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "United States;United Kingdom" + "aff_country_unique": "United States;United Kingdom", + "bibtex": "@InProceedings{Kim_2013_ICCV,\n \n author = {\n Kim,\n Byung-Soo and Kohli,\n Pushmeet and Savarese,\n Silvio\n},\n title = {\n 3D Scene Understanding by Voxel-CRF\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "5909335660", @@ -59,7 +60,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Taiwan", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Lin_2013_ICCV,\n \n author = {\n Lin,\n Yen-Liang and Huang,\n Cheng-Yu and Wang,\n Hao-Jeng and Hsu,\n Winston\n},\n title = {\n 3D Sub-query Expansion for Improving Sketch-Based Multi-view Image Retrieval\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "afff4d0deb", @@ -84,13 +86,14 @@ "status": "Poster", "aff_unique_index": "0;1", "aff_unique_norm": "Google;Carnegie Mellon University", - "aff_unique_dep": "Google;", + "aff_unique_dep": ";", "aff_unique_url": "https://www.google.com;https://www.cmu.edu", "aff_unique_abbr": "Google;CMU", "aff_campus_unique_index": "0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Satkin_2013_ICCV,\n \n author = {\n Satkin,\n Scott and Hebert,\n Martial\n},\n title = {\n 3DNN: Viewpoint Invariant 3D Geometry Matching for Scene Understanding\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "d5e240f929", @@ -116,12 +119,13 @@ "aff_unique_index": "0;0;0+1;0", "aff_unique_norm": "University of Electronic Science and Technology of China;Shanghai Institutes for Biological Sciences", "aff_unique_dep": ";Chinese Academy of Sciences", - "aff_unique_url": "http://www.uestc.edu.cn;", - "aff_unique_abbr": "UESTC;", + "aff_unique_url": "http://www.uestc.edu.cn;http://www.sibs.ac.cn", + "aff_unique_abbr": "UESTC;SIBS", "aff_campus_unique_index": "0;0;0+1;0", "aff_campus_unique": "Chengdu;Shanghai", "aff_country_unique_index": "0;0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Gao_2013_ICCV,\n \n author = {\n Gao,\n Shaobing and Yang,\n Kaifu and Li,\n Chaoyi and Li,\n Yongjie\n},\n title = {\n A Color Constancy Model with Double-Opponency Mechanisms\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "10b32c7a15", @@ -143,7 +147,8 @@ "email": ";;;", "author_num": 4, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Elhamifar_2013_ICCV,\n \n author = {\n Elhamifar,\n Ehsan and Sapiro,\n Guillermo and Yang,\n Allen and Sasrty,\n S. Shankar\n},\n title = {\n A Convex Optimization Framework for Active Learning\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "469c850dab", @@ -167,14 +172,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0+1;0;0+1", - "aff_unique_norm": "Chinese University of Hong Kong;Chinese Academy of Sciences", + "aff_unique_norm": "The Chinese University of Hong Kong;Chinese Academy of Sciences", "aff_unique_dep": "Department of Information Engineering;Shenzhen Institutes of Advanced Technology", "aff_unique_url": "https://www.cuhk.edu.hk;http://www.siat.cas.cn", "aff_unique_abbr": "CUHK;SIAT", "aff_campus_unique_index": "0+1;0;0+1", "aff_campus_unique": "Hong Kong SAR;Shenzhen", "aff_country_unique_index": "0+0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Luo_2013_ICCV,\n \n author = {\n Luo,\n Ping and Wang,\n Xiaogang and Tang,\n Xiaoou\n},\n title = {\n A Deep Sum-Product Architecture for Robust Facial Attributes Analysis\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "d38a6c88e2", @@ -205,7 +211,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Dong_2013_ICCV,\n \n author = {\n Dong,\n Jian and Chen,\n Qiang and Xia,\n Wei and Huang,\n Zhongyang and Yan,\n Shuicheng\n},\n title = {\n A Deformable Mixture Parsing Model with Parselets\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "61d058286c", @@ -236,7 +243,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Thomas_2013_ICCV,\n \n author = {\n Thomas,\n Diego and Sugimoto,\n Akihiro\n},\n title = {\n A Flexible Scene Representation for 3D Reconstruction Using an RGB-D Camera\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "326dbfab2f", @@ -267,7 +275,8 @@ "aff_campus_unique_index": "0+0;0+0;0;0+0", "aff_campus_unique": "Canberra", "aff_country_unique_index": "0+0;0+0;0;0+0", - "aff_country_unique": "Australia" + "aff_country_unique": "Australia", + "bibtex": "@InProceedings{Jayasumana_2013_ICCV,\n \n author = {\n Jayasumana,\n Sadeep and Salzmann,\n Mathieu and Li,\n Hongdong and Harandi,\n Mehrtash\n},\n title = {\n A Framework for Shape Analysis via Hilbert Space Embedding\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "28fee86781", @@ -298,7 +307,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", - "aff_country_unique": "Germany;Chile" + "aff_country_unique": "Germany;Chile", + "bibtex": "@InProceedings{Sipiran_2013_ICCV,\n \n author = {\n Sipiran,\n Ivan and Bustos,\n Benjamin\n},\n title = {\n A Fully Hierarchical Approach for Finding Correspondences in Non-rigid Shapes\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "3351a25986", @@ -307,7 +317,7 @@ "author": "Jim Braux-Zin; Romain Dupont; Adrien Bartoli", "abstract": "Dense motion field estimation (typically optical flow, stereo disparity and surface registration) is a key computer vision problem. Many solutions have been proposed to compute small or large displacements, narrow or wide baseline stereo disparity, but a unified methodology is still lacking. We here introduce a general framework that robustly combines direct and feature-based matching. The feature-based cost is built around a novel robust distance function that handles keypoints and \"weak\" features such as segments. It allows us to use putative feature matches which may contain mismatches to guide dense motion estimation out of local minima. Our framework uses a robust direct data term (AD-Census). It is implemented with a powerful second order Total Generalized Variation regularization with external and self-occlusion reasoning. Our framework achieves state of the art performance in several cases (standard optical flow benchmarks, wide-baseline stereo and non-rigid surface registration). Our framework has a modular design that customizes to specific application needs.", "pdf": "http://openaccess.thecvf.com/content_iccv_2013/papers/Braux-Zin_A_General_Dense_2013_ICCV_paper.pdf", - "aff": "CEA, LIST, France+ISIT, Universit \u00b4e d\u2019Auvergne/CNRS, France; CEA, LIST, France; ISIT, Universit \u00b4e d\u2019Auvergne/CNRS, France", + "aff": "CEA, LIST, France+ISIT, Universit ´e d’Auvergne/CNRS, France; CEA, LIST, France; ISIT, Universit ´e d’Auvergne/CNRS, France", "project": "", "github": "", "supp": "", @@ -322,14 +332,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0+1;0;1", - "aff_unique_norm": "CEA;Universit\u00e9 d\u2019Auvergne", + "aff_unique_norm": "CEA;Université d’Auvergne", "aff_unique_dep": "LIST;ISIT", "aff_unique_url": "https://www.cea.fr;", - "aff_unique_abbr": "CEA;", + "aff_unique_abbr": ";", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0", - "aff_country_unique": "France" + "aff_country_unique": "France", + "bibtex": "@InProceedings{Braux-Zin_2013_ICCV,\n \n author = {\n Braux-Zin,\n Jim and Dupont,\n Romain and Bartoli,\n Adrien\n},\n title = {\n A General Dense Image Matching Framework Combining Direct and Feature-Based Costs\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "6b82c8c9cf", @@ -360,7 +371,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Australia" + "aff_country_unique": "Australia", + "bibtex": "@InProceedings{Lin_2013_ICCV,\n \n author = {\n Lin,\n Guosheng and Shen,\n Chunhua and Suter,\n David and van den Hengel,\n Anton\n},\n title = {\n A General Two-Step Approach to Learning-Based Hashing\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "f2bfec1fce", @@ -369,7 +381,7 @@ "author": "Wangmeng Zuo; Deyu Meng; Lei Zhang; Xiangchu Feng; David Zhang", "abstract": "In many sparse coding based image restoration and image classification problems, using non-convex p -norm minimization (0 top po1) can often obtain better results than the convex 1 -norm minimization. A number of algorithms, e.g., iteratively reweighted least squares (IRLS), iteratively thresholding method (ITMp ), and look-up table (LUT), have been proposed for non-convex p -norm sparse coding, while some analytic solutions have been suggested for some specific values of p. In this paper, by extending the popular soft-thresholding operator, we propose a generalized iterated shrinkage algorithm (GISA) for p -norm non-convex sparse coding. Unlike the analytic solutions, the proposed GISA algorithm is easy to implement, and can be adopted for solving non-convex sparse coding problems with arbitrary p values. Compared with LUT, GISA is more general and does not need to compute and store the look-up tables. Compared with IRLS and ITMp , GISA is theoretically more solid and can achieve more accurate solutions. Experiments on image restoration and sparse coding based face recognition are conducted to validate the performance of GISA.", "pdf": "http://openaccess.thecvf.com/content_iccv_2013/papers/Zuo_A_Generalized_Iterated_2013_ICCV_paper.pdf", - "aff": "Harbin Institute of Technology+Hong Kong Polytechnic University; Xi\u2019an Jiaotong University; Hong Kong Polytechnic University; Xidian University; Hong Kong Polytechnic University", + "aff": "Harbin Institute of Technology+Hong Kong Polytechnic University; Xi’an Jiaotong University; Hong Kong Polytechnic University; Xidian University; Hong Kong Polytechnic University", "project": "", "github": "", "supp": "", @@ -384,14 +396,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0+1;2;1;3;1", - "aff_unique_norm": "Harbin Institute of Technology;Hong Kong Polytechnic University;Xi'an Jiao Tong University;Xidian University", + "aff_unique_norm": "Harbin Institute of Technology;Hong Kong Polytechnic University;Xi'an Jiaotong University;Xidian University", "aff_unique_dep": ";;;", "aff_unique_url": "http://www.hit.edu.cn/;https://www.polyu.edu.hk;https://www.xjtu.edu.cn;http://www.xidian.edu.cn/", "aff_unique_abbr": "HIT;PolyU;XJTU;Xidian", "aff_campus_unique_index": "0+1;1;1", "aff_campus_unique": "Harbin;Hong Kong SAR;", "aff_country_unique_index": "0+0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zuo_2013_ICCV,\n \n author = {\n Zuo,\n Wangmeng and Meng,\n Deyu and Zhang,\n Lei and Feng,\n Xiangchu and Zhang,\n David\n},\n title = {\n A Generalized Iterated Shrinkage Algorithm for Non-convex Sparse Coding\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "53e196dffb", @@ -422,7 +435,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Taiwan", "aff_country_unique_index": "0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2013_ICCV,\n \n author = {\n Chen,\n Yi-Lei and Hsu,\n Chiou-Ting\n},\n title = {\n A Generalized Low-Rank Appearance Model for Spatio-temporally Correlated Rain Streaks\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "8d6c83f049", @@ -444,7 +458,8 @@ "email": ";;;;", "author_num": 5, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Zeng_2013_ICCV,\n \n author = {\n Zeng,\n Yun and Wang,\n Chaohui and Gu,\n Xianfeng and Samaras,\n Dimitris and Paragios,\n Nikos\n},\n title = {\n A Generic Deformation Model for Dense Non-rigid Surface Registration: A Higher-Order MRF-Based Approach\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "5349676e21", @@ -475,7 +490,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Jiang_2013_ICCV,\n \n author = {\n Jiang,\n Nianjuan and Cui,\n Zhaopeng and Tan,\n Ping\n},\n title = {\n A Global Linear Method for Camera Pose Registration\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "c045463b07", @@ -484,7 +500,7 @@ "author": "Martin Kiechle; Simon Hawe; Martin Kleinsteuber", "abstract": "High-resolution depth maps can be inferred from lowresolution depth measurements and an additional highresolution intensity image of the same scene. To that end, we introduce a bimodal co-sparse analysis model, which is able to capture the interdependency of registered intensity and depth information. This model is based on the assumption that the co-supports of corresponding bimodal image structures are aligned when computed by a suitable pair of analysis operators. No analytic form of such operators exist and we propose a method for learning them from a set of registered training signals. This learning process is done offline and returns a bimodal analysis operator that is universally applicable to natural scenes. We use this to exploit the bimodal co-sparse analysis model as a prior for solving inverse problems, which leads to an efficient algorithm for depth map super-resolution.", "pdf": "http://openaccess.thecvf.com/content_iccv_2013/papers/Kiechle_A_Joint_Intensity_2013_ICCV_paper.pdf", - "aff": "Department of Electrical Engineering and Information Technology, Technische Universit \u00a8at M \u00a8unchen, Munich, Germany; Department of Electrical Engineering and Information Technology, Technische Universit \u00a8at M \u00a8unchen, Munich, Germany; Department of Electrical Engineering and Information Technology, Technische Universit \u00a8at M \u00a8unchen, Munich, Germany", + "aff": "Department of Electrical Engineering and Information Technology, Technische Universit ¨at M ¨unchen, Munich, Germany; Department of Electrical Engineering and Information Technology, Technische Universit ¨at M ¨unchen, Munich, Germany; Department of Electrical Engineering and Information Technology, Technische Universit ¨at M ¨unchen, Munich, Germany", "project": "http://www.gol.ei.tum.de", "github": "", "supp": "", @@ -499,14 +515,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;0;0", - "aff_unique_norm": "Technische Universit\u00e4t M\u00fcnchen", + "aff_unique_norm": "Technische Universität München", "aff_unique_dep": "Department of Electrical Engineering and Information Technology", "aff_unique_url": "https://www.tum.de", "aff_unique_abbr": "TUM", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Munich", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Kiechle_2013_ICCV,\n \n author = {\n Kiechle,\n Martin and Hawe,\n Simon and Kleinsteuber,\n Martin\n},\n title = {\n A Joint Intensity and Depth Co-sparse Analysis Model for Depth Map Super-resolution\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "731d83c8c9", @@ -528,7 +545,8 @@ "email": ";;;", "author_num": 4, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Choi_2013_ICCV,\n \n author = {\n Choi,\n Inchang and Kim,\n Sunyeong and Brown,\n Michael S. and Tai,\n Yu-Wing\n},\n title = {\n A Learning-Based Approach to Reduce JPEG Artifacts in Image Matting\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "4bb462783f", @@ -552,14 +570,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;1;2;0", - "aff_unique_norm": "University of Illinois Urbana-Champaign;Adobe;U.S. Army Research Laboratory", - "aff_unique_dep": "Beckman Institute;Adobe Systems Incorporated;", + "aff_unique_norm": "University of Illinois at Urbana-Champaign;Adobe Systems Incorporated;U.S. Army Research Laboratory", + "aff_unique_dep": "Beckman Institute;;", "aff_unique_url": "https://www.illinois.edu;https://www.adobe.com;https://www.arl.army.mil", "aff_unique_abbr": "UIUC;Adobe;ARL", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Urbana-Champaign;", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Wang_2013_ICCV,\n \n author = {\n Wang,\n Zhaowen and Yang,\n Jianchao and Nasrabadi,\n Nasser and Huang,\n Thomas\n},\n title = {\n A Max-Margin Perspective on Sparse Representation-Based Classification\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "0ca2269962", @@ -590,7 +609,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Ma_2013_ICCV,\n \n author = {\n Ma,\n Chang and Dong,\n Zhongqian and Jiang,\n Tingting and Wang,\n Yizhou and Gao,\n Wen\n},\n title = {\n A Method of Perceptual-Based Shape Decomposition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "93d3ec4dac", @@ -612,7 +632,8 @@ "email": ";", "author_num": 2, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Shariat_2013_ICCV,\n \n author = {\n Shariat,\n Shahriar and Pavlovic,\n Vladimir\n},\n title = {\n A New Adaptive Segmental Matching Measure for Human Activity Recognition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "b668237e1a", @@ -634,7 +655,8 @@ "email": ";;;;", "author_num": 5, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Kong_2013_ICCV,\n \n author = {\n Kong,\n Xiangfei and Li,\n Kuan and Yang,\n Qingxiong and Wenyin,\n Liu and Yang,\n Ming-Hsuan\n},\n title = {\n A New Image Quality Metric for Image Auto-denoising\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "bb560ee1d4", @@ -658,14 +680,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;0;1", - "aff_unique_norm": "Max Planck Institute for Intelligent Systems;Microsoft", - "aff_unique_dep": ";Microsoft Research", + "aff_unique_norm": "Max Planck Institute for Intelligent Systems;Microsoft Research", + "aff_unique_dep": ";", "aff_unique_url": "https://www.mpituebingen.mpg.de;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "MPI-IS;MSR", "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Tuebingen;Cambridge", "aff_country_unique_index": "0;0;1", - "aff_country_unique": "Germany;United Kingdom" + "aff_country_unique": "Germany;United Kingdom", + "bibtex": "@InProceedings{Lehrmann_2013_ICCV,\n \n author = {\n Lehrmann,\n Andreas M. and Gehler,\n Peter V. and Nowozin,\n Sebastian\n},\n title = {\n A Non-parametric Bayesian Network Prior of Human Pose\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "5ddccf093a", @@ -689,14 +712,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;1;2", - "aff_unique_norm": "Dalian University of Technology;Heilongjiang University;Hong Kong Polytechnic University", + "aff_unique_norm": "Dalian University of Technology;Heilongjiang University;The Hong Kong Polytechnic University", "aff_unique_dep": ";;", "aff_unique_url": "http://www.dlut.edu.cn/;http://www.hljnu.edu.cn;https://www.polyu.edu.hk", "aff_unique_abbr": "DUT;HGHU;PolyU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2013_ICCV,\n \n author = {\n Li,\n Peihua and Wang,\n Qilong and Zhang,\n Lei\n},\n title = {\n A Novel Earth Mover's Distance Methodology for Image Matching with Gaussian Mixture Models\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "a4fe9d2463", @@ -720,14 +744,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;0;0;0;0", - "aff_unique_norm": "Microsoft", - "aff_unique_dep": "Microsoft Corporation", + "aff_unique_norm": "Microsoft Corporation", + "aff_unique_dep": "", "aff_unique_url": "https://www.microsoft.com", "aff_unique_abbr": "Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Cao_2013_ICCV,\n \n author = {\n Cao,\n Xudong and Wipf,\n David and Wen,\n Fang and Duan,\n Genquan and Sun,\n Jian\n},\n title = {\n A Practical Transfer Learning Algorithm for Face Verification\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "cac8b7b389", @@ -749,7 +774,8 @@ "email": ";;", "author_num": 3, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Bartoli_2013_ICCV,\n \n author = {\n Bartoli,\n Adrien and Pizarro,\n Daniel and Collins,\n Toby\n},\n title = {\n A Robust Analytical Solution to Isometric Shape-from-Template with Focal Length Calibration\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "7dfce805c8", @@ -780,7 +806,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Newark", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Ye_2013_ICCV,\n \n author = {\n Ye,\n Jinwei and Ji,\n Yu and Yu,\n Jingyi\n},\n title = {\n A Rotational Stereo Model Based on XSlit Imaging\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "49f998de88", @@ -811,7 +838,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Australia" + "aff_country_unique": "Australia", + "bibtex": "@InProceedings{Liu_2013_ICCV,\n \n author = {\n Liu,\n Lingqiao and Wang,\n Lei\n},\n title = {\n A Scalable Unsupervised Feature Merging Approach to Efficient Dimensionality Reduction of High-Dimensional Visual Data\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "947da1f039", @@ -842,7 +870,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Chen_2013_ICCV,\n \n author = {\n Chen,\n Qifeng and Koltun,\n Vladlen\n},\n title = {\n A Simple Model for Intrinsic Image Decomposition with Depth Cues\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "abfc1394a9", @@ -864,7 +893,8 @@ "email": ";", "author_num": 2, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Wang_2013_ICCV,\n \n author = {\n Wang,\n Xiaoyang and Ji,\n Qiang\n},\n title = {\n A Unified Probabilistic Approach Modeling Relationships between Attributes and Objects\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "059e598035", @@ -895,7 +925,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Sophia Antipolis;", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "France;Australia" + "aff_country_unique": "France;Australia", + "bibtex": "@InProceedings{Meilland_2013_ICCV,\n \n author = {\n Meilland,\n Maxime and Drummond,\n Tom and Comport,\n Andrew I.\n},\n title = {\n A Unified Rolling Shutter and Motion Blur Model for 3D Visual Registration\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "f0cd1773d2", @@ -926,7 +957,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Galasso_2013_ICCV,\n \n author = {\n Galasso,\n Fabio and Nagaraja,\n Naveen Shankar and Cardenas,\n Tatiana Jimenez and Brox,\n Thomas and Schiele,\n Bernt\n},\n title = {\n A Unified Video Segmentation Benchmark: Annotation,\n Metrics and Analysis\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "6670075940", @@ -948,7 +980,8 @@ "email": ";", "author_num": 2, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Sun_2013_ICCV,\n \n author = {\n Sun,\n Chen and Nevatia,\n Ram\n},\n title = {\n ACTIVE: Activity Concept Transitions in Video Event Classification\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "16fcea01be", @@ -972,14 +1005,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;0;0", - "aff_unique_norm": "Chinese University of Hong Kong", + "aff_unique_norm": "The Chinese University of Hong Kong", "aff_unique_dep": "", "aff_unique_url": "https://www.cuhk.edu.hk", "aff_unique_abbr": "CUHK", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Lu_2013_ICCV,\n \n author = {\n Lu,\n Cewu and Shi,\n Jianping and Jia,\n Jiaya\n},\n title = {\n Abnormal Event Detection at 150 FPS in MATLAB\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "c358773795", @@ -1003,14 +1037,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;0;0;0;0", - "aff_unique_norm": "Weizmann Institute of Science", + "aff_unique_norm": "The Weizmann Institute of Science", "aff_unique_dep": "Dept. of Computer Science and Applied Math", "aff_unique_url": "https://www.weizmann.ac.il", "aff_unique_abbr": "Weizmann", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "Israel" + "aff_country_unique": "Israel", + "bibtex": "@InProceedings{Efrat_2013_ICCV,\n \n author = {\n Efrat,\n Netalee and Glasner,\n Daniel and Apartsin,\n Alexander and Nadler,\n Boaz and Levin,\n Anat\n},\n title = {\n Accurate Blur Models vs. Image Priors in Single Image Super-resolution\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "11053cc29d", @@ -1034,14 +1069,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;1;0;1;0", - "aff_unique_norm": "Texas A&M University;Microsoft", + "aff_unique_norm": "Texas A&M University;Microsoft Research", "aff_unique_dep": ";Research", "aff_unique_url": "https://www.tamu.edu;https://www.microsoft.com/en-us/research/group/asia", "aff_unique_abbr": "TAMU;MSR Asia", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0;1;0;1;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Chen_2013_ICCV,\n \n author = {\n Chen,\n Yen-Lin and Wu,\n Hsiang-Tao and Shi,\n Fuhao and Tong,\n Xin and Chai,\n Jinxiang\n},\n title = {\n Accurate and Robust 3D Facial Capture Using a Single RGBD Camera\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "0e8f84027d", @@ -1072,7 +1108,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", - "aff_country_unique": "United States;T\u00fcrkiye" + "aff_country_unique": "United States;Turkey", + "bibtex": "@InProceedings{Ma_2013_ICCV,\n \n author = {\n Ma,\n Shugao and Zhang,\n Jianming and Ikizler-Cinbis,\n Nazli and Sclaroff,\n Stan\n},\n title = {\n Action Recognition and Localization by Hierarchical Space-Time Segments\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "d4c8caafe7", @@ -1096,14 +1133,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0+1;2;0+1;0+1;3", - "aff_unique_norm": "Shanghai Jiao Tong University;Shanghai Key Lab of Digital Media Processing and Transmission;Microsoft;University of California, San Diego", + "aff_unique_norm": "Shanghai Jiao Tong University;Shanghai Key Lab of Digital Media Processing and Transmission;Microsoft Research;University of California, San Diego", "aff_unique_dep": "Institute of Image Communication and Network Engineering;Digital Media Processing and Transmission;Research;Department of Cognitive Science", "aff_unique_url": "https://www.sjtu.edu.cn;;https://www.microsoft.com/en-us/research/group/asia;https://ucsd.edu", "aff_unique_abbr": "SJTU;;MSR Asia;UCSD", "aff_campus_unique_index": "0;2;0;0;3", "aff_campus_unique": "Shanghai;;Asia;San Diego", "aff_country_unique_index": "0+0;0;0+0;0+0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Zhu_2013_ICCV,\n \n author = {\n Zhu,\n Jun and Wang,\n Baoyuan and Yang,\n Xiaokang and Zhang,\n Wenjun and Tu,\n Zhuowen\n},\n title = {\n Action Recognition with Actons\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "8248644abd", @@ -1134,7 +1172,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "France" + "aff_country_unique": "France", + "bibtex": "@InProceedings{Wang_2013_ICCV,\n \n author = {\n Wang,\n Heng and Schmid,\n Cordelia\n},\n title = {\n Action Recognition with Improved Trajectories\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "4790865b1e", @@ -1143,7 +1182,7 @@ "author": "Dan Oneata; Jakob Verbeek; Cordelia Schmid", "abstract": "Action recognition in uncontrolled video is an important and challenging computer vision problem. Recent progress in this area is due to new local features and models that capture spatio-temporal structure between local features, or human-object interactions. Instead of working towards more complex models, we focus on the low-level features and their encoding. We evaluate the use of Fisher vectors as an alternative to bag-of-word histograms to aggregate a small set of state-of-the-art low-level descriptors, in combination with linear classifiers. We present a large and varied set of evaluations, considering (i) classification of short actions in five datasets, (ii) localization of such actions in feature-length movies, and (iii) large-scale recognition of complex events. We find that for basic action recognition and localization MBH features alone are enough for stateof-the-art performance. For complex events we find that SIFT and MFCC features provide complementary cues. On all three problems we obtain state-of-the-art results, while using fewer features and less complex models.", "pdf": "http://openaccess.thecvf.com/content_iccv_2013/papers/Oneata_Action_and_Event_2013_ICCV_paper.pdf", - "aff": "LEAR, INRIA Grenoble \u2013 Rh\u00f4ne-Alpes, France; Laboratoire Jean Kuntzmann; LEAR, INRIA Grenoble \u2013 Rh\u00f4ne-Alpes, France", + "aff": "LEAR, INRIA Grenoble – Rhône-Alpes, France; Laboratoire Jean Kuntzmann; LEAR, INRIA Grenoble – Rhône-Alpes, France", "project": "", "github": "", "supp": "", @@ -1158,14 +1197,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;1;0", - "aff_unique_norm": "INRIA Grenoble \u2013 Rh\u00f4ne-Alpes;Laboratoire Jean Kuntzmann", + "aff_unique_norm": "INRIA Grenoble – Rhône-Alpes;Laboratoire Jean Kuntzmann", "aff_unique_dep": "LEAR;", "aff_unique_url": "https://www.inria.fr/centre/grenoble;https://www.ljk.ij.cnrs.fr", "aff_unique_abbr": "INRIA;", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Grenoble;", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "France" + "aff_country_unique": "France", + "bibtex": "@InProceedings{Oneata_2013_ICCV,\n \n author = {\n Oneata,\n Dan and Verbeek,\n Jakob and Schmid,\n Cordelia\n},\n title = {\n Action and Event Recognition with Fisher Vectors on a Compact Feature Set\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "3f5c6d00ba", @@ -1196,7 +1236,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Austin", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Bandla_2013_ICCV,\n \n author = {\n Bandla,\n Sunil and Grauman,\n Kristen\n},\n title = {\n Active Learning of an Action Detector from Untrimmed Videos\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "c510b93ddf", @@ -1205,7 +1246,7 @@ "author": "Gemma Roig; Xavier Boix; Roderick De Nijs; Sebastian Ramos; Koljia Kuhnlenz; Luc Van Gool", "abstract": "Most MAP inference algorithms for CRFs optimize an energy function knowing all the potentials. In this paper, we focus on CRFs where the computational cost of instantiating the potentials is orders of magnitude higher than MAP inference. This is often the case in semantic image segmentation, where most potentials are instantiated by slow classifiers fed with costly features. We introduce Active MAP inference 1) to on-the-fly select a subset of potentials to be instantiated in the energy function, leaving the rest of the parameters of the potentials unknown, and 2) to estimate the MAP labeling from such incomplete energy function. Results for semantic segmentation benchmarks, namely PASCAL VOC 2010 [5] and MSRC-21 [19], show that Active MAP inference achieves similar levels of accuracy but with major efficiency gains.", "pdf": "http://openaccess.thecvf.com/content_iccv_2013/papers/Roig_Active_MAP_Inference_2013_ICCV_paper.pdf", - "aff": "ETH Z\u00fcrich, Switzerland; ETH Z\u00fcrich, Switzerland; TU Munchen, Germany; CVC Barcelona, Spain; TU Munchen, Germany; ETH Z\u00fcrich, Switzerland+KU Leuven, Belgium", + "aff": "ETH Zürich, Switzerland; ETH Zürich, Switzerland; TU Munchen, Germany; CVC Barcelona, Spain; TU Munchen, Germany; ETH Zürich, Switzerland+KU Leuven, Belgium", "project": "", "github": "", "supp": "", @@ -1220,14 +1261,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;0;1;2;1;0+3", - "aff_unique_norm": "ETH Zurich;Technical University of Munich;CVC Barcelona;KU Leuven", + "aff_unique_norm": "ETH Zürich;Technical University of Munich;CVC Barcelona;KU Leuven", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.ethz.ch;https://www.tum.de;https://www.cvc.uab.cat;https://www.kuleuven.be", "aff_unique_abbr": "ETHZ;TUM;CVC;KU Leuven", "aff_campus_unique_index": "1;", "aff_campus_unique": ";Barcelona", "aff_country_unique_index": "0;0;1;2;1;0+3", - "aff_country_unique": "Switzerland;Germany;Spain;Belgium" + "aff_country_unique": "Switzerland;Germany;Spain;Belgium", + "bibtex": "@InProceedings{Roig_2013_ICCV,\n \n author = {\n Roig,\n Gemma and Boix,\n Xavier and De Nijs,\n Roderick and Ramos,\n Sebastian and Kuhnlenz,\n Koljia and Van Gool,\n Luc\n},\n title = {\n Active MAP Inference in CRFs for Efficient Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "e1e19668b9", @@ -1251,14 +1293,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;0;1", - "aff_unique_norm": "Stevens Institute of Technology;Microsoft", + "aff_unique_norm": "Stevens Institute of Technology;Microsoft Corporation", "aff_unique_dep": ";Microsoft Research", "aff_unique_url": "https://www.stevens.edu;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "SIT;MSR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Long_2013_ICCV,\n \n author = {\n Long,\n Chengjiang and Hua,\n Gang and Kapoor,\n Ashish\n},\n title = {\n Active Visual Recognition with Expertise Estimation in Crowdsourcing\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "fa56a32c78", @@ -1289,7 +1332,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Bangalore", "aff_country_unique_index": "0;0", - "aff_country_unique": "India" + "aff_country_unique": "India", + "bibtex": "@InProceedings{Jain_2013_ICCV,\n \n author = {\n Jain,\n Vidit and Farfade,\n Sachin Sudhakar\n},\n title = {\n Adapting Classification Cascades to New Domains\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "59b124225a", @@ -1320,7 +1364,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Giza", "aff_country_unique_index": "0;0;1;0", - "aff_country_unique": "United States;Egypt" + "aff_country_unique": "United States;Egypt", + "bibtex": "@InProceedings{Chi_2013_ICCV,\n \n author = {\n Chi,\n Yu-Tseh and Ali,\n Mohsen and Rushdi,\n Muhammad and Ho,\n Jeffrey\n},\n title = {\n Affine-Constrained Group Sparse Coding and Its Application to Image-Based Classifications\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "f0ee87b497", @@ -1342,7 +1387,8 @@ "email": ";;", "author_num": 3, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Antonio_2013_ICCV,\n \n author = {\n Antonio,\n M. Jose and De Raedt,\n Luc and Tuytelaars,\n Tinne\n},\n title = {\n Allocentric Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "3aebbccc9e", @@ -1373,7 +1419,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0", - "aff_country_unique": "Austria;United States" + "aff_country_unique": "Austria;United States", + "bibtex": "@InProceedings{Schulter_2013_ICCV,\n \n author = {\n Schulter,\n Samuel and Leistner,\n Christian and Wohlhart,\n Paul and Roth,\n Peter M. and Bischof,\n Horst\n},\n title = {\n Alternating Regression Forests for Object Detection and Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "6dffe98bbf", @@ -1404,7 +1451,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Vancouver", "aff_country_unique_index": "0;0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Guo_2013_ICCV,\n \n author = {\n Guo,\n Zhenyu and Wang,\n Z. Jane\n},\n title = {\n An Adaptive Descriptor Design for Object Recognition in the Wild\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "ae158f260e", @@ -1435,7 +1483,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Toulouse", "aff_country_unique_index": "0;0", - "aff_country_unique": "France" + "aff_country_unique": "France", + "bibtex": "@InProceedings{Calvet_2013_ICCV,\n \n author = {\n Calvet,\n Lilian and Gurdjos,\n Pierre\n},\n title = {\n An Enhanced Structure-from-Motion Paradigm Based on the Absolute Dual Quadric and Images of Circular Points\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "591d433f01", @@ -1457,7 +1506,8 @@ "email": ";;;", "author_num": 4, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Borji_2013_ICCV,\n \n author = {\n Borji,\n Ali and Tavakoli,\n Hamed R. and Sihite,\n Dicky N. and Itti,\n Laurent\n},\n title = {\n Analysis of Scores,\n Datasets,\n and Models in Visual Saliency Prediction\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "6f3787c738", @@ -1479,7 +1529,8 @@ "email": ";;", "author_num": 3, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Timofte_2013_ICCV,\n \n author = {\n Timofte,\n Radu and De Smet,\n Vincent and Van Gool,\n Luc\n},\n title = {\n Anchored Neighborhood Regression for Fast Example-Based Super-Resolution\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "9d1d7382df", @@ -1510,7 +1561,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Austin", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Kovashka_2013_ICCV,\n \n author = {\n Kovashka,\n Adriana and Grauman,\n Kristen\n},\n title = {\n Attribute Adaptation for Personalized Image Search\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "b930e8bb90", @@ -1541,7 +1593,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Turakhia_2013_ICCV,\n \n author = {\n Turakhia,\n Naman and Parikh,\n Devi\n},\n title = {\n Attribute Dominance: What Pops Out?\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "d9ba1048cd", @@ -1572,7 +1625,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Austin", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Kovashka_2013_ICCV,\n \n author = {\n Kovashka,\n Adriana and Grauman,\n Kristen\n},\n title = {\n Attribute Pivots for Guiding Relevance Feedback in Image Search\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "f01b490cf7", @@ -1603,7 +1657,8 @@ "aff_campus_unique_index": "0;2", "aff_campus_unique": "Graduate Center;;New York", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "United States;Greece" + "aff_country_unique": "United States;Greece", + "bibtex": "@InProceedings{Liu_2013_ICCV,\n \n author = {\n Liu,\n Juan and Psarakis,\n Emmanouil and Stamos,\n Ioannis\n},\n title = {\n Automatic Kronecker Product Model Based Detection of Repeated Patterns in 2D Urban Images\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "6dc4053599", @@ -1634,7 +1689,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Kiel", "aff_country_unique_index": "0;1+0;0", - "aff_country_unique": "Switzerland;Germany" + "aff_country_unique": "Switzerland;Germany", + "bibtex": "@InProceedings{Zeisl_2013_ICCV,\n \n author = {\n Zeisl,\n Bernhard and Koser,\n Kevin and Pollefeys,\n Marc\n},\n title = {\n Automatic Registration of RGB-D Scans via Salient Directions\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "3906bb98c9", @@ -1665,7 +1721,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Italy" + "aff_country_unique": "Italy", + "bibtex": "@InProceedings{Tombari_2013_ICCV,\n \n author = {\n Tombari,\n Federico and Franchi,\n Alessandro and Di Stefano,\n Luigi\n},\n title = {\n BOLD Features to Detect Texture-less Objects\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "b3022a8775", @@ -1696,7 +1753,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": ";;", - "aff_country_unique": "" + "aff_country_unique": "", + "bibtex": "@InProceedings{Brau_2013_ICCV,\n \n author = {\n Brau,\n Ernesto and Guan,\n Jinyan and Simek,\n Kyle and Del Pero,\n Luca and Dawson,\n Colin Reimer and Barnard,\n Kobus\n},\n title = {\n Bayesian 3D Tracking from Monocular Video\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "998f07479f", @@ -1727,7 +1785,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "London", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Shi_2013_ICCV,\n \n author = {\n Shi,\n Zhiyuan and Hospedales,\n Timothy M. and Xiang,\n Tao\n},\n title = {\n Bayesian Joint Topic Modelling for Weakly Supervised Object Localisation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "53cf28f9dd", @@ -1758,7 +1817,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2013_ICCV,\n \n author = {\n Wang,\n Naiyan and Yeung,\n Dit-Yan\n},\n title = {\n Bayesian Robust Matrix Factorization for Image and Video Processing\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "9e9ba41d57", @@ -1789,7 +1849,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Portugal" + "aff_country_unique": "Portugal", + "bibtex": "@InProceedings{Henriques_2013_ICCV,\n \n author = {\n Henriques,\n Joao F. and Carreira,\n Joao and Caseiro,\n Rui and Batista,\n Jorge\n},\n title = {\n Beyond Hard Negative Mining: Efficient Detector Learning via Block-Circulant Decomposition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "e7952b92ec", @@ -1820,7 +1881,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Liu_2013_ICCV,\n \n author = {\n Liu,\n Jiongxin and Belhumeur,\n Peter N.\n},\n title = {\n Bird Part Localization Using Exemplar-Based Models with Enforced Pose and Subcategory Consistency\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "4e958f9a76", @@ -1851,7 +1913,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "BC", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Nosrati_2013_ICCV,\n \n author = {\n Nosrati,\n Masoud S. and Andrews,\n Shawn and Hamarneh,\n Ghassan\n},\n title = {\n Bounded Labeling Function for Global Segmentation of Multi-part Objects with Geometric Constraints\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "8c4ed46b61", @@ -1882,7 +1945,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Chicago", "aff_country_unique_index": "0;1;0;1", - "aff_country_unique": "Switzerland;United States" + "aff_country_unique": "Switzerland;United States", + "bibtex": "@InProceedings{Schwing_2013_ICCV,\n \n author = {\n Schwing,\n Alexander G. and Fidler,\n Sanja and Pollefeys,\n Marc and Urtasun,\n Raquel\n},\n title = {\n Box in the Box: Joint 3D Layout and Object Reasoning from Single Images\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "3aa32eedc7", @@ -1913,7 +1977,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Tokola_2013_ICCV,\n \n author = {\n Tokola,\n Ryan and Choi,\n Wongun and Savarese,\n Silvio\n},\n title = {\n Breaking the Chain: Liberation from the Temporal Markov Assumption for Tracking Human Poses\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "7f9f90d0d9", @@ -1944,7 +2009,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Shrivastava_2013_ICCV,\n \n author = {\n Shrivastava,\n Abhinav and Gupta,\n Abhinav\n},\n title = {\n Building Part-Based Object Detectors via 3D Geometry\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "9e75ad0494", @@ -1975,7 +2041,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Netherlands" + "aff_country_unique": "Netherlands", + "bibtex": "@InProceedings{Alnajar_2013_ICCV,\n \n author = {\n Alnajar,\n Fares and Gevers,\n Theo and Valenti,\n Roberto and Ghebreab,\n Sennay\n},\n title = {\n Calibration-Free Gaze Estimation Using Human Gaze Patterns\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "ae12e00f7f", @@ -2006,7 +2073,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Santa Barbara", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Kuo_2013_ICCV,\n \n author = {\n Kuo,\n Thomas and Sunderrajan,\n Santhoshkumar and Manjunath,\n B.S.\n},\n title = {\n Camera Alignment Using Trajectory Intersections in Unsynchronized Videos\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "6ee2a8f7d6", @@ -2037,7 +2105,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Harbin", "aff_country_unique_index": "0;1;1;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Wang_2013_ICCV,\n \n author = {\n Wang,\n Ziheng and Li,\n Yongqiang and Wang,\n Shangfei and Ji,\n Qiang\n},\n title = {\n Capturing Global Semantic Relationships for Facial Action Unit Recognition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "150eb89863", @@ -2068,7 +2137,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhao_2013_ICCV,\n \n author = {\n Zhao,\n Xiaowei and Shan,\n Shiguang and Chai,\n Xiujuan and Chen,\n Xilin\n},\n title = {\n Cascaded Shape Space Pruning for Robust Facial Landmark Detection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "6ba83421b8", @@ -2099,7 +2169,8 @@ "aff_campus_unique_index": "0;1", "aff_campus_unique": "Berkeley;Mountain View", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Jia_2013_ICCV,\n \n author = {\n Jia,\n Yangqing and Han,\n Mei\n},\n title = {\n Category-Independent Object-Level Saliency Detection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "bbfdb54313", @@ -2130,7 +2201,8 @@ "aff_campus_unique_index": "0", "aff_campus_unique": "Chicago;", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Lin_2013_ICCV,\n \n author = {\n Lin,\n Dahua and Xiao,\n Jianxiong\n},\n title = {\n Characterizing Layouts of Outdoor Scenes Using Spatial Topic Processes\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "7798d65e8d", @@ -2154,14 +2226,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;1;0", - "aff_unique_norm": "University of California, San Diego;Yahoo!", - "aff_unique_dep": "Department of Electrical and Computer Engineering;Yahoo! Labs", + "aff_unique_norm": "University of California, San Diego;Yahoo! Labs", + "aff_unique_dep": "Department of Electrical and Computer Engineering;", "aff_unique_url": "https://www.ucsd.edu;https://labs.yahoo.com", "aff_unique_abbr": "UCSD;Yahoo! Labs", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "San Diego;Bangalore", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "United States;India" + "aff_country_unique": "United States;India", + "bibtex": "@InProceedings{Dixit_2013_ICCV,\n \n author = {\n Dixit,\n Mandar and Rasiwasia,\n Nikhil and Vasconcelos,\n Nuno\n},\n title = {\n Class-Specific Simplex-Latent Dirichlet Allocation for Image Classification\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "6617e0912b", @@ -2185,14 +2258,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;0", - "aff_unique_norm": "Weizmann Institute of Science", + "aff_unique_norm": "The Weizmann Institute of Science", "aff_unique_dep": "Dept. of Computer Science and Applied Math", "aff_unique_url": "https://www.weizmann.ac.il", "aff_unique_abbr": "Weizmann", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Israel" + "aff_country_unique": "Israel", + "bibtex": "@InProceedings{Faktor_2013_ICCV,\n \n author = {\n Faktor,\n Alon and Irani,\n Michal\n},\n title = {\n Co-segmentation by Composition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "42be699790", @@ -2216,14 +2290,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;0;0", - "aff_unique_norm": "Chinese University of Hong Kong", + "aff_unique_norm": "The Chinese University of Hong Kong", "aff_unique_dep": "", "aff_unique_url": "https://www.cuhk.edu.hk", "aff_unique_abbr": "CUHK", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Shi_2013_ICCV,\n \n author = {\n Shi,\n Jianping and Liao,\n Renjie and Jia,\n Jiaya\n},\n title = {\n CoDeL: A Human Co-detection and Labeling Framework\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "4ce5a1f677", @@ -2254,7 +2329,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Jain_2013_ICCV,\n \n author = {\n Jain,\n Aastha and Chatterjee,\n Shuanak and Vidal,\n Rene\n},\n title = {\n Coarse-to-Fine Semantic Video Segmentation Using Supervoxel Trees\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "04d5c1500e", @@ -2285,7 +2361,8 @@ "aff_campus_unique_index": "0;0;0;0;0+0", "aff_campus_unique": "Amsterdam", "aff_country_unique_index": "0;0;0;0;0+0", - "aff_country_unique": "Netherlands" + "aff_country_unique": "Netherlands", + "bibtex": "@InProceedings{Li_2013_ICCV,\n \n author = {\n Li,\n Zhenyang and Gavves,\n Efstratios and van de Sande,\n Koen E.A. and Snoek,\n Cees G.M. and Smeulders,\n Arnold W.M.\n},\n title = {\n Codemaps - Segment,\n Classify and Search Objects Locally\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "80b21ed604", @@ -2316,7 +2393,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Amherst", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Narayana_2013_ICCV,\n \n author = {\n Narayana,\n Manjunath and Hanson,\n Allen and Learned-Miller,\n Erik\n},\n title = {\n Coherent Motion Segmentation in Moving Camera Videos Using Optical Flow Orientations\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "c3672b77b0", @@ -2347,7 +2425,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Pan_2013_ICCV,\n \n author = {\n Pan,\n Jiyan and Kanade,\n Takeo\n},\n title = {\n Coherent Object Detection with 3D Geometric Context from a Single Image\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "da52089ab9", @@ -2371,14 +2450,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;0;1;2", - "aff_unique_norm": "Stevens Institute of Technology;Meta;Northwestern University", - "aff_unique_dep": ";Facebook, Inc.;", + "aff_unique_norm": "Stevens Institute of Technology;Facebook, Inc.;Northwestern University", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.stevens.edu;https://www.facebook.com;https://www.northwestern.edu", "aff_unique_abbr": "SIT;FB;NU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Hua_2013_ICCV,\n \n author = {\n Hua,\n Gang and Long,\n Chengjiang and Yang,\n Ming and Gao,\n Yan\n},\n title = {\n Collaborative Active Learning of a Kernel Machine Ensemble for Recognition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "3c30adba62", @@ -2409,7 +2489,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Tang_2013_ICCV,\n \n author = {\n Tang,\n Kevin and Yao,\n Bangpeng and Fei-Fei,\n Li and Koller,\n Daphne\n},\n title = {\n Combining the Right Features for Complex Event Recognition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "866396bd4b", @@ -2431,7 +2512,8 @@ "email": ";;", "author_num": 3, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Achar_2013_ICCV,\n \n author = {\n Achar,\n Supreeth and Nuske,\n Stephen T. and Narasimhan,\n Srinivasa G.\n},\n title = {\n Compensating for Motion during Direct-Global Separation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "e295a8edaa", @@ -2440,7 +2522,7 @@ "author": "Zhongming Jin; Yao Hu; Yue Lin; Debing Zhang; Shiding Lin; Deng Cai; Xuelong Li", "abstract": "Recently, hashing techniques have been widely applied to solve the approximate nearest neighbors search problem in many vision applications. Generally, these hashing approaches generate 2 c buckets, where c is the length of the hash code. A good hashing method should satisfy the following two requirements: 1) mapping the nearby data points into the same bucket or nearby (measured by the Hamming distance) buckets. 2) all the data points are evenly distributed among all the buckets. In this paper, we propose a novel algorithm named Complementary Projection Hashing (CPH) to find the optimal hashing functions which explicitly considers the above two requirements. Specifically, CPH aims at sequentially finding a series of hyperplanes (hashing functions) which cross the sparse region of the data. At the same time, the data points are evenly distributed in the hypercubes generated by these hyperplanes. The experiments comparing with the state-of-the-art hashing methods demonstrate the effectiveness of the proposed method.", "pdf": "http://openaccess.thecvf.com/content_iccv_2013/papers/Jin_Complementary_Projection_Hashing_2013_ICCV_paper.pdf", - "aff": "State Key Lab of CAD&CG, College of Computer Science, Zhejiang University, Hangzhou, P. R. China; State Key Lab of CAD&CG, College of Computer Science, Zhejiang University, Hangzhou, P. R. China; State Key Lab of CAD&CG, College of Computer Science, Zhejiang University, Hangzhou, P. R. China; State Key Lab of CAD&CG, College of Computer Science, Zhejiang University, Hangzhou, P. R. China; Baidu, Inc., Beijing, P. R. China; State Key Lab of CAD&CG, College of Computer Science, Zhejiang University, Hangzhou, P. R. China; Center for OPTical IMagery Analysis and Learning (OPTIMAL), State Key Laboratory of Transient Optics and Photonics, Xi\u2019an Institute of Optics and Precision Mechanics, Chinese Academy of Sciences, Xi\u2019an 710119, Shaanxi, P. R. China", + "aff": "State Key Lab of CAD&CG, College of Computer Science, Zhejiang University, Hangzhou, P. R. China; State Key Lab of CAD&CG, College of Computer Science, Zhejiang University, Hangzhou, P. R. China; State Key Lab of CAD&CG, College of Computer Science, Zhejiang University, Hangzhou, P. R. China; State Key Lab of CAD&CG, College of Computer Science, Zhejiang University, Hangzhou, P. R. China; Baidu, Inc., Beijing, P. R. China; State Key Lab of CAD&CG, College of Computer Science, Zhejiang University, Hangzhou, P. R. China; Center for OPTical IMagery Analysis and Learning (OPTIMAL), State Key Laboratory of Transient Optics and Photonics, Xi’an Institute of Optics and Precision Mechanics, Chinese Academy of Sciences, Xi’an 710119, Shaanxi, P. R. China", "project": "", "github": "", "supp": "", @@ -2455,14 +2537,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;0;0;0;1;0;2", - "aff_unique_norm": "Zhejiang University;Baidu;Chinese Academy of Sciences", - "aff_unique_dep": "College of Computer Science;Baidu, Inc.;Xi'an Institute of Optics and Precision Mechanics", + "aff_unique_norm": "Zhejiang University;Baidu, Inc.;Chinese Academy of Sciences", + "aff_unique_dep": "College of Computer Science;;Xi'an Institute of Optics and Precision Mechanics", "aff_unique_url": "http://www.zju.edu.cn;https://www.baidu.com;http://www.opt.ac.cn", "aff_unique_abbr": "ZJU;Baidu;CAS", "aff_campus_unique_index": "0;0;0;0;1;0;2", "aff_campus_unique": "Hangzhou;Beijing;Xi'an", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Jin_2013_ICCV,\n \n author = {\n Jin,\n Zhongming and Hu,\n Yao and Lin,\n Yue and Zhang,\n Debing and Lin,\n Shiding and Cai,\n Deng and Li,\n Xuelong\n},\n title = {\n Complementary Projection Hashing\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "bbe9a98fa1", @@ -2486,14 +2569,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0+1+2;0+1;0+1", - "aff_unique_norm": "Chinese University of Hong Kong;Shenzhen Key Lab of Computer Vision and Pattern Recognition;Huawei", + "aff_unique_norm": "The Chinese University of Hong Kong;Shenzhen Key Lab of Computer Vision and Pattern Recognition;Huawei Technologies Co. Ltd.", "aff_unique_dep": "Department of Information Engineering;Computer Vision and Pattern Recognition;Media Lab", "aff_unique_url": "https://www.cuhk.edu.hk;;https://www.huawei.com", "aff_unique_abbr": "CUHK;;Huawei", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0+0+0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yang_2013_ICCV,\n \n author = {\n Yang,\n Linjie and Liu,\n Jianzhuang and Tang,\n Xiaoou\n},\n title = {\n Complex 3D General Object Reconstruction from Line Drawings\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "e2943f5626", @@ -2524,7 +2608,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;1", - "aff_country_unique": "Canada;United States" + "aff_country_unique": "Canada;United States", + "bibtex": "@InProceedings{Vahdat_2013_ICCV,\n \n author = {\n Vahdat,\n Arash and Cannons,\n Kevin and Mori,\n Greg and Oh,\n Sangmin and Kim,\n Ilseo\n},\n title = {\n Compositional Models for Video Event Detection: A Multiple Kernel Learning Latent Variable Approach\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "60b045d37a", @@ -2533,7 +2618,7 @@ "author": "Ping Wei; Nanning Zheng; Yibiao Zhao; Song-Chun Zhu", "abstract": "Action recognition has often been posed as a classification problem, which assumes that a video sequence only have one action class label and different actions are independent. However, a single human body can perform multiple concurrent actions at the same time, and different actions interact with each other. This paper proposes a concurrent action detection model where the action detection is formulated as a structural prediction problem. In this model, an interval in a video sequence can be described by multiple action labels. An detected action interval is determined both by the unary local detector and the relations with other actions. We use a wavelet feature to represent the action sequence, and design a composite temporal logic descriptor to describe the action relations. The model parameters are trained by structural SVM learning. Given a long video sequence, a sequential decision window search algorithm is designed to detect the actions. Experiments on our new collected concurrent action dataset demonstrate the strength of our method.", "pdf": "http://openaccess.thecvf.com/content_iccv_2013/papers/Wei_Concurrent_Action_Detection_2013_ICCV_paper.pdf", - "aff": "Xi\u2019an Jiaotong University, China+University of California, Los Angeles, USA; Xi\u2019an Jiaotong University, China; University of California, Los Angeles, USA; University of California, Los Angeles, USA", + "aff": "Xi’an Jiaotong University, China+University of California, Los Angeles, USA; Xi’an Jiaotong University, China; University of California, Los Angeles, USA; University of California, Los Angeles, USA", "project": "", "github": "", "supp": "", @@ -2548,14 +2633,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0+1;0;1;1", - "aff_unique_norm": "Xi'an Jiao Tong University;University of California, Los Angeles", + "aff_unique_norm": "Xi'an Jiaotong University;University of California, Los Angeles", "aff_unique_dep": ";", "aff_unique_url": "http://en.xjtu.edu.cn/;https://www.ucla.edu", "aff_unique_abbr": "XJTU;UCLA", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0+1;0;1;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Wei_2013_ICCV,\n \n author = {\n Wei,\n Ping and Zheng,\n Nanning and Zhao,\n Yibiao and Zhu,\n Song-Chun\n},\n title = {\n Concurrent Action Detection with Structural Prediction\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "0990f58af8", @@ -2586,7 +2672,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0", - "aff_country_unique": "Germany;Unknown" + "aff_country_unique": "Germany;Unknown", + "bibtex": "@InProceedings{Schiegg_2013_ICCV,\n \n author = {\n Schiegg,\n Martin and Hanslovsky,\n Philipp and Kausler,\n Bernhard X. and Hufnagel,\n Lars and Hamprecht,\n Fred A.\n},\n title = {\n Conservation Tracking\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "8566871296", @@ -2610,14 +2697,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;1;1;1;0", - "aff_unique_norm": "University of Chinese Academy of Sciences;Microsoft", + "aff_unique_norm": "University of Chinese Academy of Sciences;Microsoft Research", "aff_unique_dep": "State Key Laboratory of Computer Science;Research", "aff_unique_url": "http://www.ucas.ac.cn;https://www.microsoft.com/en-us/research/group/asia", "aff_unique_abbr": "UCAS;MSR Asia", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Ma_2013_ICCV,\n \n author = {\n Ma,\n Ziyang and He,\n Kaiming and Wei,\n Yichen and Sun,\n Jian and Wu,\n Enhua\n},\n title = {\n Constant Time Weighted Median Filtering for Stereo Matching and Beyond\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "df8589e641", @@ -2626,7 +2714,7 @@ "author": "Dapeng Chen; Zejian Yuan; Yang Wu; Geng Zhang; Nanning Zheng", "abstract": "Representation is a fundamental problem in object tracking. Conventional methods track the target by describing its local or global appearance. In this paper we present that, besides the two paradigms, the composition of local region histograms can also provide diverse and important object cues. We use cells to extract local appearance, and construct complex cells to integrate the information from cells. With different spatial arrangements of cells, complex cells can explore various contextual information at multiple scales, which is important to improve the tracking performance. We also develop a novel template-matching algorithm for object tracking, where the template is composed of temporal varying cells and has two layers to capture the target and background appearance respectively. An adaptive weight is associated with each complex cell to cope with occlusion as well as appearance variation. A fusion weight is associated with each complex cell type to preserve the global distinctiveness. Our algorithm is evaluated on 25 challenging sequences, and the results not only confirm the contribution of each component in our tracking system, but also outperform other competing trackers.", "pdf": "http://openaccess.thecvf.com/content_iccv_2013/papers/Chen_Constructing_Adaptive_Complex_2013_ICCV_paper.pdf", - "aff": "Institute of Artificial Intelligence and Robotics, Xi\u2019an Jiaotong University; Institute of Artificial Intelligence and Robotics, Xi\u2019an Jiaotong University; Academic Center for Computing and Media Studies, Kyoto University; Institute of Artificial Intelligence and Robotics, Xi\u2019an Jiaotong University; Institute of Artificial Intelligence and Robotics, Xi\u2019an Jiaotong University", + "aff": "Institute of Artificial Intelligence and Robotics, Xi’an Jiaotong University; Institute of Artificial Intelligence and Robotics, Xi’an Jiaotong University; Academic Center for Computing and Media Studies, Kyoto University; Institute of Artificial Intelligence and Robotics, Xi’an Jiaotong University; Institute of Artificial Intelligence and Robotics, Xi’an Jiaotong University", "project": "", "github": "", "supp": "", @@ -2641,14 +2729,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;0;1;0;0", - "aff_unique_norm": "Xi'an Jiao Tong University;Kyoto University", + "aff_unique_norm": "Xi'an Jiaotong University;Kyoto University", "aff_unique_dep": "Institute of Artificial Intelligence and Robotics;Academic Center for Computing and Media Studies", "aff_unique_url": "http://www.xjtu.edu.cn;https://www.kyoto-u.ac.jp", "aff_unique_abbr": "XJTU;Kyoto U", "aff_campus_unique_index": "0;0;1;0;0", "aff_campus_unique": "Xi'an;Kyoto", "aff_country_unique_index": "0;0;1;0;0", - "aff_country_unique": "China;Japan" + "aff_country_unique": "China;Japan", + "bibtex": "@InProceedings{Chen_2013_ICCV,\n \n author = {\n Chen,\n Dapeng and Yuan,\n Zejian and Wu,\n Yang and Zhang,\n Geng and Zheng,\n Nanning\n},\n title = {\n Constructing Adaptive Complex Cells for Robust Visual Tracking\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "b15df050ea", @@ -2672,14 +2761,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;1+0;0", - "aff_unique_norm": "Microsoft;Tsinghua University", + "aff_unique_norm": "Microsoft Research;Tsinghua University", "aff_unique_dep": "Research;", "aff_unique_url": "https://www.microsoft.com/en-us/research/group/asia;https://www.tsinghua.edu.cn", "aff_unique_abbr": "MSR Asia;THU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Asia;", "aff_country_unique_index": "0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{He_2013_ICCV,\n \n author = {\n He,\n Kaiming and Chang,\n Huiwen and Sun,\n Jian\n},\n title = {\n Content-Aware Rotation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "3f2eee754b", @@ -2710,7 +2800,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Adelaide", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "Australia" + "aff_country_unique": "Australia", + "bibtex": "@InProceedings{Li_2013_ICCV,\n \n author = {\n Li,\n Xi and Li,\n Yao and Shen,\n Chunhua and Dick,\n Anthony and Van Den Hengel,\n Anton\n},\n title = {\n Contextual Hypergraph Modeling for Salient Object Detection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "8b5022a686", @@ -2732,7 +2823,8 @@ "email": "", "author_num": 1, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Finlayson_2013_ICCV,\n \n author = {\n Finlayson,\n Graham D.\n},\n title = {\n Corrected-Moment Illuminant Estimation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "b08cd068e8", @@ -2763,7 +2855,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", - "aff_country_unique": "Singapore;China" + "aff_country_unique": "Singapore;China", + "bibtex": "@InProceedings{Lu_2013_ICCV,\n \n author = {\n Lu,\n Canyi and Feng,\n Jiashi and Lin,\n Zhouchen and Yan,\n Shuicheng\n},\n title = {\n Correlation Adaptive Subspace Segmentation by Trace Lasso\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "c805665858", @@ -2787,14 +2880,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;1;0;2;0;3", - "aff_unique_norm": "National University of Singapore;Nanjing University of Science and Technology;Sun Yat-sen University;Peking University", + "aff_unique_norm": "National University of Singapore;Nanjing University of Science and Technology;Sun Yat-Sen University;Peking University", "aff_unique_dep": "Department of Electrical and Computer Engineering;School of Computer Science;School of Software;School of EECS", "aff_unique_url": "https://www.nus.edu.sg;http://www.nust.edu.cn;http://www.sysu.edu.cn;http://www.pku.edu.cn", "aff_unique_abbr": "NUS;NUST;SYSU;PKU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;1;0;1", - "aff_country_unique": "Singapore;China" + "aff_country_unique": "Singapore;China", + "bibtex": "@InProceedings{Lu_2013_ICCV,\n \n author = {\n Lu,\n Canyi and Tang,\n Jinhui and Lin,\n Min and Lin,\n Liang and Yan,\n Shuicheng and Lin,\n Zhouchen\n},\n title = {\n Correntropy Induced L2 Graph for Robust Subspace Clustering\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "148ea70ac8", @@ -2825,7 +2919,8 @@ "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0+1;1;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Dai_2013_ICCV,\n \n author = {\n Dai,\n Jifeng and Wu,\n Ying Nian and Zhou,\n Jie and Zhu,\n Song-Chun\n},\n title = {\n Cosegmentation and Cosketch by Unsupervised Learning\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "b74fb64921", @@ -2856,7 +2951,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Taiwan", "aff_country_unique_index": "0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Huang_2013_ICCV,\n \n author = {\n Huang,\n De-An and Wang,\n Yu-Chiang Frank\n},\n title = {\n Coupled Dictionary and Feature Space Learning with Applications to Cross-Domain Image Synthesis and Recognition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "75352bc5bb", @@ -2887,7 +2983,8 @@ "aff_campus_unique_index": "0+0;0+0;0;0;0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0+0;0+0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Huang_2013_ICCV,\n \n author = {\n Huang,\n Zhiwu and Zhao,\n Xiaowei and Shan,\n Shiguang and Wang,\n Ruiping and Chen,\n Xilin\n},\n title = {\n Coupling Alignments with Recognition for Still-to-Video Face Recognition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "d3b4fc9901", @@ -2911,14 +3008,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;0;0;1;1;1;0", - "aff_unique_norm": "Chinese University of Hong Kong;Qualcomm Incorporated", + "aff_unique_norm": "The Chinese University of Hong Kong;Qualcomm Incorporated", "aff_unique_dep": ";", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.qualcomm.com", "aff_unique_abbr": "CUHK;Qualcomm", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;1;1;1;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Yan_2013_ICCV,\n \n author = {\n Yan,\n Qiong and Shen,\n Xiaoyong and Xu,\n Li and Zhuo,\n Shaojie and Zhang,\n Xiaopeng and Shen,\n Liang and Jia,\n Jiaya\n},\n title = {\n Cross-Field Joint Image Restoration via Scale Map\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "983883a0d8", @@ -2949,7 +3047,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wu_2013_ICCV,\n \n author = {\n Wu,\n Xinxiao and Wang,\n Han and Liu,\n Cuiwei and Jia,\n Yunde\n},\n title = {\n Cross-View Action Recognition over Heterogeneous Feature Spaces\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "85a4aedcb3", @@ -2971,7 +3070,8 @@ "email": ";;", "author_num": 3, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Kim_2013_ICCV,\n \n author = {\n Kim,\n Kwang In and Tompkin,\n James and Theobalt,\n Christian\n},\n title = {\n Curvature-Aware Regularization on Riemannian Submanifolds\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "49bce1e9c5", @@ -2993,7 +3093,8 @@ "email": ";;;", "author_num": 4, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Eshet_2013_ICCV,\n \n author = {\n Eshet,\n Yaron and Korman,\n Simon and Ofek,\n Eyal and Avidan,\n Shai\n},\n title = {\n DCSH - Matching Patches in RGBD Images\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "6bed9c5af8", @@ -3024,7 +3125,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Fouhey_2013_ICCV,\n \n author = {\n Fouhey,\n David F. and Gupta,\n Abhinav and Hebert,\n Martial\n},\n title = {\n Data-Driven 3D Primitives for Single Image Understanding\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "9fc75ac4a1", @@ -3048,14 +3150,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;1;0", - "aff_unique_norm": "Hebrew University;Adobe", + "aff_unique_norm": "Hebrew University;Adobe Research", "aff_unique_dep": ";Research", "aff_unique_url": "https://www.huji.ac.il;https://research.adobe.com", "aff_unique_abbr": "HUJI;Adobe", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Jerusalem;Seattle", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "Israel;United States" + "aff_country_unique": "Israel;United States", + "bibtex": "@InProceedings{Hacohen_2013_ICCV,\n \n author = {\n Hacohen,\n Yoav and Shechtman,\n Eli and Lischinski,\n Dani\n},\n title = {\n Deblurring by Example Using Dense Correspondence\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "e27c9bef43", @@ -3077,7 +3180,8 @@ "email": ";;", "author_num": 3, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Gandhi_2013_ICCV,\n \n author = {\n Gandhi,\n Ankit and Alahari,\n Karteek and Jawahar,\n C.V.\n},\n title = {\n Decomposing Bag of Words Histograms\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "adcdce2293", @@ -3101,14 +3205,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0+1;0+0+1;0;0+1", - "aff_unique_norm": "Chinese University of Hong Kong;Chinese Academy of Sciences", + "aff_unique_norm": "The Chinese University of Hong Kong;Chinese Academy of Sciences", "aff_unique_dep": "Department of Information Engineering;Shenzhen Institutes of Advanced Technology", "aff_unique_url": "https://www.cuhk.edu.hk;http://www.siat.cas.cn", "aff_unique_abbr": "CUHK;SIAT", "aff_campus_unique_index": "0+1;0+0+1;0;0+1", "aff_campus_unique": "Hong Kong SAR;Shenzhen", "aff_country_unique_index": "0+0;0+0+0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhu_2013_ICCV,\n \n author = {\n Zhu,\n Zhenyao and Luo,\n Ping and Wang,\n Xiaogang and Tang,\n Xiaoou\n},\n title = {\n Deep Learning Identity-Preserving Face Space\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "41a29def6e", @@ -3139,7 +3244,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Grenoble", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "France" + "aff_country_unique": "France", + "bibtex": "@InProceedings{Weinzaepfel_2013_ICCV,\n \n author = {\n Weinzaepfel,\n Philippe and Revaud,\n Jerome and Harchaoui,\n Zaid and Schmid,\n Cordelia\n},\n title = {\n DeepFlow: Large Displacement Optical Flow with Deep Matching\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "5477f109d0", @@ -3170,7 +3276,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;0+0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhang_2013_ICCV,\n \n author = {\n Zhang,\n Ning and Farrell,\n Ryan and Iandola,\n Forrest and Darrell,\n Trevor\n},\n title = {\n Deformable Part Descriptors for Fine-Grained Recognition and Attribute Prediction\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "93566c0e1e", @@ -3192,7 +3299,8 @@ "email": ";;;", "author_num": 4, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Tao_2013_ICCV,\n \n author = {\n Tao,\n Michael W. and Hadap,\n Sunil and Malik,\n Jitendra and Ramamoorthi,\n Ravi\n},\n title = {\n Depth from Combining Defocus and Correspondence Using Light-Field Cameras\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "1ee9fd14c5", @@ -3223,7 +3331,8 @@ "aff_campus_unique_index": "0;0;0;1;0", "aff_campus_unique": "Stanford;Chapel Hill", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Russakovsky_2013_ICCV,\n \n author = {\n Russakovsky,\n Olga and Deng,\n Jia and Huang,\n Zhiheng and Berg,\n Alexander C. and Fei-Fei,\n Li\n},\n title = {\n Detecting Avocados to Zucchinis: What Have We Done,\n and Where Are We Going?\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "bc421c4742", @@ -3245,7 +3354,8 @@ "email": ";;", "author_num": 3, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Lee_2013_ICCV,\n \n author = {\n Lee,\n Tom Sie Ho and Fidler,\n Sanja and Dickinson,\n Sven\n},\n title = {\n Detecting Curved Symmetric Parts Using a Deformable Disc Model\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "847de9abc4", @@ -3276,7 +3386,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Irvine", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Diaz_2013_ICCV,\n \n author = {\n Diaz,\n Raul and Hallman,\n Sam and Fowlkes,\n Charless C.\n},\n title = {\n Detecting Dynamic Objects with Multi-view Background Subtraction\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "861f63c694", @@ -3300,14 +3411,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;0;0;0;0", - "aff_unique_norm": "EPFL", + "aff_unique_norm": "École Polytechnique Fédérale de Lausanne", "aff_unique_dep": "CVLab", "aff_unique_url": "https://www.epfl.ch", "aff_unique_abbr": "EPFL", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Lausanne", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "Switzerland" + "aff_country_unique": "Switzerland", + "bibtex": "@InProceedings{Turetken_2013_ICCV,\n \n author = {\n Turetken,\n Engin and Becker,\n Carlos and Glowacki,\n Przemyslaw and Benmansour,\n Fethallah and Fua,\n Pascal\n},\n title = {\n Detecting Irregular Curvilinear Structures in Gray Scale and Color Imagery Using Multi-directional Oriented Flux\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "0ac323cdf2", @@ -3329,7 +3441,8 @@ "email": ";", "author_num": 2, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Lee_2013_ICCV,\n \n author = {\n Lee,\n Kwang Hee and Lee,\n Sang Wook\n},\n title = {\n Deterministic Fitting of Multiple Structures Using Iterative MaxFS with Inlier Scale Estimation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "91a5e7a8be", @@ -3351,7 +3464,8 @@ "email": ";;;", "author_num": 4, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Harandi_2013_ICCV,\n \n author = {\n Harandi,\n Mehrtash and Sanderson,\n Conrad and Shen,\n Chunhua and Lovell,\n Brian C.\n},\n title = {\n Dictionary Learning and Sparse Coding on Grassmann Manifolds: An Extrinsic Solution\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "656bf270c3", @@ -3382,7 +3496,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", - "aff_country_unique": "Australia;Switzerland" + "aff_country_unique": "Australia;Switzerland", + "bibtex": "@InProceedings{Kneip_2013_ICCV,\n \n author = {\n Kneip,\n Laurent and Lynen,\n Simon\n},\n title = {\n Direct Optimization of Frame-to-Frame Rotation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "2d107fe07c", @@ -3391,7 +3506,7 @@ "author": "Ling Wang; Hichem Sahbi", "abstract": "One of the trends of action recognition consists in extracting and comparing mid-level features which encode visual and motion aspects of objects into scenes. However, when scenes contain high-level semantic actions with many interacting parts, these mid-level features are not sufficient to capture high level structures as well as high order causal relationships between moving objects resulting into a clear drop in performances. In this paper, we address this issue and we propose an alternative action recognition method based on a novel graph kernel. In the main contributions of this work, we first describe actions in videos using directed acyclic graphs (DAGs), that naturally encode pairwise interactions between moving object parts, and then we compare these DAGs by analyzing the spectrum of their sub-patterns that capture complex higher order interactions. This extraction and comparison process is computationally tractable, resulting from the acyclic property of DAGs, and it also defines a positive semi-definite kernel. When plugging the latter into support vector machines, we obtain an action recognition algorithm that overtakes related work, including graph-based methods, on a standard evaluation dataset.", "pdf": "http://openaccess.thecvf.com/content_iccv_2013/papers/Wang_Directed_Acyclic_Graph_2013_ICCV_paper.pdf", - "aff": "Institut Mines-T\u00e9l\u00e9com + T\u00e9l\u00e9com ParisTech + CNRS LTCI; CNRS LTCI + T\u00e9l\u00e9com ParisTech", + "aff": "Institut Mines-Télécom + Télécom ParisTech + CNRS LTCI; CNRS LTCI + Télécom ParisTech", "project": "", "github": "", "supp": "", @@ -3406,14 +3521,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0+1+2;2+1", - "aff_unique_norm": "Institut Mines-T\u00e9l\u00e9com;T\u00e9l\u00e9com ParisTech;CNRS", + "aff_unique_norm": "Institut Mines-Télécom;Télécom ParisTech;CNRS", "aff_unique_dep": ";;Laboratoire Traitement du signal et des images", "aff_unique_url": "https://www.imt.fr;https://www.telecom-paristech.fr;https://www.ltci.cnrs.fr", "aff_unique_abbr": "IMT;TP;LTCI", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0+0;0+0", - "aff_country_unique": "France" + "aff_country_unique": "France", + "bibtex": "@InProceedings{Wang_2013_ICCV,\n \n author = {\n Wang,\n Ling and Sahbi,\n Hichem\n},\n title = {\n Directed Acyclic Graph Kernels for Action Recognition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "21b071322e", @@ -3444,7 +3560,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Aachen", "aff_country_unique_index": "0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Weyand_2013_ICCV,\n \n author = {\n Weyand,\n Tobias and Leibe,\n Bastian\n},\n title = {\n Discovering Details and Scene Structure with Hierarchical Iconoid Shift\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "ed69882a10", @@ -3475,7 +3592,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Yao_2013_ICCV,\n \n author = {\n Yao,\n Bangpeng and Ma,\n Jiayuan and Fei-Fei,\n Li\n},\n title = {\n Discovering Object Functionality\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "df012fb049", @@ -3506,7 +3624,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";London", "aff_country_unique_index": "0;0;0;1", - "aff_country_unique": "China;United Kingdom" + "aff_country_unique": "China;United Kingdom", + "bibtex": "@InProceedings{Gao_2013_ICCV,\n \n author = {\n Gao,\n Jin and Xing,\n Junliang and Hu,\n Weiming and Maybank,\n Steve\n},\n title = {\n Discriminant Tracking Using Tensor Representation with Semi-supervised Improvement\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "691d7db1a0", @@ -3515,7 +3634,7 @@ "author": "K.C. Amit Kumar; Christophe De Vleeschouwer", "abstract": "Given a set of plausible detections, detected at each time instant independently, we investigate how to associate them across time. This is done by propagating labels on a set of graphs that capture how the spatio-temporal and the appearance cues promote the assignment of identical or distinct labels to a pair of nodes. The graph construction is driven by the locally linear embedding (LLE) of either the spatio-temporal or the appearance features associated to the detections. Interestingly, the neighborhood of a node in each appearance graph is defined to include all nodes for which the appearance feature is available (except the ones that coexist at the same time). This allows to connect the nodes that share the same appearance even if they are temporally distant, which gives our framework the uncommon ability to exploit the appearance features that are available only sporadically along the sequence of detections. Once the graphs have been defined, the multi-object tracking is formulated as the problem of finding a label assignment that is consistent with the constraints captured by each of the graphs. This results into a difference of convex program that can be efficiently solved. Experiments are performed on a basketball and several well-known pedestrian datasets in order to validate the effectiveness of the proposed solution.", "pdf": "http://openaccess.thecvf.com/content_iccv_2013/papers/Kumar_Discriminative_Label_Propagation_2013_ICCV_paper.pdf", - "aff": "ISPGroup, ELEN Department, ICTEAM Institute, Universit \u00b4e catholique de Louvain; ISPGroup, ELEN Department, ICTEAM Institute, Universit \u00b4e catholique de Louvain", + "aff": "ISPGroup, ELEN Department, ICTEAM Institute, Universit ´e catholique de Louvain; ISPGroup, ELEN Department, ICTEAM Institute, Universit ´e catholique de Louvain", "project": "", "github": "", "supp": "", @@ -3530,14 +3649,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;0", - "aff_unique_norm": "Universit \u00e9 catholique de Louvain", + "aff_unique_norm": "Universit é catholique de Louvain", "aff_unique_dep": "ELEN Department", "aff_unique_url": "https://www.uclouvain.be", "aff_unique_abbr": "UCLouvain", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Belgium" + "aff_country_unique": "Belgium", + "bibtex": "@InProceedings{Kumar_2013_ICCV,\n \n author = {\n Kumar,\n K.C. Amit and De Vleeschouwer,\n Christophe\n},\n title = {\n Discriminative Label Propagation for Multi-object Tracking with Sporadic Appearance Features\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "4e98147b3e", @@ -3561,14 +3681,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;1", - "aff_unique_norm": "Centro de Investigaci\u00f3n y Estudios Avanzados del IPN;KU Leuven", + "aff_unique_norm": "Centro de Investigación y Estudios Avanzados del IPN;KU Leuven", "aff_unique_dep": "Robotics and Advanced Manufacturing;ESAT-PSI-VISICS", "aff_unique_url": "https://www.cinvestav.mx;https://www.kuleuven.be", "aff_unique_abbr": "CINVESTAV;KU Leuven", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", - "aff_country_unique": "Mexico;Belgium" + "aff_country_unique": "Mexico;Belgium", + "bibtex": "@InProceedings{Rios-Cabrera_2013_ICCV,\n \n author = {\n Rios-Cabrera,\n Reyes and Tuytelaars,\n Tinne\n},\n title = {\n Discriminatively Trained Templates for 3D Object Detection: A Real Time Scalable Approach\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "49d5a9f6e7", @@ -3599,7 +3720,8 @@ "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Berkeley;Stanford;", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Talwalkar_2013_ICCV,\n \n author = {\n Talwalkar,\n Ameet and Mackey,\n Lester and Mu,\n Yadong and Chang,\n Shih-Fu and Jordan,\n Michael I.\n},\n title = {\n Distributed Low-Rank Subspace Segmentation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "0855829f39", @@ -3621,7 +3743,8 @@ "email": ";", "author_num": 2, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Mirrashed_2013_ICCV,\n \n author = {\n Mirrashed,\n Fatemeh and Rastegari,\n Mohammad\n},\n title = {\n Domain Adaptive Classification\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "732a3a2561", @@ -3652,7 +3775,8 @@ "aff_campus_unique_index": "0+1;0;0", "aff_campus_unique": "Hong Kong SAR;Zhuhai", "aff_country_unique_index": "0+0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Ma_2013_ICCV,\n \n author = {\n Ma,\n Andy J. and Yuen,\n Pong C. and Li,\n Jiawei\n},\n title = {\n Domain Transfer Support Vector Ranking for Person Re-identification without Target Camera Label Information\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "e4d8704e4a", @@ -3683,7 +3807,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0", - "aff_country_unique": "Austria" + "aff_country_unique": "Austria", + "bibtex": "@InProceedings{Kazmar_2013_ICCV,\n \n author = {\n Kazmar,\n Tomas and Kvon,\n Evgeny Z. and Stark,\n Alexander and Lampert,\n Christoph H.\n},\n title = {\n Drosophila Embryo Stage Annotation Using Label Propagation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "1f051056c0", @@ -3714,7 +3839,8 @@ "aff_campus_unique_index": "0;1", "aff_campus_unique": "Stanford;San Diego;", "aff_country_unique_index": "0;0;1", - "aff_country_unique": "United States;Canada" + "aff_country_unique": "United States;Canada", + "bibtex": "@InProceedings{Wang_2013_ICCV,\n \n author = {\n Wang,\n Bo and Tu,\n Zhuowen and Tsotsos,\n John K.\n},\n title = {\n Dynamic Label Propagation for Semi-supervised Multi-class Multi-label Classification\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "8f5cabce03", @@ -3745,7 +3871,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "San Diego;", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Li_2013_ICCV,\n \n author = {\n Li,\n Weixin and Yu,\n Qian and Divakaran,\n Ajay and Vasconcelos,\n Nuno\n},\n title = {\n Dynamic Pooling for Complex Event Recognition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "14ae35eb63", @@ -3776,7 +3903,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Ulusoy_2013_ICCV,\n \n author = {\n Ulusoy,\n Ali Osman and Biris,\n Octavian and Mundy,\n Joseph L.\n},\n title = {\n Dynamic Probabilistic Volumetric Models\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "5c1d64a11b", @@ -3807,7 +3935,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Seoul", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Kim_2013_ICCV,\n \n author = {\n Kim,\n Tae Hyun and Ahn,\n Byeongjoo and Lee,\n Kyoung Mu\n},\n title = {\n Dynamic Scene Deblurring\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "5227216cf0", @@ -3832,13 +3961,14 @@ "status": "Poster", "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Pennsylvania;Google;University of Washington", - "aff_unique_dep": ";Google;", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.upenn.edu;https://www.google.com;https://www.washington.edu", "aff_unique_abbr": "UPenn;Google;UW", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Weiss_2013_ICCV,\n \n author = {\n Weiss,\n David and Sapp,\n Benjamin and Taskar,\n Ben\n},\n title = {\n Dynamic Structured Model Selection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "0a8ab9ebcf", @@ -3869,7 +3999,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Santa Barbara", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Fragoso_2013_ICCV,\n \n author = {\n Fragoso,\n Victor and Sen,\n Pradeep and Rodriguez,\n Sergio and Turk,\n Matthew\n},\n title = {\n EVSAC: Accelerating Hypotheses Generation by Modeling Matching Scores with Extreme Value Theory\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "b2d73579cf", @@ -3900,7 +4031,8 @@ "aff_campus_unique_index": "0", "aff_campus_unique": "Oxford;", "aff_country_unique_index": "0;1", - "aff_country_unique": "United Kingdom;Australia" + "aff_country_unique": "United Kingdom;Australia", + "bibtex": "@InProceedings{Kahler_2013_ICCV,\n \n author = {\n Kahler,\n Olaf and Reid,\n Ian\n},\n title = {\n Efficient 3D Scene Labeling Using Fields of Trees\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "42f4371570", @@ -3931,7 +4063,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Xu_2013_ICCV,\n \n author = {\n Xu,\n Chi and Cheng,\n Li\n},\n title = {\n Efficient Hand Pose Estimation from a Single Depth Image\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "ee7b59a3bd", @@ -3955,14 +4088,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;1", - "aff_unique_norm": "Microsoft;Indian Institute of Science", + "aff_unique_norm": "Microsoft Corporation;Indian Institute of Science", "aff_unique_dep": "Microsoft Research;Department of Electrical Engineering", "aff_unique_url": "https://www.microsoft.com/en-us/research;https://www.iisc.ac.in", "aff_unique_abbr": "MSR;IISc", "aff_campus_unique_index": "1", "aff_campus_unique": ";Bengaluru", "aff_country_unique_index": "0;1", - "aff_country_unique": "United States;India" + "aff_country_unique": "United States;India", + "bibtex": "@InProceedings{Jain_2013_ICCV,\n \n author = {\n Jain,\n Suraj and Govindu,\n Venu Madhav\n},\n title = {\n Efficient Higher-Order Clustering on the Grassmann Manifold\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "8554c21e0c", @@ -3993,7 +4127,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Meng_2013_ICCV,\n \n author = {\n Meng,\n Gaofeng and Wang,\n Ying and Duan,\n Jiangyong and Xiang,\n Shiming and Pan,\n Chunhong\n},\n title = {\n Efficient Image Dehazing with Boundary Constraint and Contextual Regularization\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "70e7b82391", @@ -4017,14 +4152,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;0;0", - "aff_unique_norm": "University of Adelaide", + "aff_unique_norm": "The University of Adelaide", "aff_unique_dep": "Australian Centre for Visual Technologies", "aff_unique_url": "https://www.adelaide.edu.au", "aff_unique_abbr": "Adelaide", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Adelaide", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Australia" + "aff_country_unique": "Australia", + "bibtex": "@InProceedings{Paisitkriangkrai_2013_ICCV,\n \n author = {\n Paisitkriangkrai,\n Sakrapee and Shen,\n Chunhua and Van Den Hengel,\n Anton\n},\n title = {\n Efficient Pedestrian Detection by Directly Optimizing the Partial Area under the ROC Curve\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "9f01d2e412", @@ -4034,7 +4170,7 @@ "abstract": "Detecting visually salient regions in images is one of the fundamental problems in computer vision. We propose a novel method to decompose an image into large scale perceptually homogeneous elements for efficient salient region detection, using a soft image abstraction representation. By considering both appearance similarity and spatial distribution of image pixels, the proposed representation abstracts out unnecessary image details, allowing the assignment of comparable saliency values across similar regions, and producing perceptually accurate salient region detection. We evaluate our salient region detection approach on the largest publicly available dataset with pixel accurate annotations. The experimental results show that the proposed method outperforms 18 alternate methods, reducing the mean absolute error by 25.2% compared to the previous best result, while being computationally more efficient.", "pdf": "http://openaccess.thecvf.com/content_iccv_2013/papers/Cheng_Efficient_Salient_Region_2013_ICCV_paper.pdf", "aff": "Vision Group, Oxford Brookes University; Vision Group, Oxford Brookes University; Vision Group, Oxford Brookes University; Vision Group, Oxford Brookes University; Vision Group, Oxford Brookes University; Vision Group, Oxford Brookes University", - "project": "http://mmcheng.net/ef\ufb01salobj/", + "project": "http://mmcheng.net/effisalobj/", "github": "", "supp": "", "arxiv": "", @@ -4055,7 +4191,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Cheng_2013_ICCV,\n \n author = {\n Cheng,\n Ming-Ming and Warrell,\n Jonathan and Lin,\n Wen-Yan and Zheng,\n Shuai and Vineet,\n Vibhav and Crook,\n Nigel\n},\n title = {\n Efficient Salient Region Detection with Soft Image Abstraction\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "f99de85dd3", @@ -4086,7 +4223,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Bengaluru", "aff_country_unique_index": "0;0", - "aff_country_unique": "India" + "aff_country_unique": "India", + "bibtex": "@InProceedings{Chatterjee_2013_ICCV,\n \n author = {\n Chatterjee,\n Avishek and Govindu,\n Venu Madhav\n},\n title = {\n Efficient and Robust Large-Scale Rotation Averaging\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "0a44b66322", @@ -4117,7 +4255,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhou_2013_ICCV,\n \n author = {\n Zhou,\n Qian-Yi and Miller,\n Stephen and Koltun,\n Vladlen\n},\n title = {\n Elastic Fragments for Dense Scene Reconstruction\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "a4b2db21bb", @@ -4126,7 +4265,7 @@ "author": "Emanuele Rodola; Andrea Torsello; Tatsuya Harada; Yasuo Kuniyoshi; Daniel Cremers", "abstract": "We consider a parametrized relaxation of the widely adopted quadratic assignment problem (QAP) formulation for minimum distortion correspondence between deformable shapes. In order to control the accuracy/sparsity trade-off we introduce a weighting parameter on the combination of two existing relaxations, namely spectral and", "pdf": "http://openaccess.thecvf.com/content_iccv_2013/papers/Rodola_Elastic_Net_Constraints_2013_ICCV_paper.pdf", - "aff": "The University of Tokyo + TU Munich; Universit `a Ca\u2019 Foscari V enezia; The University of Tokyo; The University of Tokyo; TU Munich", + "aff": "The University of Tokyo + TU Munich; Universit `a Ca’ Foscari V enezia; The University of Tokyo; The University of Tokyo; TU Munich", "project": "", "github": "", "supp": "", @@ -4141,14 +4280,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0+1;2;0;0;1", - "aff_unique_norm": "University of Tokyo;Technical University of Munich;Universit\u00e0 Ca' Foscari Venezia", + "aff_unique_norm": "University of Tokyo;Technical University of Munich;Università Ca' Foscari Venezia", "aff_unique_dep": ";;", "aff_unique_url": "https://www.u-tokyo.ac.jp;https://www.tum.de;https://www.unive.it", - "aff_unique_abbr": "UTokyo;TUM;UCV", + "aff_unique_abbr": "UTokyo;TUM;UNIVE", "aff_campus_unique_index": ";1", "aff_campus_unique": ";Venezia", "aff_country_unique_index": "0+1;2;0;0;1", - "aff_country_unique": "Japan;Germany;Italy" + "aff_country_unique": "Japan;Germany;Italy", + "bibtex": "@InProceedings{Rodola_2013_ICCV,\n \n author = {\n Rodola,\n Emanuele and Torsello,\n Andrea and Harada,\n Tatsuya and Kuniyoshi,\n Yasuo and Cremers,\n Daniel\n},\n title = {\n Elastic Net Constraints for Shape Matching\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "f13b71a562", @@ -4157,7 +4297,7 @@ "author": "Guoqing Zhou; Qing Wang", "abstract": "Optimization using the L ? norm has been becoming an effective way to solve parameter estimation problems in multiview geometry. But the computational cost increases rapidly with the size of measurement data. Although some strategies have been presented to improve the efficiency of L ? optimization, it is still an open issue. In the paper, we propose a novel approach under the framework of enhanced continuous tabu search (ECTS) for generic parameter estimation in multiview geometry. ECTS is an optimization method in the domain of artificial intelligence, which has an interesting ability of covering a wide solution space by promoting the search far away from current solution and consecutively decreasing the possibility of trapping in the local minima. Taking the triangulation as an example, we propose the corresponding ways in the key steps of ECTS, diversification and intensification. We also present theoretical proof to guarantee the global convergence of search with probability one. Experimental results have validated that the ECTS based approach can obtain global optimum efficiently, especially for large scale dimension of parameter. Potentially, the novel ECTS based algorithm can be applied in many applications of multiview geometry.", "pdf": "http://openaccess.thecvf.com/content_iccv_2013/papers/Zhou_Enhanced_Continuous_Tabu_2013_ICCV_paper.pdf", - "aff": "School of Computer Science and Engineering, Northwestern Polytechnical University, Xi\u2019an 710072, P. R. China; School of Computer Science and Engineering, Northwestern Polytechnical University, Xi\u2019an 710072, P. R. China", + "aff": "School of Computer Science and Engineering, Northwestern Polytechnical University, Xi’an 710072, P. R. China; School of Computer Science and Engineering, Northwestern Polytechnical University, Xi’an 710072, P. R. China", "project": "", "github": "", "supp": "", @@ -4179,7 +4319,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Xi'an", "aff_country_unique_index": "0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhou_2013_ICCV,\n \n author = {\n Zhou,\n Guoqing and Wang,\n Qing\n},\n title = {\n Enhanced Continuous Tabu Search for Parameter Estimation in Multiview Geometry\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "45cde73f5a", @@ -4210,7 +4351,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Zurich", "aff_country_unique_index": "0;0", - "aff_country_unique": "Switzerland" + "aff_country_unique": "Switzerland", + "bibtex": "@InProceedings{Dai_2013_ICCV,\n \n author = {\n Dai,\n Dengxin and Van Gool,\n Luc\n},\n title = {\n Ensemble Projection for Semi-supervised Image Classification\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "a8721f706d", @@ -4219,7 +4361,7 @@ "author": "Silvia Zuffi; Javier Romero; Cordelia Schmid; Michael J. Black", "abstract": "We address the problem of upper-body human pose estimation in uncontrolled monocular video sequences, without manual initialization. Most current methods focus on isolated video frames and often fail to correctly localize arms and hands. Inferring pose over a video sequence is advantageous because poses of people in adjacent frames exhibit properties of smooth variation due to the nature of human and camera motion. To exploit this, previous methods have used prior knowledge about distinctive actions or generic temporal priors combined with static image likelihoods to track people in motion. Here we take a different approach based on a simple observation: Information about how a person moves from frame to frame is present in the optical flow field. We develop an approach for tracking articulated motions that \"links\" articulated shape models of people in adjacent frames through the dense optical flow. Key to this approach is a 2D shape model of the body that we use to compute how the body moves over time. The resulting \"flowing puppets\" provide a way of integrating image evidence across frames to improve pose inference. We apply our method on a challenging dataset of TV video sequences and show state-of-the-art performance.", "pdf": "http://openaccess.thecvf.com/content_iccv_2013/papers/Zuffi_Estimating_Human_Pose_2013_ICCV_paper.pdf", - "aff": "Department of Computer Science, Brown University, Providence, RI 02912, USA + ITC - Consiglio Nazionale delle Ricerche, Milan, Italy; Max Planck Institute for Intelligent Systems, 72076 T\u00a8ubingen, Germany; INRIA - Grenoble, France; Max Planck Institute for Intelligent Systems, 72076 T\u00a8ubingen, Germany", + "aff": "Department of Computer Science, Brown University, Providence, RI 02912, USA + ITC - Consiglio Nazionale delle Ricerche, Milan, Italy; Max Planck Institute for Intelligent Systems, 72076 T¨ubingen, Germany; INRIA - Grenoble, France; Max Planck Institute for Intelligent Systems, 72076 T¨ubingen, Germany", "project": "", "github": "", "supp": "", @@ -4239,9 +4381,10 @@ "aff_unique_url": "https://www.brown.edu;https://www.cnr.it;https://www.mpi-is.mpg.de;https://www.inria.fr", "aff_unique_abbr": "Brown;CNR;MPI-IS;INRIA", "aff_campus_unique_index": "0+1;2;3;2", - "aff_campus_unique": "Providence;Milan;T\u00fcbingen;Grenoble", + "aff_campus_unique": "Providence;Milan;Tübingen;Grenoble", "aff_country_unique_index": "0+1;2;3;2", - "aff_country_unique": "United States;Italy;Germany;France" + "aff_country_unique": "United States;Italy;Germany;France", + "bibtex": "@InProceedings{Zuffi_2013_ICCV,\n \n author = {\n Zuffi,\n Silvia and Romero,\n Javier and Schmid,\n Cordelia and Black,\n Michael J.\n},\n title = {\n Estimating Human Pose with Flowing Puppets\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "7aa216f645", @@ -4272,14 +4415,15 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Chicago", "aff_country_unique_index": "0;0;1;2", - "aff_country_unique": "China;Switzerland;United States" + "aff_country_unique": "China;Switzerland;United States", + "bibtex": "@InProceedings{Zhang_2013_ICCV,\n \n author = {\n Zhang,\n Jian and Kan,\n Chen and Schwing,\n Alexander G. and Urtasun,\n Raquel\n},\n title = {\n Estimating the 3D Layout of Indoor Scenes and Its Clutter from Depth Sensors\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "3f729d590a", "title": "Estimating the Material Properties of Fabric from Video", "site": "http://openaccess.thecvf.com/content_iccv_2013/html/Bouman_Estimating_the_Material_2013_ICCV_paper.html", "author": "Katherine L. Bouman; Bei Xiao; Peter Battaglia; William T. Freeman", - "abstract": "Passively estimating the intrinsic material properties of deformable objects moving in a natural environment is essential for scene understanding. We present a framework to automatically analyze videos of fabrics moving under various unknown wind forces, and recover two key material properties of the fabric: stiffness and area weight. We extend features previously developed to compactly represent static image textures to describe video textures, such as fabric motion. A discriminatively trained regression model is then used to predict the physical properties of fabric from these features. The success of our model is demonstrated on a new, publicly available database of fabric videos with corresponding measured ground truth material properties. We show that our predictions are well correlated with ground truth measurements of stiffness and density for the fabrics. Our contributions include: (a) a database that can be used for training and testing algorithms for passively predicting fabric properties from video, (b) an algorithm for predicting the material properties of fabric from a video, and (c) a perceptual study of humans\u00e2\u0080\u0099 ability to estimate the material properties of fabric from videos and images.", + "abstract": "Passively estimating the intrinsic material properties of deformable objects moving in a natural environment is essential for scene understanding. We present a framework to automatically analyze videos of fabrics moving under various unknown wind forces, and recover two key material properties of the fabric: stiffness and area weight. We extend features previously developed to compactly represent static image textures to describe video textures, such as fabric motion. A discriminatively trained regression model is then used to predict the physical properties of fabric from these features. The success of our model is demonstrated on a new, publicly available database of fabric videos with corresponding measured ground truth material properties. We show that our predictions are well correlated with ground truth measurements of stiffness and density for the fabrics. Our contributions include: (a) a database that can be used for training and testing algorithms for passively predicting fabric properties from video, (b) an algorithm for predicting the material properties of fabric from a video, and (c) a perceptual study of humans’ ability to estimate the material properties of fabric from videos and images.", "pdf": "http://openaccess.thecvf.com/content_iccv_2013/papers/Bouman_Estimating_the_Material_2013_ICCV_paper.pdf", "aff": "Massachusetts Institute of Technology; Massachusetts Institute of Technology; Massachusetts Institute of Technology; Massachusetts Institute of Technology", "project": "", @@ -4303,7 +4447,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Bouman_2013_ICCV,\n \n author = {\n Bouman,\n Katherine L. and Xiao,\n Bei and Battaglia,\n Peter and Freeman,\n William T.\n},\n title = {\n Estimating the Material Properties of Fabric from Video\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "80a5acf348", @@ -4334,7 +4479,8 @@ "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Beijing;Troy", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Zhang_2013_ICCV,\n \n author = {\n Zhang,\n Yifan and Ji,\n Qiang and Lu,\n Hanqing\n},\n title = {\n Event Detection in Complex Scenes Using Interval Temporal Constraints\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "fb89069b97", @@ -4358,14 +4504,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;0;0+1", - "aff_unique_norm": "ETH Zurich;European Space Agency", - "aff_unique_dep": "Computer Vision Lab;", + "aff_unique_norm": "ETH Zurich;ESA", + "aff_unique_dep": "Computer Vision Lab;T", "aff_unique_url": "https://www.ethz.ch;https://www.esa.int", "aff_unique_abbr": "ETHZ;ESA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+1", - "aff_country_unique": "Switzerland;Belgium" + "aff_country_unique": "Switzerland;Belgium", + "bibtex": "@InProceedings{Bossard_2013_ICCV,\n \n author = {\n Bossard,\n Lukas and Guillaumin,\n Matthieu and Van Gool,\n Luc\n},\n title = {\n Event Recognition in Photo Collections with a Stopwatch HMM\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "f77ef28ce4", @@ -4374,7 +4521,7 @@ "author": "Dengxin Dai; Hayko Riemenschneider; Gerhard Schmitt; Luc Van Gool", "abstract": "There is an increased interest in the efficient creation of city models, be it virtual or as-built. We present a method for synthesizing complex, photo-realistic facade images, from a single example. After parsing the example image into its semantic components, a tiling for it is generated. Novel tilings can then be created, yielding facade textures with different dimensions or with occluded parts inpainted. A genetic algorithm guides the novel facades as well as inpainted parts to be consistent with the example, both in terms of their overall structure and their detailed textures. Promising results for multiple standard datasets in particular for the different building styles they contain demonstrate the potential of the method.", "pdf": "http://openaccess.thecvf.com/content_iccv_2013/papers/Dai_Example-Based_Facade_Texture_2013_ICCV_paper.pdf", - "aff": "Computer Vision Lab, ETH Z\u00fcrich; Computer Vision Lab, ETH Z\u00fcrich; Chair of Information Architecture, ETH Z\u00fcrich; Computer Vision Lab, ETH Z\u00fcrich", + "aff": "Computer Vision Lab, ETH Zürich; Computer Vision Lab, ETH Zürich; Chair of Information Architecture, ETH Zürich; Computer Vision Lab, ETH Zürich", "project": "", "github": "", "supp": "", @@ -4389,14 +4536,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;0;0;0", - "aff_unique_norm": "ETH Zurich", + "aff_unique_norm": "ETH Zürich", "aff_unique_dep": "Computer Vision Lab", "aff_unique_url": "https://www.ethz.ch", "aff_unique_abbr": "ETHZ", "aff_campus_unique_index": "0;0;0", - "aff_campus_unique": "Z\u00fcrich;", + "aff_campus_unique": "Zürich;", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Switzerland" + "aff_country_unique": "Switzerland", + "bibtex": "@InProceedings{Dai_2013_ICCV,\n \n author = {\n Dai,\n Dengxin and Riemenschneider,\n Hayko and Schmitt,\n Gerhard and Van Gool,\n Luc\n},\n title = {\n Example-Based Facade Texture Synthesis\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "452cf6df56", @@ -4427,7 +4575,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Merced", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Yang_2013_ICCV,\n \n author = {\n Yang,\n Jimei and Tsai,\n Yi-Hsuan and Yang,\n Ming-Hsuan\n},\n title = {\n Exemplar Cut\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "d851bc4243", @@ -4458,7 +4607,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhou_2013_ICCV,\n \n author = {\n Zhou,\n Feng and Brandt,\n Jonathan and Lin,\n Zhe\n},\n title = {\n Exemplar-Based Graph Matching for Robust Facial Landmark Localization\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "179a8bcfdf", @@ -4486,10 +4636,11 @@ "aff_unique_dep": "School of Computing", "aff_unique_url": "https://www.nus.edu.sg", "aff_unique_abbr": "NUS", - "aff_campus_unique_index": "0;0", - "aff_campus_unique": "Singapore", + "aff_campus_unique_index": "", + "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Li_2013_ICCV,\n \n author = {\n Li,\n Yu and Brown,\n Michael S.\n},\n title = {\n Exploiting Reflection Change for Automatic Reflection Removal\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "ce1d9e13fa", @@ -4518,7 +4669,8 @@ "aff_unique_url": "https://www.merl.com", "aff_unique_abbr": "MERL", "aff_country_unique_index": "0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Agrawal_2013_ICCV,\n \n author = {\n Agrawal,\n Amit\n},\n title = {\n Extrinsic Camera Calibration without a Direct View Using Spherical Mirror\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "c622ae2c31", @@ -4542,14 +4694,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;0;0", - "aff_unique_norm": "Chinese University of Hong Kong", + "aff_unique_norm": "The Chinese University of Hong Kong", "aff_unique_dep": "Department of Information Engineering", "aff_unique_url": "https://www.cuhk.edu.hk", "aff_unique_abbr": "CUHK", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Lu_2013_ICCV,\n \n author = {\n Lu,\n Chaochao and Zhao,\n Deli and Tang,\n Xiaoou\n},\n title = {\n Face Recognition Using Face Patch Networks\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "72311a3e4c", @@ -4573,14 +4726,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;1;0;0", - "aff_unique_norm": "Chinese University of Hong Kong;IBM", - "aff_unique_dep": "Information Engineering Department;IBM T. J. Watson Research Center", + "aff_unique_norm": "The Chinese University of Hong Kong;IBM T. J. Watson Research Center", + "aff_unique_dep": "Information Engineering Department;", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.ibm.com/research/watson", "aff_unique_abbr": "CUHK;IBM Watson", "aff_campus_unique_index": "0;1;0;0", "aff_campus_unique": "Hong Kong SAR;Yorktown Heights", "aff_country_unique_index": "0;1;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Xiong_2013_ICCV,\n \n author = {\n Xiong,\n Yuanjun and Liu,\n Wei and Zhao,\n Deli and Tang,\n Xiaoou\n},\n title = {\n Face Recognition via Archetype Hull Ranking\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "d48214dedb", @@ -4602,7 +4756,8 @@ "email": ";;;;", "author_num": 5, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Ding_2013_ICCV,\n \n author = {\n Ding,\n Xiaoyu and Chu,\n Wen-Sheng and De La Torre,\n Fernando and Cohn,\n Jeffery F. and Wang,\n Qiao\n},\n title = {\n Facial Action Unit Event Detection by Cascade of Tasks\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "caa3bf0861", @@ -4633,7 +4788,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Merced", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Yang_2013_ICCV,\n \n author = {\n Yang,\n Chih-Yuan and Yang,\n Ming-Hsuan\n},\n title = {\n Fast Direct Super-Resolution by Simple Functions\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "c036ead356", @@ -4642,7 +4798,7 @@ "author": "Kristina Scherbaum; James Petterson; Rogerio S. Feris; Volker Blanz; Hans-Peter Seidel", "abstract": "Face detection is an important task in computer vision and often serves as the first step for a variety of applications. State-of-the-art approaches use efficient learning algorithms and train on large amounts of manually labeled imagery. Acquiring appropriate training images, however, is very time-consuming and does not guarantee that the collected training data is representative in terms of data variability. Moreover, available data sets are often acquired under controlled settings, restricting, for example, scene illumination or 3D head pose to a narrow range. This paper takes a look into the automated generation of adaptive training samples from a 3D morphable face model. Using statistical insights, the tailored training data guarantees full data variability and is enriched by arbitrary facial attributes such as age or body weight. Moreover, it can automatically adapt to environmental constraints, such as illumination or viewing angle of recorded video footage from surveillance cameras. We use the tailored imagery to train a new many-core implementation of Viola Jones' AdaBoost object detection framework. The new implementation is not only faster but also enables the use of multiple feature channels such as color features at training time. In our experiments we trained seven view-dependent face detectors and evaluate these on the Face Detection Data Set and Benchmark (FDDB). Our experiments show that the use of tailored training imagery outperforms state-of-the-art approaches on this challenging dataset.", "pdf": "http://openaccess.thecvf.com/content_iccv_2013/papers/Scherbaum_Fast_Face_Detector_2013_ICCV_paper.pdf", - "aff": "Cluster of Excellence MMCI at Saarland University; Commonwealth Bank of Australia; IBM Watson Research Center; Universit\u00e4t Siegen; MPI for Informatics", + "aff": "Cluster of Excellence MMCI at Saarland University; Commonwealth Bank of Australia; IBM Watson Research Center; Universität Siegen; MPI for Informatics", "project": "", "github": "", "supp": "", @@ -4664,7 +4820,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Yorktown Heights", "aff_country_unique_index": "0;1;2;0;0", - "aff_country_unique": "Germany;Australia;United States" + "aff_country_unique": "Germany;Australia;United States", + "bibtex": "@InProceedings{Scherbaum_2013_ICCV,\n \n author = {\n Scherbaum,\n Kristina and Petterson,\n James and Feris,\n Rogerio S. and Blanz,\n Volker and Seidel,\n Hans-Peter\n},\n title = {\n Fast Face Detector Training Using Tailored Views\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "061736ca35", @@ -4695,7 +4852,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1", - "aff_country_unique": "Israel;United States" + "aff_country_unique": "Israel;United States", + "bibtex": "@InProceedings{Barkan_2013_ICCV,\n \n author = {\n Barkan,\n Oren and Weill,\n Jonathan and Wolf,\n Lior and Aronowitz,\n Hagai\n},\n title = {\n Fast High Dimensional Vector Multiplication Face Recognition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "28aa9e340a", @@ -4719,14 +4877,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;1;0;0;1;1", - "aff_unique_norm": "Peking University;Microsoft", + "aff_unique_norm": "Peking University;Microsoft Corporation", "aff_unique_dep": ";Microsoft Research", "aff_unique_url": "http://www.pku.edu.cn;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "Peking U;MSR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;1;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Wang_2013_ICCV,\n \n author = {\n Wang,\n Jing and Wang,\n Jingdong and Zeng,\n Gang and Gan,\n Rui and Li,\n Shipeng and Guo,\n Baining\n},\n title = {\n Fast Neighborhood Graph Search Using Cartesian Concatenation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "cab4f489f5", @@ -4757,7 +4916,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Papazoglou_2013_ICCV,\n \n author = {\n Papazoglou,\n Anestis and Ferrari,\n Vittorio\n},\n title = {\n Fast Object Segmentation in Unconstrained Video\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "97cfa78690", @@ -4788,7 +4948,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Iowa City", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "Singapore;United States" + "aff_country_unique": "Singapore;United States", + "bibtex": "@InProceedings{Bao_2013_ICCV,\n \n author = {\n Bao,\n Chenglong and Cai,\n Jian-Feng and Ji,\n Hui\n},\n title = {\n Fast Sparsity-Based Orthogonal Dictionary Learning for Image Restoration\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "6b8b18af95", @@ -4819,7 +4980,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Minneapolis;", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Wang_2013_ICCV,\n \n author = {\n Wang,\n Xu and Atev,\n Stefan and Wright,\n John and Lerman,\n Gilad\n},\n title = {\n Fast Subspace Search via Grassmannian Based Hashing\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "535b6f18d5", @@ -4843,14 +5005,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;1;2;3;1", - "aff_unique_norm": "University of Queensland;Carnegie Mellon University;Nanyang Technological University;University of Trento", + "aff_unique_norm": "The University of Queensland;Carnegie Mellon University;Nanyang Technological University;University of Trento", "aff_unique_dep": "ITEE;School of Computer Science;School of Computer Engineering;Department of Information Engineering and Computer Science", "aff_unique_url": "https://www.uq.edu.au;https://www.cmu.edu;https://www.ntu.edu.sg;https://www.unitn.it", "aff_unique_abbr": "UQ;CMU;NTU;UniTN", - "aff_campus_unique_index": "1;2;1", - "aff_campus_unique": ";Pittsburgh;Singapore", + "aff_campus_unique_index": "1;1", + "aff_campus_unique": ";Pittsburgh", "aff_country_unique_index": "0;1;2;3;1", - "aff_country_unique": "Australia;United States;Singapore;Italy" + "aff_country_unique": "Australia;United States;Singapore;Italy", + "bibtex": "@InProceedings{Xu_2013_ICCV,\n \n author = {\n Xu,\n Zhongwen and Yang,\n Yi and Tsang,\n Ivor and Sebe,\n Nicu and Hauptmann,\n Alexander G.\n},\n title = {\n Feature Weighting via Optimal Thresholding for Video Analysis\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "0e1dd01f5a", @@ -4881,7 +5044,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Gupta_2013_ICCV,\n \n author = {\n Gupta,\n Mohit and Iso,\n Daisuke and Nayar,\n Shree K.\n},\n title = {\n Fibonacci Exposure Bracketing for High Dynamic Range Imaging\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "77c09048de", @@ -4912,7 +5076,8 @@ "aff_campus_unique_index": "0;1;2", "aff_campus_unique": "Seattle;Ann Arbor;Stanford", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Sun_2013_ICCV,\n \n author = {\n Sun,\n Min and Huang,\n Wan and Savarese,\n Silvio\n},\n title = {\n Find the Best Path: An Efficient and Accurate Classifier for Image Hierarchies\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "7090658882", @@ -4921,7 +5086,7 @@ "author": "P. Bojanowski; F. Bach; I. Laptev; J. Ponce; C. Schmid; J. Sivic", "abstract": "We address the problem of learning a joint model of actors and actions in movies using weak supervision provided by scripts. Specifically, we extract actor/action pairs from the script and use them as constraints in a discriminative clustering framework. The corresponding optimization problem is formulated as a quadratic program under linear constraints. People in video are represented by automatically extracted and tracked faces together with corresponding motion features. First, we apply the proposed framework to the task of learning names of characters in the movie and demonstrate significant improvements over previous methods used for this task. Second, we explore the joint actor/action constraint and show its advantage for weakly supervised action learning. We validate our method in the challenging setting of localizing and recognizing characters and their actions in feature length movies Casablanca and American Beauty.", "pdf": "http://openaccess.thecvf.com/content_iccv_2013/papers/Bojanowski_Finding_Actors_and_2013_ICCV_paper.pdf", - "aff": "INRIA; INRIA; INRIA; Ecole Normale Sup\u00e9rieure; INRIA; INRIA", + "aff": "INRIA; INRIA; INRIA; Ecole Normale Supérieure; INRIA; INRIA", "project": "", "github": "", "supp": "", @@ -4936,14 +5101,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;0;0;1;0;0", - "aff_unique_norm": "INRIA;Ecole Normale Sup\u00e9rieure", + "aff_unique_norm": "INRIA;Ecole Normale Supérieure", "aff_unique_dep": ";", "aff_unique_url": "https://www.inria.fr;https://www.ens.fr", "aff_unique_abbr": "INRIA;ENS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "France" + "aff_country_unique": "France", + "bibtex": "@InProceedings{Bojanowski_2013_ICCV,\n \n author = {\n Bojanowski,\n P. and Bach,\n F. and Laptev,\n I. and Ponce,\n J. and Schmid,\n C. and Sivic,\n J.\n},\n title = {\n Finding Actors and Actions in Movies\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "e5e627c322", @@ -4974,7 +5140,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Boston", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Ayazoglu_2013_ICCV,\n \n author = {\n Ayazoglu,\n Mustafa and Yilmaz,\n Burak and Sznaier,\n Mario and Camps,\n Octavia\n},\n title = {\n Finding Causal Interactions in Video Sequences\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "63e1bac556", @@ -5005,7 +5172,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Philadelphia", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Pang_2013_ICCV,\n \n author = {\n Pang,\n Yu and Ling,\n Haibin\n},\n title = {\n Finding the Best from the Second Bests - Inhibiting Subjective Bias in Evaluation of Visual Tracking Algorithms\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "cc1d229325", @@ -5036,7 +5204,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Amsterdam", "aff_country_unique_index": "0;1;0+0;0+0;1", - "aff_country_unique": "Netherlands;Belgium" + "aff_country_unique": "Netherlands;Belgium", + "bibtex": "@InProceedings{Gavves_2013_ICCV,\n \n author = {\n Gavves,\n E. and Fernando,\n B. and Snoek,\n C.G.M. and Smeulders,\n A.W.M. and Tuytelaars,\n T.\n},\n title = {\n Fine-Grained Categorization by Alignments\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "369c55796e", @@ -5067,7 +5236,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Chicago", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Kim_2013_ICCV,\n \n author = {\n Kim,\n Taehwan and Shakhnarovich,\n Greg and Livescu,\n Karen\n},\n title = {\n Fingerspelling Recognition with Semi-Markov Conditional Random Fields\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "e5256a7bf0", @@ -5098,7 +5268,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Buffalo", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Xu_2013_ICCV,\n \n author = {\n Xu,\n Chenliang and Whitt,\n Spencer and Corso,\n Jason J.\n},\n title = {\n Flattening Supervoxel Hierarchies by the Uniform Entropy Slice\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "140ac371ac", @@ -5129,7 +5300,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Jeon_2013_ICCV,\n \n author = {\n Jeon,\n Hae-Gon and Lee,\n Joon-Young and Han,\n Yudeog and Kim,\n Seon Joo and Kweon,\n In So\n},\n title = {\n Fluttering Pattern Generation Using Modified Legendre Sequence for Coded Exposure Imaging\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "5fc296359f", @@ -5151,7 +5323,8 @@ "email": ";;", "author_num": 3, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Zheng_2013_ICCV,\n \n author = {\n Zheng,\n Shicheng and Xu,\n Li and Jia,\n Jiaya\n},\n title = {\n Forward Motion Deblurring\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "4bedebe660", @@ -5182,7 +5355,8 @@ "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Philadelphia;Toronto", "aff_country_unique_index": "0;0;1", - "aff_country_unique": "United States;Canada" + "aff_country_unique": "United States;Canada", + "bibtex": "@InProceedings{Zhang_2013_ICCV,\n \n author = {\n Zhang,\n Weiyu and Zhu,\n Menglong and Derpanis,\n Konstantinos G.\n},\n title = {\n From Actemes to Action: A Strongly-Supervised Representation for Detailed Action Understanding\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "4f189723bc", @@ -5213,7 +5387,8 @@ "aff_campus_unique_index": "0;1;0;0", "aff_campus_unique": "Chapel Hill;Stanford;", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Ordonez_2013_ICCV,\n \n author = {\n Ordonez,\n Vicente and Deng,\n Jia and Choi,\n Yejin and Berg,\n Alexander C. and Berg,\n Tamara L.\n},\n title = {\n From Large Scale Image Categorization to Entry-Level Categories\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "c45c1cf91f", @@ -5237,14 +5412,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;0", - "aff_unique_norm": "Hong Kong Polytechnic University", + "aff_unique_norm": "The Hong Kong Polytechnic University", "aff_unique_dep": "", "aff_unique_url": "https://www.polyu.edu.hk", "aff_unique_abbr": "PolyU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhu_2013_ICCV,\n \n author = {\n Zhu,\n Pengfei and Zhang,\n Lei and Zuo,\n Wangmeng and Zhang,\n David\n},\n title = {\n From Point to Set: Extend the Learning of Distance Metrics\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "ff2e58b3c9", @@ -5268,14 +5444,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;1;1", - "aff_unique_norm": "Chinese University of Hong Kong;Queen Mary University of London", + "aff_unique_norm": "The Chinese University of Hong Kong;Queen Mary University of London", "aff_unique_dep": "Dept. of Information Engineering;School of EECS", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.qmul.ac.uk", "aff_unique_abbr": "CUHK;QMUL", "aff_campus_unique_index": "0;1;1", "aff_campus_unique": "Hong Kong SAR;London", "aff_country_unique_index": "0;1;1", - "aff_country_unique": "China;United Kingdom" + "aff_country_unique": "China;United Kingdom", + "bibtex": "@InProceedings{Loy_2013_ICCV,\n \n author = {\n Loy,\n Chen Change and Gong,\n Shaogang and Xiang,\n Tao\n},\n title = {\n From Semi-supervised to Transfer Counting of Crowds\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "506eaeb17f", @@ -5297,7 +5474,8 @@ "email": ";;;", "author_num": 4, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Lan_2013_ICCV,\n \n author = {\n Lan,\n Tian and Raptis,\n Michalis and Sigal,\n Leonid and Mori,\n Greg\n},\n title = {\n From Subcategories to Visual Composites: A Multi-level Framework for Object Detection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "159cead590", @@ -5328,7 +5506,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Santa Barbara;", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States;" + "aff_country_unique": "United States;", + "bibtex": "@InProceedings{Karthikeyan_2013_ICCV,\n \n author = {\n Karthikeyan,\n S. and Jagadeesh,\n Vignesh and Shenoy,\n Renuka and Ecksteinz,\n Miguel and Manjunath,\n B.S.\n},\n title = {\n From Where and How to What We See\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "84780b80c2", @@ -5359,7 +5538,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", - "aff_country_unique": "Italy;Belgium" + "aff_country_unique": "Italy;Belgium", + "bibtex": "@InProceedings{Tommasi_2013_ICCV,\n \n author = {\n Tommasi,\n Tatiana and Caputo,\n Barbara\n},\n title = {\n Frustratingly Easy NBNN Domain Adaptation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "9ac5b706d7", @@ -5390,7 +5570,8 @@ "aff_campus_unique_index": "0;0;1;0", "aff_campus_unique": "Madison;Whitewater;", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Xu_2013_ICCV,\n \n author = {\n Xu,\n Jia and Ithapu,\n Vamsi K. and Mukherjee,\n Lopamudra and Rehg,\n James M. and Singh,\n Vikas\n},\n title = {\n GOSUS: Grassmannian Online Subspace Updates with Structured-Sparsity\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "69c5937525", @@ -5421,7 +5602,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", - "aff_country_unique": "United States;Germany" + "aff_country_unique": "United States;Germany", + "bibtex": "@InProceedings{Zeng_2013_ICCV,\n \n author = {\n Zeng,\n Wei and Goswami,\n Mayank and Luo,\n Feng and Gu,\n Xianfeng\n},\n title = {\n Geometric Registration Based on Distortion Estimation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "999ff82750", @@ -5430,7 +5612,7 @@ "author": "Pierre Moulon; Pascal Monasse; Renaud Marlet", "abstract": "Multi-view structure from motion (SfM) estimates the position and orientation of pictures in a common 3D coordinate frame. When views are treated incrementally, this external calibration can be subject to drift, contrary to global methods that distribute residual errors evenly. We propose a new global calibration approach based on the fusion of relative motions between image pairs. We improve an existing method for robustly computing global rotations. We present an efficient a contrario trifocal tensor estimation method, from which stable and precise translation directions can be extracted. We also define an efficient translation registration method that recovers accurate camera positions. These components are combined into an original SfM pipeline. Our experiments show that, on most datasets, it outperforms in accuracy other existing incremental and global pipelines. It also achieves strikingly good running times: it is about 20 times faster than the other global method we could compare to, and as fast as the best incremental method. More importantly, it features better scalability properties.", "pdf": "http://openaccess.thecvf.com/content_iccv_2013/papers/Moulon_Global_Fusion_of_2013_ICCV_paper.pdf", - "aff": "Universit\u00e9 Paris-Est, LIGM (UMR CNRS), ENPC, F-77455 Marne-la-Vall\u00e9e+Mikros Image; Universit\u00e9 Paris-Est, LIGM (UMR CNRS), ENPC, F-77455 Marne-la-Vall\u00e9e; Universit\u00e9 Paris-Est, LIGM (UMR CNRS), ENPC, F-77455 Marne-la-Vall\u00e9e", + "aff": "Université Paris-Est, LIGM (UMR CNRS), ENPC, F-77455 Marne-la-Vallée+Mikros Image; Université Paris-Est, LIGM (UMR CNRS), ENPC, F-77455 Marne-la-Vallée; Université Paris-Est, LIGM (UMR CNRS), ENPC, F-77455 Marne-la-Vallée", "project": "", "github": "", "supp": "", @@ -5445,14 +5627,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0+1;0;0", - "aff_unique_norm": "Universit\u00e9 Paris-Est;Mikros Image", + "aff_unique_norm": "Université Paris-Est;Mikros Image", "aff_unique_dep": "LIGM (UMR CNRS);", "aff_unique_url": "https://www.univ-mlv.fr;", "aff_unique_abbr": "UPE;", "aff_campus_unique_index": "0;0;0", - "aff_campus_unique": "Marne-la-Vall\u00e9e;", + "aff_campus_unique": "Marne-la-Vallée;", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "France;" + "aff_country_unique": "France;", + "bibtex": "@InProceedings{Moulon_2013_ICCV,\n \n author = {\n Moulon,\n Pierre and Monasse,\n Pascal and Marlet,\n Renaud\n},\n title = {\n Global Fusion of Relative Motions for Robust,\n Accurate and Scalable Structure from Motion\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "707ead6e55", @@ -5483,7 +5666,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Beijing;", "aff_country_unique_index": "0+1;1;0", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Yang_2013_ICCV,\n \n author = {\n Yang,\n Jiaolong and Li,\n Hongdong and Jia,\n Yunde\n},\n title = {\n Go-ICP: Solving 3D Registration Efficiently and Globally Optimally\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "ef2dbe0c41", @@ -5505,7 +5689,8 @@ "email": ";;;", "author_num": 4, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Tang_2013_ICCV,\n \n author = {\n Tang,\n Meng and Gorelick,\n Lena and Veksler,\n Olga and Boykov,\n Yuri\n},\n title = {\n GrabCut in One Cut\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "0e21b536dd", @@ -5536,7 +5721,8 @@ "aff_campus_unique_index": "0;2", "aff_campus_unique": "College Park;;Cambridge", "aff_country_unique_index": "0+0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Chen_2013_ICCV,\n \n author = {\n Chen,\n Daozheng and Batra,\n Dhruv and Freeman,\n William T.\n},\n title = {\n Group Norm for Learning Structured SVMs with Unstructured Latent Variables\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "bfdfd053c6", @@ -5560,14 +5746,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;0;0", - "aff_unique_norm": "University of Tennessee", + "aff_unique_norm": "The University of Tennessee", "aff_unique_dep": "", "aff_unique_url": "https://www.utk.edu", "aff_unique_abbr": "UT", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Knoxville", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Luo_2013_ICCV,\n \n author = {\n Luo,\n Jiajia and Wang,\n Wei and Qi,\n Hairong\n},\n title = {\n Group Sparsity and Geometry Constrained Dictionary Learning for Action Recognition from Depth Maps\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "ad5b4fcc3d", @@ -5598,7 +5785,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Vondrick_2013_ICCV,\n \n author = {\n Vondrick,\n Carl and Khosla,\n Aditya and Malisiewicz,\n Tomasz and Torralba,\n Antonio\n},\n title = {\n HOGgles: Visualizing Object Detection Features\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "ae429cf322", @@ -5620,7 +5808,8 @@ "email": ";;;", "author_num": 4, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Mathias_2013_ICCV,\n \n author = {\n Mathias,\n Markus and Benenson,\n Rodrigo and Timofte,\n Radu and Van Gool,\n Luc\n},\n title = {\n Handling Occlusions with Franken-Classifiers\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "632ac0c3cf", @@ -5651,7 +5840,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Vahdat_2013_ICCV,\n \n author = {\n Vahdat,\n Arash and Mori,\n Greg\n},\n title = {\n Handling Uncertain Tags in Visual Recognition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "bd4ef1ce42", @@ -5673,7 +5863,8 @@ "email": ";;;", "author_num": 4, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Almazan_2013_ICCV,\n \n author = {\n Almazan,\n Jon and Gordo,\n Albert and Fornes,\n Alicia and Valveny,\n Ernest\n},\n title = {\n Handwritten Word Spotting with Corrected Attributes\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "817f56bec8", @@ -5697,14 +5888,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;0;0+1;0;0+1", - "aff_unique_norm": "Istituto Italiano di Tecnologia;Universit\u00e0 di Verona", + "aff_unique_norm": "Istituto Italiano di Tecnologia;Università di Verona", "aff_unique_dep": "Pattern Analysis & Computer Vision;Departimento di Informatica", "aff_unique_url": "https://www.iit.it;https://www.univr.it", "aff_unique_abbr": "IIT;UniVR", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0;0;0+0", - "aff_country_unique": "Italy" + "aff_country_unique": "Italy", + "bibtex": "@InProceedings{Biagio_2013_ICCV,\n \n author = {\n Biagio,\n Marco San and Crocco,\n Marco and Cristani,\n Marco and Martelli,\n Samuele and Murino,\n Vittorio\n},\n title = {\n Heterogeneous Auto-similarities of Characteristics (HASC): Exploiting Relational Information for Classification\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "d89e8063e3", @@ -5735,7 +5927,8 @@ "aff_campus_unique_index": "0;0;1;0+1", "aff_campus_unique": "Arlington;Sydney", "aff_country_unique_index": "0;0;1;0+1", - "aff_country_unique": "United States;Australia" + "aff_country_unique": "United States;Australia", + "bibtex": "@InProceedings{Cai_2013_ICCV,\n \n author = {\n Cai,\n Xiao and Nie,\n Feiping and Cai,\n Weidong and Huang,\n Heng\n},\n title = {\n Heterogeneous Image Features Integration via Multi-modal Semi-supervised Learning Model\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "348f16513e", @@ -5759,14 +5952,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;0;1;2+3;0+2", - "aff_unique_norm": "Chinese Academy of Sciences;Toyota Technological Institute at Chicago;Chinese University of Hong Kong;Huawei", + "aff_unique_norm": "Chinese Academy of Sciences;Toyota Technological Institute at Chicago;The Chinese University of Hong Kong;Huawei Technologies Co. Ltd.", "aff_unique_dep": "Shenzhen Key Lab of Computer Vision and Pattern Recognition;;Dept. of Information Engineering;Media Lab", "aff_unique_url": "http://www.cas.cn;https://www.tti-chicago.org;https://www.cuhk.edu.hk;https://www.huawei.com", "aff_unique_abbr": "CAS;TTI Chicago;CUHK;Huawei", "aff_campus_unique_index": "0;0;1;2;0+2", "aff_campus_unique": "Shenzhen;Chicago;Hong Kong SAR;", "aff_country_unique_index": "0;0;1;0+0;0+0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Gong_2013_ICCV,\n \n author = {\n Gong,\n Dihong and Li,\n Zhifeng and Lin,\n Dahua and Liu,\n Jianzhuang and Tang,\n Xiaoou\n},\n title = {\n Hidden Factor Analysis for Age Invariant Face Recognition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "de360c88b8", @@ -5797,7 +5991,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Tian_2013_ICCV,\n \n author = {\n Tian,\n Yuandong and Narasimhan,\n Srinivasa G.\n},\n title = {\n Hierarchical Data-Driven Descent for Efficient Optimal Deformation Estimation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "2c0d97ea47", @@ -5819,7 +6014,8 @@ "email": ";;", "author_num": 3, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Lobel_2013_ICCV,\n \n author = {\n Lobel,\n Hans and Vidal,\n Rene and Soto,\n Alvaro\n},\n title = {\n Hierarchical Joint Max-Margin Learning of Mid and Top Level Representations for Visual Recognition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "62559c6c4f", @@ -5850,7 +6046,8 @@ "aff_campus_unique_index": "1;2;3;1", "aff_campus_unique": ";Beijing;San Antonio;Hefei", "aff_country_unique_index": "0+0+0;1;0;2;0+0+0", - "aff_country_unique": "China;United States;Singapore" + "aff_country_unique": "China;United States;Singapore", + "bibtex": "@InProceedings{Xie_2013_ICCV,\n \n author = {\n Xie,\n Lingxi and Tian,\n Qi and Hong,\n Richang and Yan,\n Shuicheng and Zhang,\n Bo\n},\n title = {\n Hierarchical Part Matching for Fine-Grained Visual Categorization\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "11e090d407", @@ -5881,7 +6078,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Han_2013_ICCV,\n \n author = {\n Han,\n Yudeog and Lee,\n Joon-Young and Kweon,\n In So\n},\n title = {\n High Quality Shape from a Single RGB-D Image under Uncalibrated Natural Illumination\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "1f6c4e46dd", @@ -5905,14 +6103,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;0", - "aff_unique_norm": "Hebrew University", + "aff_unique_norm": "The Hebrew University", "aff_unique_dep": "School of Computer Science and Engineering", "aff_unique_url": "http://www.huji.ac.il", "aff_unique_abbr": "HUJI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Israel" + "aff_country_unique": "Israel", + "bibtex": "@InProceedings{Arora_2013_ICCV,\n \n author = {\n Arora,\n Chetan and Globerson,\n Amir\n},\n title = {\n Higher Order Matching for Consistent Multiple Target Tracking\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "22afb6578d", @@ -5943,7 +6142,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Chicago", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Lin_2013_ICCV,\n \n author = {\n Lin,\n Dahua and Fidler,\n Sanja and Urtasun,\n Raquel\n},\n title = {\n Holistic Scene Understanding for 3D Object Detection with RGBD Cameras\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "cb2b6b2324", @@ -5965,7 +6165,8 @@ "email": ";", "author_num": 2, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Berg_2013_ICCV,\n \n author = {\n Berg,\n Thomas and Belhumeur,\n Peter N.\n},\n title = {\n How Do You Tell a Blackbird from a Crow?\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "43d50f8975", @@ -5989,14 +6190,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;0;1;2;0", - "aff_unique_norm": "Carnegie Mellon University;University of Queensland;National University of Singapore", + "aff_unique_norm": "Carnegie Mellon University;The University of Queensland;National University of Singapore", "aff_unique_dep": "School of Computer Science;ITEE;Electrical and Computer Engineering", "aff_unique_url": "https://www.cmu.edu;https://www.uq.edu.au;https://www.nus.edu.sg", "aff_unique_abbr": "CMU;UQ;NUS", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Pittsburgh;", "aff_country_unique_index": "0;0;1;2;0", - "aff_country_unique": "United States;Australia;Singapore" + "aff_country_unique": "United States;Australia;Singapore", + "bibtex": "@InProceedings{Yang_2013_ICCV,\n \n author = {\n Yang,\n Yi and Ma,\n Zhigang and Xu,\n Zhongwen and Yan,\n Shuicheng and Hauptmann,\n Alexander G.\n},\n title = {\n How Related Exemplars Help Complex Event Detection in Web Videos?\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "6f99fd4aae", @@ -6027,7 +6229,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0;0+1;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Joo_2013_ICCV,\n \n author = {\n Joo,\n Jungseock and Wang,\n Shuo and Zhu,\n Song-Chun\n},\n title = {\n Human Attribute Recognition by Rich Appearance Dictionary\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "08e1e4a2bd", @@ -6051,14 +6254,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;0;0;1", - "aff_unique_norm": "Sun Yat-sen University;University of California, Los Angeles", + "aff_unique_norm": "Sun Yat-Sen University;University of California, Los Angeles", "aff_unique_dep": ";", "aff_unique_url": "http://www.sysu.edu.cn;https://www.ucla.edu", "aff_unique_abbr": "SYSU;UCLA", "aff_campus_unique_index": "1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;0;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Xu_2013_ICCV,\n \n author = {\n Xu,\n Yuanlu and Lin,\n Liang and Zheng,\n Wei-Shi and Liu,\n Xiaobai\n},\n title = {\n Human Re-identification by Matching Compositional Template with Cluster Sampling\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "7e20d26571", @@ -6082,14 +6286,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;0+1;0+1", - "aff_unique_norm": "Chinese University of Hong Kong;Chinese Academy of Sciences", + "aff_unique_norm": "The Chinese University of Hong Kong;Chinese Academy of Sciences", "aff_unique_dep": "Department of Information Engineering;Shenzhen Institutes of Advanced Technology", "aff_unique_url": "https://www.cuhk.edu.hk;http://www.siat.cas.cn", "aff_unique_abbr": "CUHK;SIAT", "aff_campus_unique_index": "0;0+1;0+1", "aff_campus_unique": "Hong Kong SAR;Shenzhen", "aff_country_unique_index": "0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Sun_2013_ICCV,\n \n author = {\n Sun,\n Yi and Wang,\n Xiaogang and Tang,\n Xiaoou\n},\n title = {\n Hybrid Deep Learning for Face Verification\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "8b7d7172d0", @@ -6111,7 +6316,8 @@ "email": ";;", "author_num": 3, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Prinet_2013_ICCV,\n \n author = {\n Prinet,\n Veronique and Lischinski,\n Dani and Werman,\n Michael\n},\n title = {\n Illuminant Chromaticity from Image Sequences\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "02c50dd2e2", @@ -6142,7 +6348,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Wang_2013_ICCV,\n \n author = {\n Wang,\n Fan and Huang,\n Qixing and Guibas,\n Leonidas J.\n},\n title = {\n Image Co-segmentation via Consistent Functional Maps\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "9bdcd34357", @@ -6173,7 +6380,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "Austria" + "aff_country_unique": "Austria", + "bibtex": "@InProceedings{Ferstl_2013_ICCV,\n \n author = {\n Ferstl,\n David and Reinbacher,\n Christian and Ranftl,\n Rene and Ruether,\n Matthias and Bischof,\n Horst\n},\n title = {\n Image Guided Depth Upsampling Using Anisotropic Total Generalized Variation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "3b530d83c0", @@ -6195,7 +6403,8 @@ "email": ";;", "author_num": 3, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Mishra_2013_ICCV,\n \n author = {\n Mishra,\n Anand and Alahari,\n Karteek and Jawahar,\n C.V.\n},\n title = {\n Image Retrieval Using Textual Cues\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "829e1c0a7f", @@ -6226,7 +6435,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Salt Lake City", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Seyedhosseini_2013_ICCV,\n \n author = {\n Seyedhosseini,\n Mojtaba and Sajjadi,\n Mehdi and Tasdizen,\n Tolga\n},\n title = {\n Image Segmentation with Cascaded Hierarchical Models and Logistic Disjunctive Normal Networks\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "341393bf5c", @@ -6250,14 +6460,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;1;2", - "aff_unique_norm": "Advanced Digital Sciences Center;Nanyang Technological University;University of Illinois Urbana-Champaign", + "aff_unique_norm": "Advanced Digital Sciences Center;Nanyang Technological University;University of Illinois at Urbana-Champaign", "aff_unique_dep": ";School of Electrical and Electronic Engineering;Department of Electrical and Computer Engineering", "aff_unique_url": ";https://www.ntu.edu.sg;https://illinois.edu", "aff_unique_abbr": ";NTU;UIUC", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Singapore;Urbana-Champaign", "aff_country_unique_index": "0;0;1", - "aff_country_unique": "Singapore;United States" + "aff_country_unique": "Singapore;United States", + "bibtex": "@InProceedings{Lu_2013_ICCV,\n \n author = {\n Lu,\n Jiwen and Wang,\n Gang and Moulin,\n Pierre\n},\n title = {\n Image Set Classification Using Holistic Multiple Order Statistics Features and Localized Multi-kernel Metric Learning\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "7006e25711", @@ -6288,7 +6499,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Austin", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Parikh_2013_ICCV,\n \n author = {\n Parikh,\n Devi and Grauman,\n Kristen\n},\n title = {\n Implied Feedback: Learning Nuances of User Behavior in Image Search\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "6d2aac918b", @@ -6319,7 +6531,8 @@ "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Wollongong;Canberra", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Australia" + "aff_country_unique": "Australia", + "bibtex": "@InProceedings{Wang_2013_ICCV,\n \n author = {\n Wang,\n Chao and Wang,\n Lei and Liu,\n Lingqiao\n},\n title = {\n Improving Graph Matching via Density Maximization\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "d923f24142", @@ -6341,7 +6554,8 @@ "email": ";", "author_num": 2, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Peng_2013_ICCV,\n \n author = {\n Peng,\n Kuan-Chuan and Chen,\n Tsuhan\n},\n title = {\n Incorporating Cloud Distribution in Sky Representation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "3465bfa9fd", @@ -6372,7 +6586,8 @@ "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Los Angeles;Corvallis", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Xie_2013_ICCV,\n \n author = {\n Xie,\n Dan and Todorovic,\n Sinisa and Zhu,\n Song-Chun\n},\n title = {\n Inferring \"Dark Matter\" and \"Dark Energy\" from Videos\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "908cba3775", @@ -6403,7 +6618,8 @@ "aff_campus_unique_index": "0;0;0;1;0", "aff_campus_unique": "Seoul;London", "aff_country_unique_index": "0;0;0;1;0", - "aff_country_unique": "South Korea;United Kingdom" + "aff_country_unique": "South Korea;United Kingdom", + "bibtex": "@InProceedings{Yi_2013_ICCV,\n \n author = {\n Yi,\n Kwang Moo and Jeong,\n Hawook and Heo,\n Byeongho and Chang,\n Hyung Jin and Choi,\n Jin Young\n},\n title = {\n Initialization-Insensitive Visual Tracking through Voting with Salient Local Features\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "abff1a1245", @@ -6434,7 +6650,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Sridhar_2013_ICCV,\n \n author = {\n Sridhar,\n Srinath and Oulasvirta,\n Antti and Theobalt,\n Christian\n},\n title = {\n Interactive Markerless Articulated Hand Motion Tracking Using RGB and Depth Data\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "533646d528", @@ -6463,7 +6680,8 @@ "aff_unique_url": "https://www.washington.edu", "aff_unique_abbr": "UW", "aff_country_unique_index": "0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Kemelmacher-Shlizerman_2013_ICCV,\n \n author = {\n Kemelmacher-Shlizerman,\n Ira\n},\n title = {\n Internet Based Morphable Model\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "0973fe07a2", @@ -6487,14 +6705,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;0", - "aff_unique_norm": "Chinese University of Hong Kong", + "aff_unique_norm": "the Chinese University of Hong Kong", "aff_unique_dep": "Department of Electronic Engineering", "aff_unique_url": "https://www.cuhk.edu.hk", "aff_unique_abbr": "CUHK", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Ouyang_2013_ICCV,\n \n author = {\n Ouyang,\n Wanli and Wang,\n Xiaogang\n},\n title = {\n Joint Deep Learning for Pedestrian Detection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "dbdae7d9e7", @@ -6516,7 +6735,8 @@ "email": ";;;", "author_num": 4, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Xia_2013_ICCV,\n \n author = {\n Xia,\n Yan and He,\n Kaiming and Wen,\n Fang and Sun,\n Jian\n},\n title = {\n Joint Inverted Indexing\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "4af34ab686", @@ -6547,7 +6767,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Graz", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Austria" + "aff_country_unique": "Austria", + "bibtex": "@InProceedings{Kostinger_2013_ICCV,\n \n author = {\n Kostinger,\n Martin and Wohlhart,\n Paul and Roth,\n Peter M. and Bischof,\n Horst\n},\n title = {\n Joint Learning of Discriminative Prototypes and Large Margin Nearest Neighbor Classifiers\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "662877a6ad", @@ -6578,14 +6799,15 @@ "aff_campus_unique_index": "0+1;0;0;0;0", "aff_campus_unique": "Mountain View;Cambridge", "aff_country_unique_index": "0+0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Shih_2013_ICCV,\n \n author = {\n Shih,\n Yichang and Kwatra,\n Vivek and Chinen,\n Troy and Fang,\n Hui and Ioffe,\n Sergey\n},\n title = {\n Joint Noise Level Estimation from Personal Photo Collections\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "456753f769", "title": "Joint Optimization for Consistent Multiple Graph Matching", "site": "http://openaccess.thecvf.com/content_iccv_2013/html/Yan_Joint_Optimization_for_2013_ICCV_paper.html", "author": "Junchi Yan; Yu Tian; Hongyuan Zha; Xiaokang Yang; Ya Zhang; Stephen M. Chu", - "abstract": "The problem of graph matching in general is NP-hard and approaches have been proposed for its suboptimal solution, most focusing on finding the one-to-one node mapping between two graphs. A more general and challenging problem arises when one aims to find consistent mappings across a number of graphs more than two. Conventional graph pair matching methods often result in mapping inconsistency since the mapping between two graphs can either be determined by pair mapping or by an additional anchor graph. To address this issue, a novel formulation is derived which is maximized via alternating optimization. Our method enjoys several advantages: 1) the mappings are jointly optimized rather than sequentially performed by applying pair matching, allowing the global affinity information across graphs can be propagated and explored; 2) the number of concerned variables to optimize is in linear with the number of graphs, being superior to local pair matching resulting in O(n 2 ) variables; 3) the mapping consistency constraints are analytically satisfied during optimization; and 4) off-the-shelf graph pair matching solvers can be reused under the proposed framework in an \u00e2\u0080\u0098out-of-thebox\u00e2\u0080\u0099 fashion. Competitive results on both the synthesized data and the real data are reported, by varying the level of deformation, outliers and edge densities.", + "abstract": "The problem of graph matching in general is NP-hard and approaches have been proposed for its suboptimal solution, most focusing on finding the one-to-one node mapping between two graphs. A more general and challenging problem arises when one aims to find consistent mappings across a number of graphs more than two. Conventional graph pair matching methods often result in mapping inconsistency since the mapping between two graphs can either be determined by pair mapping or by an additional anchor graph. To address this issue, a novel formulation is derived which is maximized via alternating optimization. Our method enjoys several advantages: 1) the mappings are jointly optimized rather than sequentially performed by applying pair matching, allowing the global affinity information across graphs can be propagated and explored; 2) the number of concerned variables to optimize is in linear with the number of graphs, being superior to local pair matching resulting in O(n 2 ) variables; 3) the mapping consistency constraints are analytically satisfied during optimization; and 4) off-the-shelf graph pair matching solvers can be reused under the proposed framework in an ‘out-of-thebox’ fashion. Competitive results on both the synthesized data and the real data are reported, by varying the level of deformation, outliers and edge densities.", "pdf": "http://openaccess.thecvf.com/content_iccv_2013/papers/Yan_Joint_Optimization_for_2013_ICCV_paper.pdf", "aff": "Shanghai Jiao Tong University+IBM Research - China; Shanghai Jiao Tong University; Georgia Institute of Technology; Shanghai Jiao Tong University; Shanghai Jiao Tong University; IBM T.J. Waston Research Center", "project": "", @@ -6601,15 +6823,16 @@ "author_num": 6, "track": "main", "status": "Poster", - "aff_unique_index": "0+1;0;2;0;0;1", - "aff_unique_norm": "Shanghai Jiao Tong University;IBM;Georgia Institute of Technology", - "aff_unique_dep": ";Research;", - "aff_unique_url": "https://www.sjtu.edu.cn;https://www.ibm.com/research;https://www.gatech.edu", - "aff_unique_abbr": "SJTU;IBM;Georgia Tech", + "aff_unique_index": "0+1;0;2;0;0;3", + "aff_unique_norm": "Shanghai Jiao Tong University;IBM Research;Georgia Institute of Technology;IBM", + "aff_unique_dep": ";Research;;T.J. Watson Research Center", + "aff_unique_url": "https://www.sjtu.edu.cn;https://www.ibm.com/research;https://www.gatech.edu;https://www.ibm.com/research/watson", + "aff_unique_abbr": "SJTU;IBM;Georgia Tech;IBM Watson", "aff_campus_unique_index": ";1", "aff_campus_unique": ";Yorktown Heights", "aff_country_unique_index": "0+0;0;1;0;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Yan_2013_ICCV,\n \n author = {\n Yan,\n Junchi and Tian,\n Yu and Zha,\n Hongyuan and Yang,\n Xiaokang and Zhang,\n Ya and Chu,\n Stephen M.\n},\n title = {\n Joint Optimization for Consistent Multiple Graph Matching\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "7299bd3f23", @@ -6633,14 +6856,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0+1;1;1;1", - "aff_unique_norm": "Samsung;POSTECH", + "aff_unique_norm": "Samsung Electronics;POSTECH", "aff_unique_dep": "DMC R&D Center;Department of Computer Science and Engineering", "aff_unique_url": "https://www.samsung.com;https://www.postech.ac.kr", "aff_unique_abbr": "Samsung;POSTECH", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Lim_2013_ICCV,\n \n author = {\n Lim,\n Taegyu and Hong,\n Seunghoon and Han,\n Bohyung and Han,\n Joon Hee\n},\n title = {\n Joint Segmentation and Pose Tracking of Human in Natural Videos\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "04f9eddb5c", @@ -6666,12 +6890,13 @@ "aff_unique_index": "0;1+0;2", "aff_unique_norm": "Portland State University;Fuzhou University;Adobe", "aff_unique_dep": ";;Adobe Research", - "aff_unique_url": "https://www.pdx.edu;https://www.fznu.edu.cn;https://research.adobe.com", + "aff_unique_url": "https://www.pdx.edu;https://www.fzu.edu.cn;https://research.adobe.com", "aff_unique_abbr": "PSU;FZU;Adobe", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1+0;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Liu_2013_ICCV,\n \n author = {\n Liu,\n Feng and Niu,\n Yuzhen and Jin,\n Hailin\n},\n title = {\n Joint Subspace Stabilization for Stereoscopic Video\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "fa5f7b0daf", @@ -6702,7 +6927,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "East Lansing", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Feng_2013_ICCV,\n \n author = {\n Feng,\n Zheyun and Jin,\n Rong and Jain,\n Anil\n},\n title = {\n Large-Scale Image Annotation by Efficient and Robust Kernel Metric Learning\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "9637288791", @@ -6724,7 +6950,8 @@ "email": ";;", "author_num": 3, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Steinbrucker_2013_ICCV,\n \n author = {\n Steinbrucker,\n Frank and Kerl,\n Christian and Cremers,\n Daniel\n},\n title = {\n Large-Scale Multi-resolution Surface Reconstruction from RGB-D Sequences\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "17f93ed13d", @@ -6748,14 +6975,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;0;1;0", - "aff_unique_norm": "Columbia University;IBM", - "aff_unique_dep": "Dept. of Electrical Engineering;IBM T. J. Watson Research Center", + "aff_unique_norm": "Columbia University;IBM T. J. Watson Research Center", + "aff_unique_dep": "Dept. of Electrical Engineering;", "aff_unique_url": "https://www.columbia.edu;https://www.ibm.com/research/watson", "aff_unique_abbr": "Columbia;IBM Watson", "aff_campus_unique_index": "0;0;1;0", "aff_campus_unique": "New York;Yorktown Heights", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Ye_2013_ICCV,\n \n author = {\n Ye,\n Guangnan and Liu,\n Dong and Wang,\n Jun and Chang,\n Shih-Fu\n},\n title = {\n Large-Scale Video Hashing via Structure Learning\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "f36d6d99c4", @@ -6786,7 +7014,8 @@ "aff_campus_unique_index": "0", "aff_campus_unique": "Oxford;", "aff_country_unique_index": "0;1", - "aff_country_unique": "United Kingdom;Australia" + "aff_country_unique": "United Kingdom;Australia", + "bibtex": "@InProceedings{Segal_2013_ICCV,\n \n author = {\n Segal,\n Aleksandr V. and Reid,\n Ian\n},\n title = {\n Latent Data Association: Bayesian Model Selection for Multi-target Tracking\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "b8590fea70", @@ -6817,7 +7046,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Mahasseni_2013_ICCV,\n \n author = {\n Mahasseni,\n Behrooz and Todorovic,\n Sinisa\n},\n title = {\n Latent Multitask Learning for View-Invariant Action Recognition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "903497be30", @@ -6848,7 +7078,8 @@ "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "College Park;Baltimore", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Patel_2013_ICCV,\n \n author = {\n Patel,\n Vishal M. and Van Nguyen,\n Hien and Vidal,\n Rene\n},\n title = {\n Latent Space Sparse Subspace Clustering\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "ef595edb0f", @@ -6879,7 +7110,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Jia_2013_ICCV,\n \n author = {\n Jia,\n Yangqing and Darrell,\n Trevor\n},\n title = {\n Latent Task Adaptation with Large-Scale Hierarchies\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "271eac55f8", @@ -6888,7 +7120,7 @@ "author": "Honghui Zhang; Jingdong Wang; Ping Tan; Jinglu Wang; Long Quan", "abstract": "We propose an adaptive subgradient descent method to efficiently learn the parameters of CRF models for image parsing. To balance the learning efficiency and performance of the learned CRF models, the parameter learning is iteratively carried out by solving a convex optimization problem in each iteration, which integrates a proximal term to preserve the previously learned information and the large margin preference to distinguish bad labeling and the ground truth labeling. A solution of subgradient descent updating form is derived for the convex optimization problem, with an adaptively determined updating step-size. Besides, to deal with partially labeled training data, we propose a new objective constraint modeling both the labeled and unlabeled parts in the partially labeled training data for the parameter learning of CRF models. The superior learning efficiency of the proposed method is verified by the experiment results on two public datasets. We also demonstrate the powerfulness of our method for handling partially labeled training data.", "pdf": "http://openaccess.thecvf.com/content_iccv_2013/papers/Zhang_Learning_CRFs_for_2013_ICCV_paper.pdf", - "aff": "The Hong Kong University of Science and Technology\u2217; Microsoft Research\u2020; National University of Singapore\u2021; The Hong Kong University of Science and Technology\u2217; The Hong Kong University of Science and Technology\u2217", + "aff": "The Hong Kong University of Science and Technology∗; Microsoft Research†; National University of Singapore‡; The Hong Kong University of Science and Technology∗; The Hong Kong University of Science and Technology∗", "project": "", "github": "", "supp": "", @@ -6903,14 +7135,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;1;2;0;0", - "aff_unique_norm": "Hong Kong University of Science and Technology;Microsoft;National University of Singapore", - "aff_unique_dep": ";Microsoft Research;", + "aff_unique_norm": "Hong Kong University of Science and Technology;Microsoft Research;National University of Singapore", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.ust.hk;https://www.microsoft.com/en-us/research;https://www.nus.edu.sg", "aff_unique_abbr": "HKUST;MSR;NUS", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;1;2;0;0", - "aff_country_unique": "China;United States;Singapore" + "aff_country_unique": "China;United States;Singapore", + "bibtex": "@InProceedings{Zhang_2013_ICCV,\n \n author = {\n Zhang,\n Honghui and Wang,\n Jingdong and Tan,\n Ping and Wang,\n Jinglu and Quan,\n Long\n},\n title = {\n Learning CRFs for Image Parsing with Adaptive Subgradient Descent\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "85e6d664a3", @@ -6932,7 +7165,8 @@ "email": ";;;;", "author_num": 5, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Wang_2013_ICCV,\n \n author = {\n Wang,\n Kaiye and He,\n Ran and Wang,\n Wei and Wang,\n Liang and Tan,\n Tieniu\n},\n title = {\n Learning Coupled Feature Spaces for Cross-Modal Matching\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "0ad4a96f5a", @@ -6941,7 +7175,7 @@ "author": "Jian Sun; Jean Ponce", "abstract": "In this paper, we address the problem of learning discriminative part detectors from image sets with category labels. We propose a novel latent SVM model regularized by group sparsity to learn these part detectors. Starting from a large set of initial parts, the group sparsity regularizer forces the model to jointly select and optimize a set of discriminative part detectors in a max-margin framework. We propose a stochastic version of a proximal algorithm to solve the corresponding optimization problem. We apply the proposed method to image classification and cosegmentation, and quantitative experiments with standard benchmarks show that it matches or improves upon the state of the art.", "pdf": "http://openaccess.thecvf.com/content_iccv_2013/papers/Sun_Learning_Discriminative_Part_2013_ICCV_paper.pdf", - "aff": "Xi'an Jiaotong University, INRIA; Ecole Normale Sup\u00e9rieure, WILLOW project-team, D\u00b4epartement d\u2019Informatique de l\u2019Ecole Normale Sup\u00e9rieure, ENS/INRIA/CNRS UMR 8548", + "aff": "Xi'an Jiaotong University, INRIA; Ecole Normale Supérieure, WILLOW project-team, D´epartement d’Informatique de l’Ecole Normale Supérieure, ENS/INRIA/CNRS UMR 8548", "project": "", "github": "", "supp": "", @@ -6956,14 +7190,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;1", - "aff_unique_norm": "Xi'an Jiao Tong University;Ecole Normale Sup\u00e9rieure", - "aff_unique_dep": ";D\u00b4epartement d\u2019Informatique", + "aff_unique_norm": "Xi'an Jiaotong University;Ecole Normale Supérieure", + "aff_unique_dep": ";D´epartement d’Informatique", "aff_unique_url": "http://www.xjtu.edu.cn;https://www.ens.fr", "aff_unique_abbr": "XJTU;ENS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", - "aff_country_unique": "China;France" + "aff_country_unique": "China;France", + "bibtex": "@InProceedings{Sun_2013_ICCV,\n \n author = {\n Sun,\n Jian and Ponce,\n Jean\n},\n title = {\n Learning Discriminative Part Detectors for Image Classification and Cosegmentation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "bf3d6581a5", @@ -6972,7 +7207,7 @@ "author": "Quanshi Zhang; Xuan Song; Xiaowei Shao; Huijing Zhao; Ryosuke Shibasaki", "abstract": "Although graph matching is a fundamental problem in pattern recognition, and has drawn broad interest from many fields, the problem of learning graph matching has not received much attention. In this paper, we redefine the learning of graph matching as a model learning problem. In addition to conventional training of matching parameters, our approach modifies the graph structure and attributes to generate a graphical model. In this way, the model learning is oriented toward both matching and recognition performance, and can proceed in an unsupervised gnfashion. Experiments demonstrate that our approach outperforms conventional methods for learning graph matching.", "pdf": "http://openaccess.thecvf.com/content_iccv_2013/papers/Zhang_Learning_Graph_Matching_2013_ICCV_paper.pdf", - "aff": "Center for Spatial Information Science, University of Tokyo\u2020; Center for Spatial Information Science, University of Tokyo\u2020; Center for Spatial Information Science, University of Tokyo\u2020; Key Laboratory of Machine Perception (MoE), Peking University\u2021; Center for Spatial Information Science, University of Tokyo\u2020", + "aff": "Center for Spatial Information Science, University of Tokyo†; Center for Spatial Information Science, University of Tokyo†; Center for Spatial Information Science, University of Tokyo†; Key Laboratory of Machine Perception (MoE), Peking University‡; Center for Spatial Information Science, University of Tokyo†", "project": "", "github": "", "supp": "", @@ -6994,7 +7229,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0", - "aff_country_unique": "Japan;China" + "aff_country_unique": "Japan;China", + "bibtex": "@InProceedings{Zhang_2013_ICCV,\n \n author = {\n Zhang,\n Quanshi and Song,\n Xuan and Shao,\n Xiaowei and Zhao,\n Huijing and Shibasaki,\n Ryosuke\n},\n title = {\n Learning Graph Matching: Oriented to Category Modeling from Cluttered Scenes\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "76b52aa0f8", @@ -7003,7 +7239,7 @@ "author": "Minsu Cho; Karteek Alahari; Jean Ponce", "abstract": "Many tasks in computer vision are formulated as graph matching problems. Despite the NP-hard nature of the problem, fast and accurate approximations have led to significant progress in a wide range of applications. Learning graph models from observed data, however, still remains a challenging issue. This paper presents an effective scheme to parameterize a graph model, and learn its structural attributes for visual object matching. For this, we propose a graph representation with histogram-based attributes, and optimize them to increase the matching accuracy. Experimental evaluations on synthetic and real image datasets demonstrate the effectiveness of our approach, and show significant improvement in matching accuracy over graphs with pre-defined structures.", "pdf": "http://openaccess.thecvf.com/content_iccv_2013/papers/Cho_Learning_Graphs_to_2013_ICCV_paper.pdf", - "aff": "Inria; Inria; \u00c9cole Normale Sup\u00e9rieure", + "aff": "Inria; Inria; École Normale Supérieure", "project": "", "github": "", "supp": "", @@ -7018,14 +7254,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;0;1", - "aff_unique_norm": "INRIA;\u00c9cole Normale Sup\u00e9rieure", + "aff_unique_norm": "Inria;École Normale Supérieure", "aff_unique_dep": ";", "aff_unique_url": "https://www.inria.fr;https://www.ens.fr", "aff_unique_abbr": "Inria;ENS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "France" + "aff_country_unique": "France", + "bibtex": "@InProceedings{Cho_2013_ICCV,\n \n author = {\n Cho,\n Minsu and Alahari,\n Karteek and Ponce,\n Jean\n},\n title = {\n Learning Graphs to Match\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "5dc009fe15", @@ -7049,14 +7286,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;0;1;2", - "aff_unique_norm": "IBM;Georgia Institute of Technology;Fudan University", + "aff_unique_norm": "IBM T. J. Watson Research Center;Georgia Institute of Technology;Fudan University", "aff_unique_dep": "Business Analytics and Mathematical Sciences;School of Industrial and Systems Engineering;School of Computer Science", "aff_unique_url": "https://www.ibm.com/research/watson;https://www.gatech.edu;https://www.fudan.edu.cn", "aff_unique_abbr": "IBM Watson;Georgia Tech;Fudan", "aff_campus_unique_index": "1", "aff_campus_unique": ";Atlanta", "aff_country_unique_index": "0;0;0;1", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Wang_2013_ICCV,\n \n author = {\n Wang,\n Jun and Liu,\n Wei and Sun,\n Andy X. and Jiang,\n Yu-Gang\n},\n title = {\n Learning Hash Codes with Listwise Supervision\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "5904a81c21", @@ -7087,7 +7325,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Wang_2013_ICCV,\n \n author = {\n Wang,\n Jiang and Wu,\n Ying\n},\n title = {\n Learning Maximum Margin Temporal Warping for Action Recognition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "e211fde1e0", @@ -7118,7 +7357,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Los Angeles", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Wu_2013_ICCV,\n \n author = {\n Wu,\n Tianfu and Zhu,\n Song-Chun\n},\n title = {\n Learning Near-Optimal Cost-Sensitive Decision Policy for Object Detection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "a87cbc7475", @@ -7140,7 +7380,8 @@ "email": ";;;;;", "author_num": 6, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Tang_2013_ICCV,\n \n author = {\n Tang,\n Siyu and Andriluka,\n Mykhaylo and Milan,\n Anton and Schindler,\n Konrad and Roth,\n Stefan and Schiele,\n Bernt\n},\n title = {\n Learning People Detectors for Tracking in Crowded Scenes\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "43ffeb968c", @@ -7171,7 +7412,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "London;", "aff_country_unique_index": "0;0;0;0;0+1", - "aff_country_unique": "United Kingdom;Netherlands" + "aff_country_unique": "United Kingdom;Netherlands", + "bibtex": "@InProceedings{Zafeiriou_2013_ICCV,\n \n author = {\n Zafeiriou,\n Lazaros and Nicolaou,\n Mihalis A. and Zafeiriou,\n Stefanos and Nikitidis,\n Symeon and Pantic,\n Maja\n},\n title = {\n Learning Slow Features for Behaviour Analysis\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "3b95f95159", @@ -7180,7 +7422,7 @@ "author": "Jingjing Zheng; Zhuolin Jiang", "abstract": "We present an approach to jointly learn a set of viewspecific dictionaries and a common dictionary for crossview action recognition. The set of view-specific dictionaries is learned for specific views while the common dictionary is shared across different views. Our approach represents videos in each view using both the corresponding view-specific dictionary and the common dictionary. More importantly, it encourages the set of videos taken from different views of the same action to have similar sparse representations. In this way, we can align view-specific features in the sparse feature spaces spanned by the viewspecific dictionary set and transfer the view-shared features in the sparse feature space spanned by the common dictionary. Meanwhile, the incoherence between the common dictionary and the view-specific dictionary set enables us to exploit the discrimination information encoded in viewspecific features and view-shared features separately. In addition, the learned common dictionary not only has the capability to represent actions from unseen views, but also makes our approach effective in a semi-supervised setting where no correspondence videos exist and only a few labels exist in the target view. Extensive experiments using the multi-view IXMAS dataset demonstrate that our approach outperforms many recent approaches for cross-view action recognition.", "pdf": "http://openaccess.thecvf.com/content_iccv_2013/papers/Zheng_Learning_View-Invariant_Sparse_2013_ICCV_paper.pdf", - "aff": "University of Maryland, College Park, MD, USA; Noah\u2019s Ark Lab, Huawei Technologies", + "aff": "University of Maryland, College Park, MD, USA; Noah’s Ark Lab, Huawei Technologies", "project": "", "github": "", "supp": "", @@ -7195,14 +7437,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;1", - "aff_unique_norm": "University of Maryland;Huawei", - "aff_unique_dep": ";Noah\u2019s Ark Lab", + "aff_unique_norm": "University of Maryland;Huawei Technologies", + "aff_unique_dep": ";Noah’s Ark Lab", "aff_unique_url": "https://www/umd.edu;https://www.huawei.com", "aff_unique_abbr": "UMD;Huawei", "aff_campus_unique_index": "0", "aff_campus_unique": "College Park;", "aff_country_unique_index": "0;1", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Zheng_2013_ICCV,\n \n author = {\n Zheng,\n Jingjing and Jiang,\n Zhuolin\n},\n title = {\n Learning View-Invariant Sparse Representations for Cross-View Action Recognition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "7b18d3a516", @@ -7233,7 +7476,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "", - "aff_country_unique": "" + "aff_country_unique": "", + "bibtex": "@InProceedings{Chen_2013_ICCV,\n \n author = {\n Chen,\n Liang-Chieh and Papandreou,\n George and Yuille,\n Alan L.\n},\n title = {\n Learning a Dictionary of Shape Epitomes with Applications to Image Labeling\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "5485a10c6b", @@ -7257,14 +7501,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;1;0", - "aff_unique_norm": "Microsoft;Virginia Tech", + "aff_unique_norm": "Microsoft Corporation;Virginia Tech", "aff_unique_dep": "Microsoft Research;", "aff_unique_url": "https://www.microsoft.com/en-us/research;https://www.vt.edu", "aff_unique_abbr": "MSR;VT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zitnick_2013_ICCV,\n \n author = {\n Zitnick,\n C. L. and Parikh,\n Devi and Vanderwende,\n Lucy\n},\n title = {\n Learning the Visual Interpretation of Sentences\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "89e4795b22", @@ -7286,7 +7531,8 @@ "email": ";;", "author_num": 3, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Li_2013_ICCV,\n \n author = {\n Li,\n Yin and Fathi,\n Alireza and Rehg,\n James M.\n},\n title = {\n Learning to Predict Gaze in Egocentric Video\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "bd7e876878", @@ -7317,7 +7563,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Cambridge", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "Austria;United Kingdom" + "aff_country_unique": "Austria;United Kingdom", + "bibtex": "@InProceedings{Sharmanska_2013_ICCV,\n \n author = {\n Sharmanska,\n Viktoriia and Quadrianto,\n Novi and Lampert,\n Christoph H.\n},\n title = {\n Learning to Rank Using Privileged Information\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "5c1a73ba69", @@ -7348,7 +7595,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+1;1;0", - "aff_country_unique": "Singapore;United States" + "aff_country_unique": "Singapore;United States", + "bibtex": "@InProceedings{Zhou_2013_ICCV,\n \n author = {\n Zhou,\n Qiang and Wang,\n Gang and Jia,\n Kui and Zhao,\n Qi\n},\n title = {\n Learning to Share Latent Tasks for Action Recognition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "0181dd717f", @@ -7379,7 +7627,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Ramalingam_2013_ICCV,\n \n author = {\n Ramalingam,\n Srikumar and Brand,\n Matthew\n},\n title = {\n Lifting 3D Manhattan Lines from a Single Image\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "9dbb351b34", @@ -7388,7 +7637,7 @@ "author": "Hamdi Dibeklioglu; Albert Ali Salah; Theo Gevers", "abstract": "Kinship verification from facial appearance is a difficult problem. This paper explores the possibility of employing facial expression dynamics in this problem. By using features that describe facial dynamics and spatio-temporal appearance over smile expressions, we show that it is possible to improve the state of the art in this problem, and verify that it is indeed possible to recognize kinship by resemblance of facial expressions. The proposed method is tested on different kin relationships. On the average, 72.89% verification accuracy is achieved on spontaneous smiles.", "pdf": "http://openaccess.thecvf.com/content_iccv_2013/papers/Dibeklioglu_Like_Father_Like_2013_ICCV_paper.pdf", - "aff": "Intelligent Systems Lab Amsterdam, University of Amsterdam, Amsterdam, The Netherlands + Pattern Recognition & Bioinformatics Group, Delft University of Technology, Delft, The Netherlands; Department of Computer Engineering, Bo \u02d8gazic\u00b8i University, Istanbul, Turkey; Intelligent Systems Lab Amsterdam, University of Amsterdam, Amsterdam, The Netherlands", + "aff": "Intelligent Systems Lab Amsterdam, University of Amsterdam, Amsterdam, The Netherlands + Pattern Recognition & Bioinformatics Group, Delft University of Technology, Delft, The Netherlands; Department of Computer Engineering, Bo ˘gazic¸i University, Istanbul, Turkey; Intelligent Systems Lab Amsterdam, University of Amsterdam, Amsterdam, The Netherlands", "project": "", "github": "", "supp": "", @@ -7410,7 +7659,8 @@ "aff_campus_unique_index": "0+1;2;0", "aff_campus_unique": "Amsterdam;Delft;Istanbul", "aff_country_unique_index": "0+0;1;0", - "aff_country_unique": "Netherlands;T\u00fcrkiye" + "aff_country_unique": "Netherlands;Turkey", + "bibtex": "@InProceedings{Dibeklioglu_2013_ICCV,\n \n author = {\n Dibeklioglu,\n Hamdi and Salah,\n Albert Ali and Gevers,\n Theo\n},\n title = {\n Like Father,\n Like Son: Facial Expression Dynamics for Kinship Verification\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "3a08db68e3", @@ -7441,7 +7691,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Yu_2013_ICCV,\n \n author = {\n Yu,\n Zhan and Guo,\n Xinqing and Lin,\n Haibing and Lumsdaine,\n Andrew and Yu,\n Jingyi\n},\n title = {\n Line Assisted Light Field Triangulation and Stereo Matching\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "5c3973765b", @@ -7472,7 +7723,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Su_2013_ICCV,\n \n author = {\n Su,\n Bing and Ding,\n Xiaoqing\n},\n title = {\n Linear Sequence Discriminant Analysis: A Model-Based Dimensionality Reduction Method for Vector Sequences\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "2483ba1df6", @@ -7494,7 +7746,8 @@ "email": ";;;;;", "author_num": 6, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Tanskanen_2013_ICCV,\n \n author = {\n Tanskanen,\n Petri and Kolev,\n Kalin and Meier,\n Lorenz and Camposeco,\n Federico and Saurer,\n Olivier and Pollefeys,\n Marc\n},\n title = {\n Live Metric 3D Reconstruction on Mobile Phones\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "1fe1fa3f5b", @@ -7525,7 +7778,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Zurich", "aff_country_unique_index": "0;0", - "aff_country_unique": "Switzerland" + "aff_country_unique": "Switzerland", + "bibtex": "@InProceedings{Bradley_2013_ICCV,\n \n author = {\n Bradley,\n Derek and Beeler,\n Thabo\n},\n title = {\n Local Signal Equalization for Correspondence Matching\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "60b1ee1434", @@ -7556,7 +7810,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1+0", - "aff_country_unique": "Romania;Sweden" + "aff_country_unique": "Romania;Sweden", + "bibtex": "@InProceedings{Leordeanu_2013_ICCV,\n \n author = {\n Leordeanu,\n Marius and Zanfir,\n Andrei and Sminchisescu,\n Cristian\n},\n title = {\n Locally Affine Sparse-to-Dense Matching for Motion and Occlusion Estimation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "23daf7b555", @@ -7580,14 +7835,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;1;2+3;3", - "aff_unique_norm": "Dalian University of Technology;Heilongjiang University;Harbin Institute of Technology;Hong Kong Polytechnic University", + "aff_unique_norm": "Dalian University of Technology;Heilongjiang University;Harbin Institute of Technology;The Hong Kong Polytechnic University", "aff_unique_dep": ";;;", "aff_unique_url": "http://www.dlut.edu.cn/;http://www.hljnu.edu.cn;http://www.hit.edu.cn/;https://www.polyu.edu.hk", "aff_unique_abbr": "DUT;HGHU;HIT;PolyU", "aff_campus_unique_index": "1+2;2", "aff_campus_unique": ";Harbin;Hong Kong SAR", "aff_country_unique_index": "0;0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2013_ICCV,\n \n author = {\n Li,\n Peihua and Wang,\n Qilong and Zuo,\n Wangmeng and Zhang,\n Lei\n},\n title = {\n Log-Euclidean Kernels for Sparse Representation and Dictionary Learning\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "e1623b18fa", @@ -7611,14 +7867,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0+1;2+1;3;4+1;1", - "aff_unique_norm": "University of Illinois at Singapore;University of Illinois Urbana-Champaign;King Abdullah University of Science and Technology;National University of Singapore;Chinese Academy of Sciences", + "aff_unique_norm": "University of Illinois at Singapore;University of Illinois at Urbana-Champaign;King Abdullah University of Science and Technology;National University of Singapore;Chinese Academy of Sciences", "aff_unique_dep": "Advanced Digital Sciences Center;;;;Institute of Automation", "aff_unique_url": "https://www.illinois.edu;https://illinois.edu;https://www.kast.kau.edu.sa;https://www.nus.edu.sg;http://www.ia.cas.cn", "aff_unique_abbr": "UIUC-Singapore;UIUC;KAUST;NUS;CAS", "aff_campus_unique_index": "0+1;1;1;1", "aff_campus_unique": "Singapore;Urbana;", "aff_country_unique_index": "0+1;2+1;0;3+1;1", - "aff_country_unique": "Singapore;United States;Saudi Arabia;China" + "aff_country_unique": "Singapore;United States;Saudi Arabia;China", + "bibtex": "@InProceedings{Zhang_2013_ICCV,\n \n author = {\n Zhang,\n Tianzhu and Ghanem,\n Bernard and Liu,\n Si and Xu,\n Changsheng and Ahuja,\n Narendra\n},\n title = {\n Low-Rank Sparse Coding for Image Classification\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "351dae1b2a", @@ -7649,7 +7906,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Atlanta", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Xu_2013_ICCV,\n \n author = {\n Xu,\n Hongteng and Zha,\n Hongyuan\n},\n title = {\n Manifold Based Face Synthesis from Sparse Samples\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "38c47edea1", @@ -7673,14 +7931,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;1", - "aff_unique_norm": "Advanced Digital Sciences Center;University of Illinois Urbana-Champaign", + "aff_unique_norm": "Advanced Digital Sciences Center;University of Illinois at Urbana-Champaign", "aff_unique_dep": ";", "aff_unique_url": ";https://illinois.edu", "aff_unique_abbr": ";UIUC", "aff_campus_unique_index": "1", "aff_campus_unique": ";Urbana", "aff_country_unique_index": "0;1", - "aff_country_unique": "Singapore;United States" + "aff_country_unique": "Singapore;United States", + "bibtex": "@InProceedings{Ni_2013_ICCV,\n \n author = {\n Ni,\n Bingbing and Moulin,\n Pierre\n},\n title = {\n Manipulation Pattern Discovery: A Nonparametric Bayesian Approach\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "1a00d09536", @@ -7704,14 +7963,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0+1;0;1", - "aff_unique_norm": "Samsung;Korea Advanced Institute of Science and Technology", + "aff_unique_norm": "Samsung Advanced Institute of Technology;Korea Advanced Institute of Science and Technology", "aff_unique_dep": "Advanced Media Lab.;Dept. of EE", "aff_unique_url": "https://www.sait.samsung.com;https://www.kaist.ac.kr", "aff_unique_abbr": "SAIT;KAIST", "aff_campus_unique_index": "0+1;0;1", "aff_campus_unique": "Gyeonggi;Daejeon", "aff_country_unique_index": "0+0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Hwang_2013_ICCV,\n \n author = {\n Hwang,\n Wonjun and Roh,\n Kyungshik and Kim,\n Junmo\n},\n title = {\n Markov Network-Based Unified Classifier for Face Identification\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "efa5426c56", @@ -7742,7 +8002,8 @@ "aff_campus_unique_index": "0", "aff_campus_unique": "College Park", "aff_country_unique_index": "0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Yacoob_2013_ICCV,\n \n author = {\n Yacoob,\n Yaser\n},\n title = {\n Matching Dry to Wet Materials\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "a205ac44d0", @@ -7771,7 +8032,8 @@ "aff_unique_url": "https://www.sri.com", "aff_unique_abbr": "SRI", "aff_country_unique_index": "0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Ali_2013_ICCV,\n \n author = {\n Ali,\n Saad\n},\n title = {\n Measuring Flow Complexity in Videos\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "2559d6bfe4", @@ -7802,7 +8064,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Lee_2013_ICCV,\n \n author = {\n Lee,\n Choon-Meng and Cheong,\n Loong-Fah\n},\n title = {\n Minimal Basis Facility Location for Subspace Segmentation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "dbbff6a7fe", @@ -7826,14 +8089,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0+1;1;0+1", - "aff_unique_norm": "Chinese University of Hong Kong;Shenzhen Institute of Advanced Technology", + "aff_unique_norm": "The Chinese University of Hong Kong;Shenzhen Institutes of Advanced Technology", "aff_unique_dep": "Department of Information Engineering;key lab of Computer Vision & Pattern Recognition", "aff_unique_url": "https://www.cuhk.edu.hk;http://www.siat.ac.cn", "aff_unique_abbr": "CUHK;SIAT", "aff_campus_unique_index": "0+1;1;0+1", "aff_campus_unique": "Hong Kong SAR;Shenzhen", "aff_country_unique_index": "0+0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2013_ICCV,\n \n author = {\n Wang,\n Limin and Qiao,\n Yu and Tang,\n Xiaoou\n},\n title = {\n Mining Motion Atoms and Phrases for Complex Action Recognition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "463dcb41c4", @@ -7864,7 +8128,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Belgium" + "aff_country_unique": "Belgium", + "bibtex": "@InProceedings{Fernando_2013_ICCV,\n \n author = {\n Fernando,\n Basura and Tuytelaars,\n Tinne\n},\n title = {\n Mining Multiple Queries for Image Retrieval: On-the-Fly Learning of an Object-Specific Mid-level Representation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "13652581c6", @@ -7895,7 +8160,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Li_2013_ICCV,\n \n author = {\n Li,\n Cheng and Kitani,\n Kris M.\n},\n title = {\n Model Recommendation with Virtual Probes for Egocentric Hand Detection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "b284ec87f6", @@ -7904,7 +8170,7 @@ "author": "Ping Wei; Yibiao Zhao; Nanning Zheng; Song-Chun Zhu", "abstract": "Recognizing the events and objects in the video sequence are two challenging tasks due to the complex temporal structures and the large appearance variations. In this paper, we propose a 4D human-object interaction model, where the two tasks jointly boost each other. Our human-object interaction is defined in 4D space: i) the cooccurrence and geometric constraints of human pose and object in 3D space; ii) the sub-events transition and objects coherence in 1D temporal dimension. We represent the structure of events, sub-events and objects in a hierarchical graph. For an input RGB-depth video, we design a dynamic programming beam search algorithm to: i) segment the video, ii) recognize the events, and iii) detect the objects simultaneously. For evaluation, we built a large-scale multiview 3D event dataset which contains 3815 video sequences and 383,036 RGBD frames captured by the Kinect cameras. The experiment results on this dataset show the effectiveness of our method.", "pdf": "http://openaccess.thecvf.com/content_iccv_2013/papers/Wei_Modeling_4D_Human-Object_2013_ICCV_paper.pdf", - "aff": "Xi\u2019an Jiaotong University, China+University of California, Los Angeles, USA; University of California, Los Angeles, USA; Xi\u2019an Jiaotong University, China; University of California, Los Angeles, USA", + "aff": "Xi’an Jiaotong University, China+University of California, Los Angeles, USA; University of California, Los Angeles, USA; Xi’an Jiaotong University, China; University of California, Los Angeles, USA", "project": "", "github": "", "supp": "", @@ -7919,14 +8185,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0+1;1;0;1", - "aff_unique_norm": "Xi'an Jiao Tong University;University of California, Los Angeles", + "aff_unique_norm": "Xi'an Jiaotong University;University of California, Los Angeles", "aff_unique_dep": ";", "aff_unique_url": "http://en.xjtu.edu.cn/;https://www.ucla.edu", "aff_unique_abbr": "XJTU;UCLA", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0+1;1;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Wei_2013_ICCV,\n \n author = {\n Wei,\n Ping and Zhao,\n Yibiao and Zheng,\n Nanning and Zhu,\n Song-Chun\n},\n title = {\n Modeling 4D Human-Object Interactions for Event and Object Recognition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "bd6b2197ea", @@ -7957,7 +8224,8 @@ "aff_campus_unique_index": "0;1;1;1", "aff_campus_unique": "Beijing;Los Angeles", "aff_country_unique_index": "0;1;1;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Li_2013_ICCV,\n \n author = {\n Li,\n Bo and Hu,\n Wenze and Wu,\n Tianfu and Zhu,\n Song-Chun\n},\n title = {\n Modeling Occlusion by Discriminative AND-OR Structures\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "0d0698cb8e", @@ -7988,7 +8256,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Saudi Arabia" + "aff_country_unique": "Saudi Arabia", + "bibtex": "@InProceedings{Yang_2013_ICCV,\n \n author = {\n Yang,\n Yanchao and Sundaramoorthi,\n Ganesh\n},\n title = {\n Modeling Self-Occlusions in Dynamic Shape and Appearance Tracking\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "6352fa72e7", @@ -8010,7 +8279,8 @@ "email": ";;;", "author_num": 4, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Cho_2013_ICCV,\n \n author = {\n Cho,\n Donghyeon and Lee,\n Minhaeng and Kim,\n Sunyeong and Tai,\n Yu-Wing\n},\n title = {\n Modeling the Calibration Pipeline of the Lytro Camera for High Quality Light-Field Image Reconstruction\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "ca1c6b9904", @@ -8041,7 +8311,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Khosla_2013_ICCV,\n \n author = {\n Khosla,\n Aditya and Bainbridge,\n Wilma A. and Torralba,\n Antonio and Oliva,\n Aude\n},\n title = {\n Modifying the Memorability of Face Photographs\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "abfe87b520", @@ -8072,7 +8343,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0+0", - "aff_country_unique": "Australia" + "aff_country_unique": "Australia", + "bibtex": "@InProceedings{Radwan_2013_ICCV,\n \n author = {\n Radwan,\n Ibrahim and Dhall,\n Abhinav and Goecke,\n Roland\n},\n title = {\n Monocular Image 3D Human Pose Estimation under Self-Occlusion\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "8428c72525", @@ -8103,7 +8375,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Amer_2013_ICCV,\n \n author = {\n Amer,\n Mohamed R. and Todorovic,\n Sinisa and Fern,\n Alan and Zhu,\n Song-Chun\n},\n title = {\n Monte Carlo Tree Search for Scheduling Activity Recognition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "879470a4ae", @@ -8134,7 +8407,8 @@ "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Stanford;Hong Kong SAR", "aff_country_unique_index": "0;0;1", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Li_2013_ICCV,\n \n author = {\n Li,\n Dingzeyu and Chen,\n Qifeng and Tang,\n Chi-Keung\n},\n title = {\n Motion-Aware KNN Laplacian for Video Matting\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "4775bc25e2", @@ -8165,7 +8439,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Taiwan", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chiang_2013_ICCV,\n \n author = {\n Chiang,\n Chen-Kuo and Su,\n Te-Feng and Yen,\n Chih and Lai,\n Shang-Hong\n},\n title = {\n Multi-attributed Dictionary Learning for Sparse Coding\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "b001744718", @@ -8196,7 +8471,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", - "aff_country_unique": "Singapore;Australia" + "aff_country_unique": "Singapore;Australia", + "bibtex": "@InProceedings{Galoogahi_2013_ICCV,\n \n author = {\n Galoogahi,\n Hamed Kiani and Sim,\n Terence and Lucey,\n Simon\n},\n title = {\n Multi-channel Correlation Filters\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "e3ce1683d7", @@ -8227,7 +8503,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Binghamton", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Hu_2013_ICCV,\n \n author = {\n Hu,\n Kaoning and Yin,\n Lijun\n},\n title = {\n Multi-scale Topological Features for Hand Posture Representation and Analysis\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "1813902650", @@ -8249,7 +8526,8 @@ "email": ";;", "author_num": 3, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Zeng_2013_ICCV,\n \n author = {\n Zeng,\n Xingyu and Ouyang,\n Wanli and Wang,\n Xiaogang\n},\n title = {\n Multi-stage Contextual Deep Learning for Pedestrian Detection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "e8b5fee136", @@ -8280,7 +8558,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+1;1;0", - "aff_country_unique": "Australia;China" + "aff_country_unique": "Australia;China", + "bibtex": "@InProceedings{Kim_2013_ICCV,\n \n author = {\n Kim,\n Jae-Hak and Dai,\n Yuchao and Li,\n Hongdong and Du,\n Xin and Kim,\n Jonghyuk\n},\n title = {\n Multi-view 3D Reconstruction from Uncalibrated Radially-Symmetric Cameras\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "f8a4cfc093", @@ -8311,7 +8590,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Weinmann_2013_ICCV,\n \n author = {\n Weinmann,\n Michael and Osep,\n Aljosa and Ruiters,\n Roland and Klein,\n Reinhard\n},\n title = {\n Multi-view Normal Field Integration for 3D Reconstruction of Mirroring Objects\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "3e853ea2eb", @@ -8320,7 +8600,7 @@ "author": "Abdelaziz Djelouah; Jean-Sebastien Franco; Edmond Boyer; Francois Le Clerc; Patrick Perez", "abstract": "In this paper, we address the problem of object segmentation in multiple views or videos when two or more viewpoints of the same scene are available. We propose a new approach that propagates segmentation coherence information in both space and time, hence allowing evidences in one image to be shared over the complete set. To this aim the segmentation is cast as a single efficient labeling problem over space and time with graph cuts. In contrast to most existing multi-view segmentation methods that rely on some form of dense reconstruction, ours only requires a sparse 3D sampling to propagate information between viewpoints. The approach is thoroughly evaluated on standard multiview datasets, as well as on videos. With static views, results compete with state of the art methods but they are achieved with significantly fewer viewpoints. With multiple videos, we report results that demonstrate the benefit of segmentation propagation through temporal cues.", "pdf": "http://openaccess.thecvf.com/content_iccv_2013/papers/Djelouah_Multi-view_Object_Segmentation_2013_ICCV_paper.pdf", - "aff": "LJK - INRIA Rh\u00f4ne-Alpes, France + Technicolor, Cesson Sevigne, France; LJK - INRIA Rh\u00f4ne-Alpes, France; LJK - INRIA Rh\u00f4ne-Alpes, France; Technicolor, Cesson Sevigne, France; Technicolor, Cesson Sevigne, France", + "aff": "LJK - INRIA Rhône-Alpes, France + Technicolor, Cesson Sevigne, France; LJK - INRIA Rhône-Alpes, France; LJK - INRIA Rhône-Alpes, France; Technicolor, Cesson Sevigne, France; Technicolor, Cesson Sevigne, France", "project": "", "github": "", "supp": "", @@ -8335,14 +8615,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0+1;0;0;1;1", - "aff_unique_norm": "INRIA Rh\u00f4ne-Alpes;Technicolor", + "aff_unique_norm": "INRIA Rhône-Alpes;Technicolor", "aff_unique_dep": "LJK;", "aff_unique_url": "https://www.inria.fr/en/centre/rhone-alpes;https://www.technicolor.com", "aff_unique_abbr": "INRIA;", "aff_campus_unique_index": "0+1;0;0;1;1", - "aff_campus_unique": "Rh\u00f4ne-Alpes;Cesson Sevigne", + "aff_campus_unique": "Rhône-Alpes;Cesson Sevigne", "aff_country_unique_index": "0+0;0;0;0;0", - "aff_country_unique": "France" + "aff_country_unique": "France", + "bibtex": "@InProceedings{Djelouah_2013_ICCV,\n \n author = {\n Djelouah,\n Abdelaziz and Franco,\n Jean-Sebastien and Boyer,\n Edmond and Le Clerc,\n Francois and Perez,\n Patrick\n},\n title = {\n Multi-view Object Segmentation in Space and Time\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "0e7554656f", @@ -8373,7 +8654,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Merced;", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "United States;Japan" + "aff_country_unique": "United States;Japan", + "bibtex": "@InProceedings{Wu_2013_ICCV,\n \n author = {\n Wu,\n Yi and Ijiri,\n Yoshihisa and Yang,\n Ming-Hsuan\n},\n title = {\n Multiple Non-rigid Surface Detection and Registration\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "a1692f4fb6", @@ -8396,15 +8678,16 @@ "author_num": 5, "track": "main", "status": "Poster", - "aff_unique_index": "0;1;1;0;0", - "aff_unique_norm": "Korea Advanced Institute of Science and Technology;Microsoft", - "aff_unique_dep": ";Microsoft Research", - "aff_unique_url": "https://www.kaist.ac.kr;https://www.microsoft.com/en-us/research", - "aff_unique_abbr": "KAIST;MSR", + "aff_unique_index": "0;1;2;0;0", + "aff_unique_norm": "Korea Advanced Institute of Science and Technology;Microsoft Research;Microsoft Research Asia", + "aff_unique_dep": ";Microsoft Research;", + "aff_unique_url": "https://www.kaist.ac.kr;https://www.microsoft.com/en-us/research;https://www.microsoft.com/en-us/research/group/asia", + "aff_unique_abbr": "KAIST;MSR;MSRA", "aff_campus_unique_index": "1", "aff_campus_unique": ";Redmond", "aff_country_unique_index": "0;1;2;0;0", - "aff_country_unique": "South Korea;United States;China" + "aff_country_unique": "South Korea;United States;China", + "bibtex": "@InProceedings{Park_2013_ICCV,\n \n author = {\n Park,\n Jaesik and Sinha,\n Sudipta N. and Matsushita,\n Yasuyuki and Tai,\n Yu-Wing and Kweon,\n In So\n},\n title = {\n Multiview Photometric Stereo Using Planar Mesh Parameterization\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "a03e1776fc", @@ -8426,7 +8709,8 @@ "email": ";;", "author_num": 3, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Chen_2013_ICCV,\n \n author = {\n Chen,\n Xinlei and Shrivastava,\n Abhinav and Gupta,\n Abhinav\n},\n title = {\n NEIL: Extracting Visual Knowledge from Web Data\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "88675990ac", @@ -8457,7 +8741,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Matzen_2013_ICCV,\n \n author = {\n Matzen,\n Kevin and Snavely,\n Noah\n},\n title = {\n NYC3DCars: A Dataset of 3D Vehicles in Geographic Context\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "6fb0c05d1f", @@ -8488,7 +8773,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Tokyo", "aff_country_unique_index": "0;0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Inoue_2013_ICCV,\n \n author = {\n Inoue,\n Nakamasa and Shinoda,\n Koichi\n},\n title = {\n Neighbor-to-Neighbor Search for Fast Coding of Feature Vectors\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "4136d972fe", @@ -8519,7 +8805,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Byrne_2013_ICCV,\n \n author = {\n Byrne,\n Jeffrey and Shi,\n Jianbo\n},\n title = {\n Nested Shape Descriptors\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "e8702f9938", @@ -8550,7 +8837,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Wilson_2013_ICCV,\n \n author = {\n Wilson,\n Kyle and Snavely,\n Noah\n},\n title = {\n Network Principles for SfM: Disambiguating Repeated Structures with Local Context\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "c59fff14f2", @@ -8581,7 +8869,8 @@ "aff_campus_unique_index": "0;0;1;0+1", "aff_campus_unique": "Arlington;Sydney", "aff_country_unique_index": "0;0;1;0+1", - "aff_country_unique": "United States;Australia" + "aff_country_unique": "United States;Australia", + "bibtex": "@InProceedings{Cai_2013_ICCV,\n \n author = {\n Cai,\n Xiao and Nie,\n Feiping and Cai,\n Weidong and Huang,\n Heng\n},\n title = {\n New Graph Structured Sparsity Model for Multi-label Image Annotations\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "7c3493f1ca", @@ -8603,7 +8892,8 @@ "email": ";;;;", "author_num": 5, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Yan_2013_ICCV,\n \n author = {\n Yan,\n Yan and Ricci,\n Elisa and Subramanian,\n Ramanathan and Lanz,\n Oswald and Sebe,\n Nicu\n},\n title = {\n No Matter Where You Are: Flexible Graph-Guided Multi-task Learning for Multi-view Head Pose Classification under Target Motion\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "063693bc61", @@ -8625,7 +8915,8 @@ "email": ";", "author_num": 2, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Gupta_2013_ICCV,\n \n author = {\n Das Gupta,\n Mithun and Kumar,\n Sanjeev\n},\n title = {\n Non-convex P-Norm Projection for Robust Sparsity\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "5b70d52b8a", @@ -8656,7 +8947,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Israel" + "aff_country_unique": "Israel", + "bibtex": "@InProceedings{Michaeli_2013_ICCV,\n \n author = {\n Michaeli,\n Tomer and Irani,\n Michal\n},\n title = {\n Nonparametric Blind Super-resolution\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "5842d74375", @@ -8678,14 +8970,15 @@ "email": ";;", "author_num": 3, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Panda_2013_ICCV,\n \n author = {\n Panda,\n Jayaguru and Brown,\n Michael S. and Jawahar,\n C.V.\n},\n title = {\n Offline Mobile Instance Retrieval with a Small Memory Footprint\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "40384f03e1", "title": "On One-Shot Similarity Kernels: Explicit Feature Maps and Properties", "site": "http://openaccess.thecvf.com/content_iccv_2013/html/Zafeiriou_On_One-Shot_Similarity_2013_ICCV_paper.html", "author": "Stefanos Zafeiriou; Irene Kotsia", - "abstract": "Kernels have been a common tool of machine learning and computer vision applications for modeling nonlinearities and/or the design of robust 1 similarity measures between objects. Arguably, the class of positive semidefinite (psd) kernels, widely known as Mercer\u00e2\u0080\u0099s Kernels,constitutes one of the most well-studied cases. For every psd kernel there exists an associated feature map to an arbitrary dimensional Hilbert space H, the so-called feature space. The main reason behind psd kernels\u00e2\u0080\u0099 popularity is the fact that classification/regression techniques (such as Support Vector Machines (SVMs)) and component analysis algorithms (such as Kernel Principal Component Analysis (KPCA)) can be devised in H, without an explicit definition of the feature map, only by using the kernel (the so-called kernel trick). Recently, due to the development of very efficient solutions for large scale linear SVMs and for incremental linear component analysis, the research towards finding feature map approximations for classes of kernels has attracted significant interest. In this paper, we attempt the derivation of explicit feature maps of a recently proposed class of kernels, the so-called one-shot similarity kernels. We show that for this class of kernels either there exists an explicit representation in feature space or the kernel can be expressed in such a form that allows for exact incremental learning. We theoretically explore the properties of these kernels and show how these kernels can be used for the development of robust visual tracking, recognition and deformable fitting algorithms.", + "abstract": "Kernels have been a common tool of machine learning and computer vision applications for modeling nonlinearities and/or the design of robust 1 similarity measures between objects. Arguably, the class of positive semidefinite (psd) kernels, widely known as Mercer’s Kernels,constitutes one of the most well-studied cases. For every psd kernel there exists an associated feature map to an arbitrary dimensional Hilbert space H, the so-called feature space. The main reason behind psd kernels’ popularity is the fact that classification/regression techniques (such as Support Vector Machines (SVMs)) and component analysis algorithms (such as Kernel Principal Component Analysis (KPCA)) can be devised in H, without an explicit definition of the feature map, only by using the kernel (the so-called kernel trick). Recently, due to the development of very efficient solutions for large scale linear SVMs and for incremental linear component analysis, the research towards finding feature map approximations for classes of kernels has attracted significant interest. In this paper, we attempt the derivation of explicit feature maps of a recently proposed class of kernels, the so-called one-shot similarity kernels. We show that for this class of kernels either there exists an explicit representation in feature space or the kernel can be expressed in such a form that allows for exact incremental learning. We theoretically explore the properties of these kernels and show how these kernels can be used for the development of robust visual tracking, recognition and deformable fitting algorithms.", "pdf": "http://openaccess.thecvf.com/content_iccv_2013/papers/Zafeiriou_On_One-Shot_Similarity_2013_ICCV_paper.pdf", "aff": "Department of Computing, Imperial College London; Electronics Laboratory, Department of Physics, University of Patras, Greece+School of Science and Technology, Middlesex University, London", "project": "", @@ -8709,7 +9002,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "London;", "aff_country_unique_index": "0;1+0", - "aff_country_unique": "United Kingdom;Greece" + "aff_country_unique": "United Kingdom;Greece", + "bibtex": "@InProceedings{Zafeiriou_2013_ICCV,\n \n author = {\n Zafeiriou,\n Stefanos and Kotsia,\n Irene\n},\n title = {\n On One-Shot Similarity Kernels: Explicit Feature Maps and Properties\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "4d2d0d19dd", @@ -8740,7 +9034,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Caen", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "France" + "aff_country_unique": "France", + "bibtex": "@InProceedings{Chakik_2013_ICCV,\n \n author = {\n El Chakik,\n Abdallah and Elmoataz,\n Abderrahim and Sadi,\n Ahcene\n},\n title = {\n On the Mean Curvature Flow on Graphs with Applications in Image and Manifold Processing\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "13eb7909fb", @@ -8771,7 +9066,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Elqursh_2013_ICCV,\n \n author = {\n Elqursh,\n Ali and Elgammal,\n Ahmed\n},\n title = {\n Online Motion Segmentation Using Dynamic Label Propagation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "17408d5d12", @@ -8795,14 +9091,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;1;0", - "aff_unique_norm": "Hong Kong University of Science and Technology;Microsoft", + "aff_unique_norm": "Hong Kong University of Science and Technology;Microsoft Corporation", "aff_unique_dep": ";Microsoft Research", "aff_unique_url": "https://www.ust.hk;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "HKUST;MSR", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Wang_2013_ICCV,\n \n author = {\n Wang,\n Naiyan and Wang,\n Jingdong and Yeung,\n Dit-Yan\n},\n title = {\n Online Robust Non-negative Dictionary Learning for Visual Tracking\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "e33f3bd3e0", @@ -8824,7 +9121,8 @@ "email": ";;;;", "author_num": 5, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Bergh_2013_ICCV,\n \n author = {\n Van Den Bergh,\n Michael and Roig,\n Gemma and Boix,\n Xavier and Manen,\n Santiago and Van Gool,\n Luc\n},\n title = {\n Online Video SEEDS for Temporal Window Objectness\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "6f71c2a082", @@ -8855,7 +9153,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Seoul", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Kim_2013_ICCV,\n \n author = {\n Kim,\n Tae Hyun and Lee,\n Hee Seok and Lee,\n Kyoung Mu\n},\n title = {\n Optical Flow via Locally Adaptive Fusion of Complementary Data Costs\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "a0fc91370d", @@ -8879,14 +9178,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0+1+2;3;0+1", - "aff_unique_norm": "INRIA;CEREA2;Universit\u00e9 de Versailles Saint-Quentin-en-Yvelines;Lithicon", + "aff_unique_norm": "INRIA;CEREA2;Université de Versailles Saint-Quentin-en-Yvelines;Lithicon", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.inria.fr;;https://www.uvsq.fr;", "aff_unique_abbr": "INRIA;;UVSQ;Lithicon", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;2;0", - "aff_country_unique": "France;;Norway" + "aff_country_unique": "France;;Norway", + "bibtex": "@InProceedings{Huot_2013_ICCV,\n \n author = {\n Huot,\n Etienne and Papari,\n Giuseppe and Herlin,\n Isabelle\n},\n title = {\n Optimal Orthogonal Basis and Image Assimilation: Motion Modeling\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "53e6f5262a", @@ -8913,11 +9213,12 @@ "aff_unique_norm": "University of Lincoln;Imperial College London;University of Twente", "aff_unique_dep": "School of Computer Science;Department of Computing;", "aff_unique_url": "https://www.lincoln.ac.uk;https://www.imperial.ac.uk;https://www.utwente.nl", - "aff_unique_abbr": "UoL;Imperial;UT", + "aff_unique_abbr": ";Imperial;UT", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";London", "aff_country_unique_index": "0+0;0+1", - "aff_country_unique": "United Kingdom;Netherlands" + "aff_country_unique": "United Kingdom;Netherlands", + "bibtex": "@InProceedings{Tzimiropoulos_2013_ICCV,\n \n author = {\n Tzimiropoulos,\n Georgios and Pantic,\n Maja\n},\n title = {\n Optimization Problems for Fast AAM Fitting in-the-Wild\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "9fb75b3e91", @@ -8939,7 +9240,8 @@ "email": ";;", "author_num": 3, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Hong_2013_ICCV,\n \n author = {\n Hong,\n Seunghoon and Kwak,\n Suha and Han,\n Bohyung\n},\n title = {\n Orderless Tracking through Model-Averaged Posterior Estimation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "7e68fd6c6a", @@ -8948,7 +9250,7 @@ "author": "Philipp Heise; Sebastian Klose; Brian Jensen; Alois Knoll", "abstract": "Most stereo correspondence algorithms match support windows at integer-valued disparities and assume a constant disparity value within the support window. The recently proposed PatchMatch stereo algorithm [7] overcomes this limitation of previous algorithms by directly estimating planes. This work presents a method that integrates the PatchMatch stereo algorithm into a variational smoothing formulation using quadratic relaxation. The resulting algorithm allows the explicit regularization of the disparity and normal gradients using the estimated plane parameters. Evaluation of our method in the Middlebury benchmark shows that our method outperforms the traditional integer-valued disparity strategy as well as the original algorithm and its variants in sub-pixel accurate disparity estimation.", "pdf": "http://openaccess.thecvf.com/content_iccv_2013/papers/Heise_PM-Huber_PatchMatch_with_2013_ICCV_paper.pdf", - "aff": "Department of Informatics, Technische Universit \u00a8at M \u00a8unchen, Germany; Department of Informatics, Technische Universit \u00a8at M \u00a8unchen, Germany; Department of Informatics, Technische Universit \u00a8at M \u00a8unchen, Germany; Department of Informatics, Technische Universit \u00a8at M \u00a8unchen, Germany", + "aff": "Department of Informatics, Technische Universit ¨at M ¨unchen, Germany; Department of Informatics, Technische Universit ¨at M ¨unchen, Germany; Department of Informatics, Technische Universit ¨at M ¨unchen, Germany; Department of Informatics, Technische Universit ¨at M ¨unchen, Germany", "project": "", "github": "", "supp": "", @@ -8963,14 +9265,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;0;0;0", - "aff_unique_norm": "Technische Universit\u00e4t M\u00fcnchen", + "aff_unique_norm": "Technische Universität München", "aff_unique_dep": "Department of Informatics", "aff_unique_url": "https://www.tum.de", "aff_unique_abbr": "TUM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Heise_2013_ICCV,\n \n author = {\n Heise,\n Philipp and Klose,\n Sebastian and Jensen,\n Brian and Knoll,\n Alois\n},\n title = {\n PM-Huber: PatchMatch with Huber Regularization for Stereo Matching\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "be02e274de", @@ -8994,14 +9297,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;1;2;0", - "aff_unique_norm": "Tsinghua University;Chinese University of Hong Kong;Queen Mary University of London", + "aff_unique_norm": "Tsinghua University;The Chinese University of Hong Kong;Queen Mary University of London", "aff_unique_dep": "Dept. of Electronic Engineering;Dept. of Information Engineering;School of EECS", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.cuhk.edu.hk;https://www.qmul.ac.uk", "aff_unique_abbr": "THU;CUHK;QMUL", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Hong Kong SAR;London", "aff_country_unique_index": "0;0;1;0", - "aff_country_unique": "China;United Kingdom" + "aff_country_unique": "China;United Kingdom", + "bibtex": "@InProceedings{Liu_2013_ICCV,\n \n author = {\n Liu,\n Chunxiao and Loy,\n Chen Change and Gong,\n Shaogang and Wang,\n Guijin\n},\n title = {\n POP: Person Re-identification Post-rank Optimisation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "1f643c73d0", @@ -9032,7 +9336,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Chapel Hill", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Yamaguchi_2013_ICCV,\n \n author = {\n Yamaguchi,\n Kota and Kiapour,\n M. Hadi and Berg,\n Tamara L.\n},\n title = {\n Paper Doll Parsing: Retrieving Similar Styles to Parse Clothing Items\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "b3642ce65f", @@ -9056,14 +9361,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;1;2;0", - "aff_unique_norm": "Florida State University;Ohio State University;University of Nottingham", - "aff_unique_dep": ";;", + "aff_unique_norm": "The Florida State University;The Ohio State University;The University of Nottingham", + "aff_unique_dep": "Department of Physics;;", "aff_unique_url": "https://www.fsu.edu;https://www.osu.edu;https://www.nottingham.ac.uk", "aff_unique_abbr": "FSU;OSU;Nottingham", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", - "aff_country_unique": "United States;United Kingdom" + "aff_country_unique": "United States;United Kingdom", + "bibtex": "@InProceedings{Xie_2013_ICCV,\n \n author = {\n Xie,\n Qian and Kurtek,\n Sebastian and Le,\n Huiling and Srivastava,\n Anuj\n},\n title = {\n Parallel Transport of Deformations in Shape Space of Elastic Surfaces\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "6ea81b3863", @@ -9094,7 +9400,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Lim_2013_ICCV,\n \n author = {\n Lim,\n Joseph J. and Pirsiavash,\n Hamed and Torralba,\n Antonio\n},\n title = {\n Parsing IKEA Objects: Fine Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "233601aead", @@ -9125,7 +9432,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;2", - "aff_country_unique": "Sweden;Canada;Austria" + "aff_country_unique": "Sweden;Canada;Austria", + "bibtex": "@InProceedings{Olsson_2013_ICCV,\n \n author = {\n Olsson,\n Carl and Ulen,\n Johannes and Boykov,\n Yuri and Kolmogorov,\n Vladimir\n},\n title = {\n Partial Enumeration and Curvature Regularization\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "e438ae06ea", @@ -9156,7 +9464,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0", - "aff_country_unique": "South Korea;Switzerland" + "aff_country_unique": "South Korea;Switzerland", + "bibtex": "@InProceedings{Oh_2013_ICCV,\n \n author = {\n Oh,\n Tae-Hyun and Kim,\n Hyeongwoo and Tai,\n Yu-Wing and Bazin,\n Jean-Charles and Kweon,\n In So\n},\n title = {\n Partial Sum Minimization of Singular Values in RPCA for Low-Level Vision\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "5c6a80ee06", @@ -9180,14 +9489,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0+1;0;0+1", - "aff_unique_norm": "Chinese University of Hong Kong;Chinese Academy of Sciences", + "aff_unique_norm": "The Chinese University of Hong Kong;Chinese Academy of Sciences", "aff_unique_dep": "Department of Information Engineering;Shenzhen Institutes of Advanced Technology", "aff_unique_url": "https://www.cuhk.edu.hk;http://www.siat.cas.cn", "aff_unique_abbr": "CUHK;SIAT", "aff_campus_unique_index": "0+1;0;0+1", "aff_campus_unique": "Hong Kong SAR;Shenzhen", "aff_country_unique_index": "0+0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Luo_2013_ICCV,\n \n author = {\n Luo,\n Ping and Wang,\n Xiaogang and Tang,\n Xiaoou\n},\n title = {\n Pedestrian Parsing via Deep Decompositional Network\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "af25167431", @@ -9196,7 +9506,7 @@ "author": "Wufeng Xue; Xuanqin Mou; Lei Zhang; Xiangchu Feng", "abstract": "How to measure the perceptual quality of natural images is an important problem in low level vision. It is known that the Mean Squared Error (MSE) is not an effective index to describe the perceptual fidelity of images. Numerous perceptual fidelity indices have been developed, while the representatives include the Structural SIMilarity (SSIM) index and its variants. However, most of those perceptual measures are nonlinear, and they cannot be easily adopted as an objective function to minimize in various low level vision tasks. Can MSE be perceptual fidelity aware after some minor adaptation? In this paper we propose a simple framework to enhance the perceptual fidelity awareness of MSE by introducing an l 2 -norm structural error term to it. Such a Structural MSE (SMSE) can lead to very competitive image quality assessment (IQA) results. More surprisingly, we show that by using certain structure extractors, SMSE can be further turned into a Gaussian smoothed MSE (i.e., the Euclidean distance between the original and distorted images after Gaussian smooth filtering), which is much simpler to calculate but achieves rather better IQA performance than SSIM. The socalled Perceptual-fidelity Aware MSE (PAMSE) can have great potentials in applications such as perceptual image coding and perceptual image restoration.", "pdf": "http://openaccess.thecvf.com/content_iccv_2013/papers/Xue_Perceptual_Fidelity_Aware_2013_ICCV_paper.pdf", - "aff": "Institute of Image Processing and Pattern Recognition, Xi\u2019an Jiaotong University, CHINA+Department of Computing, The Hong Kong Polytechnic University, Hong Kong, CHINA; Institute of Image Processing and Pattern Recognition, Xi\u2019an Jiaotong University, CHINA; Department of Computing, The Hong Kong Polytechnic University, Hong Kong, CHINA; School of Science, XiDian University, CHINA", + "aff": "Institute of Image Processing and Pattern Recognition, Xi’an Jiaotong University, CHINA+Department of Computing, The Hong Kong Polytechnic University, Hong Kong, CHINA; Institute of Image Processing and Pattern Recognition, Xi’an Jiaotong University, CHINA; Department of Computing, The Hong Kong Polytechnic University, Hong Kong, CHINA; School of Science, XiDian University, CHINA", "project": "", "github": "", "supp": "", @@ -9211,14 +9521,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0+1;0;1;2", - "aff_unique_norm": "Xi'an Jiao Tong University;Hong Kong Polytechnic University;Xidian University", + "aff_unique_norm": "Xi'an Jiaotong University;The Hong Kong Polytechnic University;XiDian University", "aff_unique_dep": "Institute of Image Processing and Pattern Recognition;Department of Computing;School of Science", "aff_unique_url": "http://www.xjtu.edu.cn;https://www.polyu.edu.hk;http://www.xidian.edu.cn/", "aff_unique_abbr": "XJTU;PolyU;XDU", "aff_campus_unique_index": "0+1;0;1", "aff_campus_unique": "Xi'an;Hong Kong;", "aff_country_unique_index": "0+0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xue_2013_ICCV,\n \n author = {\n Xue,\n Wufeng and Mou,\n Xuanqin and Zhang,\n Lei and Feng,\n Xiangchu\n},\n title = {\n Perceptual Fidelity Aware Mean Squared Error\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "6dc2067e7b", @@ -9242,14 +9553,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;0;0", - "aff_unique_norm": "Chinese University of Hong Kong", + "aff_unique_norm": "the Chinese University of Hong Kong", "aff_unique_dep": "Department of Electronic Engineering", "aff_unique_url": "https://www.cuhk.edu.hk", "aff_unique_abbr": "CUHK", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhao_2013_ICCV,\n \n author = {\n Zhao,\n Rui and Ouyang,\n Wanli and Wang,\n Xiaogang\n},\n title = {\n Person Re-identification by Salience Matching\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "4d133ba9f4", @@ -9280,7 +9592,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Suzhou", "aff_country_unique_index": "0;0;0;0+0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Li_2013_ICCV,\n \n author = {\n Li,\n Zhuwen and Guo,\n Jiaming and Cheong,\n Loong-Fah and Zhou,\n Steven Zhiying\n},\n title = {\n Perspective Motion Segmentation via Collaborative Clustering\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "006fad6817", @@ -9302,7 +9615,8 @@ "email": ";;;", "author_num": 4, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Bissacco_2013_ICCV,\n \n author = {\n Bissacco,\n Alessandro and Cummins,\n Mark and Netzer,\n Yuval and Neven,\n Hartmut\n},\n title = {\n PhotoOCR: Reading Text in Uncontrolled Conditions\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "8d58c1859c", @@ -9333,7 +9647,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1+0", - "aff_country_unique": "Romania;Sweden" + "aff_country_unique": "Romania;Sweden", + "bibtex": "@InProceedings{Marinoiu_2013_ICCV,\n \n author = {\n Marinoiu,\n Elisabeta and Papava,\n Dragos and Sminchisescu,\n Cristian\n},\n title = {\n Pictorial Human Spaces: How Well Do Humans Perceive a 3D Articulated Pose?\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "f2807085da", @@ -9355,7 +9670,8 @@ "email": ";;", "author_num": 3, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Vogel_2013_ICCV,\n \n author = {\n Vogel,\n Christoph and Schindler,\n Konrad and Roth,\n Stefan\n},\n title = {\n Piecewise Rigid Scene Flow\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "97911485d6", @@ -9364,7 +9680,7 @@ "author": "Stefan Duffner; Christophe Garcia", "abstract": "In this paper, we present a novel algorithm for fast tracking of generic objects in videos. The algorithm uses two components: a detector that makes use of the generalised Hough transform with pixel-based descriptors, and a probabilistic segmentation method based on global models for foreground and background. These components are used for tracking in a combined way, and they adapt each other in a co-training manner. Through effective model adaptation and segmentation, the algorithm is able to track objects that undergo rigid and non-rigid deformations and considerable shape and appearance variations. The proposed tracking method has been thoroughly evaluated on challenging standard videos, and outperforms state-of-theart tracking methods designed for the same task. Finally, the proposed models allow for an extremely efficient implementation, and thus tracking is very fast.", "pdf": "http://openaccess.thecvf.com/content_iccv_2013/papers/Duffner_PixelTrack_A_Fast_2013_ICCV_paper.pdf", - "aff": "Universit \u00b4e de Lyon, CNRS; INSA-Lyon, LIRIS, UMR5205, F-69621, France", + "aff": "Universit ´e de Lyon, CNRS; INSA-Lyon, LIRIS, UMR5205, F-69621, France", "project": "", "github": "", "supp": "", @@ -9379,14 +9695,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;1", - "aff_unique_norm": "Universit\u00e9 de Lyon;INSA Lyon", + "aff_unique_norm": "Université de Lyon;INSA Lyon", "aff_unique_dep": ";LIRIS", "aff_unique_url": "https://www.universite-lyon.fr;https://www.insa-lyon.fr", "aff_unique_abbr": "UDL;INSA-Lyon", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "France" + "aff_country_unique": "France", + "bibtex": "@InProceedings{Duffner_2013_ICCV,\n \n author = {\n Duffner,\n Stefan and Garcia,\n Christophe\n},\n title = {\n PixelTrack: A Fast Adaptive Algorithm for Tracking Non-rigid Objects\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "dc0b95253e", @@ -9417,7 +9734,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Freiburg", "aff_country_unique_index": "0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Ummenhofer_2013_ICCV,\n \n author = {\n Ummenhofer,\n Benjamin and Brox,\n Thomas\n},\n title = {\n Point-Based 3D Reconstruction of Thin Objects\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "5c920b5782", @@ -9439,7 +9757,8 @@ "email": ";;;", "author_num": 4, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Alahari_2013_ICCV,\n \n author = {\n Alahari,\n Karteek and Seguin,\n Guillaume and Sivic,\n Josef and Laptev,\n Ivan\n},\n title = {\n Pose Estimation and Segmentation of People in 3D Movies\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "dc3eb8f9c5", @@ -9470,7 +9789,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Lund", "aff_country_unique_index": "0;0", - "aff_country_unique": "Sweden" + "aff_country_unique": "Sweden", + "bibtex": "@InProceedings{Kuang_2013_ICCV,\n \n author = {\n Kuang,\n Yubin and Astrom,\n Kalle\n},\n title = {\n Pose Estimation with Unknown Focal Length Using Points,\n Directions and Lines\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "da7336d71e", @@ -9501,7 +9821,8 @@ "aff_campus_unique_index": "0", "aff_campus_unique": "Gdansk;", "aff_country_unique_index": "0;1", - "aff_country_unique": "Poland;France" + "aff_country_unique": "Poland;France", + "bibtex": "@InProceedings{Wesierski_2013_ICCV,\n \n author = {\n Wesierski,\n Daniel and Horain,\n Patrick\n},\n title = {\n Pose-Configurable Generic Tracking of Elongated Objects\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "f4e4821299", @@ -9532,7 +9853,8 @@ "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Arlington;Charlotte", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Yu_2013_ICCV,\n \n author = {\n Yu,\n Xiang and Huang,\n Junzhou and Zhang,\n Shaoting and Yan,\n Wang and Metaxas,\n Dimitris N.\n},\n title = {\n Pose-Free Facial Landmark Fitting via Optimized Part Mixtures and Cascaded Deformable Shape Model\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "db7ad17b2c", @@ -9563,7 +9885,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Austria" + "aff_country_unique": "Austria", + "bibtex": "@InProceedings{Gridchyn_2013_ICCV,\n \n author = {\n Gridchyn,\n Igor and Kolmogorov,\n Vladimir\n},\n title = {\n Potts Model,\n Parametric Maxflow and K-Submodular Functions\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "1bd7d9c432", @@ -9594,7 +9917,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Park_2013_ICCV,\n \n author = {\n Park,\n Hyun Soo and Jain,\n Eakta and Sheikh,\n Yaser\n},\n title = {\n Predicting Primary Gaze Behavior Using Social Saliency Fields\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "b36344dedd", @@ -9625,7 +9949,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Austin", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Jain_2013_ICCV,\n \n author = {\n Jain,\n Suyog Dutt and Grauman,\n Kristen\n},\n title = {\n Predicting Sufficient Annotation Strength for Interactive Foreground Segmentation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "4aeea1ed9c", @@ -9656,7 +9981,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Unknown" + "aff_country_unique": "Unknown", + "bibtex": "@InProceedings{Serrano_2013_ICCV,\n \n author = {\n Serrano,\n Jose A. Rodriguez and Larlus,\n Diane\n},\n title = {\n Predicting an Object Location Using a Global Image Representation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "4cd68c8588", @@ -9687,7 +10013,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Zurich;", "aff_country_unique_index": "0;0;0+1", - "aff_country_unique": "Switzerland;Belgium" + "aff_country_unique": "Switzerland;Belgium", + "bibtex": "@InProceedings{Manen_2013_ICCV,\n \n author = {\n Manen,\n Santiago and Guillaumin,\n Matthieu and Van Gool,\n Luc\n},\n title = {\n Prime Object Proposals with Randomized Prim's Algorithm\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "a88039117b", @@ -9711,14 +10038,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;0;1;1;1", - "aff_unique_norm": "Stevens Institute of Technology;Adobe", - "aff_unique_dep": ";Adobe Systems Incorporated", + "aff_unique_norm": "Stevens Institute of Technology;Adobe Systems Incorporated", + "aff_unique_dep": ";", "aff_unique_url": "https://www.stevens.edu;https://www.adobe.com", "aff_unique_abbr": "SIT;Adobe", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Li_2013_ICCV,\n \n author = {\n Li,\n Haoxiang and Hua,\n Gang and Lin,\n Zhe and Brandt,\n Jonathan and Yang,\n Jianchao\n},\n title = {\n Probabilistic Elastic Part Model for Unsupervised Face Detector Adaptation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "674709215c", @@ -9749,7 +10077,8 @@ "aff_campus_unique_index": "0;1", "aff_campus_unique": "Pasadena;Berkeley", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Maire_2013_ICCV,\n \n author = {\n Maire,\n Michael and Yu,\n Stella X.\n},\n title = {\n Progressive Multigrid Eigensolvers for Multiscale Spectral Segmentation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "b4bfbe8113", @@ -9780,7 +10109,8 @@ "aff_campus_unique_index": "0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;1;1", - "aff_country_unique": "United States;Germany" + "aff_country_unique": "United States;Germany", + "bibtex": "@InProceedings{Nieuwenhuis_2013_ICCV,\n \n author = {\n Nieuwenhuis,\n Claudia and Strekalovskiy,\n Evgeny and Cremers,\n Daniel\n},\n title = {\n Proportion Priors for Image Sequence Segmentation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "8ce42e461d", @@ -9811,7 +10141,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Swears_2013_ICCV,\n \n author = {\n Swears,\n Eran and Hoogs,\n Anthony and Boyer,\n Kim\n},\n title = {\n Pyramid Coding for Functional Scene Element Recognition in Video Scenes\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "a95316dd3f", @@ -9842,7 +10173,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Paris", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "France" + "aff_country_unique": "France", + "bibtex": "@InProceedings{Law_2013_ICCV,\n \n author = {\n Law,\n Marc T. and Thome,\n Nicolas and Cord,\n Matthieu\n},\n title = {\n Quadruplet-Wise Image Similarity Learning\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "d081f71132", @@ -9871,7 +10203,8 @@ "aff_unique_url": "https://www.ntua.gr", "aff_unique_abbr": "NTUA", "aff_country_unique_index": "0", - "aff_country_unique": "Greece" + "aff_country_unique": "Greece", + "bibtex": "@InProceedings{Avrithis_2013_ICCV,\n \n author = {\n Avrithis,\n Yannis\n},\n title = {\n Quantize and Conquer: A Dimensionality-Recursive Solution to Clustering,\n Vector Quantization,\n and Image Retrieval\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "313bdd8800", @@ -9902,7 +10235,8 @@ "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Tokyo;Rennes", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "Japan;France" + "aff_country_unique": "Japan;France", + "bibtex": "@InProceedings{Zhu_2013_ICCV,\n \n author = {\n Zhu,\n Cai-Zhi and Jegou,\n Herve and Satoh,\n Shin Ichi\n},\n title = {\n Query-Adaptive Asymmetrical Dissimilarities for Visual Object Retrieval\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "4cc97f01ed", @@ -9924,7 +10258,8 @@ "email": ";;;", "author_num": 4, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Zhang_2013_ICCV,\n \n author = {\n Zhang,\n Yizhe and Shao,\n Ming and Wong,\n Edward K. and Fu,\n Yun\n},\n title = {\n Random Faces Guided Sparse Many-to-One Encoder for Pose-Invariant Face Recognition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "e884573f42", @@ -9933,7 +10268,7 @@ "author": "Javier Marin; David Vazquez; Antonio M. Lopez; Jaume Amores; Bastian Leibe", "abstract": "Pedestrian detection is one of the most challenging tasks in computer vision, and has received a lot of attention in the last years. Recently, some authors have shown the advantages of using combinations of part/patch-based detectors in order to cope with the large variability of poses and the existence of partial occlusions. In this paper, we propose a pedestrian detection method that efficiently combines multiple local experts by means of a Random Forest ensemble. The proposed method works with rich block-based representations such as HOG and LBP, in such a way that the same features are reused by the multiple local experts, so that no extra computational cost is needed with respect to a holistic method. Furthermore, we demonstrate how to integrate the proposed approach with a cascaded architecture in order to achieve not only high accuracy but also an acceptable efficiency. In particular, the resulting detector operates at five frames per second using a laptop machine. We tested the proposed method with well-known challenging datasets such as Caltech, ETH, Daimler, and INRIA. The method proposed in this work consistently ranks among the top performers in all the datasets, being either the best method or having a small difference with the best one.", "pdf": "http://openaccess.thecvf.com/content_iccv_2013/papers/Marin_Random_Forests_of_2013_ICCV_paper.pdf", - "aff": "Computer Vision Center, Universitat Aut\u00f2noma de Barcelona; Computer Vision Center, Universitat Aut\u00f2noma de Barcelona; Computer Vision Center, Universitat Aut\u00f2noma de Barcelona; Computer Vision Center, Universitat Aut\u00f2noma de Barcelona; UMIC Research Centre, RWTH Aachen University", + "aff": "Computer Vision Center, Universitat Autònoma de Barcelona; Computer Vision Center, Universitat Autònoma de Barcelona; Computer Vision Center, Universitat Autònoma de Barcelona; Computer Vision Center, Universitat Autònoma de Barcelona; UMIC Research Centre, RWTH Aachen University", "project": "", "github": "", "supp": "", @@ -9948,14 +10283,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;0;0;0;1", - "aff_unique_norm": "Universitat Aut\u00f2noma de Barcelona;RWTH Aachen University", + "aff_unique_norm": "Universitat Autònoma de Barcelona;RWTH Aachen University", "aff_unique_dep": "Computer Vision Center;UMIC Research Centre", "aff_unique_url": "https://www.uab.cat;https://www.rwth-aachen.de", "aff_unique_abbr": "UAB;RWTH", "aff_campus_unique_index": "1", "aff_campus_unique": ";Aachen", "aff_country_unique_index": "0;0;0;0;1", - "aff_country_unique": "Spain;Germany" + "aff_country_unique": "Spain;Germany", + "bibtex": "@InProceedings{Marin_2013_ICCV,\n \n author = {\n Marin,\n Javier and Vazquez,\n David and Lopez,\n Antonio M. and Amores,\n Jaume and Leibe,\n Bastian\n},\n title = {\n Random Forests of Local Experts for Pedestrian Detection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "653d1c6069", @@ -9980,13 +10316,14 @@ "status": "Poster", "aff_unique_index": "0;0;0", "aff_unique_norm": "Google", - "aff_unique_dep": "Google", + "aff_unique_dep": "", "aff_unique_url": "https://www.google.com", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Aiger_2013_ICCV,\n \n author = {\n Aiger,\n Dror and Kokiopoulou,\n Efi and Rivlin,\n Ehud\n},\n title = {\n Random Grids: Fast Approximate Nearest Neighbors and Range Searching for Image Search\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "e449abb0a6", @@ -10017,7 +10354,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Bai_2013_ICCV,\n \n author = {\n Bai,\n Qinxun and Wu,\n Zheng and Sclaroff,\n Stan and Betke,\n Margrit and Monnier,\n Camille\n},\n title = {\n Randomized Ensemble Tracking\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "cf0221b8e8", @@ -10048,7 +10386,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0+0", - "aff_country_unique": "Australia" + "aff_country_unique": "Australia", + "bibtex": "@InProceedings{Cheng_2013_ICCV,\n \n author = {\n Cheng,\n Xin and Sridharan,\n Sridha and Saragih,\n Jason and Lucey,\n Simon\n},\n title = {\n Rank Minimization across Appearance and Shape for AAM Ensemble Fitting\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "2cac6683e8", @@ -10079,7 +10418,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Cambridge", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Tang_2013_ICCV,\n \n author = {\n Tang,\n Danhang and Yu,\n Tsz-Ho and Kim,\n Tae-Kyun\n},\n title = {\n Real-Time Articulated Hand Pose Estimation Using Semi-supervised Transductive Regression Forests\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "f5406adad3", @@ -10110,7 +10450,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Helten_2013_ICCV,\n \n author = {\n Helten,\n Thomas and Muller,\n Meinard and Seidel,\n Hans-Peter and Theobalt,\n Christian\n},\n title = {\n Real-Time Body Tracking with One Depth Camera and Inertial Sensors\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "639ec855b0", @@ -10141,7 +10482,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0", - "aff_country_unique": "Czech Republic" + "aff_country_unique": "Czech Republic", + "bibtex": "@InProceedings{Kukelova_2013_ICCV,\n \n author = {\n Kukelova,\n Zuzana and Bujnak,\n Martin and Pajdla,\n Tomas\n},\n title = {\n Real-Time Solution to the Absolute Pose Problem with Unknown Radial Distortion and Focal Length\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "572ad638dd", @@ -10163,7 +10505,8 @@ "email": ";;;", "author_num": 4, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Jacquet_2013_ICCV,\n \n author = {\n Jacquet,\n Bastien and Hane,\n Christian and Koser,\n Kevin and Pollefeys,\n Marc\n},\n title = {\n Real-World Normal Map Capture for Nearly Flat Reflective Surfaces\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "af51c28e4b", @@ -10189,12 +10532,13 @@ "aff_unique_index": "0;0+1;0;2;2", "aff_unique_norm": "Sun Yat-sen University;Guangdong Province Key Laboratory of Computational Science;Queen Mary University of London", "aff_unique_dep": "School of Mathematics and Computational Science;Computational Science;School of Electronic Engineering and Computer Science", - "aff_unique_url": "http://www.sysu.edu.cn;;https://www.qmul.ac.uk", + "aff_unique_url": "http://www.sysu.edu.cn/;;https://www.qmul.ac.uk", "aff_unique_abbr": "SYSU;;QMUL", "aff_campus_unique_index": "1;2;2", "aff_campus_unique": ";Guangzhou;London", "aff_country_unique_index": "0;0+0;0;1;1", - "aff_country_unique": "China;United Kingdom" + "aff_country_unique": "China;United Kingdom", + "bibtex": "@InProceedings{Hu_2013_ICCV,\n \n author = {\n Hu,\n Jian-Fang and Zheng,\n Wei-Shi and Lai,\n Jianhuang and Gong,\n Shaogang and Xiang,\n Tao\n},\n title = {\n Recognising Human-Object Interaction via Exemplar Based Modelling\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "c623c923e1", @@ -10222,10 +10566,11 @@ "aff_unique_dep": "School of Computing;Faculty of Computer Science and Information Technology", "aff_unique_url": "https://www.nus.edu.sg;https://www.um.edu.my", "aff_unique_abbr": "NUS;UM", - "aff_campus_unique_index": "0;0;0", - "aff_campus_unique": "Singapore;", + "aff_campus_unique_index": "", + "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", - "aff_country_unique": "Singapore;Malaysia" + "aff_country_unique": "Singapore;Malaysia", + "bibtex": "@InProceedings{Phan_2013_ICCV,\n \n author = {\n Phan,\n Trung Quy and Shivakumara,\n Palaiahnakote and Tian,\n Shangxuan and Tan,\n Chew Lim\n},\n title = {\n Recognizing Text with Perspective Distortion in Natural Scenes\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "dfb56d0e27", @@ -10247,14 +10592,15 @@ "email": ";;;", "author_num": 4, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Chang_2013_ICCV,\n \n author = {\n Chang,\n Che-Han and Hu,\n Min-Chun and Cheng,\n Wen-Huang and Chuang,\n Yung-Yu\n},\n title = {\n Rectangling Stereographic Projection for Wide-Angle Image Visualization\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "15f416241e", "title": "Recursive Estimation of the Stein Center of SPD Matrices and Its Applications", "site": "http://openaccess.thecvf.com/content_iccv_2013/html/Salehian_Recursive_Estimation_of_2013_ICCV_paper.html", "author": "Hesamoddin Salehian; Guang Cheng; Baba C. Vemuri; Jeffrey Ho", - "abstract": "Symmetric positive-definite (SPD) matrices are ubiquitous in Computer Vision, Machine Learning and Medical Image Analysis. Finding the center/average of a population of such matrices is a common theme in many algorithms such as clustering, segmentation, principal geodesic analysis, etc. The center of a population of such matrices can be defined using a variety of distance/divergence measures as the minimizer of the sum of squared distances/divergences from the unknown center to the members of the population. It is well known that the computation of the Karcher mean for the space of SPD matrices which is a negativelycurved Riemannian manifold is computationally expensive. Recently, the LogDet divergence-based center was shown to be a computationally attractive alternative. However,the LogDet-based mean of more than two matrices can not be computed in closed form, which makes it computationally less attractive for large populations. In this paper we present a novel recursive estimator for center based on the Stein distance \u00e2\u0080\u0093 which is the square root of the LogDet divergence \u00e2\u0080\u0093 that is significantly faster than the batch mode computation of this center. The key theoretical contribution is a closed-form solution for the weighted Stein center of two SPD matrices, which is used in the recursive computation of the Stein center for a population of SPD matrices. Additionally, we show experimental evidence of the convergence of our recursive Stein center estimator to the batch mode Stein center. We present applications of our recursive estimator to K-means clustering and image indexing depicting significant time gains over corresponding algorithms that use the batch mode computations. For the latter application, we develop novel hashing functions using the Stein distance and apply it to publicly available data sets, and experimental results have shown favorable comparisons to other competing methods.", + "abstract": "Symmetric positive-definite (SPD) matrices are ubiquitous in Computer Vision, Machine Learning and Medical Image Analysis. Finding the center/average of a population of such matrices is a common theme in many algorithms such as clustering, segmentation, principal geodesic analysis, etc. The center of a population of such matrices can be defined using a variety of distance/divergence measures as the minimizer of the sum of squared distances/divergences from the unknown center to the members of the population. It is well known that the computation of the Karcher mean for the space of SPD matrices which is a negativelycurved Riemannian manifold is computationally expensive. Recently, the LogDet divergence-based center was shown to be a computationally attractive alternative. However,the LogDet-based mean of more than two matrices can not be computed in closed form, which makes it computationally less attractive for large populations. In this paper we present a novel recursive estimator for center based on the Stein distance – which is the square root of the LogDet divergence – that is significantly faster than the batch mode computation of this center. The key theoretical contribution is a closed-form solution for the weighted Stein center of two SPD matrices, which is used in the recursive computation of the Stein center for a population of SPD matrices. Additionally, we show experimental evidence of the convergence of our recursive Stein center estimator to the batch mode Stein center. We present applications of our recursive estimator to K-means clustering and image indexing depicting significant time gains over corresponding algorithms that use the batch mode computations. For the latter application, we develop novel hashing functions using the Stein distance and apply it to publicly available data sets, and experimental results have shown favorable comparisons to other competing methods.", "pdf": "http://openaccess.thecvf.com/content_iccv_2013/papers/Salehian_Recursive_Estimation_of_2013_ICCV_paper.pdf", "aff": "Department of CISE, University of Florida; Department of CISE, University of Florida; Department of CISE, University of Florida; Department of CISE, University of Florida", "project": "", @@ -10278,7 +10624,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Salehian_2013_ICCV,\n \n author = {\n Salehian,\n Hesamoddin and Cheng,\n Guang and Vemuri,\n Baba C. and Ho,\n Jeffrey\n},\n title = {\n Recursive Estimation of the Stein Center of SPD Matrices and Its Applications\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "2606ece2c1", @@ -10309,7 +10656,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Kiel", "aff_country_unique_index": "0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Jordt-Sedlazeck_2013_ICCV,\n \n author = {\n Jordt-Sedlazeck,\n Anne and Koch,\n Reinhard\n},\n title = {\n Refractive Structure-from-Motion on Underwater Images\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "7f6256d09b", @@ -10340,7 +10688,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Wang_2013_ICCV,\n \n author = {\n Wang,\n Xiaoyu and Yang,\n Ming and Zhu,\n Shenghuo and Lin,\n Yuanqing\n},\n title = {\n Regionlets for Generic Object Detection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "4cdeb99b3c", @@ -10365,13 +10714,14 @@ "status": "Poster", "aff_unique_index": "0;1;0", "aff_unique_norm": "IBM;ID Analytics", - "aff_unique_dep": "IBM;", + "aff_unique_dep": ";", "aff_unique_url": "https://www.ibm.com/research/watson;", "aff_unique_abbr": "IBM;", "aff_campus_unique_index": "0;0", "aff_campus_unique": "T. J. Watson;", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Fan_2013_ICCV,\n \n author = {\n Fan,\n Quanfu and Gabbur,\n Prasad and Pankanti,\n Sharath\n},\n title = {\n Relative Attributes for Large-Scale Abandoned Object Detection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "266b1c4fc4", @@ -10402,7 +10752,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "New York", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Eigen_2013_ICCV,\n \n author = {\n Eigen,\n David and Krishnan,\n Dilip and Fergus,\n Rob\n},\n title = {\n Restoring an Image Taken through a Window Covered with Dirt or Rain\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "f6a4dfda60", @@ -10433,7 +10784,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Aodha_2013_ICCV,\n \n author = {\n Mac Aodha,\n Oisin and Brostow,\n Gabriel J.\n},\n title = {\n Revisiting Example Dependent Cost-Sensitive Learning with Decision Trees\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "81937055df", @@ -10464,7 +10816,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;1;0", - "aff_country_unique": "Japan;Sweden" + "aff_country_unique": "Japan;Sweden", + "bibtex": "@InProceedings{Zheng_2013_ICCV,\n \n author = {\n Zheng,\n Yinqiang and Kuang,\n Yubin and Sugimoto,\n Shigeki and Astrom,\n Kalle and Okutomi,\n Masatoshi\n},\n title = {\n Revisiting the PnP Problem: A Fast,\n General and Optimal Solution\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "be40bc3695", @@ -10495,7 +10848,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Chen_2013_ICCV,\n \n author = {\n Chen,\n Zhuoyuan and Wu,\n Ying\n},\n title = {\n Robust Dictionary Learning by Error Source Decomposition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "892f78f61b", @@ -10519,14 +10873,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;0;1", - "aff_unique_norm": "California Institute of Technology;Microsoft", - "aff_unique_dep": ";Microsoft Research", + "aff_unique_norm": "California Institute of Technology;Microsoft Research", + "aff_unique_dep": ";", "aff_unique_url": "https://www.caltech.edu;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "Caltech;MSR", "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Pasadena;Redmond", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Burgos-Artizzu_2013_ICCV,\n \n author = {\n Burgos-Artizzu,\n Xavier P. and Perona,\n Pietro and Dollar,\n Piotr\n},\n title = {\n Robust Face Landmark Estimation under Occlusion\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "dbdf6969ec", @@ -10557,7 +10912,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Weng_2013_ICCV,\n \n author = {\n Weng,\n Renliang and Lu,\n Jiwen and Hu,\n Junlin and Yang,\n Gao and Tan,\n Yap-Peng\n},\n title = {\n Robust Feature Set Matching for Partial Face Recognition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "4b4d39f4ff", @@ -10566,7 +10922,7 @@ "author": "Deyu Meng; Fernando De La Torre", "abstract": "Many problems in computer vision can be posed as recovering a low-dimensional subspace from highdimensional visual data. Factorization approaches to lowrank subspace estimation minimize a loss function between an observed measurement matrix and a bilinear factorization. Most popular loss functions include the L 2 and L 1 losses. L 2 is optimal for Gaussian noise, while L 1 is for Laplacian distributed noise. However, real data is often corrupted by an unknown noise distribution, which is unlikely to be purely Gaussian or Laplacian. To address this problem, this paper proposes a low-rank matrix factorization problem with a Mixture of Gaussians (MoG) noise model. The MoG model is a universal approximator for any continuous distribution, and hence is able to model a wider range of noise distributions. The parameters of the MoG model can be estimated with a maximum likelihood method, while the subspace is computed with standard approaches. We illustrate the benefits of our approach in extensive synthetic and real-world experiments including structure from motion, face modeling and background subtraction.", "pdf": "http://openaccess.thecvf.com/content_iccv_2013/papers/Meng_Robust_Matrix_Factorization_2013_ICCV_paper.pdf", - "aff": "Xi\u2019an Jiaotong University; Carnegie Mellon University", + "aff": "Xi’an Jiaotong University; Carnegie Mellon University", "project": "", "github": "", "supp": "", @@ -10581,14 +10937,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;1", - "aff_unique_norm": "Xi'an Jiao Tong University;Carnegie Mellon University", + "aff_unique_norm": "Xi'an Jiaotong University;Carnegie Mellon University", "aff_unique_dep": ";", "aff_unique_url": "https://www.xjtu.edu.cn;https://www.cmu.edu", "aff_unique_abbr": "XJTU;CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Meng_2013_ICCV,\n \n author = {\n Meng,\n Deyu and De La Torre,\n Fernando\n},\n title = {\n Robust Matrix Factorization with Unknown Noise\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "638945f37a", @@ -10619,7 +10976,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0", - "aff_country_unique": "United Kingdom;Singapore" + "aff_country_unique": "United Kingdom;Singapore", + "bibtex": "@InProceedings{Lin_2013_ICCV,\n \n author = {\n Lin,\n Wen-Yan and Cheng,\n Ming-Ming and Zheng,\n Shuai and Lu,\n Jiangbo and Crook,\n Nigel\n},\n title = {\n Robust Non-parametric Data Fitting for Correspondence Modeling\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "89abb0f2f4", @@ -10650,7 +11008,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;0;0;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Xing_2013_ICCV,\n \n author = {\n Xing,\n Junliang and Gao,\n Jin and Li,\n Bing and Hu,\n Weiming and Yan,\n Shuicheng\n},\n title = {\n Robust Object Tracking with Online Multi-lifespan Dictionary Learning\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "b3456d3228", @@ -10681,7 +11040,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2013_ICCV,\n \n author = {\n Zhang,\n Yingya and Sun,\n Zhenan and He,\n Ran and Tan,\n Tieniu\n},\n title = {\n Robust Subspace Clustering via Half-Quadratic Minimization\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "2b451f177c", @@ -10712,7 +11072,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Shi_2013_ICCV,\n \n author = {\n Shi,\n Feng and Zhou,\n Zhong and Xiao,\n Jiangjian and Wu,\n Wei\n},\n title = {\n Robust Trajectory Clustering for Motion Segmentation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "330a96c2c7", @@ -10743,7 +11104,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Arlington", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhang_2013_ICCV,\n \n author = {\n Zhang,\n Miao and Ding,\n Chris\n},\n title = {\n Robust Tucker Tensor Decomposition for Effective Image Representation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "5a62c24d38", @@ -10752,7 +11114,7 @@ "author": "Olivier Saurer; Kevin Koser; Jean-Yves Bouguet; Marc Pollefeys", "abstract": "A huge fraction of cameras used nowadays is based on CMOS sensors with a rolling shutter that exposes the image line by line. For dynamic scenes/cameras this introduces undesired effects like stretch, shear and wobble. It has been shown earlier that rotational shake induced rolling shutter effects in hand-held cell phone capture can be compensated based on an estimate of the camera rotation. In contrast, we analyse the case of significant camera motion, e.g. where a bypassing streetlevel capture vehicle uses a rolling shutter camera in a 3D reconstruction framework. The introduced error is depth dependent and cannot be compensated based on camera motion/rotation alone, invalidating also rectification for stereo camera systems. On top, significant lens distortion as often present in wide angle cameras intertwines with rolling shutter effects as it changes the time at which a certain 3D point is seen. We show that naive 3D reconstructions (assuming global shutter) will deliver biased geometry already for very mild assumptions on vehicle speed and resolution. We then develop rolling shutter dense multiview stereo algorithms that solve for time of exposure and depth at the same time, even in the presence of lens distortion and perform an evaluation on ground truth laser scan models as well as on real street-level data.", "pdf": "http://openaccess.thecvf.com/content_iccv_2013/papers/Saurer_Rolling_Shutter_Stereo_2013_ICCV_paper.pdf", - "aff": "ETH Z \u00a8urich, Switzerland; GEOMAR Kiel, Germany + ETH Z \u00a8urich, Switzerland; Google, Inc., Mountain View, CA; ETH Z \u00a8urich, Switzerland", + "aff": "ETH Z ¨urich, Switzerland; GEOMAR Kiel, Germany + ETH Z ¨urich, Switzerland; Google, Inc., Mountain View, CA; ETH Z ¨urich, Switzerland", "project": "", "github": "", "supp": "", @@ -10767,14 +11129,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;1+0;2;0", - "aff_unique_norm": "ETH Zurich;GEOMAR Helmholtz Centre for Ocean Research Kiel;Google", - "aff_unique_dep": ";;Google, Inc.", + "aff_unique_norm": "ETH Zürich;GEOMAR Helmholtz Centre for Ocean Research Kiel;Google, Inc.", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.ethz.ch;https://www.geomar.de;https://www.google.com", "aff_unique_abbr": "ETHZ;GEOMAR;Google", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Kiel;Mountain View", "aff_country_unique_index": "0;1+0;2;0", - "aff_country_unique": "Switzerland;Germany;United States" + "aff_country_unique": "Switzerland;Germany;United States", + "bibtex": "@InProceedings{Saurer_2013_ICCV,\n \n author = {\n Saurer,\n Olivier and Koser,\n Kevin and Bouguet,\n Jean-Yves and Pollefeys,\n Marc\n},\n title = {\n Rolling Shutter Stereo\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "eef4530b42", @@ -10805,7 +11168,8 @@ "aff_campus_unique_index": ";1", "aff_campus_unique": ";Sydney", "aff_country_unique_index": "0+0;0;1;0", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Liu_2013_ICCV,\n \n author = {\n Liu,\n Qiegen and Liu,\n Jianbo and Dong,\n Pei and Liang,\n Dong\n},\n title = {\n SGTD: Structure Gradient and Texture Decorrelating Regularization for Image Decomposition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "f0346089b3", @@ -10836,7 +11200,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Israel" + "aff_country_unique": "Israel", + "bibtex": "@InProceedings{Gilinsky_2013_ICCV,\n \n author = {\n Gilinsky,\n Alexandra and Manor,\n Lihi Zelnik\n},\n title = {\n SIFTpack: A Compact Representation for Efficient SIFT Matching\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "bf5978f562", @@ -10867,7 +11232,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Oxford;", "aff_country_unique_index": "0;0;0;1", - "aff_country_unique": "United Kingdom;Australia" + "aff_country_unique": "United Kingdom;Australia", + "bibtex": "@InProceedings{Ren_2013_ICCV,\n \n author = {\n Ren,\n Carl Yuheng and Prisacariu,\n Victor and Murray,\n David and Reid,\n Ian\n},\n title = {\n STAR3D: Simultaneous Tracking and Reconstruction of 3D Objects Using RGB-D Data\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "67af9f91bf", @@ -10898,7 +11264,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Xiao_2013_ICCV,\n \n author = {\n Xiao,\n Jianxiong and Owens,\n Andrew and Torralba,\n Antonio\n},\n title = {\n SUN3D: A Database of Big Spaces Reconstructed Using SfM and Object Labels\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "23bb869c1a", @@ -10922,14 +11289,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0+1;0;2;1;3", - "aff_unique_norm": "Tianjin University;Chinese Academy of Sciences;National University of Singapore;Sun Yat-sen University", + "aff_unique_norm": "Tianjin University;Chinese Academy of Sciences;National University of Singapore;Sun Yat-Sen University", "aff_unique_dep": "School of Computer Science and Technology;Institute of Information Engineering;Department of Electrical & Computer Engineering;School of Software", "aff_unique_url": "http://www.tju.edu.cn;http://www.cas.cn;https://www.nus.edu.sg;http://www.sysu.edu.cn", "aff_unique_abbr": "Tianjin University;CAS;NUS;SYSU", "aff_campus_unique_index": "0+1;0;1;3", "aff_campus_unique": "Tianjin;Beijing;;Guangzhou", "aff_country_unique_index": "0+0;0;1;0;0", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Cao_2013_ICCV,\n \n author = {\n Cao,\n Xiaochun and Zhang,\n Hua and Liu,\n Si and Guo,\n Xiaojie and Lin,\n Liang\n},\n title = {\n SYM-FISH: A Symmetry-Aware Flip Invariant Sketch Histogram Shape Descriptor\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "b221401ad8", @@ -10960,7 +11328,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Israel" + "aff_country_unique": "Israel", + "bibtex": "@InProceedings{Shtrom_2013_ICCV,\n \n author = {\n Shtrom,\n Elizabeth and Leifman,\n George and Tal,\n Ayellet\n},\n title = {\n Saliency Detection in Large Point Sets\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "cb3d5b900a", @@ -10991,7 +11360,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Merced", "aff_country_unique_index": "0;0;0;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Jiang_2013_ICCV,\n \n author = {\n Jiang,\n Bowen and Zhang,\n Lihe and Lu,\n Huchuan and Yang,\n Chuan and Yang,\n Ming-Hsuan\n},\n title = {\n Saliency Detection via Absorbing Markov Chain\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "9015879087", @@ -11022,7 +11392,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Merced", "aff_country_unique_index": "0;0;0;1;2", - "aff_country_unique": "China;Japan;United States" + "aff_country_unique": "China;Japan;United States", + "bibtex": "@InProceedings{Li_2013_ICCV,\n \n author = {\n Li,\n Xiaohui and Lu,\n Huchuan and Zhang,\n Lihe and Ruan,\n Xiang and Yang,\n Ming-Hsuan\n},\n title = {\n Saliency Detection via Dense and Sparse Reconstruction\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "7f32907869", @@ -11053,7 +11424,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhang_2013_ICCV,\n \n author = {\n Zhang,\n Jianming and Sclaroff,\n Stan\n},\n title = {\n Saliency Detection: A Boolean Map Approach\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "cf94da749e", @@ -11084,7 +11456,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "Belgium" + "aff_country_unique": "Belgium", + "bibtex": "@InProceedings{Riche_2013_ICCV,\n \n author = {\n Riche,\n Nicolas and Duvinage,\n Matthieu and Mancas,\n Matei and Gosselin,\n Bernard and Dutoit,\n Thierry\n},\n title = {\n Saliency and Human Fixations: State-of-the-Art and Study of Comparison Metrics\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "1903b7555d", @@ -11115,7 +11488,8 @@ "aff_campus_unique_index": "0;1;2;0", "aff_campus_unique": "Jinan;Philadelphia;Newark", "aff_country_unique_index": "0;1;1;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Jiang_2013_ICCV,\n \n author = {\n Jiang,\n Peng and Ling,\n Haibin and Yu,\n Jingyi and Peng,\n Jingliang\n},\n title = {\n Salient Region Detection by UFO: Uniqueness,\n Focusness and Objectness\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "077f58df95", @@ -11139,14 +11513,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;1", - "aff_unique_norm": "Massachusetts Institute of Technology;Microsoft", + "aff_unique_norm": "Massachusetts Institute of Technology;Microsoft Corporation", "aff_unique_dep": ";Microsoft Research", "aff_unique_url": "https://web.mit.edu;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "MIT;MSR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Isola_2013_ICCV,\n \n author = {\n Isola,\n Phillip and Liu,\n Ce\n},\n title = {\n Scene Collaging: Analysis and Synthesis of Natural Images with Semantic Layers\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "ca3e7a6f7d", @@ -11177,7 +11552,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Prague", "aff_country_unique_index": "0;0", - "aff_country_unique": "Czech Republic" + "aff_country_unique": "Czech Republic", + "bibtex": "@InProceedings{Neumann_2013_ICCV,\n \n author = {\n Neumann,\n Lukas and Matas,\n Jiri\n},\n title = {\n Scene Text Localization and Recognition with Oriented Stroke Detection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "22e15e07d6", @@ -11186,7 +11562,7 @@ "author": "Ramazan Gokberk Cinbis; Jakob Verbeek; Cordelia Schmid", "abstract": "We present an object detection system based on the Fisher vector (FV) image representation computed over SIFT and color descriptors. For computational and storage efficiency, we use a recent segmentation-based method to generate class-independent object detection hypotheses, in combination with data compression techniques. Our main contribution is a method to produce tentative object segmentation masks to suppress background clutter in the features. Re-weighting the local image features based on these masks is shown to improve object detection significantly. We also exploit contextual features in the form of a full-image FV descriptor, and an inter-category rescoring mechanism. Our experiments on the PASCAL VOC 2007 and 2010 datasets show that our detector improves over the current state-of-the-art detection results.", "pdf": "http://openaccess.thecvf.com/content_iccv_2013/papers/Cinbis_Segmentation_Driven_Object_2013_ICCV_paper.pdf", - "aff": "LEAR, INRIA Grenoble - Rh\u00f4ne-Alpes, France; Laboratoire Jean Kuntzmann; LEAR, INRIA Grenoble - Rh\u00f4ne-Alpes, France", + "aff": "LEAR, INRIA Grenoble - Rhône-Alpes, France; Laboratoire Jean Kuntzmann; LEAR, INRIA Grenoble - Rhône-Alpes, France", "project": "", "github": "", "supp": "", @@ -11201,14 +11577,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;1;0", - "aff_unique_norm": "INRIA Grenoble - Rh\u00f4ne-Alpes;Laboratoire Jean Kuntzmann", + "aff_unique_norm": "INRIA Grenoble - Rhône-Alpes;Laboratoire Jean Kuntzmann", "aff_unique_dep": "LEAR;", "aff_unique_url": "https://www.inria.fr/centre/grenoble;https://www.ljk.ij.cnrs.fr", "aff_unique_abbr": "INRIA;", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Grenoble;", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "France" + "aff_country_unique": "France", + "bibtex": "@InProceedings{Cinbis_2013_ICCV,\n \n author = {\n Cinbis,\n Ramazan Gokberk and Verbeek,\n Jakob and Schmid,\n Cordelia\n},\n title = {\n Segmentation Driven Object Detection with Fisher Vectors\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "9da76395fa", @@ -11239,7 +11616,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Xia_2013_ICCV,\n \n author = {\n Xia,\n Wei and Domokos,\n Csaba and Dong,\n Jian and Cheong,\n Loong-Fah and Yan,\n Shuicheng\n},\n title = {\n Semantic Segmentation without Annotating Segments\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "eabcd6a92f", @@ -11270,7 +11648,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Shankar_2013_ICCV,\n \n author = {\n Shankar,\n Sukrit and Lasenby,\n Joan and Cipolla,\n Roberto\n},\n title = {\n Semantic Transform: Weakly Supervised Semantic Inference for Relating Visual Attributes\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "d49e15d3f3", @@ -11301,7 +11680,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "San Antonio;", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhang_2013_ICCV,\n \n author = {\n Zhang,\n Shiliang and Yang,\n Ming and Wang,\n Xiaoyu and Lin,\n Yuanqing and Tian,\n Qi\n},\n title = {\n Semantic-Aware Co-indexing for Image Retrieval\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "b31db3b3d4", @@ -11325,14 +11705,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;1;2;1;3;4", - "aff_unique_norm": "Institute for Infocomm Research;Nanyang Technological University;University of Chinese Academy of Sciences;University of Technology Sydney;Microsoft", - "aff_unique_dep": ";;;;Microsoft Research Asia", + "aff_unique_norm": "Institute for Infocomm Research;Nanyang Technological University;University of Chinese Academy of Sciences;University of Technology, Sydney;Microsoft Research Asia", + "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.i2r.a-star.edu.sg;https://www.ntu.edu.sg;http://www.ucas.ac.cn;https://www.uts.edu.au;https://www.microsoft.com/en-us/research/group/asia", "aff_unique_abbr": "I2R;NTU;UCAS;UTS;MSRA", "aff_campus_unique_index": "1;2;1", "aff_campus_unique": ";Beijing;Sydney", "aff_country_unique_index": "0;0;1;0;2;1", - "aff_country_unique": "Singapore;China;Australia" + "aff_country_unique": "Singapore;China;Australia", + "bibtex": "@InProceedings{Liu_2013_ICCV,\n \n author = {\n Liu,\n Huiying and Xu,\n Dong and Huang,\n Qingming and Li,\n Wen and Xu,\n Min and Lin,\n Stephen\n},\n title = {\n Semantically-Based Human Scanpath Estimation with HMMs\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "761b80ac1f", @@ -11354,7 +11735,8 @@ "email": ";;", "author_num": 3, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Engel_2013_ICCV,\n \n author = {\n Engel,\n Jakob and Sturm,\n Jurgen and Cremers,\n Daniel\n},\n title = {\n Semi-dense Visual Odometry for a Monocular Camera\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "13299b88c3", @@ -11385,7 +11767,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2013_ICCV,\n \n author = {\n Wang,\n Zhengxiang and Liu,\n Rujie\n},\n title = {\n Semi-supervised Learning for Large Scale Image Cosegmentation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "bf22f29fab", @@ -11416,7 +11799,8 @@ "aff_campus_unique_index": "1;2;1", "aff_campus_unique": ";Arlington;Sydney", "aff_country_unique_index": "0;0;1;0", - "aff_country_unique": "United States;Australia" + "aff_country_unique": "United States;Australia", + "bibtex": "@InProceedings{Wang_2013_ICCV,\n \n author = {\n Wang,\n Hua and Nie,\n Feiping and Cai,\n Weidong and Huang,\n Heng\n},\n title = {\n Semi-supervised Robust Dictionary Learning via Efficient l-Norms Minimization\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "fc8af2e21a", @@ -11447,7 +11831,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Fu_2013_ICCV,\n \n author = {\n Fu,\n Ying and Lam,\n Antony and Sato,\n Imari and Okabe,\n Takahiro and Sato,\n Yoichi\n},\n title = {\n Separating Reflective and Fluorescent Components Using High Frequency Illumination in the Spectral Domain\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "0e0fb4900e", @@ -11456,7 +11841,7 @@ "author": "Evgeny Levinkov; Mario Fritz", "abstract": "Semantic road labeling is a key component of systems that aim at assisted or even autonomous driving. Considering that such systems continuously operate in the realworld, unforeseen conditions not represented in any conceivable training procedure are likely to occur on a regular basis. In order to equip systems with the ability to cope with such situations, we would like to enable adaptation to such new situations and conditions at runtime. Existing adaptive methods for image labeling either require labeled data from the new condition or even operate globally on a complete test set. None of this is a desirable mode of operation for a system as described above where new images arrive sequentially and conditions may vary. We study the effect of changing test conditions on scene labeling methods based on a new diverse street scene dataset. We propose a novel approach that can operate in such conditions and is based on a sequential Bayesian model update in order to robustly integrate the arriving images into the adapting procedure.", "pdf": "http://openaccess.thecvf.com/content_iccv_2013/papers/Levinkov_Sequential_Bayesian_Model_2013_ICCV_paper.pdf", - "aff": "Max Planck Institute for Informatics, Saarbr \u00a8ucken, Germany; Max Planck Institute for Informatics, Saarbr \u00a8ucken, Germany", + "aff": "Max Planck Institute for Informatics, Saarbr ¨ucken, Germany; Max Planck Institute for Informatics, Saarbr ¨ucken, Germany", "project": "", "github": "", "supp": "", @@ -11476,9 +11861,10 @@ "aff_unique_url": "https://mpi-inf.mpg.de", "aff_unique_abbr": "MPII", "aff_campus_unique_index": "0;0", - "aff_campus_unique": "Saarbr\u00fccken", + "aff_campus_unique": "Saarbrücken", "aff_country_unique_index": "0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Levinkov_2013_ICCV,\n \n author = {\n Levinkov,\n Evgeny and Fritz,\n Mario\n},\n title = {\n Sequential Bayesian Model Update under Structured Scene Prior for Semantic Road Scenes Labeling\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "8615578333", @@ -11509,7 +11895,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Cambridge;", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Owens_2013_ICCV,\n \n author = {\n Owens,\n Andrew and Xiao,\n Jianxiong and Torralba,\n Antonio and Freeman,\n William\n},\n title = {\n Shape Anchors for Data-Driven Multi-view Reconstruction\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "77db5eb61f", @@ -11531,7 +11918,8 @@ "email": ";;;", "author_num": 4, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Pedersen_2013_ICCV,\n \n author = {\n Pedersen,\n Kim Steenstrup and Stensbo-Smidt,\n Kristoffer and Zirm,\n Andrew and Igel,\n Christian\n},\n title = {\n Shape Index Descriptors Applied to Texture-Based Galaxy Analysis\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "3e914bb376", @@ -11562,7 +11950,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0+0;1", - "aff_country_unique": "Sweden;United States" + "aff_country_unique": "Sweden;United States", + "bibtex": "@InProceedings{Strandmark_2013_ICCV,\n \n author = {\n Strandmark,\n Petter and Ulen,\n Johannes and Kahl,\n Fredrik and Grady,\n Leo\n},\n title = {\n Shortest Paths with Curvature and Torsion\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "5724f85c66", @@ -11584,7 +11973,8 @@ "email": "", "author_num": 1, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Kokkinos_2013_ICCV,\n \n author = {\n Kokkinos,\n Iasonas\n},\n title = {\n Shufflets: Shared Mid-level Parts for Fast Object Detection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "15963057c8", @@ -11615,7 +12005,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "London", "aff_country_unique_index": "0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Yang_2013_ICCV,\n \n author = {\n Yang,\n Heng and Patras,\n Ioannis\n},\n title = {\n Sieving Regression Forest Votes for Facial Feature Detection in the Wild\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "72b2a8e5c2", @@ -11646,7 +12037,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Cao_2013_ICCV,\n \n author = {\n Cao,\n Qiong and Ying,\n Yiming and Li,\n Peng\n},\n title = {\n Similarity Metric Learning for Face Recognition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "984397407b", @@ -11677,7 +12069,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Albany", "aff_country_unique_index": "0;1;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Wu_2013_ICCV,\n \n author = {\n Wu,\n Baoyuan and Lyu,\n Siwei and Hu,\n Bao-Gang and Ji,\n Qiang\n},\n title = {\n Simultaneous Clustering and Tracklet Linking for Multi-face Tracking in Videos\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "49d26d204f", @@ -11708,7 +12101,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Wang_2013_ICCV,\n \n author = {\n Wang,\n Ruixuan and Trucco,\n Emanuele\n},\n title = {\n Single-Patch Low-Rank Prior for Non-pointwise Impulse Noise Removal\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "4206418cf9", @@ -11739,7 +12133,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Muller_2013_ICCV,\n \n author = {\n Muller,\n Oliver and Yang,\n Michael Ying and Rosenhahn,\n Bodo\n},\n title = {\n Slice Sampling Particle Belief Propagation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "f5e45bf160", @@ -11763,14 +12158,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;1;1;0;2;1", - "aff_unique_norm": "CEA-LIST;Carnegie Mellon University;Mines-ParisTech", + "aff_unique_norm": "CEA-List;Carnegie Mellon University;Mines-ParisTech", "aff_unique_dep": ";;", "aff_unique_url": "https://www-list.cea.fr;https://www.cmu.edu;https://www.mines-paristech.fr", "aff_unique_abbr": "CEA-List;CMU;Mines-ParisTech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0;0;1", - "aff_country_unique": "France;United States" + "aff_country_unique": "France;United States", + "bibtex": "@InProceedings{Ballas_2013_ICCV,\n \n author = {\n Ballas,\n Nicolas and Yang,\n Yi and Lan,\n Zhen-Zhong and Delezoide,\n Bertrand and Preteux,\n Francoise and Hauptmann,\n Alexander\n},\n title = {\n Space-Time Robust Representation for Action Recognition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "bece1cb63a", @@ -11794,14 +12190,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;1;0", - "aff_unique_norm": "Tel Aviv University;Interdisciplinary Center", + "aff_unique_norm": "Tel Aviv University;The Interdisciplinary Center", "aff_unique_dep": ";", "aff_unique_url": "https://www.tau.ac.il;https://www.idc.ac.il", "aff_unique_abbr": "TAU;IDC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Israel" + "aff_country_unique": "Israel", + "bibtex": "@InProceedings{(Basha)_2013_ICCV,\n \n author = {\n (Basha),\n Tali Dekel and Moses,\n Yael and Avidan,\n Shai\n},\n title = {\n Space-Time Tradeoffs in Photo Sequencing\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "109f022f3b", @@ -11825,14 +12222,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;0;1", - "aff_unique_norm": "ETH Zurich;Hong Kong Polytechnic University", + "aff_unique_norm": "ETH Zurich;The Hong Kong Polytechnic University", "aff_unique_dep": ";", "aff_unique_url": "https://www.ethz.ch;https://www.polyu.edu.hk", "aff_unique_abbr": "ETHZ;PolyU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;1", - "aff_country_unique": "Switzerland;China" + "aff_country_unique": "Switzerland;China", + "bibtex": "@InProceedings{Yang_2013_ICCV,\n \n author = {\n Yang,\n Meng and Van Gool,\n Luc and Zhang,\n Lei\n},\n title = {\n Sparse Variation Dictionary Learning for Face Recognition with a Single Training Sample per Person\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "2c2f7061b4", @@ -11863,7 +12261,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Sadovnik_2013_ICCV,\n \n author = {\n Sadovnik,\n Amir and Gallagher,\n Andrew and Parikh,\n Devi and Chen,\n Tsuhan\n},\n title = {\n Spoken Attributes: Mixing Binary and Relative Attributes to Say the Right Thing\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "75e2078242", @@ -11885,7 +12284,8 @@ "email": ";;;", "author_num": 4, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Douze_2013_ICCV,\n \n author = {\n Douze,\n Matthijs and Revaud,\n Jerome and Schmid,\n Cordelia and Jegou,\n Herve\n},\n title = {\n Stable Hyper-pooling and Query Expansion for Event Detection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "891047ad4d", @@ -11916,7 +12316,8 @@ "aff_campus_unique_index": "0;0;1;0", "aff_campus_unique": "Berkeley;Portland", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Chang_2013_ICCV,\n \n author = {\n Chang,\n Hang and Zhou,\n Yin and Spellman,\n Paul and Parvin,\n Bahram\n},\n title = {\n Stacked Predictive Sparse Coding for Classification of Distinct Regions in Tumor Histopathology\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "3ea0cfb902", @@ -11941,13 +12342,14 @@ "status": "Poster", "aff_unique_index": "0;0;0", "aff_unique_norm": "Google", - "aff_unique_dep": "Google", + "aff_unique_dep": "", "aff_unique_url": "https://www.google.com", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Klingner_2013_ICCV,\n \n author = {\n Klingner,\n Bryan and Martin,\n David and Roseborough,\n James\n},\n title = {\n Street View Motion-from-Structure-from-Motion\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "df4b989cad", @@ -11969,7 +12371,8 @@ "email": ";;;", "author_num": 4, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Pishchulin_2013_ICCV,\n \n author = {\n Pishchulin,\n Leonid and Andriluka,\n Mykhaylo and Gehler,\n Peter and Schiele,\n Bernt\n},\n title = {\n Strong Appearance and Expressive Spatial Models for Human Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "da8d1e465c", @@ -11993,14 +12396,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;0", - "aff_unique_norm": "Microsoft", + "aff_unique_norm": "Microsoft Corporation", "aff_unique_dep": "Microsoft Research", "aff_unique_url": "https://www.microsoft.com/en-us/research", "aff_unique_abbr": "MSR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Dollar_2013_ICCV,\n \n author = {\n Dollar,\n Piotr and Zitnick,\n C. L.\n},\n title = {\n Structured Forests for Fast Edge Detection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "4182cab8eb", @@ -12021,7 +12425,8 @@ "email": ";;;", "author_num": 4, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Fix_2013_ICCV,\n \n author = {\n Fix,\n Alexander and Joachims,\n Thorsten and Park,\n Sung Min and Zabih,\n Ramin\n},\n title = {\n Structured Learning of Sum-of-Submodular Higher Order Energy Functions\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "ab0d5fe335", @@ -12052,7 +12457,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Gupta_2013_ICCV,\n \n author = {\n Gupta,\n Mohit and Yin,\n Qi and Nayar,\n Shree K.\n},\n title = {\n Structured Light in Sunlight\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "0c4e9eb315", @@ -12083,7 +12489,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Pittsburgh", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Lee_2013_ICCV,\n \n author = {\n Lee,\n Yong Jae and Efros,\n Alexei A. and Hebert,\n Martial\n},\n title = {\n Style-Aware Mid-level Representation for Discovering Visual Connections in Space and Time\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "aad2190bfa", @@ -12092,7 +12499,7 @@ "author": "Nicolas Martin; Vincent Couture; Sebastien Roy", "abstract": "We present a scanning method that recovers dense subpixel camera-projector correspondence without requiring any photometric calibration nor preliminary knowledge of their relative geometry. Subpixel accuracy is achieved by considering several zero-crossings defined by the difference between pairs of unstructured patterns. We use gray-level band-pass white noise patterns that increase robustness to indirect lighting and scene discontinuities. Simulated and experimental results show that our method recovers scene geometry with high subpixel precision, and that it can handle many challenges of active reconstruction systems. We compare our results to state of the art methods such as micro phase shifting and modulated phase shifting.", "pdf": "http://openaccess.thecvf.com/content_iccv_2013/papers/Martin_Subpixel_Scanning_Invariant_2013_ICCV_paper.pdf", - "aff": "Universit \u00b4e de Montr \u00b4eal; Universit \u00b4e de Montr \u00b4eal; Universit \u00b4e de Montr \u00b4eal", + "aff": "Universit ´e de Montr ´eal; Universit ´e de Montr ´eal; Universit ´e de Montr ´eal", "project": "", "github": "", "supp": "", @@ -12107,20 +12514,21 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;0;0", - "aff_unique_norm": "Universit\u00e9 de Montr\u00e9al", + "aff_unique_norm": "Université de Montréal", "aff_unique_dep": "", "aff_unique_url": "https://www.umontreal.ca", "aff_unique_abbr": "UdeM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Martin_2013_ICCV,\n \n author = {\n Martin,\n Nicolas and Couture,\n Vincent and Roy,\n Sebastien\n},\n title = {\n Subpixel Scanning Invariant to Indirect Lighting Using Quadratic Code Length\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "d2f18303a9", "title": "Super-resolution via Transform-Invariant Group-Sparse Regularization", "site": "http://openaccess.thecvf.com/content_iccv_2013/html/Fernandez-Granda_Super-resolution_via_Transform-Invariant_2013_ICCV_paper.html", - "author": "Carlos Fernandez-Granda; Emmanuel J. Cand\u00c3\u00a8s", + "author": "Carlos Fernandez-Granda; Emmanuel J. Candès", "abstract": "We present a framework to super-resolve planar regions found in urban scenes and other man-made environments by taking into account their 3D geometry. Such regions have highly structured straight edges, but this prior is challenging to exploit due to deformations induced by the projection onto the imaging plane. Our method factors out such deformations by using recently developed tools based on convex optimization to learn a transform that maps the image to a domain where its gradient has a simple group-sparse structure. This allows to obtain a novel convex regularizer that enforces global consistency constraints between the edges of the image. Computational experiments with real images show that this data-driven approach to the design of regularizers promoting transform-invariant group sparsity is very effective at high super-resolution factors. We view our approach as complementary to most recent superresolution methods, which tend to focus on hallucinating high-frequency textures.", "pdf": "http://openaccess.thecvf.com/content_iccv_2013/papers/Fernandez-Granda_Super-resolution_via_Transform-Invariant_2013_ICCV_paper.pdf", "aff": "Stanford University; Stanford University", @@ -12145,7 +12553,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Fernandez-Granda_2013_ICCV,\n \n author = {\n Fernandez-Granda,\n Carlos and Candès,\n Emmanuel J.\n},\n title = {\n Super-resolution via Transform-Invariant Group-Sparse Regularization\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "3fd7a4a6a5", @@ -12174,7 +12583,8 @@ "aff_unique_url": "https://www.nokia.com", "aff_unique_abbr": "Nokia", "aff_country_unique_index": "0", - "aff_country_unique": "Finland" + "aff_country_unique": "Finland", + "bibtex": "@InProceedings{Fan_2013_ICCV,\n \n author = {\n Fan,\n Lixin\n},\n title = {\n Supervised Binary Hash Code Learning with Jensen Shannon Divergence\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "c5b6a26769", @@ -12196,7 +12606,8 @@ "email": ";", "author_num": 2, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Guo_2013_ICCV,\n \n author = {\n Guo,\n Ruiqi and Hoiem,\n Derek\n},\n title = {\n Support Surface Prediction in Indoor Scenes\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "4301773e53", @@ -12227,7 +12638,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Oxford;", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "United Kingdom;Russian Federation" + "aff_country_unique": "United Kingdom;Russia", + "bibtex": "@InProceedings{Chai_2013_ICCV,\n \n author = {\n Chai,\n Yuning and Lempitsky,\n Victor and Zisserman,\n Andrew\n},\n title = {\n Symbiotic Segmentation and Part Localization for Fine-Grained Categorization\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "96d841dac2", @@ -12249,7 +12661,8 @@ "email": ";;", "author_num": 3, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Steinberg_2013_ICCV,\n \n author = {\n Steinberg,\n Daniel M. and Pizarro,\n Oscar and Williams,\n Stefan B.\n},\n title = {\n Synergistic Clustering of Image and Segment Descriptors for Unsupervised Scene Understanding\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "d109fffe43", @@ -12280,7 +12693,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Taiwan", "aff_country_unique_index": "0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Tsai_2013_ICCV,\n \n author = {\n Tsai,\n Pei-Hen and Chuang,\n Yung-Yu\n},\n title = {\n Target-Driven Moire Pattern Synthesis by Phase Modulation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "897de377ad", @@ -12302,7 +12716,8 @@ "email": ";;;", "author_num": 4, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Reso_2013_ICCV,\n \n author = {\n Reso,\n Matthias and Jachalsky,\n Jorn and Rosenhahn,\n Bodo and Ostermann,\n Jorn\n},\n title = {\n Temporally Consistent Superpixels\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "45419d6272", @@ -12326,14 +12741,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0+1;2;2;2", - "aff_unique_norm": "Shenzhen Institute of Advanced Technology;Chinese University of Hong Kong;Adobe", + "aff_unique_norm": "Shenzhen Institutes of Advanced Technology;The Chinese University of Hong Kong;Adobe", "aff_unique_dep": "Shenzhen Key Lab of Comp. Vis and Pat. Rec.;Department of Information Engineering;Adobe Research", "aff_unique_url": "http://www.siat.ac.cn;https://www.cuhk.edu.hk;https://research.adobe.com", "aff_unique_abbr": "SIAT;CUHK;Adobe", "aff_campus_unique_index": "0+1", "aff_campus_unique": "Shenzhen;Hong Kong SAR;", "aff_country_unique_index": "0+0;1;1;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Huang_2013_ICCV,\n \n author = {\n Huang,\n Weilin and Lin,\n Zhe and Yang,\n Jianchao and Wang,\n Jue\n},\n title = {\n Text Localization in Natural Images Using Stroke Feature Transform and Text Covariance Descriptors\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "b382218aaa", @@ -12364,7 +12780,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Zurich;", "aff_country_unique_index": "0;0;1;0;0+1", - "aff_country_unique": "Switzerland;Belgium" + "aff_country_unique": "Switzerland;Belgium", + "bibtex": "@InProceedings{Gygli_2013_ICCV,\n \n author = {\n Gygli,\n Michael and Grabner,\n Helmut and Riemenschneider,\n Hayko and Nater,\n Fabian and Van Gool,\n Luc\n},\n title = {\n The Interestingness of Images\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "d2a7c1173f", @@ -12395,7 +12812,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1+0", - "aff_country_unique": "Romania;Sweden" + "aff_country_unique": "Romania;Sweden", + "bibtex": "@InProceedings{Zanfir_2013_ICCV,\n \n author = {\n Zanfir,\n Mihai and Leordeanu,\n Marius and Sminchisescu,\n Cristian\n},\n title = {\n The Moving Pose: An Efficient 3D Kinematics Descriptor for Low-Latency Action Recognition and Detection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "c7c9915176", @@ -12426,7 +12844,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Boston", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Dicle_2013_ICCV,\n \n author = {\n Dicle,\n Caglayan and Camps,\n Octavia I. and Sznaier,\n Mario\n},\n title = {\n The Way They Move: Tracking Multiple Targets with Similar Appearance\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "2fb9954a1e", @@ -12457,7 +12876,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Rennes;", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "France;Greece" + "aff_country_unique": "France;Greece", + "bibtex": "@InProceedings{Tolias_2013_ICCV,\n \n author = {\n Tolias,\n Giorgos and Avrithis,\n Yannis and Jegou,\n Herve\n},\n title = {\n To Aggregate or Not to aggregate: Selective Match Kernels for Image Search\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "ed2800b020", @@ -12488,7 +12908,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Chang_2013_ICCV,\n \n author = {\n Chang,\n Jason and Fisher,\n III,\n John W.\n},\n title = {\n Topology-Constrained Layered Tracking with Latent Flow\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "e27f65e5fc", @@ -12510,7 +12931,8 @@ "email": ";;;", "author_num": 4, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Lellmann_2013_ICCV,\n \n author = {\n Lellmann,\n Jan and Strekalovskiy,\n Evgeny and Koetter,\n Sabrina and Cremers,\n Daniel\n},\n title = {\n Total Variation Regularization for Functions with Values in a Manifold\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "a52a4368a4", @@ -12541,7 +12963,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "New York", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhang_2013_ICCV,\n \n author = {\n Zhang,\n Yuqian and Mu,\n Cun and Kuo,\n Han-Wen and Wright,\n John\n},\n title = {\n Toward Guaranteed Illumination Models for Non-convex Objects\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "4fbf916c4b", @@ -12572,7 +12995,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Tambe_2013_ICCV,\n \n author = {\n Tambe,\n Salil and Veeraraghavan,\n Ashok and Agrawal,\n Amit\n},\n title = {\n Towards Motion Aware Light Field Video for Dynamic Scenes\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "dc4bdfb2bd", @@ -12593,7 +13017,8 @@ "email": ";;;;", "author_num": 5, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Jhuang_2013_ICCV,\n \n author = {\n Jhuang,\n Hueihan and Gall,\n Juergen and Zuffi,\n Silvia and Schmid,\n Cordelia and Black,\n Michael J.\n},\n title = {\n Towards Understanding Action Recognition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "aa8c903598", @@ -12624,7 +13049,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Song_2013_ICCV,\n \n author = {\n Song,\n Shuran and Xiao,\n Jianxiong\n},\n title = {\n Tracking Revisited Using RGBD Camera: Unified Benchmark and Baselines\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "f965de6767", @@ -12648,14 +13074,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0+1;1;1;0", - "aff_unique_norm": "University of Technology Sydney;Toyota Research Institute", + "aff_unique_norm": "University of Technology, Sydney;Toyota Research Institute", "aff_unique_dep": "Faculty of Engineering and Information Technology;", "aff_unique_url": "https://www.uts.edu.au;https://www.tri.global", "aff_unique_abbr": "UTS;TRI", "aff_campus_unique_index": "0+1;1;1;0", "aff_campus_unique": "Sydney;Ann Arbor", "aff_country_unique_index": "0+1;1;1;0", - "aff_country_unique": "Australia;United States" + "aff_country_unique": "Australia;United States", + "bibtex": "@InProceedings{Hong_2013_ICCV,\n \n author = {\n Hong,\n Zhibin and Mei,\n Xue and Prokhorov,\n Danil and Tao,\n Dacheng\n},\n title = {\n Tracking via Robust Multi-task Multi-view Joint Sparse Representation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "225d14fae9", @@ -12686,7 +13113,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Girshick_2013_ICCV,\n \n author = {\n Girshick,\n Ross and Malik,\n Jitendra\n},\n title = {\n Training Deformable Part Models with Decorrelated Features\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "665b5a6962", @@ -12717,7 +13145,8 @@ "aff_campus_unique_index": "0+0;0;0;0;1", "aff_campus_unique": "Beijing;Chicago", "aff_country_unique_index": "0+0;0;0;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Long_2013_ICCV,\n \n author = {\n Long,\n Mingsheng and Wang,\n Jianmin and Ding,\n Guiguang and Sun,\n Jiaguang and Yu,\n Philip S.\n},\n title = {\n Transfer Feature Learning with Joint Distribution Adaptation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "7ecaf693a0", @@ -12739,7 +13168,8 @@ "email": ";;;;;", "author_num": 6, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Rohrbach_2013_ICCV,\n \n author = {\n Rohrbach,\n Marcus and Qiu,\n Wei and Titov,\n Ivan and Thater,\n Stefan and Pinkal,\n Manfred and Schiele,\n Bernt\n},\n title = {\n Translating Video Content to Natural Language Descriptions\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "c11e58c2b7", @@ -12761,7 +13191,8 @@ "email": ";;", "author_num": 3, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Stuhmer_2013_ICCV,\n \n author = {\n Stuhmer,\n Jan and Schroder,\n Peter and Cremers,\n Daniel\n},\n title = {\n Tree Shape Priors with Connectivity Constraints Using Convex Relaxation on General Graphs\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "fca7e81362", @@ -12792,7 +13223,8 @@ "aff_campus_unique_index": "0;0;1;1", "aff_campus_unique": "Philadelphia;Suita", "aff_country_unique_index": "0;0;1;1", - "aff_country_unique": "United States;Japan" + "aff_country_unique": "United States;Japan", + "bibtex": "@InProceedings{Lombardi_2013_ICCV,\n \n author = {\n Lombardi,\n Stephen and Nishino,\n Ko and Makihara,\n Yasushi and Yagi,\n Yasushi\n},\n title = {\n Two-Point Gait: Decoupling Gait from Body Shape\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "b3c5f785b1", @@ -12814,7 +13246,8 @@ "email": ";;", "author_num": 3, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Fang_2013_ICCV,\n \n author = {\n Fang,\n Chen and Xu,\n Ye and Rockmore,\n Daniel N.\n},\n title = {\n Unbiased Metric Learning: On the Utilization of Multiple Datasets and Web Images for Softening Bias\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "c342db4fa9", @@ -12823,7 +13256,7 @@ "author": "Sarah Parisot; William Wells III; Stephane Chemouny; Hugues Duffau; Nikos Paragios", "abstract": "Graph-based methods have become popular in recent years and have successfully addressed tasks like segmentation and deformable registration. Their main strength is optimality of the obtained solution while their main limitation is the lack of precision due to the grid-like representations and the discrete nature of the quantized search space. In this paper we introduce a novel approach for combined segmentation/registration of brain tumors that adapts graph and sampling resolution according to the image content. To this end we estimate the segmentation and registration marginals towards adaptive graph resolution and intelligent definition of the search space. This information is considered in a hierarchical framework where uncertainties are propagated in a natural manner. State of the art results in the joint segmentation/registration of brain images with low-grade gliomas demonstrate the potential of our approach.", "pdf": "http://openaccess.thecvf.com/content_iccv_2013/papers/Parisot_Uncertainty-Driven_Efficiently-Sampled_Sparse_2013_ICCV_paper.pdf", - "aff": "Center for Visual Computing, Ecole Centrale Paris, France+Equipe GALEN, INRIA Saclay-Ile-de-France, France; Surgical Planning Laboratory, Brigham and Women\u2019s hospital, Harvard Medical School, USA; Intrasense SAS, Montpellier, France; D\u00b4epartement de Neurochirurgie, Hopital Gui de Chauliac, CHU Montpellier, France; Center for Visual Computing, Ecole Centrale Paris, France+Equipe GALEN, INRIA Saclay-Ile-de-France, France", + "aff": "Center for Visual Computing, Ecole Centrale Paris, France+Equipe GALEN, INRIA Saclay-Ile-de-France, France; Surgical Planning Laboratory, Brigham and Women’s hospital, Harvard Medical School, USA; Intrasense SAS, Montpellier, France; D´epartement de Neurochirurgie, Hopital Gui de Chauliac, CHU Montpellier, France; Center for Visual Computing, Ecole Centrale Paris, France+Equipe GALEN, INRIA Saclay-Ile-de-France, France", "project": "", "github": "", "supp": "", @@ -12839,13 +13272,14 @@ "status": "Poster", "aff_unique_index": "0+1;2;3;4;0+1", "aff_unique_norm": "Ecole Centrale Paris;INRIA;Harvard Medical School;Intrasense SAS;CHU Montpellier", - "aff_unique_dep": "Center for Visual Computing;Equipe GALEN;Surgical Planning Laboratory;;D\u00b4epartement de Neurochirurgie", + "aff_unique_dep": "Center for Visual Computing;Equipe GALEN;Surgical Planning Laboratory;;D´epartement de Neurochirurgie", "aff_unique_url": "https://www.ecp.fr;https://www.inria.fr;https://hms.harvard.edu;;https://www.chumontpellier.fr", "aff_unique_abbr": ";INRIA;HMS;;", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Saclay-Ile-de-France", "aff_country_unique_index": "0+0;1;0;0;0+0", - "aff_country_unique": "France;United States" + "aff_country_unique": "France;United States", + "bibtex": "@InProceedings{Parisot_2013_ICCV,\n \n author = {\n Parisot,\n Sarah and Wells,\n III,\n William and Chemouny,\n Stephane and Duffau,\n Hugues and Paragios,\n Nikos\n},\n title = {\n Uncertainty-Driven Efficiently-Sampled Sparse Graphical Models for Concurrent Tumor Segmentation and Atlas Registration\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "e77aa59889", @@ -12854,7 +13288,7 @@ "author": "Hongyi Zhang; Andreas Geiger; Raquel Urtasun", "abstract": "In this paper, we are interested in understanding the semantics of outdoor scenes in the context of autonomous driving. Towards this goal, we propose a generative model of 3D urban scenes which is able to reason not only about the geometry and objects present in the scene, but also about the high-level semantics in the form of traffic patterns. We found that a small number of patterns is sufficient to model the vast majority of traffic scenes and show how these patterns can be learned. As evidenced by our experiments, this high-level reasoning significantly improves the overall scene estimation as well as the vehicle-to-lane association when compared to state-of-the-art approaches [10].", "pdf": "http://openaccess.thecvf.com/content_iccv_2013/papers/Zhang_Understanding_High-Level_Semantics_2013_ICCV_paper.pdf", - "aff": "Peking University; MPI T\u00fcbingen; TTI Chicago", + "aff": "Peking University; MPI Tübingen; TTI Chicago", "project": "", "github": "", "supp": "", @@ -12874,9 +13308,10 @@ "aff_unique_url": "http://www.pku.edu.cn;https://www.cbs.mpg.de;https://www.tti-chicago.org", "aff_unique_abbr": "Peking U;MPI CBS;TTI", "aff_campus_unique_index": "1;2", - "aff_campus_unique": ";T\u00fcbingen;Chicago", + "aff_campus_unique": ";Tübingen;Chicago", "aff_country_unique_index": "0;1;2", - "aff_country_unique": "China;Germany;United States" + "aff_country_unique": "China;Germany;United States", + "bibtex": "@InProceedings{Zhang_2013_ICCV,\n \n author = {\n Zhang,\n Hongyi and Geiger,\n Andreas and Urtasun,\n Raquel\n},\n title = {\n Understanding High-Level Semantics by Modeling Traffic Patterns\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "59a52d075a", @@ -12885,7 +13320,7 @@ "author": "Ricardo Cabral; Fernando De La Torre; Joao P. Costeira; Alexandre Bernardino", "abstract": "Low rank models have been widely used for the representation of shape, appearance or motion in computer vision problems. Traditional approaches to fit low rank models make use of an explicit bilinear factorization. These approaches benefit from fast numerical methods for optimization and easy kernelization. However, they suffer from serious local minima problems depending on the loss function and the amount/type of missing data. Recently, these lowrank models have alternatively been formulated as convex problems using the nuclear norm regularizer; unlike factorization methods, their numerical solvers are slow and it is unclear how to kernelize them or to impose a rank a priori. This paper proposes a unified approach to bilinear factorization and nuclear norm regularization, that inherits the benefits of both. We analyze the conditions under which these approaches are equivalent. Moreover, based on this analysis, we propose a new optimization algorithm and a \"rank continuation\" strategy that outperform state-of-theart approaches for Robust PCA, Structure from Motion and Photometric Stereo with outliers and missing data.", "pdf": "http://openaccess.thecvf.com/content_iccv_2013/papers/Cabral_Unifying_Nuclear_Norm_2013_ICCV_paper.pdf", - "aff": "ISR - Instituto Superior T\u00e9cnico, Lisboa, Portugal+Carnegie Mellon University, Pittsburgh, PA, USA; Carnegie Mellon University, Pittsburgh, PA, USA; ISR - Instituto Superior T\u00e9cnico, Lisboa, Portugal; ISR - Instituto Superior T\u00e9cnico, Lisboa, Portugal", + "aff": "ISR - Instituto Superior Técnico, Lisboa, Portugal+Carnegie Mellon University, Pittsburgh, PA, USA; Carnegie Mellon University, Pittsburgh, PA, USA; ISR - Instituto Superior Técnico, Lisboa, Portugal; ISR - Instituto Superior Técnico, Lisboa, Portugal", "project": "", "github": "", "supp": "", @@ -12900,14 +13335,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0+1;1;0;0", - "aff_unique_norm": "Instituto Superior T\u00e9cnico;Carnegie Mellon University", + "aff_unique_norm": "Instituto Superior Técnico;Carnegie Mellon University", "aff_unique_dep": ";", "aff_unique_url": "https://www IST.pt;https://www.cmu.edu", "aff_unique_abbr": "ISR;CMU", "aff_campus_unique_index": "0+1;1;0;0", "aff_campus_unique": "Lisboa;Pittsburgh", "aff_country_unique_index": "0+1;1;0;0", - "aff_country_unique": "Portugal;United States" + "aff_country_unique": "Portugal;United States", + "bibtex": "@InProceedings{Cabral_2013_ICCV,\n \n author = {\n Cabral,\n Ricardo and De La Torre,\n Fernando and Costeira,\n Joao P. and Bernardino,\n Alexandre\n},\n title = {\n Unifying Nuclear Norm and Bilinear Factorization Approaches for Low-Rank Matrix Decomposition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "5f714d4102", @@ -12938,7 +13374,8 @@ "aff_campus_unique_index": "1;1;1;1", "aff_campus_unique": ";Canberra", "aff_country_unique_index": "0+0;0+0;0+0;0+0", - "aff_country_unique": "Australia" + "aff_country_unique": "Australia", + "bibtex": "@InProceedings{Baktashmotlagh_2013_ICCV,\n \n author = {\n Baktashmotlagh,\n Mahsa and Harandi,\n Mehrtash T. and Lovell,\n Brian C. and Salzmann,\n Mathieu\n},\n title = {\n Unsupervised Domain Adaptation by Domain Invariant Projection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "a5d2fd0b86", @@ -12947,7 +13384,7 @@ "author": "R. Melo; M. Antunes; J.P. Barreto; G. Falcao; N. Goncalves", "abstract": "Estimating the amount and center of distortion from lines in the scene has been addressed in the literature by the socalled \"plumb-line\" approach. In this paper we propose a new geometric method to estimate not only the distortion parameters but the entire camera calibration (up to an \"angular\" scale factor) using a minimum of 3 lines. We propose a new framework for the unsupervised simultaneous detection of natural image of lines and camera parameters estimation, enabling a robust calibration from a single image. Comparative experiments with existing automatic approaches for the distortion estimation and with ground truth data are presented.", "pdf": "http://openaccess.thecvf.com/content_iccv_2013/papers/Melo_Unsupervised_Intrinsic_Calibration_2013_ICCV_paper.pdf", - "aff": "Institute for Systems and Robotics, University of Coimbra; Institute for Systems and Robotics, University of Coimbra; Institute for Systems and Robotics, University of Coimbra; Instituto de Telecomunica\u00e7\u00f5es, University of Coimbra; Institute for Systems and Robotics, University of Coimbra", + "aff": "Institute for Systems and Robotics, University of Coimbra; Institute for Systems and Robotics, University of Coimbra; Institute for Systems and Robotics, University of Coimbra; Instituto de Telecomunicações, University of Coimbra; Institute for Systems and Robotics, University of Coimbra", "project": "", "github": "", "supp": "", @@ -12969,7 +13406,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "Portugal" + "aff_country_unique": "Portugal", + "bibtex": "@InProceedings{Melo_2013_ICCV,\n \n author = {\n Melo,\n R. and Antunes,\n M. and Barreto,\n J.P. and Falcao,\n G. and Goncalves,\n N.\n},\n title = {\n Unsupervised Intrinsic Calibration from a Single Frame Using a \"Plumb-Line\" Approach\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "4a5ee6f2bd", @@ -13000,7 +13438,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "China;United Kingdom" + "aff_country_unique": "China;United Kingdom", + "bibtex": "@InProceedings{Pei_2013_ICCV,\n \n author = {\n Pei,\n Yuru and Kim,\n Tae-Kyun and Zha,\n Hongbin\n},\n title = {\n Unsupervised Random Forest Manifold Alignment for Lipreading\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "852e2aad40", @@ -13022,7 +13461,8 @@ "email": ";;;", "author_num": 4, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Fernando_2013_ICCV,\n \n author = {\n Fernando,\n Basura and Habrard,\n Amaury and Sebban,\n Marc and Tuytelaars,\n Tinne\n},\n title = {\n Unsupervised Visual Domain Adaptation Using Subspace Alignment\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "d9ec70c2c3", @@ -13053,7 +13493,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Suzhou", "aff_country_unique_index": "0;0;0;0+1", - "aff_country_unique": "Singapore;China" + "aff_country_unique": "Singapore;China", + "bibtex": "@InProceedings{Guo_2013_ICCV,\n \n author = {\n Guo,\n Jiaming and Li,\n Zhuwen and Cheong,\n Loong-Fah and Zhou,\n Steven Zhiying\n},\n title = {\n Video Co-segmentation for Meaningful Action Extraction\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "68c8d1451c", @@ -13084,7 +13525,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Ramanathan_2013_ICCV,\n \n author = {\n Ramanathan,\n Vignesh and Liang,\n Percy and Fei-Fei,\n Li\n},\n title = {\n Video Event Understanding Using Natural Language Descriptions\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "e7b747e508", @@ -13115,7 +13557,8 @@ "aff_campus_unique_index": "0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0+0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Ricco_2013_ICCV,\n \n author = {\n Ricco,\n Susanna and Tomasi,\n Carlo\n},\n title = {\n Video Motion for Every Visible Point\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "9c184849e7", @@ -13146,7 +13589,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Atlanta", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Li_2013_ICCV,\n \n author = {\n Li,\n Fuxin and Kim,\n Taeyoung and Humayun,\n Ahmad and Tsai,\n David and Rehg,\n James M.\n},\n title = {\n Video Segmentation by Tracking Many Figure-Ground Segments\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "dc9f562bfe", @@ -13170,14 +13614,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;1;0", - "aff_unique_norm": "Queen Mary, University of London;Chinese University of Hong Kong", + "aff_unique_norm": "Queen Mary, University of London;The Chinese University of Hong Kong", "aff_unique_dep": ";", "aff_unique_url": "https://www.qmul.ac.uk;https://www.cuhk.edu.hk", "aff_unique_abbr": "QMUL;CUHK", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "London;Hong Kong SAR", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "United Kingdom;China" + "aff_country_unique": "United Kingdom;China", + "bibtex": "@InProceedings{Zhu_2013_ICCV,\n \n author = {\n Zhu,\n Xiatian and Loy,\n Chen Change and Gong,\n Shaogang\n},\n title = {\n Video Synopsis by Heterogeneous Multi-source Correlation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "3e5bcbd5e8", @@ -13206,7 +13651,8 @@ "aff_unique_url": "https://www.openu.ac.il", "aff_unique_abbr": "OUI", "aff_country_unique_index": "0", - "aff_country_unique": "Israel" + "aff_country_unique": "Israel", + "bibtex": "@InProceedings{Hassner_2013_ICCV,\n \n author = {\n Hassner,\n Tal\n},\n title = {\n Viewing Real-World Faces in 3D\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "697e82a5f8", @@ -13215,7 +13661,7 @@ "author": "Cheng Deng; Rongrong Ji; Wei Liu; Dacheng Tao; Xinbo Gao", "abstract": "Visual reranking has been widely deployed to refine the quality of conventional content-based image retrieval engines. The current trend lies in employing a crowd of retrieved results stemming from multiple feature modalities to boost the overall performance of visual reranking. However, a major challenge pertaining to current reranking methods is how to take full advantage of the complementary property of distinct feature modalities. Given a query image and one feature modality, a regular visual reranking framework treats the top-ranked images as pseudo positive instances which are inevitably noisy, difficult to reveal this complementary property, and thus lead to inferior ranking performance. This paper proposes a novel image reranking approach by introducing a Co-Regularized Multi-Graph Learning (Co-RMGL) framework, in which the intra-graph and inter-graph constraints are simultaneously imposed to encode affinities in a single graph and consistency across different graphs. Moreover, weakly supervised learning driven by image attributes is performed to denoise the pseudolabeled instances, thereby highlighting the unique strength of individual feature modality. Meanwhile, such learning can yield a few anchors in graphs that vitally enable the alignment and fusion of multiple graphs. As a result, an edge weight matrix learned from the fused graph automatically gives the ordering to the initially retrieved results. We evaluate our approach on four benchmark image retrieval datasets, demonstrating a significant performance gain over the state-of-the-arts.", "pdf": "http://openaccess.thecvf.com/content_iccv_2013/papers/Deng_Visual_Reranking_through_2013_ICCV_paper.pdf", - "aff": "Xidian University, Xi\u2019an, China; Xiamen University, Xiamen, China; IBM Watson Research Center, Armonk, NY, USA; University of Technology, Sydney, Australia; Xidian University, Xi\u2019an, China", + "aff": "Xidian University, Xi’an, China; Xiamen University, Xiamen, China; IBM Watson Research Center, Armonk, NY, USA; University of Technology, Sydney, Australia; Xidian University, Xi’an, China", "project": "", "github": "", "supp": "", @@ -13230,14 +13676,15 @@ "track": "main", "status": "Poster", "aff_unique_index": "0;1;2;3;0", - "aff_unique_norm": "Xidian University;Xiamen University;IBM;University of Technology Sydney", - "aff_unique_dep": ";;IBM Watson Research Center;", + "aff_unique_norm": "Xidian University;Xiamen University;IBM Watson Research Center;University of Technology Sydney", + "aff_unique_dep": ";;;", "aff_unique_url": "http://www.xidian.edu.cn;https://www.xmu.edu.cn;https://www.ibm.com/watson;https://www.uts.edu.au", "aff_unique_abbr": "Xidian;XMU;IBM Watson;UTS", "aff_campus_unique_index": "0;1;2;3;0", "aff_campus_unique": "Xi'an;Xiamen;Armonk;Sydney", "aff_country_unique_index": "0;0;1;2;0", - "aff_country_unique": "China;United States;Australia" + "aff_country_unique": "China;United States;Australia", + "bibtex": "@InProceedings{Deng_2013_ICCV,\n \n author = {\n Deng,\n Cheng and Ji,\n Rongrong and Liu,\n Wei and Tao,\n Dacheng and Gao,\n Xinbo\n},\n title = {\n Visual Reranking through Weakly Supervised Multi-graph Learning\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "53dbb14f06", @@ -13268,7 +13715,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "1;1", - "aff_country_unique": ";China" + "aff_country_unique": ";China", + "bibtex": "@InProceedings{Qiu_2013_ICCV,\n \n author = {\n Qiu,\n Shi and Wang,\n Xiaogang and Tang,\n Xiaoou\n},\n title = {\n Visual Semantic Complex Network for Web Images\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "361a4148dc", @@ -13295,11 +13743,12 @@ "aff_unique_norm": "University of California, Berkeley;Lawrence Berkeley National Laboratory", "aff_unique_dep": ";", "aff_unique_url": "https://www.berkeley.edu;https://www.lbl.gov", - "aff_unique_abbr": "UC Berkeley;LBNL", + "aff_unique_abbr": "UC Berkeley;LBL", "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Barron_2013_ICCV,\n \n author = {\n Barron,\n Jonathan T. and Biggin,\n Mark D. and Arbelaez,\n Pablo and Knowles,\n David W. and Keranen,\n Soile V.E. and Malik,\n Jitendra\n},\n title = {\n Volumetric Semantic Segmentation Using Pyramid Context Features\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "f108e7cb2d", @@ -13330,7 +13779,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Straehle_2013_ICCV,\n \n author = {\n Straehle,\n Christoph and Koethe,\n Ullrich and Hamprecht,\n Fred A.\n},\n title = {\n Weakly Supervised Learning of Image Partitioning Using Decision Trees with Structured Split Criteria\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "b994b4147f", @@ -13361,7 +13811,8 @@ "aff_campus_unique_index": "0+0;0;0+0", "aff_campus_unique": "MA", "aff_country_unique_index": "0+0;0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Shao_2013_ICCV,\n \n author = {\n Shao,\n Ming and Li,\n Liangyue and Fu,\n Yun\n},\n title = {\n What Do You Do? Occupation Recognition in a Photo via Social Context\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "b504e877e6", @@ -13383,7 +13834,8 @@ "email": ";;", "author_num": 3, "track": "main", - "status": "Poster" + "status": "Poster", + "bibtex": "@InProceedings{Iwamura_2013_ICCV,\n \n author = {\n Iwamura,\n Masakazu and Sato,\n Tomokazu and Kise,\n Koichi\n},\n title = {\n What is the Most Efficient Way to Select Nearest Neighbor Candidates for Fast Approximate Nearest Neighbor Search?\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "8f350151da", @@ -13414,7 +13866,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "New Brunswick", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Elhoseiny_2013_ICCV,\n \n author = {\n Elhoseiny,\n Mohamed and Saleh,\n Babak and Elgammal,\n Ahmed\n},\n title = {\n Write a Classifier: Zero-Shot Learning Using Purely Textual Descriptions\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" }, { "id": "cda6fe19d7", @@ -13445,6 +13898,7 @@ "aff_campus_unique_index": "0;1;1;1;1;0;2", "aff_campus_unique": "Berkeley;Austin;Lowell", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Guadarrama_2013_ICCV,\n \n author = {\n Guadarrama,\n Sergio and Krishnamoorthy,\n Niveda and Malkarnenkar,\n Girish and Venugopalan,\n Subhashini and Mooney,\n Raymond and Darrell,\n Trevor and Saenko,\n Kate\n},\n title = {\n YouTube2Text: Recognizing and Describing Arbitrary Activities Using Semantic Hierarchies and Zero-Shot Recognition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2013\n} \n}" } ] \ No newline at end of file diff --git a/iccv/iccv2015.json b/iccv/iccv2015.json index edc5f12..c8dfb52 100644 --- a/iccv/iccv2015.json +++ b/iccv/iccv2015.json @@ -31,7 +31,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhang_2015_ICCV,\n \n author = {\n Zhang,\n Kang and Yu,\n Wuyi and Manhein,\n Mary and Waggenspack,\n Warren and Li,\n Xin\n},\n title = {\n 3D Fragment Reassembly Using Integrated Template Guidance and Fracture-Region Matching\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "ca90ddff68", @@ -65,7 +66,8 @@ "aff_campus_unique_index": "1;1;2", "aff_campus_unique": ";Philadelphia;Hangzhou", "aff_country_unique_index": "0+1;1;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Li_2015_ICCV,\n \n author = {\n Li,\n Peiyi and Ling,\n Haibin and Li,\n Xi and Liao,\n Chunyuan\n},\n title = {\n 3D Hand Pose Estimation Using Randomized Decision Forest With Segmentation Index Points\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "680fcfa26e", @@ -78,7 +80,7 @@ "author": "Dimitrios Tzionas; Juergen Gall", "abstract": "Recent advances have enabled 3d object reconstruction approaches using a single off-the-shelf RGB-D camera. Although these approaches are successful for a wide range of object classes, they rely on stable and distinctive geometric or texture features. Many objects like mechanical parts, toys, household or decorative articles, however, are textureless and characterized by minimalistic shapes that are simple and symmetric. Existing in-hand scanning systems and 3d reconstruction techniques fail for such symmetric objects in the absence of highly distinctive features. In this work, we show that extracting 3d hand motion for in-hand scanning effectively facilitates the reconstruction of even featureless and highly symmetric objects and we present an approach that fuses the rich additional information of hands into a 3d reconstruction pipeline, significantly contributing to the state-of-the-art of in-hand scanning.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Tzionas_3D_Object_Reconstruction_ICCV_2015_paper.pdf", - "aff": "University of Bonn, Bonn, Germany + MPI for Intelligent Systems, T\u00fcbingen, Germany; University of Bonn, Bonn, Germany", + "aff": "University of Bonn, Bonn, Germany + MPI for Intelligent Systems, Tübingen, Germany; University of Bonn, Bonn, Germany", "project": "http://skanect.occipital.com/", "github": "", "supp": "", @@ -97,9 +99,10 @@ "aff_unique_url": "https://www.uni-bonn.de;https://www.mpituebingen.mpg.de", "aff_unique_abbr": "UBonn;MPI-IS", "aff_campus_unique_index": "0+1;0", - "aff_campus_unique": "Bonn;T\u00fcbingen", + "aff_campus_unique": "Bonn;Tübingen", "aff_country_unique_index": "0+0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Tzionas_2015_ICCV,\n \n author = {\n Tzionas,\n Dimitrios and Gall,\n Juergen\n},\n title = {\n 3D Object Reconstruction From Hand-Object Interactions\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "80d68768f9", @@ -108,7 +111,7 @@ "status": "Poster", "track": "main", "pid": "", - "author_site": "Matea \u00d0onli\u0107, Tomislav Petkovi\u0107, Tomislav Pribani\u0107", + "author_site": "Matea Ðonlić, Tomislav Petković, Tomislav Pribanić", "author": "Matea Donlic; Tomislav Petkovic; Tomislav Pribanic", "abstract": "A novel structured light method for color 3D surface profilometry is proposed. The proposed method does not require color calibration of a camera-projector pair and may be used for reconstruction of both dynamic and static scenes. The method uses a structured light pattern that is a combination of a De Bruijn color sequence and of a sinusoidal fringe. For dynamic scenes a Hessian ridge detector and a Gaussian mixture model are combined to extract stripe centers and to identify color. Stripes are then uniquely identified using dynamic programming based on the Smith-Waterman algorithm and a De Bruijn window property. For static scenes phase-shifting and De Bruijn window property are combined to obtain a high accuracy reconstruction. We have tested the proposed method on multiple objects with challenging surfaces and different albedos that demonstrate usability and robustness of the method.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Donlic_3D_Surface_Profilometry_ICCV_2015_paper.pdf", @@ -128,12 +131,13 @@ "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Zagreb", "aff_unique_dep": "Faculty of Electrical Engineering and Computing", - "aff_unique_url": "https://www.feezagreb.unizg.hr", + "aff_unique_url": "https://www.feezagreb.unizg.hr/", "aff_unique_abbr": "", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Croatia" + "aff_country_unique": "Croatia", + "bibtex": "@InProceedings{Donlic_2015_ICCV,\n \n author = {\n Donlic,\n Matea and Petkovic,\n Tomislav and Pribanic,\n Tomislav\n},\n title = {\n 3D Surface Profilometry Using Phase Shifting of De Bruijn Pattern\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "c8ac619d1a", @@ -161,13 +165,14 @@ "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Martin-Brualla_3D_Time-Lapse_Reconstruction_ICCV_2015_paper.html", "aff_unique_index": "0;1;0+1", "aff_unique_norm": "University of Washington;Google", - "aff_unique_dep": ";Google", + "aff_unique_dep": ";", "aff_unique_url": "https://www.washington.edu;https://www.google.com", "aff_unique_abbr": "UW;Google", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Martin-Brualla_2015_ICCV,\n \n author = {\n Martin-Brualla,\n Ricardo and Gallup,\n David and Seitz,\n Steven M.\n},\n title = {\n 3D Time-Lapse Reconstruction From Internet Photos\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "c4ab758772", @@ -192,7 +197,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Su_3D-Assisted_Feature_Synthesis_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Su_3D-Assisted_Feature_Synthesis_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Su_2015_ICCV,\n \n author = {\n Su,\n Hao and Wang,\n Fan and Yi,\n Eric and Guibas,\n Leonidas J.\n},\n title = {\n 3D-Assisted Feature Synthesis for Novel Views of an Object\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "60887d3072", @@ -217,7 +223,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Choi_A_Collaborative_Filtering_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Choi_A_Collaborative_Filtering_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Choi_2015_ICCV,\n \n author = {\n Choi,\n Chiho and Sinha,\n Ayan and Choi,\n Joon Hee and Jang,\n Sujin and Ramani,\n Karthik\n},\n title = {\n A Collaborative Filtering Approach to Real-Time Hand Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "962dfe1b47", @@ -251,7 +258,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Beigpour_2015_ICCV,\n \n author = {\n Beigpour,\n Shida and Kolb,\n Andreas and Kunz,\n Sven\n},\n title = {\n A Comprehensive Multi-Illuminant Dataset for Benchmarking of the Intrinsic Image Algorithms\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "7f1888d76d", @@ -285,7 +293,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2015_ICCV,\n \n author = {\n Li,\n Jia and Xia,\n Changqun and Song,\n Yafei and Fang,\n Shu and Chen,\n Xiaowu\n},\n title = {\n A Data-Driven Metric for Comprehensive Evaluation of Saliency Models\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "9f4aff673e", @@ -298,7 +307,7 @@ "author": "Zhuoyuan Chen; Xun Sun; Liang Wang; Yinan Yu; Chang Huang", "abstract": "This paper presents a data-driven matching cost for stereo matching. A novel deep visual correspondence embedding model is trained via Convolutional Neural Network on a large set of stereo images with ground truth disparities. This deep embedding model leverages appearance data to learn visual similarity relationships between corresponding image patches, and explicitly maps intensity values into an embedding feature space to measure pixel dissimilarities. Experimental results on KITTI and Middlebury data sets demonstrate the effectiveness of our model. First, we prove that the new measure of pixel dissimilarity outperforms traditional matching costs. Furthermore, when integrated with a global stereo framework, our method ranks top 3 among all two-frame algorithms on the KITTI benchmark. Finally, cross-validation results show that our model is able to make correct predictions for unseen data which are outside of its labeled training set.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Chen_A_Deep_Visual_ICCV_2015_paper.pdf", - "aff": "Baidu Research \u2013 Institute of Deep Learning; Baidu Research \u2013 Institute of Deep Learning; Baidu Research \u2013 Institute of Deep Learning; Horizon Robotics; Horizon Robotics", + "aff": "Baidu Research – Institute of Deep Learning; Baidu Research – Institute of Deep Learning; Baidu Research – Institute of Deep Learning; Horizon Robotics; Horizon Robotics", "project": "", "github": "", "supp": "", @@ -312,14 +321,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Chen_A_Deep_Visual_ICCV_2015_paper.html", "aff_unique_index": "0;0;0;1;1", - "aff_unique_norm": "Baidu;Horizon Robotics", + "aff_unique_norm": "Baidu Research;Horizon Robotics", "aff_unique_dep": "Institute of Deep Learning;", "aff_unique_url": "https://research.baidu.com;https://www.horizon-robotics.com/", "aff_unique_abbr": "Baidu;Horizon Robotics", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2015_ICCV,\n \n author = {\n Chen,\n Zhuoyuan and Sun,\n Xun and Wang,\n Liang and Yu,\n Yinan and Huang,\n Chang\n},\n title = {\n A Deep Visual Correspondence Embedding Model for Stereo Matching Costs\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "27c64c8ae0", @@ -353,7 +363,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0+1", - "aff_country_unique": "Belgium;Switzerland" + "aff_country_unique": "Belgium;Switzerland", + "bibtex": "@InProceedings{Georgoulis_2015_ICCV,\n \n author = {\n Georgoulis,\n Stamatios and Vanweddingen,\n Vincent and Proesmans,\n Marc and Van Gool,\n Luc\n},\n title = {\n A Gaussian Process Latent Variable Model for BRDF Inference\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "3647a0d516", @@ -366,7 +377,7 @@ "author": "Timo Bolkart; Stefanie Wuhrer", "abstract": "Multilinear face models are widely used to model the space of human faces with expressions. For databases of 3D human faces of different identities performing multiple expressions, these statistical shape models decouple identity and expression variations. To compute a high-quality multilinear face model, the quality of the registration of the database of 3D face scans used for training is essential. Meanwhile, a multilinear face model can be used as an effective prior to register 3D face scans, which are typically noisy and incomplete. Inspired by the minimum description length approach, we propose the first method to jointly optimize a multilinear model and the registration of the 3D scans used for training. Given an initial registration, our approach fully automatically improves the registration by optimizing an objective function that measures the compactness of the multilinear model, resulting in a sparse model. We choose a continuous representation for each face shape that allows to use a quasi-Newton method in parameter space for optimization. We show that our approach is computationally significantly more efficient and leads to correspondences of higher quality than existing methods based on linear statistical models. This allows us to evaluate our approach on large standard 3D face databases and in the presence of noisy initializations.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Bolkart_A_Groupwise_Multilinear_ICCV_2015_paper.pdf", - "aff": "Saarland University, Germany; Inria Rh\u00f4ne-Alpes, France", + "aff": "Saarland University, Germany; Inria Rhône-Alpes, France", "project": "", "github": "", "supp": "", @@ -380,14 +391,15 @@ "author_num": 2, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Bolkart_A_Groupwise_Multilinear_ICCV_2015_paper.html", "aff_unique_index": "0;1", - "aff_unique_norm": "Saarland University;INRIA", + "aff_unique_norm": "Saarland University;Inria", "aff_unique_dep": ";", "aff_unique_url": "https://www.uni-saarland.de;https://www.inria.fr", "aff_unique_abbr": "UdS;Inria", "aff_campus_unique_index": "1", - "aff_campus_unique": ";Rh\u00f4ne-Alpes", + "aff_campus_unique": ";Rhône-Alpes", "aff_country_unique_index": "0;1", - "aff_country_unique": "Germany;France" + "aff_country_unique": "Germany;France", + "bibtex": "@InProceedings{Bolkart_2015_ICCV,\n \n author = {\n Bolkart,\n Timo and Wuhrer,\n Stefanie\n},\n title = {\n A Groupwise Multilinear Correspondence Optimization for 3D Faces\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "27f2ed0d69", @@ -421,7 +433,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Kyoto;", "aff_country_unique_index": "0+0;0;0;0;0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Nishimura_2015_ICCV,\n \n author = {\n Nishimura,\n Mai and Nobuhara,\n Shohei and Matsuyama,\n Takashi and Shimizu,\n Shinya and Fujii,\n Kensaku\n},\n title = {\n A Linear Generalized Camera Calibration From Three Intersecting Reference Planes\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "0cc3e03870", @@ -434,7 +447,7 @@ "author": "Junchi Yan; Hongteng Xu; Hongyuan Zha; Xiaokang Yang; Huanxi Liu; Stephen Chu", "abstract": "Graph matching has a wide spectrum of real-world applications and in general is known NP-hard. In many vision tasks, one realistic problem arises for finding the global node mappings across a batch of corrupted weighted graphs. This paper is an attempt to connect graph matching, especially multi-graph matching to the matrix decomposition model and its relevant on-the-shelf convex optimization algorithms. Our method aims to extract the common inliers and their synchronized permutations from disordered weighted graphs in the presence of deformation and outliers. Under the proposed framework, several variants can be derived in the hope of accommodating to other types of noises. Experimental results on both synthetic data and real images empirically show that the proposed paradigm exhibits several interesting behaviors and in many cases performs competitively with the state-of-the-arts.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Yan_A_Matrix_Decomposition_ICCV_2015_paper.pdf", - "aff": "Shanghai Jiao Tong University+IBM Research \u2013 China; College of Computing+School of Electronic Engineering, Georgia Institute of Technology; Software Engineering Institute, East China Normal University+College of Computing, Georgia Institute of Technology; Shanghai Jiao Tong University; Shanghai Jiao Tong University; IBM Research \u2013 China", + "aff": "Shanghai Jiao Tong University+IBM Research – China; College of Computing+School of Electronic Engineering, Georgia Institute of Technology; Software Engineering Institute, East China Normal University+College of Computing, Georgia Institute of Technology; Shanghai Jiao Tong University; Shanghai Jiao Tong University; IBM Research – China", "project": "", "github": "", "supp": "", @@ -448,14 +461,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Yan_A_Matrix_Decomposition_ICCV_2015_paper.html", "aff_unique_index": "0+1;2+3;4+3;0;0;1", - "aff_unique_norm": "Shanghai Jiao Tong University;IBM;College of Computing;Georgia Institute of Technology;East China Normal University", + "aff_unique_norm": "Shanghai Jiao Tong University;IBM Research;College of Computing;Georgia Institute of Technology;East China Normal University", "aff_unique_dep": ";China;;School of Electronic Engineering;Software Engineering Institute", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.ibm.com/research/global/china;;https://www.gatech.edu;http://www.ecnu.edu.cn", "aff_unique_abbr": "SJTU;IBM;;Georgia Tech;ECNU", "aff_campus_unique_index": ";1;1", "aff_campus_unique": ";Atlanta", "aff_country_unique_index": "0+0;2;0+2;0;0;0", - "aff_country_unique": "China;;United States" + "aff_country_unique": "China;;United States", + "bibtex": "@InProceedings{Yan_2015_ICCV,\n \n author = {\n Yan,\n Junchi and Xu,\n Hongteng and Zha,\n Hongyuan and Yang,\n Xiaokang and Liu,\n Huanxi and Chu,\n Stephen\n},\n title = {\n A Matrix Decomposition Perspective to Multiple Graph Matching\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "dee01b7d79", @@ -489,7 +503,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "Israel" + "aff_country_unique": "Israel", + "bibtex": "@InProceedings{Meir_2015_ICCV,\n \n author = {\n Meir,\n Omer and Galun,\n Meirav and Yagev,\n Stav and Basri,\n Ronen and Yavneh,\n Irad\n},\n title = {\n A Multiscale Variable-Grouping Framework for MRF Energy Minimization\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "3c32b46339", @@ -523,7 +538,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", - "aff_country_unique": "Cyprus;Greece" + "aff_country_unique": "Cyprus;Greece", + "bibtex": "@InProceedings{Chatzis_2015_ICCV,\n \n author = {\n Chatzis,\n Sotirios P. and Kosmopoulos,\n Dimitrios\n},\n title = {\n A Nonparametric Bayesian Approach Toward Stacked Convolutional Independent Component Analysis\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "a13c6ba702", @@ -536,7 +552,7 @@ "author": "Alberto Crivellaro; Mahdi Rad; Yannick Verdie; Kwang Moo Yi; Pascal Fua; Vincent Lepetit", "abstract": "We present a method that estimates in real-time and under challenging conditions the 3D pose of a known object. Our method relies only on grayscale images since depth cameras fail on metallic objects; it can handle poorly textured objects, and cluttered, changing environments; the pose it predicts degrades gracefully in presence of large occlusions. As a result, by contrast with the state-of-the-art, our method is suitable for practical Augmented Reality applications even in industrial environments. To be robust to occlusions, we first learn to detect some parts of the target object. Our key idea is to then predict the 3D pose of each part in the form of the 2D projections of a few control points. The advantages of this representation is three-fold: We can predict the 3D pose of the object even when only one part is visible; when several parts are visible, we can combine them easily to compute a better pose of the object; the 3D pose we obtain is usually very accurate, even when only few parts are visible.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Crivellaro_A_Novel_Representation_ICCV_2015_paper.pdf", - "aff": "Computer Vision Laboratory, \u00c9cole Polytechnique F\u00e9d\u00e9rale de Lausanne (EPFL), Switzerland; Institute for Computer Graphics and Vision, Graz University of Technology, Austria; Computer Vision Laboratory, \u00c9cole Polytechnique F\u00e9d\u00e9rale de Lausanne (EPFL), Switzerland; Computer Vision Laboratory, \u00c9cole Polytechnique F\u00e9d\u00e9rale de Lausanne (EPFL), Switzerland; Computer Vision Laboratory, \u00c9cole Polytechnique F\u00e9d\u00e9rale de Lausanne (EPFL), Switzerland; Institute for Computer Graphics and Vision, Graz University of Technology, Austria", + "aff": "Computer Vision Laboratory, École Polytechnique Fédérale de Lausanne (EPFL), Switzerland; Institute for Computer Graphics and Vision, Graz University of Technology, Austria; Computer Vision Laboratory, École Polytechnique Fédérale de Lausanne (EPFL), Switzerland; Computer Vision Laboratory, École Polytechnique Fédérale de Lausanne (EPFL), Switzerland; Computer Vision Laboratory, École Polytechnique Fédérale de Lausanne (EPFL), Switzerland; Institute for Computer Graphics and Vision, Graz University of Technology, Austria", "project": "", "github": "", "supp": "", @@ -550,14 +566,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Crivellaro_A_Novel_Representation_ICCV_2015_paper.html", "aff_unique_index": "0;1;0;0;0;1", - "aff_unique_norm": "EPFL;Graz University of Technology", + "aff_unique_norm": "École Polytechnique Fédérale de Lausanne;Graz University of Technology", "aff_unique_dep": "Computer Vision Laboratory;Institute for Computer Graphics and Vision", "aff_unique_url": "https://www.epfl.ch;https://www.tugraz.at", "aff_unique_abbr": "EPFL;TU Graz", "aff_campus_unique_index": "0;1;0;0;0;1", "aff_campus_unique": "Lausanne;Graz", "aff_country_unique_index": "0;1;0;0;0;1", - "aff_country_unique": "Switzerland;Austria" + "aff_country_unique": "Switzerland;Austria", + "bibtex": "@InProceedings{Crivellaro_2015_ICCV,\n \n author = {\n Crivellaro,\n Alberto and Rad,\n Mahdi and Verdie,\n Yannick and Yi,\n Kwang Moo and Fua,\n Pascal and Lepetit,\n Vincent\n},\n title = {\n A Novel Representation of Parts for Accurate 3D Object Detection and Tracking in Monocular Images\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "5eb37bdf31", @@ -570,7 +587,7 @@ "author": "Qian Zhao; Deyu Meng; Xu Kong; Qi Xie; Wenfei Cao; Yao Wang; Zongben Xu", "abstract": "In this paper, we propose a new sparsity regularizer for measuring the low-rank structure underneath a tensor. The proposed sparsity measure has a natural physical meaning which is intrinsically the size of the fundamental Kronecker basis to express the tensor. By embedding the sparsity measure into the tensor completion and tensor robust PCA frameworks, we formulate new models to enhance their capability in tensor recovery. Through introducing relaxation forms of the proposed sparsity measure, we also adopt the alternating direction method of multipliers (ADMM) for solving the proposed models. Experiments implemented on synthetic and multispectral image data sets substantiate the effectiveness of the proposed methods.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Zhao_A_Novel_Sparsity_ICCV_2015_paper.pdf", - "aff": "School of Mathematics and Statistics, Xi\u2019an Jiaotong University; School of Mathematics and Statistics, Xi\u2019an Jiaotong University + Ministry of Education Key Lab of Intelligent Networks and Network Security, Xi\u2019an Jiaotong University; School of Mathematical Sciences, Liaocheng University; School of Mathematics and Statistics, Xi\u2019an Jiaotong University; School of Mathematics and Statistics, Xi\u2019an Jiaotong University; School of Mathematics and Statistics, Xi\u2019an Jiaotong University; School of Mathematics and Statistics, Xi\u2019an Jiaotong University + Ministry of Education Key Lab of Intelligent Networks and Network Security, Xi\u2019an Jiaotong University", + "aff": "School of Mathematics and Statistics, Xi’an Jiaotong University; School of Mathematics and Statistics, Xi’an Jiaotong University + Ministry of Education Key Lab of Intelligent Networks and Network Security, Xi’an Jiaotong University; School of Mathematical Sciences, Liaocheng University; School of Mathematics and Statistics, Xi’an Jiaotong University; School of Mathematics and Statistics, Xi’an Jiaotong University; School of Mathematics and Statistics, Xi’an Jiaotong University; School of Mathematics and Statistics, Xi’an Jiaotong University + Ministry of Education Key Lab of Intelligent Networks and Network Security, Xi’an Jiaotong University", "project": "", "github": "", "supp": "", @@ -584,14 +601,15 @@ "author_num": 7, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Zhao_A_Novel_Sparsity_ICCV_2015_paper.html", "aff_unique_index": "0;0+0;1;0;0;0;0+0", - "aff_unique_norm": "Xi'an Jiao Tong University;Liaocheng University", + "aff_unique_norm": "Xi'an Jiaotong University;Liaocheng University", "aff_unique_dep": "School of Mathematics and Statistics;School of Mathematical Sciences", "aff_unique_url": "http://en.xjtu.edu.cn/;http://www.lctu.edu.cn/", "aff_unique_abbr": "XJTU;", "aff_campus_unique_index": "0;0+0;0;0;0;0+0", "aff_campus_unique": "Xi'an;", "aff_country_unique_index": "0;0+0;0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhao_2015_ICCV,\n \n author = {\n Zhao,\n Qian and Meng,\n Deyu and Kong,\n Xu and Xie,\n Qi and Cao,\n Wenfei and Wang,\n Yao and Xu,\n Zongben\n},\n title = {\n A Novel Sparsity Measure for Tensor Recovery\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "936c5d703d", @@ -625,7 +643,8 @@ "aff_campus_unique_index": "0;0;0;0;0;0;0+0", "aff_campus_unique": "Madison", "aff_country_unique_index": "0;0;0;0;0;0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Hwang_2015_ICCV,\n \n author = {\n Hwang,\n Seong Jae and Collins,\n Maxwell D. and Ravi,\n Sathya N. and Ithapu,\n Vamsi K. and Adluru,\n Nagesh and Johnson,\n Sterling C. and Singh,\n Vikas\n},\n title = {\n A Projection Free Method for Generalized Eigenvalue Problem With a Nonsmooth Regularizer\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "ee7d027bae", @@ -650,7 +669,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Kim_A_Randomized_Ensemble_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Kim_A_Randomized_Ensemble_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Kim_2015_ICCV,\n \n author = {\n Kim,\n Hyojin and Thiagarajan,\n Jayaraman Jayaraman J. and Bremer,\n Peer-Timo\n},\n title = {\n A Randomized Ensemble Approach to Industrial CT Segmentation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "d65ca72d61", @@ -663,7 +683,7 @@ "author": "Dingwen Zhang; Deyu Meng; Chao Li; Lu Jiang; Qian Zhao; Junwei Han", "abstract": "As an interesting and emerging topic, co-saliency detection aims at simultaneously extracting common salient objects in a group of images. Traditional co-saliency detection approaches rely heavily on human knowledge for designing hand-crafted metrics to explore the intrinsic patterns underlying co-salient objects. Such strategies, however, always suffer from poor generalization capability to flexibly adapt various scenarios in real applications, especially due to their lack of insightful understanding of the biological mechanisms of human visual co-attention. To alleviate this problem, we propose a novel framework for this task, by naturally reformulating it as a multiple-instance learning (MIL) problem and further integrating it into a self-paced learning (SPL) regime. The proposed framework on one hand is capable of fitting insightful metric measurements and discovering common patterns under co-salient regions in a self-learning way by MIL, and on the other hand tends to promise the learning reliability and stability by simulating the human learning process through SPL. Experiments on benchmark datasets have demonstrated the effectiveness of the proposed framework as compared with the state-of-the-arts.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Zhang_A_Self-Paced_Multiple-Instance_ICCV_2015_paper.pdf", - "aff": "School of Automation, Northwestern Polytechnical University; School of Mathematics and Statistics, Xi\u2019an Jiaotong University; School of Automation, Northwestern Polytechnical University; School of Computer Science, Carnegie Mellon University; School of Mathematics and Statistics, Xi\u2019an Jiaotong University; School of Automation, Northwestern Polytechnical University", + "aff": "School of Automation, Northwestern Polytechnical University; School of Mathematics and Statistics, Xi’an Jiaotong University; School of Automation, Northwestern Polytechnical University; School of Computer Science, Carnegie Mellon University; School of Mathematics and Statistics, Xi’an Jiaotong University; School of Automation, Northwestern Polytechnical University", "project": "", "github": "", "supp": "", @@ -677,14 +697,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Zhang_A_Self-Paced_Multiple-Instance_ICCV_2015_paper.html", "aff_unique_index": "0;1;0;2;1;0", - "aff_unique_norm": "Northwestern Polytechnical University;Xi'an Jiao Tong University;Carnegie Mellon University", + "aff_unique_norm": "Northwestern Polytechnical University;Xi'an Jiaotong University;Carnegie Mellon University", "aff_unique_dep": "School of Automation;School of Mathematics and Statistics;School of Computer Science", "aff_unique_url": "https://www.nwpu.edu.cn;http://en.xjtu.edu.cn/;https://www.cmu.edu", "aff_unique_abbr": "NWPU;XJTU;CMU", "aff_campus_unique_index": "1;2;1", "aff_campus_unique": ";Xi'an;Pittsburgh", "aff_country_unique_index": "0;0;0;1;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Zhang_2015_ICCV,\n \n author = {\n Zhang,\n Dingwen and Meng,\n Deyu and Li,\n Chao and Jiang,\n Lu and Zhao,\n Qian and Han,\n Junwei\n},\n title = {\n A Self-Paced Multiple-Instance Learning Framework for Co-Saliency Detection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "55023e7875", @@ -718,7 +739,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2015_ICCV,\n \n author = {\n Liu,\n Kan and Ma,\n Bingpeng and Zhang,\n Wei and Huang,\n Rui\n},\n title = {\n A Spatio-Temporal Appearance Representation for Viceo-Based Pedestrian Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "6cb819d8be", @@ -743,7 +765,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Siyahjani_A_Supervised_Low-Rank_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Siyahjani_A_Supervised_Low-Rank_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Siyahjani_2015_ICCV,\n \n author = {\n Siyahjani,\n Farzad and Almohsen,\n Ranya and Sabri,\n Sinan and Doretto,\n Gianfranco\n},\n title = {\n A Supervised Low-Rank Method for Learning Invariant Subspaces\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "6a0ad6409f", @@ -777,7 +800,8 @@ "aff_campus_unique_index": "0+0;0;0;0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0+0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liang_2015_ICCV,\n \n author = {\n Liang,\n Kongming and Chang,\n Hong and Shan,\n Shiguang and Chen,\n Xilin\n},\n title = {\n A Unified Multiplicative Framework for Attribute Learning\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "99a685d8bc", @@ -802,7 +826,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Tan_A_Versatile_Learning-Based_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Tan_A_Versatile_Learning-Based_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Tan_2015_ICCV,\n \n author = {\n Tan,\n David Joseph and Tombari,\n Federico and Ilic,\n Slobodan and Navab,\n Nassir\n},\n title = {\n A Versatile Learning-Based 3D Temporal Tracker: Scalable,\n Robust,\n Online\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "7731ccf43e", @@ -829,14 +854,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Rhodin_A_Versatile_Scene_ICCV_2015_paper.html", "aff_unique_index": "0;0+1;0+1;0;0", - "aff_unique_norm": "Max Planck Institute for Informatics;Intel", + "aff_unique_norm": "Max Planck Institute for Informatics;Intel Corporation", "aff_unique_dep": "Informatik;Visual Computing Institute", "aff_unique_url": "https://www.mpi-inf.mpg.de;https://www.intel.com", "aff_unique_abbr": "MPII;Intel", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0+1;0+1;0;0", - "aff_country_unique": "Germany;United States" + "aff_country_unique": "Germany;United States", + "bibtex": "@InProceedings{Rhodin_2015_ICCV,\n \n author = {\n Rhodin,\n Helge and Robertini,\n Nadia and Richardt,\n Christian and Seidel,\n Hans-Peter and Theobalt,\n Christian\n},\n title = {\n A Versatile Scene Model With Differentiable Visibility Applied to Generative Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "886fda8b7d", @@ -849,7 +875,7 @@ "author": "Brais Cancela; Marcos Ortega; Manuel G. Penedo", "abstract": "This paper presents a new wavefront propagation method for dealing with the classic Eikonal equation. While classic Dijkstra-like graph-based techniques achieve the solution in O(M log M), they do not approximate the unique physically relevant solution very well. Fast Marching Methods (FMM) were created to efficiently solve the continuous problem. The proposed approximation tries to maintain the complexity, in order to make the algorithm useful in a wide range of contexts. The key idea behind our method is the creation of 'mini wave-fronts', which are combined to propagate the solution. Experimental results show the improvement in the accuracy with respect to the state of the art, while the average computational speed is maintained in O(M log M), similar to the FMM techniques.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Cancela_A_Wavefront_Marching_ICCV_2015_paper.pdf", - "aff": "V ARPA Group, Universidade da Coru \u02dcna; V ARPA Group, Universidade da Coru \u02dcna; V ARPA Group, Universidade da Coru \u02dcna", + "aff": "V ARPA Group, Universidade da Coru ˜na; V ARPA Group, Universidade da Coru ˜na; V ARPA Group, Universidade da Coru ˜na", "project": "", "github": "", "supp": "", @@ -863,14 +889,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Cancela_A_Wavefront_Marching_ICCV_2015_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "Universidade da Coru\u00f1a", + "aff_unique_norm": "Universidade da Coruña", "aff_unique_dep": "V ARPA Group", "aff_unique_url": "https://www.udc.es", "aff_unique_abbr": "", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Spain" + "aff_country_unique": "Spain", + "bibtex": "@InProceedings{Cancela_2015_ICCV,\n \n author = {\n Cancela,\n Brais and Ortega,\n Marcos and Penedo,\n Manuel G.\n},\n title = {\n A Wavefront Marching Method for Solving the Eikonal Equation on Cartesian Grids\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "0897b27f99", @@ -904,7 +931,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Ha_2015_ICCV,\n \n author = {\n Ha,\n Hyowon and Bok,\n Yunsu and Joo,\n Kyungdon and Jung,\n Jiyoung and Kweon,\n In So\n},\n title = {\n Accurate Camera Calibration Robust to Defocus Using a Smartphone\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "53f371e1f3", @@ -938,7 +966,8 @@ "aff_campus_unique_index": "0;1", "aff_campus_unique": "Buffalo;Ann Arbor", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Chen_2015_ICCV,\n \n author = {\n Chen,\n Wei and Corso,\n Jason J.\n},\n title = {\n Action Detection by Implicit Intentional Motion Clustering\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "da04c11754", @@ -970,9 +999,10 @@ "aff_unique_url": "https://www.ucf.edu", "aff_unique_abbr": "UCF", "aff_campus_unique_index": "0;0;0", - "aff_campus_unique": "UCF", + "aff_campus_unique": "Orlando", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Soomro_2015_ICCV,\n \n author = {\n Soomro,\n Khurram and Idrees,\n Haroon and Shah,\n Mubarak\n},\n title = {\n Action Localization in Videos Through Context Walk\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "4e99651267", @@ -997,7 +1027,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Lan_Action_Recognition_by_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Lan_Action_Recognition_by_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Lan_2015_ICCV,\n \n author = {\n Lan,\n Tian and Zhu,\n Yuke and Zamir,\n Amir Roshan and Savarese,\n Silvio\n},\n title = {\n Action Recognition by Hierarchical Mid-Level Action Elements\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "29b5441057", @@ -1031,7 +1062,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Luo_2015_ICCV,\n \n author = {\n Luo,\n Ye and Cheong,\n Loong-Fah and Tran,\n An\n},\n title = {\n Actionness-Assisted Recognition of Actions\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "c9fc747622", @@ -1058,14 +1090,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Gkioxari_Actions_and_Attributes_ICCV_2015_paper.html", "aff_unique_index": "0;1;0", - "aff_unique_norm": "University of California, Berkeley;Microsoft", + "aff_unique_norm": "University of California, Berkeley;Microsoft Corporation", "aff_unique_dep": ";Microsoft Research", "aff_unique_url": "https://www.berkeley.edu;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "UC Berkeley;MSR", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Gkioxari_2015_ICCV,\n \n author = {\n Gkioxari,\n Georgia and Girshick,\n Ross and Malik,\n Jitendra\n},\n title = {\n Actions and Attributes From Wholes and Parts\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "7ed38a04ca", @@ -1089,7 +1122,8 @@ "aff_domain": ";", "email": ";", "author_num": 2, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Caicedo_Active_Object_Localization_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Caicedo_Active_Object_Localization_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Caicedo_2015_ICCV,\n \n author = {\n Caicedo,\n Juan C. and Lazebnik,\n Svetlana\n},\n title = {\n Active Object Localization With Deep Reinforcement Learning\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "cdef574066", @@ -1123,7 +1157,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Kawasaki_2015_ICCV,\n \n author = {\n Kawasaki,\n Hiroshi and Ono,\n Satoshi and Horita,\n Yuki and Shiba,\n Yuki and Furukawa,\n Ryo and Hiura,\n Shinsaku\n},\n title = {\n Active One-Shot Scan for Wide Depth Range Using a Light Field Projector Based on Coded Aperture\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "85519dcdaa", @@ -1157,7 +1192,8 @@ "aff_campus_unique_index": ";1", "aff_campus_unique": ";Chapel Hill", "aff_country_unique_index": "0+1;0;2;0;1", - "aff_country_unique": "Netherlands;Belgium;United States" + "aff_country_unique": "Netherlands;Belgium;United States", + "bibtex": "@InProceedings{Gavves_2015_ICCV,\n \n author = {\n Gavves,\n Efstratios and Mensink,\n Thomas and Tommasi,\n Tatiana and Snoek,\n Cees G. M. and Tuytelaars,\n Tinne\n},\n title = {\n Active Transfer Learning With Zero-Shot Priors: Reusing Past Datasets for Future Tasks\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "d6d00fd082", @@ -1191,7 +1227,8 @@ "aff_campus_unique_index": "0+0;0+0;0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0+0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xu_2015_ICCV,\n \n author = {\n Xu,\n Zhen and Qing,\n Laiyun and Miao,\n Jun\n},\n title = {\n Activity Auto-Completion: Predicting Human Activities From Partial Videos\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "1f2f701dd1", @@ -1225,7 +1262,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Wu_2015_ICCV,\n \n author = {\n Wu,\n Xiaomeng and Kashino,\n Kunio\n},\n title = {\n Adaptive Dither Voting for Robust Spatial Verification\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "4a19a73837", @@ -1259,7 +1297,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Singapore", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Dang_2015_ICCV,\n \n author = {\n Dang,\n Kang and Yang,\n Jiong and Yuan,\n Junsong\n},\n title = {\n Adaptive Exponential Smoothing for Online Filtering of Pixel Prediction Maps\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "1b24a943ec", @@ -1293,7 +1332,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Boston", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Cakir_2015_ICCV,\n \n author = {\n Cakir,\n Fatih and Sclaroff,\n Stan\n},\n title = {\n Adaptive Hashing for Fast Similarity Search\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "e06496f6a1", @@ -1318,7 +1358,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Fu_Adaptive_Spatial-Spectral_Dictionary_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Fu_Adaptive_Spatial-Spectral_Dictionary_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Fu_2015_ICCV,\n \n author = {\n Fu,\n Ying and Lam,\n Antony and Sato,\n Imari and Sato,\n Yoichi\n},\n title = {\n Adaptive Spatial-Spectral Dictionary Learning for Hyperspectral Image Denoising\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "36110f463b", @@ -1352,7 +1393,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2015_ICCV,\n \n author = {\n Wang,\n Xiaobo and Guo,\n Xiaojie and Li,\n Stan Z.\n},\n title = {\n Adaptively Unified Semi-Supervised Dictionary Learning With Active Points\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "c2a54ffe25", @@ -1386,7 +1428,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0;0;1", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Wang_2015_ICCV,\n \n author = {\n Wang,\n Zhenzhen and Yuan,\n Xiao-Tong and Liu,\n Qingshan and Yan,\n Shuicheng\n},\n title = {\n Additive Nearest Neighbor Feature Maps\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "60e64c6326", @@ -1420,7 +1463,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0", - "aff_country_unique": "Russian Federation" + "aff_country_unique": "Russia", + "bibtex": "@InProceedings{Babenko_2015_ICCV,\n \n author = {\n Babenko,\n Artem and Lempitsky,\n Victor\n},\n title = {\n Aggregating Local Deep Features for Image Retrieval\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "6bb1598ffc", @@ -1454,7 +1498,8 @@ "aff_campus_unique_index": "0;0;0;1", "aff_campus_unique": "Haifa;Pasadena", "aff_country_unique_index": "0;0;0;1", - "aff_country_unique": "Israel;United States" + "aff_country_unique": "Israel;United States", + "bibtex": "@InProceedings{Levis_2015_ICCV,\n \n author = {\n Levis,\n Aviad and Schechner,\n Yoav Y. and Aides,\n Amit and Davis,\n Anthony B.\n},\n title = {\n Airborne Three-Dimensional Cloud Tomography\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "463911c145", @@ -1488,7 +1533,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;1;0", - "aff_country_unique": "Canada;United States" + "aff_country_unique": "Canada;United States", + "bibtex": "@InProceedings{Zhu_2015_ICCV,\n \n author = {\n Zhu,\n Yukun and Kiros,\n Ryan and Zemel,\n Rich and Salakhutdinov,\n Ruslan and Urtasun,\n Raquel and Torralba,\n Antonio and Fidler,\n Sanja\n},\n title = {\n Aligning Books and Movies: Towards Story-Like Visual Explanations by Watching Movies and Reading Books\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "65d8721a50", @@ -1522,7 +1568,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Irie_2015_ICCV,\n \n author = {\n Irie,\n Go and Arai,\n Hiroyuki and Taniguchi,\n Yukinobu\n},\n title = {\n Alternating Co-Quantization for Cross-Modal Hashing\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "3bf92f9d1c", @@ -1556,7 +1603,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Kar_2015_ICCV,\n \n author = {\n Kar,\n Abhishek and Tulsiani,\n Shubham and Carreira,\n Joao and Malik,\n Jitendra\n},\n title = {\n Amodal Completion and Size Constancy in Natural Scenes\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "4344237d65", @@ -1583,14 +1631,15 @@ "author_num": 2, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Zhao_An_Accurate_Iris_ICCV_2015_paper.html", "aff_unique_index": "0;0", - "aff_unique_norm": "Hong Kong Polytechnic University", + "aff_unique_norm": "The Hong Kong Polytechnic University", "aff_unique_dep": "Department of Computing", "aff_unique_url": "https://www.polyu.edu.hk", "aff_unique_abbr": "PolyU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhao_2015_ICCV,\n \n author = {\n Zhao,\n Zijing and Ajay,\n Kumar\n},\n title = {\n An Accurate Iris Segmentation Framework Under Relaxed Imaging Constraints Using Total Variation Model\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "7d3bfb2100", @@ -1624,7 +1673,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Australia" + "aff_country_unique": "Australia", + "bibtex": "@InProceedings{Campbell_2015_ICCV,\n \n author = {\n Campbell,\n Dylan and Petersson,\n Lars\n},\n title = {\n An Adaptive Data Representation for Robust Point-Set Registration and Merging\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "350fc82218", @@ -1658,7 +1708,8 @@ "aff_campus_unique_index": "0;1;1", "aff_campus_unique": "Colorado Springs;Graz", "aff_country_unique_index": "0;1;1", - "aff_country_unique": "United States;Austria" + "aff_country_unique": "United States;Austria", + "bibtex": "@InProceedings{Ventura_2015_ICCV,\n \n author = {\n Ventura,\n Jonathan and Arth,\n Clemens and Lepetit,\n Vincent\n},\n title = {\n An Efficient Minimal Solution for Multi-Camera Motion\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "638d583b51", @@ -1683,7 +1734,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Chen_An_Efficient_Statistical_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Chen_An_Efficient_Statistical_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Chen_2015_ICCV,\n \n author = {\n Chen,\n Guangyong and Zhu,\n Fengyuan and Heng,\n Pheng Ann\n},\n title = {\n An Efficient Statistical Method for Image Noise Level Estimation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "b9388af977", @@ -1708,7 +1760,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Cheng_An_Exploration_of_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Cheng_An_Exploration_of_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Cheng_2015_ICCV,\n \n author = {\n Cheng,\n Yu and Yu,\n Felix X. and Feris,\n Rogerio S. and Kumar,\n Sanjiv and Choudhary,\n Alok and Chang,\n Shi-Fu\n},\n title = {\n An Exploration of Parameter Redundancy in Deep Networks With Circulant Projections\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "230724b808", @@ -1742,7 +1795,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Nguyen_2015_ICCV,\n \n author = {\n Nguyen,\n Duc Thanh and Tran,\n Minh-Khoi and Yeung,\n Sai-Kit\n},\n title = {\n An MRF-Poselets Model for Detecting Highly Articulated Humans\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "2011d34a3f", @@ -1776,7 +1830,8 @@ "aff_campus_unique_index": "0;1;1;0;1", "aff_campus_unique": "Whitewater;Madison", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Mukherjee_2015_ICCV,\n \n author = {\n Mukherjee,\n Lopamudra and Ravi,\n Sathya N. and Ithapu,\n Vamsi K. and Holmes,\n Tyler and Singh,\n Vikas\n},\n title = {\n An NMF Perspective on Binary Hashing\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "30e790a4fb", @@ -1801,7 +1856,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Parashar_As-Rigid-As-Possible_Volumetric_Shape-From-Template_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Parashar_As-Rigid-As-Possible_Volumetric_Shape-From-Template_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Parashar_2015_ICCV,\n \n author = {\n Parashar,\n Shaifali and Pizarro,\n Daniel and Bartoli,\n Adrien and Collins,\n Toby\n},\n title = {\n As-Rigid-As-Possible Volumetric Shape-From-Template\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "26729b6c2a", @@ -1826,7 +1882,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Malinowski_Ask_Your_Neurons_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Malinowski_Ask_Your_Neurons_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Malinowski_2015_ICCV,\n \n author = {\n Malinowski,\n Mateusz and Rohrbach,\n Marcus and Fritz,\n Mario\n},\n title = {\n Ask Your Neurons: A Neural-Based Approach to Answering Questions About Images\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "4cbfad9723", @@ -1860,7 +1917,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+1;0;0", - "aff_country_unique": "South Korea;United States" + "aff_country_unique": "South Korea;United States", + "bibtex": "@InProceedings{Yoo_2015_ICCV,\n \n author = {\n Yoo,\n Donggeun and Park,\n Sunggyun and Lee,\n Joon-Young and Paek,\n Anthony S. and Kweon,\n In So\n},\n title = {\n AttentionNet: Aggregating Weak Directions for Accurate Object Detection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "7b7f8864f1", @@ -1894,7 +1952,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Bangalore", "aff_country_unique_index": "0;0", - "aff_country_unique": "India" + "aff_country_unique": "India", + "bibtex": "@InProceedings{Prabhu_2015_ICCV,\n \n author = {\n Prabhu,\n Nikita and Babu,\n R. Venkatesh\n},\n title = {\n Attribute-Graph: A Graph Based Approach to Image Ranking\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "d87779fd41", @@ -1928,7 +1987,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Los Angeles", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Park_2015_ICCV,\n \n author = {\n Park,\n Seyoung and Zhu,\n Song-Chun\n},\n title = {\n Attributed Grammars for Joint Estimation of Human Attributes,\n Part and Pose\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "7c6e8eb68a", @@ -1955,14 +2015,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Xu_Augmenting_Strong_Supervision_ICCV_2015_paper.html", "aff_unique_index": "0+1;1;0;1", - "aff_unique_norm": "Shanghai Jiao Tong University;University of Technology Sydney", + "aff_unique_norm": "Shanghai Jiao Tong University;University of Technology, Sydney", "aff_unique_dep": "Cooperative Medianet Innovation Center and Shanghai Key Laboratory of Multimedia Processing and Transmissions;Faculty of Engineering and Information Technology", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.uts.edu.au", "aff_unique_abbr": "SJTU;UTS", "aff_campus_unique_index": "0+1;1;0;1", "aff_campus_unique": "Shanghai;Ultimo", "aff_country_unique_index": "0+1;1;0;1", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Xu_2015_ICCV,\n \n author = {\n Xu,\n Zhe and Huang,\n Shaoli and Zhang,\n Ya and Tao,\n Dacheng\n},\n title = {\n Augmenting Strong Supervision Using Web Data for Fine-Grained Categorization\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "218492b613", @@ -1989,14 +2050,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Joo_Automated_Facial_Trait_ICCV_2015_paper.html", "aff_unique_index": "0+1;0;0", - "aff_unique_norm": "University of California, Los Angeles;Meta", - "aff_unique_dep": "Departments of Computer Science and Statistics;Facebook, Inc.", + "aff_unique_norm": "University of California, Los Angeles;Facebook, Inc.", + "aff_unique_dep": "Departments of Computer Science and Statistics;", "aff_unique_url": "https://www.ucla.edu;https://www.facebook.com", "aff_unique_abbr": "UCLA;FB", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0+0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Joo_2015_ICCV,\n \n author = {\n Joo,\n Jungseock and Steen,\n Francis F. and Zhu,\n Song-Chun\n},\n title = {\n Automated Facial Trait Judgment and Election Outcome Prediction: Social Dimensions of Face\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "b335d14e04", @@ -2030,7 +2092,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0;1+0;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Sun_2015_ICCV,\n \n author = {\n Sun,\n Chen and Gan,\n Chuang and Nevatia,\n Ram\n},\n title = {\n Automatic Concept Discovery From Parallel Text and Visual Corpora\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "4e1b30f831", @@ -2057,14 +2120,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Huang_Automatic_Thumbnail_Generation_ICCV_2015_paper.html", "aff_unique_index": "0+0;0+0;0+0;1", - "aff_unique_norm": "Tsinghua University;Microsoft", + "aff_unique_norm": "Tsinghua University;Microsoft Corporation", "aff_unique_dep": "School of Software;Microsoft Research", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "THU;MSR", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0+0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Huang_2015_ICCV,\n \n author = {\n Huang,\n Jingwei and Chen,\n Huarong and Wang,\n Bin and Lin,\n Stephen\n},\n title = {\n Automatic Thumbnail Generation Based on Visual Representativeness and Foreground Recognizability\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "4512d10cb6", @@ -2088,7 +2152,8 @@ "aff_domain": ";", "email": ";", "author_num": 2, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Liu_Bayesian_Model_Adaptation_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Liu_Bayesian_Model_Adaptation_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Liu_2015_ICCV,\n \n author = {\n Liu,\n Bo and Vasconcelos,\n Nuno\n},\n title = {\n Bayesian Model Adaptation for Crowd Counts\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "6a5abd4c8a", @@ -2112,7 +2177,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Natola_Bayesian_Non-Parametric_Inference_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Natola_Bayesian_Non-Parametric_Inference_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Natola_2015_ICCV,\n \n author = {\n Natola,\n Fabrizio and Ntouskos,\n Valsamis and Sanzari,\n Marta and Pirri,\n Fiora\n},\n title = {\n Bayesian Non-Parametric Inference for Manifold Based MoCap Representation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "e9f0adb78f", @@ -2146,7 +2212,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0+1;0", - "aff_country_unique": "Australia;China" + "aff_country_unique": "Australia;China", + "bibtex": "@InProceedings{Wang_2015_ICCV,\n \n author = {\n Wang,\n Lei and Zhang,\n Jianjia and Zhou,\n Luping and Tang,\n Chang and Li,\n Wanqing\n},\n title = {\n Beyond Covariance: Feature Representation With Nonlinear Kernel Matrices\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "3b69100970", @@ -2171,7 +2238,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Harandi_Beyond_Gauss_Image-Set_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Harandi_Beyond_Gauss_Image-Set_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Harandi_2015_ICCV,\n \n author = {\n Harandi,\n Mehrtash and Salzmann,\n Mathieu and Baktashmotlagh,\n Mahsa\n},\n title = {\n Beyond Gauss: Image-Set Matching on the Riemannian Manifold of PDFs\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "2e2d945d0f", @@ -2205,7 +2273,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "1+1;1+1;1+1+1", - "aff_country_unique": ";China" + "aff_country_unique": ";China", + "bibtex": "@InProceedings{Fu_2015_ICCV,\n \n author = {\n Fu,\n Lianrui and Zhang,\n Junge and Huang,\n Kaiqi\n},\n title = {\n Beyond Tree Structure Models: A New Occlusion Aware Graphical Model for Human Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "f6591994ed", @@ -2229,7 +2298,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Cheng_Beyond_White_Ground_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Cheng_Beyond_White_Ground_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Cheng_2015_ICCV,\n \n author = {\n Cheng,\n Dongliang and Price,\n Brian and Cohen,\n Scott and Brown,\n Michael S.\n},\n title = {\n Beyond White: Ground Truth Colors for Color Constancy Correction\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "ea6cb526bb", @@ -2263,7 +2333,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Beijing;", "aff_country_unique_index": "0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Kan_2015_ICCV,\n \n author = {\n Kan,\n Meina and Shan,\n Shiguang and Chen,\n Xilin\n},\n title = {\n Bi-Shifting Auto-Encoder for Unsupervised Domain Adaptation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "e1c18e3b26", @@ -2297,7 +2368,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Amherst", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Lin_2015_ICCV,\n \n author = {\n Lin,\n Tsung-Yu and RoyChowdhury,\n Aruni and Maji,\n Subhransu\n},\n title = {\n Bilinear CNN Models for Fine-Grained Visual Recognition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "c2aa7bc4a5", @@ -2331,7 +2403,8 @@ "aff_campus_unique_index": "0;1;1", "aff_campus_unique": "College Park;Sunnyvale", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Chen_2015_ICCV,\n \n author = {\n Chen,\n Ching-Hui and Zhou,\n Hui and Ahonen,\n Timo\n},\n title = {\n Blur-Aware Disparity Estimation From Defocus Stereo Images\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "4dd6706252", @@ -2358,14 +2431,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Wang_BodyPrint_Pose_Invariant_ICCV_2015_paper.html", "aff_unique_index": "0+1;1;1;0;1", - "aff_unique_norm": "University of Illinois Urbana-Champaign;Siemens Healthcare", + "aff_unique_norm": "University of Illinois at Urbana-Champaign;Siemens Healthcare", "aff_unique_dep": "Beckman Institute;Medical Imaging Technologies", "aff_unique_url": "https://www.illinois.edu;https://www.siemens-healthineers.com", "aff_unique_abbr": "UIUC;Siemens", "aff_campus_unique_index": "0+1;1;1;0;1", "aff_campus_unique": "Urbana-Champaign;Princeton", "aff_country_unique_index": "0+0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Wang_2015_ICCV,\n \n author = {\n Wang,\n Jiangping and Ma,\n Kai and Singh,\n Vivek Kumar and Huang,\n Thomas and Chen,\n Terrence\n},\n title = {\n BodyPrint: Pose Invariant 3D Shape Matching of Human Bodies\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "ef583113a1", @@ -2378,7 +2452,7 @@ "author": "Jordi Pont-Tuset; Luc Van Gool", "abstract": "Computer vision in general, and object proposals in particular, are nowadays strongly influenced by the databases on which researchers evaluate the performance of their algorithms. This paper studies the transition from the Pascal Visual Object Challenge dataset, which has been the benchmark of reference for the last years, to the updated, bigger, and more challenging Microsoft Common Objects in Context. We first review and deeply analyze the new challenges, and opportunities, that this database presents. We then survey the current state of the art in object proposals and evaluate it focusing on how it generalizes to the new dataset. In sight of these results, we propose various lines of research to take advantage of the new benchmark and improve the techniques. We explore one of these lines, which leads to an improvement over the state of the art of +5.2%.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Pont-Tuset_Boosting_Object_Proposals_ICCV_2015_paper.pdf", - "aff": "Computer Vision Lab, ETH Z\u00fcrich, Switzerland; Computer Vision Lab, ETH Z\u00fcrich, Switzerland", + "aff": "Computer Vision Lab, ETH Zürich, Switzerland; Computer Vision Lab, ETH Zürich, Switzerland", "project": "", "github": "", "supp": "", @@ -2392,14 +2466,15 @@ "author_num": 2, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Pont-Tuset_Boosting_Object_Proposals_ICCV_2015_paper.html", "aff_unique_index": "0;0", - "aff_unique_norm": "ETH Zurich", + "aff_unique_norm": "ETH Zürich", "aff_unique_dep": "Computer Vision Lab", "aff_unique_url": "https://www.ethz.ch", "aff_unique_abbr": "ETHZ", - "aff_campus_unique_index": "", - "aff_campus_unique": "", + "aff_campus_unique_index": "0;0", + "aff_campus_unique": "Zürich", "aff_country_unique_index": "0;0", - "aff_country_unique": "Switzerland" + "aff_country_unique": "Switzerland", + "bibtex": "@InProceedings{Pont-Tuset_2015_ICCV,\n \n author = {\n Pont-Tuset,\n Jordi and Van Gool,\n Luc\n},\n title = {\n Boosting Object Proposals: From Pascal to COCO\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "13eaac0fdf", @@ -2426,14 +2501,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Liu_Box_Aggregation_for_ICCV_2015_paper.html", "aff_unique_index": "0;1+2;0", - "aff_unique_norm": "Chinese University of Hong Kong;Stanford University;Shanghai Jiao Tong University", + "aff_unique_norm": "The Chinese University of Hong Kong;Stanford University;Shanghai Jiao Tong University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.stanford.edu;https://www.sjtu.edu.cn", "aff_unique_abbr": "CUHK;Stanford;SJTU", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Hong Kong SAR;Stanford;", "aff_country_unique_index": "0;1+0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Liu_2015_ICCV,\n \n author = {\n Liu,\n Shu and Lu,\n Cewu and Jia,\n Jiaya\n},\n title = {\n Box Aggregation for Proposal Decimation: Last Mile of Object Detection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "9a0ea00c5e", @@ -2460,14 +2536,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Dai_BoxSup_Exploiting_Bounding_ICCV_2015_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "Microsoft", + "aff_unique_norm": "Microsoft Corporation", "aff_unique_dep": "Microsoft Research", "aff_unique_url": "https://www.microsoft.com/en-us/research", "aff_unique_abbr": "MSR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Dai_2015_ICCV,\n \n author = {\n Dai,\n Jifeng and He,\n Kaiming and Sun,\n Jian\n},\n title = {\n BoxSup: Exploiting Bounding Boxes to Supervise Convolutional Networks for Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "a0ff115679", @@ -2492,7 +2569,8 @@ "aff_domain": ";", "email": ";", "author_num": 2, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Matzen_BubbLeNet_Foveated_Imaging_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Matzen_BubbLeNet_Foveated_Imaging_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Matzen_2015_ICCV,\n \n author = {\n Matzen,\n Kevin and Snavely,\n Noah\n},\n title = {\n BubbLeNet: Foveated Imaging for Visual Discovery\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "5820963c1c", @@ -2526,7 +2604,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";St. Louis", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Murdock_2015_ICCV,\n \n author = {\n Murdock,\n Calvin and Jacobs,\n Nathan and Pless,\n Robert\n},\n title = {\n Building Dynamic Cloud Maps From the Ground Up\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "a1228b7685", @@ -2551,7 +2630,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Pham_COUNT_Forest_CO-Voting_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Pham_COUNT_Forest_CO-Voting_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Pham_2015_ICCV,\n \n author = {\n Pham,\n Viet-Quoc and Kozakaya,\n Tatsuo and Yamaguchi,\n Osamu and Okada,\n Ryuzo\n},\n title = {\n COUNT Forest: CO-Voting Uncertain Number of Targets Using Random Forest for Crowd Density Estimation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "ec55a509dd", @@ -2585,7 +2665,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Austria" + "aff_country_unique": "Austria", + "bibtex": "@InProceedings{Zendel_2015_ICCV,\n \n author = {\n Zendel,\n Oliver and Murschitz,\n Markus and Humenberger,\n Martin and Herzner,\n Wolfgang\n},\n title = {\n CV-HAZOP: Introducing Test Data Validation for Computer Vision\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "6cba74c4e7", @@ -2619,7 +2700,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Switzerland" + "aff_country_unique": "Switzerland", + "bibtex": "@InProceedings{Zeisl_2015_ICCV,\n \n author = {\n Zeisl,\n Bernhard and Sattler,\n Torsten and Pollefeys,\n Marc\n},\n title = {\n Camera Pose Voting for Large-Scale Image-Based Localization\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "3eae6e5c4f", @@ -2653,7 +2735,8 @@ "aff_campus_unique_index": "0;0;0;0;", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0+0;0+0;0;0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Jain_2015_ICCV,\n \n author = {\n Jain,\n Ashesh and Koppula,\n Hema S. and Raghavan,\n Bharad and Soh,\n Shane and Saxena,\n Ashutosh\n},\n title = {\n Car That Knows Before You Do: Anticipating Maneuvers via Learning Temporal Driving Models\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "92ee557609", @@ -2662,7 +2745,7 @@ "status": "Poster", "track": "main", "pid": "", - "author_site": "David Novotny, Ji\u0159\u00ed Matas", + "author_site": "David Novotny, Jiří Matas", "author": "David Novotny; Jiri Matas", "abstract": "A novel efficient method for extraction of object proposals is introduced. Its \"objectness\" function exploits deep spatial pyramid features, a novel fast-to-compute HoG-based edge statistic and the EdgeBoxes score. The efficiency is achieved by the use of spatial bins in a novel combination with sparsity-inducing group normalized SVM. State-of-the-art recall performance is achieved on Pascal VOC07, significantly outperforming methods with comparable speed. Interestingly, when only 100 proposals per image are considered the method attains 78 % recall on VOC07. The method improves mAP of the RCNN class-specific detector, increasing it by 10 points when only 50 proposals are used in each image. The system trained on twenty classes performs well on the two hundred class ILSVRC2013 set confirming generalization capability.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Novotny_Cascaded_Sparse_Spatial_ICCV_2015_paper.pdf", @@ -2687,7 +2770,8 @@ "aff_campus_unique_index": "0;1", "aff_campus_unique": "Oxford;Prague", "aff_country_unique_index": "0;1", - "aff_country_unique": "United Kingdom;Czech Republic" + "aff_country_unique": "United Kingdom;Czech Republic", + "bibtex": "@InProceedings{Novotny_2015_ICCV,\n \n author = {\n Novotny,\n David and Matas,\n Jiri\n},\n title = {\n Cascaded Sparse Spatial Bins for Efficient and Effective Generic Object Detection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "c56eb51642", @@ -2721,7 +2805,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Albany", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Li_2015_ICCV,\n \n author = {\n Li,\n Wenbo and Wen,\n Longyin and Chuah,\n Mooi Choo and Lyu,\n Siwei\n},\n title = {\n Category-Blind Human Action Recognition: A Practical Recognition System\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "31b1ec0045", @@ -2746,7 +2831,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Anwar_Class-Specific_Image_Deblurring_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Anwar_Class-Specific_Image_Deblurring_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Anwar_2015_ICCV,\n \n author = {\n Anwar,\n Saeed and Huynh,\n Cong Phuoc and Porikli,\n Fatih\n},\n title = {\n Class-Specific Image Deblurring\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "a5a09a92af", @@ -2780,7 +2866,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Haifa", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Israel" + "aff_country_unique": "Israel", + "bibtex": "@InProceedings{Shamai_2015_ICCV,\n \n author = {\n Shamai,\n Gil and Aflalo,\n Yonathan and Zibulevsky,\n Michael and Kimmel,\n Ron\n},\n title = {\n Classical Scaling Revisited\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "4e883eb36f", @@ -2805,7 +2892,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Tasse_Cluster-Based_Point_Set_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Tasse_Cluster-Based_Point_Set_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Tasse_2015_ICCV,\n \n author = {\n Tasse,\n Flora Ponjou and Kosinka,\n Jiri and Dodgson,\n Neil\n},\n title = {\n Cluster-Based Point Set Saliency\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "4fa5ea67e2", @@ -2839,7 +2927,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;1;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Lin_2015_ICCV,\n \n author = {\n Lin,\n Yuewei and Abdelfatah,\n Kareem and Zhou,\n Youjie and Fan,\n Xiaochuan and Yu,\n Hongkai and Qian,\n Hui and Wang,\n Song\n},\n title = {\n Co-Interest Person Detection From Multiple Wearable Camera Videos\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "7e20985a23", @@ -2873,7 +2962,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Ushiku_2015_ICCV,\n \n author = {\n Ushiku,\n Yoshitaka and Yamaguchi,\n Masataka and Mukuta,\n Yusuke and Harada,\n Tatsuya\n},\n title = {\n Common Subspace for Model and Similarity: Phrase Learning for Caption Generation From Images\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "9c8c351572", @@ -2907,7 +2997,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Jeon_2015_ICCV,\n \n author = {\n Jeon,\n Hae-Gon and Lee,\n Joon-Young and Han,\n Yudeog and Kim,\n Seon Joo and Kweon,\n In So\n},\n title = {\n Complementary Sets of Shutter Sequences for Motion Deblurring\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "1d06d0b637", @@ -2932,7 +3023,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Ntouskos_Component-Wise_Modeling_of_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Ntouskos_Component-Wise_Modeling_of_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Ntouskos_2015_ICCV,\n \n author = {\n Ntouskos,\n Valsamis and Sanzari,\n Marta and Cafaro,\n Bruno and Nardi,\n Federico and Natola,\n Fabrizio and Pirri,\n Fiora and Ruiz,\n Manuel\n},\n title = {\n Component-Wise Modeling of Articulated Objects\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "c45195f881", @@ -2941,7 +3033,7 @@ "status": "Poster", "track": "main", "pid": "", - "author_site": "Mete Ozay, Umit Rusen Aktas, Jeremy L. Wyatt, Ale\u0161 Leonardis", + "author_site": "Mete Ozay, Umit Rusen Aktas, Jeremy L. Wyatt, Aleš Leonardis", "author": "Mete Ozay; Umit Rusen Aktas; Jeremy L. Wyatt; Ales Leonardis", "abstract": "We address the problem of statistical learning of shape models which are invariant to translation, rotation and scale in compositional hierarchies when data spaces of measurements and shape spaces are not topological manifolds. In practice, this problem is observed while modeling shapes having multiple disconnected components, e.g. partially occluded shapes in cluttered scenes. We resolve the aforementioned problem by first reformulating the relationship between data and shape spaces considering the interaction between Receptive Fields (RFs) and Shape Manifolds (SMs) in a compositional hierarchical shape vocabulary. Then, we suggest a method to model the topological structure of the SMs for statistical learning of the geometric transformations of the shapes that are defined by group actions on the SMs. For this purpose, we design a disjoint union topology using an indexing mechanism for the formation of shape models on SMs in the vocabulary, recursively. We represent the topological relationship between shape components using graphs, which are aggregated to construct a hierarchical graph structure for the shape vocabulary. To this end, we introduce a framework to implement the indexing mechanisms for the employment of the vocabulary for structural shape classification. The proposed approach is used to construct invariant shape representations. Results on benchmark shape classification outperform state-of-the-art methods.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Ozay_Compositional_Hierarchical_Representation_ICCV_2015_paper.pdf", @@ -2957,7 +3049,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Ozay_Compositional_Hierarchical_Representation_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Ozay_Compositional_Hierarchical_Representation_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Ozay_2015_ICCV,\n \n author = {\n Ozay,\n Mete and Aktas,\n Umit Rusen and Wyatt,\n Jeremy L. and Leonardis,\n Ales\n},\n title = {\n Compositional Hierarchical Representation of Shape Manifolds for Classification of Non-Manifold Shapes\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "c7bd5c31b8", @@ -2984,14 +3077,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Dong_Compression_Artifacts_Reduction_ICCV_2015_paper.html", "aff_unique_index": "0;0;0;0", - "aff_unique_norm": "Chinese University of Hong Kong", + "aff_unique_norm": "The Chinese University of Hong Kong", "aff_unique_dep": "Department of Information Engineering", "aff_unique_url": "https://www.cuhk.edu.hk", "aff_unique_abbr": "CUHK", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Dong_2015_ICCV,\n \n author = {\n Dong,\n Chao and Deng,\n Yubin and Loy,\n Chen Change and Tang,\n Xiaoou\n},\n title = {\n Compression Artifacts Reduction by a Deep Convolutional Network\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "eb2a52c6e3", @@ -3025,7 +3119,8 @@ "aff_campus_unique_index": "0;0;0;1;0", "aff_campus_unique": "London;Singapore;", "aff_country_unique_index": "0;0;0;1;1;0", - "aff_country_unique": "United Kingdom;Singapore" + "aff_country_unique": "United Kingdom;Singapore", + "bibtex": "@InProceedings{Xiong_2015_ICCV,\n \n author = {\n Xiong,\n Chao and Zhao,\n Xiaowei and Tang,\n Danhang and Jayashree,\n Karlekar and Yan,\n Shuicheng and Kim,\n Tae-Kyun\n},\n title = {\n Conditional Convolutional Neural Network for Modality-Aware Face Recognition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "569e107583", @@ -3049,7 +3144,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Huang_Conditional_High-Order_Boltzmann_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Huang_Conditional_High-Order_Boltzmann_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Huang_2015_ICCV,\n \n author = {\n Huang,\n Yan and Wang,\n Wei and Wang,\n Liang\n},\n title = {\n Conditional High-Order Boltzmann Machine: A Supervised Learning Model for Relation Learning\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "e6db6c4606", @@ -3083,7 +3179,8 @@ "aff_campus_unique_index": "0;0;0;0+1;0", "aff_campus_unique": "Oxford;Stanford;", "aff_country_unique_index": "0;0;0;0+1;2;2;2;0", - "aff_country_unique": "United Kingdom;United States;China" + "aff_country_unique": "United Kingdom;United States;China", + "bibtex": "@InProceedings{Zheng_2015_ICCV,\n \n author = {\n Zheng,\n Shuai and Jayasumana,\n Sadeep and Romera-Paredes,\n Bernardino and Vineet,\n Vibhav and Su,\n Zhizhong and Du,\n Dalong and Huang,\n Chang and Torr,\n Philip H. S.\n},\n title = {\n Conditional Random Fields as Recurrent Neural Networks\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "8a07600677", @@ -3092,7 +3189,7 @@ "status": "Poster", "track": "main", "pid": "", - "author_site": "Gernot Riegler, Samuel Schulter, Matthias R\u00fcther, Horst Bischof", + "author_site": "Gernot Riegler, Samuel Schulter, Matthias Rüther, Horst Bischof", "author": "Gernot Riegler; Samuel Schulter; Matthias Ruther; Horst Bischof", "abstract": "Single image super-resolution is an important task in the field of computer vision and finds many practical applications. Current state-of-the-art methods typically rely on machine learning algorithms to infer a mapping from low- to high-resolution images. These methods use a single fixed blur kernel during training and, consequently, assume the exact same kernel underlying the image formation process for all test images. However, this setting is not realistic for practical applications, because the blur is typically different for each test image. In this paper, we loosen this restrictive constraint and propose conditioned regression models (including convolutional neural networks and random forests) that can effectively exploit the additional kernel information during both, training and inference. This allows for training a single model, while previous methods need to be re-trained for every blur kernel individually to achieve good results, which we demonstrate in our evaluations. We also empirically show that the proposed conditioned regression models (i) can effectively handle scenarios where the blur kernel is different for each image and (ii) outperform related approaches trained for only a single kernel.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Riegler_Conditioned_Regression_Models_ICCV_2015_paper.pdf", @@ -3117,7 +3214,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Graz", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Austria" + "aff_country_unique": "Austria", + "bibtex": "@InProceedings{Riegler_2015_ICCV,\n \n author = {\n Riegler,\n Gernot and Schulter,\n Samuel and Ruther,\n Matthias and Bischof,\n Horst\n},\n title = {\n Conditioned Regression Models for Non-Blind Single Image Super-Resolution\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "6107000882", @@ -3142,7 +3240,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Zeng_Confidence_Preserving_Machine_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Zeng_Confidence_Preserving_Machine_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Zeng_2015_ICCV,\n \n author = {\n Zeng,\n Jiabei and Chu,\n Wen-Sheng and De la Torre,\n Fernando and Cohn,\n Jeffrey F. and Xiong,\n Zhang\n},\n title = {\n Confidence Preserving Machine for Facial Action Unit Detection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "795c3533fb", @@ -3176,7 +3275,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2015_ICCV,\n \n author = {\n Li,\n Jianwei and Chen,\n Xiaowu and Zou,\n Dongqing and Gao,\n Bo and Teng,\n Wei\n},\n title = {\n Conformal and Low-Rank Sparse Representation for Image Restoration\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "cd6f29c1ef", @@ -3185,7 +3285,7 @@ "status": "Poster", "track": "main", "pid": "", - "author_site": "Deepak Pathak, Philipp Kr\u00e4henb\u00fchl, Trevor Darrell", + "author_site": "Deepak Pathak, Philipp Krähenbühl, Trevor Darrell", "author": "Deepak Pathak; Philipp Krahenbuhl; Trevor Darrell", "abstract": "We present an approach to learn a dense pixel-wise labeling from image-level tags. Each image-level tag imposes constraints on the output labeling of a Convolutional Neural Network (CNN) classifier. We propose Constrained CNN (CCNN), a method which uses a novel loss function to optimize for any set of linear constraints on the output space (i.e. predicted label distribution) of a CNN. Our loss formulation is easy to optimize and can be incorporated directly into standard stochastic gradient descent optimization. The key idea is to phrase the training objective as a biconvex optimization for linear models, which we then relax to nonlinear deep networks. Extensive experiments demonstrate the generality of our new learning framework. The constrained loss yields state-of-the-art results on weakly supervised semantic image segmentation. We further demonstrate that adding slightly more supervision can greatly improve the performance of the learning algorithm.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Pathak_Constrained_Convolutional_Neural_ICCV_2015_paper.pdf", @@ -3210,7 +3310,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Pathak_2015_ICCV,\n \n author = {\n Pathak,\n Deepak and Krahenbuhl,\n Philipp and Darrell,\n Trevor\n},\n title = {\n Constrained Convolutional Neural Networks for Weakly Supervised Segmentation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "cce32f6e79", @@ -3244,7 +3345,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Riverside", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Hasan_2015_ICCV,\n \n author = {\n Hasan,\n Mahmudul and Roy-Chowdhury,\n Amit K.\n},\n title = {\n Context Aware Active Learning of Activity Recognition Models\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "75f303cc8b", @@ -3257,7 +3359,7 @@ "author": "Tuan-Hung Vu; Anton Osokin; Ivan Laptev", "abstract": "Person detection is a key problem for many computer vision tasks. While face detection has reached maturity, detecting people under full variation of camera view-points, human poses, lighting conditions and occlusions is still a difficult challenge. In this work we focus on detecting human heads in natural scenes. Starting from the recent R-CNN object detector, we extend it in two ways. First, we leverage person-scene relations and propose a global CNN model trained to predict positions and scales of heads directly from the full image. Second, we explicitly model pairwise relations among the objects via energy-based model where the potentials are computed with a CNN framework. Our full combined model complements R-CNN with contextual cues derived from the scene. To train and test our model, we introduce a large dataset with 369,846 human heads annotated in 224,740 movie frames. We evaluate our method and demonstrate improvements of person head detection compared to several recent baselines on three datasets. We also show improvements of the detection speed provided by our model.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Vu_Context-Aware_CNNs_for_ICCV_2015_paper.pdf", - "aff": "WILLOW project-team, D\u00b4epartment d\u2019Informatique de l\u2019Ecole Normale Sup\u00b4erieure, ENS/INRIA/CNRS UMR 8548, Paris, France; SIERRA project-team, D\u00b4epartment d\u2019Informatique de l\u2019Ecole Normale Sup\u00b4erieure, ENS/INRIA/CNRS UMR 8548, Paris, France; WILLOW project-team, D\u00b4epartment d\u2019Informatique de l\u2019Ecole Normale Sup\u00b4erieure, ENS/INRIA/CNRS UMR 8548, Paris, France", + "aff": "WILLOW project-team, D´epartment d’Informatique de l’Ecole Normale Sup´erieure, ENS/INRIA/CNRS UMR 8548, Paris, France; SIERRA project-team, D´epartment d’Informatique de l’Ecole Normale Sup´erieure, ENS/INRIA/CNRS UMR 8548, Paris, France; WILLOW project-team, D´epartment d’Informatique de l’Ecole Normale Sup´erieure, ENS/INRIA/CNRS UMR 8548, Paris, France", "project": "", "github": "", "supp": "", @@ -3278,7 +3380,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Paris", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "France" + "aff_country_unique": "France", + "bibtex": "@InProceedings{Vu_2015_ICCV,\n \n author = {\n Vu,\n Tuan-Hung and Osokin,\n Anton and Laptev,\n Ivan\n},\n title = {\n Context-Aware CNNs for Person Head Detection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "9ec1fd2fbb", @@ -3303,7 +3406,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Kim_Context-Guided_Diffusion_for_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Kim_Context-Guided_Diffusion_for_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Kim_2015_ICCV,\n \n author = {\n Kim,\n Kwang In and Tompkin,\n James and Pfister,\n Hanspeter and Theobalt,\n Christian\n},\n title = {\n Context-Guided Diffusion for Label Propagation on Graphs\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "62ef2bcc4b", @@ -3330,14 +3434,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Gkioxari_Contextual_Action_Recognition_ICCV_2015_paper.html", "aff_unique_index": "0;1;0", - "aff_unique_norm": "University of California, Berkeley;Microsoft", + "aff_unique_norm": "University of California, Berkeley;Microsoft Corporation", "aff_unique_dep": ";Microsoft Research", "aff_unique_url": "https://www.berkeley.edu;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "UC Berkeley;MSR", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Gkioxari_2015_ICCV,\n \n author = {\n Gkioxari,\n Georgia and Girshick,\n Ross and Malik,\n Jitendra\n},\n title = {\n Contextual Action Recognition With R*CNN\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "5bfe422faa", @@ -3346,11 +3451,11 @@ "status": "Poster", "track": "main", "pid": "", - "author_site": "Michele Fenzi, Laura Leal-Taix\u00e9, J\u00f6rn Ostermann, Tinne Tuytelaars", + "author_site": "Michele Fenzi, Laura Leal-Taixé, Jörn Ostermann, Tinne Tuytelaars", "author": "Michele Fenzi; Laura Leal-Taixe; Jorn Ostermann; Tinne Tuytelaars", "abstract": "In this paper, we treat the problem of continuous pose estimation for object categories as a regression problem on the basis of only 2D training information. While regression is a natural framework for continuous problems, regression methods so far achieved inferior results with respect to 3D-based and 2D-based classification-and-refinement approaches. This may be attributed to their weakness to high intra-class variability as well as to noisy matching procedures and lack of geometrical constraints. We propose to apply regression to Fisher-encoded vectors computed from large cells by learning an array of Fisher regressors. Fisher encoding makes our algorithm flexible to variations in class appearance, while the array structure permits to indirectly introduce spatial context information in the approach. We formulate our problem as a MAP inference problem, where the likelihood function is composed of a generative term based on the prediction error generated by the ensemble of Fisher regressors as well as a discriminative term based on SVM classifiers. We test our algorithm on three publicly available datasets that envisage several difficulties, such as high intra-class variability, truncations, occlusions, and motion blur, obtaining state-of-the-art results.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Fenzi_Continuous_Pose_Estimation_ICCV_2015_paper.pdf", - "aff": "Institut f\u00fcr Informationsverarbeitung (TNT), Leibniz Universit\u00e4t Hannover; Institute of Geodesy and Photogrammetry, ETH Zurich; Institut f\u00fcr Informationsverarbeitung (TNT), Leibniz Universit\u00e4t Hannover; KU Leuven, ESAT - PSI, iMinds", + "aff": "Institut für Informationsverarbeitung (TNT), Leibniz Universität Hannover; Institute of Geodesy and Photogrammetry, ETH Zurich; Institut für Informationsverarbeitung (TNT), Leibniz Universität Hannover; KU Leuven, ESAT - PSI, iMinds", "project": "", "github": "", "supp": "", @@ -3364,14 +3469,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Fenzi_Continuous_Pose_Estimation_ICCV_2015_paper.html", "aff_unique_index": "0;1;0;2", - "aff_unique_norm": "Leibniz Universit\u00e4t Hannover;ETH Zurich;KU Leuven", - "aff_unique_dep": "Institut f\u00fcr Informationsverarbeitung (TNT);Institute of Geodesy and Photogrammetry;ESAT - PSI", + "aff_unique_norm": "Leibniz Universität Hannover;ETH Zurich;KU Leuven", + "aff_unique_dep": "Institut für Informationsverarbeitung (TNT);Institute of Geodesy and Photogrammetry;ESAT - PSI", "aff_unique_url": "https://www.uni-hannover.de;https://www.ethz.ch;https://www.kuleuven.be", "aff_unique_abbr": "LUH;ETHZ;KU Leuven", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;2", - "aff_country_unique": "Germany;Switzerland;Belgium" + "aff_country_unique": "Germany;Switzerland;Belgium", + "bibtex": "@InProceedings{Fenzi_2015_ICCV,\n \n author = {\n Fenzi,\n Michele and Leal-Taixe,\n Laura and Ostermann,\n Jorn and Tuytelaars,\n Tinne\n},\n title = {\n Continuous Pose Estimation With a Spatial Ensemble of Fisher Regressors\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "395e74db9a", @@ -3396,7 +3502,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Lu_Contour_Box_Rejecting_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Lu_Contour_Box_Rejecting_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Lu_2015_ICCV,\n \n author = {\n Lu,\n Cewu and Liu,\n Shu and Jia,\n Jiaya and Tang,\n Chi-Keung\n},\n title = {\n Contour Box: Rejecting Object Proposals Without Explicit Closed Contours\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "6a37d9da4d", @@ -3405,7 +3512,7 @@ "status": "Poster", "track": "main", "pid": "", - "author_site": "Francisco Barranco, Ching L. Teo, Cornelia Ferm\u00fcller, Yiannis Aloimonos", + "author_site": "Francisco Barranco, Ching L. Teo, Cornelia Fermüller, Yiannis Aloimonos", "author": "Francisco Barranco; Ching L. Teo; Cornelia Fermuller; Yiannis Aloimonos", "abstract": "The bio-inspired, asynchronous event-based dynamic vision sensor records temporal changes in the luminance of the scene at high temporal resolution. Since events are only triggered at significant luminance changes, most events occur at the boundary of objects and their parts. The detection of these contours is an essential step for further interpretation of the scene. This paper presents an approach to learn the location of contours and their border ownership using Structured Random Forests on event-based features that encode motion, timing, texture, and spatial orientations. The classifier integrates elegantly information over time by utilizing the classification results previously computed. Finally, the contour detection and boundary assignment are demonstrated in a layer-segmentation of the scene. Experimental results demonstrate good performance in boundary detection and segmentation.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Barranco_Contour_Detection_and_ICCV_2015_paper.pdf", @@ -3426,11 +3533,12 @@ "aff_unique_norm": "University of Maryland;University of Granada", "aff_unique_dep": "Computer Vision Lab;CITIC", "aff_unique_url": "https://www.umd.edu;https://www.ugr.es", - "aff_unique_abbr": "UMD;", - "aff_campus_unique_index": "0;0;0;0", - "aff_campus_unique": "College Park;", + "aff_unique_abbr": ";", + "aff_campus_unique_index": "", + "aff_campus_unique": "", "aff_country_unique_index": "0+1;0;0;0", - "aff_country_unique": "United States;Spain" + "aff_country_unique": "United States;Spain", + "bibtex": "@InProceedings{Barranco_2015_ICCV,\n \n author = {\n Barranco,\n Francisco and Teo,\n Ching L. and Fermuller,\n Cornelia and Aloimonos,\n Yiannis\n},\n title = {\n Contour Detection and Characterization for Asynchronous Event Sensors\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "bc70b8c790", @@ -3455,7 +3563,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Di_Contour_Flow_Middle-Level_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Di_Contour_Flow_Middle-Level_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Di_2015_ICCV,\n \n author = {\n Di,\n Huijun and Shi,\n Qingxuan and Lv,\n Feng and Qin,\n Ming and Lu,\n Yao\n},\n title = {\n Contour Flow: Middle-Level Motion Estimation by Combining Motion Segmentation and Contour Alignment\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "a71bf885d3", @@ -3468,7 +3577,7 @@ "author": "Yuanqi Su; Yuehu Liu; Bonan Cuan; Nanning Zheng", "abstract": "For its simplicity and effectiveness, star model is popular in shape matching. However, it suffers from the loose geometric connections among parts. In the paper, we present a novel algorithm that reconsiders these connections and reduces the global matching to a set of interrelated local matching. For the purpose, we divide the shape template into overlapped parts and model the matching through a part-based layered structure that uses the latent variable to constrain parts' deformation. As for inference, each part is used for localizing candidates by the partial matching. Thanks to the contour fragments, the partial matching can be solved via modified dynamic programming. The overlapped regions among parts of the template are then explored to make the candidates of parts meet at their shared points. The process is fulfilled via a refined procedure based on iterative dynamic programming. Results on ETHZ shape and Inria Horse datasets demonstrate the benefits of the proposed algorithm.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Su_Contour_Guided_Hierarchical_ICCV_2015_paper.pdf", - "aff": "Xi\u2019an Jiaotong University, Xi\u2019an, Shaanxi Province, China, 710049; Xi\u2019an Jiaotong University, Xi\u2019an, Shaanxi Province, China, 710049; Xi\u2019an Jiaotong University, Xi\u2019an, Shaanxi Province, China, 710049; Xi\u2019an Jiaotong University, Xi\u2019an, Shaanxi Province, China, 710049", + "aff": "Xi’an Jiaotong University, Xi’an, Shaanxi Province, China, 710049; Xi’an Jiaotong University, Xi’an, Shaanxi Province, China, 710049; Xi’an Jiaotong University, Xi’an, Shaanxi Province, China, 710049; Xi’an Jiaotong University, Xi’an, Shaanxi Province, China, 710049", "project": "", "github": "", "supp": "", @@ -3482,14 +3591,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Su_Contour_Guided_Hierarchical_ICCV_2015_paper.html", "aff_unique_index": "0;0;0;0", - "aff_unique_norm": "Xi'an Jiao Tong University", + "aff_unique_norm": "Xi'an Jiaotong University", "aff_unique_dep": "", "aff_unique_url": "https://www.xjtu.edu.cn", "aff_unique_abbr": "XJTU", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Xi'an", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Su_2015_ICCV,\n \n author = {\n Su,\n Yuanqi and Liu,\n Yuehu and Cuan,\n Bonan and Zheng,\n Nanning\n},\n title = {\n Contour Guided Hierarchical Model for Shape Matching\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "5e2eff1a0f", @@ -3514,7 +3624,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/An_Contractive_Rectifier_Networks_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/An_Contractive_Rectifier_Networks_ICCV_2015_paper.html", + "bibtex": "@InProceedings{An_2015_ICCV,\n \n author = {\n An,\n Senjian and Hayat,\n Munawar and Khan,\n Salman H. and Bennamoun,\n Mohammed and Boussaid,\n Farid and Sohel,\n Ferdous\n},\n title = {\n Contractive Rectifier Networks for Nonlinear Maximum Margin Classification\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "fb68f7c39c", @@ -3548,7 +3659,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Diamond_2015_ICCV,\n \n author = {\n Diamond,\n Steven and Boyd,\n Stephen\n},\n title = {\n Convex Optimization With Abstract Linear Operators\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "cba37f2d28", @@ -3573,7 +3685,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Yang_Convolutional_Channel_Features_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Yang_Convolutional_Channel_Features_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Yang_2015_ICCV,\n \n author = {\n Yang,\n Bin and Yan,\n Junjie and Lei,\n Zhen and Li,\n Stan Z.\n},\n title = {\n Convolutional Channel Features\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "25e11ddfbb", @@ -3582,6 +3695,7 @@ "status": "Poster", "track": "main", "pid": "", + "author_site": "Jonathan T. Barron", "author": "Jonathan T. Barron", "abstract": "Color constancy is the problem of inferring the color of the light that illuminated a scene, usually so that the illumination color can be removed. Because this problem is underconstrained, it is often solved by modeling the statistical regularities of the colors of natural objects and illumination. In contrast, in this paper we reformulate the problem of color constancy as a 2D spatial localization task in a log-chrominance space, thereby allowing us to apply techniques from object detection and structured prediction to the color constancy problem. By directly learning how to discriminate between correctly white-balanced images and poorly white-balanced images, our model is able to improve performance on standard benchmarks by nearly 40%.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Barron_Convolutional_Color_Constancy_ICCV_2015_paper.pdf", @@ -3597,7 +3711,8 @@ "aff_domain": "google.com", "email": "google.com", "author_num": 1, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Barron_Convolutional_Color_Constancy_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Barron_Convolutional_Color_Constancy_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Barron_2015_ICCV,\n \n author = {\n Barron,\n Jonathan T.\n},\n title = {\n Convolutional Color Constancy\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "6b8dab419a", @@ -3622,7 +3737,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Gu_Convolutional_Sparse_Coding_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Gu_Convolutional_Sparse_Coding_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Gu_2015_ICCV,\n \n author = {\n Gu,\n Shuhang and Zuo,\n Wangmeng and Xie,\n Qi and Meng,\n Deyu and Feng,\n Xiangchu and Zhang,\n Lei\n},\n title = {\n Convolutional Sparse Coding for Image Super-Resolution\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "2edbfbb8ba", @@ -3648,15 +3764,16 @@ "email": "nus.edu.sg;nus.edu.sg;us.ibm.com;au1.ibm.com", "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Huang_Cross-Domain_Image_Retrieval_ICCV_2015_paper.html", - "aff_unique_index": "0;1;1;0", - "aff_unique_norm": "National University of Singapore;IBM", - "aff_unique_dep": ";Research Center", - "aff_unique_url": "https://www.nus.edu.sg;https://www.ibm.com/research/watson", - "aff_unique_abbr": "NUS;IBM", + "aff_unique_index": "0;1;2;0", + "aff_unique_norm": "National University of Singapore;IBM;IBM Research", + "aff_unique_dep": ";Research Center;", + "aff_unique_url": "https://www.nus.edu.sg;https://www.ibm.com/research/watson;https://www.ibm.com/research", + "aff_unique_abbr": "NUS;IBM;IBM", "aff_campus_unique_index": "1", "aff_campus_unique": ";T.J. Watson", "aff_country_unique_index": "0;1;2;0", - "aff_country_unique": "Singapore;United States;Australia" + "aff_country_unique": "Singapore;United States;Australia", + "bibtex": "@InProceedings{Huang_2015_ICCV,\n \n author = {\n Huang,\n Junshi and Feris,\n Rogerio S. and Chen,\n Qiang and Yan,\n Shuicheng\n},\n title = {\n Cross-Domain Image Retrieval With a Dual Attribute-Aware Ranking Network\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "87b975018e", @@ -3683,14 +3800,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Namin_Cutting_Edge_Soft_ICCV_2015_paper.html", "aff_unique_index": "0+1;1+2;2;0+1", - "aff_unique_norm": "Australian National University;National Information and Communications Technology Australia;EPFL", + "aff_unique_norm": "Australian National University;National Information and Communications Technology Australia;École Polytechnique Fédérale de Lausanne", "aff_unique_dep": ";;CVLab", "aff_unique_url": "https://www.anu.edu.au;https://www.nicta.com.au;https://cvlab.epfl.ch", "aff_unique_abbr": "ANU;NICTA;EPFL", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+1;1;0+0", - "aff_country_unique": "Australia;Switzerland" + "aff_country_unique": "Australia;Switzerland", + "bibtex": "@InProceedings{Namin_2015_ICCV,\n \n author = {\n Namin,\n Sarah Taghavi and Najafi,\n Mohammad and Salzmann,\n Mathieu and Petersson,\n Lars\n},\n title = {\n Cutting Edge: Soft Correspondences in Multimodal Scene Parsing\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "8e58f1be17", @@ -3724,7 +3842,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Cheng_2015_ICCV,\n \n author = {\n Cheng,\n Zezhou and Yang,\n Qingxiong and Sheng,\n Bin\n},\n title = {\n Deep Colorization\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "798d04cb72", @@ -3749,7 +3868,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Yang_Deep_Fried_Convnets_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Yang_Deep_Fried_Convnets_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Yang_2015_ICCV,\n \n author = {\n Yang,\n Zichao and Moczulski,\n Marcin and Denil,\n Misha and de Freitas,\n Nando and Smola,\n Alex and Song,\n Le and Wang,\n Ziyu\n},\n title = {\n Deep Fried Convnets\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "d4f32ca648", @@ -3776,14 +3896,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Liu_Deep_Learning_Face_ICCV_2015_paper.html", "aff_unique_index": "0+1;0+1;0+1;0+1", - "aff_unique_norm": "Chinese University of Hong Kong;Shenzhen Institute of Advanced Technology", + "aff_unique_norm": "The Chinese University of Hong Kong;Shenzhen Institutes of Advanced Technology", "aff_unique_dep": "Department of Information Engineering;Shenzhen Key Lab of Comp. Vis. & Pat. Rec.", "aff_unique_url": "https://www.cuhk.edu.hk;http://www.siat.ac.cn", "aff_unique_abbr": "CUHK;SIAT", "aff_campus_unique_index": "0+1;0+1;0+1;0+1", "aff_campus_unique": "Hong Kong SAR;Shenzhen", "aff_country_unique_index": "0+0;0+0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2015_ICCV,\n \n author = {\n Liu,\n Ziwei and Luo,\n Ping and Wang,\n Xiaogang and Tang,\n Xiaoou\n},\n title = {\n Deep Learning Face Attributes in the Wild\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "0ac611db7f", @@ -3810,14 +3931,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Tian_Deep_Learning_Strong_ICCV_2015_paper.html", "aff_unique_index": "0;0+0;0+1;0+1", - "aff_unique_norm": "Chinese University of Hong Kong;Shenzhen Institute of Advanced Technology", + "aff_unique_norm": "The Chinese University of Hong Kong;Shenzhen Institutes of Advanced Technology", "aff_unique_dep": "Department of Information Engineering;Shenzhen Key Lab of Comp. Vis. & Pat. Rec.", "aff_unique_url": "https://www.cuhk.edu.hk;http://www.siat.ac.cn", "aff_unique_abbr": "CUHK;SIAT", "aff_campus_unique_index": "0;0+0;0+1;0+1", "aff_campus_unique": "Hong Kong SAR;Shenzhen", "aff_country_unique_index": "0;0+0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Tian_2015_ICCV,\n \n author = {\n Tian,\n Yonglong and Luo,\n Ping and Wang,\n Xiaogang and Tang,\n Xiaoou\n},\n title = {\n Deep Learning Strong Parts for Pedestrian Detection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "77e5bb4b50", @@ -3826,7 +3948,7 @@ "status": "Poster", "track": "main", "pid": "", - "author_site": "Xin Lu, Zhe Lin, Xiaohui Shen, Radom\u00edr M\u011bch, James Z. Wang", + "author_site": "Xin Lu, Zhe Lin, Xiaohui Shen, Radomír Měch, James Z. Wang", "author": "Xin Lu; Zhe Lin; Xiaohui Shen; Radomir Mech; James Z. Wang", "abstract": "This paper investigates problems of image style, aesthetics, and quality estimation, which require fine-grained details from high-resolution images, utilizing deep neural network training approach. Existing deep convolutional neural networks mostly extracted one patch such as a down-sized crop from each image as a training example. However, one patch may not always well represent the entire image, which may cause ambiguity during training. We propose a deep multi-patch aggregation network training approach, which allows us to train models using multiple patches generated from one image. We achieve this by constructing multiple, shared columns in the neural network and feeding multiple patches to each of the columns. More importantly, we propose two novel network layers (statistics and sorting) to support aggregation of those patches. The proposed deep multi-patch aggregation network integrates shared feature learning and aggregation function learning into a unified framework. We demonstrate the effectiveness of the deep multi-patch aggregation network on the three problems, i.e., image style recognition, aesthetic quality categorization, and image quality estimation. Our models trained using the proposed networks significantly outperformed the state of the art in all three applications.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Lu_Deep_Multi-Patch_Aggregation_ICCV_2015_paper.pdf", @@ -3844,14 +3966,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Lu_Deep_Multi-Patch_Aggregation_ICCV_2015_paper.html", "aff_unique_index": "0+1;1;1;1;0+1", - "aff_unique_norm": "Pennsylvania State University;Adobe", - "aff_unique_dep": ";Adobe Research", + "aff_unique_norm": "The Pennsylvania State University;Adobe Research", + "aff_unique_dep": ";", "aff_unique_url": "https://www.psu.edu;https://research.adobe.com", "aff_unique_abbr": "PSU;Adobe", "aff_campus_unique_index": "0+1;1;1;1;0+1", "aff_campus_unique": "University Park;San Jose", "aff_country_unique_index": "0+0;0;0;0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Lu_2015_ICCV,\n \n author = {\n Lu,\n Xin and Lin,\n Zhe and Shen,\n Xiaohui and Mech,\n Radomir and Wang,\n James Z.\n},\n title = {\n Deep Multi-Patch Aggregation Network for Image Style,\n Aesthetics,\n and Quality Estimation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "d40810f8f3", @@ -3878,14 +4001,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Wang_Deep_Networks_for_ICCV_2015_paper.html", "aff_unique_index": "0;0;1;0;0", - "aff_unique_norm": "University of Illinois Urbana-Champaign;Snapchat", + "aff_unique_norm": "University of Illinois at Urbana-Champaign;Snapchat", "aff_unique_dep": "Beckman Institute;", "aff_unique_url": "https://www.illinois.edu;https://www.snapchat.com", "aff_unique_abbr": "UIUC;Snapchat", "aff_campus_unique_index": "0;0;1;0;0", "aff_campus_unique": "Urbana-Champaign;Venice", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Wang_2015_ICCV,\n \n author = {\n Wang,\n Zhaowen and Liu,\n Ding and Yang,\n Jianchao and Han,\n Wei and Huang,\n Thomas\n},\n title = {\n Deep Networks for Image Super-Resolution With Sparse Prior\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "d437652886", @@ -3894,7 +4018,7 @@ "status": "Oral", "track": "main", "pid": "", - "author_site": "Peter Kontschieder, Madalina Fiterau, Antonio Criminisi, Samuel Rota Bul\u00f2", + "author_site": "Peter Kontschieder, Madalina Fiterau, Antonio Criminisi, Samuel Rota Bulò", "author": "Peter Kontschieder; Madalina Fiterau; Antonio Criminisi; Samuel Rota Bulo", "abstract": "We present Deep Neural Decision Forests - a novel approach that unifies classification trees with the representation learning functionality known from deep convolutional networks, by training them in an end-to-end manner. To combine these two worlds, we introduce a stochastic and differentiable decision tree model, which steers the representation learning usually conducted in the initial layers of a (deep) convolutional network. Our model differs from conventional deep networks because a decision forest provides the final predictions and it differs from conventional decision forests since we propose a principled, joint and global optimization of split and leaf node parameters. We show experimental results on benchmark machine learning datasets like MNIST and ImageNet and find on-par or superior results when compared to state-of-the-art deep models. Most remarkably, we obtain Top5-Errors of only 7.84%/6.38% on ImageNet validation data when integrating our forests in a single-crop, single/seven model GoogLeNet architecture, respectively. Thus, even without any form of training data set augmentation we are improving on the 6.67% error obtained by the best GoogLeNet architecture (7 models, 144 crops).", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Kontschieder_Deep_Neural_Decision_ICCV_2015_paper.pdf", @@ -3912,14 +4036,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Kontschieder_Deep_Neural_Decision_ICCV_2015_paper.html", "aff_unique_index": "0;1+0;0;2", - "aff_unique_norm": "Microsoft;Carnegie Mellon University;Fondazione Bruno Kessler", - "aff_unique_dep": "Microsoft Research;;", + "aff_unique_norm": "Microsoft Research;Carnegie Mellon University;Fondazione Bruno Kessler", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.microsoft.com/en-us/research;https://www.cmu.edu;https://www.fbk.eu", "aff_unique_abbr": "MSR;CMU;FBK", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0;1", - "aff_country_unique": "United States;Italy" + "aff_country_unique": "United States;Italy", + "bibtex": "@InProceedings{Kontschieder_2015_ICCV,\n \n author = {\n Kontschieder,\n Peter and Fiterau,\n Madalina and Criminisi,\n Antonio and Bulo,\n Samuel Rota\n},\n title = {\n Deep Neural Decision Forests\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "223a9cbeac", @@ -3953,7 +4078,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Kuo_2015_ICCV,\n \n author = {\n Kuo,\n Weicheng and Hariharan,\n Bharath and Malik,\n Jitendra\n},\n title = {\n DeepBox: Learning Objectness With Convolutional Networks\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "b445817a8d", @@ -3978,7 +4104,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Chen_DeepDriving_Learning_Affordance_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Chen_DeepDriving_Learning_Affordance_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Chen_2015_ICCV,\n \n author = {\n Chen,\n Chenyi and Seff,\n Ari and Kornhauser,\n Alain and Xiao,\n Jianxiong\n},\n title = {\n DeepDriving: Learning Affordance for Direct Perception in Autonomous Driving\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "8a2539abb7", @@ -4005,14 +4132,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Ghodrati_DeepProposal_Hunting_Objects_ICCV_2015_paper.html", "aff_unique_index": "0;0;1+2;0;0+3", - "aff_unique_norm": "KU Leuven;INRIA;Inria Grenoble Rhone-Alpes;ETH Zurich", + "aff_unique_norm": "KU Leuven;Inria;Inria Grenoble Rhone-Alpes;ETH Zurich", "aff_unique_dep": "ESAT-PSI;;LEAR project, LJK;Computer Vision Laboratory", "aff_unique_url": "https://www.kuleuven.be;https://www.inria.fr;https://www.inria.fr/grenoble;https://www.ethz.ch", "aff_unique_abbr": "KU Leuven;Inria;Inria;ETHZ", "aff_campus_unique_index": "1;", "aff_campus_unique": ";Grenoble", "aff_country_unique_index": "0;0;1+1;0;0+2", - "aff_country_unique": "Belgium;France;Switzerland" + "aff_country_unique": "Belgium;France;Switzerland", + "bibtex": "@InProceedings{Ghodrati_2015_ICCV,\n \n author = {\n Ghodrati,\n Amir and Diba,\n Ali and Pedersoli,\n Marco and Tuytelaars,\n Tinne and Van Gool,\n Luc\n},\n title = {\n DeepProposal: Hunting Objects by Cascading Deep Convolutional Layers\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "e084b6a927", @@ -4039,14 +4167,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Xu_Deformable_3D_Fusion_ICCV_2015_paper.html", "aff_unique_index": "0+1;1+2;0;0", - "aff_unique_norm": "Beijing Institute of Technology;NICTA;EPFL", + "aff_unique_norm": "Beijing Institute of Technology;NICTA;École Polytechnique Fédérale de Lausanne", "aff_unique_dep": ";;CVLab", "aff_unique_url": "http://www.bit.edu.cn/;;https://cvlab.epfl.ch", "aff_unique_abbr": "BIT;;EPFL", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Canberra", "aff_country_unique_index": "0+1;1+2;0;0", - "aff_country_unique": "China;Australia;Switzerland" + "aff_country_unique": "China;Australia;Switzerland", + "bibtex": "@InProceedings{Xu_2015_ICCV,\n \n author = {\n Xu,\n Weipeng and Salzmann,\n Mathieu and Wang,\n Yongtian and Liu,\n Yue\n},\n title = {\n Deformable 3D Fusion: From Partial Dynamic 3D Observations to Complete 4D Models\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "8d7a38d850", @@ -4071,7 +4200,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/He_Delving_Deep_into_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/He_Delving_Deep_into_ICCV_2015_paper.html", + "bibtex": "@InProceedings{He_2015_ICCV,\n \n author = {\n He,\n Kaiming and Zhang,\n Xiangyu and Ren,\n Shaoqing and Sun,\n Jian\n},\n title = {\n Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "e540a7a275", @@ -4080,11 +4210,11 @@ "status": "Poster", "track": "main", "pid": "", - "author_site": "Christian Kerl, J\u00f6rg St\u00fcckler, Daniel Cremers", + "author_site": "Christian Kerl, Jörg Stückler, Daniel Cremers", "author": "Christian Kerl; Jorg Stuckler; Daniel Cremers", "abstract": "We propose a dense continuous-time tracking and mapping method for RGB-D cameras. We parametrize the camera trajectory using continuous B-splines and optimize the trajectory through dense, direct image alignment. Our method also directly models rolling shutter in both RGB and depth images within the optimization, which improves tracking and reconstruction quality for low-cost CMOS sensors. Using a continuous trajectory representation has a number of advantages over a discrete-time representation (e.g. camera poses at the frame interval). With splines, less variables need to be optimized than with a discrete representation, since the trajectory can be represented with fewer control points than frames. Splines also naturally include smoothness constraints on derivatives of the trajectory estimate. Finally, the continuous trajectory representation allows to compensate for rolling shutter effects, since a pose estimate is available at any exposure time of an image. Our approach demonstrates superior quality in tracking and reconstruction compared to approaches with discrete-time or global shutter assumptions.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Kerl_Dense_Continuous-Time_Tracking_ICCV_2015_paper.pdf", - "aff": "Technische Universit\u00e4t M\u00fcnchen; Technische Universit\u00e4t M\u00fcnchen; Technische Universit\u00e4t M\u00fcnchen", + "aff": "Technische Universität München; Technische Universität München; Technische Universität München", "project": "", "github": "", "supp": "", @@ -4098,14 +4228,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Kerl_Dense_Continuous-Time_Tracking_ICCV_2015_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "Technische Universit\u00e4t M\u00fcnchen", + "aff_unique_norm": "Technische Universität München", "aff_unique_dep": "", "aff_unique_url": "https://www.tum.de", "aff_unique_abbr": "TUM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Kerl_2015_ICCV,\n \n author = {\n Kerl,\n Christian and Stuckler,\n Jorg and Cremers,\n Daniel\n},\n title = {\n Dense Continuous-Time Tracking and Mapping With Rolling Shutter RGB-D Cameras\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "613e092def", @@ -4132,14 +4263,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Ngo_Dense_Image_Registration_ICCV_2015_paper.html", "aff_unique_index": "0;1;0;0;1;0", - "aff_unique_norm": "EPFL;KAIST", + "aff_unique_norm": "École Polytechnique Fédérale de Lausanne;KAIST", "aff_unique_dep": "Computer Vision Laboratory;School of Electrical Engineering", "aff_unique_url": "https://cvl.epfl.ch;https://www.kaist.ac.kr", "aff_unique_abbr": "EPFL;KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;1;0", - "aff_country_unique": "Switzerland;South Korea" + "aff_country_unique": "Switzerland;South Korea", + "bibtex": "@InProceedings{Ngo_2015_ICCV,\n \n author = {\n Ngo,\n Dat Tien and Park,\n Sanghyuk and Jorstad,\n Anne and Crivellaro,\n Alberto and Yoo,\n Chang D. and Fua,\n Pascal\n},\n title = {\n Dense Image Registration and Deformable Surface Reconstruction in Presence of Occlusions and Minimal Texture\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "ba0aa153c3", @@ -4173,7 +4305,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Pittsburgh", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Walker_2015_ICCV,\n \n author = {\n Walker,\n Jacob and Gupta,\n Abhinav and Hebert,\n Martial\n},\n title = {\n Dense Optical Flow Prediction From a Static Image\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "0242dd265a", @@ -4198,7 +4331,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Bristow_Dense_Semantic_Correspondence_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Bristow_Dense_Semantic_Correspondence_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Bristow_2015_ICCV,\n \n author = {\n Bristow,\n Hilton and Valmadre,\n Jack and Lucey,\n Simon\n},\n title = {\n Dense Semantic Correspondence Where Every Pixel is a Classifier\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "6c747cc184", @@ -4232,7 +4366,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Cambridge", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "South Korea;United States" + "aff_country_unique": "South Korea;United States", + "bibtex": "@InProceedings{Williem_2015_ICCV,\n \n author = {\n Williem,\n W. and Raskar,\n Ramesh and Park,\n In Kyu\n},\n title = {\n Depth Map Estimation and Colorization of Anaglyph Images Using Local Color Prior and Reverse Intensity Distribution\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "b95088f6f3", @@ -4256,7 +4391,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Lin_Depth_Recovery_From_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Lin_Depth_Recovery_From_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Lin_2015_ICCV,\n \n author = {\n Lin,\n Haiting and Chen,\n Can and Kang,\n Sing Bing and Yu,\n Jingyi\n},\n title = {\n Depth Recovery From Light Field Using Focal Stack Symmetry\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "10ab50bdc5", @@ -4290,7 +4426,8 @@ "aff_campus_unique_index": "0;1;1", "aff_campus_unique": "Tokyo;Houston", "aff_country_unique_index": "0;1;1", - "aff_country_unique": "Japan;United States" + "aff_country_unique": "Japan;United States", + "bibtex": "@InProceedings{Tadano_2015_ICCV,\n \n author = {\n Tadano,\n Ryuichi and Pediredla,\n Adithya Kumar and Veeraraghavan,\n Ashok\n},\n title = {\n Depth Selective Camera: A Direct,\n On-Chip,\n Programmable Technique for Depth Selectivity in Photography\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "5f652d0c15", @@ -4299,7 +4436,7 @@ "status": "Poster", "track": "main", "pid": "", - "author_site": "James S. Supan\u010di\u010d III, Gr\u00e9gory Rogez, Yi Yang, Jamie Shotton, Deva Ramanan", + "author_site": "James S. Supančič III, Grégory Rogez, Yi Yang, Jamie Shotton, Deva Ramanan", "author": "James S. Supancic III; Gregory Rogez; Yi Yang; Jamie Shotton; Deva Ramanan", "abstract": "Hand pose estimation has matured rapidly in recent years. The introduction of commodity depth sensors and a multitude of practical applications have spurred new advances. We provide an extensive analysis of the state-of-the-art, focusing on hand pose estimation from a single depth frame. To do so, we have implemented a considerable number of systems, and will release all software and evaluation code. We summarize important conclusions here: (1) Pose estimation appears roughly solved for scenes with isolated hands. However, methods still struggle to analyze cluttered scenes where hands may be interacting with nearby objects and surfaces. To spur further progress we introduce a challenging new dataset with diverse, cluttered scenes. (2) Many methods evaluate themselves with disparate criteria, making comparisons difficult. We define a consistent evaluation criteria, rigorously motivated by human experiments. (3) We introduce a simple nearest-neighbor baseline that outperforms most existing systems. This implies that most systems do not generalize beyond their training sets. This also reinforces the under-appreciated point that training data is as important as the model itself. We conclude with directions for future progress.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Supancic_Depth-Based_Hand_Pose_ICCV_2015_paper.pdf", @@ -4317,14 +4454,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Supancic_Depth-Based_Hand_Pose_ICCV_2015_paper.html", "aff_unique_index": "0;1;2;3;4", - "aff_unique_norm": "University of California, Irvine;INRIA;Baidu;Microsoft;Carnegie Mellon University", - "aff_unique_dep": ";;Baidu, Inc.;Microsoft Corporation;", + "aff_unique_norm": "University of California, Irvine;Inria;Baidu, Inc.;Microsoft Corporation;Carnegie Mellon University", + "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.uci.edu;https://www.inria.fr;https://www.baidu.com;https://www.microsoft.com;https://www.cmu.edu", "aff_unique_abbr": "UCI;Inria;Baidu;Microsoft;CMU", "aff_campus_unique_index": "0", "aff_campus_unique": "Irvine;", "aff_country_unique_index": "0;1;2;0;0", - "aff_country_unique": "United States;France;China" + "aff_country_unique": "United States;France;China", + "bibtex": "@InProceedings{III_2015_ICCV,\n \n author = {\n Supancic,\n III,\n James S. and Rogez,\n Gregory and Yang,\n Yi and Shotton,\n Jamie and Ramanan,\n Deva\n},\n title = {\n Depth-Based Hand Pose Estimation: Data,\n Methods,\n and Challenges\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "90ebeedb39", @@ -4337,7 +4475,7 @@ "author": "Li Yao; Atousa Torabi; Kyunghyun Cho; Nicolas Ballas; Christopher Pal; Hugo Larochelle; Aaron Courville", "abstract": "Recent progress in using recurrent neural networks (RNNs) for image description has motivated the exploration of their application for video description. However, while images are static, working with videos requires modeling their dynamic temporal structure and then properly integrating that information into a natural language description model. In this context, we propose an approach that successfully takes into account both the local and global temporal structure of videos to produce descriptions. First, our approach incorporates a spatial temporal 3-D convolutional neural network (3-D CNN) representation of the short temporal dynamics. The 3-D CNN representation is trained on video action recognition tasks, so as to produce a representation that is tuned to human motion and behavior. Second we propose a temporal attention mechanism that allows to go beyond local temporal modeling and learns to automatically select the most relevant temporal segments given the text-generating RNN. Our approach exceeds the current state-of-art for both BLEU and METEOR metrics on the Youtube2Text dataset. We also present results on a new, larger and more challenging dataset of paired video and natural language descriptions.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Yao_Describing_Videos_by_ICCV_2015_paper.pdf", - "aff": "Universit \u00b4e de Montr \u00b4eal; Universit \u00b4e de Montr \u00b4eal; Universit \u00b4e de Montr \u00b4eal; Universit \u00b4e de Montr \u00b4eal; \u00b4Ecole Polytechnique de Montr \u00b4eal; Universit \u00b4e de Sherbrooke; Universit \u00b4e de Montr \u00b4eal", + "aff": "Universit ´e de Montr ´eal; Universit ´e de Montr ´eal; Universit ´e de Montr ´eal; Universit ´e de Montr ´eal; ´Ecole Polytechnique de Montr ´eal; Universit ´e de Sherbrooke; Universit ´e de Montr ´eal", "project": "", "github": "", "supp": "", @@ -4351,14 +4489,15 @@ "author_num": 7, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Yao_Describing_Videos_by_ICCV_2015_paper.html", "aff_unique_index": "0;0;0;0;1;2;0", - "aff_unique_norm": "Universit\u00e9 de Montr\u00e9al;Ecole Polytechnique de Montr\u00e9al;Universit\u00e9 de Sherbrooke", + "aff_unique_norm": "Université de Montréal;Ecole Polytechnique de Montréal;Université de Sherbrooke", "aff_unique_dep": ";;", "aff_unique_url": "https://www.umontreal.ca;https://www.polymtl.ca;https://www.usherbrooke.ca", - "aff_unique_abbr": "UdeM;Polytechnique Montr\u00e9al;UdeS", + "aff_unique_abbr": "UdeM;Polytechnique Montréal;UdeS", "aff_campus_unique_index": "1", - "aff_campus_unique": ";Montr\u00e9al", + "aff_campus_unique": ";Montréal", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Yao_2015_ICCV,\n \n author = {\n Yao,\n Li and Torabi,\n Atousa and Cho,\n Kyunghyun and Ballas,\n Nicolas and Pal,\n Christopher and Larochelle,\n Hugo and Courville,\n Aaron\n},\n title = {\n Describing Videos by Exploiting Temporal Structure\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "4c1c8b2779", @@ -4383,7 +4522,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Bogo_Detailed_Full-Body_Reconstructions_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Bogo_Detailed_Full-Body_Reconstructions_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Bogo_2015_ICCV,\n \n author = {\n Bogo,\n Federica and Black,\n Michael J. and Loper,\n Matthew and Romero,\n Javier\n},\n title = {\n Detailed Full-Body Reconstructions of Moving People From Monocular RGB-D Sequences\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "de19760976", @@ -4392,7 +4532,7 @@ "status": "Poster", "track": "main", "pid": "", - "author_site": "Ching L. Teo, Cornelia Ferm\u00fcller, Yiannis Aloimonos", + "author_site": "Ching L. Teo, Cornelia Fermüller, Yiannis Aloimonos", "author": "Ching L. Teo; Cornelia Fermuller; Yiannis Aloimonos", "abstract": "Symmetry, as one of the key components of Gestalt theory, provides an important mid-level cue that serves as input to higher visual processes such as segmentation. In this work, we propose a complete approach that links the detection of curved reflection symmetries to produce symmetry-constrained segments of structures/regions in real images with clutter. For curved reflection symmetry detection, we leverage on patch-based symmetric features to train a Structured Random Forest classifier that detects multiscaled curved symmetries in 2D images. Next, using these curved symmetries, we modulate a novel symmetry-constrained foreground-background segmentation by their symmetry scores so that we enforce global symmetrical consistency in the final segmentation. This is achieved by imposing a pairwise symmetry prior that encourages symmetric pixels to have the same labels over a MRF-based representation of the input image edges, and the final segmentation is obtained via graph-cuts. Experimental results over four publicly available datasets containing annotated symmetric structures: 1) SYMMAX-300, 2) BSD-Parts, 3) Weizmann Horse and 4) NY-roads demonstrate the approach's applicability to different environments with state-of-the-art performance.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Teo_Detection_and_Segmentation_ICCV_2015_paper.pdf", @@ -4417,7 +4557,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "College Park", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Teo_2015_ICCV,\n \n author = {\n Teo,\n Ching L. and Fermuller,\n Cornelia and Aloimonos,\n Yiannis\n},\n title = {\n Detection and Segmentation of 2D Curved Reflection Symmetric Structures\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "635f0d09ce", @@ -4451,7 +4592,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Veeriah_2015_ICCV,\n \n author = {\n Veeriah,\n Vivek and Zhuang,\n Naifan and Qi,\n Guo-Jun\n},\n title = {\n Differential Recurrent Neural Networks for Action Recognition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "4e4ff928e4", @@ -4485,7 +4627,8 @@ "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Berkeley;Chicago", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Narihira_2015_ICCV,\n \n author = {\n Narihira,\n Takuya and Maire,\n Michael and Yu,\n Stella X.\n},\n title = {\n Direct Intrinsics: Learning Albedo-Shading Decomposition by Convolutional Regression\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "65f0df4fb5", @@ -4510,7 +4653,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Yu_Direct_Dense_and_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Yu_Direct_Dense_and_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Yu_2015_ICCV,\n \n author = {\n Yu,\n Rui and Russell,\n Chris and Campbell,\n Neill D. F. and Agapito,\n Lourdes\n},\n title = {\n Direct,\n Dense,\n and Deformable: Template-Based Non-Rigid 3D Reconstruction From RGB Video\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "36c69272fa", @@ -4544,7 +4688,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Davis", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Xiao_2015_ICCV,\n \n author = {\n Xiao,\n Fanyi and Lee,\n Yong Jae\n},\n title = {\n Discovering the Spatial Extent of Relative Attributes\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "29e7aee282", @@ -4578,7 +4723,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Seoul", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Adamczewski_2015_ICCV,\n \n author = {\n Adamczewski,\n Kamil and Suh,\n Yumin and Lee,\n Kyoung Mu\n},\n title = {\n Discrete Tabu Search for Graph Matching\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "8d8399b817", @@ -4603,7 +4749,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Simo-Serra_Discriminative_Learning_of_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Simo-Serra_Discriminative_Learning_of_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Simo-Serra_2015_ICCV,\n \n author = {\n Simo-Serra,\n Edgar and Trulls,\n Eduard and Ferraz,\n Luis and Kokkinos,\n Iasonas and Fua,\n Pascal and Moreno-Noguer,\n Francesc\n},\n title = {\n Discriminative Learning of Deep Convolutional Feature Point Descriptors\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "ece7b8ae55", @@ -4637,7 +4784,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Sui_2015_ICCV,\n \n author = {\n Sui,\n Yao and Tang,\n Yafei and Zhang,\n Li\n},\n title = {\n Discriminative Low-Rank Tracking\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "a7a2fa7767", @@ -4671,7 +4819,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Bangalore", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "India" + "aff_country_unique": "India", + "bibtex": "@InProceedings{Sanyal_2015_ICCV,\n \n author = {\n Sanyal,\n Soubhik and Mudunuri,\n Sivaram Prasad and Biswas,\n Soma\n},\n title = {\n Discriminative Pose-Free Descriptors for Face and Object Matching\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "3679b73050", @@ -4705,7 +4854,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "New Zealand" + "aff_country_unique": "New Zealand", + "bibtex": "@InProceedings{Ghifary_2015_ICCV,\n \n author = {\n Ghifary,\n Muhammad and Kleijn,\n W. Bastiaan and Zhang,\n Mengjie and Balduzzi,\n David\n},\n title = {\n Domain Generalization for Object Recognition With Multi-Task Autoencoders\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "b4becdc53b", @@ -4732,14 +4882,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Li_Dual-Feature_Warping-Based_Motion_ICCV_2015_paper.html", "aff_unique_index": "0;1;1;0", - "aff_unique_norm": "Hong Kong University of Science and Technology;Microsoft", + "aff_unique_norm": "Hong Kong University of Science and Technology;Microsoft Corporation", "aff_unique_dep": ";Microsoft Research", "aff_unique_url": "https://www.ust.hk;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "HKUST;MSR", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;1;1;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Li_2015_ICCV,\n \n author = {\n Li,\n Shiwei and Yuan,\n Lu and Sun,\n Jian and Quan,\n Long\n},\n title = {\n Dual-Feature Warping-Based Motion Model Estimation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "1f107c3a1e", @@ -4773,7 +4924,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "Singapore;China" + "aff_country_unique": "Singapore;China", + "bibtex": "@InProceedings{Quan_2015_ICCV,\n \n author = {\n Quan,\n Yuhui and Huang,\n Yan and Ji,\n Hui\n},\n title = {\n Dynamic Texture Recognition via Orthogonal Tensor Dictionary Learning\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "db3f74d689", @@ -4807,7 +4959,8 @@ "aff_campus_unique_index": "0;1;1", "aff_campus_unique": "Ashburn;Lugano", "aff_country_unique_index": "0;1;1", - "aff_country_unique": "United States;Switzerland" + "aff_country_unique": "United States;Switzerland", + "bibtex": "@InProceedings{Parag_2015_ICCV,\n \n author = {\n Parag,\n Toufiq and Ciresan,\n Dan C. and Giusti,\n Alessandro\n},\n title = {\n Efficient Classifier Training to Minimize False Merges in Electron Microscopy Segmentation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "d108c2e87f", @@ -4816,11 +4969,11 @@ "status": "Poster", "track": "main", "pid": "", - "author_site": "Margret Keuper, Evgeny Levinkov, Nicolas Bonneel, Guillaume Lavou\u00e9, Thomas Brox, Bj\u00f6rn Andres", + "author_site": "Margret Keuper, Evgeny Levinkov, Nicolas Bonneel, Guillaume Lavoué, Thomas Brox, Björn Andres", "author": "Margret Keuper; Evgeny Levinkov; Nicolas Bonneel; Guillaume Lavoue; Thomas Brox; Bjorn Andres", "abstract": "Formulations of the Image Decomposition Problem as a Multicut Problem (MP) w.r.t. a superpixel graph have received considerable attention. In contrast, instances of the MP w.r.t. a pixel grid graph have received little attention, firstly, because the MP is NP-hard and instances w.r.t. a pixel grid graph are hard to solve in practice, and, secondly, due to the lack of long-range terms in the objective function of the MP. We propose a generalization of the MP with long-range terms (LMP). We design and implement two efficient algorithms (primal feasible heuristics) for the MP and LMP which allow us to study instances of both problems w.r.t. the pixel grid graphs of the images in the BSDS-500 benchmark. The decompositions we obtain do not differ significantly from the state of the art, suggesting that the LMP is a competitive formulation of the Image Decomposition Problem. To demonstrate the generality of the LMP, we apply it also to the Mesh Decomposition Problem posed by the Princeton benchmark, obtaining state-of-the-art decompositions.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Keuper_Efficient_Decomposition_of_ICCV_2015_paper.pdf", - "aff": "Department of Computer Science, University of Freiburg; Combinatorial Image Analysis, MPI for Informatics, Saarbr \u00a8ucken; Laboratoire d\u2019Informatique en Image et Syst `emes d\u2019Information, CNRS Lyon; Laboratoire d\u2019Informatique en Image et Syst `emes d\u2019Information, CNRS Lyon; Department of Computer Science, University of Freiburg; Combinatorial Image Analysis, MPI for Informatics, Saarbr \u00a8ucken", + "aff": "Department of Computer Science, University of Freiburg; Combinatorial Image Analysis, MPI for Informatics, Saarbr ¨ucken; Laboratoire d’Informatique en Image et Syst `emes d’Information, CNRS Lyon; Laboratoire d’Informatique en Image et Syst `emes d’Information, CNRS Lyon; Department of Computer Science, University of Freiburg; Combinatorial Image Analysis, MPI for Informatics, Saarbr ¨ucken", "project": "", "github": "", "supp": "", @@ -4835,13 +4988,14 @@ "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Keuper_Efficient_Decomposition_of_ICCV_2015_paper.html", "aff_unique_index": "0;1;2;2;0;1", "aff_unique_norm": "University of Freiburg;Max Planck Institute for Informatics;CNRS", - "aff_unique_dep": "Department of Computer Science;Combinatorial Image Analysis;Laboratoire d\u2019Informatique en Image et Syst\u00e8mes d\u2019Information", + "aff_unique_dep": "Department of Computer Science;Combinatorial Image Analysis;Laboratoire d’Informatique en Image et Systèmes d’Information", "aff_unique_url": "https://www.uni-freiburg.de;https://mpi-inf.mpg.de;https://www.cnrs.fr", "aff_unique_abbr": ";MPII;CNRS", "aff_campus_unique_index": "1;2;2;1", - "aff_campus_unique": ";Saarbr\u00fccken;Lyon", + "aff_campus_unique": ";Saarbrücken;Lyon", "aff_country_unique_index": "0;0;1;1;0;0", - "aff_country_unique": "Germany;France" + "aff_country_unique": "Germany;France", + "bibtex": "@InProceedings{Keuper_2015_ICCV,\n \n author = {\n Keuper,\n Margret and Levinkov,\n Evgeny and Bonneel,\n Nicolas and Lavoue,\n Guillaume and Brox,\n Thomas and Andres,\n Bjorn\n},\n title = {\n Efficient Decomposition of Image and Mesh Graphs by Lifted Multicuts\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "2bd112c00d", @@ -4866,7 +5020,8 @@ "aff_domain": ";", "email": ";", "author_num": 2, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Liao_Efficient_PSD_Constrained_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Liao_Efficient_PSD_Constrained_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Liao_2015_ICCV,\n \n author = {\n Liao,\n Shengcai and Li,\n Stan Z.\n},\n title = {\n Efficient PSD Constrained Asymmetric Metric Learning for Person Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "728fcbb9ff", @@ -4893,14 +5048,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Kukelova_Efficient_Solution_to_ICCV_2015_paper.html", "aff_unique_index": "0;1;2+1;0;1", - "aff_unique_norm": "Microsoft;Czech Technical University;Capturing Reality", - "aff_unique_dep": "Microsoft Research;;", + "aff_unique_norm": "Microsoft Research;Czech Technical University;Capturing Reality", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.microsoft.com/en-us/research;https://www.ctu.cz;", "aff_unique_abbr": "MSR;CTU;CR", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Prague", "aff_country_unique_index": "0;1;1+1;0;1", - "aff_country_unique": "United Kingdom;Czech Republic" + "aff_country_unique": "United Kingdom;Czech Republic", + "bibtex": "@InProceedings{Kukelova_2015_ICCV,\n \n author = {\n Kukelova,\n Zuzana and Heller,\n Jan and Bujnak,\n Martin and Fitzgibbon,\n Andrew and Pajdla,\n Tomas\n},\n title = {\n Efficient Solution to the Epipolar Geometry for Radially Distorted Cameras\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "e765d45ce5", @@ -4934,7 +5090,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "", - "aff_country_unique": "" + "aff_country_unique": "", + "bibtex": "@InProceedings{Yu_2015_ICCV,\n \n author = {\n Yu,\n Chen-Ping and Le,\n Hieu and Zelinsky,\n Gregory and Samaras,\n Dimitris\n},\n title = {\n Efficient Video Segmentation Using Parametric Graph Partitioning\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "e1cad623dc", @@ -4943,7 +5100,7 @@ "status": "Poster", "track": "main", "pid": "", - "author_site": "Gell\u00e9rt M\u00e1ttyus, Shenlong Wang, Sanja Fidler, Raquel Urtasun", + "author_site": "Gellért Máttyus, Shenlong Wang, Sanja Fidler, Raquel Urtasun", "author": "Gellert Mattyus; Shenlong Wang; Sanja Fidler; Raquel Urtasun", "abstract": "In recent years, contextual models that exploit maps have been shown to be very effective for many recognition and localization tasks. In this paper we propose to exploit aerial images in order to enhance freely available world maps. Towards this goal, we make use of OpenStreetMap and formulate the problem as the one of inference in a Markov random field parameterized in terms of the location of the road-segment centerlines as well as their width. This parameterization enables very efficient inference and returns only topologically correct roads. In particular, we can segment all OSM roads in the whole world in a single day using a small cluster of 10 computers. Importantly, our approach generalizes very well; it can be trained using only 1.5 km2 aerial imagery and produce very accurate results in any location across the globe. We demonstrate the effectiveness of our approach outperforming the state-of-the-art in two new benchmarks that we collect. We then show how our enhanced maps are beneficial for semantic segmentation of ground images.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Mattyus_Enhancing_Road_Maps_ICCV_2015_paper.pdf", @@ -4968,7 +5125,8 @@ "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Toronto", "aff_country_unique_index": "0;1;1;1", - "aff_country_unique": "Germany;Canada" + "aff_country_unique": "Germany;Canada", + "bibtex": "@InProceedings{Mattyus_2015_ICCV,\n \n author = {\n Mattyus,\n Gellert and Wang,\n Shenlong and Fidler,\n Sanja and Urtasun,\n Raquel\n},\n title = {\n Enhancing Road Maps by Parsing Aerial Images Around the World\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "e5c09f0015", @@ -4981,7 +5139,7 @@ "author": "Mohamed Souiai; Martin R. Oswald; Youngwook Kee; Junmo Kim; Marc Pollefeys; Daniel Cremers", "abstract": "Despite their enormous success in solving hard combinatorial problems, convex relaxation approaches often suffer from the fact that the computed solutions are far from binary and that subsequent heuristic binarization may substantially degrade the quality of computed solutions. In this paper, we propose a novel relaxation technique which incorporates the entropy of the objective variable as a measure of relaxation tightness. We show both theoretically and experimentally that augmenting the objective function with an entropy term gives rise to more binary solutions and consequently solutions with a substantially tighter optimality gap. We use difference of convex function (DC) programming as an efficient and provably convergent solver for the arising convex-concave minimization problem. We evaluate this approach on three prominent non-convex computer vision challenges: multi-label inpainting, image segmentation and spatio-temporal multi-view reconstruction. These experiments show that our approach consistently yields better solutions with respect to the original integral optimization problem", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Souiai_Entropy_Minimization_for_ICCV_2015_paper.pdf", - "aff": "TU Munich, Germany; ETH Z\u00fcrich, Switzerland; KAIST, South Korea; KAIST, South Korea; ETH Z\u00fcrich, Switzerland; TU Munich, Germany", + "aff": "TU Munich, Germany; ETH Zürich, Switzerland; KAIST, South Korea; KAIST, South Korea; ETH Zürich, Switzerland; TU Munich, Germany", "project": "", "github": "", "supp": "", @@ -4995,14 +5153,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Souiai_Entropy_Minimization_for_ICCV_2015_paper.html", "aff_unique_index": "0;1;2;2;1;0", - "aff_unique_norm": "Technical University of Munich;ETH Zurich;Korea Advanced Institute of Science and Technology", + "aff_unique_norm": "Technical University of Munich;ETH Zürich;Korea Advanced Institute of Science and Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://www.tum.de;https://www.ethz.ch;https://www.kaist.ac.kr", "aff_unique_abbr": "TUM;ETHZ;KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;2;1;0", - "aff_country_unique": "Germany;Switzerland;South Korea" + "aff_country_unique": "Germany;Switzerland;South Korea", + "bibtex": "@InProceedings{Souiai_2015_ICCV,\n \n author = {\n Souiai,\n Mohamed and Oswald,\n Martin R. and Kee,\n Youngwook and Kim,\n Junmo and Pollefeys,\n Marc and Cremers,\n Daniel\n},\n title = {\n Entropy Minimization for Convex Relaxation Approaches\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "b427a40e6c", @@ -5015,7 +5174,7 @@ "author": "Diane Bouchacourt; Sebastian Nowozin; M. Pawan Kumar", "abstract": "Recently several generalizations of the popular latent structural SVM framework have been proposed in the literature. Broadly speaking, the generalizations can be divided into two categories: (i) those that predict the output variables while either marginalizing the latent variables or estimating their most likely values; and (ii) those that predict the output variables by minimizing an entropy-based uncertainty measure over the latent space. In order to aid their application in computer vision, we study these generalizations with the aim of identifying their strengths and weaknesses. To this end, we propose a novel prediction criterion that includes as special cases all previous prediction criteria that have been used in the literature. Specifically, our framework's prediction criterion minimizes the Aczel and Daroczy entropy of the output. This in turn allows us to design a learning objective that provides a unified framework (UF) for latent structured prediction. We develop a single optimization algorithm and empirically show that it is as effective as the more complex approaches that have been previously employed for latent structured prediction. Using this algorithm, we provide empirical evidence that lends support to prediction via the minimization of the latent space uncertainty.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Bouchacourt_Entropy-Based_Latent_Structured_ICCV_2015_paper.pdf", - "aff": "CentraleSup\u00e9lec and INRIA Saclay; Microsoft Research; CentraleSup\u00e9lec and INRIA Saclay", + "aff": "CentraleSupélec and INRIA Saclay; Microsoft Research; CentraleSupélec and INRIA Saclay", "project": "", "github": "", "supp": "", @@ -5029,14 +5188,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Bouchacourt_Entropy-Based_Latent_Structured_ICCV_2015_paper.html", "aff_unique_index": "0;1;0", - "aff_unique_norm": "CentraleSup\u00e9lec;Microsoft", + "aff_unique_norm": "CentraleSupélec;Microsoft Corporation", "aff_unique_dep": ";Microsoft Research", "aff_unique_url": "https://www.centralesupelec.fr;https://www.microsoft.com/en-us/research", - "aff_unique_abbr": "CentraleSup\u00e9lec;MSR", + "aff_unique_abbr": "CentraleSupélec;MSR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "France;United States" + "aff_country_unique": "France;United States", + "bibtex": "@InProceedings{Bouchacourt_2015_ICCV,\n \n author = {\n Bouchacourt,\n Diane and Nowozin,\n Sebastian and Kumar,\n M. Pawan\n},\n title = {\n Entropy-Based Latent Structured Output Prediction\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "23aff8ee41", @@ -5061,7 +5221,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Dessein_Example-Based_Modeling_of_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Dessein_Example-Based_Modeling_of_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Dessein_2015_ICCV,\n \n author = {\n Dessein,\n Arnaud and Smith,\n William A. P. and Wilson,\n Richard C. and Hancock,\n Edwin R.\n},\n title = {\n Example-Based Modeling of Facial Texture From Deficient Data\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "db56a338be", @@ -5095,7 +5256,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Hadfield_2015_ICCV,\n \n author = {\n Hadfield,\n Simon and Bowden,\n Richard\n},\n title = {\n Exploiting High Level Scene Cues in Stereo Reconstruction\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "3a2c976923", @@ -5104,11 +5266,11 @@ "status": "Poster", "track": "main", "pid": "", - "author_site": "Chen Zhou, Fatma G\u00fcney, Yizhou Wang, Andreas Geiger", + "author_site": "Chen Zhou, Fatma Güney, Yizhou Wang, Andreas Geiger", "author": "Chen Zhou; Fatma Guney; Yizhou Wang; Andreas Geiger", "abstract": "Despite recent progress, reconstructing outdoor scenes in 3D from movable platforms remains a highly difficult endeavour. Challenges include low frame rates, occlusions, large distortions and difficult lighting conditions. In this paper, we leverage the fact that the larger the reconstructed area, the more likely objects of similar type and shape will occur in the scene. This is particularly true for outdoor scenes where buildings and vehicles often suffer from missing texture or reflections, but share similarity in 3D shape. We take advantage of this shape similarity by localizing objects using detectors and jointly reconstructing them while learning a volumetric model of their shape. This allows us to reduce noise while completing missing surfaces as objects of similar shape benefit from all observations for the respective category. We evaluate our approach with respect to LIDAR ground truth on a novel challenging suburban dataset and show its advantages over the state-of-the-art.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Zhou_Exploiting_Object_Similarity_ICCV_2015_paper.pdf", - "aff": "Nat\u2019l Engineering Laboratory for Video Technology + Cooperative Medianet Innovation Center, Peking University, China; MPI for Intelligent Systems T\u00a8ubingen; Nat\u2019l Engineering Laboratory for Video Technology + Cooperative Medianet Innovation Center, Peking University, China; MPI for Intelligent Systems T\u00a8ubingen", + "aff": "Nat’l Engineering Laboratory for Video Technology + Cooperative Medianet Innovation Center, Peking University, China; MPI for Intelligent Systems T¨ubingen; Nat’l Engineering Laboratory for Video Technology + Cooperative Medianet Innovation Center, Peking University, China; MPI for Intelligent Systems T¨ubingen", "project": "", "github": "", "supp": "", @@ -5124,12 +5286,13 @@ "aff_unique_index": "0+1;2;0+1;2", "aff_unique_norm": "National Engineering Laboratory for Video Technology;Peking University;Max Planck Institute for Intelligent Systems", "aff_unique_dep": ";Cooperative Medianet Innovation Center;", - "aff_unique_url": ";http://www.pku.edu.cn;https://www.mpituebingen.mpg.de", - "aff_unique_abbr": ";PKU;MPI-IS", + "aff_unique_url": ";http://www.pku.edu.cn;https://www.mpi-is.mpg.de", + "aff_unique_abbr": ";Peking U;MPI-IS", "aff_campus_unique_index": ";1;;1", - "aff_campus_unique": ";T\u00fcbingen", + "aff_campus_unique": ";Tübingen", "aff_country_unique_index": "0+0;1;0+0;1", - "aff_country_unique": "China;Germany" + "aff_country_unique": "China;Germany", + "bibtex": "@InProceedings{Zhou_2015_ICCV,\n \n author = {\n Zhou,\n Chen and Guney,\n Fatma and Wang,\n Yizhou and Geiger,\n Andreas\n},\n title = {\n Exploiting Object Similarity in 3D Reconstruction\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "15dae868f1", @@ -5163,7 +5326,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Guildford", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Lebeda_2015_ICCV,\n \n author = {\n Lebeda,\n Karel and Hadfield,\n Simon and Bowden,\n Richard\n},\n title = {\n Exploring Causal Relationships in Visual Object Tracking\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "4d74e927e8", @@ -5197,7 +5361,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Yokoya_2015_ICCV,\n \n author = {\n Yokoya,\n Ryunosuke and Nayar,\n Shree K.\n},\n title = {\n Extended Depth of Field Catadioptric Imaging Using Focal Sweep\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "9cd80aa219", @@ -5224,14 +5389,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Chen_External_Patch_Prior_ICCV_2015_paper.html", "aff_unique_index": "0;1;2+3", - "aff_unique_norm": "Fuzhou University;Hong Kong Polytechnic University;Zhejiang University;State Key Laboratory of CAD & CG", + "aff_unique_norm": "Fuzhou University;The Hong Kong Polytechnic University;Zhejiang University;State Key Laboratory of CAD & CG", "aff_unique_dep": "College of Mathematics and Computer Science;Dept. of Computing;College of Information Science & Electronic Engineering;", "aff_unique_url": "https://www.fzu.edu.cn;https://www.polyu.edu.hk;http://www.zju.edu.cn;", "aff_unique_abbr": "FZU;PolyU;ZJU;", "aff_campus_unique_index": "0;1;", "aff_campus_unique": "Fuzhou;Hong Kong;", "aff_country_unique_index": "0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2015_ICCV,\n \n author = {\n Chen,\n Fei and Zhang,\n Lei and Yu,\n Huimin\n},\n title = {\n External Patch Prior Guided Internal Clustering for Image Denoising\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "b865618b2d", @@ -5258,14 +5424,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Meng_Extraction_of_Virtual_ICCV_2015_paper.html", "aff_unique_index": "0;0;1;0;0", - "aff_unique_norm": "Institute of Automation Chinese Academy of Sciences;Xi'an Jiao Tong University", + "aff_unique_norm": "Institute of Automation Chinese Academy of Sciences;Xi'an Jiaotong University", "aff_unique_dep": "National Laboratory of Pattern Recognition;Institute of Artificial Intelligence and Robotics", "aff_unique_url": "http://www.ia.cas.cn;http://www.xjtu.edu.cn", "aff_unique_abbr": "IAS;XJTU", "aff_campus_unique_index": "0;0;1;0;0", "aff_campus_unique": "Beijing;Xi'an", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Meng_2015_ICCV,\n \n author = {\n Meng,\n Gaofeng and Huang,\n Zuming and Song,\n Yonghong and Xiang,\n Shiming and Pan,\n Chunhong\n},\n title = {\n Extraction of Virtual Baselines From Distorted Document Images Using Curvilinear Projection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "f9d87e4047", @@ -5274,7 +5441,7 @@ "status": "Poster", "track": "main", "pid": "", - "author_site": "Michal Bu\u0161ta, Luk\u00e1\u0161 Neumann, Ji\u0159\u00ed Matas", + "author_site": "Michal Bušta, Lukáš Neumann, Jiří Matas", "author": "Michal Busta; Lukas Neumann; Jiri Matas", "abstract": "We propose a novel easy-to-implement stroke detector based on an efficient pixel intensity comparison to surrounding pixels. Stroke-specific keypoints are efficiently detected and text fragments are subsequently extracted by local thresholding guided by keypoint properties. Classification based on effectively calculated features then eliminates non-text regions. The stroke-specific keypoints produce 2 times less region segmentations and still detect 25% more characters than the commonly exploited MSER detector and the process is 4 times faster. After a novel efficient classification step, the number of regions is reduced to 7 times less than the standard method and is still almost 3 times faster. All stages of the proposed pipeline are scale- and rotation-invariant and support a wide variety of scripts (Latin, Hebrew, Chinese, etc.) and fonts. When the proposed detector is plugged into a scene text localization and recognition pipeline, a state-of-the-art text localization accuracy is maintained whilst the processing time is significantly reduced.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Busta_FASText_Efficient_Unconstrained_ICCV_2015_paper.pdf", @@ -5299,7 +5466,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Prague", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Czech Republic" + "aff_country_unique": "Czech Republic", + "bibtex": "@InProceedings{Busta_2015_ICCV,\n \n author = {\n Busta,\n Michal and Neumann,\n Lukas and Matas,\n Jiri\n},\n title = {\n FASText: Efficient Unconstrained Scene Text Detector\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "4d5dbed720", @@ -5333,7 +5501,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "London", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Snape_2015_ICCV,\n \n author = {\n Snape,\n Patrick and Roussos,\n Anastasios and Panagakis,\n Yannis and Zafeiriou,\n Stefanos\n},\n title = {\n Face Flow\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "f208511e77", @@ -5358,7 +5527,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Malleson_FaceDirector_Continuous_Control_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Malleson_FaceDirector_Continuous_Control_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Malleson_2015_ICCV,\n \n author = {\n Malleson,\n Charles and Bazin,\n Jean-Charles and Wang,\n Oliver and Bradley,\n Derek and Beeler,\n Thabo and Hilton,\n Adrian and Sorkine-Hornung,\n Alexander\n},\n title = {\n FaceDirector: Continuous Control of Facial Performance in Video\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "ef1aa42efa", @@ -5392,7 +5562,8 @@ "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1+1;1;1;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Zhang_2015_ICCV,\n \n author = {\n Zhang,\n Xu and Yu,\n Felix X. and Guo,\n Ruiqi and Kumar,\n Sanjiv and Wang,\n Shengjin and Chang,\n Shi-Fu\n},\n title = {\n Fast Orthogonal Projection Based on Kronecker Product\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "edb6043a25", @@ -5401,6 +5572,7 @@ "status": "Oral", "track": "main", "pid": "", + "author_site": "Ross Girshick", "author": "Ross Girshick", "abstract": "This paper proposes a Fast Region-based Convolutional Network method (Fast R-CNN) for object detection. Fast R-CNN builds on previous work to efficiently classify object proposals using deep convolutional networks. Compared to previous work, Fast R-CNN employs several innovations to improve training and testing speed while also increasing detection accuracy. Fast R-CNN trains the very deep VGG16 network 9x faster than R-CNN, is 213x faster at test-time, and achieves a higher mAP on PASCAL VOC 2012. Compared to SPPnet, Fast R-CNN trains VGG16 3x faster, tests 10x faster, and is more accurate. Fast R-CNN is implemented in Python and C++ (using Caffe) and is available under the open-source MIT License at https://github.com/rbgirshick/fast-rcnn.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Girshick_Fast_R-CNN_ICCV_2015_paper.pdf", @@ -5418,12 +5590,13 @@ "author_num": 1, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Girshick_Fast_R-CNN_ICCV_2015_paper.html", "aff_unique_index": "0", - "aff_unique_norm": "Microsoft", + "aff_unique_norm": "Microsoft Corporation", "aff_unique_dep": "Microsoft Research", "aff_unique_url": "https://www.microsoft.com/en-us/research", "aff_unique_abbr": "MSR", "aff_country_unique_index": "0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Girshick_2015_ICCV,\n \n author = {\n Girshick,\n Ross\n},\n title = {\n Fast R-CNN\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "f1e7a2557c", @@ -5457,7 +5630,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Merced", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "South Korea;United States" + "aff_country_unique": "South Korea;United States", + "bibtex": "@InProceedings{Lee_2015_ICCV,\n \n author = {\n Lee,\n Donghoon and Yang,\n Ming-Hsuan and Oh,\n Songhwai\n},\n title = {\n Fast and Accurate Head Pose Estimation via Random Projection Forests\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "4a3619a230", @@ -5481,7 +5655,8 @@ "aff_domain": ";", "email": ";", "author_num": 2, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Nguyen_Fast_and_Effective_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Nguyen_Fast_and_Effective_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Nguyen_2015_ICCV,\n \n author = {\n Nguyen,\n Rang M. H. and Brown,\n Michael S.\n},\n title = {\n Fast and Effective L0 Gradient Minimization by Region Fusion\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "0b28c30f0f", @@ -5506,7 +5681,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Yu_Fill_and_Transfer_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Yu_Fill_and_Transfer_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Yu_2015_ICCV,\n \n author = {\n Yu,\n Lap-Fai and Duncan,\n Noah and Yeung,\n Sai-Kit\n},\n title = {\n Fill and Transfer: A Simple Physics-Based Approach for Containability Reasoning\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "e6e4ae6be6", @@ -5531,7 +5707,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Feng_Fine-Grained_Change_Detection_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Feng_Fine-Grained_Change_Detection_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Feng_2015_ICCV,\n \n author = {\n Feng,\n Wei and Tian,\n Fei-Peng and Zhang,\n Qian and Zhang,\n Nan and Wan,\n Liang and Sun,\n Jizhou\n},\n title = {\n Fine-Grained Change Detection of Misaligned Scenes With Varied Illuminations\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "26a486ed2a", @@ -5544,7 +5721,7 @@ "author": "Bryan A. Plummer; Liwei Wang; Chris M. Cervantes; Juan C. Caicedo; Julia Hockenmaier; Svetlana Lazebnik", "abstract": "The Flickr30k dataset has become a standard benchmark for sentence-based image description. This paper presents Flickr30k Entities, which augments the 158k captions from Flickr30k with 244k coreference chains linking mentions of the same entities in images, as well as 276k manually annotated bounding boxes corresponding to each entity. Such annotation is essential for continued progress in automatic image description and grounded language understanding. We present experiments demonstrating the usefulness of our annotations for text-to-image reference resolution, or the task of localizing textual entity mentions in an image, and for bidirectional image-sentence retrieval. These experiments confirm that we can further improve the accuracy of state-of-the-art retrieval methods by training with explicit region-to-phrase correspondence, but at the same time, they show that accurately inferring this correspondence given an image and a caption remains really challenging.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Plummer_Flickr30k_Entities_Collecting_ICCV_2015_paper.pdf", - "aff": "Univ. of Illinois at Urbana-Champaign; Univ. of Illinois at Urbana-Champaign; Univ. of Illinois at Urbana-Champaign; Fundaci \u00b4on Univ. Konrad Lorenz; Univ. of Illinois at Urbana-Champaign; Univ. of Illinois at Urbana-Champaign", + "aff": "Univ. of Illinois at Urbana-Champaign; Univ. of Illinois at Urbana-Champaign; Univ. of Illinois at Urbana-Champaign; Fundaci ´on Univ. Konrad Lorenz; Univ. of Illinois at Urbana-Champaign; Univ. of Illinois at Urbana-Champaign", "project": "", "github": "", "supp": "", @@ -5558,14 +5735,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Plummer_Flickr30k_Entities_Collecting_ICCV_2015_paper.html", "aff_unique_index": "0;0;0;1;0;0", - "aff_unique_norm": "University of Illinois Urbana-Champaign;Fundaci\u00f3n Universitaria Konrad Lorenz", + "aff_unique_norm": "University of Illinois at Urbana-Champaign;Fundación Universitaria Konrad Lorenz", "aff_unique_dep": ";", "aff_unique_url": "https://illinois.edu;https://www.fundacionkl.org.co", "aff_unique_abbr": "UIUC;FUKL", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Urbana-Champaign;", "aff_country_unique_index": "0;0;0;1;0;0", - "aff_country_unique": "United States;Colombia" + "aff_country_unique": "United States;Colombia", + "bibtex": "@InProceedings{Plummer_2015_ICCV,\n \n author = {\n Plummer,\n Bryan A. and Wang,\n Liwei and Cervantes,\n Chris M. and Caicedo,\n Juan C. and Hockenmaier,\n Julia and Lazebnik,\n Svetlana\n},\n title = {\n Flickr30k Entities: Collecting Region-to-Phrase Correspondences for Richer Image-to-Sentence Models\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "d4d6f6a654", @@ -5599,7 +5777,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0+0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Bailer_2015_ICCV,\n \n author = {\n Bailer,\n Christian and Taetz,\n Bertram and Stricker,\n Didier\n},\n title = {\n Flow Fields: Dense Correspondence Fields for Highly Accurate Large Displacement Optical Flow Estimation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "a02158d33c", @@ -5608,7 +5787,7 @@ "status": "Poster", "track": "main", "pid": "", - "author_site": "Alexey Dosovitskiy, Philipp Fischer, Eddy Ilg, Philip H\u00e4usser, Caner Haz\u0131rba\u015f, Vladimir Golkov, Patrick van der Smagt, Daniel Cremers, Thomas Brox", + "author_site": "Alexey Dosovitskiy, Philipp Fischer, Eddy Ilg, Philip Häusser, Caner Hazırbaş, Vladimir Golkov, Patrick van der Smagt, Daniel Cremers, Thomas Brox", "author": "Alexey Dosovitskiy; Philipp Fischer; Eddy Ilg; Philip Hausser; Caner Hazirbas; Vladimir Golkov; Patrick van der Smagt; Daniel Cremers; Thomas Brox", "abstract": "Convolutional neural networks (CNNs) have recently been very successful in a variety of computer vision tasks, especially on those linked to recognition. Optical flow estimation has not been among the tasks CNNs succeeded at. In this paper we construct CNNs which are capable of solving the optical flow estimation problem as a supervised learning task. We propose and compare two architectures: a generic architecture and another one including a layer that correlates feature vectors at different image locations. Since existing ground truth data sets are not sufficiently large to train a CNN, we generate a large synthetic Flying Chairs dataset. We show that networks trained on this unrealistic data still generalize very well to existing datasets such as Sintel and KITTI, achieving competitive accuracy at frame rates of 5 to 10 fps.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Dosovitskiy_FlowNet_Learning_Optical_ICCV_2015_paper.pdf", @@ -5633,7 +5812,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0+0;0;0;0;0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Dosovitskiy_2015_ICCV,\n \n author = {\n Dosovitskiy,\n Alexey and Fischer,\n Philipp and Ilg,\n Eddy and Hausser,\n Philip and Hazirbas,\n Caner and Golkov,\n Vladimir and van der Smagt,\n Patrick and Cremers,\n Daniel and Brox,\n Thomas\n},\n title = {\n FlowNet: Learning Optical Flow With Convolutional Networks\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "d8f4cf4a74", @@ -5667,7 +5847,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Oxford;", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Pfister_2015_ICCV,\n \n author = {\n Pfister,\n Tomas and Charles,\n James and Zisserman,\n Andrew\n},\n title = {\n Flowing ConvNets for Human Pose Estimation in Videos\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "0083283afd", @@ -5680,7 +5861,7 @@ "author": "Philip Lenz; Andreas Geiger; Raquel Urtasun", "abstract": "One of the most popular approaches to multi-target tracking is tracking-by-detection. Current min-cost flow algorithms which solve the data association problem optimally have three main drawbacks: they are computationally expensive, they assume that the whole video is given as a batch, and they scale badly in memory and computation with the length of the video sequence. In this paper, we address each of these issues, resulting in a computationally and memory-bounded solution. First, we introduce a dynamic version of the successive shortest-path algorithm which solves the data association problem optimally while reusing computation, resulting in faster inference than standard solvers. Second, we address the optimal solution to the data association problem when dealing with an incoming stream of data (i.e., online setting). Finally, we present our main contribution which is an approximate online solution with bounded memory and computation which is capable of handling videos of arbitrary length while performing tracking in real time. We demonstrate the effectiveness of our algorithms on the KITTI and PETS2009 benchmarks and show state-of-the-art performance, while being significantly faster than existing solvers.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Lenz_FollowMe_Efficient_Online_ICCV_2015_paper.pdf", - "aff": "Karlsruhe Institute of Technology; MPI T\u00fcbingen; University of Toronto", + "aff": "Karlsruhe Institute of Technology; MPI Tübingen; University of Toronto", "project": "", "github": "", "supp": "", @@ -5699,9 +5880,10 @@ "aff_unique_url": "https://www.kit.edu;https://www.cbs.mpg.de;https://www.utoronto.ca", "aff_unique_abbr": "KIT;MPI CBS;U of T", "aff_campus_unique_index": "1", - "aff_campus_unique": ";T\u00fcbingen", + "aff_campus_unique": ";Tübingen", "aff_country_unique_index": "0;0;1", - "aff_country_unique": "Germany;Canada" + "aff_country_unique": "Germany;Canada", + "bibtex": "@InProceedings{Lenz_2015_ICCV,\n \n author = {\n Lenz,\n Philip and Geiger,\n Andreas and Urtasun,\n Raquel\n},\n title = {\n FollowMe: Efficient Online Min-Cost Flow Tracking With Bounded Memory and Computation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "a9b55bf609", @@ -5735,7 +5917,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Qian_2015_ICCV,\n \n author = {\n Qian,\n Yiming and Gong,\n Minglun and Yang,\n Yee-Hong\n},\n title = {\n Frequency-Based Environment Matting by Compressive Sensing\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "736b3e4cf5", @@ -5769,7 +5952,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Spain" + "aff_country_unique": "Spain", + "bibtex": "@InProceedings{Ruiz_2015_ICCV,\n \n author = {\n Ruiz,\n Adria and Van de Weijer,\n Joost and Binefa,\n Xavier\n},\n title = {\n From Emotions to Action Units With Hidden and Semi-Hidden-Task Learning\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "e9b1844b30", @@ -5796,14 +5980,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Yang_From_Facial_Parts_ICCV_2015_paper.html", "aff_unique_index": "0+1;0+1;0+1;0+1", - "aff_unique_norm": "Chinese University of Hong Kong;Shenzhen Institute of Advanced Technology", + "aff_unique_norm": "The Chinese University of Hong Kong;Shenzhen Institutes of Advanced Technology", "aff_unique_dep": "Department of Information Engineering;Shenzhen Key Lab of Comp. Vis. & Pat. Rec.", "aff_unique_url": "https://www.cuhk.edu.hk;http://www.siat.ac.cn", "aff_unique_abbr": "CUHK;SIAT", "aff_campus_unique_index": "0+1;0+1;0+1;0+1", "aff_campus_unique": "Hong Kong SAR;Shenzhen", "aff_country_unique_index": "0+0;0+0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yang_2015_ICCV,\n \n author = {\n Yang,\n Shuo and Luo,\n Ping and Loy,\n Chen-Change and Tang,\n Xiaoou\n},\n title = {\n From Facial Parts Responses to Face Detection: A Deep Learning Approach\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "b3c8436c35", @@ -5837,7 +6022,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "BEIJING", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Dai_2015_ICCV,\n \n author = {\n Dai,\n Longquan and Yuan,\n Mengke and Zhang,\n Feihu and Zhang,\n Xiaopeng\n},\n title = {\n Fully Connected Guided Image Filtering\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "b915a3d428", @@ -5862,7 +6048,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Perazzi_Fully_Connected_Object_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Perazzi_Fully_Connected_Object_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Perazzi_2015_ICCV,\n \n author = {\n Perazzi,\n Federico and Wang,\n Oliver and Gross,\n Markus and Sorkine-Hornung,\n Alexander\n},\n title = {\n Fully Connected Object Proposals for Video Segmentation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "f6588e3284", @@ -5892,11 +6079,12 @@ "aff_unique_norm": "University of Surrey", "aff_unique_dep": "CVSSP", "aff_unique_url": "https://www.surrey.ac.uk", - "aff_unique_abbr": "", + "aff_unique_abbr": "Surrey", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Guildford", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Mustafa_2015_ICCV,\n \n author = {\n Mustafa,\n Armin and Kim,\n Hansung and Guillemaut,\n Jean-Yves and Hilton,\n Adrian\n},\n title = {\n General Dynamic Scene Reconstruction From Multiple View Video\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "f57d9b43ad", @@ -5930,7 +6118,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Soran_2015_ICCV,\n \n author = {\n Soran,\n Bilge and Farhadi,\n Ali and Shapiro,\n Linda\n},\n title = {\n Generating Notifications for Missing Actions: Don't Forget to Turn the Lights Off!\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "110447cf19", @@ -5964,7 +6153,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";San Diego", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Jiang_2015_ICCV,\n \n author = {\n Jiang,\n Peng and Vasconcelos,\n Nuno and Peng,\n Jingliang\n},\n title = {\n Generic Promotion of Diffusion-Based Salient Object Detection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "315d8111f8", @@ -5998,7 +6188,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Huang_2015_ICCV,\n \n author = {\n Huang,\n Jiaji and Qiu,\n Qiang and Calderbank,\n Robert and Sapiro,\n Guillermo\n},\n title = {\n Geometry-Aware Deep Transform\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "33b07a6fc9", @@ -6032,7 +6223,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Cui_2015_ICCV,\n \n author = {\n Cui,\n Zhaopeng and Tan,\n Ping\n},\n title = {\n Global Structure-From-Motion by Similarity Averaging\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "8823f0f312", @@ -6066,7 +6258,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Freiburg", "aff_country_unique_index": "0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Ummenhofer_2015_ICCV,\n \n author = {\n Ummenhofer,\n Benjamin and Brox,\n Thomas\n},\n title = {\n Global,\n Dense Multiscale Reconstruction for a Billion Points\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "502fa7645b", @@ -6091,7 +6284,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Brown_Globally_Optimal_2D-3D_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Brown_Globally_Optimal_2D-3D_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Brown_2015_ICCV,\n \n author = {\n Brown,\n Mark and Windridge,\n David and Guillemaut,\n Jean-Yves\n},\n title = {\n Globally Optimal 2D-3D Registration From Points or Lines Without Correspondences\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "c0528b53f7", @@ -6125,7 +6319,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhang_2015_ICCV,\n \n author = {\n Zhang,\n Ziming and Chen,\n Yuting and Saligrama,\n Venkatesh\n},\n title = {\n Group Membership Prediction\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "3709041920", @@ -6134,7 +6329,7 @@ "status": "Poster", "track": "main", "pid": "", - "author_site": "\u00c1lvaro Parra Bustos, Tat-Jun Chin", + "author_site": "Álvaro Parra Bustos, Tat-Jun Chin", "author": "Alvaro Parra Bustos; Tat-Jun Chin", "abstract": "Rotation search has become a core routine for solving many computer vision problems. The aim is to rotationally align two input point sets with correspondences. Recently, there is significant interest in developing globally optimal rotation search algorithms. A notable weakness of global algorithms, however, is their relatively high computational cost, especially on large problem sizes and data with a high proportion of outliers. In this paper, we propose a novel outlier removal technique for rotation search. Our method guarantees that any correspondence it discards as an outlier does not exist in the inlier set of the globally optimal rotation for the original data. Based on simple geometric operations, our algorithm is deterministic and fast. Used as a preprocessor to prune a large portion of the outliers from the input data, our method enables substantial speed-up of rotation search algorithms without compromising global optimality. We demonstrate the efficacy of our method in various synthetic and real data experiments.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Bustos_Guaranteed_Outlier_Removal_ICCV_2015_paper.pdf", @@ -6152,14 +6347,15 @@ "author_num": 2, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Bustos_Guaranteed_Outlier_Removal_ICCV_2015_paper.html", "aff_unique_index": "0;0", - "aff_unique_norm": "University of Adelaide", + "aff_unique_norm": "The University of Adelaide", "aff_unique_dep": "School of Computer Science", "aff_unique_url": "https://www.adelaide.edu.au", "aff_unique_abbr": "Adelaide", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Adelaide", "aff_country_unique_index": "0;0", - "aff_country_unique": "Australia" + "aff_country_unique": "Australia", + "bibtex": "@InProceedings{Bustos_2015_ICCV,\n \n author = {\n Bustos,\n Alvaro Parra and Chin,\n Tat-Jun\n},\n title = {\n Guaranteed Outlier Removal for Rotation Search\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "8dfdeeea11", @@ -6186,14 +6382,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Jia_Guiding_the_Long-Short_ICCV_2015_paper.html", "aff_unique_index": "0;1;2+0;0", - "aff_unique_norm": "KU Leuven;University of Amsterdam;Australian National University", + "aff_unique_norm": "KU Leuven;University of Amsterdam;The Australian National University", "aff_unique_dep": "ESAT-PSI;QUV A Lab;ACRV", "aff_unique_url": "https://www.kuleuven.be;https://www.uva.nl;https://www.anu.edu.au", "aff_unique_abbr": "KU Leuven;UvA;ANU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2+0;0", - "aff_country_unique": "Belgium;Netherlands;Australia" + "aff_country_unique": "Belgium;Netherlands;Australia", + "bibtex": "@InProceedings{Jia_2015_ICCV,\n \n author = {\n Jia,\n Xu and Gavves,\n Efstratios and Fernando,\n Basura and Tuytelaars,\n Tinne\n},\n title = {\n Guiding the Long-Short Term Memory Model for Image Caption Generation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "f1e6252e69", @@ -6227,7 +6424,8 @@ "aff_campus_unique_index": "0", "aff_campus_unique": "Shenzhen;", "aff_country_unique_index": "0;1", - "aff_country_unique": "China;France" + "aff_country_unique": "China;France", + "bibtex": "@InProceedings{Zou_2015_ICCV,\n \n author = {\n Zou,\n Wenbin and Komodakis,\n Nikos\n},\n title = {\n HARF: Hierarchy-Associated Rich Features for Salient Object Detection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "ba527a2886", @@ -6254,14 +6452,15 @@ "author_num": 7, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Yan_HD-CNN_Hierarchical_Deep_ICCV_2015_paper.html", "aff_unique_index": "0;1;2;2;2;2;3+0", - "aff_unique_norm": "University of Illinois Urbana-Champaign;Carnegie Mellon University;eBay;University of Hong Kong", + "aff_unique_norm": "University of Illinois at Urbana-Champaign;Carnegie Mellon University;eBay;The University of Hong Kong", "aff_unique_dep": ";;eBay Research Lab;", "aff_unique_url": "https://illinois.edu;https://www.cmu.edu;https://www.ebayinc.com;https://www.hku.hk", "aff_unique_abbr": "UIUC;CMU;eBay;HKU", "aff_campus_unique_index": "0;2+0", "aff_campus_unique": "Urbana-Champaign;;Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0;0;1+0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Yan_2015_ICCV,\n \n author = {\n Yan,\n Zhicheng and Zhang,\n Hao and Piramuthu,\n Robinson and Jagadeesh,\n Vignesh and DeCoste,\n Dennis and Di,\n Wei and Yu,\n Yizhou\n},\n title = {\n HD-CNN: Hierarchical Deep Convolutional Neural Networks for Large Scale Visual Recognition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "2161c495f7", @@ -6295,7 +6494,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Ann Arbor", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Chao_2015_ICCV,\n \n author = {\n Chao,\n Yu-Wei and Wang,\n Zhan and He,\n Yugeng and Wang,\n Jiaxuan and Deng,\n Jia\n},\n title = {\n HICO: A Benchmark for Recognizing Human-Object Interactions in Images\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "43db8988dc", @@ -6320,7 +6520,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Wu_Harvesting_Discriminative_Meta_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Wu_Harvesting_Discriminative_Meta_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Wu_2015_ICCV,\n \n author = {\n Wu,\n Ruobing and Wang,\n Baoyuan and Wang,\n Wenping and Yu,\n Yizhou\n},\n title = {\n Harvesting Discriminative Meta Objects With Deep CNN Features for Scene Classification\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "c5e26feb98", @@ -6347,14 +6548,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Ma_Hierarchical_Convolutional_Features_ICCV_2015_paper.html", "aff_unique_index": "0;1;0;2", - "aff_unique_norm": "Shanghai Jiao Tong University;University of Illinois Urbana-Champaign;University of California, Merced", + "aff_unique_norm": "Shanghai Jiao Tong University;University of Illinois at Urbana-Champaign;University of California, Merced", "aff_unique_dep": ";;", "aff_unique_url": "https://www.sjtu.edu.cn;https://www illinois.edu;https://www.ucmerced.edu", "aff_unique_abbr": "SJTU;UIUC;UCM", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Urbana-Champaign;Merced", "aff_country_unique_index": "0;1;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Ma_2015_ICCV,\n \n author = {\n Ma,\n Chao and Huang,\n Jia-Bin and Yang,\n Xiaokang and Yang,\n Ming-Hsuan\n},\n title = {\n Hierarchical Convolutional Features for Visual Tracking\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "c166516a5c", @@ -6381,14 +6583,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Pham_Hierarchical_Higher-Order_Regression_ICCV_2015_paper.html", "aff_unique_index": "0;0;0;1", - "aff_unique_norm": "University of Adelaide;Australian National University", + "aff_unique_norm": "The University of Adelaide;The Australian National University", "aff_unique_dep": "School of Computer Science;Research School of Computer Science", "aff_unique_url": "https://www.adelaide.edu.au;https://www.anu.edu.au", "aff_unique_abbr": "Adelaide;ANU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Australia" + "aff_country_unique": "Australia", + "bibtex": "@InProceedings{Pham_2015_ICCV,\n \n author = {\n Pham,\n Trung T. and Reid,\n Ian and Latif,\n Yasir and Gould,\n Stephen\n},\n title = {\n Hierarchical Higher-Order Regression Forest Fields: An Application to 3D Indoor Scene Labelling\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "ac2c57eb53", @@ -6422,7 +6625,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Im_2015_ICCV,\n \n author = {\n Im,\n Sunghoon and Ha,\n Hyowon and Choe,\n Gyeongmin and Jeon,\n Hae-Gon and Joo,\n Kyungdon and Kweon,\n In So\n},\n title = {\n High Quality Structure From Small Motion for Rolling Shutter Cameras\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "2cf3421fd7", @@ -6456,7 +6660,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Bertasius_2015_ICCV,\n \n author = {\n Bertasius,\n Gedas and Shi,\n Jianbo and Torresani,\n Lorenzo\n},\n title = {\n High-for-Low and Low-for-High: Efficient Boundary Detection From Deep Object Features and its Applications to High-Level Vision\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "f2c0a449f2", @@ -6490,7 +6695,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2015_ICCV,\n \n author = {\n Liu,\n Jingbo and Wang,\n Jinglu and Fang,\n Tian and Tai,\n Chiew-Lan and Quan,\n Long\n},\n title = {\n Higher-Order CRF Structural Segmentation of 3D Reconstructed Surfaces\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "f983f18f29", @@ -6524,7 +6730,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Switzerland" + "aff_country_unique": "Switzerland", + "bibtex": "@InProceedings{Zhang_2015_ICCV,\n \n author = {\n Zhang,\n Jian and Djolonga,\n Josip and Krause,\n Andreas\n},\n title = {\n Higher-Order Inference for Multi-Class Log-Supermodular Models\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "14dd9bbec1", @@ -6533,7 +6740,7 @@ "status": "Poster", "track": "main", "pid": "", - "author_site": "Oren Freifeld, S\u00f8ren Hauberg, Kayhan Batmanghelich, John W. Fisher III", + "author_site": "Oren Freifeld, Søren Hauberg, Kayhan Batmanghelich, John W. Fisher III", "author": "Oren Freifeld; Soren Hauberg; Kayhan Batmanghelich; John W. Fisher III", "abstract": "We propose novel finite-dimensional spaces of R - R transformations, n [?] 1, 2, 3, derived from (continuously-defined) parametric stationary velocity fields. Particularly, we obtain these transformations, which are diffeomorphisms, by fast and highly-accurate integration of continuous piecewise-affine velocity fields; we also provide an exact solution for n = 1. The simple-yet-highly-expressive proposed representation handles optional constraints (e.g., volume preservation) easily and supports convenient modeling choices and rapid likelihood evaluations (facilitating tractable inference over latent transformations). Its applications include, but are not limited to: unconstrained optimization over monotonic functions; modeling cumulative distribution functions or histograms; time warping; image registration; landmark-based warping; real-time diffeomorphic image editing. Our code is available at https://github.com/freifeld/cpabDiffeo", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Freifeld_Highly-Expressive_Spaces_of_ICCV_2015_paper.pdf", @@ -6558,7 +6765,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Cambridge;", "aff_country_unique_index": "0;1;0;0", - "aff_country_unique": "United States;Denmark" + "aff_country_unique": "United States;Denmark", + "bibtex": "@InProceedings{Freifeld_2015_ICCV,\n \n author = {\n Freifeld,\n Oren and Hauberg,\n Soren and Batmanghelich,\n Kayhan and Fisher,\n III,\n John W.\n},\n title = {\n Highly-Expressive Spaces of Well-Behaved Transformations: Keeping It Simple\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "9fabdd5679", @@ -6582,7 +6790,8 @@ "aff_domain": ";", "email": ";", "author_num": 2, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Xie_Holistically-Nested_Edge_Detection_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Xie_Holistically-Nested_Edge_Detection_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Xie_2015_ICCV,\n \n author = {\n Xie,\n Saining and Tu,\n Zhuowen\n},\n title = {\n Holistically-Nested Edge Detection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "dc2427e74f", @@ -6607,7 +6816,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Glasner_Hot_or_Not_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Glasner_Hot_or_Not_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Glasner_2015_ICCV,\n \n author = {\n Glasner,\n Daniel and Fua,\n Pascal and Zickler,\n Todd and Zelnik-Manor,\n Lihi\n},\n title = {\n Hot or Not: Exploring Correlations Between Appearance and Temperature\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "4c227cc475", @@ -6635,13 +6845,14 @@ "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Sun_Human_Action_Recognition_ICCV_2015_paper.html", "aff_unique_index": "0+1;2;0;0", "aff_unique_norm": "Hong Kong University of Science and Technology;Lenovo Corporate Research;University of Macau", - "aff_unique_dep": "Department of Electronic and Computer Engineering;;Faculty of Science and Technology", + "aff_unique_dep": "Department of Electronic and Computer Engineering;Corporate Research;Faculty of Science and Technology", "aff_unique_url": "https://www.ust.hk;https://www.lenovo.com;https://www.um.edu.mo", "aff_unique_abbr": "HKUST;LCR;UM", "aff_campus_unique_index": "0+0;1;0;0", "aff_campus_unique": "Hong Kong SAR;Macau SAR", "aff_country_unique_index": "0+0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Sun_2015_ICCV,\n \n author = {\n Sun,\n Lin and Jia,\n Kui and Yeung,\n Dit-Yan and Shi,\n Bertram E.\n},\n title = {\n Human Action Recognition Using Factorized Spatio-Temporal Convolutional Networks\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "a8c88ee275", @@ -6675,7 +6886,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;1;2;2;0;0;0;1", - "aff_country_unique": "China;Singapore;United States" + "aff_country_unique": "China;Singapore;United States", + "bibtex": "@InProceedings{Liang_2015_ICCV,\n \n author = {\n Liang,\n Xiaodan and Xu,\n Chunyan and Shen,\n Xiaohui and Yang,\n Jianchao and Liu,\n Si and Tang,\n Jinhui and Lin,\n Liang and Yan,\n Shuicheng\n},\n title = {\n Human Parsing With Contextualized Convolutional Neural Network\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "0b5130726f", @@ -6699,7 +6911,8 @@ "aff_domain": ";", "email": ";", "author_num": 2, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Zhang_Human_Pose_Estimation_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Zhang_Human_Pose_Estimation_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Zhang_2015_ICCV,\n \n author = {\n Zhang,\n Dong and Shah,\n Mubarak\n},\n title = {\n Human Pose Estimation in Videos\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "80515844d3", @@ -6708,11 +6921,11 @@ "status": "Poster", "track": "main", "pid": "", - "author_site": "Torsten Sattler, Michal Havlena, Filip Radenovi\u0107, Konrad Schindler, Marc Pollefeys", + "author_site": "Torsten Sattler, Michal Havlena, Filip Radenović, Konrad Schindler, Marc Pollefeys", "author": "Torsten Sattler; Michal Havlena; Filip Radenovic; Konrad Schindler; Marc Pollefeys", "abstract": "Structure-based localization is the task of finding the absolute pose of a given query image w.r.t. a pre-computed 3D model. While this is almost trivial at small scale, special care must be taken as the size of the 3D model grows, because straight-forward descriptor matching becomes ineffective due to the large memory footprint of the model, as well as the strictness of the ratio test in 3D. Recently, several authors have tried to overcome these problems, either by a smart compression of the 3D model or by clever sampling strategies for geometric verification. Here we explore an orthogonal strategy, which uses all the 3D points and standard sampling, but performs feature matching implicitly, by quantization into a fine vocabulary. We show that although this matching is ambiguous and gives rise to 3D hyperpoints when matching each 2D query feature in isolation, a simple voting strategy, which enforces the fact that the selected 3D points shall be co-visible, can reliably find a locally unique 2D-3D point assignment. Experiments on two large-scale datasets demonstrate that our method achieves state-of-the-art performance, while the memory footprint is greatly reduced, since only visual word labels but no 3D point descriptors need to be stored.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Sattler_Hyperpoints_and_Fine_ICCV_2015_paper.pdf", - "aff": "Department of Computer Science, ETH Z\u00fcrich, Switzerland; Institute of Geodesy and Photogrammetry, ETH Z\u00fcrich, Switzerland; CMP, Faculty of Electrical Engineering, Czech Technical University in Prague; Institute of Geodesy and Photogrammetry, ETH Z\u00fcrich, Switzerland; Department of Computer Science, ETH Z\u00fcrich, Switzerland", + "aff": "Department of Computer Science, ETH Zürich, Switzerland; Institute of Geodesy and Photogrammetry, ETH Zürich, Switzerland; CMP, Faculty of Electrical Engineering, Czech Technical University in Prague; Institute of Geodesy and Photogrammetry, ETH Zürich, Switzerland; Department of Computer Science, ETH Zürich, Switzerland", "project": "", "github": "", "supp": "", @@ -6726,14 +6939,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Sattler_Hyperpoints_and_Fine_ICCV_2015_paper.html", "aff_unique_index": "0;0;1;0;0", - "aff_unique_norm": "ETH Zurich;Czech Technical University in Prague", + "aff_unique_norm": "ETH Zürich;Czech Technical University in Prague", "aff_unique_dep": "Department of Computer Science;Faculty of Electrical Engineering", "aff_unique_url": "https://www.ethz.ch;https://www.cvut.cz", "aff_unique_abbr": "ETHZ;CTU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Prague", "aff_country_unique_index": "0;0;1;0;0", - "aff_country_unique": "Switzerland;Czech Republic" + "aff_country_unique": "Switzerland;Czech Republic", + "bibtex": "@InProceedings{Sattler_2015_ICCV,\n \n author = {\n Sattler,\n Torsten and Havlena,\n Michal and Radenovic,\n Filip and Schindler,\n Konrad and Pollefeys,\n Marc\n},\n title = {\n Hyperpoints and Fine Vocabularies for Large-Scale Location Recognition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "6d1ad0b806", @@ -6746,7 +6960,7 @@ "author": "Lei Zhang; Wei Wei; Yanning Zhang; Fei Li; Chunhua Shen; Qinfeng Shi", "abstract": "To reconstruct hyperspectral image (HSI) accurately from a few noisy compressive measurements, we present a novel manifold-structured sparsity prior based hyperspectral compressive sensing (HCS) method in this study. A matrix based hierarchical prior is first proposed to represent the spectral structured sparsity and spatial unknown manifold structure of HSI simultaneously. Then, a latent variable Bayes model is introduced to learn the sparsity prior and estimate the noise jointly from measurements. The learned prior can fully represent the inherent 3D structure of HSI and regulate its shape based on the estimated noise level. Thus, with this learned prior, the proposed method improves the reconstruction accuracy significantly and shows strong robustness to unknown noise in HCS. Experiments on four real hyperspectral datasets show that the proposed method outperforms several state-of-the-art methods on the reconstruction accuracy of HSI.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Zhang_Hyperspectral_Compressive_Sensing_ICCV_2015_paper.pdf", - "aff": "School of Computer Science and Engineering, Northwestern Polytechnical University, Xi\u2019an, 710072, China; School of Computer Science and Engineering, Northwestern Polytechnical University, Xi\u2019an, 710072, China; School of Computer Science and Engineering, Northwestern Polytechnical University, Xi\u2019an, 710072, China; School of Computer Science and Engineering, Northwestern Polytechnical University, Xi\u2019an, 710072, China; School of Computer Science, The University of Adelaide, Australia; School of Computer Science, The University of Adelaide, Australia", + "aff": "School of Computer Science and Engineering, Northwestern Polytechnical University, Xi’an, 710072, China; School of Computer Science and Engineering, Northwestern Polytechnical University, Xi’an, 710072, China; School of Computer Science and Engineering, Northwestern Polytechnical University, Xi’an, 710072, China; School of Computer Science and Engineering, Northwestern Polytechnical University, Xi’an, 710072, China; School of Computer Science, The University of Adelaide, Australia; School of Computer Science, The University of Adelaide, Australia", "project": "", "github": "", "supp": "", @@ -6760,14 +6974,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Zhang_Hyperspectral_Compressive_Sensing_ICCV_2015_paper.html", "aff_unique_index": "0;0;0;0;1;1", - "aff_unique_norm": "Northwestern Polytechnical University;University of Adelaide", + "aff_unique_norm": "Northwestern Polytechnical University;The University of Adelaide", "aff_unique_dep": "School of Computer Science and Engineering;School of Computer Science", "aff_unique_url": "http://www.nwpu.edu.cn;https://www.adelaide.edu.au", - "aff_unique_abbr": "NPU;Adelaide", + "aff_unique_abbr": "NWPU;Adelaide", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Xi'an;", "aff_country_unique_index": "0;0;0;0;1;1", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Zhang_2015_ICCV,\n \n author = {\n Zhang,\n Lei and Wei,\n Wei and Zhang,\n Yanning and Li,\n Fei and Shen,\n Chunhua and Shi,\n Qinfeng\n},\n title = {\n Hyperspectral Compressive Sensing Using Manifold-Structured Sparsity Prior\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "606dfce57f", @@ -6780,7 +6995,7 @@ "author": "Charis Lanaras; Emmanuel Baltsavias; Konrad Schindler", "abstract": "Hyperspectral cameras capture images with many narrow spectral channels, which densely sample the electromagnetic spectrum. The detailed spectral resolution is useful for many image analysis problems, but it comes at the cost of much lower spatial resolution. Hyperspectral super-resolution addresses this problem, by fusing a low-resolution hyperspectral image and a conventional high-resolution image into a product of both high spatial and high spectral resolution. In this paper, we propose a method which performs hyperspectral super-resolution by jointly unmixing the two input images into the pure reflectance spectra of the observed materials and the associated mixing coefficients. The formulation leads to a coupled matrix factorisation problem, with a number of useful constraints imposed by elementary physical properties of spectral mixing. In experiments with two benchmark datasets we show that the proposed approach delivers improved hyperspectral super-resolution.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Lanaras_Hyperspectral_Super-Resolution_by_ICCV_2015_paper.pdf", - "aff": "Institute of Geodesy and Photogrammetry, ETH Z\u00fcrich, Switzerland; Institute of Geodesy and Photogrammetry, ETH Z\u00fcrich, Switzerland; Institute of Geodesy and Photogrammetry, ETH Z\u00fcrich, Switzerland", + "aff": "Institute of Geodesy and Photogrammetry, ETH Zürich, Switzerland; Institute of Geodesy and Photogrammetry, ETH Zürich, Switzerland; Institute of Geodesy and Photogrammetry, ETH Zürich, Switzerland", "project": "", "github": "", "supp": "", @@ -6794,14 +7009,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Lanaras_Hyperspectral_Super-Resolution_by_ICCV_2015_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "ETH Zurich", + "aff_unique_norm": "ETH Zürich", "aff_unique_dep": "Institute of Geodesy and Photogrammetry", "aff_unique_url": "https://www.ethz.ch", "aff_unique_abbr": "ETH", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Switzerland" + "aff_country_unique": "Switzerland", + "bibtex": "@InProceedings{Lanaras_2015_ICCV,\n \n author = {\n Lanaras,\n Charis and Baltsavias,\n Emmanuel and Schindler,\n Konrad\n},\n title = {\n Hyperspectral Super-Resolution by Coupled Spectral Unmixing\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "637cf03aa1", @@ -6828,14 +7044,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/liu_Illumination_Robust_Color_ICCV_2015_paper.html", "aff_unique_index": "0;0;0;0;0", - "aff_unique_norm": "Xi'an Jiao Tong University", + "aff_unique_norm": "Xi'an Jiaotong University", "aff_unique_dep": "Institute of Artificial Intelligence and Robotics", "aff_unique_url": "http://www.xjtu.edu.cn", "aff_unique_abbr": "XJTU", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Xi'an", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{liu_2015_ICCV,\n \n author = {\n liu,\n Yuanliu and Yuan,\n Zejian and Chen,\n Badong and Xue,\n Jianru and Zheng,\n Nanning\n},\n title = {\n Illumination Robust Color Naming via Label Propagation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "e78d2edad9", @@ -6863,13 +7080,14 @@ "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Meyers_Im2Calories_Towards_an_ICCV_2015_paper.html", "aff_unique_index": "0;1;1;1;1;1;1;1;1;1", "aff_unique_norm": "University of Maryland;Google", - "aff_unique_dep": ";Google", + "aff_unique_dep": ";", "aff_unique_url": "https://www/umd.edu;https://www.google.com", "aff_unique_abbr": "UMD;Google", "aff_campus_unique_index": "1;1;1;1;1;1;1;1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Meyers_2015_ICCV,\n \n author = {\n Meyers,\n Austin and Johnston,\n Nick and Rathod,\n Vivek and Korattikara,\n Anoop and Gorban,\n Alex and Silberman,\n Nathan and Guadarrama,\n Sergio and Papandreou,\n George and Huang,\n Jonathan and Murphy,\n Kevin P.\n},\n title = {\n Im2Calories: Towards an Automated Mobile Vision Food Diary\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "0974dd9218", @@ -6894,7 +7112,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Karacan_Image_Matting_With_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Karacan_Image_Matting_With_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Karacan_2015_ICCV,\n \n author = {\n Karacan,\n Levent and Erdem,\n Aykut and Erdem,\n Erkut\n},\n title = {\n Image Matting With KL-Divergence Based Sparse Sampling\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "2a79e44cfb", @@ -6928,7 +7147,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Ispra", "aff_country_unique_index": "0;0", - "aff_country_unique": "Italy" + "aff_country_unique": "Italy", + "bibtex": "@InProceedings{Rodriguez_2015_ICCV,\n \n author = {\n Rodriguez,\n Antonio L. and Sequeira,\n Vitor\n},\n title = {\n Improving Ferns Ensembles by Sparsifying and Quantising Posterior Probabilities\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "aae26c3233", @@ -6955,14 +7175,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Tang_Improving_Image_Classification_ICCV_2015_paper.html", "aff_unique_index": "0;1;0;1;1", - "aff_unique_norm": "Stanford University;Meta", + "aff_unique_norm": "Stanford University;Facebook", "aff_unique_dep": "Computer Science Department;Facebook AI Research", "aff_unique_url": "https://www.stanford.edu;https://research.facebook.com", "aff_unique_abbr": "Stanford;FAIR", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Tang_2015_ICCV,\n \n author = {\n Tang,\n Kevin and Paluri,\n Manohar and Fei-Fei,\n Li and Fergus,\n Rob and Bourdev,\n Lubomir\n},\n title = {\n Improving Image Classification With Location Context\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "8bc29aceed", @@ -6996,7 +7217,8 @@ "aff_campus_unique_index": "0;2;0", "aff_campus_unique": "Albany;;Beijing", "aff_country_unique_index": "0;1;1;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Mei_2015_ICCV,\n \n author = {\n Mei,\n Xing and Qi,\n Honggang and Hu,\n Bao-Gang and Lyu,\n Siwei\n},\n title = {\n Improving Image Restoration With Soft-Rounding\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "cf083123eb", @@ -7023,14 +7245,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Kirillov_Inferring_M-Best_Diverse_ICCV_2015_paper.html", "aff_unique_index": "0;0;0;1;0", - "aff_unique_norm": "Technische Universit\u00e4t Dresden;Skolkovo Institute of Science and Technology", + "aff_unique_norm": "Technische Universität Dresden;Skolkovo Institute of Science and Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.tu-dresden.de;https://www.skoltech.ru", "aff_unique_abbr": "TUD;Skoltech", "aff_campus_unique_index": "0;0;0;1;0", "aff_campus_unique": "Dresden;Moscow", "aff_country_unique_index": "0;0;0;1;0", - "aff_country_unique": "Germany;Russian Federation" + "aff_country_unique": "Germany;Russia", + "bibtex": "@InProceedings{Kirillov_2015_ICCV,\n \n author = {\n Kirillov,\n Alexander and Savchynskyy,\n Bogdan and Schlesinger,\n Dmitrij and Vetrov,\n Dmitry and Rother,\n Carsten\n},\n title = {\n Inferring M-Best Diverse Labelings in a Single One\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "d1de512d13", @@ -7064,7 +7287,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Verona", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Italy" + "aff_country_unique": "Italy", + "bibtex": "@InProceedings{Roffo_2015_ICCV,\n \n author = {\n Roffo,\n Giorgio and Melzi,\n Simone and Cristani,\n Marco\n},\n title = {\n Infinite Feature Selection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "fc9b2b22b7", @@ -7098,7 +7322,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Taiwan", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2015_ICCV,\n \n author = {\n Chen,\n Hsin-I and Chen,\n Yi-Ling and Lee,\n Wei-Tse and Wang,\n Fan and Chen,\n Bing-Yu\n},\n title = {\n Integrating Dashcam Views Through Inter-Video Mapping\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "5662d61c3c", @@ -7132,7 +7357,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+1;1;0+1;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Zuo_2015_ICCV,\n \n author = {\n Zuo,\n Xinxin and Du,\n Chao and Wang,\n Sen and Zheng,\n Jiangbin and Yang,\n Ruigang\n},\n title = {\n Interactive Visual Hull Refinement for Specular and Transparent Object Surface Reconstruction\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "4f91ec091d", @@ -7145,7 +7371,7 @@ "author": "Hyunwoo J. Kim; Nagesh Adluru; Monami Banerjee; Baba C. Vemuri; Vikas Singh", "abstract": "Probability density functions (PDFs) are fundamental \"objects\" in mathematics with numerous applications in computer vision, machine learning and medical imaging. The feasibility of basic operations such as computing the distance between two PDFs and estimating a mean of a set of PDFs is a direct function of the representation we choose to work with. In this paper, we study the Gaussian mixture model (GMM) representation of the PDFs motivated by its numerous attractive features. (1) GMMs are arguably more interpretable than, say, square root parameterizations (2) the model complexity can be explicitly controlled by the number of components and (3) they are already widely used in many applications. The main contributions of this paper are numerical algorithms to enable basic operations on such objects that strictly respect their underlying geometry. For instance, when operating with a set of k component GMMs, a first order expectation is that the result of simple operations like interpolation and averaging should provide an object that is also a k component GMM. The literature provides very little guidance on enforcing such requirements systematically. It turns out that these tasks are important internal modules for analysis and processing of a field of ensemble average propagators (EAPs), common in diffusion weighted magnetic resonance imaging. We provide proof of principle experiments showing how the proposed algorithms for interpolation can facilitate statistical analysis of such data, essential to many neuroimaging studies. Separately, we also derive interesting connections of our algorithm with functional spaces of Gaussians, that may be of independent interest.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Kim_Interpolation_on_the_ICCV_2015_paper.pdf", - "aff": "University of Wisconsin\u2013Madison; University of Wisconsin\u2013Madison; University of Florida; University of Florida; University of Wisconsin\u2013Madison", + "aff": "University of Wisconsin–Madison; University of Wisconsin–Madison; University of Florida; University of Florida; University of Wisconsin–Madison", "project": "http://pages.cs.wisc.edu/~hwkim/projects/k-gmm", "github": "", "supp": "", @@ -7159,14 +7385,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Kim_Interpolation_on_the_ICCV_2015_paper.html", "aff_unique_index": "0;0;1;1;0", - "aff_unique_norm": "University of Wisconsin\u2013Madison;University of Florida", + "aff_unique_norm": "University of Wisconsin–Madison;University of Florida", "aff_unique_dep": ";", "aff_unique_url": "https://www.wisc.edu;https://www.ufl.edu", - "aff_unique_abbr": "UW\u2013Madison;UF", + "aff_unique_abbr": "UW–Madison;UF", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Madison;", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Kim_2015_ICCV,\n \n author = {\n Kim,\n Hyunwoo J. and Adluru,\n Nagesh and Banerjee,\n Monami and Vemuri,\n Baba C. and Singh,\n Vikas\n},\n title = {\n Interpolation on the Manifold of K Component GMMs\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "7b0adc6f19", @@ -7200,7 +7427,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Switzerland" + "aff_country_unique": "Switzerland", + "bibtex": "@InProceedings{Laffont_2015_ICCV,\n \n author = {\n Laffont,\n Pierre-Yves and Bazin,\n Jean-Charles\n},\n title = {\n Intrinsic Decomposition of Image Sequences From Local Temporal Variations\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "c722446f72", @@ -7225,7 +7453,8 @@ "aff_domain": ";", "email": ";", "author_num": 2, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Kong_Intrinsic_Depth_Improving_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Kong_Intrinsic_Depth_Improving_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Kong_2015_ICCV,\n \n author = {\n Kong,\n Naejin and Black,\n Michael J.\n},\n title = {\n Intrinsic Depth: Improving Depth Transfer With Intrinsic Images\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "900837a5c8", @@ -7259,7 +7488,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Saudi Arabia" + "aff_country_unique": "Saudi Arabia", + "bibtex": "@InProceedings{Hachama_2015_ICCV,\n \n author = {\n Hachama,\n Mohammed and Ghanem,\n Bernard and Wonka,\n Peter\n},\n title = {\n Intrinsic Scene Decomposition From RGB-D images\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "521012f8c5", @@ -7286,14 +7516,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Konyushkova_Introducing_Geometry_in_ICCV_2015_paper.html", "aff_unique_index": "0;1;0", - "aff_unique_norm": "EPFL;University of Bern", + "aff_unique_norm": "Ecole Polytechnique Fédérale de Lausanne;University of Bern", "aff_unique_dep": ";", "aff_unique_url": "https://www.epfl.ch;https://www.unibe.ch", "aff_unique_abbr": "EPFL;UniBE", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Switzerland" + "aff_country_unique": "Switzerland", + "bibtex": "@InProceedings{Konyushkova_2015_ICCV,\n \n author = {\n Konyushkova,\n Ksenia and Sznitman,\n Raphael and Fua,\n Pascal\n},\n title = {\n Introducing Geometry in Active Learning for Image Segmentation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "40c2951616", @@ -7327,7 +7558,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2015_ICCV,\n \n author = {\n Zhang,\n Runze and Li,\n Shiwei and Fang,\n Tian and Zhu,\n Siyu and Quan,\n Long\n},\n title = {\n Joint Camera Clustering and Surface Segmentation for Large-Scale Multi-View Stereo\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "e60cc8f452", @@ -7361,7 +7593,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Jung_2015_ICCV,\n \n author = {\n Jung,\n Heechul and Lee,\n Sihaeng and Yim,\n Junho and Park,\n Sunjeong and Kim,\n Junmo\n},\n title = {\n Joint Fine-Tuning in Deep Neural Networks for Facial Expression Recognition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "d62b8433c4", @@ -7395,7 +7628,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Wang_2015_ICCV,\n \n author = {\n Wang,\n Peng and Shen,\n Xiaohui and Lin,\n Zhe and Cohen,\n Scott and Price,\n Brian and Yuille,\n Alan L.\n},\n title = {\n Joint Object and Part Segmentation Using Deep Learned Potentials\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "15db3d8840", @@ -7429,7 +7663,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", - "aff_country_unique": "Russian Federation;Canada" + "aff_country_unique": "Russia;Canada", + "bibtex": "@InProceedings{Lobacheva_2015_ICCV,\n \n author = {\n Lobacheva,\n Ekaterina and Veksler,\n Olga and Boykov,\n Yuri\n},\n title = {\n Joint Optimization of Segmentation and Color Clustering\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "e499365b00", @@ -7456,14 +7691,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Rezatofighi_Joint_Probabilistic_Data_ICCV_2015_paper.html", "aff_unique_index": "0;0;1;0;0;0", - "aff_unique_norm": "University of Adelaide;Northwestern Polytechnical University", + "aff_unique_norm": "The University of Adelaide;Northwestern Polytechnical University", "aff_unique_dep": "School of Computer Science;School of Computer Science and Technology", "aff_unique_url": "https://www.adelaide.edu.au;http://www.nwpu.edu.cn", "aff_unique_abbr": "Adelaide;NPU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Xian", "aff_country_unique_index": "0;0;1;0;0;0", - "aff_country_unique": "Australia;China" + "aff_country_unique": "Australia;China", + "bibtex": "@InProceedings{Rezatofighi_2015_ICCV,\n \n author = {\n Rezatofighi,\n Seyed Hamid and Milan,\n Anton and Zhang,\n Zhen and Shi,\n Qinfeng and Dick,\n Anthony and Reid,\n Ian\n},\n title = {\n Joint Probabilistic Data Association Revisited\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "bf9eb47e92", @@ -7487,7 +7723,8 @@ "aff_domain": ";", "email": ";", "author_num": 2, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Yu_Just_Noticeable_Differences_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Yu_Just_Noticeable_Differences_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Yu_2015_ICCV,\n \n author = {\n Yu,\n Aron and Grauman,\n Kristen\n},\n title = {\n Just Noticeable Differences in Visual Attributes\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "5b08b1d2e2", @@ -7496,7 +7733,7 @@ "status": "Poster", "track": "main", "pid": "", - "author_site": "Albert Gordo, Jon Almaz\u00e1n, Naila Murray, Florent Perronin", + "author_site": "Albert Gordo, Jon Almazán, Naila Murray, Florent Perronin", "author": "Albert Gordo; Jon Almazan; Naila Murray; Florent Perronin", "abstract": "The goal of this work is to bring semantics into the tasks of text recognition and retrieval in natural images. Although text recognition and retrieval have received a lot of attention in recent years, previous works have focused on recognizing or retrieving exactly the same word used as a query, without taking the In this paper, we ask the following question: can we predict semantic concepts directly from a word image, without explicitly trying to transcribe the word image or its characters at any point? For this goal we propose a convolutional neural network (CNN) with a weighted ranking loss objective that ensures that the concepts relevant to the query image are ranked ahead of those that are not relevant. This can also be interpreted as learning a Euclidean space where word images and concepts are jointly embedded. This model is learned in an end-to-end manner, from image pixels to semantic concepts, using a dataset of synthetically generated word images and concepts mined from a lexical database (WordNet). Our results show that, despite the complexity of the task, word images and concepts can indeed be associated with a high degree of accuracy.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Gordo_LEWIS_Latent_Embeddings_ICCV_2015_paper.pdf", @@ -7514,14 +7751,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Gordo_LEWIS_Latent_Embeddings_ICCV_2015_paper.html", "aff_unique_index": "0;0;0;1", - "aff_unique_norm": "Xerox Research Centre Europe;Meta", + "aff_unique_norm": "Xerox Research Centre Europe;Facebook", "aff_unique_dep": ";Facebook AI Research", - "aff_unique_url": "https://www.xerox.com/research-centre-europe.html;https://research.facebook.com", + "aff_unique_url": "https://www.xerox.com/en-us/innovation/research-centers/europe;https://research.facebook.com", "aff_unique_abbr": "XRCE;FAIR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1", - "aff_country_unique": "Unknown;United States" + "aff_country_unique": "Unknown;United States", + "bibtex": "@InProceedings{Gordo_2015_ICCV,\n \n author = {\n Gordo,\n Albert and Almazan,\n Jon and Murray,\n Naila and Perronin,\n Florent\n},\n title = {\n LEWIS: Latent Embeddings for Word Images and their Semantics\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "79610c9e2b", @@ -7555,7 +7793,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1+0", - "aff_country_unique": "Romania;Sweden" + "aff_country_unique": "Romania;Sweden", + "bibtex": "@InProceedings{Zanfir_2015_ICCV,\n \n author = {\n Zanfir,\n Andrei and Sminchisescu,\n Cristian\n},\n title = {\n Large Displacement 3D Scene Flow With Occlusion Reasoning\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "b8f8559371", @@ -7579,7 +7818,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Krull_Learning_Analysis-by-Synthesis_for_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Krull_Learning_Analysis-by-Synthesis_for_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Krull_2015_ICCV,\n \n author = {\n Krull,\n Alexander and Brachmann,\n Eric and Michel,\n Frank and Yang,\n Michael Ying and Gumhold,\n Stefan and Rother,\n Carsten\n},\n title = {\n Learning Analysis-by-Synthesis for 6D Pose Estimation in RGB-D Images\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "84530196e1", @@ -7606,14 +7846,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Shen_Learning_Binary_Codes_ICCV_2015_paper.html", "aff_unique_index": "0;1;2;0;0+3", - "aff_unique_norm": "University of Electronic Science and Technology of China;Xidian University;University of North Carolina at Charlotte;University of Queensland", + "aff_unique_norm": "University of Electronic Science and Technology of China;Xidian University;University of North Carolina at Charlotte;The University of Queensland", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.uestc.edu.cn;http://www.xidian.edu.cn/;https://www.uncc.edu;https://www.uq.edu.au", "aff_unique_abbr": "UESTC;Xidian;UNCC;UQ", "aff_campus_unique_index": "1;", "aff_campus_unique": ";Charlotte", "aff_country_unique_index": "0;0;1;0;0+2", - "aff_country_unique": "China;United States;Australia" + "aff_country_unique": "China;United States;Australia", + "bibtex": "@InProceedings{Shen_2015_ICCV,\n \n author = {\n Shen,\n Fumin and Liu,\n Wei and Zhang,\n Shaoting and Yang,\n Yang and Shen,\n Heng Tao\n},\n title = {\n Learning Binary Codes for Maximum Inner Product Search\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "a7e8f11ef3", @@ -7640,14 +7881,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Vedantam_Learning_Common_Sense_ICCV_2015_paper.html", "aff_unique_index": "0;0;1+0;2;0", - "aff_unique_norm": "Virginia Tech;Carnegie Mellon University;Microsoft", + "aff_unique_norm": "Virginia Tech;Carnegie Mellon University;Microsoft Corporation", "aff_unique_dep": ";;Microsoft Research", "aff_unique_url": "https://www.vt.edu;https://www.cmu.edu;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "VT;CMU;MSR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Vedantam_2015_ICCV,\n \n author = {\n Vedantam,\n Ramakrishna and Lin,\n Xiao and Batra,\n Tanmay and Zitnick,\n C. Lawrence and Parikh,\n Devi\n},\n title = {\n Learning Common Sense Through Visual Abstraction\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "edd74aa9b5", @@ -7671,7 +7913,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Cai_Learning_Complexity-Aware_Cascades_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Cai_Learning_Complexity-Aware_Cascades_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Cai_2015_ICCV,\n \n author = {\n Cai,\n Zhaowei and Saberian,\n Mohammad and Vasconcelos,\n Nuno\n},\n title = {\n Learning Complexity-Aware Cascades for Deep Pedestrian Detection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "a87ff79601", @@ -7696,7 +7939,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Wilber_Learning_Concept_Embeddings_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Wilber_Learning_Concept_Embeddings_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Wilber_2015_ICCV,\n \n author = {\n Wilber,\n Michael and Kwak,\n Iljung S. and Kriegman,\n David and Belongie,\n Serge\n},\n title = {\n Learning Concept Embeddings With Combined Human-Machine Expertise\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "46f5159a81", @@ -7705,7 +7949,7 @@ "status": "Poster", "track": "main", "pid": "", - "author_site": "Tinghui Zhou, Philipp Kr\u00e4henb\u00fchl, Alexei A. Efros", + "author_site": "Tinghui Zhou, Philipp Krähenbühl, Alexei A. Efros", "author": "Tinghui Zhou; Philipp Krahenbuhl; Alexei A. Efros", "abstract": "We propose a data-driven approach for intrinsic image decomposition, which is the process of inferring the confounding factors of reflectance and shading in an image. We pose this as a two-stage learning problem. First, we train a model to predict relative reflectance ordering be- tween image patches ('brighter', 'darker', 'same') from large-scale human annotations, producing a data-driven reflectance prior. Second, we show how to naturally integrate this learned prior into existing energy minimization frame- works for intrinsic image decomposition. We compare our method to the state-of-the-art approach of Bell et al. [7] on both decomposition and image relighting tasks, demonstrating the benefits of the simple relative reflectance prior, especially for scenes under challenging lighting conditions.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Zhou_Learning_Data-Driven_Reflectance_ICCV_2015_paper.pdf", @@ -7721,7 +7965,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Zhou_Learning_Data-Driven_Reflectance_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Zhou_Learning_Data-Driven_Reflectance_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Zhou_2015_ICCV,\n \n author = {\n Zhou,\n Tinghui and Krahenbuhl,\n Philipp and Efros,\n Alexei A.\n},\n title = {\n Learning Data-Driven Reflectance Priors for Intrinsic Image Decomposition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "af0647fbd6", @@ -7755,7 +8000,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Noh_2015_ICCV,\n \n author = {\n Noh,\n Hyeonwoo and Hong,\n Seunghoon and Han,\n Bohyung\n},\n title = {\n Learning Deconvolution Network for Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "5b1887e269", @@ -7789,7 +8035,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Lowell", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Peng_2015_ICCV,\n \n author = {\n Peng,\n Xingchao and Sun,\n Baochen and Ali,\n Karim and Saenko,\n Kate\n},\n title = {\n Learning Deep Object Detectors From 3D Models\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "60e0fa5694", @@ -7816,14 +8063,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Ouyang_Learning_Deep_Representation_ICCV_2015_paper.html", "aff_unique_index": "0;0;0;0", - "aff_unique_norm": "Chinese University of Hong Kong", + "aff_unique_norm": "The Chinese University of Hong Kong", "aff_unique_dep": "Department of Electronic Engineering", "aff_unique_url": "https://www.cuhk.edu.hk", "aff_unique_abbr": "CUHK", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Ouyang_2015_ICCV,\n \n author = {\n Ouyang,\n Wanli and Li,\n Hongyang and Zeng,\n Xingyu and Wang,\n Xiaogang\n},\n title = {\n Learning Deep Representation With Large-Scale Attributes\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "cbe5808c76", @@ -7850,14 +8098,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Xia_Learning_Discriminative_Reconstructions_ICCV_2015_paper.html", "aff_unique_index": "0;1;1;1;1", - "aff_unique_norm": "University of Science and Technology of China;Microsoft", + "aff_unique_norm": "University of Science and Technology of China;Microsoft Corporation", "aff_unique_dep": ";Microsoft Research", "aff_unique_url": "http://www.ustc.edu.cn;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "USTC;MSR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Xia_2015_ICCV,\n \n author = {\n Xia,\n Yan and Cao,\n Xudong and Wen,\n Fang and Hua,\n Gang and Sun,\n Jian\n},\n title = {\n Learning Discriminative Reconstructions for Unsupervised Outlier Removal\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "1572398560", @@ -7891,7 +8140,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Hajimirsadeghi_2015_ICCV,\n \n author = {\n Hajimirsadeghi,\n Hossein and Mori,\n Greg\n},\n title = {\n Learning Ensembles of Potential Functions for Structured Prediction With Latent Variables\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "a6499d4b75", @@ -7925,7 +8175,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Austin", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Jayaraman_2015_ICCV,\n \n author = {\n Jayaraman,\n Dinesh and Grauman,\n Kristen\n},\n title = {\n Learning Image Representations Tied to Ego-Motion\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "88a5bf16f8", @@ -7956,10 +8207,11 @@ "aff_unique_dep": "School of Computing", "aff_unique_url": "https://www.nus.edu.sg", "aff_unique_abbr": "NUS", - "aff_campus_unique_index": "0;0;0;0", - "aff_campus_unique": "Singapore", + "aff_campus_unique_index": "", + "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Geng_2015_ICCV,\n \n author = {\n Geng,\n Xue and Zhang,\n Hanwang and Bian,\n Jingwen and Chua,\n Tat-Seng\n},\n title = {\n Learning Image and User Features for Recommendation in Social Networks\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "16820122c6", @@ -7986,14 +8238,15 @@ "author_num": 2, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Mallya_Learning_Informative_Edge_ICCV_2015_paper.html", "aff_unique_index": "0;0", - "aff_unique_norm": "University of Illinois Urbana-Champaign", + "aff_unique_norm": "University of Illinois at Urbana-Champaign", "aff_unique_dep": "Dept. of Computer Science", "aff_unique_url": "https://illinois.edu", "aff_unique_abbr": "UIUC", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Urbana-Champaign", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Mallya_2015_ICCV,\n \n author = {\n Mallya,\n Arun and Lazebnik,\n Svetlana\n},\n title = {\n Learning Informative Edge Maps for Indoor Scene Layout Prediction\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "36b5a4c7dd", @@ -8020,14 +8273,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Deshpande_Learning_Large-Scale_Automatic_ICCV_2015_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "University of Illinois Urbana-Champaign", + "aff_unique_norm": "University of Illinois at Urbana-Champaign", "aff_unique_dep": "", "aff_unique_url": "https://illinois.edu", "aff_unique_abbr": "UIUC", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Urbana-Champaign", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Deshpande_2015_ICCV,\n \n author = {\n Deshpande,\n Aditya and Rock,\n Jason and Forsyth,\n David\n},\n title = {\n Learning Large-Scale Automatic Image Colorization\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "88281fcd60", @@ -8061,7 +8315,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0;1;1;1;1;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Mao_2015_ICCV,\n \n author = {\n Mao,\n Junhua and Wei,\n Xu and Yang,\n Yi and Wang,\n Jiang and Huang,\n Zhiheng and Yuille,\n Alan L.\n},\n title = {\n Learning Like a Child: Fast Novel Visual Concept Learning From Sentence Descriptions of Images\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "8571b512dd", @@ -8086,7 +8341,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Moeller_Learning_Nonlinear_Spectral_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Moeller_Learning_Nonlinear_Spectral_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Moeller_2015_ICCV,\n \n author = {\n Moeller,\n Michael and Diebold,\n Julia and Gilboa,\n Guy and Cremers,\n Daniel\n},\n title = {\n Learning Nonlinear Spectral Filters for Color Image Reconstruction\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "13b965c72f", @@ -8114,13 +8370,14 @@ "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Zoran_Learning_Ordinal_Relationships_ICCV_2015_paper.html", "aff_unique_index": "0;0;1;1+0", "aff_unique_norm": "Massachusetts Institute of Technology;Google", - "aff_unique_dep": "Computer Science and Artificial Intelligence Laboratory;Google", + "aff_unique_dep": "Computer Science and Artificial Intelligence Laboratory;", "aff_unique_url": "https://www.csail.mit.edu;https://www.google.com", "aff_unique_abbr": "MIT;Google", "aff_campus_unique_index": "0;0;1;1+0", "aff_campus_unique": "Cambridge;Mountain View", "aff_country_unique_index": "0;0;0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zoran_2015_ICCV,\n \n author = {\n Zoran,\n Daniel and Isola,\n Phillip and Krishnan,\n Dilip and Freeman,\n William T.\n},\n title = {\n Learning Ordinal Relationships for Mid-Level Vision\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "d1eacdbe21", @@ -8133,7 +8390,7 @@ "author": "Yongbo Li; Weisheng Dong; Guangming Shi; Xuemei Xie", "abstract": "Existing approaches toward Image super-resolution (SR) is often either data-driven (e.g., based on internet-scale matching and web image retrieval) or model-based (e.g., formulated as an Maximizing a Posterior estimation problem). The former is conceptually simple yet heuristic; while the latter is constrained by the fundamental limit of frequency aliasing. In this paper, we propose to develop a hybrid approach toward SR by combining those two lines of ideas. More specifically, the parameters underlying sparse distributions of desirable HR image patches are learned from a pair of LR image and retrieved HR images. Our hybrid approach can be interpreted as the first attempt of reconciling the difference between parametric and nonparametric models for low-level vision tasks. Experimental results show that the proposed hybrid SR method performs much better than existing state-of-the-art methods in terms of both subjective and objective image qualities.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Li_Learning_Parametric_Distributions_ICCV_2015_paper.pdf", - "aff": "School of Electronic Engineering, Xidian University, Xi\u2019an, China, 710071; School of Electronic Engineering, Xidian University, Xi\u2019an, China, 710071; School of Electronic Engineering, Xidian University, Xi\u2019an, China, 710071; School of Electronic Engineering, Xidian University, Xi\u2019an, China, 710071", + "aff": "School of Electronic Engineering, Xidian University, Xi’an, China, 710071; School of Electronic Engineering, Xidian University, Xi’an, China, 710071; School of Electronic Engineering, Xidian University, Xi’an, China, 710071; School of Electronic Engineering, Xidian University, Xi’an, China, 710071", "project": "", "github": "", "supp": "", @@ -8154,7 +8411,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Xi'an", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2015_ICCV,\n \n author = {\n Li,\n Yongbo and Dong,\n Weisheng and Shi,\n Guangming and Xie,\n Xuemei\n},\n title = {\n Learning Parametric Distributions for Image Super-Resolution: Where Patch Matching Meets Sparse Coding\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "0092e71448", @@ -8181,14 +8439,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Yao_Learning_Query_and_ICCV_2015_paper.html", "aff_unique_index": "0;0;1", - "aff_unique_norm": "Microsoft;City University of Hong Kong", - "aff_unique_dep": "Microsoft Research;", + "aff_unique_norm": "Microsoft Research;City University of Hong Kong", + "aff_unique_dep": ";", "aff_unique_url": "https://www.microsoft.com/en-us/research/group/microsoft-research-asia;https://www.cityu.edu.hk", "aff_unique_abbr": "MSR;CityU", "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Beijing;Hong Kong SAR", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yao_2015_ICCV,\n \n author = {\n Yao,\n Ting and Mei,\n Tao and Ngo,\n Chong-Wah\n},\n title = {\n Learning Query and Image Similarities With Ranking Canonical Correlation Analysis\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "9815e76e88", @@ -8215,14 +8474,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Li_Learning_Semi-Supervised_Representation_ICCV_2015_paper.html", "aff_unique_index": "0;1+2;0;0", - "aff_unique_norm": "Beijing University of Posts and Telecommunications;Peking University;Shanghai Jiao Tong University", + "aff_unique_norm": "Beijing University of Posts and Telecommunications;Peking University;Shanghai Jiaotong University", "aff_unique_dep": "School of Info. & Commu. Engineering;School of EECS;Cooperative Medianet Innovation Center", "aff_unique_url": "http://www.bupt.edu.cn/;http://www.pku.edu.cn;https://www.sjtu.edu.cn", "aff_unique_abbr": "BUPT;PKU;SJTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2015_ICCV,\n \n author = {\n Li,\n Chun-Guang and Lin,\n Zhouchen and Zhang,\n Honggang and Guo,\n Jun\n},\n title = {\n Learning Semi-Supervised Representation Towards a Unified Optimization Framework for Semi-Supervised Learning\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "f3f9472d55", @@ -8235,7 +8495,7 @@ "author": "Antonio Agudo; Francesc Moreno-Noguer", "abstract": "In this paper, we address the problem of simultaneously recovering the 3D shape and pose of a deformable and potentially elastic object from 2D motion. This is a highly ambiguous problem typically tackled by using low-rank shape and trajectory constraints. We show that formulating the problem in terms of a low-rank force space that induces the deformation, allows for a better physical interpretation of the resulting priors and a more accurate representation of the actual object's behavior. However, this comes at the price of, besides force and pose, having to estimate the elastic model of the object. For this, we use an Expectation Maximization strategy, where each of these parameters are successively learned within partial M-steps, while robustly dealing with missing observations. We thoroughly validate the approach on both mocap and real sequences, showing more accurate 3D reconstructions than state-of-the-art, and additionally providing an estimate of the full elastic model with no a priori information.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Agudo_Learning_Shape_Motion_ICCV_2015_paper.pdf", - "aff": "Instituto de Investigaci \u00b4on en Ingenier \u00b4\u0131a de Arag \u00b4on (I3A), Universidad de Zaragoza, Spain; Institut de Rob `otica i Inform `atica Industrial (CSIC-UPC), Barcelona, Spain", + "aff": "Instituto de Investigaci ´on en Ingenier ´ıa de Arag ´on (I3A), Universidad de Zaragoza, Spain; Institut de Rob `otica i Inform `atica Industrial (CSIC-UPC), Barcelona, Spain", "project": "", "github": "", "supp": "", @@ -8249,14 +8509,15 @@ "author_num": 2, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Agudo_Learning_Shape_Motion_ICCV_2015_paper.html", "aff_unique_index": "0;1", - "aff_unique_norm": "Universidad de Zaragoza;Institut de Rob\u00f2tica i Inform\u00e0tica Industrial", - "aff_unique_dep": "Instituto de Investigaci \u00b4on en Ingenier \u00b4\u0131a de Arag \u00b4on (I3A);Robotica i Informatica Industrial", + "aff_unique_norm": "Universidad de Zaragoza;Institut de Robòtica i Informàtica Industrial", + "aff_unique_dep": "Instituto de Investigaci ´on en Ingenier ´ıa de Arag ´on (I3A);Robotica i Informatica Industrial", "aff_unique_url": "https://www.unizar.es;http://www.iri.upc.edu/", "aff_unique_abbr": "UniZar;IRI", "aff_campus_unique_index": "1", "aff_campus_unique": ";Barcelona", "aff_country_unique_index": "0;0", - "aff_country_unique": "Spain" + "aff_country_unique": "Spain", + "bibtex": "@InProceedings{Agudo_2015_ICCV,\n \n author = {\n Agudo,\n Antonio and Moreno-Noguer,\n Francesc\n},\n title = {\n Learning Shape,\n Motion and Elastic Models in Force Space\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "2e6c692dcc", @@ -8283,14 +8544,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Zhang_Learning_Social_Relation_ICCV_2015_paper.html", "aff_unique_index": "0;0;0;0", - "aff_unique_norm": "Chinese University of Hong Kong", + "aff_unique_norm": "The Chinese University of Hong Kong", "aff_unique_dep": "Department of Information Engineering", "aff_unique_url": "https://www.cuhk.edu.hk", "aff_unique_abbr": "CUHK", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2015_ICCV,\n \n author = {\n Zhang,\n Zhanpeng and Luo,\n Ping and Loy,\n Chen-Change and Tang,\n Xiaoou\n},\n title = {\n Learning Social Relation Traits From Face Images\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "b9a0ae6d41", @@ -8299,7 +8561,7 @@ "status": "Poster", "track": "main", "pid": "", - "author_site": "Martin Danelljan, Gustav H\u00e4ger, Fahad Shahbaz Khan, Michael Felsberg", + "author_site": "Martin Danelljan, Gustav Häger, Fahad Shahbaz Khan, Michael Felsberg", "author": "Martin Danelljan; Gustav Hager; Fahad Shahbaz Khan; Michael Felsberg", "abstract": "Robust and accurate visual tracking is one of the most challenging computer vision problems. Due to the inherent lack of training data, a robust approach for constructing a target appearance model is crucial. Recently, discriminatively learned correlation filters (DCF) have been successfully applied to address this problem for tracking. These methods utilize a periodic assumption of the training samples to efficiently learn a classifier on all patches in the target neighborhood. However, the periodic assumption also introduces unwanted boundary effects, which severely degrade the quality of the tracking model. We propose Spatially Regularized Discriminative Correlation Filters (SRDCF) for tracking. A spatial regularization component is introduced in the learning to penalize correlation filter coefficients depending on their spatial location. Our SRDCF formulation allows the correlation filters to be learned on a significantly larger set of negative training samples, without corrupting the positive samples. We further propose an optimization strategy, based on the iterative Gauss-Seidel method, for efficient online learning of our SRDCF. Experiments are performed on four benchmark datasets: OTB-2013, ALOV++, OTB-2015, and VOT2014. Our approach achieves state-of-the-art results on all four datasets. On OTB-2013 and OTB-2015, we obtain an absolute gain of 8.0% and 8.2% respectively, in mean overlap precision, compared to the best existing trackers.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Danelljan_Learning_Spatially_Regularized_ICCV_2015_paper.pdf", @@ -8324,7 +8586,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Sweden" + "aff_country_unique": "Sweden", + "bibtex": "@InProceedings{Danelljan_2015_ICCV,\n \n author = {\n Danelljan,\n Martin and Hager,\n Gustav and Khan,\n Fahad Shahbaz and Felsberg,\n Michael\n},\n title = {\n Learning Spatially Regularized Correlation Filters for Visual Tracking\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "2094bb8b8c", @@ -8351,14 +8614,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Tran_Learning_Spatiotemporal_Features_ICCV_2015_paper.html", "aff_unique_index": "0+1;0;0;1;0", - "aff_unique_norm": "Meta;Dartmouth College", + "aff_unique_norm": "Facebook;Dartmouth College", "aff_unique_dep": "Facebook AI Research;", "aff_unique_url": "https://research.facebook.com;https://www.dartmouth.edu", "aff_unique_abbr": "FAIR;Dartmouth", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Tran_2015_ICCV,\n \n author = {\n Tran,\n Du and Bourdev,\n Lubomir and Fergus,\n Rob and Torresani,\n Lorenzo and Paluri,\n Manohar\n},\n title = {\n Learning Spatiotemporal Features With 3D Convolutional Networks\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "2fe06cd81f", @@ -8392,7 +8656,8 @@ "aff_campus_unique_index": "0;0;1;0", "aff_campus_unique": "Stanford;Burnaby", "aff_country_unique_index": "0;0;1;0", - "aff_country_unique": "United States;Canada" + "aff_country_unique": "United States;Canada", + "bibtex": "@InProceedings{Ramanathan_2015_ICCV,\n \n author = {\n Ramanathan,\n Vignesh and Tang,\n Kevin and Mori,\n Greg and Fei-Fei,\n Li\n},\n title = {\n Learning Temporal Embeddings for Complex Video Analysis\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "3a137d64aa", @@ -8426,7 +8691,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Feng_2015_ICCV,\n \n author = {\n Feng,\n Jiashi and Darrell,\n Trevor\n},\n title = {\n Learning The Structure of Deep Convolutional Networks\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "dac3dde389", @@ -8451,7 +8717,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Veit_Learning_Visual_Clothing_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Veit_Learning_Visual_Clothing_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Veit_2015_ICCV,\n \n author = {\n Veit,\n Andreas and Kovacs,\n Balazs and Bell,\n Sean and McAuley,\n Julian and Bala,\n Kavita and Belongie,\n Serge\n},\n title = {\n Learning Visual Clothing Style With Heterogeneous Dyadic Co-Occurrences\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "08b2bd548a", @@ -8464,7 +8731,7 @@ "author": "Marco Pedersoli; Tinne Tuytelaars", "abstract": "A common issue in deformable object detection is finding a good way to position the parts. This issue is even more outspoken when considering detection and pose estimation for 3D objects, where parts should be placed in a three-dimensional space. Some methods extract the 3D shape of the object from 3D CAD models. This limits their applicability to categories for which such models are available. Others represent the object with a predefined and simple shape (e.g. a cuboid). This extends the applicability of the model, but in many cases the pre-defined shape is too simple to properly represent the object in 3D. In this paper we propose a new method for the detection and pose estimation of 3D objects, that does not use any 3D CAD model or other 3D information. Starting from a simple and general 3D shape, we learn in a weakly supervised manner the 3D part locations that best fit the training data. As this method builds on a iterative estimation of the part locations, we introduce several speedups to make the method fast enough for practical experiments. We evaluate our model for the detection and pose estimation of faces and cars. Our method obtains results comparable with the state of the art, it is faster than most of the other approaches and does not need any additional 3D information.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Pedersoli_Learning_Where_to_ICCV_2015_paper.pdf", - "aff": "Inria\u2217\u2020; PSI-iMinds KU Leuven, Belgium", + "aff": "Inria∗†; PSI-iMinds KU Leuven, Belgium", "project": "", "github": "", "supp": "", @@ -8478,14 +8745,15 @@ "author_num": 2, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Pedersoli_Learning_Where_to_ICCV_2015_paper.html", "aff_unique_index": "0;1", - "aff_unique_norm": "INRIA;KU Leuven", + "aff_unique_norm": "Inria;KU Leuven", "aff_unique_dep": ";PSI-iMinds", "aff_unique_url": "https://www.inria.fr;https://www.kuleuven.be", "aff_unique_abbr": "Inria;KU Leuven", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", - "aff_country_unique": "France;Belgium" + "aff_country_unique": "France;Belgium", + "bibtex": "@InProceedings{Pedersoli_2015_ICCV,\n \n author = {\n Pedersoli,\n Marco and Tuytelaars,\n Tinne\n},\n title = {\n Learning Where to Position Parts in 3D\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "8833a7479a", @@ -8509,7 +8777,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Salti_Learning_a_Descriptor-Specific_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Salti_Learning_a_Descriptor-Specific_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Salti_2015_ICCV,\n \n author = {\n Salti,\n Samuele and Tombari,\n Federico and Spezialetti,\n Riccardo and Di Stefano,\n Luigi\n},\n title = {\n Learning a Descriptor-Specific 3D Keypoint Detector\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "cd5e0fd3f1", @@ -8518,7 +8787,7 @@ "status": "Poster", "track": "main", "pid": "", - "author_site": "Jun-Yan Zhu, Philipp Kr\u00e4henb\u00fchl, Eli Shechtman, Alexei A. Efros", + "author_site": "Jun-Yan Zhu, Philipp Krähenbühl, Eli Shechtman, Alexei A. Efros", "author": "Jun-Yan Zhu; Philipp Krahenbuhl; Eli Shechtman; Alexei A. Efros", "abstract": "What makes an image appear realistic? In this work, we are answering this question from a data-driven perspective by learning the perception of visual realism directly from large amounts of data. In particular, we train a Convolutional Neural Network (CNN) model that distinguishes natural photographs from automatically generated composite images. The model learns to predict visual realism of a scene in terms of color, lighting and texture compatibility, without any human annotations pertaining to it. Our model outperforms previous works that rely on hand-crafted heuristics, for the task of classifying realistic vs. unrealistic photos. Furthermore, we apply our learned model to compute optimal parameters of a compositing method, to maximize the visual realism score predicted by our CNN model. We demonstrate its advantage against existing methods via a human perception study.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Zhu_Learning_a_Discriminative_ICCV_2015_paper.pdf", @@ -8533,7 +8802,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Zhu_Learning_a_Discriminative_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Zhu_Learning_a_Discriminative_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Zhu_2015_ICCV,\n \n author = {\n Zhu,\n Jun-Yan and Krahenbuhl,\n Philipp and Shechtman,\n Eli and Efros,\n Alexei A.\n},\n title = {\n Learning a Discriminative Model for the Perception of Realism in Composite Images\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "021ae4055b", @@ -8567,7 +8837,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Gu_2015_ICCV,\n \n author = {\n Gu,\n Lin and Cheng,\n Li\n},\n title = {\n Learning to Boost Filamentary Structure Segmentation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "dc86af92e1", @@ -8601,7 +8872,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Lee_2015_ICCV,\n \n author = {\n Lee,\n Tom and Fidler,\n Sanja and Dickinson,\n Sven\n},\n title = {\n Learning to Combine Mid-Level Cues for Object Proposal Generation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "e7278663f0", @@ -8635,7 +8907,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Italy" + "aff_country_unique": "Italy", + "bibtex": "@InProceedings{Solera_2015_ICCV,\n \n author = {\n Solera,\n Francesco and Calderara,\n Simone and Cucchiara,\n Rita\n},\n title = {\n Learning to Divide and Conquer for Online Multi-Target Tracking\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "5dcc7711a5", @@ -8669,7 +8942,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xu_2015_ICCV,\n \n author = {\n Xu,\n Mai and Ren,\n Yun and Wang,\n Zulin\n},\n title = {\n Learning to Predict Saliency on Face Images\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "3ae6de0315", @@ -8693,7 +8967,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Fernando_Learning_to_Rank_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Fernando_Learning_to_Rank_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Fernando_2015_ICCV,\n \n author = {\n Fernando,\n Basura and Gavves,\n Efstratios and Muselet,\n Damien and Tuytelaars,\n Tinne\n},\n title = {\n Learning to Rank Based on Subsequences\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "4dcb8cff2b", @@ -8727,7 +9002,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Agrawal_2015_ICCV,\n \n author = {\n Agrawal,\n Pulkit and Carreira,\n Joao and Malik,\n Jitendra\n},\n title = {\n Learning to See by Moving\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "5fe52b65c8", @@ -8740,7 +9016,7 @@ "author": "Philippe Weinzaepfel; Zaid Harchaoui; Cordelia Schmid", "abstract": "We propose an effective approach for spatio-temporal action localization in realistic videos. The approach first detects proposals at the frame-level and scores them with a combination of static and motion CNN features. It then tracks high-scoring proposals throughout the video using a tracking-by-detection approach. Our tracker relies simultaneously on instance-level and class-level detectors. The tracks are scored using a spatio-temporal motion histogram, a descriptor at the track level, in combination with the CNN features. Finally, we perform temporal localization of the action using a sliding-window approach at the track level. We present experimental results for spatio-temporal localization on the UCF-Sports, J-HMDB and UCF-101 action localization datasets, where our approach outperforms the state of the art with a margin of 15%, 7% and 12% respectively in mAP.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Weinzaepfel_Learning_to_Track_ICCV_2015_paper.pdf", - "aff": "Inria\u2217; Inria+NYU; Inria\u2217", + "aff": "Inria∗; Inria+NYU; Inria∗", "project": "", "github": "", "supp": "", @@ -8754,14 +9030,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Weinzaepfel_Learning_to_Track_ICCV_2015_paper.html", "aff_unique_index": "0;0+1;0", - "aff_unique_norm": "INRIA;New York University", + "aff_unique_norm": "Inria;New York University", "aff_unique_dep": ";", "aff_unique_url": "https://www.inria.fr;https://www.nyu.edu", "aff_unique_abbr": "Inria;NYU", "aff_campus_unique_index": "1", "aff_campus_unique": ";New York", "aff_country_unique_index": "0;0+1;0", - "aff_country_unique": "France;United States" + "aff_country_unique": "France;United States", + "bibtex": "@InProceedings{Weinzaepfel_2015_ICCV,\n \n author = {\n Weinzaepfel,\n Philippe and Harchaoui,\n Zaid and Schmid,\n Cordelia\n},\n title = {\n Learning to Track for Spatio-Temporal Action Localization\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "5940ce768e", @@ -8795,7 +9072,8 @@ "aff_campus_unique_index": "0+1;0;0", "aff_campus_unique": "Stanford;Ann Arbor", "aff_country_unique_index": "0+0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Xiang_2015_ICCV,\n \n author = {\n Xiang,\n Yu and Alahi,\n Alexandre and Savarese,\n Silvio\n},\n title = {\n Learning to Track: Online Multi-Object Tracking by Decision Making\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "4179be7195", @@ -8829,7 +9107,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Almaev_2015_ICCV,\n \n author = {\n Almaev,\n Timur and Martinez,\n Brais and Valstar,\n Michel\n},\n title = {\n Learning to Transfer: Transferring Latent Task Structures and Its Application to Person-Specific Facial Action Unit Detection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "6dcd25659e", @@ -8863,7 +9142,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Stony Brook", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Vicente_2015_ICCV,\n \n author = {\n Vicente,\n Tomas F. Yago and Hoai,\n Minh and Samaras,\n Dimitris\n},\n title = {\n Leave-One-Out Kernel Optimization for Shadow Detection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "05973e608f", @@ -8897,7 +9177,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Bambach_2015_ICCV,\n \n author = {\n Bambach,\n Sven and Lee,\n Stefan and Crandall,\n David J. and Yu,\n Chen\n},\n title = {\n Lending A Hand: Detecting Hands and Recognizing Activities in Complex Egocentric Interactions\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "5631e450b5", @@ -8931,7 +9212,8 @@ "aff_campus_unique_index": "0+0;0;0;0", "aff_campus_unique": "Beijing;", "aff_country_unique_index": "0+0;0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2015_ICCV,\n \n author = {\n Zhang,\n Jie and Kan,\n Meina and Shan,\n Shiguang and Chen,\n Xilin\n},\n title = {\n Leveraging Datasets With Varying Annotations for Face Alignment via Deep Regression Network\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "4dc7ead274", @@ -8965,7 +9247,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Beijing;", "aff_country_unique_index": "0;0;0;0;1", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Ma_2015_ICCV,\n \n author = {\n Ma,\n Bo and Hu,\n Hongwei and Shen,\n Jianbing and Zhang,\n Yuping and Porikli,\n Fatih\n},\n title = {\n Linearization to Nonlinear Learning for Visual Tracking\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "f5a7e84022", @@ -8999,7 +9282,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Perth", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Australia" + "aff_country_unique": "Australia", + "bibtex": "@InProceedings{Sui_2015_ICCV,\n \n author = {\n Sui,\n Chao and Bennamoun,\n Mohammed and Togneri,\n Roberto\n},\n title = {\n Listening With Your Eyes: Towards a Practical Visual Speech Recognition System Using Deep Boltzmann Machines\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "210ef286f4", @@ -9033,7 +9317,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Tel Aviv", "aff_country_unique_index": "0;0", - "aff_country_unique": "Israel" + "aff_country_unique": "Israel", + "bibtex": "@InProceedings{Levy_2015_ICCV,\n \n author = {\n Levy,\n Ofir and Wolf,\n Lior\n},\n title = {\n Live Repetition Counting\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "03add9c840", @@ -9046,7 +9331,7 @@ "author": "Mattis Paulin; Matthijs Douze; Zaid Harchaoui; Julien Mairal; Florent Perronin; Cordelia Schmid", "abstract": "Patch-level descriptors underlie several important computer vision tasks, such as stereo-matching or content-based image retrieval. We introduce a deep convolutional architecture that yields patch-level descriptors, as an alternative to the popular SIFT descriptor for image retrieval. The proposed family of descriptors, called Patch-CKN, adapt the recently introduced Convolutional Kernel Network (CKN), an unsupervised framework to learn convolutional architectures. We present a comparison framework to benchmark current deep convolutional approaches along with Patch-CKN for both patch and image retrieval, including our novel ``RomePatches'' dataset. Patch-CKN descriptors yield competitive results compared to supervised CNN alternatives on patch and image retrieval.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Paulin_Local_Convolutional_Features_ICCV_2015_paper.pdf", - "aff": "Inria\u2217; Inria\u2217; Inria\u2217+NYU; Inria\u2217; Facebook AI Research\u2020; Inria\u2217", + "aff": "Inria∗; Inria∗; Inria∗+NYU; Inria∗; Facebook AI Research†; Inria∗", "project": "", "github": "", "supp": "", @@ -9060,14 +9345,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Paulin_Local_Convolutional_Features_ICCV_2015_paper.html", "aff_unique_index": "0;0;0+1;0;2;0", - "aff_unique_norm": "INRIA;New York University;Meta", + "aff_unique_norm": "Inria;New York University;Facebook", "aff_unique_dep": ";;Facebook AI Research", "aff_unique_url": "https://www.inria.fr;https://www.nyu.edu;https://research.facebook.com", "aff_unique_abbr": "Inria;NYU;FAIR", "aff_campus_unique_index": "1", "aff_campus_unique": ";New York", "aff_country_unique_index": "0;0;0+1;0;1;0", - "aff_country_unique": "France;United States" + "aff_country_unique": "France;United States", + "bibtex": "@InProceedings{Paulin_2015_ICCV,\n \n author = {\n Paulin,\n Mattis and Douze,\n Matthijs and Harchaoui,\n Zaid and Mairal,\n Julien and Perronin,\n Florent and Schmid,\n Cordelia\n},\n title = {\n Local Convolutional Features With Unsupervised Training for Image Retrieval\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "d6542ac942", @@ -9092,7 +9378,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Ma_Local_Subspace_Collaborative_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Ma_Local_Subspace_Collaborative_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Ma_2015_ICCV,\n \n author = {\n Ma,\n Lin and Zhang,\n Xiaoqin and Hu,\n Weiming and Xing,\n Junliang and Lu,\n Jiwen and Zhou,\n Jie\n},\n title = {\n Local Subspace Collaborative Tracking\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "6e7dc84f15", @@ -9126,7 +9413,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;1;1;0", - "aff_country_unique": "United States;Italy" + "aff_country_unique": "United States;Italy", + "bibtex": "@InProceedings{Lu_2015_ICCV,\n \n author = {\n Lu,\n Guoyu and Yan,\n Yan and Ren,\n Li and Song,\n Jingkuan and Sebe,\n Nicu and Kambhamettu,\n Chandra\n},\n title = {\n Localize Me Anywhere,\n Anytime: A Multi-Task Point-Retrieval Approach\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "bbc1507458", @@ -9151,7 +9439,8 @@ "aff_domain": ";;;;;;;;;;;", "email": ";;;;;;;;;;;", "author_num": 12, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Cao_Look_and_Think_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Cao_Look_and_Think_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Cao_2015_ICCV,\n \n author = {\n Cao,\n Chunshui and Liu,\n Xianming and Yang,\n Yi and Yu,\n Yinan and Wang,\n Jiang and Wang,\n Zilei and Huang,\n Yongzhen and Wang,\n Liang and Huang,\n Chang and Xu,\n Wei and Ramanan,\n Deva and Huang,\n Thomas S.\n},\n title = {\n Look and Think Twice: Capturing Top-Down Visual Attention With Feedback Convolutional Neural Networks\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "fdcb5d1716", @@ -9185,7 +9474,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Toronto", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Wang_2015_ICCV,\n \n author = {\n Wang,\n Shenlong and Fidler,\n Sanja and Urtasun,\n Raquel\n},\n title = {\n Lost Shopping! Monocular Localization in Large Indoor Spaces\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "ccf889964a", @@ -9219,7 +9509,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Johnson_2015_ICCV,\n \n author = {\n Johnson,\n Justin and Ballan,\n Lamberto and Fei-Fei,\n Li\n},\n title = {\n Love Thy Neighbors: Image Annotation by Exploiting Image Metadata\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "994ed1654e", @@ -9228,7 +9519,7 @@ "status": "Poster", "track": "main", "pid": "", - "author_site": "Ond\u0159ej Chum", + "author_site": "Ondřej Chum", "author": "Ondrej Chum", "abstract": "Approximating non-linear kernels by finite-dimensional feature maps is a popular approach for speeding up training and evaluation of support vector machines or to encode information into efficient match kernels. We propose a novel method of data independent construction of low dimensional feature maps. The problem is cast as a linear program which jointly considers competing objectives: the quality of the approximation and the dimensionality of the feature map. For both shift-invariant and homogeneous kernels the proposed method achieves a better approximations at the same dimensionality or comparable approximations at lower dimensionality of the feature map compared with state-of-the-art methods.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Chum_Low_Dimensional_Explicit_ICCV_2015_paper.pdf", @@ -9251,7 +9542,8 @@ "aff_unique_url": "https://www.fel.cvut.cz", "aff_unique_abbr": "CTU", "aff_country_unique_index": "0", - "aff_country_unique": "Czech Republic" + "aff_country_unique": "Czech Republic", + "bibtex": "@InProceedings{Chum_2015_ICCV,\n \n author = {\n Chum,\n Ondrej\n},\n title = {\n Low Dimensional Explicit Feature Maps\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "df2690517d", @@ -9276,7 +9568,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Cao_Low-Rank_Matrix_Factorization_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Cao_Low-Rank_Matrix_Factorization_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Cao_2015_ICCV,\n \n author = {\n Cao,\n Xiangyong and Chen,\n Yang and Zhao,\n Qian and Meng,\n Deyu and Wang,\n Yao and Wang,\n Dong and Xu,\n Zongben\n},\n title = {\n Low-Rank Matrix Factorization Under General Mixture Noise Distributions\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "7b8f43c005", @@ -9310,7 +9603,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Dong_2015_ICCV,\n \n author = {\n Dong,\n Weisheng and Li,\n Guangyu and Shi,\n Guangming and Li,\n Xin and Ma,\n Yi\n},\n title = {\n Low-Rank Tensor Approximation With Laplacian Scale Mixture Modeling for Multiframe Image Denoising\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "8686c00e03", @@ -9344,7 +9638,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0+0", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Zhang_2015_ICCV,\n \n author = {\n Zhang,\n Changqing and Fu,\n Huazhu and Liu,\n Si and Liu,\n Guangcan and Cao,\n Xiaochun\n},\n title = {\n Low-Rank Tensor Constrained Multiview Subspace Clustering\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "4c4c11c168", @@ -9357,7 +9652,7 @@ "author": "Thibaut Durand; Nicolas Thome; Matthieu Cord", "abstract": "In this work, we propose a novel Weakly Supervised Learning (WSL) framework dedicated to learn discriminative part detectors from images annotated with a global label. Our WSL method encompasses three main contributions. Firstly, we introduce a new structured output latent variable model, Minimum mAximum lateNt sTRucturAl SVM (MANTRA), which prediction relies on a pair of latent variables: h^+ (resp. h^-) provides positive (resp. negative) evidence for a given output y. Secondly, we instantiate MANTRA for two different visual recognition tasks: multi-class classification and ranking. For ranking, we propose efficient solutions to exactly solve the inference and the loss-augmented problems. Finally, extensive experiments highlight the relevance of the proposed method: MANTRA outperforms state-of-the art results on five different datasets.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Durand_MANTRA_Minimum_Maximum_ICCV_2015_paper.pdf", - "aff": "Sorbonne Universit \u00b4es, UPMC Univ Paris 06, CNRS, LIP6 UMR 7606, 4 place Jussieu, 75005 Paris; Sorbonne Universit \u00b4es, UPMC Univ Paris 06, CNRS, LIP6 UMR 7606, 4 place Jussieu, 75005 Paris; Sorbonne Universit \u00b4es, UPMC Univ Paris 06, CNRS, LIP6 UMR 7606, 4 place Jussieu, 75005 Paris", + "aff": "Sorbonne Universit ´es, UPMC Univ Paris 06, CNRS, LIP6 UMR 7606, 4 place Jussieu, 75005 Paris; Sorbonne Universit ´es, UPMC Univ Paris 06, CNRS, LIP6 UMR 7606, 4 place Jussieu, 75005 Paris; Sorbonne Universit ´es, UPMC Univ Paris 06, CNRS, LIP6 UMR 7606, 4 place Jussieu, 75005 Paris", "project": "", "github": "", "supp": "", @@ -9371,14 +9666,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Durand_MANTRA_Minimum_Maximum_ICCV_2015_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "Sorbonne Universit\u00e9s", + "aff_unique_norm": "Sorbonne Universités", "aff_unique_dep": "", "aff_unique_url": "https://www.sorbonne-universite.fr", "aff_unique_abbr": "Sorbonne", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Paris", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "France" + "aff_country_unique": "France", + "bibtex": "@InProceedings{Durand_2015_ICCV,\n \n author = {\n Durand,\n Thibaut and Thome,\n Nicolas and Cord,\n Matthieu\n},\n title = {\n MANTRA: Minimum Maximum Latent Structural SVM for Image Classification and Ranking\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "7cea0f1e91", @@ -9387,7 +9683,7 @@ "status": "Poster", "track": "main", "pid": "", - "author_site": "Eric T. Psota, J\u0119drzej Kowalczuk, Mateusz Mittek, Lance C. P\u00e9rez", + "author_site": "Eric T. Psota, Jędrzej Kowalczuk, Mateusz Mittek, Lance C. Pérez", "author": "Eric T. Psota; Jedrzej Kowalczuk; Mateusz Mittek; Lance C. Perez", "abstract": "A new method is introduced for stereo matching that operates on minimum spanning trees (MSTs) generated from the images. Disparity maps are represented as a collection of hidden states on MSTs, and each MST is modeled as a hidden Markov tree. An efficient recursive message-passing scheme designed to operate on hidden Markov trees, known as the upward-downward algorithm, is used to compute the maximum a posteriori (MAP) disparity estimate at each pixel. The messages processed by the upward-downward algorithm involve two types of probabilities: the probability of a pixel having a particular disparity given a set of per-pixel matching costs, and the probability of a disparity transition between a pair of connected pixels given their similarity. The distributions of these probabilities are modeled from a collection of images with ground truth disparities. Performance evaluation using the Middlebury stereo benchmark version 3 demonstrates that the proposed method ranks second and third in terms of overall accuracy when evaluated on the training and test image sets, respectively.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Psota_MAP_Disparity_Estimation_ICCV_2015_paper.pdf", @@ -9412,7 +9708,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Lincoln", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Psota_2015_ICCV,\n \n author = {\n Psota,\n Eric T. and Kowalczuk,\n Jedrzej and Mittek,\n Mateusz and Perez,\n Lance C.\n},\n title = {\n MAP Disparity Estimation Using Hidden Markov Trees\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "cae18e1487", @@ -9446,7 +9743,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Albany", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "Saudi Arabia;United States" + "aff_country_unique": "Saudi Arabia;United States", + "bibtex": "@InProceedings{Wu_2015_ICCV,\n \n author = {\n Wu,\n Baoyuan and Lyu,\n Siwei and Ghanem,\n Bernard\n},\n title = {\n ML-MG: Multi-Label Learning With Missing Labels Using a Mixed Graph\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "704391eabc", @@ -9470,7 +9768,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Wang_MMSS_Multi-Modal_Sharable_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Wang_MMSS_Multi-Modal_Sharable_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Wang_2015_ICCV,\n \n author = {\n Wang,\n Anran and Cai,\n Jianfei and Lu,\n Jiwen and Cham,\n Tat-Jen\n},\n title = {\n MMSS: Multi-Modal Sharable and Specific Feature Learning for RGB-D Object Recognition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "ed43ce130b", @@ -9495,7 +9794,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Galliani_Massively_Parallel_Multiview_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Galliani_Massively_Parallel_Multiview_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Galliani_2015_ICCV,\n \n author = {\n Galliani,\n Silvano and Lasinger,\n Katrin and Schindler,\n Konrad\n},\n title = {\n Massively Parallel Multiview Stereopsis by Surface Normal Diffusion\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "69ad14c6fa", @@ -9529,7 +9829,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+1;1;2+1", - "aff_country_unique": "Romania;Germany;Sweden" + "aff_country_unique": "Romania;Germany;Sweden", + "bibtex": "@InProceedings{Ionescu_2015_ICCV,\n \n author = {\n Ionescu,\n Catalin and Vantzos,\n Orestis and Sminchisescu,\n Cristian\n},\n title = {\n Matrix Backpropagation for Deep Networks With Structured Layers\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "766b04ed26", @@ -9563,7 +9864,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2015_ICCV,\n \n author = {\n Li,\n Sijin and Zhang,\n Weichen and Chan,\n Antoni B.\n},\n title = {\n Maximum-Margin Structured Learning With Deep Networks for 3D Human Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "e3f9ea0a8a", @@ -9597,7 +9899,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Switzerland" + "aff_country_unique": "Switzerland", + "bibtex": "@InProceedings{Cohen_2015_ICCV,\n \n author = {\n Cohen,\n Andrea and Sattler,\n Torsten and Pollefeys,\n Marc\n},\n title = {\n Merging the Unmatchable: Stitching Visually Disconnected SfM Models\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "5c767d07aa", @@ -9624,14 +9927,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Zhang_MeshStereo_A_Global_ICCV_2015_paper.html", "aff_unique_index": "0;1+2;3;0;1+2;0", - "aff_unique_norm": "Microsoft;Sun Yat-sen University;SYSU-CMU Shunde International Joint Research Institute;Chinese Academy of Sciences", + "aff_unique_norm": "Microsoft Corporation;Sun Yat-Sen University;SYSU-CMU Shunde International Joint Research Institute;Chinese Academy of Sciences", "aff_unique_dep": "Microsoft Research;;;Institute of Automation", "aff_unique_url": "https://www.microsoft.com/en-us/research;http://www.sysu.edu.cn/;;http://www.ia.cas.cn", "aff_unique_abbr": "MSR;SYSU;;CAS", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Shunde", "aff_country_unique_index": "0;1+1;1;0;1+1;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Zhang_2015_ICCV,\n \n author = {\n Zhang,\n Chi and Li,\n Zhiwei and Cheng,\n Yanhua and Cai,\n Rui and Chao,\n Hongyang and Rui,\n Yong\n},\n title = {\n MeshStereo: A Global Stereo Model With Mesh Alignment Regularization for View Interpolation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "8878026bea", @@ -9665,7 +9969,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Chapel Hill", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zheng_2015_ICCV,\n \n author = {\n Zheng,\n Enliang and Wang,\n Ke and Dunn,\n Enrique and Frahm,\n Jan-Michael\n},\n title = {\n Minimal Solvers for 3D Geometry From Satellite Imagery\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "128c2ef0d4", @@ -9699,7 +10004,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Atlanta", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Ciptadi_2015_ICCV,\n \n author = {\n Ciptadi,\n Arridhana and Rehg,\n James M.\n},\n title = {\n Minimizing Human Effort in Interactive Tracking by Incremental Learning of Model Parameters\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "e9d0221301", @@ -9708,7 +10014,7 @@ "status": "Oral", "track": "main", "pid": "", - "author_site": "Jianming Zhang, Stan Sclaroff, Zhe Lin, Xiaohui Shen, Brian Price, Radom\u00edr M\u011bch", + "author_site": "Jianming Zhang, Stan Sclaroff, Zhe Lin, Xiaohui Shen, Brian Price, Radomír Měch", "author": "Jianming Zhang; Stan Sclaroff; Zhe Lin; Xiaohui Shen; Brian Price; Radomir Mech", "abstract": "We propose a highly efficient, yet powerful, salient object detection method based on the Minimum Barrier Distance (MBD) Transform. The MBD transform is robust to pixel-value fluctuation, and thus can be effectively applied on raw pixels without region abstraction. We present an approximate MBD transform algorithm with 100X speedup over the exact algorithm. An error bound analysis is also provided. Powered by this fast MBD transform algorithm, the proposed salient object detection method runs at 80 FPS, and significantly outperforms previous methods with similar speed on four large benchmark datasets, and achieves comparable or better performance than state-of-the-art methods. Furthermore, a technique based on color whitening is proposed to extend our method to leverage the appearance-based backgroundness cue. This extended version further improves the performance, while still being one order of magnitude faster than all the other leading methods.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Zhang_Minimum_Barrier_Salient_ICCV_2015_paper.pdf", @@ -9724,7 +10030,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Zhang_Minimum_Barrier_Salient_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Zhang_Minimum_Barrier_Salient_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Zhang_2015_ICCV,\n \n author = {\n Zhang,\n Jianming and Sclaroff,\n Stan and Lin,\n Zhe and Shen,\n Xiaohui and Price,\n Brian and Mech,\n Radomir\n},\n title = {\n Minimum Barrier Salient Object Detection at 80 FPS\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "6ba6c74349", @@ -9758,7 +10065,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Los Angeles", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhang_2015_ICCV,\n \n author = {\n Zhang,\n Quanshi and Wu,\n Ying Nian and Zhu,\n Song-Chun\n},\n title = {\n Mining And-Or Graphs for Graph Matching and Object Discovery\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "ffb7f66998", @@ -9783,7 +10091,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Wang_Mode-Seeking_on_Hypergraphs_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Wang_Mode-Seeking_on_Hypergraphs_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Wang_2015_ICCV,\n \n author = {\n Wang,\n Hanzi and Xiao,\n Guobao and Yan,\n Yan and Suter,\n David\n},\n title = {\n Mode-Seeking on Hypergraphs for Robust Geometric Model Fitting\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "1ccf44290f", @@ -9792,7 +10101,7 @@ "status": "Poster", "track": "main", "pid": "", - "author_site": "Jan St\u00fchmer, Sebastian Nowozin, Andrew Fitzgibbon, Richard Szeliski, Travis Perry, Sunil Acharya, Daniel Cremers, Jamie Shotton", + "author_site": "Jan Stühmer, Sebastian Nowozin, Andrew Fitzgibbon, Richard Szeliski, Travis Perry, Sunil Acharya, Daniel Cremers, Jamie Shotton", "author": "Jan Stuhmer; Sebastian Nowozin; Andrew Fitzgibbon; Richard Szeliski; Travis Perry; Sunil Acharya; Daniel Cremers; Jamie Shotton", "abstract": "Consumer depth cameras have dramatically improved our ability to track rigid, articulated, and deformable 3D objects in real-time. However, depth cameras have a limited temporal resolution (frame-rate) that restricts the accuracy and robustness of tracking, especially for fast or unpredictable motion. In this paper, we show how to perform model-based object tracking which allows to reconstruct the object's depth at an order of magnitude higher frame-rate through simple modifications to an off-the-shelf depth camera. We focus on phase-based time-of-flight (ToF) sensing, which reconstructs each low frame-rate depth image from a set of short exposure 'raw' infrared captures. These raw captures are taken in quick succession near the beginning of each depth frame, and differ in the modulation of their active illumination. We make two contributions. First, we detail how to perform model-based tracking against these raw captures. Second, we show that by reprogramming the camera to space the raw captures uniformly in time, we obtain a 10x higher frame-rate, and thereby improve the ability to track fast-moving objects.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Stuhmer_Model-Based_Tracking_at_ICCV_2015_paper.pdf", @@ -9808,7 +10117,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Stuhmer_Model-Based_Tracking_at_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Stuhmer_Model-Based_Tracking_at_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Stuhmer_2015_ICCV,\n \n author = {\n Stuhmer,\n Jan and Nowozin,\n Sebastian and Fitzgibbon,\n Andrew and Szeliski,\n Richard and Perry,\n Travis and Acharya,\n Sunil and Cremers,\n Daniel and Shotton,\n Jamie\n},\n title = {\n Model-Based Tracking at 300Hz Using Raw Time-of-Flight Observations\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "3f5ea1950e", @@ -9842,7 +10152,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Toronto", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Zhang_2015_ICCV,\n \n author = {\n Zhang,\n Ziyu and Schwing,\n Alexander G. and Fidler,\n Sanja and Urtasun,\n Raquel\n},\n title = {\n Monocular Object Instance Segmentation and Depth Ordering With CNNs\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "860682238d", @@ -9867,7 +10178,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Keuper_Motion_Trajectory_Segmentation_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Keuper_Motion_Trajectory_Segmentation_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Keuper_2015_ICCV,\n \n author = {\n Keuper,\n Margret and Andres,\n Bjoern and Brox,\n Thomas\n},\n title = {\n Motion Trajectory Segmentation via Minimum Cost Multicuts\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "ab17763341", @@ -9894,14 +10206,15 @@ "author_num": 2, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Long_Multi-Class_Multi-Annotator_Active_ICCV_2015_paper.html", "aff_unique_index": "0;1", - "aff_unique_norm": "Stevens Institute of Technology;Microsoft", + "aff_unique_norm": "Stevens Institute of Technology;Microsoft Research", "aff_unique_dep": ";Research", "aff_unique_url": "https://www.stevens.edu;https://www.microsoft.com/en-us/research/group/asia", "aff_unique_abbr": "SIT;MSR Asia", "aff_campus_unique_index": "1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0;1", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Long_2015_ICCV,\n \n author = {\n Long,\n Chengjiang and Hua,\n Gang\n},\n title = {\n Multi-Class Multi-Annotator Active Learning With Robust Gaussian Process for Visual Recognition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "ff7c3ed18e", @@ -9935,7 +10248,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "London;", "aff_country_unique_index": "0+1;0+1;0+1", - "aff_country_unique": "United Kingdom;Netherlands" + "aff_country_unique": "United Kingdom;Netherlands", + "bibtex": "@InProceedings{Eleftheriadis_2015_ICCV,\n \n author = {\n Eleftheriadis,\n Stefanos and Rudovic,\n Ognjen and Pantic,\n Maja\n},\n title = {\n Multi-Conditional Latent Variable Model for Joint Facial Action Unit Detection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "bdbb5e4825", @@ -9960,7 +10274,8 @@ "aff_domain": ";", "email": ";", "author_num": 2, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Yi_Multi-Cue_Structure_Preserving_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Yi_Multi-Cue_Structure_Preserving_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Yi_2015_ICCV,\n \n author = {\n Yi,\n Saehoon and Pavlovic,\n Vladimir\n},\n title = {\n Multi-Cue Structure Preserving MRF for Unconstrained Video Segmentation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "2972f2aff4", @@ -9994,7 +10309,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Philadelphia", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhou_2015_ICCV,\n \n author = {\n Zhou,\n Xiaowei and Zhu,\n Menglong and Daniilidis,\n Kostas\n},\n title = {\n Multi-Image Matching via Fast Alternating Minimization\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "375984fd4a", @@ -10028,7 +10344,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Tang_2015_ICCV,\n \n author = {\n Tang,\n Ming and Feng,\n Jiayi\n},\n title = {\n Multi-Kernel Correlation Filter for Visual Tracking\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "94f85a8d92", @@ -10053,7 +10370,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Ranjan_Multi-Label_Cross-Modal_Retrieval_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Ranjan_Multi-Label_Cross-Modal_Retrieval_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Ranjan_2015_ICCV,\n \n author = {\n Ranjan,\n Viresh and Rasiwasia,\n Nikhil and Jawahar,\n C. V.\n},\n title = {\n Multi-Label Cross-Modal Retrieval\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "3318f0fec0", @@ -10087,7 +10405,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";London", "aff_country_unique_index": "0;0;0;1;1", - "aff_country_unique": "China;United Kingdom" + "aff_country_unique": "China;United Kingdom", + "bibtex": "@InProceedings{Li_2015_ICCV,\n \n author = {\n Li,\n Xiang and Zheng,\n Wei-Shi and Wang,\n Xiaojuan and Xiang,\n Tao and Gong,\n Shaogang\n},\n title = {\n Multi-Scale Learning for Low-Resolution Person Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "44d81fc28c", @@ -10121,7 +10440,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Yang_2015_ICCV,\n \n author = {\n Yang,\n Songfan and Ramanan,\n Deva\n},\n title = {\n Multi-Scale Recognition With DAG-CNNs\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "58cf72043e", @@ -10146,7 +10466,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Su_Multi-Task_Learning_With_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Su_Multi-Task_Learning_With_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Su_2015_ICCV,\n \n author = {\n Su,\n Chi and Yang,\n Fan and Zhang,\n Shiliang and Tian,\n Qi and Davis,\n Larry S. and Gao,\n Wen\n},\n title = {\n Multi-Task Learning With Low Rank Attribute Embedding for Person Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "7de4fbd59e", @@ -10173,14 +10494,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Chu_Multi-Task_Recurrent_Neural_ICCV_2015_paper.html", "aff_unique_index": "0;0;0;0", - "aff_unique_norm": "Chinese University of Hong Kong", + "aff_unique_norm": "The Chinese University of Hong Kong", "aff_unique_dep": "Department of Electronic Engineering", "aff_unique_url": "https://www.cuhk.edu.hk", "aff_unique_abbr": "CUHK", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chu_2015_ICCV,\n \n author = {\n Chu,\n Xiao and Ouyang,\n Wanli and Yang,\n Wei and Wang,\n Xiaogang\n},\n title = {\n Multi-Task Recurrent Neural Network for Immediacy Prediction\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "9979b598d3", @@ -10193,7 +10515,7 @@ "author": "Xianglong Liu; Lei Huang; Cheng Deng; Jiwen Lu; Bo Lang", "abstract": "Recent years have witnessed the success of hashing techniques in fast nearest neighbor search. In practice many applications (e.g., visual search, object detection, image matching, etc.) have enjoyed the benefits of complementary hash tables and information fusion over multiple views. However, most of prior research mainly focused on compact hash code cleaning, and rare work studies how to build multiple complementary hash tables, much less to adaptively integrate information stemming from multiple views. In this paper we first present a novel multi-view complementary hash table method that learns complementarity hash tables from the data with multiple views. For single multi-view table, using exemplar based feature fusion, we approximate the inherent data similarities with a low-rank matrix, and learn discriminative hash functions in an efficient way. To build complementary tables and meanwhile maintain scalable training and fast out-of-sample extension, an exemplar reweighting scheme is introduced to update the induced low-rank similarity in the sequential table construction framework, which indeed brings mutual benefits between tables by placing greater importance on exemplars shared by mis-separated neighbors. Extensive experiments on three large-scale image datasets demonstrate that the proposed method significantly outperforms various naive solutions and state-of-the-art multi-table methods.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Liu_Multi-View_Complementary_Hash_ICCV_2015_paper.pdf", - "aff": "State Key Lab of Software Development Environment, Beihang University, Beijing, China; State Key Lab of Software Development Environment, Beihang University, Beijing, China; Xidian University, Xi\u2019an, China; Department of Automation, Tsinghua University, Beijing, China; State Key Lab of Software Development Environment, Beihang University, Beijing, China", + "aff": "State Key Lab of Software Development Environment, Beihang University, Beijing, China; State Key Lab of Software Development Environment, Beihang University, Beijing, China; Xidian University, Xi’an, China; Department of Automation, Tsinghua University, Beijing, China; State Key Lab of Software Development Environment, Beihang University, Beijing, China", "project": "", "github": "", "supp": "", @@ -10214,7 +10536,8 @@ "aff_campus_unique_index": "0;0;1;0;0", "aff_campus_unique": "Beijing;Xi'an", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2015_ICCV,\n \n author = {\n Liu,\n Xianglong and Huang,\n Lei and Deng,\n Cheng and Lu,\n Jiwen and Lang,\n Bo\n},\n title = {\n Multi-View Complementary Hash Tables for Nearest Neighbor Search\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "e17675c5b1", @@ -10248,7 +10571,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Amherst", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Su_2015_ICCV,\n \n author = {\n Su,\n Hang and Maji,\n Subhransu and Kalogerakis,\n Evangelos and Learned-Miller,\n Erik\n},\n title = {\n Multi-View Convolutional Neural Networks for 3D Shape Recognition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "6ad19683bb", @@ -10275,14 +10599,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Niu_Multi-View_Domain_Generalization_ICCV_2015_paper.html", "aff_unique_index": "0;1;0+2", - "aff_unique_norm": "Nanyang Technological University;ETH Zurich;University of Sydney", + "aff_unique_norm": "Nanyang Technological University;ETH Zurich;The University of Sydney", "aff_unique_dep": "Interdisciplinary Graduate School;Computer Vision Laboratory;School of Electrical and Information Engineering", "aff_unique_url": "https://www.ntu.edu.sg;https://www.ethz.ch;https://www.sydney.edu.au", "aff_unique_abbr": "NTU;ETHZ;USYD", - "aff_campus_unique_index": "1+2", - "aff_campus_unique": ";Singapore;Sydney", + "aff_campus_unique_index": "1;2", + "aff_campus_unique": ";Zurich;Sydney", "aff_country_unique_index": "0;1;0+2", - "aff_country_unique": "Singapore;Switzerland;Australia" + "aff_country_unique": "Singapore;Switzerland;Australia", + "bibtex": "@InProceedings{Niu_2015_ICCV,\n \n author = {\n Niu,\n Li and Li,\n Wen and Xu,\n Dong\n},\n title = {\n Multi-View Domain Generalization for Visual Recognition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "bde84c6856", @@ -10295,7 +10620,7 @@ "author": "Hongchang Gao; Feiping Nie; Xuelong Li; Heng Huang", "abstract": "For many computer vision applications, the data sets distribute on certain low-dimensional subspaces. Subspace clustering is to find such underlying subspaces and cluster the data points correctly. In this paper, we propose a novel multi-view subspace clustering method. The proposed method performs clustering on the subspace representation of each view simultaneously. Meanwhile, we propose to use a common cluster structure to guarantee the consistence among different views. In addition, an efficient algorithm is proposed to solve the problem. Experiments on four benchmark data sets have been performed to validate our proposed method. The promising results demonstrate the effectiveness of our method.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Gao_Multi-View_Subspace_Clustering_ICCV_2015_paper.pdf", - "aff": "Computer Science and Engineering, University of Texas at Arlington, Arlington, TX, 76019, USA; Computer Science and Engineering, University of Texas at Arlington, Arlington, TX, 76019, USA; Center for OPTIMAL, XIOPM, Chinese Academy of Sciences, Xi\u2019an, 710119, China; Computer Science and Engineering, University of Texas at Arlington, Arlington, TX, 76019, USA", + "aff": "Computer Science and Engineering, University of Texas at Arlington, Arlington, TX, 76019, USA; Computer Science and Engineering, University of Texas at Arlington, Arlington, TX, 76019, USA; Center for OPTIMAL, XIOPM, Chinese Academy of Sciences, Xi’an, 710119, China; Computer Science and Engineering, University of Texas at Arlington, Arlington, TX, 76019, USA", "project": "", "github": "", "supp": "", @@ -10316,7 +10641,8 @@ "aff_campus_unique_index": "0;0;1;0", "aff_campus_unique": "Arlington;Xi'an", "aff_country_unique_index": "0;0;1;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Gao_2015_ICCV,\n \n author = {\n Gao,\n Hongchang and Nie,\n Feiping and Li,\n Xuelong and Huang,\n Heng\n},\n title = {\n Multi-View Subspace Clustering\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "44e1958812", @@ -10329,7 +10655,7 @@ "author": "Lin Ma; Zhengdong Lu; Lifeng Shang; Hang Li", "abstract": "In this paper, we propose multimodal convolutional neural networks (m-CNNs) for matching image and sentence. Our m-CNN provides an end-to-end framework with convolutional architectures to exploit image representation, word composition, and the matching relations between the two modalities. More specifically, it consists of one image CNN encoding the image content and one matching CNN modeling the joint representation of image and sentence. The matching CNN composes different semantic fragments from words and learns the inter-modal relations between image and the composed fragments at different levels, thus fully exploit the matching relations between image and sentence. Experimental results demonstrate that the proposed m-CNNs can effectively capture the information necessary for image and sentence matching. More specifically, our proposed m-CNNs significantly outperform the state-of-the-art approaches for bidirectional image and sentence retrieval on the Flickr8K and Flickr30K datasets.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Ma_Multimodal_Convolutional_Neural_ICCV_2015_paper.pdf", - "aff": "Noah\u2019s Ark Lab, Huawei Technologies; Noah\u2019s Ark Lab, Huawei Technologies; Noah\u2019s Ark Lab, Huawei Technologies; Noah\u2019s Ark Lab, Huawei Technologies", + "aff": "Noah’s Ark Lab, Huawei Technologies; Noah’s Ark Lab, Huawei Technologies; Noah’s Ark Lab, Huawei Technologies; Noah’s Ark Lab, Huawei Technologies", "project": "", "github": "", "supp": "", @@ -10343,14 +10669,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Ma_Multimodal_Convolutional_Neural_ICCV_2015_paper.html", "aff_unique_index": "0;0;0;0", - "aff_unique_norm": "Huawei", - "aff_unique_dep": "Noah\u2019s Ark Lab", + "aff_unique_norm": "Huawei Technologies", + "aff_unique_dep": "Noah’s Ark Lab", "aff_unique_url": "https://www.huawei.com", "aff_unique_abbr": "Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Ma_2015_ICCV,\n \n author = {\n Ma,\n Lin and Lu,\n Zhengdong and Shang,\n Lifeng and Li,\n Hang\n},\n title = {\n Multimodal Convolutional Neural Networks for Matching Image and Sentence\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "afe0ad7edc", @@ -10375,7 +10702,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Ma_Multiple_Feature_Fusion_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Ma_Multiple_Feature_Fusion_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Ma_2015_ICCV,\n \n author = {\n Ma,\n Lin and Lu,\n Jiwen and Feng,\n Jianjiang and Zhou,\n Jie\n},\n title = {\n Multiple Feature Fusion via Weighted Entropy for Visual Tracking\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "bcfc692eec", @@ -10409,7 +10737,8 @@ "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Shanghai", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2015_ICCV,\n \n author = {\n Wang,\n Dequan and Shen,\n Zhiqiang and Shao,\n Jie and Zhang,\n Wei and Xue,\n Xiangyang and Zhang,\n Zheng\n},\n title = {\n Multiple Granularity Descriptors for Fine-Grained Categorization\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "c96ed1da95", @@ -10433,7 +10762,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Kim_Multiple_Hypothesis_Tracking_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Kim_Multiple_Hypothesis_Tracking_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Kim_2015_ICCV,\n \n author = {\n Kim,\n Chanho and Li,\n Fuxin and Ciptadi,\n Arridhana and Rehg,\n James M.\n},\n title = {\n Multiple Hypothesis Tracking Revisited\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "7c333a4550", @@ -10458,7 +10788,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Hasegawa_Multiple-Hypothesis_Affine_Region_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Hasegawa_Multiple-Hypothesis_Affine_Region_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Hasegawa_2015_ICCV,\n \n author = {\n Hasegawa,\n Takahiro and Ambai,\n Mitsuru and Ishikawa,\n Kohta and Koutaki,\n Gou and Yamauchi,\n Yuji and Yamashita,\n Takayoshi and Fujiyoshi,\n Hironobu\n},\n title = {\n Multiple-Hypothesis Affine Region Estimation With Anisotropic LoG Filters\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "b4214e2745", @@ -10467,11 +10798,11 @@ "status": "Poster", "track": "main", "pid": "", - "author_site": "David Varas, M\u00f3nica Alfaro, Ferran Marques", + "author_site": "David Varas, Mónica Alfaro, Ferran Marques", "author": "David Varas; Monica Alfaro; Ferran Marques", "abstract": "This paper presents a co-clustering technique that, given a collection of images and their hierarchies, clusters nodes from these hierarchies to obtain a coherent multiresolution representation of the image collection. We formalize the co-clustering as Quadratic Semi-Assignment Problem and solve it with a linear programming relaxation approach that makes effective use of information from hierarchies. Initially, we address the problem of generating an optimal, coherent partition per image and, afterwards, we extend this method to a multiresolution framework. Finally, we particularize this framework to an iterative multiresolution video segmentation algorithm in sequences with small variations. We evaluate the algorithm on the Video Occlusion/Object Boundary Detection Dataset, showing that it produces state-of-the-art results in these scenarios.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Varas_Multiresolution_Hierarchy_Co-Clustering_ICCV_2015_paper.pdf", - "aff": "Universitat Polit\u00e8cnica de Catalunya Barcelona Tech, Spain; Universitat Polit\u00e8cnica de Catalunya Barcelona Tech, Spain; Universitat Polit\u00e8cnica de Catalunya Barcelona Tech, Spain", + "aff": "Universitat Politècnica de Catalunya Barcelona Tech, Spain; Universitat Politècnica de Catalunya Barcelona Tech, Spain; Universitat Politècnica de Catalunya Barcelona Tech, Spain", "project": "", "github": "", "supp": "", @@ -10485,14 +10816,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Varas_Multiresolution_Hierarchy_Co-Clustering_ICCV_2015_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "Universitat Polit\u00e8cnica de Catalunya Barcelona Tech", + "aff_unique_norm": "Universitat Politècnica de Catalunya Barcelona Tech", "aff_unique_dep": "", "aff_unique_url": "https://www.upc.edu", "aff_unique_abbr": "UPC", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Barcelona", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Spain" + "aff_country_unique": "Spain", + "bibtex": "@InProceedings{Varas_2015_ICCV,\n \n author = {\n Varas,\n David and Alfaro,\n Monica and Marques,\n Ferran\n},\n title = {\n Multiresolution Hierarchy Co-Clustering for Semantic Segmentation in Sequences With Small Variations\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "4717fea141", @@ -10517,7 +10849,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Shen_Mutual-Structure_for_Joint_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Shen_Mutual-Structure_for_Joint_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Shen_2015_ICCV,\n \n author = {\n Shen,\n Xiaoyong and Zhou,\n Chao and Xu,\n Li and Jia,\n Jiaya\n},\n title = {\n Mutual-Structure for Joint Filtering\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "a2b9f92a8d", @@ -10526,7 +10859,7 @@ "status": "Poster", "track": "main", "pid": "", - "author_site": "Jordi Salvador, Eduardo P\u00e9rez-Pellitero", + "author_site": "Jordi Salvador, Eduardo Pérez-Pellitero", "author": "Jordi Salvador; Eduardo Perez-Pellitero", "abstract": "This paper presents a fast, high-performance method for super resolution with external learning. The first contribution leading to the excellent performance is a bimodal tree for clustering, which successfully exploits the antipodal invariance of the coarse-to-high-res mapping of natural image patches and provides scalability to finer partitions of the underlying coarse patch space. During training an ensemble of such bimodal trees is computed, providing different linearizations of the mapping. The second and main contribution is a fast inference algorithm, which selects the most suitable mapping function within the tree ensemble for each patch by adopting a Local Naive Bayes formulation. The experimental validation shows promising scalability properties that reflect the suitability of the proposed model, which may also be generalized to other tasks. The resulting method is beyond one order of magnitude faster and performs objectively and subjectively better than the current state of the art.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Salvador_Naive_Bayes_Super-Resolution_ICCV_2015_paper.pdf", @@ -10551,7 +10884,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hannover", "aff_country_unique_index": "0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Salvador_2015_ICCV,\n \n author = {\n Salvador,\n Jordi and Perez-Pellitero,\n Eduardo\n},\n title = {\n Naive Bayes Super-Resolution Forest\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "2064717cac", @@ -10560,6 +10894,7 @@ "status": "Poster", "track": "main", "pid": "", + "author_site": "Wongun Choi", "author": "Wongun Choi", "abstract": "In this paper, we tackle two key aspects of multiple target tracking problem: 1) designing an accurate affinity measure to associate detections and 2) implementing an efficient and accurate (near) online multiple target tracking algorithm. As for the first contribution, we introduce a novel Aggregated Local Flow Descriptor (ALFD) that encodes the relative motion pattern between a pair of temporally distant detections using long term interest point trajectories (IPTs). Leveraging on the IPTs, the ALFD provides a robust affinity measure for estimating the likelihood of matching detections regardless of the application scenarios. As for another contribution, we present a Near-Online Multi-target Tracking (NOMT) algorithm. The tracking problem is formulated as a data-association between targets and detections in a temporal window, that is performed repeatedly at every frame. While being efficient, NOMT achieves robustness via integrating multiple cues including ALFD metric, target dynamics, appearance similarity, and long term trajectory regularization into the model. Our ablative analysis verifies the superiority of the ALFD metric over the other conventional affinity metrics. We run a comprehensive experimental evaluation on two challenging tracking datasets, KITTI and MOT datasets. The NOMT method combined with ALFD metric achieves the best accuracy in both datasets with significant margins (about 10% higher MOTA) over the state-of-the-art.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Choi_Near-Online_Multi-Target_Tracking_ICCV_2015_paper.pdf", @@ -10584,7 +10919,8 @@ "aff_campus_unique_index": "0", "aff_campus_unique": "Cupertino", "aff_country_unique_index": "0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Choi_2015_ICCV,\n \n author = {\n Choi,\n Wongun\n},\n title = {\n Near-Online Multi-Target Tracking With Aggregated Local Flow Descriptor\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "d0dd27189e", @@ -10609,7 +10945,8 @@ "aff_domain": ";", "email": ";", "author_num": 2, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Simon_Neural_Activation_Constellations_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Simon_Neural_Activation_Constellations_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Simon_2015_ICCV,\n \n author = {\n Simon,\n Marcel and Rodner,\n Erik\n},\n title = {\n Neural Activation Constellations: Unsupervised Part Model Discovery With Convolutional Networks\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "c836767a0f", @@ -10643,7 +10980,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", - "aff_country_unique": "Singapore;United States" + "aff_country_unique": "Singapore;United States", + "bibtex": "@InProceedings{Li_2015_ICCV,\n \n author = {\n Li,\n Yu and Tan,\n Robby T. and Brown,\n Michael S.\n},\n title = {\n Nighttime Haze Removal With Glow and Multiple Light Colors\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "d1055c7284", @@ -10656,7 +10994,7 @@ "author": "Federico Camposeco; Torsten Sattler; Marc Pollefeys", "abstract": "We propose a novel two-step method for estimating the intrinsic and extrinsic calibration of any radially symmetric camera, including non-central systems. The first step consists of estimating the camera pose, given a Structure from Motion (SfM) model, up to the translation along the optical axis. As a second step, we obtain the calibration by finding the translation of the camera center using an ordering constraint. The method makes use of the 1D radial camera model, which allows us to effectively handle any radially symmetric camera, including non-central ones. Using this ordering constraint, we show that the we are able to calibrate several different (central and non-central) Wide Field of View (WFOV) cameras, including fisheye, hyper-catadioptric and spherical catadioptric cameras, as well as pinhole cameras, using a single image or jointly solving for several views.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Camposeco_Non-Parametric_Structure-Based_Calibration_ICCV_2015_paper.pdf", - "aff": "Department of Computer Science, ETH Z\u00fcrich, Switzerland; Department of Computer Science, ETH Z\u00fcrich, Switzerland; Department of Computer Science, ETH Z\u00fcrich, Switzerland", + "aff": "Department of Computer Science, ETH Zürich, Switzerland; Department of Computer Science, ETH Zürich, Switzerland; Department of Computer Science, ETH Zürich, Switzerland", "project": "", "github": "", "supp": "", @@ -10670,14 +11008,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Camposeco_Non-Parametric_Structure-Based_Calibration_ICCV_2015_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "ETH Zurich", + "aff_unique_norm": "ETH Zürich", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.ethz.ch", "aff_unique_abbr": "ETHZ", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Switzerland" + "aff_country_unique": "Switzerland", + "bibtex": "@InProceedings{Camposeco_2015_ICCV,\n \n author = {\n Camposeco,\n Federico and Sattler,\n Torsten and Pollefeys,\n Marc\n},\n title = {\n Non-Parametric Structure-Based Calibration of Radially Symmetric Cameras\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "72b84836a1", @@ -10702,7 +11041,8 @@ "aff_domain": ";", "email": ";", "author_num": 2, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Ren_Object_Detection_Using_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Ren_Object_Detection_Using_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Ren_2015_ICCV,\n \n author = {\n Ren,\n Haoyu and Li,\n Ze-Nian\n},\n title = {\n Object Detection Using Generalization and Efficiency Balanced Co-Occurrence Features\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "c02361b5be", @@ -10736,7 +11076,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "France" + "aff_country_unique": "France", + "bibtex": "@InProceedings{Gidaris_2015_ICCV,\n \n author = {\n Gidaris,\n Spyros and Komodakis,\n Nikos\n},\n title = {\n Object Detection via a Multi-Region and Semantic Segmentation-Aware CNN Model\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "3b8488506e", @@ -10761,7 +11102,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Jain_Objects2action_Classifying_and_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Jain_Objects2action_Classifying_and_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Jain_2015_ICCV,\n \n author = {\n Jain,\n Mihir and van Gemert,\n Jan C. and Mensink,\n Thomas and Snoek,\n Cees G. M.\n},\n title = {\n Objects2action: Classifying and Localizing Actions Without Any Video Example\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "2550ccf09e", @@ -10795,7 +11137,8 @@ "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Berkeley;San Diego", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Wang_2015_ICCV,\n \n author = {\n Wang,\n Ting-Chun and Efros,\n Alexei A. and Ramamoorthi,\n Ravi\n},\n title = {\n Occlusion-Aware Depth Estimation Using Light-Field Cameras\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "5aefcb113f", @@ -10820,7 +11163,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Johannsen_On_Linear_Structure_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Johannsen_On_Linear_Structure_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Johannsen_2015_ICCV,\n \n author = {\n Johannsen,\n Ole and Sulc,\n Antonin and Goldluecke,\n Bastian\n},\n title = {\n On Linear Structure From Motion for Light Field Cameras\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "ff71a268e5", @@ -10833,7 +11177,7 @@ "author": "Won Hwa Kim; Sathya N. Ravi; Sterling C. Johnson; Ozioma C. Okonkwo; Vikas Singh", "abstract": "A variety of studies in neuroscience/neuroimaging seek to perform statistical inference on the acquired brain image scans for diagnosis as well as understanding the pathological manifestation of diseases. To do so, an important first step is to register (or co-register) all of the image data into a common coordinate system. This permits meaningful comparison of the intensities at each voxel across groups (e.g., diseased versus healthy) to evaluate the effects of the disease and/or use machine learning algorithms in a subsequent step. But errors in the underlying registration make this problematic, they either decrease the statistical power or make the follow-up inference tasks less effective/accurate. In this paper, we derive a novel algorithm which offers immunity to local errors in the underlying deformation field obtained from registration procedures. By deriving a deformation invariant representation of the image, the downstream analysis can be made more robust as if one had access to a (hypothetical) far superior registration procedure. Our algorithm is based on recent work on Scattering coefficients. Using this as a starting point, we show how results from harmonic analysis (especially, non-Euclidean wavelets) yields strategies for designing deformation and additive noise invariant representations of large 3-D brain image volumes. We present a set of results on synthetic and real brain images where we achieve robust statistical analysis even in the presence of substantial deformation errors; here, standard analysis procedures significantly under-perform and fail to identify the true signal.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Kim_On_Statistical_Analysis_ICCV_2015_paper.pdf", - "aff": "Dept. of Computer Sciences, University of Wisconsin, Madison, WI+Wisconsin Alzheimer\u2019s Disease Research Center, University of Wisconsin, Madison, WI; Dept. of Industrial and Systems Engineering, University of Wisconsin, Madison, WI; GRECC, William S. Middleton V A Hospital, Madison, WI+Wisconsin Alzheimer\u2019s Disease Research Center, University of Wisconsin, Madison, WI; Wisconsin Alzheimer\u2019s Disease Research Center, University of Wisconsin, Madison, WI; Dept. of Biostatistics & Med. Informatics, University of Wisconsin, Madison, WI+Dept. of Computer Sciences, University of Wisconsin, Madison, WI+Wisconsin Alzheimer\u2019s Disease Research Center, University of Wisconsin, Madison, WI", + "aff": "Dept. of Computer Sciences, University of Wisconsin, Madison, WI+Wisconsin Alzheimer’s Disease Research Center, University of Wisconsin, Madison, WI; Dept. of Industrial and Systems Engineering, University of Wisconsin, Madison, WI; GRECC, William S. Middleton V A Hospital, Madison, WI+Wisconsin Alzheimer’s Disease Research Center, University of Wisconsin, Madison, WI; Wisconsin Alzheimer’s Disease Research Center, University of Wisconsin, Madison, WI; Dept. of Biostatistics & Med. Informatics, University of Wisconsin, Madison, WI+Dept. of Computer Sciences, University of Wisconsin, Madison, WI+Wisconsin Alzheimer’s Disease Research Center, University of Wisconsin, Madison, WI", "project": "", "github": "", "supp": "", @@ -10848,13 +11192,14 @@ "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Kim_On_Statistical_Analysis_ICCV_2015_paper.html", "aff_unique_index": "0+1;0;2+1;1;1+0+1", "aff_unique_norm": "University of Wisconsin-Madison;University of Wisconsin;William S. Middleton Memorial Veterans Hospital", - "aff_unique_dep": "Department of Computer Sciences;Wisconsin Alzheimer\u2019s Disease Research Center;GRECC (Geriatric Research, Education, and Clinical Center)", + "aff_unique_dep": "Department of Computer Sciences;Wisconsin Alzheimer’s Disease Research Center;GRECC (Geriatric Research, Education, and Clinical Center)", "aff_unique_url": "https://www.wisc.edu;https://www.wisc.edu;https://www.wisconsin.va.gov/", "aff_unique_abbr": "UW-Madison;UW;", "aff_campus_unique_index": "0+0;0;0+0;0;0+0+0", "aff_campus_unique": "Madison", "aff_country_unique_index": "0+0;0;0+0;0;0+0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Kim_2015_ICCV,\n \n author = {\n Kim,\n Won Hwa and Ravi,\n Sathya N. and Johnson,\n Sterling C. and Okonkwo,\n Ozioma C. and Singh,\n Vikas\n},\n title = {\n On Statistical Analysis of Neuroimages With Imperfect Registration\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "ca311a8852", @@ -10881,14 +11226,15 @@ "author_num": 2, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Kumar_On_the_Equivalence_ICCV_2015_paper.html", "aff_unique_index": "0;0", - "aff_unique_norm": "University of Illinois Urbana-Champaign", + "aff_unique_norm": "University of Illinois at Urbana-Champaign", "aff_unique_dep": "Department of Electrical and Computer Engineering", "aff_unique_url": "https://illinois.edu", "aff_unique_abbr": "UIUC", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Urbana", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Kumar_2015_ICCV,\n \n author = {\n Kumar,\n Avinash and Ahuja,\n Narendra\n},\n title = {\n On the Equivalence of Moving Entrance Pupil and Radial Distortion for Camera Calibration\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "9d720dc6f4", @@ -10901,7 +11247,7 @@ "author": "Sagi Katz; Ayellet Tal", "abstract": "Is it possible to determine the visible subset of points directly from a given point cloud? Interestingly, in [7] it was shown that this is indeed the case - despite the fact that points cannot occlude each other, this task can be performed without surface reconstruction or normal estimation. The operator is very simple - it first transforms the points to a new domain and then constructs the convex hull in that domain. Points that lie on the convex hull of the transformed set of points are the images of the visible points. This operator found numerous applications in computer vision, including face reconstruction, keypoint detection, finding the best viewpoints, reduction of points, and many more. The current paper addresses a fundamental question: What properties should a transformation function satisfy, in order to be utilized in this operator? We show that three such properties are sufficient: the sign of the function, monotonicity, and a condition regarding the function's parameter. The correctness of an algorithm that satisfies these three properties is proved. Finally, we show an interesting application of the operator - assignment of visibility-confidence score. This feature is missing from previous approaches, where a binary yes/no visibility is determined. This score can be utilized in various applications; we illustrate its use in view-dependent curvature estimation.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Katz_On_the_Visibility_ICCV_2015_paper.pdf", - "aff": "Technion\u2013Israel Institute of Technology; Technion\u2013Israel Institute of Technology", + "aff": "Technion–Israel Institute of Technology; Technion–Israel Institute of Technology", "project": "", "github": "", "supp": "", @@ -10915,14 +11261,15 @@ "author_num": 2, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Katz_On_the_Visibility_ICCV_2015_paper.html", "aff_unique_index": "0;0", - "aff_unique_norm": "Technion\u2013Israel Institute of Technology", + "aff_unique_norm": "Technion–Israel Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.technion.ac.il/en/", "aff_unique_abbr": "Technion", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Israel" + "aff_country_unique": "Israel", + "bibtex": "@InProceedings{Katz_2015_ICCV,\n \n author = {\n Katz,\n Sagi and Tal,\n Ayellet\n},\n title = {\n On the Visibility of Point Clouds\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "ca780a9452", @@ -10956,7 +11303,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Los Angeles", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Wong_2015_ICCV,\n \n author = {\n Wong,\n Alex and Yuille,\n Alan L.\n},\n title = {\n One Shot Learning via Compositions of Meaningful Patches\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "6b16a448f9", @@ -10981,7 +11329,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Hua_Online_Object_Tracking_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Hua_Online_Object_Tracking_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Hua_2015_ICCV,\n \n author = {\n Hua,\n Yang and Alahari,\n Karteek and Schmid,\n Cordelia\n},\n title = {\n Online Object Tracking With Proposal Selection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "38b7689a85", @@ -11005,7 +11354,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Tang_Opening_the_Black_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Tang_Opening_the_Black_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Tang_2015_ICCV,\n \n author = {\n Tang,\n Danhang and Taylor,\n Jonathan and Kohli,\n Pushmeet and Keskin,\n Cem and Kim,\n Tae-Kyun and Shotton,\n Jamie\n},\n title = {\n Opening the Black Box: Hierarchical Sampling Optimization for Estimating Human Hand Pose\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "0298799f15", @@ -11018,7 +11368,7 @@ "author": "Faruk Ahmed; Dany Tarlow; Dhruv Batra", "abstract": "We study the question of how to make loss-aware predictions in image segmentation settings where the evaluation function is the Intersection-over-Union (IoU) measure that is used widely in evaluating image segmentation systems. Currently, there are two dominant approaches: the first approximates the Expected-IoU (EIoU) score as Expected-Intersection-over-Expected-Union (EIoEU); and the second approach is to compute exact EIoU but only over a small set of high-quality candidate solutions. We begin by asking which approach we should favor for two typical image segmentation tasks. Studying this question leads to two new methods that draw ideas from both existing approaches. Our new methods use the EIoEU approximation paired with high quality candidate solutions. Experimentally we show that our new approaches lead to improved performance on both image segmentation tasks.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Ahmed_Optimizing_Expected_Intersection-Over-Union_ICCV_2015_paper.pdf", - "aff": "Universit\u00e9 de Montr\u00e9al; Microsoft Research; Virginia Tech", + "aff": "Université de Montréal; Microsoft Research; Virginia Tech", "project": "", "github": "", "supp": "", @@ -11032,14 +11382,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Ahmed_Optimizing_Expected_Intersection-Over-Union_ICCV_2015_paper.html", "aff_unique_index": "0;1;2", - "aff_unique_norm": "Universit\u00e9 de Montr\u00e9al;Microsoft;Virginia Tech", + "aff_unique_norm": "Université de Montréal;Microsoft Corporation;Virginia Tech", "aff_unique_dep": ";Microsoft Research;", "aff_unique_url": "https://www.umontreal.ca;https://www.microsoft.com/en-us/research;https://www.vt.edu", "aff_unique_abbr": "UdeM;MSR;VT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", - "aff_country_unique": "Canada;United States" + "aff_country_unique": "Canada;United States", + "bibtex": "@InProceedings{Ahmed_2015_ICCV,\n \n author = {\n Ahmed,\n Faruk and Tarlow,\n Dany and Batra,\n Dhruv\n},\n title = {\n Optimizing Expected Intersection-Over-Union With Candidate-Constrained CRFs\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "829ae618fd", @@ -11048,11 +11399,11 @@ "status": "Poster", "track": "main", "pid": "", - "author_site": "Chris Sweeney, Torsten Sattler, Tobias H\u00f6llerer, Matthew Turk, Marc Pollefeys", + "author_site": "Chris Sweeney, Torsten Sattler, Tobias Höllerer, Matthew Turk, Marc Pollefeys", "author": "Chris Sweeney; Torsten Sattler; Tobias Hollerer; Matthew Turk; Marc Pollefeys", "abstract": "The viewing graph represents a set of views that are related by pairwise relative geometries. In the context of Structure-from-Motion (SfM), the viewing graph is the input to the incremental or global estimation pipeline. Much effort has been put towards developing robust algorithms to overcome potentially inaccurate relative geometries in the viewing graph during SfM. In this paper, we take a fundamentally different approach to SfM and instead focus on improving the quality of the viewing graph before applying SfM. Our main contribution is a novel optimization that improves the quality of the relative geometries in the viewing graph by enforcing loop consistency constraints with the epipolar point transfer. We show that this optimization greatly improves the accuracy of relative poses in the viewing graph and removes the need for filtering steps or robust algorithms typically used in global SfM methods. In addition, the optimized viewing graph can be used to efficiently calibrate cameras at scale. We combine our viewing graph optimization and focal length calibration into a global SfM pipeline that is more efficient than existing approaches. To our knowledge, ours is the first global SfM pipeline capable of handling uncalibrated image sets.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Sweeney_Optimizing_the_Viewing_ICCV_2015_paper.pdf", - "aff": "University of California Santa Barbara; ETH Z \u00a8urich, Switzerland; University of California Santa Barbara; University of California Santa Barbara; ETH Z \u00a8urich, Switzerland", + "aff": "University of California Santa Barbara; ETH Z ¨urich, Switzerland; University of California Santa Barbara; University of California Santa Barbara; ETH Z ¨urich, Switzerland", "project": "", "github": "", "supp": "", @@ -11066,14 +11417,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Sweeney_Optimizing_the_Viewing_ICCV_2015_paper.html", "aff_unique_index": "0;1;0;0;1", - "aff_unique_norm": "University of California, Santa Barbara;ETH Zurich", + "aff_unique_norm": "University of California, Santa Barbara;ETH Zürich", "aff_unique_dep": ";", "aff_unique_url": "https://www.ucsb.edu;https://www.ethz.ch", "aff_unique_abbr": "UCSB;ETHZ", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Santa Barbara;", "aff_country_unique_index": "0;1;0;0;1", - "aff_country_unique": "United States;Switzerland" + "aff_country_unique": "United States;Switzerland", + "bibtex": "@InProceedings{Sweeney_2015_ICCV,\n \n author = {\n Sweeney,\n Chris and Sattler,\n Torsten and Hollerer,\n Tobias and Turk,\n Matthew and Pollefeys,\n Marc\n},\n title = {\n Optimizing the Viewing Graph for Structure-From-Motion\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "e18e5f0562", @@ -11107,7 +11459,8 @@ "aff_campus_unique_index": "0;0;0;1", "aff_campus_unique": "Berkeley;San Diego", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Srinivasan_2015_ICCV,\n \n author = {\n Srinivasan,\n Pratul P. and Tao,\n Michael W. and Ng,\n Ren and Ramamoorthi,\n Ravi\n},\n title = {\n Oriented Light-Field Windows for Scene Flow\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "5d1ff3d03b", @@ -11141,7 +11494,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{He_2015_ICCV,\n \n author = {\n He,\n Shengfeng and Lau,\n Rynson W.H.\n},\n title = {\n Oriented Object Proposals\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "428d299df1", @@ -11150,11 +11504,11 @@ "status": "Poster", "track": "main", "pid": "", - "author_site": "Guilhem Ch\u00e9ron, Ivan Laptev, Cordelia Schmid", + "author_site": "Guilhem Chéron, Ivan Laptev, Cordelia Schmid", "author": "Guilhem Cheron; Ivan Laptev; Cordelia Schmid", "abstract": "This work targets human action recognition in video. While recent methods typically represent actions by statistics of local video features, here we argue for the importance of a representation derived from human pose. To this end we propose a new Pose-based Convolutional Neural Network descriptor (P-CNN) for action recognition. The descriptor aggregates motion and appearance information along tracks of human body parts. We investigate different schemes of temporal aggregation and experiment with P-CNN features obtained both for automatically estimated and manually annotated human poses. We evaluate our method on the recent and challenging JHMDB and MPII Cooking datasets. For both datasets our method shows consistent improvement over the state of the art.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Cheron_P-CNN_Pose-Based_CNN_ICCV_2015_paper.pdf", - "aff": "WILLOW project-team, Departement d\u2019Informatique de l\u2019Ecole Normale Superieure, ENS/Inria/CNRS UMR 8548, Paris, France; WILLOW project-team, Departement d\u2019Informatique de l\u2019Ecole Normale Superieure, ENS/Inria/CNRS UMR 8548, Paris, France; LEAR project-team, Inria Grenoble Rhone-Alpes, Laboratoire Jean Kuntzmann, CNRS, Univ. Grenoble Alpes, France", + "aff": "WILLOW project-team, Departement d’Informatique de l’Ecole Normale Superieure, ENS/Inria/CNRS UMR 8548, Paris, France; WILLOW project-team, Departement d’Informatique de l’Ecole Normale Superieure, ENS/Inria/CNRS UMR 8548, Paris, France; LEAR project-team, Inria Grenoble Rhone-Alpes, Laboratoire Jean Kuntzmann, CNRS, Univ. Grenoble Alpes, France", "project": "", "github": "", "supp": "", @@ -11169,13 +11523,14 @@ "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Cheron_P-CNN_Pose-Based_CNN_ICCV_2015_paper.html", "aff_unique_index": "0;0;1", "aff_unique_norm": "Ecole Normale Superieure;Inria Grenoble Rhone-Alpes", - "aff_unique_dep": "Departement d\u2019Informatique;Laboratoire Jean Kuntzmann", + "aff_unique_dep": "Departement d’Informatique;Laboratoire Jean Kuntzmann", "aff_unique_url": "https://www.ens.fr;https://www.inria.fr/grenoble", "aff_unique_abbr": "ENS;Inria", "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Paris;Grenoble", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "France" + "aff_country_unique": "France", + "bibtex": "@InProceedings{Cheron_2015_ICCV,\n \n author = {\n Cheron,\n Guilhem and Laptev,\n Ivan and Schmid,\n Cordelia\n},\n title = {\n P-CNN: Pose-Based CNN Features for Action Recognition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "aeac96b340", @@ -11209,7 +11564,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Charlotte", "aff_country_unique_index": "0;0+0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Peng_2015_ICCV,\n \n author = {\n Peng,\n Xi and Zhang,\n Shaoting and Yang,\n Yu and Metaxas,\n Dimitris N.\n},\n title = {\n PIEFA: Personalized Incremental and Ensemble Face Alignment\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "924a707d47", @@ -11243,7 +11599,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Finlayson_2015_ICCV,\n \n author = {\n Finlayson,\n Graham D. and Hayes,\n Alex E.\n},\n title = {\n POP Image Fusion - Derivative Domain Image Fusion Without Reintegration\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "d0a27f524e", @@ -11270,14 +11627,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Matsui_PQTable_Fast_Exact_ICCV_2015_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "University of Tokyo", + "aff_unique_norm": "The University of Tokyo", "aff_unique_dep": "", "aff_unique_url": "https://www.u-tokyo.ac.jp", "aff_unique_abbr": "UTokyo", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Matsui_2015_ICCV,\n \n author = {\n Matsui,\n Yusuke and Yamasaki,\n Toshihiko and Aizawa,\n Kiyoharu\n},\n title = {\n PQTable: Fast Exact Asymmetric Distance Neighbor Search for Product Quantization Using Hash Tables\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "217eb287c6", @@ -11286,11 +11644,11 @@ "status": "Poster", "track": "main", "pid": "", - "author_site": "Arnaud Dapogny, Kevin Bailly, S\u00e9verine Dubuisson", + "author_site": "Arnaud Dapogny, Kevin Bailly, Séverine Dubuisson", "author": "Arnaud Dapogny; Kevin Bailly; Severine Dubuisson", "abstract": "Facial expression can be seen as the dynamic variation of one's appearance over time. Successful recognition thus involves finding representations of high-dimensional spatiotemporal patterns that can be generalized to unseen facial morphologies and variations of the expression dynamics. In this paper, we propose to learn Random Forests from heterogeneous derivative features (e.g. facial fiducial point movements or texture variations) upon pairs of images. Those forests are conditioned on the expression label of the first frame to reduce the variability of the ongoing expression transitions. When testing on a specific frame of a video, pairs are created between this frame and the previous ones. Predictions for each previous frame are used to draw trees from Pairwise Conditional Random Forests (PCRF) whose pairwise outputs are averaged over time to produce robust estimates. As such, PCRF appears as a natural extension of Random Forests to learn spatio-temporal patterns, that leads to significant improvements over standard Random Forests as well as state-of-the-art approaches on several facial expression benchmarks.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Dapogny_Pairwise_Conditional_Random_ICCV_2015_paper.pdf", - "aff": "Sorbonne Universit \u00b4es, UPMC Univ Paris 06, CNRS, ISIR UMR 7222, 4 place Jussieu 75005 Paris; Sorbonne Universit \u00b4es, UPMC Univ Paris 06, CNRS, ISIR UMR 7222, 4 place Jussieu 75005 Paris; Sorbonne Universit \u00b4es, UPMC Univ Paris 06, CNRS, ISIR UMR 7222, 4 place Jussieu 75005 Paris", + "aff": "Sorbonne Universit ´es, UPMC Univ Paris 06, CNRS, ISIR UMR 7222, 4 place Jussieu 75005 Paris; Sorbonne Universit ´es, UPMC Univ Paris 06, CNRS, ISIR UMR 7222, 4 place Jussieu 75005 Paris; Sorbonne Universit ´es, UPMC Univ Paris 06, CNRS, ISIR UMR 7222, 4 place Jussieu 75005 Paris", "project": "", "github": "", "supp": "", @@ -11304,14 +11662,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Dapogny_Pairwise_Conditional_Random_ICCV_2015_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "Sorbonne Universit\u00e9s", + "aff_unique_norm": "Sorbonne Universités", "aff_unique_dep": "", "aff_unique_url": "https://www.sorbonne-universite.fr", "aff_unique_abbr": "Sorbonne", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Paris", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "France" + "aff_country_unique": "France", + "bibtex": "@InProceedings{Dapogny_2015_ICCV,\n \n author = {\n Dapogny,\n Arnaud and Bailly,\n Kevin and Dubuisson,\n Severine\n},\n title = {\n Pairwise Conditional Random Forests for Facial Expression Recognition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "eca827a228", @@ -11345,7 +11704,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Jiang_2015_ICCV,\n \n author = {\n Jiang,\n Yiyong and Ding,\n Xinghao and Zeng,\n Delu and Huang,\n Yue and Paisley,\n John\n},\n title = {\n Pan-Sharpening With a Hyper-Laplacian Penalty\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "a2a8e99abc", @@ -11379,7 +11739,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Pittsburgh", "aff_country_unique_index": "0;1;1;1;0;0;0;2;0", - "aff_country_unique": "United States;China;Japan" + "aff_country_unique": "United States;China;Japan", + "bibtex": "@InProceedings{Joo_2015_ICCV,\n \n author = {\n Joo,\n Hanbyul and Liu,\n Hao and Tan,\n Lei and Gui,\n Lin and Nabbe,\n Bart and Matthews,\n Iain and Kanade,\n Takeo and Nobuhara,\n Shohei and Sheikh,\n Yaser\n},\n title = {\n Panoptic Studio: A Massively Multiview System for Social Motion Capture\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "85029a0f1f", @@ -11392,7 +11753,7 @@ "author": "Puneet K. Dokania; M. Pawan Kumar", "abstract": "We propose a new family of discrete energy minimization problems, which we call parsimonious labeling. Our energy function consists of unary potentials and high-order clique potentials. While the unary potentials are arbitrary, the clique potentials are proportional to the diversity of the set of unique labels assigned to the clique. Intuitively, our energy function encourages the labeling to be parsimonious, that is, use as few labels as possible. This in turn allows us to capture useful cues for important computer vision applications such as stereo correspondence and image denoising. Furthermore, we propose an efficient graph-cuts based algorithm for the parsimonious labeling problem that provides strong theoretical guarantees on the quality of the solution. Our algorithm consists of three steps. First, we approximate a given diversity using a mixture of a novel hierarchical Pn Potts model. Second, we use a divide-and-conquer approach for each mixture component, where each subproblem is solved using an efficient alpha-expansion algorithm. This provides us with a small number of putative labelings, one for each mixture component. Third, we choose the best putative labeling in terms of the energy value. Using both synthetic and standard real datasets, we show that our algorithm significantly outperforms other graph-cuts based approaches.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Dokania_Parsimonious_Labeling_ICCV_2015_paper.pdf", - "aff": "CentraleSup\u00e9lec and INRIA Saclay; CentraleSup\u00e9lec and INRIA Saclay", + "aff": "CentraleSupélec and INRIA Saclay; CentraleSupélec and INRIA Saclay", "project": "", "github": "", "supp": "", @@ -11406,14 +11767,15 @@ "author_num": 2, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Dokania_Parsimonious_Labeling_ICCV_2015_paper.html", "aff_unique_index": "0;0", - "aff_unique_norm": "CentraleSup\u00e9lec", + "aff_unique_norm": "CentraleSupélec", "aff_unique_dep": "", "aff_unique_url": "https://www.centralesupelec.fr", - "aff_unique_abbr": "CentraleSup\u00e9lec", + "aff_unique_abbr": "CentraleSupélec", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "France" + "aff_country_unique": "France", + "bibtex": "@InProceedings{Dokania_2015_ICCV,\n \n author = {\n Dokania,\n Puneet K. and Kumar,\n M. Pawan\n},\n title = {\n Parsimonious Labeling\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "83affaa839", @@ -11447,7 +11809,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";London", "aff_country_unique_index": "0;0;1;0;0;1", - "aff_country_unique": "China;United Kingdom" + "aff_country_unique": "China;United Kingdom", + "bibtex": "@InProceedings{Zheng_2015_ICCV,\n \n author = {\n Zheng,\n Wei-Shi and Li,\n Xiang and Xiang,\n Tao and Liao,\n Shengcai and Lai,\n Jianhuang and Gong,\n Shaogang\n},\n title = {\n Partial Person Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "7fdaa54b0c", @@ -11460,7 +11823,7 @@ "author": "Jun Xu; Lei Zhang; Wangmeng Zuo; David Zhang; Xiangchu Feng", "abstract": "Patch based image modeling has achieved a great success in low level vision such as image denoising. In particular, the use of image nonlocal self-similarity (NSS) prior, which refers to the fact that a local patch often has many nonlocal similar patches to it across the image, has significantly enhanced the denoising performance. However, in most existing methods only the NSS of input degraded image is exploited, while how to utilize the NSS of clean natural images is still an open problem. In this paper, we propose a patch group (PG) based NSS prior learning scheme to learn explicit NSS models from natural images for high performance denoising. PGs are extracted from training images by putting nonlocal similar patches into groups, and a PG based Gaussian Mixture Model (PG-GMM) learning algorithm is developed to learn the NSS prior. We demonstrate that, owe to the learned PG-GMM, a simple weighted sparse coding model, which has a closed-form solution, can be used to perform image denoising effectively, resulting in high PSNR measure, fast speed, and particularly the best visual quality among all competing methods.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Xu_Patch_Group_Based_ICCV_2015_paper.pdf", - "aff": "Dept. of Computing, The Hong Kong Polytechnic University, Hong Kong, China; Dept. of Computing, The Hong Kong Polytechnic University, Hong Kong, China; School of Computer Science and Technology, Harbin Institute of Technology, Harbin, China; Dept. of Computing, The Hong Kong Polytechnic University, Hong Kong, China; School of Mathematics and Statistics, Xidian University, Xi\u2019an, China", + "aff": "Dept. of Computing, The Hong Kong Polytechnic University, Hong Kong, China; Dept. of Computing, The Hong Kong Polytechnic University, Hong Kong, China; School of Computer Science and Technology, Harbin Institute of Technology, Harbin, China; Dept. of Computing, The Hong Kong Polytechnic University, Hong Kong, China; School of Mathematics and Statistics, Xidian University, Xi’an, China", "project": "", "github": "", "supp": "", @@ -11474,14 +11837,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Xu_Patch_Group_Based_ICCV_2015_paper.html", "aff_unique_index": "0;0;1;0;2", - "aff_unique_norm": "Hong Kong Polytechnic University;Harbin Institute of Technology;Xidian University", + "aff_unique_norm": "The Hong Kong Polytechnic University;Harbin Institute of Technology;Xidian University", "aff_unique_dep": "Dept. of Computing;School of Computer Science and Technology;School of Mathematics and Statistics", "aff_unique_url": "https://www.polyu.edu.hk;http://www.hit.edu.cn/;http://www.xidian.edu.cn/", "aff_unique_abbr": "PolyU;HIT;Xidian", "aff_campus_unique_index": "0;0;1;0;2", "aff_campus_unique": "Hong Kong;Harbin;Xi'an", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xu_2015_ICCV,\n \n author = {\n Xu,\n Jun and Zhang,\n Lei and Zuo,\n Wangmeng and Zhang,\n David and Feng,\n Xiangchu\n},\n title = {\n Patch Group Based Nonlocal Self-Similarity Prior Learning for Image Denoising\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "977b22d000", @@ -11506,7 +11870,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Liu_PatchMatch-Based_Automatic_Lattice_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Liu_PatchMatch-Based_Automatic_Lattice_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Liu_2015_ICCV,\n \n author = {\n Liu,\n Siying and Ng,\n Tian-Tsong and Sunkavalli,\n Kalyan and Do,\n Minh N. and Shechtman,\n Eli and Carr,\n Nathan\n},\n title = {\n PatchMatch-Based Automatic Lattice Detection for Near-Regular Textures\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "4ff7e595c9", @@ -11533,14 +11898,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Yi_Pedestrian_Travel_Time_ICCV_2015_paper.html", "aff_unique_index": "0+1;0+1;0+1", - "aff_unique_norm": "Chinese University of Hong Kong;Chinese Academy of Sciences", + "aff_unique_norm": "The Chinese University of Hong Kong;Chinese Academy of Sciences", "aff_unique_dep": "Department of Electronic Engineering;Shenzhen Institutes of Advanced Technology", "aff_unique_url": "https://www.cuhk.edu.hk;http://www.siat.cas.cn", "aff_unique_abbr": "CUHK;SIAT", "aff_campus_unique_index": "0+1;0+1;0+1", "aff_campus_unique": "Hong Kong SAR;Shenzhen", "aff_country_unique_index": "0+0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yi_2015_ICCV,\n \n author = {\n Yi,\n Shuai and Li,\n Hongsheng and Wang,\n Xiaogang\n},\n title = {\n Pedestrian Travel Time Estimation in Crowded Scenes\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "8b589cb64f", @@ -11565,7 +11931,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Korman_Peeking_Template_Matching_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Korman_Peeking_Template_Matching_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Korman_2015_ICCV,\n \n author = {\n Korman,\n Simon and Ofek,\n Eyal and Avidan,\n Shai\n},\n title = {\n Peeking Template Matching for Depth Extension\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "68e5c5d92a", @@ -11574,7 +11941,7 @@ "status": "Poster", "track": "main", "pid": "", - "author_site": "Borislav Antic, Bj\u00f6rn Ommer", + "author_site": "Borislav Antic, Björn Ommer", "author": "Borislav Antic; Bjorn Ommer", "abstract": "Object, action, or scene representations that are corrupted by noise significantly impair the performance of visual recognition. Typically, partial occlusion, clutter, or excessive articulation affects only a subset of all feature dimensions and, most importantly, different dimensions are corrupted in different samples. Nevertheless, the common approach to this problem in feature selection and kernel methods is to down-weight or eliminate entire training samples or the same dimensions of all samples. Thus, valuable signal is lost, resulting in suboptimal classification. Our goal is, therefore, to adjust the contribution of individual feature dimensions when comparing any two samples and computing their similarity. Consequently, per-sample selection of informative dimensions is directly integrated into kernel computation. The interrelated problems of learning the parameters of a kernel classifier and determining the informative components of each sample are then addressed in a joint objective function. The approach can be integrated into the learning stage of any kernel-based visual recognition problem and it does not affect the computational performance in the retrieval phase. Experiments on diverse challenges of action recognition in videos and indoor scene classification show the general applicability of the approach and its ability to improve learning of visual representations.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Antic_Per-Sample_Kernel_Adaptation_ICCV_2015_paper.pdf", @@ -11599,7 +11966,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Heidelberg", "aff_country_unique_index": "0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Antic_2015_ICCV,\n \n author = {\n Antic,\n Borislav and Ommer,\n Bjorn\n},\n title = {\n Per-Sample Kernel Adaptation for Visual Recognition and Grouping\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "f67f1f558a", @@ -11608,7 +11976,7 @@ "status": "Poster", "track": "main", "pid": "", - "author_site": "Jorge Garc\u00eda, Niki Martinel, Christian Micheloni, Alfredo Gardel", + "author_site": "Jorge García, Niki Martinel, Christian Micheloni, Alfredo Gardel", "author": "Jorge Garcia; Niki Martinel; Christian Micheloni; Alfredo Gardel", "abstract": "Person re-identification is an open and challenging problem in computer vision. Existing re-identification approaches focus on optimal methods for features matching (e.g., metric learning approaches) or study the inter-camera transformations of such features. These methods hardly ever pay attention to the problem of visual ambiguities shared between the first ranks. In this paper, we focus on such a problem and introduce an unsupervised ranking optimization approach based on discriminant context information analysis. The proposed approach refines a given initial ranking by removing the visual ambiguities common to first ranks. This is achieved by analyzing their content and context information. Extensive experiments on three publicly available benchmark datasets and different baseline methods have been conducted. Results demonstrate a remarkable improvement in the first positions of the ranking. Regardless of the selected dataset, state-of-the-art methods are strongly outperformed by our method.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Garcia_Person_Re-Identification_Ranking_ICCV_2015_paper.pdf", @@ -11633,7 +12001,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0", - "aff_country_unique": "Spain;Italy" + "aff_country_unique": "Spain;Italy", + "bibtex": "@InProceedings{Garcia_2015_ICCV,\n \n author = {\n Garcia,\n Jorge and Martinel,\n Niki and Micheloni,\n Christian and Gardel,\n Alfredo\n},\n title = {\n Person Re-Identification Ranking Optimisation by Discriminant Context Information Analysis\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "6b6f65c8f2", @@ -11660,14 +12029,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Shen_Person_Re-Identification_With_ICCV_2015_paper.html", "aff_unique_index": "0;0;0;1;2;3", - "aff_unique_norm": "Shanghai Jiao Tong University;Zhengzhou University;Nanjing University;Microsoft", - "aff_unique_dep": "Dept. of Electronic Engineering;School of Information Engineering;National Key Lab for Novel Software Technology;Microsoft Research", + "aff_unique_norm": "Shanghai Jiao Tong University;Zhengzhou University;Nanjing University;Microsoft Research", + "aff_unique_dep": "Dept. of Electronic Engineering;School of Information Engineering;National Key Lab for Novel Software Technology;", "aff_unique_url": "https://www.sjtu.edu.cn;http://www.zzu.edu.cn;http://www.nju.edu.cn;https://www.microsoft.com/en-us/research/group/microsoft-research-asia", "aff_unique_abbr": "SJTU;;Nanjing U;MSR", "aff_campus_unique_index": "1", "aff_campus_unique": ";Beijing", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Shen_2015_ICCV,\n \n author = {\n Shen,\n Yang and Lin,\n Weiyao and Yan,\n Junchi and Xu,\n Mingliang and Wu,\n Jianxin and Wang,\n Jingdong\n},\n title = {\n Person Re-Identification With Correspondence Structure Learning\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "7cfd5f5541", @@ -11701,7 +12071,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Troy", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Karanam_2015_ICCV,\n \n author = {\n Karanam,\n Srikrishna and Li,\n Yang and Radke,\n Richard J.\n},\n title = {\n Person Re-Identification With Discriminatively Trained Viewpoint Invariant Dictionaries\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "c7fc80a709", @@ -11735,7 +12106,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Oh_2015_ICCV,\n \n author = {\n Oh,\n Seong Joon and Benenson,\n Rodrigo and Fritz,\n Mario and Schiele,\n Bernt\n},\n title = {\n Person Recognition in Personal Photo Collections\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "88f7aafe28", @@ -11748,7 +12120,7 @@ "author": "Xiangbo Shu; Jinhui Tang; Hanjiang Lai; Luoqi Liu; Shuicheng Yan", "abstract": "In this paper, we aim to automatically render aging faces in a personalized way. Basically, a set of age-group specific dictionaries are learned, where the dictionary bases corresponding to the same index yet from different dictionaries form a particular aging process pattern cross different age groups, and a linear combination of these patterns expresses a particular personalized aging process. Moreover, two factors are taken into consideration in the dictionary learning process. First, beyond the aging dictionaries, each subject may have extra personalized facial characteristics, e.g. mole, which are invariant in the aging process. Second, it is challenging or even impossible to collect faces of all age groups for a particular subject, yet much easier and more practical to get face pairs from neighboring age groups. Thus a personality-aware coupled reconstruction loss is utilized to learn the dictionaries based on face pairs from neighboring age groups. Extensive experiments well demonstrate the advantages of our proposed solution over other state-of-the-arts in term of personalized aging progression, as well as the performance gain for cross-age face verification by synthesizing aging faces.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Shu_Personalized_Age_Progression_ICCV_2015_paper.pdf", - "aff": "School of Computer Science and Engineering, Nanjing University of Science and Technology\u2021; School of Computer Science and Engineering, Nanjing University of Science and Technology\u2021; Department of Electrical and Computer Engineering, National University of Singapore\u00a7; Department of Electrical and Computer Engineering, National University of Singapore\u00a7; Department of Electrical and Computer Engineering, National University of Singapore\u00a7", + "aff": "School of Computer Science and Engineering, Nanjing University of Science and Technology‡; School of Computer Science and Engineering, Nanjing University of Science and Technology‡; Department of Electrical and Computer Engineering, National University of Singapore§; Department of Electrical and Computer Engineering, National University of Singapore§; Department of Electrical and Computer Engineering, National University of Singapore§", "project": "", "github": "", "supp": "", @@ -11765,11 +12137,12 @@ "aff_unique_norm": "Nanjing University of Science and Technology;National University of Singapore", "aff_unique_dep": "School of Computer Science and Engineering;Department of Electrical and Computer Engineering", "aff_unique_url": "http://www.nust.edu.cn;https://www.nus.edu.sg", - "aff_unique_abbr": "NJUST;NUS", + "aff_unique_abbr": "NUST;NUS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;1;1", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Shu_2015_ICCV,\n \n author = {\n Shu,\n Xiangbo and Tang,\n Jinhui and Lai,\n Hanjiang and Liu,\n Luoqi and Yan,\n Shuicheng\n},\n title = {\n Personalized Age Progression With Aging Dictionary\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "ec75fc9098", @@ -11803,7 +12176,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Gotardo_2015_ICCV,\n \n author = {\n Gotardo,\n Paulo F. U. and Simon,\n Tomas and Sheikh,\n Yaser and Matthews,\n Iain\n},\n title = {\n Photogeometric Scene Flow for High-Detail Dynamic 3D Reconstruction\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "9e425fe294", @@ -11828,7 +12202,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Wang_Photometric_Stereo_With_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Wang_Photometric_Stereo_With_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Wang_2015_ICCV,\n \n author = {\n Wang,\n Jian and Matsushita,\n Yasuyuki and Shi,\n Boxin and Sankaranarayanan,\n Aswin C.\n},\n title = {\n Photometric Stereo With Small Angular Variations\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "5380d7fd03", @@ -11853,7 +12228,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Murez_Photometric_Stereo_in_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Murez_Photometric_Stereo_in_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Murez_2015_ICCV,\n \n author = {\n Murez,\n Zak and Treibitz,\n Tali and Ramamoorthi,\n Ravi and Kriegman,\n David\n},\n title = {\n Photometric Stereo in a Scattering Medium\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "b7ad69a3e6", @@ -11878,7 +12254,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Yu_Piecewise_Flat_Embedding_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Yu_Piecewise_Flat_Embedding_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Yu_2015_ICCV,\n \n author = {\n Yu,\n Yizhou and Fang,\n Chaowei and Liao,\n Zicheng\n},\n title = {\n Piecewise Flat Embedding for Image Segmentation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "cc89e144d7", @@ -11887,7 +12264,7 @@ "status": "Poster", "track": "main", "pid": "", - "author_site": "Simon Donn\u00e9, Bart Goossens, Wilfried Philips", + "author_site": "Simon Donné, Bart Goossens, Wilfried Philips", "author": "Simon Donne; Bart Goossens; Wilfried Philips", "abstract": "Multi-camera triangulation of feature points based on a minimisation of the overall L2 reprojection error can get stuck in suboptimal local minima or require slow global optimisation. For this reason, researchers have proposed optimising the L-infinity norm of the L2 single view reprojection errors, which avoids the problem of local minima entirely. In this paper we present a novel method for L-infinity triangulation that minimizes the L-infinity norm of the L-infinity reprojection errors: this apparently small difference leads to a much faster but equally accurate solution which is related to the MLE under the assumption of uniform noise. The proposed method adopts a new optimisation strategy based on solving simple quadratic equations. This stands in contrast with the fastest existing methods, which solve a sequence of more complex auxiliary Linear Programming or Second Order Cone Problems. The proposed algorithm performs well: for triangulation, it achieves the same accuracy as existing techniques while executing faster and being straightforward to implement.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Donne_Point_Triangulation_Through_ICCV_2015_paper.pdf", @@ -11912,7 +12289,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Belgium" + "aff_country_unique": "Belgium", + "bibtex": "@InProceedings{Donne_2015_ICCV,\n \n author = {\n Donne,\n Simon and Goossens,\n Bart and Philips,\n Wilfried\n},\n title = {\n Point Triangulation Through Polyhedron Collapse Using the l[?] Norm\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "0c8a76fa3e", @@ -11946,7 +12324,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Cambridge;", "aff_country_unique_index": "0;1+0;2+0;0", - "aff_country_unique": "United States;Russian Federation;Singapore" + "aff_country_unique": "United States;Russia;Singapore", + "bibtex": "@InProceedings{Kadambi_2015_ICCV,\n \n author = {\n Kadambi,\n Achuta and Taamazyan,\n Vage and Shi,\n Boxin and Raskar,\n Ramesh\n},\n title = {\n Polarized 3D: High-Quality Depth Sensing With Polarization Cues\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "03d5e3d500", @@ -11955,7 +12334,7 @@ "status": "Poster", "track": "main", "pid": "", - "author_site": "Shubham Tulsiani, Jo\u00e3o Carreira, Jitendra Malik", + "author_site": "Shubham Tulsiani, João Carreira, Jitendra Malik", "author": "Shubham Tulsiani; Joao Carreira; Jitendra Malik", "abstract": "We address the task of predicting pose for objects of unannotated object categories from a small seed set of annotated object classes. We present a generalized classifier that can reliably induce pose given a single instance of a novel category. In case of availability of a large collection of novel instances, our approach then jointly reasons over all instances to improve the initial estimates. We empirically validate the various components of our algorithm and quantitatively show that our method produces reliable pose estimates. We also show qualitative results on a diverse set of classes and further demonstrate the applicability of our system for learning shape models of novel object classes.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Tulsiani_Pose_Induction_for_ICCV_2015_paper.pdf", @@ -11980,7 +12359,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Tulsiani_2015_ICCV,\n \n author = {\n Tulsiani,\n Shubham and Carreira,\n Joao and Malik,\n Jitendra\n},\n title = {\n Pose Induction for Novel Object Categories\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "4a521c6968", @@ -12014,7 +12394,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "East Lansing", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Jourabloo_2015_ICCV,\n \n author = {\n Jourabloo,\n Amin and Liu,\n Xiaoming\n},\n title = {\n Pose-Invariant 3D Face Alignment\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "c018776bcd", @@ -12027,7 +12408,7 @@ "author": "Alex Kendall; Matthew Grimes; Roberto Cipolla", "abstract": "We present a robust and real-time monocular six degree of freedom relocalization system. Our system trains a convolutional neural network to regress the 6-DOF camera pose from a single RGB image in an end-to-end manner with no need of additional engineering or graph optimisation. The algorithm can operate indoors and outdoors in real time, taking 5ms per frame to compute. It obtains approximately 2m and 3 degrees accuracy for large scale outdoor scenes and 0.5m and 5 degrees accuracy indoors. This is achieved using an efficient 23 layer deep convnet, demonstrating that convnets can be used to solve complicated out of image plane regression problems. This was made possible by leveraging transfer learning from large scale classification data. We show that the PoseNet localizes from high level features and is robust to difficult lighting, motion blur and different camera intrinsics where point based SIFT registration fails. Furthermore we show how the pose feature that is produced generalizes to other scenes allowing us to regress pose with only a few dozen training examples.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Kendall_PoseNet_A_Convolutional_ICCV_2015_paper.pdf", - "aff": "University of Cambridge; University of Cambridge; King\u2019s College Old Hospital Shop Fac \u00b8ade St Mary\u2019s Church", + "aff": "University of Cambridge; University of Cambridge; King’s College Old Hospital Shop Fac ¸ade St Mary’s Church", "project": "mi.eng.cam.ac.uk/projects/relocalisation/", "github": "", "supp": "", @@ -12048,7 +12429,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Cambridge;", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Kendall_2015_ICCV,\n \n author = {\n Kendall,\n Alex and Grimes,\n Matthew and Cipolla,\n Roberto\n},\n title = {\n PoseNet: A Convolutional Network for Real-Time 6-DOF Camera Relocalization\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "74f5ad4f4e", @@ -12082,7 +12464,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Ba_2015_ICCV,\n \n author = {\n Ba,\n Jimmy Lei and Swersky,\n Kevin and Fidler,\n Sanja and salakhutdinov,\n Ruslan\n},\n title = {\n Predicting Deep Zero-Shot Convolutional Neural Networks Using Textual Descriptions\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "433dc639b6", @@ -12109,14 +12492,15 @@ "author_num": 2, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Eigen_Predicting_Depth_Surface_ICCV_2015_paper.html", "aff_unique_index": "0;0+1", - "aff_unique_norm": "New York University;Meta", + "aff_unique_norm": "New York University;Facebook", "aff_unique_dep": "Dept. of Computer Science;Facebook AI Research", "aff_unique_url": "https://www.nyu.edu;https://research.facebook.com", "aff_unique_abbr": "NYU;FAIR", "aff_campus_unique_index": "0;0", "aff_campus_unique": "New York;", "aff_country_unique_index": "0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Eigen_2015_ICCV,\n \n author = {\n Eigen,\n David and Fergus,\n Rob\n},\n title = {\n Predicting Depth,\n Surface Normals and Semantic Labels With a Common Multi-Scale Convolutional Architecture\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "e73bb4b6b2", @@ -12140,7 +12524,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Kim_Predicting_Good_Features_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Kim_Predicting_Good_Features_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Kim_2015_ICCV,\n \n author = {\n Kim,\n Hyo Jin and Dunn,\n Enrique and Frahm,\n Jan-Michael\n},\n title = {\n Predicting Good Features for Image Geo-Localization Using Per-Bundle VLAD\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "8d0f8d205e", @@ -12165,7 +12550,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Dey_Predicting_Multiple_Structured_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Dey_Predicting_Multiple_Structured_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Dey_2015_ICCV,\n \n author = {\n Dey,\n Debadeepta and Ramakrishna,\n Varun and Hebert,\n Martial and Bagnell,\n J. Andrew\n},\n title = {\n Predicting Multiple Structured Visual Interpretations\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "965b9b7cad", @@ -12174,7 +12560,7 @@ "status": "Poster", "track": "main", "pid": "", - "author_site": "Julia Kr\u00fcger, Jan Ehrhardt, Heinz Handels", + "author_site": "Julia Krüger, Jan Ehrhardt, Heinz Handels", "author": "Julia Kruger; Jan Ehrhardt; Heinz Handels", "abstract": "Statistical shape and appearance models are often based on the accurate identification of one-to-one correspondences in a training data set. At the same time, the determination of these corresponding landmarks is the most challenging part of such methods. Hufnagel etal developed an alternative method using correspondence probabilities for a statistical shape model. We propose the use of probabilistic correspondences for statistical appearance models by incorporating appearance information into the framework. A point-based representation is employed representing the image by a set of vectors assembling position and appearances. Using probabilistic correspondences between these multi-dimensional feature vectors eliminates the need for extensive preprocessing to find corresponding landmarks and reduces the dependence of the generated model on the landmark positions. Then, a maximum a-posteriori approach is used to derive a single global optimization criterion with respect to model parameters and observation dependent parameters, that directly affects shape and appearance information of the considered structures. Model generation and fitting can be expressed by optimizing the same criterion. The developed framework describes the modeling process in a concise and flexible mathematical way and allows for additional constraints as topological regularity in the modeling process. Furthermore, it eliminates the demand for costly correspondence determination. We apply the model for segmentation and landmark identification in hand X-ray images, where segmentation information is modeled as further features in the vectorial image representation. The results demonstrate the feasibility of the model to reconstruct contours and landmarks for unseen test images. Furthermore, we apply the model for tissue classification, where a model is generated for healthy brain tissue using 2D MRI slices. Applying the model to images of stroke patients the probabilistic correspondences are used to classify between healthy and pathological structures. The results demonstrate the ability of the probabilistic model to recognize healthy and pathological tissue automatically.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Kruger_Probabilistic_Appearance_Models_ICCV_2015_paper.pdf", @@ -12199,7 +12585,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Kruger_2015_ICCV,\n \n author = {\n Kruger,\n Julia and Ehrhardt,\n Jan and Handels,\n Heinz\n},\n title = {\n Probabilistic Appearance Models for Segmentation and Classification\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "d0994d48c2", @@ -12227,13 +12614,14 @@ "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Ding_Probabilistic_Label_Relation_ICCV_2015_paper.html", "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Google;University of Michigan", - "aff_unique_dep": "Google;", + "aff_unique_dep": ";", "aff_unique_url": "https://www.google.com;https://www.umich.edu", "aff_unique_abbr": "Google;UM", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Ding_2015_ICCV,\n \n author = {\n Ding,\n Nan and Deng,\n Jia and Murphy,\n Kevin P. and Neven,\n Hartmut\n},\n title = {\n Probabilistic Label Relation Graphs With Ising Models\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "606ebe6bb4", @@ -12242,7 +12630,7 @@ "status": "Poster", "track": "main", "pid": "", - "author_site": "\u0130lke Demir, Daniel G. Aliaga, Bedrich Benes", + "author_site": "İlke Demir, Daniel G. Aliaga, Bedrich Benes", "author": "Ilke Demir; Daniel G. Aliaga; Bedrich Benes", "abstract": "Thanks to the recent advances in computational photography and remote sensing, point clouds of buildings are becoming increasingly available, yet their processing poses various challenges. In our work, we tackle the problem of point cloud completion and editing and we approach it via inverse procedural modeling. Contrary to the previous work, our approach operates directly on the point cloud without an intermediate triangulation. Our approach consists of 1) semi-automatic segmentation of the input point cloud with segment comparison and template matching to detect repeating structures, 2) a consensus-based voting schema and a pattern extraction algorithm to discover completed terminal geometry and their patterns of usage, all encoded into a context-free grammar, and 3) an interactive editing tool where the user can create new point clouds by using procedural copy and paste operations, and smart resizing. We demonstrate our approach on editing of building models with up to 1.8M points. In our implementation, preprocessing takes up to several minutes and a single editing operation needs from one second to one minute depending on the model size and the operation type.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Demir_Procedural_Editing_of_ICCV_2015_paper.pdf", @@ -12267,7 +12655,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Demir_2015_ICCV,\n \n author = {\n Demir,\n Ilke and Aliaga,\n Daniel G. and Benes,\n Bedrich\n},\n title = {\n Procedural Editing of 3D Building Point Clouds\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "1425686ae5", @@ -12301,7 +12690,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Newcastle upon Tyne", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Liu_2015_ICCV,\n \n author = {\n Liu,\n Li and Yu,\n Mengyang and Shao,\n Ling\n},\n title = {\n Projection Bank: From High-Dimensional Data to Medium-Length Binary Codes\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "449b687146", @@ -12326,7 +12716,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Sironi_Projection_Onto_the_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Sironi_Projection_Onto_the_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Sironi_2015_ICCV,\n \n author = {\n Sironi,\n Amos and Lepetit,\n Vincent and Fua,\n Pascal\n},\n title = {\n Projection Onto the Manifold of Elongated Structures for Accurate Extraction\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "6dd3aa9dad", @@ -12353,14 +12744,15 @@ "author_num": 7, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Cheng_Query_Adaptive_Similarity_ICCV_2015_paper.html", "aff_unique_index": "0;1;2;1;0;3;1", - "aff_unique_norm": "Chinese Academy of Sciences Institute of Automation;Microsoft;Sun Yat-sen University;Chinese Academy of Sciences", + "aff_unique_norm": "Chinese Academy of Sciences Institute of Automation;Microsoft Corporation;Sun Yat-Sen University;Chinese Academy of Sciences", "aff_unique_dep": "CRIPAC (Computational Intelligence & Pattern Analysis Group) & NLPR (National Laboratory of Pattern Recognition);Microsoft Research;;Center for Excellence in Brain Science and Intelligence Technology", "aff_unique_url": "http://www.ia.cas.cn;https://www.microsoft.com/en-us/research;http://www.sysu.edu.cn/;http://www.cas.cn/", "aff_unique_abbr": "CASIA;MSR;SYSU;CAS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;1;0;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Cheng_2015_ICCV,\n \n author = {\n Cheng,\n Yanhua and Cai,\n Rui and Zhang,\n Chi and Li,\n Zhiwei and Zhao,\n Xin and Huang,\n Kaiqi and Rui,\n Yong\n},\n title = {\n Query Adaptive Similarity Measure for RGB-D Object Recognition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "a58aa5a0fc", @@ -12394,7 +12786,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", - "aff_country_unique": "South Korea;China" + "aff_country_unique": "South Korea;China", + "bibtex": "@InProceedings{Kwon_2015_ICCV,\n \n author = {\n Kwon,\n Hyeokhyen and Tai,\n Yu-Wing\n},\n title = {\n RGB-Guided Hyperspectral Image Upsampling\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "2d417000ff", @@ -12428,7 +12821,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Alahi_2015_ICCV,\n \n author = {\n Alahi,\n Alexandre and Haque,\n Albert and Fei-Fei,\n Li\n},\n title = {\n RGB-W: When Vision Meets Wireless\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "8e4d034a8d", @@ -12452,7 +12846,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Xie_RIDE_Reversal_Invariant_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Xie_RIDE_Reversal_Invariant_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Xie_2015_ICCV,\n \n author = {\n Xie,\n Lingxi and Wang,\n Jingdong and Lin,\n Weiyao and Zhang,\n Bo and Tian,\n Qi\n},\n title = {\n RIDE: Reversal Invariant Descriptor Enhancement\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "6483bddb9a", @@ -12461,7 +12856,7 @@ "status": "Poster", "track": "main", "pid": "", - "author_site": "Roman Jur\u00e1nek, Adam Herout, Mark\u00e9ta Dubsk\u00e1, Pavel Zem\u010d\u00edk", + "author_site": "Roman Juránek, Adam Herout, Markéta Dubská, Pavel Zemčík", "author": "Roman Juranek; Adam Herout; Marketa Dubska; Pavel Zemcik", "abstract": "We present an object detector coupled with pose estimation directly in a single compact and simple model, where the detector shares extracted image features with the pose estimator. The output of the classification of each candidate window consists of both object score and likelihood map of poses. This extension introduces negligible overhead during detection so that the detector is still capable of real time operation. We evaluated the proposed approach on the problem of vehicle detection. We used existing datasets with viewpoint/pose annotation (WCVP, 3D objects, KITTI). Besides that, we collected a new traffic surveillance dataset COD20k which fills certain gaps of the existing datasets and we make it public. The experimental results show that the proposed approach is comparable with state-of-the-art approaches in terms of accuracy, but it is considerably faster - easily operating in real time (Matlab with C++ code). The source codes and the collected COD20k dataset are made public along with the paper.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Juranek_Real-Time_Pose_Estimation_ICCV_2015_paper.pdf", @@ -12486,7 +12881,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Brno", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Czech Republic" + "aff_country_unique": "Czech Republic", + "bibtex": "@InProceedings{Juranek_2015_ICCV,\n \n author = {\n Juranek,\n Roman and Herout,\n Adam and Dubska,\n Marketa and Zemcik,\n Pavel\n},\n title = {\n Real-Time Pose Estimation Piggybacked on Object Detection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "03b1771f3a", @@ -12495,7 +12891,7 @@ "status": "Poster", "track": "main", "pid": "", - "author_site": "Juan Jos\u00e9 Tarrio, Sol Pedre", + "author_site": "Juan José Tarrio, Sol Pedre", "author": "Juan Jose Tarrio; Sol Pedre", "abstract": "In this work we present a novel algorithm for realtime visual odometry for a monocular camera. The main idea is to develop an approach between classical feature-based visual odometry systems and modern direct dense/semi-dense methods, trying to benefit from the best attributes of both. Similar to feature-based systems, we extract information from the images, instead of working with raw image intensities as direct methods. In particular, the information extracted are the edges present in the image, while the rest of the algorithm is designed to take advantage of the structural information provided when pixels are treated as edges. Edge extraction is an efficient and higly parallelizable operation. The edge depth information extracted is dense enough to allow acceptable surface fitting, similar to modern semi-dense methods. This is a valuable attribute that feature-based odometry lacks. Experimental results show that the proposed method has similar drift than state of the art feature-based and direct methods, and is a simple algorithm that runs at realtime and can be parallelized. Finally, we have also developed an inertial aided version that successfully stabilizes an unmanned air vehicle in complex indoor environments using only a frontal camera, while running the complete solution in the embedded hardware on board the vehicle.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Tarrio_Realtime_Edge-Based_Visual_ICCV_2015_paper.pdf", @@ -12511,7 +12907,8 @@ "aff_domain": ";", "email": ";", "author_num": 2, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Tarrio_Realtime_Edge-Based_Visual_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Tarrio_Realtime_Edge-Based_Visual_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Tarrio_2015_ICCV,\n \n author = {\n Tarrio,\n Juan Jose and Pedre,\n Sol\n},\n title = {\n Realtime Edge-Based Visual Odometry for a Monocular Camera\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "fb8261057d", @@ -12535,7 +12932,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Fragkiadaki_Recurrent_Network_Models_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Fragkiadaki_Recurrent_Network_Models_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Fragkiadaki_2015_ICCV,\n \n author = {\n Fragkiadaki,\n Katerina and Levine,\n Sergey and Felsen,\n Panna and Malik,\n Jitendra\n},\n title = {\n Recurrent Network Models for Human Dynamics\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "7d203768d4", @@ -12569,7 +12967,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Chakraborty_2015_ICCV,\n \n author = {\n Chakraborty,\n Rudrasis and Vemuri,\n Baba C.\n},\n title = {\n Recursive Frechet Mean Computation on the Grassmannian and its Applications to Computer Vision\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "a055dee0df", @@ -12596,14 +12995,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Nair_Reflection_Modeling_for_ICCV_2015_paper.html", "aff_unique_index": "0;1;0+2;2", - "aff_unique_norm": "Heidelberg University;Microsoft;Technical University Dresden", - "aff_unique_dep": "Heidelberg Collaboratory for Image Processing;Microsoft Research;Computer Vision Lab", + "aff_unique_norm": "Heidelberg University;Microsoft Research;Technical University Dresden", + "aff_unique_dep": "Heidelberg Collaboratory for Image Processing;;Computer Vision Lab", "aff_unique_url": "https://www.uni-heidelberg.de;https://www.microsoft.com/en-us/research;https://tu-dresden.de", "aff_unique_abbr": "Uni Heidelberg;MSR;TUD", "aff_campus_unique_index": "0;1;0+2;2", "aff_campus_unique": "Heidelberg;Cambridge;Dresden", "aff_country_unique_index": "0;1;0+0;0", - "aff_country_unique": "Germany;United Kingdom" + "aff_country_unique": "Germany;United Kingdom", + "bibtex": "@InProceedings{Nair_2015_ICCV,\n \n author = {\n Nair,\n Rahul and Fitzgibbon,\n Andrew and Kondermann,\n Daniel and Rother,\n Carsten\n},\n title = {\n Reflection Modeling for Passive Stereo\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "316aa88058", @@ -12612,7 +13012,7 @@ "status": "Oral", "track": "main", "pid": "", - "author_site": "Tobias Pl\u00f6tz, Stefan Roth", + "author_site": "Tobias Plötz, Stefan Roth", "author": "Tobias Plotz; Stefan Roth", "abstract": "Many existing approaches for image-to-geometry registration assume that either a textured 3D model or a good initial guess of the 3D pose is available to bootstrap the registration process. In this paper we consider the registration of photographs to 3D models even when no texture information is available. This is very challenging as we cannot rely on texture gradients, and even shading gradients are hard to estimate since the lighting conditions are unknown. To that end, we propose average shading gradients, a rendering technique that estimates the average gradient magnitude over all lighting directions under Lambertian shading. We use this gradient representation as the building block of a registration pipeline based on matching sparse features. To cope with inevitable false matches due to the missing texture information and to increase robustness, the pose of the 3D model is estimated in two stages. Coarse pose hypotheses are first obtained from a single correct match each, subsequently refined using SIFT flow, and finally verified. We apply our algorithm to registering images of real-world objects to untextured 3D meshes of limited accuracy.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Plotz_Registering_Images_to_ICCV_2015_paper.pdf", @@ -12630,14 +13030,15 @@ "author_num": 2, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Plotz_Registering_Images_to_ICCV_2015_paper.html", "aff_unique_index": "0;0", - "aff_unique_norm": "Technische Universit\u00e4t Darmstadt", + "aff_unique_norm": "Technische Universität Darmstadt", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.tu-darmstadt.de", "aff_unique_abbr": "TU Darmstadt", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Darmstadt", "aff_country_unique_index": "0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Plotz_2015_ICCV,\n \n author = {\n Plotz,\n Tobias and Roth,\n Stefan\n},\n title = {\n Registering Images to Untextured Geometry Using Average Shading Gradients\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "24e796c07d", @@ -12671,7 +13072,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Italy" + "aff_country_unique": "Italy", + "bibtex": "@InProceedings{Tulyakov_2015_ICCV,\n \n author = {\n Tulyakov,\n Sergey and Sebe,\n Nicu\n},\n title = {\n Regressing a 3D Face Shape From a Single Image\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "f66954db66", @@ -12684,7 +13086,7 @@ "author": "Gee-Sern Hsu; Kai-Hsiang Chang; Shih-Chieh Huang", "abstract": "Although the Tree Structured Model (TSM) is proven effective for solving face detection, pose estimation and landmark localization in an unified model, its sluggish run time makes it unfavorable in practical applications, especially when dealing with cases of multiple faces. We propose the Regressive Tree Structure Model (RTSM) to improve the run-time speed and localization accuracy. The RTSM is composed of two component TSMs, the coarse TSM (c-TSM) and the refined TSM (r-TSM), and a Bilateral Support Vector Regressor (BSVR). The c-TSM is built on the low-resolution octaves of samples so that it provides coarse but fast face detection. The r-TSM is built on the mid-resolution octaves so that it can locate the landmarks on the face candidates given by the c-TSM and improve precision. The r-TSM based landmarks are used in the forward BSVR as references to locate the dense set of landmarks, which are then used in the backward BSVR to relocate the landmarks with large localization errors. The forward and backward regression goes on iteratively until convergence. The performance of the RTSM is validated on three benchmark databases, the Multi-PIE, LFPW and AFW, and compared with the latest TSM to demonstrate its efficacy.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Hsu_Regressive_Tree_Structured_ICCV_2015_paper.pdf", - "aff": "Arti\ufb01cial Vision Lab., Dept Mechanical Engineering, National Taiwan University of Science and Technology; Arti\ufb01cial Vision Lab., Dept Mechanical Engineering, National Taiwan University of Science and Technology; Arti\ufb01cial Vision Lab., Dept Mechanical Engineering, National Taiwan University of Science and Technology", + "aff": "Artificial Vision Lab., Dept Mechanical Engineering, National Taiwan University of Science and Technology; Artificial Vision Lab., Dept Mechanical Engineering, National Taiwan University of Science and Technology; Artificial Vision Lab., Dept Mechanical Engineering, National Taiwan University of Science and Technology", "project": "", "github": "", "supp": "", @@ -12705,7 +13107,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Taiwan", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Hsu_2015_ICCV,\n \n author = {\n Hsu,\n Gee-Sern and Chang,\n Kai-Hsiang and Huang,\n Shih-Chieh\n},\n title = {\n Regressive Tree Structured Model for Facial Landmark Localization\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "1bc98e5134", @@ -12739,7 +13142,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2015_ICCV,\n \n author = {\n Wang,\n Xinggang and Zhu,\n Zhuotun and Yao,\n Cong and Bai,\n Xiang\n},\n title = {\n Relaxed Multiple-Instance SVM With Application to Object Discovery\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "f52c84ca48", @@ -12766,14 +13170,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Fu_Relaxing_From_Vocabulary_ICCV_2015_paper.html", "aff_unique_index": "0+1;2;1;0;0;1", - "aff_unique_norm": "Chinese Academy of Sciences;Microsoft;University of Science and Technology of China", - "aff_unique_dep": "Institute of Automation;Microsoft Research;", + "aff_unique_norm": "Chinese Academy of Sciences;Microsoft Research;University of Science and Technology of China", + "aff_unique_dep": "Institute of Automation;;", "aff_unique_url": "http://www.ia.cas.cn;https://www.microsoft.com/en-us/research/group/microsoft-research-asia;http://www.ustc.edu.cn", "aff_unique_abbr": "CAS;MSR;USTC", "aff_campus_unique_index": "0+0;1;0;0;0;0", "aff_campus_unique": "Beijing;Hefei", "aff_country_unique_index": "0+0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Fu_2015_ICCV,\n \n author = {\n Fu,\n Jianlong and Wu,\n Yue and Mei,\n Tao and Wang,\n Jinqiao and Lu,\n Hanqing and Rui,\n Yong\n},\n title = {\n Relaxing From Vocabulary: Robust Weakly-Supervised Deep Learning for Vocabulary-Free Image Tagging\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "a89940b2c0", @@ -12807,7 +13212,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Guangzhou;", "aff_country_unique_index": "0+1;0;1", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Luo_2015_ICCV,\n \n author = {\n Luo,\n Yu and Xu,\n Yong and Ji,\n Hui\n},\n title = {\n Removing Rain From a Single Image via Discriminative Sparse Coding\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "4c92c449e8", @@ -12832,7 +13238,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Su_Render_for_CNN_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Su_Render_for_CNN_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Su_2015_ICCV,\n \n author = {\n Su,\n Hao and Qi,\n Charles R. and Li,\n Yangyan and Guibas,\n Leonidas J.\n},\n title = {\n Render for CNN: Viewpoint Estimation in Images Using CNNs Trained With Rendered 3D Model Views\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "d48886cfe6", @@ -12841,7 +13248,7 @@ "status": "Poster", "track": "main", "pid": "", - "author_site": "Erroll Wood, Tadas Baltru\u0161aitis, Xucong Zhang, Yusuke Sugano, Peter Robinson, Andreas Bulling", + "author_site": "Erroll Wood, Tadas Baltrušaitis, Xucong Zhang, Yusuke Sugano, Peter Robinson, Andreas Bulling", "author": "Erroll Wood; Tadas Baltrusaitis; Xucong Zhang; Yusuke Sugano; Peter Robinson; Andreas Bulling", "abstract": "Images of the eye are key in several computer vision problems, such as shape registration and gaze estimation. Recent large-scale supervised methods for these problems require time-consuming data collection and manual annotation, which can be unreliable. We propose synthesizing perfectly labelled photo-realistic training data in a fraction of the time. We used computer graphics techniques to build a collection of dynamic eye-region models from head scan geometry. These were randomly posed to synthesize close-up eye images for a wide range of head poses, gaze directions, and illumination conditions. We used our model's controllability to verify the importance of realistic illumination and shape variations in eye-region training data. Finally, we demonstrate the benefits of our synthesized training data (SynthesEyes) by out-performing state-of-the-art methods for eye-shape registration as well as cross-dataset appearance-based gaze estimation in the wild.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Wood_Rendering_of_Eyes_ICCV_2015_paper.pdf", @@ -12866,7 +13273,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Cambridge;", "aff_country_unique_index": "0;0;1;1;0;1", - "aff_country_unique": "United Kingdom;Germany" + "aff_country_unique": "United Kingdom;Germany", + "bibtex": "@InProceedings{Wood_2015_ICCV,\n \n author = {\n Wood,\n Erroll and Baltrusaitis,\n Tadas and Zhang,\n Xucong and Sugano,\n Yusuke and Robinson,\n Peter and Bulling,\n Andreas\n},\n title = {\n Rendering of Eyes for Eye-Shape Registration and Gaze Estimation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "fd0d575af0", @@ -12893,14 +13301,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Yang_Resolving_Scale_Ambiguity_ICCV_2015_paper.html", "aff_unique_index": "0;0;1;0+2", - "aff_unique_norm": "University of Delaware;Microsoft;ShanghaiTech University", + "aff_unique_norm": "University of Delaware;Microsoft Corporation;ShanghaiTech University", "aff_unique_dep": ";Microsoft Research;", "aff_unique_url": "https://www.udel.edu;https://www.microsoft.com/en-us/research;https://www.shanghaitech.edu.cn", "aff_unique_abbr": "UD;MSR;ShanghaiTech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0+1", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Yang_2015_ICCV,\n \n author = {\n Yang,\n Wei and Lin,\n Haiting and Kang,\n Sing Bing and Yu,\n Jingyi\n},\n title = {\n Resolving Scale Ambiguity Via XSlit Aspect Ratio Analysis\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "8d9e55ac95", @@ -12925,7 +13334,8 @@ "aff_domain": ";", "email": ";", "author_num": 2, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Wu_Robust_Facial_Landmark_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Wu_Robust_Facial_Landmark_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Wu_2015_ICCV,\n \n author = {\n Wu,\n Yue and Ji,\n Qiang\n},\n title = {\n Robust Facial Landmark Detection Under Significant Head Poses and Occlusion\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "7f43d55761", @@ -12959,7 +13369,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "", - "aff_country_unique": "" + "aff_country_unique": "", + "bibtex": "@InProceedings{Lam_2015_ICCV,\n \n author = {\n Lam,\n Antony and Kuno,\n Yoshinori\n},\n title = {\n Robust Heart Rate Measurement From Video Using Select Random Patches\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "40784309ca", @@ -12986,14 +13397,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Fu_Robust_Image_Segmentation_ICCV_2015_paper.html", "aff_unique_index": "0;0;0;1;0", - "aff_unique_norm": "University of Southern California;Microsoft", + "aff_unique_norm": "University of Southern California;Microsoft Corporation", "aff_unique_dep": ";Microsoft Research", "aff_unique_url": "https://www.usc.edu;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "USC;MSR", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Fu_2015_ICCV,\n \n author = {\n Fu,\n Xiang and Wang,\n Chien-Yi and Chen,\n Chen and Wang,\n Changhu and Kuo,\n C.-C. Jay\n},\n title = {\n Robust Image Segmentation Using Contour-Guided Color Palettes\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "75b7451cec", @@ -13020,14 +13432,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Meyer_Robust_Model-Based_3D_ICCV_2015_paper.html", "aff_unique_index": "0+1;1;1;1;1", - "aff_unique_norm": "University of Illinois Urbana-Champaign;NVIDIA", - "aff_unique_dep": ";NVIDIA Corporation", + "aff_unique_norm": "University of Illinois at Urbana-Champaign;NVIDIA Corporation", + "aff_unique_dep": ";", "aff_unique_url": "https://illinois.edu;https://www.nvidia.com", "aff_unique_abbr": "UIUC;NVIDIA", "aff_campus_unique_index": "0", "aff_campus_unique": "Urbana-Champaign;", "aff_country_unique_index": "0+0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Meyer_2015_ICCV,\n \n author = {\n Meyer,\n Gregory P. and Gupta,\n Shalini and Frosio,\n Iuri and Reddy,\n Dikpal and Kautz,\n Jan\n},\n title = {\n Robust Model-Based 3D Head Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "f3021f3d94", @@ -13054,14 +13467,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Guo_Robust_Non-Rigid_Motion_ICCV_2015_paper.html", "aff_unique_index": "0+0;0+0;1;0+0;0+0", - "aff_unique_norm": "Tsinghua University;Microsoft", + "aff_unique_norm": "Tsinghua University;Microsoft Corporation", "aff_unique_dep": "National Laboratory for Information Science and Technology;Microsoft Research", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "THU;MSR", "aff_campus_unique_index": "0+0;0+0;0+0;0+0", "aff_campus_unique": "Beijing;", "aff_country_unique_index": "0+0;0+0;1;0+0;0+0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Guo_2015_ICCV,\n \n author = {\n Guo,\n Kaiwen and Xu,\n Feng and Wang,\n Yangang and Liu,\n Yebin and Dai,\n Qionghai\n},\n title = {\n Robust Non-Rigid Motion Tracking and Surface Reconstruction Using L0 Regularization\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "0daa6d42c1", @@ -13085,7 +13499,8 @@ "aff_domain": ";", "email": ";", "author_num": 2, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Chen_Robust_Nonrigid_Registration_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Chen_Robust_Nonrigid_Registration_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Chen_2015_ICCV,\n \n author = {\n Chen,\n Qifeng and Koltun,\n Vladlen\n},\n title = {\n Robust Nonrigid Registration by Convex Optimization\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "da71981258", @@ -13098,7 +13513,7 @@ "author": "Vasileios Belagiannis; Christian Rupprecht; Gustavo Carneiro; Nassir Navab", "abstract": "Convolutional Neural Networks (ConvNets) have successfully contributed to improve the accuracy of regression-based methods for computer vision tasks such as human pose estimation, landmark localization, and object detection. The network optimization has been usually performed with L2 loss and without considering the impact of outliers on the training process, where an outlier in this context is defined by a sample estimation that lies at an abnormal distance from the other training sample estimations in the objective space. In this work, we propose a regression model with ConvNets that achieves robustness to such outliers by minimizing Tukey's biweight function, an M-estimator robust to outliers, as the loss function for the ConvNet. In addition to the robust loss, we introduce a coarse-to-fine model, which processes input images of progressively higher resolutions for improving the accuracy of the regressed values. In our experiments, we demonstrate faster convergence and better generalization of our robust loss function for the tasks of human pose estimation and age estimation from face images. We also show that the combination of the robust loss function with the coarse-to-fine model produces comparable or better results than current state-of-the-art approaches in four publicly available human pose estimation datasets.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Belagiannis_Robust_Optimization_for_ICCV_2015_paper.pdf", - "aff": "Computer Aided Medical Procedures, Technische Universit\u00e4t M\u00fcnchen + Visual Geometry Group, Department of Engineering Science, University of Oxford; Computer Aided Medical Procedures, Technische Universit\u00e4t M\u00fcnchen + Johns Hopkins University; Australian Centre for Visual Technologies, University of Adelaide; Computer Aided Medical Procedures, Technische Universit\u00e4t M\u00fcnchen + Johns Hopkins University", + "aff": "Computer Aided Medical Procedures, Technische Universität München + Visual Geometry Group, Department of Engineering Science, University of Oxford; Computer Aided Medical Procedures, Technische Universität München + Johns Hopkins University; Australian Centre for Visual Technologies, University of Adelaide; Computer Aided Medical Procedures, Technische Universität München + Johns Hopkins University", "project": "", "github": "", "supp": "", @@ -13112,14 +13527,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Belagiannis_Robust_Optimization_for_ICCV_2015_paper.html", "aff_unique_index": "0+1;0+2;3;0+2", - "aff_unique_norm": "Technische Universit\u00e4t M\u00fcnchen;University of Oxford;Johns Hopkins University;University of Adelaide", + "aff_unique_norm": "Technische Universität München;University of Oxford;Johns Hopkins University;University of Adelaide", "aff_unique_dep": "Computer Aided Medical Procedures;Department of Engineering Science;;Australian Centre for Visual Technologies", "aff_unique_url": "https://www.tum.de;https://www.ox.ac.uk;https://www.jhu.edu;https://www.adelaide.edu.au", "aff_unique_abbr": "TUM;Oxford;JHU;Adelaide", "aff_campus_unique_index": "1;;2;", "aff_campus_unique": ";Oxford;Adelaide", "aff_country_unique_index": "0+1;0+2;3;0+2", - "aff_country_unique": "Germany;United Kingdom;United States;Australia" + "aff_country_unique": "Germany;United Kingdom;United States;Australia", + "bibtex": "@InProceedings{Belagiannis_2015_ICCV,\n \n author = {\n Belagiannis,\n Vasileios and Rupprecht,\n Christian and Carneiro,\n Gustavo and Navab,\n Nassir\n},\n title = {\n Robust Optimization for Deep Regression\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "a4a99611d4", @@ -13146,14 +13562,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Shahid_Robust_Principal_Component_ICCV_2015_paper.html", "aff_unique_index": "0;0;0;1;0", - "aff_unique_norm": "EPFL;Universita della Svizzera Italiana", + "aff_unique_norm": "École Polytechnique Fédérale de Lausanne;Universita della Svizzera Italiana", "aff_unique_dep": "Signal Processing Laboratory (LTS2);", "aff_unique_url": "https://www.epfl.ch;https://www.usi.ch", "aff_unique_abbr": "EPFL;USI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "Switzerland" + "aff_country_unique": "Switzerland", + "bibtex": "@InProceedings{Shahid_2015_ICCV,\n \n author = {\n Shahid,\n Nauman and Kalofolias,\n Vassilis and Bresson,\n Xavier and Bronstein,\n Michael and Vandergheynst,\n Pierre\n},\n title = {\n Robust Principal Component Analysis on Graphs\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "80e117b36e", @@ -13187,7 +13604,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "College Station", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Lu_2015_ICCV,\n \n author = {\n Lu,\n Yan and Song,\n Dezhen\n},\n title = {\n Robust RGB-D Odometry Using Point and Line Features\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "24f2c974ab", @@ -13216,12 +13634,13 @@ "aff_unique_index": "0;0;0;0+1", "aff_unique_norm": "Imperial College London;University of Twente", "aff_unique_dep": "Department of Computing;EEMCS", - "aff_unique_url": "https://www.imperial.ac.uk;https://www.utwente.nl/en/", + "aff_unique_url": "https://www.imperial.ac.uk;https://www.utwente.nl", "aff_unique_abbr": "Imperial;UT", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "London;", "aff_country_unique_index": "0;0;0;0+1", - "aff_country_unique": "United Kingdom;Netherlands" + "aff_country_unique": "United Kingdom;Netherlands", + "bibtex": "@InProceedings{Sagonas_2015_ICCV,\n \n author = {\n Sagonas,\n Christos and Panagakis,\n Yannis and Zafeiriou,\n Stefanos and Pantic,\n Maja\n},\n title = {\n Robust Statistical Face Frontalization\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "eaa7b3e8ab", @@ -13230,7 +13649,7 @@ "status": "Oral", "track": "main", "pid": "", - "author_site": "Danda Pani Paudel, Adlane Habed, C\u00e9dric Demonceaux, Pascal Vasseur", + "author_site": "Danda Pani Paudel, Adlane Habed, Cédric Demonceaux, Pascal Vasseur", "author": "Danda Pani Paudel; Adlane Habed; Cedric Demonceaux; Pascal Vasseur", "abstract": "This paper deals with the problem of registering a known structured 3D scene and its metric Structure-from-Motion (SfM) counterpart. The proposed work relies on a prior plane segmentation of the 3D scene and aligns the data obtained from both modalities by solving the point-to-plane assignment problem. An inliers-maximization approach within a Branch-and-Bound (BnB) search scheme is adopted. For the first time in this paper, a Sum-of-Squares optimization theory framework is employed for identifying point-to-plane mismatches (i.e. outliers) with certainty. This allows us to iteratively build potential inliers sets and converge to the solution satisfied by the largest number of point-to-plane assignments. Furthermore, our approach is boosted by new plane visibility conditions which are also introduced in this paper. Using this framework, we solve the registration problem in two cases: (i) a set of putative point-to-plane correspondences (with possibly overwhelmingly many outliers) is given as input and (ii) no initial correspondences are given. In both cases, our approach yields outstanding results in terms of robustness and optimality.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Paudel_Robust_and_Optimal_ICCV_2015_paper.pdf", @@ -13246,7 +13665,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Paudel_Robust_and_Optimal_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Paudel_Robust_and_Optimal_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Paudel_2015_ICCV,\n \n author = {\n Paudel,\n Danda Pani and Habed,\n Adlane and Demonceaux,\n Cedric and Vasseur,\n Pascal\n},\n title = {\n Robust and Optimal Sum-of-Squares-Based Point-to-Plane Registration of Image Sets and Structured Scenes\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "e4b829973e", @@ -13280,7 +13700,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Chennai", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "India" + "aff_country_unique": "India", + "bibtex": "@InProceedings{Punnappurath_2015_ICCV,\n \n author = {\n Punnappurath,\n Abhijith and Rengarajan,\n Vijay and Rajagopalan,\n A.N.\n},\n title = {\n Rolling Shutter Super-Resolution\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "fee80fd955", @@ -13305,7 +13726,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Huang_SALICON_Reducing_the_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Huang_SALICON_Reducing_the_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Huang_2015_ICCV,\n \n author = {\n Huang,\n Xun and Shen,\n Chengyao and Boix,\n Xavier and Zhao,\n Qi\n},\n title = {\n SALICON: Reducing the Semantic Gap in Saliency Prediction by Adapting Deep Neural Networks\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "510a4a8443", @@ -13332,14 +13754,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Kim_SOWP_Spatially_Ordered_ICCV_2015_paper.html", "aff_unique_index": "0;1;2;0", - "aff_unique_norm": "Korea University;Samsung;Ulsan National Institute of Science and Technology", - "aff_unique_dep": "School of Electrical Engineering;Samsung Electronics;School of Electrical and Computer Engineering", + "aff_unique_norm": "Korea University;Samsung Electronics;Ulsan National Institute of Science and Technology", + "aff_unique_dep": "School of Electrical Engineering;;School of Electrical and Computer Engineering", "aff_unique_url": "https://www.korea.ac.kr;https://www.samsung.com;https://www.unist.ac.kr", "aff_unique_abbr": "KU;Samsung;UNIST", "aff_campus_unique_index": "1", "aff_campus_unique": ";Ulsan", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Kim_2015_ICCV,\n \n author = {\n Kim,\n Han-Ul and Lee,\n Dae-Youn and Sim,\n Jae-Young and Kim,\n Chang-Su\n},\n title = {\n SOWP: Spatially Ordered and Weighted Patch Descriptor for Visual Tracking\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "e0447c2e0f", @@ -13366,14 +13789,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Li_SPM-BP_Sped-up_PatchMatch_ICCV_2015_paper.html", "aff_unique_index": "0+1;2;1;3;0", - "aff_unique_norm": "Advanced Digital Sciences Center;National University of Singapore;Chungnam National University;University of Illinois Urbana-Champaign", + "aff_unique_norm": "Advanced Digital Sciences Center;National University of Singapore;Chungnam National University;University of Illinois at Urbana-Champaign", "aff_unique_dep": ";;;", "aff_unique_url": ";https://www.nus.edu.sg;http://www.cnu.ac.kr;https://illinois.edu", "aff_unique_abbr": ";NUS;CNU;UIUC", "aff_campus_unique_index": ";1", "aff_campus_unique": ";Urbana-Champaign", "aff_country_unique_index": "0+0;1;0;2;0", - "aff_country_unique": "Singapore;South Korea;United States" + "aff_country_unique": "Singapore;South Korea;United States", + "bibtex": "@InProceedings{Li_2015_ICCV,\n \n author = {\n Li,\n Yu and Min,\n Dongbo and Brown,\n Michael S. and Do,\n Minh N. and Lu,\n Jiangbo\n},\n title = {\n SPM-BP: Sped-up PatchMatch Belief Propagation for Continuous MRFs\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "9beb735c05", @@ -13407,7 +13831,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Sharma_2015_ICCV,\n \n author = {\n Sharma,\n Gaurav and Schiele,\n Bernt\n},\n title = {\n Scalable Nonlinear Embeddings for Semantic Category-Based Image Retrieval\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "bdbcd8cb25", @@ -13432,7 +13857,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Zheng_Scalable_Person_Re-Identification_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Zheng_Scalable_Person_Re-Identification_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Zheng_2015_ICCV,\n \n author = {\n Zheng,\n Liang and Shen,\n Liyue and Tian,\n Lu and Wang,\n Shengjin and Wang,\n Jingdong and Tian,\n Qi\n},\n title = {\n Scalable Person Re-Identification: A Benchmark\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "57ca70fd31", @@ -13445,7 +13871,7 @@ "author": "Zhou Ren; Chaohui Wang; Alan L. Yuille", "abstract": "In this paper, we are interested in enhancing the expressivity and robustness of part-based models for object representation, in the common scenario where the training data are based on 2D images. To this end, we propose scene-domain active part models (SDAPM), which reconstruct and characterize the 3D geometric statistics between object's parts in 3D scene-domain by using 2D training data in the image-domain alone. And on top of this, we explicitly model and handle occlusions in SDAPM. Together with the developed learning and inference algorithms, such a model provides rich object descriptions, including 2D object and parts localization, 3D landmark shape and camera viewpoint, which offers an effective representation to various image understanding tasks, such as object and parts detection, 3D landmark shape and viewpoint estimation from images. Experiments on the above tasks show that SDAPM outperforms previous part-based models, and thus demonstrates the potential of the proposed technique.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Ren_Scene-Domain_Active_Part_ICCV_2015_paper.pdf", - "aff": "University of California, Los Angeles; Universit \u00b4e Paris-Est, LIGM - CNRS UMR 8049; University of California, Los Angeles", + "aff": "University of California, Los Angeles; Universit ´e Paris-Est, LIGM - CNRS UMR 8049; University of California, Los Angeles", "project": "", "github": "", "supp": "", @@ -13459,14 +13885,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Ren_Scene-Domain_Active_Part_ICCV_2015_paper.html", "aff_unique_index": "0;1;0", - "aff_unique_norm": "University of California, Los Angeles;Universit\u00e9 Paris-Est", + "aff_unique_norm": "University of California, Los Angeles;Université Paris-Est", "aff_unique_dep": ";LIGM - CNRS UMR 8049", "aff_unique_url": "https://www.ucla.edu;https://www.univ-Paris12.fr", "aff_unique_abbr": "UCLA;UPE", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "United States;France" + "aff_country_unique": "United States;France", + "bibtex": "@InProceedings{Ren_2015_ICCV,\n \n author = {\n Ren,\n Zhou and Wang,\n Chaohui and Yuille,\n Alan L.\n},\n title = {\n Scene-Domain Active Part Models for Object Representation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "f6de649196", @@ -13479,7 +13906,7 @@ "author": "Meng Tang; Ismail Ben Ayed; Dmitrii Marin; Yuri Boykov", "abstract": "The log-likelihood energy term in popular model-fitting segmentation methods, e.g. Zhu&Yuille, Chan-Vese, GrabCut, is presented as a generalized \"probabilistic K-means\" energy for color space clustering. This interpretation reveals some limitations, e.g. over-fitting. We propose an alternative approach to color clustering using kernel K-means energy with well-known properties such as non-linear separation and scalability to higher-dimensional feature spaces. Our bound formulation for kernel K-means allows to combine general pair-wise feature clustering methods with image grid regularization using graph cuts, similarly to standard color model fitting techniques for segmentation. Unlike histogram or GMM fitting, our approach is closely related to average association and normalized cut. But, in contrast to previous pairwise clustering algorithms, our approach can incorporate any standard geometric regularization in the image domain. We analyze extreme cases for kernel bandwidth (e.g. Gini bias) and demonstrate effectiveness of KNN-based adaptive bandwidth strategies. Our kernel K-means approach to segmentation benefits from higher-dimensional features where standard model fitting fails.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Tang_Secrets_of_GrabCut_ICCV_2015_paper.pdf", - "aff": "Computer Science Department, University of Western Ontario, Canada; \u00b4Ecole de Technologie Sup\u00e9rieure, University of Quebec, Canada; Computer Science Department, University of Western Ontario, Canada; Computer Science Department, University of Western Ontario, Canada", + "aff": "Computer Science Department, University of Western Ontario, Canada; ´Ecole de Technologie Supérieure, University of Quebec, Canada; Computer Science Department, University of Western Ontario, Canada; Computer Science Department, University of Western Ontario, Canada", "project": "", "github": "", "supp": "", @@ -13498,9 +13925,10 @@ "aff_unique_url": "https://www.uwo.ca;https://www.etsmtl.ca", "aff_unique_abbr": "UWO;ETS", "aff_campus_unique_index": "1", - "aff_campus_unique": ";Ecole de Technologie Sup\u00e9rieure", + "aff_campus_unique": ";Ecole de Technologie Supérieure", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Tang_2015_ICCV,\n \n author = {\n Tang,\n Meng and Ben Ayed,\n Ismail and Marin,\n Dmitrii and Boykov,\n Yuri\n},\n title = {\n Secrets of GrabCut and Kernel K-Means\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "b1f3a5219a", @@ -13528,13 +13956,14 @@ "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Hong_Secrets_of_Matrix_ICCV_2015_paper.html", "aff_unique_index": "0;1", "aff_unique_norm": "University of Cambridge;Microsoft", - "aff_unique_dep": ";Microsoft", + "aff_unique_dep": ";", "aff_unique_url": "https://www.cam.ac.uk;https://www.microsoft.com", "aff_unique_abbr": "Cambridge;MSFT", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Hong_2015_ICCV,\n \n author = {\n Hong,\n Je Hyeong and Fitzgibbon,\n Andrew\n},\n title = {\n Secrets of Matrix Factorization: Approximations,\n Numerics,\n Manifold Optimization and Random Restarts\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "0caab4e385", @@ -13547,7 +13976,7 @@ "author": "Wei-Chen Chiu; Mario Fritz", "abstract": "The Histogram of Oriented Gradient (HOG) descriptor has led to many advances in computer vision over the last decade and is still part of many state of the art approaches. We realize that the associated feature computation is piecewise differentiable and therefore many pipelines which build on HOG can be made differentiable. This lends to advanced introspection as well as opportunities for end-to-end optimization. We present our implementation of [?]HOG based on the auto-differentiation toolbox Chumpy and show applications to pre-image visualization and pose estimation which extends the existing differentiable renderer OpenDR pipeline. Both applications improve on the respective state-of-the-art HOG approaches.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Chiu_See_the_Difference_ICCV_2015_paper.pdf", - "aff": "Max Planck Institute for Informatics, Saarbr \u00a8ucken, Germany; Max Planck Institute for Informatics, Saarbr \u00a8ucken, Germany", + "aff": "Max Planck Institute for Informatics, Saarbr ¨ucken, Germany; Max Planck Institute for Informatics, Saarbr ¨ucken, Germany", "project": "", "github": "", "supp": "", @@ -13566,9 +13995,10 @@ "aff_unique_url": "https://mpi-inf.mpg.de", "aff_unique_abbr": "MPII", "aff_campus_unique_index": "0;0", - "aff_campus_unique": "Saarbr\u00fccken", + "aff_campus_unique": "Saarbrücken", "aff_country_unique_index": "0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Chiu_2015_ICCV,\n \n author = {\n Chiu,\n Wei-Chen and Fritz,\n Mario\n},\n title = {\n See the Difference: Direct Pre-Image Reconstruction and Pose Estimation by Differentiating HOG\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "b03a24335c", @@ -13581,7 +14011,7 @@ "author": "Feihu Zhang; Longquan Dai; Shiming Xiang; Xiaopeng Zhang", "abstract": "In this paper, we design a new edge-aware structure, named segment graph, to represent the image and we further develop a novel double weighted average image filter (SGF) based on the segment graph. In our SGF, we use the tree distance on the segment graph to define the internal weight function of the filtering kernel, which enables the filter to smooth out high-contrast details and textures while preserving major image structures very well. While for the external weight function, we introduce a user specified smoothing window to balance the smoothing effects from each node of the segment graph. Moreover, we also set a threshold to adjust the edge-preserving performance. These advantages make the SGF more flexible in various applications and overcome the \"halo\" and \"leak\" problems appearing in most of the state-of-the-art approaches. Finally and importantly, we develop a linear algorithm for the implementation of our SGF, which has an O(N) time complexity for both gray-scale and high dimensional images, regardless of the kernel size and the intensity range. Typically, as one of the fastest edge-preserving filters, our CPU implementation achieves 0.15s per megapixel when performing filtering for 3-channel color images. The strength of the proposed filter is demonstrated by various applications, including stereo matching, optical flow, joint depth map upsampling, edge-preserving smoothing, edges detection, image abstraction and texture editing.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Zhang_Segment_Graph_Based_ICCV_2015_paper.pdf", - "aff": "School of Computer Science, Northwestern Polytechnical University, Xi\u2019an, China+NLPR-LIAMA, Institute of Automation, Chinese Academy of Sciences, Beijing, China; NLPR-LIAMA, Institute of Automation, Chinese Academy of Sciences, Beijing, China; NLPR-LIAMA, Institute of Automation, Chinese Academy of Sciences, Beijing, China; NLPR-LIAMA, Institute of Automation, Chinese Academy of Sciences, Beijing, China", + "aff": "School of Computer Science, Northwestern Polytechnical University, Xi’an, China+NLPR-LIAMA, Institute of Automation, Chinese Academy of Sciences, Beijing, China; NLPR-LIAMA, Institute of Automation, Chinese Academy of Sciences, Beijing, China; NLPR-LIAMA, Institute of Automation, Chinese Academy of Sciences, Beijing, China; NLPR-LIAMA, Institute of Automation, Chinese Academy of Sciences, Beijing, China", "project": "", "github": "", "supp": "", @@ -13602,7 +14032,8 @@ "aff_campus_unique_index": "0+1;1;1;1", "aff_campus_unique": "Xi'an;Beijing", "aff_country_unique_index": "0+0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2015_ICCV,\n \n author = {\n Zhang,\n Feihu and Dai,\n Longquan and Xiang,\n Shiming and Zhang,\n Xiaopeng\n},\n title = {\n Segment Graph Based Image Filtering: Fast Structure-Preserving Smoothing\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "05e7fde4d7", @@ -13615,7 +14046,7 @@ "author": "Hamid Izadinia; Fereshteh Sadeghi; Santosh K. Divvala; Hannaneh Hajishirzi; Yejin Choi; Ali Farhadi", "abstract": "We introduce Segment-Phrase Table (SPT), a large collection of bijective associations between textual phrases and their corresponding segmentations. Leveraging recent progress in object recognition and natural language semantics, we show how we can successfully build a high-quality segment-phrase table using minimal human supervision. More importantly, we demonstrate the unique value unleashed by this rich bimodal resource, for both vision as well as natural language understanding. First, we show that fine-grained textual labels facilitate contextual reasoning that helps in satisfying semantic constraints across image segments. This feature enables us to achieve state-of-the-art segmentation results on benchmark datasets. Next, we show that the association of high-quality segmentations to textual phrases aids in richer semantic understanding and reasoning of these textual phrases. Leveraging this feature, we motivate the problem of visual entailment and visual paraphrasing, and demonstrate its utility on a large dataset.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Izadinia_Segment-Phrase_Table_for_ICCV_2015_paper.pdf", - "aff": "University of Washington\u2020; University of Washington\u2020; University of Washington\u2021,\u2020; University of Washington\u2020; University of Washington\u2020; The Allen Institute for AI\u2021,\u2020", + "aff": "University of Washington†; University of Washington†; University of Washington‡,†; University of Washington†; University of Washington†; The Allen Institute for AI‡,†", "project": "", "github": "", "supp": "", @@ -13636,7 +14067,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Izadinia_2015_ICCV,\n \n author = {\n Izadinia,\n Hamid and Sadeghi,\n Fereshteh and Divvala,\n Santosh K. and Hajishirzi,\n Hannaneh and Choi,\n Yejin and Farhadi,\n Ali\n},\n title = {\n Segment-Phrase Table for Semantic Segmentation,\n Visual Entailment and Paraphrasing\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "a43c3368bb", @@ -13670,7 +14102,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "College Park", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Singh_2015_ICCV,\n \n author = {\n Singh,\n Bharat and Han,\n Xintong and Wu,\n Zhe and Morariu,\n Vlad I. and Davis,\n Larry S.\n},\n title = {\n Selecting Relevant Web Trained Concepts for Automated Event Retrieval\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "1cd861ed20", @@ -13704,7 +14137,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "College Park", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Li_2015_ICCV,\n \n author = {\n Li,\n Ang and Morariu,\n Vlad and Davis,\n Larry S.\n},\n title = {\n Selective Encoding for Recognizing Unreliably Localized Faces\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "fb6d2f45fb", @@ -13713,7 +14147,7 @@ "status": "Poster", "track": "main", "pid": "", - "author_site": "Michael Hirsch, Bernhard Sch\u00f6lkopf", + "author_site": "Michael Hirsch, Bernhard Schölkopf", "author": "Michael Hirsch; Bernhard Scholkopf", "abstract": "Even high-quality lenses suffer from optical aberrations, especially when used at full aperture. Furthermore, there are significant lens-to-lens deviations due to manufacturing tolerances, often rendering current software solutions like DxO, Lightroom, and PTLens insufficient as they don't adapt and only include generic lens blur models. We propose a method that enables the self-calibration of lenses from a natural image, or a set of such images. To this end we develop a machine learning framework that is able to exploit several recorded images and distills the available information into an accurate model of the considered lens.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Hirsch_Self-Calibration_of_Optical_ICCV_2015_paper.pdf", @@ -13738,7 +14172,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Hirsch_2015_ICCV,\n \n author = {\n Hirsch,\n Michael and Scholkopf,\n Bernhard\n},\n title = {\n Self-Calibration of Optical Lenses\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "c2b58e550e", @@ -13772,7 +14207,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0+1;1;0", - "aff_country_unique": "United States;Saudi Arabia" + "aff_country_unique": "United States;Saudi Arabia", + "bibtex": "@InProceedings{Yang_2015_ICCV,\n \n author = {\n Yang,\n Yanchao and Sundaramoorthi,\n Ganesh and Soatto,\n Stefano\n},\n title = {\n Self-Occlusions and Disocclusions in Causal Video Object Segmentation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "127b9fd718", @@ -13806,7 +14242,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Murdock_2015_ICCV,\n \n author = {\n Murdock,\n Calvin and De la Torre,\n Fernando\n},\n title = {\n Semantic Component Analysis\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "d943c88855", @@ -13833,14 +14270,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Liu_Semantic_Image_Segmentation_ICCV_2015_paper.html", "aff_unique_index": "0;0;0;0;0", - "aff_unique_norm": "Chinese University of Hong Kong", + "aff_unique_norm": "The Chinese University of Hong Kong", "aff_unique_dep": "Department of Information Engineering", "aff_unique_url": "https://www.cuhk.edu.hk", "aff_unique_abbr": "CUHK", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2015_ICCV,\n \n author = {\n Liu,\n Ziwei and Li,\n Xiaoxiao and Luo,\n Ping and Loy,\n Chen-Change and Tang,\n Xiaoou\n},\n title = {\n Semantic Image Segmentation via Deep Parsing Network\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "76b07edd8e", @@ -13853,7 +14291,7 @@ "author": "Jeremie Papon; Markus Schoeler", "abstract": "In this work we address the problem of indoor scene understanding from RGB-D images. Specifically, we propose to find instances of common furniture classes, their spatial extent, and their pose with respect to generalized class models. To accomplish this, we use a deep, wide, multi-output convolutional neural network (CNN) that predicts class, pose, and location of possible objects simultaneously. To overcome the lack of large annotated RGB-D training sets (especially those with pose), we use an on-the-fly rendering pipeline that generates realistic cluttered room scenes in parallel to training. We then perform transfer learning on the relatively small amount of publicly available annotated RGB-D data, and find that our model is able to successfully annotate even highly challenging real scenes. Importantly, our trained network is able to understand noisy and sparse observations of highly cluttered scenes with a remarkable degree of accuracy, inferring class and pose from a very limited set of cues. Additionally, our neural network is only moderately deep and computes class, pose and position in tandem, so the overall run-time is significantly faster than existing methods, estimating all output parameters simultaneously in parallel.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Papon_Semantic_Pose_Using_ICCV_2015_paper.pdf", - "aff": "Bernstein Center for Computational Neuroscience (BCCN); III. Physikalisches Institut - Biophysik, Georg-August University of G\u00f6ttingen", + "aff": "Bernstein Center for Computational Neuroscience (BCCN); III. Physikalisches Institut - Biophysik, Georg-August University of Göttingen", "project": "", "github": "", "supp": "", @@ -13867,14 +14305,15 @@ "author_num": 2, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Papon_Semantic_Pose_Using_ICCV_2015_paper.html", "aff_unique_index": "0;1", - "aff_unique_norm": "Bernstein Center for Computational Neuroscience;Georg-August University of G\u00f6ttingen", + "aff_unique_norm": "Bernstein Center for Computational Neuroscience;Georg-August University of Göttingen", "aff_unique_dep": "Computational Neuroscience;III. Physikalisches Institut - Biophysik", "aff_unique_url": ";https://www.uni-goettingen.de", - "aff_unique_abbr": "BCCN;Uni G\u00f6ttingen", + "aff_unique_abbr": "BCCN;Uni Göttingen", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Papon_2015_ICCV,\n \n author = {\n Papon,\n Jeremie and Schoeler,\n Markus\n},\n title = {\n Semantic Pose Using Deep Networks Trained on Synthetic RGB-D\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "b12c8053f1", @@ -13901,14 +14340,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Qi_Semantic_Segmentation_With_ICCV_2015_paper.html", "aff_unique_index": "0;0;0;0;0", - "aff_unique_norm": "Chinese University of Hong Kong", + "aff_unique_norm": "The Chinese University of Hong Kong", "aff_unique_dep": "", "aff_unique_url": "https://www.cuhk.edu.hk", "aff_unique_abbr": "CUHK", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Qi_2015_ICCV,\n \n author = {\n Qi,\n Xiaojuan and Shi,\n Jianping and Liu,\n Shu and Liao,\n Renjie and Jia,\n Jiaya\n},\n title = {\n Semantic Segmentation With Object Clique Potential\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "7b5d06a84b", @@ -13942,7 +14382,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Deng_2015_ICCV,\n \n author = {\n Deng,\n Zhuo and Todorovic,\n Sinisa and Latecki,\n Longin Jan\n},\n title = {\n Semantic Segmentation of RGBD Images With Mutex Constraints\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "6200429a5a", @@ -13976,7 +14417,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Li_2015_ICCV,\n \n author = {\n Li,\n Yuncheng and Yang,\n Xitong and Luo,\n Jiebo\n},\n title = {\n Semantic Video Entity Linking Based on Visual Content and Metadata\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "6d03fcbf44", @@ -14010,7 +14452,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Cabezas_2015_ICCV,\n \n author = {\n Cabezas,\n Randi and Straub,\n Julian and Fisher,\n III,\n John W.\n},\n title = {\n Semantically-Aware Aerial Reconstruction From Multi-Modal Data\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "f37aab6ccf", @@ -14044,7 +14487,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Rochester", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Chew_2015_ICCV,\n \n author = {\n Chew,\n Selene E. and Cahill,\n Nathan D.\n},\n title = {\n Semi-Supervised Normalized Cuts for Image Segmentation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "3f449aba89", @@ -14078,7 +14522,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", - "aff_country_unique": "United States;Canada" + "aff_country_unique": "United States;Canada", + "bibtex": "@InProceedings{Li_2015_ICCV,\n \n author = {\n Li,\n Xin and Guo,\n Yuhong and Schuurmans,\n Dale\n},\n title = {\n Semi-Supervised Zero-Shot Classification With Label Representation Learning\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "2b92bbfa44", @@ -14112,7 +14557,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Zheng_2015_ICCV,\n \n author = {\n Zheng,\n Yinqiang and Fu,\n Ying and Lam,\n Antony and Sato,\n Imari and Sato,\n Yoichi\n},\n title = {\n Separating Fluorescent and Reflective Components by Using a Single Hyperspectral Image\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "f6df1f45cb", @@ -14137,7 +14583,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Venugopalan_Sequence_to_Sequence_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Venugopalan_Sequence_to_Sequence_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Venugopalan_2015_ICCV,\n \n author = {\n Venugopalan,\n Subhashini and Rohrbach,\n Marcus and Donahue,\n Jeffrey and Mooney,\n Raymond and Darrell,\n Trevor and Saenko,\n Kate\n},\n title = {\n Sequence to Sequence - Video to Text\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "3ed4b39fcb", @@ -14162,7 +14609,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Ji_Shape_Interaction_Matrix_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Ji_Shape_Interaction_Matrix_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Ji_2015_ICCV,\n \n author = {\n Ji,\n Pan and Salzmann,\n Mathieu and Li,\n Hongdong\n},\n title = {\n Shape Interaction Matrix Revisited and Robustified: Efficient Subspace Clustering With Corrupted and Incomplete Data\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "70ea233ec3", @@ -14189,14 +14637,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Zhang_Shell_PCA_Statistical_ICCV_2015_paper.html", "aff_unique_index": "0;1;1;0", - "aff_unique_norm": "University of York;University of Bonn", + "aff_unique_norm": "The University of York;University of Bonn", "aff_unique_dep": "Department of Computer Science;Institute for Numerical Simulation", "aff_unique_url": "https://www.york.ac.uk;https://www.uni-bonn.de", "aff_unique_abbr": "York;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0", - "aff_country_unique": "United Kingdom;Germany" + "aff_country_unique": "United Kingdom;Germany", + "bibtex": "@InProceedings{Zhang_2015_ICCV,\n \n author = {\n Zhang,\n Chao and Heeren,\n Behrend and Rumpf,\n Martin and Smith,\n William A. P.\n},\n title = {\n Shell PCA: Statistical Shape Modelling in Shell Space\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "40d4fdb94c", @@ -14225,12 +14674,13 @@ "aff_unique_index": "0+1;1;0+1;2", "aff_unique_norm": "University of Chinese Academy of Sciences;Chinese Academy of Sciences;University of Texas at San Antonio", "aff_unique_dep": "Key Lab of Big Data Mining and Knowledge Management;Key Lab of Intell. Info. Process., Inst. of Comput. Tech;Department of Computer Science", - "aff_unique_url": "http://www.ucas.ac.cn;http://www.ict.ac.cn;https://www.utsa.edu", + "aff_unique_url": "http://www.ucas.ac.cn;http://www.ict.cas.cn;https://www.utsa.edu", "aff_unique_abbr": "UCAS;CAS;UTSA", "aff_campus_unique_index": "0+0;0;0+0;1", "aff_campus_unique": "Beijing;San Antonio", "aff_country_unique_index": "0+0;0;0+0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Song_2015_ICCV,\n \n author = {\n Song,\n Guoli and Wang,\n Shuhui and Huang,\n Qingming and Tian,\n Qi\n},\n title = {\n Similarity Gaussian Process Latent Variable Model for Multi-Modal Data Analysis\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "286a0d3905", @@ -14264,7 +14714,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Tempe", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Venkatesan_2015_ICCV,\n \n author = {\n Venkatesan,\n Ragav and Chandakkar,\n Parag and Li,\n Baoxin\n},\n title = {\n Simpler Non-Parametric Methods Provide as Good or Better Results to Multiple-Instance Learning\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "69b0587e89", @@ -14298,7 +14749,8 @@ "aff_campus_unique_index": "0;0;0;1", "aff_campus_unique": "Berkeley;Lowell", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Tzeng_2015_ICCV,\n \n author = {\n Tzeng,\n Eric and Hoffman,\n Judy and Darrell,\n Trevor and Saenko,\n Kate\n},\n title = {\n Simultaneous Deep Transfer Across Domains and Tasks\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "1edde6be22", @@ -14307,11 +14759,11 @@ "status": "Poster", "track": "main", "pid": "", - "author_site": "Jaemyun Kim, Ad\u00edn Ram\u00edrez Rivera, Byungyong Ryu, Oksam Chae", + "author_site": "Jaemyun Kim, Adín Ramírez Rivera, Byungyong Ryu, Oksam Chae", "author": "Jaemyun Kim; Adin Ramirez Rivera; Byungyong Ryu; Oksam Chae", "abstract": "In this paper, we propose a hybrid background model that relies on edge and non-edge features of the image to produce the model. We encode these features into a coding scheme, that we called Local Hybrid Pattern (LHP), that selectively models edges and non-edges features of each pixel. Furthermore, we model each pixel with an adaptive code dictionary to represent the background dynamism, and update it by adding stable codes and discarding unstable ones. We weight each code in the dictionary to enhance its description of the pixel it models. The foreground is detected as the incoming codes that deviate from the dictionary. We can detect (as foreground or background) and classify (as edge or inner region) each pixel simultaneously. We tested our proposed method in existing databases with promising results.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Kim_Simultaneous_Foreground_Detection_ICCV_2015_paper.pdf", - "aff": "Dept. of Computer Engineering, Kyung Hee University, Gyeonggi-do, South Korea; Escuela Inform\u00e1tica y Telecomunicaciones, Universidad Diego Portales, Santiago, Chile; Dept. of Computer Engineering, Kyung Hee University, Gyeonggi-do, South Korea; Dept. of Computer Engineering, Kyung Hee University, Gyeonggi-do, South Korea", + "aff": "Dept. of Computer Engineering, Kyung Hee University, Gyeonggi-do, South Korea; Escuela Informática y Telecomunicaciones, Universidad Diego Portales, Santiago, Chile; Dept. of Computer Engineering, Kyung Hee University, Gyeonggi-do, South Korea; Dept. of Computer Engineering, Kyung Hee University, Gyeonggi-do, South Korea", "project": "", "github": "", "supp": "", @@ -14326,13 +14778,14 @@ "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Kim_Simultaneous_Foreground_Detection_ICCV_2015_paper.html", "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Kyung Hee University;Universidad Diego Portales", - "aff_unique_dep": "Dept. of Computer Engineering;Escuela Inform\u00e1tica y Telecomunicaciones", + "aff_unique_dep": "Dept. of Computer Engineering;Escuela Informática y Telecomunicaciones", "aff_unique_url": "http://www.khu.ac.kr;https://www.udp.cl", "aff_unique_abbr": "KHU;", "aff_campus_unique_index": "0;1;0;0", "aff_campus_unique": "Gyeonggi-do;Santiago", "aff_country_unique_index": "0;1;0;0", - "aff_country_unique": "South Korea;Chile" + "aff_country_unique": "South Korea;Chile", + "bibtex": "@InProceedings{Kim_2015_ICCV,\n \n author = {\n Kim,\n Jaemyun and Rivera,\n Adin Ramirez and Ryu,\n Byungyong and Chae,\n Oksam\n},\n title = {\n Simultaneous Foreground Detection and Classification With Hybrid Features\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "a3ebaf111d", @@ -14356,7 +14809,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Lu_Simultaneous_Local_Binary_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Lu_Simultaneous_Local_Binary_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Lu_2015_ICCV,\n \n author = {\n Lu,\n Jiwen and Liong,\n Venice Erin and Zhou,\n Jie\n},\n title = {\n Simultaneous Local Binary Feature Learning and Encoding for Face Recognition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "bf82e8fcd2", @@ -14381,7 +14835,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Fouhey_Single_Image_3D_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Fouhey_Single_Image_3D_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Fouhey_2015_ICCV,\n \n author = {\n Fouhey,\n David F. and Hussain,\n Wajahat and Gupta,\n Abhinav and Hebert,\n Martial\n},\n title = {\n Single Image 3D Without a Single 3D Image\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "124f1cb55a", @@ -14415,7 +14870,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhu_2015_ICCV,\n \n author = {\n Zhu,\n Menglong and Zhou,\n Xiaowei and Daniilidis,\n Kostas\n},\n title = {\n Single Image Pop-Up From Discriminatively Learned Parts\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "2c7512d740", @@ -14449,7 +14905,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Meng_2015_ICCV,\n \n author = {\n Meng,\n Lingfei and Lu,\n Liyang and Bedard,\n Noah and Berkner,\n Kathrin\n},\n title = {\n Single-Shot Specular Surface Reconstruction With Gonio-Plenoptic Imaging\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "9ea3e32df2", @@ -14483,7 +14940,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Chapel Hill", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zheng_2015_ICCV,\n \n author = {\n Zheng,\n Enliang and Ji,\n Dinghuang and Dunn,\n Enrique and Frahm,\n Jan-Michael\n},\n title = {\n Sparse Dynamic 3D Reconstruction From Unsynchronized Videos\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "301bd39333", @@ -14508,7 +14966,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Mrowca_Spatial_Semantic_Regularisation_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Mrowca_Spatial_Semantic_Regularisation_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Mrowca_2015_ICCV,\n \n author = {\n Mrowca,\n Damian and Rohrbach,\n Marcus and Hoffman,\n Judy and Hu,\n Ronghang and Saenko,\n Kate and Darrell,\n Trevor\n},\n title = {\n Spatial Semantic Regularisation for Large Scale Object Detection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "e0c220435a", @@ -14542,7 +15001,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Jo_2015_ICCV,\n \n author = {\n Jo,\n Kensei and Gupta,\n Mohit and Nayar,\n Shree K.\n},\n title = {\n SpeDo: 6 DOF Ego-Motion Sensor Using Speckle Defocus Imaging\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "a78b7d4b85", @@ -14567,7 +15027,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Lu_Square_Localization_for_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Lu_Square_Localization_for_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Lu_2015_ICCV,\n \n author = {\n Lu,\n Cewu and Lu,\n Yongyi and Chen,\n Hao and Tang,\n Chi-Keung\n},\n title = {\n Square Localization for Efficient and Accurate Object Detection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "52b5022b76", @@ -14591,7 +15052,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Ju_StereoSnakes_Contour_Based_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Ju_StereoSnakes_Contour_Based_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Ju_2015_ICCV,\n \n author = {\n Ju,\n Ran and Ren,\n Tongwei and Wu,\n Gangshan\n},\n title = {\n StereoSnakes: Contour Based Consistent Object Extraction For Stereo Images\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "ae9874be22", @@ -14625,7 +15087,8 @@ "aff_campus_unique_index": "0", "aff_campus_unique": "Austin;", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "United States;South Korea" + "aff_country_unique": "United States;South Korea", + "bibtex": "@InProceedings{Xiong_2015_ICCV,\n \n author = {\n Xiong,\n Bo and Kim,\n Gunhee and Sigal,\n Leonid\n},\n title = {\n Storyline Representation of Egocentric Videos With an Applications to Story-Based Search\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "de8d4d9868", @@ -14650,7 +15113,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Hayder_Structural_Kernel_Learning_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Hayder_Structural_Kernel_Learning_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Hayder_2015_ICCV,\n \n author = {\n Hayder,\n Zeeshan and He,\n Xuming and Salzmann,\n Mathieu\n},\n title = {\n Structural Kernel Learning for Large Scale Multiclass Object Co-Detection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "e8e88ecb15", @@ -14678,13 +15142,14 @@ "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Zheng_Structure_From_Motion_ICCV_2015_paper.html", "aff_unique_index": "0;1", "aff_unique_norm": "University of North Carolina at Chapel Hill;Google", - "aff_unique_dep": ";Google", + "aff_unique_dep": ";", "aff_unique_url": "https://www.unc.edu;https://www.google.com", "aff_unique_abbr": "UNC Chapel Hill;Google", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Chapel Hill;Mountain View", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zheng_2015_ICCV,\n \n author = {\n Zheng,\n Enliang and Wu,\n Changchang\n},\n title = {\n Structure From Motion Using Structure-Less Resection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "3cd388e835", @@ -14718,7 +15183,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Gao_2015_ICCV,\n \n author = {\n Gao,\n Tian and Wang,\n Ziheng and Ji,\n Qiang\n},\n title = {\n Structured Feature Selection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "a64d4388cb", @@ -14752,7 +15218,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "St. Louis", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Ikehata_2015_ICCV,\n \n author = {\n Ikehata,\n Satoshi and Yang,\n Hang and Furukawa,\n Yasutaka\n},\n title = {\n Structured Indoor Modeling\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "564c85a989", @@ -14779,14 +15246,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Ji_Synthesizing_Illumination_Mosaics_ICCV_2015_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "University of North Carolina at Chapel Hill", + "aff_unique_norm": "The University of North Carolina at Chapel Hill", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.unc.edu", "aff_unique_abbr": "UNC Chapel Hill", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Chapel Hill", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Ji_2015_ICCV,\n \n author = {\n Ji,\n Dinghuang and Dunn,\n Enrique and Frahm,\n Jan-Michael\n},\n title = {\n Synthesizing Illumination Mosaics From Internet Photo-Collections\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "4f62729e56", @@ -14820,7 +15288,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Nottingham", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Wang_2015_ICCV,\n \n author = {\n Wang,\n Xiaomeng and Valstar,\n Michel and Martinez,\n Brais and Khan,\n Muhammad Haris and Pridmore,\n Tony\n},\n title = {\n TRIC-track: Tracking by Regression With Incrementally Learned Cascades\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "d5337243f1", @@ -14854,7 +15323,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0+0", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Xie_2015_ICCV,\n \n author = {\n Xie,\n Guo-Sen and Zhang,\n Xu-Yao and Shu,\n Xiangbo and Yan,\n Shuicheng and Liu,\n Cheng-Lin\n},\n title = {\n Task-Driven Feature Pooling for Image Classification\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "0f8dcdd70f", @@ -14888,7 +15358,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Chapel Hill", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhou_2015_ICCV,\n \n author = {\n Zhou,\n Yipin and Berg,\n Tamara L.\n},\n title = {\n Temporal Perception and Prediction in Ego-Centric Video\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "c6ae70348e", @@ -14922,7 +15393,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "1", - "aff_country_unique": ";United States" + "aff_country_unique": ";United States", + "bibtex": "@InProceedings{Li_2015_ICCV,\n \n author = {\n Li,\n Sheng and Li,\n Kang and Fu,\n Yun\n},\n title = {\n Temporal Subspace Clustering for Human Motion Segmentation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "c66514de6f", @@ -14949,14 +15421,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Tian_Text_Flow_A_ICCV_2015_paper.html", "aff_unique_index": "0;1;1;2;1;0", - "aff_unique_norm": "National University of Singapore;Baidu;Institute for Infocomm Research", + "aff_unique_norm": "National University of Singapore;Baidu Research;Institute for Infocomm Research", "aff_unique_dep": "School of Computing;Institute of Deep Learning;Visual Computing Department", "aff_unique_url": "https://www.nus.edu.sg;https://baidu.com;https://www.i2r.a-star.edu.sg", "aff_unique_abbr": "NUS;Baidu;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0;1;0", - "aff_country_unique": "Singapore;China" + "aff_country_unique": "Singapore;China", + "bibtex": "@InProceedings{Tian_2015_ICCV,\n \n author = {\n Tian,\n Shangxuan and Pan,\n Yifeng and Huang,\n Chang and Lu,\n Shijian and Yu,\n Kai and Tan,\n Chew Lim\n},\n title = {\n Text Flow: A Unified Text Detection System in Natural Scene Images\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "199aeb12e1", @@ -14980,7 +15453,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Honauer_The_HCI_Stereo_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Honauer_The_HCI_Stereo_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Honauer_2015_ICCV,\n \n author = {\n Honauer,\n Katrin and Maier-Hein,\n Lena and Kondermann,\n Daniel\n},\n title = {\n The HCI Stereo Metrics: Geometry-Aware Performance Analysis of Stereo Algorithms\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "f39c49aef4", @@ -14993,7 +15467,7 @@ "author": "Matthew Trager; Martial Hebert; Jean Ponce", "abstract": "Given multiple perspective photographs, point correspondences form the \"joint image\", effectively a replica of three dimensional space distributed across its two-dimensional projections. This set can be characterized by multilinear equations over image coordinates, such as epipolar and trifocal constraints. We revisit in this paper the geometric and algebraic properties of the joint image, and address fundamental questions such as how many and which multilinearities are necessary and/or sufficient to determine camera geometry and/or image correspondences. The new theoretical results in this paper answer these questions in a very general setting and, in turn, are intended to serve as a \"handbook\" reference about multilinearities for practitioners.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Trager_The_Joint_Image_ICCV_2015_paper.pdf", - "aff": "Inria; Carnegie Mellon University; \u00b4Ecole Normale Sup\u00e9rieure / PSL Research University", + "aff": "Inria; Carnegie Mellon University; ´Ecole Normale Supérieure / PSL Research University", "project": "", "github": "", "supp": "", @@ -15007,14 +15481,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Trager_The_Joint_Image_ICCV_2015_paper.html", "aff_unique_index": "0;1;2", - "aff_unique_norm": "INRIA;Carnegie Mellon University;Ecole Normale Sup\u00e9rieure", + "aff_unique_norm": "Inria;Carnegie Mellon University;Ecole Normale Supérieure", "aff_unique_dep": ";;", "aff_unique_url": "https://www.inria.fr;https://www.cmu.edu;https://www.ens.fr", "aff_unique_abbr": "Inria;CMU;ENS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "France;United States" + "aff_country_unique": "France;United States", + "bibtex": "@InProceedings{Trager_2015_ICCV,\n \n author = {\n Trager,\n Matthew and Hebert,\n Martial and Ponce,\n Jean\n},\n title = {\n The Joint Image Handbook\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "1b30237320", @@ -15048,7 +15523,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Cambridge", "aff_country_unique_index": "0;1", - "aff_country_unique": "Switzerland;United Kingdom" + "aff_country_unique": "Switzerland;United Kingdom", + "bibtex": "@InProceedings{Cohen_2015_ICCV,\n \n author = {\n Cohen,\n Andrea and Zach,\n Christopher\n},\n title = {\n The Likelihood-Ratio Test and Efficient Robust Estimation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "0e07e8f67a", @@ -15082,7 +15558,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Humayun_2015_ICCV,\n \n author = {\n Humayun,\n Ahmad and Li,\n Fuxin and Rehg,\n James M.\n},\n title = {\n The Middle Child Problem: Revisiting Parametric Min-Cut and Seeds for Object Proposals\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "d6260bb4a7", @@ -15091,6 +15568,7 @@ "status": "Poster", "track": "main", "pid": "", + "author_site": "Mikael Nilsson", "author": "Mikael Nilsson", "abstract": "The purpose of this paper is threefold. Firstly, the paper introduces the One Triangle Three Parallelograms (OTTP) sampling strategy, which can be viewed as a way to index pixels from a given shape and image. Secondly, a framework for cascaded shape regression, including the OTTP sampling, is presented. In short, this framework involves binary pixel tests for appearance features combined with shape features followed by a large linear system for each regression stage in the cascade. The proposed solution is found to produce state-of-the-art results on the task of facial landmark estimation. Thirdly, the dependence of accuracy of the landmark predictions and the placement of the mean shape within the detection box is discussed and a method to visualize it is presented.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Nilsson_The_One_Triangle_ICCV_2015_paper.pdf", @@ -15115,7 +15593,8 @@ "aff_campus_unique_index": "0", "aff_campus_unique": "Lund", "aff_country_unique_index": "0", - "aff_country_unique": "Sweden" + "aff_country_unique": "Sweden", + "bibtex": "@InProceedings{Nilsson_2015_ICCV,\n \n author = {\n Nilsson,\n Mikael\n},\n title = {\n The One Triangle Three Parallelograms Sampling Strategy and Its Application in Shape Regression\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "81ccce51f7", @@ -15149,7 +15628,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0;0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Marin_2015_ICCV,\n \n author = {\n Marin,\n Dmitrii and Zhong,\n Yuchen and Drangova,\n Maria and Boykov,\n Yuri\n},\n title = {\n Thin Structure Estimation With Curvature Regularization\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "b6ccce36f1", @@ -15183,7 +15663,8 @@ "aff_campus_unique_index": "0;;0;2", "aff_campus_unique": "San Diego;;T. J. Watson", "aff_country_unique_index": "0;1+0;1;0;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Song_2015_ICCV,\n \n author = {\n Song,\n Dongjin and Liu,\n Wei and Ji,\n Rongrong and Meyer,\n David A. and Smith,\n John R.\n},\n title = {\n Top Rank Supervised Binary Coding for Visual Search\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "3c3cca3e2e", @@ -15210,14 +15691,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Liang_Towards_Computational_Baby_ICCV_2015_paper.html", "aff_unique_index": "0;1;2;0;1;0+3", - "aff_unique_norm": "National University of Singapore;Sun Yat-sen University;Beijing Jiao Tong University;Chinese Academy of Sciences", + "aff_unique_norm": "National University of Singapore;Sun Yat-sen University;Beijing Jiaotong University;Chinese Academy of Sciences", "aff_unique_dep": ";;;State Key Laboratory of Information Security, Institute of Information Engineering", "aff_unique_url": "https://www.nus.edu.sg;http://www.sysu.edu.cn/;http://www.bjtu.edu.cn;http://www.cas.cn", "aff_unique_abbr": "NUS;SYSU;BJTU;CAS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0;1;0+1", - "aff_country_unique": "Singapore;China" + "aff_country_unique": "Singapore;China", + "bibtex": "@InProceedings{Liang_2015_ICCV,\n \n author = {\n Liang,\n Xiaodan and Liu,\n Si and Wei,\n Yunchao and Liu,\n Luoqi and Lin,\n Liang and Yan,\n Shuicheng\n},\n title = {\n Towards Computational Baby Learning: A Weakly-Supervised Approach for Object Detection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "c6acd3c564", @@ -15230,7 +15712,7 @@ "author": "Irina Nurutdinova; Andrew Fitzgibbon", "abstract": "Modern structure from motion (SfM) remains dependent on point features to recover camera positions, meaning that reconstruction is severely hampered in low-texture environments, for example scanning a plain coffee cup on an uncluttered table. We show how 3D curves can be used to refine camera position estimation in challenging low-texture scenes. In contrast to previous work, we allow the curves to be partially observed in all images, meaning that for the first time, curve-based SfM can be demonstrated in realistic scenes. The algorithm is based on bundle adjustment, so needs an initial estimate, but even a poor estimate from a few point correspondences can be substantially improved by including curves, suggesting that this method would benefit many existing systems.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Nurutdinova_Towards_Pointless_Structure_ICCV_2015_paper.pdf", - "aff": "Technische Universit \u00a8at Berlin; Microsoft, Cambridge, UK", + "aff": "Technische Universit ¨at Berlin; Microsoft, Cambridge, UK", "project": "", "github": "", "supp": "", @@ -15244,14 +15726,15 @@ "author_num": 2, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Nurutdinova_Towards_Pointless_Structure_ICCV_2015_paper.html", "aff_unique_index": "0;1", - "aff_unique_norm": "Technische Universit\u00e4t Berlin;Microsoft", - "aff_unique_dep": ";Microsoft", + "aff_unique_norm": "Technische Universität Berlin;Microsoft", + "aff_unique_dep": ";", "aff_unique_url": "https://www.tu-berlin.de;https://www.microsoft.com", "aff_unique_abbr": "TU Berlin;MSFT", "aff_campus_unique_index": "1", "aff_campus_unique": ";Cambridge", "aff_country_unique_index": "0;1", - "aff_country_unique": "Germany;United Kingdom" + "aff_country_unique": "Germany;United Kingdom", + "bibtex": "@InProceedings{Nurutdinova_2015_ICCV,\n \n author = {\n Nurutdinova,\n Irina and Fitzgibbon,\n Andrew\n},\n title = {\n Towards Pointless Structure From Motion: 3D Reconstruction and Camera Parameters From General 3D Curves\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "0e934f5ad5", @@ -15285,7 +15768,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0;0+0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Son_2015_ICCV,\n \n author = {\n Son,\n Jeany and Jung,\n Ilchae and Park,\n Kayoung and Han,\n Bohyung\n},\n title = {\n Tracking-by-Segmentation With Online Gradient Boosting Decision Tree\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "cb022188b4", @@ -15319,7 +15803,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Graz", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Austria" + "aff_country_unique": "Austria", + "bibtex": "@InProceedings{Oberweger_2015_ICCV,\n \n author = {\n Oberweger,\n Markus and Wohlhart,\n Paul and Lepetit,\n Vincent\n},\n title = {\n Training a Feedback Loop for Hand Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "1c09a645f8", @@ -15353,7 +15838,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Xu_2015_ICCV,\n \n author = {\n Xu,\n Yichao and Nagahara,\n Hajime and Shimada,\n Atsushi and Taniguchi,\n Rin-ichiro\n},\n title = {\n TransCut: Transparent Object Segmentation From a Light-Field Image\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "a3412ca085", @@ -15377,7 +15863,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Li_Two_Birds_One_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Li_Two_Birds_One_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Li_2015_ICCV,\n \n author = {\n Li,\n Yan and Wang,\n Ruiping and Liu,\n Haomiao and Jiang,\n Huajie and Shan,\n Shiguang and Chen,\n Xilin\n},\n title = {\n Two Birds,\n One Stone: Jointly Learning Binary Code for Large-Scale Face Image Retrieval and Attributes Prediction\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "9e591e1403", @@ -15386,7 +15873,7 @@ "status": "Poster", "track": "main", "pid": "", - "author_site": "Elisa Ricci, Jagannadan Varadarajan, Ramanathan Subramanian, Samuel Rota Bul\u00f2, Narendra Ahuja, Oswald Lanz", + "author_site": "Elisa Ricci, Jagannadan Varadarajan, Ramanathan Subramanian, Samuel Rota Bulò, Narendra Ahuja, Oswald Lanz", "author": "Elisa Ricci; Jagannadan Varadarajan; Ramanathan Subramanian; Samuel Rota Bulo; Narendra Ahuja; Oswald Lanz", "abstract": "We present a novel approach for jointly estimating tar- gets' head, body orientations and conversational groups called F-formations from a distant social scene (e.g., a cocktail party captured by surveillance cameras). Differing from related works that have (i) coupled head and body pose learning by exploiting the limited range of orientations that the two can jointly take, or (ii) determined F-formations based on the mutual head (but not body) orientations of in- teractors, we present a unified framework to jointly infer both (i) and (ii). Apart from exploiting spatial and orien- tation relationships, we also integrate cues pertaining to temporal consistency and occlusions, which are beneficial while handling low-resolution data under surveillance set- tings. Efficacy of the joint inference framework reflects via increased head, body pose and F-formation estimation ac- curacy over the state-of-the-art, as confirmed by extensive experiments on two social datasets.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Ricci_Uncovering_Interactions_and_ICCV_2015_paper.pdf", @@ -15404,14 +15891,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Ricci_Uncovering_Interactions_and_ICCV_2015_paper.html", "aff_unique_index": "0+1;2;2;0;2+2;0", - "aff_unique_norm": "Fondazione Bruno Kessler;University of Perugia;University of Illinois Urbana-Champaign", + "aff_unique_norm": "Fondazione Bruno Kessler;University of Perugia;University of Illinois at Urbana-Champaign", "aff_unique_dep": ";Department of Engineering;Advanced Digital Sciences Center", "aff_unique_url": "https://www.fbk.eu;https://www.unipg.it;https://illinois.edu", "aff_unique_abbr": "FBK;;UIUC", "aff_campus_unique_index": "0;2;2;0;2+2;0", "aff_campus_unique": "Trento;;Urbana-Champaign", "aff_country_unique_index": "0+0;1;1;0;1+1;0", - "aff_country_unique": "Italy;United States" + "aff_country_unique": "Italy;United States", + "bibtex": "@InProceedings{Ricci_2015_ICCV,\n \n author = {\n Ricci,\n Elisa and Varadarajan,\n Jagannadan and Subramanian,\n Ramanathan and Bulo,\n Samuel Rota and Ahuja,\n Narendra and Lanz,\n Oswald\n},\n title = {\n Uncovering Interactions and Interactors: Joint Estimation of Head,\n Body Orientation and F-Formations From Surveillance Videos\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "89642743cf", @@ -15424,7 +15912,7 @@ "author": "Mathieu Aubry; Bryan C. Russell", "abstract": "We introduce an approach for analyzing the variation of features generated by convolutional neural networks (CNNs) trained on large image datasets with respect to scene factors that occur in natural images. Such factors may include object style, 3D viewpoint, color, and scene lighting configuration. Our approach analyzes CNN feature responses with respect to different scene factors by controlling for them via rendering using a large database of 3D CAD models. The rendered images are presented to a trained CNN and responses for different layers are studied with respect to the input scene factors. We perform a linear decomposition of the responses based on knowledge of the input scene factors and analyze the resulting components. In particular, we quantify their relative importance in the CNN responses and visualize them using principal component analysis. We show qualitative and quantitative results of our study on three trained CNNs: AlexNet [??], Places [??], and Oxford VGG [??]. We observe important differences across the different networks and CNN layers with respect to different scene factors and object categories. Finally, we demonstrate that our analysis based on computer-generated imagery translates to the network representation of natural images.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Aubry_Understanding_Deep_Features_ICCV_2015_paper.pdf", - "aff": "UC Berkeley Universit \u00b4e Paris-Est, LIGM (UMR CNRS 8049), ENPC; Adobe Research", + "aff": "UC Berkeley Universit ´e Paris-Est, LIGM (UMR CNRS 8049), ENPC; Adobe Research", "project": "", "github": "", "supp": "", @@ -15445,7 +15933,8 @@ "aff_campus_unique_index": "0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Aubry_2015_ICCV,\n \n author = {\n Aubry,\n Mathieu and Russell,\n Bryan C.\n},\n title = {\n Understanding Deep Features With Computer-Generated Imagery\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "df4a482e36", @@ -15454,11 +15943,11 @@ "status": "Poster", "track": "main", "pid": "", - "author_site": "Gr\u00e9gory Rogez, James S. Supan\u010di\u010d III, Deva Ramanan", + "author_site": "Grégory Rogez, James S. Supančič III, Deva Ramanan", "author": "Gregory Rogez; James S. Supancic III; Deva Ramanan", "abstract": "We analyze functional manipulations of handheld objects, formalizing the problem as one of fine-grained grasp classification. To do so, we make use of a recently developed fine-grained taxonomy of human-object grasps. We introduce a large dataset of 12000 RGB-D images covering 71 everyday grasps in natural interactions. Our dataset is different from past work (typically addressed from a robotics perspective) in terms of its scale, diversity, and combination of RGB and depth data. From a computer-vision perspective, our dataset allows for exploration of contact and force prediction (crucial concepts in functional grasp analysis) from perceptual cues. We present extensive experimental results with state-of-the-art baselines, illustrating the role of segmentation, object context, and 3D-understanding in functional grasp analysis. We demonstrate a near 2X improvement over prior work and a naive deep baseline, while pointing out important directions for improvement.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Rogez_Understanding_Everyday_Hands_ICCV_2015_paper.pdf", - "aff": "Inria Rh\u02c6one-Alpes; University of California, Irvine; Carnegie Mellon University", + "aff": "Inria Rhˆone-Alpes; University of California, Irvine; Carnegie Mellon University", "project": "", "github": "", "supp": "", @@ -15472,14 +15961,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Rogez_Understanding_Everyday_Hands_ICCV_2015_paper.html", "aff_unique_index": "0;1;2", - "aff_unique_norm": "INRIA;University of California, Irvine;Carnegie Mellon University", + "aff_unique_norm": "Inria;University of California, Irvine;Carnegie Mellon University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.inria.fr;https://www.uci.edu;https://www.cmu.edu", "aff_unique_abbr": "Inria;UCI;CMU", "aff_campus_unique_index": "0;1", - "aff_campus_unique": "Rh\u00f4ne-Alpes;Irvine;", + "aff_campus_unique": "Rhˆne-Alpes;Irvine;", "aff_country_unique_index": "0;1;1", - "aff_country_unique": "France;United States" + "aff_country_unique": "France;United States", + "bibtex": "@InProceedings{Rogez_2015_ICCV,\n \n author = {\n Rogez,\n Gregory and Supancic,\n III,\n James S. and Ramanan,\n Deva\n},\n title = {\n Understanding Everyday Hands in Action From RGB-D Images\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "6e30a39f92", @@ -15513,7 +16003,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0+0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2015_ICCV,\n \n author = {\n Wang,\n Naiyan and Shi,\n Jianping and Yeung,\n Dit-Yan and Jia,\n Jiaya\n},\n title = {\n Understanding and Diagnosing Visual Tracking Systems\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "6979fb1108", @@ -15547,7 +16038,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Khosla_2015_ICCV,\n \n author = {\n Khosla,\n Aditya and Raju,\n Akhil S. and Torralba,\n Antonio and Oliva,\n Aude\n},\n title = {\n Understanding and Predicting Image Memorability at a Large Scale\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "5de38943a3", @@ -15581,7 +16073,8 @@ "aff_campus_unique_index": "0;1;1", "aff_campus_unique": "College Park;Princeton", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Vemulapalli_2015_ICCV,\n \n author = {\n Vemulapalli,\n Raviteja and Van Nguyen,\n Hien and Zhou,\n Shaohua Kevin\n},\n title = {\n Unsupervised Cross-Modal Synthesis of Subject-Specific Scans\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "ca1563e61c", @@ -15615,7 +16108,8 @@ "aff_campus_unique_index": "0;0;1;0;0;0", "aff_campus_unique": "Taiwan;Pittsburgh", "aff_country_unique_index": "0;0;1;0;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Hsu_2015_ICCV,\n \n author = {\n Hsu,\n Tzu Ming Harry and Chen,\n Wei Yu and Hou,\n Cheng-An and Tsai,\n Yao-Hung Hubert and Yeh,\n Yi-Ren and Wang,\n Yu-Chiang Frank\n},\n title = {\n Unsupervised Domain Adaptation With Imbalanced Cross-Domain Data\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "0a8a8262e8", @@ -15649,7 +16143,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "London", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Kodirov_2015_ICCV,\n \n author = {\n Kodirov,\n Elyor and Xiang,\n Tao and Fu,\n Zhenyong and Gong,\n Shaogang\n},\n title = {\n Unsupervised Domain Adaptation for Zero-Shot Learning\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "73828eceaf", @@ -15675,15 +16170,16 @@ "email": "sjtu.edu.cn;microsoft.com;microsoft.com;microsoft.com;sjtu.edu.cn;microsoft.com", "author_num": 6, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Yang_Unsupervised_Extraction_of_ICCV_2015_paper.html", - "aff_unique_index": "0;1;1;1;0;1", - "aff_unique_norm": "Shanghai Jiao Tong University;Microsoft", - "aff_unique_dep": ";Technology & Research", - "aff_unique_url": "https://www.sjtu.edu.cn;https://www.microsoft.com", - "aff_unique_abbr": "SJTU;Microsoft", + "aff_unique_index": "0;1;2;2;0;2", + "aff_unique_norm": "Shanghai Jiao Tong University;Microsoft;Microsoft Corporation", + "aff_unique_dep": ";Technology & Research;Microsoft Research", + "aff_unique_url": "https://www.sjtu.edu.cn;https://www.microsoft.com;https://www.microsoft.com/en-us/research", + "aff_unique_abbr": "SJTU;Microsoft;MSR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Yang_2015_ICCV,\n \n author = {\n Yang,\n Huan and Wang,\n Baoyuan and Lin,\n Stephen and Wipf,\n David and Guo,\n Minyi and Guo,\n Baining\n},\n title = {\n Unsupervised Extraction of Video Highlights Via Robust Recurrent Auto-Encoders\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "517e7804f1", @@ -15717,7 +16213,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Freiburg", "aff_country_unique_index": "0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Sedaghat_2015_ICCV,\n \n author = {\n Sedaghat,\n Nima and Brox,\n Thomas\n},\n title = {\n Unsupervised Generation of a Viewpoint Annotated Car Dataset From Videos\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "cf0740538f", @@ -15745,13 +16242,14 @@ "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Goroshin_Unsupervised_Learning_of_ICCV_2015_paper.html", "aff_unique_index": "0;1;2;0;3", "aff_unique_norm": "New York University;University of California, Berkeley;Google;Courant Institute of Mathematical Sciences, New York University", - "aff_unique_dep": "Courant Institute;;Google;Department of Mathematics", + "aff_unique_dep": "Courant Institute;;;Department of Mathematics", "aff_unique_url": "https://www.courant.nyu.edu;https://www.berkeley.edu;https://www.google.com;https://www.courant.nyu.edu", "aff_unique_abbr": "NYU;UC Berkeley;Google;Courant", "aff_campus_unique_index": "0;1;2;0;0", "aff_campus_unique": "New York;Berkeley;Mountain View", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Goroshin_2015_ICCV,\n \n author = {\n Goroshin,\n Ross and Bruna,\n Joan and Tompson,\n Jonathan and Eigen,\n David and LeCun,\n Yann\n},\n title = {\n Unsupervised Learning of Spatiotemporally Coherent Metrics\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "7f700a105b", @@ -15775,7 +16273,8 @@ "aff_domain": ";", "email": ";", "author_num": 2, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Wang_Unsupervised_Learning_of_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Wang_Unsupervised_Learning_of_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Wang_2015_ICCV,\n \n author = {\n Wang,\n Xiaolong and Gupta,\n Abhinav\n},\n title = {\n Unsupervised Learning of Visual Representations Using Videos\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "61d2e67ce2", @@ -15788,7 +16287,7 @@ "author": "Suha Kwak; Minsu Cho; Ivan Laptev; Jean Ponce; Cordelia Schmid", "abstract": "This paper addresses the problem of automatically localizing dominant objects as spatio-temporal tubes in a noisy collection of videos with minimal or even no supervision. We formulate the problem as a combination of two complementary processes: discovery and tracking. The first one establishes correspondences between prominent regions across videos, and the second one associates similar object regions within the same video. Interestingly, our algorithm also discovers the implicit topology of frames associated with instances of the same object class across different videos, a role normally left to supervisory information in the form of class labels in conventional image and video understanding methods. Indeed, as demonstrated by our experiments, our method can handle video collections featuring multiple object classes, and substantially outperforms the state of the art in colocalization, even though it tackles a broader problem with much less supervision.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Kwak_Unsupervised_Object_Discovery_ICCV_2015_paper.pdf", - "aff": "Inria; Inria; Inria; Ecole Normale Sup\u00e9rieure / PSL Research University; Inria+Inria Grenoble Rh\u00f4ne-Alpes, Laboratoire Jean Kuntzmann, CNRS, Univ. Grenoble Alpes, France", + "aff": "Inria; Inria; Inria; Ecole Normale Supérieure / PSL Research University; Inria+Inria Grenoble Rhône-Alpes, Laboratoire Jean Kuntzmann, CNRS, Univ. Grenoble Alpes, France", "project": "", "github": "", "supp": "", @@ -15802,14 +16301,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Kwak_Unsupervised_Object_Discovery_ICCV_2015_paper.html", "aff_unique_index": "0;0;0;1;0+2", - "aff_unique_norm": "INRIA;Ecole Normale Sup\u00e9rieure;INRIA Grenoble Rh\u00f4ne-Alpes", + "aff_unique_norm": "Inria;Ecole Normale Supérieure;Inria Grenoble Rhône-Alpes", "aff_unique_dep": ";;Laboratoire Jean Kuntzmann", "aff_unique_url": "https://www.inria.fr;https://www.ens.fr;https://www.inria.fr/grenoble", "aff_unique_abbr": "Inria;ENS;Inria", "aff_campus_unique_index": "1", "aff_campus_unique": ";Grenoble", "aff_country_unique_index": "0;0;0;0;0+0", - "aff_country_unique": "France" + "aff_country_unique": "France", + "bibtex": "@InProceedings{Kwak_2015_ICCV,\n \n author = {\n Kwak,\n Suha and Cho,\n Minsu and Laptev,\n Ivan and Ponce,\n Jean and Schmid,\n Cordelia\n},\n title = {\n Unsupervised Object Discovery and Tracking in Video Collections\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "01a788d1a7", @@ -15836,14 +16336,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Sener_Unsupervised_Semantic_Parsing_ICCV_2015_paper.html", "aff_unique_index": "0;0;0;1+2", - "aff_unique_norm": "Stanford University;Cornell University;Brain Of Things", + "aff_unique_norm": "Stanford University;Cornell University;Brain of Things", "aff_unique_dep": ";;Inc.", "aff_unique_url": "https://www.stanford.edu;https://www.cornell.edu;https://www.brainofthings.com", "aff_unique_abbr": "Stanford;Cornell;BOT", "aff_campus_unique_index": "0;0;0;", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0;0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Sener_2015_ICCV,\n \n author = {\n Sener,\n Ozan and Zamir,\n Amir R. and Savarese,\n Silvio and Saxena,\n Ashutosh\n},\n title = {\n Unsupervised Semantic Parsing of Video Collections\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "61d0ad21b9", @@ -15877,7 +16378,8 @@ "aff_campus_unique_index": "0;2;0", "aff_campus_unique": "Pittsburgh;;Beijing", "aff_country_unique_index": "0+0;1;0;0;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Chu_2015_ICCV,\n \n author = {\n Chu,\n Wen-Sheng and Zeng,\n Jiabei and De la Torre,\n Fernando and Cohn,\n Jeffrey F. and Messinger,\n Daniel S.\n},\n title = {\n Unsupervised Synchrony Discovery in Human Interaction\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "c281d557de", @@ -15911,7 +16413,8 @@ "aff_campus_unique_index": "0+0;0", "aff_campus_unique": "Atlanta;", "aff_country_unique_index": "0+0;1;1;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Xu_2015_ICCV,\n \n author = {\n Xu,\n Hongteng and Zhou,\n Yang and Lin,\n Weiyao and Zha,\n Hongyuan\n},\n title = {\n Unsupervised Trajectory Clustering via Adaptive Multi-Kernel-Based Shrinkage\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "0eb4798595", @@ -15945,7 +16448,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+1;0", - "aff_country_unique": "Italy;Serbia" + "aff_country_unique": "Italy;Serbia", + "bibtex": "@InProceedings{Puscas_2015_ICCV,\n \n author = {\n Puscas,\n Mihai Marian and Sangineto,\n Enver and Culibrk,\n Dubravko and Sebe,\n Nicu\n},\n title = {\n Unsupervised Tube Extraction Using Transductive Learning and Dense Trajectories\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "fc4ab321de", @@ -15970,7 +16474,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Doersch_Unsupervised_Visual_Representation_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Doersch_Unsupervised_Visual_Representation_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Doersch_2015_ICCV,\n \n author = {\n Doersch,\n Carl and Gupta,\n Abhinav and Efros,\n Alexei A.\n},\n title = {\n Unsupervised Visual Representation Learning by Context Prediction\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "e4b119cefe", @@ -15997,14 +16502,15 @@ "author_num": 7, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Antol_VQA_Visual_Question_ICCV_2015_paper.html", "aff_unique_index": "0;0;0;1;0;1;0", - "aff_unique_norm": "Virginia Tech;Microsoft", + "aff_unique_norm": "Virginia Tech;Microsoft Corporation", "aff_unique_dep": ";Microsoft Research", "aff_unique_url": "https://www.vt.edu;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "VT;MSR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Antol_2015_ICCV,\n \n author = {\n Antol,\n Stanislaw and Agrawal,\n Aishwarya and Lu,\n Jiasen and Mitchell,\n Margaret and Batra,\n Dhruv and Zitnick,\n C. Lawrence and Parikh,\n Devi\n},\n title = {\n VQA: Visual Question Answering\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "e385f4835c", @@ -16013,7 +16519,7 @@ "status": "Poster", "track": "main", "pid": "", - "author_site": "David Ferstl, Matthias R\u00fcther, Horst Bischof", + "author_site": "David Ferstl, Matthias Rüther, Horst Bischof", "author": "David Ferstl; Matthias Ruther; Horst Bischof", "abstract": "In this paper we propose a novel method for depth image superresolution which combines recent advances in example based upsampling with variational superresolution based on a known blur kernel. Most traditional depth superresolution approaches try to use additional high resolution intensity images as guidance for superresolution. In our method we learn a dictionary of edge priors from an external database of high and low resolution examples. In a novel variational sparse coding approach this dictionary is used to infer strong edge priors. Additionally to the traditional sparse coding constraints the difference in the overlap of neighboring edge patches is minimized in our optimization. These edge priors are used in a novel variational superresolution as anisotropic guidance of the higher order regularization. Both the sparse coding and the variational superresolution of the depth are solved based on a primal-dual formulation. In an exhaustive numerical and visual evaluation we show that our method clearly outperforms existing approaches on multiple real and synthetic datasets.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Ferstl_Variational_Depth_Superresolution_ICCV_2015_paper.pdf", @@ -16038,7 +16544,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Austria" + "aff_country_unique": "Austria", + "bibtex": "@InProceedings{Ferstl_2015_ICCV,\n \n author = {\n Ferstl,\n David and Ruther,\n Matthias and Bischof,\n Horst\n},\n title = {\n Variational Depth Superresolution Using Example-Based Edge Representations\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "ead04a502d", @@ -16051,7 +16558,7 @@ "author": "Philipp Heise; Brian Jensen; Sebastian Klose; Alois Knoll", "abstract": "In this work we propose a novel approach to the problem of multi-view stereo reconstruction. Building upon the previously proposed PatchMatch stereo and PM-Huber algorithm we introduce an extension to the multi-view scenario that employs an iterative refinement scheme. Our proposed approach uses an extended and robustified volumetric truncated signed distance function representation, which is advantageous for the fusion of refined depth maps and also for raycasting the current reconstruction estimation together with estimated depth normals into arbitrary camera views. We formulate the combined multi-view stereo reconstruction and refinement as a variational optimization problem. The newly introduced plane based smoothing term in the energy formulation is guided by the current reconstruction confidence and the image contents. Further we propose an extension of the PatchMatch scheme with an additional KLT step to avoid unnecessary sampling iterations. Improper camera poses are corrected by a direct image aligment step that performs robust outlier compensation by means of a recently proposed kernel lifting framework. To speed up the optimization of the variational formulation an adapted scheme is used for faster convergence.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Heise_Variational_PatchMatch_MultiView_ICCV_2015_paper.pdf", - "aff": "Department of Informatics, Technische Universit \u00a8at M \u00a8unchen, Germany; Department of Informatics, Technische Universit \u00a8at M \u00a8unchen, Germany; Department of Informatics, Technische Universit \u00a8at M \u00a8unchen, Germany; Department of Informatics, Technische Universit \u00a8at M \u00a8unchen, Germany", + "aff": "Department of Informatics, Technische Universit ¨at M ¨unchen, Germany; Department of Informatics, Technische Universit ¨at M ¨unchen, Germany; Department of Informatics, Technische Universit ¨at M ¨unchen, Germany; Department of Informatics, Technische Universit ¨at M ¨unchen, Germany", "project": "", "github": "", "supp": "", @@ -16065,14 +16572,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Heise_Variational_PatchMatch_MultiView_ICCV_2015_paper.html", "aff_unique_index": "0;0;0;0", - "aff_unique_norm": "Technische Universit\u00e4t M\u00fcnchen", + "aff_unique_norm": "Technische Universität München", "aff_unique_dep": "Department of Informatics", "aff_unique_url": "https://www.tum.de", "aff_unique_abbr": "TUM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Heise_2015_ICCV,\n \n author = {\n Heise,\n Philipp and Jensen,\n Brian and Klose,\n Sebastian and Knoll,\n Alois\n},\n title = {\n Variational PatchMatch MultiView Reconstruction and Refinement\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "127bb8978d", @@ -16106,7 +16614,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zou_2015_ICCV,\n \n author = {\n Zou,\n Dongqing and Chen,\n Xiaowu and Cao,\n Guangying and Wang,\n Xiaogang\n},\n title = {\n Video Matting via Sparse and Low-Rank Representation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "04cbaf83f8", @@ -16140,7 +16649,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0;0", - "aff_country_unique": "Canada;China" + "aff_country_unique": "Canada;China", + "bibtex": "@InProceedings{Wu_2015_ICCV,\n \n author = {\n Wu,\n Xiaolin and Li,\n Zhenhao and Deng,\n Xiaowei\n},\n title = {\n Video Restoration Against Yin-Yang Phasing\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "cbcd6ea644", @@ -16174,7 +16684,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Freiburg", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Nagaraja_2015_ICCV,\n \n author = {\n Nagaraja,\n Naveen Shankar and Schmidt,\n Frank R. and Brox,\n Thomas\n},\n title = {\n Video Segmentation With Just a Few Strokes\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "f6b28da01e", @@ -16201,14 +16712,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Liao_Video_Super-Resolution_via_ICCV_2015_paper.html", "aff_unique_index": "0;0;0;1;0", - "aff_unique_norm": "Chinese University of Hong Kong;University of Chinese Academy of Sciences", + "aff_unique_norm": "The Chinese University of Hong Kong;University of Chinese Academy of Sciences", "aff_unique_dep": ";", "aff_unique_url": "https://www.cuhk.edu.hk;http://www.ucas.ac.cn", "aff_unique_abbr": "CUHK;UCAS", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liao_2015_ICCV,\n \n author = {\n Liao,\n Renjie and Tao,\n Xin and Li,\n Ruiyu and Ma,\n Ziyang and Jia,\n Jiaya\n},\n title = {\n Video Super-Resolution via Deep Draft-Ensemble Learning\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "50e61016e5", @@ -16242,7 +16754,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Chapel Hill", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Yu_2015_ICCV,\n \n author = {\n Yu,\n Licheng and Park,\n Eunbyung and Berg,\n Alexander C. and Berg,\n Tamara L.\n},\n title = {\n Visual Madlibs: Fill in the Blank Description Generation and Question Answering\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "72cfa2c14b", @@ -16267,7 +16780,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Kumar_Visual_Phrases_for_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Kumar_Visual_Phrases_for_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Kumar_2015_ICCV,\n \n author = {\n Kumar,\n Vijay and Namboodiri,\n Anoop and Jawahar,\n C. V.\n},\n title = {\n Visual Phrases for Exemplar Face Detection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "c8196b3985", @@ -16292,7 +16806,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Wang_Visual_Tracking_With_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Wang_Visual_Tracking_With_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Wang_2015_ICCV,\n \n author = {\n Wang,\n Lijun and Ouyang,\n Wanli and Wang,\n Xiaogang and Lu,\n Huchuan\n},\n title = {\n Visual Tracking With Fully Convolutional Networks\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "41c5937bd3", @@ -16305,7 +16820,7 @@ "author": "Yuri Boykov; Hossam Isack; Carl Olsson; Ismail Ben Ayed", "abstract": "Many standard optimization methods for segmentation and reconstruction compute ML model estimates for appearance or geometry of segments, e.g. Zhu-Yuille 1996, Torr 1998, Chan-Vese 2001, GrabCut 2004, Delong et al. 2012. We observe that the standard likelihood term in these formulations corresponds to a generalized probabilistic K-means energy. In learning it is well known that this energy has a strong bias to clusters of equal size, which we express as a penalty for KL divergence from a uniform distribution of cardinalities. However, this volumetric bias has been mostly ignored in computer vision. We demonstrate significant artifacts in standard segmentation and reconstruction methods due to this bias. Moreover, we propose binary and multi-label optimization techniques that either (a) remove this bias or (b) replace it by a KL divergence term for any given target volume distribution. Our general ideas apply to continuous or discrete energy formulations in segmentation, stereo, and other reconstruction problems.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Boykov_Volumetric_Bias_in_ICCV_2015_paper.pdf", - "aff": "University of Western Ontario, Canada; University of Western Ontario, Canada; Lund University, Sweden; \u00c9cole de Technologie Sup\u00e9rieure, University of Quebec, Canada", + "aff": "University of Western Ontario, Canada; University of Western Ontario, Canada; Lund University, Sweden; École de Technologie Supérieure, University of Quebec, Canada", "project": "", "github": "", "supp": "", @@ -16324,9 +16839,10 @@ "aff_unique_url": "https://www.uwo.ca;https://www.lunduniversity.lu.se;https://www.etsmtl.ca", "aff_unique_abbr": "UWO;LU;ETS", "aff_campus_unique_index": "1", - "aff_campus_unique": ";\u00c9cole de Technologie Sup\u00e9rieure", + "aff_campus_unique": ";École de Technologie Supérieure", "aff_country_unique_index": "0;0;1;0", - "aff_country_unique": "Canada;Sweden" + "aff_country_unique": "Canada;Sweden", + "bibtex": "@InProceedings{Boykov_2015_ICCV,\n \n author = {\n Boykov,\n Yuri and Isack,\n Hossam and Olsson,\n Carl and Ben Ayed,\n Ismail\n},\n title = {\n Volumetric Bias in Segmentation and Reconstruction: Secrets and Solutions\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "f1bf0bf418", @@ -16360,7 +16876,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Santa Barbara", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Pourian_2015_ICCV,\n \n author = {\n Pourian,\n Niloufar and Karthikeyan,\n S. and Manjunath,\n B.S.\n},\n title = {\n Weakly Supervised Graph Based Semantic Segmentation by Learning Communities of Image-Parts\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "c2d988d5ae", @@ -16388,13 +16905,14 @@ "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Papandreou_Weakly-_and_Semi-Supervised_ICCV_2015_paper.html", "aff_unique_index": "0;1;0;1", "aff_unique_norm": "Google;University of California, Los Angeles", - "aff_unique_dep": "Google;", + "aff_unique_dep": ";", "aff_unique_url": "https://www.google.com;https://www.ucla.edu", "aff_unique_abbr": "Google;UCLA", "aff_campus_unique_index": "0;1;0;1", "aff_campus_unique": "Mountain View;Los Angeles", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Papandreou_2015_ICCV,\n \n author = {\n Papandreou,\n George and Chen,\n Liang-Chieh and Murphy,\n Kevin P. and Yuille,\n Alan L.\n},\n title = {\n Weakly- and Semi-Supervised Learning of a Deep Convolutional Network for Semantic Image Segmentation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "427f4e76cd", @@ -16403,7 +16921,7 @@ "status": "Poster", "track": "main", "pid": "", - "author_site": "Piotr Bojanowski, R\u00e9mi Lajugie, Edouard Grave, Francis Bach, Ivan Laptev, Jean Ponce, Cordelia Schmid", + "author_site": "Piotr Bojanowski, Rémi Lajugie, Edouard Grave, Francis Bach, Ivan Laptev, Jean Ponce, Cordelia Schmid", "author": "Piotr Bojanowski; Remi Lajugie; Edouard Grave; Francis Bach; Ivan Laptev; Jean Ponce; Cordelia Schmid", "abstract": "Suppose that we are given a set of videos, along with natural language descriptions in the form of multiple sentences (e.g., manual annotations, movie scripts, sport summaries etc.), and that these sentences appear in the same temporal order as their visual counterparts. We propose in this paper a method for aligning the two modalities, i.e., automatically providing a time (frame) stamp for every sentence. Given vectorial features for both video and text, this can be cast as a temporal assignment problem, with an implicit linear mapping between the two feature modalities. We formulate this problem as an integer quadratic program, and solve its continuous convex relaxation using an efficient conditional gradient algorithm. Several rounding procedures are proposed to construct the final integer solution. After demonstrating significant improvements over the state of the art on the related task of aligning video with symbolic labels, we evaluate our method on a challenging dataset of videos with associated textual descriptions, and explore bag-of-words and continuous representations for text.", "pdf": "http://openaccess.thecvf.com/content_iccv_2015/papers/Bojanowski_Weakly-Supervised_Alignment_of_ICCV_2015_paper.pdf", @@ -16419,7 +16937,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Bojanowski_Weakly-Supervised_Alignment_of_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Bojanowski_Weakly-Supervised_Alignment_of_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Bojanowski_2015_ICCV,\n \n author = {\n Bojanowski,\n Piotr and Lajugie,\n Remi and Grave,\n Edouard and Bach,\n Francis and Laptev,\n Ivan and Ponce,\n Jean and Schmid,\n Cordelia\n},\n title = {\n Weakly-Supervised Alignment of Video With Text\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "5e60364166", @@ -16444,7 +16963,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Carneiro_Weakly-Supervised_Structured_Output_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Carneiro_Weakly-Supervised_Structured_Output_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Carneiro_2015_ICCV,\n \n author = {\n Carneiro,\n Gustavo and Peng,\n Tingying and Bayer,\n Christine and Navab,\n Nassir\n},\n title = {\n Weakly-Supervised Structured Output Learning With Flexible and Latent Graphs Using High-Order Loss Functions\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "58e0786449", @@ -16469,7 +16989,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Avrithis_Web-Scale_Image_Clustering_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Avrithis_Web-Scale_Image_Clustering_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Avrithis_2015_ICCV,\n \n author = {\n Avrithis,\n Yannis and Kalantidis,\n Yannis and Anagnostopoulos,\n Evangelos and Emiris,\n Ioannis Z.\n},\n title = {\n Web-Scale Image Clustering Revisited\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "a5bf5afaa6", @@ -16493,7 +17014,8 @@ "aff_domain": ";", "email": ";", "author_num": 2, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Chen_Webly_Supervised_Learning_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Chen_Webly_Supervised_Learning_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Chen_2015_ICCV,\n \n author = {\n Chen,\n Xinlei and Gupta,\n Abhinav\n},\n title = {\n Webly Supervised Learning of Convolutional Networks\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "14e8b92c42", @@ -16518,7 +17040,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Suwajanakorn_What_Makes_Tom_ICCV_2015_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Suwajanakorn_What_Makes_Tom_ICCV_2015_paper.html", + "bibtex": "@InProceedings{Suwajanakorn_2015_ICCV,\n \n author = {\n Suwajanakorn,\n Supasorn and Seitz,\n Steven M. and Kemelmacher-Shlizerman,\n Ira\n},\n title = {\n What Makes Tom Hanks Look Like Tom Hanks\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "482462f5b8", @@ -16552,7 +17075,8 @@ "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Berkeley;Merced", "aff_country_unique_index": "0;1;1;1;0", - "aff_country_unique": "Saudi Arabia;United States" + "aff_country_unique": "Saudi Arabia;United States", + "bibtex": "@InProceedings{Dubey_2015_ICCV,\n \n author = {\n Dubey,\n Rachit and Peterson,\n Joshua and Khosla,\n Aditya and Yang,\n Ming-Hsuan and Ghanem,\n Bernard\n},\n title = {\n What Makes an Object Memorable?\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "8d32c44d2e", @@ -16579,14 +17103,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Kiapour_Where_to_Buy_ICCV_2015_paper.html", "aff_unique_index": "0;0;1;0;0", - "aff_unique_norm": "University of North Carolina;University of Illinois Urbana-Champaign", + "aff_unique_norm": "University of North Carolina;University of Illinois at Urbana-Champaign", "aff_unique_dep": ";", "aff_unique_url": "https://www.unc.edu;https://illinois.edu", "aff_unique_abbr": "UNC;UIUC", "aff_campus_unique_index": "0;0;1;0;0", "aff_campus_unique": "Chapel Hill;Urbana-Champaign", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Kiapour_2015_ICCV,\n \n author = {\n Kiapour,\n M. Hadi and Han,\n Xufeng and Lazebnik,\n Svetlana and Berg,\n Alexander C. and Berg,\n Tamara L.\n},\n title = {\n Where to Buy It: Matching Street Clothing Photos in Online Shops\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "4f66cacc1c", @@ -16613,14 +17138,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2015/html/Galun_Wide_Baseline_Stereo_ICCV_2015_paper.html", "aff_unique_index": "0;0;1;0;0", - "aff_unique_norm": "Weizmann Institute of Science;Open University", + "aff_unique_norm": "Weizmann Institute of Science;The Open University", "aff_unique_dep": ";", - "aff_unique_url": "https://www.weizmann.ac.il;https://www.openu.ac.il", + "aff_unique_url": "https://www.weizmann.org.il;https://www.openu.ac.il", "aff_unique_abbr": "Weizmann;OpenU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "Israel" + "aff_country_unique": "Israel", + "bibtex": "@InProceedings{Galun_2015_ICCV,\n \n author = {\n Galun,\n Meirav and Amir,\n Tal and Hassner,\n Tal and Basri,\n Ronen and Lipman,\n Yaron\n},\n title = {\n Wide Baseline Stereo Matching With Convex Bounded Distortion Constraints\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "7f87b33046", @@ -16654,7 +17180,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Charlotte", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Workman_2015_ICCV,\n \n author = {\n Workman,\n Scott and Souvenir,\n Richard and Jacobs,\n Nathan\n},\n title = {\n Wide-Area Image Geolocalization With Aerial Reference Imagery\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "4a14e7e7fd", @@ -16688,7 +17215,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Chu_2015_ICCV,\n \n author = {\n Chu,\n Hang and Kim,\n Dong Ki and Chen,\n Tsuhan\n},\n title = {\n You Are Here: Mimicking the Human Thinking Process in Reading Floor-Plans\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "f101111d81", @@ -16722,457 +17250,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" - }, - { - "id": "site_d4091d8f25", - "title": "[From Oral 1A] Aligning Books and Movies: Towards Story-Like Visual Explanations by Watching Movies and Reading Books", - "session": "recognition, low-level vision, and biomedical image analysis", - "author": "Yukun Zhu, Ryan Kiros, Rich Zemel, Ruslan Salakhutdinov, Raquel Urtasun, Antonio Torralba, Sanja Fidler", - "status": "Poster", - "track": "main", - "pid": "" - }, - { - "id": "site_626dc25a44", - "title": "[From Oral 1A] Ask Your Neurons: A Neural-Based Approach to Answering Questions About Images", - "session": "recognition, low-level vision, and biomedical image analysis", - "author": "Mateusz Malinowski, Marcus Rohrbach, Mario Fritz", - "status": "Poster", - "track": "main", - "pid": "" - }, - { - "id": "site_6f706f9856", - "title": "[From Oral 1A] Learning Query and Image Similarities With Ranking Canonical Correlation Analysis", - "session": "recognition, low-level vision, and biomedical image analysis", - "author": "Ting Yao, Tao Mei, Chong-Wah Ngo", - "status": "Poster", - "track": "main", - "pid": "" - }, - { - "id": "site_3bae571f19", - "title": "[From Oral 1A] Segment-Phrase Table for Semantic Segmentation, Visual Entailment and Paraphrasing", - "session": "recognition, low-level vision, and biomedical image analysis", - "author": "Hamid Izadinia, Fereshteh Sadeghi, Santosh K. Divvala, Hannaneh Hajishirzi, Yejin Choi, Ali Farhadi", - "status": "Poster", - "track": "main", - "pid": "" - }, - { - "id": "site_141f62200b", - "title": "[From Oral 1B] 3D Time-Lapse Reconstruction From Internet Photos", - "session": "recognition and 3d computer vision i", - "author": "Ricardo Martin-Brualla, David Gallup, Steven M. Seitz", - "status": "Poster", - "track": "main", - "pid": "" - }, - { - "id": "site_5f6e6133f9", - "title": "[From Oral 1B] Global, Dense Multiscale Reconstruction for a Billion Points", - "session": "recognition and 3d computer vision i", - "author": "Benjamin Ummenhofer, Thomas Brox", - "status": "Poster", - "track": "main", - "pid": "" - }, - { - "id": "site_3dd13c6620", - "title": "[From Oral 1B] On the Visibility of Point Clouds", - "session": "recognition and 3d computer vision i", - "author": "Sagi Katz, Ayellet Tal", - "status": "Poster", - "track": "main", - "pid": "" - }, - { - "id": "site_b59da822e6", - "title": "[From Oral 1B] Structured Indoor Modeling", - "session": "recognition and 3d computer vision i", - "author": "Satoshi Ikehata, Hang Yang, Yasutaka Furukawa", - "status": "Poster", - "track": "main", - "pid": "" - }, - { - "id": "site_1c0c64f352", - "title": "[From Oral 2A] Holistically-Nested Edge Detection", - "session": "optimization, segmentation, and recognition", - "author": "Saining Xie, Zhuowen Tu", - "status": "Poster", - "track": "main", - "pid": "" - }, - { - "id": "site_cae1b568de", - "title": "[From Oral 2A] Human Parsing With Contextualized Convolutional Neural Network", - "session": "optimization, segmentation, and recognition", - "author": "Xiaodan Liang, Chunyan Xu, Xiaohui Shen, Jianchao Yang, Si Liu, Jinhui Tang, Liang Lin, Shuicheng Yan", - "status": "Poster", - "track": "main", - "pid": "" - }, - { - "id": "site_9824f21472", - "title": "[From Oral 2A] Minimum Barrier Salient Object Detection at 80 FPS", - "session": "optimization, segmentation, and recognition", - "author": "Jianming Zhang, Stan Sclaroff, Zhe Lin, Xiaohui Shen, Brian Price, Radom\u00edr M\u011bch", - "status": "Poster", - "track": "main", - "pid": "" - }, - { - "id": "site_f14d6ffca3", - "title": "[From Oral 2A] Piecewise Flat Embedding for Image Segmentation", - "session": "optimization, segmentation, and recognition", - "author": "Yizhou Yu, Chaowei Fang, Zicheng Liao", - "status": "Poster", - "track": "main", - "pid": "" - }, - { - "id": "site_9dfc56300e", - "title": "[From Oral 2A] Semantic Image Segmentation via Deep Parsing Network", - "session": "optimization, segmentation, and recognition", - "author": "Ziwei Liu, Xiaoxiao Li, Ping Luo, Chen-Change Loy, Xiaoou Tang", - "status": "Poster", - "track": "main", - "pid": "" - }, - { - "id": "site_7fa75cd8ad", - "title": "[From Oral 2A] Weakly Supervised Graph Based Semantic Segmentation by Learning Communities of Image-Parts", - "session": "optimization, segmentation, and recognition", - "author": "Niloufar Pourian, S. Karthikeyan, B.S. Manjunath", - "status": "Poster", - "track": "main", - "pid": "" - }, - { - "id": "site_7b4310e234", - "title": "[From Oral 2B] Bilinear CNN Models for Fine-Grained Visual Recognition", - "session": "optimization, segmentation, and recognition", - "author": "Tsung-Yu Lin, Aruni RoyChowdhury, Subhransu Maji", - "status": "Poster", - "track": "main", - "pid": "" - }, - { - "id": "site_6bfcff158a", - "title": "[From Oral 2B] Discovering the Spatial Extent of Relative Attributes", - "session": "optimization, segmentation, and recognition", - "author": "Fanyi Xiao, Yong Jae Lee", - "status": "Poster", - "track": "main", - "pid": "" - }, - { - "id": "site_03986ff9eb", - "title": "[From Oral 2B] Fast R-CNN", - "session": "optimization, segmentation, and recognition", - "author": "Ross Girshick", - "status": "Poster", - "track": "main", - "pid": "" - }, - { - "id": "site_7dfe282072", - "title": "[From Oral 2B] Learning Image Representations Tied to Ego-Motion", - "session": "optimization, segmentation, and recognition", - "author": "Dinesh Jayaraman, Kristen Grauman", - "status": "Poster", - "track": "main", - "pid": "" - }, - { - "id": "site_4b8dd904a4", - "title": "[From Oral 2B] Unsupervised Visual Representation Learning by Context Prediction", - "session": "optimization, segmentation, and recognition", - "author": "Carl Doersch, Abhinav Gupta, Alexei A. Efros", - "status": "Poster", - "track": "main", - "pid": "" - }, - { - "id": "site_ba30f9a1d7", - "title": "[From Oral 2B] Webly Supervised Learning of Convolutional Networks", - "session": "optimization, segmentation, and recognition", - "author": "Xinlei Chen, Abhinav Gupta", - "status": "Poster", - "track": "main", - "pid": "" - }, - { - "id": "site_fc8483865e", - "title": "[From Oral 2C] Deep Fried Convnets", - "session": "optimization, segmentation, and recognition", - "author": "Zichao Yang, Marcin Moczulski, Misha Denil, Nando de Freitas, Alex Smola, Le Song, Ziyu Wang", - "status": "Poster", - "track": "main", - "pid": "" - }, - { - "id": "site_05fbd12da5", - "title": "[From Oral 2C] Deep Neural Decision Forests", - "session": "optimization, segmentation, and recognition", - "author": "Peter Kontschieder, Madalina Fiterau, Antonio Criminisi, Samuel Rota Bul\u00f2", - "status": "Poster", - "track": "main", - "pid": "" - }, - { - "id": "site_0a5799d93e", - "title": "[From Oral 2C] Learning Discriminative Reconstructions for Unsupervised Outlier Removal", - "session": "optimization, segmentation, and recognition", - "author": "Yan Xia, Xudong Cao, Fang Wen, Gang Hua, Jian Sun", - "status": "Poster", - "track": "main", - "pid": "" - }, - { - "id": "site_c458742e48", - "title": "[From Oral 2C] Low-Rank Matrix Factorization Under General Mixture Noise Distributions", - "session": "optimization, segmentation, and recognition", - "author": "Xiangyong Cao, Yang Chen, Qian Zhao, Deyu Meng, Yao Wang, Dong Wang, Zongben Xu", - "status": "Poster", - "track": "main", - "pid": "" - }, - { - "id": "site_9463238582", - "title": "[From Oral 2C] Semantic Component Analysis", - "session": "optimization, segmentation, and recognition", - "author": "Calvin Murdock, Fernando De la Torre", - "status": "Poster", - "track": "main", - "pid": "" - }, - { - "id": "site_79a048dc5a", - "title": "[From Oral 2C] Web-Scale Image Clustering Revisited", - "session": "optimization, segmentation, and recognition", - "author": "Yannis Avrithis, Yannis Kalantidis, Evangelos Anagnostopoulos, Ioannis Z. Emiris", - "status": "Poster", - "track": "main", - "pid": "" - }, - { - "id": "site_ceb05cb77f", - "title": "[From Oral 3A] CV-HAZOP: Introducing Test Data Validation for Computer Vision", - "session": "recognition and 3d computer vision ii", - "author": "Oliver Zendel, Markus Murschitz, Martin Humenberger, Wolfgang Herzner", - "status": "Poster", - "track": "main", - "pid": "" - }, - { - "id": "site_3bd285a6a0", - "title": "[From Oral 3A] MeshStereo: A Global Stereo Model With Mesh Alignment Regularization for View Interpolation", - "session": "recognition and 3d computer vision ii", - "author": "Chi Zhang, Zhiwei Li, Yanhua Cheng, Rui Cai, Hongyang Chao, Yong Rui", - "status": "Poster", - "track": "main", - "pid": "" - }, - { - "id": "site_1ca77c0888", - "title": "[From Oral 3A] Registering Images to Untextured Geometry Using Average Shading Gradients", - "session": "recognition and 3d computer vision ii", - "author": "Tobias Pl\u00f6tz, Stefan Roth", - "status": "Poster", - "track": "main", - "pid": "" - }, - { - "id": "site_09a3fb5bd0", - "title": "[From Oral 3A] Robust Nonrigid Registration by Convex Optimization", - "session": "recognition and 3d computer vision ii", - "author": "Qifeng Chen, Vladlen Koltun", - "status": "Poster", - "track": "main", - "pid": "" - }, - { - "id": "site_5c3c0b4a3f", - "title": "[From Oral 3A] Robust and Optimal Sum-of-Squares-Based Point-to-Plane Registration of Image Sets and Structured Scenes", - "session": "recognition and 3d computer vision ii", - "author": "Danda Pani Paudel, Adlane Habed, C\u00e9dric Demonceaux, Pascal Vasseur", - "status": "Poster", - "track": "main", - "pid": "" - }, - { - "id": "site_6f50d5b048", - "title": "[From Oral 3B] 3D-Assisted Feature Synthesis for Novel Views of an Object", - "session": "recognition and 3d computer vision ii", - "author": "Hao Su, Fan Wang, Eric Yi, Leonidas J. Guibas", - "status": "Poster", - "track": "main", - "pid": "" - }, - { - "id": "site_c539d4dd62", - "title": "[From Oral 3B] Camera Pose Voting for Large-Scale Image-Based Localization", - "session": "recognition and 3d computer vision ii", - "author": "Bernhard Zeisl, Torsten Sattler, Marc Pollefeys", - "status": "Poster", - "track": "main", - "pid": "" - }, - { - "id": "site_7ffc9e93ba", - "title": "[From Oral 3B] Lost Shopping! Monocular Localization in Large Indoor Spaces", - "session": "recognition and 3d computer vision ii", - "author": "Shenlong Wang, Sanja Fidler, Raquel Urtasun", - "status": "Poster", - "track": "main", - "pid": "" - }, - { - "id": "site_31b44d6e08", - "title": "[From Oral 3B] Render for CNN: Viewpoint Estimation in Images Using CNNs Trained With Rendered 3D Model Views", - "session": "statistical methods and learning, motion and tracking, and video analysis i", - "author": "Hao Su, Charles R. Qi, Yangyan Li, Leonidas J. Guibas", - "status": "Poster", - "track": "main", - "pid": "" - }, - { - "id": "site_274e21e6ea", - "title": "[From Oral 3C] Learning Complexity-Aware Cascades for Deep Pedestrian Detection", - "session": "statistical methods and learning, motion and tracking, and video analysis i", - "author": "Zhaowei Cai, Mohammad Saberian, Nuno Vasconcelos", - "status": "Poster", - "track": "main", - "pid": "" - }, - { - "id": "site_cc772e8082", - "title": "[From Oral 3C] Multi-Task Recurrent Neural Network for Immediacy Prediction", - "session": "statistical methods and learning, motion and tracking, and video analysis i", - "author": "Xiao Chu, Wanli Ouyang, Wei Yang, Xiaogang Wang", - "status": "Poster", - "track": "main", - "pid": "" - }, - { - "id": "site_3cef0b7168", - "title": "[From Oral 3C] Opening the Black Box: Hierarchical Sampling Optimization for Estimating Human Hand Pose", - "session": "statistical methods and learning, motion and tracking, and video analysis i", - "author": "Danhang Tang, Jonathan Taylor, Pushmeet Kohli, Cem Keskin, Tae-Kyun Kim, Jamie Shotton", - "status": "Poster", - "track": "main", - "pid": "" - }, - { - "id": "site_c2128236f0", - "title": "[From Oral 3C] Panoptic Studio: A Massively Multiview System for Social Motion Capture", - "session": "statistical methods and learning, motion and tracking, and video analysis i", - "author": "Hanbyul Joo, Hao Liu, Lei Tan, Lin Gui, Bart Nabbe, Iain Matthews, Takeo Kanade, Shohei Nobuhara, Yaser Sheikh", - "status": "Poster", - "track": "main", - "pid": "" - }, - { - "id": "site_5f05f33ae2", - "title": "[From Oral 3C] Training a Feedback Loop for Hand Pose Estimation", - "session": "statistical methods and learning, motion and tracking, and video analysis i", - "author": "Markus Oberweger, Paul Wohlhart, Vincent Lepetit", - "status": "Poster", - "track": "main", - "pid": "" - }, - { - "id": "site_0728e649f4", - "title": "[From Oral 3C] Where to Buy It: Matching Street Clothing Photos in Online Shops", - "session": "statistical methods and learning, motion and tracking, and video analysis i", - "author": "M. Hadi Kiapour, Xufeng Han, Svetlana Lazebnik, Alexander C. Berg, Tamara L. Berg", - "status": "Poster", - "track": "main", - "pid": "" - }, - { - "id": "site_7661935dcb", - "title": "[From Oral 4A] Airborne Three-Dimensional Cloud Tomography", - "session": "computational photography, face and gesture, and vision for x", - "author": "Aviad Levis, Yoav Y. Schechner, Amit Aides, Anthony B. Davis", - "status": "Poster", - "track": "main", - "pid": "" - }, - { - "id": "site_69dccb5522", - "title": "[From Oral 4A] Leave-One-Out Kernel Optimization for Shadow Detection", - "session": "computational photography, face and gesture, and vision for x", - "author": "Tomas F. Yago Vicente, Minh Hoai, Dimitris Samaras", - "status": "Poster", - "track": "main", - "pid": "" - }, - { - "id": "site_2c52f7a9a8", - "title": "[From Oral 4A] Mutual-Structure for Joint Filtering", - "session": "computational photography, face and gesture, and vision for x", - "author": "Xiaoyong Shen, Chao Zhou, Li Xu, Jiaya Jia", - "status": "Poster", - "track": "main", - "pid": "" - }, - { - "id": "site_8994a45611", - "title": "[From Oral 4A] Polarized 3D: High-Quality Depth Sensing With Polarization Cues", - "session": "computational photography, face and gesture, and vision for x", - "author": "Achuta Kadambi, Vage Taamazyan, Boxin Shi, Ramesh Raskar", - "status": "Poster", - "track": "main", - "pid": "" - }, - { - "id": "site_bfc327d719", - "title": "[From Oral 4A] Removing Rain From a Single Image via Discriminative Sparse Coding", - "session": "computational photography, face and gesture, and vision for x", - "author": "Yu Luo, Yong Xu, Hui Ji", - "status": "Poster", - "track": "main", - "pid": "" - }, - { - "id": "site_bcd21d6446", - "title": "[From Oral 4B] Dense Semantic Correspondence Where Every Pixel is a Classifier", - "session": "computational photography, face and gesture, and vision for x", - "author": "Hilton Bristow, Jack Valmadre, Simon Lucey", - "status": "Poster", - "track": "main", - "pid": "" - }, - { - "id": "site_c53fc339ff", - "title": "[From Oral 4B] Flow Fields: Dense Correspondence Fields for Highly Accurate Large Displacement Optical Flow Estimation", - "session": "computational photography, face and gesture, and vision for x", - "author": "Christian Bailer, Bertram Taetz, Didier Stricker", - "status": "Poster", - "track": "main", - "pid": "" - }, - { - "id": "site_1aff8e6d24", - "title": "[From Oral 4B] Multi-Image Matching via Fast Alternating Minimization", - "session": "computational photography, face and gesture, and vision for x", - "author": "Xiaowei Zhou, Menglong Zhu, Kostas Daniilidis", - "status": "Poster", - "track": "main", - "pid": "" - }, - { - "id": "site_588e566296", - "title": "[From Oral 4B] SPM-BP: Sped-up PatchMatch Belief Propagation for Continuous MRFs", - "session": "computational photography, face and gesture, and vision for x", - "author": "Yu Li, Dongbo Min, Michael S. Brown, Minh N. Do, Jiangbo Lu", - "status": "Poster", - "track": "main", - "pid": "" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhang_2015_ICCV,\n \n author = {\n Zhang,\n Ziming and Saligrama,\n Venkatesh\n},\n title = {\n Zero-Shot Learning via Semantic Similarity Embedding\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" }, { "id": "5ec3fdd6a1", @@ -17206,6 +17285,7 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Ding_2015_ICCV,\n \n author = {\n Ding,\n Kun and Huo,\n Chunlei and Fan,\n Bin and Pan,\n Chunhong\n},\n title = {\n kNN Hashing With Factorized Neighborhood Representation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n December\n},\n year = {\n 2015\n} \n}" } ] \ No newline at end of file diff --git a/iccv/iccv2017.json b/iccv/iccv2017.json index 2fdcd6e..9e78d9f 100644 --- a/iccv/iccv2017.json +++ b/iccv/iccv2017.json @@ -5,6 +5,7 @@ "status": "Poster", "track": "main", "pid": "478", + "author_site": "Pan Ji; Hongdong Li; Yuchao Dai; Ian Reid", "author": "Pan Ji; Hongdong Li; Yuchao Dai; Ian Reid", "abstract": "Rigid structure-from-motion (RSfM) and non-rigid structure-from-motion (NRSfM) have long been treated in the literature as separate (different) problems. Inspired by a previous work which solved directly for 3D scene structure by factoring the relative camera poses out, we revisit the principle of \"maximizing rigidity\" in structure-from-motion literature, and develop a unified theory which is applicable to both rigid and non-rigid structure reconstruction in a rigidity-agnostic way. We formulate these problems as a convex semi-definite program, imposing constraints that seek to apply the principle of minimizing non-rigidity. Our results demonstrate the efficacy of the approach, with state-of-the-art accuracy on various 3D reconstruction problems.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Ji_Maximizing_Rigidity_Revisited_ICCV_2017_paper.pdf", @@ -20,7 +21,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Ji_Maximizing_Rigidity_Revisited_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Ji_Maximizing_Rigidity_Revisited_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Ji_2017_ICCV,\n \n author = {\n Ji,\n Pan and Li,\n Hongdong and Dai,\n Yuchao and Reid,\n Ian\n},\n title = {\n \"Maximizing Rigidity\" Revisited: A Convex Programming Approach for Generic 3D Shape Reconstruction From Multiple Perspective Views\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "2D-Driven 3D Object Detection in RGB-D Images", @@ -28,6 +30,7 @@ "status": "Poster", "track": "main", "pid": "2164", + "author_site": "Jean Lahoud; Bernard Ghanem", "author": "Jean Lahoud; Bernard Ghanem", "abstract": "In this paper, we present a technique that places 3D bounding boxes around objects in an RGB-D scene. Our approach makes best use of the 2D information to quickly reduce the search space in 3D, benefiting from state-of-the-art 2D object detection techniques. We then use the 3D information to orient, place, and score bounding boxes around objects. We independently estimate the orientation for every object, using previous techniques that utilize normal information. Object locations and sizes in 3D are learned using a multilayer perceptron (MLP). In the final step, we refine our detections based on object class relations within a scene. When compared to state-of-the-art detection methods that operate almost entirely in the sparse 3D domain, extensive experiments on the well-known SUN RGB-D dataset show that our proposed method is much faster (4.1s per image) in detecting 3D objects in RGB-D images and performs better (3 mAP higher) than the state-of-the-art method that is 4.7 times slower and comparably to the method that is two orders of magnitude slower. This work hints at the idea that 2D-driven object detection in 3D should be further explored, especially in cases where the 3D input is sparse.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Lahoud_2D-Driven_3D_Object_ICCV_2017_paper.pdf", @@ -52,7 +55,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Saudi Arabia" + "aff_country_unique": "Saudi Arabia", + "bibtex": "@InProceedings{Lahoud_2017_ICCV,\n \n author = {\n Lahoud,\n Jean and Ghanem,\n Bernard\n},\n title = {\n 2D-Driven 3D Object Detection in RGB-D Images\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "3D Graph Neural Networks for RGBD Semantic Segmentation", @@ -60,6 +64,7 @@ "status": "Oral", "track": "main", "pid": "1548", + "author_site": "Xiaojuan Qi; Renjie Liao; Jiaya Jia; Sanja Fidler; Raquel Urtasun", "author": "Xiaojuan Qi; Renjie Liao; Jiaya Jia; Sanja Fidler; Raquel Urtasun", "abstract": "RGBD semantic segmentation requires joint reasoning about 2D appearance and 3D geometric information. In this paper we propose a 3D graph neural network (3DGNN) that builds a k-nearest neighbor graph on top of 3D point cloud. Each node in the graph corresponds to a set of points and is associated with a hidden representation vector initialized with an appearance feature extracted by a unary CNN from 2D images. Relying on recurrent functions, every node dynamically updates its hidden representation based on the current status and incoming messages from its neighbors. This propagation model is unrolled for a certain number of time steps and the final per-node representation is used for predicting the semantic class of each pixel. We use back-propagation through time to train the model. Extensive experiments on NYUD2 and SUN-RGBD datasets demonstrate the effectiveness of our approach.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Qi_3D_Graph_Neural_ICCV_2017_paper.pdf", @@ -77,14 +82,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Qi_3D_Graph_Neural_ICCV_2017_paper.html", "aff_unique_index": "0;1+2;0+3;1+2;1", - "aff_unique_norm": "Chinese University of Hong Kong;University of Toronto;Uber;Tencent", + "aff_unique_norm": "The Chinese University of Hong Kong;University of Toronto;Uber;Tencent", "aff_unique_dep": ";;Advanced Technologies Group;Youtu Lab", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.utoronto.ca;https://www.uber.com;https://www.tencent.com", "aff_unique_abbr": "CUHK;U of T;Uber ATG;Tencent", "aff_campus_unique_index": "0;;0;", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;1+2;0+0;1+2;1", - "aff_country_unique": "China;Canada;United States" + "aff_country_unique": "China;Canada;United States", + "bibtex": "@InProceedings{Qi_2017_ICCV,\n \n author = {\n Qi,\n Xiaojuan and Liao,\n Renjie and Jia,\n Jiaya and Fidler,\n Sanja and Urtasun,\n Raquel\n},\n title = {\n 3D Graph Neural Networks for RGBD Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "3D Surface Detail Enhancement From a Single Normal Map", @@ -92,6 +98,7 @@ "status": "Poster", "track": "main", "pid": "1033", + "author_site": "Wuyuan Xie; Miaohui Wang; Xianbiao Qi; Lei Zhang", "author": "Wuyuan Xie; Miaohui Wang; Xianbiao Qi; Lei Zhang", "abstract": "In 3D reconstruction, the obtained surface details are mainly limited to the visual sensor due to sampling and quantization in the digitalization process. How to get a fine-grained 3D surface with low-cost is still a challenging obstacle in terms of experience, equipment and easy-to-obtain. This work introduces a novel framework for enhancing surfaces reconstructed from normal map, where the assumptions on hardware (e.g., photometric stereo setup) and reflection model (e.g., Lambertion reflection) are not necessarily needed. We propose to use a new measure, angle profile, to infer the hidden micro-structure from existing surfaces. In addition, the inferred results are further improved in the domain of discrete geometry processing (DGP) which is able to achieve a stable surface structure under a selectable enhancement setting. Extensive simulation results show that the proposed method obtains significantly improvements over uniform sharpening method in terms of both subjective visual assessment and objective quality metric.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Xie_3D_Surface_Detail_ICCV_2017_paper.pdf", @@ -106,7 +113,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Xie_3D_Surface_Detail_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Xie_3D_Surface_Detail_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Xie_2017_ICCV,\n \n author = {\n Xie,\n Wuyuan and Wang,\n Miaohui and Qi,\n Xianbiao and Zhang,\n Lei\n},\n title = {\n 3D Surface Detail Enhancement From a Single Normal Map\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "3D-PRNN: Generating Shape Primitives With Recurrent Neural Networks", @@ -114,6 +122,7 @@ "status": "Poster", "track": "main", "pid": "384", + "author_site": "Chuhang Zou; Ersin Yumer; Jimei Yang; Duygu Ceylan; Derek Hoiem", "author": "Chuhang Zou; Ersin Yumer; Jimei Yang; Duygu Ceylan; Derek Hoiem", "abstract": "The success of various applications including robotics, digital content creation, and visualization demand a structured and abstract representation of the 3D world from limited sensor data. Inspired by the nature of human perception of 3D shapes as a collection of simple parts, we explore such an abstract shape representation based on primitives. Given a single depth image of an object, we present 3D-PRNN, a generative recurrent neural network that synthesizes multiple plausible shapes composed of a set of primitives. Our generative model encodes symmetry characteristics of common man-made objects, preserves long-range structural coherence, and describes objects of varying complexity with a compact representation. We also propose a method based on Gaussian Fields to generate a large scale dataset of primitive-based shape representations to train our network. We evaluate our approach on a wide range of examples and show that it outperforms nearest-neighbor based shape retrieval methods and is on-par with voxel-based generative models while using a significantly reduced parameter space.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zou_3D-PRNN_Generating_Shape_ICCV_2017_paper.pdf", @@ -131,14 +140,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Zou_3D-PRNN_Generating_Shape_ICCV_2017_paper.html", "aff_unique_index": "0;1;1;1;0", - "aff_unique_norm": "University of Illinois Urbana-Champaign;Adobe", + "aff_unique_norm": "University of Illinois at Urbana-Champaign;Adobe", "aff_unique_dep": ";Adobe Research", "aff_unique_url": "https://illinois.edu;https://research.adobe.com", "aff_unique_abbr": "UIUC;Adobe", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Urbana-Champaign;", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zou_2017_ICCV,\n \n author = {\n Zou,\n Chuhang and Yumer,\n Ersin and Yang,\n Jimei and Ceylan,\n Duygu and Hoiem,\n Derek\n},\n title = {\n 3D-PRNN: Generating Shape Primitives With Recurrent Neural Networks\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "3DCNN-DQN-RNN: A Deep Reinforcement Learning Framework for Semantic Parsing of Large-Scale 3D Point Clouds", @@ -146,6 +156,7 @@ "status": "Poster", "track": "main", "pid": "2931", + "author_site": "Fangyu Liu; Shuaipeng Li; Liqiang Zhang; Chenghu Zhou; Rongtian Ye; Yuebin Wang; Jiwen Lu", "author": "Fangyu Liu; Shuaipeng Li; Liqiang Zhang; Chenghu Zhou; Rongtian Ye; Yuebin Wang; Jiwen Lu", "abstract": "Semantic parsing of large-scale 3D point clouds is an important research topic in computer vision and remote sensing fields. Most existing approaches utilize hand-crafted features for each modality independently and combine them in a heuristic manner. They often fail to consider the consistency and complementary information among features adequately, which makes them difficult to capture high-level semantic structures. The features learned by most of the current deep learning methods can obtain high-quality image classification results. However, these methods are hard to be applied to recognize 3D point clouds due to unorganized distribution and various point density of data. In this paper, we propose a 3DCNN-DQN-RNN method which fuses the 3D convolutional neural network (CNN), Deep Q-Network (DQN) and Residual recurrent neural network (RNN) for an efficient semantic parsing of large-scale 3D point clouds. In our method, an eye window under control of the 3D CNN and DQN can localize and segment the points of the class objects efficiently. The 3D CNN and Residual RNN further extract robust and discriminative features of the points in the eye window, and thus greatly enhance the parsing accuracy of large-scale point clouds. Our method provides an automatic process that maps the raw data to the classification results. It also integrates object localization, segmentation and classification into one framework. Experimental results demonstrate that the proposed method outperforms the state-of-the-art point cloud classification methods.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Liu_3DCNN-DQN-RNN_A_Deep_ICCV_2017_paper.pdf", @@ -161,7 +172,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Liu_3DCNN-DQN-RNN_A_Deep_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Liu_3DCNN-DQN-RNN_A_Deep_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Liu_2017_ICCV,\n \n author = {\n Liu,\n Fangyu and Li,\n Shuaipeng and Zhang,\n Liqiang and Zhou,\n Chenghu and Ye,\n Rongtian and Wang,\n Yuebin and Lu,\n Jiwen\n},\n title = {\n 3DCNN-DQN-RNN: A Deep Reinforcement Learning Framework for Semantic Parsing of Large-Scale 3D Point Clouds\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "A 3D Morphable Model of Craniofacial Shape and Texture Variation", @@ -169,6 +181,7 @@ "status": "Poster", "track": "main", "pid": "1290", + "author_site": "Hang Dai; Nick Pears; William A. P. Smith; Christian Duncan", "author": "Hang Dai; Nick Pears; William A. P. Smith; Christian Duncan", "abstract": "We present a fully automatic pipeline to train 3D Morphable Models (3DMMs), with contributions in pose normalisation, dense correspondence using both shape and texture information, and high quality, high resolution texture mapping. We propose a dense correspondence system, combining a hierarchical parts-based template morphing framework in the shape channel and a refining optical flow in the texture channel. The texture map is generated using raw texture images from five views. We employ a pixel-embedding method to maintain the texture map at the same high resolution as the raw texture images, rather than using per-vertex color maps. The high quality texture map is then used for statistical texture modelling. The Headspace dataset used for training includes demographic information about each subject, allowing for the construction of both global 3DMMs and models tailored for specific gender and age groups. We build both global craniofacial 3DMMs and demographic sub-population 3DMMs from more than 1200 distinct identities. To our knowledge, we present the first public 3DMM of the full human head in both shape and texture: the Liverpool-York Head Model. Furthermore, we analyse the 3DMMs in terms of a range of performance metrics. Our evaluations reveal that the training pipeline constructs state-of-the-art models.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Dai_A_3D_Morphable_ICCV_2017_paper.pdf", @@ -193,7 +206,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Liverpool", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Dai_2017_ICCV,\n \n author = {\n Dai,\n Hang and Pears,\n Nick and Smith,\n William A. P. and Duncan,\n Christian\n},\n title = {\n A 3D Morphable Model of Craniofacial Shape and Texture Variation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "A Coarse-Fine Network for Keypoint Localization", @@ -201,6 +215,7 @@ "status": "Spotlight", "track": "main", "pid": "1609", + "author_site": "Shaoli Huang; Mingming Gong; Dacheng Tao", "author": "Shaoli Huang; Mingming Gong; Dacheng Tao", "abstract": "We propose a coarse-fine network (CFN) that exploits multi-level supervisions for keypoint localization. Recently, convolutional neural networks (CNNs)-based methods have achieved great success due to the powerful hierarchical features in CNNs. These methods typically use confidence maps generated from ground-truth keypoint locations as supervisory signals. However, while some keypoints can be easily located with high accuracy, many of them are hard to localize due to appearance ambiguity. Thus, using strict supervision often fails to detect keypoints that are difficult to locate accurately. To target this problem, we develop a keypoint localization network composed of several coarse detector branches, each of which is built on top of a feature layer in a CNN, and a fine detector branch built on top of multiple feature layers. We supervise each branch by a specified label map to explicate a certain supervision strictness level. All the branches are unified principally to produce the final accurate keypoint locations. We demonstrate the efficacy, efficiency, and generality of our method on several benchmarks for multiple tasks including bird part localization and human body pose estimation. Especially, our method achieves 72.2% AP on the 2016 COCO Keypoints Challenge dataset, which is an 18% improvement over the winning entry.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Huang_A_Coarse-Fine_Network_ICCV_2017_paper.pdf", @@ -218,14 +233,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Huang_A_Coarse-Fine_Network_ICCV_2017_paper.html", "aff_unique_index": "0;1;0", - "aff_unique_norm": "University of Sydney;University of Technology Sydney", + "aff_unique_norm": "The University of Sydney;University of Technology Sydney", "aff_unique_dep": "Sydney AI Centre;Faculty of Engineering and Information Technology", "aff_unique_url": "https://www.sydney.edu.au;https://www.uts.edu.au", "aff_unique_abbr": "USYD;UTS", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Sydney;", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Australia" + "aff_country_unique": "Australia", + "bibtex": "@InProceedings{Huang_2017_ICCV,\n \n author = {\n Huang,\n Shaoli and Gong,\n Mingming and Tao,\n Dacheng\n},\n title = {\n A Coarse-Fine Network for Keypoint Localization\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "A Discriminative View of MRF Pre-Processing Algorithms", @@ -233,6 +249,7 @@ "status": "Poster", "track": "main", "pid": "2712", + "author_site": "Chen Wang; Charles Herrmann; Ramin Zabih", "author": "Chen Wang; Charles Herrmann; Ramin Zabih", "abstract": "While Markov Random Fields (MRFs) are widely used in computer vision, they present a quite challenging inference problem. MRF inference can be accelerated by pre-processing techniques like Dead End Elimination (DEE) or QPBO-based approaches which compute the optimal labeling of a subset of variables. These techniques are guaranteed to never wrongly label a variable but they often leave a large number of variables unlabeled. We address this shortcoming by interpreting pre-processing as a classification problem, which allows us to trade off false positives (i.e., giving a variable an incorrect label) versus false negatives (i.e., failing to label a variable). We describe an efficient discriminative rule that finds optimal solutions for a subset of variables. Our technique provides both per-instance and worst-case guarantees concerning the quality of the solution. Empirical studies were conducted over several benchmark datasets. We obtain a speedup factor of 2 to 12 over expansion moves without preprocessing, and on difficult non-submodular energy functions produce slightly lower energy.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Wang_A_Discriminative_View_ICCV_2017_paper.pdf", @@ -257,7 +274,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0+0;0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Wang_2017_ICCV,\n \n author = {\n Wang,\n Chen and Herrmann,\n Charles and Zabih,\n Ramin\n},\n title = {\n A Discriminative View of MRF Pre-Processing Algorithms\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "A Generative Model of People in Clothing", @@ -265,10 +283,11 @@ "status": "Spotlight", "track": "main", "pid": "1783", + "author_site": "Christoph Lassner; Gerard Pons-Moll; Peter V. Gehler", "author": "Christoph Lassner; Gerard Pons-Moll; Peter V. Gehler", "abstract": "We present the first image-based generative model of people in clothing for the full body. We sidestep the commonly used complex graphics rendering pipeline and the need for high-quality 3D scans of dressed people. Instead, we learn generative models from a large image database. The main challenge is to cope with the high variance in human pose, shape and appearance. For this reason, pure image-based approaches have not been considered so far. We show that this challenge can be overcome by splitting the generating process in two parts. First, we learn to generate a semantic segmentation of the body and clothing. Second, we learn a conditional model on the resulting segments that creates realistic images. The full model is differentiable and can be conditioned on pose, shape or color. The result are samples of people in different clothing items and styles. The proposed model can generate entirely new people with realistic clothing. In several experiments we present encouraging results that suggest an entirely data-driven approach to people generation is possible.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Lassner_A_Generative_Model_ICCV_2017_paper.pdf", - "aff": "BCCN, T\u00fcbingen+MPI for Intelligent Systems, T\u00fcbingen; MPI for Intelligent Systems, T\u00fcbingen; University of W\u00fcrzburg", + "aff": "BCCN, Tübingen+MPI for Intelligent Systems, Tübingen; MPI for Intelligent Systems, Tübingen; University of Würzburg", "project": "", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2017/supplemental/Lassner_A_Generative_Model_ICCV_2017_supplemental.pdf", @@ -282,14 +301,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Lassner_A_Generative_Model_ICCV_2017_paper.html", "aff_unique_index": "0+1;1;2", - "aff_unique_norm": "BCCN;Max Planck Institute for Intelligent Systems;University of W\u00fcrzburg", + "aff_unique_norm": "BCCN;Max Planck Institute for Intelligent Systems;University of Würzburg", "aff_unique_dep": ";;", "aff_unique_url": ";https://www.mpi-is.mpg.de;https://www.uni-wuerzburg.de", "aff_unique_abbr": ";MPI-IS;UWue", "aff_campus_unique_index": "0+0;0", - "aff_campus_unique": "T\u00fcbingen;", + "aff_campus_unique": "Tübingen;", "aff_country_unique_index": "0+0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Lassner_2017_ICCV,\n \n author = {\n Lassner,\n Christoph and Pons-Moll,\n Gerard and Gehler,\n Peter V.\n},\n title = {\n A Generative Model of People in Clothing\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "A Generic Deep Architecture for Single Image Reflection Removal and Image Smoothing", @@ -297,6 +317,7 @@ "status": "Poster", "track": "main", "pid": "1309", + "author_site": "Qingnan Fan; Jiaolong Yang; Gang Hua; Baoquan Chen; David Wipf", "author": "Qingnan Fan; Jiaolong Yang; Gang Hua; Baoquan Chen; David Wipf", "abstract": "This paper proposes a deep neural network structure that exploits edge information in addressing representative low-level vision tasks such as layer separation and image filtering. Unlike most other deep learning strategies applied in this context, our approach tackles these challenging problems by estimating edges and reconstructing images using only cascaded convolutional layers arranged such that no handcrafted or application-specific image-processing components are required. We apply the resulting transferrable pipeline to two different problem domains that are both sensitive to edges, namely, single image reflection removal and image smoothing. For the former, using a mild reflection smoothness assumption and a novel synthetic data generation method that acts as a type of weak supervision, our network is able to solve much more difficult reflection cases that cannot be handled by previous methods. For the latter, we also exceed the state-of-the-art quantitative and qualitative results by wide margins. In all cases, the proposed framework is simple, fast, and easy to transfer across disparate domains.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Fan_A_Generic_Deep_ICCV_2017_paper.pdf", @@ -314,14 +335,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Fan_A_Generic_Deep_ICCV_2017_paper.html", "aff_unique_index": "0;1;1;0+0;1", - "aff_unique_norm": "Shandong University;Microsoft", + "aff_unique_norm": "Shandong University;Microsoft Corporation", "aff_unique_dep": ";Microsoft Research", "aff_unique_url": "http://www.sdu.edu.cn;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "SDU;MSR", "aff_campus_unique_index": "1", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0;1;1;0+0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Fan_2017_ICCV,\n \n author = {\n Fan,\n Qingnan and Yang,\n Jiaolong and Hua,\n Gang and Chen,\n Baoquan and Wipf,\n David\n},\n title = {\n A Generic Deep Architecture for Single Image Reflection Removal and Image Smoothing\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "A Geometric Framework for Statistical Analysis of Trajectories With Distinct Temporal Spans", @@ -329,10 +351,11 @@ "status": "Poster", "track": "main", "pid": "168", + "author_site": "Rudrasis Chakraborty; Vikas Singh; Nagesh Adluru; Baba C. Vemuri", "author": "Rudrasis Chakraborty; Vikas Singh; Nagesh Adluru; Baba C. Vemuri", "abstract": "Analyzing data representing multifarious trajectories is central to the many fields in Science and Engineering; for example, trajectories representing a tennis serve, a gymnast's parallel bar routine, progression/remission of disease and so on. We present a novel geometric algorithm for performing statistical analysis of trajectories with distinct number of samples representing longitudinal (or temporal) data. A key feature of our proposal is that unlike existing schemes, our model is deployable in regimes where each participant provides a different number of acquisitions (trajectories have different number of sample points). To achieve this, we develop a novel method involving the parallel transport of the tangent vectors along each given trajectory to the starting point of the respective trajectories and then use the span of the matrix whose columns consist of these vectors, to construct a linear subspace in R^m. We then map these linear subspaces of R^m on to a single high dimensional hypersphere. This enables computing group statistics over trajectories by instead performing statistics on the hypersphere (equipped with a simpler geometry). Given a point on the hypersphere representing a trajectory, we also provide a \"reverse mapping\" algorithm to uniquely (under certain assumptions) reconstruct the subspace that corresponds to this point. Finally, by using existing algorithms for recursive Frechet mean and exact principal geodesic analysis on the hypersphere, we present several experiments on synthetic and real (vision and medical) data sets showing how group testing on such diversely sampled longitudinal data is possible by analyzing the reconstructed data in the subspace spanned by the first few PGs.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Chakraborty_A_Geometric_Framework_ICCV_2017_paper.pdf", - "aff": "Department of CISE, University of Florida; University of Wisconsin\u2013Madison; University of Wisconsin\u2013Madison; Department of CISE, University of Florida", + "aff": "Department of CISE, University of Florida; University of Wisconsin–Madison; University of Wisconsin–Madison; Department of CISE, University of Florida", "project": "https://youtu.be/PeID4r9SSxM", "github": "", "supp": "", @@ -346,14 +369,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Chakraborty_A_Geometric_Framework_ICCV_2017_paper.html", "aff_unique_index": "0;1;1;0", - "aff_unique_norm": "University of Florida;University of Wisconsin\u2013Madison", + "aff_unique_norm": "University of Florida;University of Wisconsin–Madison", "aff_unique_dep": "Department of CISE;", "aff_unique_url": "https://www.ufl.edu;https://www.wisc.edu", - "aff_unique_abbr": "UF;UW\u2013Madison", + "aff_unique_abbr": "UF;UW–Madison", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Madison", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Chakraborty_2017_ICCV,\n \n author = {\n Chakraborty,\n Rudrasis and Singh,\n Vikas and Adluru,\n Nagesh and Vemuri,\n Baba C.\n},\n title = {\n A Geometric Framework for Statistical Analysis of Trajectories With Distinct Temporal Spans\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "A Joint Intrinsic-Extrinsic Prior Model for Retinex", @@ -361,6 +385,7 @@ "status": "Poster", "track": "main", "pid": "1618", + "author_site": "Bolun Cai; Xianming Xu; Kailing Guo; Kui Jia; Bin Hu; Dacheng Tao", "author": "Bolun Cai; Xianming Xu; Kailing Guo; Kui Jia; Bin Hu; Dacheng Tao", "abstract": "We propose a joint intrinsic-extrinsic prior model to estimate both illumination and reflectance from an observed image. The 2D image formed from 3D object in the scene is affected by the intrinsic properties (shape and texture) and the extrinsic property (illumination). Based on a novel structure-preserving measure called local variation deviation, a joint intrinsic-extrinsic prior model is proposed for better prior representation. Better than conventional Retinex models, the proposed model can preserve the structure information by shape prior, estimate the reflectance with fine details by texture prior, and capture the luminous source by illumination prior. Experimental results demonstrate the effectiveness of the proposed method on simulated and real data. Compared with the other Retinex algorithms and state-of-the-art algorithms, the proposed model yields better results on both subjective and objective assessments.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Cai_A_Joint_Intrinsic-Extrinsic_ICCV_2017_paper.pdf", @@ -378,14 +403,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Cai_A_Joint_Intrinsic-Extrinsic_ICCV_2017_paper.html", "aff_unique_index": "0;0;0;0;1;2", - "aff_unique_norm": "South China University of Technology;Lanzhou University;University of Sydney", + "aff_unique_norm": "South China University of Technology;Lanzhou University;The University of Sydney", "aff_unique_dep": "School of Electronic and Information Engineering;Ubiquitous Awareness and Intelligent Solutions Lab;School of IT, FEIT", "aff_unique_url": "http://www.scut.edu.cn;http://www.lzu.edu.cn;https://www.sydney.edu.au", "aff_unique_abbr": "SCUT;;USYD", "aff_campus_unique_index": "1", "aff_campus_unique": ";Sydney", "aff_country_unique_index": "0;0;0;0;0;1", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Cai_2017_ICCV,\n \n author = {\n Cai,\n Bolun and Xu,\n Xianming and Guo,\n Kailing and Jia,\n Kui and Hu,\n Bin and Tao,\n Dacheng\n},\n title = {\n A Joint Intrinsic-Extrinsic Prior Model for Retinex\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "A Lightweight Approach for On-The-Fly Reflectance Estimation", @@ -393,7 +419,7 @@ "status": "Oral", "track": "main", "pid": "1392", - "author_site": "Kihwan Kim; Jinwei Gu; Stephen Tyree; Pavlo Molchanov; Matthias Nie\u00c3\u009fner; Jan Kautz", + "author_site": "Kihwan Kim; Jinwei Gu; Stephen Tyree; Pavlo Molchanov; Matthias Nießner; Jan Kautz", "author": "Kihwan Kim; Jinwei Gu; Stephen Tyree; Pavlo Molchanov; Matthias Niessner; Jan Kautz", "abstract": "Estimating surface reflectance (BRDF) is one key component for complete 3D scene capture, with wide applications in virtual reality, augmented reality, and human computer interaction. Prior work is either limited to controlled environments (e.g., gonioreflectometers, light stages or multi-camera domes), or requires the joint optimization of shape, illumination, and reflectance, which is often computationally too expensive (e.g., hours of running time) for real-time applications. Moreover, most prior work requires HDR images as input which further complicates the capture process. In this paper, we propose a lightweight, practical approach for surface reflectance estimation directly from 8-bit RGB images in real-time, which can be easily plugged into any 3D scanning-and-fusion system with a commodity RGBD sensor. Our method is learning-based, with an inference time of less than 90ms per scene and a model size of less than 340K bytes. We propose two novel network architectures, HemiCNN and Grouplet, to deal with the unstructured input data from multiple viewpoints under unknown illumination. We further design a loss function to resolve the color-constancy and scale ambiguity. In addition, we have created a large synthetic dataset, SynBRDF, which comprises a total of 500K RGBD images rendered with a physically-based ray tracer under a variety of natural illumination, covering 5000 materials and 5000 shapes. SynBRDF is the first large-scale benchmark dataset for reflectance estimation. Experiments on both synthetic data and real data show that the proposed method effectively recovers surface reflectance, and outperforms prior work for reflectance estimation in uncontrolled environments.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Kim_A_Lightweight_Approach_ICCV_2017_paper.pdf", @@ -409,7 +435,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Kim_A_Lightweight_Approach_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Kim_A_Lightweight_Approach_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Kim_2017_ICCV,\n \n author = {\n Kim,\n Kihwan and Gu,\n Jinwei and Tyree,\n Stephen and Molchanov,\n Pavlo and Niessner,\n Matthias and Kautz,\n Jan\n},\n title = {\n A Lightweight Approach for On-The-Fly Reflectance Estimation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "A Lightweight Single-Camera Polarization Compass With Covariance Estimation", @@ -417,7 +444,7 @@ "status": "Poster", "track": "main", "pid": "2523", - "author_site": "Wolfgang St\u00c3\u00bcrzl", + "author_site": "Wolfgang Stürzl", "author": "Wolfgang Sturzl", "abstract": "A lightweight visual compass system is presented as well as a direct method for estimating sun direction and its covariance. The optical elements of the system are described enabling estimation of sky polarization in a FOV of approx. 56 degrees with a single standard camera sensor. Using the proposed direct method, the sun direction and its covariance matrix can be estimated based on the polarization measured in the image plane. Experiments prove the applicability of the polarization sensor and the proposed estimation method, even in difficult conditions. It is also shown that in case the sensor is not leveled, combination with an IMU allows to determine all degrees of orientation. Due to the low weight of the sensor and the low complexity of the estimation method the polarization system is well suited for MAVs which have limited payload and computational resources. Furthermore, since not just the sun direction but also its covariance is estimated an integration in a multi-sensor navigation framework is straight forward.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Sturzl_A_Lightweight_Single-Camera_ICCV_2017_paper.pdf", @@ -440,7 +467,8 @@ "aff_unique_url": "https://www.dlr.de", "aff_unique_abbr": "DLR", "aff_country_unique_index": "0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Sturzl_2017_ICCV,\n \n author = {\n Sturzl,\n Wolfgang\n},\n title = {\n A Lightweight Single-Camera Polarization Compass With Covariance Estimation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "A Microfacet-Based Reflectance Model for Photometric Stereo With Highly Specular Surfaces", @@ -448,6 +476,7 @@ "status": "Poster", "track": "main", "pid": "1525", + "author_site": "Lixiong Chen; Yinqiang Zheng; Boxin Shi; Art Subpa-Asa; Imari Sato", "author": "Lixiong Chen; Yinqiang Zheng; Boxin Shi; Art Subpa-Asa; Imari Sato", "abstract": "A precise, stable and invertible model for surface reflectance is the key to the success of photometric stereo with real world materials. Recent developments in the field have enabled shape recovery techniques for surfaces of various types, but an effective solution to directly estimating the surface normal in the presence of highly specular reflectance remains elusive. In this paper, we derive an analytical isotropic microfacet-based reflectance model, based on which a physically interpretable approximate is tailored for highly specular surfaces. With this approximate, we identify the equivalence between the surface recovery problem and the ellipsoid of revolution fitting problem, where the latter can be described as a system of polynomials. Additionally, we devise a fast, non-iterative and globally optimal solver for this problem. Experimental results on both synthetic and real images validate our model and demonstrate that our solution can stably deliver superior performance in its targeted application domain.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Chen_A_Microfacet-Based_Reflectance_ICCV_2017_paper.pdf", @@ -472,7 +501,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Tokyo", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Chen_2017_ICCV,\n \n author = {\n Chen,\n Lixiong and Zheng,\n Yinqiang and Shi,\n Boxin and Subpa-Asa,\n Art and Sato,\n Imari\n},\n title = {\n A Microfacet-Based Reflectance Model for Photometric Stereo With Highly Specular Surfaces\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "A Multilayer-Based Framework for Online Background Subtraction With Freely Moving Cameras", @@ -480,6 +510,7 @@ "status": "Poster", "track": "main", "pid": "2111", + "author_site": "Yizhe Zhu; Ahmed Elgammal", "author": "Yizhe Zhu; Ahmed Elgammal", "abstract": "The exponentially increasing use of moving platforms for video capture introduces the urgent need to develop the general background subtraction algorithms with the capability to deal with the moving background. In this paper, we propose a multilayer-based framework for online background subtraction for videos captured by moving cameras. Unlike the previous treatments of the problem, the proposed method is not restricted to binary segmentation of background and foreground, but formulates it as a multi-label segmentation problem by modeling multiple foreground objects in different layers when they appear simultaneously in the scene. We assign an independent processing layer to each foreground object, as well as the background, where both motion and appearance models are estimated, and a probability map is inferred using a Bayesian filtering framework. Finally, Multi-label Graph-cut on Markov Random Field is employed to perform pixel-wise labeling. Extensive evaluation results show that the proposed method outperforms state-of-the-art methods on challenging video sequences.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zhu_A_Multilayer-Based_Framework_ICCV_2017_paper.pdf", @@ -504,7 +535,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Piscataway", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhu_2017_ICCV,\n \n author = {\n Zhu,\n Yizhe and Elgammal,\n Ahmed\n},\n title = {\n A Multilayer-Based Framework for Online Background Subtraction With Freely Moving Cameras\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "A Multimodal Deep Regression Bayesian Network for Affective Video Content Analyses", @@ -512,6 +544,7 @@ "status": "Poster", "track": "main", "pid": "2056", + "author_site": "Quan Gan; Shangfei Wang; Longfei Hao; Qiang Ji", "author": "Quan Gan; Shangfei Wang; Longfei Hao; Qiang Ji", "abstract": "The inherent dependencies between visual elements and aural elements are crucial for affective video content analyses, yet have not been successfully exploited. Therefore, we propose a multimodal deep regression Bayesian network (MMDRBN) to capture the dependencies between visual elements and aural elements for affective video content analyses. The regression Bayesian network (RBN) is a directed graphical model consisting of one latent layer and one visible layer. Due to the explaining away effect in Bayesian networks (BN), RBN is able to capture both the dependencies among the latent variables given the observation and the dependencies among visible variables. We propose a fast learning algorithm to learn the RBN. For the MMDRBN, first, we learn several RBNs layer-wisely from visual modality and audio modality respectively. Then we stack these RBNs and obtain two deep networks. After that, a joint representation is extracted from the top layers of the two deep networks, and thus captures the high order dependencies between visual modality and audio modality. In order to predict the valence or arousal score of video contents, we initialize a feed-forward inference network from the MMDRBN whose inference is intractable by minimizing the KullbackLeibler (KL)divergence between the two networks. The back propagation algorithm is adopted for finetuning the inference network. Experimental results on the LIRIS-ACCEDE database demonstrate that the proposed MMDRBN successfully captures the dependencies between visual and audio elements, and thus achieves better performance compared with state of the art work.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Gan_A_Multimodal_Deep_ICCV_2017_paper.pdf", @@ -536,7 +569,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Gan_2017_ICCV,\n \n author = {\n Gan,\n Quan and Wang,\n Shangfei and Hao,\n Longfei and Ji,\n Qiang\n},\n title = {\n A Multimodal Deep Regression Bayesian Network for Affective Video Content Analyses\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "A Novel Space-Time Representation on the Positive Semidefinite Cone for Facial Expression Recognition", @@ -544,6 +578,7 @@ "status": "Poster", "track": "main", "pid": "1339", + "author_site": "Anis Kacem; Mohamed Daoudi; Boulbaba Ben Amor; Juan Carlos Alvarez-Paiva", "author": "Anis Kacem; Mohamed Daoudi; Boulbaba Ben Amor; Juan Carlos Alvarez-Paiva", "abstract": "In this paper, we study the problem of facial expression recognition using a novel space-time geometric representation. We describe the temporal evolution of facial landmarks as parametrized trajectories on the Riemannian manifold of positive semidefinite matrices of fixed-rank. Our representation has the advantage to bring naturally a second desirable quantity when comparing shapes -- the spatial covariance -- in addition to the conventional affine-shape representation. We derive then geometric and computational tools for rate-invariant analysis and adaptive re-sampling of trajectories, grounding on the Riemannian geometry of the manifold. Specifically, our approach involves three steps: 1) facial landmarks are first mapped into the Riemannian manifold of positive semidefinite matrices of rank 2, to build time-parameterized trajectories; 2) a temporal alignment is performed on the trajectories, providing a geometry-aware (dis-)similarity measure between them; 3) finally, pairwise proximity function SVM (ppfSVM) is used to classify them, incorporating the latter (dis-)similarity measure into the kernel function. We show the effectiveness of the proposed approach on four publicly available benchmarks (CK+, MMI, Oulu-CASIA, and AFEW). The results of the proposed approach are comparable to or better than the state-of-the-art methods when involving only facial landmarks.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Kacem_A_Novel_Space-Time_ICCV_2017_paper.pdf", @@ -559,7 +594,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Kacem_A_Novel_Space-Time_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Kacem_A_Novel_Space-Time_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Kacem_2017_ICCV,\n \n author = {\n Kacem,\n Anis and Daoudi,\n Mohamed and Ben Amor,\n Boulbaba and Carlos Alvarez-Paiva,\n Juan\n},\n title = {\n A Novel Space-Time Representation on the Positive Semidefinite Cone for Facial Expression Recognition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "A Read-Write Memory Network for Movie Story Understanding", @@ -567,6 +603,7 @@ "status": "Poster", "track": "main", "pid": "164", + "author_site": "Seil Na; Sangho Lee; Jisung Kim; Gunhee Kim", "author": "Seil Na; Sangho Lee; Jisung Kim; Gunhee Kim", "abstract": "We propose a novel memory network model named Read-Write Memory Network (RWMN) to perform question and answering tasks for large-scale, multimodal movie story understanding. The key focus of our RWMN model is to design the read network and the write network that consist of multiple convolutional layers, which enable memory read and write operations to have high capacity and flexibility. While existing memory-augmented network models treat each memory slot as an independent block, our use of multi-layered CNNs allows the model to read and write sequential memory cells as chunks, which is more reasonable to represent a sequential story because adjacent memory blocks often have strong correlations. For evaluation, we apply our model to all the six tasks of the MovieQA benchmark, and achieve the best accuracies on several tasks, especially on the visual QA task. Our model shows a potential to better understand not only the content in the story, but also more abstract information, such as relationships between characters and the reasons for their actions.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Na_A_Read-Write_Memory_ICCV_2017_paper.pdf", @@ -581,7 +618,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Na_A_Read-Write_Memory_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Na_A_Read-Write_Memory_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Na_2017_ICCV,\n \n author = {\n Na,\n Seil and Lee,\n Sangho and Kim,\n Jisung and Kim,\n Gunhee\n},\n title = {\n A Read-Write Memory Network for Movie Story Understanding\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "A Revisit of Sparse Coding Based Anomaly Detection in Stacked RNN Framework", @@ -589,6 +627,7 @@ "status": "Poster", "track": "main", "pid": "62", + "author_site": "Weixin Luo; Wen Liu; Shenghua Gao", "author": "Weixin Luo; Wen Liu; Shenghua Gao", "abstract": "Motivated by the capability of sparse coding based anomaly detection, we propose a Temporally-coherent Sparse Coding (TSC) where we enforce similar neighbouring frames be encoded with similar reconstruction coefficients. Then we map the TSC with a special type of stacked Recurrent Neural Network (sRNN). By taking advantage sRNN in learning all parameters simultaneously, the nontrivial hyper-parameter selection to TSC can be avoided, meanwhile with a shallow sRNN, the reconstruction coefficients can be inferred within a forward pass, which reduces the computational cost for learning sparse coefficients. The contributions of this paper are two-fold: i) We propose a TSC, which can be mapped to a sRNN which facilitates the parameter optimization and accelerates the anomaly prediction. ii) We build a very large dataset which is even larger than the summation of all existing dataset for anomaly detection in terms of both the volume of data and the diversity of scenes. Extensive experiments on both a toy dataset and real datasets demonstrate that our TSC based and sRNN based method consistently outperform existing methods, which validates the effectiveness of our method.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Luo_A_Revisit_of_ICCV_2017_paper.pdf", @@ -613,7 +652,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Luo_2017_ICCV,\n \n author = {\n Luo,\n Weixin and Liu,\n Wen and Gao,\n Shenghua\n},\n title = {\n A Revisit of Sparse Coding Based Anomaly Detection in Stacked RNN Framework\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "A Self-Balanced Min-Cut Algorithm for Image Clustering", @@ -621,6 +661,7 @@ "status": "Poster", "track": "main", "pid": "713", + "author_site": "Xiaojun Chen; Joshua Zhexue Haung; Feiping Nie; Renjie Chen; Qingyao Wu", "author": "Xiaojun Chen; Joshua Zhexue Haung; Feiping Nie; Renjie Chen; Qingyao Wu", "abstract": "Many spectral clustering algorithms have been proposed and successfully applied to image data analysis such as content based image retrieval, image annotation, and image indexing. Conventional spectral clustering algorithms usually involve a two-stage process: eigendecomposition of similarity matrix and clustering assignments from eigenvectors by k-means or spectral rotation. However, the final clustering assignments obtained by the two-stage process may deviate from the assignments by directly optimize the original objective function. Moreover, most of these methods usually have very high computational complexities. In this paper, we propose a new min-cut algorithm for image clustering, which scales linearly to the data size. In the new method, a self-balanced min-cut model is proposed in which the Exclusive Lasso is implicitly introduced as a balance regularizer in order to produce balanced partition. We propose an iterative algorithm to solve the new model, which has a time complexity of O(n) where n is the number of samples. Theoretical analysis reveals that the new method can simultaneously minimize the graph cut and balance the partition across all clusters. A series of experiments were conducted on both synthetic and benchmark data sets and the experimental results show the superior performance of the new method.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Chen_A_Self-Balanced_Min-Cut_ICCV_2017_paper.pdf", @@ -645,7 +686,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Shenzhen;", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2017_ICCV,\n \n author = {\n Chen,\n Xiaojun and Zhexue Haung,\n Joshua and Nie,\n Feiping and Chen,\n Renjie and Wu,\n Qingyao\n},\n title = {\n A Self-Balanced Min-Cut Algorithm for Image Clustering\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "A Simple yet Effective Baseline for 3D Human Pose Estimation", @@ -653,6 +695,7 @@ "status": "Poster", "track": "main", "pid": "1154", + "author_site": "Julieta Martinez; Rayat Hossain; Javier Romero; James J. Little", "author": "Julieta Martinez; Rayat Hossain; Javier Romero; James J. Little", "abstract": "Following the success of deep convolutional networks, state-of-the-art methods for 3d human pose estimation have focused on deep end-to-end systems that predict 3d joint locations given raw image pixels. Despite their excellent performance, it is often not easy to understand whether their remaining error stems from a limited 2d pose (visual) understanding, or from a failure to map 2d poses into 3-dimensional positions. With the goal of understanding these sources of error, we set out to build a system that given 2d joint locations predicts 3d positions. Much to our surprise, we have found that, with current technology, \"lifting\" ground truth 2d joint locations to 3d space is a task that can be solved with a remarkably low error rate: a relatively simple deep feed-forward network outperforms the best reported result by about 30% on Human3.6M, the largest publicly available 3d pose estimation benchmark. Furthermore, training our system on the output of an off-the-shelf state-of-the-art 2d detector (i.e., using images as input) yields state of the art results -- this includes an array of systems that have been trained end-to-end specifically for this task. Our results indicate that a large portion of the error of modern deep 3d pose estimation systems stems from their visual analysis, and suggests directions to further advance the state of the art in 3d human pose estimation.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Martinez_A_Simple_yet_ICCV_2017_paper.pdf", @@ -677,7 +720,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", - "aff_country_unique": "Canada;United States" + "aff_country_unique": "Canada;United States", + "bibtex": "@InProceedings{Martinez_2017_ICCV,\n \n author = {\n Martinez,\n Julieta and Hossain,\n Rayat and Romero,\n Javier and Little,\n James J.\n},\n title = {\n A Simple yet Effective Baseline for 3D Human Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "A Spatiotemporal Oriented Energy Network for Dynamic Texture Recognition", @@ -685,6 +729,7 @@ "status": "Spotlight", "track": "main", "pid": "2680", + "author_site": "Isma Hadji; Richard P. Wildes", "author": "Isma Hadji; Richard P. Wildes", "abstract": "This paper presents a novel hierarchical spatiotemporal orientation representation for spacetime image analysis. It is designed to combine the benefits of the multilayer architecture of ConvNets and a more controlled approach to spacetime analysis. A distinguishing aspect of the approach is that unlike most contemporary convolutional networks no learning is involved; rather, all design decisions are specified analytically with theoretical motivations. This approach makes it possible to understand what information is being extracted at each stage and layer of processing as well as to minimize heuristic choices in design. Another key aspect of the network is its recurrent nature, whereby the output of each layer of processing feeds back to the input. To keep the network size manageable across layers, a novel cross-channel feature pooling is proposed. The multilayer architecture that results systematically reveals hierarchical image structure in terms of multiscale, multiorientation properties of visual spacetime. To illustrate its utility, the network has been applied to the task of dynamic texture recognition. Empirical evaluation on multiple standard datasets shows that it sets a new state-of-the-art.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Hadji_A_Spatiotemporal_Oriented_ICCV_2017_paper.pdf", @@ -709,7 +754,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Toronto", "aff_country_unique_index": "0;0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Hadji_2017_ICCV,\n \n author = {\n Hadji,\n Isma and Wildes,\n Richard P.\n},\n title = {\n A Spatiotemporal Oriented Energy Network for Dynamic Texture Recognition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "A Stagewise Refinement Model for Detecting Salient Objects in Images", @@ -717,6 +763,7 @@ "status": "Poster", "track": "main", "pid": "1709", + "author_site": "Tiantian Wang; Ali Borji; Lihe Zhang; Pingping Zhang; Huchuan Lu", "author": "Tiantian Wang; Ali Borji; Lihe Zhang; Pingping Zhang; Huchuan Lu", "abstract": "Deep convolutional neural networks (CNNs) have been successfully applied to a wide variety of problems in computer vision, including salient object detection. To detect and segment salient objects accurately, it is necessary to extract and combine high-level semantic features with low-level fine details simultaneously. This happens to be a challenge for CNNs as repeated subsampling operations such as pooling and convolution lead to a significant decrease in the initial image resolution, which results in loss of spatial details and finer structures. To remedy this problem, here we propose to augment feedforward neural networks with a novel pyramid pooling module and a multi-stage refinement mechanism for saliency detection. First, our deep feedward net is used to generate a coarse prediction map with much detailed structures lost. Then, refinement nets are integrated with local context information to refine the preceding saliency maps generated in the master branch in a stagewise manner. Further, a pyramid pooling module is applied for different region-based global context aggregation. Empirical evaluations over five benchmark datasets show that our proposed method compares favorably against the state-of-the-art approaches.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Wang_A_Stagewise_Refinement_ICCV_2017_paper.pdf", @@ -731,7 +778,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Wang_A_Stagewise_Refinement_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Wang_A_Stagewise_Refinement_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Wang_2017_ICCV,\n \n author = {\n Wang,\n Tiantian and Borji,\n Ali and Zhang,\n Lihe and Zhang,\n Pingping and Lu,\n Huchuan\n},\n title = {\n A Stagewise Refinement Model for Detecting Salient Objects in Images\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "A Two Stream Siamese Convolutional Neural Network for Person Re-Identification", @@ -739,6 +787,7 @@ "status": "Poster", "track": "main", "pid": "920", + "author_site": "Dahjung Chung; Khalid Tahboub; Edward J. Delp", "author": "Dahjung Chung; Khalid Tahboub; Edward J. Delp", "abstract": "Person re-identification is an important task in video surveillance systems. It can be formally defined as establishing the correspondence between images of a person taken from different cameras at different times. In this pa- per, we present a two stream convolutional neural network where each stream is a Siamese network. This architecture can learn spatial and temporal information separately. We also propose a weighted two stream training objective function which combines the Siamese cost of the spatial and temporal streams with the objective of predicting a person's identity. We evaluate our proposed method on the publicly available PRID2011 and iLIDS-VID datasets and demonstrate the efficacy of our proposed method. On average, the top rank matching accuracy is 4% higher than the accuracy achieved by the cross-view quadratic discriminant analysis used in combination with the hierarchical Gaussian descriptor (GOG+XQDA), and 5% higher than the recurrent neural network method.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Chung_A_Two_Stream_ICCV_2017_paper.pdf", @@ -763,7 +812,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "West Lafayette", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Chung_2017_ICCV,\n \n author = {\n Chung,\n Dahjung and Tahboub,\n Khalid and Delp,\n Edward J.\n},\n title = {\n A Two Stream Siamese Convolutional Neural Network for Person Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "A Two-Streamed Network for Estimating Fine-Scaled Depth Maps From Single RGB Images", @@ -771,6 +821,7 @@ "status": "Poster", "track": "main", "pid": "1286", + "author_site": "Jun Li; Reinhard Klein; Angela Yao", "author": "Jun Li; Reinhard Klein; Angela Yao", "abstract": "Estimating depth from a single RGB image is an ill-posed and inherently ambiguous problem. State-of-the-art deep learning methods can now estimate accurate 2D depth maps, but when the maps are projected into 3D, they lack local detail and are often highly distorted. We propose a fast-to-train two-streamed CNN that predicts depth and depth gradients, which are then fused together into an accurate and detailed depth map. We also define a novel set loss over multiple images; by regularizing the estimation between a common set of images, the network is less prone to over-fitting and achieves better accuracy than competing methods. Experiments on the NYU Depth v2 dataset shows that our depth predictions are competitive with state-of-the-art and lead to faithful 3D projections.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Li_A_Two-Streamed_Network_ICCV_2017_paper.pdf", @@ -795,7 +846,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0;0", - "aff_country_unique": "Germany;China" + "aff_country_unique": "Germany;China", + "bibtex": "@InProceedings{Li_2017_ICCV,\n \n author = {\n Li,\n Jun and Klein,\n Reinhard and Yao,\n Angela\n},\n title = {\n A Two-Streamed Network for Estimating Fine-Scaled Depth Maps From Single RGB Images\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "A Unified Model for Near and Remote Sensing", @@ -803,6 +855,7 @@ "status": "Poster", "track": "main", "pid": "1102", + "author_site": "Scott Workman; Menghua Zhai; David J. Crandall; Nathan Jacobs", "author": "Scott Workman; Menghua Zhai; David J. Crandall; Nathan Jacobs", "abstract": "We propose a novel convolutional neural network architecture for estimating geospatial functions such as population density, land cover, or land use. In our approach, we combine overhead and ground-level images in an end-to-end trainable neural network, which uses kernel regression and density estimation to convert features extracted from the ground-level images into a dense feature map. The output of this network is a dense estimate of the geospatial function in the form of a pixel-level labeling of the overhead image. To evaluate our approach, we created a large dataset of overhead and ground-level images from a major urban area with three sets of labels: land use, building function, and building age. We find that our approach is more accurate for all tasks, in some cases dramatically so.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Workman_A_Unified_Model_ICCV_2017_paper.pdf", @@ -827,7 +880,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Bloomington", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Workman_2017_ICCV,\n \n author = {\n Workman,\n Scott and Zhai,\n Menghua and Crandall,\n David J. and Jacobs,\n Nathan\n},\n title = {\n A Unified Model for Near and Remote Sensing\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "AMAT: Medial Axis Transform for Natural Images", @@ -835,6 +889,7 @@ "status": "Poster", "track": "main", "pid": "1142", + "author_site": "Stavros Tsogkas; Sven Dickinson", "author": "Stavros Tsogkas; Sven Dickinson", "abstract": "We introduce Appearance-MAT (AMAT), a generalization of the medial axis transform for natural images, that is framed as a weighted geometric set cover problem. We make the following contributions: i) we extend previous medial point detection methods for color images, by associating each medial point with a local scale; ii) inspired by the invertibility property of the binary MAT, we also associate each medial point with a local encoding that allows us to invert the AMAT, reconstructing the input image; iii) we describe a clustering scheme that takes advantage of the additional scale and appearance information to group individual points into medial branches, providing a shape decomposition of the underlying image regions. In our experiments, we show state-of-the-art performance in medial point detection on Berkeley Medial AXes (BMAX500), a new dataset of medial axes based on the BSDS500 database, and good generalization on the SK506 and WH-SYMMAX datasets. We also measure the quality of reconstructed images from BMAX500, obtained by inverting their computed AMAT. Our approach delivers significantly better reconstruction quality wrt to three baselines, using just 10% of the image pixels. Our code and annotations are available at https://github.com/tsogkas/amat .", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Tsogkas_AMAT_Medial_Axis_ICCV_2017_paper.pdf", @@ -859,7 +914,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Tsogkas_2017_ICCV,\n \n author = {\n Tsogkas,\n Stavros and Dickinson,\n Sven\n},\n title = {\n AMAT: Medial Axis Transform for Natural Images\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "AMTnet: Action-Micro-Tube Regression by End-To-End Trainable Deep Architecture", @@ -867,6 +923,7 @@ "status": "Poster", "track": "main", "pid": "1935", + "author_site": "Suman Saha; Gurkirt Singh; Fabio Cuzzolin", "author": "Suman Saha; Gurkirt Singh; Fabio Cuzzolin", "abstract": "Dominant approaches to action detection can only provide sub-optimal solutions to the problem, as they rely on seeking frame-level detections, to later compose them into \"action tubes\" in a post-processing step. With this paper we radically depart from current practice, and take a first step towards the design and implementation of a deep network architecture able to classify and regress whole video subsets, so providing a truly optimal solution of the action detection problem. In this work, in particular, we propose a novel deep net framework able to regress and classify 3D region proposals spanning two successive video frames, whose core is an evolution of classical region proposal networks (RPNs). As such, our 3D-RPN net is able to effectively encode the temporal aspect of actions by purely exploiting appearance, as opposed to methods which heavily rely on expensive flow maps. The proposed model is end-to-end trainable and can be jointly optimised for action localisation and classification in a single step. At test time the network predicts \"micro-tubes\" encompassing two successive frames, which are linked up into complete action tubes via a new algorithm which exploits the temporal encoding learned by the network and cuts computation time by 50%. Promising results on the J-HMDB-21 and UCF-101 action detection datasets show that our model does outperform the state-of-the-art when relying purely on appearance.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Saha_AMTnet_Action-Micro-Tube_Regression_ICCV_2017_paper.pdf", @@ -887,11 +944,12 @@ "aff_unique_norm": "Oxford Brookes University", "aff_unique_dep": "", "aff_unique_url": "https://www.oxfordbrookes.ac.uk", - "aff_unique_abbr": "", + "aff_unique_abbr": "OBU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Oxford", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Saha_2017_ICCV,\n \n author = {\n Saha,\n Suman and Singh,\n Gurkirt and Cuzzolin,\n Fabio\n},\n title = {\n AMTnet: Action-Micro-Tube Regression by End-To-End Trainable Deep Architecture\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "AOD-Net: All-In-One Dehazing Network", @@ -899,6 +957,7 @@ "status": "Poster", "track": "main", "pid": "2275", + "author_site": "Boyi Li; Xiulian Peng; Zhangyang Wang; Jizheng Xu; Dan Feng", "author": "Boyi Li; Xiulian Peng; Zhangyang Wang; Jizheng Xu; Dan Feng", "abstract": "This paper proposes an image dehazing model built with a convolutional neural network (CNN), called All-in-One Dehazing Network (AOD-Net). It is designed based on a re-formulated atmospheric scattering model. Instead of estimating the transmission matrix and the atmospheric light separately as most previous models did, AOD-Net directly generates the clean image through a light-weight CNN. Such a novel end-to-end design makes it easy to embed AOD-Net into other deep models, e.g., Faster R-CNN, for improving high-level tasks on hazy images. Experimental results on both synthesized and natural hazy image datasets demonstrate our superior performance than the state-of-the-art in terms of PSNR, SSIM and the subjective visual quality. Furthermore, when concatenating AOD-Net with Faster R-CNN, we witness a large improvement of the object detection performance on hazy images.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Li_AOD-Net_All-In-One_Dehazing_ICCV_2017_paper.pdf", @@ -913,7 +972,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Li_AOD-Net_All-In-One_Dehazing_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Li_AOD-Net_All-In-One_Dehazing_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Li_2017_ICCV,\n \n author = {\n Li,\n Boyi and Peng,\n Xiulian and Wang,\n Zhangyang and Xu,\n Jizheng and Feng,\n Dan\n},\n title = {\n AOD-Net: All-In-One Dehazing Network\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Action Tubelet Detector for Spatio-Temporal Action Localization", @@ -921,6 +981,7 @@ "status": "Poster", "track": "main", "pid": "1873", + "author_site": "Vicky Kalogeiton; Philippe Weinzaepfel; Vittorio Ferrari; Cordelia Schmid", "author": "Vicky Kalogeiton; Philippe Weinzaepfel; Vittorio Ferrari; Cordelia Schmid", "abstract": "Current state-of-the-art approaches for spatio-temporal action localization rely on detections at the frame level that are then linked or tracked across time. In this paper, we leverage the temporal continuity of videos instead of operating at the frame level. We propose the ACtion Tubelet detector (ACT-detector) that takes as input a sequence of frames and outputs tubelets, ie, sequences of bounding boxes with associated scores. The same way state-of-the-art object detectors rely on anchor boxes, our ACT-detector is based on anchor cuboids. We build upon the SSD framework. Convolutional features are extracted for each frame, while scores and regressions are based on the temporal stacking of these features, thus exploiting information from a sequence. Our experimental results show that leveraging sequences of frames significantly improves detection performance over using individual frames. The gain of our tubelet detector can be explained by both more accurate scores and more precise localization. Our ACT-detector outperforms the state-of-the-art methods for frame-mAP and video-mAP on the J-HMDB and UCF-101 datasets, in particular at high overlap thresholds.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Kalogeiton_Action_Tubelet_Detector_ICCV_2017_paper.pdf", @@ -936,7 +997,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Kalogeiton_Action_Tubelet_Detector_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Kalogeiton_Action_Tubelet_Detector_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Kalogeiton_2017_ICCV,\n \n author = {\n Kalogeiton,\n Vicky and Weinzaepfel,\n Philippe and Ferrari,\n Vittorio and Schmid,\n Cordelia\n},\n title = {\n Action Tubelet Detector for Spatio-Temporal Action Localization\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Active Decision Boundary Annotation With Deep Generative Models", @@ -944,6 +1006,7 @@ "status": "Spotlight", "track": "main", "pid": "1673", + "author_site": "Miriam Huijser; Jan C. van Gemert", "author": "Miriam Huijser; Jan C. van Gemert", "abstract": "This paper is on active learning where the goal is to reduce the data annotation burden by interacting with a (human) oracle during training. Standard active learning methods ask the oracle to annotate data samples. Instead, we take a profoundly different approach: we ask for annotations of the decision boundary. We achieve this using a deep generative model to create novel instances along a 1d vector. A point on the decision boundary is revealed where the instances change class. Experimentally we show on three datasets that our method can be plugged-in to other active learning schemes, that human oracles can effectively annotate point on the decision boundary, and that decision boundary annotations improve over single sample instance annotations.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Huijser_Active_Decision_Boundary_ICCV_2017_paper.pdf", @@ -968,7 +1031,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Delft", "aff_country_unique_index": "0;0", - "aff_country_unique": "Netherlands" + "aff_country_unique": "Netherlands", + "bibtex": "@InProceedings{Huijser_2017_ICCV,\n \n author = {\n Huijser,\n Miriam and van Gemert,\n Jan C.\n},\n title = {\n Active Decision Boundary Annotation With Deep Generative Models\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Active Learning for Human Pose Estimation", @@ -976,6 +1040,7 @@ "status": "Poster", "track": "main", "pid": "1986", + "author_site": "Buyu Liu; Vittorio Ferrari", "author": "Buyu Liu; Vittorio Ferrari", "abstract": "Annotating human poses in realistic scenes is very time consuming, yet necessary for training human pose estimators. We propose to address this problem in an active learning framework, which alternates between requesting the most useful annotations among a large set of unlabelled images, and re-training the pose estimator. To this end, (1) we propose an uncertainty estimator specific for body joint predictions, which takes into account the spatial distribution of the responses of the current pose estimator on the unlabelled images; (2) we propose a dynamic combination of influence and uncertainty cues, where their weights vary during the active learning process according to the reliability of the current pose estimator; (3) we introduce a computer assisted annotation interface, which reduces the time necessary for a human annotator to click on a joint by discretizing the image into regions generated by the current pose estimator. Experiments using the MPII and LSP datasets with both simulated and real annotators show that (1) the proposed active selection scheme outperforms several baselines; (2) our computer-assisted interface can further reduce annotation effort; and (3) our technique can further improve the performance of a pose estimator even when starting from an already strong one.performance in 23% annotation time.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Liu_Active_Learning_for_ICCV_2017_paper.pdf", @@ -1000,7 +1065,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Liu_2017_ICCV,\n \n author = {\n Liu,\n Buyu and Ferrari,\n Vittorio\n},\n title = {\n Active Learning for Human Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Adaptive Feeding: Achieving Fast and Accurate Detections by Adaptively Combining Object Detectors", @@ -1008,6 +1074,7 @@ "status": "Poster", "track": "main", "pid": "1544", + "author_site": "Hong-Yu Zhou; Bin-Bin Gao; Jianxin Wu", "author": "Hong-Yu Zhou; Bin-Bin Gao; Jianxin Wu", "abstract": "Object detection aims at high speed and accuracy simultaneously. However, fast models are usually less accurate, while accurate models cannot satisfy our need for speed. A fast model can be 10 times faster but 50% less accurate than an accurate model. In this paper, we propose Adaptive Feeding (AF) to combine a fast (but less accurate) detector and an accurate (but slow) detector, by adaptively determining whether an image is easy or hard and choosing an appropriate detector for it. In practice, we build a cascade of detectors, including the AF classifier which make the easy vs. hard decision and the two detectors. The AF classifier can be tuned to obtain different tradeoff between speed and accuracy, which has negligible training time and requires no additional training data. Experimental results on the PASCAL VOC, MS COCO and Caltech Pedestrian datasets confirm that AF has the ability to achieve comparable speed as the fast detector and comparable accuracy as the accurate one at the same time. As an example, by combining the fast SSD300 with the accurate SSD500 detector, AF leads to 50% speedup over SSD500 with the same precision on the VOC2007 test set.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zhou_Adaptive_Feeding_Achieving_ICCV_2017_paper.pdf", @@ -1032,7 +1099,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhou_2017_ICCV,\n \n author = {\n Zhou,\n Hong-Yu and Gao,\n Bin-Bin and Wu,\n Jianxin\n},\n title = {\n Adaptive Feeding: Achieving Fast and Accurate Detections by Adaptively Combining Object Detectors\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Adaptive RNN Tree for Large-Scale Human Action Recognition", @@ -1040,6 +1108,7 @@ "status": "Poster", "track": "main", "pid": "562", + "author_site": "Wenbo Li; Longyin Wen; Ming-Ching Chang; Ser Nam Lim; Siwei Lyu", "author": "Wenbo Li; Longyin Wen; Ming-Ching Chang; Ser Nam Lim; Siwei Lyu", "abstract": "In this work, we present the RNN Tree (RNN-T), an adaptive learning framework for skeleton based human action recognition. Our method categorizes action classes and uses multiple Recurrent Neural Networks (RNNs) in a tree-like hierarchy. The RNNs in RNN-T are co-trained with the action category hierarchy, which determines the structure of RNN-T. Actions in skeletal representations are recognized via a hierarchical inference process, during which individual RNNs differentiate finer-grained action classes with increasing confidence. Inference in RNN-T ends when any RNN in the tree recognizes the action with high confidence, or a leaf node is reached. RNN-T effectively addresses two main challenges of large-scale action recognition: (i) able to distinguish fine-grained action classes that are intractable using a single network, and (ii) adaptive to new action classes by augmenting an existing model. We demonstrate the effectiveness of RNN-T/ACH method and compare it with the state-of-the-art methods on a large-scale dataset and several existing benchmarks.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Li_Adaptive_RNN_Tree_ICCV_2017_paper.pdf", @@ -1064,7 +1133,8 @@ "aff_campus_unique_index": "0;;0;;0", "aff_campus_unique": "Albany;", "aff_country_unique_index": "0;0+0;0;0+0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Li_2017_ICCV,\n \n author = {\n Li,\n Wenbo and Wen,\n Longyin and Chang,\n Ming-Ching and Nam Lim,\n Ser and Lyu,\n Siwei\n},\n title = {\n Adaptive RNN Tree for Large-Scale Human Action Recognition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Adversarial Examples Detection in Deep Networks With Convolutional Filter Statistics", @@ -1072,6 +1142,7 @@ "status": "Poster", "track": "main", "pid": "3060", + "author_site": "Xin Li; Fuxin Li", "author": "Xin Li; Fuxin Li", "abstract": "Deep learning has greatly improved visual recognition in recent years. However, recent research has shown that there exist many adversarial examples that can negatively impact the performance of such an architecture. This paper focuses on detecting those adversarial examples by analyzing whether they come from the same distribution as the normal examples. Instead of directly training a deep neural network to detect adversarials, a much simpler approach was proposed based on statistics on outputs from convolutional layers. A cascade classifier was designed to efficiently detect adversarials. Furthermore, trained from one particular adversarial generating mechanism, the resulting classifier can successfully detect adversarials from a completely different mechanism as well. The resulting classifier is non-subdifferentiable, hence creates a difficulty for adversaries to attack by using the gradient of the classifier. After detecting adversarial examples, we show that many of them can be recovered by simply performing a small average filter on the image. Those findings should lead to more insights about the classification mechanisms in deep convolutional neural networks.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Li_Adversarial_Examples_Detection_ICCV_2017_paper.pdf", @@ -1086,7 +1157,8 @@ "aff_domain": ";", "email": ";", "author_num": 2, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Li_Adversarial_Examples_Detection_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Li_Adversarial_Examples_Detection_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Li_2017_ICCV,\n \n author = {\n Li,\n Xin and Li,\n Fuxin\n},\n title = {\n Adversarial Examples Detection in Deep Networks With Convolutional Filter Statistics\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Adversarial Examples for Semantic Segmentation and Object Detection", @@ -1094,6 +1166,7 @@ "status": "Poster", "track": "main", "pid": "568", + "author_site": "Cihang Xie; Jianyu Wang; Zhishuai Zhang; Yuyin Zhou; Lingxi Xie; Alan Yuille", "author": "Cihang Xie; Jianyu Wang; Zhishuai Zhang; Yuyin Zhou; Lingxi Xie; Alan Yuille", "abstract": "It has been well demonstrated that adversarial examples, i.e., natural images with visually imperceptible perturbations added, cause deep networks to fail on image classification. In this paper, we extend adversarial examples to semantic segmentation and object detection which are much more difficult. Our observation is that both segmentation and detection are based on classifying multiple targets on an image (e.g., the target is a pixel or a receptive field in segmentation, and an object proposal in detection). This inspires us to optimize a loss function over a set of targets for generating adversarial perturbations. Based on this, we propose a novel algorithm named Dense Adversary Generation (DAG), which applies to the state-of-the-art networks for segmentation and detection. We find that the adversarial perturbations can be transferred across networks with different training data, based on different architectures, and even for different recognition tasks. In particular, the transfer ability across networks with the same architecture is more significant than in other cases. Besides, we show that summing up heterogeneous perturbations often leads to better transfer performance, which provides an effective method of black-box adversarial attack.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Xie_Adversarial_Examples_for_ICCV_2017_paper.pdf", @@ -1111,14 +1184,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Xie_Adversarial_Examples_for_ICCV_2017_paper.html", "aff_unique_index": "0;1+0;0;0;0;0", - "aff_unique_norm": "Johns Hopkins University;Baidu", + "aff_unique_norm": "The Johns Hopkins University;Baidu Research", "aff_unique_dep": "Department of Computer Science;Research", "aff_unique_url": "https://www.jhu.edu;https://research.baidu.com", "aff_unique_abbr": "JHU;Baidu", "aff_campus_unique_index": "1", "aff_campus_unique": ";USA", "aff_country_unique_index": "0;0+0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Xie_2017_ICCV,\n \n author = {\n Xie,\n Cihang and Wang,\n Jianyu and Zhang,\n Zhishuai and Zhou,\n Yuyin and Xie,\n Lingxi and Yuille,\n Alan\n},\n title = {\n Adversarial Examples for Semantic Segmentation and Object Detection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Adversarial Image Perturbation for Privacy Protection -- A Game Theory Perspective", @@ -1126,10 +1200,11 @@ "status": "Poster", "track": "main", "pid": "489", + "author_site": "Seong Joon Oh; Mario Fritz; Bernt Schiele", "author": "Seong Joon Oh; Mario Fritz; Bernt Schiele", "abstract": "Users like sharing personal photos with others through social media. At the same time, they might want to make automatic identification in such photos difficult or even impossible. Classic obfuscation methods such as blurring are not only unpleasant but also not as effective as one would expect. Recent studies on adversarial image perturbations (AIP) suggest that it is possible to confuse recognition systems effectively without unpleasant artifacts. However, in the presence of counter measures against AIPs, it is unclear how effective AIP would be in particular when the choice of counter measure is unknown. Game theory provides tools for studying the interaction between agents with uncertainties in the strategies. We introduce a general game theoretical framework for the user-recogniser dynamics, and present a case study that involves current state of the art AIP and person recognition techniques. We derive the optimal strategy for the user that assures an upper bound on the recognition rate independent of the recogniser's counter measure. Code is available at https://goo.gl/hgvbNK.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Oh_Adversarial_Image_Perturbation_ICCV_2017_paper.pdf", - "aff": "Max Planck Institute for Informatics, Saarland Informatics Campus, Saarbr\u00fccken, Germany; Max Planck Institute for Informatics, Saarland Informatics Campus, Saarbr\u00fccken, Germany; Max Planck Institute for Informatics, Saarland Informatics Campus, Saarbr\u00fccken, Germany", + "aff": "Max Planck Institute for Informatics, Saarland Informatics Campus, Saarbrücken, Germany; Max Planck Institute for Informatics, Saarland Informatics Campus, Saarbrücken, Germany; Max Planck Institute for Informatics, Saarland Informatics Campus, Saarbrücken, Germany", "project": "https://goo.gl/hgvbNK", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2017/supplemental/Oh_Adversarial_Image_Perturbation_ICCV_2017_supplemental.pdf", @@ -1148,9 +1223,10 @@ "aff_unique_url": "https://mpi-inf.mpg.de", "aff_unique_abbr": "MPII", "aff_campus_unique_index": "0;0;0", - "aff_campus_unique": "Saarbr\u00fccken", + "aff_campus_unique": "Saarbrücken", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Oh_2017_ICCV,\n \n author = {\n Joon Oh,\n Seong and Fritz,\n Mario and Schiele,\n Bernt\n},\n title = {\n Adversarial Image Perturbation for Privacy Protection -- A Game Theory Perspective\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Adversarial Inverse Graphics Networks: Learning 2D-To-3D Lifting and Image-To-Image Translation From Unpaired Supervision", @@ -1158,6 +1234,7 @@ "status": "Poster", "track": "main", "pid": "1967", + "author_site": "Hsiao-Yu Fish Tung; Adam W. Harley; William Seto; Katerina Fragkiadaki", "author": "Hsiao-Yu Fish Tung; Adam W. Harley; William Seto; Katerina Fragkiadaki", "abstract": "Researchers have developed excellent feed-forward models that learn to map images to desired outputs, such as to the images' latent factors, or to other images, using supervised learning. Learning such mappings from unlabelled data, or improving upon supervised models by exploiting unlabelled data, remains elusive. We argue that there are two important parts to learning without annotations: (i) matching the predictions to the input observations, and (ii) matching the predictions to known priors. We propose Adversarial Inverse Graphics networks (AIGNs): weakly supervised neural network models that combine feedback from rendering their predictions, with distribution matching between their predictions and a collection of ground-truth factors. We apply AIGNs to 3D human pose estimation and 3D structure and egomotion estimation, and outperform models supervised by only paired annotations. We further apply AIGNs to facial image transformation using super-resolution and inpainting renderers, while deliberately adding biases in the ground-truth datasets. Our model seamlessly incorporates such biases, rendering input faces towards young, old, feminine, masculine or Tom Cruise-like equivalents (depending on the chosen bias), or adding lip and nose augmentations while inpainting concealed lips and noses.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Tung_Adversarial_Inverse_Graphics_ICCV_2017_paper.pdf", @@ -1182,7 +1259,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Tung_2017_ICCV,\n \n author = {\n Fish Tung,\n Hsiao-Yu and Harley,\n Adam W. and Seto,\n William and Fragkiadaki,\n Katerina\n},\n title = {\n Adversarial Inverse Graphics Networks: Learning 2D-To-3D Lifting and Image-To-Image Translation From Unpaired Supervision\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Adversarial PoseNet: A Structure-Aware Convolutional Network for Human Pose Estimation", @@ -1190,6 +1268,7 @@ "status": "Poster", "track": "main", "pid": "373", + "author_site": "Yu Chen; Chunhua Shen; Xiu-Shen Wei; Lingqiao Liu; Jian Yang", "author": "Yu Chen; Chunhua Shen; Xiu-Shen Wei; Lingqiao Liu; Jian Yang", "abstract": "For human pose estimation in monocular images, joint occlusions and overlapping upon human bodies often result in deviated pose predictions. Under these circumstances, bi- ologically implausible pose predictions may be produced. In contrast, human vision is able to predict poses by exploiting geometric constraints of joint inter-connectivity. To address the problem by incorporating priors about the structure of human bodies, we propose a novel structure-aware convo- lutional network to implicitly take such priors into account during training of the deep network. Explicit learning of such constraints is typically challenging. Instead, we design discriminators to distinguish the real poses from the fake ones (such as biologically implausible ones). If the pose generator (G) generates results that the discriminator fails to distinguish from real ones, the network successfully learns the priors. To better capture the structure dependency of human body joints, the generator G is designed in a stacked multi-task manner to predict poses as well as occlusion heatmaps. Then, the pose and occlusion heatmaps are sent to the discrimina- tors to predict the likelihood of the pose being real. Training of the network follows the strategy of conditional Generative Adversarial Networks (GANs). The effectiveness of the pro- posed network is evaluated on two widely used human pose estimation benchmark datasets. Our approach significantly outperforms the state-of-the-art methods and almost always generates plausible human pose predictions.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Chen_Adversarial_PoseNet_A_ICCV_2017_paper.pdf", @@ -1214,7 +1293,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;1;0", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Chen_2017_ICCV,\n \n author = {\n Chen,\n Yu and Shen,\n Chunhua and Wei,\n Xiu-Shen and Liu,\n Lingqiao and Yang,\n Jian\n},\n title = {\n Adversarial PoseNet: A Structure-Aware Convolutional Network for Human Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Aesthetic Critiques Generation for Photos", @@ -1222,6 +1302,7 @@ "status": "Poster", "track": "main", "pid": "1559", + "author_site": "Kuang-Yu Chang; Kung-Hung Lu; Chu-Song Chen", "author": "Kuang-Yu Chang; Kung-Hung Lu; Chu-Song Chen", "abstract": "It is said that a picture is worth a thousand words. Thus, there are various ways to describe an image, especially in aesthetic quality analysis. Although aesthetic quality assessment has generated a great deal of interest in the last decade, most studies focus on providing a quality rating of good or bad for an image. In this work, we extend the task to produce captions related to photo aesthetics and/or photography skills. To the best of our knowledge, this is the first study that deals with aesthetics captioning instead of AQ scoring. In contrast to common image captioning tasks that depict the objects or their relations in a picture, our approach can select a particular aesthetics aspect and generate captions with respect to the aspect chosen. Meanwhile, the proposed aspect-fusion method further uses an attention mechanism to generate more abundant aesthetics captions. We also introduce a new dataset for aesthetics captioning called the Photo Critique Captioning Dataset (PCCD), which contains pair-wise image-comment data from professional photographers. The results of experiments on PCCD demonstrate that our approaches outperform existing methods for generating aesthetic-oriented captions for images.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Chang_Aesthetic_Critiques_Generation_ICCV_2017_paper.pdf", @@ -1246,7 +1327,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Taiwan", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chang_2017_ICCV,\n \n author = {\n Chang,\n Kuang-Yu and Lu,\n Kung-Hung and Chen,\n Chu-Song\n},\n title = {\n Aesthetic Critiques Generation for Photos\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Aligned Image-Word Representations Improve Inductive Transfer Across Vision-Language Tasks", @@ -1254,6 +1336,7 @@ "status": "Poster", "track": "main", "pid": "1997", + "author_site": "Tanmay Gupta; Kevin Shih; Saurabh Singh; Derek Hoiem", "author": "Tanmay Gupta; Kevin Shih; Saurabh Singh; Derek Hoiem", "abstract": "An important goal of computer vision is to build systems that learn visual representations over time that can be applied to many tasks. In this paper, we investigate a vision-language embedding as a core representation and show that it leads to better cross-task transfer than standard multi-task learning. In particular, the task of visual recognition is aligned to the task of visual question answering by forcing each to use the same word-region embeddings. We show this leads to greater inductive transfer from recognition to VQA than standard multitask learning. Visual recognition also improves, especially for categories that have relatively few recognition training labels but appear often in the VQA setting. Thus, our paper takes a small step towards creating more general vision systems by showing the benefit of interpretable, flexible, and trainable core representations.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Gupta_Aligned_Image-Word_Representations_ICCV_2017_paper.pdf", @@ -1268,7 +1351,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Gupta_Aligned_Image-Word_Representations_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Gupta_Aligned_Image-Word_Representations_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Gupta_2017_ICCV,\n \n author = {\n Gupta,\n Tanmay and Shih,\n Kevin and Singh,\n Saurabh and Hoiem,\n Derek\n},\n title = {\n Aligned Image-Word Representations Improve Inductive Transfer Across Vision-Language Tasks\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Am I a Baller? Basketball Performance Assessment From First-Person Videos", @@ -1276,6 +1360,7 @@ "status": "Poster", "track": "main", "pid": "883", + "author_site": "Gedas Bertasius; Hyun Soo Park; Stella X. Yu; Jianbo Shi", "author": "Gedas Bertasius; Hyun Soo Park; Stella X. Yu; Jianbo Shi", "abstract": "This paper presents a method to assess a basketball player's performance from his/her first-person video. A key challenge lies in the fact that the evaluation metric is highly subjective and specific to a particular evaluator. We leverage the first-person camera to address this challenge. The spatiotemporal visual semantics provided by a first-person view allows us to reason about the camera wearer's actions while he/she is participating in an unscripted basketball game. Our method takes a player's first-person video and provides a player's performance measure that is specific to an evaluator's preference. To achieve this goal, we first use a convolutional LSTM network to detect atomic basketball events from first-person videos. Our network's ability to zoom-in to the salient regions addresses the issue of a severe camera wearer's head movement in first-person videos. The detected atomic events are then passed through the Gaussian mixtures to construct a highly non-linear visual spatiotemporal basketball assessment feature. Finally, we use this feature to learn a basketball assessment model from pairs of labeled first-person basketball videos, for which a basketball expert indicates, which of the two players is better. We demonstrate that despite not knowing the basketball evaluator's criterion, our model learns to accurately assess the players in real-world games. Furthermore, our model can also discover basketball events that contribute positively and negatively to a player's performance.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Bertasius_Am_I_a_ICCV_2017_paper.pdf", @@ -1300,7 +1385,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Bertasius_2017_ICCV,\n \n author = {\n Bertasius,\n Gedas and Soo Park,\n Hyun and Yu,\n Stella X. and Shi,\n Jianbo\n},\n title = {\n Am I a Baller? Basketball Performance Assessment From First-Person Videos\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Amulet: Aggregating Multi-Level Convolutional Features for Salient Object Detection", @@ -1308,6 +1394,7 @@ "status": "Poster", "track": "main", "pid": "102", + "author_site": "Pingping Zhang; Dong Wang; Huchuan Lu; Hongyu Wang; Xiang Ruan", "author": "Pingping Zhang; Dong Wang; Huchuan Lu; Hongyu Wang; Xiang Ruan", "abstract": "Fully convolutional neural networks (FCNs) have shown outstanding performance in many dense labeling problems. One key pillar of these successes is mining relevant information from features in convolutional layers. However, how to better aggregate multi-level convolutional feature maps for salient object detection is underexplored. In this work, we present Amulet, a generic aggregating multi-level convolutional feature framework for salient object detection. Our framework first integrates multi-level feature maps into multiple resolutions, which simultaneously incorporate coarse semantics and fine details. Then it adaptively learns to combine these feature maps at each resolution and predict saliency maps with the combined features. Finally, the predicted results are efficiently fused to generate the final saliency map. In addition, to achieve accurate boundary inference and semantic enhancement, edge-aware feature maps in low-level layers and the predicted results of low resolution features are recursively embedded into the learning framework. By aggregating multi-level convolutional features in this efficient and flexible manner, the proposed saliency model provides accurate salient object labeling. Comprehensive experiments demonstrate that our method performs favorably against state-of-the-art approaches in terms of near all compared evaluation metrics.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zhang_Amulet_Aggregating_Multi-Level_ICCV_2017_paper.pdf", @@ -1332,7 +1419,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+1;0;1", - "aff_country_unique": "China;Japan" + "aff_country_unique": "China;Japan", + "bibtex": "@InProceedings{Zhang_2017_ICCV,\n \n author = {\n Zhang,\n Pingping and Wang,\n Dong and Lu,\n Huchuan and Wang,\n Hongyu and Ruan,\n Xiang\n},\n title = {\n Amulet: Aggregating Multi-Level Convolutional Features for Salient Object Detection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "An Analysis of Visual Question Answering Algorithms", @@ -1340,6 +1428,7 @@ "status": "Poster", "track": "main", "pid": "899", + "author_site": "Kushal Kafle; Christopher Kanan", "author": "Kushal Kafle; Christopher Kanan", "abstract": "In visual question answering (VQA), an algorithm must answer text-based questions about images. While multiple datasets for VQA have been created since late 2014, they all have flaws in both their content and the way algorithms are evaluated on them. As a result, evaluation scores are inflated and predominantly determined by answering easier questions, making it difficult to compare different methods. In this paper, we analyze existing VQA algorithms using a new dataset called the Task Driven Image Understanding Challenge (TDIUC), which has over 1.6 million questions organized into 12 different categories. We also introduce questions that are meaningless for a given image to force a VQA system to reason about image content. We propose new evaluation schemes that compensate for over-represented question-types and make it easier to study the strengths and weaknesses of algorithms. We analyze the performance of both baseline and state-of-the-art VQA models, including multi-modal compact bilinear pooling (MCB), neural module networks, and recurrent answering units. Our experiments establish how attention helps certain categories more than others, determine which models work better than others, and explain how simple models (e.g. MLP) can surpass more complex models (MCB) by simply learning to answer large, easy question categories.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Kafle_An_Analysis_of_ICCV_2017_paper.pdf", @@ -1364,7 +1453,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Kafle_2017_ICCV,\n \n author = {\n Kafle,\n Kushal and Kanan,\n Christopher\n},\n title = {\n An Analysis of Visual Question Answering Algorithms\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "An Empirical Study of Language CNN for Image Captioning", @@ -1372,6 +1462,7 @@ "status": "Poster", "track": "main", "pid": "393", + "author_site": "Jiuxiang Gu; Gang Wang; Jianfei Cai; Tsuhan Chen", "author": "Jiuxiang Gu; Gang Wang; Jianfei Cai; Tsuhan Chen", "abstract": "Language models based on recurrent neural networks have dominated recent image caption generation tasks. In this paper, we introduce a Language CNN model which is suitable for statistical language modeling tasks and shows competitive performance in image captioning. In contrast to previous models which predict next word based on one previous word and hidden state, our language CNN is fed with all the previous words and can model the long-range dependencies in history words, which are critical for image captioning. The effectiveness of our approach is validated on two datasets: Flickr30K and MS COCO. Our extensive experimental results show that our method outperforms the vanilla recurrent neural network based language models and is competitive with the state-of-the-art methods.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Gu_An_Empirical_Study_ICCV_2017_paper.pdf", @@ -1396,7 +1487,8 @@ "aff_campus_unique_index": "0;1;0;0", "aff_campus_unique": "Singapore;Hangzhou", "aff_country_unique_index": "0;1;0;0", - "aff_country_unique": "Singapore;China" + "aff_country_unique": "Singapore;China", + "bibtex": "@InProceedings{Gu_2017_ICCV,\n \n author = {\n Gu,\n Jiuxiang and Wang,\n Gang and Cai,\n Jianfei and Chen,\n Tsuhan\n},\n title = {\n An Empirical Study of Language CNN for Image Captioning\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "An Optimal Transportation Based Univariate Neuroimaging Index", @@ -1404,6 +1496,7 @@ "status": "Poster", "track": "main", "pid": "246", + "author_site": "Liang Mi; Wen Zhang; Junwei Zhang; Yonghui Fan; Dhruman Goradia; Kewei Chen; Eric M. Reiman; Xianfeng Gu; Yalin Wang", "author": "Liang Mi; Wen Zhang; Junwei Zhang; Yonghui Fan; Dhruman Goradia; Kewei Chen; Eric M. Reiman; Xianfeng Gu; Yalin Wang", "abstract": "The alterations of brain structures and functions have been considered closely correlated to the change of cognitive performance due to neurodegenerative diseases such as Alzheimer's disease. In this paper, we introduce a variational framework to compute the optimal transformation (OT) in 3D space and propose a univariate neuroimaging index based on OT to measure such alterations. We compute the OT from each image to a template and measure the Wasserstein distance between them. By comparing the distances from all the images to the common template, we obtain a concise and informative index for each image. Our framework makes use of the Newton's method, which reduces the computational cost and enables itself to be applicable to large-scale datasets. The proposed work is a generic approach and thus may be applicable to various volumetric brain images, including structural magnetic resonance (sMR) and fluorodeoxyglucose positron emission tomography (FDG-PET) images. In the classification between Alzheimer's disease patients and healthy controls, our method achieves an accuracy of 82.30% on the Alzheimer's Disease Neuroimaging Initiative (ADNI) baseline sMRI dataset and outperforms several other indices. On FDG-PET dataset, we boost the accuracy to 88.37% by leveraging pairwise Wasserstein distances. In a longitudinal study, we obtain a 5% significance with p-value = 0.0000113 in a t-test on FDG-PET. The results demonstrate a great potential of the proposed index for neuroimage analysis and the precision medicine research.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Mi_An_Optimal_Transportation_ICCV_2017_paper.pdf", @@ -1418,7 +1511,8 @@ "aff_domain": ";;;;;;;;", "email": ";;;;;;;;", "author_num": 9, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Mi_An_Optimal_Transportation_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Mi_An_Optimal_Transportation_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Mi_2017_ICCV,\n \n author = {\n Mi,\n Liang and Zhang,\n Wen and Zhang,\n Junwei and Fan,\n Yonghui and Goradia,\n Dhruman and Chen,\n Kewei and Reiman,\n Eric M. and Gu,\n Xianfeng and Wang,\n Yalin\n},\n title = {\n An Optimal Transportation Based Univariate Neuroimaging Index\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Anchored Regression Networks Applied to Age Estimation and Super Resolution", @@ -1426,6 +1520,7 @@ "status": "Poster", "track": "main", "pid": "848", + "author_site": "Eirikur Agustsson; Radu Timofte; Luc Van Gool", "author": "Eirikur Agustsson; Radu Timofte; Luc Van Gool", "abstract": "We propose the Anchored Regression Network (ARN), a nonlinear regression network which can be seamlessly integrated into various networks or can be used stand-alone when the features have already been fixed. Our ARN is a smoothed relaxation of a piecewise linear regressor through the combination of multiple linear regressors over soft assignments to anchor points. When the anchor points are fixed the optimal ARN regressors can be obtained with a closed form global solution, otherwise ARN admits end-to-end learning with standard gradient based methods. We demonstrate the power of the ARN by applying it to two very diverse and challenging tasks: age prediction from face images and image super-resolution. In both cases, ARNs yield strong results.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Agustsson_Anchored_Regression_Networks_ICCV_2017_paper.pdf", @@ -1450,7 +1545,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Zurich", "aff_country_unique_index": "0;0+1;0+2", - "aff_country_unique": "Switzerland;Germany;Belgium" + "aff_country_unique": "Switzerland;Germany;Belgium", + "bibtex": "@InProceedings{Agustsson_2017_ICCV,\n \n author = {\n Agustsson,\n Eirikur and Timofte,\n Radu and Van Gool,\n Luc\n},\n title = {\n Anchored Regression Networks Applied to Age Estimation and Super Resolution\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "AnnArbor: Approximate Nearest Neighbors Using Arborescence Coding", @@ -1458,6 +1554,7 @@ "status": "Poster", "track": "main", "pid": "2207", + "author_site": "Artem Babenko; Victor Lempitsky", "author": "Artem Babenko; Victor Lempitsky", "abstract": "To compress large datasets of high-dimensional descriptors, modern quantization schemes learn multiple codebooks and then represent individual descriptors as combinations of codewords. Once the codebooks are learned, these schemes encode descriptors independently. In contrast to that, we present a new coding scheme that arranges dataset descriptors into a set of arborescence graphs, and then encodes non-root descriptors by quantizing their displacements with respect to their parent nodes. By optimizing the structure of arborescences, our coding scheme can decrease the quantization error considerably, while incurring only minimal overhead on the memory footprint and the speed of nearest neighbor search in the compressed dataset compared to the independent quantization. The advantage of the proposed scheme is demonstrated in a series of experiments with datasets of SIFT and deep descriptors.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Babenko_AnnArbor_Approximate_Nearest_ICCV_2017_paper.pdf", @@ -1482,7 +1579,8 @@ "aff_campus_unique_index": "0+0;0", "aff_campus_unique": "Moscow", "aff_country_unique_index": "0+0;0", - "aff_country_unique": "Russian Federation" + "aff_country_unique": "Russia", + "bibtex": "@InProceedings{Babenko_2017_ICCV,\n \n author = {\n Babenko,\n Artem and Lempitsky,\n Victor\n},\n title = {\n AnnArbor: Approximate Nearest Neighbors Using Arborescence Coding\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Anticipating Daily Intention Using On-Wrist Motion Triggered Sensing", @@ -1490,6 +1588,7 @@ "status": "Spotlight", "track": "main", "pid": "316", + "author_site": "Tz-Ying Wu; Ting-An Chien; Cheng-Sheng Chan; Chan-Wei Hu; Min Sun", "author": "Tz-Ying Wu; Ting-An Chien; Cheng-Sheng Chan; Chan-Wei Hu; Min Sun", "abstract": "Anticipating human intention by observing one's actions has many applications. For instance, picking up a cellphone, then a charger (actions) implies that one wants to charge the cellphone (intention). By anticipating the intention, an intelligent system can guide the user to the closest power outlet. We propose an on-wrist motion triggered sensing system for anticipating daily intentions, where the on-wrist sensors help us to persistently observe one's actions. The core of the system is a novel Recurrent Neural Network (RNN) and Policy Network (PN), where the RNN encodes visual and motion observation to anticipate intention, and the PN parsimoniously triggers the process of visual observation to reduce computation requirement. We jointly trained the whole network using policy gradient and cross-entropy loss. To evaluate, we collect the first daily \"intention\" dataset consisting of 2379 videos with 34 intentions and 164 unique action sequences. Our method achieves 92.68%, 90.85%, 97.56% accuracy on three users while processing only 29% of the visual observation on average.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Wu_Anticipating_Daily_Intention_ICCV_2017_paper.pdf", @@ -1504,7 +1603,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Wu_Anticipating_Daily_Intention_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Wu_Anticipating_Daily_Intention_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Wu_2017_ICCV,\n \n author = {\n Wu,\n Tz-Ying and Chien,\n Ting-An and Chan,\n Cheng-Sheng and Hu,\n Chan-Wei and Sun,\n Min\n},\n title = {\n Anticipating Daily Intention Using On-Wrist Motion Triggered Sensing\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Approximate Grassmannian Intersections: Subspace-Valued Subspace Learning", @@ -1512,6 +1612,7 @@ "status": "Poster", "track": "main", "pid": "1831", + "author_site": "Calvin Murdock; Fernando De la Torre", "author": "Calvin Murdock; Fernando De la Torre", "abstract": "Subspace learning is one of the most foundational tasks in computer vision with applications ranging from dimensionality reduction to data denoising. As geometric objects, subspaces have also been successfully used for efficiently representing certain types of invariant data. However, methods for subspace learning from subspace-valued data have been notably absent due to incompatibilities with standard problem formulations. To fill this void, we introduce Approximate Grassmannian Intersections (AGI), a novel geometric interpretation of subspace learning posed as finding the approximate intersection of constraint sets on a Grassmann manifold. Our approach can naturally be applied to input subspaces of varying dimension while reducing to standard subspace learning in the case of vector-valued data. Despite the nonconvexity of our problem, its globally-optimal solution can be found using a singular value decomposition. Furthermore, we also propose an efficient, general optimization approach that can incorporate additional constraints to encourage properties such as robustness. Alongside standard subspace applications, AGI also enables the novel task of transfer learning via subspace completion. We evaluate our approach on a variety of applications, demonstrating improved invariance and generalization over vector-valued alternatives.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Murdock_Approximate_Grassmannian_Intersections_ICCV_2017_paper.pdf", @@ -1526,7 +1627,8 @@ "aff_domain": ";", "email": ";", "author_num": 2, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Murdock_Approximate_Grassmannian_Intersections_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Murdock_Approximate_Grassmannian_Intersections_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Murdock_2017_ICCV,\n \n author = {\n Murdock,\n Calvin and De la Torre,\n Fernando\n},\n title = {\n Approximate Grassmannian Intersections: Subspace-Valued Subspace Learning\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Arbitrary Style Transfer in Real-Time With Adaptive Instance Normalization", @@ -1534,6 +1636,7 @@ "status": "Oral", "track": "main", "pid": "180", + "author_site": "Xun Huang; Serge Belongie", "author": "Xun Huang; Serge Belongie", "abstract": "Gatys et al. recently introduced a neural algorithm that renders a content image in the style of another image, achieving so-called style transfer. However, their framework requires a slow iterative optimization process, which limits its practical application. Fast approximations with feed-forward neural networks have been proposed to speed up neural style transfer. Unfortunately, the speed improvement comes at a cost: the network is usually tied to a fixed set of styles and cannot adapt to arbitrary new styles. In this paper, we present a simple yet effective approach that for the first time enables arbitrary style transfer in real-time. At the heart of our method is a novel adaptive instance normalization (AdaIN) layer that aligns the mean and variance of the content features with those of the style features. Our method achieves speed comparable to the fastest existing approach, without the restriction to a pre-defined set of styles. In addition, our approach allows flexible user controls such as content-style trade-off, style interpolation, color & spatial controls, all using a single feed-forward neural network.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Huang_Arbitrary_Style_Transfer_ICCV_2017_paper.pdf", @@ -1548,7 +1651,8 @@ "aff_domain": ";", "email": ";", "author_num": 2, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Huang_Arbitrary_Style_Transfer_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Huang_Arbitrary_Style_Transfer_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Huang_2017_ICCV,\n \n author = {\n Huang,\n Xun and Belongie,\n Serge\n},\n title = {\n Arbitrary Style Transfer in Real-Time With Adaptive Instance Normalization\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Areas of Attention for Image Captioning", @@ -1556,10 +1660,11 @@ "status": "Poster", "track": "main", "pid": "410", + "author_site": "Marco Pedersoli; Thomas Lucas; Cordelia Schmid; Jakob Verbeek", "author": "Marco Pedersoli; Thomas Lucas; Cordelia Schmid; Jakob Verbeek", "abstract": "We propose \"Areas of Attention\", a novel attention-based model for automatic image captioning. Our approach models the dependencies between image regions, caption words, and the state of an RNN language model, using three pairwise interactions. In contrast to previous attention-based approaches that associate image regions to the RNN state, our method allows a direct association between caption words and image regions. During training these associations are inferred from image-level captions, akin to weakly-supervised object detector training. These associations help to improve captioning by localizing the corresponding regions during testing. We also propose and compare different ways of generating attention areas: CNN activation grids, object proposals, and spatial transformers nets applied in a convolutional fashion. Spatial transformers give the best results, since they allow for image specific attention areas, and can be trained jointly with the rest of the network. Our attention mechanism and spatial transformer attention areas together yield state-of-the-art results on the MSCOCO dataset.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Pedersoli_Areas_of_Attention_ICCV_2017_paper.pdf", - "aff": "\u00b4Ecole de technologie sup\u00e9rieure, Montr\u00e9al, Canada; Univ. Grenoble Alpes, Inria, CNRS, Grenoble INP, LJK, 38000 Grenoble, France; Univ. Grenoble Alpes, Inria, CNRS, Grenoble INP, LJK, 38000 Grenoble, France; Univ. Grenoble Alpes, Inria, CNRS, Grenoble INP, LJK, 38000 Grenoble, France", + "aff": "´Ecole de technologie supérieure, Montréal, Canada; Univ. Grenoble Alpes, Inria, CNRS, Grenoble INP, LJK, 38000 Grenoble, France; Univ. Grenoble Alpes, Inria, CNRS, Grenoble INP, LJK, 38000 Grenoble, France; Univ. Grenoble Alpes, Inria, CNRS, Grenoble INP, LJK, 38000 Grenoble, France", "project": "", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2017/supplemental/Pedersoli_Areas_of_Attention_ICCV_2017_supplemental.pdf", @@ -1573,14 +1678,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Pedersoli_Areas_of_Attention_ICCV_2017_paper.html", "aff_unique_index": "0;1;1;1", - "aff_unique_norm": "Ecole de technologie sup\u00e9rieure;Universite Grenoble Alpes", + "aff_unique_norm": "Ecole de technologie supérieure;Universite Grenoble Alpes", "aff_unique_dep": ";", "aff_unique_url": "https://www.etsmtl.ca;https://www.univ-grenoble-alpes.fr", "aff_unique_abbr": "ETS;UGA", "aff_campus_unique_index": "0;1;1;1", - "aff_campus_unique": "Montr\u00e9al;Grenoble", + "aff_campus_unique": "Montréal;Grenoble", "aff_country_unique_index": "0;1;1;1", - "aff_country_unique": "Canada;France" + "aff_country_unique": "Canada;France", + "bibtex": "@InProceedings{Pedersoli_2017_ICCV,\n \n author = {\n Pedersoli,\n Marco and Lucas,\n Thomas and Schmid,\n Cordelia and Verbeek,\n Jakob\n},\n title = {\n Areas of Attention for Image Captioning\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Associative Domain Adaptation", @@ -1588,6 +1694,7 @@ "status": "Poster", "track": "main", "pid": "1052", + "author_site": "Philip Haeusser; Thomas Frerix; Alexander Mordvintsev; Daniel Cremers", "author": "Philip Haeusser; Thomas Frerix; Alexander Mordvintsev; Daniel Cremers", "abstract": "We propose \"associative domain adaptation\", a novel technique for end-to-end domain adaptation with neural networks, the task of inferring class labels for an unlabeled target domain based on the statistical properties of a labeled source domain. Our training scheme follows the paradigm that in order to effectively derive class labels for the target domain, a network should produce statistically domain invariant embeddings, while minimizing the classification error on the labeled source domain. We accomplish this by reinforcing \"associations\" between source and target data directly in embedding space. Our method can easily be added to any existing classification network with no structural and almost no computational overhead. We demonstrate the effectiveness of our approach on various benchmarks and achieve state-of-the-art results across the board with a generic convolutional neural network architecture not specifically tuned to the respective tasks. Finally, we show that the proposed association loss produces embeddings that are more effective for domain adaptation compared to methods employing maximum mean discrepancy as a similarity measure in embedding space.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Haeusser_Associative_Domain_Adaptation_ICCV_2017_paper.pdf", @@ -1606,13 +1713,14 @@ "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Haeusser_Associative_Domain_Adaptation_ICCV_2017_paper.html", "aff_unique_index": "0+1;0;1;0", "aff_unique_norm": "Technical University of Munich;Google", - "aff_unique_dep": "Department of Informatics;Google", + "aff_unique_dep": "Department of Informatics;", "aff_unique_url": "https://www.tum.de;https://www.google.com", "aff_unique_abbr": "TUM;Google", "aff_campus_unique_index": "0+1;0;1;0", "aff_campus_unique": "Munich;Mountain View", "aff_country_unique_index": "0+1;0;1;0", - "aff_country_unique": "Germany;United States" + "aff_country_unique": "Germany;United States", + "bibtex": "@InProceedings{Haeusser_2017_ICCV,\n \n author = {\n Haeusser,\n Philip and Frerix,\n Thomas and Mordvintsev,\n Alexander and Cremers,\n Daniel\n},\n title = {\n Associative Domain Adaptation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Attention-Aware Deep Reinforcement Learning for Video Face Recognition", @@ -1620,6 +1728,7 @@ "status": "Poster", "track": "main", "pid": "1598", + "author_site": "Yongming Rao; Jiwen Lu; Jie Zhou", "author": "Yongming Rao; Jiwen Lu; Jie Zhou", "abstract": "In this paper, we propose an attention-aware deep reinforcement learning (ADRL) method for video face recognition, which aims to discard the misleading and confounding frames and find the focuses of attention in face videos for person recognition. We formulate the process of finding the attentions of videos as a Markov decision process and train the attention model through a deep reinforcement learning framework without using extra labels. Unlike existing attention models, our method takes information from both the image space and the feature space as the input to make better use of face information that is discarded in the feature learning process. Besides, our approach is attention-aware, which seeks different attentions of videos for the verification of different pairs of videos. Our approach achieves very competitive video face recognition performance on three widely used video face datasets.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Rao_Attention-Aware_Deep_Reinforcement_ICCV_2017_paper.pdf", @@ -1644,7 +1753,8 @@ "aff_campus_unique_index": "0+0;0+0;0+0", "aff_campus_unique": "Beijing;", "aff_country_unique_index": "0+0+0;0+0+0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Rao_2017_ICCV,\n \n author = {\n Rao,\n Yongming and Lu,\n Jiwen and Zhou,\n Jie\n},\n title = {\n Attention-Aware Deep Reinforcement Learning for Video Face Recognition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Attention-Based Multimodal Fusion for Video Description", @@ -1652,6 +1762,7 @@ "status": "Poster", "track": "main", "pid": "1945", + "author_site": "Chiori Hori; Takaaki Hori; Teng-Yok Lee; Ziming Zhang; Bret Harsham; John R. Hershey; Tim K. Marks; Kazuhiko Sumi", "author": "Chiori Hori; Takaaki Hori; Teng-Yok Lee; Ziming Zhang; Bret Harsham; John R. Hershey; Tim K. Marks; Kazuhiko Sumi", "abstract": "Current methods for video description are based on encoder-decoder sentence generation using recurrent neural networks (RNNs). Recent work has demonstrated the advantages of integrating temporal attention mechanisms into these models, in which the decoder network predicts each word in the description by selectively giving more weight to encoded features from specific time frames. Such methods typically use two different types of features: image features (from an object classification model), and motion features (from an action recognition model), combined by naive concatenation in the model input. Because different feature modalities may carry task-relevant information at different times, fusing them by naive concatenation may limit the model's ability to dynamically determine the relevance of each type of feature to different parts of the description. In this paper, we incorporate audio features in addition to the image and motion features. To fuse these three modalities, we introduce a multimodal attention model that can selectively utilize features from different modalities for each word in the output description. Combining our new multimodal attention model with standard temporal attention outperforms state-of-the-art methods on two standard datasets: YouTube2Text and MSR-VTT.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Hori_Attention-Based_Multimodal_Fusion_ICCV_2017_paper.pdf", @@ -1676,7 +1787,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;1+0", - "aff_country_unique": "United States;Japan" + "aff_country_unique": "United States;Japan", + "bibtex": "@InProceedings{Hori_2017_ICCV,\n \n author = {\n Hori,\n Chiori and Hori,\n Takaaki and Lee,\n Teng-Yok and Zhang,\n Ziming and Harsham,\n Bret and Hershey,\n John R. and Marks,\n Tim K. and Sumi,\n Kazuhiko\n},\n title = {\n Attention-Based Multimodal Fusion for Video Description\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Attentive Semantic Video Generation Using Captions", @@ -1684,6 +1796,7 @@ "status": "Poster", "track": "main", "pid": "510", + "author_site": "Tanya Marwah; Gaurav Mittal; Vineeth N. Balasubramanian", "author": "Tanya Marwah; Gaurav Mittal; Vineeth N. Balasubramanian", "abstract": "This paper proposes a network architecture to perform variable length semantic video generation using captions. We adopt a new perspective towards video generation where we allow the captions to be combined with the long-term and short-term dependencies between video frames and thus generate a video in an incremental manner. Our experiments demonstrate our network architecture's ability to distinguish between objects, actions and interactions in a video and combine them to generate videos for unseen captions. The network also exhibits the capability to perform spatio-temporal style transfer when asked to generate videos for a sequence of captions. We also show that the network's ability to learn a latent representation allows it generate videos in an unsupervised manner and perform other tasks such as action recognition.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Marwah_Attentive_Semantic_Video_ICCV_2017_paper.pdf", @@ -1708,7 +1821,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hyderabad", "aff_country_unique_index": "0;0", - "aff_country_unique": "India" + "aff_country_unique": "India", + "bibtex": "@InProceedings{Marwah_2017_ICCV,\n \n author = {\n Marwah,\n Tanya and Mittal,\n Gaurav and Balasubramanian,\n Vineeth N.\n},\n title = {\n Attentive Semantic Video Generation Using Captions\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Attribute Recognition by Joint Recurrent Learning of Context and Correlation", @@ -1716,6 +1830,7 @@ "status": "Poster", "track": "main", "pid": "262", + "author_site": "Jingya Wang; Xiatian Zhu; Shaogang Gong; Wei Li", "author": "Jingya Wang; Xiatian Zhu; Shaogang Gong; Wei Li", "abstract": "Recognising semantic pedestrian attributes in surveillance images is a challenging task for computer vision, particularly when the imaging quality is poor with complex background clutter and uncontrolled viewing conditions, and the number of labelled training data is small. In this work, we formulate a Joint Recurrent Learning (JRL) model for exploring attribute context and correlation in order to improve attribute recognition given small sized training data with poor quality images. The JRL model learns jointly pedestrian attribute correlations in a pedestrian image and in particular their sequential ordering dependencies (latent high-order correlation) in an end-to-end encoder/decoder recurrent network. We demonstrate the performance advantage and robustness of the JRL model over a wide range of state-of-the-art deep models for pedestrian attribute recognition, multi-label image classification, and multi-person image annotation on two largest pedestrian attribute benchmarks PETA and RAP.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Wang_Attribute_Recognition_by_ICCV_2017_paper.pdf", @@ -1740,7 +1855,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "London;", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United Kingdom;" + "aff_country_unique": "United Kingdom;", + "bibtex": "@InProceedings{Wang_2017_ICCV,\n \n author = {\n Wang,\n Jingya and Zhu,\n Xiatian and Gong,\n Shaogang and Li,\n Wei\n},\n title = {\n Attribute Recognition by Joint Recurrent Learning of Context and Correlation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Attribute-Enhanced Face Recognition With Neural Tensor Fusion Networks", @@ -1748,10 +1864,11 @@ "status": "Spotlight", "track": "main", "pid": "232", + "author_site": "Guosheng Hu; Yang Hua; Yang Yuan; Zhihong Zhang; Zheng Lu; Sankha S. Mukherjee; Timothy M. Hospedales; Neil M. Robertson; Yongxin Yang", "author": "Guosheng Hu; Yang Hua; Yang Yuan; Zhihong Zhang; Zheng Lu; Sankha S. Mukherjee; Timothy M. Hospedales; Neil M. Robertson; Yongxin Yang", "abstract": "Deep learning has achieved great success in face recognition, however deep-learned features still have limited invariance to strong intra-personal variations such as large pose. It is observed that some facial attributes (e.g. eyebrow thickness, gender) are invariant to such variations. We present the first work to systematically explore how the fusion of face recognition feature (FRF) and facial attribute feature (FAF) can enhance face recognition performance in various challenging scenarios. Despite this helpfulness of FAF, in practice, we find the existing fusion methods cannot reliably improve the recognition performance. Thus, we develop a powerful tensor-based framework which formulates this fusion as a low-rank tensor optimisation problem. It is non-trivial to directly optimise this tensor due to the large number of parameters to optimise. To solve this problem, we establish a theoretical equivalence between tensor optimisation and a two-stream gated neural network. This equivalence allows tractable computation and the use of standard neural network optimisation tools, leading to an accurate and stable optimisation. Experimental results show the fused feature works better than individual features thus proving for the first time that facial attributes aid face recognition. We achieve state-of-the-art performance on databases such as MultiPIE, CASIA NIR-VIR2.0 and LFW.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Hu_Attribute-Enhanced_Face_Recognition_ICCV_2017_paper.pdf", - "aff": "AnyVision; AnyVision+Queen\u2019s University Belfast; AnyVision; Xiamen University; AnyVision; AnyVision; The University of Edinburgh; AnyVision+Queen\u2019s University Belfast; Queen Mary University of London+Yang\u2019s Accounting Consultancy Ltd", + "aff": "AnyVision; AnyVision+Queen’s University Belfast; AnyVision; Xiamen University; AnyVision; AnyVision; The University of Edinburgh; AnyVision+Queen’s University Belfast; Queen Mary University of London+Yang’s Accounting Consultancy Ltd", "project": "", "github": "", "supp": "", @@ -1765,14 +1882,15 @@ "author_num": 9, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Hu_Attribute-Enhanced_Face_Recognition_ICCV_2017_paper.html", "aff_unique_index": "0;0+1;0;2;0;0;3;0+1;4+5", - "aff_unique_norm": "Anyvision;Queen's University Belfast;Xiamen University;University of Edinburgh;Queen Mary University of London;Yang\u2019s Accounting Consultancy Ltd", + "aff_unique_norm": "AnyVision;Queen's University Belfast;Xiamen University;University of Edinburgh;Queen Mary University of London;Yang’s Accounting Consultancy Ltd", "aff_unique_dep": ";;;;;", "aff_unique_url": "https://www.anyvision.ai;https://www.qub.ac.uk;https://www.xmu.edu.cn;https://www.ed.ac.uk;https://www.qmul.ac.uk;", "aff_unique_abbr": ";QUB;XMU;Edinburgh;QMUL;", "aff_campus_unique_index": ";;1", "aff_campus_unique": ";London", "aff_country_unique_index": "0;0+1;0;2;0;0;1;0+1;1+1", - "aff_country_unique": "Israel;United Kingdom;China" + "aff_country_unique": "Israel;United Kingdom;China", + "bibtex": "@InProceedings{Hu_2017_ICCV,\n \n author = {\n Hu,\n Guosheng and Hua,\n Yang and Yuan,\n Yang and Zhang,\n Zhihong and Lu,\n Zheng and Mukherjee,\n Sankha S. and Hospedales,\n Timothy M. and Robertson,\n Neil M. and Yang,\n Yongxin\n},\n title = {\n Attribute-Enhanced Face Recognition With Neural Tensor Fusion Networks\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Attributes2Classname: A Discriminative Model for Attribute-Based Unsupervised Zero-Shot Learning", @@ -1780,6 +1898,7 @@ "status": "Poster", "track": "main", "pid": "403", + "author_site": "Berkan Demirel; Ramazan Gokberk Cinbis; Nazli Ikizler-Cinbis", "author": "Berkan Demirel; Ramazan Gokberk Cinbis; Nazli Ikizler-Cinbis", "abstract": "We propose a novel approach for unsupervised zero-shot learning (ZSL) of classes based on their names. Most existing unsupervised ZSL methods aim to learn a model for directly comparing image features and class names. However, this proves to be a difficult task due to dominance of non-visual semantics in underlying vector-space embeddings of class names. To address this issue, we discriminatively learn a word representation such that the similarities between class and combination of attribute names fall in line with the visual similarity. Contrary to the traditional zero-shot learning approaches that are built upon attribute presence, our approach bypasses the laborious attribute-class relation annotations for unseen classes. In addition, our proposed approach renders text-only training possible, hence, the training can be augmented without the need to collect additional image data. The experimental results show that our method yields state-of-the-art results for unsupervised ZSL in three benchmark datasets.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Demirel_Attributes2Classname_A_Discriminative_ICCV_2017_paper.pdf", @@ -1804,7 +1923,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;1;1", - "aff_country_unique": "United States;T\u00fcrkiye" + "aff_country_unique": "United States;Turkey", + "bibtex": "@InProceedings{Demirel_2017_ICCV,\n \n author = {\n Demirel,\n Berkan and Gokberk Cinbis,\n Ramazan and Ikizler-Cinbis,\n Nazli\n},\n title = {\n Attributes2Classname: A Discriminative Model for Attribute-Based Unsupervised Zero-Shot Learning\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "AutoDIAL: Automatic DomaIn Alignment Layers", @@ -1812,7 +1932,7 @@ "status": "Poster", "track": "main", "pid": "2300", - "author_site": "Fabio Maria Carlucci; Lorenzo Porzi; Barbara Caputo; Elisa Ricci; Samuel Rota Bul\u00c3\u00b2", + "author_site": "Fabio Maria Carlucci; Lorenzo Porzi; Barbara Caputo; Elisa Ricci; Samuel Rota Bulò", "author": "Fabio Maria Carlucci; Lorenzo Porzi; Barbara Caputo; Elisa Ricci; Samuel Rota Bulo", "abstract": "Classifiers trained on given databases perform poorly when tested on data acquired in different settings. This is explained in domain adaptation through a shift among distributions of the source and target domains. Attempts to align them have traditionally resulted in works reducing the domain shift by introducing appropriate loss terms, measuring the discrepancies between source and target distributions, in the objective function. Here we take a different route, proposing to align the learned representations by embedding in any given network specific Domain Alignment Layers, designed to match the source and target feature distributions to a reference one. Opposite to previous works which define a priori in which layers adaptation should be performed, our method is able to automatically learn the degree of feature alignment required at different levels of the deep network. Thorough experiments on different public benchmarks, in the unsupervised setting, confirm the power of our approach.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Carlucci_AutoDIAL_Automatic_DomaIn_ICCV_2017_paper.pdf", @@ -1830,14 +1950,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Carlucci_AutoDIAL_Automatic_DomaIn_ICCV_2017_paper.html", "aff_unique_index": "0;1+2;0;3+4;3+2", - "aff_unique_norm": "Sapienza University of Rome;Institut de Rob\u00f2tica i Inform\u00e0tica Industrial;Mapillary;Fondazione Bruno Kessler;University of Perugia", + "aff_unique_norm": "Sapienza University of Rome;Institut de Robòtica i Informàtica Industrial;Mapillary;Fondazione Bruno Kessler;University of Perugia", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.uniroma1.it;https://www.iri.upc.edu/;https://www.mapillary.com;https://www.fbk.eu;https://www.unipg.it", "aff_unique_abbr": "Sapienza;IRI;;FBK;Unipg", "aff_campus_unique_index": "0;1+2;0;3;3+2", "aff_campus_unique": "Roma;Barcelona;Graz;Trento;", "aff_country_unique_index": "0;1+2;0;0+0;0+2", - "aff_country_unique": "Italy;Spain;Austria" + "aff_country_unique": "Italy;Spain;Austria", + "bibtex": "@InProceedings{Carlucci_2017_ICCV,\n \n author = {\n Maria Carlucci,\n Fabio and Porzi,\n Lorenzo and Caputo,\n Barbara and Ricci,\n Elisa and Rota Bulo,\n Samuel\n},\n title = {\n AutoDIAL: Automatic DomaIn Alignment Layers\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Automatic Content-Aware Projection for 360deg Videos", @@ -1845,6 +1966,7 @@ "status": "Poster", "track": "main", "pid": "2106", + "author_site": "Yeong Won Kim; Chang-Ryeol Lee; Dae-Yong Cho; Yong Hoon Kwon; Hyeok-Jae Choi; Kuk-Jin Yoon", "author": "Yeong Won Kim; Chang-Ryeol Lee; Dae-Yong Cho; Yong Hoon Kwon; Hyeok-Jae Choi; Kuk-Jin Yoon", "abstract": "To watch 360 videos on normal 2D displays, we need to project the selected part of the 360 image onto the 2D display plane. In this paper, we propose a fully-automated framework for generating content-aware 2D normal-view perspective videos from 360 videos. Especially, we focus on the projection step preserving important image contents and reducing image distortion. Basically, our projection method is based on Pannini projection model. At first, the salient contents such as linear structures and salient regions in the image are preserved by optimizing the single Panini projection model. Then, the multiple Panini projection models at salient regions are interpolated to suppress image distortion globally. Finally, the temporal consistency for image projection is enforced for producing temporally stable normal-view videos. Our proposed projection method does not require any user-interaction and is much faster than previous content-preserving methods. It can be applied to not only images but also videos taking the temporal consistency of projection into account. Experiments on various 360 videos show the superiority of the proposed projection method quantitatively and qualitatively.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Kim_Automatic_Content-Aware_Projection_ICCV_2017_paper.pdf", @@ -1859,7 +1981,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Kim_Automatic_Content-Aware_Projection_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Kim_Automatic_Content-Aware_Projection_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Kim_2017_ICCV,\n \n author = {\n Won Kim,\n Yeong and Lee,\n Chang-Ryeol and Cho,\n Dae-Yong and Hoon Kwon,\n Yong and Choi,\n Hyeok-Jae and Yoon,\n Kuk-Jin\n},\n title = {\n Automatic Content-Aware Projection for 360deg Videos\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Automatic Spatially-Aware Fashion Concept Discovery", @@ -1867,6 +1990,7 @@ "status": "Poster", "track": "main", "pid": "343", + "author_site": "Xintong Han; Zuxuan Wu; Phoenix X. Huang; Xiao Zhang; Menglong Zhu; Yuan Li; Yang Zhao; Larry S. Davis", "author": "Xintong Han; Zuxuan Wu; Phoenix X. Huang; Xiao Zhang; Menglong Zhu; Yuan Li; Yang Zhao; Larry S. Davis", "abstract": "This paper proposes an automatic spatially-aware concept discovery approach using weakly labeled image-text data from shopping websites. We first fine-tune GoogleNet by jointly modeling clothing images and their corresponding descriptions in a visual-semantic embedding space. Then, for each attribute (word), we generate its spatially-aware representation by combining its semantic word vector representation with its spatial representation derived from the convolutional maps of the fine-tuned network. The resulting spatially-aware representations are further used to cluster attributes into multiple groups to form spatially-aware concepts (e.g., the neckline concept might consist of attributes like v-neck, round-neck, etc). Finally, we decompose the visual-semantic embedding space into multiple concept-specific subspaces, which facilitates structured browsing and attribute-feedback product retrieval by exploiting multimodal linguistic regularities. We conducted extensive experiments on our newly collected Fashion200K dataset, and results on clustering quality evaluation and attribute-feedback product retrieval task demonstrate the effectiveness of our automatically discovered spatially-aware concepts.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Han_Automatic_Spatially-Aware_Fashion_ICCV_2017_paper.pdf", @@ -1885,13 +2009,14 @@ "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Han_Automatic_Spatially-Aware_Fashion_ICCV_2017_paper.html", "aff_unique_index": "0;1;2+0;2+1;2;2;2;0", "aff_unique_norm": "University of Maryland;Snap Inc.;Google", - "aff_unique_dep": ";;Google", + "aff_unique_dep": ";;", "aff_unique_url": "https://www/umd.edu;https://www.snapinc.com;https://www.google.com", "aff_unique_abbr": "UMD;Snap;Google", "aff_campus_unique_index": "1;1;1;1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0+0;0+0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Han_2017_ICCV,\n \n author = {\n Han,\n Xintong and Wu,\n Zuxuan and Huang,\n Phoenix X. and Zhang,\n Xiao and Zhu,\n Menglong and Li,\n Yuan and Zhao,\n Yang and Davis,\n Larry S.\n},\n title = {\n Automatic Spatially-Aware Fashion Concept Discovery\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "BAM! The Behance Artistic Media Dataset for Recognition Beyond Photography", @@ -1899,6 +2024,7 @@ "status": "Poster", "track": "main", "pid": "353", + "author_site": "Michael J. Wilber; Chen Fang; Hailin Jin; Aaron Hertzmann; John Collomosse; Serge Belongie", "author": "Michael J. Wilber; Chen Fang; Hailin Jin; Aaron Hertzmann; John Collomosse; Serge Belongie", "abstract": "Computer vision systems are designed to work well within the context of everyday photography. However, artists often render the world around them in ways that do not resemble photographs. Artwork produced by people is not constrained to mimic the physical world, making it more challenging for machines to recognize. This work is a step toward teaching machines how to categorize images in ways that are valuable to humans. First, we collect a large-scale dataset of contemporary artwork from Behance, a website containing millions of portfolios from professional and commercial artists. We annotate Behance imagery with rich attribute labels for content, emotions, and artistic media. Furthermore, we carry out baseline experiments to show the value of this dataset for artistic style prediction, for improving the generality of existing object classifiers, and for the study of visual domain adaptation. We believe our Behance Artistic Media dataset will be a good starting point for researchers wishing to study artistic imagery and relevant problems. This dataset can be found at https://bam-dataset.org/", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Wilber_BAM_The_Behance_ICCV_2017_paper.pdf", @@ -1923,7 +2049,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";New York City", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Wilber_2017_ICCV,\n \n author = {\n Wilber,\n Michael J. and Fang,\n Chen and Jin,\n Hailin and Hertzmann,\n Aaron and Collomosse,\n John and Belongie,\n Serge\n},\n title = {\n BAM! The Behance Artistic Media Dataset for Recognition Beyond Photography\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "BB8: A Scalable, Accurate, Robust to Partial Occlusion Method for Predicting the 3D Poses of Challenging Objects Without Using Depth", @@ -1931,6 +2058,7 @@ "status": "Poster", "track": "main", "pid": "1701", + "author_site": "Mahdi Rad; Vincent Lepetit", "author": "Mahdi Rad; Vincent Lepetit", "abstract": "We introduce a novel method for 3D object detection and pose estimation from color images only. We first use segmentation to detect the objects of interest in 2D even in presence of partial occlusions and cluttered background. By contrast with recent patch-based methods, we rely on a \"holistic\" approach: We apply to the detected objects a Convolutional Neural Network (CNN) trained to predict their 3D poses in the form of 2D projections of the corners of their 3D bounding boxes. This, however, is not sufficient for handling objects from the recent T-LESS dataset: These objects exhibit an axis of rotational symmetry, and the similarity of two images of such an object under two different poses makes training the CNN challenging. We solve this problem by restricting the range of poses used for training, and by introducing a classifier to identify the range of a pose at run-time before estimating it. We also use an optional additional step that refines the predicted poses. We improve the state-of-the-art on the LINEMOD dataset from 73.7% to 89.3% of correctly registered RGB frames. We are also the first to report results on the Occlusion dataset using color images only. We obtain 54% of frames passing the Pose 6D criterion on average on several sequences of the T-LESS dataset, compared to the 67% of the state-of-the-art on the same sequences which uses both color and depth. The full approach is also scalable, as a single network can be trained for multiple objects simultaneously.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Rad_BB8_A_Scalable_ICCV_2017_paper.pdf", @@ -1946,7 +2074,8 @@ "aff_domain": ";", "email": ";", "author_num": 2, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Rad_BB8_A_Scalable_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Rad_BB8_A_Scalable_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Rad_2017_ICCV,\n \n author = {\n Rad,\n Mahdi and Lepetit,\n Vincent\n},\n title = {\n BB8: A Scalable,\n Accurate,\n Robust to Partial Occlusion Method for Predicting the 3D Poses of Challenging Objects Without Using Depth\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "BIER - Boosting Independent Embeddings Robustly", @@ -1954,6 +2083,7 @@ "status": "Oral", "track": "main", "pid": "888", + "author_site": "Michael Opitz; Georg Waltner; Horst Possegger; Horst Bischof", "author": "Michael Opitz; Georg Waltner; Horst Possegger; Horst Bischof", "abstract": "Learning similarity functions between image pairs with deep neural networks yields highly correlated activations of large embeddings. In this work, we show how to improve the robustness of embeddings by exploiting independence in ensembles. We divide the last embedding layer of a deep network into an embedding ensemble and formulate training this ensemble as an online gradient boosting problem. Each learner receives a reweighted training sample from the previous learners. This leverages large embedding sizes more effectively by significantly reducing correlation of the embedding and consequently increases retrieval accuracy of the embedding. Our method does not introduce any additional parameters and works with any differentiable loss function. We evaluate our metric learning method on image retrieval tasks and show that it improves over state-of-the-art methods on the CUB-200-2011, Cars-196, Stanford Online Products, In-Shop Clothes Retrieval and VehicleID datasets by a significant margin.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Opitz_BIER_-_Boosting_ICCV_2017_paper.pdf", @@ -1978,7 +2108,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Graz", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Austria" + "aff_country_unique": "Austria", + "bibtex": "@InProceedings{Opitz_2017_ICCV,\n \n author = {\n Opitz,\n Michael and Waltner,\n Georg and Possegger,\n Horst and Bischof,\n Horst\n},\n title = {\n BIER - Boosting Independent Embeddings Robustly\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Be Your Own Prada: Fashion Synthesis With Structural Coherence", @@ -1986,6 +2117,7 @@ "status": "Poster", "track": "main", "pid": "795", + "author_site": "Shizhan Zhu; Raquel Urtasun; Sanja Fidler; Dahua Lin; Chen Change Loy", "author": "Shizhan Zhu; Raquel Urtasun; Sanja Fidler; Dahua Lin; Chen Change Loy", "abstract": "We present a novel and effective approach for generating new clothing on a wearer through generative adversarial learning. Given an input image of a person and a sentence describing a different outfit, our model \"redresses\" the person as desired, while at the same time keeping the wearer and her/his pose unchanged. Generating new outfits with precise regions conforming to a language description while retaining wearer's body structure is a new challenging task. Existing generative adversarial networks are not ideal in ensuring global coherence of structure given both the input photograph and language description as conditions. We address this challenge by decomposing the complex generative process into two conditional stages. In the first stage, we generate a plausible semantic segmentation map that obeys the wearer's pose as a latent spatial arrangement. An effective spatial constraint is formulated to guide the generation of this semantic segmentation map. In the second stage, a generative model with a newly proposed compositional mapping layer is used to render the final image with precise regions and textures conditioned on this map. We extended the DeepFashion dataset [8] by collecting sentence descriptions for 79K images. We demonstrate the effectiveness of our approach through both quantitative and qualitative evaluations. A user study is also conducted.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zhu_Be_Your_Own_ICCV_2017_paper.pdf", @@ -2003,14 +2135,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Zhu_Be_Your_Own_ICCV_2017_paper.html", "aff_unique_index": "0;1+2;1+2+3;0;0", - "aff_unique_norm": "Chinese University of Hong Kong;University of Toronto;Vector Institute;Uber", + "aff_unique_norm": "The Chinese University of Hong Kong;University of Toronto;Vector Institute;Uber", "aff_unique_dep": "Department of Information Engineering;;;Advanced Technologies Group", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.utoronto.ca;https://vectorinstitute.ai/;https://www.uber.com", "aff_unique_abbr": "CUHK;U of T;Vector Institute;Uber ATG", "aff_campus_unique_index": "0;;;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;1+1;1+1+2;0;0", - "aff_country_unique": "China;Canada;United States" + "aff_country_unique": "China;Canada;United States", + "bibtex": "@InProceedings{Zhu_2017_ICCV,\n \n author = {\n Zhu,\n Shizhan and Urtasun,\n Raquel and Fidler,\n Sanja and Lin,\n Dahua and Change Loy,\n Chen\n},\n title = {\n Be Your Own Prada: Fashion Synthesis With Structural Coherence\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Benchmarking Single-Image Reflection Removal Algorithms", @@ -2018,6 +2151,7 @@ "status": "Poster", "track": "main", "pid": "1660", + "author_site": "Renjie Wan; Boxin Shi; Ling-Yu Duan; Ah-Hwee Tan; Alex C. Kot", "author": "Renjie Wan; Boxin Shi; Ling-Yu Duan; Ah-Hwee Tan; Alex C. Kot", "abstract": "Removing undesired reflections from a photo taken in front of a glass is of great importance for enhancing the efficiency of visual computing systems. Various approaches have been proposed and shown to be visually plausible on small datasets collected by their authors. A quantitative comparison of existing approaches using the same dataset has never been conducted due to the lack of suitable benchmark data with ground truth. This paper presents the first captured Single-image Reflection Removal dataset 'SIR2' with 40 controlled and 100 wild scenes, ground truth of background and reflection. For each controlled scene, we further provide ten sets of images under varying aperture settings and glass thicknesses. We perform quantitative and visual quality comparisons for four state-of-the-art singleimage reflection removal algorithms using four error metrics. Open problems for improving reflection removal algorithms are discussed at the end.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Wan_Benchmarking_Single-Image_Reflection_ICCV_2017_paper.pdf", @@ -2032,7 +2166,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Wan_Benchmarking_Single-Image_Reflection_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Wan_Benchmarking_Single-Image_Reflection_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Wan_2017_ICCV,\n \n author = {\n Wan,\n Renjie and Shi,\n Boxin and Duan,\n Ling-Yu and Tan,\n Ah-Hwee and Kot,\n Alex C.\n},\n title = {\n Benchmarking Single-Image Reflection Removal Algorithms\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Benchmarking and Error Diagnosis in Multi-Instance Pose Estimation", @@ -2040,6 +2175,7 @@ "status": "Poster", "track": "main", "pid": "67", + "author_site": "Matteo Ruggero Ronchi; Pietro Perona", "author": "Matteo Ruggero Ronchi; Pietro Perona", "abstract": "We propose a new method to analyze the impact of errors in algorithms for multi-instance pose estimation and a principled benchmark that can be used to compare them. We define and characterize three classes of errors - localization, scoring, and background - study how they are influenced by instance attributes and their impact on an algorithm's performance. Our technique is applied to compare the two leading methods for human pose estimation on the COCO Dataset, measure the sensitivity of pose estimation with respect to instance size, type and number of visible keypoints, clutter due to multiple instances, and the relative score of instances. The performance of algorithms, and the types of error they make, are highly dependent on all these variables, but mostly on the number of keypoints and the clutter. The analysis and software tools we propose offer a novel and insightful approach for understanding the behavior of pose estimation algorithms and an effective method for measuring their strengths and weaknesses.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Ronchi_Benchmarking_and_Error_ICCV_2017_paper.pdf", @@ -2064,7 +2200,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Pasadena", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Ronchi_2017_ICCV,\n \n author = {\n Ruggero Ronchi,\n Matteo and Perona,\n Pietro\n},\n title = {\n Benchmarking and Error Diagnosis in Multi-Instance Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Beyond Face Rotation: Global and Local Perception GAN for Photorealistic and Identity Preserving Frontal View Synthesis", @@ -2072,6 +2209,7 @@ "status": "Poster", "track": "main", "pid": "1029", + "author_site": "Rui Huang; Shu Zhang; Tianyu Li; Ran He", "author": "Rui Huang; Shu Zhang; Tianyu Li; Ran He", "abstract": "Photorealistic frontal view synthesis from a single face image has a wide range of applications in the field of face recognition. Although data-driven deep learning methods have been proposed to address this problem by seeking solutions from ample face data, this problem is still challenging because it is intrinsically ill-posed. This paper proposes a Two-Pathway Generative Adversarial Network (TP-GAN) for photorealistic frontal view synthesis by simultaneously perceiving global structures and local details. Four landmark located patch networks are proposed to attend to local textures in addition to the commonly used global encoder-decoder network. Except for the novel architecture, we make this ill-posed problem well constrained by introducing a combination of adversarial loss, symmetry loss and identity preserving loss. The combined loss function leverages both frontal face distribution and pre-trained discriminative deep face models to guide an identity preserving inference of frontal views from profiles. Different from previous deep learning methods that mainly rely on intermediate features for recognition, our method directly leverages the synthesized identity preserving image for downstream tasks like face recognition and attribution estimation. Experimental results demonstrate that our method not only presents compelling perceptual results but also outperforms state-of-the-art results on large pose face recognition.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Huang_Beyond_Face_Rotation_ICCV_2017_paper.pdf", @@ -2096,7 +2234,8 @@ "aff_campus_unique_index": "1;1;;1", "aff_campus_unique": ";Beijing", "aff_country_unique_index": "0+0+0;0+0+0;0+0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Huang_2017_ICCV,\n \n author = {\n Huang,\n Rui and Zhang,\n Shu and Li,\n Tianyu and He,\n Ran\n},\n title = {\n Beyond Face Rotation: Global and Local Perception GAN for Photorealistic and Identity Preserving Frontal View Synthesis\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Beyond Planar Symmetry: Modeling Human Perception of Reflection and Rotation Symmetries in the Wild", @@ -2104,6 +2243,7 @@ "status": "Oral", "track": "main", "pid": "1128", + "author_site": "Christopher Funk; Yanxi Liu", "author": "Christopher Funk; Yanxi Liu", "abstract": "Humans take advantage of real world symmetries for various tasks, yet capturing their superb symmetry perception mechanism with a computational model remains elusive. Motivated by a new study demonstrating the extremely high inter-person accuracy of human perceived symmetries in the wild, we have constructed the first deep-learning neural network for reflection and rotation symmetry detection (Sym-NET), trained on photos from MS-COCO (Microsoft-Common Object in COntext) dataset with nearly 11K consistent symmetry-labels from more than 400 human observers. We employ novel methods to convert discrete human labels into symmetry heatmaps, capture symmetry densely in an image and quantitatively evaluate Sym-NET against multiple existing computer vision algorithms. On CVPR 2013 symmetry competition testsets and unseen MS-COCO photos, Sym-NET significantly outperforms all other competitors. Beyond mathematically well-defined symmetries on a plane, Sym-NET demonstrates abilities to identify viewpoint-varied 3D symmetries, partially occluded symmetrical objects, and symmetries at a semantic level.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Funk_Beyond_Planar_Symmetry_ICCV_2017_paper.pdf", @@ -2121,14 +2261,15 @@ "author_num": 2, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Funk_Beyond_Planar_Symmetry_ICCV_2017_paper.html", "aff_unique_index": "0;0", - "aff_unique_norm": "Pennsylvania State University", + "aff_unique_norm": "The Pennsylvania State University", "aff_unique_dep": "School of Electrical Engineering and Computer Science", "aff_unique_url": "https://www.psu.edu", "aff_unique_abbr": "PSU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "University Park", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Funk_2017_ICCV,\n \n author = {\n Funk,\n Christopher and Liu,\n Yanxi\n},\n title = {\n Beyond Planar Symmetry: Modeling Human Perception of Reflection and Rotation Symmetries in the Wild\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Beyond Standard Benchmarks: Parameterizing Performance Evaluation in Visual Object Tracking", @@ -2136,7 +2277,7 @@ "status": "Poster", "track": "main", "pid": "1352", - "author_site": "Luka \u00c4\u008cehovin Zajc; Alan Luke\u00c5\u00bei\u00c4\u008d; Ale\u00c5\u00a1 Leonardis; Matej Kristan", + "author_site": "Luka Čehovin Zajc; Alan Lukežič; AleÅ¡ Leonardis; Matej Kristan", "author": "Luka Cehovin Zajc; Alan Lukezic; Ales Leonardis; Matej Kristan", "abstract": "Object-to-camera motion produces a variety of apparent motion patterns that significantly affect performance of short-term visual trackers. Despite being crucial for designing robust trackers, their influence is poorly explored in standard benchmarks due to weakly defined, biased and overlapping attribute annotations. In this paper we propose to go beyond pre-recorded benchmarks with post-hoc annotations by presenting an approach that utilizes omnidirectional videos to generate realistic, consistently annotated, short-term tracking scenarios with exactly parameterized motion patterns. We have created an evaluation system, constructed a fully annotated dataset of omnidirectional videos and generators for typical motion patterns. We provide an in-depth analysis of major tracking paradigms which is complementary to the standard benchmarks and confirms the expressiveness of our evaluation approach.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zajc_Beyond_Standard_Benchmarks_ICCV_2017_paper.pdf", @@ -2161,7 +2302,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Slovenia" + "aff_country_unique": "Slovenia", + "bibtex": "@InProceedings{Zajc_2017_ICCV,\n \n author = {\n Cehovin Zajc,\n Luka and Lukezic,\n Alan and Leonardis,\n Ales and Kristan,\n Matej\n},\n title = {\n Beyond Standard Benchmarks: Parameterizing Performance Evaluation in Visual Object Tracking\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Binarized Convolutional Landmark Localizers for Human Pose Estimation and Face Alignment With Limited Resources", @@ -2169,6 +2311,7 @@ "status": "Oral", "track": "main", "pid": "531", + "author_site": "Adrian Bulat; Georgios Tzimiropoulos", "author": "Adrian Bulat; Georgios Tzimiropoulos", "abstract": "Our goal is to design architectures that retain the groundbreaking performance of CNNs for landmark localization and at the same time are lightweight, compact and suitable for applications with limited computational resources. To this end, we make the following contributions: (a) we are the first to study the effect of neural network binarization on localization tasks, namely human pose estimation and face alignment. We exhaustively evaluate various design choices, identify performance bottlenecks, and more importantly propose multiple orthogonal ways to boost performance. (b) Based on our analysis, we propose a novel hierarchical, parallel and multi-scale residual architecture that yields large performance improvement over the standard bottleneck block while having the same number of parameters, thus bridging the gap between the original network and its binarized counterpart. (c) We perform a large number of ablation studies that shed light on the properties and the performance of the proposed block. (d) We present results for experiments on the most challenging datasets for human pose estimation and face alignment, reporting in many cases state-of-the-art performance. Code can be downloaded from https://www.adrianbulat.com/binary-cnn-landmarks", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Bulat_Binarized_Convolutional_Landmark_ICCV_2017_paper.pdf", @@ -2186,14 +2329,15 @@ "author_num": 2, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Bulat_Binarized_Convolutional_Landmark_ICCV_2017_paper.html", "aff_unique_index": "0;0", - "aff_unique_norm": "University of Nottingham", + "aff_unique_norm": "The University of Nottingham", "aff_unique_dep": "Computer Vision Laboratory", "aff_unique_url": "https://www.nottingham.ac.uk", "aff_unique_abbr": "", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Bulat_2017_ICCV,\n \n author = {\n Bulat,\n Adrian and Tzimiropoulos,\n Georgios\n},\n title = {\n Binarized Convolutional Landmark Localizers for Human Pose Estimation and Face Alignment With Limited Resources\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Blind Image Deblurring With Outlier Handling", @@ -2201,6 +2345,7 @@ "status": "Poster", "track": "main", "pid": "996", + "author_site": "Jiangxin Dong; Jinshan Pan; Zhixun Su; Ming-Hsuan Yang", "author": "Jiangxin Dong; Jinshan Pan; Zhixun Su; Ming-Hsuan Yang", "abstract": "Deblurring images with outliers has attracted considerable attention recently. However, existing algorithms usually involve complex operations which increase the difficulty of blur kernel estimation. In this paper, we propose a simple yet effective blind image deblurring algorithm to handle blurred images with outliers. The proposed method is motivated by the observation that outliers in the blurred images significantly affect the goodness-of-fit in function approximation. Therefore, we propose an algorithm to model the data fidelity term so that the outliers have little effect on kernel estimation. The proposed algorithm does not require any heuristic outlier detection step, which is critical to the state-of-the-art blind deblurring methods for images with outliers. We analyze the relationship between the proposed algorithm and other blind deblurring methods with outlier handling and show how to estimate intermediate latent images for blur kernel estimation principally. We show that the proposed method can be applied to generic image deblurring as well as non-uniform deblurring. Experimental results demonstrate that the proposed algorithm performs favorably against the state-of-the-art blind image deblurring methods on both synthetic and real-world images.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Dong_Blind_Image_Deblurring_ICCV_2017_paper.pdf", @@ -2225,7 +2370,8 @@ "aff_campus_unique_index": ";1", "aff_campus_unique": ";Merced", "aff_country_unique_index": "0;0;0+0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Dong_2017_ICCV,\n \n author = {\n Dong,\n Jiangxin and Pan,\n Jinshan and Su,\n Zhixun and Yang,\n Ming-Hsuan\n},\n title = {\n Blind Image Deblurring With Outlier Handling\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "BlitzNet: A Real-Time Deep Network for Scene Understanding", @@ -2233,6 +2379,7 @@ "status": "Poster", "track": "main", "pid": "1824", + "author_site": "Nikita Dvornik; Konstantin Shmelkov; Julien Mairal; Cordelia Schmid", "author": "Nikita Dvornik; Konstantin Shmelkov; Julien Mairal; Cordelia Schmid", "abstract": "Real-time scene understanding has become crucial in many applications such as autonomous driving. In this paper, we propose a deep architecture, called BlitzNet, that jointly performs object detection and semantic segmentation in one forward pass, allowing real-time computations. Besides the computational gain of having a single network to perform several tasks, we show that object detection and semantic segmentation benefit from each other in terms of accuracy. Experimental results for VOC and COCO datasets show state-of-the-art performance for object detection and segmentation among real time systems.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Dvornik_BlitzNet_A_Real-Time_ICCV_2017_paper.pdf", @@ -2248,7 +2395,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Dvornik_BlitzNet_A_Real-Time_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Dvornik_BlitzNet_A_Real-Time_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Dvornik_2017_ICCV,\n \n author = {\n Dvornik,\n Nikita and Shmelkov,\n Konstantin and Mairal,\n Julien and Schmid,\n Cordelia\n},\n title = {\n BlitzNet: A Real-Time Deep Network for Scene Understanding\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Blob Reconstruction Using Unilateral Second Order Gaussian Kernels With Application to High-ISO Long-Exposure Image Denoising", @@ -2256,6 +2404,7 @@ "status": "Poster", "track": "main", "pid": "2461", + "author_site": "Gang Wang; Carlos Lopez-Molina; Bernard De Baets", "author": "Gang Wang; Carlos Lopez-Molina; Bernard De Baets", "abstract": "Blob detection and image denoising are fundamental, and sometimes related, tasks in computer vision. In this paper, we propose a blob reconstruction method using scale-invariant normalized unilateral second order Gaussian kernels. Unlike other blob detection methods, our method suppresses non-blob structures while also identifying blob parameters, i.e., position, prominence and scale, thereby facilitating blob reconstruction. We present an algorithm for high-ISO long-exposure noise removal that results from the combination of our blob reconstruction method and state-of-the-art denoising methods, i.e., the non-local means algorithm (NLM) and the color version of block-matching and 3-D filtering (CBM3D). Experiments on standard images corrupted by real high-ISO long-exposure noise and real-world noisy images demonstrate that our schemes incorporating the blob reduction procedure outperform both the original NLM and CBM3D.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Wang_Blob_Reconstruction_Using_ICCV_2017_paper.pdf", @@ -2275,12 +2424,13 @@ "aff_unique_index": "0;0+1;0", "aff_unique_norm": "Ghent University;Universidad Publica de Navarra", "aff_unique_dep": "Department of Mathematical Modelling, Statistics and Bioinformatics;Dpto. Automatica y Computacion", - "aff_unique_url": "https://www.ugent.be;https://www.unav.es", + "aff_unique_url": "https://www.ugent.be;https://www.unavarra.es", "aff_unique_abbr": "UGent;", "aff_campus_unique_index": "0;0+1;0", "aff_campus_unique": "Ghent;Pamplona", "aff_country_unique_index": "0;0+1;0", - "aff_country_unique": "Belgium;Spain" + "aff_country_unique": "Belgium;Spain", + "bibtex": "@InProceedings{Wang_2017_ICCV,\n \n author = {\n Wang,\n Gang and Lopez-Molina,\n Carlos and De Baets,\n Bernard\n},\n title = {\n Blob Reconstruction Using Unilateral Second Order Gaussian Kernels With Application to High-ISO Long-Exposure Image Denoising\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Blur-Invariant Deep Learning for Blind-Deblurring", @@ -2288,6 +2438,7 @@ "status": "Poster", "track": "main", "pid": "2134", + "author_site": "T. M. Nimisha; Akash Kumar Singh; A. N. Rajagopalan", "author": "T. M. Nimisha; Akash Kumar Singh; A. N. Rajagopalan", "abstract": "In this paper, we investigate deep neural networks for blind motion deblurring. Instead of regressing for the motion blur kernel and performing non-blind deblurring out- side of the network (as most methods do), we propose a compact and elegant end-to-end deblurring network. Inspired by the data-driven sparse-coding approaches that are capable of capturing linear dependencies in data, we generalize this notion by embedding non-linearities into the learning process. We propose a new architecture for blind motion deblurring that consists of an autoencoder that learns the data prior, and an adversarial network that attempts to generate and discriminate between clean and blurred features. Once the network is trained, the generator learns a blur-invariant data representation which when fed through the decoder results in the final deblurred output.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Nimisha_Blur-Invariant_Deep_Learning_ICCV_2017_paper.pdf", @@ -2312,7 +2463,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Madras", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "India" + "aff_country_unique": "India", + "bibtex": "@InProceedings{Nimisha_2017_ICCV,\n \n author = {\n Nimisha,\n T. M. and Kumar Singh,\n Akash and Rajagopalan,\n A. N.\n},\n title = {\n Blur-Invariant Deep Learning for Blind-Deblurring\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "BodyFusion: Real-Time Capture of Human Motion and Surface Geometry Using a Single Depth Camera", @@ -2320,6 +2472,7 @@ "status": "Poster", "track": "main", "pid": "415", + "author_site": "Tao Yu; Kaiwen Guo; Feng Xu; Yuan Dong; Zhaoqi Su; Jianhui Zhao; Jianguo Li; Qionghai Dai; Yebin Liu", "author": "Tao Yu; Kaiwen Guo; Feng Xu; Yuan Dong; Zhaoqi Su; Jianhui Zhao; Jianguo Li; Qionghai Dai; Yebin Liu", "abstract": "We propose BodyFusion, a novel real-time geometry fusion method that can track and reconstruct non-rigid surface motion of a human performance using a single consumer-grade depth camera. To reduce the ambiguities of the non-rigid deformation parameterization on the surface graph nodes, we take advantage of the internal articulated motion prior for human performance and contribute a skeleton-embedded surface fusion (SSF) method. The key feature of our method is that it jointly solves for both the skeleton and graph-node deformations based on information of the attachments between the skeleton and the graph nodes. The attachments are also updated frame by frame based on the fused surface geometry and the computed deformations. Overall, our method enables increasingly denoised, detailed, and complete surface reconstruction as well as the updating of the skeleton and attachments as the temporal depth frames are fused. Experimental results show that our method exhibits substantially improved nonrigid motion fusion performance and tracking robustness compared with previous state-of-the-art fusion methods. We also contribute a dataset for the quantitative evaluation of fusion-based dynamic scene reconstruction algorithms using a single depth camera.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Yu_BodyFusion_Real-Time_Capture_ICCV_2017_paper.pdf", @@ -2334,7 +2487,8 @@ "aff_domain": ";;;;;;;;", "email": ";;;;;;;;", "author_num": 9, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Yu_BodyFusion_Real-Time_Capture_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Yu_BodyFusion_Real-Time_Capture_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Yu_2017_ICCV,\n \n author = {\n Yu,\n Tao and Guo,\n Kaiwen and Xu,\n Feng and Dong,\n Yuan and Su,\n Zhaoqi and Zhao,\n Jianhui and Li,\n Jianguo and Dai,\n Qionghai and Liu,\n Yebin\n},\n title = {\n BodyFusion: Real-Time Capture of Human Motion and Surface Geometry Using a Single Depth Camera\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Boosting Image Captioning With Attributes", @@ -2342,6 +2496,7 @@ "status": "Poster", "track": "main", "pid": "2215", + "author_site": "Ting Yao; Yingwei Pan; Yehao Li; Zhaofan Qiu; Tao Mei", "author": "Ting Yao; Yingwei Pan; Yehao Li; Zhaofan Qiu; Tao Mei", "abstract": "Automatically describing an image with a natural language has been an emerging challenge in both fields of computer vision and natural language processing. In this paper, we present Long Short-Term Memory with Attributes (LSTM-A) - a novel architecture that integrates attributes into the successful Convolutional Neural Networks (CNNs) plus Recurrent Neural Networks (RNNs) image captioning framework, by training them in an end-to-end manner. Particularly, the learning of attributes is strengthened by integrating inter-attribute correlations into Multiple Instance Learning (MIL). To incorporate attributes into captioning, we construct variants of architectures by feeding image representations and attributes into RNNs in different ways to explore the mutual but also fuzzy relationship between them. Extensive experiments are conducted on COCO image captioning dataset and our framework shows clear improvements when compared to state-of-the-art deep models. More remarkably, we obtain METEOR/CIDEr-D of 25.5%/100.2% on testing data of widely used and publicly available splits in [10] when extracting image representations by GoogleNet and achieve superior performance on COCO captioning Leaderboard.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Yao_Boosting_Image_Captioning_ICCV_2017_paper.pdf", @@ -2359,14 +2514,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Yao_Boosting_Image_Captioning_ICCV_2017_paper.html", "aff_unique_index": "0;1+0;2;1;0", - "aff_unique_norm": "Microsoft;University of Science and Technology of China;Sun Yat-sen University", - "aff_unique_dep": "Microsoft Research;;", + "aff_unique_norm": "Microsoft Research;University of Science and Technology of China;Sun Yat-Sen University", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.microsoft.com/en-us/research/group/microsoft-research-asia;http://www.ustc.edu.cn;http://www.sysu.edu.cn/", "aff_unique_abbr": "MSR;USTC;SYSU", "aff_campus_unique_index": "0;1+0;2;1;0", "aff_campus_unique": "Beijing;Hefei;Guangzhou", "aff_country_unique_index": "0;0+0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yao_2017_ICCV,\n \n author = {\n Yao,\n Ting and Pan,\n Yingwei and Li,\n Yehao and Qiu,\n Zhaofan and Mei,\n Tao\n},\n title = {\n Boosting Image Captioning With Attributes\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Bounding Boxes, Segmentations and Object Coordinates: How Important Is Recognition for 3D Scene Flow Estimation in Autonomous Driving Scenarios?", @@ -2374,10 +2530,11 @@ "status": "Poster", "track": "main", "pid": "1149", + "author_site": "Aseem Behl; Omid Hosseini Jafari; Siva Karthik Mustikovela; Hassan Abu Alhaija; Carsten Rother; Andreas Geiger", "author": "Aseem Behl; Omid Hosseini Jafari; Siva Karthik Mustikovela; Hassan Abu Alhaija; Carsten Rother; Andreas Geiger", "abstract": "Existing methods for 3D scene flow estimation often fail in the presence of large displacement or local ambiguities, e.g., at texture-less or reflective surfaces. However, these challenges are omnipresent in dynamic road scenes, which is the focus of this work. Our main contribution is to overcome these 3D motion estimation problems by exploiting recognition. In particular, we investigate the importance of recognition granularity, from coarse 2D bounding box estimates over 2D instance segmentations to fine-grained 3D object part predictions. We compute these cues using CNNs trained on a newly annotated dataset of stereo images and integrate them into a CRF-based model for robust 3D scene flow estimation - an approach we term Instance Scene Flow. We analyze the importance of each recognition cue in an ablation study and observe that the instance segmentation cue is by far strongest, in our setting. We demonstrate the effectiveness of our method on the challenging KITTI 2015 scene flow benchmark where we achieve state-of-the-art performance at the time of submission.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Behl_Bounding_Boxes_Segmentations_ICCV_2017_paper.pdf", - "aff": "Autonomous Vision Group, MPI for Intelligent Systems T\u00fcbingen; Computer Vision Lab, TU Dresden; Computer Vision Lab, TU Dresden; Computer Vision Lab, TU Dresden; Computer Vision Lab, TU Dresden; Autonomous Vision Group, MPI for Intelligent Systems T\u00fcbingen + Computer Vision and Geometry Group, ETH Z\u00fcrich", + "aff": "Autonomous Vision Group, MPI for Intelligent Systems Tübingen; Computer Vision Lab, TU Dresden; Computer Vision Lab, TU Dresden; Computer Vision Lab, TU Dresden; Computer Vision Lab, TU Dresden; Autonomous Vision Group, MPI for Intelligent Systems Tübingen + Computer Vision and Geometry Group, ETH Zürich", "project": "", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2017/supplemental/Behl_Bounding_Boxes_Segmentations_ICCV_2017_supplemental.pdf", @@ -2391,14 +2548,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Behl_Bounding_Boxes_Segmentations_ICCV_2017_paper.html", "aff_unique_index": "0;1;1;1;1;0+2", - "aff_unique_norm": "Max Planck Institute for Intelligent Systems;Technische Universit\u00e4t Dresden;ETH Zurich", + "aff_unique_norm": "Max Planck Institute for Intelligent Systems;Technische Universität Dresden;ETH Zürich", "aff_unique_dep": "Autonomous Vision Group;Computer Vision Lab;Computer Vision and Geometry Group", "aff_unique_url": "https://www.mpituebingen.mpg.de;https://www.tu-dresden.de;https://www.ethz.ch", "aff_unique_abbr": "MPI-IS;TUD;ETHZ", "aff_campus_unique_index": "0;1;1;1;1;0", - "aff_campus_unique": "T\u00fcbingen;Dresden;", + "aff_campus_unique": "Tübingen;Dresden;", "aff_country_unique_index": "0;0;0;0;0;0+1", - "aff_country_unique": "Germany;Switzerland" + "aff_country_unique": "Germany;Switzerland", + "bibtex": "@InProceedings{Behl_2017_ICCV,\n \n author = {\n Behl,\n Aseem and Hosseini Jafari,\n Omid and Karthik Mustikovela,\n Siva and Abu Alhaija,\n Hassan and Rother,\n Carsten and Geiger,\n Andreas\n},\n title = {\n Bounding Boxes,\n Segmentations and Object Coordinates: How Important Is Recognition for 3D Scene Flow Estimation in Autonomous Driving Scenarios?\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Bringing Background Into the Foreground: Making All Classes Equal in Weakly-Supervised Video Semantic Segmentation", @@ -2406,7 +2564,7 @@ "status": "Poster", "track": "main", "pid": "679", - "author_site": "Fatemeh Sadat Saleh; Mohammad Sadegh Aliakbarian; Mathieu Salzmann; Lars Petersson; Jose M. \u00c3\u0081lvarez", + "author_site": "Fatemeh Sadat Saleh; Mohammad Sadegh Aliakbarian; Mathieu Salzmann; Lars Petersson; Jose M. Álvarez", "author": "Fatemeh Sadat Saleh; Mohammad Sadegh Aliakbarian; Mathieu Salzmann; Lars Petersson; Jose M. Alvarez", "abstract": "Pixel-level annotations are expensive and time-consuming to obtain. Hence, weak supervision using only image tags could have a significant impact in semantic segmentation. Recent years have seen great progress in weakly-supervised semantic segmentation, whether from a single image or from videos. However, most existing methods are designed to handle a single background class. In practical applications, such as autonomous navigation, it is often crucial to reason about multiple background classes. In this paper, we introduce an approach to doing so by making use of classifier heatmaps. We then develop a two-stream deep architecture that jointly leverages appearance and motion, and design a loss based on our heatmaps to train it. Our experiments demonstrate the benefits of our classifier heatmaps and of our two-stream architecture on challenging urban scene datasets and on the YouTube-Objects benchmark, where we obtain state-of-the-art results.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Saleh_Bringing_Background_Into_ICCV_2017_paper.pdf", @@ -2424,14 +2582,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Saleh_Bringing_Background_Into_ICCV_2017_paper.html", "aff_unique_index": "0+1;0+1;2;0+1;0+1", - "aff_unique_norm": "Australian National University;CSIRO;EPFL", + "aff_unique_norm": "Australian National University;CSIRO;École Polytechnique Fédérale de Lausanne", "aff_unique_dep": ";Smart Vision Systems;CVLab", "aff_unique_url": "https://www.anu.edu.au;https://www.csiro.au;https://cvlab.epfl.ch", "aff_unique_abbr": "ANU;CSIRO;EPFL", "aff_campus_unique_index": ";;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;1;0+0;0+0", - "aff_country_unique": "Australia;Switzerland" + "aff_country_unique": "Australia;Switzerland", + "bibtex": "@InProceedings{Saleh_2017_ICCV,\n \n author = {\n Sadat Saleh,\n Fatemeh and Sadegh Aliakbarian,\n Mohammad and Salzmann,\n Mathieu and Petersson,\n Lars and Alvarez,\n Jose M.\n},\n title = {\n Bringing Background Into the Foreground: Making All Classes Equal in Weakly-Supervised Video Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "CAD Priors for Accurate and Flexible Instance Reconstruction", @@ -2439,6 +2598,7 @@ "status": "Poster", "track": "main", "pid": "79", + "author_site": "Tolga Birdal; Slobodan Ilic", "author": "Tolga Birdal; Slobodan Ilic", "abstract": "We present an efficient and automatic approach for accurate reconstruction of instances of big 3D objects from multiple, unorganized and unstructured point clouds, in presence of dynamic clutter and occlusions. In contrast to conventional scanning, where the background is assumed to be rather static, we aim at handling dynamic clutter where background drastically changes during the object scanning. Currently, it is tedious to solve this with available methods unless the object of interest is first segmented out from the rest of the scene. We address the problem by assuming the availability of a prior CAD model, roughly resembling the object to be reconstructed. This assumption almost always holds in applications such as industrial inspection or reverse engineering. With aid of this prior acting as a proxy, we propose a fully enhanced pipeline, capable of automatically detecting and segmenting the object of interest from scenes and creating a pose graph, online, with linear complexity. This allows initial scan alignment to the CAD model space, which is then refined without the CAD constraint to fully recover a high fidelity 3D reconstruction, accurate up to the sensor noise level. We also contribute a novel object detection method, local implicit shape models (LISM) and give a fast verification scheme. We evaluate our method on multiple datasets, demonstrating the ability to accurately reconstruct objects from small sizes up to 125m3.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Birdal_CAD_Priors_for_ICCV_2017_paper.pdf", @@ -2453,7 +2613,8 @@ "aff_domain": ";", "email": ";", "author_num": 2, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Birdal_CAD_Priors_for_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Birdal_CAD_Priors_for_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Birdal_2017_ICCV,\n \n author = {\n Birdal,\n Tolga and Ilic,\n Slobodan\n},\n title = {\n CAD Priors for Accurate and Flexible Instance Reconstruction\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "CDTS: Collaborative Detection, Tracking, and Segmentation for Online Multiple Object Segmentation in Videos", @@ -2461,6 +2622,7 @@ "status": "Poster", "track": "main", "pid": "1297", + "author_site": "Yeong Jun Koh; Chang-Su Kim", "author": "Yeong Jun Koh; Chang-Su Kim", "abstract": "A novel online algorithm to segment multiple objects in a video sequence is proposed in this work. We develop the collaborative detection, tracking, and segmentation (CDTS) technique to extract multiple segment tracks accurately. First, we jointly use object detector and tracker to generate multiple bounding box tracks for objects. Second, we transform each bounding box into a pixel-wise segment, by employing the alternate shrinking and expansion(ASE) segmentation. Third, we refine the segment tracks, by detecting object disappearance and reappearance cases and merging overlapping segment tracks. Experimental results show that the proposed algorithm significantly surpasses the state-of-the-art conventional algorithms on benchmark datasets.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Koh_CDTS_Collaborative_Detection_ICCV_2017_paper.pdf", @@ -2485,7 +2647,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Koh_2017_ICCV,\n \n author = {\n Jun Koh,\n Yeong and Kim,\n Chang-Su\n},\n title = {\n CDTS: Collaborative Detection,\n Tracking,\n and Segmentation for Online Multiple Object Segmentation in Videos\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "CREST: Convolutional Residual Learning for Visual Tracking", @@ -2493,6 +2656,7 @@ "status": "Poster", "track": "main", "pid": "1118", + "author_site": "Yibing Song; Chao Ma; Lijun Gong; Jiawei Zhang; Rynson W. H. Lau; Ming-Hsuan Yang", "author": "Yibing Song; Chao Ma; Lijun Gong; Jiawei Zhang; Rynson W. H. Lau; Ming-Hsuan Yang", "abstract": "Discriminative correlation filters (DCFs) have \\ryn been shown to perform superiorly in visual tracking. They \\ryn only need a small set of training samples from the initial frame to generate an appearance model. However, existing DCFs learn the filters separately from feature extraction, and update these filters using a moving average operation with an empirical weight. These DCF trackers hardly benefit from the end-to-end training. In this paper, we propose the CREST algorithm to reformulate DCFs as a one-layer convolutional neural network. Our method integrates feature extraction, response map generation as well as model update into the neural networks for an end-to-end training. To reduce model degradation during online update, we apply residual learning to take appearance changes into account. Extensive experiments on the benchmark datasets demonstrate that our CREST tracker performs favorably against state-of-the-art trackers.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Song_CREST_Convolutional_Residual_ICCV_2017_paper.pdf", @@ -2507,7 +2671,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Song_CREST_Convolutional_Residual_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Song_CREST_Convolutional_Residual_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Song_2017_ICCV,\n \n author = {\n Song,\n Yibing and Ma,\n Chao and Gong,\n Lijun and Zhang,\n Jiawei and Lau,\n Rynson W. H. and Yang,\n Ming-Hsuan\n},\n title = {\n CREST: Convolutional Residual Learning for Visual Tracking\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "CVAE-GAN: Fine-Grained Image Generation Through Asymmetric Training", @@ -2515,6 +2680,7 @@ "status": "Poster", "track": "main", "pid": "997", + "author_site": "Jianmin Bao; Dong Chen; Fang Wen; Houqiang Li; Gang Hua", "author": "Jianmin Bao; Dong Chen; Fang Wen; Houqiang Li; Gang Hua", "abstract": "We present variational generative adversarial networks, a general learning framework that combines a variational auto-encoder with a generative adversarial network, for synthesizing images in fine-grained categories, such as faces of a specific person or objects in a category. Our approach models an image as a composition of label and latent attributes in a probabilistic model. By varying the fine-grained category label fed into the resulting generative model, we can generate images in a specific category with randomly drawn values on a latent attribute vector. Our approach has two novel aspects. First, we adopt a cross entropy loss for the discriminative and classifier network, but a mean discrepancy objective for the generative network. This kind of asymmetric loss function makes the GAN training more stable. Second, we adopt an encoder network to learn the relationship between the latent space and the real image space, and use pairwise feature matching to keep the structure of generated images. We experiment with natural images of faces, flowers, and birds, and demonstrate that the proposed models are capable of generating realistic and diverse samples with fine-grained category labels. We further show that our models can be applied to other tasks, such as image inpainting, super-resolution, and data augmentation for training better face recognition models.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Bao_CVAE-GAN_Fine-Grained_Image_ICCV_2017_paper.pdf", @@ -2532,14 +2698,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Bao_CVAE-GAN_Fine-Grained_Image_ICCV_2017_paper.html", "aff_unique_index": "0;1;1;0;1", - "aff_unique_norm": "University of Science and Technology of China;Microsoft", + "aff_unique_norm": "University of Science and Technology of China;Microsoft Corporation", "aff_unique_dep": ";Microsoft Research", "aff_unique_url": "http://www.ustc.edu.cn;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "USTC;MSR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Bao_2017_ICCV,\n \n author = {\n Bao,\n Jianmin and Chen,\n Dong and Wen,\n Fang and Li,\n Houqiang and Hua,\n Gang\n},\n title = {\n CVAE-GAN: Fine-Grained Image Generation Through Asymmetric Training\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Camera Calibration by Global Constraints on the Motion of Silhouettes", @@ -2547,6 +2714,7 @@ "status": "Poster", "track": "main", "pid": "2908", + "author_site": "Gil Ben-Artzi", "author": "Gil Ben-Artzi", "abstract": "We address the problem of epipolar geometry using the motion of silhouettes. Such methods match epipolar lines or frontier points across views, which are then used as the set of putative correspondences. We introduce an approach that improves by two orders of magnitude the performance over state-of-the-art methods, by significantly reducing the number of outliers in the putative matching. We model the frontier points' correspondence problem as constrained flow optimization, requiring small differences between their coordinates over consecutive frames. Our approach is formulated as a Linear Integer Program and we show that due to the nature of our problem, it can be solved efficiently in an iterative manner. Our method was validated on four standard datasets providing accurate calibrations across very different viewpoints.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Ben-Artzi_Camera_Calibration_by_ICCV_2017_paper.pdf", @@ -2562,7 +2730,8 @@ "aff_domain": "", "email": "", "author_num": 1, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Ben-Artzi_Camera_Calibration_by_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Ben-Artzi_Camera_Calibration_by_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Ben-Artzi_2017_ICCV,\n \n author = {\n Ben-Artzi,\n Gil\n},\n title = {\n Camera Calibration by Global Constraints on the Motion of Silhouettes\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Cascaded Feature Network for Semantic Segmentation of RGB-D Images", @@ -2570,6 +2739,7 @@ "status": "Poster", "track": "main", "pid": "620", + "author_site": "Di Lin; Guangyong Chen; Daniel Cohen-Or; Pheng-Ann Heng; Hui Huang", "author": "Di Lin; Guangyong Chen; Daniel Cohen-Or; Pheng-Ann Heng; Hui Huang", "abstract": "Fully convolutional network (FCN) has been successfully applied in semantic segmentation of scenes represented with RGB images. Images augmented with depth channel provide more understanding of the geometric information of the scene in the image. The question is how to best exploit this additional information to improve the segmentation performance. In this paper, we present a neural network with multiple branches for segmenting RGB-D images. Our approach is to use the available depth to split the image into layers with common visual characteristic of objects/scenes, or common \"scene-resolution\". We introduce context-aware receptive field (CaRF) which provides a better control on the relevant contextual information of the learned features. Equipped with CaRF, each branch of the network semantically segments relevant similar scene-resolution, leading to a more focused domain which is easier to learn. Furthermore, our network is cascaded with features from one branch augmenting the features of adjacent branch. We show that such cascading of features enriches the contextual information of each branch and enhances the overall performance. The accuracy that our network achieves outperforms the state-of-the-art methods on two public datasets.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Lin_Cascaded_Feature_Network_ICCV_2017_paper.pdf", @@ -2587,14 +2757,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Lin_Cascaded_Feature_Network_ICCV_2017_paper.html", "aff_unique_index": "0;1;2+3;3;0+3", - "aff_unique_norm": "Shenzhen University;Chinese University of Hong Kong;Tel Aviv University;Shenzhen Institute of Advanced Technology", + "aff_unique_norm": "Shenzhen University;The Chinese University of Hong Kong;Tel Aviv University;Shenzhen Institute of Advanced Technology", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.szu.edu.cn;https://www.cuhk.edu.hk;https://www.tau.ac.il;http://www.siat.ac.cn", "aff_unique_abbr": "SZU;CUHK;TAU;SIAT", "aff_campus_unique_index": "1;;", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;1+0;0;0+0", - "aff_country_unique": "China;Israel" + "aff_country_unique": "China;Israel", + "bibtex": "@InProceedings{Lin_2017_ICCV,\n \n author = {\n Lin,\n Di and Chen,\n Guangyong and Cohen-Or,\n Daniel and Heng,\n Pheng-Ann and Huang,\n Hui\n},\n title = {\n Cascaded Feature Network for Semantic Segmentation of RGB-D Images\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Catadioptric HyperSpectral Light Field Imaging", @@ -2602,6 +2773,7 @@ "status": "Poster", "track": "main", "pid": "580", + "author_site": "Yujia Xue; Kang Zhu; Qiang Fu; Xilin Chen; Jingyi Yu", "author": "Yujia Xue; Kang Zhu; Qiang Fu; Xilin Chen; Jingyi Yu", "abstract": "The complete plenoptic function records radiance of rays from every location, at every angle, for every wavelength and at every time. The signal is multi-dimensional and has long relied on multi-modal sensing such as hybrid light field camera arrays. In this paper, we present a single camera hyperspectral light field imaging solution that we call Snapshot Plenoptic Imager (SPI). SPI uses spectral coded catadioptric mirror arrays for simultaneously acquiring the spatial, angular and spectral dimensions. We further apply a learning-based approach to improve the spectral resolution from very few measurements. Specifically, we demonstrate and then employ a new spectral sparsity prior that allows the hyperspectral profiles to be sparsely represented under a pre-trained dictionary. Comprehensive experiments on synthetic and real data show that our technique is effective, reliable, and accurate. In particular, we are able to produce the first wide FoV multi-spectral light field database.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Xue_Catadioptric_HyperSpectral_Light_ICCV_2017_paper.pdf", @@ -2626,7 +2798,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;1+1;1;1;0+1", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Xue_2017_ICCV,\n \n author = {\n Xue,\n Yujia and Zhu,\n Kang and Fu,\n Qiang and Chen,\n Xilin and Yu,\n Jingyi\n},\n title = {\n Catadioptric HyperSpectral Light Field Imaging\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Centered Weight Normalization in Accelerating Training of Deep Neural Networks", @@ -2634,6 +2807,7 @@ "status": "Poster", "track": "main", "pid": "1176", + "author_site": "Lei Huang; Xianglong Liu; Yang Liu; Bo Lang; Dacheng Tao", "author": "Lei Huang; Xianglong Liu; Yang Liu; Bo Lang; Dacheng Tao", "abstract": "Training deep neural networks is difficult for the pathological curvature problem. Re-parameterization is an effective way to relieve the problem by learning the curvature approximately or constraining the solutions of weights with good properties for optimization. This paper proposes to re-parameterize the input weight of each neuron in deep neural networks by normalizing it with zero-mean and unit-norm, followed by a learnable scalar parameter to adjust the norm of the weight. This technique effectively stabilizes the distribution implicitly. Besides, it improves the conditioning of the optimization problem and thus accelerates the training of deep neural networks. It can be wrapped as a linear module in practice and plugged in any architecture to replace the standard linear module. We highlight the benefits of our method on both multi-layer perceptrons and convolutional neural networks, and demonstrate its scalability and efficiency on SVHN, CIFAR-10, CIFAR-100 and ImageNet datasets.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Huang_Centered_Weight_Normalization_ICCV_2017_paper.pdf", @@ -2651,14 +2825,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Huang_Centered_Weight_Normalization_ICCV_2017_paper.html", "aff_unique_index": "0;0;0;0;1", - "aff_unique_norm": "Beihang University;University of Sydney", + "aff_unique_norm": "Beihang University;The University of Sydney", "aff_unique_dep": "State Key Laboratory of Software Development Environment;School of IT, FEIT", "aff_unique_url": "http://www.buaa.edu.cn;https://www.sydney.edu.au", "aff_unique_abbr": "Beihang;USYD", "aff_campus_unique_index": "1", "aff_campus_unique": ";Sydney", "aff_country_unique_index": "0;0;0;0;1", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Huang_2017_ICCV,\n \n author = {\n Huang,\n Lei and Liu,\n Xianglong and Liu,\n Yang and Lang,\n Bo and Tao,\n Dacheng\n},\n title = {\n Centered Weight Normalization in Accelerating Training of Deep Neural Networks\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Chained Cascade Network for Object Detection", @@ -2666,6 +2841,7 @@ "status": "Poster", "track": "main", "pid": "798", + "author_site": "Wanli Ouyang; Kun Wang; Xin Zhu; Xiaogang Wang", "author": "Wanli Ouyang; Kun Wang; Xin Zhu; Xiaogang Wang", "abstract": "Cascade is a widely used approach that rejects obvious negative samples at early stages for learning better classifier and faster inference. This paper presents chained cascade network (CC-Net). In this CC-Net, there are many cascade stages. Preceding cascade stages are placed at shallow layers. Easy hard examples are rejected at shallow layers so that the computation for deeper or wider layers is not required. In this way, features and classifiers at latter stages handle more difficult samples with the help of features and classifiers in previous stages. It yields consistent boost in detection performance on PASCAL VOC 2007 and ImageNet for both fast RCNN and Faster RCNN. CC-Net saves computation for both training and testing. Code is available on.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Ouyang_Chained_Cascade_Network_ICCV_2017_paper.pdf", @@ -2683,14 +2859,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Ouyang_Chained_Cascade_Network_ICCV_2017_paper.html", "aff_unique_index": "0+1;0;0;0", - "aff_unique_norm": "Chinese University of Hong Kong;University of Sydney", + "aff_unique_norm": "The Chinese University of Hong Kong;University of Sydney", "aff_unique_dep": ";", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.sydney.edu.au", "aff_unique_abbr": "CUHK;USYD", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0+1;0;0;0", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Ouyang_2017_ICCV,\n \n author = {\n Ouyang,\n Wanli and Wang,\n Kun and Zhu,\n Xin and Wang,\n Xiaogang\n},\n title = {\n Chained Cascade Network for Object Detection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Chained Multi-Stream Networks Exploiting Pose, Motion, and Appearance for Action Classification and Detection", @@ -2698,6 +2875,7 @@ "status": "Poster", "track": "main", "pid": "1104", + "author_site": "Mohammadreza Zolfaghari; Gabriel L. Oliveira; Nima Sedaghat; Thomas Brox", "author": "Mohammadreza Zolfaghari; Gabriel L. Oliveira; Nima Sedaghat; Thomas Brox", "abstract": "General human action recognition requires understanding of various visual cues. In this paper, we propose a network architecture that computes and integrates the most important visual cues for action recognition: pose, motion, and the raw images. For the integration, we introduce a Markov chain model which adds cues successively. The resulting approach is efficient and applicable to action classification as well as to spatial and temporal action localization. The two contributions clearly improve the performance over respective baselines. The overall approach achieves state-of-the-art action classification performance on HMDB51, J-HMDB and NTU RGB+D datasets. Moreover, it yields state-of-the-art spatio-temporal action localization results on UCF101 and J-HMDB.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zolfaghari_Chained_Multi-Stream_Networks_ICCV_2017_paper.pdf", @@ -2722,7 +2900,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Zolfaghari_2017_ICCV,\n \n author = {\n Zolfaghari,\n Mohammadreza and Oliveira,\n Gabriel L. and Sedaghat,\n Nima and Brox,\n Thomas\n},\n title = {\n Chained Multi-Stream Networks Exploiting Pose,\n Motion,\n and Appearance for Action Classification and Detection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Channel Pruning for Accelerating Very Deep Neural Networks", @@ -2730,10 +2909,11 @@ "status": "Poster", "track": "main", "pid": "600", + "author_site": "Yihui He; Xiangyu Zhang; Jian Sun", "author": "Yihui He; Xiangyu Zhang; Jian Sun", "abstract": "In this paper, we introduce a new channel pruning method to accelerate very deep convolutional neural networks.Given a trained CNN model, we propose an iterative two-step algorithm to effectively prune each layer, by a LASSO regression based channel selection and least square reconstruction. We further generalize this algorithm to multi-layer and multi-branch cases. Our method reduces the accumulated error and enhance the compatibility with various architectures. Our pruned VGG-16 achieves the state-of-the-art results by 5x speed-up along with only 0.3% increase of error. More importantly, our method is able to accelerate modern networks like ResNet, Xception and suffers only 1.4%, 1.0% accuracy loss under 2x speed-up respectively, which is significant.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/He_Channel_Pruning_for_ICCV_2017_paper.pdf", - "aff": "Xi\u2019an Jiaotong University; Megvii Inc.; Megvii Inc.", + "aff": "Xi’an Jiaotong University; Megvii Inc.; Megvii Inc.", "project": "", "github": "", "supp": "", @@ -2747,14 +2927,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/He_Channel_Pruning_for_ICCV_2017_paper.html", "aff_unique_index": "0;1;1", - "aff_unique_norm": "Xi'an Jiao Tong University;Megvii Technology", + "aff_unique_norm": "Xi'an Jiaotong University;Megvii Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.xjtu.edu.cn;https://www.megvii.com/", "aff_unique_abbr": "XJTU;Megvii", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{He_2017_ICCV,\n \n author = {\n He,\n Yihui and Zhang,\n Xiangyu and Sun,\n Jian\n},\n title = {\n Channel Pruning for Accelerating Very Deep Neural Networks\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Characterizing and Improving Stability in Neural Style Transfer", @@ -2762,6 +2943,7 @@ "status": "Poster", "track": "main", "pid": "2027", + "author_site": "Agrim Gupta; Justin Johnson; Alexandre Alahi; Li Fei-Fei", "author": "Agrim Gupta; Justin Johnson; Alexandre Alahi; Li Fei-Fei", "abstract": "Recent progress in style transfer on images has focused on improving the quality of stylized images and speed of methods. However, real-time methods are highly unstable resulting in visible flickering when applied to videos. In this work we characterize the instability of these methods by examining the solution set of the style transfer objective. We show that the trace of the Gram matrix representing style is inversely related to the stability of the method. Then, we present a recurrent convolutional network for real-time video style transfer which incorporates a temporal consistency loss and overcomes the instability of prior methods. Our networks can be applied at any resolution, do not require optical flow at test time, and produce high quality, temporally consistent stylized videos in real-time.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Gupta_Characterizing_and_Improving_ICCV_2017_paper.pdf", @@ -2777,7 +2959,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Gupta_Characterizing_and_Improving_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Gupta_Characterizing_and_Improving_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Gupta_2017_ICCV,\n \n author = {\n Gupta,\n Agrim and Johnson,\n Justin and Alahi,\n Alexandre and Fei-Fei,\n Li\n},\n title = {\n Characterizing and Improving Stability in Neural Style Transfer\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "ChromaTag: A Colored Marker and Fast Detection Algorithm", @@ -2785,6 +2968,7 @@ "status": "Poster", "track": "main", "pid": "430", + "author_site": "Joseph DeGol; Timothy Bretl; Derek Hoiem", "author": "Joseph DeGol; Timothy Bretl; Derek Hoiem", "abstract": "Current fiducial marker detection algorithms rely on marker IDs for false positive rejection. Time is wasted on potential detections that will eventually be rejected as false positives. We introduce ChromaTag, a fiducial marker and detection algorithm designed to use opponent colors to limit and quickly reject initial false detections and grayscale for precise localization. Through experiments, we show that ChromaTag is significantly faster than current fiducial markers while achieving similar or better detection accuracy. We also show how tag size and viewing direction effect detection accuracy. Our contribution is significant because fiducial markers are often used in real-time applications (e.g. marker assisted robot navigation) where heavy computation is required by other parts of the system.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/DeGol_ChromaTag_A_Colored_ICCV_2017_paper.pdf", @@ -2802,14 +2986,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/DeGol_ChromaTag_A_Colored_ICCV_2017_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "University of Illinois Urbana-Champaign", + "aff_unique_norm": "University of Illinois at Urbana-Champaign", "aff_unique_dep": "", "aff_unique_url": "https://illinois.edu", "aff_unique_abbr": "UIUC", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Urbana-Champaign", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{DeGol_2017_ICCV,\n \n author = {\n DeGol,\n Joseph and Bretl,\n Timothy and Hoiem,\n Derek\n},\n title = {\n ChromaTag: A Colored Marker and Fast Detection Algorithm\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Class Rectification Hard Mining for Imbalanced Deep Learning", @@ -2817,6 +3002,7 @@ "status": "Poster", "track": "main", "pid": "695", + "author_site": "Qi Dong; Shaogang Gong; Xiatian Zhu", "author": "Qi Dong; Shaogang Gong; Xiatian Zhu", "abstract": "Recognising detailed facial or clothing attributes in images of people is a challenging task for computer vision, especially when the training data are both in very large scale and extremely imbalanced among different attribute classes. To address this problem, we formulate a novel scheme for batch incremental hard sample mining of minority attribute classes from imbalanced large scale training data. We develop an end-to-end deep learning framework capable of avoiding the dominant effect of majority classes by discovering sparsely sampled boundaries of minority classes. This is made possible by introducing a Class Rectification Loss (CRL) regularising algorithm. We demonstrate the advantages and scalability of CRL over existing state-of-the-art attribute recognition and imbalanced data learning models on two large scale imbalanced benchmark datasets, the CelebA facial attribute dataset and the X-Domain clothing attribute dataset.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Dong_Class_Rectification_Hard_ICCV_2017_paper.pdf", @@ -2841,7 +3027,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "London;", "aff_country_unique_index": "0;0", - "aff_country_unique": "United Kingdom;" + "aff_country_unique": "United Kingdom;", + "bibtex": "@InProceedings{Dong_2017_ICCV,\n \n author = {\n Dong,\n Qi and Gong,\n Shaogang and Zhu,\n Xiatian\n},\n title = {\n Class Rectification Hard Mining for Imbalanced Deep Learning\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Click Here: Human-Localized Keypoints as Guidance for Viewpoint Estimation", @@ -2849,6 +3036,7 @@ "status": "Poster", "track": "main", "pid": "862", + "author_site": "Ryan Szeto; Jason J. Corso", "author": "Ryan Szeto; Jason J. Corso", "abstract": "We motivate and address a human-in-the-loop variant of the monocular viewpoint estimation task in which the location and class of one semantic object keypoint is available at test time. In order to leverage the keypoint information, we devise a Convolutional Neural Network called Click-Here CNN (CH-CNN) that integrates the keypoint information with activations from the layers that process the image. It transforms the keypoint information into a 2D map that can be used to weigh features from certain parts of the image more heavily. The weighted sum of these spatial features is combined with global image features to provide relevant information to the prediction layers. To train our network, we collect a novel dataset of 3D keypoint annotations on thousands of CAD models, and synthetically render millions of images with 2D keypoint information. On test instances from PASCAL 3D+, our model achieves a mean class accuracy of 90.7%, whereas the state-of-the-art baseline only obtains 85.7% mean class accuracy, justifying our argument for human-in-the-loop inference.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Szeto_Click_Here_Human-Localized_ICCV_2017_paper.pdf", @@ -2873,7 +3061,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Szeto_2017_ICCV,\n \n author = {\n Szeto,\n Ryan and Corso,\n Jason J.\n},\n title = {\n Click Here: Human-Localized Keypoints as Guidance for Viewpoint Estimation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Coherent Online Video Style Transfer", @@ -2881,6 +3070,7 @@ "status": "Poster", "track": "main", "pid": "615", + "author_site": "Dongdong Chen; Jing Liao; Lu Yuan; Nenghai Yu; Gang Hua", "author": "Dongdong Chen; Jing Liao; Lu Yuan; Nenghai Yu; Gang Hua", "abstract": "Training a feed-forward network for the fast neural style transfer of images has proven successful, but the naive extension of processing videos frame by frame is prone to producing flickering results. We propose the first end-to-end network for online video style transfer, which generates temporally coherent stylized video sequences in near real-time. Two key ideas include an efficient network by incorporating short-term coherence, and propagating short-term coherence to long-term, which ensures consistency over a longer period of time. Our network can incorporate different image stylization networks and clearly outperforms the per-frame baseline both qualitatively and quantitatively. Moreover, it can achieve visually comparable coherence to optimization-based video style transfer, but is three orders of magnitude faster.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Chen_Coherent_Online_Video_ICCV_2017_paper.pdf", @@ -2895,7 +3085,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Chen_Coherent_Online_Video_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Chen_Coherent_Online_Video_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Chen_2017_ICCV,\n \n author = {\n Chen,\n Dongdong and Liao,\n Jing and Yuan,\n Lu and Yu,\n Nenghai and Hua,\n Gang\n},\n title = {\n Coherent Online Video Style Transfer\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Colored Point Cloud Registration Revisited", @@ -2903,6 +3094,7 @@ "status": "Poster", "track": "main", "pid": "183", + "author_site": "Jaesik Park; Qian-Yi Zhou; Vladlen Koltun", "author": "Jaesik Park; Qian-Yi Zhou; Vladlen Koltun", "abstract": "We present an algorithm for tightly aligning two colored point clouds. The key idea is to optimize a joint photometric and geometric objective that locks the alignment along both the normal direction and the tangent plane. We extend a photometric objective for aligning RGB-D images to point clouds, by locally parameterizing the point cloud with a virtual camera. Experiments demonstrate that our algorithm is more accurate and more robust than prior point cloud registration algorithms, including those that utilize color information. We use the presented algorithms to enhance a state-of-the-art scene reconstruction system. The accuracy of the resulting system is demonstrated on real-world scenes with accurate ground-truth models.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Park_Colored_Point_Cloud_ICCV_2017_paper.pdf", @@ -2918,7 +3110,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Park_Colored_Point_Cloud_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Park_Colored_Point_Cloud_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Park_2017_ICCV,\n \n author = {\n Park,\n Jaesik and Zhou,\n Qian-Yi and Koltun,\n Vladlen\n},\n title = {\n Colored Point Cloud Registration Revisited\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Common Action Discovery and Localization in Unconstrained Videos", @@ -2926,6 +3119,7 @@ "status": "Poster", "track": "main", "pid": "750", + "author_site": "Jiong Yang; Junsong Yuan", "author": "Jiong Yang; Junsong Yuan", "abstract": "Similar to common object discovery in images or videos, it is of great interests to discover and locate common actions in videos, which can benefit many video analytics applications such as video summarization, search, and understanding. In this work, we tackle the problem of common action discovery and localization in unconstrained videos, where we do not assume to know the types, numbers or locations of the common actions in the videos. Furthermore, each video can contain zero, one or several common action instances. To perform automatic discovery and localization in such challenging scenarios, we first generate action proposals using human prior. By building an affinity graph among all action proposals, we formulate the common action discovery as a subgraph density maximization problem to select the proposals containing common actions. To avoid enumerating in the exponentially large solution space, we propose an efficient polynomial time optimization algorithm. It solves the problem up to a user specified error bound with respect to the global optimal solution. The experimental results on several datasets show that even without any prior knowledge of common actions, our method can robustly locate the common actions in a collection of videos.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Yang_Common_Action_Discovery_ICCV_2017_paper.pdf", @@ -2950,7 +3144,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Singapore", "aff_country_unique_index": "0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Yang_2017_ICCV,\n \n author = {\n Yang,\n Jiong and Yuan,\n Junsong\n},\n title = {\n Common Action Discovery and Localization in Unconstrained Videos\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Complex Event Detection by Identifying Reliable Shots From Untrimmed Videos", @@ -2958,10 +3153,11 @@ "status": "Poster", "track": "main", "pid": "326", + "author_site": "Hehe Fan; Xiaojun Chang; De Cheng; Yi Yang; Dong Xu; Alexander G. Hauptmann", "author": "Hehe Fan; Xiaojun Chang; De Cheng; Yi Yang; Dong Xu; Alexander G. Hauptmann", "abstract": "The goal of complex event detection is to automatically detect whether an event of interest happens in temporally untrimmed long videos which usually consist of multiple video shots. Observing some video shots in positive (resp. negative) videos are irrelevant (resp. relevant) to the given event class, we formulate this task as a multi-instance learning (MIL) problem by taking each video as a bag and the video shots in each video as instances. To this end, we propose a new MIL method, which simultaneously learns a linear SVM classifier and infers a binary indicator for each instance in order to select reliable training instances from each positive or negative bag. In our new objective function, we balance the weighted training errors and a l1-l2 mixed-norm regularization term which adaptively selects reliable shots as training instances from different videos to have them as diverse as possible. We also develop an alternating optimization approach that can efficiently solve our proposed objective function. Extensive experiments on the challenging real-world Multimedia Event Detection (MED) datasets MEDTest-14, MEDTest-13 and CCV clearly demonstrate the effectiveness of our proposed MIL approach for complex event detection.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Fan_Complex_Event_Detection_ICCV_2017_paper.pdf", - "aff": "Centre for Arti\ufb01cial Intelligence, University of Technology Sydney, Sydney, Australia; School of Computer Science, Carnegie Mellon University, Pittsburgh, USA; Institute of arti\ufb01cial intelligence and robotics, Xi\u2019an Jiaotong University, Xi\u2019an, China; Centre for Arti\ufb01cial Intelligence, University of Technology Sydney, Sydney, Australia; School of Electrical and Information Engineering, The University of Sydney, Sydney, Australia; School of Computer Science, Carnegie Mellon University, Pittsburgh, USA", + "aff": "Centre for Artificial Intelligence, University of Technology Sydney, Sydney, Australia; School of Computer Science, Carnegie Mellon University, Pittsburgh, USA; Institute of artificial intelligence and robotics, Xi’an Jiaotong University, Xi’an, China; Centre for Artificial Intelligence, University of Technology Sydney, Sydney, Australia; School of Electrical and Information Engineering, The University of Sydney, Sydney, Australia; School of Computer Science, Carnegie Mellon University, Pittsburgh, USA", "project": "", "github": "", "supp": "", @@ -2975,14 +3171,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Fan_Complex_Event_Detection_ICCV_2017_paper.html", "aff_unique_index": "0;1;2;0;3;1", - "aff_unique_norm": "University of Technology Sydney;Carnegie Mellon University;Xi'an Jiao Tong University;University of Sydney", - "aff_unique_dep": "Centre for Arti\ufb01cial Intelligence;School of Computer Science;Institute of Artificial Intelligence and Robotics;School of Electrical and Information Engineering", + "aff_unique_norm": "University of Technology Sydney;Carnegie Mellon University;Xi'an Jiaotong University;The University of Sydney", + "aff_unique_dep": "Centre for Artificial Intelligence;School of Computer Science;Institute of Artificial Intelligence and Robotics;School of Electrical and Information Engineering", "aff_unique_url": "https://www.uts.edu.au;https://www.cmu.edu;http://www.xjtu.edu.cn;https://www.sydney.edu.au", "aff_unique_abbr": "UTS;CMU;XJTU;USYD", "aff_campus_unique_index": "0;1;2;0;0;1", "aff_campus_unique": "Sydney;Pittsburgh;Xi'an", "aff_country_unique_index": "0;1;2;0;0;1", - "aff_country_unique": "Australia;United States;China" + "aff_country_unique": "Australia;United States;China", + "bibtex": "@InProceedings{Fan_2017_ICCV,\n \n author = {\n Fan,\n Hehe and Chang,\n Xiaojun and Cheng,\n De and Yang,\n Yi and Xu,\n Dong and Hauptmann,\n Alexander G.\n},\n title = {\n Complex Event Detection by Identifying Reliable Shots From Untrimmed Videos\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Composite Focus Measure for High Quality Depth Maps", @@ -2990,6 +3187,7 @@ "status": "Poster", "track": "main", "pid": "754", + "author_site": "Parikshit Sakurikar; P. J. Narayanan", "author": "Parikshit Sakurikar; P. J. Narayanan", "abstract": "Depth from focus is a highly accessible method to estimate the 3D structure of everyday scenes. Today's DSLR and mobile cameras facilitate the easy capture of multiple focused images of a scene. Focus measures (FMs) that estimate the amount of focus at each pixel form the basis of depth-from-focus methods. Several FMs have been proposed in the past and new ones will emerge in the future, each with their own strengths. We estimate a weighted combination of standard FMs that outperforms others on a wide range of scene types. The resulting composite focus measure consists of FMs that are in consensus with one another but not in chorus. Our two-stage pipeline first estimates fine depth at each pixel using the composite focus measure. A cost-volume propagation step then assigns depths from confident pixels to others. We can generate high quality depth maps using just the top five FMs from our composite focus measure. This is a positive step towards depth estimation of everyday scenes with no special equipment.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Sakurikar_Composite_Focus_Measure_ICCV_2017_paper.pdf", @@ -3014,7 +3212,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hyderabad", "aff_country_unique_index": "0;0", - "aff_country_unique": "India" + "aff_country_unique": "India", + "bibtex": "@InProceedings{Sakurikar_2017_ICCV,\n \n author = {\n Sakurikar,\n Parikshit and Narayanan,\n P. J.\n},\n title = {\n Composite Focus Measure for High Quality Depth Maps\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Compositional Human Pose Regression", @@ -3022,6 +3221,7 @@ "status": "Poster", "track": "main", "pid": "971", + "author_site": "Xiao Sun; Jiaxiang Shang; Shuang Liang; Yichen Wei", "author": "Xiao Sun; Jiaxiang Shang; Shuang Liang; Yichen Wei", "abstract": "Regression based methods are not performing as well as detection based methods for human pose estimation. A central problem is that the structural information in the pose is not well exploited in the previous regression methods. In this work, we propose a structure-aware regression approach. It adopts a reparameterized pose representation using bones instead of joints. It exploits the joint connection structure to define a compositional loss function that encodes the long range interactions in the pose. It is simple, effective, and general for both 2D and 3D pose estimation in a unified setting. Comprehensive evaluation validates the effectiveness of our approach. It significantly advances the state-of-the-art on Human3.6M and is competitive with state-of-the-art results on MPII.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Sun_Compositional_Human_Pose_ICCV_2017_paper.pdf", @@ -3039,14 +3239,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Sun_Compositional_Human_Pose_ICCV_2017_paper.html", "aff_unique_index": "0;0;1;0", - "aff_unique_norm": "Microsoft;Tongji University", + "aff_unique_norm": "Microsoft Corporation;Tongji University", "aff_unique_dep": "Microsoft Research;", "aff_unique_url": "https://www.microsoft.com/en-us/research;https://www.tongji.edu.cn", "aff_unique_abbr": "MSR;Tongji", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Sun_2017_ICCV,\n \n author = {\n Sun,\n Xiao and Shang,\n Jiaxiang and Liang,\n Shuang and Wei,\n Yichen\n},\n title = {\n Compositional Human Pose Regression\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Compressive Quantization for Fast Object Instance Search in Videos", @@ -3054,6 +3255,7 @@ "status": "Poster", "track": "main", "pid": "299", + "author_site": "Tan Yu; Zhenzhen Wang; Junsong Yuan", "author": "Tan Yu; Zhenzhen Wang; Junsong Yuan", "abstract": "Most of current visual search systems focus on image-to-image (point-to-point) search such as image and object retrieval. Nevertheless, fast image-to-video (point-to-set) search is much less exploited. This paper tackles object instance search in videos, where efficient point-to-set matching is essential. Through jointly optimizing vector quantization and hashing, we propose compressive quantization method to compress M object proposals extracted from each video into only k binary codes, where k<< M. Then the similarity between the query object and the whole video can be determined by the Hamming distance between the query's binary code and the video's best-matched binary code. Our compressive quantization not only enables fast search but also significantly reduces the memory cost of storing the video features. Despite the high compression ratio, our proposed compressive quantization still can effectively retrieve small objects in large video datasets. Systematic experiments on three benchmark datasets verify the effectiveness and efficiency of our compressive quantization.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Yu_Compressive_Quantization_for_ICCV_2017_paper.pdf", @@ -3078,7 +3280,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Singapore", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Yu_2017_ICCV,\n \n author = {\n Yu,\n Tan and Wang,\n Zhenzhen and Yuan,\n Junsong\n},\n title = {\n Compressive Quantization for Fast Object Instance Search in Videos\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Consensus Convolutional Sparse Coding", @@ -3086,6 +3289,7 @@ "status": "Poster", "track": "main", "pid": "1686", + "author_site": "Biswarup Choudhury; Robin Swanson; Felix Heide; Gordon Wetzstein; Wolfgang Heidrich", "author": "Biswarup Choudhury; Robin Swanson; Felix Heide; Gordon Wetzstein; Wolfgang Heidrich", "abstract": "Convolutional sparse coding (CSC) is a promising direction for unsupervised learning in computer vision. In contrast to recent supervised methods, CSC allows for convolutional image representations to be learned that are equally useful for high-level vision tasks and low-level image reconstruction and can be applied to a wide range of tasks without problem-specific retraining. Due to their extreme memory requirements, however, existing CSC solvers have so far been limited to low-dimensional problems and datasets using a handful of low-resolution example images at a time. In this paper, we propose a new approach to solving CSC as a consensus optimization problem, which lifts these limitations. By learning CSC features from large-scale image datasets for the first time, we achieve significant quality improvements in a number of imaging tasks. Moreover, the proposed method enables new applications in high-dimensional feature learning that has been intractable using existing CSC methods. This is demonstrated for a variety of reconstruction problems across diverse problem domains, including 3D multispectral demosaicing and 4D light field view synthesis.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Choudhury_Consensus_Convolutional_Sparse_ICCV_2017_paper.pdf", @@ -3110,7 +3314,8 @@ "aff_campus_unique_index": ";1;1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;0+1;2;2;0", - "aff_country_unique": "Saudi Arabia;Canada;United States" + "aff_country_unique": "Saudi Arabia;Canada;United States", + "bibtex": "@InProceedings{Choudhury_2017_ICCV,\n \n author = {\n Choudhury,\n Biswarup and Swanson,\n Robin and Heide,\n Felix and Wetzstein,\n Gordon and Heidrich,\n Wolfgang\n},\n title = {\n Consensus Convolutional Sparse Coding\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Constrained Convolutional Sparse Coding for Parametric Based Reconstruction of Line Drawings", @@ -3118,6 +3323,7 @@ "status": "Poster", "track": "main", "pid": "1769", + "author_site": "Sara Shaheen; Lama Affara; Bernard Ghanem", "author": "Sara Shaheen; Lama Affara; Bernard Ghanem", "abstract": "Convolutional sparse coding (CSC) plays an essential role in many computer vision applications ranging from image compression to deep learning. In this work, we spot the light on a new application where CSC can effectively serve, namely line drawing analysis. The process of drawing a line drawing can be approximated as the sparse spatial localization of a number of typical basic strokes, which in turn can be cast as a non-standard CSC model that considers the line drawing formation process from parametric curves. These curves are learned to optimize the fit between the model and a specific set of line drawings. Parametric representation of sketches is vital in enabling automatic sketch analysis, synthesis and manipulation. A couple of sketch manipulation examples are demonstrated in this work. Consequently, our novel method is expected to provide a reliable and automatic method for parametric sketch description. Through experiments, we empirically validate the convergence of our method to a feasible solution.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Shaheen_Constrained_Convolutional_Sparse_ICCV_2017_paper.pdf", @@ -3142,7 +3348,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Saudi Arabia" + "aff_country_unique": "Saudi Arabia", + "bibtex": "@InProceedings{Shaheen_2017_ICCV,\n \n author = {\n Shaheen,\n Sara and Affara,\n Lama and Ghanem,\n Bernard\n},\n title = {\n Constrained Convolutional Sparse Coding for Parametric Based Reconstruction of Line Drawings\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Convergence Analysis of MAP Based Blur Kernel Estimation", @@ -3150,6 +3357,7 @@ "status": "Poster", "track": "main", "pid": "2441", + "author_site": "Sunghyun Cho; Seungyong Lee", "author": "Sunghyun Cho; Seungyong Lee", "abstract": "One popular approach for blind deconvolution is to formulate a maximum a posteriori (MAP) problem with sparsity priors on the gradients of the latent image, and then alternatingly estimate the blur kernel and the latent image. While several successful MAP based methods have been proposed, there has been much controversy and confusion about their convergence, because sparsity priors have been shown to prefer blurry images to sharp natural images. In this paper, we revisit this problem and provide an analysis on the convergence of MAP based approaches. We first introduce a slight modification to a conventional joint energy function for blind deconvolution. The reformulated energy function yields the same alternating estimation process, but more clearly reveals how blind deconvolution works. We then show the energy function can actually favor the right solution instead of the no-blur solution under certain conditions, which explains the success of previous MAP based approaches. The reformulated energy function and our conditions for the convergence also provide a way to compare the qualities of different blur kernels, and we demonstrate its applicability to automatic blur kernel size selection, blur kernel estimation using light streaks, and defocus estimation.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Cho_Convergence_Analysis_of_ICCV_2017_paper.pdf", @@ -3174,7 +3382,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Pohang", "aff_country_unique_index": "0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Cho_2017_ICCV,\n \n author = {\n Cho,\n Sunghyun and Lee,\n Seungyong\n},\n title = {\n Convergence Analysis of MAP Based Blur Kernel Estimation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Convolutional Dictionary Learning via Local Processing", @@ -3182,6 +3391,7 @@ "status": "Spotlight", "track": "main", "pid": "1684", + "author_site": "Vardan Papyan; Yaniv Romano; Jeremias Sulam; Michael Elad", "author": "Vardan Papyan; Yaniv Romano; Jeremias Sulam; Michael Elad", "abstract": "Convolutional Sparse Coding (CSC) is an increasingly popular model in the signal and image processing communities, tackling some of the limitations of traditional patch-based sparse representations. Although several works have addressed the dictionary learning problem under this model, these relied on an ADMM formulation in the Fourier domain, losing the sense of locality and the relation to the traditional patch-based sparse pursuit. A recent work suggested a novel theoretical analysis of this global model, providing guarantees that rely on a localized sparsity measure. Herein, we extend this local-global relation by showing how one can efficiently solve the convolutional sparse pursuit problem and train the filters involved, while operating locally on image patches. Our approach provides an intuitive algorithm that can leverage standard techniques from the sparse representations field. The proposed method is fast to train, simple to implement, and flexible enough that it can be easily deployed in a variety of applications. We demonstrate the proposed training scheme for image inpainting and image separation, while achieving state-of-the-art results.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Papyan_Convolutional_Dictionary_Learning_ICCV_2017_paper.pdf", @@ -3196,7 +3406,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Papyan_Convolutional_Dictionary_Learning_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Papyan_Convolutional_Dictionary_Learning_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Papyan_2017_ICCV,\n \n author = {\n Papyan,\n Vardan and Romano,\n Yaniv and Sulam,\n Jeremias and Elad,\n Michael\n},\n title = {\n Convolutional Dictionary Learning via Local Processing\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Coordinating Filters for Faster Deep Neural Networks", @@ -3204,6 +3415,7 @@ "status": "Poster", "track": "main", "pid": "279", + "author_site": "Wei Wen; Cong Xu; Chunpeng Wu; Yandan Wang; Yiran Chen; Hai Li", "author": "Wei Wen; Cong Xu; Chunpeng Wu; Yandan Wang; Yiran Chen; Hai Li", "abstract": "Very large-scale Deep Neural Networks (DNNs) have achieved remarkable successes in a large variety of computer vision tasks. However, the high computation intensity of DNNs makes it challenging to deploy these models on resource-limited systems. Some studies used low-rank approaches that approximate the filters by low-rank basis to accelerate the testing. Those works directly decomposed the pre-trained DNNs by Low-Rank Approximations (LRA). How to train DNNs toward lower-rank space for more efficient DNNs, however, remains as an open area. To solve the issue, in this work, we propose Force Regularization, which uses attractive forces to enforce filters so as to coordinate more weight information into lower-rank space. We mathematically and empirically verify that after applying our technique, standard LRA methods can reconstruct filters using much lower basis and thus result in faster DNNs. The effectiveness of our approach is comprehensively evaluated in ResNets, AlexNet, and GoogLeNet. In AlexNet, for example, Force Regularization gains 2x speedup on modern GPU without accuracy loss and 4.05x speedup on CPU by paying small accuracy degradation. Moreover, Force Regularization better initializes the low-rank DNNs such that the fine-tuning can converge faster toward higher accuracy. The obtained lower-rank DNNs can be further sparsified, proving that Force Regularization can be integrated with state-of-the-art sparsity-based acceleration methods.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Wen_Coordinating_Filters_for_ICCV_2017_paper.pdf", @@ -3228,7 +3440,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Wen_2017_ICCV,\n \n author = {\n Wen,\n Wei and Xu,\n Cong and Wu,\n Chunpeng and Wang,\n Yandan and Chen,\n Yiran and Li,\n Hai\n},\n title = {\n Coordinating Filters for Faster Deep Neural Networks\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Corner-Based Geometric Calibration of Multi-Focus Plenoptic Cameras", @@ -3236,7 +3449,7 @@ "status": "Poster", "track": "main", "pid": "622", - "author_site": "Sotiris Nousias; Fran\u00c3\u00a7ois Chadebecq; Jonas Pichat; Pearse Keane; S\u00c3\u00a9bastien Ourselin; Christos Bergeles", + "author_site": "Sotiris Nousias; François Chadebecq; Jonas Pichat; Pearse Keane; Sébastien Ourselin; Christos Bergeles", "author": "Sotiris Nousias; Francois Chadebecq; Jonas Pichat; Pearse Keane; Sebastien Ourselin; Christos Bergeles", "abstract": "We propose a method for geometric calibration of multi-focus plenoptic cameras using raw images. Multi-focus plenoptic cameras feature several types of micro-lenses spatially aligned in front of the camera sensor to generate micro-images at different magnifications. This multi-lens arrangement provides computational-photography benefits but complicates calibration. Our methodology achieves the detection of the type of micro-lenses, the retrieval of their spatial arrangement, and the estimation of intrinsic and extrinsic camera parameters therefore fully characterising this specialised camera class. Motivated from classic pinhole camera calibration, the presented algorithm operates based on a checker-board's corners, retrieved by a custom micro-image corner detector. This approach enables the introduction of a re-projection error that is used in a minimisation framework. Our algorithm compares favourably to the state-of-the-art, as demonstrated by controlled and free-hand experiments, making it a first step towards accurate 3D reconstruction and Structure-from-Motion.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Nousias_Corner-Based_Geometric_Calibration_ICCV_2017_paper.pdf", @@ -3261,7 +3474,8 @@ "aff_campus_unique_index": "0+0;0+0;0;0+0;0+0", "aff_campus_unique": "London;", "aff_country_unique_index": "0+0;0+0;0;0;0+0;0+0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Nousias_2017_ICCV,\n \n author = {\n Nousias,\n Sotiris and Chadebecq,\n Francois and Pichat,\n Jonas and Keane,\n Pearse and Ourselin,\n Sebastien and Bergeles,\n Christos\n},\n title = {\n Corner-Based Geometric Calibration of Multi-Focus Plenoptic Cameras\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "CoupleNet: Coupling Global Structure With Local Parts for Object Detection", @@ -3269,6 +3483,7 @@ "status": "Poster", "track": "main", "pid": "1727", + "author_site": "Yousong Zhu; Chaoyang Zhao; Jinqiao Wang; Xu Zhao; Yi Wu; Hanqing Lu", "author": "Yousong Zhu; Chaoyang Zhao; Jinqiao Wang; Xu Zhao; Yi Wu; Hanqing Lu", "abstract": "The region-based Convolutional Neural Network (CNN) detectors such as Faster R-CNN or R-FCN have already shown promising results for object detection by combining the region proposal subnetwork and the classification subnetwork together. Although R-FCN has achieved higher detection speed while keeping the detection performance, the global structure information is ignored by the position-sensitive score maps. To fully explore the local and global properties, in this paper, we propose a novel fully convolutional network, named as CoupleNet, to couple the global structure with local parts for object detection. Specifically, the object proposals obtained by the Region Proposal Network (RPN) are fed into the the coupling module which consists of two branches. One branch adopts the position-sensitive RoI (PSRoI) pooling to capture the local part information of the object, while the other employs the RoI pooling to encode the global and context information. Next, we design different coupling strategies and normalization ways to make full use of the complementary advantages between the global and local branches. Extensive experiments demonstrate the effectiveness of our approach. We achieve state-of-the-art results on all three challenging datasets, i.e. a mAP of 82.7% on VOC07, 80.4% on VOC12, and 34.4% on COCO.Codes will be made publicly available.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zhu_CoupleNet_Coupling_Global_ICCV_2017_paper.pdf", @@ -3289,11 +3504,12 @@ "aff_unique_norm": "Chinese Academy of Sciences;University of Chinese Academy of Sciences;Nanjing Audit University;Indiana University", "aff_unique_dep": "Institute of Automation;;School of Technology;Department of Medicine", "aff_unique_url": "http://www.ia.cas.cn;http://www.ucas.ac.cn;http://www.nau.edu.cn/;https://iu.edu", - "aff_unique_abbr": "CAS;UCAS;NAU;IU", + "aff_unique_abbr": "CAS;UCAS;;IU", "aff_campus_unique_index": "0;0;0;0;2+3;0", "aff_campus_unique": "Beijing;;Nanjing;Indianapolis", "aff_country_unique_index": "0+0;0+0;0+0;0+0;0+1;0+0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Zhu_2017_ICCV,\n \n author = {\n Zhu,\n Yousong and Zhao,\n Chaoyang and Wang,\n Jinqiao and Zhao,\n Xu and Wu,\n Yi and Lu,\n Hanqing\n},\n title = {\n CoupleNet: Coupling Global Structure With Local Parts for Object Detection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Cross-Modal Deep Variational Hashing", @@ -3301,6 +3517,7 @@ "status": "Poster", "track": "main", "pid": "1593", + "author_site": "Venice Erin Liong; Jiwen Lu; Yap-Peng Tan; Jie Zhou", "author": "Venice Erin Liong; Jiwen Lu; Yap-Peng Tan; Jie Zhou", "abstract": "In this paper, we propose a cross-modal deep variational hashing (CMDVH) method to learn compact binary codes for cross-modality multimedia retrieval. Unlike most existing cross-modal hashing methods which learn a single pair of projections to map each example into a binary vector, we design a deep fusion neural network to learn non-linear transformations from image-text input pairs, such that a unified binary code is achieved in a discrete and discriminative manner using a classification-based hinge-loss criterion. We then design modality-specific neural networks in a probabilistic manner such that we model a latent variable to be close as possible from the inferred binary codes, at the same time approximated by a posterior distribution regularized by a known prior, which is suitable for out-of-sample extension. Experimental results on three benchmark datasets show the efficacy of the proposed approach.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Liong_Cross-Modal_Deep_Variational_ICCV_2017_paper.pdf", @@ -3315,7 +3532,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Liong_Cross-Modal_Deep_Variational_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Liong_Cross-Modal_Deep_Variational_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Liong_2017_ICCV,\n \n author = {\n Erin Liong,\n Venice and Lu,\n Jiwen and Tan,\n Yap-Peng and Zhou,\n Jie\n},\n title = {\n Cross-Modal Deep Variational Hashing\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Cross-View Asymmetric Metric Learning for Unsupervised Person Re-Identification", @@ -3323,6 +3541,7 @@ "status": "Poster", "track": "main", "pid": "378", + "author_site": "Hong-Xing Yu; Ancong Wu; Wei-Shi Zheng", "author": "Hong-Xing Yu; Ancong Wu; Wei-Shi Zheng", "abstract": "While metric learning is important for Person re-identification (RE-ID), a significant problem in visual surveillance for cross-view pedestrian matching, existing metric models for RE-ID are mostly based on supervised learning that requires quantities of labeled samples in all pairs of camera views for training. However, this limits their scalabilities to realistic applications, in which a large amount of data over multiple disjoint camera views is available but not labelled. To overcome the problem, we propose an unsupervised asymmetric metric learning model for unsupervised RE-ID. Our model aims to learn an asymmetric metric, i.e., specific projection for each view, effectively based on clustering on cross-view person images. Our model finds a shared space where view-specific bias is alleviated and thus better matching performance can be achieved. Extensive experiments have been conducted on a baseline and five large-scale RE-ID datasets to demonstrate the effectiveness of the proposed model. Through the comparison, we show that our unsupervised asymmetric metric model works much more suitable for unsupervised RE-ID as compared to classical unsupervised metric learning models. We also compare existing unsupervised RE-ID methods, and our model outperforms them with notable margins, and especially we report the performance on large-scale unlabelled RE-ID dataset, which is unfortunately less concerned in literatures.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Yu_Cross-View_Asymmetric_Metric_ICCV_2017_paper.pdf", @@ -3347,7 +3566,8 @@ "aff_campus_unique_index": "1;", "aff_campus_unique": ";Guangzhou", "aff_country_unique_index": "0+0;0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yu_2017_ICCV,\n \n author = {\n Yu,\n Hong-Xing and Wu,\n Ancong and Zheng,\n Wei-Shi\n},\n title = {\n Cross-View Asymmetric Metric Learning for Unsupervised Person Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Curriculum Domain Adaptation for Semantic Segmentation of Urban Scenes", @@ -3355,6 +3575,7 @@ "status": "Poster", "track": "main", "pid": "725", + "author_site": "Yang Zhang; Philip David; Boqing Gong", "author": "Yang Zhang; Philip David; Boqing Gong", "abstract": "During the last half decade, convolutional neural networks (CNNs) have triumphed over semantic segmentation, which is a core task of various emerging industrial applications such as autonomous driving and medical imaging. However, to train CNNs requires a huge amount of data, which is difficult to collect and laborious to annotate. Recent advances in computer graphics make it possible to train CNN models on photo-realistic synthetic data with computer-generated annotations. Despite this, the domain mismatch between the real images and the synthetic data significantly decreases the models' performance. Hence we propose a curriculum-style learning approach to minimize the domain gap in semantic segmentation. The curriculum domain adaptation solves easy tasks first in order to infer some necessary properties about the target domain; in particular, the first task is to learn global label distributions over images and local distributions over landmark superpixels. These are easy to estimate because images of urban traffic scenes have strong idiosyncrasies (e.g., the size and spatial relations of buildings, streets, cars, etc.). We then train the segmentation network in such a way that the network predictions in the target domain follow those inferred properties. In experiments, our method significantly outperforms the baselines as well as the only known existing approach to the same problem.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zhang_Curriculum_Domain_Adaptation_ICCV_2017_paper.pdf", @@ -3379,7 +3600,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Orlando;", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhang_2017_ICCV,\n \n author = {\n Zhang,\n Yang and David,\n Philip and Gong,\n Boqing\n},\n title = {\n Curriculum Domain Adaptation for Semantic Segmentation of Urban Scenes\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Curriculum Dropout", @@ -3387,11 +3609,11 @@ "status": "Poster", "track": "main", "pid": "1346", - "author_site": "Pietro Morerio; Jacopo Cavazza; Riccardo Volpi; Ren\u00c3\u00a9 Vidal; Vittorio Murino", + "author_site": "Pietro Morerio; Jacopo Cavazza; Riccardo Volpi; René Vidal; Vittorio Murino", "author": "Pietro Morerio; Jacopo Cavazza; Riccardo Volpi; Rene Vidal; Vittorio Murino", "abstract": "Dropout is a very effective way of regularizing neural networks. Stochastically dropping out units with a certain probability discourages over-specific co-adaptations of feature detectors, preventing overfitting and improving network generalization. Besides, Dropout can be interpreted as an approximate model aggregation technique, where an exponential number of smaller networks are averaged in order to get a more powerful ensemble. In this paper, we show that using a fixed dropout probability during training is a suboptimal choice. We thus propose a time scheduling for the probability of retaining neurons in the network. This induces an adaptive regularization scheme that smoothly increases the difficulty of the optimization problem. This idea of starting easy and adaptively increasing the difficulty of the learning problem has its roots in curriculum learning and allows one to train better models. Indeed, we prove that our optimization strategy implements a very general curriculum scheme, by gradually adding noise to both the input and intermediate feature representations within the network architecture. Experiments on seven image classification datasets and different network architectures show that our method, named Curriculum Dropout, frequently yields to better generalization and, at worst, performs just as well as the standard Dropout method.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Morerio_Curriculum_Dropout_ICCV_2017_paper.pdf", - "aff": "Pattern Analysis & Computer Vision (PA VIS) \u2013 Istituto Italiano di Tecnologia \u2013 Genova, 16163, Italy; Pattern Analysis & Computer Vision (PA VIS) \u2013 Istituto Italiano di Tecnologia \u2013 Genova, 16163, Italy + Electrical, Electronics and Telecommunication Engineering and Naval Architecture Department (DITEN) \u2013 Universit \u00e0 degli Studi di Genova \u2013 Genova, 16145, Italy; Pattern Analysis & Computer Vision (PA VIS) \u2013 Istituto Italiano di Tecnologia \u2013 Genova, 16163, Italy + Electrical, Electronics and Telecommunication Engineering and Naval Architecture Department (DITEN) \u2013 Universit \u00e0 degli Studi di Genova \u2013 Genova, 16145, Italy; Department of Biomedical Engineering \u2013 Johns Hopkins University \u2013 Baltimore, MD 21218, USA; Pattern Analysis & Computer Vision (PA VIS) \u2013 Istituto Italiano di Tecnologia \u2013 Genova, 16163, Italy + Computer Science Department \u2013 Universit \u00e0 di Verona \u2013 Verona, 37134, Italy", + "aff": "Pattern Analysis & Computer Vision (PA VIS) – Istituto Italiano di Tecnologia – Genova, 16163, Italy; Pattern Analysis & Computer Vision (PA VIS) – Istituto Italiano di Tecnologia – Genova, 16163, Italy + Electrical, Electronics and Telecommunication Engineering and Naval Architecture Department (DITEN) – Universit à degli Studi di Genova – Genova, 16145, Italy; Pattern Analysis & Computer Vision (PA VIS) – Istituto Italiano di Tecnologia – Genova, 16163, Italy + Electrical, Electronics and Telecommunication Engineering and Naval Architecture Department (DITEN) – Universit à degli Studi di Genova – Genova, 16145, Italy; Department of Biomedical Engineering – Johns Hopkins University – Baltimore, MD 21218, USA; Pattern Analysis & Computer Vision (PA VIS) – Istituto Italiano di Tecnologia – Genova, 16163, Italy + Computer Science Department – Universit à di Verona – Verona, 37134, Italy", "project": "", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2017/supplemental/Morerio_Curriculum_Dropout_ICCV_2017_supplemental.pdf", @@ -3405,14 +3627,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Morerio_Curriculum_Dropout_ICCV_2017_paper.html", "aff_unique_index": "0;0+1;0+1;2;0+3", - "aff_unique_norm": "Istituto Italiano di Tecnologia;Universit\u00e0 degli Studi di Genova;Johns Hopkins University;Universit\u00e0 di Verona", + "aff_unique_norm": "Istituto Italiano di Tecnologia;Università degli Studi di Genova;Johns Hopkins University;Università di Verona", "aff_unique_dep": "Pattern Analysis & Computer Vision (PA VIS);Electrical, Electronics and Telecommunication Engineering and Naval Architecture Department (DITEN);Department of Biomedical Engineering;Computer Science Department", "aff_unique_url": "https://www.iit.it;https://www.unige.it;https://www.jhu.edu;https://www.univr.it", "aff_unique_abbr": "IIT;;JHU;", "aff_campus_unique_index": "0;0+0;0+0;1;0+2", "aff_campus_unique": "Genova;Baltimore;Verona", "aff_country_unique_index": "0;0+0;0+0;1;0+0", - "aff_country_unique": "Italy;United States" + "aff_country_unique": "Italy;United States", + "bibtex": "@InProceedings{Morerio_2017_ICCV,\n \n author = {\n Morerio,\n Pietro and Cavazza,\n Jacopo and Volpi,\n Riccardo and Vidal,\n Rene and Murino,\n Vittorio\n},\n title = {\n Curriculum Dropout\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Cut, Paste and Learn: Surprisingly Easy Synthesis for Instance Detection", @@ -3420,6 +3643,7 @@ "status": "Poster", "track": "main", "pid": "588", + "author_site": "Debidatta Dwibedi; Ishan Misra; Martial Hebert", "author": "Debidatta Dwibedi; Ishan Misra; Martial Hebert", "abstract": "A major impediment in rapidly deploying object detection models for instance detection is the lack of large annotated datasets. For example, finding a large labeled dataset containing instances in a particular kitchen is unlikely. Each new environment with new instances requires expensive data collection and annotation. In this paper, we propose a simple approach to generate large annotated instance datasets with minimal effort. Our key insight is that ensuring only patch-level realism provides enough training signal for current object detector models. We automatically `cut' object instances and `paste' them on random backgrounds. A naive way to do this results in pixel artifacts which result in poor performance for trained models. We show how to make detectors ignore these artifacts during training and generate data that gives competitive performance on real data. Our method outperforms existing synthesis approaches and when combined with real images improves relative performance by more than 21% on benchmark datasets. In a cross-domain setting, our synthetic data combined with just 10% real data outperforms models trained on all real data.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Dwibedi_Cut_Paste_and_ICCV_2017_paper.pdf", @@ -3444,7 +3668,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Dwibedi_2017_ICCV,\n \n author = {\n Dwibedi,\n Debidatta and Misra,\n Ishan and Hebert,\n Martial\n},\n title = {\n Cut,\n Paste and Learn: Surprisingly Easy Synthesis for Instance Detection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "DCTM: Discrete-Continuous Transformation Matching for Semantic Flow", @@ -3452,6 +3677,7 @@ "status": "Oral", "track": "main", "pid": "2581", + "author_site": "Seungryong Kim; Dongbo Min; Stephen Lin; Kwanghoon Sohn", "author": "Seungryong Kim; Dongbo Min; Stephen Lin; Kwanghoon Sohn", "abstract": "Techniques for dense semantic correspondence have provided limited ability to deal with the geometric variations that commonly exist between semantically similar images. While variations due to scale and rotation have been examined, there is a lack of practical solutions for more complex deformations such as affine transformations because of the tremendous size of the associated solution space. To address this problem, we present a discrete-continuous transformation matching (DCTM) framework where dense affine transformation fields are inferred through a discrete label optimization in which the labels are iteratively updated via continuous regularization. In this way, our approach draws solutions from the continuous space of affine transformations in a manner that can be computed efficiently through constant-time edge-aware filtering and a proposed affine-varying CNN-based descriptor. Experimental results show that this model outperforms the state-of-the-art methods for dense semantic correspondence on various benchmarks.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Kim_DCTM_Discrete-Continuous_Transformation_ICCV_2017_paper.pdf", @@ -3469,14 +3695,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Kim_DCTM_Discrete-Continuous_Transformation_ICCV_2017_paper.html", "aff_unique_index": "0;1;2;0", - "aff_unique_norm": "Yonsei University;Chungnam National University;Microsoft", + "aff_unique_norm": "Yonsei University;Chungnam National University;Microsoft Corporation", "aff_unique_dep": ";;Microsoft Research", "aff_unique_url": "https://www.yonsei.ac.kr;http://www.cnu.ac.kr;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "Yonsei;CNU;MSR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", - "aff_country_unique": "South Korea;United States" + "aff_country_unique": "South Korea;United States", + "bibtex": "@InProceedings{Kim_2017_ICCV,\n \n author = {\n Kim,\n Seungryong and Min,\n Dongbo and Lin,\n Stephen and Sohn,\n Kwanghoon\n},\n title = {\n DCTM: Discrete-Continuous Transformation Matching for Semantic Flow\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "DSLR-Quality Photos on Mobile Devices With Deep Convolutional Networks", @@ -3484,10 +3711,11 @@ "status": "Poster", "track": "main", "pid": "1365", + "author_site": "Andrey Ignatov; Nikolay Kobyshev; Radu Timofte; Kenneth Vanhoey; Luc Van Gool", "author": "Andrey Ignatov; Nikolay Kobyshev; Radu Timofte; Kenneth Vanhoey; Luc Van Gool", "abstract": "Despite a rapid rise in the quality of built-in smartphone cameras, their physical limitations - small sensor size, compact lenses and the lack of specific hardware, - impede them to achieve the quality results of DSLR cameras. In this work we present an end-to-end deep learning approach that bridges this gap by translating ordinary photos into DSLR-quality images. We propose learning the translation function using a residual convolutional neural network that improves both color rendition and image sharpness. Since the standard mean squared loss is not well suited for measuring perceptual image quality, we introduce a composite perceptual error function that combines content, color and texture losses. The first two losses are defined analytically, while the texture loss is learned in an adversarial fashion. We also present DPED, a large-scale dataset that consists of real photos captured from three different phones and one high-end reflex camera. Our quantitative and qualitative assessments reveal that the enhanced image quality is comparable to that of DSLR-taken photos, while the methodology is generalized to any type of digital camera.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Ignatov_DSLR-Quality_Photos_on_ICCV_2017_paper.pdf", - "aff": "Computer Vision Laboratory, ETH Z\u00fcrich, Switzerland; Computer Vision Laboratory, ETH Z\u00fcrich, Switzerland; Computer Vision Laboratory, ETH Z\u00fcrich, Switzerland; Computer Vision Laboratory, ETH Z\u00fcrich, Switzerland; Computer Vision Laboratory, ETH Z\u00fcrich, Switzerland+ESAT - PSI, KU Leuven, Belgium", + "aff": "Computer Vision Laboratory, ETH Zürich, Switzerland; Computer Vision Laboratory, ETH Zürich, Switzerland; Computer Vision Laboratory, ETH Zürich, Switzerland; Computer Vision Laboratory, ETH Zürich, Switzerland; Computer Vision Laboratory, ETH Zürich, Switzerland+ESAT - PSI, KU Leuven, Belgium", "project": "", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2017/supplemental/Ignatov_DSLR-Quality_Photos_on_ICCV_2017_supplemental.pdf", @@ -3501,14 +3729,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Ignatov_DSLR-Quality_Photos_on_ICCV_2017_paper.html", "aff_unique_index": "0;0;0;0;0+1", - "aff_unique_norm": "ETH Zurich;KU Leuven", + "aff_unique_norm": "ETH Zürich;KU Leuven", "aff_unique_dep": "Computer Vision Laboratory;ESAT - PSI", "aff_unique_url": "https://www.ethz.ch;https://www.kuleuven.be", "aff_unique_abbr": "ETHZ;KU Leuven", "aff_campus_unique_index": "0;0;0;0;0", - "aff_campus_unique": "Z\u00fcrich;", + "aff_campus_unique": "Zürich;", "aff_country_unique_index": "0;0;0;0;0+1", - "aff_country_unique": "Switzerland;Belgium" + "aff_country_unique": "Switzerland;Belgium", + "bibtex": "@InProceedings{Ignatov_2017_ICCV,\n \n author = {\n Ignatov,\n Andrey and Kobyshev,\n Nikolay and Timofte,\n Radu and Vanhoey,\n Kenneth and Van Gool,\n Luc\n},\n title = {\n DSLR-Quality Photos on Mobile Devices With Deep Convolutional Networks\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "DSOD: Learning Deeply Supervised Object Detectors From Scratch", @@ -3516,6 +3745,7 @@ "status": "Poster", "track": "main", "pid": "767", + "author_site": "Zhiqiang Shen; Zhuang Liu; Jianguo Li; Yu-Gang Jiang; Yurong Chen; Xiangyang Xue", "author": "Zhiqiang Shen; Zhuang Liu; Jianguo Li; Yu-Gang Jiang; Yurong Chen; Xiangyang Xue", "abstract": "We present Deeply Supervised Object Detector (DSOD), a framework that can learn object detectors from scratch. State-of-the-art object objectors rely heavily on the off-the-shelf networks pre-trained on large-scale classification datasets like ImageNet, which incurs learning bias due to the difference on both the loss functions and the category distributions between classification and detection tasks. Model fine-tuning for the detection task could alleviate this bias to some extent but not fundamentally. Besides, transferring pre-trained models from classification to detection between discrepant domains is even more difficult (e.g. RGB to depth images). A better solution to tackle these two critical problems is to train object detectors from scratch, which motivates our proposed DSOD. Previous efforts in this direction mostly failed due to much more complicated loss functions and limited training data in object detection. In DSOD, we contribute a set of design principles for training object detectors from scratch. One of the key findings is that deep supervision, enabled by dense layer-wise connections, plays a critical role in learning a good detector. Combining with several other principles, we develop DSOD following the single-shot detection (SSD) framework. Experiments on PASCAL VOC 2007, 2012 and MS COCO datasets demonstrate that DSOD can achieve better results than the state-of-the-art solutions with much more compact models. For instance, DSOD outperforms SSD on all three benchmarks with real-time detection speed, while requires only 1/2 parameters to SSD and 1/10 parameters to Faster RCNN.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Shen_DSOD_Learning_Deeply_ICCV_2017_paper.pdf", @@ -3533,14 +3763,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Shen_DSOD_Learning_Deeply_ICCV_2017_paper.html", "aff_unique_index": "0;1;2;0;2;0", - "aff_unique_norm": "Fudan University;Tsinghua University;Intel", + "aff_unique_norm": "Fudan University;Tsinghua University;Intel Corporation", "aff_unique_dep": ";;Intel Labs", "aff_unique_url": "https://www.fudan.edu.cn;https://www.tsinghua.edu.cn;https://www.intel.cn", "aff_unique_abbr": "Fudan;THU;Intel", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Shen_2017_ICCV,\n \n author = {\n Shen,\n Zhiqiang and Liu,\n Zhuang and Li,\n Jianguo and Jiang,\n Yu-Gang and Chen,\n Yurong and Xue,\n Xiangyang\n},\n title = {\n DSOD: Learning Deeply Supervised Object Detectors From Scratch\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "DeNet: Scalable Real-Time Object Detection With Directed Sparse Sampling", @@ -3548,6 +3779,7 @@ "status": "Poster", "track": "main", "pid": "142", + "author_site": "Lachlan Tychsen-Smith; Lars Petersson", "author": "Lachlan Tychsen-Smith; Lars Petersson", "abstract": "We define the object detection from imagery problem as estimating a very large but extremely sparse bounding box dependent probability distribution. Subsequently we identify a sparse distribution estimation scheme, Directed Sparse Sampling, and employ it in a single end-to-end CNN based detection model. This methodology extends and formalizes previous state-of-the-art detection models with an additional emphasis on high evaluation rates and reduced manual engineering. We introduce two novelties, a corner based region-of-interest estimator and a deconvolution based CNN model. The resulting model is scene adaptive, does not require manually defined reference bounding boxes and produces highly competitive results on MSCOCO, Pascal VOC 2007 and Pascal VOC 2012 with real-time evaluation rates. Further analysis suggests our model performs particularly well when finegrained object localization is desirable. We argue that this advantage stems from the significantly larger set of available regions-of-interest relative to other methods. Source-code is available from: https://github.com/lachlants/denet", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Tychsen-Smith_DeNet_Scalable_Real-Time_ICCV_2017_paper.pdf", @@ -3572,7 +3804,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Australia" + "aff_country_unique": "Australia", + "bibtex": "@InProceedings{Tychsen-Smith_2017_ICCV,\n \n author = {\n Tychsen-Smith,\n Lachlan and Petersson,\n Lars\n},\n title = {\n DeNet: Scalable Real-Time Object Detection With Directed Sparse Sampling\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Decoder Network Over Lightweight Reconstructed Feature for Fast Semantic Style Transfer", @@ -3580,6 +3813,7 @@ "status": "Poster", "track": "main", "pid": "945", + "author_site": "Ming Lu; Hao Zhao; Anbang Yao; Feng Xu; Yurong Chen; Li Zhang", "author": "Ming Lu; Hao Zhao; Anbang Yao; Feng Xu; Yurong Chen; Li Zhang", "abstract": "Recently, the community of style transfer is trying to incorporate semantic information into traditional system. This practice achieves better perceptual results by transferring the style between semantically-corresponding regions. Yet, few efforts are invested to address the computation bottleneck of back-propagation. In this paper, we propose a new framework for fast semantic style transfer. Our method decomposes the semantic style transfer problem into feature reconstruction part and feature decoder part. The reconstruction part tactfully solves the optimization problem of content loss and style loss in feature space by particularly reconstructed feature. This significantly reduces the computation of propagating the loss through the whole network. The decoder part transforms the reconstructed feature into the stylized image. Through a careful bridging of the two modules, the proposed approach not only achieves competitive results as backward optimization methods but also is about two orders of magnitude faster.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Lu_Decoder_Network_Over_ICCV_2017_paper.pdf", @@ -3597,14 +3831,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Lu_Decoder_Network_Over_ICCV_2017_paper.html", "aff_unique_index": "0;0;1;0;1;0", - "aff_unique_norm": "Tsinghua University;Intel", + "aff_unique_norm": "Tsinghua University;Intel Labs China", "aff_unique_dep": "Department of Electronic Engineering;Cognitive Computing Laboratory", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.intel.com/content/www/us/en/research/labs.html", "aff_unique_abbr": "THU;Intel", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Lu_2017_ICCV,\n \n author = {\n Lu,\n Ming and Zhao,\n Hao and Yao,\n Anbang and Xu,\n Feng and Chen,\n Yurong and Zhang,\n Li\n},\n title = {\n Decoder Network Over Lightweight Reconstructed Feature for Fast Semantic Style Transfer\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Deep Adaptive Image Clustering", @@ -3612,6 +3847,7 @@ "status": "Oral", "track": "main", "pid": "69", + "author_site": "Jianlong Chang; Lingfeng Wang; Gaofeng Meng; Shiming Xiang; Chunhong Pan", "author": "Jianlong Chang; Lingfeng Wang; Gaofeng Meng; Shiming Xiang; Chunhong Pan", "abstract": "Image clustering is a crucial but challenging task in machine learning and computer vision. Existing methods often ignore the combination between feature learning and clustering. To tackle this problem, we propose Deep Adaptive Clustering (DAC) that recasts the clustering problem into a binary pairwise-classification framework to judge whether pairs of images belong to the same clusters. In DAC, the similarities are calculated as the cosine distance between label features of images which are generated by a deep convolutional network (ConvNet). By introducing a constraint into DAC, the learned label features tend to be one-hot vectors that can be utilized for clustering images. The main challenge is that the ground-truth similarities are unknown in image clustering. We handle this issue by presenting an alternating iterative Adaptive Learning algorithm where each iteration alternately selects labeled samples and trains the ConvNet. Conclusively, images are automatically clustered based on the label features. Experimental results show that DAC achieves state-of-the-art performance on five popular datasets, e.g., yielding 97.75% clustering accuracy on MNIST, 52.18% on CIFAR-10 and 46.99% on STL-10.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Chang_Deep_Adaptive_Image_ICCV_2017_paper.pdf", @@ -3636,7 +3872,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chang_2017_ICCV,\n \n author = {\n Chang,\n Jianlong and Wang,\n Lingfeng and Meng,\n Gaofeng and Xiang,\n Shiming and Pan,\n Chunhong\n},\n title = {\n Deep Adaptive Image Clustering\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Deep Binaries: Encoding Semantic-Rich Cues for Efficient Textual-Visual Cross Retrieval", @@ -3644,6 +3881,7 @@ "status": "Poster", "track": "main", "pid": "1648", + "author_site": "Yuming Shen; Li Liu; Ling Shao; Jingkuan Song", "author": "Yuming Shen; Li Liu; Ling Shao; Jingkuan Song", "abstract": "Cross-modal hashing is usually regarded as an effective technique for large-scale textual-visual cross retrieval, where data from different modalities are mapped into a shared Hamming space for matching. Most of the traditional textual-visual binary encoding methods only consider holistic image representations and fail to model descriptive sentences. This renders existing methods inappropriate to handle the rich semantics of informative cross-modal data for quality textual-visual search tasks. To address the problem of hashing cross-modal data with semantic-rich cues, in this paper, a novel integrated deep architecture is developed to effectively encode the detailed semantics of informative images and long descriptive sentences, named as Textual-Visual Deep Binaries (TVDB). In particular, region-based convolutional networks with long short-term memory units are introduced to fully explore image regional details while semantic cues of sentences are modeled by a text convolutional network. Additionally, we propose a stochastic batch-wise training routine, where high-quality binary codes and deep encoding functions are efficiently optimized in an alternating manner. Experiments are conducted on three multimedia datasets, i.e. Microsoft COCO, IAPR TC-12, and INRIA Web Queries, where the proposed TVDB model significantly outperforms state-of-the-art binary coding methods in the task of cross-modal retrieval.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Shen_Deep_Binaries_Encoding_ICCV_2017_paper.pdf", @@ -3668,7 +3906,8 @@ "aff_campus_unique_index": "0;0;0;2", "aff_campus_unique": "Norwich;;Chengdu", "aff_country_unique_index": "0;0+1;0;1", - "aff_country_unique": "United Kingdom;China" + "aff_country_unique": "United Kingdom;China", + "bibtex": "@InProceedings{Shen_2017_ICCV,\n \n author = {\n Shen,\n Yuming and Liu,\n Li and Shao,\n Ling and Song,\n Jingkuan\n},\n title = {\n Deep Binaries: Encoding Semantic-Rich Cues for Efficient Textual-Visual Cross Retrieval\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Deep Clustering via Joint Convolutional Autoencoder Embedding and Relative Entropy Minimization", @@ -3676,6 +3915,7 @@ "status": "Poster", "track": "main", "pid": "2643", + "author_site": "Kamran Ghasedi Dizaji; Amirhossein Herandi; Cheng Deng; Weidong Cai; Heng Huang", "author": "Kamran Ghasedi Dizaji; Amirhossein Herandi; Cheng Deng; Weidong Cai; Heng Huang", "abstract": "In this paper, we propose a new clustering model, called DEeP Embedded RegularIzed ClusTering (DEPICT), which efficiently maps data into a discriminative embedding subspace and precisely predicts cluster assignments. DEPICT generally consists of a multinomial logistic regression function stacked on top of a multi-layer convolutional autoencoder. We define a clustering objective function using relative entropy (KL divergence) minimization, regularized by a prior for the frequency of cluster assignments. An alternating strategy is then derived to optimize the objective by updating parameters and estimating cluster assignments. Furthermore, we employ the reconstruction loss functions in our autoencoder, as a data-dependent regularization term, to prevent the deep embedding function from overfitting. In order to benefit from end-to-end optimization and eliminate the necessity for layer-wise pretraining, we introduce a joint learning framework to minimize the unified clustering and reconstruction loss functions together and train all network layers simultaneously. Experimental results indicate the superiority and faster running time of DEPICT in real-world clustering tasks, where no labeled data is available for hyper-parameter tuning.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Dizaji_Deep_Clustering_via_ICCV_2017_paper.pdf", @@ -3700,7 +3940,8 @@ "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Arlington;Sydney", "aff_country_unique_index": "0;0;1;2;0", - "aff_country_unique": "United States;China;Australia" + "aff_country_unique": "United States;China;Australia", + "bibtex": "@InProceedings{Dizaji_2017_ICCV,\n \n author = {\n Ghasedi Dizaji,\n Kamran and Herandi,\n Amirhossein and Deng,\n Cheng and Cai,\n Weidong and Huang,\n Heng\n},\n title = {\n Deep Clustering via Joint Convolutional Autoencoder Embedding and Relative Entropy Minimization\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Deep Cropping via Attention Box Prediction and Aesthetics Assessment", @@ -3708,6 +3949,7 @@ "status": "Poster", "track": "main", "pid": "712", + "author_site": "Wenguan Wang; Jianbing Shen", "author": "Wenguan Wang; Jianbing Shen", "abstract": "We model the photo cropping problem as a cascade of attention box regression and aesthetic quality classification, based on deep learning. A neural network is designed that has two branches for predicting attention bounding box and analyzing aesthetics, respectively. The predicted attention box is treated as an initial crop window where a set of cropping candidates are generated around it, without missing important information. Then, aesthetics assessment is employed to select the final crop as the one with the best aesthetic quality. With our network, cropping candidates share features within full-image convolutional feature maps, thus avoiding repeated feature computation and leading to higher computation efficiency. Via leveraging rich data for attention prediction and aesthetics assessment, the proposed method produces high-quality cropping results, even with the limited availability of training data for photo cropping. The experimental results demonstrate the competitive results and fast processing speed (5 fps with all steps).", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Wang_Deep_Cropping_via_ICCV_2017_paper.pdf", @@ -3732,7 +3974,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2017_ICCV,\n \n author = {\n Wang,\n Wenguan and Shen,\n Jianbing\n},\n title = {\n Deep Cropping via Attention Box Prediction and Aesthetics Assessment\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Deep Determinantal Point Process for Large-Scale Multi-Label Classification", @@ -3740,10 +3983,11 @@ "status": "Poster", "track": "main", "pid": "200", + "author_site": "Pengtao Xie; Ruslan Salakhutdinov; Luntian Mou; Eric P. Xing", "author": "Pengtao Xie; Ruslan Salakhutdinov; Luntian Mou; Eric P. Xing", "abstract": "We study large-scale multi-label classification (MLC) on two recently released datasets: Youtube-8M and Open Images that contain millions of data instances and thousands of classes. The unprecedented problem scale poses great challenges for MLC. First, finding out the correct label subset out of exponentially many choices incurs substantial ambiguity and uncertainty. Second, the large data-size and class-size entail considerable computational cost. To address the first challenge, we investigate two strategies: capturing label-correlations from the training data and incorporating label co-occurrence relations obtained from external knowledge, which effectively eliminate semantically inconsistent labels and provide contextual clues to differentiate visually ambiguous labels. Specifically, we propose a Deep Determinantal Point Process (DDPP) model which seamlessly integrates a DPP with deep neural networks (DNNs) and supports end-to-end multi-label learning and deep representation learning. The DPP is able to capture label-correlations of any order with a polynomial computational cost, while the DNNs learn hierarchical features of images/videos and capture the dependency between input data and labels. To incorporate external knowledge about label co-occurrence relations, we impose a relational regularization over the kernel matrix in DDPP. To address the second challenge, we study an efficient low-rank kernel learning algorithm based on inducing point methods. Experiments on the two datasets demonstrate the efficacy and efficiency of the proposed methods.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Xie_Deep_Determinantal_Point_ICCV_2017_paper.pdf", - "aff": "Machine Learning Department, Carnegie Mellon University, USA+Petuum Inc.; Machine Learning Department, Carnegie Mellon University, USA+Petuum Inc.; Beijing Key Laboratory of Traf\ufb01c Engineering, Beijing University of Technology, China; Petuum Inc.", + "aff": "Machine Learning Department, Carnegie Mellon University, USA+Petuum Inc.; Machine Learning Department, Carnegie Mellon University, USA+Petuum Inc.; Beijing Key Laboratory of Traffic Engineering, Beijing University of Technology, China; Petuum Inc.", "project": "", "github": "", "supp": "", @@ -3758,13 +4002,14 @@ "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Xie_Deep_Determinantal_Point_ICCV_2017_paper.html", "aff_unique_index": "0+1;0+1;2;1", "aff_unique_norm": "Carnegie Mellon University;Petuum Inc.;Beijing University of Technology", - "aff_unique_dep": "Machine Learning Department;;Beijing Key Laboratory of Traf\ufb01c Engineering", + "aff_unique_dep": "Machine Learning Department;;Beijing Key Laboratory of Traffic Engineering", "aff_unique_url": "https://www.cmu.edu;https://www.petuum.com;http://www.bjut.edu.cn", "aff_unique_abbr": "CMU;;", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;1;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Xie_2017_ICCV,\n \n author = {\n Xie,\n Pengtao and Salakhutdinov,\n Ruslan and Mou,\n Luntian and Xing,\n Eric P.\n},\n title = {\n Deep Determinantal Point Process for Large-Scale Multi-Label Classification\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Deep Direct Regression for Multi-Oriented Scene Text Detection", @@ -3772,6 +4017,7 @@ "status": "Poster", "track": "main", "pid": "44", + "author_site": "Wenhao He; Xu-Yao Zhang; Fei Yin; Cheng-Lin Liu", "author": "Wenhao He; Xu-Yao Zhang; Fei Yin; Cheng-Lin Liu", "abstract": "In this paper, we first provide a new perspective to divide existing high performance object detection methods into direct and indirect regressions. Direct regression performs boundary regression by predicting the offsets from a given point, while indirect regression predicts the offsets from some bounding box proposals. In the context of multi-oriented scene text detection, we analyze the drawbacks of indirect regression, which covers the state-of-the-art detection structures Faster-RCNN and SSD as instances, and point out the potential superiority of direct regression. To verify this point of view, we propose a deep direct regression based method for multi-oriented scene text detection. Our detection framework is simple and effective with a fully convolutional network and one-step post processing. The fully convolutional network is optimized in an end-to-end way and has bi-task outputs where one is pixel-wise classification between text and non-text, and the other is direct regression to determine the vertex coordinates of quadrilateral text boundaries. The proposed method is particularly beneficial to localize incidental scene texts. On the ICDAR2015 Incidental Scene Text benchmark, our method achieves the F-measure of 81%, which is a new state-of-the-art and significantly outperforms previous approaches. On other standard datasets with focused scene texts, our method also reaches the state-of-the-art performance.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/He_Deep_Direct_Regression_ICCV_2017_paper.pdf", @@ -3796,7 +4042,8 @@ "aff_campus_unique_index": "0+0;0;0;0+0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0+0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{He_2017_ICCV,\n \n author = {\n He,\n Wenhao and Zhang,\n Xu-Yao and Yin,\n Fei and Liu,\n Cheng-Lin\n},\n title = {\n Deep Direct Regression for Multi-Oriented Scene Text Detection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Deep Dual Learning for Semantic Image Segmentation", @@ -3804,6 +4051,7 @@ "status": "Poster", "track": "main", "pid": "1192", + "author_site": "Ping Luo; Guangrun Wang; Liang Lin; Xiaogang Wang", "author": "Ping Luo; Guangrun Wang; Liang Lin; Xiaogang Wang", "abstract": "Deep neural networks have advanced many computer vision tasks, because of their compelling capacities to learn from large amount of labeled data. However, their performances are not fully exploited in semantic image segmentation as the scale of training set is limited, where per-pixel labelmaps are expensive to obtain. To reduce labeling efforts, a natural solution is to collect additional images from Internet that are associated with image-level tags. Unlike existing works that treated labelmaps and tags as independent supervisions, we present a novel learning setting, namely dual image segmentation (DIS), which consists of two complementary learning problems that are jointly solved. One predicts labelmaps and tags from images, and the other reconstructs the images using the predicted labelmaps. DIS has three appealing properties. 1) Given an image with tags only, its labelmap can be inferred by leveraging the images and tags as constraints. The estimated labelmaps that capture accurate object classes and boundaries are used as ground truths in training to boost performance. 2) DIS is able to clean tags that have noises. 3) DIS significantly reduces the number of per-pixel annotations in training, while still achieves state-of-the-art performance. Extensive experiments demonstrate the effectiveness of DIS, which outperforms an existing best-performing baseline by 12.6% on Pascal VOC 2012 test set, without any post-processing such as CRF/MRF smoothing.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Luo_Deep_Dual_Learning_ICCV_2017_paper.pdf", @@ -3821,14 +4069,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Luo_Deep_Dual_Learning_ICCV_2017_paper.html", "aff_unique_index": "0;1+0;1+2;0", - "aff_unique_norm": "Chinese University of Hong Kong;Sun Yat-sen University;SenseTime Group", + "aff_unique_norm": "The Chinese University of Hong Kong;Sun Yat-Sen University;SenseTime Group", "aff_unique_dep": ";;", "aff_unique_url": "https://www.cuhk.edu.hk;http://www.sysu.edu.cn/;https://www.sensetime.com", "aff_unique_abbr": "CUHK;SYSU;SenseTime", "aff_campus_unique_index": "0;0;;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0+0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Luo_2017_ICCV,\n \n author = {\n Luo,\n Ping and Wang,\n Guangrun and Lin,\n Liang and Wang,\n Xiaogang\n},\n title = {\n Deep Dual Learning for Semantic Image Segmentation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Deep Facial Action Unit Recognition From Partially Labeled Data", @@ -3836,6 +4085,7 @@ "status": "Poster", "track": "main", "pid": "1633", + "author_site": "Shan Wu; Shangfei Wang; Bowen Pan; Qiang Ji", "author": "Shan Wu; Shangfei Wang; Bowen Pan; Qiang Ji", "abstract": "Current work on facial action unit (AU) recognition requires AU-labeled facial images. Although large amounts of facial images are readily available, AU annotation is expensive and time consuming. To address this, we propose a deep facial action unit recognition approach learning from partially AU-labeled data. The proposed approach makes full use of both partly available ground-truth AU labels and the readily available large scale facial images without annotation. Specifically, we propose to learn label distribution from the ground-truth AU labels, and then train the AU classifiers from the large-scale facial images by maximizing the log likelihood of the mapping functions of AUs with regard to the learnt label distribution for all training data and minimizing the error between predicted AUs and ground-truth AUs for labeled data simultaneously. A restricted Boltzmann machine is adopted to model AU label distribution, a deep neural network is used to learn facial representation from facial images, and the support vector machine is employed as the classifier. Experiments on two benchmark databases demonstrate the effectiveness of the proposed approach.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Wu_Deep_Facial_Action_ICCV_2017_paper.pdf", @@ -3860,7 +4110,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Wu_2017_ICCV,\n \n author = {\n Wu,\n Shan and Wang,\n Shangfei and Pan,\n Bowen and Ji,\n Qiang\n},\n title = {\n Deep Facial Action Unit Recognition From Partially Labeled Data\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Deep Free-Form Deformation Network for Object-Mask Registration", @@ -3868,6 +4119,7 @@ "status": "Poster", "track": "main", "pid": "1631", + "author_site": "Haoyang Zhang; Xuming He", "author": "Haoyang Zhang; Xuming He", "abstract": "This paper addresses the problem of object-mask registration, which aligns a shape mask to a target object instance. Prior work typically formulate the problem as an object segmentation task with mask prior, which is challenging to solve. In this work, we take a transformation based approach that predicts a 2D non-rigid spatial transform and warps the shape mask onto the target object. In particular, we propose a deep spatial transformer network that learns free-form deformations (FFDs) to non-rigidly warp the shape mask based on a multi-level dual mask feature pooling strategy. The FFD transforms are based on B-splines and parameterized by the offsets of predefined control points, which are differentiable. Therefore, we are able to train the entire network in an end-to-end manner based on L2 matching loss. We evaluate our FFD network on a challenging object-mask alignment task, which aims to refine a set of object segment proposals, and our approach achieves the state-of-the-art performance on the Cityscapes, the PASCAL VOC and the MSCOCO datasets.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zhang_Deep_Free-Form_Deformation_ICCV_2017_paper.pdf", @@ -3892,7 +4144,8 @@ "aff_campus_unique_index": "0;1", "aff_campus_unique": "Canberra;Shanghai", "aff_country_unique_index": "0;1", - "aff_country_unique": "Australia;China" + "aff_country_unique": "Australia;China", + "bibtex": "@InProceedings{Zhang_2017_ICCV,\n \n author = {\n Zhang,\n Haoyang and He,\n Xuming\n},\n title = {\n Deep Free-Form Deformation Network for Object-Mask Registration\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Deep Functional Maps: Structured Prediction for Dense Shape Correspondence", @@ -3900,7 +4153,7 @@ "status": "Poster", "track": "main", "pid": "2649", - "author_site": "Or Litany; Tal Remez; Emanuele Rodol\u00c3\u00a0; Alex Bronstein; Michael Bronstein", + "author_site": "Or Litany; Tal Remez; Emanuele Rodolà; Alex Bronstein; Michael Bronstein", "author": "Or Litany; Tal Remez; Emanuele Rodola; Alex Bronstein; Michael Bronstein", "abstract": "We introduce a new framework for learning dense correspondence between deformable 3D shapes. Existing learning based approaches model shape correspondence as a labelling problem, where each point of a query shape receives a label identifying a point on some reference domain; the correspondence is then constructed a posteriori by composing the label predictions of two input shapes. We propose a paradigm shift and design a structured prediction model in the space of functional maps, linear operators that provide a compact representation of the correspondence. We model the learning process via a deep residual network which takes dense descriptor fields defined on two shapes as input, and outputs a soft map between the two given objects. The resulting correspondence is shown to be accurate on several challenging benchmarks comprising multiple categories, synthetic models, real scans with acquisition artifacts, topological noise, and partiality.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Litany_Deep_Functional_Maps_ICCV_2017_paper.pdf", @@ -3918,14 +4171,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Litany_Deep_Functional_Maps_ICCV_2017_paper.html", "aff_unique_index": "0+1;0;2+3;1+4;1+2", - "aff_unique_norm": "Tel Aviv University;Intel;Universit\u00e0 della Svizzera italiana;Sapienza University of Rome;Technion - Israel Institute of Technology", - "aff_unique_dep": ";Intel Corporation;;;", + "aff_unique_norm": "Tel Aviv University;Intel Corporation;Università della Svizzera italiana;Sapienza University of Rome;Technion - Israel Institute of Technology", + "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.tau.ac.il;https://www.intel.com;https://www.usi.ch;https://www.uniroma1.it;https://www.technion.ac.il/en/", "aff_unique_abbr": "TAU;Intel;USI;Sapienza;Technion", "aff_campus_unique_index": ";1+2;;1", "aff_campus_unique": ";Lugano;Rome", "aff_country_unique_index": "0+1;0;2+3;1+0;1+2", - "aff_country_unique": "Israel;United States;Switzerland;Italy" + "aff_country_unique": "Israel;United States;Switzerland;Italy", + "bibtex": "@InProceedings{Litany_2017_ICCV,\n \n author = {\n Litany,\n Or and Remez,\n Tal and Rodola,\n Emanuele and Bronstein,\n Alex and Bronstein,\n Michael\n},\n title = {\n Deep Functional Maps: Structured Prediction for Dense Shape Correspondence\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Deep Generative Adversarial Compression Artifact Removal", @@ -3933,6 +4187,7 @@ "status": "Poster", "track": "main", "pid": "2490", + "author_site": "Leonardo Galteri; Lorenzo Seidenari; Marco Bertini; Alberto Del Bimbo", "author": "Leonardo Galteri; Lorenzo Seidenari; Marco Bertini; Alberto Del Bimbo", "abstract": "Compression artifacts arise in images whenever a lossy compression algorithm is applied. These artifacts eliminate details present in the original image, or add noise and small structures; because of these effects they make images less pleasant for the human eye, and may also lead to decreased performance of computer vision algorithms such as object detectors. To eliminate such artifacts, when decompressing an image, it is required to recover the original image from a disturbed version. To this end, we present a feed-forward fully convolutional residual network model trained using a generative adversarial framework. To provide a baseline, we show that our model can be also trained optimizing the Structural Similarity (SSIM), which is a better loss with respect to the simpler Mean Squared Error (MSE). Our GAN is able to produce images with more photorealistic details than MSE or SSIM based networks. Moreover we show that our approach can be used as a pre-processing step for object detection in case images are degraded by compression to a point that state-of-the art detectors fail. In this task, our GAN method obtains better performance than MSE or SSIM trained networks.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Galteri_Deep_Generative_Adversarial_ICCV_2017_paper.pdf", @@ -3948,7 +4203,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Galteri_Deep_Generative_Adversarial_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Galteri_Deep_Generative_Adversarial_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Galteri_2017_ICCV,\n \n author = {\n Galteri,\n Leonardo and Seidenari,\n Lorenzo and Bertini,\n Marco and Del Bimbo,\n Alberto\n},\n title = {\n Deep Generative Adversarial Compression Artifact Removal\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Deep Globally Constrained MRFs for Human Pose Estimation", @@ -3956,6 +4212,7 @@ "status": "Poster", "track": "main", "pid": "1466", + "author_site": "Ioannis Marras; Petar Palasek; Ioannis Patras", "author": "Ioannis Marras; Petar Palasek; Ioannis Patras", "abstract": "This work introduces a novel Convolutional Network architecture (ConvNet) for the task of human pose estimation, that is the localization of body joints in a single static image. We propose a coarse to fine architecture that addresses shortcomings of the baseline architecture in [26] that stem from the fact that large inaccuracies of its coarse ConvNet cannot be corrected by the refinement ConvNet that refines the estimation within small windows of the coarse prediction. We overcome this by introducing a Markov Random Field (MRF)-based spatial model network between the coarse and the refinement model that introduces geometric constraints on the relative locations of the body joints. We propose an architecture in which a) the filters that implement the message passing in the MRF inference are factored in a way that constrains them by a low dimensional pose manifold the projection to which is estimated by a separate branch of the proposed ConvNet and b) the strengths of the pairwise joint constraints are modeled by weights that are jointly estimated by the other parameters of the network. The proposed network is trained in an end-to-end fashion. Experimental results show that the proposed method improves the baseline model and provides state of the art results on very challenging benchmarks.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Marras_Deep_Globally_Constrained_ICCV_2017_paper.pdf", @@ -3980,7 +4237,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "London", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Marras_2017_ICCV,\n \n author = {\n Marras,\n Ioannis and Palasek,\n Petar and Patras,\n Ioannis\n},\n title = {\n Deep Globally Constrained MRFs for Human Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Deep Growing Learning", @@ -3988,6 +4246,7 @@ "status": "Poster", "track": "main", "pid": "1203", + "author_site": "Guangcong Wang; Xiaohua Xie; Jianhuang Lai; Jiaxuan Zhuo", "author": "Guangcong Wang; Xiaohua Xie; Jianhuang Lai; Jiaxuan Zhuo", "abstract": "Semi-supervised learning (SSL) is an import paradigm to make full use of a large amount of unlabeled data in machine learning. A bottleneck of SSL is the overfitting problem when training over the limited labeled data, especially on a complex model like a deep neural network. To get around this bottleneck, we propose a bio-inspired SSL framework on deep neural network, namely Deep Growing Learning (DGL). Specifically, we formulate the SSL as an EM-like process, where the deep network alternately iterates between automatically growing convolutional layers and selecting reliable pseudo-labeled data for training. The DGL guarantees that a shallow neural network is trained with labeled data, while a deeper neural network is trained with growing amount of reliable pseudo-labeled data, so as to alleviate the overfitting problem. Experiments on different visual recognition tasks have verified the effectiveness of DGL.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Wang_Deep_Growing_Learning_ICCV_2017_paper.pdf", @@ -4007,12 +4266,13 @@ "aff_unique_index": "0+1+2;0+1+2;0+1+2;0", "aff_unique_norm": "Sun Yat-sen University;Guangdong Key Laboratory of Information Security Technology;Ministry of Education", "aff_unique_dep": "School of Data and Computer Science;Information Security Technology;Key Laboratory of Machine Intelligence and Advanced Computing", - "aff_unique_url": "http://www.sysu.edu.cn/;;http://www.moe.gov.cn/", - "aff_unique_abbr": "SYSU;;MOE", + "aff_unique_url": "http://www.sysu.edu.cn/;;", + "aff_unique_abbr": "SYSU;;", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0+0;0+0+0;0+0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2017_ICCV,\n \n author = {\n Wang,\n Guangcong and Xie,\n Xiaohua and Lai,\n Jianhuang and Zhuo,\n Jiaxuan\n},\n title = {\n Deep Growing Learning\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Deep Metric Learning With Angular Loss", @@ -4020,6 +4280,7 @@ "status": "Poster", "track": "main", "pid": "966", + "author_site": "Jian Wang; Feng Zhou; Shilei Wen; Xiao Liu; Yuanqing Lin", "author": "Jian Wang; Feng Zhou; Shilei Wen; Xiao Liu; Yuanqing Lin", "abstract": "The modern image search system requires semantic understanding of image, and a key yet under-addressed problem is to learn a good metric for measuring the similarity between images. While deep metric learning has yielded impressive performance gains by extracting high level abstractions from image data, a proper objective loss function becomes the central issue to boost the performance. In this paper, we propose a novel angular loss, which takes angle relationship into account, for learning better similarity metric. Whereas previous metric learning methods focus on optimizing the similarity (contrastive loss) or relative similarity (triplet loss) of image pairs, our proposed method aims at constraining the angle at the negative point of triplet triangles. Several favorable properties are observed when compared with conventional methods. First, scale invariance is introduced, improving the robustness of objective against feature variance. Second, a third-order geometric constraint is inherently imposed, capturing additional local structure of triplet triangles than contrastive loss or triplet loss. Third, better convergence has been demonstrated by experiments on three publicly available datasets.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Wang_Deep_Metric_Learning_ICCV_2017_paper.pdf", @@ -4044,7 +4305,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2017_ICCV,\n \n author = {\n Wang,\n Jian and Zhou,\n Feng and Wen,\n Shilei and Liu,\n Xiao and Lin,\n Yuanqing\n},\n title = {\n Deep Metric Learning With Angular Loss\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Deep Occlusion Reasoning for Multi-Camera Multi-Target Detection", @@ -4052,7 +4314,7 @@ "status": "Poster", "track": "main", "pid": "633", - "author_site": "Pierre Baqu\u00c3\u00a9; Fran\u00c3\u00a7ois Fleuret; Pascal Fua", + "author_site": "Pierre Baqué; François Fleuret; Pascal Fua", "author": "Pierre Baque; Francois Fleuret; Pascal Fua", "abstract": "People detection in 2D images has improved greatly in recent years. However, comparatively little of this progress has percolated into multi-camera multi-people tracking algorithms, whose performance still degrades severely when scenes become very crowded. In this work, we introduce a new architecture that combines Convolutional Neural Nets and Conditional Random Fields to explicitly resolve ambiguities. One of its key ingredients are high-order CRF terms that model potential occlusions and give our approach its robustness even when many people are present. Our model is trained end-to-end and we show that it outperforms several state-of-the-art algorithms on challenging scenes.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Baque_Deep_Occlusion_Reasoning_ICCV_2017_paper.pdf", @@ -4070,14 +4332,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Baque_Deep_Occlusion_Reasoning_ICCV_2017_paper.html", "aff_unique_index": "0;0+1;0", - "aff_unique_norm": "EPFL;IDIAP", + "aff_unique_norm": "École Polytechnique Fédérale de Lausanne;IDIAP", "aff_unique_dep": "CVLab;", "aff_unique_url": "https://www.epfl.ch;https://www.idiap.ch", "aff_unique_abbr": "EPFL;", "aff_campus_unique_index": "0;0+1;0", "aff_campus_unique": "Lausanne;Martigny", "aff_country_unique_index": "0;0+0;0", - "aff_country_unique": "Switzerland" + "aff_country_unique": "Switzerland", + "bibtex": "@InProceedings{Baque_2017_ICCV,\n \n author = {\n Baque,\n Pierre and Fleuret,\n Francois and Fua,\n Pascal\n},\n title = {\n Deep Occlusion Reasoning for Multi-Camera Multi-Target Detection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Deep Scene Image Classification With the MFAFVNet", @@ -4085,6 +4348,7 @@ "status": "Poster", "track": "main", "pid": "2683", + "author_site": "Yunsheng Li; Mandar Dixit; Nuno Vasconcelos", "author": "Yunsheng Li; Mandar Dixit; Nuno Vasconcelos", "abstract": "The problem of transferring a deep convolutional network trained for object recognition to the task of scene image classification is considered. An embedded implementation of the recently proposed mixture of factor analyzers Fisher vector (MFA-FV) is proposed. This enables the design of a network architecture, the MFAFVNet, that can be trained in an end to end manner. The new architecture involves the design of an MFA-FV layer that implements a statistically correct version of the MFA-FV, through a combination of network computations and regularization. When compared to previous neural implementations of Fisher vectors, the MFAFVNet relies on a more powerful statistical model and a more accurate implementation. When compared to previous non-embedded models, the MFAFVNet relies on a state of the art model, which is now embedded into a CNN. This enables end to end training. Experiments show that the MFAFVNet has state of the art performance on scene classification.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Li_Deep_Scene_Image_ICCV_2017_paper.pdf", @@ -4099,7 +4363,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Li_Deep_Scene_Image_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Li_Deep_Scene_Image_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Li_2017_ICCV,\n \n author = {\n Li,\n Yunsheng and Dixit,\n Mandar and Vasconcelos,\n Nuno\n},\n title = {\n Deep Scene Image Classification With the MFAFVNet\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Deep Spatial-Semantic Attention for Fine-Grained Sketch-Based Image Retrieval", @@ -4107,6 +4372,7 @@ "status": "Poster", "track": "main", "pid": "2667", + "author_site": "Jifei Song; Qian Yu; Yi-Zhe Song; Tao Xiang; Timothy M. Hospedales", "author": "Jifei Song; Qian Yu; Yi-Zhe Song; Tao Xiang; Timothy M. Hospedales", "abstract": "Human sketches are unique in being able to capture both the spatial topology of a visual object, as well as its subtle appearance details. Fine-grained sketch-based image retrieval (FG-SBIR) importantly leverages on such fine-grained characteristics of sketches to conduct instance-level retrieval of photos. Nevertheless, human sketches are often highly abstract and iconic, resulting in severe misalignments with candidate photos which in turn make subtle visual detail matching difficult. Existing FG-SBIR approaches focus only on coarse holistic matching via deep cross-domain representation learning, yet ignore explicitly accounting for fine-grained details and their spatial context. In this paper, a novel deep FG-SBIR model is proposed which differs significantly from the existing models in that: (1) It is spatially aware, achieved by introducing an attention module that is sensitive to the spatial position of visual details; (2) It combines coarse and fine semantic information via a shortcut connection fusion block; and (3) It models feature correlation and is robust to misalignments between the extracted features across the two domains by introducing a novel higher order learnable energy function (HOLEF) based loss. Extensive experiments show that the proposed deep spatial-semantic attention model significantly outperforms the state-of-the-art.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Song_Deep_Spatial-Semantic_Attention_ICCV_2017_paper.pdf", @@ -4131,7 +4397,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "London;", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Song_2017_ICCV,\n \n author = {\n Song,\n Jifei and Yu,\n Qian and Song,\n Yi-Zhe and Xiang,\n Tao and Hospedales,\n Timothy M.\n},\n title = {\n Deep Spatial-Semantic Attention for Fine-Grained Sketch-Based Image Retrieval\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Deep TextSpotter: An End-To-End Trainable Scene Text Localization and Recognition Framework", @@ -4139,7 +4406,7 @@ "status": "Poster", "track": "main", "pid": "772", - "author_site": "Michal Bu\u00c5\u00a1ta; Luk\u00c3\u00a1\u00c5\u00a1 Neumann; Ji\u00c5\u0099\u00c3\u00ad Matas", + "author_site": "Michal BuÅ¡ta; Lukáš Neumann; Jiří Matas", "author": "Michal Busta; Lukas Neumann; Jiri Matas", "abstract": "A method for scene text localization and recognition is proposed. The novelties include: training of both text detection and recognition in a single end-to-end pass, the structure of the recognition CNN and the geometry of its input layer that preserves the aspect of the text and adapts its resolution to the data. The proposed method achieves state-of-the-art accuracy in the end-to-end text recognition on two standard datasets - ICDAR 2013 and ICDAR 2015, whilst being an order of magnitude faster than competing methods - the whole pipeline runs at 10 frames per second on an NVidia K80 GPU.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Busta_Deep_TextSpotter_An_ICCV_2017_paper.pdf", @@ -4164,7 +4431,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Prague", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Czech Republic" + "aff_country_unique": "Czech Republic", + "bibtex": "@InProceedings{Busta_2017_ICCV,\n \n author = {\n Busta,\n Michal and Neumann,\n Lukas and Matas,\n Jiri\n},\n title = {\n Deep TextSpotter: An End-To-End Trainable Scene Text Localization and Recognition Framework\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "DeepCD: Learning Deep Complementary Descriptors for Patch Representations", @@ -4172,6 +4440,7 @@ "status": "Poster", "track": "main", "pid": "1557", + "author_site": "Tsun-Yi Yang; Jo-Han Hsu; Yen-Yu Lin; Yung-Yu Chuang", "author": "Tsun-Yi Yang; Jo-Han Hsu; Yen-Yu Lin; Yung-Yu Chuang", "abstract": "This paper presents the DeepCD framework which learns a pair of complementary descriptors jointly for a patch by employing deep learning techniques. It can be achieved by taking any descriptor learning architecture for learning a leading descriptor and augmenting the architecture with an additional network stream for learning a complementary descriptor. To enforce the complementary property, a new network layer, called data-dependent modulation (DDM) layer, is introduced for adaptively learning the augmented network stream with the emphasis on the training data that are not well handled by the leading stream. By optimizing the proposed joint loss function with late fusion, the obtained descriptors are complementary to each other and their fusion improves performance. Experiments on several problems and datasets show that the proposed method is simple yet effective, outperforming state-of-the-art methods.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Yang_DeepCD_Learning_Deep_ICCV_2017_paper.pdf", @@ -4196,7 +4465,8 @@ "aff_campus_unique_index": "0+0;0+0;0;0", "aff_campus_unique": "Taiwan", "aff_country_unique_index": "0+0;0+0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yang_2017_ICCV,\n \n author = {\n Yang,\n Tsun-Yi and Hsu,\n Jo-Han and Lin,\n Yen-Yu and Chuang,\n Yung-Yu\n},\n title = {\n DeepCD: Learning Deep Complementary Descriptors for Patch Representations\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "DeepCoder: Semi-Parametric Variational Autoencoders for Automatic Facial Action Coding", @@ -4204,7 +4474,7 @@ "status": "Poster", "track": "main", "pid": "1409", - "author_site": "Dieu Linh Tran; Robert Walecki; Ognjen (Oggi) Rudovic; Stefanos Eleftheriadis; Bj\u00c3\u00b6rn Schuller; Maja Pantic", + "author_site": "Dieu Linh Tran; Robert Walecki; Ognjen (Oggi) Rudovic; Stefanos Eleftheriadis; Björn Schuller; Maja Pantic", "author": "Dieu Linh Tran; Robert Walecki; Ognjen (Oggi) Rudovic; Stefanos Eleftheriadis; Bjorn Schuller; Maja Pantic", "abstract": "Human face exhibits an inherent hierarchy in its representations (i.e., holistic facial expressions can be encoded via a set of facial action units (AUs) and their intensity). Variational (deep) auto-encoders (VAE) have shown great results in unsupervised extraction of hierarchical latent representations from large amounts of image data, while being robust to noise and other undesired artifacts. Potentially, this makes VAEs a suitable approach for learning facial features for AU intensity estimation. Yet, most existing VAE-based methods apply classifiers learned separately from the encoded features. By contrast, the non-parametric (probabilistic) approaches, such as Gaussian Processes (GPs), typically outperform their parametric counterparts, but cannot deal easily with large amounts of data. To this end, we propose a novel VAE semi-parametric modeling framework, named DeepCoder, which combines the modeling power of parametric (convolutional) and non-parametric (ordinal GPs) VAEs, for joint learning of (1) latent representations at multiple levels in a task hierarchy, and (2) classification of multiple ordinal outputs. We show on benchmark datasets for AU intensity estimation that the proposed DeepCoder outperforms the state-of-the-art approaches, and related VAEs and deep learning models.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Tran_DeepCoder_Semi-Parametric_Variational_ICCV_2017_paper.pdf", @@ -4219,7 +4489,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Tran_DeepCoder_Semi-Parametric_Variational_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Tran_DeepCoder_Semi-Parametric_Variational_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Tran_2017_ICCV,\n \n author = {\n Linh Tran,\n Dieu and Walecki,\n Robert and (Oggi) Rudovic,\n Ognjen and Eleftheriadis,\n Stefanos and Schuller,\n Bjorn and Pantic,\n Maja\n},\n title = {\n DeepCoder: Semi-Parametric Variational Autoencoders for Automatic Facial Action Coding\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "DeepContext: Context-Encoding Neural Pathways for 3D Holistic Scene Understanding", @@ -4227,6 +4498,7 @@ "status": "Poster", "track": "main", "pid": "350", + "author_site": "Yinda Zhang; Mingru Bai; Pushmeet Kohli; Shahram Izadi; Jianxiong Xiao", "author": "Yinda Zhang; Mingru Bai; Pushmeet Kohli; Shahram Izadi; Jianxiong Xiao", "abstract": "3D context has been shown to be an extremely important cue for scene understanding, yet very little research has been done on integrating context information with deep models. This paper presents an approach to embed 3D context into the topology of a neural network trained to perform holistic scene understanding. Given a depth image depicting a 3D scene, our network aligns the observed scene with a predefined 3D scene template, and then reasons about the existence and location of each object within the scene template. In doing so, our model recognizes multiple objects in a single forward pass of a 3D convolutional neural network, capturing both global scene and local object information simultaneously. To create training data for this 3D network, we generate partly hallucinated depth images which are rendered by replacing real objects with a repository of CAD models of the same object category. Extensive experiments demonstrate the effectiveness of our algorithm compared to the state of the art.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zhang_DeepContext_Context-Encoding_Neural_ICCV_2017_paper.pdf", @@ -4244,14 +4516,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Zhang_DeepContext_Context-Encoding_Neural_ICCV_2017_paper.html", "aff_unique_index": "0;0;1+2;3+2;0+4+2", - "aff_unique_norm": "Princeton University;DeepMind;Microsoft;perceptiveIO;AutoX", + "aff_unique_norm": "Princeton University;DeepMind;Microsoft Corporation;PerceptiveIO;AutoX", "aff_unique_dep": ";;Microsoft Research;;", "aff_unique_url": "https://www.princeton.edu;https://deepmind.com;https://www.microsoft.com/en-us/research;;https://www.autox.ai", "aff_unique_abbr": "Princeton;DeepMind;MSR;;AutoX", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1+0;0;0+3+0", - "aff_country_unique": "United States;United Kingdom;;China" + "aff_country_unique": "United States;United Kingdom;;China", + "bibtex": "@InProceedings{Zhang_2017_ICCV,\n \n author = {\n Zhang,\n Yinda and Bai,\n Mingru and Kohli,\n Pushmeet and Izadi,\n Shahram and Xiao,\n Jianxiong\n},\n title = {\n DeepContext: Context-Encoding Neural Pathways for 3D Holistic Scene Understanding\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "DeepFuse: A Deep Unsupervised Approach for Exposure Fusion With Extreme Exposure Image Pairs", @@ -4259,6 +4532,7 @@ "status": "Poster", "track": "main", "pid": "2329", + "author_site": "K. Ram Prabhakar; V Sai Srikar; R. Venkatesh Babu", "author": "K. Ram Prabhakar; V Sai Srikar; R. Venkatesh Babu", "abstract": "We present a novel deep learning architecture for fusing static multi-exposure images. Current multi-exposure fusion (MEF) approaches use hand-crafted features to fuse input sequence. However, the weak hand-crafted representations are not robust to varying input conditions. Moreover, they perform poorly for extreme exposure image pairs. Thus, it is highly desirable to have a method that is robust to varying input conditions and capable of handling extreme exposure without artifacts. Deep representations have known to be robust to input conditions and have shown phenomenal performance in a supervised setting. However, the stumbling block in using deep learning for MEF was the lack of sufficient training data and an oracle to provide the ground-truth for supervision. To address the above issues, we have gathered a large dataset of multi-exposure image stacks for training and to circumvent the need for ground truth images, we propose an unsupervised deep learning framework for MEF utilizing a no-reference quality metric as loss function. The proposed approach uses a novel CNN architecture trained to learn the fusion operation without reference ground truth image. The model fuses a set of common low level features extracted from each image to generate artifact-free perceptually pleasing results. We perform extensive quantitative and qualitative evaluation and show that the proposed technique outperforms existing state-of-the-art approaches for a variety of natural images.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Prabhakar_DeepFuse_A_Deep_ICCV_2017_paper.pdf", @@ -4274,7 +4548,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Prabhakar_DeepFuse_A_Deep_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Prabhakar_DeepFuse_A_Deep_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Prabhakar_2017_ICCV,\n \n author = {\n Ram Prabhakar,\n K. and Sai Srikar,\n V and Venkatesh Babu,\n R.\n},\n title = {\n DeepFuse: A Deep Unsupervised Approach for Exposure Fusion With Extreme Exposure Image Pairs\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "DeepRoadMapper: Extracting Road Topology From Aerial Images", @@ -4282,7 +4557,7 @@ "status": "Poster", "track": "main", "pid": "1418", - "author_site": "Gell\u00c3\u00a9rt M\u00c3\u00a1ttyus; Wenjie Luo; Raquel Urtasun", + "author_site": "Gellért Máttyus; Wenjie Luo; Raquel Urtasun", "author": "Gellert Mattyus; Wenjie Luo; Raquel Urtasun", "abstract": "Creating road maps is essential to the success of many applications such as autonomous driving and city planning. Most approaches in industry focus on leveraging expensive sensors mounted on top of a fleet of cars. This results in very accurate estimates when using techniques that involve a user in the loop. However, these solutions are very expensive and have small coverage. In contrast, in this paper we propose an approach that directly estimates road topology from aerial images. This provides us with an affordable solution which has large coverage. Towards this goal, we take advantage of the latest developments in deep learning to have an initial segmentation of the aerial images. We then propose an algorithm that reasons about missing connections in the extracted road topology as a shortest path problem which can be solved efficiently. We demonstrate the effectiveness of our approach in the challenging TorontoCity dataset and show very significant improvements over the state-of-the-art.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Mattyus_DeepRoadMapper_Extracting_Road_ICCV_2017_paper.pdf", @@ -4307,7 +4582,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "United States;Canada" + "aff_country_unique": "United States;Canada", + "bibtex": "@InProceedings{Mattyus_2017_ICCV,\n \n author = {\n Mattyus,\n Gellert and Luo,\n Wenjie and Urtasun,\n Raquel\n},\n title = {\n DeepRoadMapper: Extracting Road Topology From Aerial Images\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "DeepSetNet: Predicting Sets With Deep Neural Networks", @@ -4315,6 +4591,7 @@ "status": "Spotlight", "track": "main", "pid": "1160", + "author_site": "S. Hamid Rezatofighi; Vijay Kumar B G; Anton Milan; Ehsan Abbasnejad; Anthony Dick; Ian Reid", "author": "S. Hamid Rezatofighi; Vijay Kumar B G; Anton Milan; Ehsan Abbasnejad; Anthony Dick; Ian Reid", "abstract": "This paper addresses the task of set prediction using deep learning. This is important because the output of many computer vision tasks, including image tagging and object detection, are naturally expressed as sets of entities rather than vectors. As opposed to a vector, the size of a set is not fixed in advance, and it is invariant to the ordering of entities within it. We define a likelihood for a set distribution and learn its parameters using a deep neural network. We also derive a loss for predicting a discrete distribution corresponding to set cardinality. Set prediction is demonstrated on the problem of multi-class image classification. Moreover, we show that the proposed cardinality loss can also trivially be applied to the tasks of object counting and pedestrian detection. Our approach outperforms existing methods in all three cases on standard datasets.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Rezatofighi_DeepSetNet_Predicting_Sets_ICCV_2017_paper.pdf", @@ -4330,7 +4607,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Rezatofighi_DeepSetNet_Predicting_Sets_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Rezatofighi_DeepSetNet_Predicting_Sets_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Rezatofighi_2017_ICCV,\n \n author = {\n Hamid Rezatofighi,\n S. and Kumar B G,\n Vijay and Milan,\n Anton and Abbasnejad,\n Ehsan and Dick,\n Anthony and Reid,\n Ian\n},\n title = {\n DeepSetNet: Predicting Sets With Deep Neural Networks\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Deeper, Broader and Artier Domain Generalization", @@ -4338,6 +4616,7 @@ "status": "Poster", "track": "main", "pid": "2654", + "author_site": "Da Li; Yongxin Yang; Yi-Zhe Song; Timothy M. Hospedales", "author": "Da Li; Yongxin Yang; Yi-Zhe Song; Timothy M. Hospedales", "abstract": "The problem of domain generalization is to learn from multiple training domains, and extract a domain-agnostic model that can then be applied to an unseen domain. Domain generalization (DG) has a clear motivation in contexts where there are target domains with distinct characteristics, yet sparse data for training. For example recognition in sketch images, which are distinctly more abstract and rarer than photos. Nevertheless, DG methods have primarily been evaluated on photo-only benchmarks focusing on alleviating the dataset bias where both problems of domain distinctiveness and data sparsity can be minimal. We argue that these benchmarks are overly straightforward, and show that simple deep learning baselines perform surprisingly well on them. In this paper, we make two main contributions: Firstly, we build upon the favorable domain shift-robust properties of deep learning methods, and develop a low-rank parameterized CNN model for end-to-end DG learning. Secondly, we develop a DG benchmark dataset covering photo, sketch, cartoon and painting domains. This is both more practically relevant, and harder (bigger domain shift) than existing benchmarks. The results show that our method outperforms existing DG alternatives, and our dataset provides a more significant DG challenge to drive future research.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Li_Deeper_Broader_and_ICCV_2017_paper.pdf", @@ -4362,7 +4641,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "London;", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Li_2017_ICCV,\n \n author = {\n Li,\n Da and Yang,\n Yongxin and Song,\n Yi-Zhe and Hospedales,\n Timothy M.\n},\n title = {\n Deeper,\n Broader and Artier Domain Generalization\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Deeply-Learned Part-Aligned Representations for Person Re-Identification", @@ -4370,10 +4650,11 @@ "status": "Poster", "track": "main", "pid": "1556", + "author_site": "Liming Zhao; Xi Li; Yueting Zhuang; Jingdong Wang", "author": "Liming Zhao; Xi Li; Yueting Zhuang; Jingdong Wang", "abstract": "In this paper, we address the problem of person re-identification, which refers to associating the persons captured from different cameras. We propose a simple yet effective human part-aligned representation for handling the body part misalignment problem. Our approach decomposes the human body into regions (parts) which are discriminative for person matching, accordingly computes the representations over the regions, and aggregates the similarities computed between the corresponding regions of a pair of probe and gallery images as the overall matching score. Our formulation, inspired by attention models, is a deep neural network modeling the three steps together, which is learnt through minimizing the triplet loss function without requiring body part labeling information. Unlike most existing deep learning algorithms that learn a global or spatial partition-based local representation, our approach performs human body partition, and thus is more robust to pose changes and various human spatial distributions in the person bounding box. Our approach shows state-of-the-art results over standard datasets, Market-1501, CUHK03, CUHK01 and VIPeR.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zhao_Deeply-Learned_Part-Aligned_Representations_ICCV_2017_paper.pdf", - "aff": "Zhejiang University\u2020; Zhejiang University\u2020; Zhejiang University\u2020; Microsoft Research\u2021", + "aff": "Zhejiang University†; Zhejiang University†; Zhejiang University†; Microsoft Research‡", "project": "", "github": "", "supp": "", @@ -4387,14 +4668,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Zhao_Deeply-Learned_Part-Aligned_Representations_ICCV_2017_paper.html", "aff_unique_index": "0;0;0;1", - "aff_unique_norm": "Zhejiang University;Microsoft", - "aff_unique_dep": ";Microsoft Research", + "aff_unique_norm": "Zhejiang University;Microsoft Research", + "aff_unique_dep": ";", "aff_unique_url": "https://www.zju.edu.cn;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "ZJU;MSR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Zhao_2017_ICCV,\n \n author = {\n Zhao,\n Liming and Li,\n Xi and Zhuang,\n Yueting and Wang,\n Jingdong\n},\n title = {\n Deeply-Learned Part-Aligned Representations for Person Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Deformable Convolutional Networks", @@ -4402,6 +4684,7 @@ "status": "Oral", "track": "main", "pid": "133", + "author_site": "Jifeng Dai; Haozhi Qi; Yuwen Xiong; Yi Li; Guodong Zhang; Han Hu; Yichen Wei", "author": "Jifeng Dai; Haozhi Qi; Yuwen Xiong; Yi Li; Guodong Zhang; Han Hu; Yichen Wei", "abstract": "Convolutional neural networks (CNNs) are inherently limited to model geometric transformations due to the fixed geometric structures in its building modules. In this work, we introduce two new modules to enhance the transformation modeling capacity of CNNs, namely, deformable convolution and deformable RoI pooling. Both are based on the idea of augmenting the spatial sampling locations in the modules with additional offsets and learning the offsets from target tasks, without additional supervision. The new modules can readily replace their plain counterparts in existing CNNs and can be easily trained end-to-end by standard back-propagation, giving rise to deformable convolutional networks. Extensive experiments validate the effectiveness of our approach on sophisticated vision tasks of object detection and semantic segmentation. The code would be released.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Dai_Deformable_Convolutional_Networks_ICCV_2017_paper.pdf", @@ -4419,14 +4702,15 @@ "author_num": 7, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Dai_Deformable_Convolutional_Networks_ICCV_2017_paper.html", "aff_unique_index": "0;0;0;0;0;0;0", - "aff_unique_norm": "Microsoft", + "aff_unique_norm": "Microsoft Research", "aff_unique_dep": "Research", "aff_unique_url": "https://www.microsoft.com/en-us/research/group/asia", "aff_unique_abbr": "MSR Asia", "aff_campus_unique_index": "0;0;0;0;0;0;0", "aff_campus_unique": "Asia", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Dai_2017_ICCV,\n \n author = {\n Dai,\n Jifeng and Qi,\n Haozhi and Xiong,\n Yuwen and Li,\n Yi and Zhang,\n Guodong and Hu,\n Han and Wei,\n Yichen\n},\n title = {\n Deformable Convolutional Networks\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Deltille Grids for Geometric Camera Calibration", @@ -4434,10 +4718,11 @@ "status": "Poster", "track": "main", "pid": "2913", + "author_site": "Hyowon Ha; Michal Perdoch; Hatem Alismail; In So Kweon; Yaser Sheikh", "author": "Hyowon Ha; Michal Perdoch; Hatem Alismail; In So Kweon; Yaser Sheikh", "abstract": "The recent proliferation of high resolution cameras presents an opportunity to achieve unprecedented levels of precision in visual 3D reconstruction. Yet the camera calibration pipeline, developed decades ago using checkerboards, has remained the de facto standard. In this paper, we ask the question: are checkerboards the optimal pattern for high precision calibration? We empirically demonstrate that deltille grids (regular triangular tiling) produce the highest precision calibration of the possible tilings of Euclidean plane. We posit that they should be the new standard for high-precision calibration and present a complete ecosystem for calibration using deltille grids including: (1) a highly precise corner detection algorithm based on polynomial surface fitting; (2) an indexing scheme based on polarities extracted from the fitted surfaces; and (3) a 2D coding system for deltille grids, which we refer to as DelTags, in lieu of conventional matrix barcodes. We demonstrate state-of-the-art performance and apply the full calibration ecosystem through the use of 3D calibration objects for multiview camera calibration.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Ha_Deltille_Grids_for_ICCV_2017_paper.pdf", - "aff": "Korea Advanced Institute of Science and Technology\u2020; Oculus Research\u2021; Oculus Research\u2021; Korea Advanced Institute of Science and Technology\u2020; Oculus Research\u2021", + "aff": "Korea Advanced Institute of Science and Technology†; Oculus Research‡; Oculus Research‡; Korea Advanced Institute of Science and Technology†; Oculus Research‡", "project": "", "github": "", "supp": "", @@ -4458,7 +4743,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0;1", - "aff_country_unique": "South Korea;United States" + "aff_country_unique": "South Korea;United States", + "bibtex": "@InProceedings{Ha_2017_ICCV,\n \n author = {\n Ha,\n Hyowon and Perdoch,\n Michal and Alismail,\n Hatem and So Kweon,\n In and Sheikh,\n Yaser\n},\n title = {\n Deltille Grids for Geometric Camera Calibration\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Delving Into Salient Object Subitizing and Detection", @@ -4466,6 +4752,7 @@ "status": "Poster", "track": "main", "pid": "369", + "author_site": "Shengfeng He; Jianbo Jiao; Xiaodan Zhang; Guoqiang Han; Rynson W.H. Lau", "author": "Shengfeng He; Jianbo Jiao; Xiaodan Zhang; Guoqiang Han; Rynson W.H. Lau", "abstract": "Subitizing (i.e., instant judgement on the number) and detection of salient objects are human inborn abilities. These two tasks influence each other in the human visual system. In this paper, we delve into the complementarity of these two tasks. We propose a multi-task deep neural network with weight prediction for salient object detection, where the parameters of an adaptive weight layer are dynamically determined by an auxiliary subitizing network. The numerical representation of salient objects is therefore embedded into the spatial representation. The proposed joint network can be trained end-to-end using back-propagation. Experiments show that the proposed multi-task network outperforms existing multi-task architectures, and the auxiliary subitizing network provides strong guidance to salient object detection by reducing false positives and producing coherent saliency maps. Moreover, the proposed method is an unconstrained method able to handle images with/without salient objects. Finally, we show state-of-theart performance on different salient object datasets.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/He_Delving_Into_Salient_ICCV_2017_paper.pdf", @@ -4490,7 +4777,8 @@ "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0+0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{He_2017_ICCV,\n \n author = {\n He,\n Shengfeng and Jiao,\n Jianbo and Zhang,\n Xiaodan and Han,\n Guoqiang and Lau,\n Rynson W.H.\n},\n title = {\n Delving Into Salient Object Subitizing and Detection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Dense Non-Rigid Structure-From-Motion and Shading With Unknown Albedos", @@ -4498,10 +4786,11 @@ "status": "Poster", "track": "main", "pid": "1848", + "author_site": "Mathias Gallardo; Toby Collins; Adrien Bartoli", "author": "Mathias Gallardo; Toby Collins; Adrien Bartoli", "abstract": "Significant progress has been recently made in Non-Rigid Structure-from-Motion (NRSfM). However, existing methods do not handle poorly-textured surfaces that deform non-smoothly. These are nonetheless common occurrence in real-world applications. An important unanswered question is whether shading can be used to robustly handle these cases. Shading is complementary to motion because it constrains reconstruction densely at textureless regions, and has been used in several other reconstruction problems. The challenge we face is to simultaneously and densely estimate non-smooth, non-rigid shape from each image together with non-smooth, spatially-varying surface albedo (which is required to use shading). We tackle this using an energy-based formulation that combines a physical, discontinuity-preserving deformation prior with motion, shading and contour information. This is a largescale, highly non-convex optimization problem, and we propose a cascaded optimization that converges well without an initial estimate. Our approach works on both unorganized and organized small-sized image sets, and has been empirically validated on four real-world datasets for which all state-of-the-art approaches fail.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Gallardo_Dense_Non-Rigid_Structure-From-Motion_ICCV_2017_paper.pdf", - "aff": "EnCoV, IP, UMR 6602 CNRS, Universit\u00e9 Clermont Auvergne, SIGMA, France; IRCAD, Strasbourg, France; EnCoV, IP, UMR 6602 CNRS, Universit\u00e9 Clermont Auvergne, SIGMA, France", + "aff": "EnCoV, IP, UMR 6602 CNRS, Université Clermont Auvergne, SIGMA, France; IRCAD, Strasbourg, France; EnCoV, IP, UMR 6602 CNRS, Université Clermont Auvergne, SIGMA, France", "project": "", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2017/supplemental/Gallardo_Dense_Non-Rigid_Structure-From-Motion_ICCV_2017_supplemental.pdf", @@ -4515,14 +4804,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Gallardo_Dense_Non-Rigid_Structure-From-Motion_ICCV_2017_paper.html", "aff_unique_index": "0;1;0", - "aff_unique_norm": "Universit\u00e9 Clermont Auvergne;Institut de Recherche en Canc\u00e9rologie et en Anatomopathologie", + "aff_unique_norm": "Université Clermont Auvergne;Institut de Recherche en Cancérologie et en Anatomopathologie", "aff_unique_dep": "EnCoV, IP, UMR 6602 CNRS, SIGMA;", - "aff_unique_url": "https://www.uca.fr;https://www ircad fr", + "aff_unique_url": ";https://www ircad fr", "aff_unique_abbr": ";IRCAD", "aff_campus_unique_index": "1", "aff_campus_unique": ";Strasbourg", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "France" + "aff_country_unique": "France", + "bibtex": "@InProceedings{Gallardo_2017_ICCV,\n \n author = {\n Gallardo,\n Mathias and Collins,\n Toby and Bartoli,\n Adrien\n},\n title = {\n Dense Non-Rigid Structure-From-Motion and Shading With Unknown Albedos\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Dense and Low-Rank Gaussian CRFs Using Deep Embeddings", @@ -4530,10 +4820,11 @@ "status": "Poster", "track": "main", "pid": "2495", + "author_site": "Siddhartha Chandra; Nicolas Usunier; Iasonas Kokkinos", "author": "Siddhartha Chandra; Nicolas Usunier; Iasonas Kokkinos", "abstract": "In this work we introduce a structured prediction model that endows the Deep Gaussian Conditional Random Field (G-CRF) with a densely connected graph structure. We keep memory and computational complexity under control by expressing the pairwise interactions as inner products of low-dimensional, learnable embeddings. The G-CRF system matrix is therefore low-rank, allowing us to solve the resulting system in a few milliseconds on the GPU by using conjugate gradients. As in G-CRF, inference is exact, the unary and pairwise terms are jointly trained end-to-end by using analytic expressions for the gradients, while we also develop even faster, Potts-type variants of our embeddings. We show that the learned embeddings capture pixel-to-pixel affinities in a task-specific manner, while our approach achieves state of the art results on three challenging benchmarks, namely semantic segmentation, human part segmentation, and saliency estimation. Our implementation is fully GPU based, built on top of the Caffe library, and is available at https://github.com/siddharthachandra/gcrf-v2.0", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Chandra_Dense_and_Low-Rank_ICCV_2017_paper.pdf", - "aff": "INRIA GALEN, CentraleSup\u00e9lec; Facebook AI Research, Paris; Facebook AI Research, Paris", + "aff": "INRIA GALEN, CentraleSupélec; Facebook AI Research, Paris; Facebook AI Research, Paris", "project": "", "github": "https://github.com/siddharthachandra/gcrf-v2.0", "supp": "", @@ -4547,14 +4838,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Chandra_Dense_and_Low-Rank_ICCV_2017_paper.html", "aff_unique_index": "0;1;1", - "aff_unique_norm": "INRIA;Meta", + "aff_unique_norm": "INRIA;Facebook", "aff_unique_dep": "GALEN;Facebook AI Research", "aff_unique_url": "https://www.inria.fr;https://research.facebook.com", "aff_unique_abbr": "INRIA;FAIR", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Paris", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "France" + "aff_country_unique": "France", + "bibtex": "@InProceedings{Chandra_2017_ICCV,\n \n author = {\n Chandra,\n Siddhartha and Usunier,\n Nicolas and Kokkinos,\n Iasonas\n},\n title = {\n Dense and Low-Rank Gaussian CRFs Using Deep Embeddings\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Dense-Captioning Events in Videos", @@ -4562,6 +4854,7 @@ "status": "Poster", "track": "main", "pid": "271", + "author_site": "Ranjay Krishna; Kenji Hata; Frederic Ren; Li Fei-Fei; Juan Carlos Niebles", "author": "Ranjay Krishna; Kenji Hata; Frederic Ren; Li Fei-Fei; Juan Carlos Niebles", "abstract": "Most natural videos contain numerous events. For example, in a video of a \"man playing a piano\", the video might also contain \"another man dancing\" or \"a crowd clapping\". We introduce the task of dense-captioning events, which involves both detecting and describing events in a video. We propose a new model that is able to identify all such events in a single pass of the video while simultaneously describing the detected events with natural language. Our model introduces a variant of an existing proposal module that is designed to capture both short as well as long events that span minutes. To capture the dependencies between the events in a video, our model introduces a new captioning module that uses contextual information from past and future events to jointly describe all events. We also introduce ActivityNet Captions, a large-scale benchmark for dense-captioning events. ActivityNet Captions contains 20k videos amounting to 849 video hours with 100k total descriptions, each with it's unique start and end time. Finally, we report performances of our model for dense-captioning events, video retrieval and localization.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Krishna_Dense-Captioning_Events_in_ICCV_2017_paper.pdf", @@ -4586,7 +4879,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Krishna_2017_ICCV,\n \n author = {\n Krishna,\n Ranjay and Hata,\n Kenji and Ren,\n Frederic and Fei-Fei,\n Li and Carlos Niebles,\n Juan\n},\n title = {\n Dense-Captioning Events in Videos\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Depth Estimation Using Structured Light Flow -- Analysis of Projected Pattern Flow on an Object's Surface", @@ -4594,6 +4888,7 @@ "status": "Poster", "track": "main", "pid": "2257", + "author_site": "Ryo Furukawa; Ryusuke Sagawa; Hiroshi Kawasaki", "author": "Ryo Furukawa; Ryusuke Sagawa; Hiroshi Kawasaki", "abstract": "Shape reconstruction techniques using structured light have been widely researched and developed due to their robustness, high precision, and density. Because the techniques are based on decoding a pattern to find correspondences, it implicitly requires that the projected patterns be clearly captured by an image sensor, i.e., to avoid defocus and motion blur of the projected pattern. Although intensive researches have been conducted for solving defocus blur, few researches for motion blur and only solution is to capture with extremely fast shutter speed. In this paper, unlike the previous approaches, we actively utilize motion blur, which we refer to as a light flow, to estimate depth. Analysis reveals that minimum two light flows, which are retrieved from two projected patterns on the object, are required for depth estimation. To retrieve two light flows at the same time, two sets of parallel line patterns are illuminated from two video projectors and the size of motion blur of each line is precisely measured. By analyzing the light flows, i.e. lengths of the blurs, scene depth information is estimated. In the experiments, 3D shapes of fast moving objects, which are inevitably captured with motion blur, are successfully reconstructed by our technique.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Furukawa_Depth_Estimation_Using_ICCV_2017_paper.pdf", @@ -4609,7 +4904,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Furukawa_Depth_Estimation_Using_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Furukawa_Depth_Estimation_Using_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Furukawa_2017_ICCV,\n \n author = {\n Furukawa,\n Ryo and Sagawa,\n Ryusuke and Kawasaki,\n Hiroshi\n},\n title = {\n Depth Estimation Using Structured Light Flow -- Analysis of Projected Pattern Flow on an Object's Surface\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Depth and Image Restoration From Light Field in a Scattering Medium", @@ -4617,6 +4913,7 @@ "status": "Poster", "track": "main", "pid": "1043", + "author_site": "Jiandong Tian; Zachary Murez; Tong Cui; Zhen Zhang; David Kriegman; Ravi Ramamoorthi", "author": "Jiandong Tian; Zachary Murez; Tong Cui; Zhen Zhang; David Kriegman; Ravi Ramamoorthi", "abstract": "Traditional imaging methods and computer vision algorithms are often ineffective when images are acquired in scattering media, such as underwater, fog, and biological tissue. Here, we explore the use of light field imaging and algorithms for image restoration and depth estimation that address the image degradation from the medium. Towards this end, we make the following three contributions. First, we present a new single image restoration algorithm which removes backscatter and attenuation from images better than existing methods, and apply it to each view in the light field. Second, we combine a novel transmission based depth cue with existing correspondence and defocus cues to improve light field depth estimation. In densely scattering media, our transmission depth cue is critical for depth estimation since the images have low signal to noise ratios which significantly degrades the performance of the correspondence and defocus cues. Finally, we propose shearing and refocusing multiple views of the light field to recover a single image of higher quality than what is possible from a single view. We demonstrate the benefits of our method through extensive experimental results in a water tank.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Tian_Depth_and_Image_ICCV_2017_paper.pdf", @@ -4641,7 +4938,8 @@ "aff_campus_unique_index": "1;;;1;1", "aff_campus_unique": ";San Diego", "aff_country_unique_index": "0;1;0+0+0;0+0;1;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Tian_2017_ICCV,\n \n author = {\n Tian,\n Jiandong and Murez,\n Zachary and Cui,\n Tong and Zhang,\n Zhen and Kriegman,\n David and Ramamoorthi,\n Ravi\n},\n title = {\n Depth and Image Restoration From Light Field in a Scattering Medium\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Detail-Revealing Deep Video Super-Resolution", @@ -4649,6 +4947,7 @@ "status": "Oral", "track": "main", "pid": "1345", + "author_site": "Xin Tao; Hongyun Gao; Renjie Liao; Jue Wang; Jiaya Jia", "author": "Xin Tao; Hongyun Gao; Renjie Liao; Jue Wang; Jiaya Jia", "abstract": "Previous CNN-based video super-resolution approaches need to align multiple frames to the reference. In this paper, we show that proper frame alignment and motion compensation is crucial for achieving high quality results. We accordingly propose a 'sub-pixel motion compensation' (SPMC) layer in a CNN framework. Analysis and experiments show the suitability of this layer in video SR. The final end-to-end, scalable CNN framework effectively incorporates the SPMC layer and fuses multiple frames to reveal image details. Our implementation can generate visually and quantitatively high-quality results, superior to current state-of-the-arts, without the need of parameter tuning.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Tao_Detail-Revealing_Deep_Video_ICCV_2017_paper.pdf", @@ -4666,14 +4965,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Tao_Detail-Revealing_Deep_Video_ICCV_2017_paper.html", "aff_unique_index": "0;0;1+2;3;4", - "aff_unique_norm": "Chinese University of Hong Kong;University of Toronto;Uber;Megvii Technology;Tencent", + "aff_unique_norm": "The Chinese University of Hong Kong;University of Toronto;Uber;Megvii Technology;Tencent", "aff_unique_dep": ";;Advanced Technologies Group;;Youtu Lab", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.utoronto.ca;https://www.uber.com;https://www.megvii.com/;https://www.tencent.com", "aff_unique_abbr": "CUHK;U of T;Uber ATG;Megvii;Tencent", "aff_campus_unique_index": "0;0;", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;1+2;0;0", - "aff_country_unique": "China;Canada;United States" + "aff_country_unique": "China;Canada;United States", + "bibtex": "@InProceedings{Tao_2017_ICCV,\n \n author = {\n Tao,\n Xin and Gao,\n Hongyun and Liao,\n Renjie and Wang,\n Jue and Jia,\n Jiaya\n},\n title = {\n Detail-Revealing Deep Video Super-Resolution\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Detailed Surface Geometry and Albedo Recovery From RGB-D Video Under Natural Illumination", @@ -4681,6 +4981,7 @@ "status": "Poster", "track": "main", "pid": "1473", + "author_site": "Xinxin Zuo; Sen Wang; Jiangbin Zheng; Ruigang Yang", "author": "Xinxin Zuo; Sen Wang; Jiangbin Zheng; Ruigang Yang", "abstract": "In this paper we present a novel approach for depth map enhancement from an RGB-D video sequence. The basic idea is to exploit the photometric information in the color sequence. Instead of making any assumption about surface albedo or controlled object motion and lighting, we use the lighting variations introduced by casual object movement. We are effectively calculating photometric stereo from a moving object under natural illuminations. The key technical challenge is to establish correspondences over the entire image set. We therefore develop a lighting insensitive robust pixel matching technique that out-performs optical flow method in presence of lighting variations. In addition we present an expectation-maximization framework to recover the surface normal and albedo simultaneously, without any regularization term. We have validated our method on both synthetic and real datasets to show its superior performance on both surface details recovery and intrinsic decomposition.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zuo_Detailed_Surface_Geometry_ICCV_2017_paper.pdf", @@ -4698,14 +4999,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Zuo_Detailed_Surface_Geometry_ICCV_2017_paper.html", "aff_unique_index": "0+1;1+0;1;0+2", - "aff_unique_norm": "University of Kentucky;Northwestern Polytechnical University;Baidu", - "aff_unique_dep": ";;Baidu, Inc.", + "aff_unique_norm": "University of Kentucky;Northwestern Polytechnical University;Baidu, Inc.", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.uky.edu;https://www.nwpu.edu.cn;https://www.baidu.com", "aff_unique_abbr": "UK;NWPU;Baidu", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+1;1+0;1;0+1", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Zuo_2017_ICCV,\n \n author = {\n Zuo,\n Xinxin and Wang,\n Sen and Zheng,\n Jiangbin and Yang,\n Ruigang\n},\n title = {\n Detailed Surface Geometry and Albedo Recovery From RGB-D Video Under Natural Illumination\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Detect to Track and Track to Detect", @@ -4713,6 +5015,7 @@ "status": "Spotlight", "track": "main", "pid": "1806", + "author_site": "Christoph Feichtenhofer; Axel Pinz; Andrew Zisserman", "author": "Christoph Feichtenhofer; Axel Pinz; Andrew Zisserman", "abstract": "Recent approaches for high accuracy detection and tracking of object categories in video consist of complex multistage solutions that become more cumbersome each year. In this paper we propose a ConvNet architecture that jointly performs detection and tracking, solving the task in a simple and effective way. Our contributions are threefold: (i) we set up a ConvNet architecture for simultaneous detection and tracking, using a multi-task objective for frame-based object detection and across-frame track regression; (ii) we introduce correlation features that represent object co-occurrences across time to aid the ConvNet during tracking; and (iii) we link the frame level detections based on our across-frame tracklets to produce high accuracy detections at the video level. Our ConvNet architecture for spatiotemporal object detection is evaluated on the large-scale ImageNet VID dataset where it achieves state-of-the-art results. Our approach provides better single model performance than the winning method of the last ImageNet challenge while being conceptually much simpler. Finally, we show that by increasing the temporal stride we can dramatically increase the tracker speed.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Feichtenhofer_Detect_to_Track_ICCV_2017_paper.pdf", @@ -4737,7 +5040,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", - "aff_country_unique": "Austria;United Kingdom" + "aff_country_unique": "Austria;United Kingdom", + "bibtex": "@InProceedings{Feichtenhofer_2017_ICCV,\n \n author = {\n Feichtenhofer,\n Christoph and Pinz,\n Axel and Zisserman,\n Andrew\n},\n title = {\n Detect to Track and Track to Detect\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Detecting Faces Using Inside Cascaded Contextual CNN", @@ -4745,6 +5049,7 @@ "status": "Poster", "track": "main", "pid": "1331", + "author_site": "Kaipeng Zhang; Zhanpeng Zhang; Hao Wang; Zhifeng Li; Yu Qiao; Wei Liu", "author": "Kaipeng Zhang; Zhanpeng Zhang; Hao Wang; Zhifeng Li; Yu Qiao; Wei Liu", "abstract": "Deep Convolutional Neural Networks (CNNs) achieve substantial improvements in face detection in the wild. Classical CNN-based face detection methods simply stack successive layers of filters where an input sample should pass through all layers before reaching a face/non-face decision. Inspired by the fact that for face detection, filters in deeper layers can discriminate between difficult face/non-face samples while those in shallower layers can efficiently reject simple non-face samples, we propose Inside Cascaded Structure that introduces face/non-face classifiers at different layers within the same CNN. In the training phase, we propose data routing mechanism which enables different layers to be trained by different types of samples, and thus deeper layers can focus on handling more difficult samples compared with traditional architecture. In addition, we introduce a two-stream contextual CNN architecture that leverages body part information adaptively to enhance face detection. Extensive experiments on the challenging FDDB and WIDER FACE benchmarks demonstrate that our method achieves competitive accuracy to the state-of-the-art techniques while keeps real time performance.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zhang_Detecting_Faces_Using_ICCV_2017_paper.pdf", @@ -4769,7 +5074,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2017_ICCV,\n \n author = {\n Zhang,\n Kaipeng and Zhang,\n Zhanpeng and Wang,\n Hao and Li,\n Zhifeng and Qiao,\n Yu and Liu,\n Wei\n},\n title = {\n Detecting Faces Using Inside Cascaded Contextual CNN\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Directionally Convolutional Networks for 3D Shape Segmentation", @@ -4777,6 +5083,7 @@ "status": "Poster", "track": "main", "pid": "1103", + "author_site": "Haotian Xu; Ming Dong; Zichun Zhong", "author": "Haotian Xu; Ming Dong; Zichun Zhong", "abstract": "Previous approaches on 3D shape segmentation mostly rely on heuristic processing and hand-tuned geometric descriptors. In this paper, we propose a novel 3D shape representation learning approach, Directionally Convolutional Network (DCN), to solve the shape segmentation problem. DCN extends convolution operations from images to the surface mesh of 3D shapes. With DCN, we learn effective shape representations from raw geometric features, i.e., face normals and distances, to achieve robust segmentation. More specifically, a two-stream segmentation framework is proposed: one stream is made up by the proposed DCN with the face normals as the input, and the other stream is implemented by a neural network with the face distance histogram as the input. The learned shape representations from the two streams are fused by an element-wise product. Finally, Conditional Random Field (CRF) is applied to optimize the segmentation. Through extensive experiments conducted on benchmark datasets, we demonstrate that our approach outperforms the current state-of-the-arts (both classic and deep learning-based) on a large variety of 3D shapes.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Xu_Directionally_Convolutional_Networks_ICCV_2017_paper.pdf", @@ -4801,7 +5108,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Xu_2017_ICCV,\n \n author = {\n Xu,\n Haotian and Dong,\n Ming and Zhong,\n Zichun\n},\n title = {\n Directionally Convolutional Networks for 3D Shape Segmentation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Distributed Very Large Scale Bundle Adjustment by Global Camera Consensus", @@ -4809,6 +5117,7 @@ "status": "Oral", "track": "main", "pid": "1019", + "author_site": "Runze Zhang; Siyu Zhu; Tian Fang; Long Quan", "author": "Runze Zhang; Siyu Zhu; Tian Fang; Long Quan", "abstract": "The increasing scale of Structure-from-Motion is fundamentally limited by the conventional optimization framework for the all-in-one global bundle adjustment. In this paper, we propose a distributed approach to coping with this global bundle adjustment for very large scale Structure-from-Motion computation. First, we derive the distributed formulation from the classical optimization algorithm ADMM, Alternating Direction Method of Multipliers, based on the global camera consensus. Then, we analyze the conditions under which the convergence of this distributed optimization would be guaranteed. In particular, we adopt over-relaxation and self-adaption schemes to improve the convergence rate. After that, we propose to split the large scale camera-point visibility graph in order to reduce the communication overheads of the distributed computing. The experiments on both public large scale SfM data-sets and our very large scale aerial photo sets demonstrate that the proposed distributed method clearly outperforms the state-of-the-art method in efficiency and accuracy.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zhang_Distributed_Very_Large_ICCV_2017_paper.pdf", @@ -4826,14 +5135,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Zhang_Distributed_Very_Large_ICCV_2017_paper.html", "aff_unique_index": "0;0;1;0", - "aff_unique_norm": "Hong Kong University of Science and Technology;Shenzhen Zhuke Innovation Technology", + "aff_unique_norm": "The Hong Kong University of Science and Technology;Shenzhen Zhuke Innovation Technology", "aff_unique_dep": "Department of Computer Science and Engineering;", "aff_unique_url": "https://www.ust.hk;", "aff_unique_abbr": "HKUST;", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2017_ICCV,\n \n author = {\n Zhang,\n Runze and Zhu,\n Siyu and Fang,\n Tian and Quan,\n Long\n},\n title = {\n Distributed Very Large Scale Bundle Adjustment by Global Camera Consensus\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Domain-Adaptive Deep Network Compression", @@ -4841,7 +5151,7 @@ "status": "Poster", "track": "main", "pid": "1797", - "author_site": "Marc Masana; Joost van de Weijer; Luis Herranz; Andrew D. Bagdanov; Jose M. \u00c3\u0081lvarez", + "author_site": "Marc Masana; Joost van de Weijer; Luis Herranz; Andrew D. Bagdanov; Jose M. Álvarez", "author": "Marc Masana; Joost van de Weijer; Luis Herranz; Andrew D. Bagdanov; Jose M. Alvarez", "abstract": "Deep Neural Networks trained on large datasets can be easily transferred to new domains with far fewer labeled examples by a process called fine-tuning. This has the advantage that representations learned in the large source domain can be exploited on smaller target domains. However, networks designed to be optimal for the source task are often prohibitively large for the target task. In this work we address the compression of networks after domain transfer. We focus on compression algorithms based on low-rank matrix decomposition. Existing methods base compression solely on learned network weights and ignore the statistics of network activations. We show that domain transfer leads to large shifts in network activations and that it is desirable to take this into account when compressing. We demonstrate that considering activation statistics when compressing weights leads to a rank-constrained regression problem with a closed-form solution. Because our method takes into account the target domain, it can more optimally remove the redundancy in the weights. Experiments show that our Domain Adaptive Low Rank (DALR) method significantly outperforms existing low-rank compression techniques. With our approach, the fc6 layer of VGG19 can be compressed more than 4x more than using truncated SVD alone -- with only a minor or no loss in accuracy. When applied to domain-transferred networks it allows for compression down to only 5-20% of the original number of parameters with only a minor drop in performance.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Masana_Domain-Adaptive_Deep_Network_ICCV_2017_paper.pdf", @@ -4866,7 +5176,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;2", - "aff_country_unique": "Spain;Italy;United States" + "aff_country_unique": "Spain;Italy;United States", + "bibtex": "@InProceedings{Masana_2017_ICCV,\n \n author = {\n Masana,\n Marc and van de Weijer,\n Joost and Herranz,\n Luis and Bagdanov,\n Andrew D. and Alvarez,\n Jose M.\n},\n title = {\n Domain-Adaptive Deep Network Compression\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Drone-Based Object Counting by Spatially Regularized Regional Proposal Network", @@ -4874,6 +5185,7 @@ "status": "Poster", "track": "main", "pid": "1775", + "author_site": "Meng-Ru Hsieh; Yen-Liang Lin; Winston H. Hsu", "author": "Meng-Ru Hsieh; Yen-Liang Lin; Winston H. Hsu", "abstract": "Existing counting methods often adopt regression-based approaches and cannot precisely localize the target objects, which hinders the further analysis (e.g., high-level understanding and fine-grained classification). In addition, most of prior work mainly focus on counting objects in static environments with fixed cameras. Motivated by the advent of unmanned flying vehicles (i.e., drones), we are interested in detecting and counting objects in such dynamic environments. We propose Layout Proposal Networks (LPNs) and spatial kernels to simultaneously count and localize target objects (e.g., cars) in videos recorded by the drone. Different from the conventional region proposal methods, we leverage the spatial layout information (e.g., cars often park regularly) and introduce these spatially regularized constraints into our network to improve the localization accuracy. To evaluate our counting method, we present a new large-scale car parking lot dataset (CARPK) that contains nearly 90,000 cars captured from different parking lots. To the best of our knowledge, it is the first and the largest drone view dataset that supports object counting, and provides the bounding box annotations.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Hsieh_Drone-Based_Object_Counting_ICCV_2017_paper.pdf", @@ -4898,7 +5210,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Taiwan;", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Hsieh_2017_ICCV,\n \n author = {\n Hsieh,\n Meng-Ru and Lin,\n Yen-Liang and Hsu,\n Winston H.\n},\n title = {\n Drone-Based Object Counting by Spatially Regularized Regional Proposal Network\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Dual Motion GAN for Future-Flow Embedded Video Prediction", @@ -4906,6 +5219,7 @@ "status": "Poster", "track": "main", "pid": "157", + "author_site": "Xiaodan Liang; Lisa Lee; Wei Dai; Eric P. Xing", "author": "Xiaodan Liang; Lisa Lee; Wei Dai; Eric P. Xing", "abstract": "Future frame prediction in videos is a promising avenue for unsupervised video representation learning. Video frames are naturally generated by the inherent pixel flows from preceding frames based on the appearance and motion dynamics in the video. However, existing methods focus on directly hallucinating pixel values, resulting in blurry predictions. In this paper, we develop a dual motion Generative Adversarial Net (GAN) architecture, which learns to explicitly enforce future-frame predictions to be consistent with the pixel-wise flows in the video through a dual-learning mechanism. The primal future-frame prediction and dual future-flow prediction form a closed loop, generating informative feedback signals to each other for better video prediction. To make both synthesized future frames and flows indistinguishable from reality, a dual adversarial training method is proposed to ensure that the future-flow prediction is able to help infer realistic future-frames, while the future-frame prediction in turn leads to realistic optical flows. Our dual motion GAN also handles natural motion uncertainty in different pixel locations with a new probabilistic motion encoder, which is based on variational autoencoders. Extensive experiments demonstrate that the proposed dual motion GAN significantly outperforms state-of-the-art approaches on synthesizing new video frames and predicting future flows. Our model generalizes well across diverse visual scenes and shows superiority in unsupervised video representation learning.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Liang_Dual_Motion_GAN_ICCV_2017_paper.pdf", @@ -4930,7 +5244,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Liang_2017_ICCV,\n \n author = {\n Liang,\n Xiaodan and Lee,\n Lisa and Dai,\n Wei and Xing,\n Eric P.\n},\n title = {\n Dual Motion GAN for Future-Flow Embedded Video Prediction\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Dual-Glance Model for Deciphering Social Relationships", @@ -4938,6 +5253,7 @@ "status": "Poster", "track": "main", "pid": "1163", + "author_site": "Junnan Li; Yongkang Wong; Qi Zhao; Mohan S. Kankanhalli", "author": "Junnan Li; Yongkang Wong; Qi Zhao; Mohan S. Kankanhalli", "abstract": "Since the beginning of early civilizations, social relationships derived from each individual fundamentally form the basis of social structure in our daily life. In the computer vision literature, much progress has been made in scene understanding, such as object detection and scene parsing. Recent research focuses on the relationship between objects based on its functionality and geometrical relations. In this work, we aim to study the problem of social relationship recognition, in still images. We have proposed a dual-glance model for social relationship recognition, where the first glance fixates at the individual pair of interest and the second glance deploys attention mechanism to explore contextual cues. We have also collected a new large scale People in Social Context (PISC) dataset, which comprises of 22,670 images and 76,568 annotated samples from 9 types of social relationship. We provide benchmark results on the PISC dataset, and qualitatively demonstrate the efficacy of the proposed model.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Li_Dual-Glance_Model_for_ICCV_2017_paper.pdf", @@ -4952,7 +5268,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Li_Dual-Glance_Model_for_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Li_Dual-Glance_Model_for_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Li_2017_ICCV,\n \n author = {\n Li,\n Junnan and Wong,\n Yongkang and Zhao,\n Qi and Kankanhalli,\n Mohan S.\n},\n title = {\n Dual-Glance Model for Deciphering Social Relationships \n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "DualGAN: Unsupervised Dual Learning for Image-To-Image Translation", @@ -4960,6 +5277,7 @@ "status": "Poster", "track": "main", "pid": "1281", + "author_site": "Zili Yi; Hao Zhang; Ping Tan; Minglun Gong", "author": "Zili Yi; Hao Zhang; Ping Tan; Minglun Gong", "abstract": "Conditional Generative Adversarial Networks (GANs) for cross-domain image-to-image translation have made much progress recently. Depending on the task complexity, thousands to millions of labeled image pairs are needed to train a conditional GAN. However, human labeling is expensive, even impractical, and large quantities of data may not always be available. Inspired by dual learning from natural language translation, we develop a novel mechanism, which enables image translators to be trained from two sets of images from two domains. In our architecture, the primal GAN learns to translate images from domain U to those in domain V, while the dual GAN learns to invert the task. The closed loop made by the primal and dual tasks allows images from either domain to be translated and then reconstructed. Hence a loss function that accounts for the reconstruction error of images can be used to train the translators. Experiments on multiple image translation tasks with unlabeled data show considerable performance gain of DualGAN over a single GAN. For some tasks, DualGAN can even achieve comparable or slightly better results than conditional GAN trained on fully labeled data.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Yi_DualGAN_Unsupervised_Dual_ICCV_2017_paper.pdf", @@ -4975,7 +5293,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Yi_DualGAN_Unsupervised_Dual_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Yi_DualGAN_Unsupervised_Dual_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Yi_2017_ICCV,\n \n author = {\n Yi,\n Zili and Zhang,\n Hao and Tan,\n Ping and Gong,\n Minglun\n},\n title = {\n DualGAN: Unsupervised Dual Learning for Image-To-Image Translation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "DualNet: Learn Complementary Features for Image Recognition", @@ -4983,6 +5302,7 @@ "status": "Poster", "track": "main", "pid": "227", + "author_site": "Saihui Hou; Xu Liu; Zilei Wang", "author": "Saihui Hou; Xu Liu; Zilei Wang", "abstract": "In this work we propose a novel framework named DualNet aiming at learning more accurate representation for image recognition. Here two parallel neural networks are coordinated to learn complementary features and thus a wider network is constructed. Specifically, we logically divide an end-to-end deep convolutional neural network into two functional parts, i.e., feature extractor and image classifier. The extractors of two subnetworks are placed side by side, which exactly form the feature extractor of DualNet. Then the two-stream features are aggregated to the final classifier for overall classification, while two auxiliary classifiers are appended behind the feature extractor of each subnetwork to make the separately learned features discriminative alone. The complementary constraint is imposed by weighting the three classifiers, which is indeed the key of DualNet. The corresponding training strategy is also proposed, consisting of iterative training and joint finetuning, to make the two subnetworks cooperate well with each other. Finally, DualNet based on the well-known CaffeNet, VGGNet, NIN and ResNet are thoroughly investigated and experimentally evaluated on multiple datasets including CIFAR-100, Stanford Dogs and UEC FOOD-100. The results demonstrate that DualNet can really help learn more accurate image representation, and thus result in higher accuracy for recognition. In particular, the performance on CIFAR-100 is state-of-the-art compared to the recent works.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Hou_DualNet_Learn_Complementary_ICCV_2017_paper.pdf", @@ -5007,7 +5327,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Hou_2017_ICCV,\n \n author = {\n Hou,\n Saihui and Liu,\n Xu and Wang,\n Zilei\n},\n title = {\n DualNet: Learn Complementary Features for Image Recognition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Dynamic Label Graph Matching for Unsupervised Video Re-Identification", @@ -5015,6 +5336,7 @@ "status": "Poster", "track": "main", "pid": "2120", + "author_site": "Mang Ye; Andy J. Ma; Liang Zheng; Jiawei Li; Pong C. Yuen", "author": "Mang Ye; Andy J. Ma; Liang Zheng; Jiawei Li; Pong C. Yuen", "abstract": "Label estimation is an important component in an unsupervised person re-identification (re-ID) system. This paper focuses on cross-camera label estimation, which can be subsequently used in feature learning to learn robust re-ID models. Specifically, we propose to construct a graph for samples in each camera, and then graph matching scheme is introduced for cross-camera labeling association. While labels directly output from existing graph matching methods may be noisy and inaccurate due to significant cross-camera variations, this paper propose a dynamic graph matching (DGM) method. DGM iteratively updates the image graph and the label estimation process by learning a better feature space with intermediate estimated labels. DGM is advantageous in two aspects: 1) the accuracy of estimated labels is improved significantly with the iterations; 2) DGM is robust to noisy initial training data. Extensive experiments conducted on three benchmarks including the large-scale MARS dataset show that DGM yields competitive performance to fully supervised baselines, and outperforms competing unsupervised learning methods.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Ye_Dynamic_Label_Graph_ICCV_2017_paper.pdf", @@ -5039,7 +5361,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;1;0;0", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Ye_2017_ICCV,\n \n author = {\n Ye,\n Mang and Ma,\n Andy J. and Zheng,\n Liang and Li,\n Jiawei and Yuen,\n Pong C.\n},\n title = {\n Dynamic Label Graph Matching for Unsupervised Video Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Dynamics Enhanced Multi-Camera Motion Segmentation From Unsynchronized Videos", @@ -5047,6 +5370,7 @@ "status": "Poster", "track": "main", "pid": "2374", + "author_site": "Xikang Zhang; Bengisu Ozbay; Mario Sznaier; Octavia Camps", "author": "Xikang Zhang; Bengisu Ozbay; Mario Sznaier; Octavia Camps", "abstract": "This paper considers the multi-camera motion segmentation problem using unsynchronized videos. Specifically, given two video clips containing several moving objects, captured by unregistered, unsynchronized cameras with different viewpoints, our goal is to assign features to moving objects in the scene. This problem challenges existing methods, due to the lack of registration information and correspondences across cameras. To solve it, we propose a new method that exploits both shape and dynamical information and does not require spatio-temporal registration or shared features. As shown in the paper, the combination of shape and dynamical information results in improved performance even in the single camera case, and allows for solving the multi-camera segmentation problem with a computational cost similar to that of existing single-view techniques. These results are illustrated using both the existing Hopkins 155 data set and a new multi-camera data set, the RSL-12.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zhang_Dynamics_Enhanced_Multi-Camera_ICCV_2017_paper.pdf", @@ -5071,7 +5395,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhang_2017_ICCV,\n \n author = {\n Zhang,\n Xikang and Ozbay,\n Bengisu and Sznaier,\n Mario and Camps,\n Octavia\n},\n title = {\n Dynamics Enhanced Multi-Camera Motion Segmentation From Unsynchronized Videos\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Editable Parametric Dense Foliage From 3D Capture", @@ -5079,6 +5404,7 @@ "status": "Poster", "track": "main", "pid": "2513", + "author_site": "Gaurav Chaurasia; Paul Beardsley", "author": "Gaurav Chaurasia; Paul Beardsley", "abstract": "We present an algorithm to compute parametric models of dense foliage. The guiding principles of our work are automatic reconstruction and compact artist friendly representation. We use Bezier patches to model leaf surface, which we compute from images and point clouds of dense foliage. We present an algorithm to segment individual leaves from colour and depth data. We then reconstruct the Bezier representation from segmented leaf points clouds using non-linear optimisation. Unlike previous work, we do not require laboratory scanned exemplars or user intervention. We also demonstrate intuitive manipulators to edit the reconstructed parametric models. We believe our work is a step towards making captured data more accessible to artists for foliage modelling.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Chaurasia_Editable_Parametric_Dense_ICCV_2017_paper.pdf", @@ -5103,7 +5429,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Zurich", "aff_country_unique_index": "0;0", - "aff_country_unique": "Switzerland" + "aff_country_unique": "Switzerland", + "bibtex": "@InProceedings{Chaurasia_2017_ICCV,\n \n author = {\n Chaurasia,\n Gaurav and Beardsley,\n Paul\n},\n title = {\n Editable Parametric Dense Foliage From 3D Capture\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Efficient Algorithms for Moral Lineage Tracing", @@ -5111,6 +5438,7 @@ "status": "Poster", "track": "main", "pid": "2324", + "author_site": "Markus Rempfler; Jan-Hendrik Lange; Florian Jug; Corinna Blasse; Eugene W. Myers; Bjoern H. Menze; Bjoern Andres", "author": "Markus Rempfler; Jan-Hendrik Lange; Florian Jug; Corinna Blasse; Eugene W. Myers; Bjoern H. Menze; Bjoern Andres", "abstract": "Lineage tracing, the joint segmentation and tracking of living cells as they move and divide in a sequence of light microscopy images, is a challenging task. Jug et al. have proposed a mathematical abstraction of this task, the moral lineage tracing problem (MLTP), whose feasible solutions define both a segmentation of every image and a lineage forest of cells. Their branch-and-cut algorithm, however, is prone to many cuts and slow convergence for large instances. To address this problem, we make three contributions: (i) we devise the first efficient primal feasible local search algorithms for the MLTP, (ii) we improve the branch-and-cut algorithm by separating tighter cutting planes and by incorporating our primal algorithms, (iii) we show in experiments that our algorithms find accurate solutions on the problem instances of Jug et al. and scale to larger instances, leveraging moral lineage tracing to practical significance.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Rempfler_Efficient_Algorithms_for_ICCV_2017_paper.pdf", @@ -5126,7 +5454,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Rempfler_Efficient_Algorithms_for_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Rempfler_Efficient_Algorithms_for_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Rempfler_2017_ICCV,\n \n author = {\n Rempfler,\n Markus and Lange,\n Jan-Hendrik and Jug,\n Florian and Blasse,\n Corinna and Myers,\n Eugene W. and Menze,\n Bjoern H. and Andres,\n Bjoern\n},\n title = {\n Efficient Algorithms for Moral Lineage Tracing\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Efficient Global 2D-3D Matching for Camera Localization in a Large-Scale 3D Map", @@ -5134,10 +5463,11 @@ "status": "Poster", "track": "main", "pid": "1229", + "author_site": "Liu Liu; Hongdong Li; Yuchao Dai", "author": "Liu Liu; Hongdong Li; Yuchao Dai", "abstract": "Given an image of a street scene in a city, this paper develops a new method that can quickly and precisely pinpoint at which location (as well as viewing direction) the image was taken, against a pre-stored large-scale 3D point-cloud map of the city. We adopt the recently developed 2D-3D direct feature matching framework for this task [23,31,32,42-44]. This is a challenging task especially for large-scale problems. As the map size grows bigger, many 3D points in the wider geographical area can be visually very similar-or even identical-causing severe ambiguities in 2D-3D feature matching. The key is to quickly and unambiguously find the correct matches between a query image and the large 3D map. Existing methods solve this problem mainly via comparing individual features' visual similarities in a local and per feature manner, thus only local solutions can be found, inadequate for large-scale applications. In this paper, we introduce a global method which harnesses global contextual information exhibited both within the query image and among all the 3D points in the map. This is achieved by a novel global ranking algorithm, applied to a Markov network built upon the 3D map, which takes account of not only visual similarities between individual 2D-3D matches, but also their global compatibilities (as measured by co-visibility) among all matching pairs found in the scene. Tests on standard benchmark datasets show that our method achieved both higher precision and comparable recall, compared with the state-of-the-art.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Liu_Efficient_Global_2D-3D_ICCV_2017_paper.pdf", - "aff": "Northwestern Polytechnical University, Xi\u2019an, China+Australian National University, Canberra, Australia; Australian National University, Canberra, Australia+Australia Centre for Robotic Vision; Northwestern Polytechnical University, Xi\u2019an, China+Australian National University, Canberra, Australia", + "aff": "Northwestern Polytechnical University, Xi’an, China+Australian National University, Canberra, Australia; Australian National University, Canberra, Australia+Australia Centre for Robotic Vision; Northwestern Polytechnical University, Xi’an, China+Australian National University, Canberra, Australia", "project": "", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2017/supplemental/Liu_Efficient_Global_2D-3D_ICCV_2017_supplemental.pdf", @@ -5153,12 +5483,13 @@ "aff_unique_index": "0+1;1+2;0+1", "aff_unique_norm": "Northwestern Polytechnical University;Australian National University;Australia Centre for Robotic Vision", "aff_unique_dep": ";;", - "aff_unique_url": "http://www.nwpu.edu.cn;https://www.anu.edu.au;https://roboticvision.org/", - "aff_unique_abbr": "NPU;ANU;ACRV", + "aff_unique_url": "http://www.nwpu.edu.cn;https://www.anu.edu.au;", + "aff_unique_abbr": "NWPU;ANU;", "aff_campus_unique_index": "0+1;1;0+1", "aff_campus_unique": "Xi'an;Canberra;", "aff_country_unique_index": "0+1;1+1;0+1", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Liu_2017_ICCV,\n \n author = {\n Liu,\n Liu and Li,\n Hongdong and Dai,\n Yuchao\n},\n title = {\n Efficient Global 2D-3D Matching for Camera Localization in a Large-Scale 3D Map\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Efficient Global Illumination for Morphable Models", @@ -5166,7 +5497,7 @@ "status": "Poster", "track": "main", "pid": "1823", - "author_site": "Andreas Schneider; Sandro Sch\u00c3\u00b6nborn; Lavrenti Frobeen; Bernhard Egger; Thomas Vetter", + "author_site": "Andreas Schneider; Sandro Schönborn; Lavrenti Frobeen; Bernhard Egger; Thomas Vetter", "author": "Andreas Schneider; Sandro Schonborn; Lavrenti Frobeen; Bernhard Egger; Thomas Vetter", "abstract": "We propose an efficient self-shadowing illumination model for Morphable Models. Simulating self-shadowing with ray casting is computationally expensive which makes them impractical in Analysis-by-Synthesis methods for object reconstruction from single images. Therefore, we propose to learn self-shadowing for Morphable Model parameters directly with a linear model. Radiance transfer functions are a powerful way to represent self-shadowing used within the precomputed radiance transfer framework (PRT). We build on PRT to render deforming objects with self-shadowing at interactive frame rates. It can be illuminated efficiently by environment maps represented with spherical harmonics. The result is an efficient global illumination method for Morphable Models, exploiting an approximated radiance transfer. We apply the method to fitting Morphable Model parameters to a single image of a face and demonstrate that considering self-shadowing improves shape reconstruction.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Schneider_Efficient_Global_Illumination_ICCV_2017_paper.pdf", @@ -5191,7 +5522,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "Switzerland" + "aff_country_unique": "Switzerland", + "bibtex": "@InProceedings{Schneider_2017_ICCV,\n \n author = {\n Schneider,\n Andreas and Schonborn,\n Sandro and Frobeen,\n Lavrenti and Egger,\n Bernhard and Vetter,\n Thomas\n},\n title = {\n Efficient Global Illumination for Morphable Models\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Efficient Low Rank Tensor Ring Completion", @@ -5199,6 +5531,7 @@ "status": "Poster", "track": "main", "pid": "2575", + "author_site": "Wenqi Wang; Vaneet Aggarwal; Shuchin Aeron", "author": "Wenqi Wang; Vaneet Aggarwal; Shuchin Aeron", "abstract": "Using the matrix product state (MPS) representation of the recently proposed tensor ring (TR) decompositions, in this paper we propose a TR completion algorithm, which is an alternating minimization algorithm that alternates over the factors in the MPS representation. This development is motivated in part by the success of matrix completion algorithms that alternate over the (low-rank) factors. We propose a novel initialization method and analyze the computational complexity of the TR completion algorithm. The numerical comparison between the TR completion algorithm and the existing algorithms that employ a low rank tensor train (TT) approximation for data completion shows that our method outperforms the existing ones for a variety of real computer vision settings, and thus demonstrates the improved expressive power of tensor ring as compared to tensor train.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Wang_Efficient_Low_Rank_ICCV_2017_paper.pdf", @@ -5223,7 +5556,8 @@ "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "West Lafayette;Medford", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Wang_2017_ICCV,\n \n author = {\n Wang,\n Wenqi and Aggarwal,\n Vaneet and Aeron,\n Shuchin\n},\n title = {\n Efficient Low Rank Tensor Ring Completion\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Efficient Online Local Metric Adaptation via Negative Samples for Person Re-Identification", @@ -5231,6 +5565,7 @@ "status": "Poster", "track": "main", "pid": "938", + "author_site": "Jiahuan Zhou; Pei Yu; Wei Tang; Ying Wu", "author": "Jiahuan Zhou; Pei Yu; Wei Tang; Ying Wu", "abstract": "Many existing person re-identification (PRID) methods typically attempt to train a faithful global metric offline to cover the enormous visual appearance variations, so as to directly use it online on various probes for identity matching. However, their need for a huge set of positive training pairs is very demanding in practice. In contrast to these methods, this paper advocates a different paradigm: part of the learning can be performed online but with nominal costs, so as to achieve online metric adaptation for different input probes. A major challenge here is that no positive training pairs are available for the probe anymore. By only exploiting easily-available negative samples, we propose a novel solution to achieve local metric adaptation effectively and efficiently. For each probe at the test time, it learns a strictly positive semi-definite dedicated local metric. Comparing to offline global metric learning, its computational cost is negligible. The insight of this new method is that the local hard negative samples can actually provide tight constraints to fine tune the metric locally. This new local metric adaptation method is generally applicable, as it can be used on top of any global metric to enhance its performance. In addition, this paper gives in-depth theoretical analysis and justification of the new method. We prove that our new method guarantees the reduction of the classification error asymptotically, and prove that it actually learns the optimal local metric to best approximate the asymptotic case by a finite number of training data. Extensive experiments and comparative studies on almost all major benchmarks (VIPeR, QMUL GRID, CUHK Campus, CUHK03 and Market-1501) have confirmed the effectiveness and superiority of our method.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zhou_Efficient_Online_Local_ICCV_2017_paper.pdf", @@ -5255,7 +5590,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhou_2017_ICCV,\n \n author = {\n Zhou,\n Jiahuan and Yu,\n Pei and Tang,\n Wei and Wu,\n Ying\n},\n title = {\n Efficient Online Local Metric Adaptation via Negative Samples for Person Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Egocentric Gesture Recognition Using Recurrent 3D Convolutional Neural Networks With Spatiotemporal Transformer Modules", @@ -5263,6 +5599,7 @@ "status": "Spotlight", "track": "main", "pid": "964", + "author_site": "Congqi Cao; Yifan Zhang; Yi Wu; Hanqing Lu; Jian Cheng", "author": "Congqi Cao; Yifan Zhang; Yi Wu; Hanqing Lu; Jian Cheng", "abstract": "Gesture is a natural interface in interacting with wearable devices such as VR/AR helmet and glasses. The main challenge of gesture recognition in egocentric vision arises from the global camera motion caused by the spontaneous head movement of the device wearer. In this paper, we address the problem by a novel recurrent 3D convolutional neural network for end-to-end learning. We specially design a spatiotemporal transformer module with recurrent connections between neighboring time slices which can actively transform a 3D feature map into a canonical view in both spatial and temporal dimensions. To validate our method, we introduce a new dataset with sufficient size, variation and reality, which contains 83 gestures designed for interaction with wearable devices, and more than 24,000 RGB-D gesture samples from 50 subjects captured in 6 scenes. On this dataset, we show that the proposed network outperforms competing state-of-the-art algorithms. Moreover, our method can achieve state-of-the-art performance on the challenging GTEA egocentric action dataset.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Cao_Egocentric_Gesture_Recognition_ICCV_2017_paper.pdf", @@ -5287,7 +5624,8 @@ "aff_campus_unique_index": ";;;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0+1;0+0;0+0+0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Cao_2017_ICCV,\n \n author = {\n Cao,\n Congqi and Zhang,\n Yifan and Wu,\n Yi and Lu,\n Hanqing and Cheng,\n Jian\n},\n title = {\n Egocentric Gesture Recognition Using Recurrent 3D Convolutional Neural Networks With Spatiotemporal Transformer Modules\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Embedding 3D Geometric Features for Rigid Object Part Segmentation", @@ -5295,6 +5633,7 @@ "status": "Poster", "track": "main", "pid": "163", + "author_site": "Yafei Song; Xiaowu Chen; Jia Li; Qinping Zhao", "author": "Yafei Song; Xiaowu Chen; Jia Li; Qinping Zhao", "abstract": "Object part segmentation is a challenging and fundamental problem in computer vision. Its difficulties may be caused by the varying viewpoints, poses, and topological structures, which can be attributed to an essential reason, i.e., a specific object is a 3D model rather than a 2D figure. Therefore, we conjecture that not only 2D appearance features but also 3D geometric features could be helpful. With this in mind, we propose a 2-stream FCN. One stream, named AppNet, is to extract 2D appearance features from the input image. The other stream, named GeoNet, is to extract 3D geometric features. However, the problem is that the input is just an image. To this end, we design a 2D-convolution based CNN structure to extract 3D geometric features from 3D volume, which is named VolNet. Then a teacher-student strategy is adopted and VolNet teaches GeoNet how to extract 3D geometric features from an image. To perform this teaching process, we synthesize training data using 3D models. Each training sample consists of an image and its corresponding volume. A perspective voxelization algorithm is further proposed to align them. Experimental results verify our conjecture and the effectiveness of both the proposed 2-stream CNN and VolNet.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Song_Embedding_3D_Geometric_ICCV_2017_paper.pdf", @@ -5319,7 +5658,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Song_2017_ICCV,\n \n author = {\n Song,\n Yafei and Chen,\n Xiaowu and Li,\n Jia and Zhao,\n Qinping\n},\n title = {\n Embedding 3D Geometric Features for Rigid Object Part Segmentation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Encoder Based Lifelong Learning", @@ -5327,6 +5667,7 @@ "status": "Poster", "track": "main", "pid": "432", + "author_site": "Amal Rannen; Rahaf Aljundi; Matthew B. Blaschko; Tinne Tuytelaars", "author": "Amal Rannen; Rahaf Aljundi; Matthew B. Blaschko; Tinne Tuytelaars", "abstract": "This paper introduces a new lifelong learning solution where a single model is trained for a sequence of tasks. The main challenge that vision systems face in this context is catastrophic forgetting: as they tend to adapt to the most recently seen task, they lose performance on the tasks that were learned previously. Our method aims at preserving the knowledge of the previous tasks while learning a new one by using autoencoders. For each task, an under-complete autoencoder is learned, capturing the features that are crucial for its achievement. When a new task is presented to the system, we prevent the reconstructions of the features with these autoencoders from changing, which has the effect of preserving the information on which the previous tasks are mainly relying. At the same time, the features are given space to adjust to the most recent environment as only their projection into a low dimension submanifold is controlled. The proposed system is evaluated on image classification tasks and shows a reduction of forgetting over the state-of-the-art.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Rannen_Encoder_Based_Lifelong_ICCV_2017_paper.pdf", @@ -5351,7 +5692,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Belgium" + "aff_country_unique": "Belgium", + "bibtex": "@InProceedings{Rannen_2017_ICCV,\n \n author = {\n Rannen,\n Amal and Aljundi,\n Rahaf and Blaschko,\n Matthew B. and Tuytelaars,\n Tinne\n},\n title = {\n Encoder Based Lifelong Learning\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Encouraging LSTMs to Anticipate Actions Very Early", @@ -5359,6 +5701,7 @@ "status": "Poster", "track": "main", "pid": "181", + "author_site": "Mohammad Sadegh Aliakbarian; Fatemeh Sadat Saleh; Mathieu Salzmann; Basura Fernando; Lars Petersson; Lars Andersson", "author": "Mohammad Sadegh Aliakbarian; Fatemeh Sadat Saleh; Mathieu Salzmann; Basura Fernando; Lars Petersson; Lars Andersson", "abstract": "In contrast to the widely studied problem of recognizing an action given a complete sequence, action anticipation aims to identify the action from only partially available videos. As such, it is therefore key to the success of computer vision applications requiring to react as early as possible, such as autonomous navigation. In this paper, we propose a new action anticipation method that achieves high prediction accuracy even in the presence of a very small percentage of a video sequence. To this end, we develop a multi-stage LSTM architecture that leverages context-aware and action-aware features, and introduce a novel loss function that encourages the model to predict the correct class as early as possible. Our experiments on standard benchmark datasets evidence the benefits of our approach; We outperform the state-of-the-art action anticipation methods for early prediction by a relative increase in accuracy of 22.0% on JHMDB-21, 14.0% on UT-Interaction and 49.9% on UCF-101.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Aliakbarian_Encouraging_LSTMs_to_ICCV_2017_paper.pdf", @@ -5376,14 +5719,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Aliakbarian_Encouraging_LSTMs_to_ICCV_2017_paper.html", "aff_unique_index": "0+1;0+1;2;0;0+1;1", - "aff_unique_norm": "Australian National University;CSIRO;EPFL", + "aff_unique_norm": "Australian National University;CSIRO;École Polytechnique Fédérale de Lausanne", "aff_unique_dep": ";Smart Vision Systems;CVLab", "aff_unique_url": "https://www.anu.edu.au;https://www.csiro.au;https://cvlab.epfl.ch", "aff_unique_abbr": "ANU;CSIRO;EPFL", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;1;0;0+0;0", - "aff_country_unique": "Australia;Switzerland" + "aff_country_unique": "Australia;Switzerland", + "bibtex": "@InProceedings{Aliakbarian_2017_ICCV,\n \n author = {\n Sadegh Aliakbarian,\n Mohammad and Sadat Saleh,\n Fatemeh and Salzmann,\n Mathieu and Fernando,\n Basura and Petersson,\n Lars and Andersson,\n Lars\n},\n title = {\n Encouraging LSTMs to Anticipate Actions Very Early\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "End-To-End Face Detection and Cast Grouping in Movies Using Erdos-Renyi Clustering", @@ -5391,6 +5735,7 @@ "status": "Spotlight", "track": "main", "pid": "1422", + "author_site": "SouYoung Jin; Hang Su; Chris Stauffer; Erik Learned-Miller", "author": "SouYoung Jin; Hang Su; Chris Stauffer; Erik Learned-Miller", "abstract": "We present an end-to-end system for detecting and clustering faces by identity in full-length movies. Unlike works that start with a predefined set of detected faces, we consider the end-to-end problem of detection and clustering together. We make three separate contributions. First, we combine a state-of-the-art face detector with a generic tracker to extract high quality face tracklets. We then introduce a novel clustering method, motivated by the classic graph theory results of Erdos and Renyi. It is based on the observations that large clusters can be fully connected by joining just a small fraction of their point pairs, while just a single connection between two different people can lead to poor clustering results. This suggests clustering using a verification system with very few false positives but perhaps moderate recall. We introduce a novel verification method, rank-1 counts verification, that has this property, and use it in a link-based clustering scheme. Finally, we define a novel end-to-end detection and clustering evaluation metric allowing us to assess the accuracy of the entire end-to-end system. We present state-of-the-art results on multiple video data sets and also on standard face databases.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Jin_End-To-End_Face_Detection_ICCV_2017_paper.pdf", @@ -5406,7 +5751,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Jin_End-To-End_Face_Detection_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Jin_End-To-End_Face_Detection_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Jin_2017_ICCV,\n \n author = {\n Jin,\n SouYoung and Su,\n Hang and Stauffer,\n Chris and Learned-Miller,\n Erik\n},\n title = {\n End-To-End Face Detection and Cast Grouping in Movies Using Erdos-Renyi Clustering\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "End-To-End Learning of Geometry and Context for Deep Stereo Regression", @@ -5414,6 +5760,7 @@ "status": "Spotlight", "track": "main", "pid": "1007", + "author_site": "Alex Kendall; Hayk Martirosyan; Saumitro Dasgupta; Peter Henry; Ryan Kennedy; Abraham Bachrach; Adam Bry", "author": "Alex Kendall; Hayk Martirosyan; Saumitro Dasgupta; Peter Henry; Ryan Kennedy; Abraham Bachrach; Adam Bry", "abstract": "We propose a novel deep learning architecture for regressing disparity from a rectified pair of stereo images. We leverage knowledge of the problem's geometry to form a cost volume using deep feature representations. We learn to incorporate contextual information using 3-D convolutions over this volume. Disparity values are regressed from the cost volume using a proposed differentiable soft argmin operation, which allows us to train our method end-to-end to sub-pixel accuracy without any additional post-processing or regularization. We evaluate our method on the Scene Flow and KITTI datasets and on KITTI we set a new state-of-the-art benchmark, while being significantly faster than competing approaches.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Kendall_End-To-End_Learning_of_ICCV_2017_paper.pdf", @@ -5438,7 +5785,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Kendall_2017_ICCV,\n \n author = {\n Kendall,\n Alex and Martirosyan,\n Hayk and Dasgupta,\n Saumitro and Henry,\n Peter and Kennedy,\n Ryan and Bachrach,\n Abraham and Bry,\n Adam\n},\n title = {\n End-To-End Learning of Geometry and Context for Deep Stereo Regression\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "EnhanceNet: Single Image Super-Resolution Through Automated Texture Synthesis", @@ -5446,7 +5794,7 @@ "status": "Oral", "track": "main", "pid": "500", - "author_site": "Mehdi S. M. Sajjadi; Bernhard Sch\u00c3\u00b6lkopf; Michael Hirsch", + "author_site": "Mehdi S. M. Sajjadi; Bernhard Schölkopf; Michael Hirsch", "author": "Mehdi S. M. Sajjadi; Bernhard Scholkopf; Michael Hirsch", "abstract": "Single image super-resolution is the task of inferring a high-resolution image from a single low-resolution input. Traditionally, the performance of algorithms for this task is measured using pixel-wise reconstruction measures such as peak signal-to-noise ratio (PSNR) which have been shown to correlate poorly with the human perception of image quality. As a result, algorithms minimizing these metrics tend to produce over-smoothed images that lack high-frequency textures and do not look natural despite yielding high PSNR values. We propose a novel application of automated texture synthesis in combination with a perceptual loss focusing on creating realistic textures rather than optimizing for a pixel-accurate reproduction of ground truth images during training. By using feed-forward fully convolutional neural networks in an adversarial training setting, we achieve a significant boost in image quality at high magnification ratios. Extensive experiments on a number of datasets show the effectiveness of our approach, yielding state-of-the-art results in both quantitative and qualitative benchmarks.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Sajjadi_EnhanceNet_Single_Image_ICCV_2017_paper.pdf", @@ -5471,7 +5819,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Sajjadi_2017_ICCV,\n \n author = {\n Sajjadi,\n Mehdi S. M. and Scholkopf,\n Bernhard and Hirsch,\n Michael\n},\n title = {\n EnhanceNet: Single Image Super-Resolution Through Automated Texture Synthesis\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Ensemble Deep Learning for Skeleton-Based Action Recognition Using Temporal Sliding LSTM Networks", @@ -5479,6 +5828,7 @@ "status": "Poster", "track": "main", "pid": "457", + "author_site": "Inwoong Lee; Doyoung Kim; Seoungyoon Kang; Sanghoon Lee", "author": "Inwoong Lee; Doyoung Kim; Seoungyoon Kang; Sanghoon Lee", "abstract": "This paper addresses the problems of feature representation of skeleton joints and the modeling of temporal dynamics to recognize human actions. Traditional methods generally use relative coordinate systems dependent on some joints, and model only the long-term dependency, while excluding short-term and medium term dependencies. Instead of taking raw skeletons as the input, we transform the skeletons into another coordinate system to obtain the robustness to scale, rotation and translation, and then extract salient motion features from them. Considering that Long Short-term Memory (LSTM) networks with various time-step sizes can model various attributes well, we propose novel ensemble Temporal Sliding LSTM (TS-LSTM) networks for skeleton-based action recognition. The proposed network is composed of multiple parts containing short-term, medium-term and long-term TS-LSTM networks, respectively. In our network, we utilize an average ensemble among multiple parts as a final feature to capture various temporal dependencies. We evaluate the proposed networks and the additional other architectures to verify the effectiveness of the proposed networks, and also compare them with several other methods on five challenging datasets. The experimental results demonstrate that our network models achieve the state-of-the-art performance through various temporal features. Additionally, we analyze a relation between the recognized actions and the multi-term TS-LSTM features by visualizing the softmax features of multiple parts.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Lee_Ensemble_Deep_Learning_ICCV_2017_paper.pdf", @@ -5503,7 +5853,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Seoul", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Lee_2017_ICCV,\n \n author = {\n Lee,\n Inwoong and Kim,\n Doyoung and Kang,\n Seoungyoon and Lee,\n Sanghoon\n},\n title = {\n Ensemble Deep Learning for Skeleton-Based Action Recognition Using Temporal Sliding LSTM Networks\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Ensemble Diffusion for Retrieval", @@ -5511,6 +5862,7 @@ "status": "Oral", "track": "main", "pid": "693", + "author_site": "Song Bai; Zhichao Zhou; Jingdong Wang; Xiang Bai; Longin Jan Latecki; Qi Tian", "author": "Song Bai; Zhichao Zhou; Jingdong Wang; Xiang Bai; Longin Jan Latecki; Qi Tian", "abstract": "As a postprocessing procedure, diffusion process has demonstrated its ability of substantially improving the performance of various visual retrieval systems. Whereas, great efforts are also devoted to similarity (or metric) fusion, seeing that only one individual type of similarity cannot fully reveal the intrinsic relationship between objects. This stimulates a great research interest of considering similarity fusion in the framework of diffusion process (i.e., fusion with diffusion) for robust retrieval. In this paper, we firstly revisit representative methods about fusion with diffusion, and provide new insights which are ignored by previous researchers. Then, observing that existing algorithms are susceptible to noisy similarities, the proposed Regularized Ensemble Diffusion (RED) is bundled with an automatic weight learning paradigm, so that the negative impacts of noisy similarities are suppressed. At last, we integrate several recently-proposed similarities with the proposed framework. The experimental results suggest that we can achieve new state-of-the-art performances on various retrieval tasks, including 3D shape retrieval on ModelNet dataset, and image retrieval on Holidays and Ukbench dataset.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Bai_Ensemble_Diffusion_for_ICCV_2017_paper.pdf", @@ -5528,14 +5880,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Bai_Ensemble_Diffusion_for_ICCV_2017_paper.html", "aff_unique_index": "0;0;1;0;2;3", - "aff_unique_norm": "Huazhong University of Science and Technology;Microsoft;Temple University;University of Texas at San Antonio", + "aff_unique_norm": "Huazhong University of Science and Technology;Microsoft Research;Temple University;University of Texas at San Antonio", "aff_unique_dep": ";Research;;", "aff_unique_url": "http://www.hust.edu.cn;https://www.microsoft.com/en-us/research/group/asia;https://www.temple.edu;https://www.utsa.edu", "aff_unique_abbr": "HUST;MSR Asia;Temple;UTSA", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Asia;San Antonio", "aff_country_unique_index": "0;0;0;0;1;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Bai_2017_ICCV,\n \n author = {\n Bai,\n Song and Zhou,\n Zhichao and Wang,\n Jingdong and Bai,\n Xiang and Jan Latecki,\n Longin and Tian,\n Qi\n},\n title = {\n Ensemble Diffusion for Retrieval\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Escape From Cells: Deep Kd-Networks for the Recognition of 3D Point Cloud Models", @@ -5543,6 +5896,7 @@ "status": "Spotlight", "track": "main", "pid": "2204", + "author_site": "Roman Klokov; Victor Lempitsky", "author": "Roman Klokov; Victor Lempitsky", "abstract": "We present a new deep learning architecture (called Kd-network) that is designed for 3D model recognition tasks and works with unstructured point clouds. The new architecture performs multiplicative transformations and shares parameters of these transformations according to the subdivisions of the point clouds imposed onto them by kd-trees. Unlike the currently dominant convolutional architectures that usually require rasterization on uniform two-dimensional or three-dimensional grids, Kd-networks do not rely on such grids in any way and therefore avoid poor scaling behavior. In a series of experiments with popular shape recognition benchmarks, Kd-networks demonstrate competitive performance in a number of shape recognition tasks such as shape classification, shape retrieval and shape part segmentation.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Klokov_Escape_From_Cells_ICCV_2017_paper.pdf", @@ -5567,7 +5921,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Russian Federation" + "aff_country_unique": "Russia", + "bibtex": "@InProceedings{Klokov_2017_ICCV,\n \n author = {\n Klokov,\n Roman and Lempitsky,\n Victor\n},\n title = {\n Escape From Cells: Deep Kd-Networks for the Recognition of 3D Point Cloud Models\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Estimating Defocus Blur via Rank of Local Patches", @@ -5575,6 +5930,7 @@ "status": "Poster", "track": "main", "pid": "3078", + "author_site": "Guodong Xu; Yuhui Quan; Hui Ji", "author": "Guodong Xu; Yuhui Quan; Hui Ji", "abstract": "This paper addresses the problem of defocus map estimation from a single image. We present a fast yet effective approach to estimate the spatially varying amounts of defocus blur at edge locations, which is based on the maximum, ranks of the corresponding local patches with different orientations in gradient domain. Such an approach is motivated by the theoretical analysis which reveals the connection between the rank of a local patch blurred by a defocus blur kernel and the blur amount by the kernel. After the amounts of defocus blur at edge locations are obtained, a complete defocus map is generated by a standard propagation procedure. The proposed method is extensively evaluated on real image datasets, and the experimental results show its superior performance to existing approaches.proposed method is extensively evaluated on real data, and the experimental results show its superior performance to existing approaches.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Xu_Estimating_Defocus_Blur_ICCV_2017_paper.pdf", @@ -5599,7 +5955,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Guangzhou", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "Singapore;China" + "aff_country_unique": "Singapore;China", + "bibtex": "@InProceedings{Xu_2017_ICCV,\n \n author = {\n Xu,\n Guodong and Quan,\n Yuhui and Ji,\n Hui\n},\n title = {\n Estimating Defocus Blur via Rank of Local Patches\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Exploiting Multi-Grain Ranking Constraints for Precisely Searching Visually-Similar Vehicles", @@ -5607,6 +5964,7 @@ "status": "Poster", "track": "main", "pid": "330", + "author_site": "Ke Yan; Yonghong Tian; Yaowei Wang; Wei Zeng; Tiejun Huang", "author": "Ke Yan; Yonghong Tian; Yaowei Wang; Wei Zeng; Tiejun Huang", "abstract": "Precise search of visually-similar vehicles poses a great challenge in computer vision, which needs to find exactly the same vehicle among a massive vehicles with visually similar appearances for a given query image. In this paper, we model the relationship of vehicle images as multiple grains. Following this, we propose two approaches to alleviate the precise vehicle search problem by exploiting multi-grain ranking constraints. One is Generalized Pairwise Ranking, which generalizes the conventional pairwise from considering only binary similar/dissimilar relations to multiple relations. The other is Multi-Grain based List Ranking, which introduces permutation probability to score a permutation of a multi-grain list, and further optimizes the ranking by the likelihood loss function. We implement the two approaches with multi-attribute classification in a multi-task deep learning framework. To further facilitate the research on precise vehicle search, we also contribute two high-quality and well-annotated vehicle datasets, named VD1 and VD2, which are collected from two different cities with diverse annotated attributes. As two of the largest publicly available precise vehicle search datasets, they contain 1,097,649 and 807,260 vehicle images respectively. Experimental results show that our approaches achieve the state-of-the-art performance on both datasets.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Yan_Exploiting_Multi-Grain_Ranking_ICCV_2017_paper.pdf", @@ -5631,7 +5989,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Beijing;", "aff_country_unique_index": "0+0;0+0;0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yan_2017_ICCV,\n \n author = {\n Yan,\n Ke and Tian,\n Yonghong and Wang,\n Yaowei and Zeng,\n Wei and Huang,\n Tiejun\n},\n title = {\n Exploiting Multi-Grain Ranking Constraints for Precisely Searching Visually-Similar Vehicles\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Exploiting Spatial Structure for Localizing Manipulated Image Regions", @@ -5639,6 +5998,7 @@ "status": "Poster", "track": "main", "pid": "2085", + "author_site": "Jawadul H. Bappy; Amit K. Roy-Chowdhury; Jason Bunk; Lakshmanan Nataraj; B. S. Manjunath", "author": "Jawadul H. Bappy; Amit K. Roy-Chowdhury; Jason Bunk; Lakshmanan Nataraj; B. S. Manjunath", "abstract": "The advent of high-tech journaling tools facilitates an image to be manipulated in a way that can easily evade state-of-the-art image tampering detection approaches. The recent success of the deep learning approaches in different recognition tasks inspires us to develop a high confidence detection framework which can localize manipulated regions in an image. Unlike semantic object segmentation where all meaningful regions (objects) are segmented, the localization of image manipulation focuses only the possible tampered region which makes the problem even more challenging. In order to formulate the framework, we employ a hybrid CNN-LSTM model to capture discriminative features between manipulated and non-manipulated regions. One of the key properties of manipulated regions is that they exhibit discriminative features in boundaries shared with neighboring non-manipulated pixels. Our motivation is to learn the boundary discrepancy, i.e., the spatial structure, between manipulated and non-manipulated regions with the combination of LSTM and convolution layers. We perform end-to-end training of the network to learn the parameters through back-propagation given groundtruth mask information. The overall framework is capable of detecting different types of image manipulations, including copy-move, removal and splicing. Our model shows promising results in localizing manipulated regions, which is demonstrated through rigorous experimentation on three diverse datasets.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Bappy_Exploiting_Spatial_Structure_ICCV_2017_paper.pdf", @@ -5663,7 +6023,8 @@ "aff_campus_unique_index": "0;0;1;1;1+1", "aff_campus_unique": "Riverside;Santa Barbara", "aff_country_unique_index": "0;0;0;0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Bappy_2017_ICCV,\n \n author = {\n Bappy,\n Jawadul H. and Roy-Chowdhury,\n Amit K. and Bunk,\n Jason and Nataraj,\n Lakshmanan and Manjunath,\n B. S.\n},\n title = {\n Exploiting Spatial Structure for Localizing Manipulated Image Regions\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Extreme Clicking for Efficient Object Annotation", @@ -5671,6 +6032,7 @@ "status": "Poster", "track": "main", "pid": "2429", + "author_site": "Dim P. Papadopoulos; Jasper R. R. Uijlings; Frank Keller; Vittorio Ferrari", "author": "Dim P. Papadopoulos; Jasper R. R. Uijlings; Frank Keller; Vittorio Ferrari", "abstract": "Manually annotating object bounding boxes is central to building computer vision datasets, and it is very time consuming (annotating ILSVRC [53] took 35s for one high-quality box [62]). It involves clicking on imaginary corners of a tight box around the object. This is difficult as these corners are often outside the actual object and several adjustments are required to obtain a tight box. We propose extreme clicking instead: we ask the annotator to click on four physical points on the object: the top, bottom, left- and right-most points. This task is more natural and these points are easy to find. We crowd-source extreme point annotations for PASCAL VOC 2007 and 2012 and show that (1) annotation time is only 7s per box, 5x faster than the traditional way of drawing boxes [62]; (2) the quality of the boxes is as good as the original ground-truth drawn the traditional way; (3) detectors trained on our annotations are as accurate as those trained on the original ground-truth. Moreover, our extreme clicking strategy not only yields box coordinates, but also four accurate boundary points. We show (4) how to incorporate them into GrabCut to obtain more accurate segmentations than those delivered when initializing it from bounding boxes; (5) semantic segmentations models trained on these segmentations outperform those trained on segmentations derived from bounding boxes.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Papadopoulos_Extreme_Clicking_for_ICCV_2017_paper.pdf", @@ -5695,7 +6057,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;0;0+1", - "aff_country_unique": "United Kingdom;United States" + "aff_country_unique": "United Kingdom;United States", + "bibtex": "@InProceedings{Papadopoulos_2017_ICCV,\n \n author = {\n Papadopoulos,\n Dim P. and Uijlings,\n Jasper R. R. and Keller,\n Frank and Ferrari,\n Vittorio\n},\n title = {\n Extreme Clicking for Efficient Object Annotation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "FCN-rLSTM: Deep Spatio-Temporal Neural Networks for Vehicle Counting in City Cameras", @@ -5703,7 +6066,7 @@ "status": "Poster", "track": "main", "pid": "1569", - "author_site": "Shanghang Zhang; Guanhang Wu; Jo\u00c3\u00a3o P. Costeira; Jos\u00c3\u00a9 M. F. Moura", + "author_site": "Shanghang Zhang; Guanhang Wu; João P. Costeira; José M. F. Moura", "author": "Shanghang Zhang; Guanhang Wu; Joao P. Costeira; Jose M. F. Moura", "abstract": "In this paper, we develop deep spatio-temporal neural networks to sequentially count vehicles from low quality videos captured by city cameras (citycams). Citycam videos have low resolution, low frame rate, high occlusion and large perspective, making most existing methods lose their efficacy. To overcome limitations of existing methods and incorporate the temporal information of traffic video, we design a novel FCN-rLSTM network to jointly estimate vehicle density and vehicle count by connecting fully convolutional neural networks (FCN) with long short term memory networks (LSTM) in a residual learning fashion. Such design leverages the strengths of FCN for pixel-level prediction and the strengths of LSTM for learning complex temporal dynamics. The residual learning connection reformulates the vehicle count regression as learning residual functions with reference to the sum of densities in each frame, which significantly accelerates the training of networks. To preserve feature map resolution, we propose a Hyper-Atrous combination to integrate atrous convolution in FCN and combine feature maps of different convolution layers. FCN-rLSTM enables refined feature representation and a novel end-to-end trainable mapping from pixels to vehicle count. We extensively evaluated the proposed method on different counting tasks with three datasets, with experimental results demonstrating their effectiveness and robustness. In particular, FCN-rLSTM reduces the mean absolute error (MAE) from 5.31 to 4.21 on TRANCOS; and reduces the MAE from 2.74 to 1.53 on WebCamT. Training process is accelerated by 5 times on average.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zhang_FCN-rLSTM_Deep_Spatio-Temporal_ICCV_2017_paper.pdf", @@ -5728,7 +6091,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", - "aff_country_unique": "United States;Portugal" + "aff_country_unique": "United States;Portugal", + "bibtex": "@InProceedings{Zhang_2017_ICCV,\n \n author = {\n Zhang,\n Shanghang and Wu,\n Guanhang and Costeira,\n Joao P. and Moura,\n Jose M. F.\n},\n title = {\n FCN-rLSTM: Deep Spatio-Temporal Neural Networks for Vehicle Counting in City Cameras\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "FLaME: Fast Lightweight Mesh Estimation Using Variational Smoothing on Delaunay Graphs", @@ -5736,6 +6100,7 @@ "status": "Poster", "track": "main", "pid": "2488", + "author_site": "W. Nicholas Greene; Nicholas Roy", "author": "W. Nicholas Greene; Nicholas Roy", "abstract": "We propose a lightweight method for dense online monocular depth estimation capable of reconstructing 3D meshes on computationally constrained platforms. Our main contribution is to pose the reconstruction problem as a non-local variational optimization over a time-varying Delaunay graph of the scene geometry, which allows for an efficient, keyframeless approach to depth estimation. The graph can be tuned to favor reconstruction quality or speed and is continuously smoothed and augmented as the camera explores the scene. Unlike keyframe-based approaches, the optimized surface is always available at the current pose, which is necessary for low-latency obstacle avoidance. FLaME (Fast Lightweight Mesh Estimation) can generate mesh reconstructions at upwards of 230 Hz using less than one Intel i7 CPU core, which enables operation on size, weight, and power-constrained platforms. We present results from both benchmark datasets and experiments running FLaME in-the-loop onboard a small flying quadrotor.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Greene_FLaME_Fast_Lightweight_ICCV_2017_paper.pdf", @@ -5760,7 +6125,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Greene_2017_ICCV,\n \n author = {\n Nicholas Greene,\n W. and Roy,\n Nicholas\n},\n title = {\n FLaME: Fast Lightweight Mesh Estimation Using Variational Smoothing on Delaunay Graphs\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Face Sketch Matching via Coupled Deep Transform Learning", @@ -5768,6 +6134,7 @@ "status": "Poster", "track": "main", "pid": "3220", + "author_site": "Shruti Nagpal; Maneet Singh; Richa Singh; Mayank Vatsa; Afzel Noore; Angshul Majumdar", "author": "Shruti Nagpal; Maneet Singh; Richa Singh; Mayank Vatsa; Afzel Noore; Angshul Majumdar", "abstract": "Face sketch to digital image matching is an important challenge of face recognition that involves matching across different domains. Current research efforts have primarily focused on extracting domain invariant representations or learning a mapping from one domain to the other. In this research, we propose a novel transform learning based approach termed as DeepTransformer, which learns a transformation and mapping function between the features of two domains. The proposed formulation is independent of the input information and can be applied with any existing learned or hand-crafted feature. Since the mapping function is directional in nature, we propose two variants of DeepTransformer: (i) semi-coupled and (ii) symmetrically-coupled deep transform learning. This research also uses a novel IIIT-D Composite Sketch with Age (CSA) variations database which contains sketch images of 150 subjects along with age-separated digital photos. The performance of the proposed models is evaluated on a novel application of sketch-to-sketch matching, along with sketch-to-digital photo matching. Experimental results demonstrate the robustness of the proposed models in comparison to existing state-of-the-art sketch matching algorithms and a commercial face recognition system.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Nagpal_Face_Sketch_Matching_ICCV_2017_paper.pdf", @@ -5792,7 +6159,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Delhi;", "aff_country_unique_index": "0;0;0+1;0+1;1;0", - "aff_country_unique": "India;United States" + "aff_country_unique": "India;United States", + "bibtex": "@InProceedings{Nagpal_2017_ICCV,\n \n author = {\n Nagpal,\n Shruti and Singh,\n Maneet and Singh,\n Richa and Vatsa,\n Mayank and Noore,\n Afzel and Majumdar,\n Angshul\n},\n title = {\n Face Sketch Matching via Coupled Deep Transform Learning\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Factorized Bilinear Models for Image Recognition", @@ -5800,6 +6168,7 @@ "status": "Poster", "track": "main", "pid": "820", + "author_site": "Yanghao Li; Naiyan Wang; Jiaying Liu; Xiaodi Hou", "author": "Yanghao Li; Naiyan Wang; Jiaying Liu; Xiaodi Hou", "abstract": "Although Deep Convolutional Neural Networks (CNNs) have liberated their power in various computer vision tasks, the most important components of CNN, convolutional layers and fully connected layers, are still limited to linear transformations. In this paper, we propose a novel Factorized Bilinear (FB) layer to model the pairwise feature interactions by considering the quadratic terms in the transformations. Compared with existing methods that tried to incorporate complex non-linearity structures into CNNs, the factorized parameterization makes our FB layer only require a linear increase of parameters and affordable computational cost. To further reduce the risk of overfitting of the FB layer, a specific remedy called DropFactor is devised during the training process. We also analyze the connection between FB layer and some existing models, and show FB layer is a generalization to them. Finally, we validate the effectiveness of FB layer on several widely adopted datasets including CIFAR-10, CIFAR-100 and ImageNet, and demonstrate superior results compared with various state-of-the-art deep models.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Li_Factorized_Bilinear_Models_ICCV_2017_paper.pdf", @@ -5824,7 +6193,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;1;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Li_2017_ICCV,\n \n author = {\n Li,\n Yanghao and Wang,\n Naiyan and Liu,\n Jiaying and Hou,\n Xiaodi\n},\n title = {\n Factorized Bilinear Models for Image Recognition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Fashion Forward: Forecasting Visual Style in Fashion", @@ -5832,6 +6202,7 @@ "status": "Poster", "track": "main", "pid": "110", + "author_site": "Ziad Al-Halah; Rainer Stiefelhagen; Kristen Grauman", "author": "Ziad Al-Halah; Rainer Stiefelhagen; Kristen Grauman", "abstract": "What is the future of fashion? Tackling this question from a data-driven vision perspective, we propose to forecast visual style trends before they occur. We introduce the first approach to predict the future popularity of styles discovered from fashion images in an unsupervised manner. Using these styles as a basis, we train a forecasting model to represent their trends over time. The resulting model can hypothesize new mixtures of styles that will become popular in the future, discover style dynamics (trendy vs. classic), and name the key visual attributes that will dominate tomorrow's fashion. We demonstrate our idea applied to three datasets encapsulating 80,000 fashion products sold across six years on Amazon. Results indicate that fashion forecasting benefits greatly from visual analysis, much more than textual or meta-data cues surrounding products.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Al-Halah_Fashion_Forward_Forecasting_ICCV_2017_paper.pdf", @@ -5856,7 +6227,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Austin", "aff_country_unique_index": "0;0;1", - "aff_country_unique": "Germany;United States" + "aff_country_unique": "Germany;United States", + "bibtex": "@InProceedings{Al-Halah_2017_ICCV,\n \n author = {\n Al-Halah,\n Ziad and Stiefelhagen,\n Rainer and Grauman,\n Kristen\n},\n title = {\n Fashion Forward: Forecasting Visual Style in Fashion\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Fast Face-Swap Using Convolutional Neural Networks", @@ -5864,6 +6236,7 @@ "status": "Poster", "track": "main", "pid": "1366", + "author_site": "Iryna Korshunova; Wenzhe Shi; Joni Dambre; Lucas Theis", "author": "Iryna Korshunova; Wenzhe Shi; Joni Dambre; Lucas Theis", "abstract": "We consider the problem of face swapping in images, where an input identity is transformed into a target identity while preserving pose, facial expression and lighting. To perform this mapping, we use convolutional neural networks trained to capture the appearance of the target identity from an unstructured collection of his/her photographs. This approach is enabled by framing the face swapping problem in terms of style transfer, where the goal is to render an image in the style of another one. Building on recent advances in this area, we devise a new loss function that enables the network to produce highly photorealistic results. By combining neural networks with simple pre- and post-processing steps, we aim at making face swap work in real-time with no input from the user.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Korshunova_Fast_Face-Swap_Using_ICCV_2017_paper.pdf", @@ -5888,7 +6261,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0;1;0", - "aff_country_unique": "United States;Belgium" + "aff_country_unique": "United States;Belgium", + "bibtex": "@InProceedings{Korshunova_2017_ICCV,\n \n author = {\n Korshunova,\n Iryna and Shi,\n Wenzhe and Dambre,\n Joni and Theis,\n Lucas\n},\n title = {\n Fast Face-Swap Using Convolutional Neural Networks\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Fast Image Processing With Fully-Convolutional Networks", @@ -5896,6 +6270,7 @@ "status": "Poster", "track": "main", "pid": "1134", + "author_site": "Qifeng Chen; Jia Xu; Vladlen Koltun", "author": "Qifeng Chen; Jia Xu; Vladlen Koltun", "abstract": "We present an approach to accelerating a wide variety of image processing operators. Our approach uses a fully-convolutional network that is trained on input-output pairs that demonstrate the operator's action. After training, the original operator need not be run at all. The trained network operates at full resolution and runs in constant time. We investigate the effect of network architecture on approximation accuracy, runtime, and memory footprint, and identify a specific architecture that balances these considerations. We evaluate the presented approach on ten advanced image processing operators, including multiple variational models, multiscale tone and detail manipulation, photographic style transfer, nonlocal dehazing, and nonphotorealistic stylization. All operators are approximated by the same model. Experiments demonstrate that the presented approach is significantly more accurate than prior approximation schemes. It increases approximation accuracy as measured by PSNR across the evaluated operators by 8.5 dB on the MIT-Adobe dataset (from 27.5 to 36 dB) and reduces DSSIM by a multiplicative factor of 3 compared to the most accurate prior approximation scheme, while being the fastest. We show that our models generalize across datasets and across resolutions, and investigate a number of extensions of the presented approach.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Chen_Fast_Image_Processing_ICCV_2017_paper.pdf", @@ -5913,14 +6288,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Chen_Fast_Image_Processing_ICCV_2017_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "Intel", + "aff_unique_norm": "Intel Corporation", "aff_unique_dep": "Intel Labs", "aff_unique_url": "https://www.intel.com", "aff_unique_abbr": "Intel", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Chen_2017_ICCV,\n \n author = {\n Chen,\n Qifeng and Xu,\n Jia and Koltun,\n Vladlen\n},\n title = {\n Fast Image Processing With Fully-Convolutional Networks\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Fast Multi-Image Matching via Density-Based Clustering", @@ -5928,6 +6304,7 @@ "status": "Poster", "track": "main", "pid": "1852", + "author_site": "Roberto Tron; Xiaowei Zhou; Carlos Esteves; Kostas Daniilidis", "author": "Roberto Tron; Xiaowei Zhou; Carlos Esteves; Kostas Daniilidis", "abstract": "We consider the problem of finding consistent matches across multiple images. Current state-of-the-art solutions use constraints on cycles of matches together with convex optimization, leading to computationally intensive iterative algorithms. In this paper, we instead propose a clustering-based formulation: we first rigorously show its equivalence with traditional approaches, and then propose QuickMatch, a novel algorithm that identifies multi-image matches from a density function in feature space. Specifically, QuickMatch uses the density estimate to order the points in a tree, and then extracts the matches by breaking this tree using feature distances and measures of distinctiveness. Our algorithm outperforms previous state-of-the-art methods (such as MatchALS) in accuracy, and it is significantly faster (up to 62 times faster on some benchmarks), and can scale to large datasets (with more than twenty thousands features).", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Tron_Fast_Multi-Image_Matching_ICCV_2017_paper.pdf", @@ -5952,7 +6329,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Tron_2017_ICCV,\n \n author = {\n Tron,\n Roberto and Zhou,\n Xiaowei and Esteves,\n Carlos and Daniilidis,\n Kostas\n},\n title = {\n Fast Multi-Image Matching via Density-Based Clustering\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Faster Than Real-Time Facial Alignment: A 3D Spatial Transformer Network Approach in Unconstrained Poses", @@ -5960,6 +6338,7 @@ "status": "Poster", "track": "main", "pid": "1901", + "author_site": "Chandrasekhar Bhagavatula; Chenchen Zhu; Khoa Luu; Marios Savvides", "author": "Chandrasekhar Bhagavatula; Chenchen Zhu; Khoa Luu; Marios Savvides", "abstract": "Facial alignment involves finding a set of landmark points on an image with a known semantic meaning. However, this semantic meaning of landmark points is often lost in 2D approaches where landmarks are either moved to visible boundaries or ignored as the pose of the face changes. In order to extract consistent alignment points across large poses, the 3D structure of the face must be considered in the alignment step. However, extracting a 3D structure from a single 2D image usually requires alignment in the first place. We present our novel approach to simultaneously extract the 3D shape of the face and the semantically consistent 2D alignment through a 3D Spatial Transformer Network (3DSTN) to model both the camera projection matrix and the warping parameters of a 3D model. By utilizing a generic 3D model and a Thin Plate Spline (TPS) warping function, we are able to generate subject specific 3D shapes without the need for a large 3D shape basis. In addition, our proposed network can be trained in an end-to-end framework on entirely synthetic data from the 300W-LP dataset. Unlike other 3D methods, our approach only requires one pass through the network resulting in a faster than real-time alignment. Evaluations of our model on the Annotated Facial Landmarks in the Wild (AFLW) and AFLW2000-3D datasets show our method achieves state-of-the-art performance over other 3D approaches to alignment.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Bhagavatula_Faster_Than_Real-Time_ICCV_2017_paper.pdf", @@ -5984,7 +6363,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Bhagavatula_2017_ICCV,\n \n author = {\n Bhagavatula,\n Chandrasekhar and Zhu,\n Chenchen and Luu,\n Khoa and Savvides,\n Marios\n},\n title = {\n Faster Than Real-Time Facial Alignment: A 3D Spatial Transformer Network Approach in Unconstrained Poses\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Filter Selection for Hyperspectral Estimation", @@ -5992,6 +6372,7 @@ "status": "Poster", "track": "main", "pid": "1427", + "author_site": "Boaz Arad; Ohad Ben-Shahar", "author": "Boaz Arad; Ohad Ben-Shahar", "abstract": "While recovery of hyperspectral signals from natural RGB images has been a recent subject of exploration, little to no consideration has been given to the camera response profiles used in the recovery process. In this paper we demonstrate that optimal selection of camera response filters may improve hyperspectral estimation accuracy by over 33%, emphasizing the importance of considering and selecting these response profiles wisely. Additionally, we present an evolutionary optimization methodology for optimal filter set selection from very large filter spaces, an approach that facilitates practical selection from families of customizable filters or filter optimization for multispectral cameras with more than 3 channels.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Arad_Filter_Selection_for_ICCV_2017_paper.pdf", @@ -6016,7 +6397,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Israel" + "aff_country_unique": "Israel", + "bibtex": "@InProceedings{Arad_2017_ICCV,\n \n author = {\n Arad,\n Boaz and Ben-Shahar,\n Ohad\n},\n title = {\n Filter Selection for Hyperspectral Estimation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Fine-Grained Recognition in the Wild: A Multi-Task Domain Adaptation Approach", @@ -6024,6 +6406,7 @@ "status": "Poster", "track": "main", "pid": "558", + "author_site": "Timnit Gebru; Judy Hoffman; Li Fei-Fei", "author": "Timnit Gebru; Judy Hoffman; Li Fei-Fei", "abstract": "While fine-grained object recognition is an important problem in computer vision, current models are unlikely to accurately classify objects in the wild. These fully supervised models need additional annotated images to classify objects in every new scenario, a task that is infeasible. However, sources such as e-commerce websites and field guides provide annotated images for many classes. In this work, we study fine-grained domain adaptation as a step towards overcoming the dataset shift between easily acquired annotated images and the real world. Adaptation has not been studied in the fine-grained setting where annotations such as attributes could be used to increase performance. Our work uses an attribute based multitask adaptaion loss to increase accuracy from a baseline of 3.4% to 19% in the semi-supervised adaptation case. Prior domain adaptation works have been benchmarked on small datasets such as [45] with a total of 795 images for some domains, or simplistic datasets such as [40] consisting of digits. We perform experiments on a new challenging fine-grained dataset of cars consisting of 1, 095, 021 images of 2, 657 categories of cars drawn from e-commerce websites and Google Street View.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Gebru_Fine-Grained_Recognition_in_ICCV_2017_paper.pdf", @@ -6048,7 +6431,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Gebru_2017_ICCV,\n \n author = {\n Gebru,\n Timnit and Hoffman,\n Judy and Fei-Fei,\n Li\n},\n title = {\n Fine-Grained Recognition in the Wild: A Multi-Task Domain Adaptation Approach\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "First-Person Activity Forecasting With Online Inverse Reinforcement Learning", @@ -6056,6 +6440,7 @@ "status": "Oral", "track": "main", "pid": "249", + "author_site": "Nicholas Rhinehart; Kris M. Kitani", "author": "Nicholas Rhinehart; Kris M. Kitani", "abstract": "We address the problem of incrementally modeling and forecasting long-term goals of a first-person camera wearer: what the user will do, where they will go, and what goal they seek. In contrast to prior work in trajectory forecasting, our algorithm, Darko, goes further to reason about semantic states (will I pick up an object?), and future goal states that are far both in terms of space and time. Darko learns and forecasts from first-person visual observations of the user's daily behaviors via an Online Inverse Reinforcement Learning (IRL) approach. Classical IRL discovers only the rewards in a batch setting, whereas Darko discovers the states, transitions, rewards, and goals of a user from streaming data. Among other results, we show Darko forecasts goals better than competing methods in both noisy and ideal settings, and our approach is theoretically and empirically no-regret.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Rhinehart_First-Person_Activity_Forecasting_ICCV_2017_paper.pdf", @@ -6071,7 +6456,8 @@ "aff_domain": ";", "email": ";", "author_num": 2, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Rhinehart_First-Person_Activity_Forecasting_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Rhinehart_First-Person_Activity_Forecasting_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Rhinehart_2017_ICCV,\n \n author = {\n Rhinehart,\n Nicholas and Kitani,\n Kris M.\n},\n title = {\n First-Person Activity Forecasting With Online Inverse Reinforcement Learning\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Flip-Invariant Motion Representation", @@ -6079,6 +6465,7 @@ "status": "Poster", "track": "main", "pid": "3079", + "author_site": "Takumi Kobayashi", "author": "Takumi Kobayashi", "abstract": "In action recognition, local motion descriptors contribute to effectively representing video sequences where target actions appear in localized spatio-temporal regions. For robust recognition, those fundamental descriptors are required to be invariant against horizontal (mirror) flipping in video frames which frequently occurs due to changes of camera viewpoints and action directions, deteriorating classification performance. In this paper, we propose methods to render flip invariance to the local motion descriptors by two approaches. One method leverages local motion flows to ensure the invariance on input patches where the descriptors are computed. The other derives a invariant form theoretically from the flipping transformation applied to hand-crafted descriptors. The method is also extended so as to deal with ConvNet descriptors through learning the invariant form based on data. The experimental results on human action classification show that the proposed methods favorably improve performance both of the handcrafted and the ConvNet descriptors.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Kobayashi_Flip-Invariant_Motion_Representation_ICCV_2017_paper.pdf", @@ -6101,7 +6488,8 @@ "aff_unique_url": "https://www.aist.go.jp", "aff_unique_abbr": "AIST", "aff_country_unique_index": "0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Kobayashi_2017_ICCV,\n \n author = {\n Kobayashi,\n Takumi\n},\n title = {\n Flip-Invariant Motion Representation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Flow-Guided Feature Aggregation for Video Object Detection", @@ -6109,6 +6497,7 @@ "status": "Poster", "track": "main", "pid": "132", + "author_site": "Xizhou Zhu; Yujie Wang; Jifeng Dai; Lu Yuan; Yichen Wei", "author": "Xizhou Zhu; Yujie Wang; Jifeng Dai; Lu Yuan; Yichen Wei", "abstract": "Extending state-of-the-art object detectors from image to video is challenging. The accuracy of detection suffers from degenerated object appearances in videos, e.g., motion blur, video defocus, rare poses, etc. Existing work attempts to exploit temporal information on box level, but such methods are not trained end-to-end. We present flow-guided feature aggregation, an accurate and end-to-end learning framework for video object detection. It leverages temporal coherence on feature level instead. It improves the per-frame features by aggregation of nearby features along the motion paths, and thus improves the video recognition accuracy. Our method significantly improves upon strong single-frame baselines in ImageNet VID, especially for more challenging fast moving objects. Our framework is principled, and on par with the best engineered systems winning the ImageNet VID challenges 2016, without additional bells-and-whistles.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zhu_Flow-Guided_Feature_Aggregation_ICCV_2017_paper.pdf", @@ -6123,7 +6512,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Zhu_Flow-Guided_Feature_Aggregation_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Zhu_Flow-Guided_Feature_Aggregation_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Zhu_2017_ICCV,\n \n author = {\n Zhu,\n Xizhou and Wang,\n Yujie and Dai,\n Jifeng and Yuan,\n Lu and Wei,\n Yichen\n},\n title = {\n Flow-Guided Feature Aggregation for Video Object Detection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Focal Loss for Dense Object Detection", @@ -6131,7 +6521,7 @@ "status": "Oral", "track": "main", "pid": "1902", - "author_site": "Tsung-Yi Lin; Priya Goyal; Ross Girshick; Kaiming He; Piotr Doll\u00c3\u00a1r", + "author_site": "Tsung-Yi Lin; Priya Goyal; Ross Girshick; Kaiming He; Piotr Dollár", "author": "Tsung-Yi Lin; Priya Goyal; Ross Girshick; Kaiming He; Piotr Dollar", "abstract": "The highest accuracy object detectors to date are based on a two-stage approach popularized by R-CNN, where a classifier is applied to a sparse set of candidate object locations. In contrast, one-stage detectors that are applied over a regular, dense sampling of possible object locations have the potential to be faster and simpler, but have trailed the accuracy of two-stage detectors thus far. In this paper, we investigate why this is the case. We discover that the extreme foreground-background class imbalance encountered during training of dense detectors is the central cause. We propose to address this class imbalance by reshaping the standard cross entropy loss such that it down-weights the loss assigned to well-classified examples. Our novel Focal Loss focuses training on a sparse set of hard examples and prevents the vast number of easy negatives from overwhelming the detector during training. To evaluate the effectiveness of our loss, we design and train a simple dense detector we call RetinaNet. Our results show that when trained with the focal loss, RetinaNet is able to match the speed of previous one-stage detectors while surpassing the accuracy of all existing state-of-the-art two-stage detectors.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Lin_Focal_Loss_for_ICCV_2017_paper.pdf", @@ -6149,14 +6539,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Lin_Focal_Loss_for_ICCV_2017_paper.html", "aff_unique_index": "0;0;0;0;0", - "aff_unique_norm": "Meta", + "aff_unique_norm": "Facebook", "aff_unique_dep": "Facebook AI Research", "aff_unique_url": "https://research.facebook.com", "aff_unique_abbr": "FAIR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Lin_2017_ICCV,\n \n author = {\n Lin,\n Tsung-Yi and Goyal,\n Priya and Girshick,\n Ross and He,\n Kaiming and Dollar,\n Piotr\n},\n title = {\n Focal Loss for Dense Object Detection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Focal Track: Depth and Accommodation With Oscillating Lens Deformation", @@ -6164,6 +6555,7 @@ "status": "Poster", "track": "main", "pid": "461", + "author_site": "Qi Guo; Emma Alexander; Todd Zickler", "author": "Qi Guo; Emma Alexander; Todd Zickler", "abstract": "The focal track sensor is a monocular and computationally efficient depth sensor that is based on defocus controlled by a liquid membrane lens. It synchronizes small lens oscillations with a photosensor to produce real-time depth maps by means of differential defocus, and it couples these oscillations with bigger lens deformations that adapt the defocus working range to track objects over large axial distances. To create the focal track sensor, we derive a texture-invariant family of equations that relate image derivatives to scene depth when a lens changes its focal length differentially. Based on these equations, we design a feed-forward sequence of computations that: robustly incorporates image derivatives at multiple scales; produces confidence maps along with depth; and can be trained end-to-end to mitigate against noise, aberrations, and other non-idealities. Our prototype with 1-inch optics produces depth and confidence maps at 100 frames per second over an axial range of more than 75cm.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Guo_Focal_Track_Depth_ICCV_2017_paper.pdf", @@ -6188,7 +6580,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Guo_2017_ICCV,\n \n author = {\n Guo,\n Qi and Alexander,\n Emma and Zickler,\n Todd\n},\n title = {\n Focal Track: Depth and Accommodation With Oscillating Lens Deformation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Focusing Attention: Towards Accurate Text Recognition in Natural Images", @@ -6196,6 +6589,7 @@ "status": "Poster", "track": "main", "pid": "2376", + "author_site": "Zhanzhan Cheng; Fan Bai; Yunlu Xu; Gang Zheng; Shiliang Pu; Shuigeng Zhou", "author": "Zhanzhan Cheng; Fan Bai; Yunlu Xu; Gang Zheng; Shiliang Pu; Shuigeng Zhou", "abstract": "Scene text recognition has been a hot research topic in computer vision due to its various applications. The state of the art is the attention-based encoder-decoder framework that learns the mapping between input images and output sequences in a purely data-driven way. However, we observe that existing attention-based methods perform poorly on complicated and/or low-quality images. One major reason is that existing methods cannot get accurate alignments between feature areas and targets for such images. We call this phenomenon \"attention drift\". To tackle this problem, in this paper we propose the FAN (the abbreviation of Focusing Attention Network) method that employs a focusing attention mechanism to automatically draw back the drifted attention. FAN consists of two major components: an attention network (AN) that is responsible for recognizing character targets as in the existing methods, and a focusing network (FN) that is responsible for adjusting attention by evaluating whether AN pays attention properly on the target areas in the images. Furthermore, different from the existing methods, we adopt a ResNet-based network to enrich deep representations of scene text images. Extensive experiments on various benchmarks, including the IIIT5k, SVT and ICDAR datasets, show that the FAN method substantially outperforms the existing methods.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Cheng_Focusing_Attention_Towards_ICCV_2017_paper.pdf", @@ -6213,14 +6607,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Cheng_Focusing_Attention_Towards_ICCV_2017_paper.html", "aff_unique_index": "0;1;2;0;0;1", - "aff_unique_norm": "Hikvision Research Institute;Fudan University;Shanghai Jiao Tong University", + "aff_unique_norm": "Hikvision Research Institute;Fudan University;Shanghai Jiaotong University", "aff_unique_dep": ";School of Computer Science;", "aff_unique_url": "https://www.hikvision.com/cn/;https://www.fudan.edu.cn;https://www.sjtu.edu.cn", "aff_unique_abbr": "Hikvision;Fudan;SJTU", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Shanghai", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Cheng_2017_ICCV,\n \n author = {\n Cheng,\n Zhanzhan and Bai,\n Fan and Xu,\n Yunlu and Zheng,\n Gang and Pu,\n Shiliang and Zhou,\n Shuigeng\n},\n title = {\n Focusing Attention: Towards Accurate Text Recognition in Natural Images\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Following Gaze in Video", @@ -6228,7 +6623,7 @@ "status": "Poster", "track": "main", "pid": "561", - "author_site": "Adri\u00c3\u00a0 Recasens; Carl Vondrick; Aditya Khosla; Antonio Torralba", + "author_site": "Adrià Recasens; Carl Vondrick; Aditya Khosla; Antonio Torralba", "author": "Adria Recasens; Carl Vondrick; Aditya Khosla; Antonio Torralba", "abstract": "Following the gaze of people inside videos is an important signal for understanding people and their actions. In this paper, we present an approach for following gaze in video by predicting where a person (in the video) is looking even when the object is in a different frame. We collect VideoGaze, a new dataset which we use as a benchmark to both train and evaluate models. Given one frame with a person in it, our model estimates a density for gaze location in every frame and the probability that the person is looking in that particular frame. A key aspect of our approach is an end-to-end model that jointly estimates: saliency, gaze pose, and geometric relationships between views while only using gaze as supervision. Visualizations suggest that the model learns to internally solve these intermediate tasks automatically without additional supervision. Experiments show that our approach follows gaze in video better than existing approaches, enabling a richer understanding of human activities in video.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Recasens_Following_Gaze_in_ICCV_2017_paper.pdf", @@ -6253,7 +6648,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Recasens_2017_ICCV,\n \n author = {\n Recasens,\n Adria and Vondrick,\n Carl and Khosla,\n Aditya and Torralba,\n Antonio\n},\n title = {\n Following Gaze in Video\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "FoveaNet: Perspective-Aware Urban Scene Parsing", @@ -6261,6 +6657,7 @@ "status": "Oral", "track": "main", "pid": "830", + "author_site": "Xin Li; Zequn Jie; Wei Wang; Changsong Liu; Jimei Yang; Xiaohui Shen; Zhe Lin; Qiang Chen; Shuicheng Yan; Jiashi Feng", "author": "Xin Li; Zequn Jie; Wei Wang; Changsong Liu; Jimei Yang; Xiaohui Shen; Zhe Lin; Qiang Chen; Shuicheng Yan; Jiashi Feng", "abstract": "Parsing urban scene images is critical for self-driving. Most of current solutions employ generic image parsing models that treat all scales and locations in the images equally and do not consider the geometry property of car-captured urban scene images. Thus, they suffer from heterogeneous object scales caused by perspective projection of cameras on actual scenes and inevitably encounter parsing failures on distant objects as well as other boundary and recognition errors. In this work, we propose a new FoveaNet model to fully exploit the perspective geometry of scene images and address the common failures of generic parsing models. FoveaNet estimates the perspective geometry of a scene image through a convolutional network which integrates supportive evidence from contextual objects within the image. Based on the perspective geometry information, FoveaNet \"undoes\" the camera perspective projection--analyzing regions in the space of the actual scene, and thus provides much more reliable parsing results. Furthermore, to effectively address the recognition errors, FoveaNet introduces a new dense CRF model that takes the perspective geometry as a prior potential. We evaluate FoveaNet on two urban scene parsing datasets, Cityspaces and CamVid, which demonstrates that FoveaNet can outperform all the well-established baselines and provide new state-of-the-art performance.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Li_FoveaNet_Perspective-Aware_Urban_ICCV_2017_paper.pdf", @@ -6276,7 +6673,8 @@ "aff_domain": ";;;;;;;;;", "email": ";;;;;;;;;", "author_num": 10, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Li_FoveaNet_Perspective-Aware_Urban_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Li_FoveaNet_Perspective-Aware_Urban_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Li_2017_ICCV,\n \n author = {\n Li,\n Xin and Jie,\n Zequn and Wang,\n Wei and Liu,\n Changsong and Yang,\n Jimei and Shen,\n Xiaohui and Lin,\n Zhe and Chen,\n Qiang and Yan,\n Shuicheng and Feng,\n Jiashi\n},\n title = {\n FoveaNet: Perspective-Aware Urban Scene Parsing\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "From Point Clouds to Mesh Using Regression", @@ -6284,11 +6682,11 @@ "status": "Poster", "track": "main", "pid": "1895", - "author_site": "\u00c4\u00bdubor Ladick\u00c3\u00bd; Olivier Saurer; SoHyeon Jeong; Fabio Maninchedda; Marc Pollefeys", + "author_site": "Ľubor Ladický; Olivier Saurer; SoHyeon Jeong; Fabio Maninchedda; Marc Pollefeys", "author": "Lubor Ladicky; Olivier Saurer; SoHyeon Jeong; Fabio Maninchedda; Marc Pollefeys", "abstract": "Surface reconstruction from a point cloud is a standard subproblem in many algorithms for dense 3D reconstruction from RGB images or depth maps. Methods, performing only local operations in the vicinity of individual points, are very fast, but reconstructed models typically contain lots of visually unpleasant holes. On the other hand, regularized volumetric approaches, formulated as a global optimization, are typically too slow for real-time interactive applications. We propose to use a regression forest based method, which predicts the projection of a grid point to the surface, depending on the spatial configuration of point density in the grid point neighborhood. We designed a suitable feature vector and efficient oct-tree based GPU evaluation, capable of predicting surface of high resolution 3D models in milliseconds. Our method learns and predicts surfaces from an observed point cloud sparser than the evaluation grid, and therefore effectively acts as a regularizer.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Ladicky_From_Point_Clouds_ICCV_2017_paper.pdf", - "aff": "ETH Z\u00fcrich; ETH Z\u00fcrich; ETH Z\u00fcrich; ETH Z\u00fcrich; ETH Z\u00fcrich+Microsoft", + "aff": "ETH Zürich; ETH Zürich; ETH Zürich; ETH Zürich; ETH Zürich+Microsoft", "project": "", "github": "", "supp": "", @@ -6302,14 +6700,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Ladicky_From_Point_Clouds_ICCV_2017_paper.html", "aff_unique_index": "0;0;0;0;0+1", - "aff_unique_norm": "ETH Zurich;Microsoft", - "aff_unique_dep": ";Microsoft Corporation", + "aff_unique_norm": "ETH Zürich;Microsoft Corporation", + "aff_unique_dep": ";", "aff_unique_url": "https://www.ethz.ch;https://www.microsoft.com", "aff_unique_abbr": "ETHZ;Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0+1", - "aff_country_unique": "Switzerland;United States" + "aff_country_unique": "Switzerland;United States", + "bibtex": "@InProceedings{Ladicky_2017_ICCV,\n \n author = {\n Ladicky,\n Lubor and Saurer,\n Olivier and Jeong,\n SoHyeon and Maninchedda,\n Fabio and Pollefeys,\n Marc\n},\n title = {\n From Point Clouds to Mesh Using Regression\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "From RGB to Spectrum for Natural Scenes via Manifold-Based Mapping", @@ -6317,6 +6716,7 @@ "status": "Poster", "track": "main", "pid": "2236", + "author_site": "Yan Jia; Yinqiang Zheng; Lin Gu; Art Subpa-Asa; Antony Lam; Yoichi Sato; Imari Sato", "author": "Yan Jia; Yinqiang Zheng; Lin Gu; Art Subpa-Asa; Antony Lam; Yoichi Sato; Imari Sato", "abstract": "Spectral analysis of natural scenes can provide much more detailed information about the scene than an ordinary RGB camera. The richer information provided by hyperspectral images has been beneficial to numerous applications, such as understanding natural environmental changes and classifying plants and soils in agriculture based on their spectral properties. In this paper, we present an efficient manifold learning based method for accurately reconstructing a hyperspectral image from a single RGB image captured by a commercial camera with known spectral response. By applying a nonlinear dimensionality reduction technique to a large set of natural spectra, we show that the spectra of natural scenes lie on an intrinsically low dimensional manifold. This allows us to map an RGB vector to its corresponding hyperspectral vector accurately via our proposed novel manifold-based reconstruction pipeline. Experiments using both synthesized RGB images using hyperspectral datasets and real world data demonstrate our method outperforms the state-of-the-art.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Jia_From_RGB_to_ICCV_2017_paper.pdf", @@ -6332,7 +6732,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Jia_From_RGB_to_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Jia_From_RGB_to_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Jia_2017_ICCV,\n \n author = {\n Jia,\n Yan and Zheng,\n Yinqiang and Gu,\n Lin and Subpa-Asa,\n Art and Lam,\n Antony and Sato,\n Yoichi and Sato,\n Imari\n},\n title = {\n From RGB to Spectrum for Natural Scenes via Manifold-Based Mapping\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "From Square Pieces to Brick Walls: The Next Challenge in Solving Jigsaw Puzzles", @@ -6340,6 +6741,7 @@ "status": "Poster", "track": "main", "pid": "1722", + "author_site": "Shir Gur; Ohad Ben-Shahar", "author": "Shir Gur; Ohad Ben-Shahar", "abstract": "Research into computational jigsaw puzzle solving, an emerging theoretical problem with numerous applications, has focused in recent years on puzzles that constitute square pieces only. In this paper we wish to extend the scientific scope of appearance-based puzzle solving and consider \"brick wall\" jigsaw puzzles - rectangular pieces who may have different sizes, and could be placed next to each other at arbitrary offset along their abutting edge -- a more explicit configuration with propertie of real world puzzles. We present the new challenges that arise in brick wall puzzles and address them in two stages. First we concentrate on the reconstruction of the puzzle (with or without missing pieces) assuming an oracle for offset assignments. We show that despite the increased complexity of the problem, under these conditions performance can be made comparable to the state-of-the-art in solving the simpler square piece puzzles, and thereby argue that solving brick wall puzzles may be reduced to finding the correct offset between two neighboring pieces. We then move on to focus on implementing the oracle computationally using a mixture of dissimilarity metrics and correlation matching. We show results on various brick wall puzzles and discuss how our work may start a new research path for the puzzle solving community.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Gur_From_Square_Pieces_ICCV_2017_paper.pdf", @@ -6364,7 +6766,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Israel" + "aff_country_unique": "Israel", + "bibtex": "@InProceedings{Gur_2017_ICCV,\n \n author = {\n Gur,\n Shir and Ben-Shahar,\n Ohad\n},\n title = {\n From Square Pieces to Brick Walls: The Next Challenge in Solving Jigsaw Puzzles\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "GANs for Biological Image Synthesis", @@ -6372,10 +6775,11 @@ "status": "Spotlight", "track": "main", "pid": "606", + "author_site": "Anton Osokin; Anatole Chessel; Rafael E. Carazo Salas; Federico Vaggi", "author": "Anton Osokin; Anatole Chessel; Rafael E. Carazo Salas; Federico Vaggi", "abstract": "In this paper, we propose a novel application of Generative Adversarial Networks (GAN) to the synthesis of cells imaged by fluorescence microscopy. Compared to natural images, cells tend to have a simpler and more geometric global structure that facilitates image generation. However, the correlation between the spatial pattern of different fluorescent proteins reflects important biological functions, and synthesized images have to capture these relationships to be relevant for biological applications. We adapt GANs to the task at hand and propose new models with casual dependencies between image channels that can generate multi-channel images, which would be impossible to obtain experimentally. We evaluate our approach using two independent techniques and compare it against sensible baselines. Finally, we demonstrate that by interpolating across the latent space we can mimic the known changes in protein localization that occur through time during the cell cycle, allowing us to predict temporal evolution from static images.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Osokin_GANs_for_Biological_ICCV_2017_paper.pdf", - "aff": "INRIA/ENS\u2217, France+HSE\u2020, Russia; \u00b4Ecole Polytechnique\u2021, France; University of Bristol, UK; ENS\u2217, France+Amazon, USA", + "aff": "INRIA/ENS∗, France+HSE†, Russia; ´Ecole Polytechnique‡, France; University of Bristol, UK; ENS∗, France+Amazon, USA", "project": "", "github": "", "supp": "", @@ -6389,14 +6793,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Osokin_GANs_for_Biological_ICCV_2017_paper.html", "aff_unique_index": "0+1;2;3;4+5", - "aff_unique_norm": "INRIA;Higher School of Economics;Ecole Polytechnique;University of Bristol;\u00c9cole Normale Sup\u00e9rieure;Amazon", - "aff_unique_dep": ";;;;;Amazon.com, Inc.", + "aff_unique_norm": "INRIA;Higher School of Economics;Ecole Polytechnique;University of Bristol;École Normale Supérieure;Amazon.com, Inc.", + "aff_unique_dep": ";;;;;", "aff_unique_url": "https://www.inria.fr;https://www.hse.ru;https://www.polytechnique.edu;https://www.bristol.ac.uk;https://www.ens.fr;https://www.amazon.com", "aff_unique_abbr": "INRIA;HSE;Polytechnique;Bristol;ENS;Amazon", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0;2;0+3", - "aff_country_unique": "France;Russian Federation;United Kingdom;United States" + "aff_country_unique": "France;Russia;United Kingdom;United States", + "bibtex": "@InProceedings{Osokin_2017_ICCV,\n \n author = {\n Osokin,\n Anton and Chessel,\n Anatole and Carazo Salas,\n Rafael E. and Vaggi,\n Federico\n},\n title = {\n GANs for Biological Image Synthesis\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "GPLAC: Generalizing Vision-Based Robotic Skills Using Weakly Labeled Images", @@ -6404,6 +6809,7 @@ "status": "Poster", "track": "main", "pid": "2638", + "author_site": "Avi Singh; Larry Yang; Sergey Levine", "author": "Avi Singh; Larry Yang; Sergey Levine", "abstract": "We tackle the problem of learning robotic sensorimotor control policies that can generalize to visually diverse and unseen environments. Achieving broad generalization typically requires large datasets, which are difficult to obtain for task-specific interactive processes such as reinforcement learning or learning from demonstration. However, much of the visual diversity in the world can be captured through passively collected datasets of images or videos. In our method, which we refer to as GPLAC (Generalized Policy Learning with Attentional Classifier), we use both interaction data and weakly labeled image data to augment the generalization capacity of sensorimotor policies. Our method combines multitask learning on action selection and an auxiliary binary classification objective, together with a convolutional neural network architecture that uses an attentional mechanism to avoid distractors. We show that pairing interaction data from just a single environment with a diverse dataset of weakly labeled data results in greatly improved generalization to unseen environments, and show that this generalization depends on both the auxiliary objective and the attentional architecture that we propose. We demonstrate our results in both simulation and on a real robotic manipulator, and demonstrate substantial improvement over standard convolutional architectures and domain adaptation methods.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Singh_GPLAC_Generalizing_Vision-Based_ICCV_2017_paper.pdf", @@ -6419,7 +6825,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Singh_GPLAC_Generalizing_Vision-Based_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Singh_GPLAC_Generalizing_Vision-Based_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Singh_2017_ICCV,\n \n author = {\n Singh,\n Avi and Yang,\n Larry and Levine,\n Sergey\n},\n title = {\n GPLAC: Generalizing Vision-Based Robotic Skills Using Weakly Labeled Images\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Generalized Orderless Pooling Performs Implicit Salient Matching", @@ -6427,6 +6834,7 @@ "status": "Poster", "track": "main", "pid": "2483", + "author_site": "Marcel Simon; Yang Gao; Trevor Darrell; Joachim Denzler; Erik Rodner", "author": "Marcel Simon; Yang Gao; Trevor Darrell; Joachim Denzler; Erik Rodner", "abstract": "Most recent CNN architectures use average pooling as a final feature encoding step. In the field of fine-grained recognition, however, recent global representations like bilinear pooling offer improved performance. In this paper, we generalize average and bilinear pooling to \"alpha-pooling\", allowing for learning the pooling strategy during training. In addition, we present a novel way to visualize decisions made by these approaches. We identify parts of training images having the highest influence on the prediction of a given test image. This allows for justifying decisions to users and also for analyzing the influence of semantic parts. For example, we can show that the higher capacity VGG16 model focuses much more on the bird's head than, e.g., the lower-capacity VGG-M model when recognizing fine-grained bird categories. Both contributions allow us to analyze the difference when moving between average and bilinear pooling. In addition, experiments show that our generalized approach can outperform both across a variety of standard datasets.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Simon_Generalized_Orderless_Pooling_ICCV_2017_paper.pdf", @@ -6451,7 +6859,8 @@ "aff_campus_unique_index": "0;1;1;0", "aff_campus_unique": "Jena;Berkeley;", "aff_country_unique_index": "0;1;1;0;0", - "aff_country_unique": "Germany;United States" + "aff_country_unique": "Germany;United States", + "bibtex": "@InProceedings{Simon_2017_ICCV,\n \n author = {\n Simon,\n Marcel and Gao,\n Yang and Darrell,\n Trevor and Denzler,\n Joachim and Rodner,\n Erik\n},\n title = {\n Generalized Orderless Pooling Performs Implicit Salient Matching\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Generating High-Quality Crowd Density Maps Using Contextual Pyramid CNNs", @@ -6459,6 +6868,7 @@ "status": "Poster", "track": "main", "pid": "717", + "author_site": "Vishwanath A. Sindagi; Vishal M. Patel", "author": "Vishwanath A. Sindagi; Vishal M. Patel", "abstract": "We present a novel method called Contextual Pyramid CNN (CP-CNN) for generating high-quality crowd density and count estimation by explicitly incorporating global and local contextual information of crowd images. The proposed CP-CNN consists of four modules: Global Context Estimator (GCE), Local Context Estimator (LCE), Density Map Estimator (DME) and a Fusion-CNN (F-CNN). GCE is a VGG-16 based CNN that encodes global context and it is trained to classify input images into different density classes, whereas LCE is another CNN that encodes local context information and it is trained to perform patch-wise classification of input images into different density classes. DME is a multi-column architecture-based CNN that aims to generate high-dimensional feature maps from the input image which are fused with the contextual information estimated by GCE and LCE using F-CNN. To generate high resolution and high-quality density maps, F-CNN uses a set of convolutional and fractionally-strided convolutional layers and it is trained along with the DME in an end-to-end fashion using a combination of adversarial loss and pixel-level Euclidean loss. Extensive experiments on highly challenging datasets show that the proposed method achieves significant improvements over the state-of-the-art methods.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Sindagi_Generating_High-Quality_Crowd_ICCV_2017_paper.pdf", @@ -6474,7 +6884,8 @@ "aff_domain": ";", "email": ";", "author_num": 2, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Sindagi_Generating_High-Quality_Crowd_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Sindagi_Generating_High-Quality_Crowd_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Sindagi_2017_ICCV,\n \n author = {\n Sindagi,\n Vishwanath A. and Patel,\n Vishal M.\n},\n title = {\n Generating High-Quality Crowd Density Maps Using Contextual Pyramid CNNs\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Generative Adversarial Networks Conditioned by Brain Signals", @@ -6482,6 +6893,7 @@ "status": "Poster", "track": "main", "pid": "1371", + "author_site": "Simone Palazzo; Concetto Spampinato; Isaak Kavasidis; Daniela Giordano; Mubarak Shah", "author": "Simone Palazzo; Concetto Spampinato; Isaak Kavasidis; Daniela Giordano; Mubarak Shah", "abstract": "Recent advancements in generative adversarial networks (GANs), using deep convolutional models, have supported the development of image generation techniques able to reach satisfactory levels of realism. Further improvements have been proposed to condition GANs to generate images matching a specific object category or a short text description. In this work, we build on the latter class of approaches and investigate the possibility of driving and conditioning the image generation process by means of brain signals recorded, through an electroencephalograph (EEG), while users look at images from a set of 40 ImageNet object categories with the objective of generating the seen images. To accomplish this task, we first demonstrate that brain activity EEG signals encode visually-related information that allows us to accurately discriminate between visual object categories and, accordingly, we extract a more compact class-dependent representation of EEG data using recurrent neural networks. Afterwards, we use the learned EEG manifold to condition image generation employing GANs, which, during inference, will read EEG signals and convert them into images. We tested our generative approach using EEG signals recorded from six subjects while looking at images of the aforementioned 40 visual classes. The results show that for classes represented by well-defined visual patterns (e.g., pandas, airplane, etc.), the generated images are realistic and highly resemble those evoking the EEG signals used for conditioning GANs, resulting in an actual reading-the-mind process.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Palazzo_Generative_Adversarial_Networks_ICCV_2017_paper.pdf", @@ -6506,7 +6918,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Central Florida", "aff_country_unique_index": "0;0;0;0;1", - "aff_country_unique": "Italy;United States" + "aff_country_unique": "Italy;United States", + "bibtex": "@InProceedings{Palazzo_2017_ICCV,\n \n author = {\n Palazzo,\n Simone and Spampinato,\n Concetto and Kavasidis,\n Isaak and Giordano,\n Daniela and Shah,\n Mubarak\n},\n title = {\n Generative Adversarial Networks Conditioned by Brain Signals\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Generative Modeling of Audible Shapes for Object Perception", @@ -6514,6 +6927,7 @@ "status": "Poster", "track": "main", "pid": "426", + "author_site": "Zhoutong Zhang; Jiajun Wu; Qiujia Li; Zhengjia Huang; James Traer; Josh H. McDermott; Joshua B. Tenenbaum; William T. Freeman", "author": "Zhoutong Zhang; Jiajun Wu; Qiujia Li; Zhengjia Huang; James Traer; Josh H. McDermott; Joshua B. Tenenbaum; William T. Freeman", "abstract": "Humans infer rich knowledge of objects from both auditory and visual cues. Building a machine of such competency, however, is very challenging, due to the great difficulty in capturing large-scale, clean data of objects with both their appearance and the sound they make. In this paper, we present a novel, open-source pipeline that generates audio-visual data, purely from 3D object shapes and their physical properties. Through comparison with audio recordings and human behavioral studies, we validate the accuracy of the sounds it generates. Using this generative model, we are able to construct a synthetic audio-visual dataset, namely Sound-20K, for object perception tasks. We demonstrate that auditory and visual information play complementary roles in object perception, and further, that the representation learned on synthetic audio-visual data can transfer to real-world scenarios.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zhang_Generative_Modeling_of_ICCV_2017_paper.pdf", @@ -6529,7 +6943,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Zhang_Generative_Modeling_of_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Zhang_Generative_Modeling_of_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Zhang_2017_ICCV,\n \n author = {\n Zhang,\n Zhoutong and Wu,\n Jiajun and Li,\n Qiujia and Huang,\n Zhengjia and Traer,\n James and McDermott,\n Josh H. and Tenenbaum,\n Joshua B. and Freeman,\n William T.\n},\n title = {\n Generative Modeling of Audible Shapes for Object Perception\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Genetic CNN", @@ -6537,6 +6952,7 @@ "status": "Poster", "track": "main", "pid": "569", + "author_site": "Lingxi Xie; Alan Yuille", "author": "Lingxi Xie; Alan Yuille", "abstract": "The deep convolutional neural network (CNN) is the state-of-the-art solution for large-scale visual recognition. Following some basic principles such as increasing network depth and constructing highway connections, researchers have manually designed a lot of fixed network architectures and verified their effectiveness. In this paper, we discuss the possibility of learning deep network structures automatically. Note that the number of possible network structures increases exponentially with the number of layers in the network, which motivates us to adopt the genetic algorithm to efficiently explore this large search space. The core idea is to propose an encoding method to represent each network structure in a fixed-length binary string. The genetic algorithm is initialized by generating a set of randomized individuals. In each generation, we define standard genetic operations, e.g., selection, mutation and crossover, to generate competitive individuals and eliminate weak ones. The competitiveness of each individual is defined as its recognition accuracy, which is obtained via a standalone training process on a reference dataset. We run the genetic process on CIFAR10, a small-scale dataset, demonstrating its ability to find high-quality structures which are little studied before. The learned powerful structures are also transferrable to the ILSVRC2012 dataset for large-scale visual recognition.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Xie_Genetic_CNN_ICCV_2017_paper.pdf", @@ -6551,7 +6967,8 @@ "aff_domain": ";", "email": ";", "author_num": 2, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Xie_Genetic_CNN_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Xie_Genetic_CNN_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Xie_2017_ICCV,\n \n author = {\n Xie,\n Lingxi and Yuille,\n Alan\n},\n title = {\n Genetic CNN\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Globally-Optimal Inlier Set Maximisation for Simultaneous Camera Pose and Feature Correspondence", @@ -6559,10 +6976,11 @@ "status": "Oral", "track": "main", "pid": "2218", + "author_site": "Dylan Campbell; Lars Petersson; Laurent Kneip; Hongdong Li", "author": "Dylan Campbell; Lars Petersson; Laurent Kneip; Hongdong Li", "abstract": "Estimating the 6-DoF pose of a camera from a single image relative to a pre-computed 3D point-set is an important task for many computer vision applications. Perspective-n-Point (PnP) solvers are routinely used for camera pose estimation, provided that a good quality set of 2D-3D feature correspondences are known beforehand. However, finding optimal correspondences between 2D key-points and a 3D point-set is non-trivial, especially when only geometric (position) information is known. Existing approaches to the simultaneous pose and correspondence problem use local optimisation, and are therefore unlikely to find the optimal solution without a good pose initialisation, or introduce restrictive assumptions. Since a large proportion of outliers are common for this problem, we instead propose a globally-optimal inlier set cardinality maximisation approach which jointly estimates optimal camera pose and optimal correspondences. Our approach employs branch-and-bound to search the 6D space of camera poses, guaranteeing global optimality without requiring a pose prior. The geometry of SE(3) is used to find novel upper and lower bounds for the number of inliers and local optimisation is integrated to accelerate convergence. The evaluation empirically supports the optimality proof and shows that the method performs much more robustly than existing approaches, including on a large-scale outdoor data-set.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Campbell_Globally-Optimal_Inlier_Set_ICCV_2017_paper.pdf", - "aff": "Australian National University* + Data61 \u2013 CSIRO; Australian National University* + Data61 \u2013 CSIRO; Australian National University*; Australian National University*", + "aff": "Australian National University* + Data61 – CSIRO; Australian National University* + Data61 – CSIRO; Australian National University*; Australian National University*", "project": "", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2017/supplemental/Campbell_Globally-Optimal_Inlier_Set_ICCV_2017_supplemental.pdf", @@ -6583,7 +7001,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0;0", - "aff_country_unique": "Australia" + "aff_country_unique": "Australia", + "bibtex": "@InProceedings{Campbell_2017_ICCV,\n \n author = {\n Campbell,\n Dylan and Petersson,\n Lars and Kneip,\n Laurent and Li,\n Hongdong\n},\n title = {\n Globally-Optimal Inlier Set Maximisation for Simultaneous Camera Pose and Feature Correspondence\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Going Unconstrained With Rolling Shutter Deblurring", @@ -6591,6 +7010,7 @@ "status": "Poster", "track": "main", "pid": "1689", + "author_site": "Mahesh Mohan M. R.; A. N. Rajagopalan; Gunasekaran Seetharaman", "author": "Mahesh Mohan M. R.; A. N. Rajagopalan; Gunasekaran Seetharaman", "abstract": "Most present-day imaging devices are equipped with CMOS sensors. Motion blur is a common artifact in hand-held cameras. Because CMOS sensors mostly employ a rolling shutter (RS), the motion deblurring problem takes on a new dimension. Although few works have recently addressed this problem, they suffer from many constraints including heavy computational cost, need for precise sensor information, and inability to deal with wide-angle systems (which most cell-phone and drone cameras are) and irregular camera trajectory. In this work, we propose a model for RS blind motion deblurring that mitigates these issues significantly. Comprehensive comparisons with state-of-the-art methods reveal that our approach not only exhibits significant computational gains and unconstrained functionality but also leads to improved deblurring performance.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/R._Going_Unconstrained_With_ICCV_2017_paper.pdf", @@ -6615,7 +7035,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Madras;", "aff_country_unique_index": "0;0;1", - "aff_country_unique": "India;United States" + "aff_country_unique": "India;United States", + "bibtex": "@InProceedings{R._2017_ICCV,\n \n author = {\n Mohan,\n Mahesh M. R. and Rajagopalan,\n A. N. and Seetharaman,\n Gunasekaran\n},\n title = {\n Going Unconstrained With Rolling Shutter Deblurring\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Grad-CAM: Visual Explanations From Deep Networks via Gradient-Based Localization", @@ -6623,6 +7044,7 @@ "status": "Poster", "track": "main", "pid": "222", + "author_site": "Ramprasaath R. Selvaraju; Michael Cogswell; Abhishek Das; Ramakrishna Vedantam; Devi Parikh; Dhruv Batra", "author": "Ramprasaath R. Selvaraju; Michael Cogswell; Abhishek Das; Ramakrishna Vedantam; Devi Parikh; Dhruv Batra", "abstract": "We propose a technique for producing 'visual explanations' for decisions from a large class of Convolutional Neural Network (CNN)-based models, making them more transparent. Our approach - Gradient-weighted Class Activation Mapping (Grad-CAM), uses the gradients of any target concept (say logits for 'dog' or even a caption), flowing into the final convolutional layer to produce a coarse localization map highlighting the important regions in the image for predicting the concept. Unlike previous approaches, Grad-CAM is applicable to a wide variety of CNN model-families: (1) CNNs with fully-connected layers (e.g. VGG), (2) CNNs used for structured outputs (e.g. captioning), (3) CNNs used in tasks with multi-modal inputs (e.g. VQA) or reinforcement learning, and needs no architectural changes or re-training. We combine Grad-CAM with existing fine-grained visualizations to create a high-resolution class-discriminative visualization and apply it to image classification, image captioning, and visual question answering (VQA) models, including ResNet-based architectures. In the context of image classification models, our visualizations (a) lend insights into failure modes of these models (showing that seemingly unreasonable predictions have reasonable explanations), (b) outperform previous methods on the ILSVRC-15 weakly-supervised localization task, (c) are more faithful to the underlying model, and (d) help achieve model generalization by identifying dataset bias. For image captioning and VQA, our visualizations show that even non-attention based models can localize inputs. Finally, we design and conduct human studies to measure if Grad-CAM explanations help users establish appropriate trust in predictions from deep networks and show that Grad-CAM helps untrained users successfully discern a 'stronger' deep network from a 'weaker' one even when both make identical predictions. Our code is available at https://github.com/ramprs/grad-cam/ along with a demo on CloudCV [2] 1 and video at youtu.be/COjUB9Izk6E.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Selvaraju_Grad-CAM_Visual_Explanations_ICCV_2017_paper.pdf", @@ -6640,14 +7062,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Selvaraju_Grad-CAM_Visual_Explanations_ICCV_2017_paper.html", "aff_unique_index": "0;0;0;0;0+1;0+1", - "aff_unique_norm": "Georgia Institute of Technology;Meta", + "aff_unique_norm": "Georgia Institute of Technology;Facebook", "aff_unique_dep": ";Facebook AI Research", "aff_unique_url": "https://www.gatech.edu;https://research.facebook.com", "aff_unique_abbr": "Georgia Tech;FAIR", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0+0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Selvaraju_2017_ICCV,\n \n author = {\n Selvaraju,\n Ramprasaath R. and Cogswell,\n Michael and Das,\n Abhishek and Vedantam,\n Ramakrishna and Parikh,\n Devi and Batra,\n Dhruv\n},\n title = {\n Grad-CAM: Visual Explanations From Deep Networks via Gradient-Based Localization\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Group Re-Identification via Unsupervised Transfer of Sparse Features Encoding", @@ -6655,6 +7078,7 @@ "status": "Poster", "track": "main", "pid": "1037", + "author_site": "Giuseppe Lisanti; Niki Martinel; Alberto Del Bimbo; Gian Luca Foresti", "author": "Giuseppe Lisanti; Niki Martinel; Alberto Del Bimbo; Gian Luca Foresti", "abstract": "Person re-identification is best known as the problem of associating a single person that is observed from one or more disjoint cameras. The existing literature has mainly addressed such an issue, neglecting the fact that people usually move in groups, like in crowded scenarios. We believe that the additional information carried by neighboring individuals provides a relevant visual context that can be exploited to obtain a more robust match of single persons within the group. Despite this, re-identifying groups of people compound the common single person re-identification problems by introducing changes in the relative position of persons within the group and severe self-occlusions. In this paper, we propose a solution for group re-identification that grounds on transferring knowledge from single person re-identification to group re-identification by exploiting sparse dictionary learning. First, a dictionary of sparse atoms is learned using patches extracted from single person images. Then, the learned dictionary is exploited to obtain a sparsity-driven residual group representation, which is finally matched to perform the re-identification. Extensive experiments on the i-LIDS groups and two newly collected datasets show that the proposed solution outperforms state-of-the-art approaches.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Lisanti_Group_Re-Identification_via_ICCV_2017_paper.pdf", @@ -6679,7 +7103,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Italy" + "aff_country_unique": "Italy", + "bibtex": "@InProceedings{Lisanti_2017_ICCV,\n \n author = {\n Lisanti,\n Giuseppe and Martinel,\n Niki and Del Bimbo,\n Alberto and Luca Foresti,\n Gian\n},\n title = {\n Group Re-Identification via Unsupervised Transfer of Sparse Features Encoding\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Guided Perturbations: Self-Corrective Behavior in Convolutional Neural Networks", @@ -6687,6 +7112,7 @@ "status": "Poster", "track": "main", "pid": "1444", + "author_site": "Swami Sankaranarayanan; Arpit Jain; Ser Nam Lim", "author": "Swami Sankaranarayanan; Arpit Jain; Ser Nam Lim", "abstract": "Convolutional Neural Networks have been a subject of great importance over the past decade and great strides have been made in their utility for producing state of the art performance in many computer vision problems. However, the behavior of deep networks is yet to be fully understood and is still an active area of research. In this work, we present an intriguing behavior: pre-trained CNNs can be made to improve their predictions by structurally perturbing the input. We observe that these perturbations - referred as Guided Perturbations - enable a trained network to improve its prediction performance without any learning or change in network weights. We perform various ablative experiments to understand how these perturbations affect the local context and feature representations. Furthermore, we demonstrate that this idea can improve performance of several existing approaches on semantic segmentation and scene labeling tasks on the PASCAL VOC dataset and supervised classification tasks on MNIST and CIFAR10 datasets.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Sankaranarayanan_Guided_Perturbations_Self-Corrective_ICCV_2017_paper.pdf", @@ -6711,7 +7137,8 @@ "aff_campus_unique_index": "0", "aff_campus_unique": "College Park;", "aff_country_unique_index": "0+0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Sankaranarayanan_2017_ICCV,\n \n author = {\n Sankaranarayanan,\n Swami and Jain,\n Arpit and Nam Lim,\n Ser\n},\n title = {\n Guided Perturbations: Self-Corrective Behavior in Convolutional Neural Networks\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Hard-Aware Deeply Cascaded Embedding", @@ -6719,6 +7146,7 @@ "status": "Spotlight", "track": "main", "pid": "1013", + "author_site": "Yuhui Yuan; Kuiyuan Yang; Chao Zhang", "author": "Yuhui Yuan; Kuiyuan Yang; Chao Zhang", "abstract": "Riding on the waves of deep neural networks, deep metric learning has achieved promising results in various tasks by using triplet network or Siamese network. Though the basic goal of making images from the same category closer than the ones from different categories is intuitive, it is hard to optimize the objective directly due to the quadratic or cubic sample size. Hard example mining is widely used to solve the problem, which spends the expensive computation on a subset of samples that are considered hard. However, hard is defined relative to a specific model. Then complex models will treat most samples as easy ones and vice versa for simple models, both of which are not good for training. It is difficult to define a model with the just right complexity and choose hard examples adequately as different samples are of diverse hard levels. This motivates us to propose the novel framework named Hard-Aware Deeply Cascaded Embedding(HDC) to ensemble a set of models with different complexities in cascaded manner to mine hard examples at multiple levels. A sample is judged by a series of models with increasing complexities and only updates models that consider the sample as a hard case. The HDC is evaluated on CARS196, CUB-200-2011, Stanford Online Products, VehicleID and DeepFashion datasets, and outperforms state-of-the-art methods by a large margin.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Yuan_Hard-Aware_Deeply_Cascaded_ICCV_2017_paper.pdf", @@ -6736,14 +7164,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Yuan_Hard-Aware_Deeply_Cascaded_ICCV_2017_paper.html", "aff_unique_index": "0+1;2;3+1", - "aff_unique_norm": "Peking University;Shanghai Jiao Tong University;DeepMotion;Microsoft", + "aff_unique_norm": "Peking University;Shanghai Jiao Tong University;DeepMotion;Microsoft Corporation", "aff_unique_dep": "Key Laboratory of Machine Perception;Cooperative Medianet Innovation Center;;Microsoft Research", "aff_unique_url": "http://www.pku.edu.cn;https://www.sjtu.edu.cn;;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "PKU;SJTU;;MSR", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;2+0", - "aff_country_unique": "China;;United States" + "aff_country_unique": "China;;United States", + "bibtex": "@InProceedings{Yuan_2017_ICCV,\n \n author = {\n Yuan,\n Yuhui and Yang,\n Kuiyuan and Zhang,\n Chao\n},\n title = {\n Hard-Aware Deeply Cascaded Embedding\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "HashNet: Deep Learning to Hash by Continuation", @@ -6751,6 +7180,7 @@ "status": "Poster", "track": "main", "pid": "3025", + "author_site": "Zhangjie Cao; Mingsheng Long; Jianmin Wang; Philip S. Yu", "author": "Zhangjie Cao; Mingsheng Long; Jianmin Wang; Philip S. Yu", "abstract": "Learning to hash has been widely applied to approximate nearest neighbor search for large-scale multimedia retrieval, due to its computation efficiency and retrieval quality. Deep learning to hash, which improves retrieval quality by end-to-end representation learning and hash encoding, has received increasing attention recently. Subject to the ill-posed gradient difficulty in the optimization with sign activations, existing deep learning to hash methods need to first learn continuous representations and then generate binary hash codes in a separated binarization step, which suffer from substantial loss of retrieval quality. This work presents HashNet, a novel deep architecture for deep learning to hash by continuation method with convergence guarantees, which learns exactly binary hash codes from imbalanced similarity data. The key idea is to attack the ill-posed gradient problem in optimizing deep networks with non-smooth binary activations by continuation method, in which we begin from learning an easier network with smoothed activation function and let it evolve during the training, until it eventually goes back to being the original, difficult to optimize, deep network with the sign activation function. Comprehensive empirical evidence shows that HashNet can generate exactly binary hash codes and yield state-of-the-art multimedia retrieval performance on standard benchmarks.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Cao_HashNet_Deep_Learning_ICCV_2017_paper.pdf", @@ -6766,7 +7196,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Cao_HashNet_Deep_Learning_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Cao_HashNet_Deep_Learning_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Cao_2017_ICCV,\n \n author = {\n Cao,\n Zhangjie and Long,\n Mingsheng and Wang,\n Jianmin and Yu,\n Philip S.\n},\n title = {\n HashNet: Deep Learning to Hash by Continuation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Hide-And-Seek: Forcing a Network to Be Meticulous for Weakly-Supervised Object and Action Localization", @@ -6774,6 +7205,7 @@ "status": "Poster", "track": "main", "pid": "1583", + "author_site": "Krishna Kumar Singh; Yong Jae Lee", "author": "Krishna Kumar Singh; Yong Jae Lee", "abstract": "We propose 'Hide-and-Seek', a weakly-supervised framework that aims to improve object localization in images and action localization in videos. Most existing weakly-supervised methods localize only the most discriminative parts of an object rather than all relevant parts, which leads to suboptimal performance. Our key idea is to hide patches in a training image randomly, forcing the network to seek other relevant parts when the most discriminative part is hidden. Our approach only needs to modify the input image and can work with any network designed for object localization. During testing, we do not need to hide any patches. Our Hide-and-Seek approach obtains superior performance compared to previous methods for weakly-supervised object localization on the ILSVRC dataset. We also demonstrate that our framework can be easily extended to weakly-supervised action localization.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Singh_Hide-And-Seek_Forcing_a_ICCV_2017_paper.pdf", @@ -6798,7 +7230,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Davis", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Singh_2017_ICCV,\n \n author = {\n Kumar Singh,\n Krishna and Jae Lee,\n Yong\n},\n title = {\n Hide-And-Seek: Forcing a Network to Be Meticulous for Weakly-Supervised Object and Action Localization\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Hierarchical Multimodal LSTM for Dense Visual-Semantic Embedding", @@ -6806,10 +7239,11 @@ "status": "Poster", "track": "main", "pid": "732", + "author_site": "Zhenxing Niu; Mo Zhou; Le Wang; Xinbo Gao; Gang Hua", "author": "Zhenxing Niu; Mo Zhou; Le Wang; Xinbo Gao; Gang Hua", "abstract": "We address the problem of dense visual-semantic embedding that maps not only full sentences and whole images but also phrases within sentences and salient regions within images into a multimodal embedding space. As a result, we can produce several region-oriented and expressive phrases rather than just an overview sentence to describe an image. In particular, we present a hierarchical structured recurrent neural network (RNN), namely Hierarchical Multimodal LSTM (HM-LSTM) model. Different from chain structured RNN, our model presents a hierarchical structure so that it can naturally build representations for phrases and image regions, and further exploit their hierarchical relations. Moreover, the fine-grained correspondences between phrases and image regions can be automatically learned and utilized to boost the learning of the dense embedding space. Extensive experiments on several datasets validate the efficacy of our proposed method, which compares favorably with the state-of-the-art methods.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Niu_Hierarchical_Multimodal_LSTM_ICCV_2017_paper.pdf", - "aff": "Alibaba Group; Xidian University; Xi\u2019an Jiaotong University; Xidian University; Microsoft Research", + "aff": "Alibaba Group; Xidian University; Xi’an Jiaotong University; Xidian University; Microsoft Research", "project": "", "github": "", "supp": "", @@ -6823,14 +7257,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Niu_Hierarchical_Multimodal_LSTM_ICCV_2017_paper.html", "aff_unique_index": "0;1;2;1;3", - "aff_unique_norm": "Alibaba Group;Xidian University;Xi'an Jiao Tong University;Microsoft", + "aff_unique_norm": "Alibaba Group;Xidian University;Xi'an Jiaotong University;Microsoft Corporation", "aff_unique_dep": ";;;Microsoft Research", "aff_unique_url": "https://www.alibaba.com;http://www.xidian.edu.cn/;https://www.xjtu.edu.cn;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "Alibaba;Xidian;XJTU;MSR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Niu_2017_ICCV,\n \n author = {\n Niu,\n Zhenxing and Zhou,\n Mo and Wang,\n Le and Gao,\n Xinbo and Hua,\n Gang\n},\n title = {\n Hierarchical Multimodal LSTM for Dense Visual-Semantic Embedding\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "High Order Tensor Formulation for Convolutional Sparse Coding", @@ -6838,6 +7273,7 @@ "status": "Poster", "track": "main", "pid": "871", + "author_site": "Adel Bibi; Bernard Ghanem", "author": "Adel Bibi; Bernard Ghanem", "abstract": "Convolutional sparse coding (CSC) has gained attention for its successful role as a reconstruction and a classification tool in the computer vision and machine learning community. Current CSC methods can only reconstruct single-feature 2D images independently. However, learning multi-dimensional dictionaries and sparse codes for the reconstruction of multi-dimensional data is very important, as it examines correlations among all the data jointly. This provides more capacity for the learned dictionaries to better reconstruct data. In this paper, we propose a generic and novel formulation for the CSC problem that can handle an arbitrary order tensor of data. Backed with experimental results, our proposed formulation can not only tackle applications that are not possible with standard CSC solvers, including colored video reconstruction (5D- tensors), but it also performs favorably in reconstruction with much fewer parameters as compared to naive extensions of standard CSC to multiple features/channels.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Bibi_High_Order_Tensor_ICCV_2017_paper.pdf", @@ -6862,7 +7298,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Saudi Arabia" + "aff_country_unique": "Saudi Arabia", + "bibtex": "@InProceedings{Bibi_2017_ICCV,\n \n author = {\n Bibi,\n Adel and Ghanem,\n Bernard\n},\n title = {\n High Order Tensor Formulation for Convolutional Sparse Coding\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "High-Quality Correspondence and Segmentation Estimation for Dual-Lens Smart-Phone Portraits", @@ -6870,6 +7307,7 @@ "status": "Poster", "track": "main", "pid": "1350", + "author_site": "Xiaoyong Shen; Hongyun Gao; Xin Tao; Chao Zhou; Jiaya Jia", "author": "Xiaoyong Shen; Hongyun Gao; Xin Tao; Chao Zhou; Jiaya Jia", "abstract": "Estimating correspondence between two images and extracting the foreground object are two challenges in computer vision. With dual-lens smart phones, such as iPhone 7Plus and Huawei P9, coming into the market, two images of slightly different views provide us new information to unify the two topics. We propose a joint method to tackle them simultaneously via a joint fully connected conditional random field (CRF) framework. The regional correspondence is used to handle textureless regions in matching and make our CRF system computationally efficient. Our method is evaluated over 2,000 new image pairs, and produces promising results on challenging portrait images.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Shen_High-Quality_Correspondence_and_ICCV_2017_paper.pdf", @@ -6887,14 +7325,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Shen_High-Quality_Correspondence_and_ICCV_2017_paper.html", "aff_unique_index": "0;1;1;1;0+1", - "aff_unique_norm": "Tencent;Chinese University of Hong Kong", + "aff_unique_norm": "Tencent;The Chinese University of Hong Kong", "aff_unique_dep": "Youtu Lab;", "aff_unique_url": "https://www.tencent.com;https://www.cuhk.edu.hk", "aff_unique_abbr": "Tencent;CUHK", "aff_campus_unique_index": "1;1;1;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Shen_2017_ICCV,\n \n author = {\n Shen,\n Xiaoyong and Gao,\n Hongyun and Tao,\n Xin and Zhou,\n Chao and Jia,\n Jiaya\n},\n title = {\n High-Quality Correspondence and Segmentation Estimation for Dual-Lens Smart-Phone Portraits\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "High-Resolution Shape Completion Using Deep Neural Networks for Global Structure and Local Geometry Inference", @@ -6902,6 +7341,7 @@ "status": "Spotlight", "track": "main", "pid": "1474", + "author_site": "Xiaoguang Han; Zhen Li; Haibin Huang; Evangelos Kalogerakis; Yizhou Yu", "author": "Xiaoguang Han; Zhen Li; Haibin Huang; Evangelos Kalogerakis; Yizhou Yu", "abstract": "We propose a data-driven method for recovering missing parts of 3D shapes. Our method is based on a new deep learning architecture consisting of two sub-networks: a global structure inference network and a local geometry refinement network. The global structure inference network incorporates a long short-term memorized context fusion module (LSTM-CF) that infers the global structure of the shape based on multi-view depth information provided as part of the input. It also includes a 3D fully convolutional (3DFCN) module that further enriches the global structure representation according to volumetric information in the input. Under the guidance of the global structure network, the local geometry refinement network takes as input local 3D patches around missing regions, and progressively produces a high-resolution, complete surface through a volumetric encoder-decoder architecture. Our method jointly trains the global structure inference and local geometry refinement networks in an end-to-end manner. We perform qualitative and quantitative evaluations on six object categories, demonstrating that our method outperforms existing state-of-the-art work on shape completion.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Han_High-Resolution_Shape_Completion_ICCV_2017_paper.pdf", @@ -6919,14 +7359,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Han_High-Resolution_Shape_Completion_ICCV_2017_paper.html", "aff_unique_index": "0;0;1;1;0", - "aff_unique_norm": "University of Hong Kong;University of Massachusetts Amherst", + "aff_unique_norm": "The University of Hong Kong;University of Massachusetts Amherst", "aff_unique_dep": ";", "aff_unique_url": "https://www.hku.hk;https://www.umass.edu", "aff_unique_abbr": "HKU;UMass Amherst", "aff_campus_unique_index": "0;0;1;1;0", "aff_campus_unique": "Hong Kong SAR;Amherst", "aff_country_unique_index": "0;0;1;1;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Han_2017_ICCV,\n \n author = {\n Han,\n Xiaoguang and Li,\n Zhen and Huang,\n Haibin and Kalogerakis,\n Evangelos and Yu,\n Yizhou\n},\n title = {\n High-Resolution Shape Completion Using Deep Neural Networks for Global Structure and Local Geometry Inference\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Higher-Order Integration of Hierarchical Convolutional Activations for Fine-Grained Visual Categorization", @@ -6934,6 +7375,7 @@ "status": "Poster", "track": "main", "pid": "240", + "author_site": "Sijia Cai; Wangmeng Zuo; Lei Zhang", "author": "Sijia Cai; Wangmeng Zuo; Lei Zhang", "abstract": "The success of fine-grained visual categorization (FGVC) extremely relies on the modeling of appearance and interactions of various semantic parts. This makes FGVC very challenging because: (i) part annotation and detection require expert guidance and are very expensive; (ii) parts are of different sizes; and (iii) the part interactions are complex and of higher-order. To address these issues, we propose an end-to-end framework based on higher-order integration of hierarchical convolutional activations for FGVC. By treating the convolutional activations as local descriptors, hierarchical convolutional activations can serve as a representation of local parts from different scales. A polynomial kernel based predictor is proposed to capture higher-order statistics of convolutional activations for modeling part interaction. To model inter-layer part interactions, we extend polynomial predictor to integrate hierarchical activations via kernel fusion. Our work also provides a new perspective for combining convolutional activations from multiple layers. While hypercolumns simply concatenate maps from different layers, and holistically-nested network uses weighted fusion to combine side-outputs, our approach exploits higher-order intra-layer and inter-layer relations for better integration of hierarchical convolutional features. The proposed framework yields more discriminative representation and achieves competitive results on the widely used FGVC datasets.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Cai_Higher-Order_Integration_of_ICCV_2017_paper.pdf", @@ -6951,14 +7393,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Cai_Higher-Order_Integration_of_ICCV_2017_paper.html", "aff_unique_index": "0;1;0", - "aff_unique_norm": "Hong Kong Polytechnic University;Harbin Institute of Technology", + "aff_unique_norm": "The Hong Kong Polytechnic University;Harbin Institute of Technology", "aff_unique_dep": "Dept. of Computing;School of Computer Science and Technology", "aff_unique_url": "https://www.polyu.edu.hk;http://www.hit.edu.cn/", "aff_unique_abbr": "PolyU;HIT", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Hong Kong;Harbin", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Cai_2017_ICCV,\n \n author = {\n Cai,\n Sijia and Zuo,\n Wangmeng and Zhang,\n Lei\n},\n title = {\n Higher-Order Integration of Hierarchical Convolutional Activations for Fine-Grained Visual Categorization\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Higher-Order Minimum Cost Lifted Multicuts for Motion Segmentation", @@ -6966,6 +7409,7 @@ "status": "Poster", "track": "main", "pid": "1621", + "author_site": "Margret Keuper", "author": "Margret Keuper", "abstract": "Most state-of-the-art motion segmentation algorithms draw their potential from modeling motion differences of local entities such as point trajectories in terms of pairwise potentials in graphical models. Inference in instances of minimum cost multicut problems defined on such graphs allows to optimize the number of the resulting segments along with the segment assignment. However, pairwise potentials limit the discriminative power of the employed motion models to translational differences. More complex models such as Euclidean or affine transformations call for higher-order potentials and a tractable inference in the resulting higher-order graphical models. In this paper, we (1) introduce a generalization of the minimum cost lifted multicut problem to hypergraphs, and (2) propose a simple primal feasible heuristic that allows for a reasonably efficient inference in instances of higher-order lifted multicut problem instances defined on point trajectory hypergraphs for motion segmentation. The resulting motion segmentations improve over the state-of-the-art on the FBMS-59 dataset.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Keuper_Higher-Order_Minimum_Cost_ICCV_2017_paper.pdf", @@ -6988,7 +7432,8 @@ "aff_unique_url": "https://www.uni-mannheim.de", "aff_unique_abbr": "", "aff_country_unique_index": "0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Keuper_2017_ICCV,\n \n author = {\n Keuper,\n Margret\n},\n title = {\n Higher-Order Minimum Cost Lifted Multicuts for Motion Segmentation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "How Far Are We From Solving the 2D & 3D Face Alignment Problem? (And a Dataset of 230,000 3D Facial Landmarks)", @@ -6996,6 +7441,7 @@ "status": "Poster", "track": "main", "pid": "560", + "author_site": "Adrian Bulat; Georgios Tzimiropoulos", "author": "Adrian Bulat; Georgios Tzimiropoulos", "abstract": "This paper investigates how far a very deep neural network is from attaining close to saturating performance on existing 2D and 3D face alignment datasets. To this end, we make the following 5 contributions: (a) we construct, for the first time, a very strong baseline by combining a state-of-the-art architecture for landmark localization with a state-of-the-art residual block, train it on a very large yet synthetically expanded 2D facial landmark dataset and finally evaluate it on all other 2D facial landmark datasets. (b) We create a guided by 2D landmarks network which converts 2D landmark annotations to 3D and unifies all existing datasets, leading to the creation of LS3D-W, the largest and most challenging 3D facial landmark dataset to date 230,000 images. (c) Following that, we train a neural network for 3D face alignment and evaluate it on the newly introduced LS3D-W. (d) We further look into the effect of all \"traditional\" factors affecting face alignment performance like large pose, initialization and resolution, and introduce a \"new\" one, namely the size of the network. (e) We show that both 2D and 3D face alignment networks achieve performance of remarkable accuracy which is probably close to saturating the datasets used. Training and testing code as well as the dataset can be downloaded from https://www.adrianbulat.com/face-alignment/", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Bulat_How_Far_Are_ICCV_2017_paper.pdf", @@ -7013,14 +7459,15 @@ "author_num": 2, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Bulat_How_Far_Are_ICCV_2017_paper.html", "aff_unique_index": "0;0", - "aff_unique_norm": "University of Nottingham", + "aff_unique_norm": "The University of Nottingham", "aff_unique_dep": "Computer Vision Laboratory", "aff_unique_url": "https://www.nottingham.ac.uk", "aff_unique_abbr": "", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Bulat_2017_ICCV,\n \n author = {\n Bulat,\n Adrian and Tzimiropoulos,\n Georgios\n},\n title = {\n How Far Are We From Solving the 2D & 3D Face Alignment Problem? (And a Dataset of 230,000 3D Facial Landmarks)\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Human Pose Estimation Using Global and Local Normalization", @@ -7028,6 +7475,7 @@ "status": "Poster", "track": "main", "pid": "3023", + "author_site": "Ke Sun; Cuiling Lan; Junliang Xing; Wenjun Zeng; Dong Liu; Jingdong Wang", "author": "Ke Sun; Cuiling Lan; Junliang Xing; Wenjun Zeng; Dong Liu; Jingdong Wang", "abstract": "In this paper, we address the problem of estimating the positions of human joints, i.e., articulated pose estimation. Recent state-of-the-art solutions model two key issues, joint detection and spatial configuration refinement, together using convolutional neural networks. Our work mainly focuses on spatial configuration refinement by reducing variations of human poses statistically, which is motivated by the observation that the scattered distribution of the relative locations of joints (e.g., the left wrist is distributed nearly uniformly in a circular area around the left shoulder) makes the learning of convolutional spatial models hard. We present a two-stage normalization scheme, human body normalization and limb normalization, to make the distribution of the relative joint locations compact, resulting in easier learning of convolutional spatial models and more accurate pose estimation. In addition, our empirical results show that incorporating multi-scale supervision and multi-scale fusion into the joint detection network is beneficial. Experiment results demonstrate that our method consistently outperforms state-of-the-art methods on the benchmarks.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Sun_Human_Pose_Estimation_ICCV_2017_paper.pdf", @@ -7042,7 +7490,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Sun_Human_Pose_Estimation_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Sun_Human_Pose_Estimation_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Sun_2017_ICCV,\n \n author = {\n Sun,\n Ke and Lan,\n Cuiling and Xing,\n Junliang and Zeng,\n Wenjun and Liu,\n Dong and Wang,\n Jingdong\n},\n title = {\n Human Pose Estimation Using Global and Local Normalization\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "HydraPlus-Net: Attentive Deep Features for Pedestrian Analysis", @@ -7050,6 +7499,7 @@ "status": "Poster", "track": "main", "pid": "23", + "author_site": "Xihui Liu; Haiyu Zhao; Maoqing Tian; Lu Sheng; Jing Shao; Shuai Yi; Junjie Yan; Xiaogang Wang", "author": "Xihui Liu; Haiyu Zhao; Maoqing Tian; Lu Sheng; Jing Shao; Shuai Yi; Junjie Yan; Xiaogang Wang", "abstract": "Pedestrian analysis plays a vital role in intelligent video surveillance and is a key component for security-centric computer vision systems. Despite that the convolutional neural networks are remarkable in learning discriminative features from images, the learning of comprehensive features of pedestrians for fine-grained tasks remains an open problem. In this study, we propose a new attention-based deep neural network, named as HydraPlus-Net (HP-net), that multi-directionally feeds the multi-level attention maps to different feature layers. The attentive deep features learned from the proposed HP-net bring unique advantages: (1) the model is capable of capturing multiple attentions from low-level to semantic-level, and (2) it explores the multi-scale selectiveness of attentive features to enrich the final feature representations for a pedestrian image. We demonstrate the effectiveness and generality of the proposed HP-net for pedestrian analysis on two tasks, i.e. pedestrian attribute recognition and person re-identification. Intensive experimental results have been provided to prove that the HP-net outperforms the state-of-the-art methods on various datasets.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Liu_HydraPlus-Net_Attentive_Deep_ICCV_2017_paper.pdf", @@ -7067,14 +7517,15 @@ "author_num": 8, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Liu_HydraPlus-Net_Attentive_Deep_ICCV_2017_paper.html", "aff_unique_index": "0+1;1;1;0;1;1;1;0", - "aff_unique_norm": "Chinese University of Hong Kong;SenseTime Group Limited", + "aff_unique_norm": "The Chinese University of Hong Kong;SenseTime Group Limited", "aff_unique_dep": ";", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.sensetime.com", "aff_unique_abbr": "CUHK;SenseTime", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0+0;0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2017_ICCV,\n \n author = {\n Liu,\n Xihui and Zhao,\n Haiyu and Tian,\n Maoqing and Sheng,\n Lu and Shao,\n Jing and Yi,\n Shuai and Yan,\n Junjie and Wang,\n Xiaogang\n},\n title = {\n HydraPlus-Net: Attentive Deep Features for Pedestrian Analysis\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Identity-Aware Textual-Visual Matching With Latent Co-Attention", @@ -7082,6 +7533,7 @@ "status": "Poster", "track": "main", "pid": "735", + "author_site": "Shuang Li; Tong Xiao; Hongsheng Li; Wei Yang; Xiaogang Wang", "author": "Shuang Li; Tong Xiao; Hongsheng Li; Wei Yang; Xiaogang Wang", "abstract": "Textual-visual matching aims at measuring similarities between sentence descriptions and images. Most existing methods tackle this problem without effectively utilizing identity-level annotations. In this paper, we propose an identity-aware two-stage framework for the textual-visual matching problem. Our stage-1 CNN-LSTM network learns to embed cross-modal features with a novel Cross-Modal Cross-Entropy (CMCE) loss. The stage-1 network is able to efficiently screen easy incorrect matchings and also provide initial training point for the stage-2 training. The stage-2 CNN-LSTM network refines the matching results with a latent co-attention mechanism. The spatial attention relates each word with corresponding image regions while the latent semantic attention aligns different sentence structures to make the matching results more robust to sentence structure variations. Extensive experiments on three datasets with identity-level annotations show that our framework outperforms state-of-the-art approaches by large margins.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Li_Identity-Aware_Textual-Visual_Matching_ICCV_2017_paper.pdf", @@ -7099,14 +7551,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Li_Identity-Aware_Textual-Visual_Matching_ICCV_2017_paper.html", "aff_unique_index": "0;0;0;0;0", - "aff_unique_norm": "Chinese University of Hong Kong", + "aff_unique_norm": "The Chinese University of Hong Kong", "aff_unique_dep": "Department of Electronic Engineering", "aff_unique_url": "https://www.cuhk.edu.hk", "aff_unique_abbr": "CUHK", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2017_ICCV,\n \n author = {\n Li,\n Shuang and Xiao,\n Tong and Li,\n Hongsheng and Yang,\n Wei and Wang,\n Xiaogang\n},\n title = {\n Identity-Aware Textual-Visual Matching With Latent Co-Attention\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Illuminating Pedestrians via Simultaneous Detection & Segmentation", @@ -7114,6 +7567,7 @@ "status": "Poster", "track": "main", "pid": "2435", + "author_site": "Garrick Brazil; Xi Yin; Xiaoming Liu", "author": "Garrick Brazil; Xi Yin; Xiaoming Liu", "abstract": "Pedestrian detection is a critical problem in computer vision with significant impact on safety in urban autonomous driving. In this work, we explore how semantic segmentation can be used to boost pedestrian detection accuracy while having little to no impact on network efficiency. We propose a segmentation infusion network to enable joint supervision on semantic segmentation and pedestrian detection. When placed properly, the additional supervision helps guide features in shared layers to become more sophisticated and helpful for the downstream pedestrian detector. Using this approach, we find weakly annotated boxes to be sufficient for considerable performance gains. We provide an in-depth analysis to demonstrate how shared layers are shaped by the segmentation supervision. In doing so, we show that the resulting feature maps become more semantically meaningful and robust to shape and occlusion. Overall, our simultaneous detection and segmentation framework achieves a considerable gain over the state-of-the-art on the Caltech pedestrian dataset, competitive performance on KITTI, and executes 2x faster than competitive methods.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Brazil_Illuminating_Pedestrians_via_ICCV_2017_paper.pdf", @@ -7138,7 +7592,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "East Lansing", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Brazil_2017_ICCV,\n \n author = {\n Brazil,\n Garrick and Yin,\n Xi and Liu,\n Xiaoming\n},\n title = {\n Illuminating Pedestrians via Simultaneous Detection & Segmentation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Image Super-Resolution Using Dense Skip Connections", @@ -7146,6 +7601,7 @@ "status": "Poster", "track": "main", "pid": "2424", + "author_site": "Tong Tong; Gen Li; Xiejie Liu; Qinquan Gao", "author": "Tong Tong; Gen Li; Xiejie Liu; Qinquan Gao", "abstract": "Recent studies have shown that the performance of single-image super-resolution methods can be significantly boosted by using deep convolutional neural networks. In this study, we present a novel single-image super-resolution method by introducing dense skip connections in a very deep network. In the proposed network, the feature maps of each layer are propagated into all subsequent layers, providing an effective way to combine the low-level features and high-level features to boost the reconstruction performance. In addition, the dense skip connections in the network enable short paths to be built directly from the output to each layer, alleviating the vanishing-gradient problem of very deep networks. Moreover, deconvolution layers are integrated into the network to learn the upsampling filters and to speedup the reconstruction process. Further, the proposed method substantially reduces the number of parameters, enhancing the computational efficiency. We evaluate the proposed method using images from four benchmark datasets and set a new state of the art.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Tong_Image_Super-Resolution_Using_ICCV_2017_paper.pdf", @@ -7170,7 +7626,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Tong_2017_ICCV,\n \n author = {\n Tong,\n Tong and Li,\n Gen and Liu,\n Xiejie and Gao,\n Qinquan\n},\n title = {\n Image Super-Resolution Using Dense Skip Connections\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Image-Based Localization Using LSTMs for Structured Feature Correlation", @@ -7178,11 +7635,11 @@ "status": "Poster", "track": "main", "pid": "229", - "author_site": "Florian Walch; Caner Hazirbas; Laura Leal-Taix\u00c3\u00a9; Torsten Sattler; Sebastian Hilsenbeck; Daniel Cremers", + "author_site": "Florian Walch; Caner Hazirbas; Laura Leal-Taixé; Torsten Sattler; Sebastian Hilsenbeck; Daniel Cremers", "author": "Florian Walch; Caner Hazirbas; Laura Leal-Taixe; Torsten Sattler; Sebastian Hilsenbeck; Daniel Cremers", "abstract": "In this work we propose a new CNN+LSTM architecture for camera pose regression for indoor and outdoor scenes. CNNs allow us to learn suitable feature representations for localization that are robust against motion blur and illumination changes. We make use of LSTM units on the CNN output, which play the role of a structured dimensionality reduction on the feature vector, leading to drastic improvements in localization performance. We provide extensive quantitative comparison of CNN-based and SIFT-based localization methods, showing the weaknesses and strengths of each. Furthermore, we present a new large-scale indoor dataset with accurate ground truth from a laser scanner. Experimental results on both indoor and outdoor public datasets show our method outperforms existing deep architectures, and can localize images in hard conditions, e.g., in the presence of mostly textureless surfaces, where classic SIFT-based methods fail.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Walch_Image-Based_Localization_Using_ICCV_2017_paper.pdf", - "aff": "Technical University of Munich; Technical University of Munich; Technical University of Munich; Department of Computer Science, ETH Z \u00a8urich; NavVis; Technical University of Munich", + "aff": "Technical University of Munich; Technical University of Munich; Technical University of Munich; Department of Computer Science, ETH Z ¨urich; NavVis; Technical University of Munich", "project": "", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2017/supplemental/Walch_Image-Based_Localization_Using_ICCV_2017_supplemental.pdf", @@ -7196,14 +7653,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Walch_Image-Based_Localization_Using_ICCV_2017_paper.html", "aff_unique_index": "0;0;0;1;2;0", - "aff_unique_norm": "Technical University of Munich;ETH Zurich;NavVis", + "aff_unique_norm": "Technical University of Munich;ETH Zürich;NavVis", "aff_unique_dep": ";Department of Computer Science;", "aff_unique_url": "https://www.tum.de;https://www.ethz.ch;https://www.navvis.com", "aff_unique_abbr": "TUM;ETHZ;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0;0", - "aff_country_unique": "Germany;Switzerland" + "aff_country_unique": "Germany;Switzerland", + "bibtex": "@InProceedings{Walch_2017_ICCV,\n \n author = {\n Walch,\n Florian and Hazirbas,\n Caner and Leal-Taixe,\n Laura and Sattler,\n Torsten and Hilsenbeck,\n Sebastian and Cremers,\n Daniel\n},\n title = {\n Image-Based Localization Using LSTMs for Structured Feature Correlation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Image2song: Song Retrieval via Bridging Image Content and Lyric Words", @@ -7211,10 +7669,11 @@ "status": "Poster", "track": "main", "pid": "3101", + "author_site": "Xuelong Li; Di Hu; Xiaoqiang Lu", "author": "Xuelong Li; Di Hu; Xiaoqiang Lu", "abstract": "Image is usually taken for expressing some kinds of emotions or purposes, such as love, celebrating Christmas. There is another better way that combines the image and relevant song to amplify the expression, which has drawn much attention in the social network recently. Hence, the automatic selection of songs should be expected. In this paper, we propose to retrieve semantic relevant songs just by an image query, which is named as the image2song problem. Motivated by the requirements of establishing correlation in semantic/content, we build a semantic-based song retrieval framework, which learns the correlation between image content and lyric words. This model uses a convolutional neural network to generate rich tags from image regions, a recurrent neural network to model lyric, and then establishes correlation via a multi-layer perceptron. To reduce the content gap between image and lyric, we propose to make the lyric modeling focus on the main image content via a tag attention. We collect a dataset from the social-sharing multimodal data to study the proposed problem, which consists of (image, music clip, lyric) triplets. We demonstrate that our proposed model shows noticeable results in the image2song retrieval task and provides suitable songs. Besides, the song2image task is also performed.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Li_Image2song_Song_Retrieval_ICCV_2017_paper.pdf", - "aff": "Xi\u2019an Institute of Optics and Precision Mechanics, Chinese Academy of Sciences, Xi\u2019an 710119, P. R. China+School of Computer Science and Center for OPTical IMagery Analysis and Learning (OPTIMAL), Northwestern Polytechnical University, Xi\u2019an 710072, P. R. China; School of Computer Science and Center for OPTical IMagery Analysis and Learning (OPTIMAL), Northwestern Polytechnical University, Xi\u2019an 710072, P. R. China; Xi\u2019an Institute of Optics and Precision Mechanics, Chinese Academy of Sciences, Xi\u2019an 710119, P. R. China+School of Computer Science and Center for OPTical IMagery Analysis and Learning (OPTIMAL), Northwestern Polytechnical University, Xi\u2019an 710072, P. R. China", + "aff": "Xi’an Institute of Optics and Precision Mechanics, Chinese Academy of Sciences, Xi’an 710119, P. R. China+School of Computer Science and Center for OPTical IMagery Analysis and Learning (OPTIMAL), Northwestern Polytechnical University, Xi’an 710072, P. R. China; School of Computer Science and Center for OPTical IMagery Analysis and Learning (OPTIMAL), Northwestern Polytechnical University, Xi’an 710072, P. R. China; Xi’an Institute of Optics and Precision Mechanics, Chinese Academy of Sciences, Xi’an 710119, P. R. China+School of Computer Science and Center for OPTical IMagery Analysis and Learning (OPTIMAL), Northwestern Polytechnical University, Xi’an 710072, P. R. China", "project": "", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2017/supplemental/Li_Image2song_Song_Retrieval_ICCV_2017_supplemental.pdf", @@ -7228,14 +7687,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Li_Image2song_Song_Retrieval_ICCV_2017_paper.html", "aff_unique_index": "0+1;1;0+1", - "aff_unique_norm": "Xi\u2019an Institute of Optics and Precision Mechanics;Northwestern Polytechnical University", + "aff_unique_norm": "Xi’an Institute of Optics and Precision Mechanics;Northwestern Polytechnical University", "aff_unique_dep": "Chinese Academy of Sciences;School of Computer Science", "aff_unique_url": ";https://www.nwpu.edu.cn", "aff_unique_abbr": ";NWPU", "aff_campus_unique_index": "0+0;0;0+0", "aff_campus_unique": "Xi'an", "aff_country_unique_index": "0+0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2017_ICCV,\n \n author = {\n Li,\n Xuelong and Hu,\n Di and Lu,\n Xiaoqiang\n},\n title = {\n Image2song: Song Retrieval via Bridging Image Content and Lyric Words\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Improved Image Captioning via Policy Gradient Optimization of SPIDEr", @@ -7243,6 +7703,7 @@ "status": "Spotlight", "track": "main", "pid": "2633", + "author_site": "Siqi Liu; Zhenhai Zhu; Ning Ye; Sergio Guadarrama; Kevin Murphy", "author": "Siqi Liu; Zhenhai Zhu; Ning Ye; Sergio Guadarrama; Kevin Murphy", "abstract": "Current image captioning methods are usually trained via maximum likelihood estimation. However, the log-likelihood score of a caption does not correlate well with human assessments of quality. Standard syntactic evaluation metrics, such as BLEU, METEOR and ROUGE, are also not well correlated. The newer SPICE and CIDEr metrics are better correlated, but have traditionally been hard to optimize for. In this paper, we show how to use a policy gradient (PG) method to directly optimize a linear combination of SPICE and CIDEr (a combination we call SPIDEr): the SPICE score ensures our captions are semantically faithful to the image, while CIDEr score ensures our captions are syntactically fluent. The PG method we propose improves on the prior MIXER approach, by using Monte Carlo rollouts instead of mixing MLE training with PG. We show empirically that our algorithm leads to easier optimization and improved results compared to MIXER. Finally, we show that using our PG method we can optimize any of the metrics, including the proposed SPIDEr metric which results in image captions that are strongly preferred by human raters compared to captions generated by the same model but trained to optimize MLE or the COCO metrics.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Liu_Improved_Image_Captioning_ICCV_2017_paper.pdf", @@ -7261,13 +7722,14 @@ "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Liu_Improved_Image_Captioning_ICCV_2017_paper.html", "aff_unique_index": "0+1;1;1;1;1", "aff_unique_norm": "University of Oxford;Google", - "aff_unique_dep": "Department of Computer Science;Google", + "aff_unique_dep": "Department of Computer Science;", "aff_unique_url": "https://www.ox.ac.uk;https://www.google.com", "aff_unique_abbr": "Oxford;Google", "aff_campus_unique_index": "0+1;1;1;1;1", "aff_campus_unique": "Oxford;Mountain View", "aff_country_unique_index": "0+1;1;1;1;1", - "aff_country_unique": "United Kingdom;United States" + "aff_country_unique": "United Kingdom;United States", + "bibtex": "@InProceedings{Liu_2017_ICCV,\n \n author = {\n Liu,\n Siqi and Zhu,\n Zhenhai and Ye,\n Ning and Guadarrama,\n Sergio and Murphy,\n Kevin\n},\n title = {\n Improved Image Captioning via Policy Gradient Optimization of SPIDEr\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Increasing CNN Robustness to Occlusions by Reducing Filter Support", @@ -7275,6 +7737,7 @@ "status": "Poster", "track": "main", "pid": "306", + "author_site": "Elad Osherov; Michael Lindenbaum", "author": "Elad Osherov; Michael Lindenbaum", "abstract": "Convolutional neural networks (CNNs) provide the current state of the art in visual object classification, but they are far less accurate when classifying partially occluded objects. A straightforward way to improve classification under occlusion conditions is to train the classifier using partially occluded object examples. However, training the network on many combinations of object instances and occlusions may be computationally expensive. This work proposes an alternative approach to increasing the robustness of CNNs to occlusion. We start by studying the effect of partial occlusions on the trained CNN and show, empirically, that training on partially occluded examples reduces the spatial support of the filters. Building upon this finding, we argue that smaller filter support is beneficial for occlusion robustness. We propose a training process that uses a special regularization term that acts to shrink the spatial support of the filters. We consider three possible regularization terms that are based on second central moments, group sparsity, and mutually reweighted L1, respectively. When trained on normal (unoccluded) examples, the resulting classifier is highly robust to occlusions. For large training sets and limited training time, the proposed classifier is even more accurate than standard classifiers trained on occluded object examples.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Osherov_Increasing_CNN_Robustness_ICCV_2017_paper.pdf", @@ -7299,7 +7762,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Israel" + "aff_country_unique": "Israel", + "bibtex": "@InProceedings{Osherov_2017_ICCV,\n \n author = {\n Osherov,\n Elad and Lindenbaum,\n Michael\n},\n title = {\n Increasing CNN Robustness to Occlusions by Reducing Filter Support\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Incremental Learning of Object Detectors Without Catastrophic Forgetting", @@ -7307,6 +7771,7 @@ "status": "Poster", "track": "main", "pid": "1369", + "author_site": "Konstantin Shmelkov; Cordelia Schmid; Karteek Alahari", "author": "Konstantin Shmelkov; Cordelia Schmid; Karteek Alahari", "abstract": "Despite their success for object detection, convolutional neural networks are ill-equipped for incremental learning, i.e., adapting the original model trained on a set of classes to additionally detect objects of new classes, in the absence of the initial training data. They suffer from \"catastrophic forgetting\" - an abrupt degradation of performance on the original set of classes, when the training objective is adapted to the new classes. We present a method to address this issue, and learn object detectors incrementally, when neither the original training data nor annotations for the original classes in the new training set are available. The core of our proposed solution is a loss function to balance the interplay between predictions on the new classes and a new distillation loss which minimizes the discrepancy between responses for old classes from the original and the updated networks. This incremental learning can be performed multiple times, for a new set of classes in each step, with a moderate drop in performance compared to the baseline network trained on the ensemble of data. We present object detection results on the PASCAL VOC 2007 and COCO datasets, along with a detailed empirical analysis of the approach.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Shmelkov_Incremental_Learning_of_ICCV_2017_paper.pdf", @@ -7322,7 +7787,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Shmelkov_Incremental_Learning_of_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Shmelkov_Incremental_Learning_of_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Shmelkov_2017_ICCV,\n \n author = {\n Shmelkov,\n Konstantin and Schmid,\n Cordelia and Alahari,\n Karteek\n},\n title = {\n Incremental Learning of Object Detectors Without Catastrophic Forgetting\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Infant Footprint Recognition", @@ -7330,6 +7796,7 @@ "status": "Poster", "track": "main", "pid": "896", + "author_site": "Eryun Liu", "author": "Eryun Liu", "abstract": "Infant recognition has received increasing attention in recent years in many applications, such as tracking child vaccination and identifying missing children. Due to the lack of efficient identification methods for infants and newborns, the current methods of infant recognition rely on identification of parents or certificates of identity. While biometric recognition technologies (e.g., face and fingerprint recognition) have been widely deployed in many applications for recognizing adults and teenagers, no such recognition systems yet exist for infants or newborns. One of the major problems is that the biometric traits of infants and newborns are either not permanent (e.g., face) or difficult to capture (e.g., fingerprint) due to lack of appropriate sensors. In this paper, we investigate the feasibility of infant recognition by their footprint using a 500 ppi commodity friction ridge sensor. We collected an infant footprint dataset in three sessions, consisting of 60 subjects, with age range from 1 to 9 months. We proposed a new minutia descriptor based on deep convolutional neural network for measuring minutiae similarity. The descriptor is compact and highly discriminative. We conducted verification experiments for both single enrolled template and fusion of multiple enrolled templates, and show the impact of age and time gap on matching performance. Comparison experiments with state of the art algorithm show the advantage of the proposed minutia descriptor.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Liu_Infant_Footprint_Recognition_ICCV_2017_paper.pdf", @@ -7344,7 +7811,8 @@ "aff_domain": "", "email": "", "author_num": 1, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Liu_Infant_Footprint_Recognition_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Liu_Infant_Footprint_Recognition_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Liu_2017_ICCV,\n \n author = {\n Liu,\n Eryun\n},\n title = {\n Infant Footprint Recognition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Inferring and Executing Programs for Visual Reasoning", @@ -7352,6 +7820,7 @@ "status": "Oral", "track": "main", "pid": "1138", + "author_site": "Justin Johnson; Bharath Hariharan; Laurens van der Maaten; Judy Hoffman; Li Fei-Fei; C. Lawrence Zitnick; Ross Girshick", "author": "Justin Johnson; Bharath Hariharan; Laurens van der Maaten; Judy Hoffman; Li Fei-Fei; C. Lawrence Zitnick; Ross Girshick", "abstract": "Existing methods for visual reasoning attempt to directly map inputs to outputs using black-box architectures without explicitly modeling the underlying reasoning processes. As a result, these black-box models often learn to exploit biases in the data rather than learning to perform visual reasoning. Inspired by module networks, this paper proposes a model for visual reasoning that consists of a program generator that constructs an explicit representation of the reasoning process to be performed, and an execution engine that executes the resulting program to produce an answer. Both the program generator and the execution engine are implemented by neural networks, and are trained using a combination of backpropagation and REINFORCE. Using the CLEVR benchmark for visual reasoning, we show that our model significantly outperforms strong baselines and generalizes better in a variety of settings.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Johnson_Inferring_and_Executing_ICCV_2017_paper.pdf", @@ -7367,7 +7836,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Johnson_Inferring_and_Executing_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Johnson_Inferring_and_Executing_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Johnson_2017_ICCV,\n \n author = {\n Johnson,\n Justin and Hariharan,\n Bharath and van der Maaten,\n Laurens and Hoffman,\n Judy and Fei-Fei,\n Li and Lawrence Zitnick,\n C. and Girshick,\n Ross\n},\n title = {\n Inferring and Executing Programs for Visual Reasoning\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Infinite Latent Feature Selection: A Probabilistic Latent Graph-Based Ranking Approach", @@ -7375,6 +7845,7 @@ "status": "Poster", "track": "main", "pid": "626", + "author_site": "Giorgio Roffo; Simone Melzi; Umberto Castellani; Alessandro Vinciarelli", "author": "Giorgio Roffo; Simone Melzi; Umberto Castellani; Alessandro Vinciarelli", "abstract": "Feature selection is playing an increasingly significant role with respect to many computer vision applications spanning from object recognition to visual object tracking. However, most of the recent solutions in feature selection are not robust across different and heterogeneous set of data. In this paper, we address this issue proposing a robust probabilistic latent graph-based feature selection algorithm that performs the ranking step while considering all the possible subsets of features, as paths on a graph, bypassing the combinatorial problem analytically. An appealing characteristic of the approach is that it aims to discover an abstraction behind low-level sensory data, that is, relevancy. Relevancy is modelled as a latent variable in a PLSA-inspired generative process that allows the investigation of the importance of a feature when injected into an arbitrary set of cues. The proposed method has been tested on ten diverse benchmarks, and compared against eleven state of the art feature selection methods. Results show that the proposed approach attains the highest performance levels across many different scenarios and difficulties, thereby confirming its strong robustness while setting a new state of the art in feature selection domain.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Roffo_Infinite_Latent_Feature_ICCV_2017_paper.pdf", @@ -7389,7 +7860,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Roffo_Infinite_Latent_Feature_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Roffo_Infinite_Latent_Feature_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Roffo_2017_ICCV,\n \n author = {\n Roffo,\n Giorgio and Melzi,\n Simone and Castellani,\n Umberto and Vinciarelli,\n Alessandro\n},\n title = {\n Infinite Latent Feature Selection: A Probabilistic Latent Graph-Based Ranking Approach\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Interleaved Group Convolutions", @@ -7397,6 +7869,7 @@ "status": "Poster", "track": "main", "pid": "1996", + "author_site": "Ting Zhang; Guo-Jun Qi; Bin Xiao; Jingdong Wang", "author": "Ting Zhang; Guo-Jun Qi; Bin Xiao; Jingdong Wang", "abstract": "In this paper, we present a simple and modularized neural network architecture, named interleaved group convolutional neural networks (IGCNets). The main point lies in a novel building block, a pair of two successive interleaved group convolutions: primary group convolution and secondary group convolution. The two group convolutions are complementary: (i) the convolution on each partition in primary group convolution is a spatial convolution, while on each partition in secondary group convolution, the convolution is a point-wise convolution; (ii) the channels in the same secondary partition come from different primary partitions. We discuss one representative advantage: Wider than a regular convolution with the number of parameters and the computation complexity preserved. We also show that regular convolutions, group convolution with summation fusion, and the Xception block are special cases of interleaved group convolutions. Empirical results over standard benchmarks, CIFAR-10, CIFAR-100, SVHN and ImageNet demonstrate that our networks are more efficient in using parameters and computation complexity with similar or higher accuracy.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zhang_Interleaved_Group_Convolutions_ICCV_2017_paper.pdf", @@ -7414,14 +7887,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Zhang_Interleaved_Group_Convolutions_ICCV_2017_paper.html", "aff_unique_index": "0;1;0;0", - "aff_unique_norm": "Microsoft;University of Central Florida", + "aff_unique_norm": "Microsoft Corporation;University of Central Florida", "aff_unique_dep": "Microsoft Research;", "aff_unique_url": "https://www.microsoft.com/en-us/research;https://www.ucf.edu", "aff_unique_abbr": "MSR;UCF", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhang_2017_ICCV,\n \n author = {\n Zhang,\n Ting and Qi,\n Guo-Jun and Xiao,\n Bin and Wang,\n Jingdong\n},\n title = {\n Interleaved Group Convolutions\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Interpretable Explanations of Black Boxes by Meaningful Perturbation", @@ -7429,6 +7903,7 @@ "status": "Poster", "track": "main", "pid": "1393", + "author_site": "Ruth C. Fong; Andrea Vedaldi", "author": "Ruth C. Fong; Andrea Vedaldi", "abstract": "As machine learning algorithms are increasingly applied to high impact yet high risk tasks, such as medical diagnosis or autonomous driving, it is critical that researchers can explain how such algorithms arrived at their predictions. In recent years, a number of image saliency methods have been developed to summarize where highly complex neural networks \"look\" in an image for evidence for their predictions. However, these techniques are limited by their heuristic nature and architectural constraints. In this paper, we make two main contributions: First, we propose a general framework for learning different kinds of explanations for any black box algorithm. Second, we specialise the framework to find the part of an image most responsible for a classifier decision. Unlike previous works, our method is model-agnostic and testable because it is grounded in explicit and interpretable image perturbations.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Fong_Interpretable_Explanations_of_ICCV_2017_paper.pdf", @@ -7453,7 +7928,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Fong_2017_ICCV,\n \n author = {\n Fong,\n Ruth C. and Vedaldi,\n Andrea\n},\n title = {\n Interpretable Explanations of Black Boxes by Meaningful Perturbation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Interpretable Learning for Self-Driving Cars by Visualizing Causal Attention", @@ -7461,6 +7937,7 @@ "status": "Poster", "track": "main", "pid": "1161", + "author_site": "Jinkyu Kim; John Canny", "author": "Jinkyu Kim; John Canny", "abstract": "Deep neural perception and control networks are likely to be a key component of self-driving vehicles. These models need to be explainable - they should provide easy-to-interpret rationales for their behavior - so that passengers, insurance companies, law enforcement, developers etc., can understand what triggered a particular behavior. Here we explore the use of visual explanations. These explanations take the form of real-time highlighted regions of an image that causally influence the network's output (steering control). Our approach is two-stage. In the first stage, we use a visual attention model to train a convolution network end-to-end from images to steering angle. The attention model highlights image regions that potentially influence the network's output. Some of these are true influences, but some are spurious. We then apply a causal filtering step to determine which input regions actually influence the output. This produces more succinct visual explanations and more accurately exposes the network's behavior. We demonstrate the effectiveness of our model on three datasets totaling 16 hours of driving. We first show that training with attention does not degrade the performance of the end-to-end network. Then we show that the network causally cues on a variety of features that are used by humans while driving.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Kim_Interpretable_Learning_for_ICCV_2017_paper.pdf", @@ -7485,7 +7962,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Kim_2017_ICCV,\n \n author = {\n Kim,\n Jinkyu and Canny,\n John\n},\n title = {\n Interpretable Learning for Self-Driving Cars by Visualizing Causal Attention\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Interpretable Transformations With Encoder-Decoder Networks", @@ -7493,6 +7971,7 @@ "status": "Poster", "track": "main", "pid": "2632", + "author_site": "Daniel E. Worrall; Stephan J. Garbin; Daniyar Turmukhambetov; Gabriel J. Brostow", "author": "Daniel E. Worrall; Stephan J. Garbin; Daniyar Turmukhambetov; Gabriel J. Brostow", "abstract": "Deep feature spaces have the capacity to encode complex transformations of their input data. However, understanding the relative feature-space relationship between two transformed encoded images is difficult. For instance, what is the relative feature space relationship between two rotated images? What is decoded when we interpolate in feature space? Ideally, we want to disentangle confounding factors, such as pose, appearance, and illumination, from object identity. Disentangling these is difficult because they interact in very nonlinear ways. We propose a simple method to construct a deep feature space, with explicitly disentangled representations of several known transformations. A person or algorithm can then manipulate the disentangled representation, for example, to re-render an image with explicit control over parameterized degrees of freedom. The feature space is constructed using a transforming encoder-decoder network with a custom feature transform layer, acting on the hidden representations. We demonstrate the advantages of explicit disentangling on a variety of datasets and transformations, and as an aid for traditional tasks, such as classification.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Worrall_Interpretable_Transformations_With_ICCV_2017_paper.pdf", @@ -7508,7 +7987,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Worrall_Interpretable_Transformations_With_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Worrall_Interpretable_Transformations_With_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Worrall_2017_ICCV,\n \n author = {\n Worrall,\n Daniel E. and Garbin,\n Stephan J. and Turmukhambetov,\n Daniyar and Brostow,\n Gabriel J.\n},\n title = {\n Interpretable Transformations With Encoder-Decoder Networks\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Intrinsic 3D Dynamic Surface Tracking Based on Dynamic Ricci Flow and Teichmuller Map", @@ -7516,6 +7996,7 @@ "status": "Poster", "track": "main", "pid": "2559", + "author_site": "Xiaokang Yu; Na Lei; Yalin Wang; Xianfeng Gu", "author": "Xiaokang Yu; Na Lei; Yalin Wang; Xianfeng Gu", "abstract": "3D dynamic surface tracking is an important research problem and plays a vital role in many computer vision and medical imaging applications. However, it is still challenging to efficiently register surface sequences which has large deformations and strong noise. In this paper, we propose a novel automatic method for non-rigid 3D dynamic surface tracking with surface Ricci flow and Teichmuller map methods. According to quasi-conformal Teichmuller theory, the Techmuller map minimizes the maximal dilation so that our method is able to automatically register surfaces with large deformations. Besides, the adoption of Delaunay triangulation and quadrilateral meshes makes our method applicable to low quality meshes. In our work, the 3D dynamic surfaces are acquired by a high speed 3D scanner. We first identified sparse surface features using machine learning methods in the texture space. Then we assign landmark features with different curvature settings and the Riemannian metric of the surface is computed by the dynamic Ricci flow method, such that all the curvatures are concentrated on the feature points and the surface is flat everywhere else. The registration among frames is computed by the Teichmuller mappings, which aligns the feature points with least angle distortions. We apply our new method to multiple sequences of 3D facial surfaces with large expression deformations and compare them with two other state-of-the-art tracking methods. The effectiveness of our method is demonstrated by the clearly improved accuracy and efficiency.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Yu_Intrinsic_3D_Dynamic_ICCV_2017_paper.pdf", @@ -7540,7 +8021,8 @@ "aff_campus_unique_index": "0;1;2;3", "aff_campus_unique": "Qingdao;Dalian;Arizona;Stony Brook", "aff_country_unique_index": "0;0;1;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Yu_2017_ICCV,\n \n author = {\n Yu,\n Xiaokang and Lei,\n Na and Wang,\n Yalin and Gu,\n Xianfeng\n},\n title = {\n Intrinsic 3D Dynamic Surface Tracking Based on Dynamic Ricci Flow and Teichmuller Map\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Intrinsic3D: High-Quality 3D Reconstruction by Joint Appearance and Geometry Optimization With Spatially-Varying Lighting", @@ -7548,7 +8030,7 @@ "status": "Poster", "track": "main", "pid": "1395", - "author_site": "Robert Maier; Kihwan Kim; Daniel Cremers; Jan Kautz; Matthias Nie\u00c3\u009fner", + "author_site": "Robert Maier; Kihwan Kim; Daniel Cremers; Jan Kautz; Matthias Nießner", "author": "Robert Maier; Kihwan Kim; Daniel Cremers; Jan Kautz; Matthias Niessner", "abstract": "We introduce a novel method to obtain high-quality 3D reconstructions from consumer RGB-D sensors. Our core idea is to simultaneously optimize for geometry encoded in a signed distance field (SDF), textures from automatically-selected keyframes, and their camera poses along with material and scene lighting. To this end, we propose a joint surface reconstruction approach that is based on Shape-from-Shading (SfS) techniques and utilizes the estimation of spatially-varying spherical harmonics (SVSH) from subvolumes of the reconstructed scene. Through extensive examples and evaluations, we demonstrate that our method dramatically increases the level of detail in the reconstructed scene geometry and contributes highly to consistent surface texture recovery.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Maier_Intrinsic3D_High-Quality_3D_ICCV_2017_paper.pdf", @@ -7566,14 +8048,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Maier_Intrinsic3D_High-Quality_3D_ICCV_2017_paper.html", "aff_unique_index": "0+1;0;1;0;1+2", - "aff_unique_norm": "NVIDIA;Technical University of Munich;Stanford University", - "aff_unique_dep": "NVIDIA Corporation;;", + "aff_unique_norm": "NVIDIA Corporation;Technical University of Munich;Stanford University", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.nvidia.com;https://www.tum.de;https://www.stanford.edu", "aff_unique_abbr": "NVIDIA;TUM;Stanford", "aff_campus_unique_index": ";1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0+1;0;1;0;1+0", - "aff_country_unique": "United States;Germany" + "aff_country_unique": "United States;Germany", + "bibtex": "@InProceedings{Maier_2017_ICCV,\n \n author = {\n Maier,\n Robert and Kim,\n Kihwan and Cremers,\n Daniel and Kautz,\n Jan and Niessner,\n Matthias\n},\n title = {\n Intrinsic3D: High-Quality 3D Reconstruction by Joint Appearance and Geometry Optimization With Spatially-Varying Lighting\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Introspective Neural Networks for Generative Modeling", @@ -7581,6 +8064,7 @@ "status": "Poster", "track": "main", "pid": "1119", + "author_site": "Justin Lazarow; Long Jin; Zhuowen Tu", "author": "Justin Lazarow; Long Jin; Zhuowen Tu", "abstract": "We study unsupervised learning by developing a generative model built from progressively learned deep convolutional neural networks. The resulting generator is additionally a discriminator, capable of \"introspection\" in a sense --- being able to self-evaluate the difference between its generated samples and the given training data. Through repeated discriminative learning, desirable properties of modern discriminative classifiers are directly inherited by the generator. Specifically, our model learns a sequence of CNN classifiers using a synthesis-by-classification algorithm. In the experiments, we observe encouraging results on a number of applications including texture modeling, artistic style transferring, face modeling, and unsupervised feature learning.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Lazarow_Introspective_Neural_Networks_ICCV_2017_paper.pdf", @@ -7605,7 +8089,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "San Diego", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Lazarow_2017_ICCV,\n \n author = {\n Lazarow,\n Justin and Jin,\n Long and Tu,\n Zhuowen\n},\n title = {\n Introspective Neural Networks for Generative Modeling\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Is Second-Order Information Helpful for Large-Scale Visual Recognition?", @@ -7613,6 +8098,7 @@ "status": "Poster", "track": "main", "pid": "733", + "author_site": "Peihua Li; Jiangtao Xie; Qilong Wang; Wangmeng Zuo", "author": "Peihua Li; Jiangtao Xie; Qilong Wang; Wangmeng Zuo", "abstract": "By stacking layers of convolution and nonlinearity, convolutional networks (ConvNets) effectively learn from low-level to high-level features and discriminative representations. Since the end goal of large-scale recognition is to delineate complex boundaries of thousands of classes, adequate exploration of feature distributions is important for realizing full potentials of ConvNets. However, state-of-the-art works concentrate only on deeper or wider architecture design, while rarely exploring feature statistics higher than first-order. We take a step towards addressing this problem. Our method consists in covariance pooling, instead of the most commonly used first-order pooling, of high-level convolutional features. The main challenges involved are robust covariance estimation given a small sample of large-dimensional features and usage of the manifold structure of covariance matrices. To address these challenges, we present a Matrix Power Normalized Covariance (MPN-COV) method. We develop forward and backward propagation formulas regarding the nonlinear matrix functions such that MPN-COV can be trained end-to-end. In addition, we analyze both qualitatively and quantitatively its advantage over the well-known Log-Euclidean metric. On the ImageNet 2012 validation set, by combining MPN-COV we achieve over 4%, 3% and 2.5% gains for AlexNet, VGG-M and VGG-16, respectively; integration of MPN-COV into 50-layer ResNet outperforms ResNet-101 and is comparable to ResNet-152. The source code will be available on the project page: http://www.peihuali.org/MPN-COV.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Li_Is_Second-Order_Information_ICCV_2017_paper.pdf", @@ -7627,7 +8113,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Li_Is_Second-Order_Information_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Li_Is_Second-Order_Information_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Li_2017_ICCV,\n \n author = {\n Li,\n Peihua and Xie,\n Jiangtao and Wang,\n Qilong and Zuo,\n Wangmeng\n},\n title = {\n Is Second-Order Information Helpful for Large-Scale Visual Recognition?\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Joint Adaptive Sparsity and Low-Rankness on the Fly: An Online Tensor Reconstruction Scheme for Video Denoising", @@ -7635,6 +8122,7 @@ "status": "Poster", "track": "main", "pid": "252", + "author_site": "Bihan Wen; Yanjun Li; Luke Pfister; Yoram Bresler", "author": "Bihan Wen; Yanjun Li; Luke Pfister; Yoram Bresler", "abstract": "Recent works on adaptive sparse and low-rank signal modeling have demonstrated their usefulness, especially in image/video processing applications. While a patch-based sparse model imposes local structure, low-rankness of the grouped patches exploits non-local correlation. Applying either approach alone usually limits performance in various low-level vision tasks. In this work, we propose a novel video denoising method, based on an online tensor reconstruction scheme with a joint adaptive sparse and low-rank model, dubbed SALT. An efficient and unsupervised online unitary sparsifying transform learning method is introduced to impose adaptive sparsity on the fly. We develop an efficient 3D spatio-temporal data reconstruction framework based on the proposed online learning method, which exhibits low latency and can potentially handle streaming videos. To the best of our knowledge, this is the first work that combines adaptive sparsity and low-rankness for video denoising, and the first work of solving the proposed problem in an online fashion. We demonstrate video denoising results over commonly used videos from public datasets. Numerical experiments show that the proposed video denoising method outperforms competing methods.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Wen_Joint_Adaptive_Sparsity_ICCV_2017_paper.pdf", @@ -7649,7 +8137,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Wen_Joint_Adaptive_Sparsity_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Wen_Joint_Adaptive_Sparsity_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Wen_2017_ICCV,\n \n author = {\n Wen,\n Bihan and Li,\n Yanjun and Pfister,\n Luke and Bresler,\n Yoram\n},\n title = {\n Joint Adaptive Sparsity and Low-Rankness on the Fly: An Online Tensor Reconstruction Scheme for Video Denoising\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Joint Bi-Layer Optimization for Single-Image Rain Streak Removal", @@ -7657,6 +8146,7 @@ "status": "Poster", "track": "main", "pid": "1199", + "author_site": "Lei Zhu; Chi-Wing Fu; Dani Lischinski; Pheng-Ann Heng", "author": "Lei Zhu; Chi-Wing Fu; Dani Lischinski; Pheng-Ann Heng", "abstract": "We present a novel method for removing rain streaks from a single input image by decomposing it into a rain-free background layer B and a rain-streak layer R. A joint optimization process is used that alternates between removing rain-streak details from B and removing non-streak details from R. The process is assisted by three novel image priors. Observing that rain streaks typically span a narrow range of directions, we first analyze the local gradient statistics in the rain image to identify image regions that are dominated by rain streaks. From these regions, we estimate the dominant rain streak direction and extract a collection of rain-dominated patches. Next, we define two priors on the background layer B, one based on a centralized sparse representation and another based on the estimated rain direction. A third prior is defined on the rain-streak layer R, based on similarity of patches to the extracted rain patches. Both visual and quantitative comparisons demonstrate that our method outperforms the state-of-the-art.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zhu_Joint_Bi-Layer_Optimization_ICCV_2017_paper.pdf", @@ -7674,14 +8164,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Zhu_Joint_Bi-Layer_Optimization_ICCV_2017_paper.html", "aff_unique_index": "0+1;0+1;2;0+1", - "aff_unique_norm": "Chinese University of Hong Kong;Chinese Academy of Sciences;Hebrew University of Jerusalem", + "aff_unique_norm": "The Chinese University of Hong Kong;Chinese Academy of Sciences;The Hebrew University of Jerusalem", "aff_unique_dep": ";Shenzhen Key Laboratory of Virtual Reality and Human Interaction Technology;", "aff_unique_url": "https://www.cuhk.edu.hk;http://www.cas.cn;https://www.huji.ac.il", "aff_unique_abbr": "CUHK;CAS;HUJI", "aff_campus_unique_index": "0+1;0+1;0+1", "aff_campus_unique": "Hong Kong SAR;Shenzhen;", "aff_country_unique_index": "0+0;0+0;1;0+0", - "aff_country_unique": "China;Israel" + "aff_country_unique": "China;Israel", + "bibtex": "@InProceedings{Zhu_2017_ICCV,\n \n author = {\n Zhu,\n Lei and Fu,\n Chi-Wing and Lischinski,\n Dani and Heng,\n Pheng-Ann\n},\n title = {\n Joint Bi-Layer Optimization for Single-Image Rain Streak Removal\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Joint Convolutional Analysis and Synthesis Sparse Representation for Single Image Layer Separation", @@ -7689,6 +8180,7 @@ "status": "Poster", "track": "main", "pid": "829", + "author_site": "Shuhang Gu; Deyu Meng; Wangmeng Zuo; Lei Zhang", "author": "Shuhang Gu; Deyu Meng; Wangmeng Zuo; Lei Zhang", "abstract": "Analysis sparse representation (ASR) and synthesis sparse representation (SSR) are two representative approaches for sparsity-based image modeling. An image is described mainly by the non-zero coefficients in SSR, while it is characterized by the indices of zeros in ASR. To exploit the complementary representation mechanisms of ASR and SSR, we integrate the two models and propose a joint convolutional analysis and synthesis (JCAS) sparse representation model. The convolutional implementation is adopted to more effectively exploit the image global information. In JCAS, a single image is decomposed into two layers, one is approximated by ASR to represent image large-scale structures, and the other by SSR to represent image fine-scale textures. The synthesis dictionary is adaptively learned in JCAS to describe the texture patterns for different single image layer separation tasks. We evaluate the proposed JCAS model on a variety of applications, including rain streak removal, high dynamic range image tone mapping, etc. The results show that our JCAS method outperforms state-ofthe-arts in those applications in terms of both quantitative measure and visual perception quality.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Gu_Joint_Convolutional_Analysis_ICCV_2017_paper.pdf", @@ -7706,14 +8198,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Gu_Joint_Convolutional_Analysis_ICCV_2017_paper.html", "aff_unique_index": "0;1;2;0", - "aff_unique_norm": "Hong Kong Polytechnic University;Xi'an Jiao Tong University;Harbin Institute of Technology", + "aff_unique_norm": "The Hong Kong Polytechnic University;Xi'an Jiaotong University;Harbin Institute of Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://www.polyu.edu.hk;https://www.xjtu.edu.cn;http://www.hit.edu.cn/", "aff_unique_abbr": "PolyU;XJTU;HIT", "aff_campus_unique_index": "0;2;0", "aff_campus_unique": "Hong Kong SAR;;Harbin", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Gu_2017_ICCV,\n \n author = {\n Gu,\n Shuhang and Meng,\n Deyu and Zuo,\n Wangmeng and Zhang,\n Lei\n},\n title = {\n Joint Convolutional Analysis and Synthesis Sparse Representation for Single Image Layer Separation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Joint Detection and Recounting of Abnormal Events by Learning Deep Generic Knowledge", @@ -7721,6 +8214,7 @@ "status": "Poster", "track": "main", "pid": "1303", + "author_site": "Ryota Hinami; Tao Mei; Shin'ichi Satoh", "author": "Ryota Hinami; Tao Mei; Shin'ichi Satoh", "abstract": "This paper addresses the problem of joint detection and recounting of abnormal events in videos. Recounting of abnormal events, i.e., explaining why they are judged to be abnormal, is an unexplored but critical task in video surveillance, because it helps human observers quickly judge if they are false alarms or not. To describe the events in the human-understandable form for event recounting, learning generic knowledge about visual concepts (e.g., object and action) is crucial. Although convolutional neural networks (CNNs) have achieved promising results in learning such concepts, it remains an open question as to how to effectively use CNNs for abnormal event detection, mainly due to the environment-dependent nature of the anomaly detection. In this paper, we tackle this problem by integrating a generic CNN model and environment-dependent anomaly detectors. Our approach first learns CNN with multiple visual tasks to exploit semantic information that is useful for detecting and recounting abnormal events. By appropriately plugging the model into anomaly detectors, we can detect and recount abnormal events while taking advantage of the discriminative power of CNNs. Our approach outperforms the state-of-the-art on Avenue and UCSD Ped2 benchmarks for abnormal event detection and also produces promising results of abnormal event recounting.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Hinami_Joint_Detection_and_ICCV_2017_paper.pdf", @@ -7738,14 +8232,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Hinami_Joint_Detection_and_ICCV_2017_paper.html", "aff_unique_index": "0;1;2", - "aff_unique_norm": "University of Tokyo;National Institute of Informatics;Microsoft", + "aff_unique_norm": "University of Tokyo;National Institute of Informatics;Microsoft Research", "aff_unique_dep": ";;Research", "aff_unique_url": "https://www.u-tokyo.ac.jp;https://www.nii.ac.jp/;https://www.microsoft.com/en-us/research/group/asia", "aff_unique_abbr": "UTokyo;NII;MSR Asia", "aff_campus_unique_index": "1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0;0;1", - "aff_country_unique": "Japan;China" + "aff_country_unique": "Japan;China", + "bibtex": "@InProceedings{Hinami_2017_ICCV,\n \n author = {\n Hinami,\n Ryota and Mei,\n Tao and Satoh,\n Shin'ichi\n},\n title = {\n Joint Detection and Recounting of Abnormal Events by Learning Deep Generic Knowledge\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Joint Discovery of Object States and Manipulation Actions", @@ -7753,6 +8248,7 @@ "status": "Poster", "track": "main", "pid": "703", + "author_site": "Jean-Baptiste Alayrac; Ivan Laptev; Josef Sivic; Simon Lacoste-Julien", "author": "Jean-Baptiste Alayrac; Ivan Laptev; Josef Sivic; Simon Lacoste-Julien", "abstract": "Many human activities involve object manipulations aiming to modify the object state. Examples of common state changes include full/empty bottle, open/closed door, and attached/detached car wheel. In this work, we seek to automatically discover the states of objects and the associated manipulation actions. Given a set of videos for a particular task, we propose a joint model that learns to identify object states and to localize state-modifying actions. Our model is formulated as a discriminative clustering cost with constraints. We assume a consistent temporal order for the changes in object states and manipulation actions, and introduce new optimization techniques to learn model parameters without additional supervision. We demonstrate successful discovery of seven manipulation actions and corresponding object states on a new dataset of videos depicting real-life object manipulations. We show that our joint formulation results in an improvement of object state discovery by action recognition and vice versa.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Alayrac_Joint_Discovery_of_ICCV_2017_paper.pdf", @@ -7768,7 +8264,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Alayrac_Joint_Discovery_of_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Alayrac_Joint_Discovery_of_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Alayrac_2017_ICCV,\n \n author = {\n Alayrac,\n Jean-Baptiste and Laptev,\n Ivan and Sivic,\n Josef and Lacoste-Julien,\n Simon\n},\n title = {\n Joint Discovery of Object States and Manipulation Actions\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Joint Estimation of Camera Pose, Depth, Deblurring, and Super-Resolution From a Blurred Image Sequence", @@ -7776,6 +8273,7 @@ "status": "Poster", "track": "main", "pid": "2144", + "author_site": "Haesol Park; Kyoung Mu Lee", "author": "Haesol Park; Kyoung Mu Lee", "abstract": "The conventional methods for estimating camera poses and scene structures from severely blurry or low resolution images often result in failure. The off-the-shelf deblurring or super resolution methods may show visually pleasing results. However, applying each technique independently before matching is generally unprofitable because this naive series of procedures ignores the consistency between images. In this paper, we propose a pioneering unified framework that solves four problems simultaneously, namely, dense depth reconstruction, camera pose estimation, super resolution, and deblurring. By reflecting a physical imaging process, we formulate a cost minimization problem and solve it using an alternating optimization technique. The experimental results on both synthetic and real videos show high-quality depth maps derived from severely degraded images that contrast the failures of naive multi-view stereo methods. Our proposed method also produces outstanding deblurred and super-resolved images unlike the independent application or combination of conventional video deblurring, super resolution methods.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Park_Joint_Estimation_of_ICCV_2017_paper.pdf", @@ -7800,7 +8298,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Seoul", "aff_country_unique_index": "0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Park_2017_ICCV,\n \n author = {\n Park,\n Haesol and Mu Lee,\n Kyoung\n},\n title = {\n Joint Estimation of Camera Pose,\n Depth,\n Deblurring,\n and Super-Resolution From a Blurred Image Sequence\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Joint Layout Estimation and Global Multi-View Registration for Indoor Reconstruction", @@ -7808,6 +8307,7 @@ "status": "Poster", "track": "main", "pid": "305", + "author_site": "Jeong-Kyun Lee; Jaewon Yea; Min-Gyu Park; Kuk-Jin Yoon", "author": "Jeong-Kyun Lee; Jaewon Yea; Min-Gyu Park; Kuk-Jin Yoon", "abstract": "In this paper, we propose an approach to jointly solve scene layout estimation and global registration problems for accurate indoor 3D reconstruction. Given a sequence of range data, we build a set of scene fragments using KinectFusion and register them through pose graph optimization. Afterwards, we alternate layout estimation and layout-based global registration processes in iterative fashion to complement each other. We extract the scene layout through hierarchical agglomerative clustering and energy-based multi-model fitting in consideration of noisy measurements. Having the estimated scene layout in one hand, we register all the range data through the global iterative closest point algorithm where the positions of 3D points that belong to the layout such as walls and a ceiling are constrained to be close to the layout. We experimentally verify the proposed method with the publicly available synthetic and real-world datasets in both quantitative and qualitative ways.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Lee_Joint_Layout_Estimation_ICCV_2017_paper.pdf", @@ -7825,14 +8325,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Lee_Joint_Layout_Estimation_ICCV_2017_paper.html", "aff_unique_index": "0;1;2;0", - "aff_unique_norm": "Gwangju Institute of Science and Technology;LG;Korea Electronics Technology Institute", - "aff_unique_dep": "School of Electrical Engineering and Computer Science;LG Electronics;", + "aff_unique_norm": "Gwangju Institute of Science and Technology;LG Electronics;Korea Electronics Technology Institute", + "aff_unique_dep": "School of Electrical Engineering and Computer Science;;", "aff_unique_url": "https://www.gist.ac.kr;https://www.lg.com;https://www.keti.re.kr", "aff_unique_abbr": "GIST;LG;KETI", "aff_campus_unique_index": "0;1;2;0", "aff_campus_unique": "Gwangju;Incheon;Seongnam", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Lee_2017_ICCV,\n \n author = {\n Lee,\n Jeong-Kyun and Yea,\n Jaewon and Park,\n Min-Gyu and Yoon,\n Kuk-Jin\n},\n title = {\n Joint Layout Estimation and Global Multi-View Registration for Indoor Reconstruction\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Joint Learning of Object and Action Detectors", @@ -7840,6 +8341,7 @@ "status": "Poster", "track": "main", "pid": "1842", + "author_site": "Vicky Kalogeiton; Philippe Weinzaepfel; Vittorio Ferrari; Cordelia Schmid", "author": "Vicky Kalogeiton; Philippe Weinzaepfel; Vittorio Ferrari; Cordelia Schmid", "abstract": "While most existing approaches for detection in videos focus on objects or human actions separately, we aim at jointly detecting objects performing actions, such as cat eating or dog jumping. We introduce an end-to-end multitask objective that jointly learns object-action relationships. We compare it with different training objectives, validate its effectiveness for detecting objects-actions in videos, and show that both tasks of object and action detection benefit from this joint learning. Moreover, the proposed architecture can be used for zero-shot learning of actions: our multitask objective leverages the commonalities of an action performed by different objects, eg. dog and cat jumping, enabling to detect actions of an object without training with these object-actions pairs. In experiments on the A2D dataset, we obtain state-of-the-art results on segmentation of object-action pairs. We finally apply our multitask architecture to detect visual relationships between objects in images of the VRD dataset.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Kalogeiton_Joint_Learning_of_ICCV_2017_paper.pdf", @@ -7855,7 +8357,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Kalogeiton_Joint_Learning_of_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Kalogeiton_Joint_Learning_of_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Kalogeiton_2017_ICCV,\n \n author = {\n Kalogeiton,\n Vicky and Weinzaepfel,\n Philippe and Ferrari,\n Vittorio and Schmid,\n Cordelia\n},\n title = {\n Joint Learning of Object and Action Detectors\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Joint Prediction of Activity Labels and Starting Times in Untrimmed Videos", @@ -7863,6 +8366,7 @@ "status": "Poster", "track": "main", "pid": "2550", + "author_site": "Tahmida Mahmud; Mahmudul Hasan; Amit K. Roy-Chowdhury", "author": "Tahmida Mahmud; Mahmudul Hasan; Amit K. Roy-Chowdhury", "abstract": "Most of the existing works on human activity analysis focus on recognition or early recognition of the activity labels from complete or partial observations. Predicting the labels of future unobserved activities where no frames of the predicted activities have been observed is a challenging problem, with important applications, which has not been explored much. Associated with the future label prediction problem is the problem of predicting the starting time of the next activity. In this work, we propose a system that is able to infer about the labels and the starting times of future activities. Activities are characterized by the previous activity sequence (which is observed), as well as the objects present in the scene during their occurrence. We propose a network similar to a hybrid Siamese network with three branches to jointly learn both the future label and the starting time. The first branch takes visual features from the objects present in the scene using a fully connected network, the second branch takes previous activity features using a LSTM network to model long-term sequential relationships and the third branch captures the last observed activity features to model the context of inter-activity time using another fully connected network. These concatenated features are used for both label and time prediction. Experiments on two challenging datasets demonstrate that our framework for joint prediction of activity label and starting time improves the performance of both, and outperforms the state-of-the-arts.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Mahmud_Joint_Prediction_of_ICCV_2017_paper.pdf", @@ -7887,7 +8391,8 @@ "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Riverside;Washington", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Mahmud_2017_ICCV,\n \n author = {\n Mahmud,\n Tahmida and Hasan,\n Mahmudul and Roy-Chowdhury,\n Amit K.\n},\n title = {\n Joint Prediction of Activity Labels and Starting Times in Untrimmed Videos\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Jointly Attentive Spatial-Temporal Pooling Networks for Video-Based Person Re-Identification", @@ -7895,6 +8400,7 @@ "status": "Poster", "track": "main", "pid": "2485", + "author_site": "Shuangjie Xu; Yu Cheng; Kang Gu; Yang Yang; Shiyu Chang; Pan Zhou", "author": "Shuangjie Xu; Yu Cheng; Kang Gu; Yang Yang; Shiyu Chang; Pan Zhou", "abstract": "Person Re-Identification (person re-id) is a crucial task as its applications in visual surveillance and human-computer interaction. In this work, we present a novel joint Spatial and Temporal Attention Pooling Network (ASTPN) for video-based person re-identification, which enables the feature extractor to be aware of the current input video sequences, in a way that interdependency from the matching items can directly influence the computation of each other's representation. Specifically, the spatial pooling layer is able to select regions from each frame, while the attention temporal pooling performed can select informative frames over the sequence, both pooling guided by the information from distance matching. Experiments are conduced on the iLIDS-VID, PRID-2011 and MARS datasets and the results demonstrate that this approach outperforms existing state-of-art methods. We also analyze how the joint pooling in both dimensions can boost the person re-id performance more effectively than using either of them separately.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Xu_Jointly_Attentive_Spatial-Temporal_ICCV_2017_paper.pdf", @@ -7909,7 +8415,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Xu_Jointly_Attentive_Spatial-Temporal_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Xu_Jointly_Attentive_Spatial-Temporal_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Xu_2017_ICCV,\n \n author = {\n Xu,\n Shuangjie and Cheng,\n Yu and Gu,\n Kang and Yang,\n Yang and Chang,\n Shiyu and Zhou,\n Pan\n},\n title = {\n Jointly Attentive Spatial-Temporal Pooling Networks for Video-Based Person Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Jointly Recognizing Object Fluents and Tasks in Egocentric Videos", @@ -7917,6 +8424,7 @@ "status": "Poster", "track": "main", "pid": "1223", + "author_site": "Yang Liu; Ping Wei; Song-Chun Zhu", "author": "Yang Liu; Ping Wei; Song-Chun Zhu", "abstract": "This paper addresses the problem of jointly recognizing object fluents and tasks in egocentric videos. Fluents are the changeable attributes of objects. Tasks are goal-oriented human activities which interact with objects and aim to change some attributes of the objects. The process of executing a task is a process to change the object fluents over time. We propose a hierarchical model to represent tasks as concurrent and sequential object fluents. In a task, different fluents closely interact with each other both in spatial and temporal domains. Given an egocentric video, a beam search algorithm is applied to jointly recognizing the object fluents in each frame, and the task of the entire video. We collected a large scale egocentric video dataset of tasks and fluents. This dataset contains 14 categories of tasks, 25 object classes, 21 categories of object fluents, 809 video sequences, and approximately 333,000 video frames. The experimental results on this dataset prove the strength of our method.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Liu_Jointly_Recognizing_Object_ICCV_2017_paper.pdf", @@ -7934,14 +8442,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Liu_Jointly_Recognizing_Object_ICCV_2017_paper.html", "aff_unique_index": "0;1+0;0", - "aff_unique_norm": "University of California, Los Angeles;Xi'an Jiao Tong University", + "aff_unique_norm": "University of California, Los Angeles;Xi'an Jiaotong University", "aff_unique_dep": ";", "aff_unique_url": "https://www.ucla.edu;http://en.xjtu.edu.cn/", "aff_unique_abbr": "UCLA;XJTU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0;1+0;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Liu_2017_ICCV,\n \n author = {\n Liu,\n Yang and Wei,\n Ping and Zhu,\n Song-Chun\n},\n title = {\n Jointly Recognizing Object Fluents and Tasks in Egocentric Videos\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Large Pose 3D Face Reconstruction From a Single Image via Direct Volumetric CNN Regression", @@ -7949,6 +8458,7 @@ "status": "Poster", "track": "main", "pid": "603", + "author_site": "Aaron S. Jackson; Adrian Bulat; Vasileios Argyriou; Georgios Tzimiropoulos", "author": "Aaron S. Jackson; Adrian Bulat; Vasileios Argyriou; Georgios Tzimiropoulos", "abstract": "3D face reconstruction is a fundamental Computer Vision problem of extraordinary difficulty. Current systems often assume the availability of multiple facial images (sometimes from the same subject) as input, and must address a number of methodological challenges such as establishing dense correspondences across large facial poses, expressions, and non-uniform illumination. In general these methods require complex and inefficient pipelines for model building and fitting. In this work, we propose to address many of these limitations by training a Convolutional Neural Network (CNN) on an appropriate dataset consisting of 2D images and 3D facial models or scans. Our CNN works with just a single 2D facial image, does not require accurate alignment nor establishes dense correspondence between images, works for arbitrary facial poses and expressions, and can be used to reconstruct the whole 3D facial geometry (including the non-visible parts of the face) bypassing the construction (during training) and fitting (during testing) of a 3D Morphable Model. We achieve this via a simple CNN architecture that performs direct regression of a volumetric representation of the 3D facial geometry from a single 2D image. We also demonstrate how the related task of facial landmark localization can be incorporated into the proposed framework and help improve reconstruction quality, especially for the cases of large poses and facial expressions.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Jackson_Large_Pose_3D_ICCV_2017_paper.pdf", @@ -7966,14 +8476,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Jackson_Large_Pose_3D_ICCV_2017_paper.html", "aff_unique_index": "0;0;1;0", - "aff_unique_norm": "University of Nottingham;Kingston University", + "aff_unique_norm": "The University of Nottingham;Kingston University", "aff_unique_dep": ";", "aff_unique_url": "https://www.nottingham.ac.uk;https://www.kingston.ac.uk", "aff_unique_abbr": "Nottingham;KU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Jackson_2017_ICCV,\n \n author = {\n Jackson,\n Aaron S. and Bulat,\n Adrian and Argyriou,\n Vasileios and Tzimiropoulos,\n Georgios\n},\n title = {\n Large Pose 3D Face Reconstruction From a Single Image via Direct Volumetric CNN Regression\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Large-Scale Image Retrieval With Attentive Deep Local Features", @@ -7981,6 +8492,7 @@ "status": "Poster", "track": "main", "pid": "1433", + "author_site": "Hyeonwoo Noh; Andre Araujo; Jack Sim; Tobias Weyand; Bohyung Han", "author": "Hyeonwoo Noh; Andre Araujo; Jack Sim; Tobias Weyand; Bohyung Han", "abstract": "We propose an attentive local feature descriptor suitable for large-scale image retrieval, referred to as DELF (DEep Local Feature). The new feature is based on convolutional neural networks, which are trained only with image-level annotations on a landmark image dataset. To identify semantically useful local features for image retrieval, we also propose an attention mechanism for keypoint selection, which shares most network layers with the descriptor. This framework can be used for image retrieval as a drop-in replacement for other keypoint detectors and descriptors, enabling more accurate feature matching and geometric verification. Our system produces reliable confidence scores to reject false positives---in particular, it is robust against queries that have no correct match in the database. To evaluate the proposed descriptor, we introduce a new large-scale dataset, referred to as Google-Landmarks dataset, which involves challenges in both database and query such as background clutter, partial occlusion, multiple landmarks, objects in variable scales, etc.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Noh_Large-Scale_Image_Retrieval_ICCV_2017_paper.pdf", @@ -7999,13 +8511,14 @@ "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Noh_Large-Scale_Image_Retrieval_ICCV_2017_paper.html", "aff_unique_index": "0;1;1;1;0", "aff_unique_norm": "Pohang University of Science and Technology;Google", - "aff_unique_dep": ";Google", + "aff_unique_dep": ";", "aff_unique_url": "https://www.postech.ac.kr;https://www.google.com", "aff_unique_abbr": "POSTECH;Google", "aff_campus_unique_index": "0;1;1;1;0", "aff_campus_unique": "Pohang;Mountain View", "aff_country_unique_index": "0;1;1;1;0", - "aff_country_unique": "South Korea;United States" + "aff_country_unique": "South Korea;United States", + "bibtex": "@InProceedings{Noh_2017_ICCV,\n \n author = {\n Noh,\n Hyeonwoo and Araujo,\n Andre and Sim,\n Jack and Weyand,\n Tobias and Han,\n Bohyung\n},\n title = {\n Large-Scale Image Retrieval With Attentive Deep Local Features\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Lattice Long Short-Term Memory for Human Action Recognition", @@ -8013,6 +8526,7 @@ "status": "Poster", "track": "main", "pid": "736", + "author_site": "Lin Sun; Kui Jia; Kevin Chen; Dit-Yan Yeung; Bertram E. Shi; Silvio Savarese", "author": "Lin Sun; Kui Jia; Kevin Chen; Dit-Yan Yeung; Bertram E. Shi; Silvio Savarese", "abstract": "Human actions captured in video sequences are three-dimensional signals characterizing visual appearance and motion dynamics. To learn action patterns, existing methods adopt Convolutional and/or Recurrent Neural Networks (CNNs and RNNs). CNN based methods are effective in learning spatial appearances, but are limited in modeling long-term motion dynamics. RNNs, especially Long Short-Term Memory (LSTM), are able to learn temporal motion dynamics. However, naively applying RNNs to video sequences in a convolutional manner implicitly assumes that motions in videos are stationary across different spatial locations. This assumption is valid for short-term motions but invalid when the duration of the motion is long. In this work, we propose Lattice-LSTM, which extends LSTM by learning independent hidden state transitions of memory cells for individual spatial locations. This method effectively enhances the ability to model dynamics across time and addresses the non-stationary issue of long-term motion dynamics without significantly increasing the model complexity. Additionally, we introduce a novel multi-modal training procedure for training our network. Unlike traditional two-stream architectures which use RGB and optical flow information as input, our two-stream model leverages both modalities to jointly train both input gates and both forget gates in the network rather than treating the two streams as separate entities with no information about the other. We apply this end-to-end system to benchmark datasets (UCF-101 and HMDB-51) of human action recognition. Experiments show that on both datasets, our proposed method outperforms all existing ones that are based on LSTM and/or CNNs of similar model complexities.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Sun_Lattice_Long_Short-Term_ICCV_2017_paper.pdf", @@ -8027,7 +8541,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Sun_Lattice_Long_Short-Term_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Sun_Lattice_Long_Short-Term_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Sun_2017_ICCV,\n \n author = {\n Sun,\n Lin and Jia,\n Kui and Chen,\n Kevin and Yeung,\n Dit-Yan and Shi,\n Bertram E. and Savarese,\n Silvio\n},\n title = {\n Lattice Long Short-Term Memory for Human Action Recognition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Learned Multi-Patch Similarity", @@ -8035,6 +8550,7 @@ "status": "Poster", "track": "main", "pid": "839", + "author_site": "Wilfried Hartmann; Silvano Galliani; Michal Havlena; Luc Van Gool; Konrad Schindler", "author": "Wilfried Hartmann; Silvano Galliani; Michal Havlena; Luc Van Gool; Konrad Schindler", "abstract": "Estimating a depth map from multiple views of a scene is a fundamental task in computer vision. As soon as more than two viewpoints are available, one faces the very basic question how to measure similarity across >2 image patches. Surprisingly, no direct solution exists, instead it is common to fall back to more or less robust averaging of two-view similarities. Encouraged by the success of machine learning, and in particular convolutional neural networks, we propose to learn a matching function which directly maps multiple image patches to a scalar similarity score. Experiments on several multi-view datasets demonstrate that this approach has advantages over methods based on pairwise patch similarity.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Hartmann_Learned_Multi-Patch_Similarity_ICCV_2017_paper.pdf", @@ -8059,7 +8575,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0+2;0", - "aff_country_unique": "Switzerland;Austria;Belgium" + "aff_country_unique": "Switzerland;Austria;Belgium", + "bibtex": "@InProceedings{Hartmann_2017_ICCV,\n \n author = {\n Hartmann,\n Wilfried and Galliani,\n Silvano and Havlena,\n Michal and Van Gool,\n Luc and Schindler,\n Konrad\n},\n title = {\n Learned Multi-Patch Similarity\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Learned Watershed: End-To-End Learning of Seeded Segmentation", @@ -8067,7 +8584,7 @@ "status": "Poster", "track": "main", "pid": "718", - "author_site": "Steffen Wolf; Lukas Schott; Ullrich K\u00c3\u00b6the; Fred Hamprecht", + "author_site": "Steffen Wolf; Lukas Schott; Ullrich Köthe; Fred Hamprecht", "author": "Steffen Wolf; Lukas Schott; Ullrich Kothe; Fred Hamprecht", "abstract": "Learned boundary maps are known to outperform hand-crafted ones as a basis for the watershed algorithm. We show, for the first time, how to train watershed computation jointly with boundary map prediction. The estimator for the merging priorities is cast as a neural network that is convolutional (over space) and recurrent (over iterations). The latter allows learning of complex shape priors. The method gives the best known seeded segmentation results on the CREMI segmentation challenge.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Wolf_Learned_Watershed_End-To-End_ICCV_2017_paper.pdf", @@ -8092,7 +8609,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Heidelberg", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Wolf_2017_ICCV,\n \n author = {\n Wolf,\n Steffen and Schott,\n Lukas and Kothe,\n Ullrich and Hamprecht,\n Fred\n},\n title = {\n Learned Watershed: End-To-End Learning of Seeded Segmentation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Learning 3D Object Categories by Looking Around Them", @@ -8100,6 +8618,7 @@ "status": "Oral", "track": "main", "pid": "1114", + "author_site": "David Novotny; Diane Larlus; Andrea Vedaldi", "author": "David Novotny; Diane Larlus; Andrea Vedaldi", "abstract": "Traditional approaches for learning 3D object categories use either synthetic data or manual supervision. In this paper, we propose a method which does not require manual annotations and is instead cued by observing objects from a moving vantage point. Our system builds on two innovations: a Siamese viewpoint factorization network that robustly aligns different videos together without explicitly comparing 3D shapes; and a 3D shape completion network that can extract the full shape of an object from partial observations. We also demonstrate the benefits of configuring networks to perform probabilistic predictions as well as of geometry-aware data augmentation schemes. We obtain state-of-the-art results on publicly-available benchmarks.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Novotny_Learning_3D_Object_ICCV_2017_paper.pdf", @@ -8117,14 +8636,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Novotny_Learning_3D_Object_ICCV_2017_paper.html", "aff_unique_index": "0+1;1;0", - "aff_unique_norm": "University of Oxford;NAVER LABS Europe", + "aff_unique_norm": "University of Oxford;Naver Labs Europe", "aff_unique_dep": "Dept. of Engineering Science;Computer Vision Group", "aff_unique_url": "https://www.ox.ac.uk;https://labs.naver.com", "aff_unique_abbr": "Oxford;", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Oxford;", "aff_country_unique_index": "0+1;1;0", - "aff_country_unique": "United Kingdom;Unknown" + "aff_country_unique": "United Kingdom;Unknown", + "bibtex": "@InProceedings{Novotny_2017_ICCV,\n \n author = {\n Novotny,\n David and Larlus,\n Diane and Vedaldi,\n Andrea\n},\n title = {\n Learning 3D Object Categories by Looking Around Them\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Learning Action Recognition Model From Depth and Skeleton Videos", @@ -8132,6 +8652,7 @@ "status": "Poster", "track": "main", "pid": "3016", + "author_site": "Hossein Rahmani; Mohammed Bennamoun", "author": "Hossein Rahmani; Mohammed Bennamoun", "abstract": "Depth sensors open up possibilities of dealing with the human action recognition problem by providing 3D human skeleton data and depth images of the scene. Analysis of human actions based on 3D skeleton data has become popular recently, due to its robustness and view-invariant representation. However, the skeleton alone is insufficient to distinguish actions which involve human-object interactions. In this paper, we propose a deep model which efficiently models human-object interactions and intra-class variations under viewpoint changes. First, a human body-part model is introduced to transfer the depth appearances of body-parts to a shared view-invariant space. Second, an end-to-end learning framework is proposed which is able to effectively combine the view-invariant body-part representation from skeletal and depth images, and learn the relations between the human body-parts and the environmental objects, the interactions between different human body-parts, and the temporal structure of human actions. We have evaluated the performance of our proposed model against 15 existing techniques on two large benchmark human action recognition datasets including NTU RGB+D and UWA3DII. The Experimental results show that our technique provides a significant improvement over state-of-the-art methods.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Rahmani_Learning_Action_Recognition_ICCV_2017_paper.pdf", @@ -8149,14 +8670,15 @@ "author_num": 2, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Rahmani_Learning_Action_Recognition_ICCV_2017_paper.html", "aff_unique_index": "0;0", - "aff_unique_norm": "University of Western Australia", + "aff_unique_norm": "The University of Western Australia", "aff_unique_dep": "School of Computer Science and Software Engineering", "aff_unique_url": "https://www.uwa.edu.au", "aff_unique_abbr": "UWA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Australia" + "aff_country_unique": "Australia", + "bibtex": "@InProceedings{Rahmani_2017_ICCV,\n \n author = {\n Rahmani,\n Hossein and Bennamoun,\n Mohammed\n},\n title = {\n Learning Action Recognition Model From Depth and Skeleton Videos\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Learning Background-Aware Correlation Filters for Visual Tracking", @@ -8164,6 +8686,7 @@ "status": "Poster", "track": "main", "pid": "436", + "author_site": "Hamed Kiani Galoogahi; Ashton Fagg; Simon Lucey", "author": "Hamed Kiani Galoogahi; Ashton Fagg; Simon Lucey", "abstract": "Correlation Filters (CFs) have recently demonstrated excellent performance in terms of rapidly tracking objects under challenging photometric and geometric variations. The strength of the approach comes from its ability to efficiently learn - \"on the fly\" - how the object is changing over time. A fundamental drawback to CFs, however, is that the background of the target is not modeled over time which can result in suboptimal performance. Recent tracking algorithms have suggested to resolve this drawback by either learning CFs from more discriminative deep features (e.g. DeepSRDCF and CCOT) or learning complex deep trackers (e.g. MDNet and FCNT). While such methods have been shown to work well, they suffer from high complexity: extracting deep features or applying deep tracking frameworks is very computationally expensive. This limits the real-time performance of such methods, even on high-end GPUs. This work proposes a Background-Aware CF based on hand-crafted features (HOG) that can efficiently model how both the foreground and background of the object varies over time. Our approach, like conventional CFs, is extremely computationally efficient- and extensive experiments over multiple tracking benchmarks demonstrate the superior accuracy and real-time performance of our method compared to the state-of-the-art trackers.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Galoogahi_Learning_Background-Aware_Correlation_ICCV_2017_paper.pdf", @@ -8188,7 +8711,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Pittsburgh;", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "United States;Australia" + "aff_country_unique": "United States;Australia", + "bibtex": "@InProceedings{Galoogahi_2017_ICCV,\n \n author = {\n Kiani Galoogahi,\n Hamed and Fagg,\n Ashton and Lucey,\n Simon\n},\n title = {\n Learning Background-Aware Correlation Filters for Visual Tracking\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Learning Bag-Of-Features Pooling for Deep Convolutional Neural Networks", @@ -8196,6 +8720,7 @@ "status": "Poster", "track": "main", "pid": "2919", + "author_site": "Nikolaos Passalis; Anastasios Tefas", "author": "Nikolaos Passalis; Anastasios Tefas", "abstract": "Convolutional Neural Networks (CNNs) are well established models capable of achieving state-of-the-art classification accuracy for various computer vision tasks. However, they are becoming increasingly larger, using millions of parameters, while they are restricted to handling images of fixed size. In this paper, a quantization-based approach, inspired from the well-known Bag-of-Features model, is proposed to overcome these limitations. The proposed approach, called Convolutional BoF (CBoF), uses RBF neurons to quantize the information extracted from the convolutional layers and it is able to natively classify images of various sizes as well as to significantly reduce the number of parameters in the network. In contrast to other global pooling operators and CNN compression techniques the proposed method utilizes a trainable pooling layer that it is end-to-end differentiable, allowing the network to be trained using regular back-propagation and to achieve greater distribution shift invariance than competitive methods. The ability of the proposed method to reduce the parameters of the network and increase the classification accuracy over other state-of-the-art techniques is demonstrated using three image datasets.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Passalis_Learning_Bag-Of-Features_Pooling_ICCV_2017_paper.pdf", @@ -8220,7 +8745,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Thessaloniki", "aff_country_unique_index": "0;0", - "aff_country_unique": "Greece" + "aff_country_unique": "Greece", + "bibtex": "@InProceedings{Passalis_2017_ICCV,\n \n author = {\n Passalis,\n Nikolaos and Tefas,\n Anastasios\n},\n title = {\n Learning Bag-Of-Features Pooling for Deep Convolutional Neural Networks\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Learning Blind Motion Deblurring", @@ -8228,7 +8754,7 @@ "status": "Poster", "track": "main", "pid": "237", - "author_site": "Patrick Wieschollek; Michael Hirsch; Bernhard Sch\u00c3\u00b6lkopf; Hendrik P. A. Lensch", + "author_site": "Patrick Wieschollek; Michael Hirsch; Bernhard Schölkopf; Hendrik P. A. Lensch", "author": "Patrick Wieschollek; Michael Hirsch; Bernhard Scholkopf; Hendrik P. A. Lensch", "abstract": "As handheld video cameras are now commonplace and available in every smartphone images and videos can be recorded almost everywhere at any time. However, taking a quick shot frequently ends up in a blurry result due to unwanted camera shake during recording or moving objects in the scene. Removing these artifacts from the blurry recordings is a highly ill-posed problem as neither the sharp image nor the motion blur is known. Propagating information between multiple consecutive blurry observations can help to restore the desired sharp image or video. Solutions for blind deconvolution based on neural networks rely on a massive amount of ground-truth data which was difficult to acquire. In this work, we propose an efficient approach to produce a significant amount of realistic training data and introduce a novel recurrent network architecture to deblur frames, which can efficiently handle arbitrary spatial and temporal input sizes.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Wieschollek_Learning_Blind_Motion_ICCV_2017_paper.pdf", @@ -8244,7 +8770,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Wieschollek_Learning_Blind_Motion_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Wieschollek_Learning_Blind_Motion_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Wieschollek_2017_ICCV,\n \n author = {\n Wieschollek,\n Patrick and Hirsch,\n Michael and Scholkopf,\n Bernhard and Lensch,\n Hendrik P. A.\n},\n title = {\n Learning Blind Motion Deblurring\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Learning Compact Geometric Features", @@ -8252,6 +8779,7 @@ "status": "Poster", "track": "main", "pid": "254", + "author_site": "Marc Khoury; Qian-Yi Zhou; Vladlen Koltun", "author": "Marc Khoury; Qian-Yi Zhou; Vladlen Koltun", "abstract": "We present an approach to learning features that represent the local geometry around a point in an unstructured point cloud. Such features play a central role in geometric registration, which supports diverse applications in robotics and 3D vision. Current state-of-the-art local features for unstructured point clouds have been manually crafted and none combines the desirable properties of precision, compactness, and robustness. We show that features with these properties can be learned from data, by optimizing deep networks that map high-dimensional histograms into low-dimensional Euclidean spaces. The presented approach yields a family of features, parameterized by dimension, that are both more compact and more accurate than existing descriptors.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Khoury_Learning_Compact_Geometric_ICCV_2017_paper.pdf", @@ -8267,7 +8795,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Khoury_Learning_Compact_Geometric_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Khoury_Learning_Compact_Geometric_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Khoury_2017_ICCV,\n \n author = {\n Khoury,\n Marc and Zhou,\n Qian-Yi and Koltun,\n Vladlen\n},\n title = {\n Learning Compact Geometric Features\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Learning Cooperative Visual Dialog Agents With Deep Reinforcement Learning", @@ -8275,7 +8804,7 @@ "status": "Oral", "track": "main", "pid": "175", - "author_site": "Abhishek Das; Satwik Kottur; Jos\u00c3\u00a9 M. F. Moura; Stefan Lee; Dhruv Batra", + "author_site": "Abhishek Das; Satwik Kottur; José M. F. Moura; Stefan Lee; Dhruv Batra", "author": "Abhishek Das; Satwik Kottur; Jose M. F. Moura; Stefan Lee; Dhruv Batra", "abstract": "We introduce the first goal-driven training for visual question answering and dialog agents. Specifically, we pose a cooperative `image guessing' game between two agents -- Qbot and Abot -- who communicate in natural language dialog so that Qbot can select an unseen image from a lineup of images. We use deep reinforcement learning (RL) to end-to-end learn the policies of these agents -- from pixels to multi-agent multi-round dialog to game reward. We demonstrate two experimental results. First, as a `sanity check' demonstration of pure RL (from scratch), we show results on a synthetic world, where the agents communicate in ungrounded vocabulary, ie, symbols with no pre-specified meanings (X, Y, Z). We find that two bots invent their own communication protocol and start using certain symbols to ask/answer about certain visual attributes (shape/color/size). Thus, we demonstrate the emergence of grounded language and communication among `visual' dialog agents with no human supervision at all. Second, we conduct large-scale real-image experiments on the VisDial dataset, where we pretrain on dialog data and show that the RL fine-tuned agents significantly outperform supervised pretraining. Interestingly, the RL Qbot learns to ask questions that Abot is good at, ultimately resulting in more informative dialog and a better team.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Das_Learning_Cooperative_Visual_ICCV_2017_paper.pdf", @@ -8293,14 +8822,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Das_Learning_Cooperative_Visual_ICCV_2017_paper.html", "aff_unique_index": "0;1;1;2;0+3", - "aff_unique_norm": "Georgia Institute of Technology;Carnegie Mellon University;Virginia Tech;Meta", + "aff_unique_norm": "Georgia Institute of Technology;Carnegie Mellon University;Virginia Tech;Facebook", "aff_unique_dep": ";;;Facebook AI Research", "aff_unique_url": "https://www.gatech.edu;https://www.cmu.edu;https://www.vt.edu;https://research.facebook.com", "aff_unique_abbr": "Georgia Tech;CMU;VT;FAIR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Das_2017_ICCV,\n \n author = {\n Das,\n Abhishek and Kottur,\n Satwik and Moura,\n Jose M. F. and Lee,\n Stefan and Batra,\n Dhruv\n},\n title = {\n Learning Cooperative Visual Dialog Agents With Deep Reinforcement Learning\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Learning Deep Neural Networks for Vehicle Re-ID With Visual-Spatio-Temporal Path Proposals", @@ -8308,6 +8838,7 @@ "status": "Poster", "track": "main", "pid": "747", + "author_site": "Yantao Shen; Tong Xiao; Hongsheng Li; Shuai Yi; Xiaogang Wang", "author": "Yantao Shen; Tong Xiao; Hongsheng Li; Shuai Yi; Xiaogang Wang", "abstract": "Vehicle re-identification is an important problem and has many applications in video surveillance and intelligent transportation. It gains increasing attention because of the recent advances of person re-identification techniques. However, unlike person re-identification, the visual differences between pairs of vehicle images are usually subtle and even challenging for humans to distinguish. Incorporating additional spatio-temporal information is vital for solving the challenging re-identification task. Existing vehicle re-identification methods ignored or used over-simplified models for the spatio-temporal relations between vehicle images. In this paper, we propose a two-stage framework that incorporates complex spatio-temporal information for effectively regularizing the re-identification results. Given a pair of vehicle images with their spatio-temporal information, a candidate visual-spatio-temporal path is first generated by a chain MRF model with a deeply learned potential function, where each visual-spatio-temporal state corresponds to an actual vehicle image with its spatio-temporal information. A Siamese-CNN+Path-LSTM model takes the candidate path as well as the pairwise queries to generate their similarity score. Extensive experiments and analysis show the effectiveness of our proposed method and individual components.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Shen_Learning_Deep_Neural_ICCV_2017_paper.pdf", @@ -8322,7 +8853,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Shen_Learning_Deep_Neural_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Shen_Learning_Deep_Neural_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Shen_2017_ICCV,\n \n author = {\n Shen,\n Yantao and Xiao,\n Tong and Li,\n Hongsheng and Yi,\n Shuai and Wang,\n Xiaogang\n},\n title = {\n Learning Deep Neural Networks for Vehicle Re-ID With Visual-Spatio-Temporal Path Proposals\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Learning Dense Facial Correspondences in Unconstrained Images", @@ -8330,6 +8862,7 @@ "status": "Poster", "track": "main", "pid": "2042", + "author_site": "Ronald Yu; Shunsuke Saito; Haoxiang Li; Duygu Ceylan; Hao Li", "author": "Ronald Yu; Shunsuke Saito; Haoxiang Li; Duygu Ceylan; Hao Li", "abstract": "We present a minimalistic but effective neural network that computes dense facial correspondences in highly unconstrained RGB images. Our network learns a per-pixel flow and a matchability mask between 2D input photographs of a person and the projection of a textured 3D face model. To train such a network, we generate a massive dataset of synthetic faces with dense labels using renderings of a morphable face model with variations in pose, expressions, lighting, and occlusions. We found that a training refinement using real photographs is required to drastically improve the ability to handle real images. When combined with a facial detection and 3D face fitting step, we show that our approach outperforms the state-of-the-art face alignment methods in terms of accuracy and speed. By directly estimating dense correspondences, we do not rely on the full visibility of sparse facial landmarks and are not limited to the model space of regression-based approaches. We also assess our method on video frames and demonstrate successful per-frame processing under extreme pose variations, occlusions, and lighting conditions. Compared to existing 3D facial tracking techniques, our fitting does not rely on previous frames or frontal facial initialization and is robust to imperfect face detections.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Yu_Learning_Dense_Facial_ICCV_2017_paper.pdf", @@ -8354,7 +8887,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0+1+0;0+1+0;0;0;0+1+0", - "aff_country_unique": "United States;Israel" + "aff_country_unique": "United States;Israel", + "bibtex": "@InProceedings{Yu_2017_ICCV,\n \n author = {\n Yu,\n Ronald and Saito,\n Shunsuke and Li,\n Haoxiang and Ceylan,\n Duygu and Li,\n Hao\n},\n title = {\n Learning Dense Facial Correspondences in Unconstrained Images\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Learning Discriminative Aggregation Network for Video-Based Face Recognition", @@ -8362,6 +8896,7 @@ "status": "Spotlight", "track": "main", "pid": "1600", + "author_site": "Yongming Rao; Ji Lin; Jiwen Lu; Jie Zhou", "author": "Yongming Rao; Ji Lin; Jiwen Lu; Jie Zhou", "abstract": "In this paper, we propose a discriminative aggregation network (DAN) for video face recognition, which aims to integrate information from video frames effectively and efficiently. Different from existing aggregation methods, our method aggregates raw video frames directly instead of the features obtained by complex processing. By combining the idea of metric learning and adversarial learning, we learn an aggregation network that produces more discriminative synthesized images compared to input frames. Our framework reduces the number of frames to be processed and greatly speed up the recognition procedure. Furthermore, low-quality frames containing misleading information are denoised during the aggregation process, making the system more robust and discriminative. Experimental results show that our framework can generate discriminative images from video clips and improve the overall recognition performance in both the speed and accuracy on three widely used datasets.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Rao_Learning_Discriminative_Aggregation_ICCV_2017_paper.pdf", @@ -8386,7 +8921,8 @@ "aff_campus_unique_index": "0+0;0+0;0+0;0+0", "aff_campus_unique": "Beijing;", "aff_country_unique_index": "0+0+0;0+0+0;0+0+0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Rao_2017_ICCV,\n \n author = {\n Rao,\n Yongming and Lin,\n Ji and Lu,\n Jiwen and Zhou,\n Jie\n},\n title = {\n Learning Discriminative Aggregation Network for Video-Based Face Recognition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Learning Discriminative Data Fitting Functions for Blind Image Deblurring", @@ -8394,6 +8930,7 @@ "status": "Poster", "track": "main", "pid": "376", + "author_site": "Jinshan Pan; Jiangxin Dong; Yu-Wing Tai; Zhixun Su; Ming-Hsuan Yang", "author": "Jinshan Pan; Jiangxin Dong; Yu-Wing Tai; Zhixun Su; Ming-Hsuan Yang", "abstract": "Solving blind image deblurring usually requires defining a data fitting function and image priors. While existing algorithms mainly focus on developing image priors for blur kernel estimation and non-blind deconvolution, only a few methods consider the effect of data fitting functions. In contrast to the state-of-the-art methods that use a single or a fixed data fitting term, we propose a data-driven approach to learn effective data fitting functions from a large set of motion blurred images with associated ground truth blur kernels. The learned data fitting function facilitates estimating accurate blur kernels for generic images and domain-specific problems with corresponding image priors. In addition, we extend the learning approach for data fitting function to latent image restoration and non-uniform deblurring. Extensive experiments on challenging motion blurred images demonstrate the proposed algorithm performs favorably against the state-of-the-art methods.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Pan_Learning_Discriminative_Data_ICCV_2017_paper.pdf", @@ -8418,7 +8955,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Merced", "aff_country_unique_index": "0;0;0;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Pan_2017_ICCV,\n \n author = {\n Pan,\n Jinshan and Dong,\n Jiangxin and Tai,\n Yu-Wing and Su,\n Zhixun and Yang,\n Ming-Hsuan\n},\n title = {\n Learning Discriminative Data Fitting Functions for Blind Image Deblurring\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Learning Discriminative Latent Attributes for Zero-Shot Classification", @@ -8426,6 +8964,7 @@ "status": "Poster", "track": "main", "pid": "2001", + "author_site": "Huajie Jiang; Ruiping Wang; Shiguang Shan; Yi Yang; Xilin Chen", "author": "Huajie Jiang; Ruiping Wang; Shiguang Shan; Yi Yang; Xilin Chen", "abstract": "Zero-shot learning (ZSL) aims to transfer knowledge from observed classes to the unseen classes, based on the assumption that both the seen and unseen classes share a common semantic space, among which attributes enjoy a great popularity. However, few works study whether the human-designed semantic attributes are discriminative enough to recognize different classes. Moreover, attributes are often correlated with each other, which makes it less desirable to learn each attribute independently. In this paper, we propose to learn a latent attribute space, which is not only discriminative but also semantic-preserving, to perform the ZSL task. Specifically, a dictionary learning framework is exploited to connect the latent attribute space with attribute space and similarity space. Extensive experiments on four benchmark datasets show the effectiveness of the proposed approach.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Jiang_Learning_Discriminative_Latent_ICCV_2017_paper.pdf", @@ -8443,14 +8982,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Jiang_Learning_Discriminative_Latent_ICCV_2017_paper.html", "aff_unique_index": "0+1+2;0;0;3;0", - "aff_unique_norm": "Chinese Academy of Sciences;Shanghai Institute of Microsystem and Information Technology;ShanghaiTech University;Huawei", - "aff_unique_dep": "Institute of Computing Technology;;;Huawei Technologies Co., Ltd.", + "aff_unique_norm": "Chinese Academy of Sciences;Shanghai Institute of Microsystem and Information Technology;ShanghaiTech University;Huawei Technologies Co., Ltd.", + "aff_unique_dep": "Institute of Computing Technology;;;", "aff_unique_url": "http://www.cas.ac.cn;;http://www.shanghaitech.edu.cn;https://www.huawei.com", "aff_unique_abbr": "CAS;;ShanghaiTech;Huawei", "aff_campus_unique_index": "0+1+1;0;0;0;0", "aff_campus_unique": "Beijing;Shanghai", "aff_country_unique_index": "0+0+0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Jiang_2017_ICCV,\n \n author = {\n Jiang,\n Huajie and Wang,\n Ruiping and Shan,\n Shiguang and Yang,\n Yi and Chen,\n Xilin\n},\n title = {\n Learning Discriminative Latent Attributes for Zero-Shot Classification\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Learning Discriminative ab-Divergences for Positive Definite Matrices", @@ -8458,6 +8998,7 @@ "status": "Poster", "track": "main", "pid": "1653", + "author_site": "Anoop Cherian; Panagiotis Stanitsas; Mehrtash Harandi; Vassilios Morellas; Nikolaos Papanikolopoulos", "author": "Anoop Cherian; Panagiotis Stanitsas; Mehrtash Harandi; Vassilios Morellas; Nikolaos Papanikolopoulos", "abstract": "Symmetric positive definite (SPD) matrices are useful for capturing second-order statistics of visual data. To compare two SPD matrices, several measures are available, such as the affine-invariant Riemannian metric, Jeffreys divergence, Jensen-Bregman logdet divergence, etc.; however, their behaviors may be application dependent, raising the need of manual selection to achieve the best possible performance. Further and as a result of their overwhelming complexity for large-scale problems, computing pairwise similarities by clever embedding of SPD matrices is often preferred to direct use of the aforementioned measures. In this paper, we propose a discriminative metric learning framework, Information Divergence and Dictionary Learning (IDDL), that not only learns application specific measures on SPD matrices automatically, but also embeds them as vectors using a learned dictionary. To learn the similarity measures (which could potentially be distinct for every dictionary atom), we use the recently introduced alpha-beta-logdet divergence, which is known to unify the measures listed above. We propose a novel IDDL objective, that learns the parameters of the divergence and the dictionary atoms jointly in a discriminative setup and is solved efficiently using Riemannian optimization. We showcase extensive experiments on eight computer vision datasets, demonstrating state-of-the-art performances.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Cherian_Learning_Discriminative_ab-Divergences_ICCV_2017_paper.pdf", @@ -8482,7 +9023,8 @@ "aff_campus_unique_index": ";;1;1;1", "aff_campus_unique": ";Minneapolis", "aff_country_unique_index": "0+0;0+0;1;1;1", - "aff_country_unique": "Australia;United States" + "aff_country_unique": "Australia;United States", + "bibtex": "@InProceedings{Cherian_2017_ICCV,\n \n author = {\n Cherian,\n Anoop and Stanitsas,\n Panagiotis and Harandi,\n Mehrtash and Morellas,\n Vassilios and Papanikolopoulos,\n Nikolaos\n},\n title = {\n Learning Discriminative ab-Divergences for Positive Definite Matrices\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Learning Dynamic Siamese Network for Visual Object Tracking", @@ -8490,6 +9032,7 @@ "status": "Poster", "track": "main", "pid": "688", + "author_site": "Qing Guo; Wei Feng; Ce Zhou; Rui Huang; Liang Wan; Song Wang", "author": "Qing Guo; Wei Feng; Ce Zhou; Rui Huang; Liang Wan; Song Wang", "abstract": "How to effectively learn temporal variation of target appearance, to exclude the interference of cluttered background, while maintaining real-time response, is an essential problem of visual object tracking. Recently, Siamese networks have shown great potentials of matching based trackers in achieving balanced accuracy and beyond real-time speed. However, they still have a big gap to classification & updating based trackers in tolerating the temporal changes of objects and imaging conditions. In this paper, we propose dynamic Siamese network, via a fast transformation learning model that enables effective online learning of target appearance variation and background suppression from previous frames. We then present elementwise multi-layer fusion to adaptively integrate the network outputs using multi-level deep features. Unlike state-of-the-art trackers, our approach allows the usage of any feasible generally- or particularly-trained features, such as SiamFC and VGG. More importantly, the proposed dynamic Siamese network can be jointly trained as a whole directly on the labeled video sequences, thus can take full advantage of the rich spatial temporal information of moving objects. As a result, our approach achieves state-of-the-art performance on OTB-2013 and VOT-2015 benchmarks, while exhibits superiorly balanced accuracy and real-time response over state-of-the-art competitors.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Guo_Learning_Dynamic_Siamese_ICCV_2017_paper.pdf", @@ -8505,7 +9048,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Guo_Learning_Dynamic_Siamese_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Guo_Learning_Dynamic_Siamese_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Guo_2017_ICCV,\n \n author = {\n Guo,\n Qing and Feng,\n Wei and Zhou,\n Ce and Huang,\n Rui and Wan,\n Liang and Wang,\n Song\n},\n title = {\n Learning Dynamic Siamese Network for Visual Object Tracking\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Learning Efficient Convolutional Networks Through Network Slimming", @@ -8513,6 +9057,7 @@ "status": "Poster", "track": "main", "pid": "936", + "author_site": "Zhuang Liu; Jianguo Li; Zhiqiang Shen; Gao Huang; Shoumeng Yan; Changshui Zhang", "author": "Zhuang Liu; Jianguo Li; Zhiqiang Shen; Gao Huang; Shoumeng Yan; Changshui Zhang", "abstract": "The deployment of deep convolutional neural networks (CNNs) in many real world applications is largely hindered by their high computational cost. In this paper, we propose a novel learning scheme for CNNs to simultaneously 1) reduce the model size; 2) decrease the run-time memory footprint; and 3) lower the number of computing operations, without compromising accuracy. This is achieved by enforcing channel-level sparsity in the network in a simple but effective way. Different from many existing approaches, the proposed method directly applies to modern CNN architectures, introduces minimum overhead to the training process, and requires no special software/hardware accelerators for the resulting models. We call our approach network slimming, which takes wide and large networks as input models, but during training insignificant channels are automatically identified and pruned afterwards, yielding thin and compact models with comparable accuracy. We empirically demonstrate the effectiveness of our approach with several state-of-the-art CNN models, including VGGNet, ResNet and DenseNet, on various image classification datasets. For VGGNet, a multi-pass version of network slimming gives a 20x reduction in model size and a 5x reduction in computing operations.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Liu_Learning_Efficient_Convolutional_ICCV_2017_paper.pdf", @@ -8530,14 +9075,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Liu_Learning_Efficient_Convolutional_ICCV_2017_paper.html", "aff_unique_index": "0+1;1;2;3;1;0", - "aff_unique_norm": "Tsinghua University;Intel;Fudan University;Cornell University", + "aff_unique_norm": "Tsinghua University;Intel Corporation;Fudan University;Cornell University", "aff_unique_dep": "CSAI, TNList;Intel Labs;;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.intel.cn;https://www.fudan.edu.cn;https://www.cornell.edu", "aff_unique_abbr": ";Intel;Fudan;Cornell", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;1;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Liu_2017_ICCV,\n \n author = {\n Liu,\n Zhuang and Li,\n Jianguo and Shen,\n Zhiqiang and Huang,\n Gao and Yan,\n Shoumeng and Zhang,\n Changshui\n},\n title = {\n Learning Efficient Convolutional Networks Through Network Slimming\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Learning Feature Pyramids for Human Pose Estimation", @@ -8545,6 +9091,7 @@ "status": "Poster", "track": "main", "pid": "460", + "author_site": "Wei Yang; Shuang Li; Wanli Ouyang; Hongsheng Li; Xiaogang Wang", "author": "Wei Yang; Shuang Li; Wanli Ouyang; Hongsheng Li; Xiaogang Wang", "abstract": "Articulated human pose estimation is a fundamental yet challenging task in computer vision. The difficulty is particularly pronounced in scale variations of human body parts when camera view changes or severe foreshortening happens. Although pyramid methods are widely used to handle scale changes at inference time, learning feature pyramids in deep convolutional neural networks (DCNNs) is still not well explored. In this work, we design a Pyramid Residual Module (PRMs) to enhance the invariance in scales of DCNNs. Given input features, the PRMs learn convolutional filters on various scales of input features, which are obtained with different subsampling ratios in a multi-branch network. Moreover, we observe that it is inappropriate to adopt existing methods to initialize the weights of multi-branch networks, which achieve superior performance than plain networks in many tasks recently. Therefore, we provide theoretic derivation to extend the current weight initialization scheme to multi-branch network structures. We investigate our method on two standard benchmarks for human pose estimation. Our approach obtains state-of-the-art results on both benchmarks. Code is available at https://github.com/bearpaw/PyraNet.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Yang_Learning_Feature_Pyramids_ICCV_2017_paper.pdf", @@ -8562,14 +9109,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Yang_Learning_Feature_Pyramids_ICCV_2017_paper.html", "aff_unique_index": "0;0;0+1;0;0", - "aff_unique_norm": "Chinese University of Hong Kong;University of Sydney", + "aff_unique_norm": "The Chinese University of Hong Kong;The University of Sydney", "aff_unique_dep": "Department of Electronic Engineering;School of Electrical and Information Engineering", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.sydney.edu.au", "aff_unique_abbr": "CUHK;USYD", "aff_campus_unique_index": "0;0;0+1;0;0", "aff_campus_unique": "Hong Kong SAR;Sydney", "aff_country_unique_index": "0;0;0+1;0;0", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Yang_2017_ICCV,\n \n author = {\n Yang,\n Wei and Li,\n Shuang and Ouyang,\n Wanli and Li,\n Hongsheng and Wang,\n Xiaogang\n},\n title = {\n Learning Feature Pyramids for Human Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Learning From Noisy Labels With Distillation", @@ -8577,6 +9125,7 @@ "status": "Poster", "track": "main", "pid": "752", + "author_site": "Yuncheng Li; Jianchao Yang; Yale Song; Liangliang Cao; Jiebo Luo; Li-Jia Li", "author": "Yuncheng Li; Jianchao Yang; Yale Song; Liangliang Cao; Jiebo Luo; Li-Jia Li", "abstract": "The ability of learning from noisy labels is very useful in many visual recognition tasks, as a vast amount of data with noisy labels are relatively easy to obtain. Traditionally, label noise has been treated as statistical outliers, and techniques such as importance re-weighting and bootstrapping have been proposed to alleviate the problem. According to our observation, the real-world noisy labels exhibit multi-mode characteristics as the true labels, rather than behaving like independent random outliers. In this work, we propose a unified distillation framework to use \"side\" information, including a small clean dataset and label relations in knowledge graph, to \"hedge the risk\" of learning from noisy labels. Unlike the traditional approaches evaluated based on simulated label noises, we propose a suite of new benchmark datasets, in Sports, Species and Artifacts domains, to evaluate the task of learning from noisy labels in the practical setting. The empirical study demonstrates the effectiveness of our proposed method in all the domains.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Li_Learning_From_Noisy_ICCV_2017_paper.pdf", @@ -8595,13 +9144,14 @@ "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Li_Learning_From_Noisy_ICCV_2017_paper.html", "aff_unique_index": "0;0;1;2;3;4", "aff_unique_norm": "Snap Inc.;Yahoo;hellovera.ai;University of Rochester;Google", - "aff_unique_dep": ";Yahoo Research;;;Google", + "aff_unique_dep": ";Yahoo Research;;;", "aff_unique_url": "https://www.snapinc.com;https://research.yahoo.com;https://www.hellovera.ai;https://www.rochester.edu;https://www.google.com", "aff_unique_abbr": "Snap;Yahoo Research;;U of R;Google", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Li_2017_ICCV,\n \n author = {\n Li,\n Yuncheng and Yang,\n Jianchao and Song,\n Yale and Cao,\n Liangliang and Luo,\n Jiebo and Li,\n Li-Jia\n},\n title = {\n Learning From Noisy Labels With Distillation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Learning From Video and Text via Large-Scale Discriminative Clustering", @@ -8609,10 +9159,11 @@ "status": "Spotlight", "track": "main", "pid": "1353", + "author_site": "Antoine Miech; Jean-Baptiste Alayrac; Piotr Bojanowski; Ivan Laptev; Josef Sivic", "author": "Antoine Miech; Jean-Baptiste Alayrac; Piotr Bojanowski; Ivan Laptev; Josef Sivic", "abstract": "Discriminative clustering has been successfully applied to a number of weakly supervised learning tasks. Such applications include person and action recognition, text-to-video alignment, object co-segmentation and colocalization in videos and images. One drawback of discriminative clustering, however, is its limited scalability. We address this issue and propose an online optimization algorithm based on the Block-Coordinate Frank-Wolfe algorithm. We apply the proposed method to the problem of weakly supervised learning of actions and actors from movies together with corresponding movie scripts. The scaling up of the learning problem to 66 feature-length movies enables us to significantly improve weakly supervised action recognition.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Miech_Learning_From_Video_ICCV_2017_paper.pdf", - "aff": "\u00b4Ecole Normale Sup \u00b4erieure+Inria; \u00b4Ecole Normale Sup \u00b4erieure+Inria; Inria; \u00b4Ecole Normale Sup \u00b4erieure+Inria; \u00b4Ecole Normale Sup \u00b4erieure+Inria+CIIRC", + "aff": "´Ecole Normale Sup ´erieure+Inria; ´Ecole Normale Sup ´erieure+Inria; Inria; ´Ecole Normale Sup ´erieure+Inria; ´Ecole Normale Sup ´erieure+Inria+CIIRC", "project": "", "github": "", "supp": "", @@ -8626,14 +9177,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Miech_Learning_From_Video_ICCV_2017_paper.html", "aff_unique_index": "0+1;0+1;1;0+1;0+1+2", - "aff_unique_norm": "Ecole Normale Sup\u00e9rieure;INRIA;CIIRC", + "aff_unique_norm": "Ecole Normale Supérieure;Inria;CIIRC", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ens.fr;https://www.inria.fr;https://www.ciirc.cvut.cz/", "aff_unique_abbr": "ENS;Inria;CIIRC", "aff_campus_unique_index": ";;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0;0+0;0+0+1", - "aff_country_unique": "France;Czech Republic" + "aff_country_unique": "France;Czech Republic", + "bibtex": "@InProceedings{Miech_2017_ICCV,\n \n author = {\n Miech,\n Antoine and Alayrac,\n Jean-Baptiste and Bojanowski,\n Piotr and Laptev,\n Ivan and Sivic,\n Josef\n},\n title = {\n Learning From Video and Text via Large-Scale Discriminative Clustering\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Learning Gaze Transitions From Depth to Improve Video Saliency Estimation", @@ -8641,6 +9193,7 @@ "status": "Poster", "track": "main", "pid": "825", + "author_site": "George Leifman; Dmitry Rudoy; Tristan Swedish; Eduardo Bayro-Corrochano; Ramesh Raskar", "author": "George Leifman; Dmitry Rudoy; Tristan Swedish; Eduardo Bayro-Corrochano; Ramesh Raskar", "abstract": "In this paper we introduce a novel Depth-Aware Video Saliency approach to predict human focus of attention when viewing videos that contain a depth map (RGBD) on a 2D screen. Saliency estimation in this scenario is highly important since in the near future 3D video content will be easily acquired yet hard to display. Despite considerable progress in 3D display technologies, most are still expensive and require special glasses for viewing, so RGBD content is primarily viewed on 2D screens, removing the depth channel from the final viewing experience. We train a generative convolutional neural network that predicts the 2D viewing saliency map for a given frame using the RGBD pixel values and previous fixation estimates in the video. To evaluate the performance of our approach, we present a new comprehensive database of 2D viewing eye-fixation ground-truth for RGBD videos. Our experiments indicate that it is beneficial to integrate depth into video saliency estimates for content that is viewed on a 2D display. We demonstrate that our approach outperforms state-of-the-art methods for video saliency, achieving 15% relative improvement.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Leifman_Learning_Gaze_Transitions_ICCV_2017_paper.pdf", @@ -8658,14 +9211,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Leifman_Learning_Gaze_Transitions_ICCV_2017_paper.html", "aff_unique_index": "0;1;2;3;2", - "aff_unique_norm": "Amazon;Intel;Massachusetts Institute of Technology;CINVESTA", - "aff_unique_dep": "Amazon.com, Inc.;Intel Corporation;Media Lab;", + "aff_unique_norm": "Amazon.com, Inc.;Intel Corporation;Massachusetts Institute of Technology;CINVESTA", + "aff_unique_dep": ";;Media Lab;", "aff_unique_url": "https://www.amazon.com;https://www.intel.com;http://www.media.mit.edu/;", "aff_unique_abbr": "Amazon;Intel;MIT;", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Cambridge", "aff_country_unique_index": "0;0;0;1;0", - "aff_country_unique": "United States;Cuba" + "aff_country_unique": "United States;Cuba", + "bibtex": "@InProceedings{Leifman_2017_ICCV,\n \n author = {\n Leifman,\n George and Rudoy,\n Dmitry and Swedish,\n Tristan and Bayro-Corrochano,\n Eduardo and Raskar,\n Ramesh\n},\n title = {\n Learning Gaze Transitions From Depth to Improve Video Saliency Estimation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Learning Hand Articulations by Hallucinating Heat Distribution", @@ -8673,6 +9227,7 @@ "status": "Poster", "track": "main", "pid": "1394", + "author_site": "Chiho Choi; Sangpil Kim; Karthik Ramani", "author": "Chiho Choi; Sangpil Kim; Karthik Ramani", "abstract": "We propose a robust hand pose estimation method by learning hand articulations from depth features and auxiliary modality features. As an additional modality to depth data, we present a function of geometric properties on the surface of the hand described by heat diffusion. The proposed heat distribution descriptor is robust to identify the keypoints on the surface as it incorporates both the local geometry of the hand and global structural representation at multiple time scales. Along this line, we train our heat distribution network to learn the geometrically descriptive representations from the proposed descriptors with the fingertip position labels. Then the hallucination network is guided to mimic the intermediate responses of the heat distribution modality from a paired depth image. We use the resulting geometrically informed responses together with the discriminative depth features estimated from the depth network to regularize the angle parameters in the refinement network. To this end, we conduct extensive evaluations to validate that the proposed framework is powerful as it achieves state-of-the-art performance.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Choi_Learning_Hand_Articulations_ICCV_2017_paper.pdf", @@ -8688,7 +9243,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Choi_Learning_Hand_Articulations_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Choi_Learning_Hand_Articulations_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Choi_2017_ICCV,\n \n author = {\n Choi,\n Chiho and Kim,\n Sangpil and Ramani,\n Karthik\n},\n title = {\n Learning Hand Articulations by Hallucinating Heat Distribution\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Learning High Dynamic Range From Outdoor Panoramas", @@ -8696,11 +9252,11 @@ "status": "Oral", "track": "main", "pid": "1954", - "author_site": "Jinsong Zhang; Jean-Fran\u00c3\u00a7ois Lalonde", + "author_site": "Jinsong Zhang; Jean-François Lalonde", "author": "Jinsong Zhang; Jean-Francois Lalonde", "abstract": "Outdoor lighting has extremely high dynamic range. This makes the process of capturing outdoor environment maps notoriously challenging since special equipment must be used. In this work, we propose an alternative approach. We first capture lighting with a regular, LDR omnidirectional camera, and aim to recover the HDR after the fact via a novel, learning-based inverse tonemapping method. We propose a deep autoencoder framework which regresses linear, high dynamic range data from non-linear, saturated, low dynamic range panoramas. We validate our method through a wide set of experiments on synthetic data, as well as on a novel dataset of real photographs with ground truth. Our approach finds applications in a variety of settings, ranging from outdoor light capture to image matching.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zhang_Learning_High_Dynamic_ICCV_2017_paper.pdf", - "aff": "Universit\u00b8e Laval, Qu\u00b8ebec, Canada; Universit\u00b8e Laval, Qu\u00b8ebec, Canada", + "aff": "Universit¸e Laval, Qu¸ebec, Canada; Universit¸e Laval, Qu¸ebec, Canada", "project": "http://www.jflalonde.ca/projects/learningHDR", "github": "", "supp": "", @@ -8714,14 +9270,15 @@ "author_num": 2, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Zhang_Learning_High_Dynamic_ICCV_2017_paper.html", "aff_unique_index": "0;0", - "aff_unique_norm": "Universit\u00e9 Laval", + "aff_unique_norm": "Université Laval", "aff_unique_dep": "", "aff_unique_url": "https://www.ulaval.ca", "aff_unique_abbr": "UL", "aff_campus_unique_index": "0;0", - "aff_campus_unique": "Qu\u00e9bec", + "aff_campus_unique": "Québec", "aff_country_unique_index": "0;0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Zhang_2017_ICCV,\n \n author = {\n Zhang,\n Jinsong and Lalonde,\n Jean-Francois\n},\n title = {\n Learning High Dynamic Range From Outdoor Panoramas\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Learning Long-Term Dependencies for Action Recognition With a Biologically-Inspired Deep Network", @@ -8729,6 +9286,7 @@ "status": "Poster", "track": "main", "pid": "292", + "author_site": "Yemin Shi; Yonghong Tian; Yaowei Wang; Wei Zeng; Tiejun Huang", "author": "Yemin Shi; Yonghong Tian; Yaowei Wang; Wei Zeng; Tiejun Huang", "abstract": "Despite a lot of research efforts devoted in recent years, how to efficiently learn long-term dependencies from sequences still remains a pretty challenging task. As one of the key models for sequence learning, recurrent neural network (RNN) and its variants such as long short term memory (LSTM) and gated recurrent unit (GRU) are still not powerful enough in practice. One possible reason is that they have only feedforward connections, which is different from the biological neural system that is typically composed of both feedforward and feedback connections. To address this problem, this paper proposes a biologically-inspired deep network, called shuttleNet. Technologically, the shuttleNet consists of several processors, each of which is a GRU while associated with multiple groups of hidden states. Unlike traditional RNNs, all processors inside shuttleNet are loop connected to mimic the brain's feedforward and feedback connections, in which they are shared across multiple pathways in the loop connection. Attention mechanism is then employed to select the best information flow pathway. Extensive experiments conducted on two benchmark datasets (i.e UCF101 and HMDB51) show that we can beat state-of-the-art methods by simply embedding shuttleNet into a CNN-RNN framework.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Shi_Learning_Long-Term_Dependencies_ICCV_2017_paper.pdf", @@ -8753,7 +9311,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Beijing;", "aff_country_unique_index": "0+0;0+0;0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Shi_2017_ICCV,\n \n author = {\n Shi,\n Yemin and Tian,\n Yonghong and Wang,\n Yaowei and Zeng,\n Wei and Huang,\n Tiejun\n},\n title = {\n Learning Long-Term Dependencies for Action Recognition With a Biologically-Inspired Deep Network\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Learning Multi-Attention Convolutional Neural Network for Fine-Grained Image Recognition", @@ -8761,6 +9320,7 @@ "status": "Oral", "track": "main", "pid": "1793", + "author_site": "Heliang Zheng; Jianlong Fu; Tao Mei; Jiebo Luo", "author": "Heliang Zheng; Jianlong Fu; Tao Mei; Jiebo Luo", "abstract": "Recognizing fine-grained categories (e.g., bird species) highly relies on discriminative part localization and part-based fine-grained feature learning. Existing approaches predominantly solve these challenges independently, while neglecting the fact that part localization (e.g., head of a bird) and fine-grained feature learning (e.g., head shape) are mutually correlated. In this paper, we propose a novel part learning approach by a multi-attention convolutional neural network (MA-CNN), where part generation and feature learning can reinforce each other. MA-CNN consists of convolution, channel grouping and part classification sub-networks. The channel grouping network takes as input feature channels from convolutional layers, and generates multiple parts by clustering, weighting and pooling from spatially-correlated channels. The part classification network further classifies an image by each individual part, through which more discriminative fine-grained features can be learned. Two losses are proposed to guide the multi-task learning of channel grouping and part classification, which encourages MA-CNN to generate more discriminative parts from feature channels and learn better fine-grained features from parts in a mutual reinforced way. MA-CNN does not need bounding box/part annotation and can be trained end-to-end. We incorporate the learned parts from MA-CNN with part-CNN for recognition, and show the best performances on three challenging published fine-grained datasets, e.g., CUB-Birds, FGVC-Aircraft and Stanford-Cars.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zheng_Learning_Multi-Attention_Convolutional_ICCV_2017_paper.pdf", @@ -8778,14 +9338,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Zheng_Learning_Multi-Attention_Convolutional_ICCV_2017_paper.html", "aff_unique_index": "0;1;1;2", - "aff_unique_norm": "University of Science and Technology of China;Microsoft;University of Rochester", + "aff_unique_norm": "University of Science and Technology of China;Microsoft Corporation;University of Rochester", "aff_unique_dep": ";Microsoft Research;", "aff_unique_url": "http://www.ustc.edu.cn;https://www.microsoft.com/en-us/research;https://www.rochester.edu", "aff_unique_abbr": "USTC;MSR;U of R", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Zheng_2017_ICCV,\n \n author = {\n Zheng,\n Heliang and Fu,\n Jianlong and Mei,\n Tao and Luo,\n Jiebo\n},\n title = {\n Learning Multi-Attention Convolutional Neural Network for Fine-Grained Image Recognition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Learning Policies for Adaptive Tracking With Deep Feature Cascades", @@ -8793,6 +9354,7 @@ "status": "Spotlight", "track": "main", "pid": "1976", + "author_site": "Chen Huang; Simon Lucey; Deva Ramanan", "author": "Chen Huang; Simon Lucey; Deva Ramanan", "abstract": "Visual object tracking is a fundamental and time-critical vision task. Recent years have seen many shallow tracking methods based on real-time pixel-based correlation filters, as well as deep methods that have top performance but need a high-end GPU. In this paper, we learn to improve the speed of deep trackers without losing accuracy. Our fundamental insight is to take an adaptive approach, where easy frames are processed with cheap features (such as pixel values), while challenging frames are processed with invariant but expensive deep features. We formulate the adaptive tracking problem as a decision-making process, and learn an agent to decide whether to locate objects with high confidence on an early layer, or continue processing subsequent layers of a network. This significantly reduces the feed-forward cost for easy frames with distinct or slow-moving objects. We train the agent offline in a reinforcement learning fashion, and further demonstrate that learning all deep layers (so as to provide good features for adaptive tracking) can lead to near real-time average tracking speed of 23 fps on a single CPU while achieving state-of-the-art performance. Perhaps most tellingly, our approach provides a 100X speedup for almost 50% of the time, indicating the power of an adaptive approach.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Huang_Learning_Policies_for_ICCV_2017_paper.pdf", @@ -8817,7 +9379,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Pittsburgh", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Huang_2017_ICCV,\n \n author = {\n Huang,\n Chen and Lucey,\n Simon and Ramanan,\n Deva\n},\n title = {\n Learning Policies for Adaptive Tracking With Deep Feature Cascades\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Learning Proximal Operators: Using Denoising Networks for Regularizing Inverse Imaging Problems", @@ -8825,7 +9388,7 @@ "status": "Poster", "track": "main", "pid": "919", - "author_site": "Tim Meinhardt; Michael M\u00c3\u00b6ller; Caner Hazirbas; Daniel Cremers", + "author_site": "Tim Meinhardt; Michael Möller; Caner Hazirbas; Daniel Cremers", "author": "Tim Meinhardt; Michael Moller; Caner Hazirbas; Daniel Cremers", "abstract": "While variational methods have been among the most powerful tools for solving linear inverse problems in imaging, deep (convolutional) neural networks have recently taken the lead in many challenging benchmarks. A remaining drawback of deep learning approaches is their requirement for an expensive retraining whenever the specific problem, the noise level, noise type, or desired measure of fidelity changes. On the contrary, variational methods have a plug-and-play nature as they usually consist of separate data fidelity and regularization terms. In this paper we study the possibility of replacing the proximal operator of the regularization used in many convex energy minimization algorithms by a denoising neural network. The latter therefore serves as an implicit natural image prior, while the data term can still be chosen independently. Using a fixed denoising neural network in exemplary problems of image deconvolution with different blur kernels and image demosaicking, we obtain state-of-the-art reconstruction results. These indicate the high generalizability of our approach and a reduction of the need for problem-specific training. Additionally, we discuss novel results on the analysis of possible optimization algorithms to incorporate the network into, as well as the choices of algorithm parameters and their relation to the noise level the neural network is trained on.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Meinhardt_Learning_Proximal_Operators_ICCV_2017_paper.pdf", @@ -8850,7 +9413,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Meinhardt_2017_ICCV,\n \n author = {\n Meinhardt,\n Tim and Moller,\n Michael and Hazirbas,\n Caner and Cremers,\n Daniel\n},\n title = {\n Learning Proximal Operators: Using Denoising Networks for Regularizing Inverse Imaging Problems\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Learning Robust Visual-Semantic Embeddings", @@ -8858,6 +9422,7 @@ "status": "Poster", "track": "main", "pid": "1483", + "author_site": "Yao-Hung Hubert Tsai; Liang-Kang Huang; Ruslan Salakhutdinov", "author": "Yao-Hung Hubert Tsai; Liang-Kang Huang; Ruslan Salakhutdinov", "abstract": "Many of the existing methods for learning joint embedding of images and text use only supervised information from paired images and its textual attributes. Taking advantage of the recent success of unsupervised learning in deep neural networks, we propose an end-to-end learning framework that is able to extract more robust multi-modal representations across domains. The proposed method combines representation learning models (i.e., auto-encoders) together with cross-domain learning criteria (i.e., Maximum Mean Discrepancy loss) to learn joint embeddings for semantic and visual features. A novel technique of unsupervised-data adaptation inference is introduced to construct more comprehensive embeddings for both labeled and unlabeled data. We evaluate our method on Animals with Attributes and Caltech-UCSD Birds 200-2011 dataset with a wide range of applications, including zero and few-shot image recognition and retrieval, from inductive to transductive settings. Empirically, we show that our framework improves over the current state of the art on many of the considered tasks.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Tsai_Learning_Robust_Visual-Semantic_ICCV_2017_paper.pdf", @@ -8872,7 +9437,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Tsai_Learning_Robust_Visual-Semantic_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Tsai_Learning_Robust_Visual-Semantic_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Tsai_2017_ICCV,\n \n author = {\n Hubert Tsai,\n Yao-Hung and Huang,\n Liang-Kang and Salakhutdinov,\n Ruslan\n},\n title = {\n Learning Robust Visual-Semantic Embeddings\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Learning Spatio-Temporal Representation With Pseudo-3D Residual Networks", @@ -8880,6 +9446,7 @@ "status": "Poster", "track": "main", "pid": "2620", + "author_site": "Zhaofan Qiu; Ting Yao; Tao Mei", "author": "Zhaofan Qiu; Ting Yao; Tao Mei", "abstract": "Convolutional Neural Networks (CNN) have been regarded as a powerful class of models for image recognition problems. Nevertheless, it is not trivial when utilizing a CNN for learning spatio-temporal video representation. A few studies have shown that performing 3D convolutions is a rewarding approach to capture both spatial and temporal dimensions in videos. However, the development of a very deep 3D CNN from scratch results in expensive computational cost and memory demand. A valid question is why not recycle off-the-shelf 2D networks for a 3D CNN. In this paper, we devise multiple variants of bottleneck building blocks in a residual learning framework by simulating 3*3*3 convolutions with 1*3*3 convolutional filters on spatial domain (equivalent to 2D CNN) plus 3*1*1 convolutions to construct temporal connections on adjacent feature maps in time. Furthermore, we propose a new architecture, named Pseudo-3D Residual Net (P3D ResNet), that exploits all the variants of blocks but composes each in different placement of ResNet, following the philosophy that enhancing structural diversity with going deep could improve the power of neural networks. Our P3D ResNet achieves clear improvements on Sports-1M video classification dataset against 3D CNN and frame-based 2D CNN by 5.3% and 1.8%, respectively. We further examine the generalization performance of video representation produced by our pre-trained P3D ResNet on five different benchmarks and three different tasks, demonstrating superior performances over several state-of-the-art techniques.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Qiu_Learning_Spatio-Temporal_Representation_ICCV_2017_paper.pdf", @@ -8897,14 +9464,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Qiu_Learning_Spatio-Temporal_Representation_ICCV_2017_paper.html", "aff_unique_index": "0;1;1", - "aff_unique_norm": "University of Science and Technology of China;Microsoft", - "aff_unique_dep": ";Microsoft Research", + "aff_unique_norm": "University of Science and Technology of China;Microsoft Research", + "aff_unique_dep": ";", "aff_unique_url": "http://www.ustc.edu.cn;https://www.microsoft.com/en-us/research/group/microsoft-research-asia", "aff_unique_abbr": "USTC;MSR", "aff_campus_unique_index": "0;1;1", "aff_campus_unique": "Hefei;Beijing", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Qiu_2017_ICCV,\n \n author = {\n Qiu,\n Zhaofan and Yao,\n Ting and Mei,\n Tao\n},\n title = {\n Learning Spatio-Temporal Representation With Pseudo-3D Residual Networks\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Learning Spread-Out Local Feature Descriptors", @@ -8912,6 +9480,7 @@ "status": "Spotlight", "track": "main", "pid": "2100", + "author_site": "Xu Zhang; Felix X. Yu; Sanjiv Kumar; Shih-Fu Chang", "author": "Xu Zhang; Felix X. Yu; Sanjiv Kumar; Shih-Fu Chang", "abstract": "We propose a simple, yet powerful regularization technique that can be used to significantly improve both the pairwise and triplet losses in learning local feature descriptors. The idea is that in order to fully utilize the expressive power of the descriptor space, good local feature descriptors should be sufficiently \"spread-out\" over the space. In this work, we propose a regularization term to maximize the spread in feature descriptor inspired by the property of uniform distribution. We show that the proposed regularization with triplet loss outperforms existing Euclidean distance based descriptor learning techniques by a large margin. As an extension, the proposed regularization technique can also be used to improve image-level deep feature embedding.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zhang_Learning_Spread-Out_Local_ICCV_2017_paper.pdf", @@ -8936,7 +9505,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhang_2017_ICCV,\n \n author = {\n Zhang,\n Xu and Yu,\n Felix X. and Kumar,\n Sanjiv and Chang,\n Shih-Fu\n},\n title = {\n Learning Spread-Out Local Feature Descriptors\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Learning Uncertain Convolutional Features for Accurate Saliency Detection", @@ -8944,6 +9514,7 @@ "status": "Poster", "track": "main", "pid": "146", + "author_site": "Pingping Zhang; Dong Wang; Huchuan Lu; Hongyu Wang; Baocai Yin", "author": "Pingping Zhang; Dong Wang; Huchuan Lu; Hongyu Wang; Baocai Yin", "abstract": "Deep convolutional neural networks (CNNs) have delivered superior performance in many computer vision tasks. In this paper, we propose a novel deep fully convolutional network model for accurate salient object detection. The key contribution of this work is to learn deep uncertain convolutional features (UCF), which encourage the robustness and accuracy of saliency detection. We achieve this via introducing a reformulated dropout (R-dropout) after specific convolutional layers to construct an uncertain ensemble of internal feature units. In addition, we propose an effective hybrid upsampling method to reduce the checkerboard artifacts of deconvolution operators in our decoder network. The proposed methods can also be applied to other deep convolutional networks. Compared with existing saliency detection methods, the proposed UCF model is able to incorporate uncertainties for more accurate object boundary inference. Extensive experiments demonstrate that our proposed saliency model performs favorably against state-of-the-art approaches. The uncertain feature learning mechanism as well as the upsampling method can significantly improve performance on other pixel-wise vision tasks.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zhang_Learning_Uncertain_Convolutional_ICCV_2017_paper.pdf", @@ -8968,7 +9539,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2017_ICCV,\n \n author = {\n Zhang,\n Pingping and Wang,\n Dong and Lu,\n Huchuan and Wang,\n Hongyu and Yin,\n Baocai\n},\n title = {\n Learning Uncertain Convolutional Features for Accurate Saliency Detection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Learning Video Object Segmentation With Visual Memory", @@ -8976,6 +9548,7 @@ "status": "Oral", "track": "main", "pid": "1148", + "author_site": "Pavel Tokmakov; Karteek Alahari; Cordelia Schmid", "author": "Pavel Tokmakov; Karteek Alahari; Cordelia Schmid", "abstract": "This paper addresses the task of segmenting moving objects in unconstrained videos. We introduce a novel two-stream neural network with an explicit memory module to achieve this. The two streams of the network encode spatial and temporal features in a video sequence respectively, while the memory module captures the evolution of objects over time. The module to build a 'visual memory' in video, i.e., a joint representation of all the video frames, is realized with a convolutional recurrent unit learned from a small number of training video sequences. Given a video frame as input, our approach assigns each pixel an object or background label based on the learned spatio-temporal features as well as the 'visual memory' specific to the video, acquired automatically without any manually-annotated frames. We evaluate our method extensively on two benchmarks, DAVIS and Freiburg-Berkeley motion segmentation datasets, and show state-of-the-art results. For example, our approach outperforms the top method on the DAVIS dataset by nearly 6%. We also provide an extensive ablative analysis to investigate the influence of each component in the proposed framework.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Tokmakov_Learning_Video_Object_ICCV_2017_paper.pdf", @@ -8993,14 +9566,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Tokmakov_Learning_Video_Object_ICCV_2017_paper.html", "aff_unique_index": "0;0;1", - "aff_unique_norm": "INRIA;Universite Grenoble Alpes", + "aff_unique_norm": "Inria;Universite Grenoble Alpes", "aff_unique_dep": ";", "aff_unique_url": "https://www.inria.fr;https://www.univ-grenoble-alpes.fr", "aff_unique_abbr": "Inria;UGA", "aff_campus_unique_index": "1", "aff_campus_unique": ";Grenoble", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "France" + "aff_country_unique": "France", + "bibtex": "@InProceedings{Tokmakov_2017_ICCV,\n \n author = {\n Tokmakov,\n Pavel and Alahari,\n Karteek and Schmid,\n Cordelia\n},\n title = {\n Learning Video Object Segmentation With Visual Memory\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Learning View-Invariant Features for Person Identification in Temporally Synchronized Videos Taken by Wearable Cameras", @@ -9008,6 +9582,7 @@ "status": "Poster", "track": "main", "pid": "921", + "author_site": "Kang Zheng; Xiaochuan Fan; Yuewei Lin; Hao Guo; Hongkai Yu; Dazhou Guo; Song Wang", "author": "Kang Zheng; Xiaochuan Fan; Yuewei Lin; Hao Guo; Hongkai Yu; Dazhou Guo; Song Wang", "abstract": "In this paper, we study the problem of Cross-View Person Identification (CVPI), which aims at identifying the same person from temporally synchronized videos taken by different wearable cameras. Our basic idea is to utilize the human motion consistency for CVPI, where human motion can be computed by optical flow. However, optical flow is view-variant -- the same person's optical flow in different videos can be very different due to view angle change. In this paper, we attempt to utilize 3D human-skeleton sequences to learn a model that can extract view-invariant motion features from optical flows in different views. For this purpose, we use 3D Mocap database to build a synthetic optical flow dataset and train a Triplet Network (TN) consisting of three sub-networks: two for optical flow sequences from different views and one for the underlying 3D Mocap skeleton sequence. Finally, sub-networks for optical flows are used to extract view-invariant features for CVPI. Experimental results show that, using only the motion information, the proposed method can achieve comparable performance with the state-of-the-art methods. Further combination of the proposed method with an appearance-based method achieves new state-of-the-art performance.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zheng_Learning_View-Invariant_Features_ICCV_2017_paper.pdf", @@ -9032,7 +9607,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zheng_2017_ICCV,\n \n author = {\n Zheng,\n Kang and Fan,\n Xiaochuan and Lin,\n Yuewei and Guo,\n Hao and Yu,\n Hongkai and Guo,\n Dazhou and Wang,\n Song\n},\n title = {\n Learning View-Invariant Features for Person Identification in Temporally Synchronized Videos Taken by Wearable Cameras\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Learning Visual Attention to Identify People With Autism Spectrum Disorder", @@ -9040,6 +9616,7 @@ "status": "Poster", "track": "main", "pid": "1356", + "author_site": "Ming Jiang; Qi Zhao", "author": "Ming Jiang; Qi Zhao", "abstract": "This paper presents a novel method for quantitative and objective diagnoses of Autism Spectrum Disorder (ASD) using eye tracking and deep neural networks. ASD is prevalent, with 1.5% of people in the US. The lack of clinical resources for early diagnoses has been a long-lasting issue. This work differentiates itself with three unique features: first, the proposed approach is data-driven and free of assumptions, important for new discoveries in understanding ASD as well as other neurodevelopmental disorders. Second, we concentrate our analyses on the differences in eye movement patterns between healthy people and those with ASD. An image selection method based on Fisher scores allows feature learning with the most discriminative contents, leading to efficient and accurate diagnoses. Third, we leverage the recent advances in deep neural networks for both prediction and visualization. Experimental results show the superior performance of our method in terms of multiple evaluation metrics used in diagnostic tests.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Jiang_Learning_Visual_Attention_ICCV_2017_paper.pdf", @@ -9064,7 +9641,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Jiang_2017_ICCV,\n \n author = {\n Jiang,\n Ming and Zhao,\n Qi\n},\n title = {\n Learning Visual Attention to Identify People With Autism Spectrum Disorder\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Learning Visual N-Grams From Web Data", @@ -9072,6 +9650,7 @@ "status": "Poster", "track": "main", "pid": "1900", + "author_site": "Ang Li; Allan Jabri; Armand Joulin; Laurens van der Maaten", "author": "Ang Li; Allan Jabri; Armand Joulin; Laurens van der Maaten", "abstract": "Real-world image recognition systems need to recognize tens of thousands of classes that constitute a plethora of visual concepts. The traditional approach of annotating thousands of images per class for training is infeasible in such a scenario, prompting the use of webly supervised data. This paper explores the training of image-recognition systems on large numbers of images and associated user comments. In particular, we develop visual n-gram models that can predict arbitrary phrases that are relevant to the content of an image. Our visual n-gram models are feed-forward convolutional networks trained using new loss functions that are inspired by n-gram models commonly used in language modeling. We demonstrate the merits of our models in phrase prediction, phrase-based image retrieval, relating images and captions, and zero-shot transfer.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Li_Learning_Visual_N-Grams_ICCV_2017_paper.pdf", @@ -9089,14 +9668,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Li_Learning_Visual_N-Grams_ICCV_2017_paper.html", "aff_unique_index": "0;1;1;1", - "aff_unique_norm": "University of Maryland;Meta", + "aff_unique_norm": "University of Maryland;Facebook", "aff_unique_dep": ";Facebook AI Research", "aff_unique_url": "https://www/umd.edu;https://research.facebook.com", "aff_unique_abbr": "UMD;FAIR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Li_2017_ICCV,\n \n author = {\n Li,\n Ang and Jabri,\n Allan and Joulin,\n Armand and van der Maaten,\n Laurens\n},\n title = {\n Learning Visual N-Grams From Web Data\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Learning a Recurrent Residual Fusion Network for Multimodal Matching", @@ -9104,6 +9684,7 @@ "status": "Poster", "track": "main", "pid": "1703", + "author_site": "Yu Liu; Yanming Guo; Erwin M. Bakker; Michael S. Lew", "author": "Yu Liu; Yanming Guo; Erwin M. Bakker; Michael S. Lew", "abstract": "A major challenge in matching between vision and language is that they typically have completely different features and representations. In this work, we introduce a novel bridge between the modality-specific representations by creating a co-embedding space based on a recurrent residual fusion (RRF) block. Specifically, RRF adapts the recurrent mechanism to residual learning, so that it can recursively improve feature embeddings while retaining the shared parameters. Then, a fusion module is used to integrate the intermediate recurrent outputs and generates a more powerful representation. In the matching network, RRF acts as a feature enhancement component to gather visual and textual representations into a more discriminative embedding space where it allows to narrow the cross-modal gap between vision and language. Moreover, we employ a bi-rank loss function to enforce separability of the two modalities in the embedding space. In the experiments, we evaluate the proposed RRF-Net using two multi-modal datasets where it achieves state-of-the-art results.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Liu_Learning_a_Recurrent_ICCV_2017_paper.pdf", @@ -9128,7 +9709,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Leiden", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Netherlands" + "aff_country_unique": "Netherlands", + "bibtex": "@InProceedings{Liu_2017_ICCV,\n \n author = {\n Liu,\n Yu and Guo,\n Yanming and Bakker,\n Erwin M. and Lew,\n Michael S.\n},\n title = {\n Learning a Recurrent Residual Fusion Network for Multimodal Matching\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Learning for Active 3D Mapping", @@ -9136,7 +9718,7 @@ "status": "Oral", "track": "main", "pid": "1812", - "author_site": "Karel Zimmermann; Tom\u00c3\u00a1\u00c5\u00a1 Pet\u00c5\u0099\u00c3\u00ad\u00c4\u008dek; Vojt\u00c4\u009bch \u00c5\u00a0alansk\u00c3\u00bd; Tom\u00c3\u00a1\u00c5\u00a1 Svoboda", + "author_site": "Karel Zimmermann; Tomáš Petříček; Vojtěch Å alanský; Tomáš Svoboda", "author": "Karel Zimmermann; Tomas Petricek; Vojtech Salansky; Tomas Svoboda", "abstract": "We propose an active 3D mapping method for depth sensors, which allow individual control of depth-measuring rays, such as the newly emerging Solid State Lidars. The method simultaneously (i) learns to reconstruct a dense 3D voxel-map from sparse depth measurements, and (ii) optimizes the reactive control of depth-measuring rays. To make the first step towards the online control optimization, we propose a fast greedy algorithm, which needs to update its cost function in only a small fraction of possible rays. The approximation ratio of the greedy algorithm is derived. Experimental evaluation on the subset of the Kitti dataset demonstrates significant improvement in the 3D map accuracy when learning-to-reconstruct from sparse measurements is coupled with the optimization where-to-measure.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zimmermann_Learning_for_Active_ICCV_2017_paper.pdf", @@ -9161,7 +9743,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Prague", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Czech Republic" + "aff_country_unique": "Czech Republic", + "bibtex": "@InProceedings{Zimmermann_2017_ICCV,\n \n author = {\n Zimmermann,\n Karel and Petricek,\n Tomas and Salansky,\n Vojtech and Svoboda,\n Tomas\n},\n title = {\n Learning for Active 3D Mapping\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Learning in an Uncertain World: Representing Ambiguity Through Multiple Hypotheses", @@ -9169,10 +9752,11 @@ "status": "Poster", "track": "main", "pid": "1574", + "author_site": "Christian Rupprecht; Iro Laina; Robert DiPietro; Maximilian Baust; Federico Tombari; Nassir Navab; Gregory D. Hager", "author": "Christian Rupprecht; Iro Laina; Robert DiPietro; Maximilian Baust; Federico Tombari; Nassir Navab; Gregory D. Hager", "abstract": "Many prediction tasks contain uncertainty. In some cases, uncertainty is inherent in the task itself. In future prediction, for example, many distinct outcomes are equally valid. In other cases, uncertainty arises from the way data is labeled. For example, in object detection, many objects of interest often go unlabeled, and in human pose estimation, occluded joints are often labeled with ambiguous values. In this work we focus on a principled approach for handling such scenarios. In particular, we propose a framework for reformulating existing single-prediction models as multiple hypothesis prediction (MHP) models and an associated meta loss and optimization procedure to train them. To demonstrate our approach, we consider four diverse applications: human pose estimation, future prediction, image classification and segmentation. We find that MHP models outperform their single-hypothesis counterparts in all cases, and that MHP models simultaneously expose valuable insights into the variability of predictions.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Rupprecht_Learning_in_an_ICCV_2017_paper.pdf", - "aff": "Technische Universit \u00a8at M \u00a8unchen, Munich, Germany+Johns Hopkins University, Baltimore MD, USA; Technische Universit \u00a8at M \u00a8unchen, Munich, Germany; Johns Hopkins University, Baltimore MD, USA; Technische Universit \u00a8at M \u00a8unchen, Munich, Germany; Technische Universit \u00a8at M \u00a8unchen, Munich, Germany; Technische Universit \u00a8at M \u00a8unchen, Munich, Germany+Johns Hopkins University, Baltimore MD, USA; Johns Hopkins University, Baltimore MD, USA", + "aff": "Technische Universit ¨at M ¨unchen, Munich, Germany+Johns Hopkins University, Baltimore MD, USA; Technische Universit ¨at M ¨unchen, Munich, Germany; Johns Hopkins University, Baltimore MD, USA; Technische Universit ¨at M ¨unchen, Munich, Germany; Technische Universit ¨at M ¨unchen, Munich, Germany; Technische Universit ¨at M ¨unchen, Munich, Germany+Johns Hopkins University, Baltimore MD, USA; Johns Hopkins University, Baltimore MD, USA", "project": "", "github": "", "supp": "", @@ -9186,14 +9770,15 @@ "author_num": 7, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Rupprecht_Learning_in_an_ICCV_2017_paper.html", "aff_unique_index": "0+1;0;1;0;0;0+1;1", - "aff_unique_norm": "Technische Universit\u00e4t M\u00fcnchen;Johns Hopkins University", + "aff_unique_norm": "Technische Universität München;Johns Hopkins University", "aff_unique_dep": ";", "aff_unique_url": "https://www.tum.de;https://www.jhu.edu", "aff_unique_abbr": "TUM;JHU", "aff_campus_unique_index": "0+1;0;1;0;0;0+1;1", "aff_campus_unique": "Munich;Baltimore", "aff_country_unique_index": "0+1;0;1;0;0;0+1;1", - "aff_country_unique": "Germany;United States" + "aff_country_unique": "Germany;United States", + "bibtex": "@InProceedings{Rupprecht_2017_ICCV,\n \n author = {\n Rupprecht,\n Christian and Laina,\n Iro and DiPietro,\n Robert and Baust,\n Maximilian and Tombari,\n Federico and Navab,\n Nassir and Hager,\n Gregory D.\n},\n title = {\n Learning in an Uncertain World: Representing Ambiguity Through Multiple Hypotheses\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Learning the Latent \"Look\": Unsupervised Discovery of a Style-Coherent Embedding From Fashion Images", @@ -9201,6 +9786,7 @@ "status": "Poster", "track": "main", "pid": "1981", + "author_site": "Wei-Lin Hsiao; Kristen Grauman", "author": "Wei-Lin Hsiao; Kristen Grauman", "abstract": "What defines a visual style? Fashion styles emerge organically from how people assemble outfits of clothing, making them difficult to pin down with a computational model. Low-level visual similarity can be too specific to detect stylistically similar images, while manually crafted style categories can be too abstract to capture subtle style differences. We propose an unsupervised approach to learn a style-coherent representation. Our method leverages probabilistic polylingual topic models based on visual attributes to discover a set of latent style factors. Given a collection of unlabeled fashion images, our approach mines for the latent styles, then summarizes outfits by how they mix those styles. Our approach can organize galleries of outfits by style without requiring any style labels. Experiments on over 100K images demonstrate its promise for retrieving, mixing, and summarizing fashion images by their style.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Hsiao_Learning_the_Latent_ICCV_2017_paper.pdf", @@ -9225,7 +9811,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Austin", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Hsiao_2017_ICCV,\n \n author = {\n Hsiao,\n Wei-Lin and Grauman,\n Kristen\n},\n title = {\n Learning the Latent \"Look\": Unsupervised Discovery of a Style-Coherent Embedding From Fashion Images\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Learning to Disambiguate by Asking Discriminative Questions", @@ -9233,6 +9820,7 @@ "status": "Poster", "track": "main", "pid": "1375", + "author_site": "Yining Li; Chen Huang; Xiaoou Tang; Chen Change Loy", "author": "Yining Li; Chen Huang; Xiaoou Tang; Chen Change Loy", "abstract": "The ability to ask questions is a powerful tool to gather information in order to learn about the world and resolve ambiguities. In this paper, we explore a novel problem of generating discriminative questions to help disambiguate visual instances. Our work can be seen as a complement and new extension to the rich research studies on image captioning and question answering. We introduce the first large-scale dataset with over 10,000 carefully annotated images-question tuples to facilitate benchmarking. In particular, each tuple consists of a pair of images and 4.6 discriminative questions (as positive samples) and 5.9 non-discriminative questions (as negative samples) on average. In addition, we present an effective method for visual discriminative question generation. The method can be trained in a weakly supervised manner without discriminative images-question tuples but just existing visual question answering datasets. Promising results are shown against representative baselines through quantitative evaluations and user studies.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Li_Learning_to_Disambiguate_ICCV_2017_paper.pdf", @@ -9250,14 +9838,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Li_Learning_to_Disambiguate_ICCV_2017_paper.html", "aff_unique_index": "0;1;0;0", - "aff_unique_norm": "Chinese University of Hong Kong;Carnegie Mellon University", + "aff_unique_norm": "The Chinese University of Hong Kong;Carnegie Mellon University", "aff_unique_dep": "Department of Information Engineering;Robotics Institute", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.cmu.edu", "aff_unique_abbr": "CUHK;CMU", "aff_campus_unique_index": "0;1;0;0", "aff_campus_unique": "Hong Kong SAR;Pittsburgh", "aff_country_unique_index": "0;1;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Li_2017_ICCV,\n \n author = {\n Li,\n Yining and Huang,\n Chen and Tang,\n Xiaoou and Change Loy,\n Chen\n},\n title = {\n Learning to Disambiguate by Asking Discriminative Questions\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Learning to Estimate 3D Hand Pose From Single RGB Images", @@ -9265,6 +9854,7 @@ "status": "Poster", "track": "main", "pid": "2245", + "author_site": "Christian Zimmermann; Thomas Brox", "author": "Christian Zimmermann; Thomas Brox", "abstract": "Low-cost consumer depth cameras and deep learning have enabled reasonable 3D hand pose estimation from single depth images. In this paper, we present an approach that estimates 3D hand pose from regular RGB images. This task has far more ambiguities due to the missing depth information. To this end, we propose a deep network that learns a network-implicit 3D articulation prior. Together with detected keypoints in the images, this network yields good estimates of the 3D pose. We introduce a large scale 3D hand pose dataset based on synthetic hand models for training the involved networks. Experiments on a variety of test sets, including one on sign language recognition, demonstrate the feasibility of 3D hand pose estimation on single color images.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zimmermann_Learning_to_Estimate_ICCV_2017_paper.pdf", @@ -9289,7 +9879,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Zimmermann_2017_ICCV,\n \n author = {\n Zimmermann,\n Christian and Brox,\n Thomas\n},\n title = {\n Learning to Estimate 3D Hand Pose From Single RGB Images\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Learning to Fuse 2D and 3D Image Cues for Monocular Body Pose Estimation", @@ -9297,7 +9888,7 @@ "status": "Poster", "track": "main", "pid": "1617", - "author_site": "Bugra Tekin; Pablo M\u00c3\u00a1rquez-Neila; Mathieu Salzmann; Pascal Fua", + "author_site": "Bugra Tekin; Pablo Márquez-Neila; Mathieu Salzmann; Pascal Fua", "author": "Bugra Tekin; Pablo Marquez-Neila; Mathieu Salzmann; Pascal Fua", "abstract": "Most recent approaches to monocular 3D human pose estimation rely on Deep Learning. They typically involve regressing from an image to either 3D joint coordinates directly or 2D joint locations from which 3D coordinates are inferred. Both approaches have their strengths and weaknesses and we therefore propose a novel architecture designed to deliver the best of both worlds by performing both simultaneously and fusing the information along the way. At the heart of our framework is a trainable fusion scheme that learns how to fuse the information optimally instead of being hand-designed. This yields significant improvements upon the state-of-the-art on standard 3D human pose estimation benchmarks.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Tekin_Learning_to_Fuse_ICCV_2017_paper.pdf", @@ -9315,14 +9906,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Tekin_Learning_to_Fuse_ICCV_2017_paper.html", "aff_unique_index": "0;0;0;0", - "aff_unique_norm": "EPFL", + "aff_unique_norm": "École Polytechnique Fédérale de Lausanne", "aff_unique_dep": "CVLab", "aff_unique_url": "https://cvlab.epfl.ch", "aff_unique_abbr": "EPFL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Switzerland" + "aff_country_unique": "Switzerland", + "bibtex": "@InProceedings{Tekin_2017_ICCV,\n \n author = {\n Tekin,\n Bugra and Marquez-Neila,\n Pablo and Salzmann,\n Mathieu and Fua,\n Pascal\n},\n title = {\n Learning to Fuse 2D and 3D Image Cues for Monocular Body Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Learning to Push the Limits of Efficient FFT-Based Image Deconvolution", @@ -9330,6 +9922,7 @@ "status": "Spotlight", "track": "main", "pid": "1789", + "author_site": "Jakob Kruse; Carsten Rother; Uwe Schmidt", "author": "Jakob Kruse; Carsten Rother; Uwe Schmidt", "abstract": "This work addresses the task of non-blind image deconvolution. Motivated to keep up with the constant increase in image size, with megapixel images becoming the norm, we aim at pushing the limits of efficient FFT-based techniques. Based on an analysis of traditional and more recent learning-based methods, we generalize existing discriminative approaches by using more powerful regularization, based on convolutional neural networks. Additionally, we propose a simple, yet effective, boundary adjustment method that alleviates the problematic circular convolution assumption, which is necessary for FFT-based deconvolution. We evaluate our approach on two common non-blind deconvolution benchmarks and achieve state-of-the-art results even when including methods which are computationally considerably more expensive.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Kruse_Learning_to_Push_ICCV_2017_paper.pdf", @@ -9345,7 +9938,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Kruse_Learning_to_Push_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Kruse_Learning_to_Push_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Kruse_2017_ICCV,\n \n author = {\n Kruse,\n Jakob and Rother,\n Carsten and Schmidt,\n Uwe\n},\n title = {\n Learning to Push the Limits of Efficient FFT-Based Image Deconvolution\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Learning to Reason: End-To-End Module Networks for Visual Question Answering", @@ -9353,6 +9947,7 @@ "status": "Spotlight", "track": "main", "pid": "470", + "author_site": "Ronghang Hu; Jacob Andreas; Marcus Rohrbach; Trevor Darrell; Kate Saenko", "author": "Ronghang Hu; Jacob Andreas; Marcus Rohrbach; Trevor Darrell; Kate Saenko", "abstract": "Natural language questions are inherently compositional, and many are most easily answered by reasoning about their decomposition into modular sub-problems. For example, to answer \"is there an equal number of balls and boxes?\" we can look for balls, look for boxes, count them, and compare the results. The recently proposed Neural Module Network (NMN) architecture implements this approach to question answering by parsing questions into linguistic substructures and assembling question-specific deep networks from smaller modules that each solve one subtask. However, existing NMN implementations rely on brittle off-the-shelf parsers, and are restricted to the module configurations proposed by these parsers rather than learning them from data. In this paper, we propose End-to-End Module Networks (N2NMNs), which learn to reason by directly predicting instance-specific network layouts without the aid of a parser. Our model learns to generate network structures (by imitating expert demonstrations) while simultaneously learning network parameters (using the downstream task loss). Experimental results on the new CLEVR dataset targeted at compositional question answering show that N2NMNs achieve an error reduction of nearly 50% relative to state-of-the-art attentional approaches, while discovering interpretable network architectures specialized for each question.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Hu_Learning_to_Reason_ICCV_2017_paper.pdf", @@ -9370,14 +9965,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Hu_Learning_to_Reason_ICCV_2017_paper.html", "aff_unique_index": "0;0;0+1;0;2", - "aff_unique_norm": "University of California, Berkeley;Meta;Boston University", + "aff_unique_norm": "University of California, Berkeley;Facebook;Boston University", "aff_unique_dep": ";Facebook AI Research;", "aff_unique_url": "https://www.berkeley.edu;https://research.facebook.com;https://www.bu.edu", "aff_unique_abbr": "UC Berkeley;FAIR;BU", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;0;0+0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Hu_2017_ICCV,\n \n author = {\n Hu,\n Ronghang and Andreas,\n Jacob and Rohrbach,\n Marcus and Darrell,\n Trevor and Saenko,\n Kate\n},\n title = {\n Learning to Reason: End-To-End Module Networks for Visual Question Answering\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Learning to Super-Resolve Blurry Face and Text Images", @@ -9385,6 +9981,7 @@ "status": "Poster", "track": "main", "pid": "286", + "author_site": "Xiangyu Xu; Deqing Sun; Jinshan Pan; Yujin Zhang; Hanspeter Pfister; Ming-Hsuan Yang", "author": "Xiangyu Xu; Deqing Sun; Jinshan Pan; Yujin Zhang; Hanspeter Pfister; Ming-Hsuan Yang", "abstract": "We present an algorithm to directly restore a clear high-resolution image from a blurry low-resolution input. This problem is highly ill-posed and the basic assumptions for existing super-resolution methods (requiring clear input) and deblurring methods (requiring high-resolution input) no longer hold. We focus on face and text images and adopt a generative adversarial network (GAN) to learn a category-specific prior to solve this problem. However, the basic GAN formulation does not generate realistic high-resolution images. In this work, we introduce novel training losses that help recover fine details. We also present a multi-class GAN that can process multi-class image restoration tasks, i.e., face and text images, using a single generator network. Extensive experiments demonstrate that our method performs favorably against the state-of-the-art methods on both synthetic and real-world images at a lower computational cost.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Xu_Learning_to_Super-Resolve_ICCV_2017_paper.pdf", @@ -9400,7 +9997,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Xu_Learning_to_Super-Resolve_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Xu_Learning_to_Super-Resolve_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Xu_2017_ICCV,\n \n author = {\n Xu,\n Xiangyu and Sun,\n Deqing and Pan,\n Jinshan and Zhang,\n Yujin and Pfister,\n Hanspeter and Yang,\n Ming-Hsuan\n},\n title = {\n Learning to Super-Resolve Blurry Face and Text Images\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Learning to Synthesize a 4D RGBD Light Field From a Single Image", @@ -9408,6 +10006,7 @@ "status": "Spotlight", "track": "main", "pid": "753", + "author_site": "Pratul P. Srinivasan; Tongzhou Wang; Ashwin Sreelal; Ravi Ramamoorthi; Ren Ng", "author": "Pratul P. Srinivasan; Tongzhou Wang; Ashwin Sreelal; Ravi Ramamoorthi; Ren Ng", "abstract": "We present a machine learning algorithm that takes as input a 2D RGB image and synthesizes a 4D RGBD light field (color and depth of the scene in each ray direction). For training, we introduce the largest public light field dataset, consisting of over 3300 plenoptic camera light fields of scenes containing flowers and plants. Our synthesis pipeline consists of a convolutional neural network (CNN) that estimates scene geometry, a stage that renders a Lambertian light field using that geometry, and a second CNN that predicts occluded rays and non-Lambertian effects. Our algorithm builds on recent view synthesis methods, but is unique in predicting RGBD for each light field ray and improving unsupervised single image depth estimation by enforcing consistency of ray depths that should intersect the same scene point.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Srinivasan_Learning_to_Synthesize_ICCV_2017_paper.pdf", @@ -9432,7 +10031,8 @@ "aff_campus_unique_index": "0;0;0;1;0", "aff_campus_unique": "Berkeley;San Diego", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Srinivasan_2017_ICCV,\n \n author = {\n Srinivasan,\n Pratul P. and Wang,\n Tongzhou and Sreelal,\n Ashwin and Ramamoorthi,\n Ravi and Ng,\n Ren\n},\n title = {\n Learning to Synthesize a 4D RGBD Light Field From a Single Image\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Learning-Based Cloth Material Recovery From Video", @@ -9440,6 +10040,7 @@ "status": "Poster", "track": "main", "pid": "1788", + "author_site": "Shan Yang; Junbang Liang; Ming C. Lin", "author": "Shan Yang; Junbang Liang; Ming C. Lin", "abstract": "Image understanding enables better reconstruction of the physical world from images and videos. Existing methods focus largely on geometry and visual appearance of the reconstructed scene. In this paper, we extend the frontier in image understanding and present a new technique to recover the material properties of cloth from a video.Previous cloth material recovery methods often require markers or complex experimental set-up to acquire physical properties, or are limited to certain types of images/videos. Our approach takes advantages of the appearance changes of the moving cloth to infer its physical properties. To extract information about the cloth, our method characterizes both the motion space and the visual appearance of the cloth geometry. We apply the Convolutional Neural Network (CNN) and the Long Short Term Memory (LSTM) neural network to material recovery of cloth properties from videos. We also exploit simulated data to help statistical learning of mapping between the visual appearance and motion dynamics of the cloth. The effectiveness of our method is demonstrated via validation using simulated datasets and real-life recorded videos.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Yang_Learning-Based_Cloth_Material_ICCV_2017_paper.pdf", @@ -9464,7 +10065,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Chapel Hill", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Yang_2017_ICCV,\n \n author = {\n Yang,\n Shan and Liang,\n Junbang and Lin,\n Ming C.\n},\n title = {\n Learning-Based Cloth Material Recovery From Video\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Least Squares Generative Adversarial Networks", @@ -9472,6 +10074,7 @@ "status": "Poster", "track": "main", "pid": "1174", + "author_site": "Xudong Mao; Qing Li; Haoran Xie; Raymond Y.K. Lau; Zhen Wang; Stephen Paul Smolley", "author": "Xudong Mao; Qing Li; Haoran Xie; Raymond Y.K. Lau; Zhen Wang; Stephen Paul Smolley", "abstract": "Unsupervised learning with generative adversarial networks (GANs) has proven hugely successful. Regular GANs hypothesize the discriminator as a classifier with the sigmoid cross entropy loss function. However, we found that this loss function may lead to the vanishing gradients problem during the learning process. To overcome such a problem, we propose in this paper the Least Squares Generative Adversarial Networks (LSGANs) which adopt the least squares loss function for the discriminator. We show that minimizing the objective function of LSGAN yields minimizing the Pearson Chi^2 divergence. There are two benefits of LSGANs over regular GANs. First, LSGANs are able to generate higher quality images than regular GANs. Second, LSGANs perform more stable during the learning process. We evaluate LSGANs on LSUN and CIFAR-10 datasets and the experimental results show that the images generated by LSGANs are of better quality than the ones generated by regular GANs. We also conduct two comparison experiments between LSGANs and regular GANs to illustrate the stability of LSGANs.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Mao_Least_Squares_Generative_ICCV_2017_paper.pdf", @@ -9489,14 +10092,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Mao_Least_Squares_Generative_ICCV_2017_paper.html", "aff_unique_index": "0;0;1;0;2;3", - "aff_unique_norm": "City University of Hong Kong;Education University of Hong Kong;Northwestern Polytechnical University;CodeHatch Corp.", + "aff_unique_norm": "City University of Hong Kong;The Education University of Hong Kong;Northwestern Polytechnical University;CodeHatch Corp.", "aff_unique_dep": "Department of Computer Science;Department of Mathematics and Information Technology;Center for Optical Imagery Analysis and Learning;", "aff_unique_url": "https://www.cityu.edu.hk;https://www.eduhk.hk;https://www.nwpu.edu.cn;", "aff_unique_abbr": "CityU;EdUHK;NWPU;", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Mao_2017_ICCV,\n \n author = {\n Mao,\n Xudong and Li,\n Qing and Xie,\n Haoran and Lau,\n Raymond Y.K. and Wang,\n Zhen and Paul Smolley,\n Stephen\n},\n title = {\n Least Squares Generative Adversarial Networks\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Leveraging Weak Semantic Relevance for Complex Video Event Classification", @@ -9504,6 +10108,7 @@ "status": "Poster", "track": "main", "pid": "1482", + "author_site": "Chao Li; Jiewei Cao; Zi Huang; Lei Zhu; Heng Tao Shen", "author": "Chao Li; Jiewei Cao; Zi Huang; Lei Zhu; Heng Tao Shen", "abstract": "Existing video event classification approaches suffer from limited human-labeled semantic annotations. Weak semantic annotations can be harvested from Web-knowledge without involving any human interaction. However such weak annotations are noisy, thus can not be effectively utilized without distinguishing its reliability. In this paper, we propose a novel approach to automatically maximize the utility of weak semantic annotations (formalized as the semantic relevance of video shots to the target event) to facilitate video event classification. A novel attention model is designed to determine the attention scores of video shots, where the weak semantic relevance is considered as attentional guidance. Specifically, our model jointly optimizes two objectives at different levels. The first one is the classification loss corresponding to video-level groundtruth labels, and the second is the shot-level relevance loss corresponding to weak semantic relevance. We use a long short-term memory (LSTM) layer to capture the temporal information carried by the shots of a video. In each timestep, the LSTM employs the attention model to weight the current shot under the guidance of its weak semantic relevance to the event of interest. Thus, we can automatically exploit weak semantic relevance to assist video event classification. Extensive experiments have been conducted on three complex large-scale video event datasets i.e., MEDTest14, ActivityNet and FCVID. Our approach achieves the state-of-the-art classification performance on all three datasets. The significant performance improvement upon the conventional attention model also demonstrates the effectiveness of our model.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Li_Leveraging_Weak_Semantic_ICCV_2017_paper.pdf", @@ -9521,14 +10126,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Li_Leveraging_Weak_Semantic_ICCV_2017_paper.html", "aff_unique_index": "0;0;0;0;1", - "aff_unique_norm": "University of Queensland;University of Electronic Science and Technology of China", + "aff_unique_norm": "The University of Queensland;University of Electronic Science and Technology of China", "aff_unique_dep": ";", "aff_unique_url": "https://www.uq.edu.au;https://www.uestc.edu.cn", "aff_unique_abbr": "UQ;UESTC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1", - "aff_country_unique": "Australia;China" + "aff_country_unique": "Australia;China", + "bibtex": "@InProceedings{Li_2017_ICCV,\n \n author = {\n Li,\n Chao and Cao,\n Jiewei and Huang,\n Zi and Zhu,\n Lei and Tao Shen,\n Heng\n},\n title = {\n Leveraging Weak Semantic Relevance for Complex Video Event Classification\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Linear Differential Constraints for Photo-Polarimetric Height Estimation", @@ -9536,6 +10142,7 @@ "status": "Spotlight", "track": "main", "pid": "2270", + "author_site": "Silvia Tozza; William A. P. Smith; Dizhong Zhu; Ravi Ramamoorthi; Edwin R. Hancock", "author": "Silvia Tozza; William A. P. Smith; Dizhong Zhu; Ravi Ramamoorthi; Edwin R. Hancock", "abstract": "In this paper we present a differential approach to photo-polarimetric shape estimation. We propose several alternative differential constraints based on polarisation and photometric shading information and show how to express them in a unified partial differential system. Our method uses the image ratios technique to combine shading and polarisation information in order to directly reconstruct surface height, without first computing surface normal vectors. Moreover, we are able to remove the non-linearities so that the problem reduces to solving a linear differential problem. We also introduce a new method for estimating a polarisation image from multichannel data and, finally, we show it is possible to estimate the illumination directions in a two source setup, extending the method into an uncalibrated scenario. From a numerical point of view, we use a least-squares formulation of the discrete version of the problem. To the best of our knowledge, this is the first work to consider a unified differential approach to solve photo-polarimetric shape estimation directly for height. Numerical results on synthetic and real-world data confirm the effectiveness of our proposed method.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Tozza_Linear_Differential_Constraints_ICCV_2017_paper.pdf", @@ -9560,7 +10167,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";San Diego", "aff_country_unique_index": "0;1;1;2;1", - "aff_country_unique": "Italy;United Kingdom;United States" + "aff_country_unique": "Italy;United Kingdom;United States", + "bibtex": "@InProceedings{Tozza_2017_ICCV,\n \n author = {\n Tozza,\n Silvia and Smith,\n William A. P. and Zhu,\n Dizhong and Ramamoorthi,\n Ravi and Hancock,\n Edwin R.\n},\n title = {\n Linear Differential Constraints for Photo-Polarimetric Height Estimation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Local-To-Global Point Cloud Registration Using a Dictionary of Viewpoint Descriptors", @@ -9568,6 +10176,7 @@ "status": "Poster", "track": "main", "pid": "342", + "author_site": "David Avidar; David Malah; Meir Barzohar", "author": "David Avidar; David Malah; Meir Barzohar", "abstract": "Local-to global point cloud registration is a challenging task due to the substantial differences between these two types of data, and the different techniques used to acquire them. Global clouds cover large-scale environments and are usually acquired aerially, e.g., 3D modeling of a city using Airborne Laser Scanning (ALS). In contrast, local clouds are often acquired from ground level and at a much smaller range, for example, using Terrestrial Laser Scanning (TLS). The differences are often manifested in point density distribution, occlusions nature, and measurement noise. As a result of these differences, existing point cloud registration approaches, such as keypoint-based registration, tend to fail. We improve upon a different approach, recently proposed, based on converting the global cloud into a viewpoint-based cloud dictionary. We propose a local-to-global registration method where we replace the dictionary clouds with viewpoint descriptors, consisting of panoramic range-images. We then use an efficient dictionary search in the Discrete Fourier Transform (DFT) domain, using phase correlation, to rapidly find plausible transformations from the local to the global reference frame. We demonstrate our method's significant advantages over the previous cloud dictionary approach, in terms of computational efficiency and memory requirements. In addition, We show its superior registration performance in comparison to a state-of-the-art, keypoint-based method (FPFH). For the evaluation, we use a challenging dataset of TLS local clouds and an ALS large-scale global cloud, in an urban environment.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Avidar_Local-To-Global_Point_Cloud_ICCV_2017_paper.pdf", @@ -9592,7 +10201,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Israel" + "aff_country_unique": "Israel", + "bibtex": "@InProceedings{Avidar_2017_ICCV,\n \n author = {\n Avidar,\n David and Malah,\n David and Barzohar,\n Meir\n},\n title = {\n Local-To-Global Point Cloud Registration Using a Dictionary of Viewpoint Descriptors\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Localizing Moments in Video With Natural Language", @@ -9600,6 +10210,7 @@ "status": "Poster", "track": "main", "pid": "2780", + "author_site": "Lisa Anne Hendricks; Oliver Wang; Eli Shechtman; Josef Sivic; Trevor Darrell; Bryan Russell", "author": "Lisa Anne Hendricks; Oliver Wang; Eli Shechtman; Josef Sivic; Trevor Darrell; Bryan Russell", "abstract": "We consider retrieving a specific temporal segment, or moment, from a video given a natural language text description. Methods designed to retrieve whole video clips with natural language determine what occurs in a video but not when. To address this issue, we propose the Moment Context Network (MCN) which effectively localizes natural language queries in videos by integrating local and global video features over time. A key obstacle to training our MCN model is that current video datasets do not include pairs of localized video segments and referring expressions, or text descriptions which uniquely identify a corresponding moment. Therefore, we collect the Distinct Describable Moments (DiDeMo) dataset which consists of over 10,000 unedited, personal videos in diverse visual settings with pairs of localized video segments and referring expressions. We demonstrate that MCN outperforms several baseline methods and believe that our initial results together with release of DiDeMo will inspire further research on localizing video moments with natural language.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Hendricks_Localizing_Moments_in_ICCV_2017_paper.pdf", @@ -9624,7 +10235,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;0;0;1;0;0", - "aff_country_unique": "United States;France" + "aff_country_unique": "United States;France", + "bibtex": "@InProceedings{Hendricks_2017_ICCV,\n \n author = {\n Anne Hendricks,\n Lisa and Wang,\n Oliver and Shechtman,\n Eli and Sivic,\n Josef and Darrell,\n Trevor and Russell,\n Bryan\n},\n title = {\n Localizing Moments in Video With Natural Language\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Locally-Transferred Fisher Vectors for Texture Classification", @@ -9632,6 +10244,7 @@ "status": "Poster", "track": "main", "pid": "2255", + "author_site": "Yang Song; Fan Zhang; Qing Li; Heng Huang; Lauren J. O'Donnell; Weidong Cai", "author": "Yang Song; Fan Zhang; Qing Li; Heng Huang; Lauren J. O'Donnell; Weidong Cai", "abstract": "Texture classification has been extensively studied in computer vision. Recent research shows that the combination of Fisher vector (FV) encoding and convolutional neural network (CNN) provides significant improvement in texture classification over the previous feature representation methods. However, by truncating the CNN model at the last convolutional layer, the CNN-based FV descriptors would not incorporate the full capability of neural networks in feature learning. In this study, we propose that we can further transform the CNN-based FV descriptors in a neural network model to obtain more discriminative feature representations. In particular, we design a locally-transferred Fisher vector (LFV) method, which involves a multi-layer neural network model containing locally connected layers to transform the input FV descriptors with filters of locally shared weights. The network is optimized based on the hinge loss of classification, and transferred FV descriptors are then used for image classification. Our results on three challenging texture image datasets show improved performance over the state of the art.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Song_Locally-Transferred_Fisher_Vectors_ICCV_2017_paper.pdf", @@ -9647,7 +10260,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Song_Locally-Transferred_Fisher_Vectors_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Song_Locally-Transferred_Fisher_Vectors_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Song_2017_ICCV,\n \n author = {\n Song,\n Yang and Zhang,\n Fan and Li,\n Qing and Huang,\n Heng and O'Donnell,\n Lauren J. and Cai,\n Weidong\n},\n title = {\n Locally-Transferred Fisher Vectors for Texture Classification\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Long Short-Term Memory Kalman Filters: Recurrent Neural Estimators for Pose Regularization", @@ -9655,6 +10269,7 @@ "status": "Poster", "track": "main", "pid": "2544", + "author_site": "Huseyin Coskun; Felix Achilles; Robert DiPietro; Nassir Navab; Federico Tombari", "author": "Huseyin Coskun; Felix Achilles; Robert DiPietro; Nassir Navab; Federico Tombari", "abstract": "One-shot pose estimation for tasks such as body joint localization, camera pose estimation, and object tracking are generally noisy, and temporal filters have been extensively used for regularization. One of the most widely-used methods is the Kalman filter, which is both extremely simple and general. However, Kalman filters require a motion model and measurement model to be specified a priori, which burdens the modeler and simultaneously demands that we use explicit models that are often only crude approximations of reality. For example, in the pose-estimation tasks mentioned above, it is common to use motion models that assume constant velocity or constant acceleration, and we believe that these simplified representations are severely inhibitive. In this work, we propose to instead learn rich, dynamic representations of the motion and noise models. In particular, we propose learning these models from data using long short-term memory, which allows representations that depend on all previous observations and all previous states. We evaluate our method using three of the most popular pose estimation tasks in computer vision, and in all cases we obtain state-of-the-art performance.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Coskun_Long_Short-Term_Memory_ICCV_2017_paper.pdf", @@ -9669,7 +10284,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Coskun_Long_Short-Term_Memory_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Coskun_Long_Short-Term_Memory_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Coskun_2017_ICCV,\n \n author = {\n Coskun,\n Huseyin and Achilles,\n Felix and DiPietro,\n Robert and Navab,\n Nassir and Tombari,\n Federico\n},\n title = {\n Long Short-Term Memory Kalman Filters: Recurrent Neural Estimators for Pose Regularization\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Look, Listen and Learn", @@ -9677,7 +10293,7 @@ "status": "Poster", "track": "main", "pid": "191", - "author_site": "Relja Arandjelovi\u00c4\u0087; Andrew Zisserman", + "author_site": "Relja Arandjelović; Andrew Zisserman", "author": "Relja Arandjelovic; Andrew Zisserman", "abstract": "We consider the question: what can be learnt by looking at and listening to a large number of unlabelled videos? There is a valuable, but so far untapped, source of information contained in the video itself -- the correspondence between the visual and the audio streams, and we introduce a novel \"Audio-Visual Correspondence\" learning task that makes use of this. Training visual and audio networks from scratch, without any additional supervision other than the raw unconstrained videos themselves, is shown to successfully solve this task, and, more interestingly, result in good visual and audio representations. These features set the new state-of-the-art on two sound classification benchmarks, and perform on par with the state-of-the-art self-supervised approaches on ImageNet classification. We also demonstrate that the network is able to localize objects in both modalities, as well as perform fine-grained recognition tasks.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Arandjelovic_Look_Listen_and_ICCV_2017_paper.pdf", @@ -9702,7 +10318,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Oxford", "aff_country_unique_index": "0;0+0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Arandjelovic_2017_ICCV,\n \n author = {\n Arandjelovic,\n Relja and Zisserman,\n Andrew\n},\n title = {\n Look,\n Listen and Learn\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Look, Perceive and Segment: Finding the Salient Objects in Images via Two-Stream Fixation-Semantic CNNs", @@ -9710,6 +10327,7 @@ "status": "Poster", "track": "main", "pid": "360", + "author_site": "Xiaowu Chen; Anlin Zheng; Jia Li; Feng Lu", "author": "Xiaowu Chen; Anlin Zheng; Jia Li; Feng Lu", "abstract": "Recently, CNN-based models have achieved remarkable success in image-based salient object detection (SOD). In these models, a key issue is to find a proper network architecture that best fits for the task of SOD. Toward this end, this paper proposes two-stream fixation-semantic CNNs, whose architecture is inspired by the fact that salient objects in complex images can be unambiguously annotated by selecting the pre-segmented semantic objects that receive the highest fixation density in eye-tracking experiments. In the two-stream CNNs, a fixation stream is pre-trained on eye-tracking data whose architecture well fits for the task of fixation prediction, and a semantic stream is pre-trained on images with semantic tags that has a proper architecture for semantic perception. By fusing these two streams into an inception-segmentation module and jointly fine-tuning them on images with manually annotated salient objects, the proposed networks show impressive performance in segmenting salient objects. Experimental results show that our approach outperforms 10 state-of-the-art models (5 deep, 5 non-deep) on 4 datasets.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Chen_Look_Perceive_and_ICCV_2017_paper.pdf", @@ -9734,7 +10352,8 @@ "aff_campus_unique_index": ";;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2017_ICCV,\n \n author = {\n Chen,\n Xiaowu and Zheng,\n Anlin and Li,\n Jia and Lu,\n Feng\n},\n title = {\n Look,\n Perceive and Segment: Finding the Salient Objects in Images via Two-Stream Fixation-Semantic CNNs\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Low Compute and Fully Parallel Computer Vision With HashMatch", @@ -9742,6 +10361,7 @@ "status": "Poster", "track": "main", "pid": "1843", + "author_site": "Sean Ryan Fanello; Julien Valentin; Adarsh Kowdle; Christoph Rhemann; Vladimir Tankovich; Carlo Ciliberto; Philip Davidson; Shahram Izadi", "author": "Sean Ryan Fanello; Julien Valentin; Adarsh Kowdle; Christoph Rhemann; Vladimir Tankovich; Carlo Ciliberto; Philip Davidson; Shahram Izadi", "abstract": "Numerous computer vision problems such as stereo depth estimation, object-class segmentation and foreground/background segmentation can be formulated as per-pixel image labeling tasks. Given one or many images as input, the desired output of these methods is usually a spatially smooth assignment of labels. The large amount of such computer vision problems has lead to significant research efforts, with the state of art moving from CRF-based approaches to deep CNNs and more recently, hybrids of the two. Although these approaches have significantly advanced the state of the art, the vast majority has solely focused on improving quantitative results and are not designed for low-compute scenarios. In this paper, we present a new general framework for a variety of computer vision labeling tasks, called HashMatch. Our approach is designed to be both fully parallel, i.e. each pixel is independently processed, and low-compute, with a model complexity an order of magnitude less than existing CNN and CRF-based approaches. We evaluate HashMatch extensively on several problems such as disparity estimation, image retrieval, feature approximation and background subtraction, for which HashMatch achieves high computational efficiency while producing high quality results.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Fanello_Low_Compute_and_ICCV_2017_paper.pdf", @@ -9766,7 +10386,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "1", - "aff_country_unique": ";United Kingdom" + "aff_country_unique": ";United Kingdom", + "bibtex": "@InProceedings{Fanello_2017_ICCV,\n \n author = {\n Ryan Fanello,\n Sean and Valentin,\n Julien and Kowdle,\n Adarsh and Rhemann,\n Christoph and Tankovich,\n Vladimir and Ciliberto,\n Carlo and Davidson,\n Philip and Izadi,\n Shahram\n},\n title = {\n Low Compute and Fully Parallel Computer Vision With HashMatch\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Low-Dimensionality Calibration Through Local Anisotropic Scaling for Robust Hand Model Personalization", @@ -9774,6 +10395,7 @@ "status": "Poster", "track": "main", "pid": "923", + "author_site": "Edoardo Remelli; Anastasia Tkach; Andrea Tagliasacchi; Mark Pauly", "author": "Edoardo Remelli; Anastasia Tkach; Andrea Tagliasacchi; Mark Pauly", "abstract": "We present a robust algorithm for personalizing a sphere-mesh tracking model to a user from a collection of depth measurements. Our core contribution is to demonstrate how simple geometric reasoning can be exploited to build a shape-space, and how its performance is comparable to shape-spaces constructed from datasets of carefully calibrated models. We achieve this goal by first re-parameterizing the geometry of the tracking template, and introducing a multi-stage calibration optimization. Our novel parameterization decouples the degrees of freedom for pose and shape, resulting in improved convergence properties. Our analytically differentiable multi-stage calibration pipeline optimizes for the model in the natural low-dimensional space of local anisotropic scalings, leading to an effective solution that can be easily embedded in other tracking/calibration algorithms. Compared to existing sphere-mesh calibration algorithms, quantitative experiments assess our algorithm possesses a larger convergence basin, and our personalized models allows to perform motion tracking with superior accuracy. Code and data are available at http://github.com/edoRemelli/hadjust", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Remelli_Low-Dimensionality_Calibration_Through_ICCV_2017_paper.pdf", @@ -9789,7 +10411,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Remelli_Low-Dimensionality_Calibration_Through_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Remelli_Low-Dimensionality_Calibration_Through_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Remelli_2017_ICCV,\n \n author = {\n Remelli,\n Edoardo and Tkach,\n Anastasia and Tagliasacchi,\n Andrea and Pauly,\n Mark\n},\n title = {\n Low-Dimensionality Calibration Through Local Anisotropic Scaling for Robust Hand Model Personalization\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Low-Rank Tensor Completion: A Pseudo-Bayesian Learning Approach", @@ -9797,6 +10420,7 @@ "status": "Poster", "track": "main", "pid": "1518", + "author_site": "Wei Chen; Nan Song", "author": "Wei Chen; Nan Song", "abstract": "Low rank tensor completion, which solves a linear inverse problem with the principle of parsimony, is a powerful technique used in many application domains in computer vision and pattern recognition. As a surrogate function of the matrix rank that is non-convex and discontinuous, the nuclear norm is often used instead to derive efficient algorithms for recovering missing information in matrices and higher order tensors. However, the nuclear norm is a loose approximation of the matrix rank, and what is more, the tensor nuclear norm is not guaranteed to be the tightest convex envelope of a multilinear rank. Alternative algorithms either require specifying/tuning several parameters (e.g., the tensor rank), and/or have a performance far from reaching the theoretical limit where the number of observed elements equals the degree of freedom in the unknown low-rank tensor. In this paper, we propose a pseudo-Bayesian approach, where a Bayesian-inspired cost function is adjusted using appropriate approximations that lead to desirable attributes including concavity and symmetry. Although deviating from the original Bayesian model, the resulting non-convex cost function is proved to have the ability to recover the true tensor with a low multilinear rank. A computational efficient algorithm is derived to solve the resulting non-convex optimization problem. We demonstrate the superior performance of the proposed algorithm in comparison with state-of-the-art alternatives by conducting extensive experiments on both synthetic data and several visual data recovery tasks.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Chen_Low-Rank_Tensor_Completion_ICCV_2017_paper.pdf", @@ -9814,14 +10438,15 @@ "author_num": 2, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Chen_Low-Rank_Tensor_Completion_ICCV_2017_paper.html", "aff_unique_index": "0;0", - "aff_unique_norm": "Beijing Jiao Tong University", + "aff_unique_norm": "Beijing Jiaotong University", "aff_unique_dep": "State Key Laboratory of Rail Traffic Control and Safety", "aff_unique_url": "http://www.bjtu.edu.cn", "aff_unique_abbr": "BJTU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2017_ICCV,\n \n author = {\n Chen,\n Wei and Song,\n Nan\n},\n title = {\n Low-Rank Tensor Completion: A Pseudo-Bayesian Learning Approach\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Low-Shot Visual Recognition by Shrinking and Hallucinating Features", @@ -9829,6 +10454,7 @@ "status": "Spotlight", "track": "main", "pid": "1523", + "author_site": "Bharath Hariharan; Ross Girshick", "author": "Bharath Hariharan; Ross Girshick", "abstract": "Low-shot visual learning--the ability to recognize novel object categories from very few examples--is a hallmark of human visual intelligence. Existing machine learning approaches fail to generalize in the same way. To make progress on this foundational problem, we present a low- shot learning benchmark on complex images that mimics challenges faced by recognition systems in the wild. We then propose (1) representation regularization techniques, and (2) techniques to hallucinate additional training examples for data-starved classes. Together, our methods improve the effectiveness of convolutional networks in low-shot learning, improving the one-shot accuracy on novel classes by 2.3x on the challenging ImageNet dataset.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Hariharan_Low-Shot_Visual_Recognition_ICCV_2017_paper.pdf", @@ -9843,7 +10469,8 @@ "aff_domain": ";", "email": ";", "author_num": 2, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Hariharan_Low-Shot_Visual_Recognition_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Hariharan_Low-Shot_Visual_Recognition_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Hariharan_2017_ICCV,\n \n author = {\n Hariharan,\n Bharath and Girshick,\n Ross\n},\n title = {\n Low-Shot Visual Recognition by Shrinking and Hallucinating Features\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "MIHash: Online Hashing With Mutual Information", @@ -9851,6 +10478,7 @@ "status": "Poster", "track": "main", "pid": "158", + "author_site": "Fatih Cakir; Kun He; Sarah Adel Bargal; Stan Sclaroff", "author": "Fatih Cakir; Kun He; Sarah Adel Bargal; Stan Sclaroff", "abstract": "Learning-based hashing methods are widely used for nearest neighbor retrieval, and recently, online hashing methods have demonstrated good performance-complexity trade-offs by learning hash functions from streaming data. In this paper, we first address a key challenge for online hashing: the binary codes for indexed data must be recomputed to keep pace with updates to the hash functions. We propose an efficient quality measure for hash functions, based on an information-theoretic quantity, mutual information, and use it successfully as a criterion to eliminate unnecessary hash table updates. Next, we also show how to optimize the mutual information objective using stochastic gradient descent. We thus develop a novel hashing method, MIHash, that can be used in both online and batch settings. Experiments on image retrieval benchmarks (including a 2.5M image dataset) confirm the effectiveness of our formulation, both in reducing hash table recomputations and in learning high-quality hash functions.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Cakir_MIHash_Online_Hashing_ICCV_2017_paper.pdf", @@ -9875,7 +10503,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Boston", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Cakir_2017_ICCV,\n \n author = {\n Cakir,\n Fatih and He,\n Kun and Adel Bargal,\n Sarah and Sclaroff,\n Stan\n},\n title = {\n MIHash: Online Hashing With Mutual Information\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "MUTAN: Multimodal Tucker Fusion for Visual Question Answering", @@ -9883,6 +10512,7 @@ "status": "Poster", "track": "main", "pid": "1112", + "author_site": "Hedi Ben-younes; Remi Cadene; Matthieu Cord; Nicolas Thome", "author": "Hedi Ben-younes; Remi Cadene; Matthieu Cord; Nicolas Thome", "abstract": "Bilinear models provide an appealing framework for mixing and merging information in Visual Question Answering (VQA) tasks. They help to learn high level associations between question meaning and visual concepts in the image, but they suffer from huge dimensionality issues. We introduce MUTAN, a multimodal tensor-based Tucker decomposition to efficiently parametrize bilinear interactions between visual and textual representations. Additionally to the Tucker framework, we design a low-rank matrix-based decomposition to explicitly constrain the interaction rank. With MUTAN, we control the complexity of the merging scheme while keeping nice interpretable fusion relations. We show how the Tucker decomposition framework generalizes some of the latest VQA architectures, providing state-of-the-art results.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Ben-younes_MUTAN_Multimodal_Tucker_ICCV_2017_paper.pdf", @@ -9900,14 +10530,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Ben-younes_MUTAN_Multimodal_Tucker_ICCV_2017_paper.html", "aff_unique_index": "0+1;0;0;2", - "aff_unique_norm": "Sorbonne University;Heuritech;Conservatoire National des Arts et M\u00e9tiers", + "aff_unique_norm": "Sorbonne University;Heuritech;Conservatoire National des Arts et Métiers", "aff_unique_dep": ";;", "aff_unique_url": "https://www.sorbonne.universite.fr;https://www.heuritech.com;https://www.cnam.fr", "aff_unique_abbr": "Sorbonne;;CNAM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0", - "aff_country_unique": "France" + "aff_country_unique": "France", + "bibtex": "@InProceedings{Ben-younes_2017_ICCV,\n \n author = {\n Ben-younes,\n Hedi and Cadene,\n Remi and Cord,\n Matthieu and Thome,\n Nicolas\n},\n title = {\n MUTAN: Multimodal Tucker Fusion for Visual Question Answering\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Makeup-Go: Blind Reversion of Portrait Edit", @@ -9915,6 +10546,7 @@ "status": "Oral", "track": "main", "pid": "1242", + "author_site": "Ying-Cong Chen; Xiaoyong Shen; Jiaya Jia", "author": "Ying-Cong Chen; Xiaoyong Shen; Jiaya Jia", "abstract": "Virtual face beautification (or markup) becomes common operations in camera or image processing Apps, which is actually deceiving. In this paper, we propose the task of restoring a portrait image from this process. As the first attempt along this line, we assume unknown global operations on human faces and aim to tackle the two issues of skin smoothing and skin color change. These two tasks, intriguingly, impose very different difficulties to estimate subtle details and major color variation. We propose a Component Regression Network (CRN) and address the limitation of using Euclidean loss in blind reversion. CRN maps the edited portrait images back to the original ones without knowing beautification operation details. Our experiments demonstrate effectiveness of the system for this novel task.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Chen_Makeup-Go_Blind_Reversion_ICCV_2017_paper.pdf", @@ -9932,14 +10564,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Chen_Makeup-Go_Blind_Reversion_ICCV_2017_paper.html", "aff_unique_index": "0;1;0+1", - "aff_unique_norm": "Chinese University of Hong Kong;Tencent", + "aff_unique_norm": "The Chinese University of Hong Kong;Tencent", "aff_unique_dep": ";Youtu Lab", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.tencent.com", "aff_unique_abbr": "CUHK;Tencent", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2017_ICCV,\n \n author = {\n Chen,\n Ying-Cong and Shen,\n Xiaoyong and Jia,\n Jiaya\n},\n title = {\n Makeup-Go: Blind Reversion of Portrait Edit\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Making Minimal Solvers for Absolute Pose Estimation Compact and Robust", @@ -9947,6 +10580,7 @@ "status": "Poster", "track": "main", "pid": "986", + "author_site": "Viktor Larsson; Zuzana Kukelova; Yinqiang Zheng", "author": "Viktor Larsson; Zuzana Kukelova; Yinqiang Zheng", "abstract": "In this paper we present new techniques for constructing compact and robust minimal solvers for absolute pose estimation. We focus on the P4Pfr problem, but the methods we propose are applicable to a more general setting. Previous approaches to P4Pfr suffer from artificial degeneracies which come from their formulation and not the geometry of the original problem. In this paper we show how to avoid these false degeneracies to create more robust solvers. Combined with recently published techniques for Grobner basis solvers we are also able to construct solvers which are significantly smaller. We evaluate our solvers on both real and synthetic data, and show improved performance compared to competing solvers. Finally we show that our techniques can be directly applied to the P3.5Pf problem to get a non-degenerate solver, which is competitive with the current state-of-the-art.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Larsson_Making_Minimal_Solvers_ICCV_2017_paper.pdf", @@ -9971,7 +10605,8 @@ "aff_campus_unique_index": "0;1;2", "aff_campus_unique": "Lund;Prague;Tokyo", "aff_country_unique_index": "0;1;2", - "aff_country_unique": "Sweden;Czech Republic;Japan" + "aff_country_unique": "Sweden;Czech Republic;Japan", + "bibtex": "@InProceedings{Larsson_2017_ICCV,\n \n author = {\n Larsson,\n Viktor and Kukelova,\n Zuzana and Zheng,\n Yinqiang\n},\n title = {\n Making Minimal Solvers for Absolute Pose Estimation Compact and Robust\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "MarioQA: Answering Questions by Watching Gameplay Videos", @@ -9979,6 +10614,7 @@ "status": "Poster", "track": "main", "pid": "1022", + "author_site": "Jonghwan Mun; Paul Hongsuck Seo; Ilchae Jung; Bohyung Han", "author": "Jonghwan Mun; Paul Hongsuck Seo; Ilchae Jung; Bohyung Han", "abstract": "We present a framework to analyze various aspects of models for video question answering (VideoQA) using customizable synthetic datasets, which are constructed automatically from gameplay videos. Our work is motivated by the fact that existing models are often tested only on datasets that require excessively high-level reasoning or mostly contain instances accessible through single frame inferences. Hence, it is difficult to measure capacity and flexibility of trained models, and existing techniques often rely on ad-hoc implementations of deep neural networks without clear insight into datasets and models. We are particularly interested in understanding temporal relationships between video events to solve VideoQA problems; this is because reasoning temporal dependency is one of the most distinct components in videos from images. To address this objective, we automatically generate a customized synthetic VideoQA dataset using Super Mario Bros. gameplay videos so that it contains events with different levels of reasoning complexity. Using the dataset, we show that properly constructed datasets with events in various complexity levels are critical to learn effective models and improve overall performance.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Mun_MarioQA_Answering_Questions_ICCV_2017_paper.pdf", @@ -10003,7 +10639,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Mun_2017_ICCV,\n \n author = {\n Mun,\n Jonghwan and Hongsuck Seo,\n Paul and Jung,\n Ilchae and Han,\n Bohyung\n},\n title = {\n MarioQA: Answering Questions by Watching Gameplay Videos\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Mask R-CNN", @@ -10011,7 +10648,7 @@ "status": "Oral", "track": "main", "pid": "16", - "author_site": "Kaiming He; Georgia Gkioxari; Piotr Doll\u00c3\u00a1r; Ross Girshick", + "author_site": "Kaiming He; Georgia Gkioxari; Piotr Dollár; Ross Girshick", "author": "Kaiming He; Georgia Gkioxari; Piotr Dollar; Ross Girshick", "abstract": "We present a conceptually simple, flexible, and general framework for object instance segmentation. Our approach efficiently detects objects in an image while simultaneously generating a high-quality segmentation mask for each instance. The method, called Mask R-CNN, extends Faster R-CNN by adding a branch for predicting an object mask in parallel with the existing branch for bounding box recognition. Mask R-CNN is simple to train and adds only a small overhead to Faster R-CNN, running at 5 fps. Moreover, Mask R-CNN is easy to generalize to other tasks, e.g., allowing us to estimate human poses in the same framework. We show top results in all three tracks of the COCO suite of challenges, including instance segmentation, bounding-box object detection, and person keypoint detection. Without tricks, Mask R-CNN outperforms all existing, single-model entries on every task, including the COCO 2016 challenge winners. We hope our simple and effective approach will serve as a solid baseline and help ease future research in instance-level recognition. Code will be made available.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/He_Mask_R-CNN_ICCV_2017_paper.pdf", @@ -10027,7 +10664,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/He_Mask_R-CNN_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/He_Mask_R-CNN_ICCV_2017_paper.html", + "bibtex": "@InProceedings{He_2017_ICCV,\n \n author = {\n He,\n Kaiming and Gkioxari,\n Georgia and Dollar,\n Piotr and Girshick,\n Ross\n},\n title = {\n Mask R-CNN\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Material Editing Using a Physically Based Rendering Network", @@ -10035,6 +10673,7 @@ "status": "Spotlight", "track": "main", "pid": "1936", + "author_site": "Guilin Liu; Duygu Ceylan; Ersin Yumer; Jimei Yang; Jyh-Ming Lien", "author": "Guilin Liu; Duygu Ceylan; Ersin Yumer; Jimei Yang; Jyh-Ming Lien", "abstract": "The ability to edit materials of objects in images is desirable by many content creators. However, this is an extremely challenging task as it requires to disentangle intrinsic physical properties of an image. We propose an end-to-end network architecture that replicates the forward image formation process to accomplish this task. Specifically, given a single image, the network first predicts intrinsic properties, i.e. shape, illumination, and material, which are then provided to a rendering layer. This layer performs in-network image synthesis, thereby enabling the network to understand the physics behind the image formation process. The proposed rendering layer is fully differentiable, supports both diffuse and specular materials, and thus can be applicable in a variety of problem settings. We demonstrate a rich set of visually plausible material editing examples and provide an extensive comparative study.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Liu_Material_Editing_Using_ICCV_2017_paper.pdf", @@ -10052,14 +10691,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Liu_Material_Editing_Using_ICCV_2017_paper.html", "aff_unique_index": "0+1+2;1;1;1;0", - "aff_unique_norm": "George Mason University;Adobe;NVIDIA", - "aff_unique_dep": ";Adobe Research;NVIDIA Corporation", + "aff_unique_norm": "George Mason University;Adobe;NVIDIA Corporation", + "aff_unique_dep": ";Adobe Research;", "aff_unique_url": "https://www.gmu.edu;https://research.adobe.com;https://www.nvidia.com", "aff_unique_abbr": "GMU;Adobe;NVIDIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0+0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Liu_2017_ICCV,\n \n author = {\n Liu,\n Guilin and Ceylan,\n Duygu and Yumer,\n Ersin and Yang,\n Jimei and Lien,\n Jyh-Ming\n},\n title = {\n Material Editing Using a Physically Based Rendering Network\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "MemNet: A Persistent Memory Network for Image Restoration", @@ -10067,6 +10707,7 @@ "status": "Spotlight", "track": "main", "pid": "812", + "author_site": "Ying Tai; Jian Yang; Xiaoming Liu; Chunyan Xu", "author": "Ying Tai; Jian Yang; Xiaoming Liu; Chunyan Xu", "abstract": "Recently, very deep convolutional neural networks (CNNs) have been attracting considerable attention in image restoration. However, as the depth grows, the long-term dependency problem is rarely realized for these very deep models, which results in the prior states/layers having little influence on the subsequent ones. Motivated by the fact that human thoughts have persistency, we propose a very deep persistent memory network (MemNet) that introduces a memory block, consisting of a recursive unit and a gate unit, to explicitly mine persistent memory through an adaptive learning process. The recursive unit learns multi-level representations of the current state under different receptive fields. The representations and the outputs from the previous memory blocks are concatenated and sent to the gate unit, which adaptively controls how much of the previous states should be reserved, and decides how much of the current state should be stored. We apply MemNet to three image restoration tasks, i.e., image denosing, super-resolution and JPEG deblocking. Comprehensive experiments demonstrate the necessity of the MemNet and its unanimous superiority on all three tasks over the state of the arts. Code is available at https://github.com/tyshiwo/MemNet.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Tai_MemNet_A_Persistent_ICCV_2017_paper.pdf", @@ -10087,11 +10728,12 @@ "aff_unique_norm": "Nanjing University of Science and Technology;Michigan State University", "aff_unique_dep": "Department of Computer Science and Engineering;Department of Computer Science and Engineering", "aff_unique_url": "http://www.nust.edu.cn;https://www.msu.edu", - "aff_unique_abbr": "NJUST;MSU", + "aff_unique_abbr": "NUST;MSU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Tai_2017_ICCV,\n \n author = {\n Tai,\n Ying and Yang,\n Jian and Liu,\n Xiaoming and Xu,\n Chunyan\n},\n title = {\n MemNet: A Persistent Memory Network for Image Restoration\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "MirrorFlow: Exploiting Symmetries in Joint Optical Flow and Occlusion Estimation", @@ -10099,6 +10741,7 @@ "status": "Poster", "track": "main", "pid": "230", + "author_site": "Junhwa Hur; Stefan Roth", "author": "Junhwa Hur; Stefan Roth", "abstract": "Optical flow estimation is one of the most studied problems in computer vision, yet recent benchmark datasets continue to reveal problem areas of today's approaches. Occlusions have remained one of the key challenges. In this paper, we propose a symmetric optical flow method to address the well-known chicken-and-egg relation between optical flow and occlusions. In contrast to many state-of-the-art methods that consider occlusions as outliers, possibly filtered out during post-processing, we highlight the importance of joint occlusion reasoning in the optimization and show how to utilize occlusion as an important cue for estimating optical flow. The key feature of our model is to fully exploit the symmetry properties that characterize optical flow and occlusions in the two consecutive images. Specifically through utilizing forward-backward consistency and occlusion-disocclusion symmetry in the energy, our model jointly estimates optical flow in both forward and backward direction, as well as consistent occlusion maps in both views. We demonstrate significant performance benefits on standard benchmarks, especially from the occlusion-disocclusion symmetry. On the challenging KITTI dataset we report the most accurate two-frame results to date.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Hur_MirrorFlow_Exploiting_Symmetries_ICCV_2017_paper.pdf", @@ -10113,7 +10756,8 @@ "aff_domain": ";", "email": ";", "author_num": 2, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Hur_MirrorFlow_Exploiting_Symmetries_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Hur_MirrorFlow_Exploiting_Symmetries_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Hur_2017_ICCV,\n \n author = {\n Hur,\n Junhwa and Roth,\n Stefan\n},\n title = {\n MirrorFlow: Exploiting Symmetries in Joint Optical Flow and Occlusion Estimation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Misalignment-Robust Joint Filter for Cross-Modal Image Pairs", @@ -10121,6 +10765,7 @@ "status": "Poster", "track": "main", "pid": "1501", + "author_site": "Takashi Shibata; Masayuki Tanaka; Masatoshi Okutomi", "author": "Takashi Shibata; Masayuki Tanaka; Masatoshi Okutomi", "abstract": "Although several powerful joint filters for cross-modal image pairs have been proposed, the existing joint filters generate severe artifacts when there are misalignments between a target and a guidance images. Our goal is to generate an artifact-free output image even from the misaligned target and guidance images. We propose a novel misalignment-robust joint filter based on weight-volume-based image composition and joint-filter cost volume. Our proposed method first generates a set of translated guidances. Next, the joint-filter cost volume and a set of filtered images are computed from the target image and the set of the translated guidances. Then, a weight volume is obtained from the joint-filter cost volume while considering a spatial smoothness and a label-sparseness. The final output image is composed by fusing the set of the filtered images with the weight volume for the filtered images. The key is to generate the final output image directly from the set of the filtered images by weighted averaging using the weight volume that is obtained from the joint-filter cost volume. The proposed framework is widely applicable and can involve any kind of joint filter. Experimental results show that the proposed method is effective for various applications including image denosing, image up-sampling, haze removal and depth map interpolation.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Shibata_Misalignment-Robust_Joint_Filter_ICCV_2017_paper.pdf", @@ -10145,7 +10790,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Shibata_2017_ICCV,\n \n author = {\n Shibata,\n Takashi and Tanaka,\n Masayuki and Okutomi,\n Masatoshi\n},\n title = {\n Misalignment-Robust Joint Filter for Cross-Modal Image Pairs\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "MoFA: Model-Based Deep Convolutional Face Autoencoder for Unsupervised Monocular Reconstruction", @@ -10153,7 +10799,7 @@ "status": "Oral", "track": "main", "pid": "777", - "author_site": "Ayush Tewari; Michael Zollh\u00c3\u00b6fer; Hyeongwoo Kim; Pablo Garrido; Florian Bernard; Patrick P\u00c3\u00a9rez; Christian Theobalt", + "author_site": "Ayush Tewari; Michael Zollhöfer; Hyeongwoo Kim; Pablo Garrido; Florian Bernard; Patrick Pérez; Christian Theobalt", "author": "Ayush Tewari; Michael Zollhofer; Hyeongwoo Kim; Pablo Garrido; Florian Bernard; Patrick Perez; Christian Theobalt", "abstract": "In this work we propose a novel model-based deep convolutional autoencoder that addresses the highly challenging problem of reconstructing a 3D human face from a single in-the-wild color image. To this end, we combine a convolutional encoder network with an expert-designed generative model that serves as decoder. The core innovation is the differentiable parametric decoder that encapsulates image formation analytically based on a generative model. Our decoder takes as input a code vector with exactly defined semantic meaning that encodes detailed face pose, shape, expression, skin reflectance and scene illumination. Due to this new way of combining CNN-based with model-based face reconstruction, the CNN-based encoder learns to extract semantically meaningful parameters from a single monocular input image. For the first time, a CNN encoder and an expert-designed generative model can be trained end-to-end in an unsupervised manner, which renders training on very large (unlabeled) real world data feasible. The obtained reconstructions compare favorably to current state-of-the-art approaches in terms of quality and richness of representation.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Tewari_MoFA_Model-Based_Deep_ICCV_2017_paper.pdf", @@ -10169,7 +10815,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Tewari_MoFA_Model-Based_Deep_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Tewari_MoFA_Model-Based_Deep_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Tewari_2017_ICCV,\n \n author = {\n Tewari,\n Ayush and Zollhofer,\n Michael and Kim,\n Hyeongwoo and Garrido,\n Pablo and Bernard,\n Florian and Perez,\n Patrick and Theobalt,\n Christian\n},\n title = {\n MoFA: Model-Based Deep Convolutional Face Autoencoder for Unsupervised Monocular Reconstruction\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Modeling Urban Scenes From Pointclouds", @@ -10177,6 +10824,7 @@ "status": "Poster", "track": "main", "pid": "1734", + "author_site": "William Nguatem; Helmut Mayer", "author": "William Nguatem; Helmut Mayer", "abstract": "We present a method for Modeling Urban Scenes from Pointclouds (MUSP). In contrast to existing approaches, MUSP is robust, scalable and provides a more complete description by not making a Manhattan-World assumption and modeling both buildings (with polyhedra) as well as the non-planar ground (using NURBS). First, we segment the scene into consistent patches using a divide-and-conquer based algorithm within a nonparametric Bayesian framework (stick-breaking construction). These patches often correspond to meaningful structures, such as the ground, facades, roofs and roof superstructures. We use polygon sweeping to fit predefined templates for buildings, and for the ground, a NURBS surface is fit and uniformly tessellated. Finally, we apply boolean operations to the polygons for buildings, buildings parts and the tesselated ground to clip unnecessary geometry (e.g., facades protrusions below the non-planar ground), leading to the final model. The explicit Bayesian formulation of scene segmentation makes our approach suitable for challenging datasets with varying amounts of noise, outliers, and point density. We demonstrate the robustness of MUSP on 3D pointclouds from image matching as well as LiDAR.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Nguatem_Modeling_Urban_Scenes_ICCV_2017_paper.pdf", @@ -10201,7 +10849,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Munich", "aff_country_unique_index": "0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Nguatem_2017_ICCV,\n \n author = {\n Nguatem,\n William and Mayer,\n Helmut\n},\n title = {\n Modeling Urban Scenes From Pointclouds\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Modelling the Scene Dependent Imaging in Cameras With a Deep Neural Network", @@ -10209,6 +10858,7 @@ "status": "Poster", "track": "main", "pid": "869", + "author_site": "Seonghyeon Nam; Seon Joo Kim", "author": "Seonghyeon Nam; Seon Joo Kim", "abstract": "We present a novel deep learning framework that models the scene dependent image processing inside cameras. Often called as the radiometric calibration, the process of recovering RAW images from processed images (JPEG format in the sRGB color space) is essential for many computer vision tasks that rely on physically accurate radiance values. All previous works rely on the deterministic imaging model where the color transformation stays the same regardless of the scene and thus they can only be applied for images taken under the manual mode. In this paper, we propose a data-driven approach to learn the scene dependent and locally varying image processing inside cameras under the automode. Our method incorporates both the global and the local scene context into pixel-wise features via multi-scale pyramid of learnable histogram layers. The results show that we can model the imaging pipeline of different cameras that operate under the automode accurately in both directions (from RAW to sRGB, from sRGB to RAW) and we show how we can apply our method to improve the performance of image deblurring.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Nam_Modelling_the_Scene_ICCV_2017_paper.pdf", @@ -10223,7 +10873,8 @@ "aff_domain": ";", "email": ";", "author_num": 2, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Nam_Modelling_the_Scene_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Nam_Modelling_the_Scene_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Nam_2017_ICCV,\n \n author = {\n Nam,\n Seonghyeon and Joo Kim,\n Seon\n},\n title = {\n Modelling the Scene Dependent Imaging in Cameras With a Deep Neural Network\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Monocular 3D Human Pose Estimation by Predicting Depth on Joints", @@ -10231,6 +10882,7 @@ "status": "Poster", "track": "main", "pid": "1425", + "author_site": "Bruce Xiaohan Nie; Ping Wei; Song-Chun Zhu", "author": "Bruce Xiaohan Nie; Ping Wei; Song-Chun Zhu", "abstract": "This paper aims at estimating full-body 3D human poses from monocular images of which the biggest challenge is the inherent ambiguity introduced by lifting the 2D pose into 3D space. We propose a novel framework focusing on reducing this ambiguity by predicting the depth of human joints based on 2D human joint locations and body part images. Our approach is built on a two-level hierarchy of Long Short-Term Memory (LSTM) Networks which can be trained end-to-end. The first level consists of two components: 1) a skeleton-LSTM which learns the depth information from global human skeleton features; 2) a patch-LSTM which utilizes the local image evidence around joint locations. The both networks have tree structure defined on the kinematic relation of human skeleton, thus the information at different joints is broadcast through the whole skeleton in a top-down fashion. The two networks are first pre-trained separately on different data sources and then aggregated in the second layer for final depth prediction. The empirical evaluation on Human3.6M and HHOI dataset demonstrates the advantage of combining global 2D skeleton and local image patches for depth prediction, and our superior quantitative and qualitative performance relative to state-of-the-art methods.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Nie_Monocular_3D_Human_ICCV_2017_paper.pdf", @@ -10248,14 +10900,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Nie_Monocular_3D_Human_ICCV_2017_paper.html", "aff_unique_index": "0+1;1;0", - "aff_unique_norm": "University of California, Los Angeles;Xi'an Jiao Tong University", + "aff_unique_norm": "University of California, Los Angeles;Xi'an Jiaotong University", "aff_unique_dep": "Center for Vision, Cognition, Learning, and Autonomy;", "aff_unique_url": "https://www.ucla.edu;http://en.xjtu.edu.cn/", "aff_unique_abbr": "UCLA;XJTU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0+1;1;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Nie_2017_ICCV,\n \n author = {\n Xiaohan Nie,\n Bruce and Wei,\n Ping and Zhu,\n Song-Chun\n},\n title = {\n Monocular 3D Human Pose Estimation by Predicting Depth on Joints\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Monocular Dense 3D Reconstruction of a Complex Dynamic Scene From Two Perspective Frames", @@ -10263,10 +10916,11 @@ "status": "Poster", "track": "main", "pid": "2343", + "author_site": "Suryansh Kumar; Yuchao Dai; Hongdong Li", "author": "Suryansh Kumar; Yuchao Dai; Hongdong Li", "abstract": "This paper proposes a new approach for monocular dense 3D reconstruction of a complex dynamic scene from two perspective frames. By applying superpixel oversegmentation to the image, we model a generically dynamic (hence non-rigid) scene with a piecewise planar and rigid approximation. In this way, we reduce the dynamic reconstruction problem to a \"3D jigsaw puzzle\" problem which takes pieces from an unorganized \"soup of superpixels\". We show that our method provides an effective solution to the inherent relative scale ambiguity in structure-from-motion. Since our method does not assume a template prior, or per-object segmentation, or knowledge about the rigidity of the dynamic scene, it is applicable to a wide range of scenarios. Extensive experiments on both synthetic and real monocular sequences demonstrate the superiority of our method compared with the state-of-the-art methods.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Kumar_Monocular_Dense_3D_ICCV_2017_paper.pdf", - "aff": "Australian National University, Canberra, Australia; Australian National University, Canberra, Australia + Northwestern Polytechnical University, Xi\u2019an, China; Australian National University, Canberra, Australia + Australia Centre for Robotic Vision", + "aff": "Australian National University, Canberra, Australia; Australian National University, Canberra, Australia + Northwestern Polytechnical University, Xi’an, China; Australian National University, Canberra, Australia + Australia Centre for Robotic Vision", "project": "", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2017/supplemental/Kumar_Monocular_Dense_3D_ICCV_2017_supplemental.pdf", @@ -10282,12 +10936,13 @@ "aff_unique_index": "0;0+1;0+2", "aff_unique_norm": "Australian National University;Northwestern Polytechnical University;Australia Centre for Robotic Vision", "aff_unique_dep": ";;", - "aff_unique_url": "https://www.anu.edu.au;http://www.nwpu.edu.cn;https://roboticvision.org/", - "aff_unique_abbr": "ANU;NPU;ACRV", + "aff_unique_url": "https://www.anu.edu.au;http://www.nwpu.edu.cn;", + "aff_unique_abbr": "ANU;NWPU;", "aff_campus_unique_index": "0;0+1;0", "aff_campus_unique": "Canberra;Xi'an;", "aff_country_unique_index": "0;0+1;0+0", - "aff_country_unique": "Australia;China" + "aff_country_unique": "Australia;China", + "bibtex": "@InProceedings{Kumar_2017_ICCV,\n \n author = {\n Kumar,\n Suryansh and Dai,\n Yuchao and Li,\n Hongdong\n},\n title = {\n Monocular Dense 3D Reconstruction of a Complex Dynamic Scene From Two Perspective Frames\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Monocular Free-Head 3D Gaze Tracking With Deep Learning and Geometry Constraints", @@ -10295,6 +10950,7 @@ "status": "Poster", "track": "main", "pid": "1487", + "author_site": "Wangjiang Zhu; Haoping Deng", "author": "Wangjiang Zhu; Haoping Deng", "abstract": "Free-head 3D gaze tracking outputs both the eye location and the gaze vector in 3D space, and it has wide applications in scenarios such as driver monitoring, advertisement analysis and surveillance. A reliable and low-cost monocular solution is critical for pervasive usage in these areas. Noticing that a gaze vector is a composition of head pose and eyeball movement in a geometrically deterministic way, we propose a novel gaze transform layer to connect separate head pose and eyeball movement models. The proposed decomposition does not suffer from head-gaze correlation overfitting and makes it possible to use datasets existing for other tasks. To add stronger supervision for better network training, we propose a two-step training strategy, which first trains sub-tasks with rough labels and then jointly trains with accurate gaze labels. To enable good cross-subject performance under various conditions, we collect a large dataset which has full coverage of head poses and eyeball movements, contains 200 subjects, and has diverse illumination conditions. Our deep solution achieves state-of-the-art gaze tracking accuracy, reaching 5.6 degrees cross-subject prediction error using a small network running at 1000 fps on a s ingle CPU (excluding face alignment time) and 4.3 degrees cross-subject error with a deeper network.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zhu_Monocular_Free-Head_3D_ICCV_2017_paper.pdf", @@ -10309,7 +10965,8 @@ "aff_domain": ";", "email": ";", "author_num": 2, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Zhu_Monocular_Free-Head_3D_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Zhu_Monocular_Free-Head_3D_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Zhu_2017_ICCV,\n \n author = {\n Zhu,\n Wangjiang and Deng,\n Haoping\n},\n title = {\n Monocular Free-Head 3D Gaze Tracking With Deep Learning and Geometry Constraints\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Monocular Video-Based Trailer Coupler Detection Using Multiplexer Convolutional Neural Network", @@ -10317,6 +10974,7 @@ "status": "Poster", "track": "main", "pid": "2921", + "author_site": "Yousef Atoum; Joseph Roth; Michael Bliss; Wende Zhang; Xiaoming Liu", "author": "Yousef Atoum; Joseph Roth; Michael Bliss; Wende Zhang; Xiaoming Liu", "abstract": "This paper presents an automated monocular-camera-based computer vision system for autonomous self-backing-up a vehicle towards a trailer, by continuously estimating the 3D trailer coupler position and feeding it to the vehicle control system, until the alignment of the tow hitch with the trailers coupler. This system is made possible through our proposed distance-driven Multiplexer-CNN method, which selects the most suitable CNN using the estimated coupler-to-vehicle distance. The input of the multiplexer is a group made of a CNN detector, trackers, and 3D localizer. In the CNN detector, we propose a novel algorithm to provide a presence confidence score with each detection. The score reflects the existence of the target object in a region, as well as how accurate is the 2D target detection. We demonstrate the accuracy and efficiency of the system on a large trailer database. Our system achieves an estimation error of 1.4 cm when the ball reaches the coupler, while running at 18.9 FPS on a regular PC.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Atoum_Monocular_Video-Based_Trailer_ICCV_2017_paper.pdf", @@ -10341,7 +10999,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Atoum_2017_ICCV,\n \n author = {\n Atoum,\n Yousef and Roth,\n Joseph and Bliss,\n Michael and Zhang,\n Wende and Liu,\n Xiaoming\n},\n title = {\n Monocular Video-Based Trailer Coupler Detection Using Multiplexer Convolutional Neural Network\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Moving Object Detection in Time-Lapse or Motion Trigger Image Sequences Using Low-Rank and Invariant Sparse Decomposition", @@ -10349,6 +11008,7 @@ "status": "Poster", "track": "main", "pid": "2079", + "author_site": "Moein Shakeri; Hong Zhang", "author": "Moein Shakeri; Hong Zhang", "abstract": "Low-rank and sparse representation based methods have attracted wide attention in background subtraction and moving object detection, where moving objects in the scene are modeled as pixel-wise sparse outliers. Since in real scenarios moving objects are also structurally sparse, recently researchers have attempted to extract moving objects using structured sparse outliers. Although existing methods with structured sparsity-inducing norms produce promising results, they are still vulnerable to various illumination changes that frequently occur in real environments, specifically for time-lapse image sequences where assumptions about sparsity between images such as group sparsity are not valid. In this paper, we first introduce a prior map obtained by illumination invariant representation of images. Next, we propose a low-rank and invariant sparse decomposition using the prior map to detect moving objects under significant illumination changes. Experiments on challenging benchmark datasets demonstrate the superior performance of our proposed method under complex illumination changes.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Shakeri_Moving_Object_Detection_ICCV_2017_paper.pdf", @@ -10364,7 +11024,8 @@ "aff_domain": ";", "email": ";", "author_num": 2, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Shakeri_Moving_Object_Detection_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Shakeri_Moving_Object_Detection_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Shakeri_2017_ICCV,\n \n author = {\n Shakeri,\n Moein and Zhang,\n Hong\n},\n title = {\n Moving Object Detection in Time-Lapse or Motion Trigger Image Sequences Using Low-Rank and Invariant Sparse Decomposition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Multi-Channel Weighted Nuclear Norm Minimization for Real Color Image Denoising", @@ -10372,10 +11033,11 @@ "status": "Poster", "track": "main", "pid": "572", + "author_site": "Jun Xu; Lei Zhang; David Zhang; Xiangchu Feng", "author": "Jun Xu; Lei Zhang; David Zhang; Xiangchu Feng", "abstract": "Most of the existing denoising algorithms are developed for grayscale images. It is not trivial to extend them for color image denoising since the noise statistics in R, G, and B channels can be very different for real noisy images. In this paper, we propose a multi-channel (MC) optimization model for real color image denoising under the weighted nuclear norm minimization (WNNM) framework. We concatenate the RGB patches to make use of the channel redundancy, and introduce a weight matrix to balance the data fidelity of the three channels in consideration of their different noise statistics. The proposed MC-WNNM model does not have an analytical solution. We reformulate it into a linear equality-constrained problem and solve it via alternating direction method of multipliers. Each alternative updating step has a closed-form solution and the convergence can be guaranteed. Experiments on both synthetic and real noisy image datasets demonstrate the superiority of the proposed MC-WNNM over state-of-the-art denoising methods.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Xu_Multi-Channel_Weighted_Nuclear_ICCV_2017_paper.pdf", - "aff": "Dept. of Computing, The Hong Kong Polytechnic University, Hong Kong, China; Dept. of Computing, The Hong Kong Polytechnic University, Hong Kong, China; Dept. of Computing, The Hong Kong Polytechnic University, Hong Kong, China; School of Mathematics and Statistics, Xidian University, Xi\u2019an, China", + "aff": "Dept. of Computing, The Hong Kong Polytechnic University, Hong Kong, China; Dept. of Computing, The Hong Kong Polytechnic University, Hong Kong, China; Dept. of Computing, The Hong Kong Polytechnic University, Hong Kong, China; School of Mathematics and Statistics, Xidian University, Xi’an, China", "project": "", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2017/supplemental/Xu_Multi-Channel_Weighted_Nuclear_ICCV_2017_supplemental.pdf", @@ -10389,14 +11051,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Xu_Multi-Channel_Weighted_Nuclear_ICCV_2017_paper.html", "aff_unique_index": "0;0;0;1", - "aff_unique_norm": "Hong Kong Polytechnic University;Xidian University", + "aff_unique_norm": "The Hong Kong Polytechnic University;Xidian University", "aff_unique_dep": "Dept. of Computing;School of Mathematics and Statistics", "aff_unique_url": "https://www.polyu.edu.hk;http://www.xidian.edu.cn/", "aff_unique_abbr": "PolyU;Xidian", "aff_campus_unique_index": "0;0;0;1", "aff_campus_unique": "Hong Kong;Xi'an", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xu_2017_ICCV,\n \n author = {\n Xu,\n Jun and Zhang,\n Lei and Zhang,\n David and Feng,\n Xiangchu\n},\n title = {\n Multi-Channel Weighted Nuclear Norm Minimization for Real Color Image Denoising\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Multi-Label Image Recognition by Recurrently Discovering Attentional Regions", @@ -10404,6 +11067,7 @@ "status": "Poster", "track": "main", "pid": "1283", + "author_site": "Zhouxia Wang; Tianshui Chen; Guanbin Li; Ruijia Xu; Liang Lin", "author": "Zhouxia Wang; Tianshui Chen; Guanbin Li; Ruijia Xu; Liang Lin", "abstract": "This paper proposes a novel deep architecture to address multi-label image recognition, a fundamental and practical task towards general visual understanding. Current solutions for this task usually rely on an extra step of extracting hypothesis regions (i.e., region proposals), resulting in redundant computation and sub-optimal performance. In this work, we achieve the interpretable and contextualized multi-label image classification by developing a recurrent memorized-attention module. This module consists of two alternately performed components: i) a spatial transformer layer to locate attentional regions from the convolutional feature maps in a region-proposal-free way and ii) a LSTM (Long-Short Term Memory) sub-network to sequentially predict semantic labeling scores on the located regions while capturing the global dependencies of these regions. The LSTM also output the parameters for computing the spatial transformer. On large-scale benchmarks of multi-label image classification (e.g., MS-COCO and PASCAL VOC 07), our approach demonstrates superior performances over other existing state-of-the-arts in both accuracy and efficiency.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Wang_Multi-Label_Image_Recognition_ICCV_2017_paper.pdf", @@ -10428,7 +11092,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0+0;0;0+0;0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2017_ICCV,\n \n author = {\n Wang,\n Zhouxia and Chen,\n Tianshui and Li,\n Guanbin and Xu,\n Ruijia and Lin,\n Liang\n},\n title = {\n Multi-Label Image Recognition by Recurrently Discovering Attentional Regions\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Multi-Label Learning of Part Detectors for Heavily Occluded Pedestrian Detection", @@ -10436,6 +11101,7 @@ "status": "Poster", "track": "main", "pid": "1515", + "author_site": "Chunluan Zhou; Junsong Yuan", "author": "Chunluan Zhou; Junsong Yuan", "abstract": "Detecting pedestrians that are partially occluded remains a challenging problem due to variations and uncertainties of partial occlusion patterns. Following a commonly used framework of handling partial occlusions by part detection, we propose a multi-label learning approach to jointly learn part detectors to capture partial occlusion patterns. The part detectors share a set of decision trees via boosting to exploit part correlations and also reduce the computational cost of applying these part detectors. The learned decision trees capture the overall distribution of all the parts. When used as a pedestrian detector individually, our part detectors learned jointly show better performance than their counterparts learned separately in different occlusion situations. The learned part detectors can be further integrated to better detect partially occluded pedestrians. Experiments on the Caltech dataset show state-of-the-art performance of our approach for detecting heavily occluded pedestrians.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zhou_Multi-Label_Learning_of_ICCV_2017_paper.pdf", @@ -10460,7 +11126,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Singapore", "aff_country_unique_index": "0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Zhou_2017_ICCV,\n \n author = {\n Zhou,\n Chunluan and Yuan,\n Junsong\n},\n title = {\n Multi-Label Learning of Part Detectors for Heavily Occluded Pedestrian Detection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Multi-Modal Factorized Bilinear Pooling With Co-Attention Learning for Visual Question Answering", @@ -10468,6 +11135,7 @@ "status": "Poster", "track": "main", "pid": "675", + "author_site": "Zhou Yu; Jun Yu; Jianping Fan; Dacheng Tao", "author": "Zhou Yu; Jun Yu; Jianping Fan; Dacheng Tao", "abstract": "Visual question answering (VQA) is challenging because it requires a simultaneous understanding of both the visual content of images and the textual content of questions. The approaches used to represent the images and questions in a fine-grained manner and questions and to fuse these multi-modal features play key roles in performance. Bilinear pooling based models have been shown to outperform traditional linear models for VQA, but their high-dimensional representations and high computational complexity may seriously limit their applicability in practice. For multi-modal feature fusion, here we develop a Multi-modal Factorized Bilinear (MFB) pooling approach to efficiently and effectively combine multi-modal features, which results in superior performance for VQA compared with other bilinear pooling approaches. For fine-grained image and question representation, we develop a co-attention mechanism using an end-to-end deep network architecture to jointly learn both the image and question attentions. Combining the proposed MFB approach with co-attention learning in a new network architecture provides a unified model for VQA. Our experimental results demonstrate that the single MFB with co-attention model achieves new state-of-the-art performance on the real-world VQA dataset. Code available at https://github.com/yuzcccc/mfb", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Yu_Multi-Modal_Factorized_Bilinear_ICCV_2017_paper.pdf", @@ -10485,14 +11153,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Yu_Multi-Modal_Factorized_Bilinear_ICCV_2017_paper.html", "aff_unique_index": "0;0+1;2;1", - "aff_unique_norm": "Hangzhou Dianzi University;University of Sydney;University of North Carolina at Charlotte", + "aff_unique_norm": "Hangzhou Dianzi University;The University of Sydney;University of North Carolina at Charlotte", "aff_unique_dep": "School of Computer Science and Technology;School of IT, FEIT;Department of Computer Science", "aff_unique_url": "http://www.hdu.edu.cn/;https://www.sydney.edu.au;https://www.uncc.edu", "aff_unique_abbr": ";USYD;UNCC", "aff_campus_unique_index": "1;2;1", "aff_campus_unique": ";Sydney;Charlotte", "aff_country_unique_index": "0;0+1;2;1", - "aff_country_unique": "China;Australia;United States" + "aff_country_unique": "China;Australia;United States", + "bibtex": "@InProceedings{Yu_2017_ICCV,\n \n author = {\n Yu,\n Zhou and Yu,\n Jun and Fan,\n Jianping and Tao,\n Dacheng\n},\n title = {\n Multi-Modal Factorized Bilinear Pooling With Co-Attention Learning for Visual Question Answering\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Multi-Scale Deep Learning Architectures for Person Re-Identification", @@ -10500,6 +11169,7 @@ "status": "Poster", "track": "main", "pid": "2788", + "author_site": "Xuelin Qian; Yanwei Fu; Yu-Gang Jiang; Tao Xiang; Xiangyang Xue", "author": "Xuelin Qian; Yanwei Fu; Yu-Gang Jiang; Tao Xiang; Xiangyang Xue", "abstract": "Person Re-identification (re-id) aims to match people across non-overlapping camera views in a public space. It is a challenging problem because many people captured in surveillance videos wear similar clothes. Consequently, the differences in their appearance are often subtle and only detectable at the right location and scales. Existing re-id models, particularly the recently proposed deep learning based ones match people at a single scale. In contrast, in this paper, a novel multi-scale deep learning model is proposed. Our model is able to learn deep discriminative feature representations at different scales and automatically determine the most suitable scales for matching. The importance of different spatial locations for extracting discriminative features is also learned explicitly. Experiments are carried out to demonstrate that the proposed model outperforms the state-of-the art on a number of benchmarks.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Qian_Multi-Scale_Deep_Learning_ICCV_2017_paper.pdf", @@ -10524,7 +11194,8 @@ "aff_campus_unique_index": "0;;0;2;0", "aff_campus_unique": "Shanghai;;London", "aff_country_unique_index": "0;0+1;0+0;2;0+0", - "aff_country_unique": "China;Australia;United Kingdom" + "aff_country_unique": "China;Australia;United Kingdom", + "bibtex": "@InProceedings{Qian_2017_ICCV,\n \n author = {\n Qian,\n Xuelin and Fu,\n Yanwei and Jiang,\n Yu-Gang and Xiang,\n Tao and Xue,\n Xiangyang\n},\n title = {\n Multi-Scale Deep Learning Architectures for Person Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Multi-Stage Multi-Recursive-Input Fully Convolutional Networks for Neuronal Boundary Detection", @@ -10532,6 +11203,7 @@ "status": "Poster", "track": "main", "pid": "933", + "author_site": "Wei Shen; Bin Wang; Yuan Jiang; Yan Wang; Alan Yuille", "author": "Wei Shen; Bin Wang; Yuan Jiang; Yan Wang; Alan Yuille", "abstract": "In the field of connectomics, neuroscientists seek to identify cortical connectivity comprehensively. Neuronal boundary detection from the Electron Microscopy (EM) images is often done to assist the automatic reconstruction of neuronal circuit. But the segmentation of EM images is a challenging problem, as it requires the detector to be able to detect both filament-like thin and blob-like thick membrane, while suppressing the ambiguous intracellular structure. In this paper, we propose multi-stage multi-recursiveinput fully convolutional networks to address this problem. The multiple recursive inputs for one stage, i.e., the multiple side outputs with different receptive field sizes learned from the lower stage, provide multi-scale contextual boundary information for the consecutive learning. This design is biologically-plausible, as it likes a human visual system to compare different possible segmentation solutions to address the ambiguous boundary issue. Our multi-stage networks are trained end-to-end. It achieves promising results on two public available EM segmentation datasets, the mouse piriform cortex dataset and the ISBI 2012 EM dataset.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Shen_Multi-Stage_Multi-Recursive-Input_Fully_ICCV_2017_paper.pdf", @@ -10556,7 +11228,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0;0;1;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Shen_2017_ICCV,\n \n author = {\n Shen,\n Wei and Wang,\n Bin and Jiang,\n Yuan and Wang,\n Yan and Yuille,\n Alan\n},\n title = {\n Multi-Stage Multi-Recursive-Input Fully Convolutional Networks for Neuronal Boundary Detection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Multi-Task Self-Supervised Visual Learning", @@ -10564,6 +11237,7 @@ "status": "Poster", "track": "main", "pid": "697", + "author_site": "Carl Doersch; Andrew Zisserman", "author": "Carl Doersch; Andrew Zisserman", "abstract": "We investigate methods for combining multiple self-supervised tasks---i.e., supervised tasks where data can be collected without manual labeling---in order to train a single visual representation. First, we provide an apples-to-apples comparison of four different self-supervised tasks using the very deep ResNet-101 architecture. We then combine tasks to jointly train a network. We also explore lasso regularization to encourage the network to factorize the information in its representation, and methods for \"harmonizing\" network inputs in order to learn a more unified representation. We evaluate all methods on ImageNet classification, PASCAL VOC detection, and NYU depth prediction. Our results show that deeper networks work better, and that combining tasks---even via a naive multi-head architecture---always improves performance. Our best joint network nearly matches the PASCAL performance of a model pre-trained on ImageNet classification, and matches the ImageNet network on NYU depth prediction.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Doersch_Multi-Task_Self-Supervised_Visual_ICCV_2017_paper.pdf", @@ -10588,7 +11262,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Oxford", "aff_country_unique_index": "0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Doersch_2017_ICCV,\n \n author = {\n Doersch,\n Carl and Zisserman,\n Andrew\n},\n title = {\n Multi-Task Self-Supervised Visual Learning\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Multi-View Dynamic Shape Refinement Using Local Temporal Integration", @@ -10596,10 +11271,11 @@ "status": "Poster", "track": "main", "pid": "1361", + "author_site": "Vincent Leroy; Jean-Sebastien Franco; Edmond Boyer", "author": "Vincent Leroy; Jean-Sebastien Franco; Edmond Boyer", "abstract": "We consider 4D shape reconstructions in multi-view environments and investigate how to exploit temporal redundancy for precision refinement. In addition to being beneficial to many dynamic multi-view scenarios this also enables larger scenes where such increased precision can compensate for the reduced spatial resolution per image frame. With precision and scalability in mind, we propose a symmetric (non-causal) local time-window geometric integration scheme over temporal sequences, where shape reconstructions are refined framewise by warping local and reliable geometric regions of neighboring frames to them. This is in contrast to recent comparable approaches targeting a different context with more compact scenes and real-time applications. These usually use a single dense volumetric update space or geometric template, which they causally track and update globally frame by frame, with limitations in scalability for larger scenes and in topology and precision with a template based strategy. Our template less and local approach is a first step towards temporal shape super-resolution. We show that it improves reconstruction accuracy by considering multiple frames. To this purpose, and in addition to real data examples, we introduce a multi-camera synthetic dataset that provides ground-truth data for mid-scale dynamic scenes.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Leroy_Multi-View_Dynamic_Shape_ICCV_2017_paper.pdf", - "aff": "INRIA Grenoble Rh\u00f4ne-Alpes, LJK - Grenoble Universities, France; INRIA Grenoble Rh\u00f4ne-Alpes, LJK - Grenoble Universities, France; INRIA Grenoble Rh\u00f4ne-Alpes, LJK - Grenoble Universities, France", + "aff": "INRIA Grenoble Rhône-Alpes, LJK - Grenoble Universities, France; INRIA Grenoble Rhône-Alpes, LJK - Grenoble Universities, France; INRIA Grenoble Rhône-Alpes, LJK - Grenoble Universities, France", "project": "", "github": "", "supp": "", @@ -10613,14 +11289,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Leroy_Multi-View_Dynamic_Shape_ICCV_2017_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "INRIA Grenoble Rh\u00f4ne-Alpes", - "aff_unique_dep": "LJK", - "aff_unique_url": "https://www.inria.fr/centre/grenoble", + "aff_unique_norm": "INRIA Grenoble Rhône-Alpes", + "aff_unique_dep": "LJK - Grenoble Universities", + "aff_unique_url": "https://www.inria.fr/grenoble", "aff_unique_abbr": "INRIA", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Grenoble", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "France" + "aff_country_unique": "France", + "bibtex": "@InProceedings{Leroy_2017_ICCV,\n \n author = {\n Leroy,\n Vincent and Franco,\n Jean-Sebastien and Boyer,\n Edmond\n},\n title = {\n Multi-View Dynamic Shape Refinement Using Local Temporal Integration\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Multi-View Non-Rigid Refinement and Normal Selection for High Quality 3D Reconstruction", @@ -10628,6 +11305,7 @@ "status": "Poster", "track": "main", "pid": "1269", + "author_site": "Sk. Mohammadul Haque; Venu Madhav Govindu", "author": "Sk. Mohammadul Haque; Venu Madhav Govindu", "abstract": "In recent years, there have been a variety of proposals for high quality 3D reconstruction by fusion of depth and normal maps that contain good low and high frequency information respectively. Typically, these methods create an initial mesh representation of the complete object or scene being scanned. Subsequently, normal estimates are assigned to each mesh vertex and a mesh-normal fusion step is carried out. In this paper, we present a complete pipeline for such depth-normal fusion. The key innovations in our pipeline are twofold. Firstly, we introduce a global multi-view non-rigid refinement step that corrects for the non-rigid misalignment present in the depth and normal maps. We demonstrate that such a correction is crucial for preserving fine-scale 3D features in the final reconstruction. Secondly, despite adequate care, the averaging of multiple normals invariably results in blurring of 3D detail. To mitigate this problem, we propose an approach that selects one out of many available normals. Our global cost for normal selection incorporates a variety of desirable properties and can be efficiently solved using graph cuts. We demonstrate the efficacy of our approach in generating high quality 3D reconstructions of both synthetic and real 3D models and compare with existing methods in the literature.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Haque_Multi-View_Non-Rigid_Refinement_ICCV_2017_paper.pdf", @@ -10652,7 +11330,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "India" + "aff_country_unique": "India", + "bibtex": "@InProceedings{Haque_2017_ICCV,\n \n author = {\n Mohammadul Haque,\n Sk. and Madhav Govindu,\n Venu\n},\n title = {\n Multi-View Non-Rigid Refinement and Normal Selection for High Quality 3D Reconstruction\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Multimodal Gaussian Process Latent Variable Models With Harmonization", @@ -10660,6 +11339,7 @@ "status": "Poster", "track": "main", "pid": "2059", + "author_site": "Guoli Song; Shuhui Wang; Qingming Huang; Qi Tian", "author": "Guoli Song; Shuhui Wang; Qingming Huang; Qi Tian", "abstract": "In this work, we address multimodal learning problem with Gaussian process latent variable models (GPLVMs) and their application to cross-modal retrieval. Existing GPLVM based studies generally impose individual priors over the model parameters and ignore the intrinsic relations among these parameters. Considering the strong complementarity between modalities, we propose a novel joint prior over the parameters for multimodal GPLVMs to propagate multimodal information in both kernel hyperparameter spaces and latent space. The joint prior is formulated as a harmonization constraint on the model parameters, which enforces the agreement among the modality-specific GP kernels and the similarity in the latent space. We incorporate the harmonization mechanism into the learning process of multimodal GPLVMs. The proposed methods are evaluated on three widely used multimodal datasets for cross-modal retrieval. Experimental results show that the harmonization mechanism is beneficial to the GPLVM algorithms for learning non-linear correlation among heterogeneous modalities.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Song_Multimodal_Gaussian_Process_ICCV_2017_paper.pdf", @@ -10679,12 +11359,13 @@ "aff_unique_index": "0+1;1;0+1;2", "aff_unique_norm": "University of Chinese Academy of Sciences;Chinese Academy of Sciences;University of Texas at San Antonio", "aff_unique_dep": ";Key Lab of Intell. Info. Process., Inst. of Comput. Tech;Department of Computer Science", - "aff_unique_url": "http://www.ucas.ac.cn;http://www.ict.ac.cn;https://www.utsa.edu", + "aff_unique_url": "http://www.ucas.ac.cn;http://www.ict.cas.cn;https://www.utsa.edu", "aff_unique_abbr": "UCAS;CAS;UTSA", "aff_campus_unique_index": "0+0;0;0+0;1", "aff_campus_unique": "Beijing;San Antonio", "aff_country_unique_index": "0+0;0;0+0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Song_2017_ICCV,\n \n author = {\n Song,\n Guoli and Wang,\n Shuhui and Huang,\n Qingming and Tian,\n Qi\n},\n title = {\n Multimodal Gaussian Process Latent Variable Models With Harmonization\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Mutual Enhancement for Detection of Multiple Logos in Sports Videos", @@ -10692,6 +11373,7 @@ "status": "Poster", "track": "main", "pid": "2039", + "author_site": "Yuan Liao; Xiaoqing Lu; Chengcui Zhang; Yongtao Wang; Zhi Tang", "author": "Yuan Liao; Xiaoqing Lu; Chengcui Zhang; Yongtao Wang; Zhi Tang", "abstract": "Detecting logo frequency and duration in sports videos provides sponsors an effective way to evaluate their advertising efforts. However, general-purposed object detection methods cannot address all the challenges in sports videos. In this paper, we propose a mutual-enhanced approach that can improve the detection of a logo through the information obtained from other simultaneously occurred logos. In a Fast-RCNN-based framework, we first introduce a homogeneity-enhanced re-ranking method by analyzing the characteristics of homogeneous logos in each frame, including type repetition, color consistency, and mutual exclusion. Different from conventional enhance mechanism that improves the weak proposals with the dominant proposals, our mutual method can also enhance the relatively significant proposals with weak proposals. Mutual enhancement is also included in our frame propagation mechanism that improves logo detection by utilizing the continuity of logos across frames. We use a tennis video dataset and an associated logo collection for detection evaluation. Experiments show that the proposed method outperforms existing methods with a higher accuracy.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Liao_Mutual_Enhancement_for_ICCV_2017_paper.pdf", @@ -10716,7 +11398,8 @@ "aff_campus_unique_index": "0;0;1;0;0", "aff_campus_unique": "Beijing;Birmingham", "aff_country_unique_index": "0;0;1;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Liao_2017_ICCV,\n \n author = {\n Liao,\n Yuan and Lu,\n Xiaoqing and Zhang,\n Chengcui and Wang,\n Yongtao and Tang,\n Zhi\n},\n title = {\n Mutual Enhancement for Detection of Multiple Logos in Sports Videos\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Need for Speed: A Benchmark for Higher Frame Rate Object Tracking", @@ -10724,6 +11407,7 @@ "status": "Poster", "track": "main", "pid": "435", + "author_site": "Hamed Kiani Galoogahi; Ashton Fagg; Chen Huang; Deva Ramanan; Simon Lucey", "author": "Hamed Kiani Galoogahi; Ashton Fagg; Chen Huang; Deva Ramanan; Simon Lucey", "abstract": "In this paper, we propose the first higher frame rate video dataset (called Need for Speed - NfS) and benchmark for visual object tracking. The dataset consists of 100 videos (380K frames) captured with now commonly available higher frame rate (240 FPS) cameras from real world scenarios. All frames are annotated with axis aligned bounding boxes and all sequences are manually labelled with nine visual attributes - such as occlusion, fast motion, background clutter, etc. Our benchmark provides an extensive evaluation of many recent and state-of-the-art trackers on higher frame rate sequences. We ranked each of these trackers according to their tracking accuracy and real-time performance. One of our surprising conclusions is that at higher frame rates, simple trackers such as correlation filters outperform complex methods based on deep networks. This suggests that for practical applications (such as in robotics or embedded vision), one needs to carefully tradeoff bandwidth constraints associated with higher frame rate acquisition, computational costs of real-time analysis, and the required application accuracy. Our dataset and benchmark allows for the first time (to our knowledge) systematic exploration of such issues, and will be made available to allow for further research in this space.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Galoogahi_Need_for_Speed_ICCV_2017_paper.pdf", @@ -10748,7 +11432,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Pittsburgh;", "aff_country_unique_index": "0;1;0;0;0", - "aff_country_unique": "United States;Australia" + "aff_country_unique": "United States;Australia", + "bibtex": "@InProceedings{Galoogahi_2017_ICCV,\n \n author = {\n Kiani Galoogahi,\n Hamed and Fagg,\n Ashton and Huang,\n Chen and Ramanan,\n Deva and Lucey,\n Simon\n},\n title = {\n Need for Speed: A Benchmark for Higher Frame Rate Object Tracking\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Neural Ctrl-F: Segmentation-Free Query-By-String Word Spotting in Handwritten Manuscript Collections", @@ -10756,7 +11441,7 @@ "status": "Poster", "track": "main", "pid": "1779", - "author_site": "Tomas Wilkinson; Jonas Lindstr\u00c3\u00b6m; Anders Brun", + "author_site": "Tomas Wilkinson; Jonas Lindström; Anders Brun", "author": "Tomas Wilkinson; Jonas Lindstrom; Anders Brun", "abstract": "In this paper, we approach the problem of segmentation-free query-by-string word spotting for handwritten documents. In other words, we use methods inspired from computer vision and machine learning to search for words in large collections of digitized manuscripts. In particular, we are interested in historical handwritten texts, which are often far more challenging than modern printed documents. This task is important, as it provides people with a way to quickly find what they are looking for in large collections that are tedious and difficult to read manually. To this end, we introduce an end-to-end trainable model based on deep neural networks that we call Ctrl-F-Net. Given a full manuscript page, the model simultaneously generates region proposals, and embeds these into a distributed word embedding space, where searches are performed. We evaluate the model on common benchmarks for handwritten word spotting, outperforming the previous state-of-the-art segmentation-free approaches by a large margin, and in some cases even segmentation-based approaches. One interesting real-life application of our approach is to help historians to find and count specific words in court records that are related to women's sustenance activities and division of labor. We provide promising preliminary experiments that validate our method on this task.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Wilkinson_Neural_Ctrl-F_Segmentation-Free_ICCV_2017_paper.pdf", @@ -10781,7 +11466,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Sweden" + "aff_country_unique": "Sweden", + "bibtex": "@InProceedings{Wilkinson_2017_ICCV,\n \n author = {\n Wilkinson,\n Tomas and Lindstrom,\n Jonas and Brun,\n Anders\n},\n title = {\n Neural Ctrl-F: Segmentation-Free Query-By-String Word Spotting in Handwritten Manuscript Collections\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Neural EPI-Volume Networks for Shape From Light Field", @@ -10789,6 +11475,7 @@ "status": "Spotlight", "track": "main", "pid": "863", + "author_site": "Stefan Heber; Wei Yu; Thomas Pock", "author": "Stefan Heber; Wei Yu; Thomas Pock", "abstract": "This paper presents a novel deep regression network to extract geometric information from Light Field (LF) data. Our network builds upon u-shaped network architectures. Those networks involve two symmetric parts, an encoding and a decoding part. In the first part the network encodes relevant information from the given input into a set of high-level feature maps. In the second part the generated feature maps are then decoded to the desired output. To predict reliable and robust depth information the proposed network examines 3D subsets of the 4D LF called Epipolar Plane Image (EPI) volumes. An important aspect of our network is the use of 3D convolutional layers, that allow to propagate information from two spatial dimensions and one directional dimension of the LF. Compared to previous work this allows for an additional spatial regularization, which reduces depth artifacts and simultaneously maintains clear depth discontinuities. Experimental results show that our approach allows to create high-quality reconstruction results, which outperform current state-of-the-art Shape from Light Field (SfLF) techniques. The main advantage of the proposed approach is the ability to provide those high-quality reconstructions at a low computation time.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Heber_Neural_EPI-Volume_Networks_ICCV_2017_paper.pdf", @@ -10813,7 +11500,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0", - "aff_country_unique": "Austria" + "aff_country_unique": "Austria", + "bibtex": "@InProceedings{Heber_2017_ICCV,\n \n author = {\n Heber,\n Stefan and Yu,\n Wei and Pock,\n Thomas\n},\n title = {\n Neural EPI-Volume Networks for Shape From Light Field\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Neural Person Search Machines", @@ -10821,6 +11509,7 @@ "status": "Poster", "track": "main", "pid": "216", + "author_site": "Hao Liu; Jiashi Feng; Zequn Jie; Karlekar Jayashree; Bo Zhao; Meibin Qi; Jianguo Jiang; Shuicheng Yan", "author": "Hao Liu; Jiashi Feng; Zequn Jie; Karlekar Jayashree; Bo Zhao; Meibin Qi; Jianguo Jiang; Shuicheng Yan", "abstract": "We investigate the problem of person search in the wild in this work. Instead of comparing the query against all candidate regions generated in a query-blind manner, we propose to recursively shrink the search area from the whole image till achieving precise localization of the target person, by fully exploiting information from the query and contextual cues in every recursive search step. We develop the Neural Person Search Machines (NPSM) to implement such recursive localization for person search. Benefiting from its neural search mechanism, NPSM is able to selectively shrink its focus from a loose region to a tighter one containing the target automatically. In this process, NPSM employs an internal primitive memory component to memorize the query representation which modulates the attention and augments its robustness to other distracting regions. Evaluations on two benchmark datasets, CUHK-SYSU Person Search dataset and PRW dataset, have demonstrated that our method can outperform current state-of-the-arts in both mAP and top-1 evaluation protocols.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Liu_Neural_Person_Search_ICCV_2017_paper.pdf", @@ -10836,7 +11525,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Liu_Neural_Person_Search_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Liu_Neural_Person_Search_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Liu_2017_ICCV,\n \n author = {\n Liu,\n Hao and Feng,\n Jiashi and Jie,\n Zequn and Jayashree,\n Karlekar and Zhao,\n Bo and Qi,\n Meibin and Jiang,\n Jianguo and Yan,\n Shuicheng\n},\n title = {\n Neural Person Search Machines\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "No Fuss Distance Metric Learning Using Proxies", @@ -10844,6 +11534,7 @@ "status": "Poster", "track": "main", "pid": "48", + "author_site": "Yair Movshovitz-Attias; Alexander Toshev; Thomas K. Leung; Sergey Ioffe; Saurabh Singh", "author": "Yair Movshovitz-Attias; Alexander Toshev; Thomas K. Leung; Sergey Ioffe; Saurabh Singh", "abstract": "We address the problem of distance metric learning (DML), defined as learning a distance consistent with a notion of semantic similarity. Traditionally, for this problem supervision is expressed in the form of sets of points that follow an ordinal relationship -- an anchor point x is similar to a set of positive points Y, and dissimilar to a set of negative points Z, and a loss defined over these distances is minimized. While the specifics of the optimization differ, in this work we collectively call this type of supervision Triplets and all methods that follow this pattern Triplet-Based methods. These methods are challenging to optimize. A main issue is the need for finding informative triplets, which is usually achieved by a variety of tricks such as increasing the batch size, hard or semi-hard triplet mining, etc. Even with these tricks, the convergence rate of such methods is slow. In this paper we propose to optimize the triplet loss on a different space of triplets, consisting of an anchor data point and similar and dissimilar proxy points which are learned as well. These proxies approximate the original data points, so that a triplet loss over the proxies is a tight upper bound of the original loss. This proxy-based loss is empirically better behaved. As a result, the proxy-loss improves on state-of-art results for three standard zero-shot learning datasets, by up to 15% points, while converging three times as fast as other triplet-based losses.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Movshovitz-Attias_No_Fuss_Distance_ICCV_2017_paper.pdf", @@ -10868,7 +11559,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Movshovitz-Attias_2017_ICCV,\n \n author = {\n Movshovitz-Attias,\n Yair and Toshev,\n Alexander and Leung,\n Thomas K. and Ioffe,\n Sergey and Singh,\n Saurabh\n},\n title = {\n No Fuss Distance Metric Learning Using Proxies\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "No More Discrimination: Cross City Adaptation of Road Scene Segmenters", @@ -10876,6 +11568,7 @@ "status": "Poster", "track": "main", "pid": "694", + "author_site": "Yi-Hsin Chen; Wei-Yu Chen; Yu-Ting Chen; Bo-Cheng Tsai; Yu-Chiang Frank Wang; Min Sun", "author": "Yi-Hsin Chen; Wei-Yu Chen; Yu-Ting Chen; Bo-Cheng Tsai; Yu-Chiang Frank Wang; Min Sun", "abstract": "Despite the recent success of deep-learning based semantic segmentation, deploying a pre-trained road scene segmenter to a city whose images are not presented in the training set would not achieve satisfactory performance due to dataset biases. Instead of collecting a large number of annotated images of each city of interest to train or refine the segmenter, we propose an unsupervised learning approach to adapt road scene segmenters across different cities. By utilizing Google Street View and its time-machine feature, we can collect unannotated images for each road scene at different times, so that the associated static-object priors can be extracted accordingly. By advancing a joint global and class-specific domain adversarial learning framework, adaptation of pre-trained segmenters to that city can be achieved without the need of any user annotation or interaction. We show that our method improves the performance of semantic segmentation in multiple cities across continents, while it performs favorably against state-of-the-art approaches requiring annotated training data.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Chen_No_More_Discrimination_ICCV_2017_paper.pdf", @@ -10900,7 +11593,8 @@ "aff_campus_unique_index": "0;0+0;0;0;0;0", "aff_campus_unique": "Taiwan", "aff_country_unique_index": "0;0+0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2017_ICCV,\n \n author = {\n Chen,\n Yi-Hsin and Chen,\n Wei-Yu and Chen,\n Yu-Ting and Tsai,\n Bo-Cheng and Frank Wang,\n Yu-Chiang and Sun,\n Min\n},\n title = {\n No More Discrimination: Cross City Adaptation of Road Scene Segmenters\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Non-Convex Rank/Sparsity Regularization and Local Minima", @@ -10908,6 +11602,7 @@ "status": "Poster", "track": "main", "pid": "60", + "author_site": "Carl Olsson; Marcus Carlsson; Fredrik Andersson; Viktor Larsson", "author": "Carl Olsson; Marcus Carlsson; Fredrik Andersson; Viktor Larsson", "abstract": "This paper considers the problem of recovering either a low rank matrix or a sparse vector from observations of linear combinations of the vector or matrix elements. Recent methods replace the non-convex regularization with l1 or nuclear norm relaxations. It is well known that this approach recovers near optimal solutions if a so called restricted isometry property (RIP) holds. On the other hand it also has a shrinking bias which can degrade the solution. In this paper we study an alternative non-convex regularization term that does not suffer from this bias. Our main theoretical results show that if a RIP holds then the stationary points are often well separated, in the sense that their differences must be of high cardinality/rank. Thus, with a suitable initial solution the approach is unlikely to fall into a bad local minimum. Our numerical tests show that the approach is likely to converge to a better solution than standard l1/nuclear-norm relaxation even when starting from trivial initializations. In many cases our results can also be used to verify global optimality of our method.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Olsson_Non-Convex_RankSparsity_Regularization_ICCV_2017_paper.pdf", @@ -10932,7 +11627,8 @@ "aff_campus_unique_index": "1;1;1;1", "aff_campus_unique": ";Lund", "aff_country_unique_index": "0+0;0;0;0", - "aff_country_unique": "Sweden" + "aff_country_unique": "Sweden", + "bibtex": "@InProceedings{Olsson_2017_ICCV,\n \n author = {\n Olsson,\n Carl and Carlsson,\n Marcus and Andersson,\n Fredrik and Larsson,\n Viktor\n},\n title = {\n Non-Convex Rank/Sparsity Regularization and Local Minima\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Non-Linear Convolution Filters for CNN-Based Learning", @@ -10940,6 +11636,7 @@ "status": "Poster", "track": "main", "pid": "2256", + "author_site": "Georgios Zoumpourlis; Alexandros Doumanoglou; Nicholas Vretos; Petros Daras", "author": "Georgios Zoumpourlis; Alexandros Doumanoglou; Nicholas Vretos; Petros Daras", "abstract": "During the last years, Convolutional Neural Networks (CNNs) have achieved state-of-the-art performance in image classification. Their architectures have largely drawn inspiration by models of the primate visual system. However, while recent research results of neuroscience prove the existence of non-linear operations in the response of complex visual cells, little effort has been devoted to extend the convolution technique to non-linear forms. Typical convolutional layers are linear systems, hence their expressiveness is limited. To overcome this, various non-linearities have been used as activation functions inside CNNs, while also many pooling strategies have been applied. We address the issue of developing a convolution method in the context of a computational model of the visual cortex, exploring quadratic forms through the Volterra kernels. Such forms, constituting a more rich function space, are used as approximations of the response profile of visual cells. Our proposed second-order convolution is tested on CIFAR-10 and CIFAR-100. We show that a network which combines linear and non-linear filters in its convolutional layers, can outperform networks that use standard linear filters with the same architecture, yielding results competitive with the state-of-the-art on these datasets.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zoumpourlis_Non-Linear_Convolution_Filters_ICCV_2017_paper.pdf", @@ -10955,7 +11652,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Zoumpourlis_Non-Linear_Convolution_Filters_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Zoumpourlis_Non-Linear_Convolution_Filters_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Zoumpourlis_2017_ICCV,\n \n author = {\n Zoumpourlis,\n Georgios and Doumanoglou,\n Alexandros and Vretos,\n Nicholas and Daras,\n Petros\n},\n title = {\n Non-Linear Convolution Filters for CNN-Based Learning\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Non-Markovian Globally Consistent Multi-Object Tracking", @@ -10963,7 +11661,7 @@ "status": "Poster", "track": "main", "pid": "1098", - "author_site": "Andrii Maksai; Xinchao Wang; Fran\u00c3\u00a7ois Fleuret; Pascal Fua", + "author_site": "Andrii Maksai; Xinchao Wang; François Fleuret; Pascal Fua", "author": "Andrii Maksai; Xinchao Wang; Francois Fleuret; Pascal Fua", "abstract": "Many state-of-the-art approaches to multi-object tracking rely on detecting them in each frame independently, grouping detections into short but reliable trajectory segments, and then further grouping them into full trajectories. This grouping typically relies on imposing local smoothness constraints but almost never on enforcing more global ones on the trajectories. In this paper, we propose a non-Markovian approach to imposing global consistency by using behavioral patterns to guide the tracking algorithm. When used in conjunction with state-of-the-art tracking algorithms, this further increases their already good performance on multiple challenging datasets. We show significant improvements both in supervised settings where ground truth is available and behavioral patterns can be learned from it, and in completely unsupervised settings.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Maksai_Non-Markovian_Globally_Consistent_ICCV_2017_paper.pdf", @@ -10981,14 +11679,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Maksai_Non-Markovian_Globally_Consistent_ICCV_2017_paper.html", "aff_unique_index": "0;1;2;0", - "aff_unique_norm": "EPFL;University of Illinois Urbana-Champaign;Idiap Research Institute", + "aff_unique_norm": "École Polytechnique Fédérale de Lausanne;University of Illinois at Urbana-Champaign;IDIAP Research Institute", "aff_unique_dep": "Computer Vision Laboratory;Beckman Institute;", "aff_unique_url": "https://www.epfl.ch;https://www.beckman.illinois.edu;https://www.idiap.ch", "aff_unique_abbr": "EPFL;UIUC;", "aff_campus_unique_index": "0;1;2;0", "aff_campus_unique": "Lausanne;Urbana-Champaign;Martigny", "aff_country_unique_index": "0;1;0;0", - "aff_country_unique": "Switzerland;United States" + "aff_country_unique": "Switzerland;United States", + "bibtex": "@InProceedings{Maksai_2017_ICCV,\n \n author = {\n Maksai,\n Andrii and Wang,\n Xinchao and Fleuret,\n Francois and Fua,\n Pascal\n},\n title = {\n Non-Markovian Globally Consistent Multi-Object Tracking\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Non-Rigid Object Tracking via Deformable Patches Using Shape-Preserved KCF and Level Sets", @@ -10996,10 +11695,11 @@ "status": "Poster", "track": "main", "pid": "3128", + "author_site": "Xin Sun; Ngai-Man Cheung; Hongxun Yao; Yiluan Guo", "author": "Xin Sun; Ngai-Man Cheung; Hongxun Yao; Yiluan Guo", "abstract": "Part-based trackers are effective in exploiting local details of the target object for robust tracking. In contrast to most existing part-based methods that divide all kinds of target objects into a number of fixed rectangular patches, in this paper, we propose a novel framework in which a set of deformable patches dynamically collaborate on tracking of non-rigid objects. In particular, we proposed a shape-preserved kernelized correlation filter (SP-KCF) which can accommodate target shape information for robust tracking. The SP-KCF is introduced into the level set framework for dynamic tracking of individual patches. In this manner, our proposed deformable patches are target-dependent, have the capability to assume complex topology, and are deformable to adapt to target variations. As these deformable patches properly capture individual target subregions, we exploit their photometric discrimination and shape variation to reveal the trackability of individual target subregions, which enables the proposed tracker to dynamically take advantage of those subregions with good trackability for target likelihood estimation. Finally the shape information of these deformable patches enables accurate object contours to be computed as the tracking output. Experimental results on the latest public sets of challenging sequences demonstrate the effectiveness of the proposed method.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Sun_Non-Rigid_Object_Tracking_ICCV_2017_paper.pdf", - "aff": "Singapore University of Technology and Design\u00a7; Singapore University of Technology and Design\u00a7; Harbin Institute of Technology\u2020; Singapore University of Technology and Design\u00a7", + "aff": "Singapore University of Technology and Design§; Singapore University of Technology and Design§; Harbin Institute of Technology†; Singapore University of Technology and Design§", "project": "", "github": "", "supp": "", @@ -11020,7 +11720,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", - "aff_country_unique": "Singapore;China" + "aff_country_unique": "Singapore;China", + "bibtex": "@InProceedings{Sun_2017_ICCV,\n \n author = {\n Sun,\n Xin and Cheung,\n Ngai-Man and Yao,\n Hongxun and Guo,\n Yiluan\n},\n title = {\n Non-Rigid Object Tracking via Deformable Patches Using Shape-Preserved KCF and Level Sets\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Non-Uniform Blind Deblurring by Reblurring", @@ -11028,6 +11729,7 @@ "status": "Poster", "track": "main", "pid": "1419", + "author_site": "Yuval Bahat; Netalee Efrat; Michal Irani", "author": "Yuval Bahat; Netalee Efrat; Michal Irani", "abstract": "We present an approach for blind image deblurring, which handles non-uniform blurs. Our algorithm has two main components: (i) A new method for recovering the unknown blur-field directly from the blurry image, and (ii) A method for deblurring the image given the recovered nonuniform blur-field. Our blur-field estimation is based on analyzing the spectral content of blurry image patches by Re-blurring them. Being unrestricted by any training data, it can handle a large variety of blur sizes, yielding superior blur-field estimation results compared to training based deep-learning methods. Our non-uniform deblurring algorithm is based on the internal image-specific patch recurrence prior. It attempts to recover a sharp image which, on one hand - results in the blurry image under our estimated blur-field, and on the other hand - maximizes the internal recurrence of patches within and across scales of the recovered sharp image. The combination of these two components gives rise to a blind-deblurring algorithm, which exceeds the performance of state-of-the-art CNN-based blind-deblurring by a significant margin, without the need for any training data.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Bahat_Non-Uniform_Blind_Deblurring_ICCV_2017_paper.pdf", @@ -11043,7 +11745,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Bahat_Non-Uniform_Blind_Deblurring_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Bahat_Non-Uniform_Blind_Deblurring_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Bahat_2017_ICCV,\n \n author = {\n Bahat,\n Yuval and Efrat,\n Netalee and Irani,\n Michal\n},\n title = {\n Non-Uniform Blind Deblurring by Reblurring\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Nonparametric Variational Auto-Encoders for Hierarchical Representation Learning", @@ -11051,6 +11754,7 @@ "status": "Poster", "track": "main", "pid": "2467", + "author_site": "Prasoon Goyal; Zhiting Hu; Xiaodan Liang; Chenyu Wang; Eric P. Xing", "author": "Prasoon Goyal; Zhiting Hu; Xiaodan Liang; Chenyu Wang; Eric P. Xing", "abstract": "The recently developed variational autoencoders (VAEs) have proved to be an effective confluence of the rich representational power of neural networks with Bayesian methods. However, most work on VAEs use a rather simple prior over the latent variables such as standard normal distribution, thereby restricting its applications to relatively simple phenomena. In this work, we propose hierarchical nonparametric variational autoencoders, which combines tree-structured Bayesian nonparametric priors with VAEs, to enable infinite flexibility of the latent representation space. Both the neural parameters and Bayesian priors are learned jointly using tailored variational inference. The resulting model induces a hierarchical structure of latent semantic concepts underlying the data corpus, and infers accurate representations of data instances. We apply our model in video representation learning. Our method is able to discover highly interpretable activity hierarchies, and obtain improved clustering accuracy and generalization capacity based on the learned rich representations.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Goyal_Nonparametric_Variational_Auto-Encoders_ICCV_2017_paper.pdf", @@ -11075,7 +11779,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0;0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Goyal_2017_ICCV,\n \n author = {\n Goyal,\n Prasoon and Hu,\n Zhiting and Liang,\n Xiaodan and Wang,\n Chenyu and Xing,\n Eric P.\n},\n title = {\n Nonparametric Variational Auto-Encoders for Hierarchical Representation Learning\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Object-Level Proposals", @@ -11083,6 +11788,7 @@ "status": "Poster", "track": "main", "pid": "2344", + "author_site": "Jianxiang Ma; Anlong Ming; Zilong Huang; Xinggang Wang; Yu Zhou", "author": "Jianxiang Ma; Anlong Ming; Zilong Huang; Xinggang Wang; Yu Zhou", "abstract": "Edge and surface are two fundamental visual elements of an object. The majority of existing object proposal approaches utilize edge or edge-like cues to rank candidates, while we consider that the surface cue containing the 3D characteristic of objects should be captured effectively for proposals, which has been rarely discussed before. In this paper, an object-level proposal model is presented, which constructs an occlusion-based objectness taking the surface cue into account. Specifically, the better detection of occlusion edges is focused on to enrich the surface cue into proposals, namely, the occlusion-dominated fusion and normalization criterion are designed to obtain the approximately overall contour information, to enhance the occlusion edge map at utmost and thus boost proposals. Experimental results on the PASCAL VOC 2007 and MS COCO 2014 dataset demonstrate the effectiveness of our approach, which achieves around 6% improvement on the average recall than Edge Boxes at 1000 proposals and also leads to a modest gain on the performance of object detection.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Ma_Object-Level_Proposals_ICCV_2017_paper.pdf", @@ -11107,7 +11813,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Beijing;", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Ma_2017_ICCV,\n \n author = {\n Ma,\n Jianxiang and Ming,\n Anlong and Huang,\n Zilong and Wang,\n Xinggang and Zhou,\n Yu\n},\n title = {\n Object-Level Proposals\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Octree Generating Networks: Efficient Convolutional Architectures for High-Resolution 3D Outputs", @@ -11115,6 +11822,7 @@ "status": "Poster", "track": "main", "pid": "828", + "author_site": "Maxim Tatarchenko; Alexey Dosovitskiy; Thomas Brox", "author": "Maxim Tatarchenko; Alexey Dosovitskiy; Thomas Brox", "abstract": "We present a deep convolutional decoder architecture that can generate volumetric 3D outputs in a compute- and memory-efficient manner by using an octree representation. The network learns to predict both the structure of the octree, and the occupancy values of individual cells. This makes it a particularly valuable technique for generating 3D shapes. In contrast to standard decoders acting on regular voxel grids, the architecture does not have cubic complexity. This allows representing much higher resolution outputs with a limited memory budget. We demonstrate this in several application domains, including 3D convolutional autoencoders, generation of objects and whole scenes from high-level representations, and shape from a single image.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Tatarchenko_Octree_Generating_Networks_ICCV_2017_paper.pdf", @@ -11132,14 +11840,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Tatarchenko_Octree_Generating_Networks_ICCV_2017_paper.html", "aff_unique_index": "0;0+1;0", - "aff_unique_norm": "University of Freiburg;Intel", + "aff_unique_norm": "University of Freiburg;Intel Corporation", "aff_unique_dep": ";Intel Labs", "aff_unique_url": "https://www.uni-freiburg.de;https://www.intel.com", "aff_unique_abbr": "UoF;Intel", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+1;0", - "aff_country_unique": "Germany;United States" + "aff_country_unique": "Germany;United States", + "bibtex": "@InProceedings{Tatarchenko_2017_ICCV,\n \n author = {\n Tatarchenko,\n Maxim and Dosovitskiy,\n Alexey and Brox,\n Thomas\n},\n title = {\n Octree Generating Networks: Efficient Convolutional Architectures for High-Resolution 3D Outputs\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Offline Handwritten Signature Modeling and Verification Based on Archetypal Analysis", @@ -11147,6 +11856,7 @@ "status": "Poster", "track": "main", "pid": "2529", + "author_site": "Elias N. Zois; Ilias Theodorakopoulos; George Economou", "author": "Elias N. Zois; Ilias Theodorakopoulos; George Economou", "abstract": "The handwritten signature is perhaps the most accustomed way for the acknowledgement of the consent of an individual or the authentication of the identity of a person in numerous transactions. In addition, the authenticity of a questioned offline or static handwritten signature still poses a case of interest, especially in forensic related applications. A common approach in offline signature verification system is to apply several predetermined image analysis models. Consequently, any offline signature sample which originates from either authentic persons or forgers, utilizes a fixed feature extraction base. In this proposed study, the feature space and the corresponding projection values depend on the training samples only; thus the proposed method can be found useful in forensic cases. In order to do so, we reenter a groundbreaking unsupervised learning method named archetypal analysis, which is connected to effective data analysis approaches such as sparse coding. Due to the fact that until recently there was no efficient implementation publicly available, archetypal analysis had only few cases of use. However, a fast optimization scheme using an active set strategy is now available. The main goal of this work is to introduce archetypal analysis for offline signature verification. The output of the archetypal analysis of few reference samples is a set of archetypes which are used to form the base of the feature space. Then, given a set of archetypes and a signature sample under examination archetypal analysis and average pooling provides the corresponding features. The promising performance of the proposed approach is demonstrated with the use of an evaluation method which employs the popular CEDAR and MCYT75 signature datasets.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zois_Offline_Handwritten_Signature_ICCV_2017_paper.pdf", @@ -11171,7 +11881,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Rio", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Greece" + "aff_country_unique": "Greece", + "bibtex": "@InProceedings{Zois_2017_ICCV,\n \n author = {\n Zois,\n Elias N. and Theodorakopoulos,\n Ilias and Economou,\n George\n},\n title = {\n Offline Handwritten Signature Modeling and Verification Based on Archetypal Analysis\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "On-Demand Learning for Deep Image Restoration", @@ -11179,6 +11890,7 @@ "status": "Poster", "track": "main", "pid": "555", + "author_site": "Ruohan Gao; Kristen Grauman", "author": "Ruohan Gao; Kristen Grauman", "abstract": "While machine learning approaches to image restoration offer great promise, current methods risk training models fixated on performing well only for image corruption of a particular level of difficulty--such as a certain level of noise or blur. First, we examine the weakness of conventional \"fixated\" models and demonstrate that training general models to handle arbitrary levels of corruption is indeed non-trivial. Then, we propose an on-demand learning algorithm for training image restoration models with deep convolutional neural networks. The main idea is to exploit a feedback mechanism to self-generate training instances where they are needed most, thereby learning models that can generalize across difficulty levels. On four restoration tasks--image inpainting, pixel interpolation, image deblurring, and image denoising--and three diverse datasets, our approach consistently outperforms both the status quo training procedure and curriculum learning alternatives.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Gao_On-Demand_Learning_for_ICCV_2017_paper.pdf", @@ -11203,7 +11915,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Austin", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Gao_2017_ICCV,\n \n author = {\n Gao,\n Ruohan and Grauman,\n Kristen\n},\n title = {\n On-Demand Learning for Deep Image Restoration\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "One Network to Solve Them All -- Solving Linear Inverse Problems Using Deep Projection Models", @@ -11211,7 +11924,7 @@ "status": "Oral", "track": "main", "pid": "771", - "author_site": "J. H. Rick Chang; Chun-Liang Li; Barnab\u00c3\u00a1s P\u00c3\u00b3czos; B. V. K. Vijaya Kumar; Aswin C. Sankaranarayanan", + "author_site": "J. H. Rick Chang; Chun-Liang Li; Barnabás Póczos; B. V. K. Vijaya Kumar; Aswin C. Sankaranarayanan", "author": "J. H. Rick Chang; Chun-Liang Li; Barnabas Poczos; B. V. K. Vijaya Kumar; Aswin C. Sankaranarayanan", "abstract": "While deep learning methods have achieved state-of-the-art performance in many challenging inverse problems like image inpainting and super-resolution, they invariably involve problem-specific training of the networks. Under this approach, each inverse problem requires its own dedicated network. In scenarios where we need to solve a wide variety of problems, e.g., on a mobile camera, it is inefficient and expensive to use these problem-specific networks. On the other hand, traditional methods using analytic signal priors can be used to solve any linear inverse problem; this often comes with a performance that is worse than learning-based methods. In this work, we provide a middle ground between the two kinds of methods -- we propose a general framework to train a single deep neural network that solves arbitrary linear inverse problems. We achieve this by training a network that acts as a quasi-projection operator for the set of natural images and show that any linear inverse problem involving natural images can be solved using iterative methods. We empirically show that the proposed framework demonstrates superior performance over traditional methods using wavelet sparsity prior while achieving performance comparable to specially-trained networks on tasks including compressive sensing and pixel-wise inpainting.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Chang_One_Network_to_ICCV_2017_paper.pdf", @@ -11236,7 +11949,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Chang_2017_ICCV,\n \n author = {\n Rick Chang,\n J. H. and Li,\n Chun-Liang and Poczos,\n Barnabas and Vijaya Kumar,\n B. V. K. and Sankaranarayanan,\n Aswin C.\n},\n title = {\n One Network to Solve Them All -- Solving Linear Inverse Problems Using Deep Projection Models\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Online Multi-Object Tracking Using CNN-Based Single Object Tracker With Spatial-Temporal Attention Mechanism", @@ -11244,6 +11958,7 @@ "status": "Poster", "track": "main", "pid": "2434", + "author_site": "Qi Chu; Wanli Ouyang; Hongsheng Li; Xiaogang Wang; Bin Liu; Nenghai Yu", "author": "Qi Chu; Wanli Ouyang; Hongsheng Li; Xiaogang Wang; Bin Liu; Nenghai Yu", "abstract": "In this paper, we propose a CNN-based framework for online MOT. This framework utilizes the merits of single object trackers in adapting appearance models and searching for target in the next frame. Simply applying single object tracker for MOT will encounter the problem in computational efficiency and drifted results caused by occlusion. Our framework achieves computational efficiency by sharing features and using ROI-Pooling to obtain individual features for each target. Some online learned target-specific CNN layers are used for adapting the appearance model for each target. In the framework, we introduce spatial-temporal attention mechanism (STAM) to handle the drift caused by occlusion and interaction among targets. The visibility map of the target is learned and used for inferring the spatial attention map. The spatial attention map is then applied to weight the features. Besides, the occlusion status can be estimated from the visibility map, which controls the online updating process via weighted loss on training samples with different occlusion statuses in different frames. It can be considered as temporal attention mechanism. The proposed algorithm achieves 34.3% and 46.0% in MOTA on challenging MOT15 and MOT16 benchmark dataset respectively.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Chu_Online_Multi-Object_Tracking_ICCV_2017_paper.pdf", @@ -11261,14 +11976,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Chu_Online_Multi-Object_Tracking_ICCV_2017_paper.html", "aff_unique_index": "0;1;2;2;0;0", - "aff_unique_norm": "University of Science and Technology of China;University of Sydney;Chinese University of Hong Kong", + "aff_unique_norm": "University of Science and Technology of China;University of Sydney;The Chinese University of Hong Kong", "aff_unique_dep": ";;Department of Electronic Engineering", "aff_unique_url": "http://www.ustc.edu.cn;https://www.sydney.edu.au;https://www.cuhk.edu.hk", "aff_unique_abbr": "USTC;USYD;CUHK", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;1;0;0;0;0", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Chu_2017_ICCV,\n \n author = {\n Chu,\n Qi and Ouyang,\n Wanli and Li,\n Hongsheng and Wang,\n Xiaogang and Liu,\n Bin and Yu,\n Nenghai\n},\n title = {\n Online Multi-Object Tracking Using CNN-Based Single Object Tracker With Spatial-Temporal Attention Mechanism\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Online Real-Time Multiple Spatiotemporal Action Localisation and Prediction", @@ -11276,6 +11992,7 @@ "status": "Poster", "track": "main", "pid": "1431", + "author_site": "Gurkirt Singh; Suman Saha; Michael Sapienza; Philip H. S. Torr; Fabio Cuzzolin", "author": "Gurkirt Singh; Suman Saha; Michael Sapienza; Philip H. S. Torr; Fabio Cuzzolin", "abstract": "We present a deep-learning framework for real-time multiple spatio-temporal (S/T) action localisation and classification. Current state-of-the-art approaches work offline, and are too slow to be useful in real-world settings. To overcome their limitations we introduce two major developments. Firstly, we adopt real-time SSD (Single Shot MultiBox Detector) CNNs to regress and classify detection boxes in each video frame potentially containing an action of interest. Secondly, we design an original and efficient online algorithm to incrementally construct and label \"action tubes\" from the SSD frame level detections. As a result, our system is not only capable of performing S/T detection in real time, but can also perform early action prediction in an online fashion. We achieve new state-of-the-art results in both S/T action localisation and early action prediction on the challenging UCF101-24 and J-HMDB-21 benchmarks, even when compared to the top offline competitors. To the best of our knowledge, ours is the first real-time (up to 40fps) system able to perform online S/T action localisation on the untrimmed videos of UCF101-24.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Singh_Online_Real-Time_Multiple_ICCV_2017_paper.pdf", @@ -11300,7 +12017,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Singh_2017_ICCV,\n \n author = {\n Singh,\n Gurkirt and Saha,\n Suman and Sapienza,\n Michael and Torr,\n Philip H. S. and Cuzzolin,\n Fabio\n},\n title = {\n Online Real-Time Multiple Spatiotemporal Action Localisation and Prediction\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Online Robust Image Alignment via Subspace Learning From Gradient Orientations", @@ -11308,6 +12026,7 @@ "status": "Poster", "track": "main", "pid": "676", + "author_site": "Qingqing Zheng; Yi Wang; Pheng-Ann Heng", "author": "Qingqing Zheng; Yi Wang; Pheng-Ann Heng", "abstract": "Robust and efficient image alignment remains a challenging task, due to the massiveness of images, great illumination variations between images, partial occlusion and corruption. To address these challenges, we propose an online image alignment method via subspace learning from image gradient orientations (IGO). The proposed method integrates the subspace learning, transformed IGO reconstruction and image alignment into a unified online framework, which is robust for aligning images with severe intensity distortions. Our method is motivated by principal component analysis (PCA) from gradient orientations provides more reliable low-dimensional subspace than that from pixel intensities. Instead of processing in the intensity domain like conventional methods, we seek alignment in the IGO domain such that the aligned IGO of the newly arrived image can be decomposed as the sum of a sparse error and a linear composition of the IGO-PCA basis learned from previously well-aligned ones. The optimization problem is accomplished by an iterative linearization that minimizes the l1-norm of the sparse error. Furthermore, the IGO-PCA basis is adaptively updated based on incremental thin singular value decomposition (SVD) which takes the shift of IGO mean into consideration. The efficacy of the proposed method is validated on extensive challenging datasets through image alignment and face recognition. Experimental results demonstrate that our algorithm provides more illumination- and occlusion-robust image alignment than state-of-the-art methods do.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zheng_Online_Robust_Image_ICCV_2017_paper.pdf", @@ -11325,14 +12044,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Zheng_Online_Robust_Image_ICCV_2017_paper.html", "aff_unique_index": "0;1;0", - "aff_unique_norm": "Chinese University of Hong Kong;Shenzhen University", + "aff_unique_norm": "The Chinese University of Hong Kong;Shenzhen University", "aff_unique_dep": "Department of Computer Science and Engineering;School of Biomedical Engineering", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.szu.edu.cn", "aff_unique_abbr": "CUHK;SZU", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Hong Kong SAR;Shenzhen", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zheng_2017_ICCV,\n \n author = {\n Zheng,\n Qingqing and Wang,\n Yi and Heng,\n Pheng-Ann\n},\n title = {\n Online Robust Image Alignment via Subspace Learning From Gradient Orientations\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Online Video Deblurring via Dynamic Temporal Blending Network", @@ -11340,7 +12060,7 @@ "status": "Poster", "track": "main", "pid": "1773", - "author_site": "Tae Hyun Kim; Kyoung Mu Lee; Bernhard Sch\u00c3\u00b6lkopf; Michael Hirsch", + "author_site": "Tae Hyun Kim; Kyoung Mu Lee; Bernhard Schölkopf; Michael Hirsch", "author": "Tae Hyun Kim; Kyoung Mu Lee; Bernhard Scholkopf; Michael Hirsch", "abstract": "State-of-the-art video deblurring methods are capable of removing non-uniform blur caused by unwanted camera shake and/or object motion in dynamic scenes. However, most existing methods are based on batch processing and thus need access to all recorded frames, rendering them computationally demanding and time-consuming and thus limiting their practical use. In contrast, we propose an online (sequential) video deblurring method based on a spatio-temporal recurrent network that allows for real-time performance. In particular, we introduce a novel architecture which extends the receptive field while keeping the overall size of the network small to enable fast execution. In doing so, our network is able to remove even large blur caused by strong camera shake and/or fast moving objects. Furthermore, we propose a novel network layer that enforces temporal consistency between consecutive frames by dynamic temporal blending which compares and adaptively (at test time) shares features obtained at different time steps. We show the superiority of the proposed method in an extensive experimental evaluation.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Kim_Online_Video_Deblurring_ICCV_2017_paper.pdf", @@ -11355,7 +12075,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Kim_Online_Video_Deblurring_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Kim_Online_Video_Deblurring_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Kim_2017_ICCV,\n \n author = {\n Hyun Kim,\n Tae and Mu Lee,\n Kyoung and Scholkopf,\n Bernhard and Hirsch,\n Michael\n},\n title = {\n Online Video Deblurring via Dynamic Temporal Blending Network\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Online Video Object Detection Using Association LSTM", @@ -11363,6 +12084,7 @@ "status": "Poster", "track": "main", "pid": "1061", + "author_site": "Yongyi Lu; Cewu Lu; Chi-Keung Tang", "author": "Yongyi Lu; Cewu Lu; Chi-Keung Tang", "abstract": "Video object detection is a fundamental tool for many applications. Since direct application of image-based object detection cannot leverage the rich temporal information inherent in video data, we advocate to the detection of long-range video object pattern. While the Long Short-Term Memory (LSTM) has been the de facto choice for such detection, currently LSTM cannot fundamentally model object association between consecutive frames. In this paper, we propose the association LSTM to address this fundamental association problem. Association LSTM not only regresses and classifiy directly on object locations and categories but also associates features to represent each output object. By minimizing the matching error between these features, we learn how to associate objects in two consecutive frames. Additionally, our method works in an online manner, which is important for most video tasks. Compared to the traditional video object detection methods, our approach outperforms them on standard video datasets.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Lu__Online_Video_ICCV_2017_paper.pdf", @@ -11378,7 +12100,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Lu__Online_Video_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Lu__Online_Video_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Lu_2017_ICCV,\n \n author = {\n Lu,\n Yongyi and Lu,\n Cewu and Tang,\n Chi-Keung\n},\n title = {\n Online Video Object Detection Using Association LSTM\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Open Set Domain Adaptation", @@ -11386,6 +12109,7 @@ "status": "Oral", "track": "main", "pid": "878", + "author_site": "Pau Panareda Busto; Juergen Gall", "author": "Pau Panareda Busto; Juergen Gall", "abstract": "When the training and the test data belong to different domains, the accuracy of an object classifier is significantly reduced. Therefore, several algorithms have been proposed in the last years to diminish the so called domain shift between datasets. However, all available evaluation protocols for domain adaptation describe a closed set recognition task, where both domains, namely source and target, contain exactly the same object classes. In this work, we also explore the field of domain adaptation in open sets, which is a more realistic scenario where only a few categories of interest are shared between source and target data. Therefore, we propose a method that fits in both closed and open set scenarios. The approach learns a mapping from the source to the target domain by jointly solving an assignment problem that labels those target instances that potentially belong to the categories of interest present in the source dataset. A thorough evaluation shows that our approach outperforms the state-of-the-art.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Busto_Open_Set_Domain_ICCV_2017_paper.pdf", @@ -11410,7 +12134,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0", - "aff_country_unique": "Unknown;" + "aff_country_unique": "Unknown;", + "bibtex": "@InProceedings{Busto_2017_ICCV,\n \n author = {\n Panareda Busto,\n Pau and Gall,\n Juergen\n},\n title = {\n Open Set Domain Adaptation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Open Vocabulary Scene Parsing", @@ -11418,6 +12143,7 @@ "status": "Poster", "track": "main", "pid": "710", + "author_site": "Hang Zhao; Xavier Puig; Bolei Zhou; Sanja Fidler; Antonio Torralba", "author": "Hang Zhao; Xavier Puig; Bolei Zhou; Sanja Fidler; Antonio Torralba", "abstract": "Recognizing arbitrary objects in the wild has been a challenging problem due to the limitations of existing classification models and datasets. In this paper, we propose a new task that aims at parsing scenes with a large and open vocabulary, and several evaluation metrics are explored for this problem. Our approach is a joint image pixel and word concept embeddings framework, where word concepts are connected by semantic relations. We validate the open vocabulary prediction ability of our framework on ADE20K dataset which covers a wide variety of scenes and objects. We further explore the trained joint embedding space to show its interpretability.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zhao_Open_Vocabulary_Scene_ICCV_2017_paper.pdf", @@ -11442,7 +12168,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0", - "aff_country_unique": "United States;Canada" + "aff_country_unique": "United States;Canada", + "bibtex": "@InProceedings{Zhao_2017_ICCV,\n \n author = {\n Zhao,\n Hang and Puig,\n Xavier and Zhou,\n Bolei and Fidler,\n Sanja and Torralba,\n Antonio\n},\n title = {\n Open Vocabulary Scene Parsing\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Optimal Transformation Estimation With Semantic Cues", @@ -11450,6 +12177,7 @@ "status": "Poster", "track": "main", "pid": "2359", + "author_site": "Danda Pani Paudel; Adlane Habed; Luc Van Gool", "author": "Danda Pani Paudel; Adlane Habed; Luc Van Gool", "abstract": "This paper addresses the problem of estimating the geometric transformation relating two distinct visual modalities (e.g. an image and a map, or a projective structure and a Euclidean 3D model) while relying only on semantic cues, such as semantically segmented regions or object bounding boxes. The proposed approach differs from the traditional feature-to-feature correspondence reasoning: starting from semantic regions on one side, we seek their possible corresponding regions on the other, thus constraining the sought geometric transformation. This entails a simultaneous search for the transformation and for the region-to-region correspondences.This paper is the first to derive the conditions that must be satisfied for a convex region, defined by control points, to be transformed inside an ellipsoid. These conditions are formulated as Linear Matrix Inequalities and used within a Branch-and-Prune search to obtain the globally optimal transformation. We tested our approach, under mild initial bound conditions, on two challenging registration problems for aligning: (i) a semantically segmented image and a map via a 2D homography; (ii) a projective 3D structure and its Euclidean counterpart.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Paudel_Optimal_Transformation_Estimation_ICCV_2017_paper.pdf", @@ -11474,7 +12202,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "Switzerland;France" + "aff_country_unique": "Switzerland;France", + "bibtex": "@InProceedings{Paudel_2017_ICCV,\n \n author = {\n Pani Paudel,\n Danda and Habed,\n Adlane and Van Gool,\n Luc\n},\n title = {\n Optimal Transformation Estimation With Semantic Cues\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Orientation Invariant Feature Embedding and Spatial Temporal Regularization for Vehicle Re-Identification", @@ -11482,6 +12211,7 @@ "status": "Poster", "track": "main", "pid": "96", + "author_site": "Zhongdao Wang; Luming Tang; Xihui Liu; Zhuliang Yao; Shuai Yi; Jing Shao; Junjie Yan; Shengjin Wang; Hongsheng Li; Xiaogang Wang", "author": "Zhongdao Wang; Luming Tang; Xihui Liu; Zhuliang Yao; Shuai Yi; Jing Shao; Junjie Yan; Shengjin Wang; Hongsheng Li; Xiaogang Wang", "abstract": "In this paper, we tackle the vehicle Re-identification (ReID) problem which is of great importance in urban surveillance and can be used for multiple applications. In our vehicle ReID framework, an orientation invariant feature embedding module and a spatial-temporal regularization module are proposed. With orientation invariant feature embedding, local region features of different orientations can be extracted based on 20 key point locations and can be well aligned and combined. With spatial-temporal regularization, the log-normal distribution is adopted to model the spatial-temporal constraints and the retrieval results can be refined. Experiments are conducted on public vehicle ReID datasets and our proposed method achieves state-of-the-art performance. Investigations of the proposed framework is conducted, including the landmark regressor and comparisons with attention mechanism. Both the orientation invariant feature embedding and the spatio-temporal regularization achieve considerable improvements.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Wang_Orientation_Invariant_Feature_ICCV_2017_paper.pdf", @@ -11499,14 +12229,15 @@ "author_num": 10, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Wang_Orientation_Invariant_Feature_ICCV_2017_paper.html", "aff_unique_index": "0+1;0+1;0+1;0+1;0;0;0;1;2;2", - "aff_unique_norm": "SenseTime Group Limited;Tsinghua University;Chinese University of Hong Kong", + "aff_unique_norm": "SenseTime Group Limited;Tsinghua University;The Chinese University of Hong Kong", "aff_unique_dep": ";;", "aff_unique_url": "https://www.sensetime.com;https://www.tsinghua.edu.cn;https://www.cuhk.edu.hk", "aff_unique_abbr": "SenseTime;THU;CUHK", "aff_campus_unique_index": ";;;;1;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0+0;0+0;0+0;0+0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2017_ICCV,\n \n author = {\n Wang,\n Zhongdao and Tang,\n Luming and Liu,\n Xihui and Yao,\n Zhuliang and Yi,\n Shuai and Shao,\n Jing and Yan,\n Junjie and Wang,\n Shengjin and Li,\n Hongsheng and Wang,\n Xiaogang\n},\n title = {\n Orientation Invariant Feature Embedding and Spatial Temporal Regularization for Vehicle Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "PPR-FCN: Weakly Supervised Visual Relation Detection via Parallel Pairwise R-FCN", @@ -11514,6 +12245,7 @@ "status": "Poster", "track": "main", "pid": "2006", + "author_site": "Hanwang Zhang; Zawlin Kyaw; Jinyang Yu; Shih-Fu Chang", "author": "Hanwang Zhang; Zawlin Kyaw; Jinyang Yu; Shih-Fu Chang", "abstract": "We aim to tackle a novel vision task called Weakly Supervised Visual Relation Detection (WSVRD) to detect \"subject-predicate-object\" relations in an image with object relation groundtruths available only at the image level. This is motivated by the fact that it is extremely expensive to label the combinatorial relations between objects at the instance level. Compared to the extensively studied problem, Weakly Supervised Object Detection (WSOD), WSVRD is more challenging as it needs to examine a large set of regions pairs, which is computationally prohibitive and more likely stuck in a local optimal solution such as those involving wrong spatial context. To this end, we present a Parallel, Pairwise Region-based, Fully Convolutional Network (PPR-FCN) for WSVRD. It uses a parallel FCN architecture that simultaneously performs pair selection and classification of single regions and region pairs for object and relation detection, while sharing almost all computation shared over the entire image. In particular, we propose a novel position-role-sensitive score map with pairwise RoI pooling to efficiently capture the crucial context associated with a pair of objects. We demonstrate the superiority of PPR-FCN over all baselines in solving the WSVRD challenge by using results of extensive experiments over two visual relation benchmarks.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zhang_PPR-FCN_Weakly_Supervised_ICCV_2017_paper.pdf", @@ -11538,7 +12270,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", - "aff_country_unique": "United States;Singapore" + "aff_country_unique": "United States;Singapore", + "bibtex": "@InProceedings{Zhang_2017_ICCV,\n \n author = {\n Zhang,\n Hanwang and Kyaw,\n Zawlin and Yu,\n Jinyang and Chang,\n Shih-Fu\n},\n title = {\n PPR-FCN: Weakly Supervised Visual Relation Detection via Parallel Pairwise R-FCN\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "PUnDA: Probabilistic Unsupervised Domain Adaptation for Knowledge Transfer Across Visual Categories", @@ -11546,6 +12279,7 @@ "status": "Poster", "track": "main", "pid": "1510", + "author_site": "Behnam Gholami; Ognjen (Oggi) Rudovic; Vladimir Pavlovic", "author": "Behnam Gholami; Ognjen (Oggi) Rudovic; Vladimir Pavlovic", "abstract": "This paper introduces a probabilistic latent variable model to address unsupervised domain adaptation problems. This is achieved by learning projections from each domain to a latent space along the classifier in the latent space to simultaneously minimizing a notion of domain disparity while maximizing a measure of discriminatory power. The non-parametric nature of our Latent variable model makes it possible to infer the latent space dimension automatically from data. We also develop a Variational Bayes (VB) algorithm for parameter estimation. We evaluate and contrast our proposed model against state-of-the-art methods for the task of visual domain adaptation using both handcrafted and deep net features. Our experiments show that even with a simple softmax classifier, our model can outperform several state-of-the-art methods taking advantage of more sophisticated classification schemes.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Gholami_PUnDA_Probabilistic_Unsupervised_ICCV_2017_paper.pdf", @@ -11560,7 +12294,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Gholami_PUnDA_Probabilistic_Unsupervised_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Gholami_PUnDA_Probabilistic_Unsupervised_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Gholami_2017_ICCV,\n \n author = {\n Gholami,\n Behnam and (Oggi) Rudovic,\n Ognjen and Pavlovic,\n Vladimir\n},\n title = {\n PUnDA: Probabilistic Unsupervised Domain Adaptation for Knowledge Transfer Across Visual Categories\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "PanNet: A Deep Network Architecture for Pan-Sharpening", @@ -11568,6 +12303,7 @@ "status": "Poster", "track": "main", "pid": "2864", + "author_site": "Junfeng Yang; Xueyang Fu; Yuwen Hu; Yue Huang; Xinghao Ding; John Paisley", "author": "Junfeng Yang; Xueyang Fu; Yuwen Hu; Yue Huang; Xinghao Ding; John Paisley", "abstract": "We propose a deep network architecture for the pan-sharpening problem called PanNet. We incorporate domain-specific knowledge to design our PanNet architecture by focusing on the two aims of the pan-sharpening problem: spectral and spatial preservation. For spectral preservation, we add up-sampled multispectral images to the network output, which directly propagates the spectral information to the reconstructed image. To preserve spatial structure, we train our network parameters in the high-pass filtering domain rather than the image domain. We show that the trained network generalizes well to images from different satellites without needing retraining. Experiments show significant improvement over state-of-the-art methods visually and in terms of standard quality metrics.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Yang_PanNet_A_Deep_ICCV_2017_paper.pdf", @@ -11592,7 +12328,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Yang_2017_ICCV,\n \n author = {\n Yang,\n Junfeng and Fu,\n Xueyang and Hu,\n Yuwen and Huang,\n Yue and Ding,\n Xinghao and Paisley,\n John\n},\n title = {\n PanNet: A Deep Network Architecture for Pan-Sharpening\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Parallel Tracking and Verifying: A Framework for Real-Time and High Accuracy Visual Tracking", @@ -11600,6 +12337,7 @@ "status": "Poster", "track": "main", "pid": "3120", + "author_site": "Heng Fan; Haibin Ling", "author": "Heng Fan; Haibin Ling", "abstract": "Being intensively studied, visual tracking has seen great recent advances in either speed (e.g., with correlation filters) or accuracy (e.g., with deep features). Real-time and high accuracy tracking algorithms, however, remain scarce. In this paper we study the problem from a new perspective and present a novel parallel tracking and verifying (PTAV) framework, by taking advantage of the ubiquity of multi-thread techniques and borrowing from the success of parallel tracking and mapping in visual SLAM. Our PTAV framework typically consists of two components, a tracker T and a verifier V, working in parallel on two separate threads. The tracker T aims to provide a super real-time tracking inference and is expected to perform well most of the time; by contrast, the verifier V checks the tracking results and corrects T when needed. The key innovation is that, V does not work on every frame but only upon the requests from T; on the other end, T may adjust the tracking according to the feedback from V. With such collaboration, PTAV enjoys both the high efficiency provided by T and the strong discriminative power by V. In our extensive experiments on popular benchmarks including OTB2013, OTB2015, TC128 and UAV20L, PTAV achieves the best tracking accuracy among all real-time trackers, and in fact performs even better than many deep learning based solutions. Moreover, as a general framework, PTAV is very flexible and has great rooms for improvement and generalization.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Fan_Parallel_Tracking_and_ICCV_2017_paper.pdf", @@ -11624,7 +12362,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Philadelphia", "aff_country_unique_index": "0+1;0+1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Fan_2017_ICCV,\n \n author = {\n Fan,\n Heng and Ling,\n Haibin\n},\n title = {\n Parallel Tracking and Verifying: A Framework for Real-Time and High Accuracy Visual Tracking\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Parameter-Free Lens Distortion Calibration of Central Cameras", @@ -11632,10 +12371,11 @@ "status": "Poster", "track": "main", "pid": "1777", + "author_site": "Filippo Bergamasco; Luca Cosmo; Andrea Gasparetto; Andrea Albarelli; Andrea Torsello", "author": "Filippo Bergamasco; Luca Cosmo; Andrea Gasparetto; Andrea Albarelli; Andrea Torsello", "abstract": "At the core of many Computer Vision applications stands the need to define a mathematical model describing the imaging process. To this end, the pinhole model with radial distortion is probably the most commonly used, as it balances low complexity with a precision that is sufficient for most applications. On the other hand, unconstrained non-parametric models, despite being originally proposed to handle specialty cameras, have been shown to outperform the pinhole model, even with the simpler setups. Still, notwithstanding the higher accuracy, the inability of describing the imaging model by simple linear projective operators severely limits the use of standard algorithms with unconstrained models. In this paper we propose a parameter-free camera model where each imaging ray is constrained to a common optical center, forcing the camera to be central. Such model can be easily calibrated with a practical procedure which provides a convenient undistortion map that can be used to obtain a virtual pinhole camera. The proposed method can also be used to calibrate a stereo rig with a displacement map that simultaneously provides stereo rectification and corrects lens distortion.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Bergamasco_Parameter-Free_Lens_Distortion_ICCV_2017_paper.pdf", - "aff": "Dipartimento di Scienze Ambientali, Informatica e Statistica, Universit `a Ca\u2019 Foscari - Venice, Italy; Dipartimento di Scienze Ambientali, Informatica e Statistica, Universit `a Ca\u2019 Foscari - Venice, Italy; Dipartimento di Scienze Ambientali, Informatica e Statistica, Universit `a Ca\u2019 Foscari - Venice, Italy; Dipartimento di Scienze Ambientali, Informatica e Statistica, Universit `a Ca\u2019 Foscari - Venice, Italy; Dipartimento di Scienze Ambientali, Informatica e Statistica, Universit `a Ca\u2019 Foscari - Venice, Italy", + "aff": "Dipartimento di Scienze Ambientali, Informatica e Statistica, Universit `a Ca’ Foscari - Venice, Italy; Dipartimento di Scienze Ambientali, Informatica e Statistica, Universit `a Ca’ Foscari - Venice, Italy; Dipartimento di Scienze Ambientali, Informatica e Statistica, Universit `a Ca’ Foscari - Venice, Italy; Dipartimento di Scienze Ambientali, Informatica e Statistica, Universit `a Ca’ Foscari - Venice, Italy; Dipartimento di Scienze Ambientali, Informatica e Statistica, Universit `a Ca’ Foscari - Venice, Italy", "project": "", "github": "", "supp": "", @@ -11649,14 +12389,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Bergamasco_Parameter-Free_Lens_Distortion_ICCV_2017_paper.html", "aff_unique_index": "0;0;0;0;0", - "aff_unique_norm": "Universit \u00e0 Ca\u2019 Foscari - Venice", + "aff_unique_norm": "Universit à Ca’ Foscari - Venice", "aff_unique_dep": "Dipartimento di Scienze Ambientali, Informatica e Statistica", "aff_unique_url": "https://www.unive.it", - "aff_unique_abbr": "Ca\u2019 Foscari", + "aff_unique_abbr": "Ca’ Foscari", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Venice", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "Italy" + "aff_country_unique": "Italy", + "bibtex": "@InProceedings{Bergamasco_2017_ICCV,\n \n author = {\n Bergamasco,\n Filippo and Cosmo,\n Luca and Gasparetto,\n Andrea and Albarelli,\n Andrea and Torsello,\n Andrea\n},\n title = {\n Parameter-Free Lens Distortion Calibration of Central Cameras\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "PathTrack: Fast Trajectory Annotation With Path Supervision", @@ -11664,6 +12405,7 @@ "status": "Poster", "track": "main", "pid": "226", + "author_site": "Santiago Manen; Michael Gygli; Dengxin Dai; Luc Van Gool", "author": "Santiago Manen; Michael Gygli; Dengxin Dai; Luc Van Gool", "abstract": "Progress in Multiple Object Tracking (MOT) has been limited by the size of the available datasets. We present an efficient framework to annotate trajectories and use it to produce a MOT dataset of unprecedented size. A novel path supervision paradigm lets the annotator loosely track the object with a cursor while watching the video. This results in a path annotation for each object in the sequence. These path annotations, together with object detections, are fed into a two-step optimization to produce full bounding-box trajectories. Our experiments on existing datasets prove that our framework produces more accurate annotations than the state of the art and this in a fraction of the time. We further validate our approach by generating the PathTrack dataset, with more than 15,000 person trajectories in 720 sequences. We believe tracking approaches can benefit from a larger dataset like this one, just as was the case in object recognition. We show its potential by using it to re-train an off-the-shelf person matching network, originally trained on the MOT15 dataset, almost halving the misclassification rate. Additionally, training on our data consistently improves tracking results, both on our dataset and on MOT15. In the latter, where we improve the top-performing tracker (NOMT) dropping the number of ID Switches by 18% and fragments by 5%.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Manen_PathTrack_Fast_Trajectory_ICCV_2017_paper.pdf", @@ -11688,7 +12430,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Zurich;", "aff_country_unique_index": "0;0;0;0+1", - "aff_country_unique": "Switzerland;Belgium" + "aff_country_unique": "Switzerland;Belgium", + "bibtex": "@InProceedings{Manen_2017_ICCV,\n \n author = {\n Manen,\n Santiago and Gygli,\n Michael and Dai,\n Dengxin and Van Gool,\n Luc\n},\n title = {\n PathTrack: Fast Trajectory Annotation With Path Supervision\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Paying Attention to Descriptions Generated by Image Captioning Models", @@ -11696,6 +12439,7 @@ "status": "Poster", "track": "main", "pid": "1075", + "author_site": "Hamed R. Tavakoli; Rakshith Shetty; Ali Borji; Jorma Laaksonen", "author": "Hamed R. Tavakoli; Rakshith Shetty; Ali Borji; Jorma Laaksonen", "abstract": "To bridge the gap between humans and machines in image understanding and describing, we need further insight into how people describe a perceived scene. In this paper, we study the agreement between bottom-up saliency-based visual attention and object referrals in scene description constructs. We investigate the properties of human-written descriptions and machine-generated ones. We then propose a saliency-boosted image captioning model in order to investigate benefits from low-level cues in language models. We learn that (1) humans mention more salient objects earlier than less salient ones in their descriptions, (2) the better a captioning model performs, the better attention agreement it has with human descriptions, (3) the proposed saliency-boosted model, compared to its baseline form, does not improve significantly on the MS COCO database, indicating explicit bottom-up boosting does not help when the task is well learnt and tuned on a data, (4) a better generalization is, however, observed for the saliency-boosted model on unseen data.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Tavakoli_Paying_Attention_to_ICCV_2017_paper.pdf", @@ -11711,7 +12455,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Tavakoli_Paying_Attention_to_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Tavakoli_Paying_Attention_to_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Tavakoli_2017_ICCV,\n \n author = {\n Tavakoli,\n Hamed R. and Shetty,\n Rakshith and Borji,\n Ali and Laaksonen,\n Jorma\n},\n title = {\n Paying Attention to Descriptions Generated by Image Captioning Models\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Performance Guaranteed Network Acceleration via High-Order Residual Quantization", @@ -11719,6 +12464,7 @@ "status": "Poster", "track": "main", "pid": "1020", + "author_site": "Zefan Li; Bingbing Ni; Wenjun Zhang; Xiaokang Yang; Wen Gao", "author": "Zefan Li; Bingbing Ni; Wenjun Zhang; Xiaokang Yang; Wen Gao", "abstract": "Input binarization has shown to be an effective way for network acceleration. However, previous binarization scheme could be regarded as simple pixel-wise thresholding operations (i.e., order-one approximation) and suffers a big accuracy loss. In this paper, we propose a high-order binarization scheme, which achieves more accurate approximation while still possesses the advantage of binary operation. In particular, the proposed scheme recursively performs residual quantization and yields a series of binary input images with decreasing magnitude scales. Accordingly, we propose high-order binary filtering and gradient propagation operations for both forward and backward computations. Theoretical analysis shows approximation error guarantee property of proposed method. Extensive experimental results demonstrate that the proposed scheme yields great recognition accuracy while being accelerated.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Li_Performance_Guaranteed_Network_ICCV_2017_paper.pdf", @@ -11743,7 +12489,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2017_ICCV,\n \n author = {\n Li,\n Zefan and Ni,\n Bingbing and Zhang,\n Wenjun and Yang,\n Xiaokang and Gao,\n Wen\n},\n title = {\n Performance Guaranteed Network Acceleration via High-Order Residual Quantization\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Personalized Cinemagraphs Using Semantic Understanding and Collaborative Learning", @@ -11751,6 +12498,7 @@ "status": "Poster", "track": "main", "pid": "2176", + "author_site": "Tae-Hyun Oh; Kyungdon Joo; Neel Joshi; Baoyuan Wang; In So Kweon; Sing Bing Kang", "author": "Tae-Hyun Oh; Kyungdon Joo; Neel Joshi; Baoyuan Wang; In So Kweon; Sing Bing Kang", "abstract": "Cinemagraphs are a compelling way to convey dynamic aspects of a scene. In these media, dynamic and still elements are juxtaposed to create an artistic and narrative experience. Creating a high-quality, aesthetically pleasing cinemagraph requires isolating objects in a semantically meaningful way and then selecting good start times and looping periods for those objects to minimize visual artifacts (such a tearing). To achieve this, we present a new technique that uses object recognition and semantic segmentation as part of an optimization method to automatically create cinemagraphs from videos that are both visually appealing and semantically meaningful. Given a scene with multiple objects, there are many cinemagraphs one could create. Our method evaluates these multiple candidates and presents the best one, as determined by a model trained to predict human preferences in a collaborative way. We demonstrate the effectiveness of our approach with multiple results and a user study.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Oh_Personalized_Cinemagraphs_Using_ICCV_2017_paper.pdf", @@ -11766,7 +12514,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Oh_Personalized_Cinemagraphs_Using_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Oh_Personalized_Cinemagraphs_Using_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Oh_2017_ICCV,\n \n author = {\n Oh,\n Tae-Hyun and Joo,\n Kyungdon and Joshi,\n Neel and Wang,\n Baoyuan and So Kweon,\n In and Bing Kang,\n Sing\n},\n title = {\n Personalized Cinemagraphs Using Semantic Understanding and Collaborative Learning\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Personalized Image Aesthetics", @@ -11774,7 +12523,7 @@ "status": "Poster", "track": "main", "pid": "244", - "author_site": "Jian Ren; Xiaohui Shen; Zhe Lin; Radom\u00c3\u00adr M\u00c4\u009bch; David J. Foran", + "author_site": "Jian Ren; Xiaohui Shen; Zhe Lin; Radomír Měch; David J. Foran", "author": "Jian Ren; Xiaohui Shen; Zhe Lin; Radomir Mech; David J. Foran", "abstract": "Automatic image aesthetics rating has received a growing interest with the recent breakthrough in deep learning. Although many studies exist for learning a generic or universal aesthetics model, investigation of aesthetics models incorporating individual user's preference is quite limited. We address this personalized aesthetics problem by showing that individual's aesthetic preferences exhibit strong correlations with content and aesthetic attributes, and hence the deviation of individual's perception from generic image aesthetics is predictable. To accommodate our study, we first collect two distinct datasets, a large image dataset from Flickr and annotated by Amazon Mechanical Turk, and a small dataset of real personal albums rated by owners. We then propose a new approach to personalized aesthetics learning that can be trained even with a small set of annotated images from a user. The approach is based on a residual-based model adaptation scheme which learns an offset to compensate for the generic aesthetics score. Finally, we introduce an active learning algorithm to optimize personalized aesthetics prediction for real-world application scenarios. Experiments demonstrate that our approach can effectively learn personalized aesthetics preferences, and outperforms existing methods on quantitative comparisons.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Ren_Personalized_Image_Aesthetics_ICCV_2017_paper.pdf", @@ -11799,7 +12548,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Ren_2017_ICCV,\n \n author = {\n Ren,\n Jian and Shen,\n Xiaohui and Lin,\n Zhe and Mech,\n Radomir and Foran,\n David J.\n},\n title = {\n Personalized Image Aesthetics\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Photographic Image Synthesis With Cascaded Refinement Networks", @@ -11807,6 +12557,7 @@ "status": "Oral", "track": "main", "pid": "253", + "author_site": "Qifeng Chen; Vladlen Koltun", "author": "Qifeng Chen; Vladlen Koltun", "abstract": "We present an approach to synthesizing photographic images conditioned on semantic layouts. Given a semantic label map, our approach produces an image with photographic appearance that conforms to the input layout. The approach thus functions as a rendering engine that takes a two-dimensional semantic specification of the scene and produces a corresponding photographic image. Unlike recent and contemporaneous work, our approach does not rely on adversarial training. We show that photographic images can be synthesized from semantic layouts by a single feedforward network with appropriate structure, trained end-to-end with a direct regression objective. The presented approach scales seamlessly to high resolutions; we demonstrate this by synthesizing photographic images at 2-megapixel resolution, the full resolution of our training data. Extensive perceptual experiments on datasets of outdoor and indoor scenes demonstrate that images synthesized by the presented approach are considerably more realistic than alternative approaches.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Chen_Photographic_Image_Synthesis_ICCV_2017_paper.pdf", @@ -11824,14 +12575,15 @@ "author_num": 2, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Chen_Photographic_Image_Synthesis_ICCV_2017_paper.html", "aff_unique_index": "0;1", - "aff_unique_norm": "Intel;Stanford University", + "aff_unique_norm": "Intel Corporation;Stanford University", "aff_unique_dep": "Intel Labs;", "aff_unique_url": "https://www.intel.com;https://www.stanford.edu", "aff_unique_abbr": "Intel;Stanford", "aff_campus_unique_index": "1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Chen_2017_ICCV,\n \n author = {\n Chen,\n Qifeng and Koltun,\n Vladlen\n},\n title = {\n Photographic Image Synthesis With Cascaded Refinement Networks\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Phrase Localization and Visual Relationship Detection With Comprehensive Image-Language Cues", @@ -11839,6 +12591,7 @@ "status": "Poster", "track": "main", "pid": "784", + "author_site": "Bryan A. Plummer; Arun Mallya; Christopher M. Cervantes; Julia Hockenmaier; Svetlana Lazebnik", "author": "Bryan A. Plummer; Arun Mallya; Christopher M. Cervantes; Julia Hockenmaier; Svetlana Lazebnik", "abstract": "This paper presents a framework for localization or grounding of phrases in images using a large collection of linguistic and visual cues. We model the appearance, size, and position of entity bounding boxes, adjectives that contain attribute information, and spatial relationships between pairs of entities connected by verbs or prepositions. Special attention is given to relationships between people and clothing or body part mentions, as they are useful for distinguishing individuals. We automatically learn weights for combining these cues and at test time, perform joint inference over all phrases in a caption. The resulting system produces state of the art performance on phrase localization on the Flickr30k Entities dataset and visual relationship detection on the Stanford VRD dataset.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Plummer_Phrase_Localization_and_ICCV_2017_paper.pdf", @@ -11856,14 +12609,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Plummer_Phrase_Localization_and_ICCV_2017_paper.html", "aff_unique_index": "0;0;0;0;0", - "aff_unique_norm": "University of Illinois Urbana-Champaign", + "aff_unique_norm": "University of Illinois at Urbana-Champaign", "aff_unique_dep": "", "aff_unique_url": "https://illinois.edu", "aff_unique_abbr": "UIUC", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Urbana-Champaign", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Plummer_2017_ICCV,\n \n author = {\n Plummer,\n Bryan A. and Mallya,\n Arun and Cervantes,\n Christopher M. and Hockenmaier,\n Julia and Lazebnik,\n Svetlana\n},\n title = {\n Phrase Localization and Visual Relationship Detection With Comprehensive Image-Language Cues\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Pixel Recursive Super Resolution", @@ -11871,6 +12625,7 @@ "status": "Poster", "track": "main", "pid": "2837", + "author_site": "Ryan Dahl; Mohammad Norouzi; Jonathon Shlens", "author": "Ryan Dahl; Mohammad Norouzi; Jonathon Shlens", "abstract": "Super resolution is the problem of artificially enlarging a low resolution photograph to recover a plausible high resolution version. In the regime of high magnification factors, the problem is dramatically underspecified and many plausible, high resolution images may match a given low resolution image. In particular, traditional super resolution techniques fail in this regime due to the multimodality of the problem and strong prior information that must be imposed on image synthesis to produce plausible high resolution images. In this work we propose a new probabilistic deep network architecture, a pixel recursive super resolution model, that is an extension of PixelCNNs to address this problem. We demonstrate that this model produces a diversity of plausible high resolution images at large magnification factors. Furthermore, in human evaluation studies we demonstrate how previous methods fail to fool human observers. However, high resolution images sampled from this probabilistic deep network do fool a naive human observer a significant fraction of the time.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Dahl_Pixel_Recursive_Super_ICCV_2017_paper.pdf", @@ -11895,7 +12650,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Dahl_2017_ICCV,\n \n author = {\n Dahl,\n Ryan and Norouzi,\n Mohammad and Shlens,\n Jonathon\n},\n title = {\n Pixel Recursive Super Resolution\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Pixel-Level Matching for Video Object Segmentation Using Convolutional Neural Networks", @@ -11903,6 +12659,7 @@ "status": "Poster", "track": "main", "pid": "760", + "author_site": "Jae Shin Yoon; Francois Rameau; Junsik Kim; Seokju Lee; Seunghak Shin; In So Kweon", "author": "Jae Shin Yoon; Francois Rameau; Junsik Kim; Seokju Lee; Seunghak Shin; In So Kweon", "abstract": "We propose a novel video object segmentation algorithm based on pixel-level matching using Convolutional Neural Networks (CNN). Our network aims to distinguish the target area from the background on the basis of the pixel-level similarity between two object units. The proposed network represents a target object using features from different depth layers in order to take advantage of both the spatial details and the category-level semantic information. Furthermore, we propose a feature compression technique that drastically reduces the memory requirements while maintaining the capability of feature representation. Two-stage training (pre-training and fine-tuning) allows our network to handle any target object regardless of its category (even if the object's type does not belong to the pre-training data) or of variations in its appearance through a video sequence. Experiments on large datasets demonstrate the effectiveness of our model - against related methods - in terms of accuracy, speed, and stability. Finally, we introduce the transferability of our network to different domains, such as the infrared data domain.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Yoon_Pixel-Level_Matching_for_ICCV_2017_paper.pdf", @@ -11917,7 +12674,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Yoon_Pixel-Level_Matching_for_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Yoon_Pixel-Level_Matching_for_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Yoon_2017_ICCV,\n \n author = {\n Shin Yoon,\n Jae and Rameau,\n Francois and Kim,\n Junsik and Lee,\n Seokju and Shin,\n Seunghak and So Kweon,\n In\n},\n title = {\n Pixel-Level Matching for Video Object Segmentation Using Convolutional Neural Networks\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Playing for Benchmarks", @@ -11925,6 +12683,7 @@ "status": "Spotlight", "track": "main", "pid": "257", + "author_site": "Stephan R. Richter; Zeeshan Hayder; Vladlen Koltun", "author": "Stephan R. Richter; Zeeshan Hayder; Vladlen Koltun", "abstract": "We present a benchmark suite for visual perception. The benchmark is based on more than 250K high-resolution video frames, all annotated with ground-truth data for both low-level and high-level vision tasks, including optical flow, semantic instance segmentation, object detection and tracking, object-level 3D scene layout, and visual odometry. Ground-truth data for all tasks is available for every frame. The data was collected while driving, riding, and walking a total of 184 kilometers in diverse ambient conditions in a realistic virtual world. To create the benchmark, we have developed a new approach to collecting ground-truth data from simulated worlds without access to their source code or content. We conduct statistical analyses that show that the composition of the scenes in the benchmark closely matches the composition of corresponding physical environments. The realism of the collected data is further validated via perceptual experiments. We analyze the performance of state-of-the-art methods for multiple tasks, providing reference baselines and highlighting challenges for future research.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Richter_Playing_for_Benchmarks_ICCV_2017_paper.pdf", @@ -11940,7 +12699,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Richter_Playing_for_Benchmarks_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Richter_Playing_for_Benchmarks_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Richter_2017_ICCV,\n \n author = {\n Richter,\n Stephan R. and Hayder,\n Zeeshan and Koltun,\n Vladlen\n},\n title = {\n Playing for Benchmarks\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Point Set Registration With Global-Local Correspondence and Transformation Estimation", @@ -11948,10 +12708,11 @@ "status": "Poster", "track": "main", "pid": "1273", + "author_site": "Su Zhang; Yang Yang; Kun Yang; Yi Luo; Sim-Heng Ong", "author": "Su Zhang; Yang Yang; Kun Yang; Yi Luo; Sim-Heng Ong", "abstract": "We present a new point set registration method with global-local correspondence and transformation estimation (GL-CATE). The geometric structures of point sets are exploited by combining the global feature, the point-to-point Euclidean distance, with the local feature, the shape distance (SD) which is based on the histograms generated by an elliptical Gaussian soft count strategy. By using a bi-directional deterministic annealing scheme to directly control the searching ranges of the two features, the mixture-feature Gaussian mixture model (MGMM) is constructed to recover the correspondences of point sets. A new vector based structure constraint term is formulated to regularize the transformation. The accuracy of transformation updating is improved by constraining spatial structure at both global and local scales. An annealing scheme is applied to progressively decrease the strength of the regularization and to achieve the maximum overlap. Both of the aforementioned processes are incorporated in the EM algorithm, an unified optimization framework. We test the performances of our GL-CATE in contour registration, sequence images, real images, medical images, fingerprint images and remote sensing images, and compare with eight state-of-the-art methods where our method shows favorable performances in most scenarios.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zhang_Point_Set_Registration_ICCV_2017_paper.pdf", - "aff": "School of Information Science and Technology, Yunnan Normal University + The Engineering Research Center of GIS Technology in Western China, Ministry of Education, China + Laboratory of Pattern Recognition and Arti\ufb01cial Intelligence, Yunnan Normal University; School of Information Science and Technology, Yunnan Normal University + The Engineering Research Center of GIS Technology in Western China, Ministry of Education, China + Laboratory of Pattern Recognition and Arti\ufb01cial Intelligence, Yunnan Normal University; School of Information Science and Technology, Yunnan Normal University + The Engineering Research Center of GIS Technology in Western China, Ministry of Education, China + Laboratory of Pattern Recognition and Arti\ufb01cial Intelligence, Yunnan Normal University; School of Information Science and Technology, Yunnan Normal University + The Engineering Research Center of GIS Technology in Western China, Ministry of Education, China + Laboratory of Pattern Recognition and Arti\ufb01cial Intelligence, Yunnan Normal University; Department of Electrical and Computer Engineering, National University of Singapore", + "aff": "School of Information Science and Technology, Yunnan Normal University + The Engineering Research Center of GIS Technology in Western China, Ministry of Education, China + Laboratory of Pattern Recognition and Artificial Intelligence, Yunnan Normal University; School of Information Science and Technology, Yunnan Normal University + The Engineering Research Center of GIS Technology in Western China, Ministry of Education, China + Laboratory of Pattern Recognition and Artificial Intelligence, Yunnan Normal University; School of Information Science and Technology, Yunnan Normal University + The Engineering Research Center of GIS Technology in Western China, Ministry of Education, China + Laboratory of Pattern Recognition and Artificial Intelligence, Yunnan Normal University; School of Information Science and Technology, Yunnan Normal University + The Engineering Research Center of GIS Technology in Western China, Ministry of Education, China + Laboratory of Pattern Recognition and Artificial Intelligence, Yunnan Normal University; Department of Electrical and Computer Engineering, National University of Singapore", "project": "", "github": "", "supp": "", @@ -11964,15 +12725,16 @@ "email": "163.com;163.com; ; ; ", "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Zhang_Point_Set_Registration_ICCV_2017_paper.html", - "aff_unique_index": "0+1+0;0+1+0;0+1+0;0+1+0;2", - "aff_unique_norm": "Yunnan Normal University;Engineering Research Center of GIS Technology in Western China;National University of Singapore", - "aff_unique_dep": "School of Information Science and Technology;Ministry of Education;Department of Electrical and Computer Engineering", - "aff_unique_url": "http://www.ynnu.edu.cn;;https://www.nus.edu.sg", - "aff_unique_abbr": ";;NUS", + "aff_unique_index": "1+2;1+2;1+2;1+2;3", + "aff_unique_norm": ";The Engineering Research Center of GIS Technology in Western China;Yunnan Normal University;National University of Singapore", + "aff_unique_dep": ";Ministry of Education;Laboratory of Pattern Recognition and Artificial Intelligence;Department of Electrical and Computer Engineering", + "aff_unique_url": ";;http://www.ynnu.edu.cn;https://www.nus.edu.sg", + "aff_unique_abbr": ";;;NUS", "aff_campus_unique_index": ";;;", "aff_campus_unique": "", - "aff_country_unique_index": "0+0+0;0+0+0;0+0+0;0+0+0;1", - "aff_country_unique": "China;Singapore" + "aff_country_unique_index": "1+1;1+1;1+1;1+1;2", + "aff_country_unique": ";China;Singapore", + "bibtex": "@InProceedings{Zhang_2017_ICCV,\n \n author = {\n Zhang,\n Su and Yang,\n Yang and Yang,\n Kun and Luo,\n Yi and Ong,\n Sim-Heng\n},\n title = {\n Point Set Registration With Global-Local Correspondence and Transformation Estimation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "PolyFit: Polygonal Surface Reconstruction From Point Clouds", @@ -11980,6 +12742,7 @@ "status": "Poster", "track": "main", "pid": "1127", + "author_site": "Liangliang Nan; Peter Wonka", "author": "Liangliang Nan; Peter Wonka", "abstract": "We propose a novel framework for reconstructing lightweight polygonal surfaces from point clouds. Unlike traditional methods that focus on either extracting good geometric primitives or obtaining proper arrangements of primitives, the emphasis of this work lies in intersecting the primitives (planes only) and seeking for an appropriate combination of them to obtain a manifold polygonal surface model without boundary. We show that reconstruction from point clouds can be cast as a binary labeling problem. Our method is based on a hypothesizing and selection strategy. We first generate a reasonably large set of face candidates by intersecting the extracted planar primitives. Then an optimal subset of the candidate faces is selected through optimization. Our optimization is based on a binary linear programming formulation under hard constraints that enforce the final polygonal surface model to be manifold and watertight. Experiments on point clouds from various sources demonstrate that our method can generate lightweight polygonal surface models of arbitrary piecewise planar objects. Besides, our method is capable of recovering sharp features and is robust to noise, outliers, and missing data.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Nan_PolyFit_Polygonal_Surface_ICCV_2017_paper.pdf", @@ -12004,7 +12767,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Saudi Arabia" + "aff_country_unique": "Saudi Arabia", + "bibtex": "@InProceedings{Nan_2017_ICCV,\n \n author = {\n Nan,\n Liangliang and Wonka,\n Peter\n},\n title = {\n PolyFit: Polygonal Surface Reconstruction From Point Clouds\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Polynomial Solvers for Saturated Ideals", @@ -12012,7 +12776,7 @@ "status": "Poster", "track": "main", "pid": "935", - "author_site": "Viktor Larsson; Kalle \u00c3\u0085str\u00c3\u00b6m; Magnus Oskarsson", + "author_site": "Viktor Larsson; Kalle Åström; Magnus Oskarsson", "author": "Viktor Larsson; Kalle Astrom; Magnus Oskarsson", "abstract": "In this paper we present a new method for creating polynomial solvers for problems where a (possibly infinite) subset of the solutions are undesirable or uninteresting. These solutions typically arise from simplifications made during modeling, but can also come from degeneracies which are inherent to the geometry of the original problem. The proposed approach extends the standard action matrix method to saturated ideals. This allows us to add constraints that some polynomials should be non-zero on the solutions. This does not only offer the possibility of improved performance by removing superfluous solutions, but makes a larger class of problems tractable. Previously, problems with infinitely many solutions could not be solved directly using the action matrix method as it requires a zero-dimensional ideal. In contrast we only require that after removing the unwanted solutions only finitely many remain. We evaluate our method on three applications, optimal triangulation, time-of-arrival self-calibration and optimal vanishing point estimation.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Larsson_Polynomial_Solvers_for_ICCV_2017_paper.pdf", @@ -12037,7 +12801,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Lund", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Sweden" + "aff_country_unique": "Sweden", + "bibtex": "@InProceedings{Larsson_2017_ICCV,\n \n author = {\n Larsson,\n Viktor and Astrom,\n Kalle and Oskarsson,\n Magnus\n},\n title = {\n Polynomial Solvers for Saturated Ideals\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Pose Guided RGBD Feature Learning for 3D Object Pose Estimation", @@ -12045,6 +12810,7 @@ "status": "Poster", "track": "main", "pid": "1802", + "author_site": "Vassileios Balntas; Andreas Doumanoglou; Caner Sahin; Juil Sock; Rigas Kouskouridas; Tae-Kyun Kim", "author": "Vassileios Balntas; Andreas Doumanoglou; Caner Sahin; Juil Sock; Rigas Kouskouridas; Tae-Kyun Kim", "abstract": "In this paper we examine the effects of using object poses as guidance to learning robust features for 3D object pose estimation. Previous works have focused on learning feature embeddings based on metric learning with triplet comparisons and rely only on the qualitative distinction of similar and dissimilar pose labels. In contrast, we consider the exact pose differences between the training samples, and aim to learn embeddings such that the distances in the pose label space are proportional to the distances in the feature space. However, since it is less desirable to force the pose-feature correlation when objects are symmetric, we propose the data-driven weights that reflect object symmetry when measuring the pose distances. Furthermore, end-to-end pose regression is investigated and is shown to further boost the discriminative power of feature learning, improving pose recognition accuracies in NN, and thus can be used as another pose guidance to feature learning. Experimental results show that the features guided by poses, are significantly more discriminative than the ones learned in the traditional way, outperforming state-of-the-art works. Finally, we measure the generalisation capacities of pose guided feature learning in previously unseen scenes containing objects under different occlusion levels, and we show that it adapts well to novel tasks.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Balntas_Pose_Guided_RGBD_ICCV_2017_paper.pdf", @@ -12069,7 +12835,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Balntas_2017_ICCV,\n \n author = {\n Balntas,\n Vassileios and Doumanoglou,\n Andreas and Sahin,\n Caner and Sock,\n Juil and Kouskouridas,\n Rigas and Kim,\n Tae-Kyun\n},\n title = {\n Pose Guided RGBD Feature Learning for 3D Object Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Pose-Driven Deep Convolutional Model for Person Re-Identification", @@ -12077,6 +12844,7 @@ "status": "Poster", "track": "main", "pid": "1735", + "author_site": "Chi Su; Jianing Li; Shiliang Zhang; Junliang Xing; Wen Gao; Qi Tian", "author": "Chi Su; Jianing Li; Shiliang Zhang; Junliang Xing; Wen Gao; Qi Tian", "abstract": "Feature extraction and matching are two crucial components in person Re-Identification (ReID). The large pose deformations and the complex view variations exhibited by the captured person images significantly increase the difficulty of learning and matching of the features from person images. To overcome these difficulties, in this work we propose a Pose-driven Deep Convolutional (PDC) model to learn improved feature extraction and matching models from end to end. Our deep architecture explicitly leverages the human part cues to alleviate the pose variations and learn robust feature representations from both the global image and different local parts. To match the features from global human body and local body parts, a pose driven feature weighting sub-network is further designed to learn adaptive feature fusions. Extensive experimental analyses and results on three popular datasets demonstrate significant performance improvements of our model over all published stateof- the-art methods.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Su_Pose-Driven_Deep_Convolutional_ICCV_2017_paper.pdf", @@ -12091,7 +12859,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Su_Pose-Driven_Deep_Convolutional_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Su_Pose-Driven_Deep_Convolutional_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Su_2017_ICCV,\n \n author = {\n Su,\n Chi and Li,\n Jianing and Zhang,\n Shiliang and Xing,\n Junliang and Gao,\n Wen and Tian,\n Qi\n},\n title = {\n Pose-Driven Deep Convolutional Model for Person Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Pose-Invariant Face Alignment With a Single CNN", @@ -12099,6 +12868,7 @@ "status": "Poster", "track": "main", "pid": "1488", + "author_site": "Amin Jourabloo; Mao Ye; Xiaoming Liu; Liu Ren", "author": "Amin Jourabloo; Mao Ye; Xiaoming Liu; Liu Ren", "abstract": "Face alignment has witnessed substantial progress in the last decade. One of the recent focuses has been aligning a dense 3D face shape to face images with large head poses. The dominant technology used is based on the cascade of regressors, e.g., CNNs, which has shown promising results. Nonetheless, the cascade of CNNs suffers from several drawbacks, e.g., lack of end-to-end training, hand-crafted features and slow training speed. To address these issues, we propose a new layer, named visualization layer, which can be integrated into the CNN architecture and enables joint optimization with different loss functions. Extensive evaluation of the proposed method on multiple datasets demonstrates state-of-the-art accuracy, while reducing the training time by more than half compared to the typical cascade of CNNs. In addition, we compare across multiple CNN architectures, all with the visualization layer, to further demonstrate the advantage of its utilization.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Jourabloo_Pose-Invariant_Face_Alignment_ICCV_2017_paper.pdf", @@ -12113,7 +12883,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Jourabloo_Pose-Invariant_Face_Alignment_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Jourabloo_Pose-Invariant_Face_Alignment_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Jourabloo_2017_ICCV,\n \n author = {\n Jourabloo,\n Amin and Ye,\n Mao and Liu,\n Xiaoming and Ren,\n Liu\n},\n title = {\n Pose-Invariant Face Alignment With a Single CNN\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Practical Projective Structure From Motion (P2SfM)", @@ -12121,6 +12892,7 @@ "status": "Oral", "track": "main", "pid": "532", + "author_site": "Ludovic Magerand; Alessio Del Bue", "author": "Ludovic Magerand; Alessio Del Bue", "abstract": "This paper presents a solution to the Projective Structure from Motion (PSfM) problem able to deal efficiently with missing data, outliers and, for the first time, large scale 3D reconstruction scenarios. By embedding the projective depths into the projective parameters of the points and views, we decrease the number of unknowns to estimate and improve computational speed by optimizing standard linear Least Squares systems instead of homogeneous ones. In order to do so, we show that an extension of the linear constraints from the Generalized Projective Reconstruction Theorem can be transferred to the projective parameters, ensuring also a valid projective reconstruction in the process. We use an incremental approach that, starting from a solvable sub-problem, incrementally adds views and points until completion with a robust, outliers free, procedure. Experiments with simulated data shows that our approach is performing well, both in term of the quality of the reconstruction and the capacity to handle missing data and outliers with a reduced computational time. Finally, results on real datasets shows the ability of the method to be used in medium and large scale 3D reconstruction scenarios with high ratios of missing data (up to 98%).", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Magerand_Practical_Projective_Structure_ICCV_2017_paper.pdf", @@ -12136,7 +12908,8 @@ "aff_domain": ";", "email": ";", "author_num": 2, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Magerand_Practical_Projective_Structure_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Magerand_Practical_Projective_Structure_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Magerand_2017_ICCV,\n \n author = {\n Magerand,\n Ludovic and Del Bue,\n Alessio\n},\n title = {\n Practical Projective Structure From Motion (P2SfM)\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Practical and Efficient Multi-View Matching", @@ -12144,6 +12917,7 @@ "status": "Spotlight", "track": "main", "pid": "1260", + "author_site": "Eleonora Maset; Federica Arrigoni; Andrea Fusiello", "author": "Eleonora Maset; Federica Arrigoni; Andrea Fusiello", "abstract": "In this paper we propose a novel solution to the multi-view matching problem that, given a set of noisy pairwise correspondences, jointly updates them so as to maximize their consistency. Our method is based on a spectral decomposition, resulting in a closed-form efficient algorithm, in contrast to other iterative techniques that can be found in the literature. Experiments on both synthetic and real datasets show that our method achieves comparable or superior accuracy to state-of-the-art algorithms in significantly less time. We also demonstrate that our solution can efficiently handle datasets of hundreds of images, which is unprecedented in the literature.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Maset_Practical_and_Efficient_ICCV_2017_paper.pdf", @@ -12168,7 +12942,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Italy" + "aff_country_unique": "Italy", + "bibtex": "@InProceedings{Maset_2017_ICCV,\n \n author = {\n Maset,\n Eleonora and Arrigoni,\n Federica and Fusiello,\n Andrea\n},\n title = {\n Practical and Efficient Multi-View Matching\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Predicting Deeper Into the Future of Semantic Segmentation", @@ -12176,10 +12951,11 @@ "status": "Poster", "track": "main", "pid": "270", + "author_site": "Pauline Luc; Natalia Neverova; Camille Couprie; Jakob Verbeek; Yann LeCun", "author": "Pauline Luc; Natalia Neverova; Camille Couprie; Jakob Verbeek; Yann LeCun", "abstract": "The ability to predict and therefore to anticipate the future is an important attribute of intelligence. It is also of utmost importance in real-time systems, e.g . in robotics or autonomous driving, which depend on visual scene understanding for decision making. While prediction of the raw RGB pixel values in future video frames has been studied in previous work, here we introduce the novel task of predicting semantic segmentations of future frames. Given a sequence of video frames, our goal is to predict segmentation maps of not yet observed video frames that lie up to a second or further in the future. We develop an autoregressive convolutional neural network that learns to iteratively generate multiple frames. Our results on the Cityscapes dataset show that directly predicting future segmentations is substantially better than predicting and then segmenting future RGB frames. Prediction results up to half a second in the future are visually convincing and are much more accurate than those of a baseline based on warping semantic segmentations using optical flow.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Luc_Predicting_Deeper_Into_ICCV_2017_paper.pdf", - "aff": "Facebook AI Research+Inria Grenoble, Laboratoire Jean Kuntzmann, Universit \u00b4e Grenoble Alpes; Facebook AI Research; Facebook AI Research; Inria Grenoble, Laboratoire Jean Kuntzmann, Universit \u00b4e Grenoble Alpes; Facebook AI Research+New York University", + "aff": "Facebook AI Research+Inria Grenoble, Laboratoire Jean Kuntzmann, Universit ´e Grenoble Alpes; Facebook AI Research; Facebook AI Research; Inria Grenoble, Laboratoire Jean Kuntzmann, Universit ´e Grenoble Alpes; Facebook AI Research+New York University", "project": "", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2017/supplemental/Luc_Predicting_Deeper_Into_ICCV_2017_supplemental.pdf", @@ -12193,14 +12969,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Luc_Predicting_Deeper_Into_ICCV_2017_paper.html", "aff_unique_index": "0+1;0;0;1;0+2", - "aff_unique_norm": "Meta;INRIA Grenoble;New York University", + "aff_unique_norm": "Facebook;Inria Grenoble;New York University", "aff_unique_dep": "Facebook AI Research;Laboratoire Jean Kuntzmann;", "aff_unique_url": "https://research.facebook.com;https://www.inria.fr/grenoble;https://www.nyu.edu", "aff_unique_abbr": "FAIR;Inria;NYU", "aff_campus_unique_index": "1;1;", "aff_campus_unique": ";Grenoble", "aff_country_unique_index": "0+1;0;0;1;0+0", - "aff_country_unique": "United States;France" + "aff_country_unique": "United States;France", + "bibtex": "@InProceedings{Luc_2017_ICCV,\n \n author = {\n Luc,\n Pauline and Neverova,\n Natalia and Couprie,\n Camille and Verbeek,\n Jakob and LeCun,\n Yann\n},\n title = {\n Predicting Deeper Into the Future of Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Predicting Human Activities Using Stochastic Grammar", @@ -12208,10 +12985,11 @@ "status": "Poster", "track": "main", "pid": "566", + "author_site": "Siyuan Qi; Siyuan Huang; Ping Wei; Song-Chun Zhu", "author": "Siyuan Qi; Siyuan Huang; Ping Wei; Song-Chun Zhu", "abstract": "This paper presents a novel method to predict future human activities from partially observed RGB-D videos. Human activity prediction is generally difficult due to its non-Markovian property and the rich context between human and environments. We use a stochastic grammar model to capture the compositional structure of events, integrating human actions, objects, and their affordances. We represent the event by a spatial-temporal And-Or graph (ST-AOG). The ST-AOG is composed of a temporal stochastic grammar defined on sub-activities, and spatial graphs representing sub-activities that consist of human actions, objects, and their affordances. Future sub-activities are predicted using the temporal grammar and Earley parsing algorithm. The corresponding action, object, and affordance labels are then inferred accordingly. Extensive experiments are conducted to show the effectiveness of our model on both semantic event parsing and future activity prediction.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Qi_Predicting_Human_Activities_ICCV_2017_paper.pdf", - "aff": "University of California, Los Angeles, USA; University of California, Los Angeles, USA; Xi\u2019an Jiaotong University, Xi\u2019an, China + University of California, Los Angeles, USA; University of California, Los Angeles, USA", + "aff": "University of California, Los Angeles, USA; University of California, Los Angeles, USA; Xi’an Jiaotong University, Xi’an, China + University of California, Los Angeles, USA; University of California, Los Angeles, USA", "project": "", "github": "", "supp": "", @@ -12225,14 +13003,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Qi_Predicting_Human_Activities_ICCV_2017_paper.html", "aff_unique_index": "0;0;1+0;0", - "aff_unique_norm": "University of California, Los Angeles;Xi'an Jiao Tong University", + "aff_unique_norm": "University of California, Los Angeles;Xi'an Jiaotong University", "aff_unique_dep": ";", "aff_unique_url": "https://www.ucla.edu;https://www.xjtu.edu.cn", "aff_unique_abbr": "UCLA;XJTU", "aff_campus_unique_index": "0;0;1+0;0", "aff_campus_unique": "Los Angeles;Xi'an", "aff_country_unique_index": "0;0;1+0;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Qi_2017_ICCV,\n \n author = {\n Qi,\n Siyuan and Huang,\n Siyuan and Wei,\n Ping and Zhu,\n Song-Chun\n},\n title = {\n Predicting Human Activities Using Stochastic Grammar\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Predicting Visual Exemplars of Unseen Classes for Zero-Shot Learning", @@ -12240,6 +13019,7 @@ "status": "Poster", "track": "main", "pid": "1472", + "author_site": "Soravit Changpinyo; Wei-Lun Chao; Fei Sha", "author": "Soravit Changpinyo; Wei-Lun Chao; Fei Sha", "abstract": "Leveraging class semantic descriptions and examples of known objects, zero-shot learning makes it possible to train a recognition model for an object class whose examples are not available. In this paper, we propose a novel zero-shot learning model that takes advantage of clustering structures in the semantic embedding space. The key idea is to impose the structural constraint that semantic representations must be predictive of the locations of their corresponding visual exemplars. To this end, this reduces to training multiple kernel-based regressors from semantic representation-exemplar pairs from labeled data of the seen object categories. Despite its simplicity, our approach significantly outperforms existing zero-shot learning methods in three out of four benchmark datasets, including the ImageNet dataset with more than 20,000 unseen categories.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Changpinyo_Predicting_Visual_Exemplars_ICCV_2017_paper.pdf", @@ -12264,7 +13044,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Los Angeles", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Changpinyo_2017_ICCV,\n \n author = {\n Changpinyo,\n Soravit and Chao,\n Wei-Lun and Sha,\n Fei\n},\n title = {\n Predicting Visual Exemplars of Unseen Classes for Zero-Shot Learning\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Predictor Combination at Test Time", @@ -12272,6 +13053,7 @@ "status": "Poster", "track": "main", "pid": "1370", + "author_site": "Kwang In Kim; James Tompkin; Christian Richardt", "author": "Kwang In Kim; James Tompkin; Christian Richardt", "abstract": "We present an algorithm for test-time combination of a set of reference predictors with unknown parametric forms. Existing multi-task and transfer learning algorithms focus on training-time transfer and combination, where the parametric forms of predictors are known and shared. However, when the parametric form of a predictor is unknown, e.g., for a human predictor or a predictor in a precompiled library, existing algorithms are not applicable. Instead, we empirically evaluate predictors on sampled data points to measure distances between different predictors. This embeds the set of reference predictors into a Riemannian manifold, upon which we perform manifold denoising to obtain the refined predictor. This allows our approach to make no assumptions about the underlying predictor forms. Our test-time combination algorithm equals or outperforms existing multi-task and transfer learning algorithms on challenging real-world datasets, without introducing specific model assumptions.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Kim_Predictor_Combination_at_ICCV_2017_paper.pdf", @@ -12296,7 +13078,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "United Kingdom;United States" + "aff_country_unique": "United Kingdom;United States", + "bibtex": "@InProceedings{Kim_2017_ICCV,\n \n author = {\n In Kim,\n Kwang and Tompkin,\n James and Richardt,\n Christian\n},\n title = {\n Predictor Combination at Test Time\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Primary Video Object Segmentation via Complementary CNNs and Neighborhood Reversible Flow", @@ -12304,6 +13087,7 @@ "status": "Poster", "track": "main", "pid": "359", + "author_site": "Jia Li; Anlin Zheng; Xiaowu Chen; Bin Zhou", "author": "Jia Li; Anlin Zheng; Xiaowu Chen; Bin Zhou", "abstract": "This paper proposes a novel approach for segmenting primary video objects by using Complementary Convolutional Neural Networks (CCNN) and neighborhood reversible flow. The proposed approach first pre-trains CCNN on massive images with manually annotated salient objects in an end-to-end manner, and the trained CCNN has two separate branches that simultaneously handle two complementary tasks, i.e., foregroundness and backgroundness estimation. By applying CCNN on each video frame, the spatial foregroundness and backgroundness maps can be initialized, which are then propagated between various frames so as to segment primary video objects and suppress distractors. To enforce efficient temporal propagation, we divide each frame into superpixels and construct neighborhood reversible flow that reflects the most reliable temporal correspondences between superpixels in far-away frames. Within such flow, the initialized foregroundness and backgroundness can be efficiently and accurately propagated along the temporal axis so that primary video objects gradually pop-out and distractors are well suppressed. Extensive experimental results on three video datasets show that the proposed approach achieves impressive performance in comparisons with 18 state-of-the-art models.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Li_Primary_Video_Object_ICCV_2017_paper.pdf", @@ -12328,7 +13112,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2017_ICCV,\n \n author = {\n Li,\n Jia and Zheng,\n Anlin and Chen,\n Xiaowu and Zhou,\n Bin\n},\n title = {\n Primary Video Object Segmentation via Complementary CNNs and Neighborhood Reversible Flow\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Privacy-Preserving Visual Learning Using Doubly Permuted Homomorphic Encryption", @@ -12336,6 +13121,7 @@ "status": "Poster", "track": "main", "pid": "641", + "author_site": "Ryo Yonetani; Vishnu Naresh Boddeti; Kris M. Kitani; Yoichi Sato", "author": "Ryo Yonetani; Vishnu Naresh Boddeti; Kris M. Kitani; Yoichi Sato", "abstract": "We propose a privacy-preserving framework for learning visual classifiers by leveraging distributed private image data. This framework is designed to aggregate multiple classifiers updated locally using private data and to ensure that no private information about the data is exposed during and after its learning procedure. We utilize a homomorphic cryptosystem that can aggregate the local classifiers while they are encrypted and thus kept secret. To overcome the high computational cost of homomorphic encryption of high-dimensional classifiers, we (1) impose sparsity constraints on local classifier updates and (2) propose a novel efficient encryption scheme named doubly-permuted homomorphic encryption (DPHE) which is tailored to sparse high-dimensional data. DPHE (i) decomposes sparse data into its constituent non-zero values and their corresponding support indices, (ii) applies homomorphic encryption only to the non-zero values, and (iii) employs double permutations on the support indices to make them secret. Our experimental evaluation on several public datasets shows that the proposed approach achieves comparable performance against state-of-the-art visual recognition methods while preserving privacy and significantly outperforms other privacy-preserving methods.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Yonetani_Privacy-Preserving_Visual_Learning_ICCV_2017_paper.pdf", @@ -12350,7 +13136,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Yonetani_Privacy-Preserving_Visual_Learning_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Yonetani_Privacy-Preserving_Visual_Learning_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Yonetani_2017_ICCV,\n \n author = {\n Yonetani,\n Ryo and Naresh Boddeti,\n Vishnu and Kitani,\n Kris M. and Sato,\n Yoichi\n},\n title = {\n Privacy-Preserving Visual Learning Using Doubly Permuted Homomorphic Encryption\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "ProbFlow: Joint Optical Flow and Uncertainty Estimation", @@ -12358,6 +13145,7 @@ "status": "Poster", "track": "main", "pid": "619", + "author_site": "Anne S. Wannenwetsch; Margret Keuper; Stefan Roth", "author": "Anne S. Wannenwetsch; Margret Keuper; Stefan Roth", "abstract": "Optical flow estimation remains challenging due to untextured areas, motion boundaries, occlusions, and more. Thus, the estimated flow is not equally reliable across the image. To that end, post-hoc confidence measures have been introduced to assess the per-pixel reliability of the flow. We overcome the artificial separation of optical flow and confidence estimation by introducing a method that jointly predicts optical flow and its underlying uncertainty. Starting from common energy-based formulations, we rely on the corresponding posterior distribution of the flow given the images. We derive a variational inference scheme based on mean field, which incorporates best practices from energy minimization. An uncertainty measure is obtained along the flow at every pixel as the (marginal) entropy of the variational distribution. We demonstrate the flexibility of our probabilistic approach by applying it to two different energies and on two benchmarks. We not only obtain flow results that are competitive with the underlying energy minimization approach, but also a reliable uncertainty measure that significantly outperforms existing post-hoc approaches.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Wannenwetsch_ProbFlow_Joint_Optical_ICCV_2017_paper.pdf", @@ -12373,7 +13161,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Wannenwetsch_ProbFlow_Joint_Optical_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Wannenwetsch_ProbFlow_Joint_Optical_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Wannenwetsch_2017_ICCV,\n \n author = {\n Wannenwetsch,\n Anne S. and Keuper,\n Margret and Roth,\n Stefan\n},\n title = {\n ProbFlow: Joint Optical Flow and Uncertainty Estimation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Probabilistic Structure From Motion With Objects (PSfMO)", @@ -12381,6 +13170,7 @@ "status": "Poster", "track": "main", "pid": "1287", + "author_site": "Paul Gay; Cosimo Rubino; Vaibhav Bansal; Alessio Del Bue", "author": "Paul Gay; Cosimo Rubino; Vaibhav Bansal; Alessio Del Bue", "abstract": "In this paper we deal with the problem of recovering affine camera calibration and objects position/occupancy from multi-view images using the information from image detections. We show that remarkable object localisation and volumetric occupancy can be recovered by including both geometrical constraints and prior information given by objects CAD models from the ShapeNet dataset. This can be done by recasting the problem in the context of a probabilistic framework based on Probabilistic PCA that includes both the object semantic priors together with the multi-view geometrical constraints. We present results on synthetic and real datasets to show the validity of our approach and improvements with respect to previous approaches. In particular, the statistical priors are key to obtain reliable 3D reconstruction especially when the input detections are noisy, a likely case in real scenarios.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Gay_Probabilistic_Structure_From_ICCV_2017_paper.pdf", @@ -12405,7 +13195,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "", - "aff_country_unique": "" + "aff_country_unique": "", + "bibtex": "@InProceedings{Gay_2017_ICCV,\n \n author = {\n Gay,\n Paul and Rubino,\n Cosimo and Bansal,\n Vaibhav and Del Bue,\n Alessio\n},\n title = {\n Probabilistic Structure From Motion With Objects (PSfMO)\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Progressive Large Scale-Invariant Image Matching in Scale Space", @@ -12413,6 +13204,7 @@ "status": "Poster", "track": "main", "pid": "1190", + "author_site": "Lei Zhou; Siyu Zhu; Tianwei Shen; Jinglu Wang; Tian Fang; Long Quan", "author": "Lei Zhou; Siyu Zhu; Tianwei Shen; Jinglu Wang; Tian Fang; Long Quan", "abstract": "The power of modern image matching approaches is still fundamentally limited by the abrupt scale changes in images. In this paper, we propose a scale-invariant image matching approach to tackling the very large scale variation of views. Drawing inspiration from the scale space theory, we start with encoding the image's scale space into a compact multi-scale representation. Then, rather than trying to find the exact feature matches all in one step, we propose a progressive two-stage approach. First, we determine the related scale levels in scale space, enclosing the inlier feature correspondences, based on an optimal and exhaustive matching in a limited scale space. Second, we produce both the image similarity measurement and feature correspondences simultaneously after restricting matching between the related scale levels in a robust way. The matching performance has been intensively evaluated on vision tasks including image retrieval, feature matching and Structure-from-Motion (SfM). The successful integration of the challenging fusion of high aerial and low ground-level views with significant scale differences manifests the superiority of the proposed approach.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zhou_Progressive_Large_Scale-Invariant_ICCV_2017_paper.pdf", @@ -12430,14 +13222,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Zhou_Progressive_Large_Scale-Invariant_ICCV_2017_paper.html", "aff_unique_index": "0;0;0;0;1+0;0", - "aff_unique_norm": "Hong Kong University of Science and Technology;Shenzhen Zhuke Innovation Technology", + "aff_unique_norm": "The Hong Kong University of Science and Technology;Shenzhen Zhuke Innovation Technology", "aff_unique_dep": "Department of Computer Science and Engineering;", "aff_unique_url": "https://www.ust.hk;", "aff_unique_abbr": "HKUST;", "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhou_2017_ICCV,\n \n author = {\n Zhou,\n Lei and Zhu,\n Siyu and Shen,\n Tianwei and Wang,\n Jinglu and Fang,\n Tian and Quan,\n Long\n},\n title = {\n Progressive Large Scale-Invariant Image Matching in Scale Space\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Quantitative Evaluation of Confidence Measures in a Machine Learning World", @@ -12445,6 +13238,7 @@ "status": "Spotlight", "track": "main", "pid": "15", + "author_site": "Matteo Poggi; Fabio Tosi; Stefano Mattoccia", "author": "Matteo Poggi; Fabio Tosi; Stefano Mattoccia", "abstract": "Confidence measures aim at detecting unreliable depth measurements and play an important role for many purposes and in particular, as recently shown, to improve stereo accuracy. This topic has been thoroughly investigated by Hu and Mordohai in 2010 (and 2012) considering 17 confidence measures and two local algorithms on the two datasets available at that time. However, since then major breakthroughs happened in this field: the availability of much larger and challenging datasets, novel and more effective stereo algorithms including ones based on deep-learning and confidence measures leveraging on machine learning techniques. Therefore, this paper aims at providing an exhaustive and updated review and quantitative evaluation of 52 (actually, 76 considering variants) state-of-the-art confidence measures - focusing on recent ones mostly based on random-forests and deep-learning - with three algorithms on the challenging datasets available today. Moreover we deal with problems inherently induced by learning-based confidence measures. How are these methods able to generalize to new data? How a specific training improves their effectiveness? How more effective confidence measures can actually improve the overall stereo accuracy?", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Poggi_Quantitative_Evaluation_of_ICCV_2017_paper.pdf", @@ -12469,7 +13263,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Italy" + "aff_country_unique": "Italy", + "bibtex": "@InProceedings{Poggi_2017_ICCV,\n \n author = {\n Poggi,\n Matteo and Tosi,\n Fabio and Mattoccia,\n Stefano\n},\n title = {\n Quantitative Evaluation of Confidence Measures in a Machine Learning World\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Quasiconvex Plane Sweep for Triangulation With Outliers", @@ -12477,6 +13272,7 @@ "status": "Poster", "track": "main", "pid": "472", + "author_site": "Qianggong Zhang; Tat-Jun Chin; David Suter", "author": "Qianggong Zhang; Tat-Jun Chin; David Suter", "abstract": "Triangulation is a fundamental task in 3D computer vision. Unsurprisingly, it is a well-investigated problem with many mature algorithms. However, algorithms for robust triangulation, which are necessary to produce correct results in the presence of egregiously incorrect measurements (i.e., outliers), have received much less attention. The default approach to deal with outliers in triangulation is by random sampling. The randomized heuristic is not only suboptimal, it could, in fact, be computationally inefficient on large-scale datasets. In this paper, we propose a novel locally optimal algorithm for robust triangulation. A key feature of our method is to efficiently derive the local update step by plane sweeping a set of quasiconvex functions. Underpinning our method is a new theory behind quasiconvex plane sweep, which has not been examined previously in computational geometry. Relative to the random sampling heuristic, our algorithm not only guarantees deterministic convergence to a local minimum, it typically achieves higher quality solutions in similar runtimes.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zhang_Quasiconvex_Plane_Sweep_ICCV_2017_paper.pdf", @@ -12501,7 +13297,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Australia" + "aff_country_unique": "Australia", + "bibtex": "@InProceedings{Zhang_2017_ICCV,\n \n author = {\n Zhang,\n Qianggong and Chin,\n Tat-Jun and Suter,\n David\n},\n title = {\n Quasiconvex Plane Sweep for Triangulation With Outliers\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Query-Guided Regression Network With Context Policy for Phrase Grounding", @@ -12509,6 +13306,7 @@ "status": "Spotlight", "track": "main", "pid": "1201", + "author_site": "Kan Chen; Rama Kovvuri; Ram Nevatia", "author": "Kan Chen; Rama Kovvuri; Ram Nevatia", "abstract": "Given a textual description of an image, phrase grounding localizes objects in the image referred by query phrases in the description. State-of-the-art methods address the problem by ranking a set of proposals based on the relevance to each query, which are limited by the performance of independent proposal generation systems and ignore useful cues from context in the description. In this paper, we adopt a spatial regression method to break the performance limit, and introduce reinforcement learning techniques to further leverage semantic context information. We propose a novel Query-guided Regression network with Context policy (QRC Net) which jointly learns a Proposal Generation Network (PGN), a Query-guided Regression Network (QRN) and a Context Policy Network (CPN). Experiments show QRC Net provides a significant improvement in accuracy on two popular datasets: Flickr30K Entities and Referit Game, with 14.25% and 17.14% increase over the state-of-the-arts respectively.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Chen_Query-Guided_Regression_Network_ICCV_2017_paper.pdf", @@ -12524,7 +13322,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Chen_Query-Guided_Regression_Network_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Chen_Query-Guided_Regression_Network_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Chen_2017_ICCV,\n \n author = {\n Chen,\n Kan and Kovvuri,\n Rama and Nevatia,\n Ram\n},\n title = {\n Query-Guided Regression Network With Context Policy for Phrase Grounding\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "R-C3D: Region Convolutional 3D Network for Temporal Activity Detection", @@ -12532,6 +13331,7 @@ "status": "Poster", "track": "main", "pid": "2582", + "author_site": "Huijuan Xu; Abir Das; Kate Saenko", "author": "Huijuan Xu; Abir Das; Kate Saenko", "abstract": "We address the problem of activity detection in continuous, untrimmed video streams. This is a difficult task that requires extracting meaningful spatio-temporal features to capture activities, accurately localizing the start and end times of each activity. We introduce a new model, Region Convolutional 3D Network (R-C3D), which encodes the video streams using a three-dimensional fully convolutional network, then generates candidate temporal regions containing activities, and finally classifies selected regions into specific activities. Computation is saved due to the sharing of convolutional features between the proposal and the classification pipelines. The entire model is trained end-to-end with jointly optimized localization and classification losses. R-C3D is faster than existing methods (569 frames per second on a single Titan X Maxwell GPU) and achieves state-of-the-art results on THUMOS'14. We further demonstrate that our model is a general activity detection framework that does not rely on assumptions about particular dataset properties by evaluating our approach on ActivityNet and Charades. Our code is available at http://ai.bu.edu/r-c3d/.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Xu_R-C3D_Region_Convolutional_ICCV_2017_paper.pdf", @@ -12556,7 +13356,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Xu_2017_ICCV,\n \n author = {\n Xu,\n Huijuan and Das,\n Abir and Saenko,\n Kate\n},\n title = {\n R-C3D: Region Convolutional 3D Network for Temporal Activity Detection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "RDFNet: RGB-D Multi-Level Residual Feature Fusion for Indoor Semantic Segmentation", @@ -12564,6 +13365,7 @@ "status": "Poster", "track": "main", "pid": "2150", + "author_site": "Seong-Jin Park; Ki-Sang Hong; Seungyong Lee", "author": "Seong-Jin Park; Ki-Sang Hong; Seungyong Lee", "abstract": "In multi-class indoor semantic segmentation using RGB-D data, it has been shown that incorporating depth feature into RGB feature is helpful to improve segmentation accuracy. However, previous studies have not fully exploited the potentials of multi-modal feature fusion, e.g., simply concatenating RGB and depth features or averaging RGB and depth score maps. To learn the optimal fusion of multi-modal features, this paper presents a novel network that extends the core idea of residual learning to RGB-D semantic segmentation. Our network effectively captures multi-level RGB-D CNN features by including multi-modal feature fusion blocks and multi-level feature refinement blocks. Feature fusion blocks learn residual RGB and depth features and their combinations to fully exploit the complementary characteristics of RGB and depth data. Feature refinement blocks learn the combination of fused features from multiple levels to enable high-resolution prediction. Our network can efficiently train discriminative multi-level features from each modality end-to-end by taking full advantage of skip-connections. Our comprehensive experiments demonstrate that the proposed architecture achieves the state-of-the-art accuracy on two challenging RGB-D indoor datasets, NYUDv2 and SUN RGB-D.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Park_RDFNet_RGB-D_Multi-Level_ICCV_2017_paper.pdf", @@ -12588,7 +13390,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Pohang", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Park_2017_ICCV,\n \n author = {\n Park,\n Seong-Jin and Hong,\n Ki-Sang and Lee,\n Seungyong\n},\n title = {\n RDFNet: RGB-D Multi-Level Residual Feature Fusion for Indoor Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "RGB-Infrared Cross-Modality Person Re-Identification", @@ -12596,6 +13399,7 @@ "status": "Poster", "track": "main", "pid": "2530", + "author_site": "Ancong Wu; Wei-Shi Zheng; Hong-Xing Yu; Shaogang Gong; Jianhuang Lai", "author": "Ancong Wu; Wei-Shi Zheng; Hong-Xing Yu; Shaogang Gong; Jianhuang Lai", "abstract": "Person re-identification (Re-ID) is an important problem in video surveillance, aiming to match pedestrian images across camera views. Currently, most works focus on RGB-based Re-ID. However, in some applications, RGB images are not suitable, e.g. in a dark environment or at night. Infrared (IR) imaging becomes necessary in many visual systems. To that end, matching RGB images with infrared images is required, which are heterogeneous with very different visual characteristics. For person Re-ID, this is a very challenging cross-modality problem that has not been studied so far. In this work, we address the RGB-IR cross-modality Re-ID problem and contribute a new multiple modality Re-ID dataset named SYSU-MM01, including RGB and IR images of 491 identities from 6 cameras, giving in total 287,628 RGB images and 15,792 IR images. To explore the RGB-IR Re-ID problem, we evaluate existing popular cross-domain models, including three commonly used neural network structures (one-stream, two-stream and asymmetric FC layer) and analyse the relation between them. We further propose deep zero-padding for training one-stream network towards automatically evolving domain-specific nodes in the network for cross-modality matching. Our experiments show that RGB-IR cross-modality matching is very challenging but still feasible using the proposed model with deep zero-padding, giving the best performance. Our dataset is available at http://isee.sysu.edu.cn/project/RGBIRReID.htm.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Wu_RGB-Infrared_Cross-Modality_Person_ICCV_2017_paper.pdf", @@ -12620,7 +13424,8 @@ "aff_campus_unique_index": ";1;", "aff_campus_unique": ";London", "aff_country_unique_index": "0;0+0+0;0;1;0+0", - "aff_country_unique": "China;United Kingdom" + "aff_country_unique": "China;United Kingdom", + "bibtex": "@InProceedings{Wu_2017_ICCV,\n \n author = {\n Wu,\n Ancong and Zheng,\n Wei-Shi and Yu,\n Hong-Xing and Gong,\n Shaogang and Lai,\n Jianhuang\n},\n title = {\n RGB-Infrared Cross-Modality Person Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "RMPE: Regional Multi-Person Pose Estimation", @@ -12628,6 +13433,7 @@ "status": "Poster", "track": "main", "pid": "1060", + "author_site": "Hao-Shu Fang; Shuqin Xie; Yu-Wing Tai; Cewu Lu", "author": "Hao-Shu Fang; Shuqin Xie; Yu-Wing Tai; Cewu Lu", "abstract": "Multi-person pose estimation in the wild is challenging. Although state-of-the-art human detectors have demonstrated good performance, small errors in localization and recognition are inevitable. These errors can cause failures for a single-person pose estimator (SPPE), especially for methods that solely depend on human detection results. In this paper, we propose a novel regional multi-person pose estimation (RMPE) framework to facilitate pose estimation in the presence of inaccurate human bounding boxes. Our framework consists of three components: Symmetric Spatial Transformer Network (SSTN), Parametric Pose Non-Maximum-Suppression (NMS), and Pose-Guided Proposals Generator (PGPG). Our method is able to handle inaccurate bounding boxes and redundant detections, allowing it to achieve 76.7 mAP on the MPII (multi person) dataset. Our model and source codes are made publicly available.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Fang_RMPE_Regional_Multi-Person_ICCV_2017_paper.pdf", @@ -12652,7 +13458,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Fang_2017_ICCV,\n \n author = {\n Fang,\n Hao-Shu and Xie,\n Shuqin and Tai,\n Yu-Wing and Lu,\n Cewu\n},\n title = {\n RMPE: Regional Multi-Person Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "RPAN: An End-To-End Recurrent Pose-Attention Network for Action Recognition in Videos", @@ -12660,6 +13467,7 @@ "status": "Oral", "track": "main", "pid": "1194", + "author_site": "Wenbin Du; Yali Wang; Yu Qiao", "author": "Wenbin Du; Yali Wang; Yu Qiao", "abstract": "Recent studies demonstrate the effectiveness of Recurrent Neural Networks (RNNs) for action recognition in videos. However, previous works mainly utilize video-level category as supervision to train RNNs, which may prohibit RNNs to learn complex motion structures along time. In this paper, we propose a recurrent pose-attention network (RPAN) to address this challenge, where we introduce a novel pose-attention mechanism to adaptively learn pose-related features at every time-step action prediction of RNNs. More specifically, we make three main contributions in this paper. Firstly, unlike previous works on pose-related action recognition, our RPAN is an end-to-end recurrent network which can exploit important spatial-temporal evolutions of human pose to assist action recognition in a unified framework. Secondly, instead of learning individual human-joint features separately, our pose-attention mechanism learns robust human-part features by sharing attention parameters partially on the semantically-related human joints. These human-part features are then fed into the human-part pooling layer to construct a highly-discriminative pose-related representation for temporal action modeling. Thirdly, one important byproduct of our RPAN is pose estimation in videos, which can be used for coarse pose annotation in action videos. We evaluate the proposed RPAN quantitatively and qualitatively on two popular benchmarks, i.e., Sub-JHMDB and PennAction. Experimental results show that RPAN outperforms the recent state-of-the-art methods on these challenging datasets.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Du_RPAN_An_End-To-End_ICCV_2017_paper.pdf", @@ -12677,14 +13485,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Du_RPAN_An_End-To-End_ICCV_2017_paper.html", "aff_unique_index": "0+1;1;2", - "aff_unique_norm": "University of Chinese Academy of Sciences;Chinese Academy of Sciences;Chinese University of Hong Kong", + "aff_unique_norm": "University of Chinese Academy of Sciences;Chinese Academy of Sciences;The Chinese University of Hong Kong", "aff_unique_dep": "Shenzhen College of Advanced Technology;Guangdong Provincial Key Laboratory of Computer Vision and Virtual Reality Technology;", "aff_unique_url": "http://www.siat.ac.cn;http://www.cas.cn;https://www.cuhk.edu.hk", "aff_unique_abbr": "UCAS;CAS;CUHK", "aff_campus_unique_index": "0+0;0;1", "aff_campus_unique": "Shenzhen;Hong Kong SAR", "aff_country_unique_index": "0+0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Du_2017_ICCV,\n \n author = {\n Du,\n Wenbin and Wang,\n Yali and Qiao,\n Yu\n},\n title = {\n RPAN: An End-To-End Recurrent Pose-Attention Network for Action Recognition in Videos\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Range Loss for Deep Face Recognition With Long-Tailed Training Data", @@ -12692,6 +13501,7 @@ "status": "Poster", "track": "main", "pid": "2968", + "author_site": "Xiao Zhang; Zhiyuan Fang; Yandong Wen; Zhifeng Li; Yu Qiao", "author": "Xiao Zhang; Zhiyuan Fang; Yandong Wen; Zhifeng Li; Yu Qiao", "abstract": "Deep convolutional neural networks have achieved significant improvements on face recognition task due to their ability to learn highly discriminative features from tremendous amounts of face images. Many large scale face datasets exhibit long-tail distribution where a small number of entities (persons) have large number of face images while a large number of persons only have very few face samples (long tail). Most of the existing works alleviate this problem by simply cutting the tailed data and only keep identities with enough number of examples. Unlike these work, this paper investigated how long-tailed data impact the training of face CNNs and develop a novel loss function, called range loss, to effectively utilize the tailed data in training process. More specifically, range loss is designed to reduce overall intrapersonal variations while enlarge inter-personal differences simultaneously. Extensive experiments on two face recognition benchmarks, Labeled Faces in the Wild (LFW) and YouTube Faces (YTF), demonstrate the effectiveness of the proposed range loss in overcoming the long tail effect, and show the good generalization ability of the proposed methods.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zhang_Range_Loss_for_ICCV_2017_paper.pdf", @@ -12708,15 +13518,16 @@ "email": "gmail.com;mail.sustc.edu.cn;andrew.cmu.edu;tencent.com;siat.ac.cn", "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Zhang_Range_Loss_for_ICCV_2017_paper.html", - "aff_unique_index": "0+1;0+2;3;4;5+0", - "aff_unique_norm": "Chinese Academy of Sciences;Tianjin University;Southern University of Science and Technology;Carnegie Mellon University;Tencent;Chinese University of Hong Kong", - "aff_unique_dep": "Guangdong Provincial Key Laboratory of Computer Vision and Virtual Reality Technology;;;;Tencent AI Lab;", - "aff_unique_url": "http://www.cas.cn;http://www.tju.edu.cn;https://www.sustech.edu.cn;https://www.cmu.edu;https://ai.tencent.com;https://www.cuhk.edu.hk", - "aff_unique_abbr": "CAS;TJU;SUSTech;CMU;Tencent AI Lab;CUHK", + "aff_unique_index": "0+1;0+2;3;5+0", + "aff_unique_norm": "Chinese Academy of Sciences;Tianjin University;Southern University of Science and Technology;Carnegie Mellon University;;The Chinese University of Hong Kong", + "aff_unique_dep": "Guangdong Provincial Key Laboratory of Computer Vision and Virtual Reality Technology;;;;;", + "aff_unique_url": "http://www.cas.cn/;http://www.tju.edu.cn;https://www.sustech.edu.cn;https://www.cmu.edu;;https://www.cuhk.edu.hk", + "aff_unique_abbr": "CAS;TJU;SUSTech;CMU;;CUHK", "aff_campus_unique_index": "0;0;2+0", "aff_campus_unique": "Shenzhen;;Hong Kong SAR", - "aff_country_unique_index": "0+0;0+0;1;0;0+0", - "aff_country_unique": "China;United States" + "aff_country_unique_index": "0+0;0+0;1;0+0", + "aff_country_unique": "China;United States;", + "bibtex": "@InProceedings{Zhang_2017_ICCV,\n \n author = {\n Zhang,\n Xiao and Fang,\n Zhiyuan and Wen,\n Yandong and Li,\n Zhifeng and Qiao,\n Yu\n},\n title = {\n Range Loss for Deep Face Recognition With Long-Tailed Training Data\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "RankIQA: Learning From Rankings for No-Reference Image Quality Assessment", @@ -12724,6 +13535,7 @@ "status": "Poster", "track": "main", "pid": "340", + "author_site": "Xialei Liu; Joost van de Weijer; Andrew D. Bagdanov", "author": "Xialei Liu; Joost van de Weijer; Andrew D. Bagdanov", "abstract": "We propose a no-reference image quality assessment (NR-IQA) approach that learns from rankings (RankIQA). To address the problem of limited IQA dataset size, we train a Siamese Network to rank images in terms of image quality by using synthetically generated distortions for which relative image quality is known. These ranked image sets can be automatically generated without laborious human labeling. We then use fine-tuning to transfer the knowledge represented in the trained Siamese Network to a traditional CNN that estimates absolute image quality from single images. We demonstrate how our approach can be made significantly more efficient than traditional Siamese Networks by forward propagating a batch of images through a single network and backpropagating gradients derived from all pairs of images in the batch. Experiments on the TID2013 benchmark show that we improve the state-of-the-art by over 5%. Furthermore, on the LIVE benchmark we show that our approach is superior to existing NR-IQA techniques and that we even outperform the state-of-the-art in full-reference IQA (FR-IQA) methods without having to resort to high-quality reference images to infer IQA.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Liu_RankIQA_Learning_From_ICCV_2017_paper.pdf", @@ -12748,7 +13560,8 @@ "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Barcelona;Florence", "aff_country_unique_index": "0;0;1", - "aff_country_unique": "Spain;Italy" + "aff_country_unique": "Spain;Italy", + "bibtex": "@InProceedings{Liu_2017_ICCV,\n \n author = {\n Liu,\n Xialei and van de Weijer,\n Joost and Bagdanov,\n Andrew D.\n},\n title = {\n RankIQA: Learning From Rankings for No-Reference Image Quality Assessment\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Raster-To-Vector: Revisiting Floorplan Transformation", @@ -12756,6 +13569,7 @@ "status": "Poster", "track": "main", "pid": "724", + "author_site": "Chen Liu; Jiajun Wu; Pushmeet Kohli; Yasutaka Furukawa", "author": "Chen Liu; Jiajun Wu; Pushmeet Kohli; Yasutaka Furukawa", "abstract": "This paper addresses the problem of converting a rasterized floorplan image into a vector-graphics representation. Unlike existing approaches that rely on a sequence of low-level image processing heuristics, we adopt a learning-based approach. A neural architecture first transforms a rasterized image to a set of junctions that represent low-level geometric and semantic information (e.g., wall corners or door end-points). Integer programming is then formulated to aggregate junctions into a set of simple primitives (e.g., wall lines, door lines, or icon boxes) to produce a vectorized floorplan, while ensuring a topologically and geometrically consistent result. Our algorithm significantly outperforms existing methods and achieves around 90% precision and recall, getting to the range of production-ready performance. The vector representation allows 3D model popup for better indoor scene visualization, direct model manipulation for architectural remodeling, and further computational applications such as data analysis. Our system is efficient: we have converted hundred thousand production-level floorplan images into the vector representation and generated 3D popup models.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Liu_Raster-To-Vector_Revisiting_Floorplan_ICCV_2017_paper.pdf", @@ -12773,14 +13587,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Liu_Raster-To-Vector_Revisiting_Floorplan_ICCV_2017_paper.html", "aff_unique_index": "0;1;2+3;4+0", - "aff_unique_norm": "Washington University in St. Louis;Massachusetts Institute of Technology;DeepMind;Microsoft;Simon Fraser University", + "aff_unique_norm": "Washington University in St. Louis;Massachusetts Institute of Technology;DeepMind;Microsoft Research;Simon Fraser University", "aff_unique_dep": ";;;Microsoft Research;", "aff_unique_url": "https://wustl.edu;https://web.mit.edu;https://deepmind.com;https://www.microsoft.com/en-us/research;https://www.sfu.ca", "aff_unique_abbr": "WashU;MIT;DeepMind;MSR;SFU", "aff_campus_unique_index": "0;2;0", "aff_campus_unique": "St. Louis;;Redmond", "aff_country_unique_index": "0;0;1+0;2+0", - "aff_country_unique": "United States;United Kingdom;Canada" + "aff_country_unique": "United States;United Kingdom;Canada", + "bibtex": "@InProceedings{Liu_2017_ICCV,\n \n author = {\n Liu,\n Chen and Wu,\n Jiajun and Kohli,\n Pushmeet and Furukawa,\n Yasutaka\n},\n title = {\n Raster-To-Vector: Revisiting Floorplan Transformation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Ray Space Features for Plenoptic Structure-From-Motion", @@ -12788,6 +13603,7 @@ "status": "Poster", "track": "main", "pid": "2233", + "author_site": "Yingliang Zhang; Peihong Yu; Wei Yang; Yuanxi Ma; Jingyi Yu", "author": "Yingliang Zhang; Peihong Yu; Wei Yang; Yuanxi Ma; Jingyi Yu", "abstract": "Traditional Structure-from-Motion (SfM) uses images captured by cameras as inputs. In this paper, we explore using light fields captured by plenoptic cameras or camera arrays as inputs. We call this solution plenoptic SfM or P-SfM solution. We first present a comprehensive theory on ray geometry transforms under light field pose variations. We derive the transforms of three typical ray manifolds: rays passing through a point or point-ray manifold, rays passing through a 3D line or ray-line manifold, and rays lying on a common 3D plane or ray-plane manifold. We show that by matching these manifolds across LFs, we can recover light field poses and conduct bundle adjustment in ray space. We validate our theory and framework on synthetic and real data on light fields of different scales: small scale LFs acquired using a LF camera and large scale LFs by a camera array. We show that our P-SfM technique can significantly improve the accuracy and reliability over regular SfM and PnP especially on traditionally challenging scenes where reliable feature point correspondences are difficult to obtain but line or plane correspondences are readily accessible.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zhang_Ray_Space_Features_ICCV_2017_paper.pdf", @@ -12812,7 +13628,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1+1;0;0+1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Zhang_2017_ICCV,\n \n author = {\n Zhang,\n Yingliang and Yu,\n Peihong and Yang,\n Wei and Ma,\n Yuanxi and Yu,\n Jingyi\n},\n title = {\n Ray Space Features for Plenoptic Structure-From-Motion\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Real Time Eye Gaze Tracking With 3D Deformable Eye-Face Model", @@ -12820,6 +13637,7 @@ "status": "Poster", "track": "main", "pid": "420", + "author_site": "Kang Wang; Qiang Ji", "author": "Kang Wang; Qiang Ji", "abstract": "3D model-based gaze estimation methods are widely explored because of their good accuracy and ability to handle free head movement. Traditional methods with complex hardware systems (Eg. infrared lights, 3D sensors, etc.) are restricted to controlled environments, which significantly limit their practical utilities. In this paper, we propose a 3D model-based gaze estimation method with a single web-camera, which enables instant and portable eye gaze tracking. The key idea is to leverage on the proposed 3D eye-face model, from which we can estimate 3D eye gaze from observed 2D facial landmarks. The proposed system includes a 3D deformable eye-face model that is learned offline from multiple training subjects. Given the deformable model, individual 3D eye-face models and personal eye parameters can be recovered through the unified calibration algorithm. Experimental results show that the proposed method outperforms state-of-the-art methods while allowing convenient system setup and free head movement. A real time eye tracking system running at 30 FPS also validates the effectiveness and efficiency of the proposed method.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Wang_Real_Time_Eye_ICCV_2017_paper.pdf", @@ -12834,7 +13652,8 @@ "aff_domain": ";", "email": ";", "author_num": 2, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Wang_Real_Time_Eye_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Wang_Real_Time_Eye_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Wang_2017_ICCV,\n \n author = {\n Wang,\n Kang and Ji,\n Qiang\n},\n title = {\n Real Time Eye Gaze Tracking With 3D Deformable Eye-Face Model\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Real-Time Hand Tracking Under Occlusion From an Egocentric RGB-D Sensor", @@ -12842,6 +13661,7 @@ "status": "Poster", "track": "main", "pid": "514", + "author_site": "Franziska Mueller; Dushyant Mehta; Oleksandr Sotnychenko; Srinath Sridhar; Dan Casas; Christian Theobalt", "author": "Franziska Mueller; Dushyant Mehta; Oleksandr Sotnychenko; Srinath Sridhar; Dan Casas; Christian Theobalt", "abstract": "We present an approach for real-time, robust, and accurate hand pose estimation from moving egocentric RGB-D cameras in cluttered real environments. Existing methods typically fail for hand-object interactions in cluttered scenes imaged from egocentric viewpoints, common for virtual or augmented reality applications. Our approach uses two subsequently applied Convolutional Neural Networks (CNNs) to localize the hand and regress 3D joint locations. Hand localization is achieved by using a CNN to estimate the 2D position of the hand center in the input, even in the presence of clutter and occlusions. The localized hand position, together with the corresponding input depth value, is used to generate a normalized cropped image that is fed into a second CNN to regress relative 3D hand joint locations in real-time. For added accuracy, robustness, and temporal stability, we refine the pose estimates using a kinematic pose tracking energy. To train the CNNs, we introduce a new photorealistic dataset that uses a merged reality approach to capture and synthesize large amounts of annotated data of natural hand interaction in cluttered scenes. Through quantitative and qualitative evaluation, we show that our method is robust to self-occlusion and occlusions by objects, specifically in moving egocentric perspectives.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Mueller_Real-Time_Hand_Tracking_ICCV_2017_paper.pdf", @@ -12866,7 +13686,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1;0", - "aff_country_unique": "Germany;Spain" + "aff_country_unique": "Germany;Spain", + "bibtex": "@InProceedings{Mueller_2017_ICCV,\n \n author = {\n Mueller,\n Franziska and Mehta,\n Dushyant and Sotnychenko,\n Oleksandr and Sridhar,\n Srinath and Casas,\n Dan and Theobalt,\n Christian\n},\n title = {\n Real-Time Hand Tracking Under Occlusion From an Egocentric RGB-D Sensor\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Real-Time Monocular Pose Estimation of 3D Objects Using Temporally Consistent Local Color Histograms", @@ -12874,7 +13695,7 @@ "status": "Poster", "track": "main", "pid": "6", - "author_site": "Henning Tjaden; Ulrich Schwanecke; Elmar Sch\u00c3\u00b6mer", + "author_site": "Henning Tjaden; Ulrich Schwanecke; Elmar Schömer", "author": "Henning Tjaden; Ulrich Schwanecke; Elmar Schomer", "abstract": "We present a novel approach to 6DOF pose estimation and segmentation of rigid 3D objects using a single monocular RGB camera based on temporally consistent, local color histograms. We show that this approach outperforms previous methods in cases of cluttered backgrounds, heterogenous objects, and occlusions. The proposed histograms can be used as statistical object descriptors within a template matching strategy for pose recovery after temporary tracking loss e. g. caused by massive occlusion or if the object leaves the camera's field of view. The descriptors can be trained online within a couple of seconds moving a handheld object in front of a camera. During the training stage, our approach is already capable to recover from accidental tracking loss. We demonstrate the performance of our method in comparison to the state of the art in different challenging experiments including a popular public data set.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Tjaden_Real-Time_Monocular_Pose_ICCV_2017_paper.pdf", @@ -12899,7 +13720,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Mainz", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Tjaden_2017_ICCV,\n \n author = {\n Tjaden,\n Henning and Schwanecke,\n Ulrich and Schomer,\n Elmar\n},\n title = {\n Real-Time Monocular Pose Estimation of 3D Objects Using Temporally Consistent Local Color Histograms\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Realistic Dynamic Facial Textures From a Single Image Using GANs", @@ -12907,6 +13729,7 @@ "status": "Poster", "track": "main", "pid": "2685", + "author_site": "Kyle Olszewski; Zimo Li; Chao Yang; Yi Zhou; Ronald Yu; Zeng Huang; Sitao Xiang; Shunsuke Saito; Pushmeet Kohli; Hao Li", "author": "Kyle Olszewski; Zimo Li; Chao Yang; Yi Zhou; Ronald Yu; Zeng Huang; Sitao Xiang; Shunsuke Saito; Pushmeet Kohli; Hao Li", "abstract": "We present a novel method to realistically puppeteer and animate a face from a single RGB image using a source video sequence. We begin by fitting a multilinear PCA model to obtain the 3D geometry and a single texture of the target face. In order for the animation to be realistic, however, we need dynamic per-frame textures that capture subtle wrinkles and deformations corresponding to the animated facial expressions. This problem is highly underconstrained, as dynamic textures cannot be obtained directly from a single image. Furthermore, if the target face has a closed mouth, it is not possible to obtain actual images of the mouth interior. To address this issue, we train a Deep Generative Network that can infer realistic per-frame texture deformations, including the mouth interior, of the target identity using the per-frame source textures and the single target texture. By retargeting the PCA expression geometry from the source, as well as using the newly inferred texture, we can both animate the face and perform video face replacement on the source video using the target appearance.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Olszewski_Realistic_Dynamic_Facial_ICCV_2017_paper.pdf", @@ -12931,7 +13754,8 @@ "aff_campus_unique_index": "0;0;0;0;0;0;0;0;0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0;0;0;0;0;0;0;0;1;0+2+0", - "aff_country_unique": "United States;United Kingdom;Israel" + "aff_country_unique": "United States;United Kingdom;Israel", + "bibtex": "@InProceedings{Olszewski_2017_ICCV,\n \n author = {\n Olszewski,\n Kyle and Li,\n Zimo and Yang,\n Chao and Zhou,\n Yi and Yu,\n Ronald and Huang,\n Zeng and Xiang,\n Sitao and Saito,\n Shunsuke and Kohli,\n Pushmeet and Li,\n Hao\n},\n title = {\n Realistic Dynamic Facial Textures From a Single Image Using GANs\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Reasoning About Fine-Grained Attribute Phrases Using Reference Games", @@ -12939,6 +13763,7 @@ "status": "Poster", "track": "main", "pid": "139", + "author_site": "Jong-Chyi Su; Chenyun Wu; Huaizu Jiang; Subhransu Maji", "author": "Jong-Chyi Su; Chenyun Wu; Huaizu Jiang; Subhransu Maji", "abstract": "We present a framework for learning to describe fine-grained visual differences between instances using attribute phrases. Attribute phrases capture distinguishing aspects of an object (e.g., \"propeller on the nose\" or \"door near the wing\" for airplanes) in a compositional manner. Instances within a category can be described by a set of these phrases and collectively they span the space of semantic attributes for a category. We collect a large dataset of such phrases by asking annotators to describe several visual differences between a pair of instances within a category. We then learn to describe and ground these phrases to images in the context of a *reference game* between a speaker and a listener. The goal of a speaker is to describe attributes of an image that allows the listener to correctly identify it within a pair. Data collected in a pairwise manner improves the ability of the speaker to generate, and the ability of the listener to interpret visual descriptions. Moreover, due to the compositionality of attribute phrases, the trained listeners can interpret descriptions not seen during training for image retrieval, and the speakers can generate attribute-based explanations for differences between previously unseen categories. We also show that embedding an image into the semantic space of attribute phrases derived from listeners offers 20% improvement in accuracy over existing attribute-based representations on the FGVC-aircraft dataset.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Su_Reasoning_About_Fine-Grained_ICCV_2017_paper.pdf", @@ -12963,7 +13788,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Amherst", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Su_2017_ICCV,\n \n author = {\n Su,\n Jong-Chyi and Wu,\n Chenyun and Jiang,\n Huaizu and Maji,\n Subhransu\n},\n title = {\n Reasoning About Fine-Grained Attribute Phrases Using Reference Games\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Recognition of Action Units in the Wild With Deep Nets and a New Global-Local Loss", @@ -12971,6 +13797,7 @@ "status": "Poster", "track": "main", "pid": "1883", + "author_site": "C. Fabian Benitez-Quiroz; Yan Wang; Aleix M. Martinez", "author": "C. Fabian Benitez-Quiroz; Yan Wang; Aleix M. Martinez", "abstract": "Most previous algorithms for the recognition of Action Units (AUs) were trained on a small number of sample images. This was due to the limited amount of labeled data available at the time. This meant that data-hungry deep neural networks, which have shown their potential in other computer vision problems, could not be successfully trained to detect AUs. A recent publicly available database with close to a million labeled images has made this training possible. Image and individual variability (e.g., pose, scale, illumination, ethnicity) in this set is very large. Unfortunately, the labels in this dataset are not perfect (i.e., they are noisy), making convergence of deep nets difficult. To harness the richness of this dataset while being robust to the inaccuracies of the labels, we derive a novel global-local loss. This new loss function is shown to yield fast globally meaningful convergences and locally accurate results. Comparative results with those of the EmotioNet challenge demonstrate that our newly derived loss yields superior recognition of AUs than state-of-the-art algorithms.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Benitez-Quiroz_Recognition_of_Action_ICCV_2017_paper.pdf", @@ -12988,14 +13815,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Benitez-Quiroz_Recognition_of_Action_ICCV_2017_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "Ohio State University", + "aff_unique_norm": "The Ohio State University", "aff_unique_dep": "Dept. of Electrical and Computer Engineering", "aff_unique_url": "https://www.osu.edu", "aff_unique_abbr": "OSU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Benitez-Quiroz_2017_ICCV,\n \n author = {\n Fabian Benitez-Quiroz,\n C. and Wang,\n Yan and Martinez,\n Aleix M.\n},\n title = {\n Recognition of Action Units in the Wild With Deep Nets and a New Global-Local Loss\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Reconfiguring the Imaging Pipeline for Computer Vision", @@ -13003,6 +13831,7 @@ "status": "Poster", "track": "main", "pid": "466", + "author_site": "Mark Buckler; Suren Jayasuriya; Adrian Sampson", "author": "Mark Buckler; Suren Jayasuriya; Adrian Sampson", "abstract": "Advancements in deep learning have ignited an explosion of research on efficient hardware for embedded computer vision. Hardware vision acceleration, however, does not address the cost of capturing and processing the image data that feeds these algorithms. We examine the role of the image signal processing (ISP) pipeline in computer vision to identify opportunities to reduce computation and save energy. The key insight is that imaging pipelines should be designed to be configurable: to switch between a traditional photography mode and a low-power vision mode that produces lower-quality image data suitable only for computer vision. We use eight computer vision algorithms and a reversible pipeline simulation tool to study the imaging system's impact on vision performance. For both CNN-based and classical vision algorithms, we observe that only two ISP stages, demosaicing and gamma compression, are critical for task performance. We propose a new image sensor design that can compensate for skipping these stages. The sensor design features an adjustable resolution and tunable analog-to-digital converters (ADCs). Our proposed imaging system's vision mode disables the ISP entirely and configures the sensor to produce subsampled, lower-precision image data. This vision mode can save 75% of the average energy of a baseline photography mode while having only a small impact on vision task accuracy.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Buckler_Reconfiguring_the_Imaging_ICCV_2017_paper.pdf", @@ -13018,7 +13847,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Buckler_Reconfiguring_the_Imaging_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Buckler_Reconfiguring_the_Imaging_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Buckler_2017_ICCV,\n \n author = {\n Buckler,\n Mark and Jayasuriya,\n Suren and Sampson,\n Adrian\n},\n title = {\n Reconfiguring the Imaging Pipeline for Computer Vision\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Reconstruction-Based Disentanglement for Pose-Invariant Face Recognition", @@ -13026,6 +13856,7 @@ "status": "Poster", "track": "main", "pid": "647", + "author_site": "Xi Peng; Xiang Yu; Kihyuk Sohn; Dimitris N. Metaxas; Manmohan Chandraker", "author": "Xi Peng; Xiang Yu; Kihyuk Sohn; Dimitris N. Metaxas; Manmohan Chandraker", "abstract": "Deep neural networks (DNNs) trained on large-scale datasets have recently achieved impressive improvements in face recognition. But a persistent challenge remains to develop methods capable of handling large pose variations that are relatively under-represented in training data. This paper presents a method for learning a feature representation that is invariant to pose, without requiring extensive pose coverage in training data. We first propose to generate non-frontal views from a single frontal face, in order to increase the diversity of training data while preserving accurate facial details that are critical for identity discrimination. Our next contribution is to seek a rich embedding that encodes identity features, as well as non-identity ones such as pose and landmark locations. Finally, we propose a new feature reconstruction metric learning to explicitly disentangle identity and pose, by demanding alignment between the feature reconstructions through various combinations of identity and pose features, which is obtained from two images of the same subject. Experiments on both controlled and in-the-wild face datasets, such as MultiPIE, 300WLP and the profile view database CFP, show that our method consistently outperforms the state-of-the-art, especially on images with large head pose variations.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Peng_Reconstruction-Based_Disentanglement_for_ICCV_2017_paper.pdf", @@ -13050,7 +13881,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";San Diego", "aff_country_unique_index": "0;0;0;0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Peng_2017_ICCV,\n \n author = {\n Peng,\n Xi and Yu,\n Xiang and Sohn,\n Kihyuk and Metaxas,\n Dimitris N. and Chandraker,\n Manmohan\n},\n title = {\n Reconstruction-Based Disentanglement for Pose-Invariant Face Recognition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Recurrent 3D-2D Dual Learning for Large-Pose Facial Landmark Detection", @@ -13058,6 +13890,7 @@ "status": "Poster", "track": "main", "pid": "680", + "author_site": "Shengtao Xiao; Jiashi Feng; Luoqi Liu; Xuecheng Nie; Wei Wang; Shuicheng Yan; Ashraf Kassim", "author": "Shengtao Xiao; Jiashi Feng; Luoqi Liu; Xuecheng Nie; Wei Wang; Shuicheng Yan; Ashraf Kassim", "abstract": "Despite remarkable progress of face analysis techniques, detecting landmarks on large-pose faces is still difficult due to self-occlusion, subtle landmark difference and incomplete information. To address these challenging issues, we introduce a novel recurrent 3D-2D dual learning model that alternatively performs 2D-based 3D face model refinement and 3D-to-2D projection based 2D landmark refinement to reliably reason about self-occluded landmarks, precisely capture the subtle landmark displacement and accurately detect landmarks even in presence of extremely large poses. The proposed model presents the first loop-closed learning framework that effectively exploits the informative feedback from the 3D-2D learning and its dual 2D-3D refinement tasks in a recurrent manner. Benefiting from these two mutual-boosting steps, our proposed model demonstrates appealing robustness to large poses (up to profile pose) and outstanding ability to capture fine-scale landmark displacement compared with existing 3D models. It achieves new state-of-the-art on the challenging AFLW benchmark. Moreover, our proposed model introduces a new architectural design that economically utilizes intermediate features and achieves 4x faster speed than its deep learning based counterparts.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Xiao_Recurrent_3D-2D_Dual_ICCV_2017_paper.pdf", @@ -13082,7 +13915,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;2;1+0;0", - "aff_country_unique": "Singapore;China;Italy" + "aff_country_unique": "Singapore;China;Italy", + "bibtex": "@InProceedings{Xiao_2017_ICCV,\n \n author = {\n Xiao,\n Shengtao and Feng,\n Jiashi and Liu,\n Luoqi and Nie,\n Xuecheng and Wang,\n Wei and Yan,\n Shuicheng and Kassim,\n Ashraf\n},\n title = {\n Recurrent 3D-2D Dual Learning for Large-Pose Facial Landmark Detection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Recurrent Color Constancy", @@ -13090,7 +13924,7 @@ "status": "Poster", "track": "main", "pid": "2980", - "author_site": "Yanlin Qian; Ke Chen; Jarno Nikkanen; Joni-Kristian K\u00c3\u00a4m\u00c3\u00a4r\u00c3\u00a4inen; Ji\u00c5\u0099\u00c3\u00ad Matas", + "author_site": "Yanlin Qian; Ke Chen; Jarno Nikkanen; Joni-Kristian Kämäräinen; Jiří Matas", "author": "Yanlin Qian; Ke Chen; Jarno Nikkanen; Joni-Kristian Kamarainen; Jiri Matas", "abstract": "We introduce a novel formulation of temporal color constancy which considers multiple frames preceding the frame for which illumination is estimated. We propose an end-to-end trainable recurrent color constancy network -- the RCC-Net -- which exploits convolutional LSTMs and a simulated sequence to learn compositional representations in space and time. We use a standard single frame color constancy benchmark, the SFU Gray Ball Dataset, which can be adapted to a temporal setting. Extensive experiments show that the proposed method consistently outperforms single-frame state-of-the-art methods and their temporal variants.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Qian_Recurrent_Color_Constancy_ICCV_2017_paper.pdf", @@ -13108,14 +13942,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Qian_Recurrent_Color_Constancy_ICCV_2017_paper.html", "aff_unique_index": "0;0;1;0+2;0+2", - "aff_unique_norm": "Tampere University of Technology;Intel;Czech Technical University in Prague", - "aff_unique_dep": "Laboratory of Signal Processing;Intel Corporation;Center for Machine Perception", + "aff_unique_norm": "Tampere University of Technology;Intel Corporation;Czech Technical University in Prague", + "aff_unique_dep": "Laboratory of Signal Processing;;Center for Machine Perception", "aff_unique_url": "https://www.tut.fi;https://www.intel.com;https://www.cvut.cz", "aff_unique_abbr": "TUT;Intel;CTU", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Prague", "aff_country_unique_index": "0;0;0;0+1;0+1", - "aff_country_unique": "Finland;Czech Republic" + "aff_country_unique": "Finland;Czech Republic", + "bibtex": "@InProceedings{Qian_2017_ICCV,\n \n author = {\n Qian,\n Yanlin and Chen,\n Ke and Nikkanen,\n Jarno and Kamarainen,\n Joni-Kristian and Matas,\n Jiri\n},\n title = {\n Recurrent Color Constancy\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Recurrent Models for Situation Recognition", @@ -13123,6 +13958,7 @@ "status": "Poster", "track": "main", "pid": "195", + "author_site": "Arun Mallya; Svetlana Lazebnik", "author": "Arun Mallya; Svetlana Lazebnik", "abstract": "This work proposes Recurrent Neural Network (RNN) models to predict structured 'image situations' -- actions and noun entities fulfilling semantic roles related to the action. In contrast to prior work relying on Conditional Random Fields (CRFs), we use a specialized action prediction network followed by an RNN for noun prediction. Our system obtains state-of-the-art accuracy on the challenging recent imSitu dataset, beating CRF-based models, including ones trained with additional data. Further, we show that specialized features learned from situation prediction can be transferred to the task of image captioning to more accurately describe human-object interactions.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Mallya_Recurrent_Models_for_ICCV_2017_paper.pdf", @@ -13140,14 +13976,15 @@ "author_num": 2, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Mallya_Recurrent_Models_for_ICCV_2017_paper.html", "aff_unique_index": "0;0", - "aff_unique_norm": "University of Illinois Urbana-Champaign", + "aff_unique_norm": "University of Illinois at Urbana-Champaign", "aff_unique_dep": "", "aff_unique_url": "https://illinois.edu", "aff_unique_abbr": "UIUC", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Urbana-Champaign", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Mallya_2017_ICCV,\n \n author = {\n Mallya,\n Arun and Lazebnik,\n Svetlana\n},\n title = {\n Recurrent Models for Situation Recognition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Recurrent Multimodal Interaction for Referring Image Segmentation", @@ -13155,6 +13992,7 @@ "status": "Poster", "track": "main", "pid": "459", + "author_site": "Chenxi Liu; Zhe Lin; Xiaohui Shen; Jimei Yang; Xin Lu; Alan Yuille", "author": "Chenxi Liu; Zhe Lin; Xiaohui Shen; Jimei Yang; Xin Lu; Alan Yuille", "abstract": "In this paper we are interested in the problem of image segmentation given natural language descriptions, i.e. referring expressions. Existing works tackle this problem by first modeling images and sentences independently and then segment images by combining these two types of representations. We argue that learning word-to-image interaction is more native in the sense of jointly modeling two modalities for the image segmentation task, and we propose convolutional multimodal LSTM to encode the sequential interactions between individual words, visual information, and spatial information. We show that our proposed model outperforms the baseline model on benchmark datasets. In addition, we analyze the intermediate output of the proposed multimodal LSTM approach and empirically explain how this approach enforces a more effective word-to-image interaction.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Liu_Recurrent_Multimodal_Interaction_ICCV_2017_paper.pdf", @@ -13179,7 +14017,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Liu_2017_ICCV,\n \n author = {\n Liu,\n Chenxi and Lin,\n Zhe and Shen,\n Xiaohui and Yang,\n Jimei and Lu,\n Xin and Yuille,\n Alan\n},\n title = {\n Recurrent Multimodal Interaction for Referring Image Segmentation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Recurrent Scale Approximation for Object Detection in CNN", @@ -13187,6 +14026,7 @@ "status": "Poster", "track": "main", "pid": "331", + "author_site": "Yu Liu; Hongyang Li; Junjie Yan; Fangyin Wei; Xiaogang Wang; Xiaoou Tang", "author": "Yu Liu; Hongyang Li; Junjie Yan; Fangyin Wei; Xiaogang Wang; Xiaoou Tang", "abstract": "Since convolutional neural network (CNN) lacks an inherent mechanism to handle large scale variations, we always need to compute feature maps multiple times for multi-scale object detection, which has the bottleneck of computational cost in practice. To address this, we devise a recurrent scale approximation (RSA) to compute feature map once only, and only through this map can we approximate the rest maps on other levels. At the core of RSA is the recursive rolling out mechanism: given an initial map on a particular scale, it generates the prediction on a smaller scale that is half the size of input. To further increase efficiency and accuracy, we (a): design a scale-forecast network to globally predict potential scales in the image since there is no need to compute maps on all levels of the pyramid. (b): propose a landmark retracing network (LRN) to retrace back locations of the regressed landmarks and generate a confidence score for each landmark; LRN can effectively alleviate false positives due to the accumulated error in RSA. The whole system could be trained end-to-end in a unified CNN framework. Experiments demonstrate that our proposed algorithm is superior against state-of-the-arts on face detection benchmarks and achieves comparable results for generic proposal generation. The source code of RSA is available at github.com/sciencefans/RSA-for-object-detection.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Liu_Recurrent_Scale_Approximation_ICCV_2017_paper.pdf", @@ -13204,14 +14044,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Liu_Recurrent_Scale_Approximation_ICCV_2017_paper.html", "aff_unique_index": "0+1;1;0;0;1;1", - "aff_unique_norm": "SenseTime Group Limited;Chinese University of Hong Kong", + "aff_unique_norm": "SenseTime Group Limited;The Chinese University of Hong Kong", "aff_unique_dep": ";Multimedia Laboratory", "aff_unique_url": "https://www.sensetime.com;https://www.cuhk.edu.hk", "aff_unique_abbr": "SenseTime;CUHK", "aff_campus_unique_index": "1;1;1;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0+0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2017_ICCV,\n \n author = {\n Liu,\n Yu and Li,\n Hongyang and Yan,\n Junjie and Wei,\n Fangyin and Wang,\n Xiaogang and Tang,\n Xiaoou\n},\n title = {\n Recurrent Scale Approximation for Object Detection in CNN\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Recurrent Topic-Transition GAN for Visual Paragraph Generation", @@ -13219,6 +14060,7 @@ "status": "Poster", "track": "main", "pid": "198", + "author_site": "Xiaodan Liang; Zhiting Hu; Hao Zhang; Chuang Gan; Eric P. Xing", "author": "Xiaodan Liang; Zhiting Hu; Hao Zhang; Chuang Gan; Eric P. Xing", "abstract": "A natural image usually conveys rich semantic content and can be viewed from different angles. Existing image description methods are largely restricted by small sets of biased visual paragraph annotations, and fail to cover rich underlying semantics. In this paper, we investigate a semi-supervised paragraph generative framework that is able to synthesize diverse and semantically coherent paragraph descriptions by reasoning over local semantic regions and exploiting linguistic knowledge. The proposed Recurrent Topic-Transition Generative Adversarial Network (RTT-GAN) builds an adversarial framework between a structured paragraph generator and multi-level paragraph discriminators. The paragraph generator generates sentences recurrently by incorporating region-based visual and language attention mechanisms at each step. The quality of generated paragraph sentences is assessed by multi-level adversarial discriminators from two aspects, namely, plausibility at sentence level and topic-transition coherence at paragraph level. The joint adversarial training of RTT-GAN drives the model to generate realistic paragraphs with smooth logical transition between sentence topics. Extensive quantitative experiments on image and video paragraph datasets demonstrate the effectiveness of our RTT-GAN in both supervised and semi-supervised settings. Qualitative results on telling diverse stories for an image verify the interpretability of RTT-GAN.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Liang_Recurrent_Topic-Transition_GAN_ICCV_2017_paper.pdf", @@ -13233,7 +14075,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Liang_Recurrent_Topic-Transition_GAN_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Liang_Recurrent_Topic-Transition_GAN_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Liang_2017_ICCV,\n \n author = {\n Liang,\n Xiaodan and Hu,\n Zhiting and Zhang,\n Hao and Gan,\n Chuang and Xing,\n Eric P.\n},\n title = {\n Recurrent Topic-Transition GAN for Visual Paragraph Generation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Recursive Spatial Transformer (ReST) for Alignment-Free Face Recognition", @@ -13241,6 +14084,7 @@ "status": "Spotlight", "track": "main", "pid": "1189", + "author_site": "Wanglong Wu; Meina Kan; Xin Liu; Yi Yang; Shiguang Shan; Xilin Chen", "author": "Wanglong Wu; Meina Kan; Xin Liu; Yi Yang; Shiguang Shan; Xilin Chen", "abstract": "Convolutional Neural Network (CNN) has led to significant progress in face recognition. Currently most CNN-based face recognition methods follow a two-step pipeline, i.e. a detected face is first aligned to a canonical one pre-defined by a mean face shape, and then it is fed into a CNN to extract features for recognition. The alignment step transforms all faces to the same shape, which can cause loss of geometrical information which is helpful in distinguishing different subjects. Moreover, it is hard to define a single optimal shape for the following recognition, since faces have large diversity in facial features, e.g. poses, illumination, etc. To be free from the above problems with an independent alignment step, we introduce a Recursive Spatial Transformer (ReST) module into CNN, allowing face alignment to be jointly learned with face recognition in an end-to-end fashion. The designed ReST has an intrinsic recursive structure and is capable of progressively aligning faces to a canonical one, even those with large variations. To model non-rigid transformation, multiple ReST modules are organized in a hierarchical structure to account for different parts of faces. Overall, the proposed ReST can handle large face variations and non-rigid transformation, and is end-to-end learnable and adaptive to input, making it an effective alignment-free face recognition solution. Extensive experiments are performed on LFW and YTF datasets, and the proposed ReST outperforms those two-step methods, demonstrating its effectiveness.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Wu_Recursive_Spatial_Transformer_ICCV_2017_paper.pdf", @@ -13256,7 +14100,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Wu_Recursive_Spatial_Transformer_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Wu_Recursive_Spatial_Transformer_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Wu_2017_ICCV,\n \n author = {\n Wu,\n Wanglong and Kan,\n Meina and Liu,\n Xin and Yang,\n Yi and Shan,\n Shiguang and Chen,\n Xilin\n},\n title = {\n Recursive Spatial Transformer (ReST) for Alignment-Free Face Recognition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Referring Expression Generation and Comprehension via Attributes", @@ -13264,6 +14109,7 @@ "status": "Poster", "track": "main", "pid": "2105", + "author_site": "Jingyu Liu; Liang Wang; Ming-Hsuan Yang", "author": "Jingyu Liu; Liang Wang; Ming-Hsuan Yang", "abstract": "Referring expression is a kind of language expression that used for referring to particular objects.To make the expression without ambiguation, people often use attributes to describe the particular object. In this paper, we explore the role of attributes by incorporating them into both referring expression generation and comprehension. We first train an attribute learning model from visual objects and their paired descriptions. Then in the generation task, we take the learned attributes as the input into the generation model, thus the expressions are generated driven by both attributes and the previous words. For comprehension, we embed the learned attributes with visual features and semantics into the common space model, then the target object is retrieved based on its ranking distance in the common space. Experimental results on the three standard datasets, RefCOCO, RefCOCO+, and RefCOCOg show significant improvements over the baseline model, demonstrating that our methods are effective for both tasks.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Liu_Referring_Expression_Generation_ICCV_2017_paper.pdf", @@ -13288,7 +14134,8 @@ "aff_campus_unique_index": ";;1", "aff_campus_unique": ";Merced", "aff_country_unique_index": "0+0+0;0+0+0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Liu_2017_ICCV,\n \n author = {\n Liu,\n Jingyu and Wang,\n Liang and Yang,\n Ming-Hsuan\n},\n title = {\n Referring Expression Generation and Comprehension via Attributes\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Reflectance Capture Using Univariate Sampling of BRDFs", @@ -13296,6 +14143,7 @@ "status": "Poster", "track": "main", "pid": "3015", + "author_site": "Zhuo Hui; Kalyan Sunkavalli; Joon-Young Lee; Sunil Hadap; Jian Wang; Aswin C. Sankaranarayanan", "author": "Zhuo Hui; Kalyan Sunkavalli; Joon-Young Lee; Sunil Hadap; Jian Wang; Aswin C. Sankaranarayanan", "abstract": "We propose the use of a light-weight setup consisting of a collocated camera and light source --- commonly found on mobile devices --- to reconstruct surface normals and spatially-varying BRDFs of near-planar material samples. A collocated setup provides only a 1-D \"univariate\" sampling of the 4-D BRDF. We show that a univariate sampling is sufficient to estimate parameters of commonly used analytical BRDF models. Subsequently, we use a dictionary-based reflectance prior to derive a robust technique for per-pixel normal and BRDF estimation. We demonstrate real-world shape and capture, and its application to material editing and classification, using real data acquired using a mobile phone.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Hui_Reflectance_Capture_Using_ICCV_2017_paper.pdf", @@ -13320,7 +14168,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Hui_2017_ICCV,\n \n author = {\n Hui,\n Zhuo and Sunkavalli,\n Kalyan and Lee,\n Joon-Young and Hadap,\n Sunil and Wang,\n Jian and Sankaranarayanan,\n Aswin C.\n},\n title = {\n Reflectance Capture Using Univariate Sampling of BRDFs\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Refractive Structure-From-Motion Through a Flat Refractive Interface", @@ -13328,7 +14177,7 @@ "status": "Poster", "track": "main", "pid": "2592", - "author_site": "Fran\u00c3\u00a7ois Chadebecq; Francisco Vasconcelos; George Dwyer; Ren\u00c3\u00a9 Lacher; S\u00c3\u00a9bastien Ourselin; Tom Vercauteren; Danail Stoyanov", + "author_site": "François Chadebecq; Francisco Vasconcelos; George Dwyer; René Lacher; Sébastien Ourselin; Tom Vercauteren; Danail Stoyanov", "author": "Francois Chadebecq; Francisco Vasconcelos; George Dwyer; Rene Lacher; Sebastien Ourselin; Tom Vercauteren; Danail Stoyanov", "abstract": "Recovering 3D scene geometry from underwater images involves the Refractive Structure-from-Motion (RSfM) problem, where the image distortions caused by light refraction at the interface between different propagation media invalidates the single view point assumption. Direct use of the pinhole camera model in RSfM leads to inaccurate camera pose estimation and consequently drift. RSfM methods have been thoroughly studied for the case of a thick glass interface that assumes two refractive interfaces between the camera and the viewed scene. On the other hand, when the camera lens is in direct contact with the water, there is only one refractive interface. By explicitly considering a refractive interface, we develop a succinct derivation of the refractive fundamental matrix in the form of the generalised epipolar constraint for an axial camera. We use the refractive fundamental matrix to refine initial pose estimates obtained by assuming the pinhole model. This strategy allows us to robustly estimate underwater camera poses, where other methods suffer from poor noise-sensitivity. We also formulate a new four view constraint enforcing camera pose consistency along a video which leads us to a novel RSfM framework. For validation we use synthetic data to show the numerical properties of our method and we provide results on real data to demonstrate performance within laboratory settings and for applications in endoscopy.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Chadebecq_Refractive_Structure-From-Motion_Through_ICCV_2017_paper.pdf", @@ -13353,7 +14202,8 @@ "aff_campus_unique_index": "0+0;0+0;0+0;0;0;0;0+0", "aff_campus_unique": "London", "aff_country_unique_index": "0+0;0+0;0+0;0;0;0;0+0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Chadebecq_2017_ICCV,\n \n author = {\n Chadebecq,\n Francois and Vasconcelos,\n Francisco and Dwyer,\n George and Lacher,\n Rene and Ourselin,\n Sebastien and Vercauteren,\n Tom and Stoyanov,\n Danail\n},\n title = {\n Refractive Structure-From-Motion Through a Flat Refractive Interface\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Region-Based Correspondence Between 3D Shapes via Spatially Smooth Biclustering", @@ -13361,7 +14211,7 @@ "status": "Poster", "track": "main", "pid": "1811", - "author_site": "Matteo Denitto; Simone Melzi; Manuele Bicego; Umberto Castellani; Alessandro Farinelli; M\u00c3\u00a1rio A. T. Figueiredo; Yanir Kleiman; Maks Ovsjanikov", + "author_site": "Matteo Denitto; Simone Melzi; Manuele Bicego; Umberto Castellani; Alessandro Farinelli; Mário A. T. Figueiredo; Yanir Kleiman; Maks Ovsjanikov", "author": "Matteo Denitto; Simone Melzi; Manuele Bicego; Umberto Castellani; Alessandro Farinelli; Mario A. T. Figueiredo; Yanir Kleiman; Maks Ovsjanikov", "abstract": "Region-based correspondence (RBC) is a highly relevant and non-trivial computer vision problem. Given two 3D shapes, RBC seeks segments/regions on these shapes that can be reliably put in correspondence. The problem thus consists both in finding the regions and determining the correspondences between them. This problem statement is similar to that of \"biclustering\", implying that RBC can be cast as a biclustering problem. Here, we exploit this implication by tackling RBC via a novel biclustering approach, called S4B (spatially smooth spike and slab biclustering), which: (i) casts the problem in a probabilistic low-rank matrix factorization perspective; (ii) uses a spike and slab prior to induce sparsity; (iii) is enriched with a spatial smoothness prior, based on geodesic distances, encouraging nearby vertices to belong to the same bicluster. This type of spatial prior cannot be used in classical biclustering techniques. We test the proposed approach on the FAUST dataset, outperforming both state-of-the-art RBC techniques and classical biclustering methods.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Denitto_Region-Based_Correspondence_Between_ICCV_2017_paper.pdf", @@ -13382,11 +14232,12 @@ "aff_unique_norm": "University of Verona;Universidade de Lisboa;Ecole Polytechnique", "aff_unique_dep": ";;", "aff_unique_url": "https://www.univr.it;https://www.ulisboa.pt;https://www.polytechnique.edu", - "aff_unique_abbr": "UniVR;ULisboa;X", + "aff_unique_abbr": "UniVR;ULisboa;Polytechnique", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;1;2;2", - "aff_country_unique": "Italy;Portugal;France" + "aff_country_unique": "Italy;Portugal;France", + "bibtex": "@InProceedings{Denitto_2017_ICCV,\n \n author = {\n Denitto,\n Matteo and Melzi,\n Simone and Bicego,\n Manuele and Castellani,\n Umberto and Farinelli,\n Alessandro and Figueiredo,\n Mario A. T. and Kleiman,\n Yanir and Ovsjanikov,\n Maks\n},\n title = {\n Region-Based Correspondence Between 3D Shapes via Spatially Smooth Biclustering\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Regional Interactive Image Segmentation Networks", @@ -13394,6 +14245,7 @@ "status": "Poster", "track": "main", "pid": "1251", + "author_site": "Jun Hao Liew; Yunchao Wei; Wei Xiong; Sim-Heng Ong; Jiashi Feng", "author": "Jun Hao Liew; Yunchao Wei; Wei Xiong; Sim-Heng Ong; Jiashi Feng", "abstract": "The interactive image segmentation model allows users to iteratively add new inputs for refinement until a satisfactory result is finally obtained. Therefore, an ideal interactive segmentation model should learn to capture the user's intention with minimal interaction. However, existing models fail to fully utilize the valuable user input information in the segmentation refinement process and thus offer an unsatisfactory user experience. In order to fully exploit the user-provided information, we propose a new deep framework, called Regional Interactive Segmentation Network (RIS-Net), to expand the field-of-view of the given inputs to capture the local regional information surrounding them for local refinement. Additionally, RIS-Net adopts multiscale global contextual information to augment each local region for improving feature representation. We also introduce click discount factors to develop a novel optimization strategy for more effective end-to-end training. Comprehensive evaluations on four challenging datasets well demonstrate the superiority of the proposed RIS-Net over other state-of-the-art approaches.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Liew_Regional_Interactive_Image_ICCV_2017_paper.pdf", @@ -13418,7 +14270,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Singapore;", "aff_country_unique_index": "0;0;0;0+0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Liew_2017_ICCV,\n \n author = {\n Hao Liew,\n Jun and Wei,\n Yunchao and Xiong,\n Wei and Ong,\n Sim-Heng and Feng,\n Jiashi\n},\n title = {\n Regional Interactive Image Segmentation Networks\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Representation Learning by Learning to Count", @@ -13426,6 +14279,7 @@ "status": "Oral", "track": "main", "pid": "1044", + "author_site": "Mehdi Noroozi; Hamed Pirsiavash; Paolo Favaro", "author": "Mehdi Noroozi; Hamed Pirsiavash; Paolo Favaro", "abstract": "We introduce a novel method for representation learning that uses an artificial supervision signal based on counting visual primitives. This supervision signal is obtained from an equivariance relation, which does not require any manual annotation. We relate transformations of images to transformations of the representations. More specifically, we look for the representation that satisfies such relation rather than the transformations that match a given representation. In this paper, we use two image transformations in the context of counting: scaling and tiling. The first transformation exploits the fact that the number of visual primitives should be invariant to scale. The second transformation allows us to equate the total number of visual primitives in each tile to that in the whole image. These two transformations are combined in one constraint and used to train a neural network with a contrastive loss. The proposed task produces representations that perform on par or exceed the state of the art in transfer learning benchmarks.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Noroozi_Representation_Learning_by_ICCV_2017_paper.pdf", @@ -13450,7 +14304,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Baltimore County", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "Switzerland;United States" + "aff_country_unique": "Switzerland;United States", + "bibtex": "@InProceedings{Noroozi_2017_ICCV,\n \n author = {\n Noroozi,\n Mehdi and Pirsiavash,\n Hamed and Favaro,\n Paolo\n},\n title = {\n Representation Learning by Learning to Count\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Rethinking Reprojection: Closing the Loop for Pose-Aware Shape Reconstruction From a Single Image", @@ -13458,6 +14313,7 @@ "status": "Spotlight", "track": "main", "pid": "785", + "author_site": "Rui Zhu; Hamed Kiani Galoogahi; Chaoyang Wang; Simon Lucey", "author": "Rui Zhu; Hamed Kiani Galoogahi; Chaoyang Wang; Simon Lucey", "abstract": "An emerging problem in computer vision is the reconstruction of 3D shape and pose of an object from a single image. Hitherto, the problem has been addressed through the application of canonical deep learning methods to regress from the image directly to the 3D shape and pose labels. These approaches, however, are problematic from two perspectives. First, they are minimizing the error between 3D shapes and pose labels - with little thought about the nature of this \"label error\" when reprojecting the shape back onto the image. Second, they rely on the onerous and ill-posed task of hand labeling natural images with respect to 3D shape and pose. In this paper we define the new task of pose-aware shape reconstruction from a single image, and we advocate that cheaper 2D annotations of objects silhouettes in natural images can be utilized. We design architectures of pose-aware shape reconstruction which reproject the predicted shape back on to the image using the predicted pose. Our evaluation on several object categories demonstrates the superiority of our method for predicting pose-aware 3D shapes from natural images.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zhu_Rethinking_Reprojection_Closing_ICCV_2017_paper.pdf", @@ -13482,7 +14338,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhu_2017_ICCV,\n \n author = {\n Zhu,\n Rui and Kiani Galoogahi,\n Hamed and Wang,\n Chaoyang and Lucey,\n Simon\n},\n title = {\n Rethinking Reprojection: Closing the Loop for Pose-Aware Shape Reconstruction From a Single Image\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Revisiting Cross-Channel Information Transfer for Chromatic Aberration Correction", @@ -13490,6 +14347,7 @@ "status": "Poster", "track": "main", "pid": "1326", + "author_site": "Tiancheng Sun; Yifan Peng; Wolfgang Heidrich", "author": "Tiancheng Sun; Yifan Peng; Wolfgang Heidrich", "abstract": "Image aberrations can cause severe degradation in image quality for consumer-level cameras, especially under the current tendency to reduce the complexity of lens designs in order to shrink the overall size of modules. In simplified optical designs, chromatic aberration can be one of the most significant causes for degraded image quality, and it can be quite difficult to remove in post-processing, since it results in strong blurs in at least some of the color channels. In this work, we revisit the pixel-wise similarity between different color channels of the image and accordingly propose a novel algorithm for correcting chromatic aberration based on this cross-channel correlation. In contrast to recent weak prior-based models, ours uses strong pixel-wise fitting and transfer, which lead to significant quality improvements for large chromatic aberrations. Experimental results on both synthetic and real world images captured by different optical systems demonstrate that the chromatic aberration can be significantly reduced using our approach.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Sun_Revisiting_Cross-Channel_Information_ICCV_2017_paper.pdf", @@ -13514,7 +14372,8 @@ "aff_campus_unique_index": "0+1;2+1;1+2", "aff_campus_unique": "Beijing;Thuwal;Vancouver", "aff_country_unique_index": "0+1;2+1;1+2", - "aff_country_unique": "China;Saudi Arabia;Canada" + "aff_country_unique": "China;Saudi Arabia;Canada", + "bibtex": "@InProceedings{Sun_2017_ICCV,\n \n author = {\n Sun,\n Tiancheng and Peng,\n Yifan and Heidrich,\n Wolfgang\n},\n title = {\n Revisiting Cross-Channel Information Transfer for Chromatic Aberration Correction\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Revisiting IM2GPS in the Deep Learning Era", @@ -13522,6 +14381,7 @@ "status": "Poster", "track": "main", "pid": "1126", + "author_site": "Nam Vo; Nathan Jacobs; James Hays", "author": "Nam Vo; Nathan Jacobs; James Hays", "abstract": "Image geolocalization, inferring the geographic location of an image, is a challenging computer vision problem with many potential applications. The recent state-of-the-art approach to this problem is a deep image classification approach in which the world is spatially divided into bins and a deep network is trained to predict the correct bin for a given image. We propose to combine this approach with the original Im2GPS approach in which a query image is matched against a database of geotagged images and the location is inferred from the retrieved set. We estimate the geographic location of a query image by applying kernel density estimation to the locations of its nearest neighbors in the reference database. Interestingly, we find that the best features for our retrieval task are derived from networks trained with classification loss even though we do not use a classification approach at test time. Training with classification loss outperforms several deep feature learning methods (e.g. Siamese networks with contrastive of triplet loss) more typical for retrieval applications. Our simple approach achieves state-of-the-art geolocalization accuracy while also requiring significantly less training data.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Vo_Revisiting_IM2GPS_in_ICCV_2017_paper.pdf", @@ -13546,7 +14406,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Vo_2017_ICCV,\n \n author = {\n Vo,\n Nam and Jacobs,\n Nathan and Hays,\n James\n},\n title = {\n Revisiting IM2GPS in the Deep Learning Era\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Revisiting Unreasonable Effectiveness of Data in Deep Learning Era", @@ -13554,6 +14415,7 @@ "status": "Spotlight", "track": "main", "pid": "1498", + "author_site": "Chen Sun; Abhinav Shrivastava; Saurabh Singh; Abhinav Gupta", "author": "Chen Sun; Abhinav Shrivastava; Saurabh Singh; Abhinav Gupta", "abstract": "The success of deep learning in vision can be attributed to: (a) models with high capacity; (b) increased computational power; and (c) availability of large-scale labeled data. Since 2012, there have been significant advances in representation capabilities of the models and computational capabilities of GPUs. But the size of the biggest dataset has surprisingly remained constant. What will happen if we increase the dataset size by 10x or 100x? This paper takes a step towards clearing the clouds of mystery surrounding the relationship between `enormous data' and visual deep learning. By exploiting the JFT-300M dataset which has more than 375M noisy labels for 300M images, we investigate how the performance of current vision tasks would change if this data was used for representation learning. Our paper delivers some surprising (and some expected) findings. First, we find that the performance on vision tasks increases logarithmically based on volume of training data size. Second, we show that representation learning (or pre-training) still holds a lot of promise. One can improve performance on many vision tasks by just training a better base model. Finally, as expected, we present new state-of-the-art results for different vision tasks including image classification, object detection, semantic segmentation and human pose estimation. Our sincere hope is that this inspires vision community to not undervalue the data and develop collective efforts in building larger datasets.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Sun_Revisiting_Unreasonable_Effectiveness_ICCV_2017_paper.pdf", @@ -13569,7 +14431,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Sun_Revisiting_Unreasonable_Effectiveness_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Sun_Revisiting_Unreasonable_Effectiveness_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Sun_2017_ICCV,\n \n author = {\n Sun,\n Chen and Shrivastava,\n Abhinav and Singh,\n Saurabh and Gupta,\n Abhinav\n},\n title = {\n Revisiting Unreasonable Effectiveness of Data in Deep Learning Era\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Robust Hand Pose Estimation During the Interaction With an Unknown Object", @@ -13577,6 +14440,7 @@ "status": "Poster", "track": "main", "pid": "1396", + "author_site": "Chiho Choi; Sang Ho Yoon; Chin-Ning Chen; Karthik Ramani", "author": "Chiho Choi; Sang Ho Yoon; Chin-Ning Chen; Karthik Ramani", "abstract": "This paper proposes a robust solution for accurate 3D hand pose estimation in the presence of an external object interacting with hands. Our main insight is that the shape of an object causes a configuration of the hand in the form of a hand grasp. Along this line, we simultaneously train deep neural networks using paired depth images. The object-oriented network learns functional grasps from an object perspective, whereas the hand-oriented network explores the details of hand configurations from a hand perspective. The two networks share intermediate observations produced from different perspectives to create a more informed representation. Our system then collaboratively classifies the grasp types and orientation of the hand and further constrains a pose space using these estimates. Finally, we collectively refine the unknown pose parameters to reconstruct the final hand pose. To this end, we conduct extensive evaluations to validate the efficacy of the proposed collaborative learning approach by comparing it with self-generated baselines and the state-of-the-art method.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Choi_Robust_Hand_Pose_ICCV_2017_paper.pdf", @@ -13601,7 +14465,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Choi_2017_ICCV,\n \n author = {\n Choi,\n Chiho and Ho Yoon,\n Sang and Chen,\n Chin-Ning and Ramani,\n Karthik\n},\n title = {\n Robust Hand Pose Estimation During the Interaction With an Unknown Object\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Robust Kronecker-Decomposable Component Analysis for Low-Rank Modeling", @@ -13609,6 +14474,7 @@ "status": "Poster", "track": "main", "pid": "1329", + "author_site": "Mehdi Bahri; Yannis Panagakis; Stefanos Zafeiriou", "author": "Mehdi Bahri; Yannis Panagakis; Stefanos Zafeiriou", "abstract": "Dictionary learning and component analysis are part of one of the most well-studied and active research fields, at the intersection of signal and image processing, computer vision, and statistical machine learning. In dictionary learning, the current methods of choice are arguably K-SVD and its variants, which learn a dictionary (i.e., a decomposition) for sparse coding via Singular Value Decomposition. In robust component analysis, leading methods derive from Principal Component Pursuit (PCP), which recovers a low-rank matrix from sparse corruptions of unknown magnitude and support. However, K-SVD is sensitive to the presence of noise and outliers in the training set. Additionally, PCP does not provide a dictionary that respects the structure of the data (e.g., images), and requires expensive SVD computations when solved by convex relaxation. In this paper, we introduce a new robust decomposition of images by combining ideas from sparse dictionary learning and PCP. We propose a novel Kronecker-decomposable component analysis which is robust to gross corruption, can be used for low-rank modeling, and leverages separability to solve significantly smaller problems. We design an efficient learning algorithm by drawing links with a restricted form of tensor factorization. The effectiveness of the proposed approach is demonstrated on real-world applications, namely background subtraction and image denoising, by performing a thorough comparison with the current state of the art.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Bahri_Robust_Kronecker-Decomposable_Component_ICCV_2017_paper.pdf", @@ -13633,7 +14499,8 @@ "aff_campus_unique_index": "1;1;", "aff_campus_unique": ";London", "aff_country_unique_index": "0+0+1;0+0;0+1", - "aff_country_unique": "United Kingdom;Finland" + "aff_country_unique": "United Kingdom;Finland", + "bibtex": "@InProceedings{Bahri_2017_ICCV,\n \n author = {\n Bahri,\n Mehdi and Panagakis,\n Yannis and Zafeiriou,\n Stefanos\n},\n title = {\n Robust Kronecker-Decomposable Component Analysis for Low-Rank Modeling\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Robust Object Tracking Based on Temporal and Spatial Deep Networks", @@ -13641,6 +14508,7 @@ "status": "Poster", "track": "main", "pid": "456", + "author_site": "Zhu Teng; Junliang Xing; Qiang Wang; Congyan Lang; Songhe Feng; Yi Jin", "author": "Zhu Teng; Junliang Xing; Qiang Wang; Congyan Lang; Songhe Feng; Yi Jin", "abstract": "Recently deep neural networks have been widely employed to deal with the visual tracking problem. In this work, we present a new deep architecture which incorporates the temporal and spatial information to boost the tracking performance. Our deep architecture contains three networks, a Feature Net, a Temporal Net, and a Spatial Net. The Feature Net extracts general feature representations of the target. With these feature representations, the Temporal Net encodes the trajectory of the target and directly learns temporal correspondences to estimate the object state from a global perspective. Based on the learning results of the Temporal Net, the Spatial Net further refines the object tracking state using local spatial object information. Extensive experiments on four of the largest tracking benchmarks, including VOT2014, VOT2016, OTB50, and OTB100, demonstrate competing performance of the proposed tracker over a number of state-of-the-art algorithms.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Teng_Robust_Object_Tracking_ICCV_2017_paper.pdf", @@ -13658,14 +14526,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Teng_Robust_Object_Tracking_ICCV_2017_paper.html", "aff_unique_index": "0;1;1;0;0;0", - "aff_unique_norm": "Beijing Jiao Tong University;Chinese Academy of Sciences", + "aff_unique_norm": "Beijing Jiaotong University;Chinese Academy of Sciences", "aff_unique_dep": "School of Computer and Information Technology;Institute of Automation", "aff_unique_url": "http://www.bjtu.edu.cn;http://www.ia.cas.cn", "aff_unique_abbr": "BJTU;CAS", "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Teng_2017_ICCV,\n \n author = {\n Teng,\n Zhu and Xing,\n Junliang and Wang,\n Qiang and Lang,\n Congyan and Feng,\n Songhe and Jin,\n Yi\n},\n title = {\n Robust Object Tracking Based on Temporal and Spatial Deep Networks\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Robust Pseudo Random Fields for Light-Field Stereo Matching", @@ -13673,6 +14542,7 @@ "status": "Oral", "track": "main", "pid": "212", + "author_site": "Chao-Tsung Huang", "author": "Chao-Tsung Huang", "abstract": "Markov Random Fields are widely used to model light-field stereo matching problems. However, most previous approaches used fixed parameters and did not adapt to light-field statistics. Instead, they explored explicit vision cues to provide local adaptability and thus enhanced depth quality. But such additional assumptions could end up confining their applicability, e.g. algorithms designed for dense light fields are not suitable for sparse ones. In this paper, we develop an empirical Bayesian framework--Robust Pseudo Random Field--to explore intrinsic statistical cues for broad applicability. Based on pseudo-likelihood, it applies soft expectation-maximization (EM) for good model fitting and hard EM for robust depth estimation. We introduce novel pixel difference models to enable such adaptability and robustness simultaneously. We also devise an algorithm to employ this framework on dense, sparse, and even denoised light fields. Experimental results show that it estimates scene-dependent parameters robustly and converges quickly. In terms of depth accuracy and computation speed, it also outperforms state-of-the-art algorithms constantly.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Huang_Robust_Pseudo_Random_ICCV_2017_paper.pdf", @@ -13697,7 +14567,8 @@ "aff_campus_unique_index": "0", "aff_campus_unique": "Taiwan", "aff_country_unique_index": "0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Huang_2017_ICCV,\n \n author = {\n Huang,\n Chao-Tsung\n},\n title = {\n Robust Pseudo Random Fields for Light-Field Stereo Matching\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Robust Video Super-Resolution With Learned Temporal Dynamics", @@ -13705,6 +14576,7 @@ "status": "Poster", "track": "main", "pid": "1152", + "author_site": "Ding Liu; Zhaowen Wang; Yuchen Fan; Xianming Liu; Zhangyang Wang; Shiyu Chang; Thomas Huang", "author": "Ding Liu; Zhaowen Wang; Yuchen Fan; Xianming Liu; Zhangyang Wang; Shiyu Chang; Thomas Huang", "abstract": "Video super-resolution (SR) aims to generate a high-resolution (HR) frame from multiple low-resolution (LR) frames. The inter-frame temporal relation is as crucial as the intra-frame spatial relation for tackling this problem. However, how to utilize temporal information efficiently and effectively remains challenging since complex motion is difficult to model and can introduce adverse effects if not handled properly. We address this problem from two aspects. First, we propose a temporal adaptive neural network that can adaptively determine the optimal scale of temporal dependency. Filters on various temporal scales are applied to the input LR sequence before their responses are adaptively aggregated. Second, we reduce the complexity of motion between neighboring frames using a spatial alignment network that is much more robust and efficient than competing alignment methods and can be jointly trained with the temporal adaptive network in an end-to-end manner. Our proposed models with learned temporal dynamics are systematically evaluated on public video datasets and achieve state-of-the-art SR results compared with other recent video SR approaches. Both of the temporal adaptation and the spatial alignment modules are demonstrated to considerably improve SR quality over their plain counterparts.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Liu_Robust_Video_Super-Resolution_ICCV_2017_paper.pdf", @@ -13720,7 +14592,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Liu_Robust_Video_Super-Resolution_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Liu_Robust_Video_Super-Resolution_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Liu_2017_ICCV,\n \n author = {\n Liu,\n Ding and Wang,\n Zhaowen and Fan,\n Yuchen and Liu,\n Xianming and Wang,\n Zhangyang and Chang,\n Shiyu and Huang,\n Thomas\n},\n title = {\n Robust Video Super-Resolution With Learned Temporal Dynamics\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Rolling Shutter Correction in Manhattan World", @@ -13728,7 +14601,7 @@ "status": "Poster", "track": "main", "pid": "341", - "author_site": "Pulak Purkait; Christopher Zach; Ale\u00c5\u00a1 Leonardis", + "author_site": "Pulak Purkait; Christopher Zach; AleÅ¡ Leonardis", "author": "Pulak Purkait; Christopher Zach; Ales Leonardis", "abstract": "A vast majority of consumer cameras operate the rolling shutter mechanism, which often produces distorted images due to inter-row delay while capturing an image. Recent methods for monocular rolling shutter compensation utilize blur kernel, straightness of line segments, as well as angle and length preservation. However, they do not incorporate scene geometry explicitly for rolling shutter correction, therefore, information about the 3D scene geometry is often distorted by the correction process. In this paper we propose a novel method which leverages geometric properties of the scene---in particular vanishing directions---to estimate the camera motion during rolling shutter exposure from a single distorted image. The proposed method jointly estimates the orthogonal vanishing directions and the rolling shutter camera motion. We performed extensive experiments on synthetic and real datasets which demonstrate the benefits of our approach both in terms of qualitative and quantitative results (in terms of a geometric structure fitting) as well as with respect to computation time.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Purkait_Rolling_Shutter_Correction_ICCV_2017_paper.pdf", @@ -13753,7 +14626,8 @@ "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Cambridge;Birmingham", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Purkait_2017_ICCV,\n \n author = {\n Purkait,\n Pulak and Zach,\n Christopher and Leonardis,\n Ales\n},\n title = {\n Rolling Shutter Correction in Manhattan World\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Rolling-Shutter-Aware Differential SfM and Image Rectification", @@ -13761,6 +14635,7 @@ "status": "Poster", "track": "main", "pid": "596", + "author_site": "Bingbing Zhuang; Loong-Fah Cheong; Gim Hee Lee", "author": "Bingbing Zhuang; Loong-Fah Cheong; Gim Hee Lee", "abstract": "In this paper, we develop a modified differential Structure from Motion (SfM) algorithm that can estimate relative pose from two frames despite of Rolling Shutter (RS) artifacts. In particular, we show that under constant velocity assumption, the errors induced by the rolling shutter effect can be easily rectified by a linear scaling operation on each optical flow. We further propose a 9-point algorithm to recover the relative pose of a rolling shutter camera that undergoes constant acceleration motion. We demonstrate that the dense depth maps recovered from the relative pose of the RS camera can be used in a RS-aware warping for image rectification to recover high-quality Global Shutter (GS) images. Experiments on both synthetic and real RS images show that our RS-aware differential SfM algorithm produces more accurate results on relative pose estimation and 3D reconstruction from images distorted by RS effect compared to standard SfM algorithms that assume a GS camera model. We also demonstrate that our RS-aware warping for image rectification method outperforms state-of-the-art commercial software products, i.e. Adobe After Effects and Apple Imovie, at removing RS artifacts.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zhuang_Rolling-Shutter-Aware_Differential_SfM_ICCV_2017_paper.pdf", @@ -13775,7 +14650,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Zhuang_Rolling-Shutter-Aware_Differential_SfM_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Zhuang_Rolling-Shutter-Aware_Differential_SfM_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Zhuang_2017_ICCV,\n \n author = {\n Zhuang,\n Bingbing and Cheong,\n Loong-Fah and Hee Lee,\n Gim\n},\n title = {\n Rolling-Shutter-Aware Differential SfM and Image Rectification\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "RoomNet: End-To-End Room Layout Estimation", @@ -13783,6 +14659,7 @@ "status": "Poster", "track": "main", "pid": "2122", + "author_site": "Chen-Yu Lee; Vijay Badrinarayanan; Tomasz Malisiewicz; Andrew Rabinovich", "author": "Chen-Yu Lee; Vijay Badrinarayanan; Tomasz Malisiewicz; Andrew Rabinovich", "abstract": "This paper focuses on the task of room layout estimation from a monocular RGB image. Prior works break the problem into two sub-tasks: semantic segmentation of floor, walls, ceiling to produce layout hypotheses, followed by an iterative optimization step to rank these hypotheses. In contrast, we adopt a more direct formulation of this problem as one of estimating an ordered set of room layout keypoints. The room layout and the corresponding segmentation is completely specified given the locations of these ordered keypoints. We predict the locations of the room layout keypoints using RoomNet, an end-to-end trainable encoder-decoder network. On the challenging benchmark datasets Hedau and LSUN, we achieve state-of-the-art performance along with 200x to 600x speedup compared to the most recent work. Additionally, we present optional extensions to the RoomNet architecture such as including recurrent computations and memory units to refine the keypoint locations under the same parametric capacity.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Lee_RoomNet_End-To-End_Room_ICCV_2017_paper.pdf", @@ -13807,7 +14684,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Lee_2017_ICCV,\n \n author = {\n Lee,\n Chen-Yu and Badrinarayanan,\n Vijay and Malisiewicz,\n Tomasz and Rabinovich,\n Andrew\n},\n title = {\n RoomNet: End-To-End Room Layout Estimation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Rotation Equivariant Vector Field Networks", @@ -13815,6 +14693,7 @@ "status": "Poster", "track": "main", "pid": "2249", + "author_site": "Diego Marcos; Michele Volpi; Nikos Komodakis; Devis Tuia", "author": "Diego Marcos; Michele Volpi; Nikos Komodakis; Devis Tuia", "abstract": "In many computer vision tasks, we expect a particular behavior of the output with respect to rotations of the input image. If this relationship is explicitly encoded, instead of treated as any other variation, the complexity of the problem is decreased, leading to a reduction in the size of the required model. We propose Rotation Equivariant vector field Networks (RotEqNet) to encode rotation equivariance and invariance into Convolutional Neural Networks (CNNs). Each convolutional filter is applied at multiple orientations and returns a vector field that represents the magnitude and angle of the highest scoring orientation at every spatial location. A modified convolution operator using vector fields as inputs and filters can then be applied to obtain deep architectures. We test RotEqNet on several problems requiring different responses with respect to the inputs' rotation: image classification, biomedical image segmentation, orientation estimation and patch matching. In all cases, we show that RotEqNet offers very compact models in terms of number of parameters and provides results in line to those of networks orders of magnitude larger.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Marcos_Rotation_Equivariant_Vector_ICCV_2017_paper.pdf", @@ -13834,12 +14713,13 @@ "aff_unique_index": "0;0;1;0", "aff_unique_norm": "University of Zurich;Ecole des Ponts ParisTech", "aff_unique_dep": ";", - "aff_unique_url": "https://www.unizh.ch;https://www.ponts.org", + "aff_unique_url": "https://www.uzh.ch;https://www.ponts.org", "aff_unique_abbr": "UZH;ENPC", "aff_campus_unique_index": "1", "aff_campus_unique": ";Paris", "aff_country_unique_index": "0;0;1;0", - "aff_country_unique": "Switzerland;France" + "aff_country_unique": "Switzerland;France", + "bibtex": "@InProceedings{Marcos_2017_ICCV,\n \n author = {\n Marcos,\n Diego and Volpi,\n Michele and Komodakis,\n Nikos and Tuia,\n Devis\n},\n title = {\n Rotation Equivariant Vector Field Networks\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Rotational Subgroup Voting and Pose Clustering for Robust 3D Object Recognition", @@ -13847,6 +14727,7 @@ "status": "Poster", "track": "main", "pid": "1720", + "author_site": "Anders Glent Buch; Lilita Kiforenko; Dirk Kraft", "author": "Anders Glent Buch; Lilita Kiforenko; Dirk Kraft", "abstract": "It is possible to associate a highly constrained subset of relative 6 DoF poses between two 3D shapes, as long as the local surface orientation, the normal vector, is available at every surface point. Local shape features can be used to find putative point correspondences between the models due to their ability to handle noisy and incomplete data. However, this correspondence set is usually contaminated by outliers in practical scenarios, which has led to many past contributions based on robust detectors such as the Hough transform or RANSAC. The key insight of our work is that a single correspondence between oriented points on the two models is constrained to cast votes in a 1 DoF rotational subgroup of the full group of poses, SE(3). Kernel density estimation allows combining the set of votes efficiently to determine a full 6 DoF candidate pose between the models. This modal pose with the highest density is stable under challenging conditions, such as noise, clutter, and occlusions, and provides the output estimate of our method. We first analyze the robustness of our method in relation to noise and show that it handles high outlier rates much better than RANSAC for the task of 6 DoF pose estimation. We then apply our method to four state of the art data sets for 3D object recognition that contain occluded and cluttered scenes. Our method achieves perfect recall on two LIDAR data sets and outperforms competing methods on two RGB-D data sets, thus setting a new standard for general 3D object recognition using point cloud data.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Buch_Rotational_Subgroup_Voting_ICCV_2017_paper.pdf", @@ -13871,7 +14752,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Denmark" + "aff_country_unique": "Denmark", + "bibtex": "@InProceedings{Buch_2017_ICCV,\n \n author = {\n Glent Buch,\n Anders and Kiforenko,\n Lilita and Kraft,\n Dirk\n},\n title = {\n Rotational Subgroup Voting and Pose Clustering for Robust 3D Object Recognition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "S3FD: Single Shot Scale-Invariant Face Detector", @@ -13879,6 +14761,7 @@ "status": "Poster", "track": "main", "pid": "22", + "author_site": "Shifeng Zhang; Xiangyu Zhu; Zhen Lei; Hailin Shi; Xiaobo Wang; Stan Z. Li", "author": "Shifeng Zhang; Xiangyu Zhu; Zhen Lei; Hailin Shi; Xiaobo Wang; Stan Z. Li", "abstract": "This paper presents a real-time face detector, named Single Shot Scale-invariant Face Detector (S3FD), which performs superiorly on various scales of faces with a single deep neural network, especially for small faces. Specifically, we try to solve the common problem that anchor-based detectors deteriorate dramatically as the objects become smaller. We make contributions in the following three aspects: 1) proposing a scale-equitable face detection framework to handle different scales of faces well. We tile anchors on a wide range of layers to ensure that all scales of faces have enough features for detection. Besides, we design anchor scales based on the effective receptive field and a proposed equal proportion interval principle; 2) improving the recall rate of small faces by a scale compensation anchor matching strategy; 3) reducing the false positive rate of small faces via a max-out background label. As a consequence, our method achieves state-of-the-art detection performance on all the common face detection benchmarks, including the AFW, PASCAL face, FDDB and WIDER FACE datasets, and can run at 36 FPS on a Nvidia Titan X (Pascal) for VGA-resolution images.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zhang_S3FD_Single_Shot_ICCV_2017_paper.pdf", @@ -13903,7 +14786,8 @@ "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2017_ICCV,\n \n author = {\n Zhang,\n Shifeng and Zhu,\n Xiangyu and Lei,\n Zhen and Shi,\n Hailin and Wang,\n Xiaobo and Li,\n Stan Z.\n},\n title = {\n S3FD: Single Shot Scale-Invariant Face Detector\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "SBGAR: Semantics Based Group Activity Recognition", @@ -13911,6 +14795,7 @@ "status": "Poster", "track": "main", "pid": "1071", + "author_site": "Xin Li; Mooi Choo Chuah", "author": "Xin Li; Mooi Choo Chuah", "abstract": "Activity recognition has become an important function in many emerging computer vision applications e.g. automatic video surveillance system, human-computer interaction application, and video recommendation system, etc. In this paper, we propose a novel semantics based group activity recognition scheme, namely SBGAR, which achieves higher accuracy and efficiency than existing group activity recognition methods. SBGAR consists of two stages: in stage I, we use a LSTM model to generate a caption for each video frame; in stage II, another LSTM model is trained to predict the final activity categories based on these generated captions. We evaluate SBGAR using two well-known datasets: the Collective Activity Dataset and the Volleyball Dataset. Our experimental results show that SBGAR improves the group activity recognition accuracy with shorter computation time compared to the state-of-the-art methods.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Li_SBGAR_Semantics_Based_ICCV_2017_paper.pdf", @@ -13935,7 +14820,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Li_2017_ICCV,\n \n author = {\n Li,\n Xin and Choo Chuah,\n Mooi\n},\n title = {\n SBGAR: Semantics Based Group Activity Recognition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "SCNet: Learning Semantic Correspondence", @@ -13943,6 +14829,7 @@ "status": "Poster", "track": "main", "pid": "682", + "author_site": "Kai Han; Rafael S. Rezende; Bumsub Ham; Kwan-Yee K. Wong; Minsu Cho; Cordelia Schmid; Jean Ponce", "author": "Kai Han; Rafael S. Rezende; Bumsub Ham; Kwan-Yee K. Wong; Minsu Cho; Cordelia Schmid; Jean Ponce", "abstract": "This paper addresses the problem of establishing semantic correspondences between images depicting different instances of the same object or scene category. Previous approaches focus on either combining a spatial regularizer with hand-crafted features, or learning a correspondence model for appearance only. We propose instead a convolutional neural network architecture, called SCNet, for learning a geometrically plausible model for semantic correspondence. SCNet uses region proposals as matching primitives, and explicitly incorporates geometric consistency in its loss function. It is trained on image pairs obtained from the PASCAL VOC 2007 keypoint dataset, and a comparative evaluation on several standard benchmarks demonstrates that the proposed approach substantially outperforms both recent deep learning architectures and previous methods based on hand-crafted features.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Han_SCNet_Learning_Semantic_ICCV_2017_paper.pdf", @@ -13960,14 +14847,15 @@ "author_num": 7, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Han_SCNet_Learning_Semantic_ICCV_2017_paper.html", "aff_unique_index": "0;1;2;0;3;1+4+5+6+7;1+8", - "aff_unique_norm": "University of Hong Kong;INRIA;Yonsei University;Pohang University of Science and Technology;Universit\u00e9 Grenoble Alpes;Centre National de la Recherche Scientifique;Grenoble INP;Laboratoire Jean Kuntzmann;\u00c9cole Normale Sup\u00e9rieure (ENS)", + "aff_unique_norm": "The University of Hong Kong;Inria;Yonsei University;Pohang University of Science and Technology;Université Grenoble Alpes;Centre National de la Recherche Scientifique;Grenoble INP;Laboratoire Jean Kuntzmann;École Normale Supérieure (ENS)", "aff_unique_dep": ";;;;;;;;Department of Computer Science", "aff_unique_url": "https://www.hku.hk;https://www.inria.fr;https://www.yonsei.ac.kr;https://www.postech.ac.kr;https://www.univ-grenoble-alpes.fr;https://www.cnrs.fr;https://www.grenoble-inp.fr;https://ljk.ensimag.fr;https://www.ens.fr", "aff_unique_abbr": "HKU;Inria;Yonsei;POSTECH;UGA;CNRS;Grenoble INP;LJK;ENS", "aff_campus_unique_index": "0;0;2;;", "aff_campus_unique": "Hong Kong SAR;;Pohang", "aff_country_unique_index": "0;1;2;0;2;1+1+1+1+1;1+1", - "aff_country_unique": "China;France;South Korea" + "aff_country_unique": "China;France;South Korea", + "bibtex": "@InProceedings{Han_2017_ICCV,\n \n author = {\n Han,\n Kai and Rezende,\n Rafael S. and Ham,\n Bumsub and Wong,\n Kwan-Yee K. and Cho,\n Minsu and Schmid,\n Cordelia and Ponce,\n Jean\n},\n title = {\n SCNet: Learning Semantic Correspondence\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "SGN: Sequential Grouping Networks for Instance Segmentation", @@ -13975,6 +14863,7 @@ "status": "Poster", "track": "main", "pid": "1542", + "author_site": "Shu Liu; Jiaya Jia; Sanja Fidler; Raquel Urtasun", "author": "Shu Liu; Jiaya Jia; Sanja Fidler; Raquel Urtasun", "abstract": "In this paper, we propose Sequential Grouping Networks (SGN) to tackle the problem of object instance segmentation. SGNs employ a sequence of neural networks, each solving a sub-grouping problem of increasing semantic complexity in order to gradually compose objects out of pixels. In particular, the first network aims to group pixels along each image row and column by predicting horizontal and vertical object breakpoints. These breakpoints are then used to create line segments. By exploiting two-directional information, the second network groups horizontal and vertical lines into connected components. Finally, the third network groups the connected components into object instances. Our experiments show that our SGN significantly outperforms state-of-the-art approaches in both, the Cityscapes dataset as well as PASCAL VOC.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Liu_SGN_Sequential_Grouping_ICCV_2017_paper.pdf", @@ -13992,14 +14881,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Liu_SGN_Sequential_Grouping_ICCV_2017_paper.html", "aff_unique_index": "0+1;0+1;2+3;2+3", - "aff_unique_norm": "Chinese University of Hong Kong;Tencent;University of Toronto;Uber", + "aff_unique_norm": "The Chinese University of Hong Kong;Tencent;University of Toronto;Uber", "aff_unique_dep": ";Youtu Lab;;Advanced Technologies Group", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.tencent.com;https://www.utoronto.ca;https://www.uber.com", "aff_unique_abbr": "CUHK;Tencent;U of T;Uber ATG", "aff_campus_unique_index": "0;0;;", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0+0;0+0;1+2;1+2", - "aff_country_unique": "China;Canada;United States" + "aff_country_unique": "China;Canada;United States", + "bibtex": "@InProceedings{Liu_2017_ICCV,\n \n author = {\n Liu,\n Shu and Jia,\n Jiaya and Fidler,\n Sanja and Urtasun,\n Raquel\n},\n title = {\n SGN: Sequential Grouping Networks for Instance Segmentation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "SHaPE: A Novel Graph Theoretic Algorithm for Making Consensus-Based Decisions in Person Re-Identification Systems", @@ -14007,6 +14897,7 @@ "status": "Poster", "track": "main", "pid": "351", + "author_site": "Arko Barman; Shishir K. Shah", "author": "Arko Barman; Shishir K. Shah", "abstract": "Person re-identification is a challenge in video-based surveillance where the goal is to identify the same person in different camera views. In recent years, many algorithms have been proposed that approach this problem by designing suitable feature representations for images of persons or by training appropriate distance metrics that learn to distinguish between images of different persons. Aggregating the results from multiple algorithms for person re-identification is a relatively less-explored area of research. In this paper, we formulate an algorithm that maps the ranking process in a person re-identification algorithm to a problem in graph theory. We then extend this formulation to allow for the use of results from multiple algorithms to make a consensus-based decision for the person re-identification problem. The algorithm is unsupervised and takes into account only the matching scores generated by multiple algorithms for creating a consensus of results. Further, we show how the graph theoretic problem can be solved by a two-step process. First, we obtain a rough estimate of the solution using a greedy algorithm. Then, we extend the construction of the proposed graph so that the problem can be efficiently solved by means of Ant Colony Optimization, a heuristic path-searching algorithm for complex graphs. While we present the algorithm in the context of person re-identification, it can potentially be applied to the general problem of ranking items based on a consensus of multiple sets of scores or metric values.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Barman_SHaPE_A_Novel_ICCV_2017_paper.pdf", @@ -14031,7 +14922,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Barman_2017_ICCV,\n \n author = {\n Barman,\n Arko and Shah,\n Shishir K.\n},\n title = {\n SHaPE: A Novel Graph Theoretic Algorithm for Making Consensus-Based Decisions in Person Re-Identification Systems\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "SORT: Second-Order Response Transform for Visual Recognition", @@ -14039,6 +14931,7 @@ "status": "Poster", "track": "main", "pid": "567", + "author_site": "Yan Wang; Lingxi Xie; Chenxi Liu; Siyuan Qiao; Ya Zhang; Wenjun Zhang; Qi Tian; Alan Yuille", "author": "Yan Wang; Lingxi Xie; Chenxi Liu; Siyuan Qiao; Ya Zhang; Wenjun Zhang; Qi Tian; Alan Yuille", "abstract": "In this paper, we reveal the importance and benefits of introducing second-order operations into deep neural networks. We propose a novel approach named Second-Order Response Transform (SORT), which appends element-wise product transform to the linear sum of a two-branch network module. A direct advantage of SORT is to facilitate cross-branch response propagation, so that each branch can update its weights based on the current status of the other branch. Moreover, SORT augments the family of transform operations and increases the nonlinearity of the network, making it possible to learn flexible functions to fit the complicated distribution of feature space. SORT can be applied to a wide range of network architectures, including a branched variant of a chain-styled network and a residual network, with very light-weighted modifications. We observe consistent accuracy gain on both small (CIFAR10, CIFAR100 and SVHN) and big (ILSVRC2012) datasets. In addition, SORT is very efficient, as the extra computation overhead is less than 5%.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Wang_SORT_Second-Order_Response_ICCV_2017_paper.pdf", @@ -14053,7 +14946,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Wang_SORT_Second-Order_Response_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Wang_SORT_Second-Order_Response_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Wang_2017_ICCV,\n \n author = {\n Wang,\n Yan and Xie,\n Lingxi and Liu,\n Chenxi and Qiao,\n Siyuan and Zhang,\n Ya and Zhang,\n Wenjun and Tian,\n Qi and Yuille,\n Alan\n},\n title = {\n SORT: Second-Order Response Transform for Visual Recognition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "SSD-6D: Making RGB-Based 3D Detection and 6D Pose Estimation Great Again", @@ -14061,6 +14955,7 @@ "status": "Oral", "track": "main", "pid": "894", + "author_site": "Wadim Kehl; Fabian Manhardt; Federico Tombari; Slobodan Ilic; Nassir Navab", "author": "Wadim Kehl; Fabian Manhardt; Federico Tombari; Slobodan Ilic; Nassir Navab", "abstract": "We present a novel method for detecting 3D model instances and estimating their 6D poses from RGB data in a single shot. To this end, we extend the popular SSD paradigm to cover the full 6D pose space and train on synthetic model data only. Our approach competes or surpasses current state-of-the-art methods that leverage RGB-D data on multiple challenging datasets. Furthermore, our method produces these results at around 10Hz, which is many times faster than the related methods. For the sake of reproducibility, we make our trained networks and detection code publicly available.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Kehl_SSD-6D_Making_RGB-Based_ICCV_2017_paper.pdf", @@ -14085,7 +14980,8 @@ "aff_campus_unique_index": "0;2", "aff_campus_unique": "Los Altos;;Munich", "aff_country_unique_index": "0+1;1;1;1+1;1", - "aff_country_unique": "United States;Germany" + "aff_country_unique": "United States;Germany", + "bibtex": "@InProceedings{Kehl_2017_ICCV,\n \n author = {\n Kehl,\n Wadim and Manhardt,\n Fabian and Tombari,\n Federico and Ilic,\n Slobodan and Navab,\n Nassir\n},\n title = {\n SSD-6D: Making RGB-Based 3D Detection and 6D Pose Estimation Great Again\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "SSH: Single Stage Headless Face Detector", @@ -14093,6 +14989,7 @@ "status": "Poster", "track": "main", "pid": "2174", + "author_site": "Mahyar Najibi; Pouya Samangouei; Rama Chellappa; Larry S. Davis", "author": "Mahyar Najibi; Pouya Samangouei; Rama Chellappa; Larry S. Davis", "abstract": "We introduce the Single Stage Headless (SSH) face detector. Unlike two stage proposal-classification detectors, SSH detects faces in a single stage directly from the early convolutional layers in a classification network. SSH is headless. That is, it is able to achieve state-of-the-art results while removing the \"head\" of its underlying classification network -- i.e. all fully connected layers in the VGG-16 which contains a large number of parameters. Additionally, instead of relying on an image pyramid to detect faces with various scales, SSH is scale-invariant by design. We simultaneously detect faces with different scales in a single forward pass of the network, but from different layers. These properties make SSH fast and light-weight. Surprisingly, with a headless VGG-16, SSH beats the ResNet-101-based state-of-the-art on the WIDER dataset. Even though, unlike the current state-of-the-art, SSH does not use an image pyramid and is 5X faster. Moreover, if an image pyramid is deployed, our light-weight network achieves state-of-the-art on all subsets of the WIDER dataset, improving the AP by 2.5%. SSH also reaches state-of-the-art results on the FDDB and Pascal-Faces datasets while using a small input size, leading to a speed of 50 frames/second on a GPU.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Najibi_SSH_Single_Stage_ICCV_2017_paper.pdf", @@ -14107,7 +15004,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Najibi_SSH_Single_Stage_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Najibi_SSH_Single_Stage_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Najibi_2017_ICCV,\n \n author = {\n Najibi,\n Mahyar and Samangouei,\n Pouya and Chellappa,\n Rama and Davis,\n Larry S.\n},\n title = {\n SSH: Single Stage Headless Face Detector\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "SUBIC: A Supervised, Structured Binary Code for Image Search", @@ -14115,7 +15013,7 @@ "status": "Spotlight", "track": "main", "pid": "1413", - "author_site": "Himalaya Jain; Joaquin Zepeda; Patrick P\u00c3\u00a9rez; R\u00c3\u00a9mi Gribonval", + "author_site": "Himalaya Jain; Joaquin Zepeda; Patrick Pérez; Rémi Gribonval", "author": "Himalaya Jain; Joaquin Zepeda; Patrick Perez; Remi Gribonval", "abstract": "For large-scale visual search, highly compressed yet meaningful representations of images are essential. Structured vector quantizers based on product quantization and its variants are usually employed to achieve such compression while minimizing the loss of accuracy. Yet, unlike binary hashing schemes, these unsupervised methods have not yet benefited from the supervision, end-to-end learning and novel architectures ushered in by the deep learning revolution. We hence propose herein a novel method to make deep convolutional neural networks produce supervised, compact, structured binary codes for visual search. Our method makes use of a novel block-softmax non-linearity and of batch-based entropy losses that together induce structure in the learned encodings. We show that our method outperforms state-of-the-art compact representations based on deep hashing or structured quantization in single and cross-domain category retrieval, instance retrieval and classification. We make our code and models publicly available online.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Jain_SUBIC_A_Supervised_ICCV_2017_paper.pdf", @@ -14131,7 +15029,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Jain_SUBIC_A_Supervised_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Jain_SUBIC_A_Supervised_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Jain_2017_ICCV,\n \n author = {\n Jain,\n Himalaya and Zepeda,\n Joaquin and Perez,\n Patrick and Gribonval,\n Remi\n},\n title = {\n SUBIC: A Supervised,\n Structured Binary Code for Image Search\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "SVDNet for Pedestrian Retrieval", @@ -14139,6 +15038,7 @@ "status": "Spotlight", "track": "main", "pid": "2101", + "author_site": "Yifan Sun; Liang Zheng; Weijian Deng; Shengjin Wang", "author": "Yifan Sun; Liang Zheng; Weijian Deng; Shengjin Wang", "abstract": "This paper proposes the SVDNet for retrieval problems, with focus on the application of person re-identification (re-ID). We view each weight vector within a fully connected (FC) layer in a convolutional neuron network (CNN) as a projection basis. It is observed that the weight vectors are usually highly correlated. This problem leads to correlations among entries of the FC descriptor, and compromises the retrieval performance based on the Euclidean distance. To address the problem, this paper proposes to optimize the deep representation learning process with Singular Vector Decomposition (SVD). Specifically, with the restraint and relaxation iteration (RRI) training scheme, we are able to iteratively integrate the orthogonality constraint in CNN training, yielding the so-called SVDNet. We conduct experiments on the Market-1501, CUHK03, and Duke datasets, and show that RRI effectively reduces the correlation among the projection vectors, produces more discriminative FC descriptors, and significantly improves the re-ID accuracy. On the Market-1501 dataset, for instance, rank-1 accuracy is improved from 55.3% to 80.5% for CaffeNet, and from 73.8% to 82.3% for ResNet-50.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Sun_SVDNet_for_Pedestrian_ICCV_2017_paper.pdf", @@ -14163,7 +15063,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Sun_2017_ICCV,\n \n author = {\n Sun,\n Yifan and Zheng,\n Liang and Deng,\n Weijian and Wang,\n Shengjin\n},\n title = {\n SVDNet for Pedestrian Retrieval\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "SafetyNet: Detecting and Rejecting Adversarial Examples Robustly", @@ -14171,6 +15072,7 @@ "status": "Poster", "track": "main", "pid": "166", + "author_site": "Jiajun Lu; Theerasit Issaranon; David Forsyth", "author": "Jiajun Lu; Theerasit Issaranon; David Forsyth", "abstract": "We describe a method to produce a network where current methods such as DeepFool have great difficulty producing adversarial samples. Our construction suggests some insights into how deep networks work. We provide a reasonable analyses that our construction is difficult to defeat, and show experimentally that our method is hard to defeat with both Type I and Type II attacks using several standard networks and datasets. This SafetyNet architecture is used to an important and novel application SceneProof, which can reliably detect whether an image is a picture of a real scene or not. SceneProof applies to images captured with depth maps (RGBD images) and checks if a pair of image and depth map is consistent. It relies on the relative difficulty of producing naturalistic depth maps for images in post processing. We demonstrate that our SafetyNet is robust to adversarial examples built from currently known attacking approaches.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Lu_SafetyNet_Detecting_and_ICCV_2017_paper.pdf", @@ -14188,14 +15090,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Lu_SafetyNet_Detecting_and_ICCV_2017_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "University of Illinois Urbana-Champaign", + "aff_unique_norm": "University of Illinois at Urbana-Champaign", "aff_unique_dep": "", "aff_unique_url": "https://illinois.edu", "aff_unique_abbr": "UIUC", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Urbana-Champaign", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Lu_2017_ICCV,\n \n author = {\n Lu,\n Jiajun and Issaranon,\n Theerasit and Forsyth,\n David\n},\n title = {\n SafetyNet: Detecting and Rejecting Adversarial Examples Robustly\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Saliency Pattern Detection by Ranking Structured Trees", @@ -14203,6 +15106,7 @@ "status": "Poster", "track": "main", "pid": "2987", + "author_site": "Lei Zhu; Haibin Ling; Jin Wu; Huiping Deng; Jin Liu", "author": "Lei Zhu; Haibin Ling; Jin Wu; Huiping Deng; Jin Liu", "abstract": "In this paper we propose a new salient object detection method via structured label prediction. By learning appearance features in rectangular regions, our structural region representation encodes the local saliency distribution with a matrix of binary labels. We show that the linear combination of structured labels can well model the saliency distribution in local regions. Representing region saliency with structured labels has two advantages: 1) it connects the label assignment of all enclosed pixels, which produces a smooth saliency prediction; and 2) regular-shaped nature of structured labels enables well definition of traditional cues such as regional properties and center surround contrast, and these cues help to build meaningful and informative saliency measures. To measure the consistency between a structured label and the corresponding saliency distribution, we further propose an adaptive label ranking algorithm using proposals that are generated by a CNN model. Finally, we introduce a K-NN enhanced graph representation for saliency propagation, which is more favorable for our task than the widely-used adjacent-graph-based ones. Experimental results demonstrate the effectiveness of our proposed method on six popular benchmarks compared with state-of-the-art approaches.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zhu_Saliency_Pattern_Detection_ICCV_2017_paper.pdf", @@ -14227,7 +15131,8 @@ "aff_campus_unique_index": "0+1;1;0;0;0", "aff_campus_unique": "Wuhan;Philadelphia;", "aff_country_unique_index": "0+1;1+0;0;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Zhu_2017_ICCV,\n \n author = {\n Zhu,\n Lei and Ling,\n Haibin and Wu,\n Jin and Deng,\n Huiping and Liu,\n Jin\n},\n title = {\n Saliency Pattern Detection by Ranking Structured Trees\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Sampling Matters in Deep Embedding Learning", @@ -14235,7 +15140,7 @@ "status": "Poster", "track": "main", "pid": "1246", - "author_site": "Chao-Yuan Wu; R. Manmatha; Alexander J. Smola; Philipp Kr\u00c3\u00a4henb\u00c3\u00bchl", + "author_site": "Chao-Yuan Wu; R. Manmatha; Alexander J. Smola; Philipp Krähenbühl", "author": "Chao-Yuan Wu; R. Manmatha; Alexander J. Smola; Philipp Krahenbuhl", "abstract": "Deep embeddings answer one simple question: How similar are two images? Learning these embeddings is the bedrock of verification, zero-shot learning, and visual search. The most prominent approaches optimize a deep convolutional network with a suitable loss function, such as contrastive loss or triplet loss. While a rich line of work focuses solely on the loss functions, we show in this paper that selecting training examples plays an equally important role. We propose distance weighted sampling, which selects more informative and stable examples than traditional approaches. In addition, we show that a simple margin based loss is sufficient to outperform all other loss functions. We evaluate our approach on the CUB200-2011, CAR196, and the Stanford Online Products datasets for image retrieval and clustering, and on the LFW dataset for face verification. Our method achieves state-of-the-art performance on all of them.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Wu_Sampling_Matters_in_ICCV_2017_paper.pdf", @@ -14252,15 +15157,16 @@ "email": "cs.utexas.edu;a9.com;amazon.com;cs.utexas.edu", "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Wu_Sampling_Matters_in_ICCV_2017_paper.html", - "aff_unique_index": "0;1;1;0", - "aff_unique_norm": "University of Texas at Austin;Amazon", - "aff_unique_dep": ";Amazon", - "aff_unique_url": "https://www.utexas.edu;https://www.amazon.com", - "aff_unique_abbr": "UT Austin;Amazon", + "aff_unique_index": "0;1;2;0", + "aff_unique_norm": "University of Texas at Austin;Amazon;Amazon.com, Inc.", + "aff_unique_dep": ";;", + "aff_unique_url": "https://www.utexas.edu;https://www.amazon.com;https://www.amazon.com", + "aff_unique_abbr": "UT Austin;Amazon;Amazon", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Austin;", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Wu_2017_ICCV,\n \n author = {\n Wu,\n Chao-Yuan and Manmatha,\n R. and Smola,\n Alexander J. and Krahenbuhl,\n Philipp\n},\n title = {\n Sampling Matters in Deep Embedding Learning\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Scale Recovery for Monocular Visual Odometry Using Depth Estimated With Deep Convolutional Neural Fields", @@ -14268,6 +15174,7 @@ "status": "Poster", "track": "main", "pid": "3098", + "author_site": "Xiaochuan Yin; Xiangwei Wang; Xiaoguo Du; Qijun Chen", "author": "Xiaochuan Yin; Xiangwei Wang; Xiaoguo Du; Qijun Chen", "abstract": "Scale recovery is one of the central problems for monocular visual odometry. Normally, road plane and camera height are specified as reference to recover the scale. The performances of these methods depend on the plane recognition and height measurement of camera. In this work, we propose a novel method to recover the scale by incorporating the depths estimated from images using deep convolutional neural fields. Our method considers the whole environmental structure as reference rather than a specified plane. The accuracy of depth estimation contributes to the scale recovery. We improve the performance of depth estimation by considering two consecutive frames and egomotion of camera into our networks. The depth refinement and scale recovery are obtained iteratively. In this way, our method can eliminate the scale drift and improve the depth estimation simultaneously. The effectiveness of our method is verified on the KITTI dataset for both visual odometry and depth estimation tasks.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Yin_Scale_Recovery_for_ICCV_2017_paper.pdf", @@ -14292,7 +15199,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yin_2017_ICCV,\n \n author = {\n Yin,\n Xiaochuan and Wang,\n Xiangwei and Du,\n Xiaoguo and Chen,\n Qijun\n},\n title = {\n Scale Recovery for Monocular Visual Odometry Using Depth Estimated With Deep Convolutional Neural Fields\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Scale-Adaptive Convolutions for Scene Parsing", @@ -14300,6 +15208,7 @@ "status": "Poster", "track": "main", "pid": "729", + "author_site": "Rui Zhang; Sheng Tang; Yongdong Zhang; Jintao Li; Shuicheng Yan", "author": "Rui Zhang; Sheng Tang; Yongdong Zhang; Jintao Li; Shuicheng Yan", "abstract": "Many existing scene parsing methods adopt Convolutional Neural Networks with fixed-size receptive fields, which frequently result in inconsistent predictions of large objects and invisibility of small objects. To tackle this issue, we propose a scale-adaptive convolution to acquire flexible-size receptive fields during scene parsing. Through adding a new scale regression layer, we can dynamically infer the position-adaptive scale coefficients which are adopted to resize the convolutional patches. Consequently, the receptive fields can be adjusted automatically according to the various sizes of the objects in scene images. Thus, the problems of invisible small objects and inconsistent large-object predictions can be alleviated. Furthermore, our proposed scale-adaptive convolutions are not only differentiable to learn the convolutional parameters and scale coefficients in an end-to-end way, but also of high parallelizability for the convenience of GPU implementation. Additionally, since the new scale regression layers are learned implicitly, any extra training supervision of object sizes is unnecessary. Extensive experiments on Cityscapes and ADE20K datasets well demonstrate the effectiveness of the proposed scale-adaptive convolutions.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zhang_Scale-Adaptive_Convolutions_for_ICCV_2017_paper.pdf", @@ -14324,7 +15233,8 @@ "aff_campus_unique_index": "0+0;0+0;0+0;0+0;0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0+0;0+0;0+0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2017_ICCV,\n \n author = {\n Zhang,\n Rui and Tang,\n Sheng and Zhang,\n Yongdong and Li,\n Jintao and Yan,\n Shuicheng\n},\n title = {\n Scale-Adaptive Convolutions for Scene Parsing\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "ScaleNet: Guiding Object Proposal Generation in Supermarkets and Beyond", @@ -14332,6 +15242,7 @@ "status": "Poster", "track": "main", "pid": "649", + "author_site": "Siyuan Qiao; Wei Shen; Weichao Qiu; Chenxi Liu; Alan Yuille", "author": "Siyuan Qiao; Wei Shen; Weichao Qiu; Chenxi Liu; Alan Yuille", "abstract": "Motivated by product detection in supermarkets, this paper studies the problem of object proposal generation in supermarket images and other natural images. We argue that estimation of object scales in images is helpful for generating object proposals, especially for supermarket images where object scales are usually within a small range. Therefore, we propose to estimate object scales of images before generating object proposals. The proposed method for predicting object scales is called ScaleNet. To validate the effectiveness of ScaleNet, we build three supermarket datasets, two of which are real-world datasets used for testing and the other one is a synthetic dataset used for training. In short, we extend the previous state-of-the-art object proposal methods by adding a scale prediction phase. The resulted method outperforms the previous state-of-the-art on the supermarket datasets by a large margin. We also show that the approach works for object proposal on other natural images and it outperforms the previous state-of-the-art object proposal methods on the MS COCO dataset. The supermarket datasets, the virtual supermarkets, and the tools for creating more synthetic datasets will be made public.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Qiao_ScaleNet_Guiding_Object_ICCV_2017_paper.pdf", @@ -14356,7 +15267,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+1;0;0;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Qiao_2017_ICCV,\n \n author = {\n Qiao,\n Siyuan and Shen,\n Wei and Qiu,\n Weichao and Liu,\n Chenxi and Yuille,\n Alan\n},\n title = {\n ScaleNet: Guiding Object Proposal Generation in Supermarkets and Beyond\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Scaling the Scattering Transform: Deep Hybrid Networks", @@ -14364,10 +15276,11 @@ "status": "Poster", "track": "main", "pid": "3037", + "author_site": "Edouard Oyallon; Eugene Belilovsky; Sergey Zagoruyko", "author": "Edouard Oyallon; Eugene Belilovsky; Sergey Zagoruyko", "abstract": "We use the scattering network as a generic and fixed initialization of the first layers of a supervised hybrid deep network. We show that early layers do not necessarily need to be learned, providing the best results to-date with pre-defined representations while being competitive with Deep CNNs. Using a shallow cascade of 1x1 convolutions, which encodes scattering coefficients that correspond to spatial windows of very small sizes, permits to obtain AlexNet accuracy on the imagenet ILSVRC2012. We demonstrate that this local encoding explicitly learns invariance w.r.t. rotations. Combining scattering networks with a modern ResNet, we achieve a single-crop top 5 error of 11.4% on imagenet ILSVRC2012, comparable to the Resnet-18 architecture, while utilizing only 10 layers. We also find that hybrid architectures can yield excellent performance in the small sample regime, exceeding their end-to-end counterparts, through their ability to incorporate geometrical priors. We demonstrate this on subsets of the CIFAR-10 dataset and on the STL-10 dataset.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Oyallon_Scaling_the_Scattering_ICCV_2017_paper.pdf", - "aff": "ENS/PSL Research University, Paris, France; KU Leuven and INRIA, University of Paris-Saclay; Universit \u00b4e Paris-Est, \u00b4Ecole des Ponts ParisTech, Paris, France", + "aff": "ENS/PSL Research University, Paris, France; KU Leuven and INRIA, University of Paris-Saclay; Universit ´e Paris-Est, ´Ecole des Ponts ParisTech, Paris, France", "project": "", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2017/supplemental/Oyallon_Scaling_the_Scattering_ICCV_2017_supplemental.pdf", @@ -14381,14 +15294,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Oyallon_Scaling_the_Scattering_ICCV_2017_paper.html", "aff_unique_index": "0;1;2", - "aff_unique_norm": "\u00c9cole Normale Sup\u00e9rieure/PSL Research University;KU Leuven;Universit\u00e9 Paris-Est", - "aff_unique_dep": ";;\u00c9cole des Ponts ParisTech", + "aff_unique_norm": "École Normale Supérieure/PSL Research University;KU Leuven;Université Paris-Est", + "aff_unique_dep": ";;École des Ponts ParisTech", "aff_unique_url": "https://www.ens.psl.eu;https://www.kuleuven.be;https://www.ponts.org", "aff_unique_abbr": "ENS/PSL;KU Leuven;UPE", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Paris;", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "France;Belgium" + "aff_country_unique": "France;Belgium", + "bibtex": "@InProceedings{Oyallon_2017_ICCV,\n \n author = {\n Oyallon,\n Edouard and Belilovsky,\n Eugene and Zagoruyko,\n Sergey\n},\n title = {\n Scaling the Scattering Transform: Deep Hybrid Networks\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Scene Categorization With Spectral Features", @@ -14396,6 +15310,7 @@ "status": "Poster", "track": "main", "pid": "3088", + "author_site": "Salman H. Khan; Munawar Hayat; Fatih Porikli", "author": "Salman H. Khan; Munawar Hayat; Fatih Porikli", "abstract": "Spectral signatures of natural scenes were earlier found to be distinctive for different scene types with varying spatial envelope properties such as openness, naturalness, ruggedness, and symmetry. Recently, such handcrafted features have been outclassed by deep learning based representations. This paper proposes a novel spectral description of convolution features, implemented efficiently as a unitary transformation within deep network architectures. To the best of our knowledge, this is the first attempt to use deep learning based spectral features explicitly for image classification task. We show that the spectral transformation decorrelates convolutional activations, which reduces co-adaptation between feature detections, thus acts as an effective regularizer. Our approach achieves significant improvements on three large-scale scene-centric datasets (MIT-67, SUN-397, and Places-205). Furthermore, we evaluated the proposed approach on the attribute detection task where its superior performance manifests its relevance to semantically meaningful characteristics of natural scenes.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Khan_Scene_Categorization_With_ICCV_2017_paper.pdf", @@ -14410,7 +15325,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Khan_Scene_Categorization_With_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Khan_Scene_Categorization_With_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Khan_2017_ICCV,\n \n author = {\n Khan,\n Salman H. and Hayat,\n Munawar and Porikli,\n Fatih\n},\n title = {\n Scene Categorization With Spectral Features\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Scene Graph Generation From Objects, Phrases and Region Captions", @@ -14418,6 +15334,7 @@ "status": "Poster", "track": "main", "pid": "428", + "author_site": "Yikang Li; Wanli Ouyang; Bolei Zhou; Kun Wang; Xiaogang Wang", "author": "Yikang Li; Wanli Ouyang; Bolei Zhou; Kun Wang; Xiaogang Wang", "abstract": "Object detection, scene graph generation and region captioning, which are three scene understanding tasks at different semantic levels, are tied together: scene graphs are generated on top of objects detected in an image with their pairwise relationship predicted, while region captioning gives a language description of the objects, their attributes, relations and other context information. In this work, to leverage the mutual connections across semantic levels, we propose a novel neural network model, termed as Multi-level Scene Description Network (denoted as MSDN), to solve the three vision tasks jointly in an end-to-end manner. Object, phrase, and caption regions are first aligned with a dynamic graph based on their spatial and semantic connections. Then a feature refining structure is used to pass messages across the three levels of semantic tasks through the graph. We benchmark the learned model on three tasks, and show the joint learning across three tasks with our proposed method can bring mutual improvements over previous models. Particularly, on the scene graph generation task, our proposed method outperforms the state-of-art method with more than 3% margin.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Li_Scene_Graph_Generation_ICCV_2017_paper.pdf", @@ -14435,14 +15352,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Li_Scene_Graph_Generation_ICCV_2017_paper.html", "aff_unique_index": "0;0+1;2;0;0", - "aff_unique_norm": "Chinese University of Hong Kong;University of Sydney;Massachusetts Institute of Technology", + "aff_unique_norm": "The Chinese University of Hong Kong;University of Sydney;Massachusetts Institute of Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.sydney.edu.au;https://web.mit.edu", "aff_unique_abbr": "CUHK;USYD;MIT", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0+1;2;0;0", - "aff_country_unique": "China;Australia;United States" + "aff_country_unique": "China;Australia;United States", + "bibtex": "@InProceedings{Li_2017_ICCV,\n \n author = {\n Li,\n Yikang and Ouyang,\n Wanli and Zhou,\n Bolei and Wang,\n Kun and Wang,\n Xiaogang\n},\n title = {\n Scene Graph Generation From Objects,\n Phrases and Region Captions\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Scene Parsing With Global Context Embedding", @@ -14450,6 +15368,7 @@ "status": "Poster", "track": "main", "pid": "1143", + "author_site": "Wei-Chih Hung; Yi-Hsuan Tsai; Xiaohui Shen; Zhe Lin; Kalyan Sunkavalli; Xin Lu; Ming-Hsuan Yang", "author": "Wei-Chih Hung; Yi-Hsuan Tsai; Xiaohui Shen; Zhe Lin; Kalyan Sunkavalli; Xin Lu; Ming-Hsuan Yang", "abstract": "We present a scene parsing method that utilizes global context information based on both the parametric and non-parametric models. Compared to previous methods that only exploit the local relationship between objects, we train a context network based on scene similarities to generate feature representations for global contexts. In addition, these learned features are utilized to generate global and spatial priors for explicit classes inference. We then design modules to embed the feature representations and the priors into the segmentation network as additional global context cues. We show that the proposed method can eliminate false positives that are not compatible with the global context representations. Experiments on both the MIT ADE20K and PASCAL Context datasets show that the proposed method performs favorably against existing methods.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Hung_Scene_Parsing_With_ICCV_2017_paper.pdf", @@ -14465,7 +15384,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Hung_Scene_Parsing_With_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Hung_Scene_Parsing_With_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Hung_2017_ICCV,\n \n author = {\n Hung,\n Wei-Chih and Tsai,\n Yi-Hsuan and Shen,\n Xiaohui and Lin,\n Zhe and Sunkavalli,\n Kalyan and Lu,\n Xin and Yang,\n Ming-Hsuan\n},\n title = {\n Scene Parsing With Global Context Embedding\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "SceneNet RGB-D: Can 5M Synthetic Images Beat Generic ImageNet Pre-Training on Indoor Segmentation?", @@ -14473,6 +15393,7 @@ "status": "Poster", "track": "main", "pid": "1068", + "author_site": "John McCormac; Ankur Handa; Stefan Leutenegger; Andrew J. Davison", "author": "John McCormac; Ankur Handa; Stefan Leutenegger; Andrew J. Davison", "abstract": "We introduce SceneNet RGB-D, a dataset providing pixel-perfect ground truth for scene understanding problems such as semantic segmentation, instance segmentation, and object detection. It also provides perfect camera poses and depth data, allowing investigation into geometric computer vision problems such as optical flow, camera pose estimation, and 3D scene labelling tasks. Random sampling permits virtually unlimited scene configurations, and here we provide 5M rendered RGB-D images from 16K randomly generated 3D trajectories in synthetic layouts, with random but physically simulated object configurations. We compare the semantic segmentation performance of network weights produced from pre-training on RGB images from our dataset against generic VGG-16 ImageNet weights. After fine-tuning on the SUN RGB-D and NYUv2 real-world datasets we find in both cases that the synthetically pre-trained network outperforms the VGG-16 weights. When synthetic pre-training includes a depth channel (something ImageNet cannot natively provide) the performance is greater still. This suggests that large-scale high-quality synthetic RGB datasets with task-specific labels can be more useful for pre-training than real-world generic pre-training such as ImageNet. We host the dataset at http://robotvault.bitbucket.io/scenenet-rgbd.html", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/McCormac_SceneNet_RGB-D_Can_ICCV_2017_paper.pdf", @@ -14497,7 +15418,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "London", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{McCormac_2017_ICCV,\n \n author = {\n McCormac,\n John and Handa,\n Ankur and Leutenegger,\n Stefan and Davison,\n Andrew J.\n},\n title = {\n SceneNet RGB-D: Can 5M Synthetic Images Beat Generic ImageNet Pre-Training on Indoor Segmentation?\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "See the Glass Half Full: Reasoning About Liquid Containers, Their Volume and Content", @@ -14505,6 +15427,7 @@ "status": "Poster", "track": "main", "pid": "719", + "author_site": "Roozbeh Mottaghi; Connor Schenck; Dieter Fox; Ali Farhadi", "author": "Roozbeh Mottaghi; Connor Schenck; Dieter Fox; Ali Farhadi", "abstract": "Humans have rich understanding of liquid containers and their contents; for example, we can effortlessly pour water from a pitcher to a cup. Doing so requires estimating the volume of the cup, approximating the amount of water in the pitcher, and predicting the behavior of water when we tilt the pitcher. Very little attention in computer vision has been made to liquids and their containers. In this paper, we study liquid containers and their contents, and propose methods to estimate the volume of containers, approximate the amount of liquid in them, and perform comparative volume estimations all from a single RGB image. Furthermore, we show the results of the proposed model for predicting the behavior of liquids inside containers when one tilts the containers. We also introduce a new dataset of Containers Of liQuid contEnt (COQE) that contains more than 5,000 images of 10,000 liquid containers in context labelled with volume, amount of content, bounding box annotation, and corresponding similar 3D CAD models.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Mottaghi_See_the_Glass_ICCV_2017_paper.pdf", @@ -14520,7 +15443,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Mottaghi_See_the_Glass_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Mottaghi_See_the_Glass_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Mottaghi_2017_ICCV,\n \n author = {\n Mottaghi,\n Roozbeh and Schenck,\n Connor and Fox,\n Dieter and Farhadi,\n Ali\n},\n title = {\n See the Glass Half Full: Reasoning About Liquid Containers,\n Their Volume and Content\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "SegFlow: Joint Learning for Video Object Segmentation and Optical Flow", @@ -14528,6 +15452,7 @@ "status": "Poster", "track": "main", "pid": "211", + "author_site": "Jingchun Cheng; Yi-Hsuan Tsai; Shengjin Wang; Ming-Hsuan Yang", "author": "Jingchun Cheng; Yi-Hsuan Tsai; Shengjin Wang; Ming-Hsuan Yang", "abstract": "This paper proposes an end-to-end trainable network, SegFlow, for simultaneously predicting pixel-wise object segmentation and optical flow in videos. The proposed SegFlow has two branches where useful information of object segmentation and optical flow is propagated bidirectionally in a unified framework. The segmentation branch is based on a fully convolutional network, which has been proved effective in image segmentation task, and the optical flow branch takes advantage of the FlowNet model. The unified framework is trained iteratively offline to learn a generic notion, and fine tuned online for specific objects. Extensive experiments on both the video object segmentation and optical flow datasets demonstrate that introducing optical flow improves the performance of segmentation and vice versa, against the state-of-the-art algorithms.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Cheng_SegFlow_Joint_Learning_ICCV_2017_paper.pdf", @@ -14545,14 +15470,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Cheng_SegFlow_Joint_Learning_ICCV_2017_paper.html", "aff_unique_index": "0+1;1+2;0;1+3", - "aff_unique_norm": "Tsinghua University;University of California, Merced;NEC Laboratories America;NVIDIA", + "aff_unique_norm": "Tsinghua University;University of California, Merced;NEC Laboratories America;NVIDIA Corporation", "aff_unique_dep": ";;;NVIDIA Research", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.ucmerced.edu;https://www.nec-labs.com;https://www.nvidia.com/research", "aff_unique_abbr": "THU;UC Merced;NEC Labs America;NVIDIA", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Merced", "aff_country_unique_index": "0+1;1+1;0;1+1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Cheng_2017_ICCV,\n \n author = {\n Cheng,\n Jingchun and Tsai,\n Yi-Hsuan and Wang,\n Shengjin and Yang,\n Ming-Hsuan\n},\n title = {\n SegFlow: Joint Learning for Video Object Segmentation and Optical Flow\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Segmentation-Aware Convolutional Networks Using Local Attention Masks", @@ -14560,6 +15486,7 @@ "status": "Poster", "track": "main", "pid": "2081", + "author_site": "Adam W. Harley; Konstantinos G. Derpanis; Iasonas Kokkinos", "author": "Adam W. Harley; Konstantinos G. Derpanis; Iasonas Kokkinos", "abstract": "We introduce an approach to integrate segmentation information within a convolutional neural network (CNN). This counter-acts the tendency of CNNs to smooth information across regions and increases their spatial precision. To obtain segmentation information, we set up a CNN to provide an embedding space where region co-membership can be estimated based on Euclidean distance. We use these embeddings to compute a local attention mask relative to every neuron position. We incorporate such masks in CNNs and replace the convolution operation with a \"segmentation-aware\" variant that allows a neuron to selectively attend to inputs coming from its own region. We call the resulting network a segmentation-aware CNN because it adapts its filters at each image point according to local segmentation cues, while at the same time remaining fully-convolutional. We demonstrate the merit of our method on two widely different dense prediction tasks, that involve classification (semantic segmentation) and regression (optical flow). Our results show that in semantic segmentation we can replace DenseCRF inference with a cascade of segmentation-aware filters, and in optical flow we obtain clearly sharper responses than the ones obtained with comparable networks that do not use segmentation. In both cases segmentation-aware convolution yields systematic improvements over strong baselines.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Harley_Segmentation-Aware_Convolutional_Networks_ICCV_2017_paper.pdf", @@ -14577,14 +15504,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Harley_Segmentation-Aware_Convolutional_Networks_ICCV_2017_paper.html", "aff_unique_index": "0;1;2", - "aff_unique_norm": "Carnegie Mellon University;Ryerson University;Meta", + "aff_unique_norm": "Carnegie Mellon University;Ryerson University;Facebook", "aff_unique_dep": ";;Facebook AI Research", "aff_unique_url": "https://www.cmu.edu;https://www.ryerson.ca;https://research.facebook.com", "aff_unique_abbr": "CMU;Ryerson;FAIR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "United States;Canada" + "aff_country_unique": "United States;Canada", + "bibtex": "@InProceedings{Harley_2017_ICCV,\n \n author = {\n Harley,\n Adam W. and Derpanis,\n Konstantinos G. and Kokkinos,\n Iasonas\n},\n title = {\n Segmentation-Aware Convolutional Networks Using Local Attention Masks\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Self-Organized Text Detection With Minimal Post-Processing via Border Learning", @@ -14592,6 +15520,7 @@ "status": "Poster", "track": "main", "pid": "2306", + "author_site": "Yue Wu; Prem Natarajan", "author": "Yue Wu; Prem Natarajan", "abstract": "In this paper we propose a new solution to the text detection problem via border learning. Specifically, we make four major contributions: 1) We analyze the insufficiencies of the classic non-text and text settings for text detection. 2) We introduce the border class to the text detection problem for the first time, and validate that the decoding process is largely simplified with the help of text border. 3) We collect and release a new text detection ppt dataset containing 10,692 images with non-text, border, and text annotations. 4) We develop a lightweight (only 0.28M parameters), fully convolutional network (FCN) to effectively learn borders in text images. The results of our extensive experiments show that the proposed solution achieves comparable performance, and often outperforms state-of-the-art approaches on standard benchmarks--even though our solution only requires minimal post-processing to parse a bounding box from a detected text map, while others often require heavy post-processing.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Wu_Self-Organized_Text_Detection_ICCV_2017_paper.pdf", @@ -14607,7 +15536,8 @@ "aff_domain": ";", "email": ";", "author_num": 2, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Wu_Self-Organized_Text_Detection_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Wu_Self-Organized_Text_Detection_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Wu_2017_ICCV,\n \n author = {\n Wu,\n Yue and Natarajan,\n Prem\n},\n title = {\n Self-Organized Text Detection With Minimal Post-Processing via Border Learning\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Self-Paced Kernel Estimation for Robust Blind Image Deblurring", @@ -14615,6 +15545,7 @@ "status": "Poster", "track": "main", "pid": "670", + "author_site": "Dong Gong; Mingkui Tan; Yanning Zhang; Anton van den Hengel; Qinfeng Shi", "author": "Dong Gong; Mingkui Tan; Yanning Zhang; Anton van den Hengel; Qinfeng Shi", "abstract": "The challenge in blind image deblurring is to remove the effects of blur with limited prior information about the nature of the blur process. Existing methods often assume that the blur image is produced by linear convolution with additive Gaussian noise. However, including even a small number of outliers to this model in the kernel estimation process can significantly reduce the resulting image quality. Previous methods mainly rely on some simple but unreliable heuristics to identify outliers for kernel estimation. Rather than attempt to identify outliers to the model a priori, we instead propose to sequentially identify inliers, and gradually incorporate them into the estimation process. The self-paced kernel estimation scheme we propose represents a generalization of existing self-paced learning approaches, in which we gradually detect and include reliable inlier pixel sets in a blurred image for kernel estimation. Moreover, we automatically activate a subset of significant gradients w.r.t. the reliable inlier pixels, and then update the intermediate sharp image and the kernel accordingly. Experiments on both synthetic data and real-world images with various kinds of outliers demonstrate the effectiveness and robustness of the proposed method compared to the state-of-the-art methods.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Gong_Self-Paced_Kernel_Estimation_ICCV_2017_paper.pdf", @@ -14630,7 +15561,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Gong_Self-Paced_Kernel_Estimation_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Gong_Self-Paced_Kernel_Estimation_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Gong_2017_ICCV,\n \n author = {\n Gong,\n Dong and Tan,\n Mingkui and Zhang,\n Yanning and van den Hengel,\n Anton and Shi,\n Qinfeng\n},\n title = {\n Self-Paced Kernel Estimation for Robust Blind Image Deblurring\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Self-Supervised Learning of Pose Embeddings From Spatiotemporal Relations in Videos", @@ -14638,7 +15570,7 @@ "status": "Poster", "track": "main", "pid": "1817", - "author_site": "\u00c3\u0096mer S\u00c3\u00bcmer; Tobias Dencker; Bj\u00c3\u00b6rn Ommer", + "author_site": "Ömer Sümer; Tobias Dencker; Björn Ommer", "author": "Omer Sumer; Tobias Dencker; Bjorn Ommer", "abstract": "Human pose analysis is presently dominated by deep convolutional networks trained with extensive manual annotations of joint locations and beyond. To avoid the need for expensive labeling, we exploit spatiotemporal relations in training videos for self-supervised learning of pose embeddings. The key idea is to combine temporal ordering and spatial placement estimation as auxiliary tasks for learning pose similarities in a Siamese convolutional network. Since the self-supervised sampling of both tasks from natural videos can result in ambiguous and incorrect training labels, our method employs a curriculum learning idea that starts training with the most reliable data samples and gradually increases the difficulty. To further refine the training process we mine repetitive poses in individual videos which provide reliable labels while removing inconsistencies. Our pose embeddings capture visual characteristics of human pose that can boost existing supervised representations in human pose estimation and retrieval. We report quantitative and qualitative results on these tasks in Olympic Sports, Leeds Pose Sports and MPII Human Pose datasets.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Sumer_Self-Supervised_Learning_of_ICCV_2017_paper.pdf", @@ -14663,7 +15595,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Heidelberg", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Sumer_2017_ICCV,\n \n author = {\n Sumer,\n Omer and Dencker,\n Tobias and Ommer,\n Bjorn\n},\n title = {\n Self-Supervised Learning of Pose Embeddings From Spatiotemporal Relations in Videos\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Semantic Image Synthesis via Adversarial Learning", @@ -14671,6 +15604,7 @@ "status": "Poster", "track": "main", "pid": "2593", + "author_site": "Hao Dong; Simiao Yu; Chao Wu; Yike Guo", "author": "Hao Dong; Simiao Yu; Chao Wu; Yike Guo", "abstract": "In this paper, we propose a way of synthesizing realistic images directly with natural language description, which has many useful applications, e.g.intelligent image manipulation. We attempt to accomplish such synthesis: given a source image and a target text description, our model synthesizes images to meet two requirements: 1) being realistic while matching the target text description; 2) maintaining other image features that are irrelevant to the text description. The model should be able to disentangle the semantic information from the two modalities (image and text), and generate new images from the combined semantics. To achieve this, we proposed an end-to-end neural architecture that leverages adversarial learning to automatically learn implicit loss functions, which are optimized to fulfill the aforementioned two requirements. We have evaluated our model by conducting experiments on Caltech-200 bird dataset and Oxford-102 flower dataset, and have demonstrated that our model is capable of synthesizing realistic images that match the given descriptions, while still maintain other features of original images.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Dong_Semantic_Image_Synthesis_ICCV_2017_paper.pdf", @@ -14685,7 +15619,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Dong_Semantic_Image_Synthesis_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Dong_Semantic_Image_Synthesis_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Dong_2017_ICCV,\n \n author = {\n Dong,\n Hao and Yu,\n Simiao and Wu,\n Chao and Guo,\n Yike\n},\n title = {\n Semantic Image Synthesis via Adversarial Learning\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Semantic Jitter: Dense Supervision for Visual Comparisons via Synthetic Images", @@ -14693,6 +15628,7 @@ "status": "Poster", "track": "main", "pid": "2720", + "author_site": "Aron Yu; Kristen Grauman", "author": "Aron Yu; Kristen Grauman", "abstract": "Distinguishing subtle differences in attributes is valuable, yet learning to make visual comparisons remains nontrivial. Not only is the number of possible comparisons quadratic in the number of training images, but also access to images adequately spanning the space of fine-grained visual differences is limited. We propose to overcome the sparsity of supervision problem via synthetically generated images. Building on a state-of-the-art image generation engine, we sample pairs of training images exhibiting slight modifications of individual attributes. Augmenting real training image pairs with these examples, we then train attribute ranking models to predict the relative strength of an attribute in novel pairs of real images. Our results on datasets of faces and fashion images show the great promise of bootstrapping imperfect image generators to counteract sample sparsity for learning to rank.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Yu_Semantic_Jitter_Dense_ICCV_2017_paper.pdf", @@ -14717,7 +15653,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Austin", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Yu_2017_ICCV,\n \n author = {\n Yu,\n Aron and Grauman,\n Kristen\n},\n title = {\n Semantic Jitter: Dense Supervision for Visual Comparisons via Synthetic Images\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Semantic Line Detection and Its Applications", @@ -14725,6 +15662,7 @@ "status": "Poster", "track": "main", "pid": "1298", + "author_site": "Jun-Tae Lee; Han-Ul Kim; Chul Lee; Chang-Su Kim", "author": "Jun-Tae Lee; Han-Ul Kim; Chul Lee; Chang-Su Kim", "abstract": "Semantic lines characterize the layout of an image. Despite their importance in image analysis and scene understanding, there is no reliable research for semantic line detection. In this paper, we propose a semantic line detector using a convolutional neural network with multi-task learning, by regarding the line detection as a combination of classification and regression tasks. We use convolution and max-pooling layers to obtain multi-scale feature maps for an input image. Then, we develop the line pooling layer to extract a feature vector for each candidate line from the feature maps. Next, we feed the feature vector into the parallel classification and regression layers. The classification layer decides whether the line candidate is semantic or not. In case of a semantic line, the regression layer determines the offset for refining the line location. Experimental results show that the proposed detector extracts semantic lines accurately and reliably. Moreover, we demonstrate that the proposed detector can be used successfully in three applications: horizon estimation, composition enhancement, and image simplification.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Lee_Semantic_Line_Detection_ICCV_2017_paper.pdf", @@ -14739,7 +15677,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Lee_Semantic_Line_Detection_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Lee_Semantic_Line_Detection_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Lee_2017_ICCV,\n \n author = {\n Lee,\n Jun-Tae and Kim,\n Han-Ul and Lee,\n Chul and Kim,\n Chang-Su\n},\n title = {\n Semantic Line Detection and Its Applications\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Semantic Video CNNs Through Representation Warping", @@ -14747,10 +15686,11 @@ "status": "Oral", "track": "main", "pid": "408", + "author_site": "Raghudeep Gadde; Varun Jampani; Peter V. Gehler", "author": "Raghudeep Gadde; Varun Jampani; Peter V. Gehler", "abstract": "In this work, we propose a technique to convert CNN models for semantic segmentation of static images into CNNs for video data. We describe a warping method that can be used to augment existing architectures with very little extra computational cost. This module is called NetWarp and we demonstrate its use for a range of network architectures. The main design principle is to use optical flow of adjacent frames for warping internal network representations across time. A key insight of this work is that fast optical flow methods can be combined with many different CNN architectures for improved performance and end-to-end training. Experiments validate that the proposed approach incurs only little extra computational cost, while improving performance, when video streams are available. We achieve new state-of-the-art results on the CamVid and Cityscapes benchmark datasets and show consistent improvements over different baseline networks. Our code and models are available at http://segmentation.is.tue.mpg.de", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Gadde_Semantic_Video_CNNs_ICCV_2017_paper.pdf", - "aff": "MPI for Intelligent Systems+Bernstein Center for Computational Neuroscience+NVIDIA; MPI for Intelligent Systems+NVIDIA; MPI for Intelligent Systems+University of W\u00fcrzburg+Bernstein Center for Computational Neuroscience", + "aff": "MPI for Intelligent Systems+Bernstein Center for Computational Neuroscience+NVIDIA; MPI for Intelligent Systems+NVIDIA; MPI for Intelligent Systems+University of Würzburg+Bernstein Center for Computational Neuroscience", "project": "http://segmentation.is.tue.mpg.de", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2017/supplemental/Gadde_Semantic_Video_CNNs_ICCV_2017_supplemental.pdf", @@ -14764,14 +15704,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Gadde_Semantic_Video_CNNs_ICCV_2017_paper.html", "aff_unique_index": "0+1+2;0+2;0+3+1", - "aff_unique_norm": "Max Planck Institute for Intelligent Systems;Bernstein Center for Computational Neuroscience;NVIDIA;University of W\u00fcrzburg", - "aff_unique_dep": ";Computational Neuroscience;NVIDIA Corporation;", + "aff_unique_norm": "Max Planck Institute for Intelligent Systems;Bernstein Center for Computational Neuroscience;NVIDIA Corporation;University of Würzburg", + "aff_unique_dep": ";Computational Neuroscience;;", "aff_unique_url": "https://www.mpi-is.mpg.de;https://www.bccn-berlin.de;https://www.nvidia.com;https://www.uni-wuerzburg.de", "aff_unique_abbr": "MPI-IS;BCCN;NVIDIA;UWue", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0+1;0+1;0+0+0", - "aff_country_unique": "Germany;United States" + "aff_country_unique": "Germany;United States", + "bibtex": "@InProceedings{Gadde_2017_ICCV,\n \n author = {\n Gadde,\n Raghudeep and Jampani,\n Varun and Gehler,\n Peter V.\n},\n title = {\n Semantic Video CNNs Through Representation Warping\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Semantically Informed Multiview Surface Refinement", @@ -14779,7 +15720,7 @@ "status": "Poster", "track": "main", "pid": "1678", - "author_site": "Maro\u00c5\u00a1 Bl\u00c3\u00a1ha; Mathias Rothermel; Martin R. Oswald; Torsten Sattler; Audrey Richard; Jan D. Wegner; Marc Pollefeys; Konrad Schindler", + "author_site": "MaroÅ¡ Bláha; Mathias Rothermel; Martin R. Oswald; Torsten Sattler; Audrey Richard; Jan D. Wegner; Marc Pollefeys; Konrad Schindler", "author": "Maros Blaha; Mathias Rothermel; Martin R. Oswald; Torsten Sattler; Audrey Richard; Jan D. Wegner; Marc Pollefeys; Konrad Schindler", "abstract": "We present a method to jointly refine the geometry and semantic segmentation of 3D surface meshes. Our method alternates between updating the shape and the semantic labels. In the geometry refinement step, the mesh is deformed with variational energy minimization, such that it simultaneously maximizes photo-consistency and the compatibility of the semantic segmentations across a set of calibrated images. Label-specific shape priors account for interactions between the geometry and the semantic labels in 3D. In the semantic segmentation step, the labels on the mesh are updated with MRF inference, such that they are compatible with the semantic segmentations in the input images. Also, this step includes prior assumptions about the surface shape of different semantic classes. The priors induce a tight coupling, where semantic information influences the shape update and vice versa. Specifically, we introduce priors that favor (i) adaptive smoothing, depending on the class label; (ii) straightness of class boundaries; and (iii) semantic labels that are consistent with the surface orientation. The novel mesh-based reconstruction is evaluated in a series of experiments with real and synthetic data. We compare both to state-of-the-art, voxel-based semantic 3D reconstruction, and to purely geometric mesh refinement, and demonstrate that the proposed scheme yields improved 3D geometry as well as an improved semantic segmentation.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Blaha_Semantically_Informed_Multiview_ICCV_2017_paper.pdf", @@ -14795,7 +15736,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Blaha_Semantically_Informed_Multiview_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Blaha_Semantically_Informed_Multiview_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Blaha_2017_ICCV,\n \n author = {\n Blaha,\n Maros and Rothermel,\n Mathias and Oswald,\n Martin R. and Sattler,\n Torsten and Richard,\n Audrey and Wegner,\n Jan D. and Pollefeys,\n Marc and Schindler,\n Konrad\n},\n title = {\n Semantically Informed Multiview Surface Refinement\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Semi Supervised Semantic Segmentation Using Generative Adversarial Network", @@ -14803,6 +15745,7 @@ "status": "Poster", "track": "main", "pid": "2962", + "author_site": "Nasim Souly; Concetto Spampinato; Mubarak Shah", "author": "Nasim Souly; Concetto Spampinato; Mubarak Shah", "abstract": "Semantic segmentation has been a long standing challenging task in computer vision. It aims at assigning a label to each image pixel and needs a significant number of pixel-level annotated data, which is often unavailable. To address this lack of annotations, in this paper, we leverage, on one hand, a massive amount of available unlabeled or weakly labeled data, and on the other hand, non-realimages created through Generative Adversarial Networks. In particular, we propose a semi-supervised framework -based on Generative Adversarial Networks (GANs) - which consists of a generator network to provide extra training examples to a multi-class classifier, acting as discriminator in the GAN framework, that assigns sample a label y from the K possible classes or marks it as a fake sample (extra class). The underlying idea is that adding large fake visual data forces real samples to be close in the feature space, which, in turn, improves multiclass pixel classification. To ensure a higher quality of generated images by GANs with consequently improved pixel classification, we extend the above framework by adding weakly annotated data, i.e., we provide class level information to the generator. We test our approaches on several challenging benchmarking visual datasets, i.e. PASCAL, SiftFLow, Stanford and CamVid, achieving competitive performance compared to state-of-the-art semantic segmentation methods.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Souly__Semi_Supervised_ICCV_2017_paper.pdf", @@ -14818,7 +15761,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Souly__Semi_Supervised_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Souly__Semi_Supervised_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Souly_2017_ICCV,\n \n author = {\n Souly,\n Nasim and Spampinato,\n Concetto and Shah,\n Mubarak\n},\n title = {\n Semi Supervised Semantic Segmentation Using Generative Adversarial Network\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Semi-Global Weighted Least Squares in Image Filtering", @@ -14826,6 +15770,7 @@ "status": "Poster", "track": "main", "pid": "2975", + "author_site": "Wei Liu; Xiaogang Chen; Chuanhua Shen; Zhi Liu; Jie Yang", "author": "Wei Liu; Xiaogang Chen; Chuanhua Shen; Zhi Liu; Jie Yang", "abstract": "Solving the global method of Weighted Least Squares (WLS) model in image filtering is both time- and memory-consuming. In this paper, we present an alternative approximation in a time- and memory- efficient manner which is denoted as Semi-Global Weighed Least Squares (SG-WLS). Instead of solving a large linear system, we propose to iteratively solve a sequence of subsystems which are one-dimensional WLS models. Although each subsystem is one-dimensional, it can take two-dimensional neighborhood information into account due to the proposed special neighborhood construction. We show such a desirable property makes our SG-WLS achieve close performance to the original two-dimensional WLS model but with much less time and memory cost. While previous related methods mainly focus on the 4-connected/8-connected neighborhood system, our SG-WLS can handle a more general and larger neighborhood system thanks to the proposed fast solution. We show such a generalization can achieve better performance than the 4-connected/8-connected neighborhood system in some applications. Our SG-WLS is ~20 times faster than the WLS model. For an image of MxN, the memory cost of SG-WLS is at most at the magnitude of max\\ 1 / M, 1 / N\\ of that of the WLS model. We show the effectiveness and efficiency of our SG-WLS in a range of applications.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Liu_Semi-Global_Weighted_Least_ICCV_2017_paper.pdf", @@ -14843,14 +15788,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Liu_Semi-Global_Weighted_Least_ICCV_2017_paper.html", "aff_unique_index": "0;1;2;3;0", - "aff_unique_norm": "Shanghai Jiao Tong University;University of Shanghai for Science and Technology;University of Adelaide;Shanghai University", + "aff_unique_norm": "Shanghai Jiao Tong University;University of Shanghai for Science and Technology;The University of Adelaide;Shanghai University", "aff_unique_dep": ";;;", - "aff_unique_url": "https://www.sjtu.edu.cn;https://www.usst.edu.cn;https://www.adelaide.edu.au;https://www.shu.edu.cn", + "aff_unique_url": "https://www.sjtu.edu.cn;http://www.usst.edu.cn;https://www.adelaide.edu.au;https://www.shu.edu.cn", "aff_unique_abbr": "SJTU;USST;Adelaide;SHU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Liu_2017_ICCV,\n \n author = {\n Liu,\n Wei and Chen,\n Xiaogang and Shen,\n Chuanhua and Liu,\n Zhi and Yang,\n Jie\n},\n title = {\n Semi-Global Weighted Least Squares in Image Filtering\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Shadow Detection With Conditional Generative Adversarial Networks", @@ -14858,6 +15804,7 @@ "status": "Oral", "track": "main", "pid": "1866", + "author_site": "Vu Nguyen; Tomas F. Yago Vicente; Maozheng Zhao; Minh Hoai; Dimitris Samaras", "author": "Vu Nguyen; Tomas F. Yago Vicente; Maozheng Zhao; Minh Hoai; Dimitris Samaras", "abstract": "We introduce scGAN, a novel extension of conditional Generative Adversarial Networks (GAN) tailored for the challenging problem of shadow detection in images. Previous methods for shadow detection focus on learning the local appearance of shadow regions, while using limited local context reasoning in the form of pairwise potentials in a Conditional Random Field. In contrast, the proposed adversarial approach is able to model higher level relationships and global scene characteristics. We train a shadow detector that corresponds to the generator of a conditional GAN, and augment its shadow accuracy by combining the typical GAN loss with a data loss term. Due to the unbalanced distribution of the shadow labels, we use weighted cross entropy. With the standard GAN architecture, properly setting the weight for the cross entropy would require training multiple GANs, a computationally expensive grid procedure. In scGAN, we introduce an additional sensitivity parameter w to the generator. The proposed approach effectively parameterizes the loss of the trained detector. The resulting shadow detector is a single network that can generate shadow maps corresponding to different sensitivity levels, obviating the need for multiple models and a costly training procedure. We evaluate our method on the large-scale SBU and UCF shadow datasets, and observe up to 17% error reduction with respect to the previous state-of-the-art method.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Nguyen_Shadow_Detection_With_ICCV_2017_paper.pdf", @@ -14882,7 +15829,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Stony Brook", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Nguyen_2017_ICCV,\n \n author = {\n Nguyen,\n Vu and Yago Vicente,\n Tomas F. and Zhao,\n Maozheng and Hoai,\n Minh and Samaras,\n Dimitris\n},\n title = {\n Shadow Detection With Conditional Generative Adversarial Networks\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Shape Inpainting Using 3D Generative Adversarial Network and Recurrent Convolutional Networks", @@ -14890,6 +15838,7 @@ "status": "Poster", "track": "main", "pid": "946", + "author_site": "Weiyue Wang; Qiangui Huang; Suya You; Chao Yang; Ulrich Neumann", "author": "Weiyue Wang; Qiangui Huang; Suya You; Chao Yang; Ulrich Neumann", "abstract": "Recent advances in convolutional neural networks have shown promising results in 3D shape completion. But due to GPU memory limitations, these methods can only produce low-resolution outputs. To inpaint 3D models with semantic plausibility and contextual details, we introduce a hybrid framework that combines a 3D Encoder-Decoder Generative Adversarial Network (3D-ED-GAN) and a Long-term Recurrent Convolutional Network (LRCN). The 3D-ED-GAN is a 3D convolutional neural network trained with a generative adversarial paradigm to fill missing 3D data in low-resolution. LRCN adopts a recurrent neural network architecture to minimize GPU memory usage and incorporates an Encoder-Decoder pair into a Long Short-term Memory Network. By handling the 3D model as a sequence of 2D slices, LRCN transforms a coarse 3D shape into a more complete and higher resolution volume. While 3D-ED-GAN captures global contextual structure of the 3D shape, LRCN localizes the fine-grained details. Experimental results on both real-world and synthetic data show reconstructions from corrupted models result in complete and high-resolution 3D objects.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Wang_Shape_Inpainting_Using_ICCV_2017_paper.pdf", @@ -14914,7 +15863,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Wang_2017_ICCV,\n \n author = {\n Wang,\n Weiyue and Huang,\n Qiangui and You,\n Suya and Yang,\n Chao and Neumann,\n Ulrich\n},\n title = {\n Shape Inpainting Using 3D Generative Adversarial Network and Recurrent Convolutional Networks\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Should We Encode Rain Streaks in Video as Deterministic or Stochastic?", @@ -14922,10 +15872,11 @@ "status": "Poster", "track": "main", "pid": "1167", + "author_site": "Wei Wei; Lixuan Yi; Qi Xie; Qian Zhao; Deyu Meng; Zongben Xu", "author": "Wei Wei; Lixuan Yi; Qi Xie; Qian Zhao; Deyu Meng; Zongben Xu", "abstract": "Videos taken in the wild sometimes contain unexpected rain streaks, which brings difficulty in subsequent video processing tasks. Rain streak removal in a video (RSRV) is thus an important issue and has been attracting much attention in computer vision. Different from previous RSRV methods formulating rain streaks as a deterministic message, this work first encodes the rains in a stochastic manner, i.e., a patch-based mixture of Gaussians. Such modification makes the proposed model capable of finely adapting a wider range of rain variations instead of certain types of rain configurations as traditional. By integrating with the spatiotemporal smoothness configuration of moving objects and low-rank structure of background scene, we propose a concise model for RSRV, containing one likelihood term imposed on the rain streak layer and two prior terms on the moving object and background scene layers of the video. Experiments implemented on videos with synthetic and real rains verify the superiority of the proposed method, as com- pared with the state-of-the-art methods, both visually and quantitatively in various performance metrics.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Wei_Should_We_Encode_ICCV_2017_paper.pdf", - "aff": "School of Mathematics and Statistics, Xi\u2019an Jiaotong University; School of Mathematics and Statistics, Xi\u2019an Jiaotong University; School of Mathematics and Statistics, Xi\u2019an Jiaotong University; School of Mathematics and Statistics, Xi\u2019an Jiaotong University + Ministry of Education Key Lab of Intelligent Networks and Network Security, Xi\u2019an Jiaotong University; School of Mathematics and Statistics, Xi\u2019an Jiaotong University + Ministry of Education Key Lab of Intelligent Networks and Network Security, Xi\u2019an Jiaotong University; School of Mathematics and Statistics, Xi\u2019an Jiaotong University + Ministry of Education Key Lab of Intelligent Networks and Network Security, Xi\u2019an Jiaotong University", + "aff": "School of Mathematics and Statistics, Xi’an Jiaotong University; School of Mathematics and Statistics, Xi’an Jiaotong University; School of Mathematics and Statistics, Xi’an Jiaotong University; School of Mathematics and Statistics, Xi’an Jiaotong University + Ministry of Education Key Lab of Intelligent Networks and Network Security, Xi’an Jiaotong University; School of Mathematics and Statistics, Xi’an Jiaotong University + Ministry of Education Key Lab of Intelligent Networks and Network Security, Xi’an Jiaotong University; School of Mathematics and Statistics, Xi’an Jiaotong University + Ministry of Education Key Lab of Intelligent Networks and Network Security, Xi’an Jiaotong University", "project": "", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2017/supplemental/Wei_Should_We_Encode_ICCV_2017_supplemental.zip", @@ -14939,14 +15890,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Wei_Should_We_Encode_ICCV_2017_paper.html", "aff_unique_index": "0;0;0;0+0;0+0;0+0", - "aff_unique_norm": "Xi'an Jiao Tong University", + "aff_unique_norm": "Xi'an Jiaotong University", "aff_unique_dep": "School of Mathematics and Statistics", "aff_unique_url": "http://en.xjtu.edu.cn/", "aff_unique_abbr": "XJTU", "aff_campus_unique_index": "0;0;0;0+0;0+0;0+0", "aff_campus_unique": "Xi'an", "aff_country_unique_index": "0;0;0;0+0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wei_2017_ICCV,\n \n author = {\n Wei,\n Wei and Yi,\n Lixuan and Xie,\n Qi and Zhao,\n Qian and Meng,\n Deyu and Xu,\n Zongben\n},\n title = {\n Should We Encode Rain Streaks in Video as Deterministic or Stochastic?\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Show, Adapt and Tell: Adversarial Training of Cross-Domain Image Captioner", @@ -14954,6 +15906,7 @@ "status": "Poster", "track": "main", "pid": "248", + "author_site": "Tseng-Hung Chen; Yuan-Hong Liao; Ching-Yao Chuang; Wan-Ting Hsu; Jianlong Fu; Min Sun", "author": "Tseng-Hung Chen; Yuan-Hong Liao; Ching-Yao Chuang; Wan-Ting Hsu; Jianlong Fu; Min Sun", "abstract": "Impressive image captioning results are achieved in domains with plenty of training image and sentence pairs (e.g., MSCOCO). However, transferring to a target domain with significant domain shifts but no paired training data (referred to as cross-domain image captioning) remains largely unexplored. We propose a novel adversarial training procedure to leverage unpaired data in the target domain. Two critic networks are introduced to guide the captioner, namely domain critic and multi-modal critic. The domain critic assesses whether the generated sentences are indistinguishable from sentences in the target domain. The multi-modal critic assesses whether an image and its generated sentence are a valid pair. During training, the critics and captioner act as adversaries -- captioner aims to generate indistinguishable sentences, whereas critics aim at distinguishing them. The assessment improves the captioner through policy gradient updates. During inference, we further propose a novel critic-based planning method to select high-quality sentences without additional supervision (e.g., tags). To evaluate, we use MSCOCO as the source domain and four other datasets (CUB-200-2011, Oxford-102, TGIF, and Flickr30k) as the target domains. Our method consistently performs well on all datasets. In particular, on CUB-200-2011, we achieve 21.8% CIDEr-D improvement after adaptation. Utilizing critics during inference further gives another 4.5% boost.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Chen_Show_Adapt_and_ICCV_2017_paper.pdf", @@ -14968,7 +15921,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Chen_Show_Adapt_and_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Chen_Show_Adapt_and_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Chen_2017_ICCV,\n \n author = {\n Chen,\n Tseng-Hung and Liao,\n Yuan-Hong and Chuang,\n Ching-Yao and Hsu,\n Wan-Ting and Fu,\n Jianlong and Sun,\n Min\n},\n title = {\n Show,\n Adapt and Tell: Adversarial Training of Cross-Domain Image Captioner\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Side Information in Robust Principal Component Analysis: Algorithms and Applications", @@ -14976,6 +15930,7 @@ "status": "Poster", "track": "main", "pid": "1884", + "author_site": "Niannan Xue; Yannis Panagakis; Stefanos Zafeiriou", "author": "Niannan Xue; Yannis Panagakis; Stefanos Zafeiriou", "abstract": "Robust Principal Component Analysis (RPCA) aims at recovering a low-rank subspace from grossly corrupted high-dimensional (often visual) data and is a cornerstone in many machine learning and computer vision applications. Even though RPCA has been shown to be very successful in solving many rank minimisation problems, there are still cases where degenerate or suboptimal solutions are obtained. This is likely to be remedied by taking into account of domain-dependent prior knowledge. In this paper, we propose two models for the RPCA problem with the aid of side information on the low-rank structure of the data. The versatility of the proposed methods is demonstrated by applying them to four applications, namely background subtraction, facial image denoising, face and facial expression recognition. Experimental results on synthetic and five real world datasets indicate the robustness and effectiveness of the proposed methods on these application domains, largely outperforming six previous approaches.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Xue_Side_Information_in_ICCV_2017_paper.pdf", @@ -15000,7 +15955,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Xue_2017_ICCV,\n \n author = {\n Xue,\n Niannan and Panagakis,\n Yannis and Zafeiriou,\n Stefanos\n},\n title = {\n Side Information in Robust Principal Component Analysis: Algorithms and Applications\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Simultaneous Detection and Removal of High Altitude Clouds From an Image", @@ -15008,6 +15964,7 @@ "status": "Poster", "track": "main", "pid": "2387", + "author_site": "Tushar Sandhan; Jin Young Choi", "author": "Tushar Sandhan; Jin Young Choi", "abstract": "Interestingly, shape of the high-altitude clouds serves as a beacon for weather forecasting, so its detection is of vital importance. Besides these clouds often cause hindrance in an endeavor of satellites to inspect our world. Even thin clouds produce the undesired superposition of visual information, whose decomposition into the clear background and cloudy layer using a single satellite image is a highly ill-posed problem. In this work, we derive sophisticated image priors by thoroughly analyzing the properties of high-altitude clouds and geological images; and formulate a non-convex optimization scheme, which simultaneously detects and removes the clouds within a few seconds. Experimental results on real world RGB images demonstrate that the proposed method outperforms the other competitive methods by retaining the comprehensive background details and producing the precise shape of the cloudy layer.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Sandhan_Simultaneous_Detection_and_ICCV_2017_paper.pdf", @@ -15022,7 +15979,8 @@ "aff_domain": ";", "email": ";", "author_num": 2, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Sandhan_Simultaneous_Detection_and_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Sandhan_Simultaneous_Detection_and_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Sandhan_2017_ICCV,\n \n author = {\n Sandhan,\n Tushar and Young Choi,\n Jin\n},\n title = {\n Simultaneous Detection and Removal of High Altitude Clouds From an Image\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Single Image Action Recognition Using Semantic Body Part Actions", @@ -15030,6 +15988,7 @@ "status": "Poster", "track": "main", "pid": "1367", + "author_site": "Zhichen Zhao; Huimin Ma; Shaodi You", "author": "Zhichen Zhao; Huimin Ma; Shaodi You", "abstract": "In this paper, we propose a novel single image action recognition algorithm based on the idea of semantic part actions. Unlike existing part-based methods, we argue that there exists a mid-level semantic, the semantic part action; and human action is a combination of semantic part actions and context cues. In detail, we divide human body into seven parts: head, torso, arms, hands and lower body. For each of them, we define a few semantic part actions (e.g.head: laughing). Finally, we exploit these part actions to infer the entire body action (e.g. applauding). To make the proposed idea practical, we propose a deep network-based framework which consists of two subnetworks, one for part localization and the other for action prediction. The action prediction network jointly learns part-level and body-level action semantics and combines them for the final decision. Extensive experiments demonstrate our proposal on semantic part actions as elements for entire body action. Our method reaches mAP of 93.9% and 91.2% on PASCAL VOC 2012 and Stanford-40, which outperforms the state-of-the-art by 2.3% and 8.6%.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zhao_Single_Image_Action_ICCV_2017_paper.pdf", @@ -15054,7 +16013,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1+1", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Zhao_2017_ICCV,\n \n author = {\n Zhao,\n Zhichen and Ma,\n Huimin and You,\n Shaodi\n},\n title = {\n Single Image Action Recognition Using Semantic Body Part Actions\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Single Shot Text Detector With Regional Attention", @@ -15062,6 +16022,7 @@ "status": "Spotlight", "track": "main", "pid": "1946", + "author_site": "Pan He; Weilin Huang; Tong He; Qile Zhu; Yu Qiao; Xiaolin Li", "author": "Pan He; Weilin Huang; Tong He; Qile Zhu; Yu Qiao; Xiaolin Li", "abstract": "We present a novel single-shot text detector that directly outputs word-level bounding boxes in a natural image. We propose an attention mechanism which roughly identifies text regions via an automatically learned attentional map. This substantially suppresses background interference in the convolutional features, which is the key to producing accurate inference of words, particularly at extremely small sizes. This results in a single model that essentially works in a coarse-to-fine manner. It departs from recent FCN-based text detectors which cascade multiple FCN models to achieve an accurate prediction. Furthermore, we develop a hierarchical inception module which efficiently aggregates multi-scale inception features. This enhances local details, and also encodes strong context information, allowing the detector to work reliably on multi-scale and multi-orientation text with single-scale images. Our text detector achieves an F-measure of 77% on the ICDAR 2015 benchmark, advancing the state-of-the-art results.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/He_Single_Shot_Text_ICCV_2017_paper.pdf", @@ -15077,7 +16038,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/He_Single_Shot_Text_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/He_Single_Shot_Text_ICCV_2017_paper.html", + "bibtex": "@InProceedings{He_2017_ICCV,\n \n author = {\n He,\n Pan and Huang,\n Weilin and He,\n Tong and Zhu,\n Qile and Qiao,\n Yu and Li,\n Xiaolin\n},\n title = {\n Single Shot Text Detector With Regional Attention\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Situation Recognition With Graph Neural Networks", @@ -15085,6 +16047,7 @@ "status": "Poster", "track": "main", "pid": "1857", + "author_site": "Ruiyu Li; Makarand Tapaswi; Renjie Liao; Jiaya Jia; Raquel Urtasun; Sanja Fidler", "author": "Ruiyu Li; Makarand Tapaswi; Renjie Liao; Jiaya Jia; Raquel Urtasun; Sanja Fidler", "abstract": "We address the problem of recognizing situations in images. Given an image, the task is to predict the most salient verb (action), and fill its semantic roles such as who is performing the action, what is the source and target of the action, etc. Different verbs have different roles (e.g. attacking has weapon), and each role can take on many possible values (nouns). We propose a model based on Graph Neural Networks that allows us to efficiently capture joint dependencies between roles using neural networks defined on a graph. Experiments with different graph connectivities show that our approach that propagates information between roles significantly outperforms existing work, as well as multiple baselines. We obtain roughly 3-5% improvement over previous work in predicting the full situation. We also provide a thorough qualitative analysis of our model and influence of different roles in the verbs.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Li_Situation_Recognition_With_ICCV_2017_paper.pdf", @@ -15102,14 +16065,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Li_Situation_Recognition_With_ICCV_2017_paper.html", "aff_unique_index": "0;1;1;0+2;1+3+4;1+4", - "aff_unique_norm": "Chinese University of Hong Kong;University of Toronto;Tencent;Uber;Vector Institute", + "aff_unique_norm": "The Chinese University of Hong Kong;University of Toronto;Tencent;Uber;Vector Institute", "aff_unique_dep": ";;Youtu Lab;Advanced Technologies Group;", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.utoronto.ca;https://www.tencent.com;https://www.uber.com;https://vectorinstitute.ai/", "aff_unique_abbr": "CUHK;U of T;Tencent;Uber ATG;Vector Institute", "aff_campus_unique_index": "0;0;;", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;1;1;0+0;1+2+1;1+1", - "aff_country_unique": "China;Canada;United States" + "aff_country_unique": "China;Canada;United States", + "bibtex": "@InProceedings{Li_2017_ICCV,\n \n author = {\n Li,\n Ruiyu and Tapaswi,\n Makarand and Liao,\n Renjie and Jia,\n Jiaya and Urtasun,\n Raquel and Fidler,\n Sanja\n},\n title = {\n Situation Recognition With Graph Neural Networks\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Sketching With Style: Visual Search With Sketches and Aesthetic Context", @@ -15117,6 +16081,7 @@ "status": "Poster", "track": "main", "pid": "1166", + "author_site": "John Collomosse; Tu Bui; Michael J. Wilber; Chen Fang; Hailin Jin", "author": "John Collomosse; Tu Bui; Michael J. Wilber; Chen Fang; Hailin Jin", "abstract": "We propose a novel measure of visual similarity for image retrieval that incorporates both structural and aesthetic (style) constraints. Our algorithm accepts a query as sketched shape, and a set of one or more contextual images specifying the desired visual aesthetic. A triplet network is used to learn a feature embedding capable of measuring style similarity independent of structure, delivering significant gains over previous networks for style discrimination. We incorporate this model within a hierarchical triplet network to unify and learn a joint space from two discriminatively trained streams for style and structure. We demonstrate that this space enables, for the first time, style-constrained sketch search over a diverse domain of digital artwork comprising graphics, paintings and drawings. We also briefly explore alternative query modalities.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Collomosse_Sketching_With_Style_ICCV_2017_paper.pdf", @@ -15141,7 +16106,8 @@ "aff_campus_unique_index": ";1", "aff_campus_unique": ";New York City", "aff_country_unique_index": "0+1;0;1;1;1", - "aff_country_unique": "United Kingdom;United States" + "aff_country_unique": "United Kingdom;United States", + "bibtex": "@InProceedings{Collomosse_2017_ICCV,\n \n author = {\n Collomosse,\n John and Bui,\n Tu and Wilber,\n Michael J. and Fang,\n Chen and Jin,\n Hailin\n},\n title = {\n Sketching With Style: Visual Search With Sketches and Aesthetic Context\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Smart Mining for Deep Metric Learning", @@ -15149,6 +16115,7 @@ "status": "Poster", "track": "main", "pid": "1205", + "author_site": "Ben Harwood; Vijay Kumar B G; Gustavo Carneiro; Ian Reid; Tom Drummond", "author": "Ben Harwood; Vijay Kumar B G; Gustavo Carneiro; Ian Reid; Tom Drummond", "abstract": "To solve deep metric learning problems and produce feature embeddings, current methodologies will commonly use a triplet model to minimise the relative distance between samples from the same class and maximise the relative distance between samples from different classes. Though successful, the training convergence of this triplet model can be compromised by the fact that the vast majority of the training samples will produce gradients with magnitudes that are close to zero. This issue has motivated the development of methods that explore the global structure of the embedding and other methods that explore hard negative/positive mining. The effectiveness of such mining methods is often associated with intractable computational requirements. In this paper, we propose a novel deep metric learning method that combines the triplet model and the global structure of the embedding space. We rely on a smart mining procedure that produces effective training samples for a low computational cost. In addition, we propose an adaptive controller that automatically adjusts the smart mining hyper-parameters and speeds up the convergence of the training process. We show empirically that our proposed method allows for fast and more accurate training of triplet ConvNets than other competing mining methods. Additionally, we show that our method achieves new state-of-the-art embedding results for CUB-200-2011 and Cars196 datasets.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Harwood_Smart_Mining_for_ICCV_2017_paper.pdf", @@ -15173,7 +16140,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "Australia" + "aff_country_unique": "Australia", + "bibtex": "@InProceedings{Harwood_2017_ICCV,\n \n author = {\n Harwood,\n Ben and Kumar B G,\n Vijay and Carneiro,\n Gustavo and Reid,\n Ian and Drummond,\n Tom\n},\n title = {\n Smart Mining for Deep Metric Learning\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Soft Proposal Networks for Weakly Supervised Object Localization", @@ -15181,6 +16149,7 @@ "status": "Poster", "track": "main", "pid": "690", + "author_site": "Yi Zhu; Yanzhao Zhou; Qixiang Ye; Qiang Qiu; Jianbin Jiao", "author": "Yi Zhu; Yanzhao Zhou; Qixiang Ye; Qiang Qiu; Jianbin Jiao", "abstract": "Weakly supervised object localization remains challenging, where only image labels instead of bounding boxes are available during training. Object proposal is an effective component in localization, but often computationally expensive and incapable of joint optimization with some of the remaining modules. In this paper, to the best of our knowledge, we for the first time integrate weakly supervised object proposal into convolutional neural networks (CNNs) in an end-to-end learning manner. We design a network component, Soft Proposal (SP), to be plugged into any standard convolutional architecture to introduce the nearly cost-free object proposal, orders of magnitude faster than state-of-the-art methods. In the SP-augmented CNNs, referred to as Soft Proposal Networks (SPNs), iteratively evolved object proposals are generated based on the deep feature maps then projected back, and further jointly optimized with network parameters, with image-level supervision only. Through the unified learning process, SPNs learn better object-centric filters, discover more discriminative visual evidence, and suppress background interference, significantly boosting both weakly supervised object localization and classification performance. We report the best results on popular benchmarks, including PASCAL VOC, MS COCO, and ImageNet.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zhu_Soft_Proposal_Networks_ICCV_2017_paper.pdf", @@ -15195,7 +16164,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Zhu_Soft_Proposal_Networks_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Zhu_Soft_Proposal_Networks_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Zhu_2017_ICCV,\n \n author = {\n Zhu,\n Yi and Zhou,\n Yanzhao and Ye,\n Qixiang and Qiu,\n Qiang and Jiao,\n Jianbin\n},\n title = {\n Soft Proposal Networks for Weakly Supervised Object Localization\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Soft-NMS -- Improving Object Detection With One Line of Code", @@ -15203,6 +16173,7 @@ "status": "Poster", "track": "main", "pid": "2709", + "author_site": "Navaneeth Bodla; Bharat Singh; Rama Chellappa; Larry S. Davis", "author": "Navaneeth Bodla; Bharat Singh; Rama Chellappa; Larry S. Davis", "abstract": "Non-maximum suppression is an integral part of the object detection pipeline. First, it sorts all detection boxes on the basis of their scores. The detection box M with the maximum score is selected and all other detection boxes with a significant overlap (using a pre-defined threshold) with M are suppressed. This process is recursively applied on the remaining boxes. As per the design of the algorithm, if an object lies within the predefined overlap threshold, it leads to a miss. To this end, we propose Soft-NMS, an algorithm which decays the detection scores of all other objects as a continuous function of their overlap with M. Hence, no object is eliminated in this process. Soft-NMS obtains consistent improvements for the coco-style mAP metric on standard datasets like PASCAL VOC 2007 (1.7% for both R-FCN and Faster-RCNN) and MS-COCO (1.3% for R-FCN and 1.1% for Faster-RCNN) by just changing the NMS algorithm without any additional hyper-parameters. Using Deformable-RFCN, Soft-NMS improves state-of-the-art in object detection from 39.8% to 40.9% with a single model. Further, the computational complexity of Soft-NMS is the same as traditional NMS and hence it can be efficiently implemented. Since Soft-NMS does not require any extra training and is simple to implement, it can be easily integrated into any object detection pipeline. Code for Soft-NMS is publicly available on GitHub", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Bodla_Soft-NMS_--_Improving_ICCV_2017_paper.pdf", @@ -15227,7 +16198,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "College Park", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Bodla_2017_ICCV,\n \n author = {\n Bodla,\n Navaneeth and Singh,\n Bharat and Chellappa,\n Rama and Davis,\n Larry S.\n},\n title = {\n Soft-NMS -- Improving Object Detection With One Line of Code\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Space-Time Localization and Mapping", @@ -15235,6 +16207,7 @@ "status": "Poster", "track": "main", "pid": "1953", + "author_site": "Minhaeng Lee; Charless C. Fowlkes", "author": "Minhaeng Lee; Charless C. Fowlkes", "abstract": "This paper addresses the problem of building a spatio-temporal model of the world from a stream of time-stamped data. Unlike traditional models for simultaneous localization and mapping (SLAM) and structure-from-motion (SfM) which focus on recovering a single rigid 3D model, we tackle the problem of mapping scenes in which dynamic components appear, move and disappear independently of each other over time. We introduce a simple generative probabilistic model of 4D structure which specifies location, spatial and temporal extent of rigid surface patches by local Gaussian mixtures. We fit this model to a time-stamped stream of input data using expectation-maximization to estimate the model structure parameters (mapping) and the alignment of the input data to the model (localization). By explicitly representing the temporal extent and observability of surfaces in a scene, our method yields superior localization and reconstruction relative to baselines that assume a static 3D scene. We carry out experiments on both synthetic RGB-D data streams as well as challenging real-world datasets, tracking scene dynamics in a human workspace over the course of several weeks.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Lee_Space-Time_Localization_and_ICCV_2017_paper.pdf", @@ -15253,13 +16226,14 @@ "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Lee_Space-Time_Localization_and_ICCV_2017_paper.html", "aff_unique_index": "0;0", "aff_unique_norm": "University of California, Irvine", - "aff_unique_dep": "Dept. of Computer Science", + "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.uci.edu", "aff_unique_abbr": "UCI", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Irvine", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Lee_2017_ICCV,\n \n author = {\n Lee,\n Minhaeng and Fowlkes,\n Charless C.\n},\n title = {\n Space-Time Localization and Mapping\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Sparse Exact PGA on Riemannian Manifolds", @@ -15267,6 +16241,7 @@ "status": "Poster", "track": "main", "pid": "2035", + "author_site": "Monami Banerjee; Rudrasis Chakraborty; Baba C. Vemuri", "author": "Monami Banerjee; Rudrasis Chakraborty; Baba C. Vemuri", "abstract": "Principal Component Analysis (PCA) is a widely popular dimensionality reduction technique for vector-valued inputs. In the past decade, a nonlinear generalization of PCA, called the Principal Geodesic Analysis (PGA) was developed to tackle data that lie on a smooth manifold. PGA suffers from the same problem as PCA in that, in both the methods, each Principal Component (PC) is a linear combination of the original variables. This makes it very difficult to interpret the PCs especially in high dimensions. This lead to the introduction of sparse PCA (SPCA) in the vector space input case. In this paper, we present a novel generalization of SPCA, called sparse exact PGA (SEPGA) that can cope with manifold-valued input data and respect the intrinsic geometry of the underlying manifold. Sparsity has the advantage of not only easy interpretability but also computational efficiency. We achieve this by formulating the PGA problem as a minimization of the projection error in conjunction with sparsity constraints enforced on the principal vectors post isomorphic mapping to Rm, where m is the dimension of the manifold on which the data reside. Further, for constant curvature smooth manifolds, we use analytic formulae for the projection error leading to an efficient solution to the SEPGA problem. We present extensive experimental results demonstrating the performance of SEPGA to achieve very good sparse principal components without sacrificing the accuracy of reconstruction. This makes the representation of manifold-valued data using SEPGA accurate and efficient.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Banerjee_Sparse_Exact_PGA_ICCV_2017_paper.pdf", @@ -15291,7 +16266,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Gainesville", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Banerjee_2017_ICCV,\n \n author = {\n Banerjee,\n Monami and Chakraborty,\n Rudrasis and Vemuri,\n Baba C.\n},\n title = {\n Sparse Exact PGA on Riemannian Manifolds\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Spatial Memory for Context Reasoning in Object Detection", @@ -15299,6 +16275,7 @@ "status": "Poster", "track": "main", "pid": "1616", + "author_site": "Xinlei Chen; Abhinav Gupta", "author": "Xinlei Chen; Abhinav Gupta", "abstract": "Modeling instance-level context and object-object relationships is extremely challenging. It requires reasoning about bounding boxes of different locations, scales, aspect ratios etc.. Above all, instance-level spatial reasoning inherently requires modeling conditional distributions on previous detections. But our current object detection systems do not have any memory to remember what to condition on! The state-of-the-art object detectors still detect all object in parallel followed by non-maximal suppression (NMS). While memory has been used for tasks such as captioning and VQA, they use image-level memory cells without capturing the spatial layout. On the other hand, modeling object-object relationships requires spatial reasoning -- not only do we need a memory to store the spatial layout, but also a effective reasoning module to extract spatial patterns. This paper presents a conceptually simple yet powerful solution -- Spatial Memory Network (SMN), to model the instance-level context efficiently and effectively. Our spatial memory essentially assembles object instances back into a pseudo \"image\" representation that is easy to be fed into another ConvNet for object-object context reasoning. This leads to a new sequential reasoning architecture where image and memory are processed in parallel to obtain detections which update the memory again. We show our SMN architecture is effective as it provides 2.2% improvement over baseline Faster RCNN on the COCO dataset with VGG16.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Chen_Spatial_Memory_for_ICCV_2017_paper.pdf", @@ -15313,7 +16290,8 @@ "aff_domain": ";", "email": ";", "author_num": 2, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Chen_Spatial_Memory_for_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Chen_Spatial_Memory_for_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Chen_2017_ICCV,\n \n author = {\n Chen,\n Xinlei and Gupta,\n Abhinav\n},\n title = {\n Spatial Memory for Context Reasoning in Object Detection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Spatial-Aware Object Embeddings for Zero-Shot Localization and Classification of Actions", @@ -15321,6 +16299,7 @@ "status": "Oral", "track": "main", "pid": "851", + "author_site": "Pascal Mettes; Cees G. M. Snoek", "author": "Pascal Mettes; Cees G. M. Snoek", "abstract": "We aim for zero-shot localization and classification of human actions in video. Where traditional approaches rely on global attribute or object classification scores for their zero-shot knowledge transfer, our main contribution is a spatial-aware object embedding. To arrive at spatial awareness, we build our embedding on top of freely available actor and object detectors. Relevance of objects is determined in a word embedding space and further enforced with estimated spatial preferences. Besides local object awareness, we also embed global object awareness into our embedding to maximize actor and object interaction. Finally, we exploit the object positions and sizes in the spatial-aware embedding to demonstrate a new spatio-temporal action retrieval scenario with composite queries. Action localization and classification experiments on four contemporary action video datasets support our proposal. Apart from state-of-the-art results in the zero-shot localization and classification settings, our spatial-aware embedding is even competitive with recent supervised action localization alternatives.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Mettes_Spatial-Aware_Object_Embeddings_ICCV_2017_paper.pdf", @@ -15345,7 +16324,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Netherlands" + "aff_country_unique": "Netherlands", + "bibtex": "@InProceedings{Mettes_2017_ICCV,\n \n author = {\n Mettes,\n Pascal and Snoek,\n Cees G. M.\n},\n title = {\n Spatial-Aware Object Embeddings for Zero-Shot Localization and Classification of Actions\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Spatio-Temporal Person Retrieval via Natural Language Queries", @@ -15353,6 +16333,7 @@ "status": "Poster", "track": "main", "pid": "591", + "author_site": "Masataka Yamaguchi; Kuniaki Saito; Yoshitaka Ushiku; Tatsuya Harada", "author": "Masataka Yamaguchi; Kuniaki Saito; Yoshitaka Ushiku; Tatsuya Harada", "abstract": "In this paper, we address the problem of spatio-temporal person retrieval from videos using a natural language query, in which we output a tube (i.e., a sequence of bounding boxes) which encloses the person described by the query. For this problem, we introduce a novel dataset consisting of videos containing people annotated with bounding boxes for each second and with five natural language descriptions. To retrieve the tube of the person described by a given natural language query, we design a model that combines methods for spatio-temporal human detection and multimodal retrieval. We conduct comprehensive experiments to compare a variety of tube and text representations and multimodal retrieval methods, and present a strong baseline in this task as well as demonstrate the efficacy of our tube representation and multimodal feature embedding technique. Finally, we demonstrate the versatility of our model by applying it to two other important tasks.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Yamaguchi_Spatio-Temporal_Person_Retrieval_ICCV_2017_paper.pdf", @@ -15370,14 +16351,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Yamaguchi_Spatio-Temporal_Person_Retrieval_ICCV_2017_paper.html", "aff_unique_index": "0;0;0;0+1", - "aff_unique_norm": "University of Tokyo;RIKEN", + "aff_unique_norm": "The University of Tokyo;RIKEN", "aff_unique_dep": "Graduate School of Information Science and Technology;", "aff_unique_url": "https://www.u-tokyo.ac.jp;https://www.riken.jp", "aff_unique_abbr": "UTokyo;RIKEN", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Tokyo;", "aff_country_unique_index": "0;0;0;0+0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Yamaguchi_2017_ICCV,\n \n author = {\n Yamaguchi,\n Masataka and Saito,\n Kuniaki and Ushiku,\n Yoshitaka and Harada,\n Tatsuya\n},\n title = {\n Spatio-Temporal Person Retrieval via Natural Language Queries\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Spatiotemporal Modeling for Crowd Counting in Videos", @@ -15385,6 +16367,7 @@ "status": "Poster", "track": "main", "pid": "2394", + "author_site": "Feng Xiong; Xingjian Shi; Dit-Yan Yeung", "author": "Feng Xiong; Xingjian Shi; Dit-Yan Yeung", "abstract": "Region of Interest (ROI) crowd counting can be formulated as a regression problem of learning a mapping from an image or a video frame to a crowd density map. Recently, convolutional neural network (CNN) models have achieved promising results for crowd counting. However, even when dealing with video data, CNN-based methods still consider each video frame independently, ignoring the strong temporal correlation between neighboring frames. To exploit the otherwise very useful temporal information in video sequences, we propose a variant of a recent deep learning model called convolutional LSTM (ConvLSTM) for crowd counting. Unlike the previous CNN-based methods, our method fully captures both spatial and temporal dependencies. Furthermore, we extend the ConvLSTM model to a bidirectional ConvLSTM model which can access long-range information in both directions. Extensive experiments using four publicly available datasets demonstrate the reliability of our approach and the effectiveness of incorporating temporal information to boost the accuracy of crowd counting. In addition, we also conduct some transfer learning experiments to show that once our model is trained on one dataset, its learning experience can be transferred easily to a new dataset which consists of only very few video frames for model adaptation.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Xiong_Spatiotemporal_Modeling_for_ICCV_2017_paper.pdf", @@ -15409,7 +16392,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xiong_2017_ICCV,\n \n author = {\n Xiong,\n Feng and Shi,\n Xingjian and Yeung,\n Dit-Yan\n},\n title = {\n Spatiotemporal Modeling for Crowd Counting in Videos\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Speaking the Same Language: Matching Machine to Human Captions by Adversarial Training", @@ -15417,10 +16401,11 @@ "status": "Poster", "track": "main", "pid": "1731", + "author_site": "Rakshith Shetty; Marcus Rohrbach; Lisa Anne Hendricks; Mario Fritz; Bernt Schiele", "author": "Rakshith Shetty; Marcus Rohrbach; Lisa Anne Hendricks; Mario Fritz; Bernt Schiele", "abstract": "While strong progress has been made in image captioning recently, machine and human captions are still quite distinct. This is primarily due to the deficiencies in the generated word distribution, vocabulary size, and strong bias in the generators towards frequent captions. Furthermore, humans -- rightfully so -- generate multiple, diverse captions, due to the inherent ambiguity in the captioning task which is not explicitly considered in today's systems. To address these challenges, we change the training objective of the caption generator from reproducing ground-truth captions to generating a set of captions that is indistinguishable from human written captions. Instead of handcrafting such a learning target, we employ adversarial training in combination with an approximate Gumbel sampler to implicitly match the generated distribution to the human one. While our method achieves comparable performance to the state-of-the-art in terms of the correctness of the captions, we generate a set of diverse captions that are significantly less biased and better match the global uni-, bi- and tri-gram distributions of the human captions.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Shetty_Speaking_the_Same_ICCV_2017_paper.pdf", - "aff": "Max Planck Institute for Informatics, Saarland Informatics Campus, Saarbr\u00fccken, Germany; UC Berkeley EECS, CA, United States+Facebook AI Research; UC Berkeley EECS, CA, United States; Max Planck Institute for Informatics, Saarland Informatics Campus, Saarbr\u00fccken, Germany; Max Planck Institute for Informatics, Saarland Informatics Campus, Saarbr\u00fccken, Germany", + "aff": "Max Planck Institute for Informatics, Saarland Informatics Campus, Saarbrücken, Germany; UC Berkeley EECS, CA, United States+Facebook AI Research; UC Berkeley EECS, CA, United States; Max Planck Institute for Informatics, Saarland Informatics Campus, Saarbrücken, Germany; Max Planck Institute for Informatics, Saarland Informatics Campus, Saarbrücken, Germany", "project": "", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2017/supplemental/Shetty_Speaking_the_Same_ICCV_2017_supplemental.pdf", @@ -15434,14 +16419,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Shetty_Speaking_the_Same_ICCV_2017_paper.html", "aff_unique_index": "0;1+2;1;0;0", - "aff_unique_norm": "Max Planck Institute for Informatics;University of California, Berkeley;Meta", + "aff_unique_norm": "Max Planck Institute for Informatics;University of California, Berkeley;Facebook", "aff_unique_dep": ";Electrical Engineering and Computer Sciences;Facebook AI Research", "aff_unique_url": "https://mpi-inf.mpg.de;https://www.berkeley.edu;https://research.facebook.com", "aff_unique_abbr": "MPII;UC Berkeley;FAIR", "aff_campus_unique_index": "0;1;1;0;0", - "aff_campus_unique": "Saarbr\u00fccken;Berkeley;", + "aff_campus_unique": "Saarbrücken;Berkeley;", "aff_country_unique_index": "0;1+1;1;0;0", - "aff_country_unique": "Germany;United States" + "aff_country_unique": "Germany;United States", + "bibtex": "@InProceedings{Shetty_2017_ICCV,\n \n author = {\n Shetty,\n Rakshith and Rohrbach,\n Marcus and Anne Hendricks,\n Lisa and Fritz,\n Mario and Schiele,\n Bernt\n},\n title = {\n Speaking the Same Language: Matching Machine to Human Captions by Adversarial Training\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "StackGAN: Text to Photo-Realistic Image Synthesis With Stacked Generative Adversarial Networks", @@ -15449,6 +16435,7 @@ "status": "Oral", "track": "main", "pid": "1208", + "author_site": "Han Zhang; Tao Xu; Hongsheng Li; Shaoting Zhang; Xiaogang Wang; Xiaolei Huang; Dimitris N. Metaxas", "author": "Han Zhang; Tao Xu; Hongsheng Li; Shaoting Zhang; Xiaogang Wang; Xiaolei Huang; Dimitris N. Metaxas", "abstract": "Synthesizing high-quality images from text descriptions is a challenging problem in computer vision and has many practical applications. Samples generated by existing text-to-image approaches can roughly reflect the meaning of the given descriptions, but they fail to contain necessary details and vivid object parts. In this paper, we propose Stacked Generative Adversarial Networks (StackGAN) to generate 256x256 photo-realistic images conditioned on text descriptions. We decompose the hard problem into more manageable sub-problems through a sketch-refinement process. The Stage-I GAN sketches the primitive shape and colors of the object based on the given text description, yielding Stage-I low-resolution images. The Stage-II GAN takes Stage-I results and text descriptions as inputs, and generates high-resolution images with photo-realistic details. It is able to rectify defects in Stage-I results and add compelling details with the refinement process. To improve the diversity of the synthesized images and stabilize the training of the conditional-GAN, we introduce a novel Conditioning Augmentation technique that encourages smoothness in the latent conditioning manifold. Extensive experiments and comparisons with state-of-the-arts on benchmark datasets demonstrate that the proposed method achieves significant improvements on generating photo-realistic images conditioned on text descriptions.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zhang_StackGAN_Text_to_ICCV_2017_paper.pdf", @@ -15466,14 +16453,15 @@ "author_num": 7, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Zhang_StackGAN_Text_to_ICCV_2017_paper.html", "aff_unique_index": "0;1;2;3;2;1;0", - "aff_unique_norm": "Rutgers University;Lehigh University;Chinese University of Hong Kong;Baidu", + "aff_unique_norm": "Rutgers University;Lehigh University;The Chinese University of Hong Kong;Baidu", "aff_unique_dep": ";;;Baidu Research", "aff_unique_url": "https://www.rutgers.edu;https://www.lehigh.edu;https://www.cuhk.edu.hk;https://research.baidu.com", "aff_unique_abbr": "Rutgers;Lehigh;CUHK;Baidu", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;1;1;1;0;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Zhang_2017_ICCV,\n \n author = {\n Zhang,\n Han and Xu,\n Tao and Li,\n Hongsheng and Zhang,\n Shaoting and Wang,\n Xiaogang and Huang,\n Xiaolei and Metaxas,\n Dimitris N.\n},\n title = {\n StackGAN: Text to Photo-Realistic Image Synthesis With Stacked Generative Adversarial Networks\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Stepwise Metric Promotion for Unsupervised Video Person Re-Identification", @@ -15481,6 +16469,7 @@ "status": "Poster", "track": "main", "pid": "983", + "author_site": "Zimo Liu; Dong Wang; Huchuan Lu", "author": "Zimo Liu; Dong Wang; Huchuan Lu", "abstract": "The intensive annotation cost and the rich but unlabeled data contained in videos motivate us to propose an unsupervised video-based person re-identification (re-ID) method. We start from two assumptions: 1) different video tracklets typically contain different persons, given that the tracklets are taken at distinct places or with long intervals; 2) within each tracklet, the frames are mostly of the same person. Based on these assumptions, this paper propose a stepwise metric promotion approach to estimate the identities of training tracklets, which iterates between cross-camera tracklet association and feature learning. Specifically, We use each training tracklet as a query, and perform retrieval in the cross camera training set. Our method is built on reciprocal nearest neighbor search and can eliminate the hard negative label matches, i.e., the cross-camera nearest neighbors of the false matches in the initial rank list. The tracklet that passes the reciprocal nearest neighbor check is considered to have the same ID with the query. Experimental results on the PRID 2011, ILIDS-VID, and MARS datasets show that the proposed method achieves very competitive re-ID accuracy compared with its supervised counterparts.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Liu_Stepwise_Metric_Promotion_ICCV_2017_paper.pdf", @@ -15505,7 +16494,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2017_ICCV,\n \n author = {\n Liu,\n Zimo and Wang,\n Dong and Lu,\n Huchuan\n},\n title = {\n Stepwise Metric Promotion for Unsupervised Video Person Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Stereo DSO: Large-Scale Direct Sparse Visual Odometry With Stereo Cameras", @@ -15513,7 +16503,7 @@ "status": "Poster", "track": "main", "pid": "1921", - "author_site": "Rui Wang; Martin Schw\u00c3\u00b6rer; Daniel Cremers", + "author_site": "Rui Wang; Martin Schwörer; Daniel Cremers", "author": "Rui Wang; Martin Schworer; Daniel Cremers", "abstract": "We propose Stereo Direct Sparse Odometry (Stereo DSO) as a novel method for highly accurate real-time visual odometry estimation of large-scale environments from stereo cameras. It jointly optimizes for all the model parameters within the active window, including the intrinsic/extrinsic camera parameters of all keyframes and the depth values of all selected pixels. In particular, we propose a novel approach to integrate constraints from static stereo into the bundle adjustment pipeline of temporal multi-view stereo. Real-time optimization is realized by sampling pixels uniformly from image regions with sufficient intensity gradient. Fixed-baseline stereo resolves scale drift. It also reduces the sensitivities to large optical flow and to rolling shutter effect which are known shortcomings of direct image alignment methods. Quantitative evaluation demonstrates that the proposed Stereo DSO outperforms existing state-of-the-art visual odometry methods both in terms of tracking accuracy and robustness. Moreover, our method delivers a more precise metric 3D reconstruction than previous dense/semi-dense direct approaches while providing a higher reconstruction density than feature-based methods.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Wang_Stereo_DSO_Large-Scale_ICCV_2017_paper.pdf", @@ -15538,7 +16528,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Wang_2017_ICCV,\n \n author = {\n Wang,\n Rui and Schworer,\n Martin and Cremers,\n Daniel\n},\n title = {\n Stereo DSO: Large-Scale Direct Sparse Visual Odometry With Stereo Cameras\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Structure-Measure: A New Way to Evaluate Foreground Maps", @@ -15546,6 +16537,7 @@ "status": "Spotlight", "track": "main", "pid": "1164", + "author_site": "Deng-Ping Fan; Ming-Ming Cheng; Yun Liu; Tao Li; Ali Borji", "author": "Deng-Ping Fan; Ming-Ming Cheng; Yun Liu; Tao Li; Ali Borji", "abstract": "Foreground map evaluation is crucial for gauging the progress of object segmentation algorithms, in particular in the filed of salient object detection where the purpose is to accurately detect and segment the most salient object in a scene. Several widely-used measures such as Area Under the Curve (AUC), Average Precision (AP) and the recently proposed Fbw have been utilized to evaluate the similarity between a non-binary saliency map (SM) and a ground-truth (GT) map. These measures are based on pixel-wise errors and often ignore the structural similarities. Behavioral vision studies, however, have shown that the human visual system is highly sensitive to structures in scenes. Here, we propose a novel, efficient, and easy to calculate measure known an structural similarity measure (Structure-measure) to evaluate non-binary foreground maps. Our new measure simultaneously evaluates region-aware and object-aware structural similarity between a SM and a GT map. We demonstrate superiority of our measure over existing ones using 5 meta-measures on 5 benchmark datasets.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Fan_Structure-Measure_A_New_ICCV_2017_paper.pdf", @@ -15570,7 +16562,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Fan_2017_ICCV,\n \n author = {\n Fan,\n Deng-Ping and Cheng,\n Ming-Ming and Liu,\n Yun and Li,\n Tao and Borji,\n Ali\n},\n title = {\n Structure-Measure: A New Way to Evaluate Foreground Maps\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Structured Attentions for Visual Question Answering", @@ -15578,6 +16571,7 @@ "status": "Poster", "track": "main", "pid": "526", + "author_site": "Chen Zhu; Yanpeng Zhao; Shuaiyi Huang; Kewei Tu; Yi Ma", "author": "Chen Zhu; Yanpeng Zhao; Shuaiyi Huang; Kewei Tu; Yi Ma", "abstract": "Visual attention, which assigns weights to image regions according to their relevance to a question, is considered as an indispensable part by most Visual Question Answering models. Although the questions may involve complex relations among multiple regions, few attention models can effectively encode such cross-region relations. In this paper,we emonstrate the importance of encoding such relations by showing the limited effective receptive field of ResNet on two datasets, and propose to model the visual attention as a multivariate distribution over a grid-structured Conditional Random Field on image regions. We demonstrate how to convert the iterative inference algorithms, Mean Field and Loopy Belief Propagation, as recurrent layers of an end-to-end neural network. We empirically evaluated our model on 3 datasets, in which it surpasses the best baseline model of the newly released CLEVR dataset by 9.5%, and the best published model on the VQA dataset by 1.25%. Source code is available at https://github.com/zhuchen03/vqa-sva.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zhu_Structured_Attentions_for_ICCV_2017_paper.pdf", @@ -15602,7 +16596,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhu_2017_ICCV,\n \n author = {\n Zhu,\n Chen and Zhao,\n Yanpeng and Huang,\n Shuaiyi and Tu,\n Kewei and Ma,\n Yi\n},\n title = {\n Structured Attentions for Visual Question Answering\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "SubUNets: End-To-End Hand Shape and Continuous Sign Language Recognition", @@ -15610,6 +16605,7 @@ "status": "Spotlight", "track": "main", "pid": "2402", + "author_site": "Necati Cihan Camgoz; Simon Hadfield; Oscar Koller; Richard Bowden", "author": "Necati Cihan Camgoz; Simon Hadfield; Oscar Koller; Richard Bowden", "abstract": "We propose a novel deep learning approach to solve simultaneous alignment and recognition problems (referred to as \"Sequence-to-sequence\" learning). We decompose the problem into a series of specialised expert systems referred to as SubUNets. The spatio-temporal relationships between these SubUNets are then modelled to solve the task, while remaining trainable end-to-end. The approach mimics human learning and educational techniques, and has a number of significant advantages. SubUNets allow us to inject domain-specific expert knowledge into the system regarding suitable intermediate representations. They also allow us to implicitly perform transfer learning between different interrelated tasks, which also allows us to exploit a wider range of more varied data sources. In our experiments we demonstrate that each of these properties serves to significantly improve the performance of the overarching recognition system, by better constraining the learning problem. The proposed techniques are demonstrated in the challenging domain of sign language recognition. We demonstrate state-of-the-art performance on hand-shape recognition outperforming previous techniques by more than 30%). Furthermore, we are able to obtain comparable sign recognition rates to previous research, without the need for an alignment step to segment out the signs for recognition.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Camgoz_SubUNets_End-To-End_Hand_ICCV_2017_paper.pdf", @@ -15634,7 +16630,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Aachen", "aff_country_unique_index": "0;0;1;0", - "aff_country_unique": "United Kingdom;Germany" + "aff_country_unique": "United Kingdom;Germany", + "bibtex": "@InProceedings{Camgoz_2017_ICCV,\n \n author = {\n Cihan Camgoz,\n Necati and Hadfield,\n Simon and Koller,\n Oscar and Bowden,\n Richard\n},\n title = {\n SubUNets: End-To-End Hand Shape and Continuous Sign Language Recognition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Sublabel-Accurate Discretization of Nonconvex Free-Discontinuity Problems", @@ -15642,7 +16639,7 @@ "status": "Poster", "track": "main", "pid": "502", - "author_site": "Thomas M\u00c3\u00b6llenhoff; Daniel Cremers", + "author_site": "Thomas Möllenhoff; Daniel Cremers", "author": "Thomas Mollenhoff; Daniel Cremers", "abstract": "In this work we show how sublabel-accurate multilabeling approaches can be derived by approximating a classical label-continuous convex relaxation of nonconvex free-discontinuity problems. This insight allows to extend these sublabel-accurate approaches from total variation to general convex and nonconvex regularizations. Furthermore, it leads to a systematic approach to the discretization of continuous convex relaxations. We study the relationship to existing discretizations and to discrete-continuous MRFs. Finally, we apply the proposed approach to obtain a sublabel-accurate and convex solution to the vectorial Mumford-Shah functional and show in several experiments that it leads to more precise solutions using fewer labels.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Mollenhoff_Sublabel-Accurate_Discretization_of_ICCV_2017_paper.pdf", @@ -15667,7 +16664,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Mollenhoff_2017_ICCV,\n \n author = {\n Mollenhoff,\n Thomas and Cremers,\n Daniel\n},\n title = {\n Sublabel-Accurate Discretization of Nonconvex Free-Discontinuity Problems\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Submodular Trajectory Optimization for Aerial 3D Scanning", @@ -15675,6 +16673,7 @@ "status": "Poster", "track": "main", "pid": "2616", + "author_site": "Mike Roberts; Debadeepta Dey; Anh Truong; Sudipta Sinha; Shital Shah; Ashish Kapoor; Pat Hanrahan; Neel Joshi", "author": "Mike Roberts; Debadeepta Dey; Anh Truong; Sudipta Sinha; Shital Shah; Ashish Kapoor; Pat Hanrahan; Neel Joshi", "abstract": "Drones equipped with cameras are emerging as a powerful tool for large-scale aerial 3D scanning, but existing automatic flight planners do not exploit all available information about the scene, and can therefore produce inaccurate and incomplete 3D models. We present an automatic method to generate drone trajectories, such that the imagery acquired during the flight will later produce a high-fidelity 3D model. Our method uses a coarse estimate of the scene geometry to plan camera trajectories that: (1) cover the scene as thoroughly as possible; (2) encourage observations of scene geometry from a diverse set of viewing angles; (3) avoid obstacles; and (4) respect a user-specified flight time budget. Our method relies on a mathematical model of scene coverage that exhibits an intuitive diminishing returns property known as submodularity. We leverage this property extensively to design a trajectory planning algorithm that reasons globally about the non-additive coverage reward obtained across a trajectory, jointly with the cost of traveling between views. We evaluate our method by using it to scan three large outdoor scenes, and we perform a quantitative evaluation using a photorealistic video game simulator.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Roberts_Submodular_Trajectory_Optimization_ICCV_2017_paper.pdf", @@ -15690,7 +16689,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Roberts_Submodular_Trajectory_Optimization_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Roberts_Submodular_Trajectory_Optimization_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Roberts_2017_ICCV,\n \n author = {\n Roberts,\n Mike and Dey,\n Debadeepta and Truong,\n Anh and Sinha,\n Sudipta and Shah,\n Shital and Kapoor,\n Ashish and Hanrahan,\n Pat and Joshi,\n Neel\n},\n title = {\n Submodular Trajectory Optimization for Aerial 3D Scanning\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Summarization and Classification of Wearable Camera Streams by Learning the Distributions Over Deep Features of Out-Of-Sample Image Sequences", @@ -15698,6 +16698,7 @@ "status": "Poster", "track": "main", "pid": "1890", + "author_site": "Alessandro Perina; Sadegh Mohammadi; Nebojsa Jojic; Vittorio Murino", "author": "Alessandro Perina; Sadegh Mohammadi; Nebojsa Jojic; Vittorio Murino", "abstract": "A popular approach to training classifiers of new image classes is to use lower levels of a pre-trained feed-forward neural network and retrain only the top. Thus, most layers simply serve as highly nonlinear feature extractors. While these features were found useful for classifying a variety of scenes and objects, previous work also demonstrated unusual levels of sensitivity to the input especially for images which are veering too far away from the training distribution. This can lead to surprising results as an imperceptible change in an image can be enough to completely change the predicted class. This occurs in particular in applications involving personaldata, typically acquired with wearable cameras (e.g., visual lifelogs), where the problem is also made more complex by the dearth of new labeled training data that make supervised learning with deep models difficult. To alleviate these problems, in this paper we propose a new generative model that captures the feature distribution in new data. Its latent space then becomes more representative of the new data, while still retaining the generalization properties. In particular, we use constrained Markov walks over a counting grid for modeling image sequences, which not only yield good latent representations, but allow for excellent classification with only a handful of labeled training examples of the new scenes or objects, a scenario typical in lifelogging applications.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Perina_Summarization_and_Classification_ICCV_2017_paper.pdf", @@ -15715,14 +16716,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Perina_Summarization_and_Classification_ICCV_2017_paper.html", "aff_unique_index": "0;1;0;1+2", - "aff_unique_norm": "Microsoft;Italian Institute of Technology;University of Verona", + "aff_unique_norm": "Microsoft Corporation;Italian Institute of Technology;University of Verona", "aff_unique_dep": "WDG Core Data;PA VIS Dept.;", "aff_unique_url": "https://www.microsoft.com;https://www.iit.it;https://www.univr.it", "aff_unique_abbr": "Microsoft;IIT;UniVR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;1+1", - "aff_country_unique": "United States;Italy" + "aff_country_unique": "United States;Italy", + "bibtex": "@InProceedings{Perina_2017_ICCV,\n \n author = {\n Perina,\n Alessandro and Mohammadi,\n Sadegh and Jojic,\n Nebojsa and Murino,\n Vittorio\n},\n title = {\n Summarization and Classification of Wearable Camera Streams by Learning the Distributions Over Deep Features of Out-Of-Sample Image Sequences\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Super-Trajectory for Video Segmentation", @@ -15730,6 +16732,7 @@ "status": "Poster", "track": "main", "pid": "711", + "author_site": "Wenguan Wang; Jianbing Shen; Jianwen Xie; Fatih Porikli", "author": "Wenguan Wang; Jianbing Shen; Jianwen Xie; Fatih Porikli", "abstract": "We introduce a novel semi-supervised video segmentation approach based on an efficient video representation, called as \"super-trajectory\". Each super-trajectory corresponds to a group of compact trajectories that exhibit consistent motion patterns, similar appearance and close spatiotemporal relationships. We generate trajectories using a probabilistic model, which handles occlusions and drifts in a robust and natural way. To reliably group trajectories, we adopt a modified version of the density peaks based clustering algorithm that allows capturing rich spatiotemporal relations among trajectories in the clustering process. The presented video representation is discriminative enough to accurately propagate the initial annotations in the first frame onto the remaining video frames. Extensive experimental analysis on challenging benchmarks demonstrate our method is capable of distinguishing the target objects from complex backgrounds and even reidentifying them after occlusions.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Wang_Super-Trajectory_for_Video_ICCV_2017_paper.pdf", @@ -15754,7 +16757,8 @@ "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Beijing;Los Angeles;", "aff_country_unique_index": "0;0;1;2", - "aff_country_unique": "China;United States;Australia" + "aff_country_unique": "China;United States;Australia", + "bibtex": "@InProceedings{Wang_2017_ICCV,\n \n author = {\n Wang,\n Wenguan and Shen,\n Jianbing and Xie,\n Jianwen and Porikli,\n Fatih\n},\n title = {\n Super-Trajectory for Video Segmentation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Supervision by Fusion: Towards Unsupervised Learning of Deep Salient Object Detector", @@ -15762,6 +16766,7 @@ "status": "Poster", "track": "main", "pid": "1787", + "author_site": "Dingwen Zhang; Junwei Han; Yu Zhang", "author": "Dingwen Zhang; Junwei Han; Yu Zhang", "abstract": "In light of the powerful learning capability of deep neural networks (DNNs), deep (convolutional) models have been built in recent years to address the task of salient object detection. Although training such deep saliency models can significantly improve the detection performance, it requires large-scale manual supervision in the form of pixel-level human annotation, which is highly labor-intensive and time-consuming. To address this problem, this paper makes the earliest effort to train a deep salient object detector without using any human annotation. The key insight is \"supervision by fusion\", i.e., generating useful supervisory signals from the fusion process of weak but fast unsupervised saliency models. Based on this insight, we combine an intra-image fusion stream and a inter-image fusion stream in the proposed framework to generate the learning curriculum and pseudo ground-truth for supervising the training of the deep salient object detector. Comprehensive experiments on four benchmark datasets demonstrate that our method can approach the same network trained with full supervision (within 2-5% performance gap) and, more encouragingly, even outperform a number of fully supervised state-of-the-art approaches.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zhang_Supervision_by_Fusion_ICCV_2017_paper.pdf", @@ -15786,7 +16791,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2017_ICCV,\n \n author = {\n Zhang,\n Dingwen and Han,\n Junwei and Zhang,\n Yu\n},\n title = {\n Supervision by Fusion: Towards Unsupervised Learning of Deep Salient Object Detector\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Supplementary Meta-Learning: Towards a Dynamic Model for Deep Neural Networks", @@ -15794,6 +16800,7 @@ "status": "Poster", "track": "main", "pid": "1927", + "author_site": "Feihu Zhang; Benjamin W. Wah", "author": "Feihu Zhang; Benjamin W. Wah", "abstract": "Data diversity in terms of types, styles, as well as radiometric, exposure and texture conditions widely exists in training and test data of vision applications. However, learning in traditional neural networks (NNs) only tries to find a model with fixed parameters that optimize the average behavior over all inputs, without using data-specific properties. In this paper, we develop a meta-level NN (MLNN) model that learns meta-knowledge on data-specific properties of images during learning and that dynamically adapts its weights during application according to the properties of the images input. MLNN consists of two parts: the dynamic supplementary NN (SNN) that learns meta-information on each type of inputs, and the fixed base-level NN (BLNN) that incorporates the meta-information from SNN into its weights at run time to realize the generalization for each type of inputs. We verify our approach using over ten network architectures under various application scenarios and loss functions. In low-level vision applications on image super-resolution and denoising, MLNN has 0.1 0.3 dB improvements on PSNR, whereas for high-level image classification, MLNN has accuracy improvement of 0.4 0.6% for Cifar10 and 1.2 2.1% for ImageNet when compared to convolutional NNs (CNNs). Improvements are more pronounced as the scale or diversity of data is increased.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zhang_Supplementary_Meta-Learning_Towards_ICCV_2017_paper.pdf", @@ -15811,14 +16818,15 @@ "author_num": 2, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Zhang_Supplementary_Meta-Learning_Towards_ICCV_2017_paper.html", "aff_unique_index": "0;0", - "aff_unique_norm": "Chinese University of Hong Kong", + "aff_unique_norm": "The Chinese University of Hong Kong", "aff_unique_dep": "", "aff_unique_url": "https://www.cuhk.edu.hk", "aff_unique_abbr": "CUHK", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2017_ICCV,\n \n author = {\n Zhang,\n Feihu and Wah,\n Benjamin W.\n},\n title = {\n Supplementary Meta-Learning: Towards a Dynamic Model for Deep Neural Networks\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Surface Normals in the Wild", @@ -15826,6 +16834,7 @@ "status": "Poster", "track": "main", "pid": "788", + "author_site": "Weifeng Chen; Donglai Xiang; Jia Deng", "author": "Weifeng Chen; Donglai Xiang; Jia Deng", "abstract": "We study the problem of single-image depth estimation for images in the wild. We collect human annotated surface normals and use them to help train a neural network that directly predicts pixel-wise depth. We propose two novel loss functions for training with surface normal annotations. Experiments on NYU Depth, KITTI, and our own dataset demonstrate that our approach can significantly improve the quality of depth estimation in the wild.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Chen_Surface_Normals_in_ICCV_2017_paper.pdf", @@ -15850,7 +16859,8 @@ "aff_campus_unique_index": "0+1;1;0", "aff_campus_unique": "Ann Arbor;Beijing", "aff_country_unique_index": "0+1;1;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Chen_2017_ICCV,\n \n author = {\n Chen,\n Weifeng and Xiang,\n Donglai and Deng,\n Jia\n},\n title = {\n Surface Normals in the Wild\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Surface Registration via Foliation", @@ -15858,6 +16868,7 @@ "status": "Poster", "track": "main", "pid": "534", + "author_site": "Xiaopeng Zheng; Chengfeng Wen; Na Lei; Ming Ma; Xianfeng Gu", "author": "Xiaopeng Zheng; Chengfeng Wen; Na Lei; Ming Ma; Xianfeng Gu", "abstract": "This work introduces a novel surface registration method based on foliation. A foliation decomposes the surface into a family of closed loops, such that the decomposition has local tensor product structure. By projecting each loop to a point, the surface is collapsed into a graph. Two homeomorphic surfaces with consistent foliations can be registered by first matching their foliation graphs, then matching the corresponding leaves. This foliation based method is capable of handling surfaces with complicated topologies and large non-isometric deformations, rigorous with solid theoretic foundation, easy to implement, robust to compute. The result mapping is diffeomorphic. Our experimental results show the efficiency and efficacy of the proposed method.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zheng_Surface_Registration_via_ICCV_2017_paper.pdf", @@ -15882,7 +16893,8 @@ "aff_campus_unique_index": "0+0+1;2;0+0+1;2;2+1", "aff_campus_unique": "Dalian;Beijing;Stony Brook", "aff_country_unique_index": "0+0+0;1;0+0+0;1;1+0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Zheng_2017_ICCV,\n \n author = {\n Zheng,\n Xiaopeng and Wen,\n Chengfeng and Lei,\n Na and Ma,\n Ming and Gu,\n Xianfeng\n},\n title = {\n Surface Registration via Foliation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "SurfaceNet: An End-To-End 3D Neural Network for Multiview Stereopsis", @@ -15890,6 +16902,7 @@ "status": "Poster", "track": "main", "pid": "948", + "author_site": "Mengqi Ji; Juergen Gall; Haitian Zheng; Yebin Liu; Lu Fang", "author": "Mengqi Ji; Juergen Gall; Haitian Zheng; Yebin Liu; Lu Fang", "abstract": "This paper proposes an end-to-end learning framework for multiview stereopsis. We term the network SurfaceNet. It takes a set of images and their corresponding camera parameters as input and directly infers the 3D model. The key advantage of the framework is that both photo-consistency as well geometric relations of the surface structure can be directly learned for the purpose of multiview stereopsis in an end-to-end fashion. SurfaceNet is a fully 3D convolutional network which is achieved by encoding the camera parameters together with the images in a 3D voxel representation. We evaluate SurfaceNet on the large-scale DTU benchmark.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Ji_SurfaceNet_An_End-To-End_ICCV_2017_paper.pdf", @@ -15914,7 +16927,8 @@ "aff_campus_unique_index": "0;", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;1;0+0;0;0", - "aff_country_unique": "China;Germany" + "aff_country_unique": "China;Germany", + "bibtex": "@InProceedings{Ji_2017_ICCV,\n \n author = {\n Ji,\n Mengqi and Gall,\n Juergen and Zheng,\n Haitian and Liu,\n Yebin and Fang,\n Lu\n},\n title = {\n SurfaceNet: An End-To-End 3D Neural Network for Multiview Stereopsis\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Synergy Between Face Alignment and Tracking via Discriminative Global Consensus Optimization", @@ -15922,6 +16936,7 @@ "status": "Spotlight", "track": "main", "pid": "1859", + "author_site": "Muhammad Haris Khan; John McDonagh; Georgios Tzimiropoulos", "author": "Muhammad Haris Khan; John McDonagh; Georgios Tzimiropoulos", "abstract": "An open question in facial landmark localization in video is whether one should perform tracking or tracking-by-detection (i.e. face alignment). Tracking produces fittings of high accuracy but is prone to drifting. Tracking-by-detection is drift-free but results in low accuracy fittings. To provide a solution to this problem, we describe the very first, to the best of our knowledge, synergistic approach between detection (face alignment) and tracking which completely eliminates drifting from face tracking, and does not merely perform tracking-by-detection. Our first main contribution is to show that one can achieve this synergy between detection and tracking using a principled optimization framework based on the theory of Global Variable Consensus Optimization using ADMM; Our second contribution is to show how the proposed analytic framework can be integrated within state-of-the-art discriminative methods for face alignment and tracking based on cascaded regression and deeply learned features. Overall, we call our method Discriminative Global Consensus Model (DGCM). Our third contribution is to show that DGCM achieves large performance improvement over the currently best performing face tracking methods on the most challenging category of the 300-VW dataset.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Khan_Synergy_Between_Face_ICCV_2017_paper.pdf", @@ -15946,7 +16961,8 @@ "aff_campus_unique_index": "0+1;0;0", "aff_campus_unique": "Nottingham;Lahore", "aff_country_unique_index": "0+1;0;0", - "aff_country_unique": "United Kingdom;Pakistan" + "aff_country_unique": "United Kingdom;Pakistan", + "bibtex": "@InProceedings{Khan_2017_ICCV,\n \n author = {\n Haris Khan,\n Muhammad and McDonagh,\n John and Tzimiropoulos,\n Georgios\n},\n title = {\n Synergy Between Face Alignment and Tracking via Discriminative Global Consensus Optimization\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "TALL: Temporal Activity Localization via Language Query", @@ -15954,6 +16970,7 @@ "status": "Spotlight", "track": "main", "pid": "1406", + "author_site": "Jiyang Gao; Chen Sun; Zhenheng Yang; Ram Nevatia", "author": "Jiyang Gao; Chen Sun; Zhenheng Yang; Ram Nevatia", "abstract": "This paper focuses on temporal localization of actions from untrimmed videos. Existing methods typically involve training classifiers for a pre-defined list of actions and applying the classifiers in a sliding window fashion. However, activities in the wild consist of a wide combination of actors, actions and objects; it is difficult to design a proper activity list that meets users' needs. We propose to localize activities by natural language queries. Temporal Activity Localization via Language (TALL) is challenging as it requires: (1) suitable design of text and video representations to allow cross-modal matching of actions and language queries; (2) ability to locate actions accurately given features from sliding windows of limited granularity. We propose a novel Cross-modal Temporal Regression Localizer (CTRL) to jointly model text query and video clips, output alignment scores and location regression results for candidate clips. For evaluation, we adopt TaCoS dataset, and build a new dataset for this task on top of Charades by adding sentence temporal annotations, called Charades-STA. Experimental results show that CTRL outperforms previous methods significantly on both datasets.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Gao_TALL_Temporal_Activity_ICCV_2017_paper.pdf", @@ -15968,7 +16985,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Gao_TALL_Temporal_Activity_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Gao_TALL_Temporal_Activity_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Gao_2017_ICCV,\n \n author = {\n Gao,\n Jiyang and Sun,\n Chen and Yang,\n Zhenheng and Nevatia,\n Ram\n},\n title = {\n TALL: Temporal Activity Localization via Language Query\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "TORNADO: A Spatio-Temporal Convolutional Regression Network for Video Action Proposal", @@ -15976,10 +16994,11 @@ "status": "Poster", "track": "main", "pid": "2892", + "author_site": "Hongyuan Zhu; Romain Vial; Shijian Lu", "author": "Hongyuan Zhu; Romain Vial; Shijian Lu", "abstract": "Given a video clip, action proposal aims to quickly generate a number of spatio-temporal tubes that enclose candidate human activities. Recently, the regression-based object detectors and long-term recurrent convolutional network (LRCN) have demonstrated superior performance in human action detection and recognition. However, the regression-based detectors performs inference without considering the temporal context among neighboring frames, and the LRCN using global visual percepts lacks the capability to capture local temporal dynamics. In this paper, we present a novel framework called TORNADO for human action proposal detection in un-trimmed video clips. Specifically, we propose a spatial-temporal convolutional network that combines the advantages of regression-based detector and LRCN by empowering Convolutional LSTM with regression capability. Our approach consists of a temporal convolutional regression network (T-CRN) and a spatial regression network (S-CRN) which are trained end-to-end on both RGB and OpticalFlow streams. They fuse appearance, motion and temporal contexts to regress the bounding boxes of candidate human actions simultaneously in 28 FPS. The action proposals are constructed by solving dynamic programming with peak trimming of the generated action boxes. Extensive experiments on the challenging UCF-101 and UCF-Sports datasets show that our method achieves superior performance as compared with the state-of-the-arts.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zhu_TORNADO_A_Spatio-Temporal_ICCV_2017_paper.pdf", - "aff": "I2R, A\u2217Star, Singapore; MINES ParisTech, France; NTU, Singapore", + "aff": "I2R, A∗Star, Singapore; MINES ParisTech, France; NTU, Singapore", "project": "", "github": "", "supp": "", @@ -16000,7 +17019,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "Singapore;France" + "aff_country_unique": "Singapore;France", + "bibtex": "@InProceedings{Zhu_2017_ICCV,\n \n author = {\n Zhu,\n Hongyuan and Vial,\n Romain and Lu,\n Shijian\n},\n title = {\n TORNADO: A Spatio-Temporal Convolutional Regression Network for Video Action Proposal\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "TURN TAP: Temporal Unit Regression Network for Temporal Action Proposals", @@ -16008,6 +17028,7 @@ "status": "Poster", "track": "main", "pid": "1407", + "author_site": "Jiyang Gao; Zhenheng Yang; Kan Chen; Chen Sun; Ram Nevatia", "author": "Jiyang Gao; Zhenheng Yang; Kan Chen; Chen Sun; Ram Nevatia", "abstract": "We address the problem of Temporal Action Proposal (TAP) generation. This is an important problem, as fast extraction of semantically important (e.g. human actions) segments from untrimmed videos is an important step for large-scale video analysis. To tackle this problem, we propose a novel Temporal Unit Regression Network (TURN) model. There are two salient aspects of TURN: (1) TURN jointly predicts action proposals and refines the temporal boundaries by temporal coordinate regression with contextual information; (2) Fast computation is enabled by unit feature reuse: a long untrimmed video is decomposed into video units, which are reused as basic building blocks of temporal proposals. TURN outperforms the state-of-the-art methods under average recall (AR) by a large margin on THUMOS-14 and ActivityNet datasets, and runs over 900 frames per second (FPS) on a TITAN X GPU. We further apply TURN as a proposal generation stage for existing temporal action localization pipelines, and outperforms state-of-the-art performance on THUMOS-14 and ActivityNet.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Gao_TURN_TAP_Temporal_ICCV_2017_paper.pdf", @@ -16032,7 +17053,8 @@ "aff_campus_unique_index": "0;0;1;0;0", "aff_campus_unique": "Los Angeles;Mountain View", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Gao_2017_ICCV,\n \n author = {\n Gao,\n Jiyang and Yang,\n Zhenheng and Chen,\n Kan and Sun,\n Chen and Nevatia,\n Ram\n},\n title = {\n TURN TAP: Temporal Unit Regression Network for Temporal Action Proposals\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Taking the Scenic Route to 3D: Optimising Reconstruction From Moving Cameras", @@ -16040,6 +17062,7 @@ "status": "Poster", "track": "main", "pid": "2453", + "author_site": "Oscar Mendez; Simon Hadfield; Nicolas Pugeault; Richard Bowden", "author": "Oscar Mendez; Simon Hadfield; Nicolas Pugeault; Richard Bowden", "abstract": "Reconstruction of 3D environments is a problem that has been widely addressed in the literature. While many approaches exist to perform reconstruction, few of them take an active role in deciding where the next observations should come from. Furthermore, the problem of travelling from the camera's current position to the next, known as pathplanning, usually focuses on minimising path length. This approach is ill-suited for reconstruction applications, where learning about the environment is more valuable than speed of traversal. We present a novel Scenic Route Planner that selects paths which maximise information gain, both in terms of total map coverage and reconstruction accuracy. We also introduce a new type of collaborative behaviour into the planning stage called opportunistic collaboration, which allows sensors to switch between acting as independent Structure from Motion (SfM) agents or as a variable baseline stereo pair. We show that Scenic Planning enables similar performance to state-of-the-art batch approaches using less than 0.00027% of the possible stereo pairs (3% of the views). Comparison against length-based pathplanning approaches show that our approach produces more complete and more accurate maps with fewer frames. Finally, we demonstrate the Scenic Pathplanner's ability to generalise to live scenarios by mounting cameras on autonomous ground-based sensor platforms and exploring an environment.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Mendez_Taking_the_Scenic_ICCV_2017_paper.pdf", @@ -16064,7 +17087,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Mendez_2017_ICCV,\n \n author = {\n Mendez,\n Oscar and Hadfield,\n Simon and Pugeault,\n Nicolas and Bowden,\n Richard\n},\n title = {\n Taking the Scenic Route to 3D: Optimising Reconstruction From Moving Cameras\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Temporal Action Detection With Structured Segment Networks", @@ -16072,6 +17096,7 @@ "status": "Poster", "track": "main", "pid": "1218", + "author_site": "Yue Zhao; Yuanjun Xiong; Limin Wang; Zhirong Wu; Xiaoou Tang; Dahua Lin", "author": "Yue Zhao; Yuanjun Xiong; Limin Wang; Zhirong Wu; Xiaoou Tang; Dahua Lin", "abstract": "Detecting actions in untrimmed videos is an important yet challenging task. In this paper, we present the structured segment network (SSN), a novel framework which models the temporal structure of each action instance via a structured temporal pyramid. On top of the pyramid, we further introduce a decomposed discriminative model comprising two classifiers, respectively for classifying actions and determining completeness. This allows the framework to effectively distinguish positive proposals from background or incomplete ones, thus leading to both accurate recognition and localization. These components are integrated into a unified network that can be efficiently trained in an end-to-end fashion. Additionally, a simple yet effective temporal action proposal scheme, dubbed temporal actionness grouping (TAG) is devised to generate high quality action proposals. On two challenging benchmarks, THUMOS'14 and ActivityNet, our method remarkably outperforms previous state-of-the-art methods, demonstrating superior accuracy and strong adaptivity in handling actions with various temporal structures.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zhao_Temporal_Action_Detection_ICCV_2017_paper.pdf", @@ -16087,7 +17112,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Zhao_Temporal_Action_Detection_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Zhao_Temporal_Action_Detection_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Zhao_2017_ICCV,\n \n author = {\n Zhao,\n Yue and Xiong,\n Yuanjun and Wang,\n Limin and Wu,\n Zhirong and Tang,\n Xiaoou and Lin,\n Dahua\n},\n title = {\n Temporal Action Detection With Structured Segment Networks\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Temporal Context Network for Activity Localization in Videos", @@ -16095,6 +17121,7 @@ "status": "Poster", "track": "main", "pid": "2601", + "author_site": "Xiyang Dai; Bharat Singh; Guyue Zhang; Larry S. Davis; Yan Qiu Chen", "author": "Xiyang Dai; Bharat Singh; Guyue Zhang; Larry S. Davis; Yan Qiu Chen", "abstract": "We present a Temporal Context Network (TCN) for precise temporal localization of human activities. Similar to the Faster-RCNN architecture, proposals are placed at equal intervals in a video which span multiple temporal scales. We propose a novel representation for ranking these proposals. Since pooling features only inside a segment is not sufficient to predict activity boundaries, we construct a representation which explicitly captures context around a proposal for ranking it. For each temporal segment inside a proposal, features are uniformly sampled at a pair of scales and are input to a temporal convolutional neural network for classification. After ranking proposals, non-maximum suppression is applied and classification is performed to obtain final detections. TCN outperforms state-of-the-art methods on the ActivityNet dataset and the THUMOS14 dataset.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Dai_Temporal_Context_Network_ICCV_2017_paper.pdf", @@ -16119,7 +17146,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;1", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Dai_2017_ICCV,\n \n author = {\n Dai,\n Xiyang and Singh,\n Bharat and Zhang,\n Guyue and Davis,\n Larry S. and Qiu Chen,\n Yan\n},\n title = {\n Temporal Context Network for Activity Localization in Videos\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Temporal Dynamic Graph LSTM for Action-Driven Video Object Detection", @@ -16127,6 +17155,7 @@ "status": "Poster", "track": "main", "pid": "664", + "author_site": "Yuan Yuan; Xiaodan Liang; Xiaolong Wang; Dit-Yan Yeung; Abhinav Gupta", "author": "Yuan Yuan; Xiaodan Liang; Xiaolong Wang; Dit-Yan Yeung; Abhinav Gupta", "abstract": "In this paper, we investigate a weakly-supervised object detection framework. Most existing frameworks focus on using static images to learn object detectors. However, these detectors often fail to generalize to videos because of the existing domain shift. Therefore, we investigate learning these detectors directly from boring videos of daily activities. Instead of using bounding boxes, we explore the use of action descriptions as supervision since they are relatively easy to gather. A common issue, however, is that objects of interest that are not involved in human actions are often absent in global action descriptions known as \"missing label\". To tackle this problem, we propose a novel temporal dynamic graph Long Short-Term Memory network (TD- Graph LSTM). TD-Graph LSTM enables global temporal reasoning by constructing a dynamic graph that is based on temporal correlations of object proposals and spans the entire video. The missing label issue for each individual frame can thus be significantly alleviated by transferring knowledge across correlated objects proposals in the whole video. Extensive evaluations on a large-scale daily-life action dataset (i.e., Charades) demonstrates the superiority of our proposed method. We also release object bounding-box annotations for more than 5,000 frames in Charades. We believe this annotated data can also benefit other research on video-based object recognition in the future.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Yuan_Temporal_Dynamic_Graph_ICCV_2017_paper.pdf", @@ -16141,7 +17170,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Yuan_Temporal_Dynamic_Graph_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Yuan_Temporal_Dynamic_Graph_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Yuan_2017_ICCV,\n \n author = {\n Yuan,\n Yuan and Liang,\n Xiaodan and Wang,\n Xiaolong and Yeung,\n Dit-Yan and Gupta,\n Abhinav\n},\n title = {\n Temporal Dynamic Graph LSTM for Action-Driven Video Object Detection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Temporal Generative Adversarial Nets With Singular Value Clipping", @@ -16149,6 +17179,7 @@ "status": "Poster", "track": "main", "pid": "1230", + "author_site": "Masaki Saito; Eiichi Matsumoto; Shunta Saito", "author": "Masaki Saito; Eiichi Matsumoto; Shunta Saito", "abstract": "In this paper, we propose a generative model, Temporal Generative Adversarial Nets (TGAN), which can learn a semantic representation of unlabeled videos, and is capable of generating videos. Unlike existing Generative Adversarial Nets (GAN)-based methods that generate videos with a single generator consisting of 3D deconvolutional layers, our model exploits two different types of generators: a temporal generator and an image generator. The temporal generator takes a single latent variable as input and outputs a set of latent variables, each of which corresponds to an image frame in a video. The image generator transforms a set of such latent variables into a video. To deal with instability in training of GAN with such advanced networks, we adopt a recently proposed model, Wasserstein GAN, and propose a novel method to train it stably in an end-to-end manner. The experimental results demonstrate the effectiveness of our methods.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Saito_Temporal_Generative_Adversarial_ICCV_2017_paper.pdf", @@ -16166,14 +17197,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Saito_Temporal_Generative_Adversarial_ICCV_2017_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "Preferred Networks Inc.", + "aff_unique_norm": "Preferred Networks inc.", "aff_unique_dep": "", "aff_unique_url": "https://www.preferred-networks.com", "aff_unique_abbr": "PFN", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Saito_2017_ICCV,\n \n author = {\n Saito,\n Masaki and Matsumoto,\n Eiichi and Saito,\n Shunta\n},\n title = {\n Temporal Generative Adversarial Nets With Singular Value Clipping\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Temporal Non-Volume Preserving Approach to Facial Age-Progression and Age-Invariant Face Recognition", @@ -16181,10 +17213,11 @@ "status": "Oral", "track": "main", "pid": "1939", + "author_site": "Chi Nhan Duong; Kha Gia Quach; Khoa Luu; Ngan Le; Marios Savvides", "author": "Chi Nhan Duong; Kha Gia Quach; Khoa Luu; Ngan Le; Marios Savvides", "abstract": "Modeling the long-term facial aging process is extremely challenging due to the presence of large and non-linear variations during the face development stages. In order to efficiently address the problem, this work first decomposes the aging process into multiple short-term stages. Then, a novel generative probabilistic model, named Temporal Non-Volume Preserving (TNVP) transformation, is presented to model the facial aging process at each stage. Unlike Generative Adversarial Networks (GANs), which requires an empirical balance threshold, and Restricted Boltzmann Machines (RBM), an intractable model, our proposed TNVP approach guarantees a tractable density function, exact inference and evaluation for embedding the feature transformations between faces in consecutive stages. Our model shows its advantages not only in capturing the non-linear age related variance in each stage but also producing a smooth synthesis in age progression across faces. Our approach can model any face in the wild provided with only four basic landmark points. Moreover, the structure can be transformed into a deep convolutional network while keeping the advantages of probabilistic models with tractable log-likelihood density estimation. Our method is evaluated in both terms of synthesizing age-progressed faces and cross-age face verification and consistently shows the state-of-the-art results in various face aging databases, i.e. FG-NET, MORPH, AginG Faces in the Wild (AGFW), and Cross-Age Celebrity Dataset (CACD). A large-scale face verification on Megaface challenge 1 is also performed to further show the advantages of our proposed approach.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Duong_Temporal_Non-Volume_Preserving_ICCV_2017_paper.pdf", - "aff": "Computer Science and Software Engineering, Concordia University, Montr \u00b4eal, Qu \u00b4ebec, Canada + CyLab Biometrics Center and the Department of Electrical and Computer Engineering, Carnegie Mellon University, Pittsburgh, PA, USA; Computer Science and Software Engineering, Concordia University, Montr \u00b4eal, Qu \u00b4ebec, Canada + CyLab Biometrics Center and the Department of Electrical and Computer Engineering, Carnegie Mellon University, Pittsburgh, PA, USA; CyLab Biometrics Center and the Department of Electrical and Computer Engineering, Carnegie Mellon University, Pittsburgh, PA, USA; CyLab Biometrics Center and the Department of Electrical and Computer Engineering, Carnegie Mellon University, Pittsburgh, PA, USA; CyLab Biometrics Center and the Department of Electrical and Computer Engineering, Carnegie Mellon University, Pittsburgh, PA, USA", + "aff": "Computer Science and Software Engineering, Concordia University, Montr ´eal, Qu ´ebec, Canada + CyLab Biometrics Center and the Department of Electrical and Computer Engineering, Carnegie Mellon University, Pittsburgh, PA, USA; Computer Science and Software Engineering, Concordia University, Montr ´eal, Qu ´ebec, Canada + CyLab Biometrics Center and the Department of Electrical and Computer Engineering, Carnegie Mellon University, Pittsburgh, PA, USA; CyLab Biometrics Center and the Department of Electrical and Computer Engineering, Carnegie Mellon University, Pittsburgh, PA, USA; CyLab Biometrics Center and the Department of Electrical and Computer Engineering, Carnegie Mellon University, Pittsburgh, PA, USA; CyLab Biometrics Center and the Department of Electrical and Computer Engineering, Carnegie Mellon University, Pittsburgh, PA, USA", "project": "", "github": "", "supp": "", @@ -16203,9 +17236,10 @@ "aff_unique_url": "https://www.concordia.ca;https://www.cmu.edu", "aff_unique_abbr": "Concordia;CMU", "aff_campus_unique_index": "0+1;0+1;1;1;1", - "aff_campus_unique": "Montr\u00e9al;Pittsburgh", + "aff_campus_unique": "Montréal;Pittsburgh", "aff_country_unique_index": "0+1;0+1;1;1;1", - "aff_country_unique": "Canada;United States" + "aff_country_unique": "Canada;United States", + "bibtex": "@InProceedings{Duong_2017_ICCV,\n \n author = {\n Nhan Duong,\n Chi and Gia Quach,\n Kha and Luu,\n Khoa and Le,\n Ngan and Savvides,\n Marios\n},\n title = {\n Temporal Non-Volume Preserving Approach to Facial Age-Progression and Age-Invariant Face Recognition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Temporal Shape Super-Resolution by Intra-Frame Motion Encoding Using High-Fps Structured Light", @@ -16213,6 +17247,7 @@ "status": "Spotlight", "track": "main", "pid": "2216", + "author_site": "Yuki Shiba; Satoshi Ono; Ryo Furukawa; Shinsaku Hiura; Hiroshi Kawasaki", "author": "Yuki Shiba; Satoshi Ono; Ryo Furukawa; Shinsaku Hiura; Hiroshi Kawasaki", "abstract": "One of the solutions of depth imaging of moving scene is to project a static pattern on the object and use just a single image for reconstruction. However, if the motion of the object is too fast with respect to the exposure time of the image sensor, patterns on the captured image are blurred and reconstruction fails. In this paper, we impose multiple projection patterns into each single captured image to realize temporal super resolution of the depth image sequences. With our method, multiple patterns are projected onto the object with higher fps than possible with a camera. In this case, the observed pattern varies depending on the depth and motion of the object, so we can extract temporal information of the scene from each single image. The decoding process is realized using a learning-based approach where no geometric calibration is needed. Experiments confirm the effectiveness of our method where sequential shapes are reconstructed from a single image. Both quantitative evaluations and comparisons with recent techniques were also conducted.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Shiba_Temporal_Shape_Super-Resolution_ICCV_2017_paper.pdf", @@ -16237,7 +17272,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Shiba_2017_ICCV,\n \n author = {\n Shiba,\n Yuki and Ono,\n Satoshi and Furukawa,\n Ryo and Hiura,\n Shinsaku and Kawasaki,\n Hiroshi\n},\n title = {\n Temporal Shape Super-Resolution by Intra-Frame Motion Encoding Using High-Fps Structured Light\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Temporal Superpixels Based on Proximity-Weighted Patch Matching", @@ -16245,6 +17281,7 @@ "status": "Poster", "track": "main", "pid": "1302", + "author_site": "Se-Ho Lee; Won-Dong Jang; Chang-Su Kim", "author": "Se-Ho Lee; Won-Dong Jang; Chang-Su Kim", "abstract": "A temporal superpixel algorithm based on proximity-weighted patch matching (TS-PPM) is proposed in this work. We develop the proximity-weighted patch matching (PPM), which estimates the motion vector of a superpixel robustly, by considering the patch matching distances of neighboring superpixels as well as the target superpixel. In each frame, we initialize superpixels by transferring the superpixel labels of the previous frame using PPM motion vectors. Then, we update the superpixel labels of boundary pixels, based on a cost function, composed of color, spatial, contour, and temporal consistency terms. Finally, we execute superpixel splitting, merging, and relabeling to regularize superpixel sizes and reduce incorrect labels. Experiments show that the proposed algorithm outperforms the state-of-the-art conventional algorithms significantly.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Lee_Temporal_Superpixels_Based_ICCV_2017_paper.pdf", @@ -16259,7 +17296,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Lee_Temporal_Superpixels_Based_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Lee_Temporal_Superpixels_Based_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Lee_2017_ICCV,\n \n author = {\n Lee,\n Se-Ho and Jang,\n Won-Dong and Kim,\n Chang-Su\n},\n title = {\n Temporal Superpixels Based on Proximity-Weighted Patch Matching\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Temporal Tessellation: A Unified Approach for Video Analysis", @@ -16267,6 +17305,7 @@ "status": "Spotlight", "track": "main", "pid": "1712", + "author_site": "Dotan Kaufman; Gil Levi; Tal Hassner; Lior Wolf", "author": "Dotan Kaufman; Gil Levi; Tal Hassner; Lior Wolf", "abstract": "We present a general approach to video understanding, inspired by semantic transfer techniques that have been successfully used for 2D image analysis. Our method considers a video to be a 1D sequence of clips, each one associated with its own semantics. The nature of these semantics -- natural language captions or other labels -- depends on the task at hand. A test video is processed by forming correspondences between its clips and the clips of reference videos with known semantics, following which, reference semantics can be transferred to the test video. We describe two matching methods, both designed to ensure that (a) reference clips appear similar to test clips and (b), taken together, the semantics of the selected reference clips is consistent and maintains temporal coherence. We use our method for video captioning on the LSMDC'16 benchmark, video summarization on the SumMe and TVSum benchmarks, Temporal Action Detection on the Thumos2015 benchmark, and sound prediction on the Greatest Hits benchmark. Our method not only surpasses the state of the art, in four out of five benchmarks, but importantly, it is the only single method we know of that was successfully applied to such a diverse range of tasks.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Kaufman_Temporal_Tessellation_A_ICCV_2017_paper.pdf", @@ -16282,7 +17321,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Kaufman_Temporal_Tessellation_A_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Kaufman_Temporal_Tessellation_A_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Kaufman_2017_ICCV,\n \n author = {\n Kaufman,\n Dotan and Levi,\n Gil and Hassner,\n Tal and Wolf,\n Lior\n},\n title = {\n Temporal Tessellation: A Unified Approach for Video Analysis\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Tensor RPCA by Bayesian CP Factorization With Complex Noise", @@ -16290,10 +17330,11 @@ "status": "Poster", "track": "main", "pid": "2055", + "author_site": "Qiong Luo; Zhi Han; Xi'ai Chen; Yao Wang; Deyu Meng; Dong Liang; Yandong Tang", "author": "Qiong Luo; Zhi Han; Xi'ai Chen; Yao Wang; Deyu Meng; Dong Liang; Yandong Tang", "abstract": "The RPCA model has achieved good performances in various applications. However, two defects limit its effectiveness. Firstly, it is designed for dealing with data in matrix form, which fails to exploit the structure information of higher order tensor data in some pratical situations. Secondly, it adopts L1-norm to tackle noise part which makes it only valid for sparse noise. In this paper, we propose a tensor RPCA model based on CP decomposition and model data noise by Mixture of Gaussians (MoG). The use of tensor structure to raw data allows us to make full use of the inherent structure priors, and MoG is a general approximator to any blends of consecutive distributions, which makes our approach capable of regaining the low dimensional linear subspace from a wide range of noises or their mixture. The model is solved by a new proposed algorithm inferred under a variational Bayesian framework. The superiority of our approach over the existing state-of-the-art approaches is demonstrated by extensive experiments on both of synthetic and real data.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Luo_Tensor_RPCA_by_ICCV_2017_paper.pdf", - "aff": "State Key Laboratory of Robotics, Shenyang Institute of Automation, Chinese Academy of Sciences + University of Chinese Academy of Sciences; State Key Laboratory of Robotics, Shenyang Institute of Automation, Chinese Academy of Sciences; State Key Laboratory of Robotics, Shenyang Institute of Automation, Chinese Academy of Sciences + University of Chinese Academy of Sciences; Xi\u2019an Jiaotong University; Xi\u2019an Jiaotong University; Xi\u2019an Jiaotong University; State Key Laboratory of Robotics, Shenyang Institute of Automation, Chinese Academy of Sciences", + "aff": "State Key Laboratory of Robotics, Shenyang Institute of Automation, Chinese Academy of Sciences + University of Chinese Academy of Sciences; State Key Laboratory of Robotics, Shenyang Institute of Automation, Chinese Academy of Sciences; State Key Laboratory of Robotics, Shenyang Institute of Automation, Chinese Academy of Sciences + University of Chinese Academy of Sciences; Xi’an Jiaotong University; Xi’an Jiaotong University; Xi’an Jiaotong University; State Key Laboratory of Robotics, Shenyang Institute of Automation, Chinese Academy of Sciences", "project": "", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2017/supplemental/Luo_Tensor_RPCA_by_ICCV_2017_supplemental.pdf", @@ -16307,14 +17348,15 @@ "author_num": 7, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Luo_Tensor_RPCA_by_ICCV_2017_paper.html", "aff_unique_index": "0+1;0;0+1;2;2;2;0", - "aff_unique_norm": "Shenyang Institute of Automation;University of Chinese Academy of Sciences;Xi'an Jiao Tong University", + "aff_unique_norm": "Shenyang Institute of Automation;University of Chinese Academy of Sciences;Xi'an Jiaotong University", "aff_unique_dep": "State Key Laboratory of Robotics;;", "aff_unique_url": "http://www.sia.cas.cn;http://www.ucas.ac.cn;https://www.xjtu.edu.cn", "aff_unique_abbr": ";UCAS;XJTU", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0+0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Luo_2017_ICCV,\n \n author = {\n Luo,\n Qiong and Han,\n Zhi and Chen,\n Xi'ai and Wang,\n Yao and Meng,\n Deyu and Liang,\n Dong and Tang,\n Yandong\n},\n title = {\n Tensor RPCA by Bayesian CP Factorization With Complex Noise\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "The \"Something Something\" Video Database for Learning and Evaluating Visual Common Sense", @@ -16322,7 +17364,7 @@ "status": "Poster", "track": "main", "pid": "3105", - "author_site": "Raghav Goyal; Samira Ebrahimi Kahou; Vincent Michalski; Joanna Materzy\u00c5\u0084ska; Susanne Westphal; Heuna Kim; Valentin Haenel; Ingo Fruend; Peter Yianilos; Moritz Mueller-Freitag; Florian Hoppe; Christian Thurau; Ingo Bax; Roland Memisevic", + "author_site": "Raghav Goyal; Samira Ebrahimi Kahou; Vincent Michalski; Joanna Materzyńska; Susanne Westphal; Heuna Kim; Valentin Haenel; Ingo Fruend; Peter Yianilos; Moritz Mueller-Freitag; Florian Hoppe; Christian Thurau; Ingo Bax; Roland Memisevic", "author": "Raghav Goyal; Samira Ebrahimi Kahou; Vincent Michalski; Joanna Materzynska; Susanne Westphal; Heuna Kim; Valentin Haenel; Ingo Fruend; Peter Yianilos; Moritz Mueller-Freitag; Florian Hoppe; Christian Thurau; Ingo Bax; Roland Memisevic", "abstract": "Neural networks trained on datasets such as ImageNet have led to major advances in visual object classification. One obstacle that prevents networks from reasoning more deeply about complex scenes and situations, and from integrating visual knowledge with natural language, like humans do, is their lack of common sense knowledge about the physical world. Videos, unlike still images, contain a wealth of detailed information about the physical world. However, most labelled video datasets represent high-level concepts rather than detailed physical aspects about actions and scenes. In this work, we describe our ongoing collection of the \"something-something\" database of video prediction tasks whose solutions require a common sense understanding of the depicted situation. The database currently contains more than 100,000 videos across 174 classes, which are defined as caption-templates. We also describe the challenges in crowd-sourcing this data at scale.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Goyal_The_Something_Something_ICCV_2017_paper.pdf", @@ -16347,7 +17389,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0;0;0;0", - "aff_country_unique": "Germany;" + "aff_country_unique": "Germany;", + "bibtex": "@InProceedings{Goyal_2017_ICCV,\n \n author = {\n Goyal,\n Raghav and Ebrahimi Kahou,\n Samira and Michalski,\n Vincent and Materzynska,\n Joanna and Westphal,\n Susanne and Kim,\n Heuna and Haenel,\n Valentin and Fruend,\n Ingo and Yianilos,\n Peter and Mueller-Freitag,\n Moritz and Hoppe,\n Florian and Thurau,\n Christian and Bax,\n Ingo and Memisevic,\n Roland\n},\n title = {\n The \"Something Something\" Video Database for Learning and Evaluating Visual Common Sense\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "The Mapillary Vistas Dataset for Semantic Understanding of Street Scenes", @@ -16355,7 +17398,7 @@ "status": "Poster", "track": "main", "pid": "2264", - "author_site": "Gerhard Neuhold; Tobias Ollmann; Samuel Rota Bul\u00c3\u00b2; Peter Kontschieder", + "author_site": "Gerhard Neuhold; Tobias Ollmann; Samuel Rota Bulò; Peter Kontschieder", "author": "Gerhard Neuhold; Tobias Ollmann; Samuel Rota Bulo; Peter Kontschieder", "abstract": "The Mapillary Vistas Dataset is a novel, large-scale street-level image dataset containing 25,000 high-resolution images annotated into 66 object categories with additional, instance-specific labels for 37 classes. Annotation is performed in a dense and fine-grained style by using polygons for delineating individual objects. Our dataset is 5x larger than the total amount of fine annotations for Cityscapes and contains images from all around the world, captured at various conditions regarding weather, season and daytime. Images come from different imaging devices (mobile phones, tablets, action cameras, professional capturing rigs) and differently experienced photographers. In such a way, our dataset has been designed and compiled to cover diversity, richness of detail and geographic extent. As default benchmark tasks, we define semantic image segmentation and instance-specific image segmentation, aiming to significantly further the development of state-of-the-art methods for visual road-scene understanding.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Neuhold_The_Mapillary_Vistas_ICCV_2017_paper.pdf", @@ -16380,7 +17423,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Sweden" + "aff_country_unique": "Sweden", + "bibtex": "@InProceedings{Neuhold_2017_ICCV,\n \n author = {\n Neuhold,\n Gerhard and Ollmann,\n Tobias and Rota Bulo,\n Samuel and Kontschieder,\n Peter\n},\n title = {\n The Mapillary Vistas Dataset for Semantic Understanding of Street Scenes\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "The Pose Knows: Video Forecasting by Generating Pose Futures", @@ -16388,6 +17432,7 @@ "status": "Poster", "track": "main", "pid": "1530", + "author_site": "Jacob Walker; Kenneth Marino; Abhinav Gupta; Martial Hebert", "author": "Jacob Walker; Kenneth Marino; Abhinav Gupta; Martial Hebert", "abstract": "Current approaches to video forecasting attempt to generate videos directly in pixel space using Generative Adversarial Networks (GANs) or Variational Autoencoders (VAEs). However, since these approaches try to model all the structure and scene dynamics at once, in unconstrained settings they often generate uninterpretable results. Our insight is that forecasting needs to be done first at a higher level of abstraction. Specifically, we exploit human pose detectors as a free source of supervision and break the video forecasting problem into two discrete steps. First we explicitly model the high level structure of active objects in the scene (humans) and use a VAE to model the possible future movements of humans in the pose space. We then use the future poses generated as conditional information to a GAN to predict the future frames of the video in pixel space. By using the structured space of pose as an intermediate representation, we sidestep the problems that GANs have in generating video pixels directly. We show through quantitative and qualitative evaluation that our method outperforms state-of-the-art methods for video prediction.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Walker_The_Pose_Knows_ICCV_2017_paper.pdf", @@ -16403,7 +17448,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Walker_The_Pose_Knows_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Walker_The_Pose_Knows_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Walker_2017_ICCV,\n \n author = {\n Walker,\n Jacob and Marino,\n Kenneth and Gupta,\n Abhinav and Hebert,\n Martial\n},\n title = {\n The Pose Knows: Video Forecasting by Generating Pose Futures\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "ThiNet: A Filter Level Pruning Method for Deep Neural Network Compression", @@ -16411,6 +17457,7 @@ "status": "Poster", "track": "main", "pid": "2276", + "author_site": "Jian-Hao Luo; Jianxin Wu; Weiyao Lin", "author": "Jian-Hao Luo; Jianxin Wu; Weiyao Lin", "abstract": "We propose an efficient and unified framework, namely ThiNet, to simultaneously accelerate and compress CNN models in both training and inference stages. We focus on the filter level pruning, i.e., the whole filter would be discarded if it is less important. Our method does not change the original network structure, thus it can be perfectly supported by any off-the-shelf deep learning libraries. We formally establish filter pruning as an optimization problem, and reveal that we need to prune filters based on statistics information computed from its next layer, not the current layer, which differentiates ThiNet from existing methods. Experimental results demonstrate the effectiveness of this strategy, which has advanced the state-of-the-art. We also show the performance of ThiNet on ILSVRC-12 benchmark. ThiNet achieves 3.31x FLOPs reduction and 16.63x compression on VGG-16, with only 0.52% top-5 accuracy drop. Similar experiments with ResNet-50 reveal that even for a compact network, ThiNet can also reduce more than half of the parameters and FLOPs, at the cost of roughly 1% top-5 accuracy drop. Moreover, the original VGG-16 model can be further pruned into a very small model with only 5.05MB model size, preserving AlexNet level accuracy but showing much stronger generalization ability.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Luo_ThiNet_A_Filter_ICCV_2017_paper.pdf", @@ -16425,7 +17472,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Luo_ThiNet_A_Filter_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Luo_ThiNet_A_Filter_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Luo_2017_ICCV,\n \n author = {\n Luo,\n Jian-Hao and Wu,\n Jianxin and Lin,\n Weiyao\n},\n title = {\n ThiNet: A Filter Level Pruning Method for Deep Neural Network Compression\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "TorontoCity: Seeing the World With a Million Eyes", @@ -16433,7 +17481,7 @@ "status": "Spotlight", "track": "main", "pid": "1469", - "author_site": "Shenlong Wang; Min Bai; Gell\u00c3\u00a9rt M\u00c3\u00a1ttyus; Hang Chu; Wenjie Luo; Bin Yang; Justin Liang; Joel Cheverie; Sanja Fidler; Raquel Urtasun", + "author_site": "Shenlong Wang; Min Bai; Gellért Máttyus; Hang Chu; Wenjie Luo; Bin Yang; Justin Liang; Joel Cheverie; Sanja Fidler; Raquel Urtasun", "author": "Shenlong Wang; Min Bai; Gellert Mattyus; Hang Chu; Wenjie Luo; Bin Yang; Justin Liang; Joel Cheverie; Sanja Fidler; Raquel Urtasun", "abstract": "In this paper we introduce the TorontoCity benchmark, which covers the full greater Toronto area (GTA) with 712.5km2 of land, 8439km of road and around 400, 000 buildings. Our benchmark provides different perspectives of the world captured from airplanes, drones and cars driving around the city. Manually labeling such a large scale dataset is infeasible. Instead, we propose to utilize different sources of high-precision maps to create our ground truth. Towards this goal, we develop algorithms that allow us to align all data sources with the maps while requiring minimal human supervision. We have designed a wide variety of tasks including building height estimation (reconstruction), road centerline and curb extraction, building instance segmentation, building contour extraction (reorganization), semantic labeling and scene type classification (recognition). Our pilot study shows that most of these tasks are still difficult for modern convolutional neural networks.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Wang_TorontoCity_Seeing_the_ICCV_2017_paper.pdf", @@ -16458,7 +17506,8 @@ "aff_campus_unique_index": "0;0;0;0;0;0;0;0;0;0", "aff_campus_unique": "Toronto;", "aff_country_unique_index": "0+1;0+1;0+1;0+1;0+1;0+1;0+1;0;0;0+1", - "aff_country_unique": "Canada;United States" + "aff_country_unique": "Canada;United States", + "bibtex": "@InProceedings{Wang_2017_ICCV,\n \n author = {\n Wang,\n Shenlong and Bai,\n Min and Mattyus,\n Gellert and Chu,\n Hang and Luo,\n Wenjie and Yang,\n Bin and Liang,\n Justin and Cheverie,\n Joel and Fidler,\n Sanja and Urtasun,\n Raquel\n},\n title = {\n TorontoCity: Seeing the World With a Million Eyes\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Toward Perceptually-Consistent Stereo: A Scanline Study", @@ -16466,10 +17515,11 @@ "status": "Poster", "track": "main", "pid": "648", + "author_site": "Jialiang Wang; Daniel Glasner; Todd Zickler", "author": "Jialiang Wang; Daniel Glasner; Todd Zickler", "abstract": "Two types of information exist in a stereo pair: correlation (matching) and decorrelation (half-occlusion). Vision science has shown that both types of information are used in the visual cortex, and that people can perceive depth even when correlation cues are absent or very weak, a capability that remains absent from most computational stereo systems. As a step toward stereo algorithms that are more consistent with these perceptual phenomena, we re-examine the topic of scanline stereo as energy minimization. We represent a disparity profile as a piecewise smooth function with explicit breakpoints between its smooth pieces, and we show this allows correlation and decorrelation to be integrated into an objective that requires only two types of local information: the correlation and its spatial gradient. Experimentally, we show the global optimum of this objective matches human perception on a broad collection of wellknown perceptual stimuli, and that it also provides reasonable piecewise-smooth interpretations of depth in natural images, even without exploiting monocular boundary cues.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Wang_Toward_Perceptually-Consistent_Stereo_ICCV_2017_paper.pdf", - "aff": "Harvard University; AiCure\u2217 + Harvard University; Harvard University", + "aff": "Harvard University; AiCure∗ + Harvard University; Harvard University", "project": "", "github": "", "supp": "", @@ -16490,7 +17540,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Wang_2017_ICCV,\n \n author = {\n Wang,\n Jialiang and Glasner,\n Daniel and Zickler,\n Todd\n},\n title = {\n Toward Perceptually-Consistent Stereo: A Scanline Study\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Towards 3D Human Pose Estimation in the Wild: A Weakly-Supervised Approach", @@ -16498,6 +17549,7 @@ "status": "Poster", "track": "main", "pid": "119", + "author_site": "Xingyi Zhou; Qixing Huang; Xiao Sun; Xiangyang Xue; Yichen Wei", "author": "Xingyi Zhou; Qixing Huang; Xiao Sun; Xiangyang Xue; Yichen Wei", "abstract": "In this paper, we study the task of 3D human pose estimation in the wild. This task is challenging due to lack of training data, as existing datasets are either in the wild images with 2D pose or in the lab images with 3D pose. We propose a weakly-supervised transfer learning method that uses mixed 2D and 3D labels in a unified deep neutral network that presents two-stage cascaded structure. Our network augments a state-of-the-art 2D pose estimation sub-network with a 3D depth regression sub-network. Unlike previous two stage approaches that train the two sub-networks sequentially and separately, our training is end-to-end and fully exploits the correlation between the 2D pose and depth estimation sub-tasks. The deep features are better learnt through shared representations. In doing so, the 3D pose labels in controlled lab environments are transferred to in the wild images. In addition, we introduce a 3D geometric constraint to regularize the 3D pose prediction, which is effective in the absence of ground truth depth labels. Our method achieves competitive results on both 2D and 3D benchmarks.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zhou_Towards_3D_Human_ICCV_2017_paper.pdf", @@ -16515,14 +17567,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Zhou_Towards_3D_Human_ICCV_2017_paper.html", "aff_unique_index": "0+1;1;2;0;2", - "aff_unique_norm": "Fudan University;University of Texas at Austin;Microsoft", + "aff_unique_norm": "Fudan University;University of Texas at Austin;Microsoft Corporation", "aff_unique_dep": "School of Computer Science;;Microsoft Research", "aff_unique_url": "https://www.fudan.edu.cn;https://www.utexas.edu;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "Fudan;UT Austin;MSR", "aff_campus_unique_index": "0+1;1;0", "aff_campus_unique": "Shanghai;Austin;", "aff_country_unique_index": "0+1;1;1;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Zhou_2017_ICCV,\n \n author = {\n Zhou,\n Xingyi and Huang,\n Qixing and Sun,\n Xiao and Xue,\n Xiangyang and Wei,\n Yichen\n},\n title = {\n Towards 3D Human Pose Estimation in the Wild: A Weakly-Supervised Approach\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Towards Context-Aware Interaction Recognition for Visual Relationship Detection", @@ -16530,6 +17583,7 @@ "status": "Poster", "track": "main", "pid": "100", + "author_site": "Bohan Zhuang; Lingqiao Liu; Chunhua Shen; Ian Reid", "author": "Bohan Zhuang; Lingqiao Liu; Chunhua Shen; Ian Reid", "abstract": "Recognizing how objects interact with each other is a crucial task in visual recognition. If we define the context of the interaction to be the objects involved, then most current methods can be categorized as either: (i) training a single classifier on the combination of the interaction and its context; or (ii) aiming to recognize the interaction independently of its explicit context. Both methods suffer limitations: the former scales poorly with the number of combinations and fails to generalize to unseen combinations, while the latter often leads to poor interaction recognition performance due to the difficulty of designing a context-independent interaction classifier. To mitigate those drawbacks, this paper proposes an alternative, context-aware interaction recognition framework. The key to our method is to explicitly construct an interaction classifier which combines the context, and the interaction. The context is encoded via word2vec into a semantic space, and is used to derive a classification result for the interaction. The proposed method still builds one classifier for one interaction (as per type (ii) above), but the classifier built is adaptive to context via weights which are context dependent. The benefit of using the semantic space is that it naturally leads to zero-shot generalizations in which semantically similar contexts (subject-object pairs) can be recognized as suitable contexts for an interaction, even if they were not observed in the training set. Our method also scales with the number of interaction-context pairs since our model parameters do not increase with the number of interactions. Thus our method avoids the limitation of both approaches. We demonstrate experimentally that the proposed framework leads to improved performance for all investigated interaction representations and datasets.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zhuang_Towards_Context-Aware_Interaction_ICCV_2017_paper.pdf", @@ -16547,14 +17601,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Zhuang_Towards_Context-Aware_Interaction_ICCV_2017_paper.html", "aff_unique_index": "0;1;0;0", - "aff_unique_norm": "University of Adelaide;Australian Centre for Robotic Vision", + "aff_unique_norm": "The University of Adelaide;Australian Centre for Robotic Vision", "aff_unique_dep": ";", "aff_unique_url": "https://www.adelaide.edu.au;https://roboticvision.org/", "aff_unique_abbr": "Adelaide;ACRV", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Australia" + "aff_country_unique": "Australia", + "bibtex": "@InProceedings{Zhuang_2017_ICCV,\n \n author = {\n Zhuang,\n Bohan and Liu,\n Lingqiao and Shen,\n Chunhua and Reid,\n Ian\n},\n title = {\n Towards Context-Aware Interaction Recognition for Visual Relationship Detection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Towards Diverse and Natural Image Descriptions via a Conditional GAN", @@ -16562,6 +17617,7 @@ "status": "Oral", "track": "main", "pid": "1076", + "author_site": "Bo Dai; Sanja Fidler; Raquel Urtasun; Dahua Lin", "author": "Bo Dai; Sanja Fidler; Raquel Urtasun; Dahua Lin", "abstract": "Despite the substantial progress in recent years, the problem of image captioning remains far from being satisfactorily tackled. Sentences produced by existing methods, e.g. those based on LSTM, are often overly rigid and lacking in variability. This issue is related to a learning principle widely used in practice, that is, to maximize the likelihood of training samples. This principle encourages the high resemblance to the \"ground-truths\", while suppressing other reasonable expressions. Conventional evaluation metrics, e.g. BLEU and METEOR, also favor such restrictive methods. In this paper, we explore an alternative approach, with an aim to improve the naturalness and diversity - two essential properties of human expressions. Specifically, we propose a new framework based on Conditional Generative Adversarial Networks (CGAN), which jointly learns a generator to produce descriptions conditioned on images and an evaluator to assess how well a description fits the visual content. It is noteworthy that training a sequence generator is nontrivial. We overcome the difficulty by Policy Gradient, a strategy stemming from Reinforcement Learning, which allows the generator to receive early feedbacks along the way. We tested our method on two large datasets, where it performed competitively against real people in our user study and outperformed other methods on various tasks.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Dai_Towards_Diverse_and_ICCV_2017_paper.pdf", @@ -16576,7 +17632,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Dai_Towards_Diverse_and_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Dai_Towards_Diverse_and_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Dai_2017_ICCV,\n \n author = {\n Dai,\n Bo and Fidler,\n Sanja and Urtasun,\n Raquel and Lin,\n Dahua\n},\n title = {\n Towards Diverse and Natural Image Descriptions via a Conditional GAN\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Towards End-To-End Text Spotting With Convolutional Recurrent Neural Networks", @@ -16584,6 +17641,7 @@ "status": "Spotlight", "track": "main", "pid": "372", + "author_site": "Hui Li; Peng Wang; Chunhua Shen", "author": "Hui Li; Peng Wang; Chunhua Shen", "abstract": "In this work, we jointly address the problem of text detection and recognition in natural scene images based on convolutional recurrent neural networks. We propose a unified network that simultaneously localizes and recognizes text with a single forward pass, avoiding intermediate processes, such as image cropping, feature re-calculation, word separation, and character grouping. In contrast to existing approaches that consider text detection and recognition as two distinct tasks and tackle them one by one, the proposed framework settles these two tasks concurrently. The whole framework can be trained end-to-end, requiring only images, ground-truth bounding boxes and text labels. The convolutional features are calculated only once and shared by both detection and recognition, which saves processing time. Through multi-task training, the learned features become more informative and improves the overall performance. Our proposed method has achieved competitive performance on several benchmark datasets.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Li_Towards_End-To-End_Text_ICCV_2017_paper.pdf", @@ -16598,7 +17656,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Li_Towards_End-To-End_Text_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Li_Towards_End-To-End_Text_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Li_2017_ICCV,\n \n author = {\n Li,\n Hui and Wang,\n Peng and Shen,\n Chunhua\n},\n title = {\n Towards End-To-End Text Spotting With Convolutional Recurrent Neural Networks\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Towards Large-Pose Face Frontalization in the Wild", @@ -16606,6 +17665,7 @@ "status": "Poster", "track": "main", "pid": "1987", + "author_site": "Xi Yin; Xiang Yu; Kihyuk Sohn; Xiaoming Liu; Manmohan Chandraker", "author": "Xi Yin; Xiang Yu; Kihyuk Sohn; Xiaoming Liu; Manmohan Chandraker", "abstract": "Despite recent advances in face recognition using deep learning, severe accuracy drops are observed for large pose variations in unconstrained environments. Learning pose-invariant features is one solution, but needs expensively labeled large-scale data and carefully designed feature learning algorithms. In this work, we focus on frontalizing faces in the wild under various head poses, including extreme profile views. We propose a novel deep 3D Morphable Model (3DMM) conditioned Face Frontalization Generative Adversarial Network (GAN), termed as FF-GAN, to generate neutral head pose face images. Our framework differs from both traditional GANs and 3DMM based modeling. Incorporating 3DMM into the GAN structure provides shape and appearance priors for fast convergence with less training data, while also supporting end-to-end training. The 3DMM-conditioned GAN employs not only the discriminator and generator loss but also a new masked symmetry loss to retain visual quality under occlusions, besides an identity loss to recover high frequency information. Experiments on face recognition, landmark localization and 3D reconstruction consistently show the advantage of our frontalization method on faces in the wild datasets.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Yin_Towards_Large-Pose_Face_ICCV_2017_paper.pdf", @@ -16630,7 +17690,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";San Diego", "aff_country_unique_index": "0;0;0;0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Yin_2017_ICCV,\n \n author = {\n Yin,\n Xi and Yu,\n Xiang and Sohn,\n Kihyuk and Liu,\n Xiaoming and Chandraker,\n Manmohan\n},\n title = {\n Towards Large-Pose Face Frontalization in the Wild\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Towards More Accurate Iris Recognition Using Deeply Learned Spatially Corresponding Features", @@ -16638,6 +17699,7 @@ "status": "Spotlight", "track": "main", "pid": "2114", + "author_site": "Zijing Zhao; Ajay Kumar", "author": "Zijing Zhao; Ajay Kumar", "abstract": "This paper proposes an accurate and generalizable deep learning framework for iris recognition. The proposed framework is based on a fully convolutional network (FCN), which generates spatially corresponding iris feature descriptors. A specially designed Extended Triplet Loss (ETL) function is introduced to incorporate the bit-shifting and non-iris masking, which are found necessary for learning discriminative spatial iris features. We also developed a sub-network to provide appropriate information for identifying meaningful iris regions, which serves as essential input for the newly developed ETL. Thorough experiments on four publicly available databases suggest that the proposed framework consistently outperforms several classic and state-of-the-art iris recognition approaches. More importantly, our model exhibits superior generalization capability as, unlike popular methods in the literature, it does not essentially require database-specific parameter tuning, which is another key advantage over other approaches.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zhao_Towards_More_Accurate_ICCV_2017_paper.pdf", @@ -16655,14 +17717,15 @@ "author_num": 2, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Zhao_Towards_More_Accurate_ICCV_2017_paper.html", "aff_unique_index": "0;0", - "aff_unique_norm": "Hong Kong Polytechnic University", + "aff_unique_norm": "The Hong Kong Polytechnic University", "aff_unique_dep": "Department of Computing", "aff_unique_url": "https://www.polyu.edu.hk", "aff_unique_abbr": "PolyU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhao_2017_ICCV,\n \n author = {\n Zhao,\n Zijing and Kumar,\n Ajay\n},\n title = {\n Towards More Accurate Iris Recognition Using Deeply Learned Spatially Corresponding Features\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Towards a Unified Compositional Model for Visual Pattern Modeling", @@ -16670,6 +17733,7 @@ "status": "Poster", "track": "main", "pid": "1136", + "author_site": "Wei Tang; Pei Yu; Jiahuan Zhou; Ying Wu", "author": "Wei Tang; Pei Yu; Jiahuan Zhou; Ying Wu", "abstract": "Compositional models represent visual patterns as hierarchies of meaningful and reusable parts. They are attractive to vision modeling due to their ability to decompose complex patterns into simpler ones and resolve the low-level ambiguities in high-level image interpretations. However, current compositional models separate structure and part discovery from parameter estimation, which generally leads to suboptimal learning and fitting of the model. Moreover, the commonly adopted latent structural learning is not scalable for deep architectures. To address these difficult issues for compositional models, this paper quests for a unified framework for compositional pattern modeling, inference and learning. Represented by And-Or graphs (AOGs), it jointly models the compositional structure, parts, features, and composition/sub-configuration relationships. We show that the inference algorithm of the proposed framework is equivalent to a feed-forward network. Thus, all the parameters can be learned efficiently via the highly-scalable back-propagation (BP) in an end-to-end fashion. We validate the model via the task of handwritten digit recognition. By visualizing the processes of bottom-up composition and top-down parsing, we show that our model is fully interpretable, being able to learn the hierarchical compositions from visual primitives to visual patterns at increasingly higher levels. We apply this new compositional model to natural scene character recognition and generic object detection. Experimental results have demonstrated its effectiveness.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Tang_Towards_a_Unified_ICCV_2017_paper.pdf", @@ -16685,7 +17749,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Tang_Towards_a_Unified_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Tang_Towards_a_Unified_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Tang_2017_ICCV,\n \n author = {\n Tang,\n Wei and Yu,\n Pei and Zhou,\n Jiahuan and Wu,\n Ying\n},\n title = {\n Towards a Unified Compositional Model for Visual Pattern Modeling\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Towards a Visual Privacy Advisor: Understanding and Predicting Privacy Risks in Images", @@ -16693,6 +17758,7 @@ "status": "Poster", "track": "main", "pid": "1581", + "author_site": "Tribhuvanesh Orekondy; Bernt Schiele; Mario Fritz", "author": "Tribhuvanesh Orekondy; Bernt Schiele; Mario Fritz", "abstract": "With an increasing number of users sharing information online, privacy implications entailing such actions are a major concern. For explicit content, such as user profile or GPS data, devices (e.g. mobile phones) as well as web services (e.g. facebook) offer to set privacy settings in order to enforce the users' privacy preferences. We propose the first approach that extends this concept to image content in the spirit of a Visual Privacy Advisor. First, we categorize personal information in images into 68 image attributes and collect a dataset, which allows us to train models that predict such information directly from images. Second, we run a user study to understand the privacy preferences of different users w.r.t. such attributes. Third, we propose models that predict user specific privacy score from images in order to enforce the users' privacy preferences. Our model is trained to predict the user specific privacy risk and even outperforms the judgment of the users, who often fail to follow their own privacy preferences on image data.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Orekondy_Towards_a_Visual_ICCV_2017_paper.pdf", @@ -16717,7 +17783,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Orekondy_2017_ICCV,\n \n author = {\n Orekondy,\n Tribhuvanesh and Schiele,\n Bernt and Fritz,\n Mario\n},\n title = {\n Towards a Visual Privacy Advisor: Understanding and Predicting Privacy Risks in Images\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Tracking as Online Decision-Making: Learning a Policy From Streaming Videos With Reinforcement Learning", @@ -16725,7 +17792,7 @@ "status": "Poster", "track": "main", "pid": "266", - "author_site": "James Supan\u00c4\u008di\u00c4\u008d, III; Deva Ramanan", + "author_site": "James Supančič, III; Deva Ramanan", "author": "James Supancic;III; Deva Ramanan", "abstract": "We formulate tracking as an online decision-making process, where a tracking agent must follow an object despite ambiguous image frames and a limited computational budget. Crucially, the agent must decide where to look in the upcoming frames, when to reinitialize because it believes the target has been lost, and when to update its appearance model for the tracked object. Such decisions are typically made heuristically. Instead, we propose to learn an optimal decision-making policy by formulating tracking as a partially observable decision-making process (POMDP). We learn policies with deep reinforcement learning algorithms that need supervision (a reward signal) only when the track has gone awry. We demonstrate that sparse rewards allow us to quickly train on massive datasets, several orders of magnitude more than past work. Interestingly, by treating the data source of Internet videos as unlimited streams, we both learn and evaluate our trackers in a single, unified computational stream.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Supancic_Tracking_as_Online_ICCV_2017_paper.pdf", @@ -16741,7 +17808,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Supancic_Tracking_as_Online_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Supancic_Tracking_as_Online_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Supancic_2017_ICCV,\n \n author = {\n Supancic,III,\n James and Ramanan,\n Deva\n},\n title = {\n Tracking as Online Decision-Making: Learning a Policy From Streaming Videos With Reinforcement Learning\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Tracking the Untrackable: Learning to Track Multiple Cues With Long-Term Dependencies", @@ -16749,6 +17817,7 @@ "status": "Poster", "track": "main", "pid": "228", + "author_site": "Amir Sadeghian; Alexandre Alahi; Silvio Savarese", "author": "Amir Sadeghian; Alexandre Alahi; Silvio Savarese", "abstract": "The majority of existing solutions to the Multi-Target Tracking (MTT) problem do not combine cues over a long period of time in a coherent fashion. In this paper, we present an online method that encodes long-term temporal dependencies across multiple cues. One key challenge of tracking methods is to accurately track occluded targets or those which share similar appearance properties with surrounding objects. To address this challenge, we present a structure of Recurrent Neural Networks (RNN) that jointly reasons on multiple cues over a temporal window. Our method allows to correct data association errors and recover observations from occluded states. We demonstrate the robustness of our data-driven approach by tracking multiple targets using their appearance, motion, and even interactions. Our method outperforms previous works on multiple publicly available datasets including the challenging MOT benchmark.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Sadeghian_Tracking_the_Untrackable_ICCV_2017_paper.pdf", @@ -16773,7 +17842,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "United States;Switzerland" + "aff_country_unique": "United States;Switzerland", + "bibtex": "@InProceedings{Sadeghian_2017_ICCV,\n \n author = {\n Sadeghian,\n Amir and Alahi,\n Alexandre and Savarese,\n Silvio\n},\n title = {\n Tracking the Untrackable: Learning to Track Multiple Cues With Long-Term Dependencies\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Training Deep Networks to Be Spatially Sensitive", @@ -16781,6 +17851,7 @@ "status": "Poster", "track": "main", "pid": "2707", + "author_site": "Nicholas Kolkin; Eli Shechtman; Gregory Shakhnarovich", "author": "Nicholas Kolkin; Eli Shechtman; Gregory Shakhnarovich", "abstract": "In many computer vision tasks, for example saliency prediction or semantic segmentation, the desired output is a foreground map that predicts pixels where some criteria is satisfied. Despite the inherently spatial nature of this task commonly used learning objectives do not incorporate the spatial relationships between misclassified pixels and the underlying ground truth. The Weighted F-measure, a recently proposed evaluation metric, does reweight errors spatially, and has been shown to closely correlate with human evaluation of quality, and stably rank predictions with respect to noisy ground truths (such as a sloppy human annotator might generate). However it suffers from computational complexity which makes it intractable as an optimization objective for gradient descent, which must be evaluated thousands or millions of times while learning a model's parameters. We propose a differentiable and efficient approximation of this metric. By incorporating spatial information into the objective we can use a simpler model than competing methods without sacrificing accuracy, resulting in faster inference speeds and alleviating the need for pre/post-processing. We match (or improve) performance on several tasks compared to prior state of the art by traditional metrics, and in many cases significantly improve performance by the weighted F-measure.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Kolkin_Training_Deep_Networks_ICCV_2017_paper.pdf", @@ -16805,7 +17876,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Chicago;", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Kolkin_2017_ICCV,\n \n author = {\n Kolkin,\n Nicholas and Shechtman,\n Eli and Shakhnarovich,\n Gregory\n},\n title = {\n Training Deep Networks to Be Spatially Sensitive\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Transferring Objects: Joint Inference of Container and Human Pose", @@ -16813,6 +17885,7 @@ "status": "Poster", "track": "main", "pid": "1262", + "author_site": "Hanqing Wang; Wei Liang; Lap-Fai Yu", "author": "Hanqing Wang; Wei Liang; Lap-Fai Yu", "abstract": "Transferring objects from one place to another place is a common task performed by human in daily life. During this process, it is usually intuitive for humans to choose an object as a proper container and to use an efficient pose to carry objects; yet, it is non-trivial for current computer vision and machine learning algorithms. In this paper, we propose an approach to jointly infer container and human pose for transferring objects by minimizing the costs associated both object and pose candidates. Our approach predicts which object to choose as a container while reasoning about how humans interact with physical surroundings to accomplish the task of transferring objects given visual input. In the learning phase, the presented method learns how humans make rational choices of containers and poses for transferring different objects, as well as the physical quantities required by the transfer task (e.g., compatibility between container and containee, energy cost of carrying pose) via a structured learning approach. In the inference phase, given a scanned 3D scene with different object candidates and a dictionary of human poses, our approach infers the best object as a container together with human pose for transferring a given object.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Wang_Transferring_Objects_Joint_ICCV_2017_paper.pdf", @@ -16837,7 +17910,8 @@ "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Beijing;Boston", "aff_country_unique_index": "0;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Wang_2017_ICCV,\n \n author = {\n Wang,\n Hanqing and Liang,\n Wei and Yu,\n Lap-Fai\n},\n title = {\n Transferring Objects: Joint Inference of Container and Human Pose\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Transformed Low-Rank Model for Line Pattern Noise Removal", @@ -16845,6 +17919,7 @@ "status": "Poster", "track": "main", "pid": "881", + "author_site": "Yi Chang; Luxin Yan; Sheng Zhong", "author": "Yi Chang; Luxin Yan; Sheng Zhong", "abstract": "This paper addresses the problem of line pattern noise removal from a single image, such as rain streak, hyperspectral stripe and so on. Most of the previous methods model the line pattern noise in original image domain, which fail to explicitly exploit the directional characteristic, thus resulting in a redundant subspace with poor representation ability for those line pattern noise. To achieve a compact subspace for the line pattern structure, in this work, we incorporate a transformation into the image decomposition model so that maps the input image to a domain where the line pattern streak/stripe appearance has an extremely distinct low-rank structure, which naturally allows us to enforce a low-rank prior to extract the line pattern streak/stripe from the noisy image. Moreover, the random noise is usually mixed up with the line pattern noise, which makes the challenging problem much more difficult. While previous methods resort to the spectral or temporal correlation of the multi-images, we give a detailed analysis between the noisy and clean image in both local gradient and nonlocal domain, and propose a compositional directional total variational and low-rank prior for the image layer, thus to simultaneously accommodate both types of noise. The proposed method has been evaluated on two different tasks, including remote sensing image mixed random stripe noise removal and rain streak removal, all of which obtain very impressive performances.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Chang_Transformed_Low-Rank_Model_ICCV_2017_paper.pdf", @@ -16869,7 +17944,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chang_2017_ICCV,\n \n author = {\n Chang,\n Yi and Yan,\n Luxin and Zhong,\n Sheng\n},\n title = {\n Transformed Low-Rank Model for Line Pattern Noise Removal\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Transitive Invariance for Self-Supervised Visual Representation Learning", @@ -16877,6 +17953,7 @@ "status": "Poster", "track": "main", "pid": "442", + "author_site": "Xiaolong Wang; Kaiming He; Abhinav Gupta", "author": "Xiaolong Wang; Kaiming He; Abhinav Gupta", "abstract": "Learning visual representations with self-supervised learning has become popular in computer vision. The idea is to design auxiliary tasks where labels are free to obtain. Most of these tasks end up providing data to learn specific kinds of invariance useful for recognition. In this paper, we propose to exploit different self-supervised approaches to learn representations invariant to (i) inter-instance variations (two objects in the same class should have similar features) and (ii) intra-instance variations (viewpoint, pose, deformations, illumination, etc). Instead of combining two approaches with multi-task learning, we argue to organize and reason the data with multiple variations. Specifically, we propose to generate a graph with millions of objects mined from hundreds of thousands of videos. The objects are connected by two types of edges which correspond to two types of invariance: \"different instances but a similar viewpoint and category\" and \"different viewpoints of the same instance\". By applying simple transitivity on the graph with these edges, we can obtain pairs of images exhibiting richer visual invariance. We use this data to train a Triplet-Siamese network with VGG16 as the base architecture and apply the learned representations to different recognition tasks. For object detection, we achieve 63.2% mAP on PASCAL VOC 2007 using Fast R-CNN (compare to 67.3% with ImageNet pre-training). For the challenging COCO dataset, our method is surprisingly close (23.5%) to the ImageNet-supervised counterpart (24.4%) using the Faster R-CNN framework. We also show that our network can perform significantly better than the ImageNet network in the surface normal estimation task.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Wang_Transitive_Invariance_for_ICCV_2017_paper.pdf", @@ -16891,7 +17968,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Wang_Transitive_Invariance_for_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Wang_Transitive_Invariance_for_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Wang_2017_ICCV,\n \n author = {\n Wang,\n Xiaolong and He,\n Kaiming and Gupta,\n Abhinav\n},\n title = {\n Transitive Invariance for Self-Supervised Visual Representation Learning\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Trespassing the Boundaries: Labeling Temporal Bounds for Object Interactions in Egocentric Video", @@ -16899,6 +17977,7 @@ "status": "Poster", "track": "main", "pid": "1085", + "author_site": "Davide Moltisanti; Michael Wray; Walterio Mayol-Cuevas; Dima Damen", "author": "Davide Moltisanti; Michael Wray; Walterio Mayol-Cuevas; Dima Damen", "abstract": "Manual annotations of temporal bounds for object interactions (i.e. start and end times) are typical training input to recognition, localization and detection algorithms. For three publicly available egocentric datasets, we uncover inconsistencies in ground truth temporal bounds within and across annotators and datasets. We systematically assess the robustness of state-of-the-art approaches to changes in labeled temporal bounds, for object interaction recognition. As boundaries are trespassed, a drop of up to 10% is observed for both Improved Dense Trajectories and Two-Stream Convolutional Neural Network. We demonstrate that such disagreement stems from a limited understanding of the distinct phases of an action, and propose annotating based on the Rubicon Boundaries, inspired by a similarly named cognitive model, for consistent temporal bounds of object interactions. Evaluated on a public dataset, we report a 4% increase in overall accuracy, and an increase in accuracy for 55% of classes when Rubicon Boundaries are used for temporal annotations.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Moltisanti_Trespassing_the_Boundaries_ICCV_2017_paper.pdf", @@ -16923,7 +18002,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Moltisanti_2017_ICCV,\n \n author = {\n Moltisanti,\n Davide and Wray,\n Michael and Mayol-Cuevas,\n Walterio and Damen,\n Dima\n},\n title = {\n Trespassing the Boundaries: Labeling Temporal Bounds for Object Interactions in Egocentric Video\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Truncating Wide Networks Using Binary Tree Architectures", @@ -16931,6 +18011,7 @@ "status": "Poster", "track": "main", "pid": "836", + "author_site": "Yan Zhang; Mete Ozay; Shuohao Li; Takayuki Okatani", "author": "Yan Zhang; Mete Ozay; Shuohao Li; Takayuki Okatani", "abstract": "In this paper, we propose a binary tree architecture to truncate architecture of wide networks by reducing the width of the networks. More precisely, in the proposed architecture, the width is incrementally reduced from lower layers to higher layers in order to increase the expressive capacity of networks with a less increase on parameter size. Also, in order to ease the gradient vanishing problem, features obtained at different layers are concatenated to form the output of our architecture. By employing the proposed architecture on a baseline wide network, we can construct and train a new network with same depth but considerably less number of parameters. In our experimental analyses, we observe that the proposed architecture enables us to obtain better parameter size and accuracy trade-off compared to baseline networks using various benchmark image classification datasets. The results show that our model can decrease the classification error of a baseline from 20.43% to 19.22% on Cifar-100 using only 28% of parameters that the baseline has.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zhang_Truncating_Wide_Networks_ICCV_2017_paper.pdf", @@ -16955,7 +18036,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+1+0;0+0", - "aff_country_unique": "Japan;China" + "aff_country_unique": "Japan;China", + "bibtex": "@InProceedings{Zhang_2017_ICCV,\n \n author = {\n Zhang,\n Yan and Ozay,\n Mete and Li,\n Shuohao and Okatani,\n Takayuki\n},\n title = {\n Truncating Wide Networks Using Binary Tree Architectures\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Tube Convolutional Neural Network (T-CNN) for Action Detection in Videos", @@ -16963,6 +18045,7 @@ "status": "Poster", "track": "main", "pid": "2939", + "author_site": "Rui Hou; Chen Chen; Mubarak Shah", "author": "Rui Hou; Chen Chen; Mubarak Shah", "abstract": "Deep learning has been demonstrated to achieve excellent results for image classification and object detection. However, the impact of deep learning on video analysis (e.g. action detection and recognition) has been limited due to complexity of video data and lack of annotations. Previous convolutional neural networks (CNN) based video action detection approaches usually consist of two major steps: frame-level action proposal detection and association of proposals across frames. Also, these methods employ two-stream CNN framework to handle spatial and temporal feature separately. In this paper, we propose an end-to-end deep network called Tube Convolutional Neural Network (T-CNN) for action detection in videos. The proposed architecture is a unified network that is able to recognize and localize action based on 3D convolution features. A video is first divided into equal length clips and for each clip a set of tube proposals are generated next based on 3D Convolutional Network (ConvNet) features. Finally, the tube proposals of different clips are linked together employing network flow and spatio-temporal action detection is performed using these linked video proposals. Extensive experiments on several video datasets demonstrate the superior performance of T-CNN for classifying and localizing actions in both trimmed and untrimmed videos compared to state-of-the-arts.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Hou_Tube_Convolutional_Neural_ICCV_2017_paper.pdf", @@ -16985,9 +18068,10 @@ "aff_unique_url": "https://www.ucf.edu", "aff_unique_abbr": "UCF", "aff_campus_unique_index": "0;0;0", - "aff_campus_unique": "UCF", + "aff_campus_unique": "Orlando", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Hou_2017_ICCV,\n \n author = {\n Hou,\n Rui and Chen,\n Chen and Shah,\n Mubarak\n},\n title = {\n Tube Convolutional Neural Network (T-CNN) for Action Detection in Videos\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Turning Corners Into Cameras: Principles and Methods", @@ -16995,7 +18079,7 @@ "status": "Spotlight", "track": "main", "pid": "1983", - "author_site": "Katherine L. Bouman; Vickie Ye; Adam B. Yedidia; Fr\u00c3\u00a9do Durand; Gregory W. Wornell; Antonio Torralba; William T. Freeman", + "author_site": "Katherine L. Bouman; Vickie Ye; Adam B. Yedidia; Frédo Durand; Gregory W. Wornell; Antonio Torralba; William T. Freeman", "author": "Katherine L. Bouman; Vickie Ye; Adam B. Yedidia; Fredo Durand; Gregory W. Wornell; Antonio Torralba; William T. Freeman", "abstract": "We show that walls and other obstructions with edges can be exploited as naturally-occurring \"cameras\" that reveal the hidden scenes beyond them. In particular, we demonstrate methods for using the subtle spatio-temporal radiance variations that arise on the ground at the base of edges to construct a one-dimensional video of the hidden scene. The resulting technique can be used for a variety of applications in diverse physical settings. From standard RGB video recordings of the variations in intensity, we use edge cameras to recover a 1-D video that reveals the number and trajectories of people moving in an occluded scene. We further show that adjacent vertical edges, such as those that arise in the case of an open doorway, yield a stereo camera from which the 2-D location of hidden, moving objects can be recovered. We demonstrate our technique in a number of indoor and outdoor environments involving varied surfaces and illumination conditions.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Bouman_Turning_Corners_Into_ICCV_2017_paper.pdf", @@ -17020,7 +18104,8 @@ "aff_campus_unique_index": "0;0;0;0;0;0;0+1", "aff_campus_unique": "Cambridge;Mountain View", "aff_country_unique_index": "0;0;0;0;0;0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Bouman_2017_ICCV,\n \n author = {\n Bouman,\n Katherine L. and Ye,\n Vickie and Yedidia,\n Adam B. and Durand,\n Fredo and Wornell,\n Gregory W. and Torralba,\n Antonio and Freeman,\n William T.\n},\n title = {\n Turning Corners Into Cameras: Principles and Methods\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Two-Phase Learning for Weakly Supervised Object Localization", @@ -17028,6 +18113,7 @@ "status": "Poster", "track": "main", "pid": "1306", + "author_site": "Dahun Kim; Donghyeon Cho; Donggeun Yoo; In So Kweon", "author": "Dahun Kim; Donghyeon Cho; Donggeun Yoo; In So Kweon", "abstract": "Weakly supervised semantic segmentation and localization have a problem of focusing only on the most important parts of an image since they use only image-level annotations. In this paper, we solve this problem fundamentally via two-phase learning. Our networks are trained in two steps. In the first step, a conventional fully convolutional network (FCN) is trained to find the most discriminative parts of an image. In the second step, the activations on the most salient parts are suppressed by inference conditional feedback, and then the second learning is performed to find the area of the next most important parts. By combining the activations of both phases, the entire portion of the target object can be captured. Our proposed training scheme is novel and can be utilized in well-designed techniques for weakly supervised semantic segmentation, salient region detection, and object location prediction. Detailed experiments demonstrate the effectiveness of our two-phase learning in each task.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Kim_Two-Phase_Learning_for_ICCV_2017_paper.pdf", @@ -17042,7 +18128,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Kim_Two-Phase_Learning_for_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Kim_Two-Phase_Learning_for_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Kim_2017_ICCV,\n \n author = {\n Kim,\n Dahun and Cho,\n Donghyeon and Yoo,\n Donggeun and So Kweon,\n In\n},\n title = {\n Two-Phase Learning for Weakly Supervised Object Localization\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Understanding Low- and High-Level Contributions to Fixation Prediction", @@ -17050,11 +18137,11 @@ "status": "Poster", "track": "main", "pid": "2404", - "author_site": "Matthias K\u00c3\u00bcmmerer; Thomas S. A. Wallis; Leon A. Gatys; Matthias Bethge", + "author_site": "Matthias Kümmerer; Thomas S. A. Wallis; Leon A. Gatys; Matthias Bethge", "author": "Matthias Kummerer; Thomas S. A. Wallis; Leon A. Gatys; Matthias Bethge", "abstract": "Understanding where people look in images is an important problem in computer vision. Despite significant research, it remains unclear to what extent human fixations can be predicted by low-level (contrast) compared to high-level (presence of objects) image features. Here we address this problem by introducing two novel models that use different feature spaces but the same readout architecture. The first model predicts human fixations based on deep neural network features trained on object recognition. This model sets a new state-of-the art in fixation prediction by achieving top performance in area under the curve metrics on the MIT300 hold-out benchmark (AUC = 88%, sAUC = 77%, NSS = 2.34). The second model uses purely low-level (isotropic contrast) features. This model achieves better performance than all models not using features pre-trained on object recognition, making it a strong baseline to assess the utility of high-level features. We then evaluate and visualize which fixations are better explained by low-level compared to high-level image features. Surprisingly we find that a substantial proportion of fixations are better explained by the simple low-level model than the state-of-the-art model. Comparing different features within the same powerful readout architecture allows us to better understand the relevance of low- versus high-level features in predicting fixation locations, while simultaneously achieving state-of-the-art saliency prediction.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Kummerer_Understanding_Low-_and_ICCV_2017_paper.pdf", - "aff": "University of T \u00a8ubingen, Centre for Integrative Neuroscience; University of T \u00a8ubingen, Centre for Integrative Neuroscience; University of T \u00a8ubingen, Centre for Integrative Neuroscience; University of T \u00a8ubingen, Centre for Integrative Neuroscience", + "aff": "University of T ¨ubingen, Centre for Integrative Neuroscience; University of T ¨ubingen, Centre for Integrative Neuroscience; University of T ¨ubingen, Centre for Integrative Neuroscience; University of T ¨ubingen, Centre for Integrative Neuroscience", "project": "", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2017/supplemental/Kummerer_Understanding_Low-_and_ICCV_2017_supplemental.pdf", @@ -17068,14 +18155,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Kummerer_Understanding_Low-_and_ICCV_2017_paper.html", "aff_unique_index": "0;0;0;0", - "aff_unique_norm": "University of T\u00fcbingen", + "aff_unique_norm": "University of Tübingen", "aff_unique_dep": "Centre for Integrative Neuroscience", "aff_unique_url": "https://www.uni-tuebingen.de/", - "aff_unique_abbr": "Uni T\u00fcbingen", + "aff_unique_abbr": "", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Kummerer_2017_ICCV,\n \n author = {\n Kummerer,\n Matthias and Wallis,\n Thomas S. A. and Gatys,\n Leon A. and Bethge,\n Matthias\n},\n title = {\n Understanding Low- and High-Level Contributions to Fixation Prediction\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Understanding and Mapping Natural Beauty", @@ -17083,6 +18171,7 @@ "status": "Poster", "track": "main", "pid": "2954", + "author_site": "Scott Workman; Richard Souvenir; Nathan Jacobs", "author": "Scott Workman; Richard Souvenir; Nathan Jacobs", "abstract": "While natural beauty is often considered a subjective property of images, in this paper, we take an objective approach and provide methods for quantifying and predicting the scenicness of an image. Using a dataset containing hundreds of thousands of outdoor images captured throughout Great Britain with crowdsourced ratings of natural beauty, we propose an approach to predict scenicness which explicitly accounts for the variance of human ratings. We demonstrate that quantitative measures of scenicness can benefit semantic image understanding, content-aware image processing, and a novel application of cross-view mapping, where the sparsity of ground-level images can be addressed by incorporating unlabeled overhead images in the training and prediction steps. For each application, our methods for scenicness prediction result in quantitative and qualitative improvements over baseline approaches.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Workman_Understanding_and_Mapping_ICCV_2017_paper.pdf", @@ -17107,7 +18196,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Workman_2017_ICCV,\n \n author = {\n Workman,\n Scott and Souvenir,\n Richard and Jacobs,\n Nathan\n},\n title = {\n Understanding and Mapping Natural Beauty\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Unified Deep Supervised Domain Adaptation and Generalization", @@ -17115,6 +18205,7 @@ "status": "Poster", "track": "main", "pid": "2594", + "author_site": "Saeid Motiian; Marco Piccirilli; Donald A. Adjeroh; Gianfranco Doretto", "author": "Saeid Motiian; Marco Piccirilli; Donald A. Adjeroh; Gianfranco Doretto", "abstract": "This work addresses the problem of domain adaptation and generalization in a unified fashion. The main idea is to exploit the siamese architecture with the Contrastive Loss to address the domain shift and generalization problems. The framework is general, and can be used with any architecture. One of the main strengths of the approach is the \"speed\" of adaptation, which requires an extremely low number of labeled training samples from the target domain, even only one per category. The same architecture and loss function can be easily extended to domain generalization. We present state-of-the-art results for both of these applications.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Motiian_Unified_Deep_Supervised_ICCV_2017_paper.pdf", @@ -17130,7 +18221,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Motiian_Unified_Deep_Supervised_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Motiian_Unified_Deep_Supervised_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Motiian_2017_ICCV,\n \n author = {\n Motiian,\n Saeid and Piccirilli,\n Marco and Adjeroh,\n Donald A. and Doretto,\n Gianfranco\n},\n title = {\n Unified Deep Supervised Domain Adaptation and Generalization\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Universal Adversarial Perturbations Against Semantic Image Segmentation", @@ -17138,6 +18230,7 @@ "status": "Poster", "track": "main", "pid": "1046", + "author_site": "Jan Hendrik Metzen; Mummadi Chaithanya Kumar; Thomas Brox; Volker Fischer", "author": "Jan Hendrik Metzen; Mummadi Chaithanya Kumar; Thomas Brox; Volker Fischer", "abstract": "While deep learning is remarkably successful on perceptual tasks, it was also shown to be vulnerable to adversarial perturbations of the input. These perturbations denote noise added to the input that was generated specifically to fool the system while being quasi-imperceptible for humans. More severely, there even exist universal perturbations that are input-agnostic but fool the network on the majority of inputs. While recent work has focused on image classification, this work proposes attacks against semantic image segmentation: we present an approach for generating (universal) adversarial perturbations that make the network yield a desired target segmentation as output. We show empirically that there exist barely perceptible universal noise patterns which result in nearly the same predicted segmentation for arbitrary inputs. Furthermore, we also show the existence of universal noise which removes a target class (e.g., all pedestrians) from the segmentation while leaving the segmentation mostly unchanged otherwise.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Metzen_Universal_Adversarial_Perturbations_ICCV_2017_paper.pdf", @@ -17162,7 +18255,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Metzen_2017_ICCV,\n \n author = {\n Hendrik Metzen,\n Jan and Chaithanya Kumar,\n Mummadi and Brox,\n Thomas and Fischer,\n Volker\n},\n title = {\n Universal Adversarial Perturbations Against Semantic Image Segmentation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Unlabeled Samples Generated by GAN Improve the Person Re-Identification Baseline in Vitro", @@ -17170,10 +18264,11 @@ "status": "Spotlight", "track": "main", "pid": "467", + "author_site": "Zhedong Zheng; Liang Zheng; Yi Yang", "author": "Zhedong Zheng; Liang Zheng; Yi Yang", "abstract": "The main contribution of this paper is a simple semi-supervised pipeline that only uses the original training set without collecting extra data. It is challenging in 1) how to obtain more training data only from the training set and 2) how to use the newly generated data. In this work, the generative adversarial network (GAN) is used to generate unlabeled samples. We propose the label smoothing regularization for outliers (LSRO). This method assigns a uniform label distribution to the unlabeled images, which regularizes the supervised model and improves the baseline. We verify the proposed method on a practical problem: person re-identification (re-ID). This task aims to retrieve a query person from other cameras. We adopt the deep convolutional generative adversarial network (DCGAN) for sample generation, and a baseline convolutional neural network (CNN) for representation learning. Experiments show that adding the GAN-generated data effectively improves the discriminative ability of learned CNN embeddings. On three large-scale datasets, Market-1501, CUHK03 and DukeMTMC-reID, we obtain +4.37%, +1.6% and +2.46% improvement in rank-1 precision over the baseline CNN, respectively. We additionally apply the proposed method to fine-grained bird recognition and achieve a +0.6% improvement over a strong baseline. The code is available at https://github.com/layumi/Person-reID_GAN.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zheng_Unlabeled_Samples_Generated_ICCV_2017_paper.pdf", - "aff": "Centre for Arti\ufb01cial Intelligence, University of Technology Sydney; Centre for Arti\ufb01cial Intelligence, University of Technology Sydney; Centre for Arti\ufb01cial Intelligence, University of Technology Sydney", + "aff": "Centre for Artificial Intelligence, University of Technology Sydney; Centre for Artificial Intelligence, University of Technology Sydney; Centre for Artificial Intelligence, University of Technology Sydney", "project": "", "github": "https://github.com/layumi/Person-reID_GAN", "supp": "", @@ -17188,13 +18283,14 @@ "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Zheng_Unlabeled_Samples_Generated_ICCV_2017_paper.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Technology Sydney", - "aff_unique_dep": "Centre for Arti\ufb01cial Intelligence", + "aff_unique_dep": "Centre for Artificial Intelligence", "aff_unique_url": "https://www.uts.edu.au", "aff_unique_abbr": "UTS", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Sydney", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Australia" + "aff_country_unique": "Australia", + "bibtex": "@InProceedings{Zheng_2017_ICCV,\n \n author = {\n Zheng,\n Zhedong and Zheng,\n Liang and Yang,\n Yi\n},\n title = {\n Unlabeled Samples Generated by GAN Improve the Person Re-Identification Baseline in Vitro\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Unmasking the Abnormal Events in Video", @@ -17202,6 +18298,7 @@ "status": "Poster", "track": "main", "pid": "1101", + "author_site": "Radu Tudor Ionescu; Sorina Smeureanu; Bogdan Alexe; Marius Popescu", "author": "Radu Tudor Ionescu; Sorina Smeureanu; Bogdan Alexe; Marius Popescu", "abstract": "We propose a novel framework for abnormal event detection in video that requires no training sequences. Our framework is based on unmasking, a technique previously used for authorship verification in text documents, which we adapt to our task. We iteratively train a binary classifier to distinguish between two consecutive video sequences while removing at each step the most discriminant features. Higher training accuracy rates of the intermediately obtained classifiers represent abnormal events. To the best of our knowledge, this is the first work to apply unmasking for a computer vision task. We compare our method with several state-of-the-art supervised and unsupervised methods on four benchmark data sets. The empirical results indicate that our abnormal event detection framework can achieve state-of-the-art results, while running in real-time at 20 frames per second.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Ionescu_Unmasking_the_Abnormal_ICCV_2017_paper.pdf", @@ -17217,7 +18314,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Ionescu_Unmasking_the_Abnormal_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Ionescu_Unmasking_the_Abnormal_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Ionescu_2017_ICCV,\n \n author = {\n Tudor Ionescu,\n Radu and Smeureanu,\n Sorina and Alexe,\n Bogdan and Popescu,\n Marius\n},\n title = {\n Unmasking the Abnormal Events in Video\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Unpaired Image-To-Image Translation Using Cycle-Consistent Adversarial Networks", @@ -17225,6 +18323,7 @@ "status": "Spotlight", "track": "main", "pid": "488", + "author_site": "Jun-Yan Zhu; Taesung Park; Phillip Isola; Alexei A. Efros", "author": "Jun-Yan Zhu; Taesung Park; Phillip Isola; Alexei A. Efros", "abstract": "Image-to-image translation is a class of vision and graphics problems where the goal is to learn the mapping between an input image and an output image using a training set of aligned image pairs. However, for many tasks, paired training data will not be available. We present an approach for learning to translate an image from a source domain X to a target domain Y in the absence of paired examples. Our goal is to learn a mapping G: X -> Y such that the distribution of images from G(X) is indistinguishable from the distribution Y using an adversarial loss. Because this mapping is highly under-constrained, we couple it with an inverse mapping F: Y -> X and introduce a cycle consistency loss to push F(G(X)) ~ X (and vice versa). Qualitative results are presented on several tasks where paired training data does not exist, including collection style transfer, object transfiguration, season transfer, photo enhancement, etc. Quantitative comparisons against several prior methods demonstrate the superiority of our approach.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zhu_Unpaired_Image-To-Image_Translation_ICCV_2017_paper.pdf", @@ -17240,7 +18339,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Zhu_Unpaired_Image-To-Image_Translation_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Zhu_Unpaired_Image-To-Image_Translation_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Zhu_2017_ICCV,\n \n author = {\n Zhu,\n Jun-Yan and Park,\n Taesung and Isola,\n Phillip and Efros,\n Alexei A.\n},\n title = {\n Unpaired Image-To-Image Translation Using Cycle-Consistent Adversarial Networks\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Unrestricted Facial Geometry Reconstruction Using Image-To-Image Translation", @@ -17248,6 +18348,7 @@ "status": "Poster", "track": "main", "pid": "838", + "author_site": "Matan Sela; Elad Richardson; Ron Kimmel", "author": "Matan Sela; Elad Richardson; Ron Kimmel", "abstract": "It has been recently shown that neural networks can recover the geometric structure of a face from a single given image. A common denominator of most existing face geometry reconstruction methods is the restriction of the solution space to some low-dimensional subspace. While such a model significantly simplifies the reconstruction problem, it is inherently limited in its expressiveness. As an alternative, we propose an Image-to-Image translation network that jointly maps the input image to a depth image and a facial correspondence map. This explicit pixel-based mapping can then be utilized to provide high quality reconstructions of diverse faces under extreme expressions, using a purely geometric refinement process. In the spirit of recent approaches, the network is trained only with synthetic data, and is then evaluated on in-the-wild facial images. Both qualitative and quantitative analyses demonstrate the accuracy and the robustness of our approach.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Sela_Unrestricted_Facial_Geometry_ICCV_2017_paper.pdf", @@ -17272,7 +18373,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Israel" + "aff_country_unique": "Israel", + "bibtex": "@InProceedings{Sela_2017_ICCV,\n \n author = {\n Sela,\n Matan and Richardson,\n Elad and Kimmel,\n Ron\n},\n title = {\n Unrestricted Facial Geometry Reconstruction Using Image-To-Image Translation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Unrolled Memory Inner-Products: An Abstract GPU Operator for Efficient Vision-Related Computations", @@ -17280,6 +18382,7 @@ "status": "Spotlight", "track": "main", "pid": "1658", + "author_site": "Yu-Sheng Lin; Wei-Chao Chen; Shao-Yi Chien", "author": "Yu-Sheng Lin; Wei-Chao Chen; Shao-Yi Chien", "abstract": "Recently, convolutional neural networks (CNNs) have achieved great success in fields such as computer vision, natural language processing, and artificial intelligence. Many of these applications utilize parallel processing in GPUs to achieve higher performance. However, it remains a daunting task to optimize for GPUs, and most researchers have to rely on vendor-provided libraries for such purposes. In this paper, we discuss an operator that can be used to succinctly express computational kernels in CNNs and various scientific and vision applications. This operator, called Unrolled-Memory-Inner-Product (UMI), is a computationally-efficient operator with smaller code token requirement. Since a naive UMI implementation would increase memory requirement through input data unrolling, we propose a method to achieve optimal memory fetch performance in modern GPUs. We demonstrate this operator by converting several popular applications into the UMI representation and achieve 1.3x-26.4x speedup against frameworks such as OpenCV and Caffe.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Lin_Unrolled_Memory_Inner-Products_ICCV_2017_paper.pdf", @@ -17299,12 +18402,13 @@ "aff_unique_index": "0;1;0", "aff_unique_norm": "National Taiwan University;Skywatch Inc.", "aff_unique_dep": ";", - "aff_unique_url": "https://www.ntu.edu.tw;https://www.skywatch.com", + "aff_unique_url": "https://www.ntu.edu.tw;", "aff_unique_abbr": "NTU;", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Taiwan;", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Lin_2017_ICCV,\n \n author = {\n Lin,\n Yu-Sheng and Chen,\n Wei-Chao and Chien,\n Shao-Yi\n},\n title = {\n Unrolled Memory Inner-Products: An Abstract GPU Operator for Efficient Vision-Related Computations\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Unsupervised Action Discovery and Localization in Videos", @@ -17312,6 +18416,7 @@ "status": "Poster", "track": "main", "pid": "243", + "author_site": "Khurram Soomro; Mubarak Shah", "author": "Khurram Soomro; Mubarak Shah", "abstract": "This paper is the first to address the problem of unsupervised action localization in videos. Given unlabeled data without bounding box annotations, we propose a novel approach that: 1) Discovers action class labels and 2) Spatio-temporally localizes actions in videos. It begins by computing local video features to apply spectral clustering on a set of unlabeled training videos. For each cluster of videos, an undirected graph is constructed to extract a dominant set, which are known for high internal homogeneity and inhomogeneity between vertices outside it. Next, a discriminative clustering approach is applied, by training a classifier for each cluster, to iteratively select videos from the non dominant set and obtain complete video action classes. Once classes are discovered, training videos within each cluster are selected to perform automatic spatio-temporal annotations, by first oversegmenting videos in each discovered class into supervoxels and constructing a directed graph to apply a variant of knapsack problem with temporal constraints. Knapsack optimization jointly collects a subset of supervoxels, by enforcing the annotated action to be spatio-temporally connected and its volume to be the size of an actor. These annotations are used to train SVM action classifiers. During testing, actions are localized using a similar Knapsack approach, where supervoxels are grouped together and SVM, learned using videos from discovered action classes, is used to recognize these actions. We evaluate our approach on UCF Sports, Sub-JHMDB, JHMDB, THUMOS13 and UCF101 datasets. Our experiments suggest that despite using no action class labels and no bounding box annotations, we are able to get competitive results to the state-of-the-art supervised methods.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Soomro_Unsupervised_Action_Discovery_ICCV_2017_paper.pdf", @@ -17326,7 +18431,8 @@ "aff_domain": ";", "email": ";", "author_num": 2, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Soomro_Unsupervised_Action_Discovery_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Soomro_Unsupervised_Action_Discovery_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Soomro_2017_ICCV,\n \n author = {\n Soomro,\n Khurram and Shah,\n Mubarak\n},\n title = {\n Unsupervised Action Discovery and Localization in Videos\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Unsupervised Adaptation for Deep Stereo", @@ -17334,6 +18440,7 @@ "status": "Poster", "track": "main", "pid": "893", + "author_site": "Alessio Tonioni; Matteo Poggi; Stefano Mattoccia; Luigi Di Stefano", "author": "Alessio Tonioni; Matteo Poggi; Stefano Mattoccia; Luigi Di Stefano", "abstract": "Recent ground-breaking works have shown that deep neural networks can be trained end-to-end to regress dense disparity maps directly from image pairs. Computer generated imagery is deployed to gather the large data corpus required to train such networks, an additional fine-tuning allowing to adapt the model to work well also on real and possibly diverse environments. Yet, besides a few public datasets such as Kitti, the ground-truth needed to adapt the network to a new scenario is hardly available in practice. In this paper we propose a novel unsupervised adaptation approach that enables to fine-tune a deep learning stereo model without any ground-truth information. We rely on off-the-shelf stereo algorithms together with state-of-the-art confidence measures, the latter able to ascertain upon correctness of the measurements yielded by former. Thus, we train the network based on a novel loss-function that penalizes predictions disagreeing with the highly confident disparities provided by the algorithm and enforces a smoothness constraint. Experiments on popular datasets (KITTI 2012, KITTI 2015 and Middlebury 2014) and other challenging test images demonstrate the effectiveness of our proposal.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Tonioni_Unsupervised_Adaptation_for_ICCV_2017_paper.pdf", @@ -17358,7 +18465,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Italy" + "aff_country_unique": "Italy", + "bibtex": "@InProceedings{Tonioni_2017_ICCV,\n \n author = {\n Tonioni,\n Alessio and Poggi,\n Matteo and Mattoccia,\n Stefano and Di Stefano,\n Luigi\n},\n title = {\n Unsupervised Adaptation for Deep Stereo\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Unsupervised Creation of Parameterized Avatars", @@ -17366,6 +18474,7 @@ "status": "Oral", "track": "main", "pid": "1661", + "author_site": "Lior Wolf; Yaniv Taigman; Adam Polyak", "author": "Lior Wolf; Yaniv Taigman; Adam Polyak", "abstract": "We study the problem of mapping an input image to a tied pair consisting of a vector of parameters and an image that is created using a graphical engine from the vector of parameters. The mapping's objective is to have the output image as similar as possible to the input image. During training, no supervision is given in the form of matching inputs and outputs. This learning problem extends two literature problems: unsupervised domain adaptation and cross domain transfer. We define a generalization bound that is based on discrepancy, and employ a GAN to implement a network solution that corresponds to this bound. Experimentally, our method is shown to solve the problem of automatically creating avatars.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Wolf_Unsupervised_Creation_of_ICCV_2017_paper.pdf", @@ -17381,7 +18490,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Wolf_Unsupervised_Creation_of_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Wolf_Unsupervised_Creation_of_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Wolf_2017_ICCV,\n \n author = {\n Wolf,\n Lior and Taigman,\n Yaniv and Polyak,\n Adam\n},\n title = {\n Unsupervised Creation of Parameterized Avatars\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Unsupervised Domain Adaptation for Face Recognition in Unlabeled Videos", @@ -17389,6 +18499,7 @@ "status": "Poster", "track": "main", "pid": "1499", + "author_site": "Kihyuk Sohn; Sifei Liu; Guangyu Zhong; Xiang Yu; Ming-Hsuan Yang; Manmohan Chandraker", "author": "Kihyuk Sohn; Sifei Liu; Guangyu Zhong; Xiang Yu; Ming-Hsuan Yang; Manmohan Chandraker", "abstract": "Despite rapid advances in face recognition, there remains a clear gap between the performance of still image-based face recognition and video-based face recognition, due to the vast difference in visual quality between the domains and the difficulty of curating diverse large-scale video datasets. This paper addresses both of those challenges, through an image to video feature-level domain adaptation approach, to learn discriminative video frame representations. The framework utilizes large-scale unlabeled video data to reduce the gap between different domains while transferring discriminative knowledge from large-scale labeled still images. Given a face recognition network that is pretrained in the image domain, the adaptation is achieved by (i) distilling knowledge from the network to a video adaptation network through feature matching, (ii) performing feature restoration through synthetic data augmentation and (iii) learning a domain-invariant feature through a domain adversarial discriminator. We further improve performance through a discriminator-guided feature fusion that boosts high-quality frames while eliminating those degraded by video domain-specific factors. Experiments on the YouTube Faces and IJB-A datasets demonstrate that each module contributes to our feature-level domain adaptation framework and substantially improves video face recognition performance to achieve state-of-the-art accuracy. We demonstrate qualitatively that the network learns to suppress diverse artifacts in videos such as pose, illumination or occlusion without being explicitly trained for them.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Sohn_Unsupervised_Domain_Adaptation_ICCV_2017_paper.pdf", @@ -17413,7 +18524,8 @@ "aff_campus_unique_index": "1;1;2", "aff_campus_unique": ";Merced;San Diego", "aff_country_unique_index": "0;0;1;0;0;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Sohn_2017_ICCV,\n \n author = {\n Sohn,\n Kihyuk and Liu,\n Sifei and Zhong,\n Guangyu and Yu,\n Xiang and Yang,\n Ming-Hsuan and Chandraker,\n Manmohan\n},\n title = {\n Unsupervised Domain Adaptation for Face Recognition in Unlabeled Videos\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Unsupervised Learning From Video to Detect Foreground Objects in Single Images", @@ -17421,10 +18533,11 @@ "status": "Poster", "track": "main", "pid": "1897", + "author_site": "Ioana Croitoru; Simion-Vlad Bogolin; Marius Leordeanu", "author": "Ioana Croitoru; Simion-Vlad Bogolin; Marius Leordeanu", "abstract": "Unsupervised learning from visual data is one of the most difficult challenges in computer vision. It is essential for understanding how visual recognition works. Learning from unsupervised input has an immense practical value, as huge quantities of unlabeled videos can be collected at low cost. Here we address the task of unsupervised learning to detect and segment foreground objects in single images. We achieve our goal by training a student pathway, consisting of a deep neural network that learns to predict, from a single input image, the output of a teacher pathway that performs unsupervised object discovery in video. Our approach is different from the published methods that perform unsupervised discovery in videos or in collections of images at test time. We move the unsupervised discovery phase during the training stage, while at test time we apply the standard feed-forward processing along the student pathway. This has a dual benefit: firstly, it allows, in principle, unlimited generalization possibilities during training, while remaining fast at testing. Secondly, the student not only becomes able to detect in single images significantly better than its unsupervised video discovery teacher, but it also achieves state of the art results on two current benchmarks, YouTube Objects and Object Discovery datasets. At test time, our system is two orders of magnitude faster than other previous methods.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Croitoru_Unsupervised_Learning_From_ICCV_2017_paper.pdf", - "aff": "Institute of Mathematics of the Romanian Academy+University \u201dPolitehnica\u201d of Bucharest; Institute of Mathematics of the Romanian Academy+University \u201dPolitehnica\u201d of Bucharest; Institute of Mathematics of the Romanian Academy+University \u201dPolitehnica\u201d of Bucharest", + "aff": "Institute of Mathematics of the Romanian Academy+University ”Politehnica” of Bucharest; Institute of Mathematics of the Romanian Academy+University ”Politehnica” of Bucharest; Institute of Mathematics of the Romanian Academy+University ”Politehnica” of Bucharest", "project": "", "github": "", "supp": "", @@ -17445,7 +18558,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0+0", - "aff_country_unique": "Romania" + "aff_country_unique": "Romania", + "bibtex": "@InProceedings{Croitoru_2017_ICCV,\n \n author = {\n Croitoru,\n Ioana and Bogolin,\n Simion-Vlad and Leordeanu,\n Marius\n},\n title = {\n Unsupervised Learning From Video to Detect Foreground Objects in Single Images\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Unsupervised Learning of Important Objects From First-Person Videos", @@ -17453,6 +18567,7 @@ "status": "Poster", "track": "main", "pid": "882", + "author_site": "Gedas Bertasius; Hyun Soo Park; Stella X. Yu; Jianbo Shi", "author": "Gedas Bertasius; Hyun Soo Park; Stella X. Yu; Jianbo Shi", "abstract": "A first-person camera, placed at a person's head, captures, which objects are important to the camera wearer. Most prior methods for this task learn to detect such important objects from the manually labeled first-person data in a supervised fashion. However, important objects are strongly related to the camera wearer's internal state such as his intentions and attention, and thus, only the person wearing the camera can provide the importance labels. Such a constraint makes the annotation process costly and limited in scalability. In this work, we show that we can detect important objects in first-person images without the supervision by the camera wearer or even third-person labelers. We formulate an important detection problem as an interplay between the 1) segmentation and 2) recognition agents. The segmentation agent first proposes a possible important object segmentation mask for each image, and then feeds it to the recognition agent, which learns to predict an important object mask using visual semantics and spatial features. We implement such an interplay between both agents via an alternating cross-pathway supervision scheme inside our proposed Visual-Spatial Network (VSN). Our VSN consists of spatial (\"where\") and visual (\"what\") pathways, one of which learns common visual semantics while the other focuses on the spatial location cues. Our unsupervised learning is accomplished via a cross-pathway supervision, where one pathway feeds its predictions to a segmentation agent, which proposes a candidate important object segmentation mask that is then used by the other pathway as a supervisory signal. We show our method's success on two different important object datasets, where our method achieves similar or better results as the supervised methods.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Bertasius_Unsupervised_Learning_of_ICCV_2017_paper.pdf", @@ -17477,7 +18592,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Bertasius_2017_ICCV,\n \n author = {\n Bertasius,\n Gedas and Soo Park,\n Hyun and Yu,\n Stella X. and Shi,\n Jianbo\n},\n title = {\n Unsupervised Learning of Important Objects From First-Person Videos\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Unsupervised Learning of Object Landmarks by Factorized Spatial Embeddings", @@ -17485,6 +18601,7 @@ "status": "Oral", "track": "main", "pid": "1340", + "author_site": "James Thewlis; Hakan Bilen; Andrea Vedaldi", "author": "James Thewlis; Hakan Bilen; Andrea Vedaldi", "abstract": "Automatically learning the structure of object categories remains an important open problem in computer vision. We propose a novel unsupervised approach that can discover and learn to detect landmarks in object categories, thus characterizing their structure. Our approach is based on factorizing image deformations, as induced by a viewpoint change or an object articulation, by learning a deep neural network that detects landmarks compatible with such visual effects. We show that, by requiring the same neural network to be applicable to different object instances, our method naturally induces meaningful correspondences between different object instances in a category. We assess the method qualitatively on a variety of object types, natural an man-made. We also show that our unsupervised landmarks are highly predictive of manually-annotated landmarks in faces benchmark datasets, and can be used to regress those with a high degree of accuracy.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Thewlis_Unsupervised_Learning_of_ICCV_2017_paper.pdf", @@ -17509,7 +18626,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Thewlis_2017_ICCV,\n \n author = {\n Thewlis,\n James and Bilen,\n Hakan and Vedaldi,\n Andrea\n},\n title = {\n Unsupervised Learning of Object Landmarks by Factorized Spatial Embeddings\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Unsupervised Learning of Stereo Matching", @@ -17517,6 +18635,7 @@ "status": "Poster", "track": "main", "pid": "833", + "author_site": "Chao Zhou; Hong Zhang; Xiaoyong Shen; Jiaya Jia", "author": "Chao Zhou; Hong Zhang; Xiaoyong Shen; Jiaya Jia", "abstract": "In recent years, convolutional neural networks have shown its strong power for stereo matching cost learning. Current approaches learn the parameters of their models from public datasets with ground truth disparity. However, due to the limitations of these datasets and the difficulty of collecting new stereo data, current methods fail in real-life cases. In this work, we present a framework for learning stereo matching cost without human supervision. Our method updates the network parameter in a iterative manner. It starts with randomly initialized network. Correct matchings are carefully picked and used as training data in each round. In the end, the networks converges to a stable state, which performs comparably with supervised methods on various benchmarks.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zhou_Unsupervised_Learning_of_ICCV_2017_paper.pdf", @@ -17534,14 +18653,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Zhou_Unsupervised_Learning_of_ICCV_2017_paper.html", "aff_unique_index": "0;0;1;0+1", - "aff_unique_norm": "Chinese University of Hong Kong;Tencent", + "aff_unique_norm": "The Chinese University of Hong Kong;Tencent", "aff_unique_dep": ";Youtu Lab", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.tencent.com", "aff_unique_abbr": "CUHK;Tencent", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhou_2017_ICCV,\n \n author = {\n Zhou,\n Chao and Zhang,\n Hong and Shen,\n Xiaoyong and Jia,\n Jiaya\n},\n title = {\n Unsupervised Learning of Stereo Matching\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Unsupervised Object Segmentation in Video by Efficient Selection of Highly Probable Positive Features", @@ -17549,6 +18669,7 @@ "status": "Poster", "track": "main", "pid": "2400", + "author_site": "Emanuela Haller; Marius Leordeanu", "author": "Emanuela Haller; Marius Leordeanu", "abstract": "We address an essential problem in computer vision, that of unsupervised foreground object segmentation in video, where a main object of interest in a video sequence should be automatically separated from its background. An efficient solution to this task would enable large-scale video interpretation at a high semantic level in the absence of the costly manual labeling. We propose an efficient unsupervised method for generating foreground object soft masks based on automatic selection and learning from highly probable positive features. We show that such features can be selected efficiently by taking into consideration the spatio-temporal appearance and motion consistency of the object in the video sequence. We also emphasize the role of the contrasting properties between the foreground object and its background. Our model is created over several stages: we start from pixel level analysis and move to descriptors that consider information over groups of pixels combined with efficient motion analysis. We also prove theoretical properties of our unsupervised learning method, which under some mild constraints is guaranteed to learn the correct classifier even in the unsupervised case. We achieve competitive and even state of the art results on the challenging Youtube-Objects and SegTrack datasets, while being at least one order of magnitude faster than the competition. We believe that the strong performance of our method, along with its theoretical properties, constitute a solid step towards solving unsupervised discovery in video.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Haller_Unsupervised_Object_Segmentation_ICCV_2017_paper.pdf", @@ -17573,7 +18694,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0", - "aff_country_unique": "Romania" + "aff_country_unique": "Romania", + "bibtex": "@InProceedings{Haller_2017_ICCV,\n \n author = {\n Haller,\n Emanuela and Leordeanu,\n Marius\n},\n title = {\n Unsupervised Object Segmentation in Video by Efficient Selection of Highly Probable Positive Features\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Unsupervised Representation Learning by Sorting Sequences", @@ -17581,6 +18703,7 @@ "status": "Poster", "track": "main", "pid": "315", + "author_site": "Hsin-Ying Lee; Jia-Bin Huang; Maneesh Singh; Ming-Hsuan Yang", "author": "Hsin-Ying Lee; Jia-Bin Huang; Maneesh Singh; Ming-Hsuan Yang", "abstract": "We present an unsupervised representation learning approach using videos without semantic labels. We leverage the temporal coherence as a supervisory signal by formulating representation learning as a sequence sorting task. We take temporally shuffled frames (i.e. in non-chronological order) as inputs and train a convolutional neural network to sort the shuffled sequences. Similar to comparison-based sorting algorithms, we propose to extract features from all frame pairs and aggregate them to predict the correct order. As sorting shuffled image sequence requires an understanding of the statistical temporal structure of images, training with such a proxy task allows us to learn rich and generalizable visual representation. We validate the effectiveness of the learned representation using our method as pre-training on high-level recognition problems. The experimental results show that our method compares favorably against state-of-the-art methods on action recognition, image classification and object detection tasks.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Lee_Unsupervised_Representation_Learning_ICCV_2017_paper.pdf", @@ -17596,7 +18719,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Lee_Unsupervised_Representation_Learning_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Lee_Unsupervised_Representation_Learning_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Lee_2017_ICCV,\n \n author = {\n Lee,\n Hsin-Ying and Huang,\n Jia-Bin and Singh,\n Maneesh and Yang,\n Ming-Hsuan\n},\n title = {\n Unsupervised Representation Learning by Sorting Sequences\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Unsupervised Video Understanding by Reconciliation of Posture Similarities", @@ -17604,7 +18728,7 @@ "status": "Poster", "track": "main", "pid": "1863", - "author_site": "Timo Milbich; Miguel Bautista; Ekaterina Sutter; Bj\u00c3\u00b6rn Ommer", + "author_site": "Timo Milbich; Miguel Bautista; Ekaterina Sutter; Björn Ommer", "author": "Timo Milbich; Miguel Bautista; Ekaterina Sutter; Bjorn Ommer", "abstract": "Understanding human activity and being able to explain it in detail surpasses mere action classification by far in both complexity and value. The challenge is thus to describe an activity on the basis of its most fundamental constituents, the individual postures and their distinctive transitions. Supervised learning of such a fine-grained representation based on elementary poses is very tedious and does not scale. Therefore, we propose a completely unsupervised deep learning procedure based solely on video sequences, which starts from scratch without requiring pre-trained networks, predefined body models, or keypoints. A combinatorial sequence matching algorithm proposes relations between frames from subsets of the training data, while a CNN is reconciling the transitivity conflicts of the different subsets to learn a single concerted pose embedding despite changes in appearance across sequences. Without any manual annotation, the model learns a structured representation of postures and their temporal development. The model not only enables retrieval of similar postures but also temporal super-resolution. Additionally, based on a recurrent formulation, next frames can be synthesized.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Milbich_Unsupervised_Video_Understanding_ICCV_2017_paper.pdf", @@ -17629,7 +18753,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Heidelberg", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Milbich_2017_ICCV,\n \n author = {\n Milbich,\n Timo and Bautista,\n Miguel and Sutter,\n Ekaterina and Ommer,\n Bjorn\n},\n title = {\n Unsupervised Video Understanding by Reconciliation of Posture Similarities\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Using Sparse Elimination for Solving Minimal Problems in Computer Vision", @@ -17637,7 +18762,7 @@ "status": "Spotlight", "track": "main", "pid": "1399", - "author_site": "Janne Heikkil\u00c3\u00a4", + "author_site": "Janne Heikkilä", "author": "Janne Heikkila", "abstract": "Finding a closed form solution to a system of polynomial equations is a common problem in computer vision as well as in many other areas of engineering and science. Groebner basis techniques are often employed to provide the solution, but implementing an efficient Groebner basis solver to a given problem requires strong expertise in algebraic geometry. One can also convert the equations to a polynomial eigenvalue problem (PEP) and solve it using linear algebra, which is a more accessible approach for those who are not so familiar with algebraic geometry. In previous works PEP has been successfully applied for solving some relative pose problems in computer vision, but its wider exploitation is limited by the problem of finding a compact monomial basis. In this paper, we propose a new algorithm for selecting the basis that is in general more compact than the basis obtained with a state-of-the-art algorithm making PEP a more viable option for solving polynomial equations. Another contribution is that we present two minimal problems for camera self-calibration based on homography, and demonstrate experimentally using synthetic and real data that our algorithm can provide a numerically stable solution to the camera focal length from two homographies of unknown planar scene.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Heikkila_Using_Sparse_Elimination_ICCV_2017_paper.pdf", @@ -17660,7 +18785,8 @@ "aff_unique_url": "https://www.oulu.fi", "aff_unique_abbr": "", "aff_country_unique_index": "0", - "aff_country_unique": "Finland" + "aff_country_unique": "Finland", + "bibtex": "@InProceedings{Heikkila_2017_ICCV,\n \n author = {\n Heikkila,\n Janne\n},\n title = {\n Using Sparse Elimination for Solving Minimal Problems in Computer Vision\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "VPGNet: Vanishing Point Guided Network for Lane and Road Marking Detection and Recognition", @@ -17668,6 +18794,7 @@ "status": "Poster", "track": "main", "pid": "822", + "author_site": "Seokju Lee; Junsik Kim; Jae Shin Yoon; Seunghak Shin; Oleksandr Bailo; Namil Kim; Tae-Hee Lee; Hyun Seok Hong; Seung-Hoon Han; In So Kweon", "author": "Seokju Lee; Junsik Kim; Jae Shin Yoon; Seunghak Shin; Oleksandr Bailo; Namil Kim; Tae-Hee Lee; Hyun Seok Hong; Seung-Hoon Han; In So Kweon", "abstract": "In this paper, we propose a unified end-to-end trainable multi-task network that jointly handles lane and road marking detection and recognition that is guided by a vanishing point under adverse weather conditions. We tackle rainy and low illumination conditions, which have not been extensively studied until now due to clear challenges. For example, images taken under rainy days are subject to low illumination, while wet roads cause light reflection and distort the appearance of lane and road markings. At night, color distortion occurs under limited illumination. As a result, no benchmark dataset exists and only a few developed algorithms work under poor weather conditions. To address this shortcoming, we build up a lane and road marking benchmark which consists of about 20,000 images with 17 lane and road marking classes under four different scenarios: no rain, rain, heavy rain, and night. We train and evaluate several versions of the proposed multi-task network and validate the importance of each task. The resulting approach, VPGNet, can detect and classify lanes and road markings, and predict a vanishing point with a single forward pass. Experimental results show that our approach achieves high accuracy and robustness under various conditions in real-time (20 fps). The benchmark and the VPGNet model will be publicly available.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Lee_VPGNet_Vanishing_Point_ICCV_2017_paper.pdf", @@ -17685,14 +18812,15 @@ "author_num": 10, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Lee_VPGNet_Vanishing_Point_ICCV_2017_paper.html", "aff_unique_index": "0;0;0;0;0;0;1;1;1;0", - "aff_unique_norm": "KAIST;Samsung", + "aff_unique_norm": "KAIST;Samsung Electronics", "aff_unique_dep": "Robotics and Computer Vision Lab.;DMC R&D Center", "aff_unique_url": "https://www.kaist.ac.kr;https://www.samsung.com", "aff_unique_abbr": "KAIST;Samsung", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Lee_2017_ICCV,\n \n author = {\n Lee,\n Seokju and Kim,\n Junsik and Shin Yoon,\n Jae and Shin,\n Seunghak and Bailo,\n Oleksandr and Kim,\n Namil and Lee,\n Tae-Hee and Seok Hong,\n Hyun and Han,\n Seung-Hoon and So Kweon,\n In\n},\n title = {\n VPGNet: Vanishing Point Guided Network for Lane and Road Marking Detection and Recognition\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "VQS: Linking Segmentations to Questions and Answers for Supervised Attention in VQA and Question-Focused Semantic Segmentation", @@ -17700,6 +18828,7 @@ "status": "Poster", "track": "main", "pid": "667", + "author_site": "Chuang Gan; Yandong Li; Haoxiang Li; Chen Sun; Boqing Gong", "author": "Chuang Gan; Yandong Li; Haoxiang Li; Chen Sun; Boqing Gong", "abstract": "Rich and dense human labeled datasets are the main enabling factor, among others, for the recent exciting work on vision-language understanding. Many seemingly distinct annotations (e.g., semantic segmentation and visual questions answering (VQA)) are inherently connected in that they reveal different levels and perspectives of human understandings about the same visual scenes --- and even the same set of MS COCO images. The popularity of MS COCO could strongly correlate those annotations and tasks. Explicitly linking them up, as we envision, can significantly benefit not only individual tasks but also the overarching goal of unified vision-language understand. We present the preliminary work of linking the instance segmentations provided by MS COCO to the questions and answers (QA) in the VQA dataset. We call the collected links visual questions and segmentation answers (VQS). They transfer human supervision between the previously separate tasks, offer more effective leverage to existing problems, and also open the door for new tasks and richer models. We study two applications of the VQS data in this paper: supervised attention for VQA and a novel question-focused semantic segmentation task. For the former, we obtain state-of-the-art results on the VQA real multiple-choice task by simply augmenting multilayer perceptrons with some attention features that are learned by using the segmentation-QA links as explicit supervision. To put the latter in perspective, we study two plausible methods and an oracle upper bound.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Gan_VQS_Linking_Segmentations_ICCV_2017_paper.pdf", @@ -17717,14 +18846,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Gan_VQS_Linking_Segmentations_ICCV_2017_paper.html", "aff_unique_index": "0;1;2;3;1", - "aff_unique_norm": "Tsinghua University;University of Central Florida;Adobe;Google", - "aff_unique_dep": "Institute for Interdisciplinary Information Sciences;CRCV;Adobe Research;Google Research", + "aff_unique_norm": "Tsinghua University;University of Central Florida;Adobe Research;Google", + "aff_unique_dep": "Institute for Interdisciplinary Information Sciences;CRCV;;Google Research", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.ucf.edu;https://research.adobe.com;https://research.google", "aff_unique_abbr": "THU;UCF;Adobe;Google", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;1;1;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Gan_2017_ICCV,\n \n author = {\n Gan,\n Chuang and Li,\n Yandong and Li,\n Haoxiang and Sun,\n Chen and Gong,\n Boqing\n},\n title = {\n VQS: Linking Segmentations to Questions and Answers for Supervised Attention in VQA and Question-Focused Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "VegFru: A Domain-Specific Dataset for Fine-Grained Visual Categorization", @@ -17732,6 +18862,7 @@ "status": "Poster", "track": "main", "pid": "274", + "author_site": "Saihui Hou; Yushan Feng; Zilei Wang", "author": "Saihui Hou; Yushan Feng; Zilei Wang", "abstract": "VegFru: A Domain-Specific Dataset for Fine-grained Visual Categorization In this paper, we propose a novel domain-specific dataset named VegFru for fine-grained visual categorization (FGVC). While the existing datasets for FGVC are mainly focused on animal breeds or man-made objects with limited labelled data, VegFru is a larger dataset consisting of vegetables and fruits which are closely associated with the daily life of everyone. Aiming at domestic cooking and food management, VegFru categorizes vegetables and fruits according to their eating characteristics, and each image contains at least one edible part of vegetables or fruits with the same cooking usage. Particularly, all the images are labelled hierarchically. The current version covers vegetables and fruits of 25 upper-level categories and 292 subordinate classes. And it contains more than 160,000 images in total and at least 200 images for each subordinate class. Accompanying the dataset, we also propose an effective framework called HybridNet to exploit the label hierarchy for FGVC. Specifically, multiple granularity features are first extracted by dealing with the hierarchical labels separately. And then they are fused through explicit operation, e.g., Compact Bilinear Pooling, to form a unified representation for the ultimate recognition. The experimental results on the novel VegFru, the public FGVC-Aircraft and CUB-200-2011 indicate that HybridNet achieves one of the top performance on these datasets. The dataset and code are available at https://github.com/hshustc/vegfru.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Hou_VegFru_A_Domain-Specific_ICCV_2017_paper.pdf", @@ -17756,7 +18887,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Hou_2017_ICCV,\n \n author = {\n Hou,\n Saihui and Feng,\n Yushan and Wang,\n Zilei\n},\n title = {\n VegFru: A Domain-Specific Dataset for Fine-Grained Visual Categorization\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Video Deblurring via Semantic Segmentation and Pixel-Wise Non-Linear Kernel", @@ -17764,6 +18896,7 @@ "status": "Poster", "track": "main", "pid": "487", + "author_site": "Wenqi Ren; Jinshan Pan; Xiaochun Cao; Ming-Hsuan Yang", "author": "Wenqi Ren; Jinshan Pan; Xiaochun Cao; Ming-Hsuan Yang", "abstract": "Video deblurring is a challenging problem as the blur is complex and usually caused by the combination of camera shakes, object motions, and depth variations. Optical flow can be used for kernel estimation since it predicts motion trajectories. However, the estimates are often inaccurate in complex scenes at object boundaries, which are crucial in kernel estimation. In this paper, we exploit semantic segmentation in each blurry frame to understand the scene contents and use different motion models for image regions to guide optical flow estimation. While existing pixel-wise blur models assume that the blur kernel is the same as optical flow during the exposure time, this assumption does not hold when the motion blur trajectory at a pixel is different from the estimated linear optical flow. We analyze the relationship between motion blur trajectory and optical flow, and present a novel pixel-wise non-linear kernel model to account for motion blur. The proposed blur model is based on the non-linear optical flow, which describes complex motion blur more effectively. Extensive experiments on challenging blurry videos demonstrate the proposed algorithm performs favorably against the state-of-the-art methods.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Ren_Video_Deblurring_via_ICCV_2017_paper.pdf", @@ -17779,7 +18912,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Ren_Video_Deblurring_via_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Ren_Video_Deblurring_via_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Ren_2017_ICCV,\n \n author = {\n Ren,\n Wenqi and Pan,\n Jinshan and Cao,\n Xiaochun and Yang,\n Ming-Hsuan\n},\n title = {\n Video Deblurring via Semantic Segmentation and Pixel-Wise Non-Linear Kernel\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Video Fill in the Blank Using LR/RL LSTMs With Spatial-Temporal Attentions", @@ -17787,6 +18921,7 @@ "status": "Poster", "track": "main", "pid": "346", + "author_site": "Amir Mazaheri; Dong Zhang; Mubarak Shah", "author": "Amir Mazaheri; Dong Zhang; Mubarak Shah", "abstract": "Given a video and a description sentence with one missing word, \"source sentence\", Video-Fill-In-the-Blank (VFIB) problem is to find the missing word automatically. The contextual information of the sentence, as well as visual cues from the video, are important to infer the missing word accurately. Since the source sentence is broken into two fragments: the sentence's left fragment (before the blank) and the sentence's right fragment (after the blank), traditional Recurrent Neural Networks cannot encode this structure accurately because of many possible variations of the missing word in terms of the location and type of the word in the source sentence. For example, a missing word can be the first word or be in the middle of the sentence and it can be a verb or an adjective. In this paper, we propose a framework to tackle the textual encoding: Two separate LSTMs (the LR and RL LSTMs) are employed to encode the left and right sentence fragments and a novel structure is introduced to combine each fragment with an \"external memory\" corresponding to the opposite fragments. For the visual encoding, end-to-end spatial and temporal attention models are employed to select discriminative visual representations to find the missing word. In the experiments, we demonstrate the superior performance of the proposed method on challenging VFIB problem. Furthermore, we introduce an extended and more generalized version of VFIB, which is not limited to a single blank. Our experiments indicate the generalization capability of our method in dealing with such more realistic scenarios.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Mazaheri_Video_Fill_in_ICCV_2017_paper.pdf", @@ -17811,7 +18946,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Orlando", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Mazaheri_2017_ICCV,\n \n author = {\n Mazaheri,\n Amir and Zhang,\n Dong and Shah,\n Mubarak\n},\n title = {\n Video Fill in the Blank Using LR/RL LSTMs With Spatial-Temporal Attentions\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Video Frame Interpolation via Adaptive Separable Convolution", @@ -17819,6 +18955,7 @@ "status": "Poster", "track": "main", "pid": "310", + "author_site": "Simon Niklaus; Long Mai; Feng Liu", "author": "Simon Niklaus; Long Mai; Feng Liu", "abstract": "Standard video frame interpolation methods first estimate optical flow between input frames and then synthesize an intermediate frame guided by motion. Recent approaches merge these two steps into a single convolution process by convolving input frames with spatially adaptive kernels that account for motion and re-sampling simultaneously. These methods require large kernels to handle large motion, which limits the number of pixels whose kernels can be estimated at once due to the large memory demand. To address this problem, this paper formulates frame interpolation as local separable convolution over input frames using pairs of 1D kernels. Compared to regular 2D kernels, the 1D kernels require significantly fewer parameters to be estimated. Our method develops a deep fully convolutional neural network that takes two input frames and estimates pairs of 1D kernels for all pixels simultaneously. Since our method is able to estimate kernels and synthesizes the whole video frame at once, it allows for the incorporation of perceptual loss to train the neural network to produce visually pleasing frames. This deep neural network is trained end-to-end using widely available video data without any human annotation. Both qualitative and quantitative experiments show that our method provides a practical solution to high-quality video frame interpolation.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Niklaus_Video_Frame_Interpolation_ICCV_2017_paper.pdf", @@ -17843,7 +18980,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Niklaus_2017_ICCV,\n \n author = {\n Niklaus,\n Simon and Mai,\n Long and Liu,\n Feng\n},\n title = {\n Video Frame Interpolation via Adaptive Separable Convolution\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Video Frame Synthesis Using Deep Voxel Flow", @@ -17851,6 +18989,7 @@ "status": "Oral", "track": "main", "pid": "1324", + "author_site": "Ziwei Liu; Raymond A. Yeh; Xiaoou Tang; Yiming Liu; Aseem Agarwala", "author": "Ziwei Liu; Raymond A. Yeh; Xiaoou Tang; Yiming Liu; Aseem Agarwala", "abstract": "We address the problem of synthesizing new video frames in an existing video, either in-between existing frames (interpolation), or subsequent to them (extrapolation). This problem is challenging because video appearance and motion can be highly complex. Traditional optical-flow-based solutions often fail where flow estimation is challenging, while newer neural-network-based methods that hallucinate pixel values directly often produce blurry results. We combine the advantages of these two methods by training a deep network that learns to synthesize video frames by flowing pixel values from existing ones, which we call deep voxel flow. Our method requires no human supervision, and any video can be used as training data by dropping, and then learning to predict, existing frames. The technique is efficient, and can be applied at any video resolution. We demonstrate that our method produces results that both quantitatively and qualitatively improve upon the state-of-the-art.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Liu_Video_Frame_Synthesis_ICCV_2017_paper.pdf", @@ -17865,7 +19004,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Liu_Video_Frame_Synthesis_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Liu_Video_Frame_Synthesis_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Liu_2017_ICCV,\n \n author = {\n Liu,\n Ziwei and Yeh,\n Raymond A. and Tang,\n Xiaoou and Liu,\n Yiming and Agarwala,\n Aseem\n},\n title = {\n Video Frame Synthesis Using Deep Voxel Flow\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Video Reflection Removal Through Spatio-Temporal Optimization", @@ -17873,6 +19013,7 @@ "status": "Poster", "track": "main", "pid": "1277", + "author_site": "Ajay Nandoriya; Mohamed Elgharib; Changil Kim; Mohamed Hefeeda; Wojciech Matusik", "author": "Ajay Nandoriya; Mohamed Elgharib; Changil Kim; Mohamed Hefeeda; Wojciech Matusik", "abstract": "Reflections can obstruct content during video capture and hence their removal is desirable. Current removal techniques are designed for still images, extracting only one reflection (foreground) and one background layer from the input. When extended to videos, unpleasant artifacts such as temporal flickering and incomplete separation are generated. We present a technique for video reflection removal by jointly solving for motion and separation. The novelty of our work is in our optimization formulation as well as the motion initialization strategy. We present a novel spatio-temporal optimization that takes n frames as input and directly estimates 2n frames as output, n for each layer. We aim to fully utilize spatio-temporal information in our objective terms. Our motion initialization is based on iterative frame-to-frame alignment instead of the direct alignment used by current approaches. We compare against advanced video extensions of the state of the art, and we significantly reduce temporal flickering and improve separation. In addition, we reduce image blur and recover moving objects more accurately. We validate our approach through subjective and objective evaluations on real and controlled data.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Nandoriya_Video_Reflection_Removal_ICCV_2017_paper.pdf", @@ -17888,7 +19029,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Nandoriya_Video_Reflection_Removal_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Nandoriya_Video_Reflection_Removal_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Nandoriya_2017_ICCV,\n \n author = {\n Nandoriya,\n Ajay and Elgharib,\n Mohamed and Kim,\n Changil and Hefeeda,\n Mohamed and Matusik,\n Wojciech\n},\n title = {\n Video Reflection Removal Through Spatio-Temporal Optimization \n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Video Scene Parsing With Predictive Feature Learning", @@ -17896,6 +19038,7 @@ "status": "Poster", "track": "main", "pid": "2905", + "author_site": "Xiaojie Jin; Xin Li; Huaxin Xiao; Xiaohui Shen; Zhe Lin; Jimei Yang; Yunpeng Chen; Jian Dong; Luoqi Liu; Zequn Jie; Jiashi Feng; Shuicheng Yan", "author": "Xiaojie Jin; Xin Li; Huaxin Xiao; Xiaohui Shen; Zhe Lin; Jimei Yang; Yunpeng Chen; Jian Dong; Luoqi Liu; Zequn Jie; Jiashi Feng; Shuicheng Yan", "abstract": "Video scene parsing is challenging due to the following two reasons: firstly, it is non-trivial to learn meaningful video representations for producing the temporally consistent labeling map; secondly, such a learning process becomes more difficult with insufficient labeled video training data. In this work, we propose a unified framework to address the above two problems, which is to our knowledge the first model to employ predictive feature learning in the video scene parsing. The predictive feature learning is carried out in two predictive tasks: frame prediction and predictive parsing. It is experimentally proved that the learned predictive features in our model are able to significantly enhance the video parsing performance by combining with the standard image parsing network. Interestingly, the performance gain brought by the predictive learning is almost costless as the features are learned from a large amount of unlabeled video data in an unsupervised way. Extensive experiments over two challenging datasets, Cityscapes and Camvid, have demonstrated the effectiveness of our model by showing remarkable improvement over well-established baselines.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Jin_Video_Scene_Parsing_ICCV_2017_paper.pdf", @@ -17911,7 +19054,8 @@ "aff_domain": ";;;;;;;;;;;", "email": ";;;;;;;;;;;", "author_num": 12, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Jin_Video_Scene_Parsing_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Jin_Video_Scene_Parsing_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Jin_2017_ICCV,\n \n author = {\n Jin,\n Xiaojie and Li,\n Xin and Xiao,\n Huaxin and Shen,\n Xiaohui and Lin,\n Zhe and Yang,\n Jimei and Chen,\n Yunpeng and Dong,\n Jian and Liu,\n Luoqi and Jie,\n Zequn and Feng,\n Jiashi and Yan,\n Shuicheng\n},\n title = {\n Video Scene Parsing With Predictive Feature Learning\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "View Adaptive Recurrent Neural Networks for High Performance Human Action Recognition From Skeleton Data", @@ -17919,6 +19063,7 @@ "status": "Poster", "track": "main", "pid": "683", + "author_site": "Pengfei Zhang; Cuiling Lan; Junliang Xing; Wenjun Zeng; Jianru Xue; Nanning Zheng", "author": "Pengfei Zhang; Cuiling Lan; Junliang Xing; Wenjun Zeng; Jianru Xue; Nanning Zheng", "abstract": "Skeleton-based human action recognition has recently attracted increasing attention due to the popularity of 3D skeleton data. One main challenge lies in the large view variations in captured human actions. We propose a novel view adaptation scheme to automatically regulate observation viewpoints during the occurrence of an action. Rather than re-positioning the skeletons based on a human defined prior criterion, we design a view adaptive recurrent neural network (RNN) with LSTM architecture, which enables the network itself to adapt to the most suitable observation viewpoints from end to end. Extensive experiment analyses show that the proposed view adaptive RNN model strives to (1) transform the skeletons of various views to much more consistent viewpoints and (2) maintain the continuity of the action rather than transforming every frame to the same position with the same body orientation. Our model achieves significant improvement over the state-of-the-art approaches on three benchmark datasets.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zhang_View_Adaptive_Recurrent_ICCV_2017_paper.pdf", @@ -17933,7 +19078,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Zhang_View_Adaptive_Recurrent_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Zhang_View_Adaptive_Recurrent_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Zhang_2017_ICCV,\n \n author = {\n Zhang,\n Pengfei and Lan,\n Cuiling and Xing,\n Junliang and Zeng,\n Wenjun and Xue,\n Jianru and Zheng,\n Nanning\n},\n title = {\n View Adaptive Recurrent Neural Networks for High Performance Human Action Recognition From Skeleton Data\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Visual Forecasting by Imitating Dynamics in Natural Sequences", @@ -17941,6 +19087,7 @@ "status": "Spotlight", "track": "main", "pid": "177", + "author_site": "Kuo-Hao Zeng; William B. Shen; De-An Huang; Min Sun; Juan Carlos Niebles", "author": "Kuo-Hao Zeng; William B. Shen; De-An Huang; Min Sun; Juan Carlos Niebles", "abstract": "We introduce a general framework for visual forecasting, which directly imitates visual sequences without additional supervision. As a result, our model can be applied at several semantic levels and does not require any domain knowledge or handcrafted features. We achieve this by formulating visual forecasting as an inverse reinforcement learning (IRL) problem, and directly imitate the dynamics in natural sequences from their raw pixel values. The key challenge is the high-dimensional and continuous state-action space that prohibits the application of previous IRL algorithms. We address this computational bottleneck by extending recent progress in model-free imitation with trainable deep feature representations, which (1) bypasses the exhaustive state-action pair visits in dynamic programming by using a dual formulation and (2) avoids explicit state sampling at gradient computation using a deep feature reparametrization. This allows us to apply IRL at scale and directly imitate the dynamics in high-dimensional continuous visual sequences from the raw pixel values. We evaluate our approach at three different level-of-abstraction, from low level pixels to higher level semantics: future frame generation, action anticipation, visual story forecasting. At all levels, our approach outperforms existing methods.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zeng_Visual_Forecasting_by_ICCV_2017_paper.pdf", @@ -17955,7 +19102,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Zeng_Visual_Forecasting_by_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Zeng_Visual_Forecasting_by_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Zeng_2017_ICCV,\n \n author = {\n Zeng,\n Kuo-Hao and Shen,\n William B. and Huang,\n De-An and Sun,\n Min and Carlos Niebles,\n Juan\n},\n title = {\n Visual Forecasting by Imitating Dynamics in Natural Sequences\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Visual Odometry for Pixel Processor Arrays", @@ -17963,6 +19111,7 @@ "status": "Spotlight", "track": "main", "pid": "2507", + "author_site": "Laurie Bose; Jianing Chen; Stephen J. Carey; Piotr Dudek; Walterio Mayol-Cuevas", "author": "Laurie Bose; Jianing Chen; Stephen J. Carey; Piotr Dudek; Walterio Mayol-Cuevas", "abstract": "We present an approach of estimating constrained motion of a novel Cellular Processor Array (CPA) camera, on which each pixel is capable of limited processing and data storage allowing for fast low power parallel computation to be carried out directly on the focal-plane of the device. Rather than the standard pipeline involved with traditional cameras whereby whole camera images are transferred to a general computer system for processing, our approach performs all computation upon the CPA itself, with the only information being transfered to a standard computer being the camera's estimated motion.This limited data transfer allows for high frame-rate processing at hundreds of hz while consuming less than 1.5 Watts of power.The current implementation is restricted to the estimation of the camera's rotation in yaw and pitch, along with a scaleless estimate of the camera's forward and backward translation. We describe methods of image alignment by gradient descent, edge detection, and image scaling, all of which are performed solely on the CPA device itself and which form the core components of detecting camera motion.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Bose_Visual_Odometry_for_ICCV_2017_paper.pdf", @@ -17987,7 +19136,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Bose_2017_ICCV,\n \n author = {\n Bose,\n Laurie and Chen,\n Jianing and Carey,\n Stephen J. and Dudek,\n Piotr and Mayol-Cuevas,\n Walterio\n},\n title = {\n Visual Odometry for Pixel Processor Arrays\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Visual Relationship Detection With Internal and External Linguistic Knowledge Distillation", @@ -17995,6 +19145,7 @@ "status": "Poster", "track": "main", "pid": "906", + "author_site": "Ruichi Yu; Ang Li; Vlad I. Morariu; Larry S. Davis", "author": "Ruichi Yu; Ang Li; Vlad I. Morariu; Larry S. Davis", "abstract": "Understanding the visual relationship between two objects involves identifying the subject, the object, and a predicate relating them.We leverage the strong correlations between the predicate and the (subj,obj) pair (both semantically and spatially) to predict predicates conditioned on the subjects and the objects. Modeling the three entities jointly more accurately reflects their relationships compared to modeling them independently, but it complicates learning since the semantic space of visual relationships is huge and training data is limited, especially for long-tail relationships that have few instances. To overcome this, we use knowledge of linguistic statistics to regularize visual model learning. We obtain linguistic knowledge by mining from both training annotations (internal knowledge) and publicly available text, e.g., Wikipedia (external knowledge), computing the conditional probability distribution of a predicate given a (subj,obj) pair. As we train the visual model, we distill this knowledge into the deep model to achieve better generalization. Our experimental results on the Visual Relationship Detection (VRD) and Visual Genome datasets suggest that with this linguistic knowledge distillation, our model outperforms the state-of-the-art methods significantly, especially when predicting unseen relationships (e.g., recall improved from 8.45% to 19.17% on VRD zero-shot testing set).", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Yu_Visual_Relationship_Detection_ICCV_2017_paper.pdf", @@ -18019,7 +19170,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "College Park", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Yu_2017_ICCV,\n \n author = {\n Yu,\n Ruichi and Li,\n Ang and Morariu,\n Vlad I. and Davis,\n Larry S.\n},\n title = {\n Visual Relationship Detection With Internal and External Linguistic Knowledge Distillation\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Visual Semantic Planning Using Deep Successor Representations", @@ -18027,6 +19179,7 @@ "status": "Poster", "track": "main", "pid": "202", + "author_site": "Yuke Zhu; Daniel Gordon; Eric Kolve; Dieter Fox; Li Fei-Fei; Abhinav Gupta; Roozbeh Mottaghi; Ali Farhadi", "author": "Yuke Zhu; Daniel Gordon; Eric Kolve; Dieter Fox; Li Fei-Fei; Abhinav Gupta; Roozbeh Mottaghi; Ali Farhadi", "abstract": "A crucial capability of real-world intelligent agents is their ability to plan a sequence of actions to achieve their goals in the visual world. In this work, we address the problem of visual semantic planning: the task of predicting a sequence of actions from visual observations that transform a dynamic environment from an initial state to a goal state. Doing so entails knowledge about objects and their affordances, as well as actions and their preconditions and effects. We propose learning these through interacting with a visual and dynamic environment. Our proposed solution involves bootstrapping reinforcement learning with imitation learning. To ensure cross task generalization, we develop a deep predictive model based on successor representations. Our experimental results show near optimal results across a wide range of tasks in the challenging THOR environment. The supplementary video can be accessed at the following link: https://goo.gl/vXsbQP.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Zhu_Visual_Semantic_Planning_ICCV_2017_paper.pdf", @@ -18042,7 +19195,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Zhu_Visual_Semantic_Planning_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Zhu_Visual_Semantic_Planning_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Zhu_2017_ICCV,\n \n author = {\n Zhu,\n Yuke and Gordon,\n Daniel and Kolve,\n Eric and Fox,\n Dieter and Fei-Fei,\n Li and Gupta,\n Abhinav and Mottaghi,\n Roozbeh and Farhadi,\n Ali\n},\n title = {\n Visual Semantic Planning Using Deep Successor Representations\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Visual Transformation Aided Contrastive Learning for Video-Based Kinship Verification", @@ -18050,7 +19204,7 @@ "status": "Poster", "track": "main", "pid": "1107", - "author_site": "Hamdi Dibeklio\u00c4\u009flu", + "author_site": "Hamdi Dibeklioğlu", "author": "Hamdi Dibeklioglu", "abstract": "Automatic kinship verification from facial information is a relatively new and open research problem in computer vision. This paper explores the possibility of learning an efficient facial representation for video-based kinship verification by exploiting the visual transformation between facial appearance of kin pairs. To this end, a Siamese-like coupled convolutional encoder-decoder network is proposed. To reveal resemblance patterns of kinship while discarding the similarity patterns that can also be observed between people who do not have a kin relationship, a novel contrastive loss function is defined in the visual appearance space. For further optimization, the learned representation is fine-tuned using a feature-based contrastive loss. An expression matching procedure is employed in the model to minimize the negative influence of expression differences between kin pairs. Each kin video is analyzed by a sliding temporal window to leverage short-term facial dynamics. The effectiveness of the proposed method is assessed on seven different kin relationships using smile videos of kin pairs. On the average, 93.65% verification accuracy is achieved, improving the state of the art.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Dibeklioglu_Visual_Transformation_Aided_ICCV_2017_paper.pdf", @@ -18065,7 +19219,8 @@ "aff_domain": "", "email": "", "author_num": 1, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Dibeklioglu_Visual_Transformation_Aided_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Dibeklioglu_Visual_Transformation_Aided_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Dibeklioglu_2017_ICCV,\n \n author = {\n Dibeklioglu,\n Hamdi\n},\n title = {\n Visual Transformation Aided Contrastive Learning for Video-Based Kinship Verification\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Volumetric Flow Estimation for Incompressible Fluids Using the Stationary Stokes Equations", @@ -18073,6 +19228,7 @@ "status": "Poster", "track": "main", "pid": "1123", + "author_site": "Katrin Lasinger; Christoph Vogel; Konrad Schindler", "author": "Katrin Lasinger; Christoph Vogel; Konrad Schindler", "abstract": "In experimental fluid dynamics, the flow in a volume of fluid is observed by injecting high-contrast tracer particles and tracking them in multi-view video. Fluid dynamics researchers have developed variants of space-carving to reconstruct the 3D particle distribution at a given time-step, and then use relatively simple local matching to recover the motion over time. On the contrary, estimating the optical flow between two consecutive images is a long-standing standard problem in computer vision, but only little work exists about volumetric 3D flow. Here, we propose a variational method for 3D fluid flow estimation from multi-view data. We start from a 3D version of the standard variational flow model, and investigate different regularization schemes that ensure divergence-free flow fields, to account for the physics of incompressible fluids. Moreover, we propose a semi-dense formulation, to cope with the computational demands of large volumetric datasets. Flow is estimated and regularized at a lower spatial resolution, while the data term is evaluated at full resolution to preserve the discriminative power and geometric precision of the local particle distribution. Extensive experiments reveal that a simple sum of squared differences (SSD) is the most suitable data term for our application. For regularization, an energy whose Euler-Lagrange equations correspond to the stationary Stokes equations leads to the best results. This strictly enforces a divergence-free flow and additionally penalizes the squared gradient of the flow.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Lasinger_Volumetric_Flow_Estimation_ICCV_2017_paper.pdf", @@ -18088,7 +19244,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Lasinger_Volumetric_Flow_Estimation_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Lasinger_Volumetric_Flow_Estimation_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Lasinger_2017_ICCV,\n \n author = {\n Lasinger,\n Katrin and Vogel,\n Christoph and Schindler,\n Konrad\n},\n title = {\n Volumetric Flow Estimation for Incompressible Fluids Using the Stationary Stokes Equations\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Wavelet-SRNet: A Wavelet-Based CNN for Multi-Scale Face Super Resolution", @@ -18096,6 +19253,7 @@ "status": "Poster", "track": "main", "pid": "796", + "author_site": "Huaibo Huang; Ran He; Zhenan Sun; Tieniu Tan", "author": "Huaibo Huang; Ran He; Zhenan Sun; Tieniu Tan", "abstract": "Most modern face super-resolution methods resort to convolutional neural networks (CNN) to infer high-resolution (HR) face images. When dealing with very low resolution (LR) images, the performance of these CNN based methods greatly degrades. Meanwhile, these methods tend to produce over-smoothed outputs and miss some textural details. To address these challenges, this paper presents a wavelet-based CNN approach that can ultra-resolve a very low resolution face image of 16x16 or smaller pixel-size to its larger version of multiple scaling factors (2x, 4x, 8x and even 16x) in a unified framework. Different from conventional CNN methods directly inferring HR images, our approach firstly learns to predict the LR's corresponding series of HR's wavelet coefficients before reconstructing HR images from them. To capture both global topology information and local texture details of human faces, we present a flexible and extensible convolutional neural network with three types of loss: wavelet prediction loss, texture loss and full-image loss. Extensive experiments demonstrate that the proposed approach achieves more appealing results both quantitatively and qualitatively than state-of-the-art super-resolution methods.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Huang_Wavelet-SRNet_A_Wavelet-Based_ICCV_2017_paper.pdf", @@ -18120,7 +19278,8 @@ "aff_campus_unique_index": ";;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0+0;0+0+0;0+0+0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Huang_2017_ICCV,\n \n author = {\n Huang,\n Huaibo and He,\n Ran and Sun,\n Zhenan and Tan,\n Tieniu\n},\n title = {\n Wavelet-SRNet: A Wavelet-Based CNN for Multi-Scale Face Super Resolution\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "WeText: Scene Text Detection Under Weak Supervision", @@ -18128,6 +19287,7 @@ "status": "Poster", "track": "main", "pid": "515", + "author_site": "Shangxuan Tian; Shijian Lu; Chongshou Li", "author": "Shangxuan Tian; Shijian Lu; Chongshou Li", "abstract": "The requiring of large amounts of annotated training data has become a common constraint on various deep learning systems. In this paper, we propose a weakly supervised scene text detection method (WeText) that trains robust and accurate scene text detection models by learning from unannotated or weakly annotated data. With a \"light\" supervised model trained on a small fully annotated dataset, we explore semi-supervised and weakly supervised learning on a large unannotated dataset and a large weakly annotated dataset, respectively. For the unsupervised learning, the light supervised model is applied to the unannotated dataset to search for more character training samples, which are further combined with the small annotated dataset to retrain a superior character detection model. For the weakly supervised learning, the character searching is guided by high-level annotations of words/text lines that are widely available and also much easier to prepare. In addition, we design an unified scene character detector by adapting regression based deep networks, which greatly relieves the error accumulation issue that widely exists in most traditional approaches. Extensive experiments across different unannotated and weakly annotated datasets show that the scene text detection performance can be clearly boosted under both scenarios, where the weakly supervised learning can achieve the state-of-the-art performance by using only 229 fully annotated scene text images.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Tian_WeText_Scene_Text_ICCV_2017_paper.pdf", @@ -18152,7 +19312,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Tian_2017_ICCV,\n \n author = {\n Tian,\n Shangxuan and Lu,\n Shijian and Li,\n Chongshou\n},\n title = {\n WeText: Scene Text Detection Under Weak Supervision\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Weakly Supervised Learning of Deep Metrics for Stereo Reconstruction", @@ -18160,11 +19321,11 @@ "status": "Poster", "track": "main", "pid": "530", - "author_site": "Stepan Tulyakov; Anton Ivanov; Fran\u00c3\u00a7ois Fleuret", + "author_site": "Stepan Tulyakov; Anton Ivanov; François Fleuret", "author": "Stepan Tulyakov; Anton Ivanov; Francois Fleuret", "abstract": "Deep-learning metrics have recently demonstrated extremely good performance to match image patches for stereo reconstruction. However, training such metrics requires large amount of labeled stereo images, which can be difficult or costly to collect for certain applications (consider for example satellite stereo imaging). Moreover, labels from the depth sensors are often noisy. The main contribution of our work is a new weakly-supervised method for learning deep metrics from unlabeled stereo images, given coarse information about the scenes and the optical system. Our method alternatively optimizes the metric with a standard stochastic gradient descent, and applies stereo constraints to regularize its prediction. Experiments on reference data-sets show that, for a given network architecture, training with this new method without ground-truth produces a metric with performance as good as state-of-the-art baselines trained with the said ground-truth. This work has three practical implications. Firstly, it helps to overcome limitations of training sets, in particular noisy ground truth. Secondly it allows to use much more training data during learning. Thirdly, it allows to tune deep metric for a particular stereo system, even if ground truth is not available.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Tulyakov_Weakly_Supervised_Learning_ICCV_2017_paper.pdf", - "aff": "\u00b4Ecole Polytechnique F\u00b4ed\u00b4erale de Lausanne (EPFL), Switzerland; \u00b4Ecole Polytechnique F\u00b4ed\u00b4erale de Lausanne (EPFL), Switzerland; Idiap Research Institute, Switzerland", + "aff": "´Ecole Polytechnique F´ed´erale de Lausanne (EPFL), Switzerland; ´Ecole Polytechnique F´ed´erale de Lausanne (EPFL), Switzerland; Idiap Research Institute, Switzerland", "project": "", "github": "", "supp": "", @@ -18178,14 +19339,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Tulyakov_Weakly_Supervised_Learning_ICCV_2017_paper.html", "aff_unique_index": "0;0;1", - "aff_unique_norm": "EPFL;Idiap Research Institute", + "aff_unique_norm": "Ecole Polytechnique Fédérale de Lausanne;Idiap Research Institute", "aff_unique_dep": ";", "aff_unique_url": "https://www.epfl.ch;https://www.idiap.ch", "aff_unique_abbr": "EPFL;Idiap", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Switzerland" + "aff_country_unique": "Switzerland", + "bibtex": "@InProceedings{Tulyakov_2017_ICCV,\n \n author = {\n Tulyakov,\n Stepan and Ivanov,\n Anton and Fleuret,\n Francois\n},\n title = {\n Weakly Supervised Learning of Deep Metrics for Stereo Reconstruction\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Weakly Supervised Manifold Learning for Dense Semantic Object Correspondence", @@ -18193,6 +19355,7 @@ "status": "Poster", "track": "main", "pid": "916", + "author_site": "Utkarsh Gaur; B. S. Manjunath", "author": "Utkarsh Gaur; B. S. Manjunath", "abstract": "The goal of the semantic object correspondence problem is to compute dense association maps for a pair of images such that the same object parts get matched for very different appearing object instances. Our method builds on the recent findings that deep convolutional neural networks (DCNNs) implicitly learn a latent model of object parts even when trained for classification. We also leverage a key correspondence problem insight that the geometric structure between object parts is consistent across multiple object instances. These two concepts are then combined in the form of a novel optimization scheme. This optimization learns a feature embedding by rewarding for projecting features closer on the manifold if they have low feature-space distance. Simultaneously, the optimization penalizes feature clusters whose geometric structure is inconsistent with the observed geometric structure of object parts. In this manner, by accounting for feature space similarities and feature neighborhood context together, a manifold is learned where features belonging to semantically similar object parts cluster together. We also describe transferring these embedded features to the sister tasks of semantic keypoint classification and localization task via a Siamese DCNN. We provide qualitative results on the Pascal VOC 2012 images and quantitative results on the Pascal Berkeley dataset where we improve on the state of the art by over 5% on classification and over 9% on localization tasks.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Gaur_Weakly_Supervised_Manifold_ICCV_2017_paper.pdf", @@ -18217,7 +19380,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Santa Barbara", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Gaur_2017_ICCV,\n \n author = {\n Gaur,\n Utkarsh and Manjunath,\n B. S.\n},\n title = {\n Weakly Supervised Manifold Learning for Dense Semantic Object Correspondence\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Weakly Supervised Object Localization Using Things and Stuff Transfer", @@ -18225,6 +19389,7 @@ "status": "Poster", "track": "main", "pid": "1318", + "author_site": "Miaojing Shi; Holger Caesar; Vittorio Ferrari", "author": "Miaojing Shi; Holger Caesar; Vittorio Ferrari", "abstract": "We propose to help weakly supervised object localization for classes where location annotations are not available, by transferring things and stuff knowledge from a source set with available annotations. The source and target classes might share similar appearance (e.g. bear fur is similar to cat fur) or appear against similar background (e.g. horse and sheep appear against grass). To exploit this, we acquire three types of knowledge from the source set: a segmentation model trained on both thing and stuff classes; similarity relations between target and source classes; and co-occurrence relations between thing and stuff classes in the source. The segmentation model is used to generate thing and stuff segmentation maps on a target image, while the class similarity and co-occurrence knowledge help refining them. We then incorporate these maps as new cues into a multiple instance learning framework (MIL), propagating the transferred knowledge from the pixel level to the object proposal level. In extensive experiments, we conduct our transfer from the PASCAL Context dataset (source) to the ILSVRC, COCO and PASCAL VOC 2007 datasets (targets). We evaluate our transfer across widely different thing classes, including some that are not similar in appearance, but appear against similar background. The results demonstrate significant improvement over standard MIL, and we outperform the state-of-the-art in the transfer setting.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Shi_Weakly_Supervised_Object_ICCV_2017_paper.pdf", @@ -18249,7 +19414,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0;0", - "aff_country_unique": "United Kingdom;China" + "aff_country_unique": "United Kingdom;China", + "bibtex": "@InProceedings{Shi_2017_ICCV,\n \n author = {\n Shi,\n Miaojing and Caesar,\n Holger and Ferrari,\n Vittorio\n},\n title = {\n Weakly Supervised Object Localization Using Things and Stuff Transfer\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Weakly Supervised Summarization of Web Videos", @@ -18257,6 +19423,7 @@ "status": "Poster", "track": "main", "pid": "1507", + "author_site": "Rameswar Panda; Abir Das; Ziyan Wu; Jan Ernst; Amit K. Roy-Chowdhury", "author": "Rameswar Panda; Abir Das; Ziyan Wu; Jan Ernst; Amit K. Roy-Chowdhury", "abstract": "Most of the prior works summarize videos by either exploring different heuristically designed criteria in an unsupervised way or developing fully supervised algorithms by leveraging human-crafted training data in form of video-summary pairs or importance annotations. However, unsupervised methods are blind to the video category and often fail to produce semantically meaningful video summaries. On the other hand, acquisition of large amount of training data in supervised approaches is non-trivial and may lead to a biased model. Different from existing methods, we introduce a weakly supervised approach that requires only video-level annotation for summarizing web videos. Casting the problem as a weakly supervised learning problem, we propose a flexible deep 3D CNN architecture to learn the notion of importance using only video-level annotation, and without any human-crafted training data. Specifically, our main idea is to leverage multiple videos of a category to automatically learn a parametric model for categorizing videos and then adopt the model to find important segments from a given video as the ones which have maximum influence to the model output. Furthermore, to unleash the full potential of our 3D CNN architecture, we also explored a series of good practices to reduce the influence of limited training data while summarizing videos. Experiments on two challenging and diverse datasets well demonstrate that our approach produces superior quality video summaries compared to several recently proposed approaches.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Panda_Weakly_Supervised_Summarization_ICCV_2017_paper.pdf", @@ -18281,7 +19448,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Riverside;", "aff_country_unique_index": "0;0;1;1;0", - "aff_country_unique": "United States;Germany" + "aff_country_unique": "United States;Germany", + "bibtex": "@InProceedings{Panda_2017_ICCV,\n \n author = {\n Panda,\n Rameswar and Das,\n Abir and Wu,\n Ziyan and Ernst,\n Jan and Roy-Chowdhury,\n Amit K.\n},\n title = {\n Weakly Supervised Summarization of Web Videos\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Weakly- and Self-Supervised Learning for Content-Aware Deep Image Retargeting", @@ -18289,6 +19457,7 @@ "status": "Spotlight", "track": "main", "pid": "1241", + "author_site": "Donghyeon Cho; Jinsun Park; Tae-Hyun Oh; Yu-Wing Tai; In So Kweon", "author": "Donghyeon Cho; Jinsun Park; Tae-Hyun Oh; Yu-Wing Tai; In So Kweon", "abstract": "This paper proposes a weakly- and self-supervised deep convolutional neural network (WSSDCNN) for content-aware image retargeting. Our network takes a source image and a target aspect ratio, and then directly outputs a retargeted image. Retargeting is performed through a shift map, which is a pixel-wise mapping from the source to the target grid. Our method implicitly learns an attention map, which leads to a content-aware shift map for image retargeting. As a result, discriminative parts in an image are preserved, while background regions are adjusted seamlessly. In the training phase, pairs of an image and its image level annotation are used to compute content and structure losses. We demonstrate the effectiveness of our proposed method for a retargeting application with insightful analyses.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Cho_Weakly-_and_Self-Supervised_ICCV_2017_paper.pdf", @@ -18313,7 +19482,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0", - "aff_country_unique": "South Korea;China" + "aff_country_unique": "South Korea;China", + "bibtex": "@InProceedings{Cho_2017_ICCV,\n \n author = {\n Cho,\n Donghyeon and Park,\n Jinsun and Oh,\n Tae-Hyun and Tai,\n Yu-Wing and So Kweon,\n In\n},\n title = {\n Weakly- and Self-Supervised Learning for Content-Aware Deep Image Retargeting\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Weakly-Supervised Learning of Visual Relations", @@ -18321,6 +19491,7 @@ "status": "Oral", "track": "main", "pid": "835", + "author_site": "Julia Peyre; Josef Sivic; Ivan Laptev; Cordelia Schmid", "author": "Julia Peyre; Josef Sivic; Ivan Laptev; Cordelia Schmid", "abstract": "This paper introduces a novel approach for modeling visual relations between pairs of objects. We call relation a triplet of the form (subject, predicate, object) where the predicate is typically a preposition (eg. 'under', 'in front of') or a verb ('hold', 'ride') that links a pair of objects (subject, object). Learning such relations is challenging as the objects have different spatial configurations and appearances depending on the relation in which they occur. Another major challenge comes from the difficulty to get annotations, especially at box-level, for all possible triplets, which makes both learning and evaluation difficult. The contributions of this paper are threefold. First, we design strong yet flexible visual features that encode the appearance and spatial configuration for pairs of objects. Second, we propose a weakly-supervised discriminative clustering model to learn relations from image-level labels only. Third we introduce a new challenging dataset of unusual relations (UnRel) together with an exhaustive annotation, that enables accurate evaluation of visual relation retrieval. We show experimentally that our model results in state-of-the-art results on the visual relationship dataset significantly improving performance on previously unseen relations (zero-shot learning), and confirm this observation on our newly introduced UnRel dataset.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Peyre_Weakly-Supervised_Learning_of_ICCV_2017_paper.pdf", @@ -18336,7 +19507,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Peyre_Weakly-Supervised_Learning_of_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Peyre_Weakly-Supervised_Learning_of_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Peyre_2017_ICCV,\n \n author = {\n Peyre,\n Julia and Sivic,\n Josef and Laptev,\n Ivan and Schmid,\n Cordelia\n},\n title = {\n Weakly-Supervised Learning of Visual Relations\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "What Actions Are Needed for Understanding Human Actions in Videos?", @@ -18344,6 +19516,7 @@ "status": "Poster", "track": "main", "pid": "707", + "author_site": "Gunnar A. Sigurdsson; Olga Russakovsky; Abhinav Gupta", "author": "Gunnar A. Sigurdsson; Olga Russakovsky; Abhinav Gupta", "abstract": "What is the right way to reason about human activities? What directions forward are most promising? In this work, we analyze the current state of human activity understanding in videos. The goal of this paper is to examine datasets, evaluation metrics, algorithms, and potential future directions. We look at the qualitative attributes that define activities such as pose variability, brevity, and density. The experiments consider multiple state-of-the-art algorithms and multiple datasets. The results demonstrate that while there is inherent ambiguity in the temporal extent of activities, current datasets still permit effective benchmarking. We discover that fine-grained understanding of objects and pose when combined with temporal reasoning is likely to yield substantial improvements in algorithmic accuracy. We present the many kinds of information that will be needed to achieve substantial gains in activity understanding: objects, verbs, intent, and sequential reasoning. The software and additional information will be made available to provide other researchers detailed diagnostics to understand their own algorithms.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Sigurdsson_What_Actions_Are_ICCV_2017_paper.pdf", @@ -18359,7 +19532,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Sigurdsson_What_Actions_Are_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Sigurdsson_What_Actions_Are_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Sigurdsson_2017_ICCV,\n \n author = {\n Sigurdsson,\n Gunnar A. and Russakovsky,\n Olga and Gupta,\n Abhinav\n},\n title = {\n What Actions Are Needed for Understanding Human Actions in Videos?\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "What Is Around the Camera?", @@ -18367,6 +19541,7 @@ "status": "Poster", "track": "main", "pid": "2246", + "author_site": "Stamatios Georgoulis; Konstantinos Rematas; Tobias Ritschel; Mario Fritz; Tinne Tuytelaars; Luc Van Gool", "author": "Stamatios Georgoulis; Konstantinos Rematas; Tobias Ritschel; Mario Fritz; Tinne Tuytelaars; Luc Van Gool", "abstract": "How much does a single image reveal about the environment it was taken in? In this paper, we investigate how much of that information can be retrieved from a foreground object, combined with the background (i.e. the visible part of the environment). Assuming it is not perfectly diffuse, the foreground object acts as a complexly shaped and far-from-perfect mirror. An additional challenge is that its appearance confounds the light coming from the environment with the unknown materials it is made of. We propose a learning-based approach to predict the environment from multiple reflectance maps that are computed from approximate surface normals. The proposed method allows us to jointly model the statistics of environments and material properties. We train our system from synthesized training data, but demonstrate its applicability to real-world data. Interestingly, our analysis shows that the information obtained from objects made out of multiple materials often is complementary and leads to better performance.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Georgoulis_What_Is_Around_ICCV_2017_paper.pdf", @@ -18382,7 +19557,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Georgoulis_What_Is_Around_ICCV_2017_paper.html" + "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Georgoulis_What_Is_Around_ICCV_2017_paper.html", + "bibtex": "@InProceedings{Georgoulis_2017_ICCV,\n \n author = {\n Georgoulis,\n Stamatios and Rematas,\n Konstantinos and Ritschel,\n Tobias and Fritz,\n Mario and Tuytelaars,\n Tinne and Van Gool,\n Luc\n},\n title = {\n What Is Around the Camera?\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "What Will Happen Next? Forecasting Player Moves in Sports Videos", @@ -18390,6 +19566,7 @@ "status": "Poster", "track": "main", "pid": "1534", + "author_site": "Panna Felsen; Pulkit Agrawal; Jitendra Malik", "author": "Panna Felsen; Pulkit Agrawal; Jitendra Malik", "abstract": "A large number of very popular team sports involve the act of one team trying to score a goal against the other. During this game play, defending players constantly try to predict the next move of the attackers to prevent them from scoring, whereas attackers constantly try to predict the next move of the defenders in order to defy them and score. Such behavior is a prime example of the general human faculty to make predictions about the future and is an important facet of human intelligence. An algorithmic solution to learning a model of the external world from sensory inputs in order to make forecasts is an important unsolved problem. In this work we develop a generic framework for forecasting future events in team sports videos directly from visual inputs. We introduce water polo and basketball datasets towards this end and compare the predictions of the proposed methods against expert and non-expert humans.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Felsen_What_Will_Happen_ICCV_2017_paper.pdf", @@ -18414,7 +19591,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Felsen_2017_ICCV,\n \n author = {\n Felsen,\n Panna and Agrawal,\n Pulkit and Malik,\n Jitendra\n},\n title = {\n What Will Happen Next? Forecasting Player Moves in Sports Videos\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "When Unsupervised Domain Adaptation Meets Tensor Representations", @@ -18422,6 +19600,7 @@ "status": "Poster", "track": "main", "pid": "122", + "author_site": "Hao Lu; Lei Zhang; Zhiguo Cao; Wei Wei; Ke Xian; Chunhua Shen; Anton van den Hengel", "author": "Hao Lu; Lei Zhang; Zhiguo Cao; Wei Wei; Ke Xian; Chunhua Shen; Anton van den Hengel", "abstract": "Domain adaption (DA) allows machine learning methods trained on data sampled from one distribution to be applied to data sampled from another. It is thus of great practical importance to the application of such methods. Despite the fact that tensor representations are widely used in Computer Vision to capture multi-linear relationships that affect the data, most existing DA methods are applicable to vectors only. This renders them incapable of reflecting and preserving important structure in many problems. We thus propose here a learning-based method to adapt the source and target tensor representations directly, without vectorization. In particular, a set of alignment matrices is introduced to align the tensor representations from both domains into the invariant tensor subspace. These alignment matrices and the tensor subspace are modeled as a joint optimization problem and can be learned adaptively from the data using the proposed alternative minimization scheme. Extensive experiments show that our approach is capable of preserving the discriminative power of the source domain, of resisting the effects of label noise, and works effectively for small sample sizes, and even one-shot DA. We show that our method outperforms the state-of-the-art on the task of cross-domain visual recognition in both efficacy and efficiency, and particularly that it outperforms all comparators when applied to DA of the convolutional activations of deep convolutional networks.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Lu_When_Unsupervised_Domain_ICCV_2017_paper.pdf", @@ -18439,14 +19618,15 @@ "author_num": 7, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Lu_When_Unsupervised_Domain_ICCV_2017_paper.html", "aff_unique_index": "0;1;0;1;0;2;2", - "aff_unique_norm": "Huazhong University of Science and Technology;Northwestern Polytechnical University;University of Adelaide", + "aff_unique_norm": "Huazhong University of Science and Technology;Northwestern Polytechnical University;The University of Adelaide", "aff_unique_dep": "School of Automation;School of Computer Science and Engineering;School of Computer Science", "aff_unique_url": "http://www.hust.edu.cn;https://www.nwpu.edu.cn;https://www.adelaide.edu.au", "aff_unique_abbr": "HUST;NWPU;Adelaide", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;1;1", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Lu_2017_ICCV,\n \n author = {\n Lu,\n Hao and Zhang,\n Lei and Cao,\n Zhiguo and Wei,\n Wei and Xian,\n Ke and Shen,\n Chunhua and van den Hengel,\n Anton\n},\n title = {\n When Unsupervised Domain Adaptation Meets Tensor Representations\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "WordSup: Exploiting Word Annotations for Character Based Text Detection", @@ -18454,6 +19634,7 @@ "status": "Poster", "track": "main", "pid": "2433", + "author_site": "Han Hu; Chengquan Zhang; Yuxuan Luo; Yuzhuo Wang; Junyu Han; Errui Ding", "author": "Han Hu; Chengquan Zhang; Yuxuan Luo; Yuzhuo Wang; Junyu Han; Errui Ding", "abstract": "Imagery texts are usually organized as a hierarchy of several visual elements, i.e. characters, words, text lines and text blocks. Among these elements, character is the most basic one for various languages such as Western, Chinese, Japanese, mathematical expression and etc. It is natural and convenient to construct a common text detection engine based on character detectors. However, training character detectors requires a vast of location annotated characters, which are expensive to obtain. Actually, the existing real text datasets are mostly annotated in word or line level. To remedy this dilemma, we propose a weakly supervised framework that can utilize word annotations, either in tight quadrangles or the more loose bounding boxes, for character detector training. When applied in scene text detection, we are thus able to train a robust character detector by exploiting word annotations in the rich large-scale real scene text datasets, e.g. ICDAR15 [??] and COCO-text [??]. The character detector acts as a key role in the pipeline of our text detection engine. It achieves the state-of-the-art performance on several challenging scene text detection benchmarks. We also demonstrate the flexibility of our pipeline by various scenarios, including deformed text detection and math expression recognition.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Hu_WordSup_Exploiting_Word_ICCV_2017_paper.pdf", @@ -18471,14 +19652,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Hu_WordSup_Exploiting_Word_ICCV_2017_paper.html", "aff_unique_index": "0+1;1;1;1;1;1", - "aff_unique_norm": "Microsoft;Baidu", + "aff_unique_norm": "Microsoft Research Asia;Baidu Research", "aff_unique_dep": "Microsoft Research;IDL", "aff_unique_url": "https://www.microsoft.com/en-us/research/group/asia;https://research.baidu.com", "aff_unique_abbr": "MSRA;Baidu", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Hu_2017_ICCV,\n \n author = {\n Hu,\n Han and Zhang,\n Chengquan and Luo,\n Yuxuan and Wang,\n Yuzhuo and Han,\n Junyu and Ding,\n Errui\n},\n title = {\n WordSup: Exploiting Word Annotations for Character Based Text Detection\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" }, { "title": "Zero-Order Reverse Filtering", @@ -18486,6 +19668,7 @@ "status": "Poster", "track": "main", "pid": "218", + "author_site": "Xin Tao; Chao Zhou; Xiaoyong Shen; Jue Wang; Jiaya Jia", "author": "Xin Tao; Chao Zhou; Xiaoyong Shen; Jue Wang; Jiaya Jia", "abstract": "In this paper, we study an unconventional but practically meaningful reversibility problem of commonly used image filters. We broadly define filters as operations to smooth images or to produce layers via global or local algorithms. And we raise the intriguingly problem if they are reservable to the status before filtering. To answer it, we present a novel strategy to understand general filter via contraction mappings on a metric space. A very simple yet effective zero-order algorithm is proposed. It is able to practically reverse most filters with low computational cost. We present quite a few experiments in the paper and supplementary file to thoroughly verify its performance. This method can also be generalized to solve other inverse problems and enables new applications.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2017/papers/Tao_Zero-Order_Reverse_Filtering_ICCV_2017_paper.pdf", @@ -18503,13 +19686,14 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_iccv_2017/html/Tao_Zero-Order_Reverse_Filtering_ICCV_2017_paper.html", "aff_unique_index": "0;0;1;2;0+1", - "aff_unique_norm": "Chinese University of Hong Kong;Tencent;Megvii Technology", + "aff_unique_norm": "The Chinese University of Hong Kong;Tencent;Megvii Technology", "aff_unique_dep": ";Youtu Lab;", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.tencent.com;https://www.megvii.com/", "aff_unique_abbr": "CUHK;Tencent;Megvii", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Tao_2017_ICCV,\n \n author = {\n Tao,\n Xin and Zhou,\n Chao and Shen,\n Xiaoyong and Wang,\n Jue and Jia,\n Jiaya\n},\n title = {\n Zero-Order Reverse Filtering\n},\n booktitle = {\n Proceedings of the IEEE International Conference on Computer Vision (ICCV)\n},\n month = {\n Oct\n},\n year = {\n 2017\n} \n}" } ] \ No newline at end of file diff --git a/iccv/iccv2019.json b/iccv/iccv2019.json index 9f0689d..a2e90c3 100644 --- a/iccv/iccv2019.json +++ b/iccv/iccv2019.json @@ -30,7 +30,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yan_2019_ICCV,\n \n author = {\n Yan,\n Bin and Zhao,\n Haojie and Wang,\n Dong and Lu,\n Huchuan and Yang,\n Xiaoyun\n},\n title = {\n 'Skimming-Perusal' Tracking: A Framework for Real-Time and Robust Long-Term Tracking\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "3C-Net: Category Count and Center Loss for Weakly-Supervised Action Localization", @@ -42,7 +43,7 @@ "author": "Sanath Narayan; Hisham Cholakkal; Fahad Shahbaz Khan; Ling Shao", "abstract": "Temporal action localization is a challenging computer vision problem with numerous real-world applications. Most existing methods require laborious frame-level supervision to train action localization models. In this work, we propose a framework, called 3C-Net, which only requires video-level supervision (weak supervision) in the form of action category labels and the corresponding count. We introduce a novel formulation to learn discriminative action features with enhanced localization capabilities. Our joint formulation has three terms: a classification term to ensure the separability of learned action features, an adapted multi-label center loss term to enhance the action feature discriminability and a counting loss term to delineate adjacent action sequences, leading to improved localization. Comprehensive experiments are performed on two challenging benchmarks: THUMOS14 and ActivityNet 1.2. Our approach sets a new state-of-the-art for weakly-supervised temporal action localization on both datasets. On the THUMOS14 dataset, the proposed method achieves an absolute gain of 4.6% in terms of mean average precision (mAP), compared to the state-of-the-art. Source code is available at https://github.com/naraysa/3c-net.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Narayan_3C-Net_Category_Count_and_Center_Loss_for_Weakly-Supervised_Action_Localization_ICCV_2019_paper.pdf", - "aff": "Inception Institute of Arti\ufb01cial Intelligence, UAE; Inception Institute of Arti\ufb01cial Intelligence, UAE; Inception Institute of Arti\ufb01cial Intelligence, UAE; Inception Institute of Arti\ufb01cial Intelligence, UAE", + "aff": "Inception Institute of Artificial Intelligence, UAE; Inception Institute of Artificial Intelligence, UAE; Inception Institute of Artificial Intelligence, UAE; Inception Institute of Artificial Intelligence, UAE", "project": "", "github": "https://github.com/naraysa/3c-net", "supp": "http://openaccess.thecvf.com/content_ICCV_2019/supplemental/Narayan_3C-Net_Category_Count_ICCV_2019_supplemental.zip", @@ -63,7 +64,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United Arab Emirates" + "aff_country_unique": "United Arab Emirates", + "bibtex": "@InProceedings{Narayan_2019_ICCV,\n \n author = {\n Narayan,\n Sanath and Cholakkal,\n Hisham and Khan,\n Fahad Shahbaz and Shao,\n Ling\n},\n title = {\n 3C-Net: Category Count and Center Loss for Weakly-Supervised Action Localization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "3D Face Modeling From Diverse Raw Scan Data", @@ -96,7 +98,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "East Lansing", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Liu_2019_ICCV,\n \n author = {\n Liu,\n Feng and Tran,\n Luan and Liu,\n Xiaoming\n},\n title = {\n 3D Face Modeling From Diverse Raw Scan Data\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "3D Instance Segmentation via Multi-Task Metric Learning", @@ -120,7 +123,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Lahoud_3D_Instance_Segmentation_via_Multi-Task_Metric_Learning_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Lahoud_3D_Instance_Segmentation_via_Multi-Task_Metric_Learning_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Lahoud_2019_ICCV,\n \n author = {\n Lahoud,\n Jean and Ghanem,\n Bernard and Pollefeys,\n Marc and Oswald,\n Martin R.\n},\n title = {\n 3D Instance Segmentation via Multi-Task Metric Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "3D Point Cloud Generative Adversarial Network Based on Tree Structured Graph Convolutions", @@ -153,7 +157,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Seoul", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Shu_2019_ICCV,\n \n author = {\n Shu,\n Dong Wook and Park,\n Sung Woo and Kwon,\n Junseok\n},\n title = {\n 3D Point Cloud Generative Adversarial Network Based on Tree Structured Graph Convolutions\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "3D Scene Graph: A Structure for Unified Semantics, 3D Space, and Camera", @@ -177,7 +182,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Armeni_3D_Scene_Graph_A_Structure_for_Unified_Semantics_3D_Space_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Armeni_3D_Scene_Graph_A_Structure_for_Unified_Semantics_3D_Space_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Armeni_2019_ICCV,\n \n author = {\n Armeni,\n Iro and He,\n Zhi-Yang and Gwak,\n JunYoung and Zamir,\n Amir R. and Fischer,\n Martin and Malik,\n Jitendra and Savarese,\n Silvio\n},\n title = {\n 3D Scene Graph: A Structure for Unified Semantics,\n 3D Space,\n and Camera\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "3D Scene Reconstruction With Multi-Layer Depth and Epipolar Transformers", @@ -201,7 +207,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Shin_3D_Scene_Reconstruction_With_Multi-Layer_Depth_and_Epipolar_Transformers_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Shin_3D_Scene_Reconstruction_With_Multi-Layer_Depth_and_Epipolar_Transformers_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Shin_2019_ICCV,\n \n author = {\n Shin,\n Daeyun and Ren,\n Zhile and Sudderth,\n Erik B. and Fowlkes,\n Charless C.\n},\n title = {\n 3D Scene Reconstruction With Multi-Layer Depth and Epipolar Transformers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "3D-LaneNet: End-to-End 3D Multiple Lane Detection", @@ -225,7 +232,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Garnett_3D-LaneNet_End-to-End_3D_Multiple_Lane_Detection_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Garnett_3D-LaneNet_End-to-End_3D_Multiple_Lane_Detection_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Garnett_2019_ICCV,\n \n author = {\n Garnett,\n Noa and Cohen,\n Rafi and Pe'er,\n Tomer and Lahav,\n Roee and Levi,\n Dan\n},\n title = {\n 3D-LaneNet: End-to-End 3D Multiple Lane Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "3D-RelNet: Joint Object and Relational Network for 3D Prediction", @@ -258,7 +266,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Kulkarni_2019_ICCV,\n \n author = {\n Kulkarni,\n Nilesh and Misra,\n Ishan and Tulsiani,\n Shubham and Gupta,\n Abhinav\n},\n title = {\n 3D-RelNet: Joint Object and Relational Network for 3D Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "3DPeople: Modeling the Geometry of Dressed Humans", @@ -284,14 +293,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Pumarola_3DPeople_Modeling_the_Geometry_of_Dressed_Humans_ICCV_2019_paper.html", "aff_unique_index": "0;0;1;0;0", - "aff_unique_norm": "Institut de Rob\u00f2tica i Inform\u00e0tica Industrial;Harvard University", + "aff_unique_norm": "Institut de Robòtica i Informàtica Industrial;Harvard University", "aff_unique_dep": "CSIC-UPC;John A. Paulson School of Engineering and Applied Sciences", "aff_unique_url": "http://www.iri.upc.edu/;https://www.harvard.edu", "aff_unique_abbr": "IRI;Harvard", "aff_campus_unique_index": "0;0;1;0;0", "aff_campus_unique": "Barcelona;Cambridge", "aff_country_unique_index": "0;0;1;0;0", - "aff_country_unique": "Spain;United States" + "aff_country_unique": "Spain;United States", + "bibtex": "@InProceedings{Pumarola_2019_ICCV,\n \n author = {\n Pumarola,\n Albert and Sanchez-Riera,\n Jordi and Choi,\n Gary P. T. and Sanfeliu,\n Alberto and Moreno-Noguer,\n Francesc\n},\n title = {\n 3DPeople: Modeling the Geometry of Dressed Humans\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "6-DOF GraspNet: Variational Grasp Generation for Object Manipulation", @@ -317,14 +327,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Mousavian_6-DOF_GraspNet_Variational_Grasp_Generation_for_Object_Manipulation_ICCV_2019_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "NVIDIA", - "aff_unique_dep": "NVIDIA Corporation", + "aff_unique_norm": "NVIDIA Corporation", + "aff_unique_dep": "", "aff_unique_url": "https://www.nvidia.com", "aff_unique_abbr": "NVIDIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Mousavian_2019_ICCV,\n \n author = {\n Mousavian,\n Arsalan and Eppner,\n Clemens and Fox,\n Dieter\n},\n title = {\n 6-DOF GraspNet: Variational Grasp Generation for Object Manipulation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "A Bayesian Optimization Framework for Neural Network Compression", @@ -348,7 +359,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Ma_A_Bayesian_Optimization_Framework_for_Neural_Network_Compression_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Ma_A_Bayesian_Optimization_Framework_for_Neural_Network_Compression_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Ma_2019_ICCV,\n \n author = {\n Ma,\n Xingchen and Triki,\n Amal Rannen and Berman,\n Maxim and Sagonas,\n Christos and Cali,\n Jacques and Blaschko,\n Matthew B.\n},\n title = {\n A Bayesian Optimization Framework for Neural Network Compression\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "A Camera That CNNs: Towards Embedded Neural Networks on Pixel Processor Arrays", @@ -381,7 +393,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Bose_2019_ICCV,\n \n author = {\n Bose,\n Laurie and Chen,\n Jianing and Carey,\n Stephen J. and Dudek,\n Piotr and Mayol-Cuevas,\n Walterio\n},\n title = {\n A Camera That CNNs: Towards Embedded Neural Networks on Pixel Processor Arrays\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "A Closed-Form Solution to Universal Style Transfer", @@ -407,14 +420,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Lu_A_Closed-Form_Solution_to_Universal_Style_Transfer_ICCV_2019_paper.html", "aff_unique_index": "0;0;1;1;0;0", - "aff_unique_norm": "Tsinghua University;Intel", + "aff_unique_norm": "Tsinghua University;Intel Corporation", "aff_unique_dep": "Department of Electronic Engineering;Intel Labs", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.intel.cn", "aff_unique_abbr": "THU;Intel", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Lu_2019_ICCV,\n \n author = {\n Lu,\n Ming and Zhao,\n Hao and Yao,\n Anbang and Chen,\n Yurong and Xu,\n Feng and Zhang,\n Li\n},\n title = {\n A Closed-Form Solution to Universal Style Transfer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "A Comprehensive Overhaul of Feature Distillation", @@ -438,7 +452,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Heo_A_Comprehensive_Overhaul_of_Feature_Distillation_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Heo_A_Comprehensive_Overhaul_of_Feature_Distillation_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Heo_2019_ICCV,\n \n author = {\n Heo,\n Byeongho and Kim,\n Jeesoo and Yun,\n Sangdoo and Park,\n Hyojin and Kwak,\n Nojun and Choi,\n Jin Young\n},\n title = {\n A Comprehensive Overhaul of Feature Distillation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "A Dataset of Multi-Illumination Images in the Wild", @@ -446,7 +461,7 @@ "status": "Poster", "track": "main", "pid": "3059", - "author_site": "Lukas Murmann, Micha\u00c3\u00abl Gharbi, Miika Aittala, Fr\u00c3\u00a9do Durand", + "author_site": "Lukas Murmann, Michaël Gharbi, Miika Aittala, Frédo Durand", "author": "Lukas Murmann; Michael Gharbi; Miika Aittala; Fredo Durand", "abstract": "Collections of images under a single, uncontrolled illumination have enabled the rapid advancement of core computer vision tasks like classification, detection, and segmentation. But even with modern learning techniques, many inverse problems involving lighting and material understanding remain too severely ill-posed to be solved with single-illumination datasets. The data simply does not contain the necessary supervisory signals. Multi-illumination datasets are notoriously hard to capture, so the data is typically collected at small scale, in controlled environments, either using multiple light sources, or robotic gantries. This leads to image collections that are not representative of the variety and complexity of real world scenes. We introduce a new multi-illumination dataset of more than 1000 real scenes, each captured in high dynamic range and high resolution, under 25 lighting conditions. We demonstrate the richness of this dataset by training state-of-the-art models for three challenging applications: single-image illumination estimation, image relighting, and mixed-illuminant white balance.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Murmann_A_Dataset_of_Multi-Illumination_Images_in_the_Wild_ICCV_2019_paper.pdf", @@ -462,7 +477,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Murmann_A_Dataset_of_Multi-Illumination_Images_in_the_Wild_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Murmann_A_Dataset_of_Multi-Illumination_Images_in_the_Wild_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Murmann_2019_ICCV,\n \n author = {\n Murmann,\n Lukas and Gharbi,\n Michael and Aittala,\n Miika and Durand,\n Fredo\n},\n title = {\n A Dataset of Multi-Illumination Images in the Wild\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "A Decoupled 3D Facial Shape Model by Adversarial Training", @@ -470,7 +486,7 @@ "status": "Oral", "track": "main", "pid": "2899", - "author_site": "Victoria Fern\u00c3\u00a1ndez Abrevaya, Adnane Boukhayma, Stefanie Wuhrer, Edmond Boyer", + "author_site": "Victoria Fernández Abrevaya, Adnane Boukhayma, Stefanie Wuhrer, Edmond Boyer", "author": "Victoria Fernandez Abrevaya; Adnane Boukhayma; Stefanie Wuhrer; Edmond Boyer", "abstract": "Data-driven generative 3D face models are used to compactly encode facial shape data into meaningful parametric representations. A desirable property of these models is their ability to effectively decouple natural sources of variation, in particular identity and expression. While factorized representations have been proposed for that purpose, they are still limited in the variability they can capture and may present modeling artifacts when applied to tasks such as expression transfer. In this work, we explore a new direction with Generative Adversarial Networks and show that they contribute to better face modeling performances, especially in decoupling natural factors, while also achieving more diverse samples. To train the model we introduce a novel architecture that combines a 3D generator with a 2D discriminator that leverages conventional CNNs, where the two components are bridged by a geometry mapping layer. We further present a training scheme, based on auxiliary classifiers, to explicitly disentangle identity and expression attributes. Through quantitative and qualitative results on standard face datasets, we illustrate the benefits of our model and demonstrate that it outperforms competing state of the art methods in terms of decoupling and diversity.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Abrevaya_A_Decoupled_3D_Facial_Shape_Model_by_Adversarial_Training_ICCV_2019_paper.pdf", @@ -488,14 +504,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Abrevaya_A_Decoupled_3D_Facial_Shape_Model_by_Adversarial_Training_ICCV_2019_paper.html", "aff_unique_index": "0;1;0;0", - "aff_unique_norm": "INRIA;University of Oxford", + "aff_unique_norm": "Inria;University of Oxford", "aff_unique_dep": "LJK;", "aff_unique_url": "https://www.inria.fr;https://www.ox.ac.uk", "aff_unique_abbr": "Inria;Oxford", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", - "aff_country_unique": "France;United Kingdom" + "aff_country_unique": "France;United Kingdom", + "bibtex": "@InProceedings{Abrevaya_2019_ICCV,\n \n author = {\n Abrevaya,\n Victoria Fernandez and Boukhayma,\n Adnane and Wuhrer,\n Stefanie and Boyer,\n Edmond\n},\n title = {\n A Decoupled 3D Facial Shape Model by Adversarial Training\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "A Deep Cybersickness Predictor Based on Brain Signal Analysis for Virtual Reality Contents", @@ -528,7 +545,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Kim_2019_ICCV,\n \n author = {\n Kim,\n Jinwoo and Kim,\n Woojae and Oh,\n Heeseok and Lee,\n Seongmin and Lee,\n Sanghoon\n},\n title = {\n A Deep Cybersickness Predictor Based on Brain Signal Analysis for Virtual Reality Contents\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "A Deep Step Pattern Representation for Multimodal Retinal Image Registration", @@ -540,7 +558,7 @@ "author": "Jimmy Addison Lee; Peng Liu; Jun Cheng; Huazhu Fu", "abstract": "This paper presents a novel feature-based method that is built upon a convolutional neural network (CNN) to learn the deep representation for multimodal retinal image registration. We coined the algorithm deep step patterns, in short DeepSPa. Most existing deep learning based methods require a set of manually labeled training data with known corresponding spatial transformations, which limits the size of training datasets. By contrast, our method is fully automatic and scale well to different image modalities with no human intervention. We generate feature classes from simple step patterns within patches of connecting edges formed by vascular junctions in multiple retinal imaging modalities. We leverage CNN to learn and optimize the input patches to be used for image registration. Spatial transformations are estimated based on the output possibility of the fully connected layer of CNN for a pair of images. One of the key advantages of the proposed algorithm is its robustness to non-linear intensity changes, which widely exist on retinal images due to the difference of acquisition modalities. We validate our algorithm on extensive challenging datasets comprising poor quality multimodal retinal images which are adversely affected by pathologies (diseases), speckle noise and low resolutions. The experimental results demonstrate the robustness and accuracy over state-of-the-art multimodal image registration algorithms.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Lee_A_Deep_Step_Pattern_Representation_for_Multimodal_Retinal_Image_Registration_ICCV_2019_paper.pdf", - "aff": "Cixi Institute of Biomedical Engineering, Chinese Academy of Sciences, China; Big Data Research Center at University of Electronic Science and Technology of China, China; Cixi Institute of Biomedical Engineering, Chinese Academy of Sciences, China+UBTech Research, China; Inception Institute of Arti\ufb01cial Intelligence, UAE", + "aff": "Cixi Institute of Biomedical Engineering, Chinese Academy of Sciences, China; Big Data Research Center at University of Electronic Science and Technology of China, China; Cixi Institute of Biomedical Engineering, Chinese Academy of Sciences, China+UBTech Research, China; Inception Institute of Artificial Intelligence, UAE", "project": "", "github": "", "supp": "", @@ -561,7 +579,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0;1", - "aff_country_unique": "China;United Arab Emirates" + "aff_country_unique": "China;United Arab Emirates", + "bibtex": "@InProceedings{Lee_2019_ICCV,\n \n author = {\n Lee,\n Jimmy Addison and Liu,\n Peng and Cheng,\n Jun and Fu,\n Huazhu\n},\n title = {\n A Deep Step Pattern Representation for Multimodal Retinal Image Registration\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "A Delay Metric for Video Object Detection: What Average Precision Fails to Tell", @@ -587,14 +606,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Mao_A_Delay_Metric_for_Video_Object_Detection_What_Average_Precision_ICCV_2019_paper.html", "aff_unique_index": "0;1;0+1", - "aff_unique_norm": "Stanford University;NVIDIA", - "aff_unique_dep": ";NVIDIA Corporation", + "aff_unique_norm": "Stanford University;NVIDIA Corporation", + "aff_unique_dep": ";", "aff_unique_url": "https://www.stanford.edu;https://www.nvidia.com", "aff_unique_abbr": "Stanford;NVIDIA", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Mao_2019_ICCV,\n \n author = {\n Mao,\n Huizi and Yang,\n Xiaodong and Dally,\n William J.\n},\n title = {\n A Delay Metric for Video Object Detection: What Average Precision Fails to Tell\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "A Differential Volumetric Approach to Multi-View Photometric Stereo", @@ -627,7 +647,8 @@ "aff_campus_unique_index": "0+0;0;0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0+0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Logothetis_2019_ICCV,\n \n author = {\n Logothetis,\n Fotios and Mecca,\n Roberto and Cipolla,\n Roberto\n},\n title = {\n A Differential Volumetric Approach to Multi-View Photometric Stereo\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "A Dual-Path Model With Adaptive Attention for Vehicle Re-Identification", @@ -660,7 +681,8 @@ "aff_campus_unique_index": "0;0;0;0;1;0", "aff_campus_unique": "College Park;Taiwan", "aff_country_unique_index": "0;0;0;0;1;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Khorramshahi_2019_ICCV,\n \n author = {\n Khorramshahi,\n Pirazh and Kumar,\n Amit and Peri,\n Neehar and Rambhatla,\n Sai Saketh and Chen,\n Jun-Cheng and Chellappa,\n Rama\n},\n title = {\n A Dual-Path Model With Adaptive Attention for Vehicle Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "A Fast and Accurate One-Stage Approach to Visual Grounding", @@ -693,7 +715,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0;0;0;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Yang_2019_ICCV,\n \n author = {\n Yang,\n Zhengyuan and Gong,\n Boqing and Wang,\n Liwei and Huang,\n Wenbing and Yu,\n Dong and Luo,\n Jiebo\n},\n title = {\n A Fast and Accurate One-Stage Approach to Visual Grounding\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "A Geometry-Inspired Decision-Based Attack", @@ -705,7 +728,7 @@ "author": "Yujia Liu; Seyed-Mohsen Moosavi-Dezfooli; Pascal Frossard", "abstract": "Deep neural networks have recently achieved tremendous success in image classification. Recent studies have however shown that they are easily misled into incorrect classification decisions by adversarial examples. Adversaries can even craft attacks by querying the model in black-box settings, where no information about the model is released except its final decision. Such decision-based attacks usually require lots of queries, while real-world image recognition systems might actually restrict the number of queries. In this paper, we propose qFool, a novel decision-based attack algorithm that can generate adversarial examples using a small number of queries. The qFool method can drastically reduce the number of queries compared to previous decision-based attacks while reaching the same quality of adversarial examples. We also enhance our method by constraining adversarial perturbations in low-frequency subspace, which can make qFool even more computationally efficient. Altogether, we manage to fool commercial image recognition systems with a small number of queries, which demonstrates the actual effectiveness of our new algorithm in practice.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Liu_A_Geometry-Inspired_Decision-Based_Attack_ICCV_2019_paper.pdf", - "aff": "University of Science and Technology of China; \u00b4Ecole Polytechnique F \u00b4ed\u00b4erale de Lausanne, Switzerland; \u00b4Ecole Polytechnique F \u00b4ed\u00b4erale de Lausanne, Switzerland", + "aff": "University of Science and Technology of China; ´Ecole Polytechnique F ´ed´erale de Lausanne, Switzerland; ´Ecole Polytechnique F ´ed´erale de Lausanne, Switzerland", "project": "", "github": "", "supp": "", @@ -719,14 +742,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Liu_A_Geometry-Inspired_Decision-Based_Attack_ICCV_2019_paper.html", "aff_unique_index": "0;1;1", - "aff_unique_norm": "University of Science and Technology of China;EPFL", + "aff_unique_norm": "University of Science and Technology of China;Ecole Polytechnique Fédérale de Lausanne", "aff_unique_dep": ";", "aff_unique_url": "http://www.ustc.edu.cn;https://www.epfl.ch", "aff_unique_abbr": "USTC;EPFL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", - "aff_country_unique": "China;Switzerland" + "aff_country_unique": "China;Switzerland", + "bibtex": "@InProceedings{Liu_2019_ICCV,\n \n author = {\n Liu,\n Yujia and Moosavi-Dezfooli,\n Seyed-Mohsen and Frossard,\n Pascal\n},\n title = {\n A Geometry-Inspired Decision-Based Attack\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "A Graph-Based Framework to Bridge Movies and Synopses", @@ -752,14 +776,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Xiong_A_Graph-Based_Framework_to_Bridge_Movies_and_Synopses_ICCV_2019_paper.html", "aff_unique_index": "0;0;1;0;0;0", - "aff_unique_norm": "Chinese University of Hong Kong;University of California, Berkeley", + "aff_unique_norm": "The Chinese University of Hong Kong;University of California, Berkeley", "aff_unique_dep": "CUHK - SenseTime Joint Lab;", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.berkeley.edu", "aff_unique_abbr": "CUHK;UC Berkeley", "aff_campus_unique_index": "0;0;1;0;0;0", "aff_campus_unique": "Hong Kong SAR;Berkeley", "aff_country_unique_index": "0;0;1;0;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Xiong_2019_ICCV,\n \n author = {\n Xiong,\n Yu and Huang,\n Qingqiu and Guo,\n Lingfeng and Zhou,\n Hang and Zhou,\n Bolei and Lin,\n Dahua\n},\n title = {\n A Graph-Based Framework to Bridge Movies and Synopses\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "A Learned Representation for Scalable Vector Graphics", @@ -792,7 +817,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Lopes_2019_ICCV,\n \n author = {\n Lopes,\n Raphael Gontijo and Ha,\n David and Eck,\n Douglas and Shlens,\n Jonathon\n},\n title = {\n A Learned Representation for Scalable Vector Graphics\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "A Neural Network for Detailed Human Depth Estimation From a Single Image", @@ -825,7 +851,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1;0", - "aff_country_unique": "Canada;China" + "aff_country_unique": "Canada;China", + "bibtex": "@InProceedings{Tang_2019_ICCV,\n \n author = {\n Tang,\n Sicong and Tan,\n Feitong and Cheng,\n Kelvin and Li,\n Zhaoyang and Zhu,\n Siyu and Tan,\n Ping\n},\n title = {\n A Neural Network for Detailed Human Depth Estimation From a Single Image\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "A Novel Unsupervised Camera-Aware Domain Adaptation Framework for Person Re-Identification", @@ -851,14 +878,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Qi_A_Novel_Unsupervised_Camera-Aware_Domain_Adaptation_Framework_for_Person_Re-Identification_ICCV_2019_paper.html", "aff_unique_index": "0;1;0;2;0;0", - "aff_unique_norm": "Nanjing University;University of Wollongong;University of Sydney", + "aff_unique_norm": "Nanjing University;University of Wollongong;The University of Sydney", "aff_unique_dep": "State Key Laboratory for Novel Software Technology;School of Computing and Information Technology;School of Electrical and Information Engineering", "aff_unique_url": "http://www.nju.edu.cn;https://www.uow.edu.au;https://www.sydney.edu.au", "aff_unique_abbr": "Nanjing University;UOW;USYD", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Wollongong;Sydney", "aff_country_unique_index": "0;1;0;1;0;0", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Qi_2019_ICCV,\n \n author = {\n Qi,\n Lei and Wang,\n Lei and Huo,\n Jing and Zhou,\n Luping and Shi,\n Yinghuan and Gao,\n Yang\n},\n title = {\n A Novel Unsupervised Camera-Aware Domain Adaptation Framework for Person Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "A Quaternion-Based Certifiably Optimal Solution to the Wahba Problem With Outliers", @@ -891,7 +919,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Yang_2019_ICCV,\n \n author = {\n Yang,\n Heng and Carlone,\n Luca\n},\n title = {\n A Quaternion-Based Certifiably Optimal Solution to the Wahba Problem With Outliers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "A Robust Learning Approach to Domain Adaptive Object Detection", @@ -915,7 +944,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Khodabandeh_A_Robust_Learning_Approach_to_Domain_Adaptive_Object_Detection_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Khodabandeh_A_Robust_Learning_Approach_to_Domain_Adaptive_Object_Detection_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Khodabandeh_2019_ICCV,\n \n author = {\n Khodabandeh,\n Mehran and Vahdat,\n Arash and Ranjbar,\n Mani and Macready,\n William G.\n},\n title = {\n A Robust Learning Approach to Domain Adaptive Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "A Tour of Convolutional Networks Guided by Linear Interpreters", @@ -939,7 +969,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Michelini_A_Tour_of_Convolutional_Networks_Guided_by_Linear_Interpreters_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Michelini_A_Tour_of_Convolutional_Networks_Guided_by_Linear_Interpreters_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Michelini_2019_ICCV,\n \n author = {\n Michelini,\n Pablo Navarrete and Liu,\n Hanwen and Lu,\n Yunhua and Jiang,\n Xingqun\n},\n title = {\n A Tour of Convolutional Networks Guided by Linear Interpreters\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "A Weakly Supervised Fine Label Classifier Enhanced by Coarse Supervision", @@ -972,7 +1003,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Taherkhani_2019_ICCV,\n \n author = {\n Taherkhani,\n Fariborz and Kazemi,\n Hadi and Dabouei,\n Ali and Dawson,\n Jeremy and Nasrabadi,\n Nasser M.\n},\n title = {\n A Weakly Supervised Fine Label Classifier Enhanced by Coarse Supervision\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "A2J: Anchor-to-Joint Regression Network for 3D Articulated Pose Estimation From a Single Depth Image", @@ -1005,7 +1037,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Buffalo", "aff_country_unique_index": "0;0;0;0;0;1;2", - "aff_country_unique": "China;Singapore;United States" + "aff_country_unique": "China;Singapore;United States", + "bibtex": "@InProceedings{Xiong_2019_ICCV,\n \n author = {\n Xiong,\n Fu and Zhang,\n Boshen and Xiao,\n Yang and Cao,\n Zhiguo and Yu,\n Taidong and Zhou,\n Joey Tianyi and Yuan,\n Junsong\n},\n title = {\n A2J: Anchor-to-Joint Regression Network for 3D Articulated Pose Estimation From a Single Depth Image\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "ABD-Net: Attentive but Diverse Person Re-Identification", @@ -1038,7 +1071,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0;0;0;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Chen_2019_ICCV,\n \n author = {\n Chen,\n Tianlong and Ding,\n Shaojin and Xie,\n Jingyi and Yuan,\n Ye and Chen,\n Wuyang and Yang,\n Yang and Ren,\n Zhou and Wang,\n Zhangyang\n},\n title = {\n ABD-Net: Attentive but Diverse Person Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "ACE: Adapting to Changing Environments for Semantic Segmentation", @@ -1071,7 +1105,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Wu_2019_ICCV,\n \n author = {\n Wu,\n Zuxuan and Wang,\n Xin and Gonzalez,\n Joseph E. and Goldstein,\n Tom and Davis,\n Larry S.\n},\n title = {\n ACE: Adapting to Changing Environments for Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "ACFNet: Attentional Class Feature Network for Semantic Segmentation", @@ -1097,14 +1132,15 @@ "author_num": 8, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Zhang_ACFNet_Attentional_Class_Feature_Network_for_Semantic_Segmentation_ICCV_2019_paper.html", "aff_unique_index": "0+1+2;2;1;2;2;0+1;2;2", - "aff_unique_norm": "Chinese Academy of Sciences;University of Chinese Academy of Sciences;Baidu", - "aff_unique_dep": "Institute of Software;;Baidu Inc.", + "aff_unique_norm": "Chinese Academy of Sciences;University of Chinese Academy of Sciences;Baidu Inc.", + "aff_unique_dep": "Institute of Software;;", "aff_unique_url": "http://www.ios.ac.cn;http://www.ucas.ac.cn;https://www.baidu.com", "aff_unique_abbr": "CAS;UCAS;Baidu", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0+0;0;0;0;0;0+0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2019_ICCV,\n \n author = {\n Zhang,\n Fan and Chen,\n Yanqin and Li,\n Zhihang and Hong,\n Zhibin and Liu,\n Jingtuo and Ma,\n Feifei and Han,\n Junyu and Ding,\n Errui\n},\n title = {\n ACFNet: Attentional Class Feature Network for Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "ACMM: Aligned Cross-Modal Memory for Few-Shot Image and Sentence Matching", @@ -1137,7 +1173,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0+0+0;0+0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Huang_2019_ICCV,\n \n author = {\n Huang,\n Yan and Wang,\n Liang\n},\n title = {\n ACMM: Aligned Cross-Modal Memory for Few-Shot Image and Sentence Matching\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "ACNet: Strengthening the Kernel Skeletons for Powerful CNN via Asymmetric Convolution Blocks", @@ -1161,7 +1198,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Ding_ACNet_Strengthening_the_Kernel_Skeletons_for_Powerful_CNN_via_Asymmetric_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Ding_ACNet_Strengthening_the_Kernel_Skeletons_for_Powerful_CNN_via_Asymmetric_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Ding_2019_ICCV,\n \n author = {\n Ding,\n Xiaohan and Guo,\n Yuchen and Ding,\n Guiguang and Han,\n Jungong\n},\n title = {\n ACNet: Strengthening the Kernel Skeletons for Powerful CNN via Asymmetric Convolution Blocks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "AFD-Net: Aggregated Feature Difference Learning for Cross-Spectral Image Patch Matching", @@ -1194,7 +1232,8 @@ "aff_campus_unique_index": "0+1;0+1;0;0;0;0;0", "aff_campus_unique": "Shaanxi;Kyoto", "aff_country_unique_index": "0+1;0+1;0;0;0;0;0", - "aff_country_unique": "China;Japan" + "aff_country_unique": "China;Japan", + "bibtex": "@InProceedings{Quan_2019_ICCV,\n \n author = {\n Quan,\n Dou and Liang,\n Xuefeng and Wang,\n Shuang and Wei,\n Shaowei and Li,\n Yanfeng and Huyan,\n Ning and Jiao,\n Licheng\n},\n title = {\n AFD-Net: Aggregated Feature Difference Learning for Cross-Spectral Image Patch Matching\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "AGSS-VOS: Attention Guided Single-Shot Video Object Segmentation", @@ -1220,14 +1259,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Lin_AGSS-VOS_Attention_Guided_Single-Shot_Video_Object_Segmentation_ICCV_2019_paper.html", "aff_unique_index": "0;1;2", - "aff_unique_norm": "Chinese University of Hong Kong;University of Oxford;Tencent", + "aff_unique_norm": "The Chinese University of Hong Kong;University of Oxford;Tencent", "aff_unique_dep": ";;YouTu Lab", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.ox.ac.uk;https://www.tencent.com", "aff_unique_abbr": "CUHK;Oxford;Tencent", "aff_campus_unique_index": "0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "China;United Kingdom" + "aff_country_unique": "China;United Kingdom", + "bibtex": "@InProceedings{Lin_2019_ICCV,\n \n author = {\n Lin,\n Huaijia and Qi,\n Xiaojuan and Jia,\n Jiaya\n},\n title = {\n AGSS-VOS: Attention Guided Single-Shot Video Object Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "AM-LFS: AutoML for Loss Function Search", @@ -1253,14 +1293,15 @@ "author_num": 7, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Li_AM-LFS_AutoML_for_Loss_Function_Search_ICCV_2019_paper.html", "aff_unique_index": "0;0;0;0;0;0;1", - "aff_unique_norm": "SenseTime Group Limited;University of Sydney", + "aff_unique_norm": "SenseTime Group Limited;The University of Sydney", "aff_unique_dep": ";", "aff_unique_url": "https://www.sensetime.com;https://www.sydney.edu.au", "aff_unique_abbr": "SenseTime;USYD", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;1", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Li_2019_ICCV,\n \n author = {\n Li,\n Chuming and Yuan,\n Xin and Lin,\n Chen and Guo,\n Minghao and Wu,\n Wei and Yan,\n Junjie and Ouyang,\n Wanli\n},\n title = {\n AM-LFS: AutoML for Loss Function Search\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "AMASS: Archive of Motion Capture As Surface Shapes", @@ -1293,7 +1334,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "1;2;1;1", - "aff_country_unique": ";Germany;Canada" + "aff_country_unique": ";Germany;Canada", + "bibtex": "@InProceedings{Mahmood_2019_ICCV,\n \n author = {\n Mahmood,\n Naureen and Ghorbani,\n Nima and Troje,\n Nikolaus F. and Pons-Moll,\n Gerard and Black,\n Michael J.\n},\n title = {\n AMASS: Archive of Motion Capture As Surface Shapes\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "AMP: Adaptive Masked Proxies for Few-Shot Segmentation", @@ -1326,7 +1368,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Siam_2019_ICCV,\n \n author = {\n Siam,\n Mennatullah and Oreshkin,\n Boris N. and Jagersand,\n Martin\n},\n title = {\n AMP: Adaptive Masked Proxies for Few-Shot Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "ARGAN: Attentive Recurrent Generative Adversarial Network for Shadow Detection and Removal", @@ -1359,7 +1402,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Wuhan;", "aff_country_unique_index": "0;1;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Ding_2019_ICCV,\n \n author = {\n Ding,\n Bin and Long,\n Chengjiang and Zhang,\n Ling and Xiao,\n Chunxia\n},\n title = {\n ARGAN: Attentive Recurrent Generative Adversarial Network for Shadow Detection and Removal\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "AVT: Unsupervised Learning of Transformation Equivariant Representations by Autoencoding Variational Transformations", @@ -1371,7 +1415,7 @@ "author": "Guo-Jun Qi; Liheng Zhang; Chang Wen Chen; Qi Tian", "abstract": "The learning of Transformation-Equivariant Representations (TERs), which is introduced by Hinton et al. [??], has been considered as a principle to reveal visual structures under various transformations. It contains the celebrated Convolutional Neural Networks (CNNs) as a special case that only equivary to the translations. In contrast, we seek to train TERs for a generic class of transformations and train them in an unsupervised fashion. To this end, we present a novel principled method by Autoencoding Variational Transformations (AVT), compared with the conventional approach to autoencoding data. Formally, given transformed images, the AVT seeks to train the networks by maximizing the mutual information between the transformations and representations. This ensures the resultant TERs of individual images contain the intrinsic information about their visual structures that would equivary extricably under various transformations in a generalized nonlinear case. Technically, we show that the resultant optimization problem can be efficiently solved by maximizing a variational lower-bound of the mutual information. This variational approach introduces a transformation decoder to approximate the intractable posterior of transformations, resulting in an autoencoding architecture with a pair of the representation encoder and the transformation decoder. Experiments demonstrate the proposed AVT model sets a new record for the performances on unsupervised tasks, greatly closing the performance gap to the supervised models.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Qi_AVT_Unsupervised_Learning_of_Transformation_Equivariant_Representations_by_Autoencoding_Variational_ICCV_2019_paper.pdf", - "aff": "Laboratory for MAchine Perception and LEarning (MAPLE)+Huawei Cloud; Laboratory for MAchine Perception and LEarning (MAPLE); The Chinese University of Hong Kong at Shenzhen and Peng Cheng Laboratory; Huawei Noah\u2019s Ark Lab", + "aff": "Laboratory for MAchine Perception and LEarning (MAPLE)+Huawei Cloud; Laboratory for MAchine Perception and LEarning (MAPLE); The Chinese University of Hong Kong at Shenzhen and Peng Cheng Laboratory; Huawei Noah’s Ark Lab", "project": "http://maple-lab.net/projects/AVT.htm", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2019/supplemental/Qi_AVT_Unsupervised_Learning_ICCV_2019_supplemental.pdf", @@ -1385,14 +1429,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Qi_AVT_Unsupervised_Learning_of_Transformation_Equivariant_Representations_by_Autoencoding_Variational_ICCV_2019_paper.html", "aff_unique_index": "0+1;0;2;1", - "aff_unique_norm": "University of Toronto;Huawei;Chinese University of Hong Kong at Shenzhen", + "aff_unique_norm": "University of Toronto;Huawei;The Chinese University of Hong Kong at Shenzhen", "aff_unique_dep": "Laboratory for MAchine Perception and LEarning (MAPLE);Huawei Cloud;", "aff_unique_url": "https://maple.cs.toronto.edu/;https://www.huaweicloud.com;https://www.cuhk.edu.cn", "aff_unique_abbr": "MAPLE;Huawei Cloud;CUHK", "aff_campus_unique_index": ";1", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0+1;0;1;1", - "aff_country_unique": "Canada;China" + "aff_country_unique": "Canada;China", + "bibtex": "@InProceedings{Qi_2019_ICCV,\n \n author = {\n Qi,\n Guo-Jun and Zhang,\n Liheng and Chen,\n Chang Wen and Tian,\n Qi\n},\n title = {\n AVT: Unsupervised Learning of Transformation Equivariant Representations by Autoencoding Variational Transformations\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "AWSD: Adaptive Weighted Spatiotemporal Distillation for Video Representation", @@ -1416,7 +1461,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Tavakolian_AWSD_Adaptive_Weighted_Spatiotemporal_Distillation_for_Video_Representation_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Tavakolian_AWSD_Adaptive_Weighted_Spatiotemporal_Distillation_for_Video_Representation_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Tavakolian_2019_ICCV,\n \n author = {\n Tavakolian,\n Mohammad and Tavakoli,\n Hamed R. and Hadid,\n Abdenour\n},\n title = {\n AWSD: Adaptive Weighted Spatiotemporal Distillation for Video Representation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Accelerate CNN via Recursive Bayesian Pruning", @@ -1428,7 +1474,7 @@ "author": "Yuefu Zhou; Ya Zhang; Yanfeng Wang; Qi Tian", "abstract": "Channel Pruning, widely used for accelerating Convolutional Neural Networks, is an NP-hard problem due to the inter-layer dependency of channel redundancy. Existing methods generally ignored the above dependency for computation simplicity. To solve the problem, under the Bayesian framework, we here propose a layer-wise Recursive Bayesian Pruning method (RBP). A new dropout-based measurement of redundancy, which facilitate the computation of posterior assuming inter-layer dependency, is introduced. Specifically, we model the noise across layers as a Markov chain and target its posterior to reflect the inter-layer dependency. Considering the closed form solution for posterior is intractable, we derive a sparsity-inducing Dirac-like prior which regularizes the distribution of the designed noise to automatically approximate the posterior. Compared with the existing methods, no additional overhead is required when the inter-layer dependency assumed. The redundant channels can be simply identified by tiny dropout noise and directly pruned layer by layer. Experiments on popular CNN architectures have shown that the proposed method outperforms several state-of-the-arts. Particularly, we achieve up to 5.0x, 2.2x and 1.7x FLOPs reduction with little accuracy loss on the large scale dataset ILSVRC2012 for VGG16, ResNet50 and MobileNetV2, respectively.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Zhou_Accelerate_CNN_via_Recursive_Bayesian_Pruning_ICCV_2019_paper.pdf", - "aff": "Cooperative Medianet Innovation Center, Shanghai Jiao Tong University + MediaSmart Technology; Cooperative Medianet Innovation Center, Shanghai Jiao Tong University; Cooperative Medianet Innovation Center, Shanghai Jiao Tong University; Huawei Noah\u2019s Ark Lab", + "aff": "Cooperative Medianet Innovation Center, Shanghai Jiao Tong University + MediaSmart Technology; Cooperative Medianet Innovation Center, Shanghai Jiao Tong University; Cooperative Medianet Innovation Center, Shanghai Jiao Tong University; Huawei Noah’s Ark Lab", "project": "", "github": "", "supp": "", @@ -1443,13 +1489,14 @@ "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Zhou_Accelerate_CNN_via_Recursive_Bayesian_Pruning_ICCV_2019_paper.html", "aff_unique_index": "0+1;0;0;2", "aff_unique_norm": "Shanghai Jiao Tong University;MediaSmart Technology;Huawei", - "aff_unique_dep": "Cooperative Medianet Innovation Center;;Noah\u2019s Ark Lab", + "aff_unique_dep": "Cooperative Medianet Innovation Center;;Noah’s Ark Lab", "aff_unique_url": "https://www.sjtu.edu.cn;;https://www.huawei.com", "aff_unique_abbr": "SJTU;;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Zhou_2019_ICCV,\n \n author = {\n Zhou,\n Yuefu and Zhang,\n Ya and Wang,\n Yanfeng and Tian,\n Qi\n},\n title = {\n Accelerate CNN via Recursive Bayesian Pruning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Accelerate Learning of Deep Hashing With Gradient Attention", @@ -1482,7 +1529,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Huang_2019_ICCV,\n \n author = {\n Huang,\n Long-Kai and Chen,\n Jianda and Pan,\n Sinno Jialin\n},\n title = {\n Accelerate Learning of Deep Hashing With Gradient Attention\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Accelerated Gravitational Point Set Alignment With Altered Physical Laws", @@ -1508,14 +1556,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Golyanik_Accelerated_Gravitational_Point_Set_Alignment_With_Altered_Physical_Laws_ICCV_2019_paper.html", "aff_unique_index": "0;1;2", - "aff_unique_norm": "Max Planck Institute for Informatics;University of Kaiserslautern;Deutsches Forschungszentrum f\u00fcr K\u00fcnstliche Intelligenz", + "aff_unique_norm": "Max Planck Institute for Informatics;University of Kaiserslautern;Deutsches Forschungszentrum für Künstliche Intelligenz", "aff_unique_dep": "Informatics;;", "aff_unique_url": "https://www.mpi-inf.mpg.de;https://www.uni-kl.de;https://www.dfki.de", "aff_unique_abbr": "MPII;Uni KL;DFKI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Golyanik_2019_ICCV,\n \n author = {\n Golyanik,\n Vladislav and Theobalt,\n Christian and Stricker,\n Didier\n},\n title = {\n Accelerated Gravitational Point Set Alignment With Altered Physical Laws\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Accurate Monocular 3D Object Detection via Color-Embedded 3D Reconstruction for Autonomous Driving", @@ -1541,14 +1590,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Ma_Accurate_Monocular_3D_Object_Detection_via_Color-Embedded_3D_Reconstruction_for_ICCV_2019_paper.html", "aff_unique_index": "0;0+1;0+1;0;2;0+1", - "aff_unique_norm": "Dalian University of Technology;Key Laboratory for Ubiquitous Network and Service Software;University of Sydney", + "aff_unique_norm": "Dalian University of Technology;Key Laboratory for Ubiquitous Network and Service Software;The University of Sydney", "aff_unique_dep": ";Liaoning Province;", "aff_unique_url": "http://www.dlut.edu.cn/;;https://www.sydney.edu.au", "aff_unique_abbr": "DUT;;USYD", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0+0;0;1;0+0", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Ma_2019_ICCV,\n \n author = {\n Ma,\n Xinzhu and Wang,\n Zhihui and Li,\n Haojie and Zhang,\n Pengbo and Ouyang,\n Wanli and Fan,\n Xin\n},\n title = {\n Accurate Monocular 3D Object Detection via Color-Embedded 3D Reconstruction for Autonomous Driving\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Action Assessment by Joint Relation Graphs", @@ -1574,14 +1624,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Pan_Action_Assessment_by_Joint_Relation_Graphs_ICCV_2019_paper.html", "aff_unique_index": "0+1+2;0+1+2;0+1+2", - "aff_unique_norm": "Sun Yat-sen University;Pengcheng Laboratory;Key Laboratory of Machine Intelligence and Advanced Computing", - "aff_unique_dep": "School of Data and Computer Science;Peng Cheng Laboratory;Ministry of Education", + "aff_unique_norm": "Sun Yat-sen University;Peng Cheng Laboratory;Key Laboratory of Machine Intelligence and Advanced Computing", + "aff_unique_dep": "School of Data and Computer Science;;Ministry of Education", "aff_unique_url": "http://www.sysu.edu.cn/;;", "aff_unique_abbr": "SYSU;;", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0+0+0;0+0+0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Pan_2019_ICCV,\n \n author = {\n Pan,\n Jia-Hui and Gao,\n Jibin and Zheng,\n Wei-Shi\n},\n title = {\n Action Assessment by Joint Relation Graphs\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Action Recognition With Spatial-Temporal Discriminative Filter Banks", @@ -1589,7 +1640,7 @@ "status": "Poster", "track": "main", "pid": "6483", - "author_site": "Brais Mart\u00c3\u00adnez, Davide Modolo, Yuanjun Xiong, Joseph Tighe", + "author_site": "Brais Martínez, Davide Modolo, Yuanjun Xiong, Joseph Tighe", "author": "Brais Martinez; Davide Modolo; Yuanjun Xiong; Joseph Tighe", "abstract": "Action recognition has seen a dramatic performance improvement in the last few years. Most of the current state-of-the-art literature either aims at improving performance through changes to the backbone CNN network, or exploring different trade-offs between computational efficiency and performance, again through altering the backbone network. However, almost all of these works maintain the same last layers of the network, which simply consist of a global average pooling followed by a fully connected layer. In this work we focus on how to improve the representation capacity of the network, but rather than altering the backbone, we focus on improving the last layers of the network, where changes have low impact in terms of computational cost. In particular, we hypothesize that current architectures have poor sensitivity to finer details and we exploit recent advances in the fine-grained recognition literature to improve our model in this aspect. With the proposed approach, we obtain state-of-the-art performance on Kinetics-400 and Something-Something-V1, the two major large-scale action recognition benchmarks.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Martinez_Action_Recognition_With_Spatial-Temporal_Discriminative_Filter_Banks_ICCV_2019_paper.pdf", @@ -1607,14 +1658,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Martinez_Action_Recognition_With_Spatial-Temporal_Discriminative_Filter_Banks_ICCV_2019_paper.html", "aff_unique_index": "0;0;0;0", - "aff_unique_norm": "Amazon", - "aff_unique_dep": "Amazon.com, Inc.", + "aff_unique_norm": "Amazon.com, Inc.", + "aff_unique_dep": "", "aff_unique_url": "https://www.amazon.com", "aff_unique_abbr": "Amazon", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Martinez_2019_ICCV,\n \n author = {\n Martinez,\n Brais and Modolo,\n Davide and Xiong,\n Yuanjun and Tighe,\n Joseph\n},\n title = {\n Action Recognition With Spatial-Temporal Discriminative Filter Banks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Active Learning for Deep Detection Neural Networks", @@ -1622,7 +1674,7 @@ "status": "Poster", "track": "main", "pid": "4697", - "author_site": "Hamed H. Aghdam, Abel Gonzalez-Garcia, Joost van de Weijer, Antonio M. L\u00c3\u00b3pez", + "author_site": "Hamed H. Aghdam, Abel Gonzalez-Garcia, Joost van de Weijer, Antonio M. López", "author": "Hamed H. Aghdam; Abel Gonzalez-Garcia; Joost van de Weijer; Antonio M. Lopez", "abstract": "The cost of drawing object bounding boxes (i.e. labeling) for millions of images is prohibitively high. For instance, labeling pedestrians in a regular urban image could take 35 seconds on average. Active learning aims to reduce the cost of labeling by selecting only those images that are informative to improve the detection network accuracy. In this paper, we propose a method to perform active learning of object detectors based on convolutional neural networks. We propose a new image-level scoring process to rank unlabeled images for their automatic selection, which clearly outperforms classical scores. The proposed method can be applied to videos and sets of still images. In the former case, temporal selection rules can complement our scoring process. As a relevant use case, we extensively study the performance of our method on the task of pedestrian detection. Overall, the experiments show that the proposed method performs better than random selection.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Aghdam_Active_Learning_for_Deep_Detection_Neural_Networks_ICCV_2019_paper.pdf", @@ -1640,14 +1692,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Aghdam_Active_Learning_for_Deep_Detection_Neural_Networks_ICCV_2019_paper.html", "aff_unique_index": "0+1;0+1;0+1;0+1", - "aff_unique_norm": "Computer Vision Center;Universitat Aut\u00f2noma de Barcelona", + "aff_unique_norm": "Computer Vision Center;Universitat Autònoma de Barcelona", "aff_unique_dep": "Computer Vision;Computer Science Department", "aff_unique_url": "https://www.cvc.uab.cat/;https://www.uab.cat", "aff_unique_abbr": "CVC;UAB", "aff_campus_unique_index": ";;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0+0;0+0", - "aff_country_unique": "Spain" + "aff_country_unique": "Spain", + "bibtex": "@InProceedings{Aghdam_2019_ICCV,\n \n author = {\n Aghdam,\n Hamed H. and Gonzalez-Garcia,\n Abel and Weijer,\n Joost van de and Lopez,\n Antonio M.\n},\n title = {\n Active Learning for Deep Detection Neural Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "AdaTransform: Adaptive Data Transformation", @@ -1680,7 +1733,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Tang_2019_ICCV,\n \n author = {\n Tang,\n Zhiqiang and Peng,\n Xi and Li,\n Tingfeng and Zhu,\n Yizhe and Metaxas,\n Dimitris N.\n},\n title = {\n AdaTransform: Adaptive Data Transformation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "AdaptIS: Adaptive Instance Selection Network", @@ -1706,14 +1760,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Sofiiuk_AdaptIS_Adaptive_Instance_Selection_Network_ICCV_2019_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "Samsung", + "aff_unique_norm": "Samsung AI Center", "aff_unique_dep": "AI Center", "aff_unique_url": "https://www.samsung.com/global/careers/ai-center/", "aff_unique_abbr": "Samsung AI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Sofiiuk_2019_ICCV,\n \n author = {\n Sofiiuk,\n Konstantin and Barinova,\n Olga and Konushin,\n Anton\n},\n title = {\n AdaptIS: Adaptive Instance Selection Network\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Adaptative Inference Cost With Convolutional Neural Mixture Models", @@ -1737,7 +1792,8 @@ "aff_domain": ";", "email": ";", "author_num": 2, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Ruiz_Adaptative_Inference_Cost_With_Convolutional_Neural_Mixture_Models_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Ruiz_Adaptative_Inference_Cost_With_Convolutional_Neural_Mixture_Models_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Ruiz_2019_ICCV,\n \n author = {\n Ruiz,\n Adria and Verbeek,\n Jakob\n},\n title = {\n Adaptative Inference Cost With Convolutional Neural Mixture Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Adaptive Activation Thresholding: Dynamic Routing Type Behavior for Interpretability in Convolutional Neural Networks", @@ -1770,7 +1826,8 @@ "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Madison;Chicago", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Sun_2019_ICCV,\n \n author = {\n Sun,\n Yiyou and Ravi,\n Sathya N. and Singh,\n Vikas\n},\n title = {\n Adaptive Activation Thresholding: Dynamic Routing Type Behavior for Interpretability in Convolutional Neural Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Adaptive Context Network for Scene Parsing", @@ -1803,7 +1860,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Fu_2019_ICCV,\n \n author = {\n Fu,\n Jun and Liu,\n Jing and Wang,\n Yuhang and Li,\n Yong and Bao,\n Yongjun and Tang,\n Jinhui and Lu,\n Hanqing\n},\n title = {\n Adaptive Context Network for Scene Parsing\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Adaptive Density Map Generation for Crowd Counting", @@ -1836,7 +1894,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wan_2019_ICCV,\n \n author = {\n Wan,\n Jia and Chan,\n Antoni\n},\n title = {\n Adaptive Density Map Generation for Crowd Counting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Adaptive Reconstruction Network for Weakly Supervised Referring Expression Grounding", @@ -1862,14 +1921,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Liu_Adaptive_Reconstruction_Network_for_Weakly_Supervised_Referring_Expression_Grounding_ICCV_2019_paper.html", "aff_unique_index": "0+1;0;0;2;0+1;1+3", - "aff_unique_norm": "Chinese Academy of Sciences;University of Chinese Academy of Sciences;University of Science and Technology of China;Pengcheng Laboratory", - "aff_unique_dep": "Institute of Computing Technology;;;Peng Cheng Laboratory", + "aff_unique_norm": "Chinese Academy of Sciences;University of Chinese Academy of Sciences;University of Science and Technology of China;Peng Cheng Laboratory", + "aff_unique_dep": "Institute of Computing Technology;;;", "aff_unique_url": "http://www.ict.cas.cn;http://www.ucas.ac.cn;http://www.ustc.edu.cn;", "aff_unique_abbr": "CAS;UCAS;USTC;", "aff_campus_unique_index": "0+0;0;0;1;0+0;0+2", "aff_campus_unique": "Beijing;Hefei;Shenzhen", "aff_country_unique_index": "0+0;0;0;0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2019_ICCV,\n \n author = {\n Liu,\n Xuejing and Li,\n Liang and Wang,\n Shuhui and Zha,\n Zheng-Jun and Meng,\n Dechao and Huang,\n Qingming\n},\n title = {\n Adaptive Reconstruction Network for Weakly Supervised Referring Expression Grounding\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Adaptive Wing Loss for Robust Face Alignment via Heatmap Regression", @@ -1893,7 +1953,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Wang_Adaptive_Wing_Loss_for_Robust_Face_Alignment_via_Heatmap_Regression_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Wang_Adaptive_Wing_Loss_for_Robust_Face_Alignment_via_Heatmap_Regression_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Wang_2019_ICCV,\n \n author = {\n Wang,\n Xinyao and Bo,\n Liefeng and Fuxin,\n Li\n},\n title = {\n Adaptive Wing Loss for Robust Face Alignment via Heatmap Regression\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Addressing Model Vulnerability to Distributional Shifts Over Image Transformation Sets", @@ -1919,14 +1980,15 @@ "author_num": 2, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Volpi_Addressing_Model_Vulnerability_to_Distributional_Shifts_Over_Image_Transformation_Sets_ICCV_2019_paper.html", "aff_unique_index": "0;0+1+2", - "aff_unique_norm": "Istituto Italiano di Tecnologia;University of Verona;Huawei", - "aff_unique_dep": ";;Huawei Technologies", + "aff_unique_norm": "Istituto Italiano di Tecnologia;University of Verona;Huawei Technologies", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.iit.it;https://www.univr.it;https://www.huawei.com/ie/en", "aff_unique_abbr": "IIT;UniVR;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0+1", - "aff_country_unique": "Italy;Ireland" + "aff_country_unique": "Italy;Ireland", + "bibtex": "@InProceedings{Volpi_2019_ICCV,\n \n author = {\n Volpi,\n Riccardo and Murino,\n Vittorio\n},\n title = {\n Addressing Model Vulnerability to Distributional Shifts Over Image Transformation Sets\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "AdvIT: Adversarial Frames Identifier Based on Temporal Consistency in Videos", @@ -1950,7 +2012,8 @@ "aff_domain": ";;;;;;;;", "email": ";;;;;;;;", "author_num": 9, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Xiao_AdvIT_Adversarial_Frames_Identifier_Based_on_Temporal_Consistency_in_Videos_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Xiao_AdvIT_Adversarial_Frames_Identifier_Based_on_Temporal_Consistency_in_Videos_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Xiao_2019_ICCV,\n \n author = {\n Xiao,\n Chaowei and Deng,\n Ruizhi and Li,\n Bo and Lee,\n Taesung and Edwards,\n Benjamin and Yi,\n Jinfeng and Song,\n Dawn and Liu,\n Mingyan and Molloy,\n Ian\n},\n title = {\n AdvIT: Adversarial Frames Identifier Based on Temporal Consistency in Videos\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Adversarial Defense by Restricting the Hidden Space of Deep Neural Networks", @@ -1974,7 +2037,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Mustafa_Adversarial_Defense_by_Restricting_the_Hidden_Space_of_Deep_Neural_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Mustafa_Adversarial_Defense_by_Restricting_the_Hidden_Space_of_Deep_Neural_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Mustafa_2019_ICCV,\n \n author = {\n Mustafa,\n Aamir and Khan,\n Salman and Hayat,\n Munawar and Goecke,\n Roland and Shen,\n Jianbing and Shao,\n Ling\n},\n title = {\n Adversarial Defense by Restricting the Hidden Space of Deep Neural Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Adversarial Defense via Learning to Generate Diverse Attacks", @@ -2007,7 +2071,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+1;0", - "aff_country_unique": "United States;South Korea" + "aff_country_unique": "United States;South Korea", + "bibtex": "@InProceedings{Jang_2019_ICCV,\n \n author = {\n Jang,\n Yunseok and Zhao,\n Tianchen and Hong,\n Seunghoon and Lee,\n Honglak\n},\n title = {\n Adversarial Defense via Learning to Generate Diverse Attacks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Adversarial Feedback Loop", @@ -2040,7 +2105,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0+1", - "aff_country_unique": "Israel;China" + "aff_country_unique": "Israel;China", + "bibtex": "@InProceedings{Shama_2019_ICCV,\n \n author = {\n Shama,\n Firas and Mechrez,\n Roey and Shoshan,\n Alon and Zelnik-Manor,\n Lihi\n},\n title = {\n Adversarial Feedback Loop\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Adversarial Fine-Grained Composition Learning for Unseen Attribute-Object Recognition", @@ -2073,7 +2139,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wei_2019_ICCV,\n \n author = {\n Wei,\n Kun and Yang,\n Muli and Wang,\n Hao and Deng,\n Cheng and Liu,\n Xianglong\n},\n title = {\n Adversarial Fine-Grained Composition Learning for Unseen Attribute-Object Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Adversarial Learning With Margin-Based Triplet Embedding Regularization", @@ -2106,7 +2173,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhong_2019_ICCV,\n \n author = {\n Zhong,\n Yaoyao and Deng,\n Weihong\n},\n title = {\n Adversarial Learning With Margin-Based Triplet Embedding Regularization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Adversarial Representation Learning for Text-to-Image Matching", @@ -2139,7 +2207,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Sarafianos_2019_ICCV,\n \n author = {\n Sarafianos,\n Nikolaos and Xu,\n Xiang and Kakadiaris,\n Ioannis A.\n},\n title = {\n Adversarial Representation Learning for Text-to-Image Matching\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Adversarial Robustness vs. Model Compression, or Both?", @@ -2163,7 +2232,8 @@ "aff_domain": ";;;;;;;;;", "email": ";;;;;;;;;", "author_num": 10, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Ye_Adversarial_Robustness_vs._Model_Compression_or_Both_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Ye_Adversarial_Robustness_vs._Model_Compression_or_Both_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Ye_2019_ICCV,\n \n author = {\n Ye,\n Shaokai and Xu,\n Kaidi and Liu,\n Sijia and Cheng,\n Hao and Lambrechts,\n Jan-Henrik and Zhang,\n Huan and Zhou,\n Aojun and Ma,\n Kaisheng and Wang,\n Yanzhi and Lin,\n Xue\n},\n title = {\n Adversarial Robustness vs. Model Compression,\n or Both?\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Aggregation via Separation: Boosting Facial Landmark Detector With Semi-Supervised Style Translation", @@ -2187,7 +2257,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Qian_Aggregation_via_Separation_Boosting_Facial_Landmark_Detector_With_Semi-Supervised_Style_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Qian_Aggregation_via_Separation_Boosting_Facial_Landmark_Detector_With_Semi-Supervised_Style_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Qian_2019_ICCV,\n \n author = {\n Qian,\n Shengju and Sun,\n Keqiang and Wu,\n Wayne and Qian,\n Chen and Jia,\n Jiaya\n},\n title = {\n Aggregation via Separation: Boosting Facial Landmark Detector With Semi-Supervised Style Translation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Agile Depth Sensing Using Triangulation Light Curtains", @@ -2220,7 +2291,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Bartels_2019_ICCV,\n \n author = {\n Bartels,\n Joseph R. and Wang,\n Jian and Whittaker,\n William \"Red\" and Narasimhan,\n Srinivasa G.\n},\n title = {\n Agile Depth Sensing Using Triangulation Light Curtains\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Algebraic Characterization of Essential Matrices and Their Averaging in Multiview Settings", @@ -2253,7 +2325,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Israel" + "aff_country_unique": "Israel", + "bibtex": "@InProceedings{Kasten_2019_ICCV,\n \n author = {\n Kasten,\n Yoni and Geifman,\n Amnon and Galun,\n Meirav and Basri,\n Ronen\n},\n title = {\n Algebraic Characterization of Essential Matrices and Their Averaging in Multiview Settings\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "id": "0243cd4b25", @@ -2275,14 +2348,15 @@ "email": "pku.edu.cn;deepwise.com;pku.edu.cn;pku.edu.cn;pku.edu.cn;pku.edu.cn", "author_num": 6, "aff_unique_index": "0;1;0;0;0+1+2;1+2", - "aff_unique_norm": "Peking University;Deepwise AI Lab;Pengcheng Laboratory", - "aff_unique_dep": "Department of Computer Science;AI Lab;Peng Cheng Laboratory", + "aff_unique_norm": "Peking University;Deepwise AI Lab;Peng Cheng Laboratory", + "aff_unique_dep": "Department of Computer Science;AI Lab;", "aff_unique_url": "http://www.pku.edu.cn;;http://www.pcl.ac.cn", "aff_unique_abbr": "Peking U;;PCL", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0+0;0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Liu_2019_ICCV,\n \n author = {\n Liu,\n Jingyu and Zhao,\n Gangming and Fei,\n Yu and Zhang,\n Ming and Wang,\n Yizhou and Yu,\n Yizhou\n},\n title = {\n Align,\n Attend and Locate: Chest X-Ray Diagnosis via Contrast Induced Attention Network With Limited Supervision\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Align2Ground: Weakly Supervised Phrase Grounding Guided by Image-Caption Alignment", @@ -2308,14 +2382,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Datta_Align2Ground_Weakly_Supervised_Phrase_Grounding_Guided_by_Image-Caption_Alignment_ICCV_2019_paper.html", "aff_unique_index": "0+1;0;0;0;1+2;0", - "aff_unique_norm": "SRI International;Georgia Institute of Technology;Meta", + "aff_unique_norm": "SRI International;Georgia Institute of Technology;Facebook", "aff_unique_dep": ";;Facebook AI Research", "aff_unique_url": "https://www.sri.com;https://www.gatech.edu;https://research.facebook.com", "aff_unique_abbr": "SRI;Georgia Tech;FAIR", "aff_campus_unique_index": "0;0;0;0;;0", "aff_campus_unique": "Princeton;", "aff_country_unique_index": "0+0;0;0;0;0+0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Datta_2019_ICCV,\n \n author = {\n Datta,\n Samyak and Sikka,\n Karan and Roy,\n Anirban and Ahuja,\n Karuna and Parikh,\n Devi and Divakaran,\n Ajay\n},\n title = {\n Align2Ground: Weakly Supervised Phrase Grounding Guided by Image-Caption Alignment\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Aligning Latent Spaces for 3D Hand Pose Estimation", @@ -2339,7 +2414,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Yang_Aligning_Latent_Spaces_for_3D_Hand_Pose_Estimation_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Yang_Aligning_Latent_Spaces_for_3D_Hand_Pose_Estimation_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Yang_2019_ICCV,\n \n author = {\n Yang,\n Linlin and Li,\n Shile and Lee,\n Dongheui and Yao,\n Angela\n},\n title = {\n Aligning Latent Spaces for 3D Hand Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "id": "cb1b88fa28", @@ -2361,14 +2437,15 @@ "email": ";;;;", "author_num": 5, "aff_unique_index": "0;0;1;0;1", - "aff_unique_norm": "Johns Hopkins University;NVIDIA", - "aff_unique_dep": ";NVIDIA Corporation", + "aff_unique_norm": "Johns Hopkins University;NVIDIA Corporation", + "aff_unique_dep": ";", "aff_unique_url": "https://www.jhu.edu;https://www.nvidia.com", "aff_unique_abbr": "JHU;NVIDIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Liu_2019_ICCV,\n \n author = {\n Liu,\n Fengze and Xia,\n Yingda and Yang,\n Dong and Yuille,\n Alan L. and Xu,\n Daguang\n},\n title = {\n An Alarm System for Segmentation Algorithm Based on Shape Model\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "An Efficient Solution to the Homography-Based Relative Pose Problem With a Common Reference Direction", @@ -2380,7 +2457,7 @@ "author": "Yaqing Ding; Jian Yang; Jean Ponce; Hui Kong", "abstract": "In this paper, we propose a novel approach to two-view minimal-case relative pose problems based on homography with a common reference direction. We explore the rank-1 constraint on the difference between the Euclidean homography matrix and the corresponding rotation, and propose an efficient two-step solution for solving both the calibrated and partially calibrated (unknown focal length) problems. We derive new 3.5-point, 3.5-point, 4-point solvers for two cameras such that the two focal lengths are unknown but equal, one of them is unknown, and both are unknown and possibly different, respectively. We present detailed analyses and comparisons with existing 6 and 7-point solvers, including results with smart phone images.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Ding_An_Efficient_Solution_to_the_Homography-Based_Relative_Pose_Problem_With_ICCV_2019_paper.pdf", - "aff": "Nanjing University of Science and Technology; Nanjing University of Science and Technology; INRIA+D\u00b4epartement d\u2019informatique de l\u2019ENS, ENS, CNRS, PSL University; Nanjing University of Science and Technology+IAAI Nanjing, Horizon Robotics", + "aff": "Nanjing University of Science and Technology; Nanjing University of Science and Technology; INRIA+D´epartement d’informatique de l’ENS, ENS, CNRS, PSL University; Nanjing University of Science and Technology+IAAI Nanjing, Horizon Robotics", "project": "", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2019/supplemental/Ding_An_Efficient_Solution_ICCV_2019_supplemental.pdf", @@ -2394,14 +2471,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Ding_An_Efficient_Solution_to_the_Homography-Based_Relative_Pose_Problem_With_ICCV_2019_paper.html", "aff_unique_index": "0;0;1+2;0+3", - "aff_unique_norm": "Nanjing University of Science and Technology;INRIA;Ecole Normale Sup\u00e9rieure (ENS);IAAI Nanjing", - "aff_unique_dep": ";;D\u00e9partement d\u2019informatique;", + "aff_unique_norm": "Nanjing University of Science and Technology;INRIA;Ecole Normale Supérieure (ENS);IAAI Nanjing", + "aff_unique_dep": ";;Département d’informatique;", "aff_unique_url": "http://www.nust.edu.cn/;https://www.inria.fr;https://www.ens.fr;", "aff_unique_abbr": "NUST;INRIA;ENS;", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1+1;0+0", - "aff_country_unique": "China;France" + "aff_country_unique": "China;France", + "bibtex": "@InProceedings{Ding_2019_ICCV,\n \n author = {\n Ding,\n Yaqing and Yang,\n Jian and Ponce,\n Jean and Kong,\n Hui\n},\n title = {\n An Efficient Solution to the Homography-Based Relative Pose Problem With a Common Reference Direction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "An Empirical Study of Spatial Attention Mechanisms in Deep Networks", @@ -2427,14 +2505,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Zhu_An_Empirical_Study_of_Spatial_Attention_Mechanisms_in_Deep_Networks_ICCV_2019_paper.html", "aff_unique_index": "0+1;1;1;1;1", - "aff_unique_norm": "University of Science and Technology of China;Microsoft", + "aff_unique_norm": "University of Science and Technology of China;Microsoft Research", "aff_unique_dep": ";Research", "aff_unique_url": "http://www.ustc.edu.cn;https://www.microsoft.com/en-us/research/group/asia", "aff_unique_abbr": "USTC;MSR Asia", "aff_campus_unique_index": "1;1;1;1;1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0+0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhu_2019_ICCV,\n \n author = {\n Zhu,\n Xizhou and Cheng,\n Dazhi and Zhang,\n Zheng and Lin,\n Stephen and Dai,\n Jifeng\n},\n title = {\n An Empirical Study of Spatial Attention Mechanisms in Deep Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "An Internal Learning Approach to Video Inpainting", @@ -2467,7 +2546,8 @@ "aff_campus_unique_index": "0;", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0;0;0;1+0;0", - "aff_country_unique": "United States;United Kingdom" + "aff_country_unique": "United States;United Kingdom", + "bibtex": "@InProceedings{Zhang_2019_ICCV,\n \n author = {\n Zhang,\n Haotian and Mai,\n Long and Xu,\n Ning and Wang,\n Zhaowen and Collomosse,\n John and Jin,\n Hailin\n},\n title = {\n An Internal Learning Approach to Video Inpainting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Analyzing the Variety Loss in the Context of Probabilistic Trajectory Prediction", @@ -2479,7 +2559,7 @@ "author": "Luca Anthony Thiede; Pratik Prabhanjan Brahma", "abstract": "Trajectory or behavior prediction of traffic agents is an important component of autonomous driving and robot planning in general. It can be framed as a probabilistic future sequence generation problem and recent literature has studied the applicability of generative models in this context. The variety or Minimum over N (MoN) loss, which tries to minimize the error between the ground truth and the closest of N output predictions, has been used in these recent learning models to improve the diversity of predictions. In this work, we present a proof to show that the MoN loss does not lead to the ground truth probability density function, but approximately to its square root instead. We validate this finding with extensive experiments on both simulated toy as well as real world datasets. We also propose multiple solutions to compensate for the dilation to show improvement of log likelihood of the ground truth samples in the corrected probability density function.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Thiede_Analyzing_the_Variety_Loss_in_the_Context_of_Probabilistic_Trajectory_ICCV_2019_paper.pdf", - "aff": "Georg-August-Universit\u00e4t G\u00f6ttingen + Volkswagen Group of America Innovation and Engineering Center California; Volkswagen Group of America", + "aff": "Georg-August-Universität Göttingen + Volkswagen Group of America Innovation and Engineering Center California; Volkswagen Group of America", "project": "", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2019/supplemental/Thiede_Analyzing_the_Variety_ICCV_2019_supplemental.pdf", @@ -2493,14 +2573,15 @@ "author_num": 2, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Thiede_Analyzing_the_Variety_Loss_in_the_Context_of_Probabilistic_Trajectory_ICCV_2019_paper.html", "aff_unique_index": "0+1;1", - "aff_unique_norm": "Georg-August-Universit\u00e4t G\u00f6ttingen;Volkswagen Group of America", + "aff_unique_norm": "Georg-August-Universität Göttingen;Volkswagen Group of America", "aff_unique_dep": ";Innovation and Engineering Center", "aff_unique_url": "https://www.uni-goettingen.de;https://www.volkswagenag.com", "aff_unique_abbr": "GAU;VW Group", "aff_campus_unique_index": "0+1", - "aff_campus_unique": "G\u00f6ttingen;California;", + "aff_campus_unique": "Göttingen;California;", "aff_country_unique_index": "0+1;1", - "aff_country_unique": "Germany;United States" + "aff_country_unique": "Germany;United States", + "bibtex": "@InProceedings{Thiede_2019_ICCV,\n \n author = {\n Thiede,\n Luca Anthony and Brahma,\n Pratik Prabhanjan\n},\n title = {\n Analyzing the Variety Loss in the Context of Probabilistic Trajectory Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Anchor Diffusion for Unsupervised Video Object Segmentation", @@ -2533,7 +2614,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0+0;1;0;0", - "aff_country_unique": "United Kingdom;China" + "aff_country_unique": "United Kingdom;China", + "bibtex": "@InProceedings{Yang_2019_ICCV,\n \n author = {\n Yang,\n Zhao and Wang,\n Qiang and Bertinetto,\n Luca and Hu,\n Weiming and Bai,\n Song and Torr,\n Philip H. S.\n},\n title = {\n Anchor Diffusion for Unsupervised Video Object Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Anchor Loss: Modulating Loss Scale Based on Prediction Difficulty", @@ -2566,7 +2648,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Pasadena;", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Ryou_2019_ICCV,\n \n author = {\n Ryou,\n Serim and Jeong,\n Seong-Gyun and Perona,\n Pietro\n},\n title = {\n Anchor Loss: Modulating Loss Scale Based on Prediction Difficulty\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Anomaly Detection in Video Sequence With Appearance-Motion Correspondence", @@ -2599,7 +2682,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Montreal", "aff_country_unique_index": "0;0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Nguyen_2019_ICCV,\n \n author = {\n Nguyen,\n Trong-Nguyen and Meunier,\n Jean\n},\n title = {\n Anomaly Detection in Video Sequence With Appearance-Motion Correspondence\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Approximated Bilinear Modules for Temporal Modeling", @@ -2625,14 +2709,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Zhu_Approximated_Bilinear_Modules_for_Temporal_Modeling_ICCV_2019_paper.html", "aff_unique_index": "0;0;1;1;0", - "aff_unique_norm": "University of Sydney;Shanghai Jiao Tong University", + "aff_unique_norm": "The University of Sydney;Shanghai Jiao Tong University", "aff_unique_dep": "School of Computer Science;", "aff_unique_url": "https://www.sydney.edu.au;https://www.sjtu.edu.cn", "aff_unique_abbr": "USYD;SJTU", "aff_campus_unique_index": "0;0;1;1;0", "aff_campus_unique": "Darlington;Shanghai", "aff_country_unique_index": "0;0;1;1;0", - "aff_country_unique": "Australia;China" + "aff_country_unique": "Australia;China", + "bibtex": "@InProceedings{Zhu_2019_ICCV,\n \n author = {\n Zhu,\n Xinqi and Xu,\n Chang and Hui,\n Langwen and Lu,\n Cewu and Tao,\n Dacheng\n},\n title = {\n Approximated Bilinear Modules for Temporal Modeling\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Asymmetric Cross-Guided Attention Network for Actor and Action Video Segmentation From Natural Language Query", @@ -2644,7 +2729,7 @@ "author": "Hao Wang; Cheng Deng; Junchi Yan; Dacheng Tao", "abstract": "Actor and action video segmentation from natural language query aims to selectively segment the actor and its action in a video based on an input textual description. Previous works mostly focus on learning simple correlation between two heterogeneous features of vision and language via dynamic convolution or fully convolutional classification. However, they ignore the linguistic variation of natural language query and have difficulty in modeling global visual context, which leads to unsatisfactory segmentation performance. To address these issues, we propose an asymmetric cross-guided attention network for actor and action video segmentation from natural language query. Specifically, we frame an asymmetric cross-guided attention network, which consists of vision guided language attention to reduce the linguistic variation of input query and language guided vision attention to incorporate query-focused global visual context simultaneously. Moreover, we adopt multi-resolution fusion scheme and weighted loss for foreground and background pixels to obtain further performance improvement. Extensive experiments on Actor-Action Dataset Sentences and J-HMDB Sentences show that our proposed approach notably outperforms state-of-the-art methods.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Wang_Asymmetric_Cross-Guided_Attention_Network_for_Actor_and_Action_Video_Segmentation_ICCV_2019_paper.pdf", - "aff": "School of Electronic Engineering, Xidian University; School of Electronic Engineering, Xidian University + Tencent AI Lab; Department of CSE, and MoE Key Lab of Arti\ufb01cial Intelligence, Shanghai Jiao Tong University; UBTECH Sydney AI Centre, School of Computer Science, FEIT, University of Sydney", + "aff": "School of Electronic Engineering, Xidian University; School of Electronic Engineering, Xidian University + Tencent AI Lab; Department of CSE, and MoE Key Lab of Artificial Intelligence, Shanghai Jiao Tong University; UBTECH Sydney AI Centre, School of Computer Science, FEIT, University of Sydney", "project": "", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2019/supplemental/Wang_Asymmetric_Cross-Guided_Attention_ICCV_2019_supplemental.pdf", @@ -2665,7 +2750,8 @@ "aff_campus_unique_index": ";1", "aff_campus_unique": ";Sydney", "aff_country_unique_index": "0;0+0;0;1", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Wang_2019_ICCV,\n \n author = {\n Wang,\n Hao and Deng,\n Cheng and Yan,\n Junchi and Tao,\n Dacheng\n},\n title = {\n Asymmetric Cross-Guided Attention Network for Actor and Action Video Segmentation From Natural Language Query\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Asymmetric Non-Local Neural Networks for Semantic Segmentation", @@ -2689,7 +2775,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Zhu_Asymmetric_Non-Local_Neural_Networks_for_Semantic_Segmentation_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Zhu_Asymmetric_Non-Local_Neural_Networks_for_Semantic_Segmentation_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Zhu_2019_ICCV,\n \n author = {\n Zhu,\n Zhen and Xu,\n Mengde and Bai,\n Song and Huang,\n Tengteng and Bai,\n Xiang\n},\n title = {\n Asymmetric Non-Local Neural Networks for Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Asynchronous Single-Photon 3D Imaging", @@ -2722,7 +2809,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Madison", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Gupta_2019_ICCV,\n \n author = {\n Gupta,\n Anant and Ingle,\n Atul and Gupta,\n Mohit\n},\n title = {\n Asynchronous Single-Photon 3D Imaging\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "AttPool: Towards Hierarchical Feature Representation in Graph Convolutional Networks via Attention Mechanism", @@ -2748,14 +2836,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Huang_AttPool_Towards_Hierarchical_Feature_Representation_in_Graph_Convolutional_Networks_via_ICCV_2019_paper.html", "aff_unique_index": "0+1;0+1;0+1;2;0+1", - "aff_unique_norm": "Peking University;Pengcheng Laboratory;Tencent", - "aff_unique_dep": "School of Electronic and Computer Engineering;Peng Cheng Laboratory;Media Lab", + "aff_unique_norm": "Peking University;Peng Cheng Laboratory;Tencent Media Lab", + "aff_unique_dep": "School of Electronic and Computer Engineering;;Media Lab", "aff_unique_url": "http://www.pku.edu.cn;;https://www.tencent.com", "aff_unique_abbr": "PKU;;Tencent ML", "aff_campus_unique_index": "0+1;0+1;0+1;2;0+1", "aff_campus_unique": "Shenzhen Graduate School;Shenzhen;Palo Alto", "aff_country_unique_index": "0+0;0+0;0+0;1;0+0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Huang_2019_ICCV,\n \n author = {\n Huang,\n Jingjia and Li,\n Zhangheng and Li,\n Nannan and Liu,\n Shan and Li,\n Ge\n},\n title = {\n AttPool: Towards Hierarchical Feature Representation in Graph Convolutional Networks via Attention Mechanism\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Attacking Optical Flow", @@ -2767,7 +2856,7 @@ "author": "Anurag Ranjan; Joel Janai; Andreas Geiger; Michael J. Black", "abstract": "Deep neural nets achieve state-of-the-art performance on the problem of optical flow estimation. Since optical flow is used in several safety-critical applications like self-driving cars, it is important to gain insights into the robustness of those techniques. Recently, it has been shown that adversarial attacks easily fool deep neural networks to misclassify objects. The robustness of optical flow networks to adversarial attacks, however, has not been studied so far. In this paper, we extend adversarial patch attacks to optical flow networks and show that such attacks can compromise their performance. We show that corrupting a small patch of less than 1% of the image size can significantly affect optical flow estimates. Our attacks lead to noisy flow estimates that extend significantly beyond the region of the attack, in many cases even completely erasing the motion of objects in the scene. While networks using an encoder-decoder architecture are very sensitive to these attacks, we found that networks using a spatial pyramid architecture are less affected. We analyse the success and failure of attacking both architectures by visualizing their feature maps and comparing them to classical optical flow techniques which are robust to these attacks. We also demonstrate that such attacks are practical by placing a printed pattern into real scenes.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Ranjan_Attacking_Optical_Flow_ICCV_2019_paper.pdf", - "aff": "Max Planck Institute for Intelligent Systems\u2020; Max Planck Institute for Intelligent Systems\u2021; Max Planck Institute for Intelligent Systems\u2020; Max Planck Institute for Intelligent Systems\u2020", + "aff": "Max Planck Institute for Intelligent Systems†; Max Planck Institute for Intelligent Systems‡; Max Planck Institute for Intelligent Systems†; Max Planck Institute for Intelligent Systems†", "project": "", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2019/supplemental/Ranjan_Attacking_Optical_Flow_ICCV_2019_supplemental.pdf", @@ -2788,7 +2877,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Ranjan_2019_ICCV,\n \n author = {\n Ranjan,\n Anurag and Janai,\n Joel and Geiger,\n Andreas and Black,\n Michael J.\n},\n title = {\n Attacking Optical Flow\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Attention Augmented Convolutional Networks", @@ -2821,7 +2911,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Bello_2019_ICCV,\n \n author = {\n Bello,\n Irwan and Zoph,\n Barret and Vaswani,\n Ashish and Shlens,\n Jonathon and Le,\n Quoc V.\n},\n title = {\n Attention Augmented Convolutional Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Attention Bridging Network for Knowledge Transfer", @@ -2854,7 +2945,8 @@ "aff_campus_unique_index": "0;0;0;0;0+0", "aff_campus_unique": "Boston", "aff_country_unique_index": "0;0;0;0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Li_2019_ICCV,\n \n author = {\n Li,\n Kunpeng and Zhang,\n Yulun and Li,\n Kai and Li,\n Yuanyuan and Fu,\n Yun\n},\n title = {\n Attention Bridging Network for Knowledge Transfer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Attention on Attention for Image Captioning", @@ -2880,14 +2972,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Huang_Attention_on_Attention_for_Image_Captioning_ICCV_2019_paper.html", "aff_unique_index": "0;0+1+2;0+1;1", - "aff_unique_norm": "Peking University;Pengcheng Laboratory;Macau University of Science and Technology", - "aff_unique_dep": "School of Electronic and Computer Engineering;Peng Cheng Laboratory;", + "aff_unique_norm": "Peking University;Peng Cheng Laboratory;Macau University of Science and Technology", + "aff_unique_dep": "School of Electronic and Computer Engineering;;", "aff_unique_url": "http://www.pku.edu.cn;http://www.pcl.ac.cn;https://www.must.edu.mo", "aff_unique_abbr": "PKU;PCL;MUST", "aff_campus_unique_index": "1;", "aff_campus_unique": ";Macau SAR", "aff_country_unique_index": "0;0+0+0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Huang_2019_ICCV,\n \n author = {\n Huang,\n Lun and Wang,\n Wenmin and Chen,\n Jie and Wei,\n Xiao-Yong\n},\n title = {\n Attention on Attention for Image Captioning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Attention-Aware Polarity Sensitive Embedding for Affective Image Retrieval", @@ -2920,7 +3013,8 @@ "aff_campus_unique_index": "1;", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;0;1;0;3;0", - "aff_country_unique": "China;United States;;United Kingdom" + "aff_country_unique": "China;United States;;United Kingdom", + "bibtex": "@InProceedings{Yao_2019_ICCV,\n \n author = {\n Yao,\n Xingxu and She,\n Dongyu and Zhao,\n Sicheng and Liang,\n Jie and Lai,\n Yu-Kun and Yang,\n Jufeng\n},\n title = {\n Attention-Aware Polarity Sensitive Embedding for Affective Image Retrieval\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Attention-Based Autism Spectrum Disorder Screening With Privileged Modality", @@ -2953,7 +3047,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Chen_2019_ICCV,\n \n author = {\n Chen,\n Shi and Zhao,\n Qi\n},\n title = {\n Attention-Based Autism Spectrum Disorder Screening With Privileged Modality\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "AttentionRNN: A Structured Spatial Attention Mechanism", @@ -2986,7 +3081,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0+0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Khandelwal_2019_ICCV,\n \n author = {\n Khandelwal,\n Siddhesh and Sigal,\n Leonid\n},\n title = {\n AttentionRNN: A Structured Spatial Attention Mechanism\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Attentional Feature-Pair Relation Networks for Accurate Face Recognition", @@ -3019,7 +3115,8 @@ "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Pohang", "aff_country_unique_index": "0+1;1+1;0;1", - "aff_country_unique": "United States;South Korea" + "aff_country_unique": "United States;South Korea", + "bibtex": "@InProceedings{Kang_2019_ICCV,\n \n author = {\n Kang,\n Bong-Nam and Kim,\n Yonghyun and Jun,\n Bongjin and Kim,\n Daijin\n},\n title = {\n Attentional Feature-Pair Relation Networks for Accurate Face Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Attentional Neural Fields for Crowd Counting", @@ -3031,7 +3128,7 @@ "author": "Anran Zhang; Lei Yue; Jiayi Shen; Fan Zhu; Xiantong Zhen; Xianbin Cao; Ling Shao", "abstract": "Crowd counting has recently generated huge popularity in computer vision, and is extremely challenging due to the huge scale variations of objects. In this paper, we propose the Attentional Neural Field (ANF) for crowd counting via density estimation. Within the encoder-decoder network, we introduce conditional random fields (CRFs) to aggregate multi-scale features, which can build more informative representations. To better model pair-wise potentials in CRFs, we incorperate non-local attention mechanism implemented as inter- and intra-layer attentions to expand the receptive field to the entire image respectively within the same layer and across different layers, which captures long-range dependencies to conquer huge scale variations. The CRFs coupled with the attention mechanism are seamlessly integrated into the encoder-decoder network, establishing an ANF that can be optimized end-to-end by back propagation. We conduct extensive experiments on four public datasets, including ShanghaiTech, WorldEXPO 10, UCF-CC-50 and UCF-QNRF. The results show that our ANF achieves high counting performance, surpassing most previous methods.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Zhang_Attentional_Neural_Fields_for_Crowd_Counting_ICCV_2019_paper.pdf", - "aff": "School of Electronic and Information Engineering, Beihang University, Beijing, China; School of Electronic and Information Engineering, Beihang University, Beijing, China; School of Electronic and Information Engineering, Beihang University, Beijing, China; Inception Institute of Arti\ufb01cial Intelligence, Abu Dhabi, UAE; Inception Institute of Arti\ufb01cial Intelligence, Abu Dhabi, UAE; School of Electronic and Information Engineering, Beihang University, Beijing, China + Key Laboratory of Advanced Technology of Near Space Information System (Beihang University), Ministry of Industry and Information Technology of China, Beijing, China + Beijing Advanced Innovation Center for Big Data-Based Precision Medicine, Beijing, China; Inception Institute of Arti\ufb01cial Intelligence, Abu Dhabi, UAE", + "aff": "School of Electronic and Information Engineering, Beihang University, Beijing, China; School of Electronic and Information Engineering, Beihang University, Beijing, China; School of Electronic and Information Engineering, Beihang University, Beijing, China; Inception Institute of Artificial Intelligence, Abu Dhabi, UAE; Inception Institute of Artificial Intelligence, Abu Dhabi, UAE; School of Electronic and Information Engineering, Beihang University, Beijing, China + Key Laboratory of Advanced Technology of Near Space Information System (Beihang University), Ministry of Industry and Information Technology of China, Beijing, China + Beijing Advanced Innovation Center for Big Data-Based Precision Medicine, Beijing, China; Inception Institute of Artificial Intelligence, Abu Dhabi, UAE", "project": "", "github": "", "supp": "", @@ -3052,7 +3149,8 @@ "aff_campus_unique_index": "0;0;0;1;1;0+0;1", "aff_campus_unique": "Beijing;Abu Dhabi;", "aff_country_unique_index": "0;0;0;1;1;0+0+0;1", - "aff_country_unique": "China;United Arab Emirates" + "aff_country_unique": "China;United Arab Emirates", + "bibtex": "@InProceedings{Zhang_2019_ICCV,\n \n author = {\n Zhang,\n Anran and Yue,\n Lei and Shen,\n Jiayi and Zhu,\n Fan and Zhen,\n Xiantong and Cao,\n Xianbin and Shao,\n Ling\n},\n title = {\n Attentional Neural Fields for Crowd Counting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Attract or Distract: Exploit the Margin of Open Set", @@ -3085,7 +3183,8 @@ "aff_campus_unique_index": "0;1;0;0", "aff_campus_unique": "Sydney;Pittsburgh;", "aff_country_unique_index": "0;1;0+2;0", - "aff_country_unique": "Australia;United States;China" + "aff_country_unique": "Australia;United States;China", + "bibtex": "@InProceedings{Feng_2019_ICCV,\n \n author = {\n Feng,\n Qianyu and Kang,\n Guoliang and Fan,\n Hehe and Yang,\n Yi\n},\n title = {\n Attract or Distract: Exploit the Margin of Open Set\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Attribute Attention for Semantic Disambiguation in Zero-Shot Learning", @@ -3118,7 +3217,8 @@ "aff_campus_unique_index": "0;0+2;0;", "aff_campus_unique": "Hangzhou;;Guangzhou", "aff_country_unique_index": "0+0+0;0+0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2019_ICCV,\n \n author = {\n Liu,\n Yang and Guo,\n Jishun and Cai,\n Deng and He,\n Xiaofei\n},\n title = {\n Attribute Attention for Semantic Disambiguation in Zero-Shot Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Attribute Manipulation Generative Adversarial Networks for Fashion Images", @@ -3151,7 +3251,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Ak_2019_ICCV,\n \n author = {\n Ak,\n Kenan E. and Lim,\n Joo Hwee and Tham,\n Jo Yew and Kassim,\n Ashraf A.\n},\n title = {\n Attribute Manipulation Generative Adversarial Networks for Fashion Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Attribute-Driven Spontaneous Motion in Unpaired Image Translation", @@ -3177,14 +3278,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Wu_Attribute-Driven_Spontaneous_Motion_in_Unpaired_Image_Translation_ICCV_2019_paper.html", "aff_unique_index": "0;1;2;1;0+1", - "aff_unique_norm": "Chinese University of Hong Kong;Tencent;Harbin Institute of Technology", + "aff_unique_norm": "The Chinese University of Hong Kong;Tencent;Harbin Institute of Technology", "aff_unique_dep": ";YouTu Lab;", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.tencent.com;http://en.hhit.edu.cn/", "aff_unique_abbr": "CUHK;Tencent;HIT", "aff_campus_unique_index": "0;2;0", "aff_campus_unique": "Hong Kong SAR;;Shenzhen", "aff_country_unique_index": "0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wu_2019_ICCV,\n \n author = {\n Wu,\n Ruizheng and Tao,\n Xin and Gu,\n Xiaodong and Shen,\n Xiaoyong and Jia,\n Jiaya\n},\n title = {\n Attribute-Driven Spontaneous Motion in Unpaired Image Translation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Attributing Fake Images to GANs: Learning and Analyzing GAN Fingerprints", @@ -3217,7 +3319,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "College Park;", "aff_country_unique_index": "0+1;0;1", - "aff_country_unique": "United States;Germany" + "aff_country_unique": "United States;Germany", + "bibtex": "@InProceedings{Yu_2019_ICCV,\n \n author = {\n Yu,\n Ning and Davis,\n Larry S. and Fritz,\n Mario\n},\n title = {\n Attributing Fake Images to GANs: Learning and Analyzing GAN Fingerprints\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Auto-FPN: Automatic Network Architecture Adaptation for Object Detection Beyond Classification", @@ -3229,7 +3332,7 @@ "author": "Hang Xu; Lewei Yao; Wei Zhang; Xiaodan Liang; Zhenguo Li", "abstract": "Abstract Neural architecture search (NAS) has shown great potential in automating the manual process of designing a good CNN architecture for image classification. In this paper, we study NAS for object detection, a core computer vision task that classifies and localizes object instances in an image. Existing works focus on transferring the searched architecture from classification task (ImageNet) to the detector backbone, while the rest of the architecture of the detector remains unchanged. However, this pipeline is not task-specific or data-oriented network search which cannot guarantee optimal adaptation to any dataset. Therefore, we propose an architecture search framework named Auto-FPN specifically designed for detection beyond simply searching a classification backbone. Specifically, we propose two auto search modules for detection: Auto-fusion to search a better fusion of the multi-level features; Auto-head to search a better structure for classification and bounding-box(bbox) regression. Instead of searching for one repeatable cell structure, we relax the constraint and allow different cells. The search space of both modules covers many popular designs of detectors and allows efficient gradient-based architecture search with resource constraint (2 days for COCO on 8 GPU cards). Extensive experiments on Pascal VOC, COCO, BDD, VisualGenome and ADE demonstrate the effectiveness of the proposed method, e.g. achieving around 5% improvement than FPN in terms of mAP while requiring around 50% fewer parameters on the searched modules.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Xu_Auto-FPN_Automatic_Network_Architecture_Adaptation_for_Object_Detection_Beyond_Classification_ICCV_2019_paper.pdf", - "aff": "Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Sun Yat-sen University; Huawei Noah\u2019s Ark Lab", + "aff": "Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; Sun Yat-sen University; Huawei Noah’s Ark Lab", "project": "", "github": "", "supp": "", @@ -3244,13 +3347,14 @@ "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Xu_Auto-FPN_Automatic_Network_Architecture_Adaptation_for_Object_Detection_Beyond_Classification_ICCV_2019_paper.html", "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "Huawei;Sun Yat-sen University", - "aff_unique_dep": "Noah\u2019s Ark Lab;", + "aff_unique_dep": "Noah’s Ark Lab;", "aff_unique_url": "https://www.huawei.com;http://www.sysu.edu.cn/", "aff_unique_abbr": "Huawei;SYSU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xu_2019_ICCV,\n \n author = {\n Xu,\n Hang and Yao,\n Lewei and Zhang,\n Wei and Liang,\n Xiaodan and Li,\n Zhenguo\n},\n title = {\n Auto-FPN: Automatic Network Architecture Adaptation for Object Detection Beyond Classification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Auto-ReID: Searching for a Part-Aware ConvNet for Person Re-Identification", @@ -3283,7 +3387,8 @@ "aff_campus_unique_index": "1;1;1;1;1", "aff_campus_unique": ";Sydney", "aff_country_unique_index": "0+1;0+1;0+1;1;1", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Quan_2019_ICCV,\n \n author = {\n Quan,\n Ruijie and Dong,\n Xuanyi and Wu,\n Yu and Zhu,\n Linchao and Yang,\n Yi\n},\n title = {\n Auto-ReID: Searching for a Part-Aware ConvNet for Person Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "AutoDispNet: Improving Disparity Estimation With AutoML", @@ -3316,7 +3421,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Saikia_2019_ICCV,\n \n author = {\n Saikia,\n Tonmoy and Marrakchi,\n Yassine and Zela,\n Arber and Hutter,\n Frank and Brox,\n Thomas\n},\n title = {\n AutoDispNet: Improving Disparity Estimation With AutoML\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "AutoFocus: Efficient Multi-Scale Inference", @@ -3349,7 +3455,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "College Park", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Najibi_2019_ICCV,\n \n author = {\n Najibi,\n Mahyar and Singh,\n Bharat and Davis,\n Larry S.\n},\n title = {\n AutoFocus: Efficient Multi-Scale Inference\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "AutoGAN: Neural Architecture Search for Generative Adversarial Networks", @@ -3382,7 +3489,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Gong_2019_ICCV,\n \n author = {\n Gong,\n Xinyu and Chang,\n Shiyu and Jiang,\n Yifan and Wang,\n Zhangyang\n},\n title = {\n AutoGAN: Neural Architecture Search for Generative Adversarial Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Automatic and Robust Skull Registration Based on Discrete Uniformization", @@ -3411,11 +3519,12 @@ "aff_unique_norm": "Qingdao University;Stony Brook University;Dalian University of Technology", "aff_unique_dep": "School of Data Science and Software Engineering;Department of Computer Science;International School of Information Science and Engineering", "aff_unique_url": "https://www.qingdaouni.edu.cn;https://www.stonybrook.edu;http://en.dlut.edu.cn/", - "aff_unique_abbr": ";SBU;DUT", + "aff_unique_abbr": ";SBU;", "aff_campus_unique_index": "0;1;1;2;1", "aff_campus_unique": "Qingdao;Stony Brook;Dalian", "aff_country_unique_index": "0;1;1;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Zhao_2019_ICCV,\n \n author = {\n Zhao,\n Junli and Qi,\n Xin and Wen,\n Chengfeng and Lei,\n Na and Gu,\n Xianfeng\n},\n title = {\n Automatic and Robust Skull Registration Based on Discrete Uniformization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "BAE-NET: Branched Autoencoder for Shape Co-Segmentation", @@ -3448,7 +3557,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Mumbai", "aff_country_unique_index": "0;0;1;1+2;0", - "aff_country_unique": "Canada;United States;India" + "aff_country_unique": "Canada;United States;India", + "bibtex": "@InProceedings{Chen_2019_ICCV,\n \n author = {\n Chen,\n Zhiqin and Yin,\n Kangxue and Fisher,\n Matthew and Chaudhuri,\n Siddhartha and Zhang,\n Hao\n},\n title = {\n BAE-NET: Branched Autoencoder for Shape Co-Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "BMN: Boundary-Matching Network for Temporal Action Proposal Generation", @@ -3474,14 +3584,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Lin_BMN_Boundary-Matching_Network_for_Temporal_Action_Proposal_Generation_ICCV_2019_paper.html", "aff_unique_index": "0;0;0;0;0", - "aff_unique_norm": "Baidu", + "aff_unique_norm": "Baidu Inc.", "aff_unique_dep": "Department of Computer Vision Technology (VIS)", "aff_unique_url": "https://www.baidu.com", "aff_unique_abbr": "Baidu", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Lin_2019_ICCV,\n \n author = {\n Lin,\n Tianwei and Liu,\n Xiao and Li,\n Xin and Ding,\n Errui and Wen,\n Shilei\n},\n title = {\n BMN: Boundary-Matching Network for Temporal Action Proposal Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Balanced Datasets Are Not Enough: Estimating and Mitigating Gender Bias in Deep Image Representations", @@ -3514,7 +3625,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Wang_2019_ICCV,\n \n author = {\n Wang,\n Tianlu and Zhao,\n Jieyu and Yatskar,\n Mark and Chang,\n Kai-Wei and Ordonez,\n Vicente\n},\n title = {\n Balanced Datasets Are Not Enough: Estimating and Mitigating Gender Bias in Deep Image Representations\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Batch DropBlock Network for Person Re-Identification and Beyond", @@ -3547,7 +3659,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1", - "aff_country_unique": "China;Canada" + "aff_country_unique": "China;Canada", + "bibtex": "@InProceedings{Dai_2019_ICCV,\n \n author = {\n Dai,\n Zuozhuo and Chen,\n Mingqiang and Gu,\n Xiaodong and Zhu,\n Siyu and Tan,\n Ping\n},\n title = {\n Batch DropBlock Network for Person Re-Identification and Beyond\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Batch Weight for Domain Adaptation With Mass Shift", @@ -3555,11 +3668,11 @@ "status": "Poster", "track": "main", "pid": "6579", - "author_site": "Mikolaj Bi\u00c5\u0084kowski, Devon Hjelm, Aaron Courville", + "author_site": "Mikolaj Bińkowski, Devon Hjelm, Aaron Courville", "author": "Mikolaj Binkowski; Devon Hjelm; Aaron Courville", "abstract": "Unsupervised domain transfer is the task of transferring or translating samples from a source distribution to a different target distribution. Current solutions unsupervised domain transfer often operate on data on which the modes of the distribution are well-matched, for instance have the same frequencies of classes between source and target distributions. However, these models do not perform well when the modes are not well-matched, as would be the case when samples are drawn independently from two different, but related, domains. This mode imbalance is problematic as generative adversarial networks (GANs), a successful approach in this setting, are sensitive to mode frequency, which results in a mismatch of semantics between source samples and generated samples of the target distribution. We propose a principled method of re-weighting training samples to correct for such mass shift between the transferred distributions, which we call batch weight. We also provide rigorous probabilistic setting for domain transfer and new simplified objective for training transfer networks, an alternative to complex, multi-component loss functions used in the current state-of-the art image-to-image translation models. The new objective stems from the discrimination of joint distributions and enforces cycle-consistency in an abstract, high-level, rather than pixel-wise, sense. Lastly, we experimentally show the effectiveness of the proposed methods in several image-to-image translation tasks.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Binkowski_Batch_Weight_for_Domain_Adaptation_With_Mass_Shift_ICCV_2019_paper.pdf", - "aff": "Mila, Universit\u00e9 de Montr\u00e9al+Imperial College London; Mila, Universit\u00e9 de Montr\u00e9al+Microsoft Research; Mila, Universit\u00e9 de Montr\u00e9al+CIFAR Fellow", + "aff": "Mila, Université de Montréal+Imperial College London; Mila, Université de Montréal+Microsoft Research; Mila, Université de Montréal+CIFAR Fellow", "project": "", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2019/supplemental/Binkowski_Batch_Weight_for_ICCV_2019_supplemental.pdf", @@ -3573,14 +3686,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Binkowski_Batch_Weight_for_Domain_Adaptation_With_Mass_Shift_ICCV_2019_paper.html", "aff_unique_index": "0+1;0+2;0+3", - "aff_unique_norm": "Universit\u00e9 de Montr\u00e9al;Imperial College London;Microsoft;CIFAR", + "aff_unique_norm": "Université de Montréal;Imperial College London;Microsoft Corporation;CIFAR", "aff_unique_dep": "Mila;;Microsoft Research;", "aff_unique_url": "https://umontreal.ca;https://www.imperial.ac.uk;https://www.microsoft.com/en-us/research;https://www.cifar.ca", "aff_unique_abbr": "UdeM;ICL;MSR;CIFAR", "aff_campus_unique_index": "0;0;0", - "aff_campus_unique": "Montr\u00e9al;", + "aff_campus_unique": "Montréal;", "aff_country_unique_index": "0+1;0+2;0+0", - "aff_country_unique": "Canada;United Kingdom;United States" + "aff_country_unique": "Canada;United Kingdom;United States", + "bibtex": "@InProceedings{Binkowski_2019_ICCV,\n \n author = {\n Binkowski,\n Mikolaj and Hjelm,\n Devon and Courville,\n Aaron\n},\n title = {\n Batch Weight for Domain Adaptation With Mass Shift\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Bayes-Factor-VAE: Hierarchical Bayesian Deep Auto-Encoder Models for Factor Disentanglement", @@ -3606,14 +3720,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Kim_Bayes-Factor-VAE_Hierarchical_Bayesian_Deep_Auto-Encoder_Models_for_Factor_Disentanglement_ICCV_2019_paper.html", "aff_unique_index": "0+1;0+1;1;0+1", - "aff_unique_norm": "Samsung;Rutgers University", + "aff_unique_norm": "Samsung AI Center;Rutgers University", "aff_unique_dep": "AI Center;Department of Computer Science", "aff_unique_url": "https://www.samsung.com/global/research-innovation/ai-research/;https://www.rutgers.edu", "aff_unique_abbr": "SAC;Rutgers", "aff_campus_unique_index": "0+1;0+1;1;0+1", "aff_campus_unique": "Cambridge;New Brunswick", "aff_country_unique_index": "0+1;0+1;1;0+1", - "aff_country_unique": "United Kingdom;United States" + "aff_country_unique": "United Kingdom;United States", + "bibtex": "@InProceedings{Kim_2019_ICCV,\n \n author = {\n Kim,\n Minyoung and Wang,\n Yuting and Sahu,\n Pritish and Pavlovic,\n Vladimir\n},\n title = {\n Bayes-Factor-VAE: Hierarchical Bayesian Deep Auto-Encoder Models for Factor Disentanglement\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Bayesian Adaptive Superpixel Segmentation", @@ -3646,7 +3761,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Israel" + "aff_country_unique": "Israel", + "bibtex": "@InProceedings{Uziel_2019_ICCV,\n \n author = {\n Uziel,\n Roy and Ronen,\n Meitar and Freifeld,\n Oren\n},\n title = {\n Bayesian Adaptive Superpixel Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Bayesian Graph Convolution LSTM for Skeleton Based Action Recognition", @@ -3679,7 +3795,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhao_2019_ICCV,\n \n author = {\n Zhao,\n Rui and Wang,\n Kang and Su,\n Hui and Ji,\n Qiang\n},\n title = {\n Bayesian Graph Convolution LSTM for Skeleton Based Action Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Bayesian Loss for Crowd Count Estimation With Point Supervision", @@ -3691,7 +3808,7 @@ "author": "Zhiheng Ma; Xing Wei; Xiaopeng Hong; Yihong Gong", "abstract": "In crowd counting datasets, each person is annotated by a point, which is usually the center of the head. And the task is to estimate the total count in a crowd scene. Most of the state-of-the-art methods are based on density map estimation, which convert the sparse point annotations into a \"ground truth\" density map through a Gaussian kernel, and then use it as the learning target to train a density map estimator. However, such a \"ground-truth\" density map is imperfect due to occlusions, perspective effects, variations in object shapes, etc. On the contrary, we propose Bayesian loss, a novel loss function which constructs a density contribution probability model from the point annotations. Instead of constraining the value at every pixel in the density map, the proposed training loss adopts a more reliable supervision on the count expectation at each annotated point. Without bells and whistles, the loss function makes substantial improvements over the baseline loss on all tested datasets. Moreover, our proposed loss function equipped with a standard backbone network, without using any external detectors or multi-scale architectures, plays favourably against the state of the arts. Our method outperforms previous best approaches by a large margin on the latest and largest UCF-QNRF dataset.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Ma_Bayesian_Loss_for_Crowd_Count_Estimation_With_Point_Supervision_ICCV_2019_paper.pdf", - "aff": "Faculty of Electronic and Information Engineering, Xi\u2019an Jiaotong University; Faculty of Electronic and Information Engineering, Xi\u2019an Jiaotong University; Faculty of Electronic and Information Engineering, Xi\u2019an Jiaotong University + Research Center for Arti\ufb01cial Intelligence, Peng Cheng Laborotory; Faculty of Electronic and Information Engineering, Xi\u2019an Jiaotong University", + "aff": "Faculty of Electronic and Information Engineering, Xi’an Jiaotong University; Faculty of Electronic and Information Engineering, Xi’an Jiaotong University; Faculty of Electronic and Information Engineering, Xi’an Jiaotong University + Research Center for Artificial Intelligence, Peng Cheng Laborotory; Faculty of Electronic and Information Engineering, Xi’an Jiaotong University", "project": "", "github": "https://github.com/ZhihengCV/Baysian-Crowd-Counting", "supp": "", @@ -3705,14 +3822,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Ma_Bayesian_Loss_for_Crowd_Count_Estimation_With_Point_Supervision_ICCV_2019_paper.html", "aff_unique_index": "0;0;0+1;0", - "aff_unique_norm": "Xi'an Jiao Tong University;Research Center for Arti\ufb01cial Intelligence", - "aff_unique_dep": "Faculty of Electronic and Information Engineering;Arti\ufb01cial Intelligence", + "aff_unique_norm": "Xi'an Jiaotong University;Research Center for Artificial Intelligence", + "aff_unique_dep": "Faculty of Electronic and Information Engineering;Artificial Intelligence", "aff_unique_url": "http://www.xjtu.edu.cn;", "aff_unique_abbr": "XJTU;", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Xi'an;", "aff_country_unique_index": "0;0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Ma_2019_ICCV,\n \n author = {\n Ma,\n Zhiheng and Wei,\n Xing and Hong,\n Xiaopeng and Gong,\n Yihong\n},\n title = {\n Bayesian Loss for Crowd Count Estimation With Point Supervision\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Bayesian Optimized 1-Bit CNNs", @@ -3724,7 +3842,7 @@ "author": "Jiaxin Gu; Junhe Zhao; Xiaolong Jiang; Baochang Zhang; Jianzhuang Liu; Guodong Guo; Rongrong Ji", "abstract": "Deep convolutional neural networks (DCNNs) have dominated the recent developments in computer vision through making various record-breaking models. However, it is still a great challenge to achieve powerful DCNNs in resource-limited environments, such as on embedded devices and smart phones. Researchers have realized that 1-bit CNNs can be one feasible solution to resolve the issue; however, they are baffled by the inferior performance compared to the full-precision DCNNs. In this paper, we propose a novel approach, called Bayesian optimized 1-bit CNNs (denoted as BONNs), taking the advantage of Bayesian learning, a well-established strategy for hard problems, to significantly improve the performance of extreme 1-bit CNNs. We incorporate the prior distributions of full-precision kernels and features into the Bayesian framework to construct 1-bit CNNs in an end-to-end manner, which have not been considered in any previous related methods. The Bayesian losses are achieved with a theoretical support to optimize the network simultaneously in both continuous and discrete spaces, aggregating different losses jointly to improve the model capacity. Extensive experiments on the ImageNet and CIFAR datasets show that BONNs achieve the best classification performance compared to state-of-the-art 1-bit CNNs.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Gu_Bayesian_Optimized_1-Bit_CNNs_ICCV_2019_paper.pdf", - "aff": "Beihang University; Beihang University; Beihang University+Institute of Deep Learning, Baidu Research+National Engineering Laboratory for Deep Learning Technology and Application; Beihang University; Huawei Noah\u2019s Ark Lab; Institute of Deep Learning, Baidu Research+National Engineering Laboratory for Deep Learning Technology and Application; School of Information Science and Engineering, Xiamen University+Peng Cheng Lab", + "aff": "Beihang University; Beihang University; Beihang University+Institute of Deep Learning, Baidu Research+National Engineering Laboratory for Deep Learning Technology and Application; Beihang University; Huawei Noah’s Ark Lab; Institute of Deep Learning, Baidu Research+National Engineering Laboratory for Deep Learning Technology and Application; School of Information Science and Engineering, Xiamen University+Peng Cheng Lab", "project": "", "github": "", "supp": "", @@ -3738,14 +3856,15 @@ "author_num": 7, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Gu_Bayesian_Optimized_1-Bit_CNNs_ICCV_2019_paper.html", "aff_unique_index": "0;0;0+1+2;0;3;1+2;4+5", - "aff_unique_norm": "Beihang University;Baidu;National Engineering Laboratory for Deep Learning Technology and Application;Huawei;Xiamen University;Pengcheng Laboratory", - "aff_unique_dep": ";Institute of Deep Learning;;Noah\u2019s Ark Lab;School of Information Science and Engineering;Peng Cheng Lab", + "aff_unique_norm": "Beihang University;Baidu Research;National Engineering Laboratory for Deep Learning Technology and Application;Huawei;Xiamen University;Peng Cheng Lab", + "aff_unique_dep": ";Institute of Deep Learning;;Noah’s Ark Lab;School of Information Science and Engineering;", "aff_unique_url": "http://www.buaa.edu.cn/;https://baidu.com;;https://www.huawei.com;https://www.xmu.edu.cn;", "aff_unique_abbr": "BUAA;Baidu;;Huawei;XMU;", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0+0;0;0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Gu_2019_ICCV,\n \n author = {\n Gu,\n Jiaxin and Zhao,\n Junhe and Jiang,\n Xiaolong and Zhang,\n Baochang and Liu,\n Jianzhuang and Guo,\n Guodong and Ji,\n Rongrong\n},\n title = {\n Bayesian Optimized 1-Bit CNNs\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Bayesian Relational Memory for Semantic Visual Navigation", @@ -3771,14 +3890,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Wu_Bayesian_Relational_Memory_for_Semantic_Visual_Navigation_ICCV_2019_paper.html", "aff_unique_index": "0;1;2;0;1;1", - "aff_unique_norm": "University of California, Berkeley;Meta;Technion - Israel Institute of Technology", + "aff_unique_norm": "University of California, Berkeley;Facebook;Technion - Israel Institute of Technology", "aff_unique_dep": ";Facebook AI Research;", "aff_unique_url": "https://www.berkeley.edu;https://research.facebook.com;https://www.technion.ac.il/en/", "aff_unique_abbr": "UC Berkeley;FAIR;Technion", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;0;1;0;0;0", - "aff_country_unique": "United States;Israel" + "aff_country_unique": "United States;Israel", + "bibtex": "@InProceedings{Wu_2019_ICCV,\n \n author = {\n Wu,\n Yi and Wu,\n Yuxin and Tamar,\n Aviv and Russell,\n Stuart and Gkioxari,\n Georgia and Tian,\n Yuandong\n},\n title = {\n Bayesian Relational Memory for Semantic Visual Navigation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Be Your Own Teacher: Improve the Performance of Convolutional Neural Networks via Self Distillation", @@ -3811,7 +3931,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Zhang_2019_ICCV,\n \n author = {\n Zhang,\n Linfeng and Song,\n Jiebo and Gao,\n Anni and Chen,\n Jingwei and Bao,\n Chenglong and Ma,\n Kaisheng\n},\n title = {\n Be Your Own Teacher: Improve the Performance of Convolutional Neural Networks via Self Distillation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Better and Faster: Exponential Loss for Image Patch Matching", @@ -3844,7 +3965,8 @@ "aff_campus_unique_index": "0;0;0+1;0;0;0;0", "aff_campus_unique": "Shaanxi;Kyoto", "aff_country_unique_index": "0;0;0+1;0;0;0;0", - "aff_country_unique": "China;Japan" + "aff_country_unique": "China;Japan", + "bibtex": "@InProceedings{Wang_2019_ICCV,\n \n author = {\n Wang,\n Shuang and Li,\n Yanfeng and Liang,\n Xuefeng and Quan,\n Dou and Yang,\n Bowu and Wei,\n Shaowei and Jiao,\n Licheng\n},\n title = {\n Better and Faster: Exponential Loss for Image Patch Matching\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Better to Follow, Follow to Be Better: Towards Precise Supervision of Feature Super-Resolution for Small Object Detection", @@ -3877,7 +3999,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Amherst", "aff_country_unique_index": "0;1;0;0;0", - "aff_country_unique": "South Korea;United States" + "aff_country_unique": "South Korea;United States", + "bibtex": "@InProceedings{Noh_2019_ICCV,\n \n author = {\n Noh,\n Junhyug and Bae,\n Wonho and Lee,\n Wonhee and Seo,\n Jinhwan and Kim,\n Gunhee\n},\n title = {\n Better to Follow,\n Follow to Be Better: Towards Precise Supervision of Feature Super-Resolution for Small Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Beyond Cartesian Representations for Local Descriptors", @@ -3889,7 +4012,7 @@ "author": "Patrick Ebel; Anastasiia Mishchuk; Kwang Moo Yi; Pascal Fua; Eduard Trulls", "abstract": "The dominant approach for learning local patch descriptors relies on small image regions whose scale must be properly estimated a priori by a keypoint detector. In other words, if two patches are not in correspondence, their descriptors will not match. A strategy often used to alleviate this problem is to \"pool\" the pixel-wise features over log-polar regions, rather than regularly spaced ones. By contrast, we propose to extract the \"support region\" directly with a log-polar sampling scheme. We show that this provides us with a better representation by simultaneously oversampling the immediate neighbourhood of the point and undersampling regions far away from it. We demonstrate that this representation is particularly amenable to learning descriptors with deep networks. Our models can match descriptors across a much wider range of scales than was possible before, and also leverage much larger support regions without suffering from occlusions. We report state-of-the-art results on three different datasets", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Ebel_Beyond_Cartesian_Representations_for_Local_Descriptors_ICCV_2019_paper.pdf", - "aff": "Computer Vision Lab, \u00c9cole Polytechnique F\u00e9d\u00e9rale de Lausanne; Computer Vision Lab, \u00c9cole Polytechnique F\u00e9d\u00e9rale de Lausanne; Visual Computing Group, University of Victoria; Computer Vision Lab, \u00c9cole Polytechnique F\u00e9d\u00e9rale de Lausanne; Google Switzerland", + "aff": "Computer Vision Lab, École Polytechnique Fédérale de Lausanne; Computer Vision Lab, École Polytechnique Fédérale de Lausanne; Visual Computing Group, University of Victoria; Computer Vision Lab, École Polytechnique Fédérale de Lausanne; Google Switzerland", "project": "", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2019/supplemental/Ebel_Beyond_Cartesian_Representations_ICCV_2019_supplemental.pdf", @@ -3903,14 +4026,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Ebel_Beyond_Cartesian_Representations_for_Local_Descriptors_ICCV_2019_paper.html", "aff_unique_index": "0;0;1;0;2", - "aff_unique_norm": "EPFL;University of Victoria;Google", - "aff_unique_dep": "Computer Vision Lab;Visual Computing Group;Google", + "aff_unique_norm": "École Polytechnique Fédérale de Lausanne;University of Victoria;Google", + "aff_unique_dep": "Computer Vision Lab;Visual Computing Group;", "aff_unique_url": "https://www.epfl.ch;https://www.uvic.ca;https://www.google.ch", "aff_unique_abbr": "EPFL;UVic;Google CH", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Victoria;Switzerland", "aff_country_unique_index": "0;0;1;0;0", - "aff_country_unique": "Switzerland;Canada" + "aff_country_unique": "Switzerland;Canada", + "bibtex": "@InProceedings{Ebel_2019_ICCV,\n \n author = {\n Ebel,\n Patrick and Mishchuk,\n Anastasiia and Yi,\n Kwang Moo and Fua,\n Pascal and Trulls,\n Eduard\n},\n title = {\n Beyond Cartesian Representations for Local Descriptors\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Beyond Human Parts: Dual Part-Aligned Representations for Person Re-Identification", @@ -3922,7 +4046,7 @@ "author": "Jianyuan Guo; Yuhui Yuan; Lang Huang; Chao Zhang; Jin-Ge Yao; Kai Han", "abstract": "Person re-identification is a challenging task due to various complex factors. Recent studies have attempted to integrate human parsing results or externally defined attributes to help capture human parts or important object regions. On the other hand, there still exist many useful contextual cues that do not fall into the scope of predefined human parts or attributes. In this paper, we address the missed contextual cues by exploiting both the accurate human parts and the coarse non-human parts. In our implementation, we apply a human parsing model to extract the binary human part masks and a self-attention mechanism to capture the soft latent (non-human) part masks. We verify the effectiveness of our approach with new state-of-the-art performance on three challenging benchmarks: Market-1501, DukeMTMC-reID and CUHK03. Our implementation is available at https://github.com/ggjy/P2Net.pytorch.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Guo_Beyond_Human_Parts_Dual_Part-Aligned_Representations_for_Person_Re-Identification_ICCV_2019_paper.pdf", - "aff": "Key Laboratory of Machine Perception (MOE), Peking University; Microsoft Research Asia+Institute of Computing Technology+University of Chinese Academy of Sciences; Key Laboratory of Machine Perception (MOE), Peking University; Key Laboratory of Machine Perception (MOE), Peking University; Microsoft Research Asia; Noah\u2019s Ark Lab", + "aff": "Key Laboratory of Machine Perception (MOE), Peking University; Microsoft Research Asia+Institute of Computing Technology+University of Chinese Academy of Sciences; Key Laboratory of Machine Perception (MOE), Peking University; Key Laboratory of Machine Perception (MOE), Peking University; Microsoft Research Asia; Noah’s Ark Lab", "project": "", "github": "https://github.com/ggjy/P2Net.pytorch", "supp": "http://openaccess.thecvf.com/content_ICCV_2019/supplemental/Guo_Beyond_Human_Parts_ICCV_2019_supplemental.pdf", @@ -3936,14 +4060,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Guo_Beyond_Human_Parts_Dual_Part-Aligned_Representations_for_Person_Re-Identification_ICCV_2019_paper.html", "aff_unique_index": "0;1+2+3;0;0;1;4", - "aff_unique_norm": "Peking University;Microsoft;Institute of Computing Technology;University of Chinese Academy of Sciences;Noah\u2019s Ark Lab", + "aff_unique_norm": "Peking University;Microsoft Research;Institute of Computing Technology;University of Chinese Academy of Sciences;Noah’s Ark Lab", "aff_unique_dep": "Key Laboratory of Machine Perception (MOE);Research;;;", "aff_unique_url": "http://www.pku.edu.cn;https://www.microsoft.com/en-us/research/group/asia;http://www.ict.ac.cn;http://www.ucas.ac.cn;", "aff_unique_abbr": "PKU;MSR Asia;;UCAS;", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0;0+0+0;0;0;0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Guo_2019_ICCV,\n \n author = {\n Guo,\n Jianyuan and Yuan,\n Yuhui and Huang,\n Lang and Zhang,\n Chao and Yao,\n Jin-Ge and Han,\n Kai\n},\n title = {\n Beyond Human Parts: Dual Part-Aligned Representations for Person Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Bidirectional One-Shot Unsupervised Domain Mapping", @@ -3969,14 +4094,15 @@ "author_num": 2, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Cohen_Bidirectional_One-Shot_Unsupervised_Domain_Mapping_ICCV_2019_paper.html", "aff_unique_index": "0;1+0", - "aff_unique_norm": "Tel Aviv University;Meta", + "aff_unique_norm": "Tel Aviv University;Facebook", "aff_unique_dep": ";Facebook AI Research", "aff_unique_url": "https://www.tau.ac.il;https://research.facebook.com", "aff_unique_abbr": "TAU;FAIR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1+0", - "aff_country_unique": "Israel;United States" + "aff_country_unique": "Israel;United States", + "bibtex": "@InProceedings{Cohen_2019_ICCV,\n \n author = {\n Cohen,\n Tomer and Wolf,\n Lior\n},\n title = {\n Bidirectional One-Shot Unsupervised Domain Mapping\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Bilateral Adversarial Training: Towards Fast Training of More Robust Models Against Adversarial Attacks", @@ -4002,14 +4128,15 @@ "author_num": 2, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Wang_Bilateral_Adversarial_Training_Towards_Fast_Training_of_More_Robust_Models_ICCV_2019_paper.html", "aff_unique_index": "0;0", - "aff_unique_norm": "Baidu", + "aff_unique_norm": "Baidu Research", "aff_unique_dep": "Research", "aff_unique_url": "https://research.baidu.com", "aff_unique_abbr": "Baidu", "aff_campus_unique_index": "0;0", "aff_campus_unique": "USA", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Wang_2019_ICCV,\n \n author = {\n Wang,\n Jianyu and Zhang,\n Haichao\n},\n title = {\n Bilateral Adversarial Training: Towards Fast Training of More Robust Models Against Adversarial Attacks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Bilinear Attention Networks for Person Retrieval", @@ -4042,7 +4169,8 @@ "aff_campus_unique_index": ";;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0+0;0+0;0", - "aff_country_unique": "Australia" + "aff_country_unique": "Australia", + "bibtex": "@InProceedings{Fang_2019_ICCV,\n \n author = {\n Fang,\n Pengfei and Zhou,\n Jieming and Roy,\n Soumava Kumar and Petersson,\n Lars and Harandi,\n Mehrtash\n},\n title = {\n Bilinear Attention Networks for Person Retrieval\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Bit-Flip Attack: Crushing Neural Network With Progressive Bit Search", @@ -4075,7 +4203,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Tempe", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Rakin_2019_ICCV,\n \n author = {\n Rakin,\n Adnan Siraj and He,\n Zhezhi and Fan,\n Deliang\n},\n title = {\n Bit-Flip Attack: Crushing Neural Network With Progressive Bit Search\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Block Annotation: Better Image Annotation With Sub-Image Decomposition", @@ -4099,7 +4228,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Lin_Block_Annotation_Better_Image_Annotation_With_Sub-Image_Decomposition_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Lin_Block_Annotation_Better_Image_Annotation_With_Sub-Image_Decomposition_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Lin_2019_ICCV,\n \n author = {\n Lin,\n Hubert and Upchurch,\n Paul and Bala,\n Kavita\n},\n title = {\n Block Annotation: Better Image Annotation With Sub-Image Decomposition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Boosting Few-Shot Visual Learning With Self-Supervision", @@ -4107,11 +4237,11 @@ "status": "Poster", "track": "main", "pid": "4124", - "author_site": "Spyros Gidaris, Andrei Bursuc, Nikos Komodakis, Patrick P\u00c3\u00a9rez, Matthieu Cord", + "author_site": "Spyros Gidaris, Andrei Bursuc, Nikos Komodakis, Patrick Pérez, Matthieu Cord", "author": "Spyros Gidaris; Andrei Bursuc; Nikos Komodakis; Patrick Perez; Matthieu Cord", "abstract": "Few-shot learning and self-supervised learning address different facets of the same problem: how to train a model with little or no labeled data. Few-shot learning aims for optimization methods and models that can learn efficiently to recognize patterns in the low data regime. Self-supervised learning focuses instead on unlabeled data and looks into it for the supervisory signal to feed high capacity deep neural networks. In this work we exploit the complementarity of these two domains and propose an approach for improving few-shot learning through self-supervision. We use self-supervision as an auxiliary task in a few-shot learning pipeline, enabling feature extractors to learn richer and more transferable visual representations while still using few annotated samples. Through self-supervision, our approach can be naturally extended towards using diverse unlabeled data from other datasets in the few-shot setting. We report consistent improvements across an array of architectures, datasets and self-supervision techniques. We provide the implementation code at: https://github.com/valeoai/BF3S", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Gidaris_Boosting_Few-Shot_Visual_Learning_With_Self-Supervision_ICCV_2019_paper.pdf", - "aff": "valeo.ai; valeo.ai; LIGM, Ecole des Pont ParisTech; valeo.ai; valeo.ai+LIGM, Ecole des Pont ParisTech+Sorbonne Universit\u00e9", + "aff": "valeo.ai; valeo.ai; LIGM, Ecole des Pont ParisTech; valeo.ai; valeo.ai+LIGM, Ecole des Pont ParisTech+Sorbonne Université", "project": "", "github": "https://github.com/valeoai/BF3S", "supp": "http://openaccess.thecvf.com/content_ICCV_2019/supplemental/Gidaris_Boosting_Few-Shot_Visual_ICCV_2019_supplemental.pdf", @@ -4125,14 +4255,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Gidaris_Boosting_Few-Shot_Visual_Learning_With_Self-Supervision_ICCV_2019_paper.html", "aff_unique_index": "0;0;1;0;0+1+2", - "aff_unique_norm": "Valeo;Ecole des Ponts ParisTech;Sorbonne Universit\u00e9", + "aff_unique_norm": "Valeo;Ecole des Ponts ParisTech;Sorbonne Université", "aff_unique_dep": ";LIGM;", "aff_unique_url": "https://www.valeo.com;https://www.ponts.org;https://www.sorbonne-universite.fr", "aff_unique_abbr": "Valeo;ENPC;Sorbonne U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0+0+0", - "aff_country_unique": "France" + "aff_country_unique": "France", + "bibtex": "@InProceedings{Gidaris_2019_ICCV,\n \n author = {\n Gidaris,\n Spyros and Bursuc,\n Andrei and Komodakis,\n Nikos and Perez,\n Patrick and Cord,\n Matthieu\n},\n title = {\n Boosting Few-Shot Visual Learning With Self-Supervision\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Bottleneck Potentials in Markov Random Fields", @@ -4165,7 +4296,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Abbas_2019_ICCV,\n \n author = {\n Abbas,\n Ahmed and Swoboda,\n Paul\n},\n title = {\n Bottleneck Potentials in Markov Random Fields\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Boundary-Aware Feature Propagation for Scene Segmentation", @@ -4198,7 +4330,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0+1;2", - "aff_country_unique": "Singapore;Switzerland;China" + "aff_country_unique": "Singapore;Switzerland;China", + "bibtex": "@InProceedings{Ding_2019_ICCV,\n \n author = {\n Ding,\n Henghui and Jiang,\n Xudong and Liu,\n Ai Qun and Thalmann,\n Nadia Magnenat and Wang,\n Gang\n},\n title = {\n Boundary-Aware Feature Propagation for Scene Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Boundless: Generative Adversarial Networks for Image Extension", @@ -4231,7 +4364,8 @@ "aff_campus_unique_index": "0;0;0;0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Teterwak_2019_ICCV,\n \n author = {\n Teterwak,\n Piotr and Sarna,\n Aaron and Krishnan,\n Dilip and Maschinot,\n Aaron and Belanger,\n David and Liu,\n Ce and Freeman,\n William T.\n},\n title = {\n Boundless: Generative Adversarial Networks for Image Extension\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Bridging the Domain Gap for Ground-to-Aerial Image Matching", @@ -4264,7 +4398,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Orlando", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Regmi_2019_ICCV,\n \n author = {\n Regmi,\n Krishna and Shah,\n Mubarak\n},\n title = {\n Bridging the Domain Gap for Ground-to-Aerial Image Matching\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Bridging the Gap Between Detection and Tracking: A Unified Approach", @@ -4297,7 +4432,8 @@ "aff_campus_unique_index": "0+0;0+0;0+0+0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0+0;0+0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Huang_2019_ICCV,\n \n author = {\n Huang,\n Lianghua and Zhao,\n Xin and Huang,\n Kaiqi\n},\n title = {\n Bridging the Gap Between Detection and Tracking: A Unified Approach\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Budget-Aware Adapters for Multi-Domain Learning", @@ -4305,7 +4441,7 @@ "status": "Poster", "track": "main", "pid": "4541", - "author_site": "Rodrigo Berriel, St\u00c3\u00a9phane Lathuill\u00c3\u00a8re, Moin Nabi, Tassilo Klein, Thiago Oliveira-Santos, Nicu Sebe, Elisa Ricci", + "author_site": "Rodrigo Berriel, Stéphane Lathuillère, Moin Nabi, Tassilo Klein, Thiago Oliveira-Santos, Nicu Sebe, Elisa Ricci", "author": "Rodrigo Berriel; Stephane Lathuillere; Moin Nabi; Tassilo Klein; Thiago Oliveira-Santos; Nicu Sebe; Elisa Ricci", "abstract": "Multi-Domain Learning (MDL) refers to the problem of learning a set of models derived from a common deep architecture, each one specialized to perform a task in a certain domain (e.g., photos, sketches, paintings). This paper tackles MDL with a particular interest in obtaining domain-specific models with an adjustable budget in terms of the number of network parameters and computational complexity. Our intuition is that, as in real applications the number of domains and tasks can be very large, an effective MDL approach should not only focus on accuracy but also on having as few parameters as possible. To implement this idea we derive specialized deep models for each domain by adapting a pre-trained architecture but, differently from other methods, we propose a novel strategy to automatically adjust the computational complexity of the network. To this aim, we introduce Budget-Aware Adapters that select the most relevant feature channels to better handle data from a novel domain. Some constraints on the number of active switches are imposed in order to obtain a network respecting the desired complexity budget. Experimentally, we show that our approach leads to recognition accuracy competitive with state-of-the-art approaches but with much lighter networks both in terms of storage and computation.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Berriel_Budget-Aware_Adapters_for_Multi-Domain_Learning_ICCV_2019_paper.pdf", @@ -4326,11 +4462,12 @@ "aff_unique_norm": "Loyola College of Art and Design;University of Trento;SAP SE;Fondazione Bruno Kessler", "aff_unique_dep": ";DISI;Machine Learning Research;", "aff_unique_url": "https://www.lcad.loyola.edu;https://www.unitn.it;https://www.sap.com;https://www.fbk.eu", - "aff_unique_abbr": "LCAD;UniTN;SAP;FBK", + "aff_unique_abbr": "LCAD;;SAP;FBK", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;0;1;1;1", - "aff_country_unique": "United States;Italy;Germany" + "aff_country_unique": "United States;Italy;Germany", + "bibtex": "@InProceedings{Berriel_2019_ICCV,\n \n author = {\n Berriel,\n Rodrigo and Lathuillere,\n Stephane and Nabi,\n Moin and Klein,\n Tassilo and Oliveira-Santos,\n Thiago and Sebe,\n Nicu and Ricci,\n Elisa\n},\n title = {\n Budget-Aware Adapters for Multi-Domain Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "C-MIDN: Coupled Multiple Instance Detection Network With Segmentation Guidance for Weakly Supervised Object Detection", @@ -4358,12 +4495,13 @@ "aff_unique_index": "0+1;0+1;0+1;0+1;1;0+1;0+1", "aff_unique_norm": "Chinese Academy of Sciences;University of Chinese Academy of Sciences", "aff_unique_dep": "Institute of Computing Technology;", - "aff_unique_url": "http://www.cas.cn;http://www.ucas.ac.cn", + "aff_unique_url": "http://www.cas.ac.cn;http://www.ucas.ac.cn", "aff_unique_abbr": "CAS;UCAS", "aff_campus_unique_index": "0+0;0+0;0+0;0+0;0;0+0;0+0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0+0;0+0;0+0;0+0;0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Gao_2019_ICCV,\n \n author = {\n Gao,\n Yan and Liu,\n Boxiao and Guo,\n Nan and Ye,\n Xiaochun and Wan,\n Fang and You,\n Haihang and Fan,\n Dongrui\n},\n title = {\n C-MIDN: Coupled Multiple Instance Detection Network With Segmentation Guidance for Weakly Supervised Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "C3DPO: Canonical 3D Pose Networks for Non-Rigid Structure From Motion", @@ -4389,14 +4527,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Novotny_C3DPO_Canonical_3D_Pose_Networks_for_Non-Rigid_Structure_From_Motion_ICCV_2019_paper.html", "aff_unique_index": "0;0;0;0;0", - "aff_unique_norm": "Meta", + "aff_unique_norm": "Facebook", "aff_unique_dep": "Facebook AI Research", "aff_unique_url": "https://research.facebook.com", "aff_unique_abbr": "FAIR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Novotny_2019_ICCV,\n \n author = {\n Novotny,\n David and Ravi,\n Nikhila and Graham,\n Benjamin and Neverova,\n Natalia and Vedaldi,\n Andrea\n},\n title = {\n C3DPO: Canonical 3D Pose Networks for Non-Rigid Structure From Motion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "id": "46b8e01192", @@ -4425,7 +4564,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Xu_2019_ICCV,\n \n author = {\n Xu,\n Gang and Song,\n Zhigang and Sun,\n Zhuo and Ku,\n Calvin and Yang,\n Zhe and Liu,\n Cancheng and Wang,\n Shuhao and Ma,\n Jianpeng and Xu,\n Wei\n},\n title = {\n CAMEL: A Weakly Supervised Learning Framework for Histopathology Image Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "CAMP: Cross-Modal Adaptive Message Passing for Text-Image Retrieval", @@ -4451,14 +4591,15 @@ "author_num": 7, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Wang_CAMP_Cross-Modal_Adaptive_Message_Passing_for_Text-Image_Retrieval_ICCV_2019_paper.html", "aff_unique_index": "0;0;0;1;2;0;2", - "aff_unique_norm": "Chinese University of Hong Kong;Beihang University;SenseTime", + "aff_unique_norm": "The Chinese University of Hong Kong;Beihang University;SenseTime", "aff_unique_dep": "CUHK-SenseTime Joint Lab;;SenseTime Research", "aff_unique_url": "https://www.cuhk.edu.hk;http://www.buaa.edu.cn/;https://www.sensetime.com", "aff_unique_abbr": "CUHK;BUAA;SenseTime", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2019_ICCV,\n \n author = {\n Wang,\n Zihao and Liu,\n Xihui and Li,\n Hongsheng and Sheng,\n Lu and Yan,\n Junjie and Wang,\n Xiaogang and Shao,\n Jing\n},\n title = {\n CAMP: Cross-Modal Adaptive Message Passing for Text-Image Retrieval\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "CARAFE: Content-Aware ReAssembly of FEatures", @@ -4484,14 +4625,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Wang_CARAFE_Content-Aware_ReAssembly_of_FEatures_ICCV_2019_paper.html", "aff_unique_index": "0;0;0;0;1;0", - "aff_unique_norm": "Chinese University of Hong Kong;Nanyang Technological University", + "aff_unique_norm": "The Chinese University of Hong Kong;Nanyang Technological University", "aff_unique_dep": "CUHK - SenseTime Joint Lab;", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.ntu.edu.sg", "aff_unique_abbr": "CUHK;NTU", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;1;0", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Wang_2019_ICCV,\n \n author = {\n Wang,\n Jiaqi and Chen,\n Kai and Xu,\n Rui and Liu,\n Ziwei and Loy,\n Chen Change and Lin,\n Dahua\n},\n title = {\n CARAFE: Content-Aware ReAssembly of FEatures\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "CCNet: Criss-Cross Attention for Semantic Segmentation", @@ -4517,14 +4659,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Huang_CCNet_Criss-Cross_Attention_for_Semantic_Segmentation_ICCV_2019_paper.html", "aff_unique_index": "0;0;1;1;2+3;0", - "aff_unique_norm": "Huazhong University of Science and Technology;Horizon Robotics;University of Technology Sydney;University of Illinois Urbana-Champaign", + "aff_unique_norm": "Huazhong University of Science and Technology;Horizon Robotics;University of Technology, Sydney;University of Illinois at Urbana-Champaign", "aff_unique_dep": "School of EIC;;ReLER;Beckman Institute", "aff_unique_url": "http://www.hust.edu.cn;https://www.horizon-robotics.com/;https://www.uts.edu.au;https://www.illinois.edu", "aff_unique_abbr": "HUST;Horizon Robotics;UTS;UIUC", "aff_campus_unique_index": "1+2", "aff_campus_unique": ";Sydney;Urbana-Champaign", "aff_country_unique_index": "0;0;0;0;1+2;0", - "aff_country_unique": "China;Australia;United States" + "aff_country_unique": "China;Australia;United States", + "bibtex": "@InProceedings{Huang_2019_ICCV,\n \n author = {\n Huang,\n Zilong and Wang,\n Xinggang and Huang,\n Lichao and Huang,\n Chang and Wei,\n Yunchao and Liu,\n Wenyu\n},\n title = {\n CCNet: Criss-Cross Attention for Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "CDPN: Coordinates-Based Disentangled Pose Network for Real-Time RGB-Based 6-DoF Object Pose Estimation", @@ -4557,7 +4700,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2019_ICCV,\n \n author = {\n Li,\n Zhigang and Wang,\n Gu and Ji,\n Xiangyang\n},\n title = {\n CDPN: Coordinates-Based Disentangled Pose Network for Real-Time RGB-Based 6-DoF Object Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "CDTB: A Color and Depth Visual Object Tracking Dataset and Benchmark", @@ -4565,7 +4709,7 @@ "status": "Poster", "track": "main", "pid": "1743", - "author_site": "Alan Luke\u00c5\u00bei\u00c4\u008d, Ugur Kart, Jani K\u00c3\u00a4pyl\u00c3\u00a4, Ahmed Durmush, Joni-Kristian K\u00c3\u00a4m\u00c3\u00a4r\u00c3\u00a4inen, Ji\u00c5\u0099\u00c3\u00ad Matas, Matej Kristan", + "author_site": "Alan Lukežič, Ugur Kart, Jani Käpylä, Ahmed Durmush, Joni-Kristian Kämäräinen, Jiří Matas, Matej Kristan", "author": "Alan Lukezic; Ugur Kart; Jani Kapyla; Ahmed Durmush; Joni-Kristian Kamarainen; Jiri Matas; Matej Kristan", "abstract": "We propose a new color-and-depth general visual object tracking benchmark (CDTB). CDTB is recorded by several passive and active RGB-D setups and contains indoor as well as outdoor sequences acquired in direct sunlight. The CDTB dataset is the largest and most diverse dataset in RGB-D tracking, with an order of magnitude larger number of frames than related datasets. The sequences have been carefully recorded to contain significant object pose change, clutter, occlusion, and periods of long-term target absence to enable tracker evaluation under realistic conditions. Sequences are per-frame annotated with 13 visual attributes for detailed analysis. Experiments with RGB and RGB-D trackers show that CDTB is more challenging than previous datasets. State-of-the-art RGB trackers outperform the recent RGB-D trackers, indicating a large gap between the two fields, which has not been previously detected by the prior benchmarks. Based on the results of the analysis we point out opportunities for future research in RGB-D tracker design.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Lukezic_CDTB_A_Color_and_Depth_Visual_Object_Tracking_Dataset_and_ICCV_2019_paper.pdf", @@ -4590,7 +4734,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Prague", "aff_country_unique_index": "0;1;1;1;1;2;0", - "aff_country_unique": "Slovenia;Finland;Czech Republic" + "aff_country_unique": "Slovenia;Finland;Czech Republic", + "bibtex": "@InProceedings{Lukezic_2019_ICCV,\n \n author = {\n Lukezic,\n Alan and Kart,\n Ugur and Kapyla,\n Jani and Durmush,\n Ahmed and Kamarainen,\n Joni-Kristian and Matas,\n Jiri and Kristan,\n Matej\n},\n title = {\n CDTB: A Color and Depth Visual Object Tracking Dataset and Benchmark\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "CFSNet: Toward a Controllable Feature Space for Image Restoration", @@ -4623,7 +4768,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;1;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Wang_2019_ICCV,\n \n author = {\n Wang,\n Wei and Guo,\n Ruiming and Tian,\n Yapeng and Yang,\n Wenming\n},\n title = {\n CFSNet: Toward a Controllable Feature Space for Image Restoration\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "CIIDefence: Defeating Adversarial Attacks by Fusing Class-Specific Image Inpainting and Image Denoising", @@ -4656,7 +4802,8 @@ "aff_campus_unique_index": "0", "aff_campus_unique": "Indore;", "aff_country_unique_index": "0;1", - "aff_country_unique": "India;Finland" + "aff_country_unique": "India;Finland", + "bibtex": "@InProceedings{Gupta_2019_ICCV,\n \n author = {\n Gupta,\n Puneet and Rahtu,\n Esa\n},\n title = {\n CIIDefence: Defeating Adversarial Attacks by Fusing Class-Specific Image Inpainting and Image Denoising\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "COCO-GAN: Generation by Parts via Conditional Coordinating", @@ -4689,7 +4836,8 @@ "aff_campus_unique_index": "0;0;0;1;1;0", "aff_campus_unique": "Taiwan;Mountain View", "aff_country_unique_index": "0;0;0;1;1;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Lin_2019_ICCV,\n \n author = {\n Lin,\n Chieh Hubert and Chang,\n Chia-Che and Chen,\n Yu-Sheng and Juan,\n Da-Cheng and Wei,\n Wei and Chen,\n Hwann-Tzong\n},\n title = {\n COCO-GAN: Generation by Parts via Conditional Coordinating\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Calibration Wizard: A Guidance System for Camera Calibration Based on Modelling Geometric and Corner Uncertainty", @@ -4701,7 +4849,7 @@ "author": "Songyou Peng; Peter Sturm", "abstract": "It is well known that the accuracy of a calibration depends strongly on the choice of camera poses from which images of a calibration object are acquired. We present a system -- Calibration Wizard -- that interactively guides a user towards taking optimal calibration images. For each new image to be taken, the system computes, from all previously acquired images, the pose that leads to the globally maximum reduction of expected uncertainty on intrinsic parameters and then guides the user towards that pose. We also show how to incorporate uncertainty in corner point position in a novel principled manner, for both, calibration and computation of the next best pose. Synthetic and real-world experiments are performed to demonstrate the effectiveness of Calibration Wizard.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Peng_Calibration_Wizard_A_Guidance_System_for_Camera_Calibration_Based_on_ICCV_2019_paper.pdf", - "aff": "ETH Zurich + INRIA Grenoble \u2013 Rh\u00f4ne-Alpes; INRIA Grenoble \u2013 Rh\u00f4ne-Alpes", + "aff": "ETH Zurich + INRIA Grenoble – Rhône-Alpes; INRIA Grenoble – Rhône-Alpes", "project": "", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2019/supplemental/Peng_Calibration_Wizard_A_ICCV_2019_supplemental.pdf", @@ -4722,7 +4870,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Grenoble", "aff_country_unique_index": "0+1;1", - "aff_country_unique": "Switzerland;France" + "aff_country_unique": "Switzerland;France", + "bibtex": "@InProceedings{Peng_2019_ICCV,\n \n author = {\n Peng,\n Songyou and Sturm,\n Peter\n},\n title = {\n Calibration Wizard: A Guidance System for Camera Calibration Based on Modelling Geometric and Corner Uncertainty\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Calibration of Axial Fisheye Cameras Through Generic Virtual Central Models", @@ -4730,11 +4879,11 @@ "status": "Poster", "track": "main", "pid": "5053", - "author_site": "Pierre-Andr\u00c3\u00a9 Brousseau, S\u00c3\u00a9bastien Roy", + "author_site": "Pierre-André Brousseau, Sébastien Roy", "author": "Pierre-Andre Brousseau; Sebastien Roy", "abstract": "Fisheye cameras are notoriously hard to calibrate using traditional plane-based methods. This paper proposes a new calibration method for large field of view cameras. Similarly to planar calibration, it relies on multiple images of a planar calibration grid with dense correspondences, typically obtained using structured light. By relying on the grids themselves instead of the distorted image plane, we can build a rectilinear Generic Virtual Central (GVC) camera. Instead of relying on a single GVC camera, our method proposes a selection of multiple GVC cameras which can cover any field of view and be trivially aligned to provide a very accurate generic central model. We demonstrate that this approach can directly model axial cameras, assuming the distortion center is located on the camera axis. Experimental validation is provided on both synthetic and real fisheye cameras featuring up to a 280deg field of view. To our knowledge, this is one of the only practical methods to calibrate axial cameras.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Brousseau_Calibration_of_Axial_Fisheye_Cameras_Through_Generic_Virtual_Central_Models_ICCV_2019_paper.pdf", - "aff": "D\u00e9partement d\u2019informatique et de recherche op\u00e9rationnelle, Universit\u00e9 de Montr\u00e9al; D\u00e9partement d\u2019informatique et de recherche op\u00e9rationnelle, Universit\u00e9 de Montr\u00e9al", + "aff": "Département d’informatique et de recherche opérationnelle, Université de Montréal; Département d’informatique et de recherche opérationnelle, Université de Montréal", "project": "", "github": "", "supp": "", @@ -4748,14 +4897,15 @@ "author_num": 2, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Brousseau_Calibration_of_Axial_Fisheye_Cameras_Through_Generic_Virtual_Central_Models_ICCV_2019_paper.html", "aff_unique_index": "0;0", - "aff_unique_norm": "Universit\u00e9 de Montr\u00e9al", - "aff_unique_dep": "D\u00e9partement d\u2019informatique et de recherche op\u00e9rationnelle", + "aff_unique_norm": "Université de Montréal", + "aff_unique_dep": "Département d’informatique et de recherche opérationnelle", "aff_unique_url": "https://www.umontreal.ca", "aff_unique_abbr": "UdeM", "aff_campus_unique_index": "0;0", - "aff_campus_unique": "Montr\u00e9al", + "aff_campus_unique": "Montréal", "aff_country_unique_index": "0;0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Brousseau_2019_ICCV,\n \n author = {\n Brousseau,\n Pierre-Andre and Roy,\n Sebastien\n},\n title = {\n Calibration of Axial Fisheye Cameras Through Generic Virtual Central Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "CamNet: Coarse-to-Fine Retrieval for Camera Re-Localization", @@ -4781,14 +4931,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Ding_CamNet_Coarse-to-Fine_Retrieval_for_Camera_Re-Localization_ICCV_2019_paper.html", "aff_unique_index": "0;1;2;1;0", - "aff_unique_norm": "University of Hong Kong;SenseTime;Chinese University of Hong Kong", + "aff_unique_norm": "The University of Hong Kong;SenseTime;The Chinese University of Hong Kong", "aff_unique_dep": ";SenseTime Research;", "aff_unique_url": "https://www.hku.hk;https://www.sensetime.com;https://www.cuhk.edu.hk", "aff_unique_abbr": "HKU;SenseTime;CUHK", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Ding_2019_ICCV,\n \n author = {\n Ding,\n Mingyu and Wang,\n Zhe and Sun,\n Jiankai and Shi,\n Jianping and Luo,\n Ping\n},\n title = {\n CamNet: Coarse-to-Fine Retrieval for Camera Re-Localization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Camera Distance-Aware Top-Down Approach for 3D Multi-Person Pose Estimation From a Single RGB Image", @@ -4821,7 +4972,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Seoul;", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Moon_2019_ICCV,\n \n author = {\n Moon,\n Gyeongsik and Chang,\n Ju Yong and Lee,\n Kyoung Mu\n},\n title = {\n Camera Distance-Aware Top-Down Approach for 3D Multi-Person Pose Estimation From a Single RGB Image\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Canonical Surface Mapping via Geometric Cycle Consistency", @@ -4847,14 +4999,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Kulkarni_Canonical_Surface_Mapping_via_Geometric_Cycle_Consistency_ICCV_2019_paper.html", "aff_unique_index": "0;0+1;1", - "aff_unique_norm": "Carnegie Mellon University;Meta", + "aff_unique_norm": "Carnegie Mellon University;Facebook", "aff_unique_dep": ";Facebook AI Research", "aff_unique_url": "https://www.cmu.edu;https://research.facebook.com", "aff_unique_abbr": "CMU;FAIR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Kulkarni_2019_ICCV,\n \n author = {\n Kulkarni,\n Nilesh and Gupta,\n Abhinav and Tulsiani,\n Shubham\n},\n title = {\n Canonical Surface Mapping via Geometric Cycle Consistency\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Cap2Det: Learning to Amplify Weak Caption Supervision for Object Detection", @@ -4887,7 +5040,8 @@ "aff_campus_unique_index": "0;0;0;1;1;1", "aff_campus_unique": "Pittsburgh;Zurich", "aff_country_unique_index": "0;0;0;1;1;1", - "aff_country_unique": "United States;Switzerland" + "aff_country_unique": "United States;Switzerland", + "bibtex": "@InProceedings{Ye_2019_ICCV,\n \n author = {\n Ye,\n Keren and Zhang,\n Mingda and Kovashka,\n Adriana and Li,\n Wei and Qin,\n Danfeng and Berent,\n Jesse\n},\n title = {\n Cap2Det: Learning to Amplify Weak Caption Supervision for Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "CapsuleVOS: Semi-Supervised Video Object Segmentation Using Capsule Routing", @@ -4920,7 +5074,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "", - "aff_country_unique": "" + "aff_country_unique": "", + "bibtex": "@InProceedings{Duarte_2019_ICCV,\n \n author = {\n Duarte,\n Kevin and Rawat,\n Yogesh S. and Shah,\n Mubarak\n},\n title = {\n CapsuleVOS: Semi-Supervised Video Object Segmentation Using Capsule Routing\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Cascaded Context Pyramid for Full-Resolution 3D Semantic Scene Completion", @@ -4948,12 +5103,13 @@ "aff_unique_index": "0;1;2;0+3;3", "aff_unique_norm": "Dalian University of Technology;University of Adelaide;Sichuan University;China Science IntelliCloud Technology Co., Ltd.", "aff_unique_dep": ";;;", - "aff_unique_url": "http://www.dlut.edu.cn/;https://www.adelaide.edu.au;https://www.scu.edu.cn;", - "aff_unique_abbr": "DUT;Adelaide;SCU;", + "aff_unique_url": "http://www.dlut.edu.cn/;https://www.adelaide.edu.au;https://www.scu.edu.cn;https://www.csicloud.com", + "aff_unique_abbr": "DUT;Adelaide;SCU;CSIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0+0;0", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Zhang_2019_ICCV,\n \n author = {\n Zhang,\n Pingping and Liu,\n Wei and Lei,\n Yinjie and Lu,\n Huchuan and Yang,\n Xiaoyun\n},\n title = {\n Cascaded Context Pyramid for Full-Resolution 3D Semantic Scene Completion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Cascaded Parallel Filtering for Memory-Efficient Image-Based Localization", @@ -4986,7 +5142,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;1", - "aff_country_unique": "Singapore;China" + "aff_country_unique": "Singapore;China", + "bibtex": "@InProceedings{Cheng_2019_ICCV,\n \n author = {\n Cheng,\n Wentao and Lin,\n Weisi and Chen,\n Kan and Zhang,\n Xinfeng\n},\n title = {\n Cascaded Parallel Filtering for Memory-Efficient Image-Based Localization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "CenterNet: Keypoint Triplets for Object Detection", @@ -4998,7 +5155,7 @@ "author": "Kaiwen Duan; Song Bai; Lingxi Xie; Honggang Qi; Qingming Huang; Qi Tian", "abstract": "In object detection, keypoint-based approaches often experience the drawback of a large number of incorrect object bounding boxes, arguably due to the lack of an additional assessment inside cropped regions. This paper presents an efficient solution that explores the visual patterns within individual cropped regions with minimal costs. We build our framework upon a representative one-stage keypoint-based detector named CornerNet. Our approach, named CenterNet, detects each object as a triplet, rather than a pair, of keypoints, which improves both precision and recall. Accordingly, we design two customized modules, cascade corner pooling, and center pooling, that enrich information collected by both the top-left and bottom-right corners and provide more recognizable information from the central regions. On the MS-COCO dataset, CenterNet achieves an AP of 47.0 %, outperforming all existing one-stage detectors by at least 4.9%. Furthermore, with a faster inference speed than the top-ranked two-stage detectors, CenterNet demonstrates a comparable performance to these detectors. Code is available at https://github.com/Duankaiwen/CenterNet.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Duan_CenterNet_Keypoint_Triplets_for_Object_Detection_ICCV_2019_paper.pdf", - "aff": "University of Chinese Academy of Sciences; Huazhong University of Science and Technology; Huawei Noah\u2019s Ark Lab; University of Chinese Academy of Sciences + Key Laboratory of Big Data Mining and Knowledge Management, UCAS + Peng Cheng Laboratory; University of Chinese Academy of Sciences + Key Laboratory of Big Data Mining and Knowledge Management, UCAS + Peng Cheng Laboratory; Huawei Noah\u2019s Ark Lab", + "aff": "University of Chinese Academy of Sciences; Huazhong University of Science and Technology; Huawei Noah’s Ark Lab; University of Chinese Academy of Sciences + Key Laboratory of Big Data Mining and Knowledge Management, UCAS + Peng Cheng Laboratory; University of Chinese Academy of Sciences + Key Laboratory of Big Data Mining and Knowledge Management, UCAS + Peng Cheng Laboratory; Huawei Noah’s Ark Lab", "project": "", "github": "https://github.com/Duankaiwen/CenterNet", "supp": "", @@ -5012,14 +5169,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Duan_CenterNet_Keypoint_Triplets_for_Object_Detection_ICCV_2019_paper.html", "aff_unique_index": "0;1;2;0+0+3;0+0+3;2", - "aff_unique_norm": "University of Chinese Academy of Sciences;Huazhong University of Science and Technology;Huawei;Pengcheng Laboratory", - "aff_unique_dep": ";;Noah\u2019s Ark Lab;Peng Cheng Laboratory", + "aff_unique_norm": "University of Chinese Academy of Sciences;Huazhong University of Science and Technology;Huawei;Peng Cheng Laboratory", + "aff_unique_dep": ";;Noah’s Ark Lab;", "aff_unique_url": "http://www.ucas.ac.cn;http://www.hust.edu.cn;https://www.huawei.com;http://www.pcl.ac.cn", "aff_unique_abbr": "UCAS;HUST;Huawei;PCL", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0+0+0;0+0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Duan_2019_ICCV,\n \n author = {\n Duan,\n Kaiwen and Bai,\n Song and Xie,\n Lingxi and Qi,\n Honggang and Huang,\n Qingming and Tian,\n Qi\n},\n title = {\n CenterNet: Keypoint Triplets for Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Chinese Street View Text: Large-Scale Chinese Text Reading With Partially Supervised Learning", @@ -5045,14 +5203,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Sun_Chinese_Street_View_Text_Large-Scale_Chinese_Text_Reading_With_Partially_ICCV_2019_paper.html", "aff_unique_index": "0;0;1;0;0;0", - "aff_unique_norm": "Baidu;University of Hong Kong", + "aff_unique_norm": "Baidu Inc.;The University of Hong Kong", "aff_unique_dep": "Department of Computer Vision Technology (VIS);Department of Computer Science", "aff_unique_url": "https://www.baidu.com;https://www.hku.hk", "aff_unique_abbr": "Baidu;HKU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Sun_2019_ICCV,\n \n author = {\n Sun,\n Yipeng and Liu,\n Jiaming and Liu,\n Wei and Han,\n Junyu and Ding,\n Errui and Liu,\n Jingtuo\n},\n title = {\n Chinese Street View Text: Large-Scale Chinese Text Reading With Partially Supervised Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Closed-Form Optimal Two-View Triangulation Based on Angular Errors", @@ -5085,7 +5244,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Spain" + "aff_country_unique": "Spain", + "bibtex": "@InProceedings{Lee_2019_ICCV,\n \n author = {\n Lee,\n Seong Hun and Civera,\n Javier\n},\n title = {\n Closed-Form Optimal Two-View Triangulation Based on Angular Errors\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "ClothFlow: A Flow-Based Model for Clothed Person Generation", @@ -5118,7 +5278,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Han_2019_ICCV,\n \n author = {\n Han,\n Xintong and Hu,\n Xiaojun and Huang,\n Weilin and Scott,\n Matthew R.\n},\n title = {\n ClothFlow: A Flow-Based Model for Clothed Person Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Cluster Alignment With a Teacher for Unsupervised Domain Adaptation", @@ -5151,7 +5312,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Deng_2019_ICCV,\n \n author = {\n Deng,\n Zhijie and Luo,\n Yucen and Zhu,\n Jun\n},\n title = {\n Cluster Alignment With a Teacher for Unsupervised Domain Adaptation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "ClusterSLAM: A SLAM Backend for Simultaneous Rigid Body Clustering and Motion Estimation", @@ -5184,7 +5346,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Beijing;", "aff_country_unique_index": "0;0+0;0;1;0", - "aff_country_unique": "China;United Kingdom" + "aff_country_unique": "China;United Kingdom", + "bibtex": "@InProceedings{Huang_2019_ICCV,\n \n author = {\n Huang,\n Jiahui and Yang,\n Sheng and Zhao,\n Zishuo and Lai,\n Yu-Kun and Hu,\n Shi-Min\n},\n title = {\n ClusterSLAM: A SLAM Backend for Simultaneous Rigid Body Clustering and Motion Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Clustered Object Detection in Aerial Images", @@ -5217,7 +5380,8 @@ "aff_campus_unique_index": "0;0;0;2+0", "aff_campus_unique": "Philadelphia;;Stony Brook", "aff_country_unique_index": "0;0;0;0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Yang_2019_ICCV,\n \n author = {\n Yang,\n Fan and Fan,\n Heng and Chu,\n Peng and Blasch,\n Erik and Ling,\n Haibin\n},\n title = {\n Clustered Object Detection in Aerial Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Co-Evolutionary Compression for Unpaired Image Translation", @@ -5229,7 +5393,7 @@ "author": "Han Shu; Yunhe Wang; Xu Jia; Kai Han; Hanting Chen; Chunjing Xu; Qi Tian; Chang Xu", "abstract": "Generative adversarial networks (GANs) have been successfully used for considerable computer vision tasks, especially the image-to-image translation. However, generators in these networks are of complicated architectures with large number of parameters and huge computational complexities. Existing methods are mainly designed for compressing and speeding-up deep neural networks in the classification task, and cannot be directly applied on GANs for image translation, due to their different objectives and training procedures. To this end, we develop a novel co-evolutionary approach for reducing their memory usage and FLOPs simultaneously. In practice, generators for two image domains are encoded as two populations and synergistically optimized for investigating the most important convolution filters iteratively. Fitness of each individual is calculated using the number of parameters, a discriminator-aware regularization, and the cycle consistency. Extensive experiments conducted on benchmark datasets demonstrate the effectiveness of the proposed method for obtaining compact and effective generators.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Shu_Co-Evolutionary_Compression_for_Unpaired_Image_Translation_ICCV_2019_paper.pdf", - "aff": "Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Key Lab of Machine Perception (MOE), CMIC, School of EECS, Peking University, China; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; School of Computer Science, Faculty of Engineering, The University of Sydney, Australia", + "aff": "Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; Key Lab of Machine Perception (MOE), CMIC, School of EECS, Peking University, China; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; School of Computer Science, Faculty of Engineering, The University of Sydney, Australia", "project": "", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2019/supplemental/Shu_Co-Evolutionary_Compression_for_ICCV_2019_supplemental.pdf", @@ -5243,14 +5407,15 @@ "author_num": 8, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Shu_Co-Evolutionary_Compression_for_Unpaired_Image_Translation_ICCV_2019_paper.html", "aff_unique_index": "0;0;0;0;1;0;0;2", - "aff_unique_norm": "Huawei;Peking University;University of Sydney", - "aff_unique_dep": "Noah\u2019s Ark Lab;School of EECS;School of Computer Science", + "aff_unique_norm": "Huawei;Peking University;The University of Sydney", + "aff_unique_dep": "Noah’s Ark Lab;School of EECS;School of Computer Science", "aff_unique_url": "https://www.huawei.com;http://www.pku.edu.cn;https://www.sydney.edu.au", "aff_unique_abbr": "Huawei;Peking U;USYD", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;1", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Shu_2019_ICCV,\n \n author = {\n Shu,\n Han and Wang,\n Yunhe and Jia,\n Xu and Han,\n Kai and Chen,\n Hanting and Xu,\n Chunjing and Tian,\n Qi and Xu,\n Chang\n},\n title = {\n Co-Evolutionary Compression for Unpaired Image Translation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Co-Mining: Deep Face Recognition With Noisy Labels", @@ -5274,7 +5439,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Wang_Co-Mining_Deep_Face_Recognition_With_Noisy_Labels_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Wang_Co-Mining_Deep_Face_Recognition_With_Noisy_Labels_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Wang_2019_ICCV,\n \n author = {\n Wang,\n Xiaobo and Wang,\n Shuo and Wang,\n Jun and Shi,\n Hailin and Mei,\n Tao\n},\n title = {\n Co-Mining: Deep Face Recognition With Noisy Labels\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Co-Segmentation Inspired Attention Networks for Video-Based Person Re-Identification", @@ -5307,7 +5473,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Madras", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "India" + "aff_country_unique": "India", + "bibtex": "@InProceedings{Subramaniam_2019_ICCV,\n \n author = {\n Subramaniam,\n Arulkumar and Nambiar,\n Athira and Mittal,\n Anurag\n},\n title = {\n Co-Segmentation Inspired Attention Networks for Video-Based Person Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Co-Separating Sounds of Visual Objects", @@ -5333,14 +5500,15 @@ "author_num": 2, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Gao_Co-Separating_Sounds_of_Visual_Objects_ICCV_2019_paper.html", "aff_unique_index": "0;0+1", - "aff_unique_norm": "University of Texas at Austin;Meta", + "aff_unique_norm": "University of Texas at Austin;Facebook", "aff_unique_dep": ";Facebook AI Research", "aff_unique_url": "https://www.utexas.edu;https://research.facebook.com", "aff_unique_abbr": "UT Austin;FAIR", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Austin;", "aff_country_unique_index": "0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Gao_2019_ICCV,\n \n author = {\n Gao,\n Ruohan and Grauman,\n Kristen\n},\n title = {\n Co-Separating Sounds of Visual Objects\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Coherent Semantic Attention for Image Inpainting", @@ -5373,7 +5541,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2019_ICCV,\n \n author = {\n Liu,\n Hongyu and Jiang,\n Bin and Xiao,\n Yi and Yang,\n Chao\n},\n title = {\n Coherent Semantic Attention for Image Inpainting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Collect and Select: Semantic Alignment Metric Learning for Few-Shot Learning", @@ -5385,7 +5554,7 @@ "author": "Fusheng Hao; Fengxiang He; Jun Cheng; Lei Wang; Jianzhong Cao; Dacheng Tao", "abstract": "Few-shot learning aims to learn latent patterns from few training examples and has shown promises in practice. However, directly calculating the distances between the query image and support image in existing methods may cause ambiguity because dominant objects can locate anywhere on images. To address this issue, this paper proposes a Semantic Alignment Metric Learning (SAML) method for few-shot learning that aligns the semantically relevant dominant objects through a \"collect-and-select\" strategy. Specifically, we first calculate a relation matrix (RM) to \"collect\" the distances of each local region pairs of the 3D tensor extracted from a query image and the mean tensor of the support images. Then, the attention technique is adapted to \"select\" the semantically relevant pairs and put more weights on them. Afterwards, a multi-layer perceptron (MLP) is utilized to map the reweighted RMs to their corresponding similarity scores. Theoretical analysis demonstrates the generalization ability of SAML and gives a theoretical guarantee. Empirical results demonstrate that semantic alignment is achieved. Extensive experiments on benchmark datasets validate the strengths of the proposed approach and demonstrate that SAML significantly outperforms the current state-of-the-art methods. The source code is available at https://github.com/haofusheng/SAML.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Hao_Collect_and_Select_Semantic_Alignment_Metric_Learning_for_Few-Shot_Learning_ICCV_2019_paper.pdf", - "aff": "CAS Key Laboratory of Human-Machine Intelligence-Synergy Systems, Shenzhen Institutes of Advanced Technology, CAS, China + The Chinese University of Hong Kong, Hong Kong, China; UBTECH Sydney AI Centre, School of Computer Science, Faculty of Engineering, The University of Sydney, Darlington, NSW 2008, Australia; CAS Key Laboratory of Human-Machine Intelligence-Synergy Systems, Shenzhen Institutes of Advanced Technology, CAS, China + The Chinese University of Hong Kong, Hong Kong, China; CAS Key Laboratory of Human-Machine Intelligence-Synergy Systems, Shenzhen Institutes of Advanced Technology, CAS, China + The Chinese University of Hong Kong, Hong Kong, China; Xi\u2019an Institute of Optics and Precision Mechanics, CAS, China; UBTECH Sydney AI Centre, School of Computer Science, Faculty of Engineering, The University of Sydney, Darlington, NSW 2008, Australia", + "aff": "CAS Key Laboratory of Human-Machine Intelligence-Synergy Systems, Shenzhen Institutes of Advanced Technology, CAS, China + The Chinese University of Hong Kong, Hong Kong, China; UBTECH Sydney AI Centre, School of Computer Science, Faculty of Engineering, The University of Sydney, Darlington, NSW 2008, Australia; CAS Key Laboratory of Human-Machine Intelligence-Synergy Systems, Shenzhen Institutes of Advanced Technology, CAS, China + The Chinese University of Hong Kong, Hong Kong, China; CAS Key Laboratory of Human-Machine Intelligence-Synergy Systems, Shenzhen Institutes of Advanced Technology, CAS, China + The Chinese University of Hong Kong, Hong Kong, China; Xi’an Institute of Optics and Precision Mechanics, CAS, China; UBTECH Sydney AI Centre, School of Computer Science, Faculty of Engineering, The University of Sydney, Darlington, NSW 2008, Australia", "project": "", "github": "https://github.com/haofusheng/SAML", "supp": "http://openaccess.thecvf.com/content_ICCV_2019/supplemental/Hao_Collect_and_Select_ICCV_2019_supplemental.pdf", @@ -5399,14 +5568,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Hao_Collect_and_Select_Semantic_Alignment_Metric_Learning_for_Few-Shot_Learning_ICCV_2019_paper.html", "aff_unique_index": "0+1;2;0+1;0+1;3;2", - "aff_unique_norm": "Chinese Academy of Sciences;Chinese University of Hong Kong;University of Sydney;Xi'an Institute of Optics and Precision Mechanics", + "aff_unique_norm": "Chinese Academy of Sciences;The Chinese University of Hong Kong;The University of Sydney;Xi'an Institute of Optics and Precision Mechanics", "aff_unique_dep": "Key Laboratory of Human-Machine Intelligence-Synergy Systems;;School of Computer Science;", "aff_unique_url": "http://www.cas.cn;https://www.cuhk.edu.hk;https://www.sydney.edu.au;http://www.xiopia.ac.cn", "aff_unique_abbr": "CAS;CUHK;USYD;XIOPM", "aff_campus_unique_index": "0+1;2;0+1;0+1;2", "aff_campus_unique": "Shenzhen;Hong Kong;Darlington;", "aff_country_unique_index": "0+0;1;0+0;0+0;0;1", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Hao_2019_ICCV,\n \n author = {\n Hao,\n Fusheng and He,\n Fengxiang and Cheng,\n Jun and Wang,\n Lei and Cao,\n Jianzhong and Tao,\n Dacheng\n},\n title = {\n Collect and Select: Semantic Alignment Metric Learning for Few-Shot Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Compact Trilinear Interaction for Visual Question Answering", @@ -5439,7 +5609,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1+0;0;0;0", - "aff_country_unique": "Singapore;United Kingdom" + "aff_country_unique": "Singapore;United Kingdom", + "bibtex": "@InProceedings{Do_2019_ICCV,\n \n author = {\n Do,\n Tuong and Do,\n Thanh-Toan and Tran,\n Huy and Tjiputra,\n Erman and Tran,\n Quang D.\n},\n title = {\n Compact Trilinear Interaction for Visual Question Answering\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "CompenNet++: End-to-End Full Projector Compensation", @@ -5472,7 +5643,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Huang_2019_ICCV,\n \n author = {\n Huang,\n Bingyao and Ling,\n Haibin\n},\n title = {\n CompenNet++: End-to-End Full Projector Compensation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "CompoNet: Learning to Generate the Unseen by Part Synthesis and Composition", @@ -5505,7 +5677,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", - "aff_country_unique": "Israel;Canada" + "aff_country_unique": "Israel;Canada", + "bibtex": "@InProceedings{Schor_2019_ICCV,\n \n author = {\n Schor,\n Nadav and Katzir,\n Oren and Zhang,\n Hao and Cohen-Or,\n Daniel\n},\n title = {\n CompoNet: Learning to Generate the Unseen by Part Synthesis and Composition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Composite Shape Modeling via Latent Space Factorization", @@ -5513,7 +5686,7 @@ "status": "Poster", "track": "main", "pid": "827", - "author_site": "Anastasia Dubrovina, Fei Xia, Panos Achlioptas, Mira Shalah, Rapha\u00c3\u00abl Groscot, Leonidas J. Guibas", + "author_site": "Anastasia Dubrovina, Fei Xia, Panos Achlioptas, Mira Shalah, Raphaël Groscot, Leonidas J. Guibas", "author": "Anastasia Dubrovina; Fei Xia; Panos Achlioptas; Mira Shalah; Raphael Groscot; Leonidas J. Guibas", "abstract": "We present a novel neural network architecture, termed Decomposer-Composer, for semantic structure-aware 3D shape modeling. Our method utilizes an auto-encoder-based pipeline, and produces a novel factorized shape embedding space, where the semantic structure of the shape collection translates into a data-dependent sub-space factorization, and where shape composition and decomposition become simple linear operations on the embedding coordinates. We further propose to model shape assembly using an explicit learned part deformation module, which utilizes a 3D spatial transformer network to perform an in-network volumetric grid deformation, and which allows us to train the whole system end-to-end. The resulting network allows us to perform part-level shape manipulation, unattainable by existing approaches. Our extensive ablation study, comparison to baseline methods and qualitative analysis demonstrate the improved performance of the proposed method.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Dubrovina_Composite_Shape_Modeling_via_Latent_Space_Factorization_ICCV_2019_paper.pdf", @@ -5538,7 +5711,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0;0;0;1;0", - "aff_country_unique": "United States;France" + "aff_country_unique": "United States;France", + "bibtex": "@InProceedings{Dubrovina_2019_ICCV,\n \n author = {\n Dubrovina,\n Anastasia and Xia,\n Fei and Achlioptas,\n Panos and Shalah,\n Mira and Groscot,\n Raphael and Guibas,\n Leonidas J.\n},\n title = {\n Composite Shape Modeling via Latent Space Factorization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Compositional Video Prediction", @@ -5564,14 +5738,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Ye_Compositional_Video_Prediction_ICCV_2019_paper.html", "aff_unique_index": "0;1;0+2;2", - "aff_unique_norm": "Carnegie Mellon University;Verisk Analytics;Meta", + "aff_unique_norm": "Carnegie Mellon University;Verisk Analytics;Facebook", "aff_unique_dep": ";;Facebook AI Research", "aff_unique_url": "https://www.cmu.edu;https://www.verisk.com;https://research.facebook.com", - "aff_unique_abbr": "CMU;Verisk;FAIR", + "aff_unique_abbr": "CMU;;FAIR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Ye_2019_ICCV,\n \n author = {\n Ye,\n Yufei and Singh,\n Maneesh and Gupta,\n Abhinav and Tulsiani,\n Shubham\n},\n title = {\n Compositional Video Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Computational Hyperspectral Imaging Based on Dimension-Discriminative Low-Rank Tensor Recovery", @@ -5583,7 +5758,7 @@ "author": "Shipeng Zhang; Lizhi Wang; Ying Fu; Xiaoming Zhong; Hua Huang", "abstract": "Exploiting the prior information is fundamental for the image reconstruction in computational hyperspectral imaging. Existing methods usually unfold the 3D signal as a 1D vector and treat the prior information within different dimensions in an indiscriminative manner, which ignores the high-dimensionality nature of hyperspectral image (HSI) and thus results in poor quality reconstruction. In this paper, we propose to make full use of the high-dimensionality structure of the desired HSI to boost the reconstruction quality. We first build a high-order tensor by exploiting the nonlocal similarity in HSI. Then, we propose a dimension-discriminative low-rank tensor recovery (DLTR) model to characterize the structure prior adaptively in each dimension. By integrating the structure prior in DLTR with the system imaging process, we develop an optimization framework for HSI reconstruction, which is finally solved via the alternating minimization algorithm. Extensive experiments implemented with both synthetic and real data demonstrate that our method outperforms state-of-the-art methods.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Zhang_Computational_Hyperspectral_Imaging_Based_on_Dimension-Discriminative_Low-Rank_Tensor_Recovery_ICCV_2019_paper.pdf", - "aff": "Xi\u2019an Jiaotong University; Beijing Institute of Technology; Beijing Institute of Technology; Beijing Institute of Space Mechanics and Electricity; Beijing Institute of Technology", + "aff": "Xi’an Jiaotong University; Beijing Institute of Technology; Beijing Institute of Technology; Beijing Institute of Space Mechanics and Electricity; Beijing Institute of Technology", "project": "", "github": "", "supp": "", @@ -5597,14 +5772,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Zhang_Computational_Hyperspectral_Imaging_Based_on_Dimension-Discriminative_Low-Rank_Tensor_Recovery_ICCV_2019_paper.html", "aff_unique_index": "0;1;1;2;1", - "aff_unique_norm": "Xi'an Jiao Tong University;Beijing Institute of Technology;Beijing Institute of Space Mechanics and Electricity", + "aff_unique_norm": "Xi'an Jiaotong University;Beijing Institute of Technology;Beijing Institute of Space Mechanics and Electricity", "aff_unique_dep": ";;", "aff_unique_url": "https://www.xjtu.edu.cn;http://www.bit.edu.cn/;http://www.bisee.ac.cn/", "aff_unique_abbr": "XJTU;BIT;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2019_ICCV,\n \n author = {\n Zhang,\n Shipeng and Wang,\n Lizhi and Fu,\n Ying and Zhong,\n Xiaoming and Huang,\n Hua\n},\n title = {\n Computational Hyperspectral Imaging Based on Dimension-Discriminative Low-Rank Tensor Recovery\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Conditional Coupled Generative Adversarial Networks for Zero-Shot Domain Adaptation", @@ -5637,7 +5813,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Shenzhen", "aff_country_unique_index": "0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2019_ICCV,\n \n author = {\n Wang,\n Jinghua and Jiang,\n Jianmin\n},\n title = {\n Conditional Coupled Generative Adversarial Networks for Zero-Shot Domain Adaptation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "id": "a231f41943", @@ -5666,7 +5843,8 @@ "aff_campus_unique_index": "1;1;2+1;1", "aff_campus_unique": ";Madison;Arlington", "aff_country_unique_index": "0+0;0;0+0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Hwang_2019_ICCV,\n \n author = {\n Hwang,\n Seong Jae and Tao,\n Zirui and Kim,\n Won Hwa and Singh,\n Vikas\n},\n title = {\n Conditional Recurrent Flow: Conditional Generation of Longitudinal Samples With Applications to Neuroimaging\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Confidence Regularized Self-Training", @@ -5692,14 +5870,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Zou_Confidence_Regularized_Self-Training_ICCV_2019_paper.html", "aff_unique_index": "0;1;0;0;2", - "aff_unique_norm": "Carnegie Mellon University;NVIDIA;General Motors", - "aff_unique_dep": ";NVIDIA Corporation;R&D", + "aff_unique_norm": "Carnegie Mellon University;NVIDIA Corporation;General Motors", + "aff_unique_dep": ";;R&D", "aff_unique_url": "https://www.cmu.edu;https://www.nvidia.com;https://www.gm.com", "aff_unique_abbr": "CMU;NVIDIA;GM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zou_2019_ICCV,\n \n author = {\n Zou,\n Yang and Yu,\n Zhiding and Liu,\n Xiaofeng and Kumar,\n B.V.K. Vijaya and Wang,\n Jinsong\n},\n title = {\n Confidence Regularized Self-Training\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Consensus Maximization Tree Search Revisited", @@ -5723,7 +5902,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Cai_Consensus_Maximization_Tree_Search_Revisited_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Cai_Consensus_Maximization_Tree_Search_Revisited_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Cai_2019_ICCV,\n \n author = {\n Cai,\n Zhipeng and Chin,\n Tat-Jun and Koltun,\n Vladlen\n},\n title = {\n Consensus Maximization Tree Search Revisited\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Conservative Wasserstein Training for Pose Estimation", @@ -5749,14 +5929,15 @@ "author_num": 7, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Liu_Conservative_Wasserstein_Training_for_Pose_Estimation_ICCV_2019_paper.html", "aff_unique_index": "0+1;0;2;3;3;4;0", - "aff_unique_norm": "Carnegie Mellon University;Harvard University;Mila;Chinese Academy of Sciences;Hong Kong Polytechnic University", + "aff_unique_norm": "Carnegie Mellon University;Harvard University;MILA;Chinese Academy of Sciences;The Hong Kong Polytechnic University", "aff_unique_dep": ";;;CIOMP;", "aff_unique_url": "https://www.cmu.edu;https://www.harvard.edu;https://mila.quebec;http://www.cas.cn;https://www.polyu.edu.hk", "aff_unique_abbr": "CMU;Harvard;MILA;CAS;PolyU", "aff_campus_unique_index": ";1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0+0;0;1;2;2;2;0", - "aff_country_unique": "United States;Canada;China" + "aff_country_unique": "United States;Canada;China", + "bibtex": "@InProceedings{Liu_2019_ICCV,\n \n author = {\n Liu,\n Xiaofeng and Zou,\n Yang and Che,\n Tong and Ding,\n Peng and Jia,\n Ping and You,\n Jane and Kumar,\n B.V.K. Vijaya\n},\n title = {\n Conservative Wasserstein Training for Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Constructing Self-Motivated Pyramid Curriculums for Cross-Domain Semantic Segmentation: A Non-Adversarial Approach", @@ -5783,13 +5964,14 @@ "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Lian_Constructing_Self-Motivated_Pyramid_Curriculums_for_Cross-Domain_Semantic_Segmentation_A_Non-Adversarial_ICCV_2019_paper.html", "aff_unique_index": "0;0;0;1", "aff_unique_norm": "University of Electronic Science and Technology of China;Google", - "aff_unique_dep": ";Google", + "aff_unique_dep": ";", "aff_unique_url": "https://www.uestc.edu.cn;https://www.google.com", "aff_unique_abbr": "UESTC;Google", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Lian_2019_ICCV,\n \n author = {\n Lian,\n Qing and Lv,\n Fengmao and Duan,\n Lixin and Gong,\n Boqing\n},\n title = {\n Constructing Self-Motivated Pyramid Curriculums for Cross-Domain Semantic Segmentation: A Non-Adversarial Approach\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Content and Style Disentanglement for Artistic Style Transfer", @@ -5797,7 +5979,7 @@ "status": "Poster", "track": "main", "pid": "2684", - "author_site": "Dmytro Kotovenko, Artsiom Sanakoyeu, Sabine Lang, Bj\u00c3\u00b6rn Ommer", + "author_site": "Dmytro Kotovenko, Artsiom Sanakoyeu, Sabine Lang, Björn Ommer", "author": "Dmytro Kotovenko; Artsiom Sanakoyeu; Sabine Lang; Bjorn Ommer", "abstract": "Artists rarely paint in a single style throughout their career. More often they change styles or develop variations of it. In addition, artworks in different styles and even within one style depict real content differently: while Picasso's Blue Period displays a vase in a blueish tone but as a whole, his Cubist works deconstruct the object. To produce artistically convincing stylizations, style transfer models must be able to reflect these changes and variations. Recently many works have aimed to improve the style transfer task, but neglected to address the described observations. We present a novel approach which captures particularities of style and the variations within and separates style and content. This is achieved by introducing two novel losses: a fixpoint triplet style loss to learn subtle variations within one style or between different styles and a disentanglement loss to ensure that the stylization is not conditioned on the real input photo. In addition the paper proposes various evaluation methods to measure the importance of both losses on the validity, quality and variability of final stylizations. We provide qualitative results to demonstrate the performance of our approach.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Kotovenko_Content_and_Style_Disentanglement_for_Artistic_Style_Transfer_ICCV_2019_paper.pdf", @@ -5813,7 +5995,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Kotovenko_Content_and_Style_Disentanglement_for_Artistic_Style_Transfer_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Kotovenko_Content_and_Style_Disentanglement_for_Artistic_Style_Transfer_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Kotovenko_2019_ICCV,\n \n author = {\n Kotovenko,\n Dmytro and Sanakoyeu,\n Artsiom and Lang,\n Sabine and Ommer,\n Bjorn\n},\n title = {\n Content and Style Disentanglement for Artistic Style Transfer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Context-Aware Emotion Recognition Networks", @@ -5825,7 +6008,7 @@ "author": "Jiyoung Lee; Seungryong Kim; Sunok Kim; Jungin Park; Kwanghoon Sohn", "abstract": "Traditional techniques for emotion recognition have focused on the facial expression analysis only, thus providing limited ability to encode context that comprehensively represents the emotional responses. We present deep networks for context-aware emotion recognition, called CAER-Net, that exploit not only human facial expression but also context information in a joint and boosting manner. The key idea is to hide human faces in a visual scene and seek other contexts based on an attention mechanism. Our networks consist of two sub-networks, including two-stream encoding networks to separately extract the features of face and context regions, and adaptive fusion networks to fuse such features in an adaptive fashion. We also introduce a novel benchmark for context-aware emotion recognition, called CAER, that is appropriate than existing benchmarks both qualitatively and quantitatively. On several benchmarks, CAER-Net proves the effect of context for emotion recognition. Our dataset is available at http://caer-dataset.github.io.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Lee_Context-Aware_Emotion_Recognition_Networks_ICCV_2019_paper.pdf", - "aff": "Yonsei University; \u00b4Ecole Polytechnique F \u00b4ed\u00b4erale de Lausanne (EPFL); Yonsei University; Yonsei University; Yonsei University", + "aff": "Yonsei University; ´Ecole Polytechnique F ´ed´erale de Lausanne (EPFL); Yonsei University; Yonsei University; Yonsei University", "project": "http://caer-dataset.github.io", "github": "", "supp": "", @@ -5839,14 +6022,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Lee_Context-Aware_Emotion_Recognition_Networks_ICCV_2019_paper.html", "aff_unique_index": "0;1;0;0;0", - "aff_unique_norm": "Yonsei University;EPFL", + "aff_unique_norm": "Yonsei University;Ecole Polytechnique Fédérale de Lausanne", "aff_unique_dep": ";", "aff_unique_url": "https://www.yonsei.ac.kr;https://www.epfl.ch", "aff_unique_abbr": "Yonsei;EPFL", "aff_campus_unique_index": "1", "aff_campus_unique": ";Lausanne", "aff_country_unique_index": "0;1;0;0;0", - "aff_country_unique": "South Korea;Switzerland" + "aff_country_unique": "South Korea;Switzerland", + "bibtex": "@InProceedings{Lee_2019_ICCV,\n \n author = {\n Lee,\n Jiyoung and Kim,\n Seungryong and Kim,\n Sunok and Park,\n Jungin and Sohn,\n Kwanghoon\n},\n title = {\n Context-Aware Emotion Recognition Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Context-Aware Feature and Label Fusion for Facial Action Unit Intensity Estimation With Partially Labeled Data", @@ -5879,7 +6063,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;2", - "aff_country_unique": "China;Singapore;United States" + "aff_country_unique": "China;Singapore;United States", + "bibtex": "@InProceedings{Zhang_2019_ICCV,\n \n author = {\n Zhang,\n Yong and Jiang,\n Haiyong and Wu,\n Baoyuan and Fan,\n Yanbo and Ji,\n Qiang\n},\n title = {\n Context-Aware Feature and Label Fusion for Facial Action Unit Intensity Estimation With Partially Labeled Data\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Context-Aware Image Matting for Simultaneous Foreground and Alpha Estimation", @@ -5912,7 +6097,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Hou_2019_ICCV,\n \n author = {\n Hou,\n Qiqi and Liu,\n Feng\n},\n title = {\n Context-Aware Image Matting for Simultaneous Foreground and Alpha Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Contextual Attention for Hand Detection in the Wild", @@ -5940,12 +6126,13 @@ "aff_unique_index": "0;0;0;1;2", "aff_unique_norm": "Stony Brook University;California Institute of Technology;VinAI Research", "aff_unique_dep": ";;", - "aff_unique_url": "https://www.stonybrook.edu;https://www.caltech.edu;https://www.vinai.io/", + "aff_unique_url": "https://www.stonybrook.edu;https://www.caltech.edu;https://www.vinai.io", "aff_unique_abbr": "SBU;Caltech;VinAI", "aff_campus_unique_index": "1", "aff_campus_unique": ";Pasadena", "aff_country_unique_index": "0;0;0;0;1", - "aff_country_unique": "United States;Vietnam" + "aff_country_unique": "United States;Vietnam", + "bibtex": "@InProceedings{Narasimhaswamy_2019_ICCV,\n \n author = {\n Narasimhaswamy,\n Supreeth and Wei,\n Zhengwei and Wang,\n Yang and Zhang,\n Justin and Hoai,\n Minh\n},\n title = {\n Contextual Attention for Hand Detection in the Wild\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Continual Learning by Asymmetric Loss Approximation With Single-Side Overestimation", @@ -5971,14 +6158,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Park_Continual_Learning_by_Asymmetric_Loss_Approximation_With_Single-Side_Overestimation_ICCV_2019_paper.html", "aff_unique_index": "0+1;0;0;0", - "aff_unique_norm": "Seoul National University;Samsung", - "aff_unique_dep": "Department of Electrical and Computer Engineering & Advanced Software Research Institute;Samsung Electronics", + "aff_unique_norm": "Seoul National University;Samsung Electronics", + "aff_unique_dep": "Department of Electrical and Computer Engineering & Advanced Software Research Institute;", "aff_unique_url": "https://www.snu.ac.kr;https://www.samsung.com", "aff_unique_abbr": "SNU;Samsung", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Seoul;", "aff_country_unique_index": "0+0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Park_2019_ICCV,\n \n author = {\n Park,\n Dongmin and Hong,\n Seokil and Han,\n Bohyung and Lee,\n Kyoung Mu\n},\n title = {\n Continual Learning by Asymmetric Loss Approximation With Single-Side Overestimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Controllable Artistic Text Style Transfer via Shape-Matching GAN", @@ -6011,7 +6199,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;1;1;1;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Yang_2019_ICCV,\n \n author = {\n Yang,\n Shuai and Wang,\n Zhangyang and Wang,\n Zhaowen and Xu,\n Ning and Liu,\n Jiaying and Guo,\n Zongming\n},\n title = {\n Controllable Artistic Text Style Transfer via Shape-Matching GAN\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Controllable Attention for Structured Layered Video Decomposition", @@ -6019,7 +6208,7 @@ "status": "Poster", "track": "main", "pid": "4034", - "author_site": "Jean-Baptiste Alayrac, Jo\u00c3\u00a3o Carreira, Relja Arandjelovi\u00c4\u0087, Andrew Zisserman", + "author_site": "Jean-Baptiste Alayrac, João Carreira, Relja Arandjelović, Andrew Zisserman", "author": "Jean-Baptiste Alayrac; Joao Carreira; Relja Arandjelovic; Andrew Zisserman", "abstract": "The objective of this paper is to be able to separate a video into its natural layers, and to control which of the separated layers to attend to. For example, to be able to separate reflections, transparency or object motion. We make the following three contributions: (i) we introduce a new structured neural network architecture that explicitly incorporates layers (as spatial masks) into its design. This improves separation performance over previous general purpose networks for this task; (ii) we demonstrate that we can augment the architecture to leverage external cues such as audio for controllability and to help disambiguation; and (iii) we experimentally demonstrate the effectiveness of our approach and training procedure with controlled experiments while also showing that the proposed model can be successfully applied to real-word applications such as reflection removal and action recognition in cluttered scenes.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Alayrac_Controllable_Attention_for_Structured_Layered_Video_Decomposition_ICCV_2019_paper.pdf", @@ -6044,7 +6233,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Oxford", "aff_country_unique_index": "0;0;0;0+0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Alayrac_2019_ICCV,\n \n author = {\n Alayrac,\n Jean-Baptiste and Carreira,\n Joao and Arandjelovic,\n Relja and Zisserman,\n Andrew\n},\n title = {\n Controllable Attention for Structured Layered Video Decomposition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Controllable Video Captioning With POS Sequence Guidance Based on Gated Fusion Network", @@ -6077,7 +6267,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0+0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2019_ICCV,\n \n author = {\n Wang,\n Bairui and Ma,\n Lin and Zhang,\n Wei and Jiang,\n Wenhao and Wang,\n Jingwen and Liu,\n Wei\n},\n title = {\n Controllable Video Captioning With POS Sequence Guidance Based on Gated Fusion Network\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Controlling Neural Networks via Energy Dissipation", @@ -6085,7 +6276,7 @@ "status": "Poster", "track": "main", "pid": "2246", - "author_site": "Michael Moeller, Thomas M\u00c3\u00b6llenhoff, Daniel Cremers", + "author_site": "Michael Moeller, Thomas Möllenhoff, Daniel Cremers", "author": "Michael Moeller; Thomas Mollenhoff; Daniel Cremers", "abstract": "The last decade has shown a tremendous success in solving various computer vision problems with the help of deep learning techniques. Lately, many works have demonstrated that learning-based approaches with suitable network architectures even exhibit superior performance for the solution of (ill-posed) image reconstruction problems such as deblurring, super-resolution, or medical image reconstruction. The drawback of purely learning-based methods, however, is that they cannot provide provable guarantees for the trained network to follow a given data formation process during inference. In this work we propose energy dissipating networks that iteratively compute a descent direction with respect to a given cost function or energy at the currently estimated reconstruction. Therefore, an adaptive step size rule such as a line-search, along with a suitable number of iterations can guarantee the reconstruction to follow a given data formation model encoded in the energy to arbitrary precision, and hence control the model's behavior even during test time. We prove that under standard assumptions, descent using the direction predicted by the network converges (linearly) to the global minimum of the energy. We illustrate the effectiveness of the proposed approach in experiments on single image super resolution and computed tomography (CT) reconstruction, and further illustrate extensions to convex feasibility problems.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Moeller_Controlling_Neural_Networks_via_Energy_Dissipation_ICCV_2019_paper.pdf", @@ -6110,7 +6301,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Moeller_2019_ICCV,\n \n author = {\n Moeller,\n Michael and Mollenhoff,\n Thomas and Cremers,\n Daniel\n},\n title = {\n Controlling Neural Networks via Energy Dissipation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Convex Relaxations for Consensus and Non-Minimal Problems in 3D Vision", @@ -6134,7 +6326,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Probst_Convex_Relaxations_for_Consensus_and_Non-Minimal_Problems_in_3D_Vision_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Probst_Convex_Relaxations_for_Consensus_and_Non-Minimal_Problems_in_3D_Vision_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Probst_2019_ICCV,\n \n author = {\n Probst,\n Thomas and Paudel,\n Danda Pani and Chhatkuli,\n Ajad and Gool,\n Luc Van\n},\n title = {\n Convex Relaxations for Consensus and Non-Minimal Problems in 3D Vision\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Convex Shape Prior for Multi-Object Segmentation Using a Single Level Set Function", @@ -6167,7 +6360,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Luo_2019_ICCV,\n \n author = {\n Luo,\n Shousheng and Tai,\n Xue-Cheng and Huo,\n Limei and Wang,\n Yang and Glowinski,\n Roland\n},\n title = {\n Convex Shape Prior for Multi-Object Segmentation Using a Single Level Set Function\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Convolutional Approximations to the General Non-Line-of-Sight Imaging Operator", @@ -6200,7 +6394,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Ahn_2019_ICCV,\n \n author = {\n Ahn,\n Byeongjoo and Dave,\n Akshat and Veeraraghavan,\n Ashok and Gkioulekas,\n Ioannis and Sankaranarayanan,\n Aswin C.\n},\n title = {\n Convolutional Approximations to the General Non-Line-of-Sight Imaging Operator\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Convolutional Character Networks", @@ -6233,7 +6428,8 @@ "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0+0;1;0+0;0+0", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Xing_2019_ICCV,\n \n author = {\n Xing,\n Linjie and Tian,\n Zhi and Huang,\n Weilin and Scott,\n Matthew R.\n},\n title = {\n Convolutional Character Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Convolutional Sequence Generation for Skeleton-Based Action Synthesis", @@ -6259,14 +6455,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Yan_Convolutional_Sequence_Generation_for_Skeleton-Based_Action_Synthesis_ICCV_2019_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "Chinese University of Hong Kong", + "aff_unique_norm": "The Chinese University of Hong Kong", "aff_unique_dep": "Department of Information Engineering", "aff_unique_url": "https://www.cuhk.edu.hk", "aff_unique_abbr": "CUHK", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yan_2019_ICCV,\n \n author = {\n Yan,\n Sijie and Li,\n Zhizhong and Xiong,\n Yuanjun and Yan,\n Huahan and Lin,\n Dahua\n},\n title = {\n Convolutional Sequence Generation for Skeleton-Based Action Synthesis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Copy-and-Paste Networks for Deep Video Inpainting", @@ -6290,7 +6487,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Lee_Copy-and-Paste_Networks_for_Deep_Video_Inpainting_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Lee_Copy-and-Paste_Networks_for_Deep_Video_Inpainting_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Lee_2019_ICCV,\n \n author = {\n Lee,\n Sungho and Oh,\n Seoung Wug and Won,\n DaeYeun and Kim,\n Seon Joo\n},\n title = {\n Copy-and-Paste Networks for Deep Video Inpainting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Correlation Congruence for Knowledge Distillation", @@ -6316,14 +6514,15 @@ "author_num": 8, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Peng_Correlation_Congruence_for_Knowledge_Distillation_ICCV_2019_paper.html", "aff_unique_index": "0;1;2;3;1;3;1;0", - "aff_unique_norm": "National University of Defense Technology;SenseTime Group;Beihang University;Chinese University of Hong Kong", + "aff_unique_norm": "National University of Defense Technology;Sensetime Group;Beihang University;The Chinese University of Hong Kong", "aff_unique_dep": ";;;", - "aff_unique_url": "http://www.nudt.edu.cn/;https://www.sensetime.com;http://www.buaa.edu.cn/;https://www.cuhk.edu.hk", + "aff_unique_url": "http://www.nudt.edu.cn/;https://www.sensetime.com/;http://www.buaa.edu.cn/;https://www.cuhk.edu.hk", "aff_unique_abbr": "NUDT;SenseTime;BUAA;CUHK", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Peng_2019_ICCV,\n \n author = {\n Peng,\n Baoyun and Jin,\n Xiao and Liu,\n Jiaheng and Li,\n Dongsheng and Wu,\n Yichao and Liu,\n Yu and Zhou,\n Shunfeng and Zhang,\n Zhaoning\n},\n title = {\n Correlation Congruence for Knowledge Distillation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Cost-Aware Fine-Grained Recognition for IoTs Based on Sequential Fixations", @@ -6356,7 +6555,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Wang_2019_ICCV,\n \n author = {\n Wang,\n Hanxiao and Saligrama,\n Venkatesh and Sclaroff,\n Stan and Ablavsky,\n Vitaly\n},\n title = {\n Cost-Aware Fine-Grained Recognition for IoTs Based on Sequential Fixations\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Counterfactual Critic Multi-Agent Training for Scene Graph Generation", @@ -6380,7 +6580,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Chen_Counterfactual_Critic_Multi-Agent_Training_for_Scene_Graph_Generation_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Chen_Counterfactual_Critic_Multi-Agent_Training_for_Scene_Graph_Generation_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Chen_2019_ICCV,\n \n author = {\n Chen,\n Long and Zhang,\n Hanwang and Xiao,\n Jun and He,\n Xiangnan and Pu,\n Shiliang and Chang,\n Shih-Fu\n},\n title = {\n Counterfactual Critic Multi-Agent Training for Scene Graph Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Counting With Focus for Free", @@ -6404,7 +6605,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Shi_Counting_With_Focus_for_Free_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Shi_Counting_With_Focus_for_Free_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Shi_2019_ICCV,\n \n author = {\n Shi,\n Zenglin and Mettes,\n Pascal and Snoek,\n Cees G. M.\n},\n title = {\n Counting With Focus for Free\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Creativity Inspired Zero-Shot Learning", @@ -6430,14 +6632,15 @@ "author_num": 2, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Elhoseiny_Creativity_Inspired_Zero-Shot_Learning_ICCV_2019_paper.html", "aff_unique_index": "0+1;2", - "aff_unique_norm": "Meta;King Abdullah University of Science and Technology;University of Central Florida", + "aff_unique_norm": "Facebook;King Abdullah University of Science and Technology;University of Central Florida", "aff_unique_dep": "Facebook AI Research;;", "aff_unique_url": "https://research.facebook.com;https://www.kaust.edu.sa;https://www.ucf.edu", "aff_unique_abbr": "FAIR;KAUST;UCF", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0", - "aff_country_unique": "United States;Saudi Arabia" + "aff_country_unique": "United States;Saudi Arabia", + "bibtex": "@InProceedings{Elhoseiny_2019_ICCV,\n \n author = {\n Elhoseiny,\n Mohamed and Elfeki,\n Mohamed\n},\n title = {\n Creativity Inspired Zero-Shot Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Cross View Fusion for 3D Human Pose Estimation", @@ -6463,14 +6666,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Qiu_Cross_View_Fusion_for_3D_Human_Pose_Estimation_ICCV_2019_paper.html", "aff_unique_index": "0;1;1;2;1", - "aff_unique_norm": "University of Science and Technology of China;Microsoft;TuSimple", + "aff_unique_norm": "University of Science and Technology of China;Microsoft Research;TuSimple", "aff_unique_dep": ";Research;", "aff_unique_url": "http://www.ustc.edu.cn;https://www.microsoft.com/en-us/research/group/asia;https://www.tusimple.com", "aff_unique_abbr": "USTC;MSR Asia;TuSimple", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0;0;0;1;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Qiu_2019_ICCV,\n \n author = {\n Qiu,\n Haibo and Wang,\n Chunyu and Wang,\n Jingdong and Wang,\n Naiyan and Zeng,\n Wenjun\n},\n title = {\n Cross View Fusion for 3D Human Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Cross-Dataset Person Re-Identification via Unsupervised Pose Disentanglement and Adaptation", @@ -6498,12 +6702,13 @@ "aff_unique_index": "0+1+2;0+1;0;0+1+2", "aff_unique_norm": "National Taiwan University;MOST Joint Research Center for AI Technology;ASUS Intelligent Cloud Services", "aff_unique_dep": ";AI Technology;", - "aff_unique_url": "https://www.ntu.edu.tw;;https://www.asus.com/", + "aff_unique_url": "https://www.ntu.edu.tw;;https://www.asus.com", "aff_unique_abbr": "NTU;;ASUS", "aff_campus_unique_index": "0+0+0;0+0;0;0+0+0", "aff_campus_unique": "Taiwan", "aff_country_unique_index": "0+0+0;0+0;0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2019_ICCV,\n \n author = {\n Li,\n Yu-Jhe and Lin,\n Ci-Siang and Lin,\n Yan-Bo and Wang,\n Yu-Chiang Frank\n},\n title = {\n Cross-Dataset Person Re-Identification via Unsupervised Pose Disentanglement and Adaptation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Cross-Domain Adaptation for Animal Pose Estimation", @@ -6529,14 +6734,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Cao_Cross-Domain_Adaptation_for_Animal_Pose_Estimation_ICCV_2019_paper.html", "aff_unique_index": "0+1;0+1;0;1;0;1", - "aff_unique_norm": "Shanghai Jiao Tong University;Tencent", - "aff_unique_dep": ";Tencent Holdings Limited", + "aff_unique_norm": "Shanghai Jiao Tong University;Tencent Holdings Limited", + "aff_unique_dep": ";", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.tencent.com", "aff_unique_abbr": "SJTU;Tencent", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Cao_2019_ICCV,\n \n author = {\n Cao,\n Jinkun and Tang,\n Hongyang and Fang,\n Hao-Shu and Shen,\n Xiaoyong and Lu,\n Cewu and Tai,\n Yu-Wing\n},\n title = {\n Cross-Domain Adaptation for Animal Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Cross-View Policy Learning for Street Navigation", @@ -6569,7 +6775,8 @@ "aff_campus_unique_index": "0;0;1;0", "aff_campus_unique": "Mountain View;London", "aff_country_unique_index": "0;0;1;0", - "aff_country_unique": "United States;United Kingdom" + "aff_country_unique": "United States;United Kingdom", + "bibtex": "@InProceedings{Li_2019_ICCV,\n \n author = {\n Li,\n Ang and Hu,\n Huiyi and Mirowski,\n Piotr and Farajtabar,\n Mehrdad\n},\n title = {\n Cross-View Policy Learning for Street Navigation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Cross-X Learning for Fine-Grained Visual Categorization", @@ -6595,14 +6802,15 @@ "author_num": 8, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Luo_Cross-X_Learning_for_Fine-Grained_Visual_Categorization_ICCV_2019_paper.html", "aff_unique_index": "0+1;1;0;1+2;1;3;4;2", - "aff_unique_norm": "South China Agricultural University;University of Maryland;Meta;Massachusetts Institute of Technology;Nanjing University of Science and Technology", + "aff_unique_norm": "South China Agricultural University;University of Maryland;Facebook;Massachusetts Institute of Technology;Nanjing University of Science and Technology", "aff_unique_dep": ";;Facebook AI;;", "aff_unique_url": "http://www.scau.edu.cn;https://www/umd.edu;https://www.facebook.com;https://web.mit.edu;http://www.nust.edu.cn/", "aff_unique_abbr": "SCAU;UMD;Facebook AI;MIT;NUST", "aff_campus_unique_index": "1;1;1;1", "aff_campus_unique": ";College Park", "aff_country_unique_index": "0+1;1;0;1+1;1;1;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Luo_2019_ICCV,\n \n author = {\n Luo,\n Wei and Yang,\n Xitong and Mo,\n Xianjie and Lu,\n Yuheng and Davis,\n Larry S. and Li,\n Jun and Yang,\n Jian and Lim,\n Ser-Nam\n},\n title = {\n Cross-X Learning for Fine-Grained Visual Categorization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Crowd Counting With Deep Structured Scale Integration Network", @@ -6628,14 +6836,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Liu_Crowd_Counting_With_Deep_Structured_Scale_Integration_Network_ICCV_2019_paper.html", "aff_unique_index": "0;0;0;1;1;0+2", - "aff_unique_norm": "Sun Yat-sen University;University of Sydney;DarkMatter AI Research", + "aff_unique_norm": "Sun Yat-sen University;The University of Sydney;DarkMatter AI Research", "aff_unique_dep": ";;AI Research", "aff_unique_url": "http://www.sysu.edu.cn/;https://www.sydney.edu.au;", "aff_unique_abbr": "SYSU;USYD;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;1;0+2", - "aff_country_unique": "China;Australia;United States" + "aff_country_unique": "China;Australia;United States", + "bibtex": "@InProceedings{Liu_2019_ICCV,\n \n author = {\n Liu,\n Lingbo and Qiu,\n Zhilin and Li,\n Guanbin and Liu,\n Shufan and Ouyang,\n Wanli and Lin,\n Liang\n},\n title = {\n Crowd Counting With Deep Structured Scale Integration Network\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Customizing Student Networks From Heterogeneous Teachers via Adaptive Knowledge Amalgamation", @@ -6668,7 +6877,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0+0;0;0+0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Shen_2019_ICCV,\n \n author = {\n Shen,\n Chengchao and Xue,\n Mengqi and Wang,\n Xinchao and Song,\n Jie and Sun,\n Li and Song,\n Mingli\n},\n title = {\n Customizing Student Networks From Heterogeneous Teachers via Adaptive Knowledge Amalgamation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "CutMix: Regularization Strategy to Train Strong Classifiers With Localizable Features", @@ -6692,7 +6902,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Yun_CutMix_Regularization_Strategy_to_Train_Strong_Classifiers_With_Localizable_Features_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Yun_CutMix_Regularization_Strategy_to_Train_Strong_Classifiers_With_Localizable_Features_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Yun_2019_ICCV,\n \n author = {\n Yun,\n Sangdoo and Han,\n Dongyoon and Oh,\n Seong Joon and Chun,\n Sanghyuk and Choe,\n Junsuk and Yoo,\n Youngjoon\n},\n title = {\n CutMix: Regularization Strategy to Train Strong Classifiers With Localizable Features\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "DADA: Depth-Aware Domain Adaptation in Semantic Segmentation", @@ -6700,7 +6911,7 @@ "status": "Poster", "track": "main", "pid": "159", - "author_site": "Tuan-Hung Vu, Himalaya Jain, Maxime Bucher, Matthieu Cord, Patrick P\u00c3\u00a9rez", + "author_site": "Tuan-Hung Vu, Himalaya Jain, Maxime Bucher, Matthieu Cord, Patrick Pérez", "author": "Tuan-Hung Vu; Himalaya Jain; Maxime Bucher; Matthieu Cord; Patrick Perez", "abstract": "Unsupervised domain adaptation (UDA) is important for applications where large scale annotation of representative data is challenging. For semantic segmentation in particular, it helps deploy on real \"target domain\" data models that are trained on annotated images from a different \"source domain\", notably a virtual environment. To this end, most previous works consider semantic segmentation as the only mode of supervision for source domain data, while ignoring other, possibly available, information like depth. In this work, we aim at exploiting at best such a privileged information while training the UDA model. We propose a unified depth-aware UDA framework that leverages in several complementary ways the knowledge of dense depth in the source domain. As a result, the performance of the trained semantic segmentation model on the target domain is boosted. Our novel approach indeed achieves state-of-the-art performance on different challenging synthetic-2-real benchmarks.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Vu_DADA_Depth-Aware_Domain_Adaptation_in_Semantic_Segmentation_ICCV_2019_paper.pdf", @@ -6718,14 +6929,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Vu_DADA_Depth-Aware_Domain_Adaptation_in_Semantic_Segmentation_ICCV_2019_paper.html", "aff_unique_index": "0;0;0;0+1;0", - "aff_unique_norm": "Valeo.ai;Sorbonne University", + "aff_unique_norm": "valeo.ai;Sorbonne University", "aff_unique_dep": ";", "aff_unique_url": "https://www.valeo.ai;https://www.sorbonne.universite.fr", "aff_unique_abbr": ";Sorbonne", "aff_campus_unique_index": "0;0;0;0+0;0", "aff_campus_unique": "Paris", "aff_country_unique_index": "0;0;0;0+0;0", - "aff_country_unique": "France" + "aff_country_unique": "France", + "bibtex": "@InProceedings{Vu_2019_ICCV,\n \n author = {\n Vu,\n Tuan-Hung and Jain,\n Himalaya and Bucher,\n Maxime and Cord,\n Matthieu and Perez,\n Patrick\n},\n title = {\n DADA: Depth-Aware Domain Adaptation in Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "DAGMapper: Learning to Map by Discovering Lane Topology", @@ -6758,7 +6970,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0+0;0;0;0;0+1", - "aff_country_unique": "United States;Canada" + "aff_country_unique": "United States;Canada", + "bibtex": "@InProceedings{Homayounfar_2019_ICCV,\n \n author = {\n Homayounfar,\n Namdar and Ma,\n Wei-Chiu and Liang,\n Justin and Wu,\n Xinyu and Fan,\n Jack and Urtasun,\n Raquel\n},\n title = {\n DAGMapper: Learning to Map by Discovering Lane Topology\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "DANet: Divergent Activation for Weakly Supervised Object Localization", @@ -6784,14 +6997,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Xue_DANet_Divergent_Activation_for_Weakly_Supervised_Object_Localization_ICCV_2019_paper.html", "aff_unique_index": "0;0;0+1;0;2;0+1", - "aff_unique_norm": "University of Chinese Academy of Sciences;Pengcheng Laboratory;Tsinghua University", - "aff_unique_dep": ";Peng Cheng Laboratory;", + "aff_unique_norm": "University of Chinese Academy of Sciences;Peng Cheng Laboratory;Tsinghua University", + "aff_unique_dep": ";;", "aff_unique_url": "http://www.ucas.ac.cn;http://www.pcl.ac.cn;https://www.tsinghua.edu.cn", "aff_unique_abbr": "UCAS;PCL;THU", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xue_2019_ICCV,\n \n author = {\n Xue,\n Haolan and Liu,\n Chang and Wan,\n Fang and Jiao,\n Jianbin and Ji,\n Xiangyang and Ye,\n Qixiang\n},\n title = {\n DANet: Divergent Activation for Weakly Supervised Object Localization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "DDSL: Deep Differentiable Simplex Layer for Learning Geometric Signals", @@ -6799,7 +7013,7 @@ "status": "Poster", "track": "main", "pid": "3695", - "author_site": "Chiyu \"Max\" Jiang, Dana Lansigan, Philip Marcus, Matthias Nie\u00c3\u009fner", + "author_site": "Chiyu \"Max\" Jiang, Dana Lansigan, Philip Marcus, Matthias Nießner", "author": "Chiyu \"Max\" Jiang; Dana Lansigan; Philip Marcus; Matthias Niessner", "abstract": "We present a Deep Differentiable Simplex Layer (DDSL) for neural networks for geometric deep learning. The DDSL is a differentiable layer compatible with deep neural networks for bridging simplex mesh-based geometry representations (point clouds, line mesh, triangular mesh, tetrahedral mesh) with raster images (e.g., 2D/3D grids). The DDSL uses Non-Uniform Fourier Transform (NUFT) to perform differentiable, efficient, anti- aliased rasterization of simplex-based signals. We present a complete theoretical framework for the process as well as an efficient backpropagation algorithm. Compared to previous differentiable renderers and rasterizers, the DDSL generalizes to arbitrary simplex degrees and dimensions. In particular, we explore its applications to 2D shapes and illustrate two applications of this method: (1) mesh editing and optimization guided by neural network outputs, and (2) using DDSL for a differentiable rasterization loss to facilitate end-to-end training of polygon generators. We are able to validate the effectiveness of gradient-based shape optimization with the example of airfoil optimization, and using the differentiable rasterization loss to facilitate end-to-end training, we surpass state of the art for polygonal image segmentation given ground-truth bounding boxes.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Jiang_DDSL_Deep_Differentiable_Simplex_Layer_for_Learning_Geometric_Signals_ICCV_2019_paper.pdf", @@ -6815,7 +7029,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Jiang_DDSL_Deep_Differentiable_Simplex_Layer_for_Learning_Geometric_Signals_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Jiang_DDSL_Deep_Differentiable_Simplex_Layer_for_Learning_Geometric_Signals_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Jiang_2019_ICCV,\n \n author = {\n Jiang,\n Chiyu \"Max\" and Lansigan,\n Dana and Marcus,\n Philip and Niessner,\n Matthias\n},\n title = {\n DDSL: Deep Differentiable Simplex Layer for Learning Geometric Signals\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "DF2Net: A Dense-Fine-Finer Network for Detailed 3D Face Reconstruction", @@ -6841,14 +7056,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Zeng_DF2Net_A_Dense-Fine-Finer_Network_for_Detailed_3D_Face_Reconstruction_ICCV_2019_paper.html", "aff_unique_index": "0+1;0;0", - "aff_unique_norm": "Shenzhen Institute of Advanced Technology;University of Chinese Academy of Sciences", + "aff_unique_norm": "Shenzhen Institutes of Advanced Technology;University of Chinese Academy of Sciences", "aff_unique_dep": "Key Lab of Computer Vision and Pattern Recognition;", "aff_unique_url": "http://www.siat.ac.cn;http://www.ucas.ac.cn", "aff_unique_abbr": "SIAT;UCAS", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Shenzhen;", "aff_country_unique_index": "0+0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zeng_2019_ICCV,\n \n author = {\n Zeng,\n Xiaoxing and Peng,\n Xiaojiang and Qiao,\n Yu\n},\n title = {\n DF2Net: A Dense-Fine-Finer Network for Detailed 3D Face Reconstruction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "DMM-Net: Differentiable Mask-Matching Network for Video Object Segmentation", @@ -6874,14 +7090,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Zeng_DMM-Net_Differentiable_Mask-Matching_Network_for_Video_Object_Segmentation_ICCV_2019_paper.html", "aff_unique_index": "0+1+2;0+1+2;0;0+1+2;0+1+3;0+1+2+4", - "aff_unique_norm": "University of Toronto;Vector Institute;Uber Advanced Technologies Group;NVIDIA;Canadian Institute for Advanced Research", - "aff_unique_dep": ";;Uber ATG;NVIDIA Corporation;", + "aff_unique_norm": "University of Toronto;Vector Institute;Uber Advanced Technologies Group;NVIDIA Corporation;Canadian Institute for Advanced Research", + "aff_unique_dep": ";;Uber ATG;;", "aff_unique_url": "https://www.utoronto.ca;https://vectorinstitute.ai/;https://www.uber.com;https://www.nvidia.com;https://www.cifar.ca", "aff_unique_abbr": "U of T;Vector Institute;Uber ATG;NVIDIA;CIFAR", "aff_campus_unique_index": "1;1;1;;1", "aff_campus_unique": ";Toronto", "aff_country_unique_index": "0+0+0;0+0+0;0;0+0+0;0+0+1;0+0+0+0", - "aff_country_unique": "Canada;United States" + "aff_country_unique": "Canada;United States", + "bibtex": "@InProceedings{Zeng_2019_ICCV,\n \n author = {\n Zeng,\n Xiaohui and Liao,\n Renjie and Gu,\n Li and Xiong,\n Yuwen and Fidler,\n Sanja and Urtasun,\n Raquel\n},\n title = {\n DMM-Net: Differentiable Mask-Matching Network for Video Object Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "DPOD: 6D Pose Object Detector and Refiner", @@ -6914,7 +7131,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Zakharov_2019_ICCV,\n \n author = {\n Zakharov,\n Sergey and Shugurov,\n Ivan and Ilic,\n Slobodan\n},\n title = {\n DPOD: 6D Pose Object Detector and Refiner\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "DSConv: Efficient Convolution Operator", @@ -6940,14 +7158,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/do_Nascimento_DSConv_Efficient_Convolution_Operator_ICCV_2019_paper.html", "aff_unique_index": "0+1;2;0+1", - "aff_unique_norm": "University of Oxford;Active Vision Lab;Intel", - "aff_unique_dep": ";;Intel Corporation", + "aff_unique_norm": "University of Oxford;Active Vision Lab;Intel Corporation", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.ox.ac.uk;;https://www.intel.com", "aff_unique_abbr": "Oxford;;Intel", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;2;0", - "aff_country_unique": "United Kingdom;;United States" + "aff_country_unique": "United Kingdom;;United States", + "bibtex": "@InProceedings{Nascimento_2019_ICCV,\n \n author = {\n Nascimento,\n Marcelo Gennari do and Fawcett,\n Roger and Prisacariu,\n Victor Adrian\n},\n title = {\n DSConv: Efficient Convolution Operator\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "DSIC: Deep Stereo Image Compression", @@ -6959,7 +7178,7 @@ "author": "Jerry Liu; Shenlong Wang; Raquel Urtasun", "abstract": "In this paper we tackle the problem of stereo image compression, and leverage the fact that the two images have overlapping fields of view to further compress the representations. Our approach leverages state-of-the-art single-image compression autoencoders and enhances the compression with novel parametric skip functions to feed fully differentiable, disparity-warped features at all levels to the encoder/decoder of the second image. Moreover, we model the probabilistic dependence between the image codes using a conditional entropy model. Our experiments show an impressive 30 - 50% reduction in the second image bitrate at low bitrates compared to deep single-image compression, and a 10 - 20% reduction at higher bitrates.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Liu_DSIC_Deep_Stereo_Image_Compression_ICCV_2019_paper.pdf", - "aff": "Uber ATG\u2021; Uber ATG; Uber ATG", + "aff": "Uber ATG‡; Uber ATG; Uber ATG", "project": "", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2019/supplemental/Liu_DSIC_Deep_Stereo_ICCV_2019_supplemental.pdf", @@ -6980,7 +7199,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Liu_2019_ICCV,\n \n author = {\n Liu,\n Jerry and Wang,\n Shenlong and Urtasun,\n Raquel\n},\n title = {\n DSIC: Deep Stereo Image Compression\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "id": "aea679c682", @@ -6989,7 +7209,7 @@ "author": "Haoliang Sun; Ronak Mehta; Hao H. Zhou; Zhichun Huang; Sterling C. Johnson; Vivek Prabhakaran; Vikas Singh", "abstract": "Positron emission tomography (PET) imaging is an imaging modality for diagnosing a number of neurological diseases. In contrast to Magnetic Resonance Imaging (MRI), PET is costly and involves injecting a radioactive substance into the patient. Motivated by developments in modality transfer in vision, we study the generation of certain types of PET images from MRI data. We derive new flow-based generative models which we show perform well in this small sample size regime (much smaller than dataset sizes available in standard vision tasks). Our formulation, DUAL-GLOW, is based on two invertible networks and a relation network that maps the latent spaces to each other. We discuss how given the prior distribution, learning the conditional distribution of PET given the MRI image reduces to obtaining the conditional distribution between the two latent codes w.r.t. the two image types. We also extend our framework to leverage \"side\" information (or attributes) when available. By controlling the PET generation through \"conditioning\" on age, our model is also able to capture brain FDG-PET (hypometabolism) changes, as a function of age. We present experiments on the Alzheimer's Disease Neuroimaging Initiative (ADNI) dataset with 826 subjects, and obtain good performance in PET image synthesis, qualitatively and quantitatively better than recent works.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Sun_DUAL-GLOW_Conditional_Flow-Based_Generative_Model_for_Modality_Transfer_ICCV_2019_paper.pdf", - "aff": "University of Wisconsin-Madison+Shandong University+Inception Institute of Arti\ufb01cial Intelligence; University of Wisconsin-Madison; University of Wisconsin-Madison; University of Wisconsin-Madison; University of Wisconsin-Madison; University of Wisconsin-Madison; University of Wisconsin-Madison", + "aff": "University of Wisconsin-Madison+Shandong University+Inception Institute of Artificial Intelligence; University of Wisconsin-Madison; University of Wisconsin-Madison; University of Wisconsin-Madison; University of Wisconsin-Madison; University of Wisconsin-Madison; University of Wisconsin-Madison", "project": "", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2019/supplemental/Sun_DUAL-GLOW_Conditional_Flow-Based_ICCV_2019_supplemental.pdf", @@ -7009,7 +7229,8 @@ "aff_campus_unique_index": "0;0;0;0;0;0;0", "aff_campus_unique": "Madison;", "aff_country_unique_index": "0+1+1;0;0;0;0;0;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Sun_2019_ICCV,\n \n author = {\n Sun,\n Haoliang and Mehta,\n Ronak and Zhou,\n Hao H. and Huang,\n Zhichun and Johnson,\n Sterling C. and Prabhakaran,\n Vivek and Singh,\n Vikas\n},\n title = {\n DUAL-GLOW: Conditional Flow-Based Generative Model for Modality Transfer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "DUP-Net: Denoiser and Upsampler Network for 3D Adversarial Point Clouds Defense", @@ -7042,7 +7263,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhou_2019_ICCV,\n \n author = {\n Zhou,\n Hang and Chen,\n Kejiang and Zhang,\n Weiming and Fang,\n Han and Zhou,\n Wenbo and Yu,\n Nenghai\n},\n title = {\n DUP-Net: Denoiser and Upsampler Network for 3D Adversarial Point Clouds Defense\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Data-Free Learning of Student Networks", @@ -7066,7 +7288,8 @@ "aff_domain": ";;;;;;;;", "email": ";;;;;;;;", "author_num": 9, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Chen_Data-Free_Learning_of_Student_Networks_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Chen_Data-Free_Learning_of_Student_Networks_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Chen_2019_ICCV,\n \n author = {\n Chen,\n Hanting and Wang,\n Yunhe and Xu,\n Chang and Yang,\n Zhaohui and Liu,\n Chuanjian and Shi,\n Boxin and Xu,\n Chunjing and Xu,\n Chao and Tian,\n Qi\n},\n title = {\n Data-Free Learning of Student Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Data-Free Quantization Through Weight Equalization and Bias Correction", @@ -7090,7 +7313,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Nagel_Data-Free_Quantization_Through_Weight_Equalization_and_Bias_Correction_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Nagel_Data-Free_Quantization_Through_Weight_Equalization_and_Bias_Correction_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Nagel_2019_ICCV,\n \n author = {\n Nagel,\n Markus and Baalen,\n Mart van and Blankevoort,\n Tijmen and Welling,\n Max\n},\n title = {\n Data-Free Quantization Through Weight Equalization and Bias Correction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "DeCaFA: Deep Convolutional Cascade for Face Alignment in the Wild", @@ -7102,7 +7326,7 @@ "author": "Arnaud Dapogny; Kevin Bailly; Matthieu Cord", "abstract": "Face Alignment is an active computer vision domain, that consists in localizing a number of facial landmarks that vary across datasets. State-of-the-art face alignment methods either consist in end-to-end regression, or in refining the shape in a cascaded manner, starting from an initial guess. In this paper, we introduce an end-to-end deep convolutional cascade (DeCaFA) architecture for face alignment. Face Alignment is an active computer vision domain, that consists in localizing a number of facial landmarks that vary across datasets. State-of-the-art face alignment methods either consist in end-to-end regression, or in refining the shape in a cascaded manner, starting from an initial guess. In this paper, we introduce DeCaFA, an end-to-end deep convolutional cascade architecture for face alignment. DeCaFA uses fully-convolutional stages to keep full spatial resolution throughout the cascade. Between each cascade stage, DeCaFA uses multiple chained transfer layers with spatial softmax to produce landmark-wise attention maps for each of several landmark alignment tasks. Weighted intermediate supervision, as well as efficient feature fusion between the stages allow to learn to progressively refine the attention maps in an end-to-end manner. We show experimentally that DeCaFA significantly outperforms existing approaches on 300W, CelebA and WFLW databases. In addition, we show that DeCaFA can learn fine alignment with reasonable accuracy from very few images using coarsely annotated data.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Dapogny_DeCaFA_Deep_Convolutional_Cascade_for_Face_Alignment_in_the_Wild_ICCV_2019_paper.pdf", - "aff": "LIP6, Sorbonne Universit\u00e9, CNRS; Datakalab + ISIR, Sorbonne Universit\u00e9, CNRS; LIP6, Sorbonne Universit\u00e9, CNRS", + "aff": "LIP6, Sorbonne Université, CNRS; Datakalab + ISIR, Sorbonne Université, CNRS; LIP6, Sorbonne Université, CNRS", "project": "", "github": "", "supp": "", @@ -7116,14 +7340,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Dapogny_DeCaFA_Deep_Convolutional_Cascade_for_Face_Alignment_in_the_Wild_ICCV_2019_paper.html", "aff_unique_index": "0;1+0;0", - "aff_unique_norm": "Sorbonne Universit\u00e9;Datakalab", + "aff_unique_norm": "Sorbonne Université;Datakalab", "aff_unique_dep": "LIP6;", "aff_unique_url": "https://www.sorbonne-universite.fr;", "aff_unique_abbr": "Sorbonne U;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1+0;0", - "aff_country_unique": "France;China" + "aff_country_unique": "France;China", + "bibtex": "@InProceedings{Dapogny_2019_ICCV,\n \n author = {\n Dapogny,\n Arnaud and Bailly,\n Kevin and Cord,\n Matthieu\n},\n title = {\n DeCaFA: Deep Convolutional Cascade for Face Alignment in the Wild\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "DeblurGAN-v2: Deblurring (Orders-of-Magnitude) Faster and Better", @@ -7152,11 +7377,12 @@ "aff_unique_norm": "Ukrainian Catholic University;SoftServe;Texas A&M University", "aff_unique_dep": ";;Department of Computer Science and Engineering", "aff_unique_url": "https://ucu.edu.ua;https://www.softserveinc.com;https://www.tamu.edu", - "aff_unique_abbr": "UCU;;TAMU", + "aff_unique_abbr": ";;TAMU", "aff_campus_unique_index": "0+0;0", "aff_campus_unique": "Lviv;", "aff_country_unique_index": "0+0;0;1;1", - "aff_country_unique": "Ukraine;United States" + "aff_country_unique": "Ukraine;United States", + "bibtex": "@InProceedings{Kupyn_2019_ICCV,\n \n author = {\n Kupyn,\n Orest and Martyniuk,\n Tetiana and Wu,\n Junru and Wang,\n Zhangyang\n},\n title = {\n DeblurGAN-v2: Deblurring (Orders-of-Magnitude) Faster and Better\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "DeceptionNet: Network-Driven Domain Randomization", @@ -7189,7 +7415,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;1;0+0", - "aff_country_unique": "Germany;United States" + "aff_country_unique": "Germany;United States", + "bibtex": "@InProceedings{Zakharov_2019_ICCV,\n \n author = {\n Zakharov,\n Sergey and Kehl,\n Wadim and Ilic,\n Slobodan\n},\n title = {\n DeceptionNet: Network-Driven Domain Randomization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Deep Appearance Maps", @@ -7197,7 +7424,7 @@ "status": "Poster", "track": "main", "pid": "4021", - "author_site": "Maxim Maximov, Laura Leal-Taix\u00c3\u00a9, Mario Fritz, Tobias Ritschel", + "author_site": "Maxim Maximov, Laura Leal-Taixé, Mario Fritz, Tobias Ritschel", "author": "Maxim Maximov; Laura Leal-Taixe; Mario Fritz; Tobias Ritschel", "abstract": "We propose a deep representation of appearance, i.e. the relation of color, surface orientation, viewer position, material and illumination. Previous approaches have used deep learning to extract classic appearance representations relating to reflectance model parameters (e.g. Phong) or illumination (e.g. HDR environment maps). We suggest to directly represent appearance itself as a network we call a deep appearance map (DAM). This is a 4D generalization over 2D reflectance maps, which held the view direction fixed. First, we show how a DAM can be learned from images or video frames and later be used to synthesize appearance, given new surface orientations and viewer positions. Second, we demonstrate how another network can be used to map from an image or video frames to a DAM network to reproduce this appearance, without using a lengthy optimization such as stochastic gradient descent (learning-to-learn). Finally, we show the example of an appearance estimation-and-segmentation task, mapping from an image showing multiple materials to multiple deep appearance maps.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Maximov_Deep_Appearance_Maps_ICCV_2019_paper.pdf", @@ -7213,7 +7440,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Maximov_Deep_Appearance_Maps_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Maximov_Deep_Appearance_Maps_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Maximov_2019_ICCV,\n \n author = {\n Maximov,\n Maxim and Leal-Taixe,\n Laura and Fritz,\n Mario and Ritschel,\n Tobias\n},\n title = {\n Deep Appearance Maps\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Deep Blind Hyperspectral Image Fusion", @@ -7246,7 +7474,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";New York", "aff_country_unique_index": "0;0;0;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Wang_2019_ICCV,\n \n author = {\n Wang,\n Wu and Zeng,\n Weihong and Huang,\n Yue and Ding,\n Xinghao and Paisley,\n John\n},\n title = {\n Deep Blind Hyperspectral Image Fusion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Deep CG2Real: Synthetic-to-Real Translation via Image Disentanglement", @@ -7279,7 +7508,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "San Diego;", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Bi_2019_ICCV,\n \n author = {\n Bi,\n Sai and Sunkavalli,\n Kalyan and Perazzi,\n Federico and Shechtman,\n Eli and Kim,\n Vladimir G. and Ramamoorthi,\n Ravi\n},\n title = {\n Deep CG2Real: Synthetic-to-Real Translation via Image Disentanglement\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Deep Closest Point: Learning Representations for Point Cloud Registration", @@ -7312,7 +7542,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Wang_2019_ICCV,\n \n author = {\n Wang,\n Yue and Solomon,\n Justin M.\n},\n title = {\n Deep Closest Point: Learning Representations for Point Cloud Registration\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Deep Clustering by Gaussian Mixture Variational Autoencoders With Graph Embedding", @@ -7345,7 +7576,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1", - "aff_country_unique": "Singapore;China" + "aff_country_unique": "Singapore;China", + "bibtex": "@InProceedings{Yang_2019_ICCV,\n \n author = {\n Yang,\n Linxiao and Cheung,\n Ngai-Man and Li,\n Jiaying and Fang,\n Jun\n},\n title = {\n Deep Clustering by Gaussian Mixture Variational Autoencoders With Graph Embedding\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Deep Comprehensive Correlation Mining for Image Clustering", @@ -7374,11 +7606,12 @@ "aff_unique_norm": "Shandong University;SenseTime;Peking University", "aff_unique_dep": "School of Computer Science and Technology;SenseTime Research;School of EECS", "aff_unique_url": "http://www.sdu.edu.cn;https://www.sensetime.com;http://www.pku.edu.cn", - "aff_unique_abbr": ";SenseTime;PKU", + "aff_unique_abbr": "SDU;SenseTime;PKU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wu_2019_ICCV,\n \n author = {\n Wu,\n Jianlong and Long,\n Keyu and Wang,\n Fei and Qian,\n Chen and Li,\n Cheng and Lin,\n Zhouchen and Zha,\n Hongbin\n},\n title = {\n Deep Comprehensive Correlation Mining for Image Clustering\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Deep Constrained Dominant Sets for Person Re-Identification", @@ -7390,7 +7623,7 @@ "author": "Leulseged Tesfaye Alemu; Marcello Pelillo; Mubarak Shah", "abstract": "In this work, we propose an end-to-end constrained clustering scheme to tackle the person re-identification (re-id) problem. Deep neural networks (DNN) have recently proven to be effective on person re-identification task. In particular, rather than leveraging solely a probe-gallery similarity, diffusing the similarities among the gallery images in an end-to-end manner has proven to be effective in yielding a robust probe-gallery affinity. However, existing methods do not apply probe image as a constraint, and are prone to noise propagation during the similarity diffusion process. To overcome this, we propose an intriguing scheme which treats person-image retrieval problem as a constrained clustering optimization problem, called deep constrained dominant sets (DCDS). Given a probe and gallery images, we re-formulate person re-id problem as finding a constrained cluster, where the probe image is taken as a constraint (seed) and each cluster corresponds to a set of images corresponding to the same person. By optimizing the constrained clustering in an end-to-end manner, we naturally leverage the contextual knowledge of a set of images corresponding to the given person-images. We further enhance the performance by integrating an auxiliary net alongside DCDS, which employs a multi-scale ResNet. To validate the effectiveness of our method we present experiments on several benchmark datasets and show that the proposed method can outperform state-of-the-art methods.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Alemu_Deep_Constrained_Dominant_Sets_for_Person_Re-Identification_ICCV_2019_paper.pdf", - "aff": "Ca\u2019 Foscari University of Venice; ECLT, Venezia+Ca\u2019 Foscari University of Venice; CRCV, University of Central Florida", + "aff": "Ca’ Foscari University of Venice; ECLT, Venezia+Ca’ Foscari University of Venice; CRCV, University of Central Florida", "project": "", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2019/supplemental/Alemu_Deep_Constrained_Dominant_ICCV_2019_supplemental.pdf", @@ -7404,14 +7637,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Alemu_Deep_Constrained_Dominant_Sets_for_Person_Re-Identification_ICCV_2019_paper.html", "aff_unique_index": "0;1+0;2", - "aff_unique_norm": "Ca\u2019 Foscari University of Venice;ECLT;University of Central Florida", - "aff_unique_dep": ";;Center for Research in Computer Vision", + "aff_unique_norm": "Ca’ Foscari University of Venice;ECLT;University of Central Florida", + "aff_unique_dep": ";;CRCV", "aff_unique_url": "https://www.unive.it;;https://www.ucf.edu", - "aff_unique_abbr": "Ca\u2019 Foscari;;UCF", - "aff_campus_unique_index": "0;1+0;2", - "aff_campus_unique": "Venice;Venezia;Orlando", + "aff_unique_abbr": "Ca’ Foscari;;UCF", + "aff_campus_unique_index": "0;1+0", + "aff_campus_unique": "Venice;Venezia;", "aff_country_unique_index": "0;0+0;1", - "aff_country_unique": "Italy;United States" + "aff_country_unique": "Italy;United States", + "bibtex": "@InProceedings{Alemu_2019_ICCV,\n \n author = {\n Alemu,\n Leulseged Tesfaye and Pelillo,\n Marcello and Shah,\n Mubarak\n},\n title = {\n Deep Constrained Dominant Sets for Person Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Deep Contextual Attention for Human-Object Interaction Detection", @@ -7423,7 +7657,7 @@ "author": "Tiancai Wang; Rao Muhammad Anwer; Muhammad Haris Khan; Fahad Shahbaz Khan; Yanwei Pang; Ling Shao; Jorma Laaksonen", "abstract": "Human-object interaction detection is an important and relatively new class of visual relationship detection tasks, essential for deeper scene understanding. Most existing approaches decompose the problem into object localization and interaction recognition. Despite showing progress, these approaches only rely on the appearances of humans and objects and overlook the available context information, crucial for capturing subtle interactions between them. We propose a contextual attention framework for human-object interaction detection. Our approach leverages context by learning contextually-aware appearance features for human and object instances. The proposed attention module then adaptively selects relevant instance-centric context information to highlight image regions likely to contain human-object interactions. Experiments are performed on three benchmarks: V-COCO, HICO-DET and HCVRD. Our approach outperforms the state-of-the-art on all datasets. On the V-COCO dataset, our method achieves a relative gain of 4.4% in terms of role mean average precision (mAP role ), compared to the existing best approach.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Wang_Deep_Contextual_Attention_for_Human-Object_Interaction_Detection_ICCV_2019_paper.pdf", - "aff": "School of Electrical and Information Engineering, Tianjin University; Inception Institute of Arti\ufb01cial Intelligence (IIAI), UAE; Inception Institute of Arti\ufb01cial Intelligence (IIAI), UAE; Inception Institute of Arti\ufb01cial Intelligence (IIAI), UAE; School of Electrical and Information Engineering, Tianjin University; Inception Institute of Arti\ufb01cial Intelligence (IIAI), UAE; Department of Computer Science, Aalto University School of Science, Finland", + "aff": "School of Electrical and Information Engineering, Tianjin University; Inception Institute of Artificial Intelligence (IIAI), UAE; Inception Institute of Artificial Intelligence (IIAI), UAE; Inception Institute of Artificial Intelligence (IIAI), UAE; School of Electrical and Information Engineering, Tianjin University; Inception Institute of Artificial Intelligence (IIAI), UAE; Department of Computer Science, Aalto University School of Science, Finland", "project": "", "github": "", "supp": "", @@ -7444,7 +7678,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;0;1;2", - "aff_country_unique": "China;United Arab Emirates;Finland" + "aff_country_unique": "China;United Arab Emirates;Finland", + "bibtex": "@InProceedings{Wang_2019_ICCV,\n \n author = {\n Wang,\n Tiancai and Anwer,\n Rao Muhammad and Khan,\n Muhammad Haris and Khan,\n Fahad Shahbaz and Pang,\n Yanwei and Shao,\n Ling and Laaksonen,\n Jorma\n},\n title = {\n Deep Contextual Attention for Human-Object Interaction Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Deep Depth From Aberration Map", @@ -7477,7 +7712,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Kashiwagi_2019_ICCV,\n \n author = {\n Kashiwagi,\n Masako and Mishima,\n Nao and Kozakaya,\n Tatsuo and Hiura,\n Shinsaku\n},\n title = {\n Deep Depth From Aberration Map\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Deep Elastic Networks With Model Selection for Multi-Task Learning", @@ -7510,7 +7746,8 @@ "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Seoul;Oxford", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "South Korea;United Kingdom" + "aff_country_unique": "South Korea;United Kingdom", + "bibtex": "@InProceedings{Ahn_2019_ICCV,\n \n author = {\n Ahn,\n Chanho and Kim,\n Eunwoo and Oh,\n Songhwai\n},\n title = {\n Deep Elastic Networks With Model Selection for Multi-Task Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Deep End-to-End Alignment and Refinement for Time-of-Flight RGB-D Module", @@ -7536,14 +7773,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Qiu_Deep_End-to-End_Alignment_and_Refinement_for_Time-of-Flight_RGB-D_Module_ICCV_2019_paper.html", "aff_unique_index": "0;0+1;0;0", - "aff_unique_norm": "SenseTime;Chinese University of Hong Kong", + "aff_unique_norm": "SenseTime;The Chinese University of Hong Kong", "aff_unique_dep": "SenseTime Research;", "aff_unique_url": "https://www.sensetime.com;https://www.cuhk.edu.hk", "aff_unique_abbr": "SenseTime;CUHK", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0+0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Qiu_2019_ICCV,\n \n author = {\n Qiu,\n Di and Pang,\n Jiahao and Sun,\n Wenxiu and Yang,\n Chengxi\n},\n title = {\n Deep End-to-End Alignment and Refinement for Time-of-Flight RGB-D Module\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Deep Floor Plan Recognition Using a Multi-Task Network With Room-Boundary-Guided Attention", @@ -7569,14 +7807,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Zeng_Deep_Floor_Plan_Recognition_Using_a_Multi-Task_Network_With_Room-Boundary-Guided_ICCV_2019_paper.html", "aff_unique_index": "0;0;0;0", - "aff_unique_norm": "Chinese University of Hong Kong", + "aff_unique_norm": "The Chinese University of Hong Kong", "aff_unique_dep": "", "aff_unique_url": "https://www.cuhk.edu.hk", "aff_unique_abbr": "CUHK", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zeng_2019_ICCV,\n \n author = {\n Zeng,\n Zhiliang and Li,\n Xianzhi and Yu,\n Ying Kin and Fu,\n Chi-Wing\n},\n title = {\n Deep Floor Plan Recognition Using a Multi-Task Network With Room-Boundary-Guided Attention\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Deep Graphical Feature Learning for the Feature Matching Problem", @@ -7609,7 +7848,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;2", - "aff_country_unique": "Australia;;Singapore" + "aff_country_unique": "Australia;;Singapore", + "bibtex": "@InProceedings{Zhang_2019_ICCV,\n \n author = {\n Zhang,\n Zhen and Lee,\n Wee Sun\n},\n title = {\n Deep Graphical Feature Learning for the Feature Matching Problem\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Deep Head Pose Estimation Using Synthetic Images and Partial Adversarial Domain Adaption for Continuous Label Spaces", @@ -7617,7 +7857,7 @@ "status": "Poster", "track": "main", "pid": "6072", - "author_site": "Felix Kuhnke, J\u00c3\u00b6rn Ostermann", + "author_site": "Felix Kuhnke, Jörn Ostermann", "author": "Felix Kuhnke; Jorn Ostermann", "abstract": "Head pose estimation aims at predicting an accurate pose from an image. Current approaches rely on supervised deep learning, which typically requires large amounts of labeled data. Manual or sensor-based annotations of head poses are prone to errors. A solution is to generate synthetic training data by rendering 3D face models. However, the differences (domain gap) between rendered (source-domain) and real-world (target-domain) images can cause low performance. Advances in visual domain adaptation allow reducing the influence of domain differences using adversarial neural networks, which match the feature spaces between domains by enforcing domain-invariant features. While previous work on visual domain adaptation generally assumes discrete and shared label spaces, these assumptions are both invalid for pose estimation tasks. We are the first to present domain adaptation for head pose estimation with a focus on partially shared and continuous label spaces. More precisely, we adapt the predominant weighting approaches to continuous label spaces by applying a weighted resampling of the source domain during training. To evaluate our approach, we revise and extend existing datasets resulting in a new benchmark for visual domain adaption. Our experiments show that our method improves the accuracy of head pose estimation for real-world images despite using only labels from synthetic images.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Kuhnke_Deep_Head_Pose_Estimation_Using_Synthetic_Images_and_Partial_Adversarial_ICCV_2019_paper.pdf", @@ -7633,7 +7873,8 @@ "aff_domain": ";", "email": ";", "author_num": 2, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Kuhnke_Deep_Head_Pose_Estimation_Using_Synthetic_Images_and_Partial_Adversarial_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Kuhnke_Deep_Head_Pose_Estimation_Using_Synthetic_Images_and_Partial_Adversarial_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Kuhnke_2019_ICCV,\n \n author = {\n Kuhnke,\n Felix and Ostermann,\n Jorn\n},\n title = {\n Deep Head Pose Estimation Using Synthetic Images and Partial Adversarial Domain Adaption for Continuous Label Spaces\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Deep Hough Voting for 3D Object Detection in Point Clouds", @@ -7657,7 +7898,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Qi_Deep_Hough_Voting_for_3D_Object_Detection_in_Point_Clouds_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Qi_Deep_Hough_Voting_for_3D_Object_Detection_in_Point_Clouds_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Qi_2019_ICCV,\n \n author = {\n Qi,\n Charles R. and Litany,\n Or and He,\n Kaiming and Guibas,\n Leonidas J.\n},\n title = {\n Deep Hough Voting for 3D Object Detection in Point Clouds\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Deep Joint-Semantics Reconstructing Hashing for Large-Scale Unsupervised Cross-Modal Retrieval", @@ -7690,7 +7932,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Su_2019_ICCV,\n \n author = {\n Su,\n Shupeng and Zhong,\n Zhisheng and Zhang,\n Chao\n},\n title = {\n Deep Joint-Semantics Reconstructing Hashing for Large-Scale Unsupervised Cross-Modal Retrieval\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Deep Learning for Light Field Saliency Detection", @@ -7723,7 +7966,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2019_ICCV,\n \n author = {\n Wang,\n Tiantian and Piao,\n Yongri and Li,\n Xiao and Zhang,\n Lihe and Lu,\n Huchuan\n},\n title = {\n Deep Learning for Light Field Saliency Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Deep Learning for Seeing Through Window With Raindrops", @@ -7756,7 +8000,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Guangzhou;", "aff_country_unique_index": "0;0;0;1", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Quan_2019_ICCV,\n \n author = {\n Quan,\n Yuhui and Deng,\n Shijie and Chen,\n Yixin and Ji,\n Hui\n},\n title = {\n Deep Learning for Seeing Through Window With Raindrops\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Deep Mesh Reconstruction From Single RGB Images via Topology Modification Networks", @@ -7782,14 +8027,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Pan_Deep_Mesh_Reconstruction_From_Single_RGB_Images_via_Topology_Modification_ICCV_2019_paper.html", "aff_unique_index": "0;1;2;0;0", - "aff_unique_norm": "South China University of Technology;Chinese University of Hong Kong (Shenzhen);University of Southern California", + "aff_unique_norm": "South China University of Technology;the Chinese University of Hong Kong (Shenzhen);University of Southern California", "aff_unique_dep": "School of Electronic and Information Engineering;Shenzhen Research Institute of Big Data;Institute for Creative Technologies", "aff_unique_url": "https://www.scut.edu.cn;https://www.cuhk.edu.cn;https://ict.usc.edu", "aff_unique_abbr": "SCUT;CUHK (Shenzhen);USC ICT", "aff_campus_unique_index": "1", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0;0;1;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Pan_2019_ICCV,\n \n author = {\n Pan,\n Junyi and Han,\n Xiaoguang and Chen,\n Weikai and Tang,\n Jiapeng and Jia,\n Kui\n},\n title = {\n Deep Mesh Reconstruction From Single RGB Images via Topology Modification Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Deep Meta Functionals for Shape Representation", @@ -7815,14 +8061,15 @@ "author_num": 2, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Littwin_Deep_Meta_Functionals_for_Shape_Representation_ICCV_2019_paper.html", "aff_unique_index": "0;0+1", - "aff_unique_norm": "Tel Aviv University;Meta", + "aff_unique_norm": "Tel Aviv University;Facebook", "aff_unique_dep": ";Facebook AI Research", "aff_unique_url": "https://www.tau.ac.il;https://research.facebook.com", "aff_unique_abbr": "TAU;FAIR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+1", - "aff_country_unique": "Israel;United States" + "aff_country_unique": "Israel;United States", + "bibtex": "@InProceedings{Littwin_2019_ICCV,\n \n author = {\n Littwin,\n Gidi and Wolf,\n Lior\n},\n title = {\n Deep Meta Functionals for Shape Representation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Deep Meta Learning for Real-Time Target-Aware Visual Tracking", @@ -7855,7 +8102,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Seoul;", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Choi_2019_ICCV,\n \n author = {\n Choi,\n Janghoon and Kwon,\n Junseok and Lee,\n Kyoung Mu\n},\n title = {\n Deep Meta Learning for Real-Time Target-Aware Visual Tracking\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Deep Meta Metric Learning", @@ -7888,7 +8136,8 @@ "aff_campus_unique_index": ";;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0+0;0+0+0;0+0+0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2019_ICCV,\n \n author = {\n Chen,\n Guangyi and Zhang,\n Tianren and Lu,\n Jiwen and Zhou,\n Jie\n},\n title = {\n Deep Meta Metric Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Deep Metric Learning With Tuplet Margin Loss", @@ -7914,14 +8163,15 @@ "author_num": 2, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Yu_Deep_Metric_Learning_With_Tuplet_Margin_Loss_ICCV_2019_paper.html", "aff_unique_index": "0;0", - "aff_unique_norm": "University of Sydney", + "aff_unique_norm": "The University of Sydney", "aff_unique_dep": "School of Computer Science", "aff_unique_url": "https://www.sydney.edu.au", "aff_unique_abbr": "USYD", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Darlington", "aff_country_unique_index": "0;0", - "aff_country_unique": "Australia" + "aff_country_unique": "Australia", + "bibtex": "@InProceedings{Yu_2019_ICCV,\n \n author = {\n Yu,\n Baosheng and Tao,\n Dacheng\n},\n title = {\n Deep Metric Learning With Tuplet Margin Loss\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Deep Multi-Model Fusion for Single-Image Dehazing", @@ -7945,7 +8195,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Deng_Deep_Multi-Model_Fusion_for_Single-Image_Dehazing_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Deng_Deep_Multi-Model_Fusion_for_Single-Image_Dehazing_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Deng_2019_ICCV,\n \n author = {\n Deng,\n Zijun and Zhu,\n Lei and Hu,\n Xiaowei and Fu,\n Chi-Wing and Xu,\n Xuemiao and Zhang,\n Qing and Qin,\n Jing and Heng,\n Pheng-Ann\n},\n title = {\n Deep Multi-Model Fusion for Single-Image Dehazing\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Deep Multiple-Attribute-Perceived Network for Real-World Texture Recognition", @@ -7969,7 +8220,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Zhai_Deep_Multiple-Attribute-Perceived_Network_for_Real-World_Texture_Recognition_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Zhai_Deep_Multiple-Attribute-Perceived_Network_for_Real-World_Texture_Recognition_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Zhai_2019_ICCV,\n \n author = {\n Zhai,\n Wei and Cao,\n Yang and Zhang,\n Jing and Zha,\n Zheng-Jun\n},\n title = {\n Deep Multiple-Attribute-Perceived Network for Real-World Texture Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Deep Non-Rigid Structure From Motion", @@ -8002,7 +8254,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Kong_2019_ICCV,\n \n author = {\n Kong,\n Chen and Lucey,\n Simon\n},\n title = {\n Deep Non-Rigid Structure From Motion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Deep Optics for Monocular Depth Estimation and 3D Object Detection", @@ -8035,7 +8288,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Chang_2019_ICCV,\n \n author = {\n Chang,\n Julie and Wetzstein,\n Gordon\n},\n title = {\n Deep Optics for Monocular Depth Estimation and 3D Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Deep Parametric Indoor Lighting Estimation", @@ -8043,11 +8297,11 @@ "status": "Poster", "track": "main", "pid": "5073", - "author_site": "Marc-Andr\u00c3\u00a9 Gardner, Yannick Hold-Geoffroy, Kalyan Sunkavalli, Christian Gagn\u00c3\u00a9, Jean-Fran\u00c3\u00a7ois Lalonde", + "author_site": "Marc-André Gardner, Yannick Hold-Geoffroy, Kalyan Sunkavalli, Christian Gagné, Jean-François Lalonde", "author": "Marc-Andre Gardner; Yannick Hold-Geoffroy; Kalyan Sunkavalli; Christian Gagne; Jean-Francois Lalonde", "abstract": "We present a method to estimate lighting from a single image of an indoor scene. Previous work has used an environment map representation that does not account for the localized nature of indoor lighting. Instead, we represent lighting as a set of discrete 3D lights with geometric and photometric parameters. We train a deep neural network to regress these parameters from a single image, on a dataset of environment maps annotated with depth. We propose a differentiable layer to convert these parameters to an environment map to compute our loss; this bypasses the challenge of establishing correspondences between estimated and ground truth lights. We demonstrate, via quantitative and qualitative evaluations, that our representation and training scheme lead to more accurate results compared to previous work, while allowing for more realistic 3D object compositing with spatially-varying lighting.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Gardner_Deep_Parametric_Indoor_Lighting_Estimation_ICCV_2019_paper.pdf", - "aff": "Universit\u00e9 Laval+Adobe Research; Adobe Research; Adobe Research; Universit\u00e9 Laval; Universit\u00e9 Laval", + "aff": "Université Laval+Adobe Research; Adobe Research; Adobe Research; Université Laval; Université Laval", "project": "https://lvsn.github.io/deepparametric/", "github": "", "supp": "", @@ -8061,14 +8315,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Gardner_Deep_Parametric_Indoor_Lighting_Estimation_ICCV_2019_paper.html", "aff_unique_index": "0+1;1;1;0;0", - "aff_unique_norm": "Universit\u00e9 Laval;Adobe", + "aff_unique_norm": "Université Laval;Adobe", "aff_unique_dep": ";Adobe Research", "aff_unique_url": "https://www.ulaval.ca;https://research.adobe.com", "aff_unique_abbr": "ULaval;Adobe", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;1;1;0;0", - "aff_country_unique": "Canada;United States" + "aff_country_unique": "Canada;United States", + "bibtex": "@InProceedings{Gardner_2019_ICCV,\n \n author = {\n Gardner,\n Marc-Andre and Hold-Geoffroy,\n Yannick and Sunkavalli,\n Kalyan and Gagne,\n Christian and Lalonde,\n Jean-Francois\n},\n title = {\n Deep Parametric Indoor Lighting Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Deep Reinforcement Active Learning for Human-in-the-Loop Person Re-Identification", @@ -8094,14 +8349,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Liu_Deep_Reinforcement_Active_Learning_for_Human-in-the-Loop_Person_Re-Identification_ICCV_2019_paper.html", "aff_unique_index": "0;1;2;0;1", - "aff_unique_norm": "Dalian University of Technology;University of Sydney;Queen Mary University of London", + "aff_unique_norm": "Dalian University of Technology;The University of Sydney;Queen Mary University of London", "aff_unique_dep": ";UBTECH Sydney AI Center;", "aff_unique_url": "http://www.dlut.edu.cn/;https://www.sydney.edu.au;https://www.qmul.ac.uk", "aff_unique_abbr": "DUT;USYD;QMUL", "aff_campus_unique_index": "1;2;1", "aff_campus_unique": ";Sydney;London", "aff_country_unique_index": "0;1;2;0;1", - "aff_country_unique": "China;Australia;United Kingdom" + "aff_country_unique": "China;Australia;United Kingdom", + "bibtex": "@InProceedings{Liu_2019_ICCV,\n \n author = {\n Liu,\n Zimo and Wang,\n Jingya and Gong,\n Shaogang and Lu,\n Huchuan and Tao,\n Dacheng\n},\n title = {\n Deep Reinforcement Active Learning for Human-in-the-Loop Person Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Deep Residual Learning in the JPEG Transform Domain", @@ -8134,7 +8390,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "College Park", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Ehrlich_2019_ICCV,\n \n author = {\n Ehrlich,\n Max and Davis,\n Larry S.\n},\n title = {\n Deep Residual Learning in the JPEG Transform Domain\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Deep Restoration of Vintage Photographs From Scanned Halftone Prints", @@ -8167,7 +8424,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1+0", - "aff_country_unique": "China;Canada" + "aff_country_unique": "China;Canada", + "bibtex": "@InProceedings{Gao_2019_ICCV,\n \n author = {\n Gao,\n Qifan and Shu,\n Xiao and Wu,\n Xiaolin\n},\n title = {\n Deep Restoration of Vintage Photographs From Scanned Halftone Prints\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Deep SR-ITM: Joint Learning of Super-Resolution and Inverse Tone-Mapping for 4K UHD HDR Applications", @@ -8191,7 +8449,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Kim_Deep_SR-ITM_Joint_Learning_of_Super-Resolution_and_Inverse_Tone-Mapping_for_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Kim_Deep_SR-ITM_Joint_Learning_of_Super-Resolution_and_Inverse_Tone-Mapping_for_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Kim_2019_ICCV,\n \n author = {\n Kim,\n Soo Ye and Oh,\n Jihyong and Kim,\n Munchurl\n},\n title = {\n Deep SR-ITM: Joint Learning of Super-Resolution and Inverse Tone-Mapping for 4K UHD HDR Applications\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Deep Self-Learning From Noisy Labels", @@ -8217,14 +8476,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Han_Deep_Self-Learning_From_Noisy_Labels_ICCV_2019_paper.html", "aff_unique_index": "0;1;0", - "aff_unique_norm": "Chinese University of Hong Kong;University of Hong Kong", + "aff_unique_norm": "The Chinese University of Hong Kong;The University of Hong Kong", "aff_unique_dep": "CUHK-SenseTime Joint Laboratory;", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.hku.hk", "aff_unique_abbr": "CUHK;HKU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Han_2019_ICCV,\n \n author = {\n Han,\n Jiangfan and Luo,\n Ping and Wang,\n Xiaogang\n},\n title = {\n Deep Self-Learning From Noisy Labels\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Deep Single-Image Portrait Relighting", @@ -8250,14 +8510,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Zhou_Deep_Single-Image_Portrait_Relighting_ICCV_2019_paper.html", "aff_unique_index": "0+1;1;2;0", - "aff_unique_norm": "University of Maryland;Amazon;Adobe", - "aff_unique_dep": ";Amazon.com, Inc.;Adobe Research", + "aff_unique_norm": "University of Maryland;Amazon.com, Inc.;Adobe", + "aff_unique_dep": ";;Adobe Research", "aff_unique_url": "https://www/umd.edu;https://www.amazon.com;https://research.adobe.com", "aff_unique_abbr": "UMD;Amazon;Adobe", "aff_campus_unique_index": "0;0", "aff_campus_unique": "College Park;", "aff_country_unique_index": "0+0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhou_2019_ICCV,\n \n author = {\n Zhou,\n Hao and Hadap,\n Sunil and Sunkavalli,\n Kalyan and Jacobs,\n David W.\n},\n title = {\n Deep Single-Image Portrait Relighting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Deep Supervised Hashing With Anchor Graph", @@ -8269,7 +8530,7 @@ "author": "Yudong Chen; Zhihui Lai; Yujuan Ding; Kaiyi Lin; Wai Keung Wong", "abstract": "Recently, a series of deep supervised hashing methods were proposed for binary code learning. However, due to the high computation cost and the limited hardware's memory, these methods will first select a subset from the training set, and then form a mini-batch data to update the network in each iteration. Therefore, the remaining labeled data cannot be fully utilized and the model cannot directly obtain the binary codes of the entire training set for retrieval. To address these problems, this paper proposes an interesting regularized deep model to seamlessly integrate the advantages of deep hashing and efficient binary code learning by using the anchor graph. As such, the deep features and label matrix can be jointly used to optimize the binary codes, and the network can obtain more discriminative feedback from the linear combinations of the learned bits. Moreover, we also reveal the algorithm mechanism and its computation essence. Experiments on three large-scale datasets indicate that the proposed method achieves better retrieval performance with less training time compared to previous deep hashing methods.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Chen_Deep_Supervised_Hashing_With_Anchor_Graph_ICCV_2019_paper.pdf", - "aff": "College of Computer Science and Software Engineering, Shenzhen University, Shenzhen, China+Guangdong Key Laboratory of Intelligent Information Processing, Shenzhen University, Shenzhen, China; College of Computer Science and Software Engineering, Shenzhen University, Shenzhen, China+Shenzhen Institute of Arti\ufb01cial Intelligence and Robotics for Society, Shenzhen, China; Institute of Textiles and Clothing, The Hong Kong Polytechnic University, Hong Kong, China; School of Software and Microelectronics, Peking University, Beijing, China; Institute of Textiles and Clothing, The Hong Kong Polytechnic University, Hong Kong, China", + "aff": "College of Computer Science and Software Engineering, Shenzhen University, Shenzhen, China+Guangdong Key Laboratory of Intelligent Information Processing, Shenzhen University, Shenzhen, China; College of Computer Science and Software Engineering, Shenzhen University, Shenzhen, China+Shenzhen Institute of Artificial Intelligence and Robotics for Society, Shenzhen, China; Institute of Textiles and Clothing, The Hong Kong Polytechnic University, Hong Kong, China; School of Software and Microelectronics, Peking University, Beijing, China; Institute of Textiles and Clothing, The Hong Kong Polytechnic University, Hong Kong, China", "project": "", "github": "", "supp": "", @@ -8283,14 +8544,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Chen_Deep_Supervised_Hashing_With_Anchor_Graph_ICCV_2019_paper.html", "aff_unique_index": "0+0;0+1;2;3;2", - "aff_unique_norm": "Shenzhen University;Shenzhen Institute of Artificial Intelligence and Robotics for Society;Hong Kong Polytechnic University;Peking University", + "aff_unique_norm": "Shenzhen University;Shenzhen Institute of Artificial Intelligence and Robotics for Society;The Hong Kong Polytechnic University;Peking University", "aff_unique_dep": "College of Computer Science and Software Engineering;;Institute of Textiles and Clothing;School of Software and Microelectronics", "aff_unique_url": "https://www.szu.edu.cn;;https://www.polyu.edu.hk;http://www.pku.edu.cn", "aff_unique_abbr": "SZU;;PolyU;PKU", "aff_campus_unique_index": "0+0;0+0;1;2;1", "aff_campus_unique": "Shenzhen;Hong Kong;Beijing", "aff_country_unique_index": "0+0;0+0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2019_ICCV,\n \n author = {\n Chen,\n Yudong and Lai,\n Zhihui and Ding,\n Yujuan and Lin,\n Kaiyi and Wong,\n Wai Keung\n},\n title = {\n Deep Supervised Hashing With Anchor Graph\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Deep Tensor ADMM-Net for Snapshot Compressive Imaging", @@ -8323,7 +8585,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Ma_2019_ICCV,\n \n author = {\n Ma,\n Jiawei and Liu,\n Xiao-Yang and Shou,\n Zheng and Yuan,\n Xin\n},\n title = {\n Deep Tensor ADMM-Net for Snapshot Compressive Imaging\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "DeepGCNs: Can GCNs Go As Deep As CNNs?", @@ -8331,7 +8594,7 @@ "status": "Oral", "track": "main", "pid": "153", - "author_site": "Guohao Li, Matthias M\u00c3\u00bcller, Ali Thabet, Bernard Ghanem", + "author_site": "Guohao Li, Matthias Müller, Ali Thabet, Bernard Ghanem", "author": "Guohao Li; Matthias Muller; Ali Thabet; Bernard Ghanem", "abstract": "Convolutional Neural Networks (CNNs) achieve impressive performance in a wide variety of fields. Their success benefited from a massive boost when very deep CNN models were able to be reliably trained. Despite their merits, CNNs fail to properly address problems with non-Euclidean data. To overcome this challenge, Graph Convolutional Networks (GCNs) build graphs to represent non-Euclidean data, borrow concepts from CNNs, and apply them in training. GCNs show promising results, but they are usually limited to very shallow models due to the vanishing gradient problem. As a result, most state-of-the-art GCN models are no deeper than 3 or 4 layers. In this work, we present new ways to successfully train very deep GCNs. We do this by borrowing concepts from CNNs, specifically residual/dense connections and dilated convolutions, and adapting them to GCN architectures. Extensive experiments show the positive effect of these deep GCN frameworks. Finally, we use these new concepts to build a very deep 56-layer GCN, and show how it significantly boosts performance (+3.7% mIoU over state-of-the-art) in the task of point cloud semantic segmentation. We believe that the community can greatly benefit from this work, as it opens up many opportunities for advancing GCN-based research.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Li_DeepGCNs_Can_GCNs_Go_As_Deep_As_CNNs_ICCV_2019_paper.pdf", @@ -8356,7 +8619,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Thuwal", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Saudi Arabia" + "aff_country_unique": "Saudi Arabia", + "bibtex": "@InProceedings{Li_2019_ICCV,\n \n author = {\n Li,\n Guohao and Muller,\n Matthias and Thabet,\n Ali and Ghanem,\n Bernard\n},\n title = {\n DeepGCNs: Can GCNs Go As Deep As CNNs?\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "DeepHuman: 3D Human Reconstruction From a Single Image", @@ -8389,7 +8653,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Zheng_2019_ICCV,\n \n author = {\n Zheng,\n Zerong and Yu,\n Tao and Wei,\n Yixuan and Dai,\n Qionghai and Liu,\n Yebin\n},\n title = {\n DeepHuman: 3D Human Reconstruction From a Single Image\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "DeepPruner: Learning Efficient Stereo Matching via Differentiable PatchMatch", @@ -8422,7 +8687,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0;0+1;0+0;0;0+1", - "aff_country_unique": "United States;Canada" + "aff_country_unique": "United States;Canada", + "bibtex": "@InProceedings{Duggal_2019_ICCV,\n \n author = {\n Duggal,\n Shivam and Wang,\n Shenlong and Ma,\n Wei-Chiu and Hu,\n Rui and Urtasun,\n Raquel\n},\n title = {\n DeepPruner: Learning Efficient Stereo Matching via Differentiable PatchMatch\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "DeepVCP: An End-to-End Deep Neural Network for Point Cloud Registration", @@ -8455,7 +8721,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Lu_2019_ICCV,\n \n author = {\n Lu,\n Weixin and Wan,\n Guowei and Zhou,\n Yao and Fu,\n Xiangyu and Yuan,\n Pengfei and Song,\n Shiyu\n},\n title = {\n DeepVCP: An End-to-End Deep Neural Network for Point Cloud Registration\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Defending Against Universal Perturbations With Shared Adversarial Training", @@ -8467,7 +8734,7 @@ "author": "Chaithanya Kumar Mummadi; Thomas Brox; Jan Hendrik Metzen", "abstract": "Classifiers such as deep neural networks have been shown to be vulnerable against adversarial perturbations on problems with high-dimensional input space. While adversarial training improves the robustness of image classifiers against such adversarial perturbations, it leaves them sensitive to perturbations on a non-negligible fraction of the inputs. In this work, we show that adversarial training is more effective in preventing universal perturbations, where the same perturbation needs to fool a classifier on many inputs. Moreover, we investigate the trade-off between robustness against universal perturbations and performance on unperturbed data and propose an extension of adversarial training that handles this trade-off more gracefully. We present results for image classification and semantic segmentation to showcase that universal perturbations that fool a model hardened with adversarial training become clearly perceptible and show patterns of the target scene.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Mummadi_Defending_Against_Universal_Perturbations_With_Shared_Adversarial_Training_ICCV_2019_paper.pdf", - "aff": "University of Freiburg + Bosch Center for Arti\ufb01cial Intelligence, Germany; University of Freiburg; Bosch Center for Arti\ufb01cial Intelligence, Germany", + "aff": "University of Freiburg + Bosch Center for Artificial Intelligence, Germany; University of Freiburg; Bosch Center for Artificial Intelligence, Germany", "project": "", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2019/supplemental/Mummadi_Defending_Against_Universal_ICCV_2019_supplemental.pdf", @@ -8481,14 +8748,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Mummadi_Defending_Against_Universal_Perturbations_With_Shared_Adversarial_Training_ICCV_2019_paper.html", "aff_unique_index": "0+1;0;1", - "aff_unique_norm": "University of Freiburg;Bosch Center for Arti\ufb01cial Intelligence", + "aff_unique_norm": "University of Freiburg;Bosch Center for Artificial Intelligence", "aff_unique_dep": ";Artificial Intelligence", "aff_unique_url": "https://www.uni-freiburg.de;https://www.bosch-ai.com", "aff_unique_abbr": "UoF;BCAI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Mummadi_2019_ICCV,\n \n author = {\n Mummadi,\n Chaithanya Kumar and Brox,\n Thomas and Metzen,\n Jan Hendrik\n},\n title = {\n Defending Against Universal Perturbations With Shared Adversarial Training\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Deformable Surface Tracking by Graph Matching", @@ -8514,14 +8782,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Wang_Deformable_Surface_Tracking_by_Graph_Matching_ICCV_2019_paper.html", "aff_unique_index": "0;1;0;0;2", - "aff_unique_norm": "Beijing Jiao Tong University;Stony Brook University;HiScene Information Technologies", + "aff_unique_norm": "Beijing Jiaotong University;Stony Brook University;HiScene Information Technologies", "aff_unique_dep": ";;", "aff_unique_url": "http://www.njtu.edu.cn/en;https://www.stonybrook.edu;", "aff_unique_abbr": "BJTU;SBU;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", - "aff_country_unique": "China;United States;" + "aff_country_unique": "China;United States;", + "bibtex": "@InProceedings{Wang_2019_ICCV,\n \n author = {\n Wang,\n Tao and Ling,\n Haibin and Lang,\n Congyan and Feng,\n Songhe and Hou,\n Xiaohui\n},\n title = {\n Deformable Surface Tracking by Graph Matching\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Delving Deep Into Hybrid Annotations for 3D Human Recovery in the Wild", @@ -8547,14 +8816,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Rong_Delving_Deep_Into_Hybrid_Annotations_for_3D_Human_Recovery_in_ICCV_2019_paper.html", "aff_unique_index": "0;0;1;2;3", - "aff_unique_norm": "Chinese University of Hong Kong;SenseTime;Stanford University;Nanyang Technological University", + "aff_unique_norm": "The Chinese University of Hong Kong;SenseTime;Stanford University;Nanyang Technological University", "aff_unique_dep": "CUHK - SenseTime Joint Lab;SenseTime Research;;", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.sensetime.com;https://www.stanford.edu;https://www.ntu.edu.sg", "aff_unique_abbr": "CUHK;SenseTime;Stanford;NTU", "aff_campus_unique_index": "0;0;2", "aff_campus_unique": "Hong Kong SAR;;Stanford", "aff_country_unique_index": "0;0;0;1;2", - "aff_country_unique": "China;United States;Singapore" + "aff_country_unique": "China;United States;Singapore", + "bibtex": "@InProceedings{Rong_2019_ICCV,\n \n author = {\n Rong,\n Yu and Liu,\n Ziwei and Li,\n Cheng and Cao,\n Kaidi and Loy,\n Chen Change\n},\n title = {\n Delving Deep Into Hybrid Annotations for 3D Human Recovery in the Wild\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Delving Into Robust Object Detection From Unmanned Aerial Vehicles: A Deep Nuisance Disentanglement Approach", @@ -8587,7 +8857,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Wu_2019_ICCV,\n \n author = {\n Wu,\n Zhenyu and Suresh,\n Karthik and Narayanan,\n Priya and Xu,\n Hongyu and Kwon,\n Heesung and Wang,\n Zhangyang\n},\n title = {\n Delving Into Robust Object Detection From Unmanned Aerial Vehicles: A Deep Nuisance Disentanglement Approach\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "DensePoint: Learning Densely Contextual Representation for Efficient Point Cloud Processing", @@ -8599,7 +8870,7 @@ "author": "Yongcheng Liu; Bin Fan; Gaofeng Meng; Jiwen Lu; Shiming Xiang; Chunhong Pan", "abstract": "Point cloud processing is very challenging, as the diverse shapes formed by irregular points are often indistinguishable. A thorough grasp of the elusive shape requires sufficiently contextual semantic information, yet few works devote to this. Here we propose DensePoint, a general architecture to learn densely contextual representation for point cloud processing. Technically, it extends regular grid CNN to irregular point configuration by generalizing a convolution operator, which holds the permutation invariance of points, and achieves efficient inductive learning of local patterns. Architecturally, it finds inspiration from dense connection mode, to repeatedly aggregate multi-level and multi-scale semantics in a deep hierarchy. As a result, densely contextual information along with rich semantics, can be acquired by DensePoint in an organic manner, making it highly effective. Extensive experiments on challenging benchmarks across four tasks, as well as thorough model analysis, verify DensePoint achieves the state of the arts.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Liu_DensePoint_Learning_Densely_Contextual_Representation_for_Efficient_Point_Cloud_Processing_ICCV_2019_paper.pdf", - "aff": "National Laboratory of Pattern Recognition, Institute of Automation, Chinese Academy of Sciences\u2021School of Arti\ufb01cial Intelligence, University of Chinese Academy of Sciences; National Laboratory of Pattern Recognition, Institute of Automation, Chinese Academy of Sciences; National Laboratory of Pattern Recognition, Institute of Automation, Chinese Academy of Sciences; Department of Automation, Tsinghua University; National Laboratory of Pattern Recognition, Institute of Automation, Chinese Academy of Sciences\u2021School of Arti\ufb01cial Intelligence, University of Chinese Academy of Sciences; National Laboratory of Pattern Recognition, Institute of Automation, Chinese Academy of Sciences", + "aff": "National Laboratory of Pattern Recognition, Institute of Automation, Chinese Academy of Sciences‡School of Artificial Intelligence, University of Chinese Academy of Sciences; National Laboratory of Pattern Recognition, Institute of Automation, Chinese Academy of Sciences; National Laboratory of Pattern Recognition, Institute of Automation, Chinese Academy of Sciences; Department of Automation, Tsinghua University; National Laboratory of Pattern Recognition, Institute of Automation, Chinese Academy of Sciences‡School of Artificial Intelligence, University of Chinese Academy of Sciences; National Laboratory of Pattern Recognition, Institute of Automation, Chinese Academy of Sciences", "project": "", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2019/supplemental/Liu_DensePoint_Learning_Densely_ICCV_2019_supplemental.pdf", @@ -8620,7 +8891,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2019_ICCV,\n \n author = {\n Liu,\n Yongcheng and Fan,\n Bin and Meng,\n Gaofeng and Lu,\n Jiwen and Xiang,\n Shiming and Pan,\n Chunhong\n},\n title = {\n DensePoint: Learning Densely Contextual Representation for Efficient Point Cloud Processing\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "DenseRaC: Joint 3D Pose and Shape Estimation by Dense Render-and-Compare", @@ -8646,14 +8918,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Xu_DenseRaC_Joint_3D_Pose_and_Shape_Estimation_by_Dense_Render-and-Compare_ICCV_2019_paper.html", "aff_unique_index": "0+1;1;0", - "aff_unique_norm": "Meta;University of California, Los Angeles", - "aff_unique_dep": "Facebook Reality Labs;", + "aff_unique_norm": "Facebook Reality Labs;University of California, Los Angeles", + "aff_unique_dep": ";", "aff_unique_url": "https://www.facebook.com/realitylabs;https://www.ucla.edu", "aff_unique_abbr": "FRL;UCLA", "aff_campus_unique_index": "0+1;1;0", "aff_campus_unique": "Sausalito;Los Angeles", "aff_country_unique_index": "0+0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Xu_2019_ICCV,\n \n author = {\n Xu,\n Yuanlu and Zhu,\n Song-Chun and Tung,\n Tony\n},\n title = {\n DenseRaC: Joint 3D Pose and Shape Estimation by Dense Render-and-Compare\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Depth Completion From Sparse LiDAR Data With Depth-Normal Constraints", @@ -8679,14 +8952,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Xu_Depth_Completion_From_Sparse_LiDAR_Data_With_Depth-Normal_Constraints_ICCV_2019_paper.html", "aff_unique_index": "0+1+2;1;0;2;2;1", - "aff_unique_norm": "SenseTime;Chinese University of Hong Kong;Zhejiang University", + "aff_unique_norm": "SenseTime;The Chinese University of Hong Kong;Zhejiang University", "aff_unique_dep": "SenseTime Research;;State Key Lab of CAD&CG", "aff_unique_url": "https://www.sensetime.com;https://www.cuhk.edu.hk;http://www.zju.edu.cn", "aff_unique_abbr": "SenseTime;CUHK;ZJU", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0+0+0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xu_2019_ICCV,\n \n author = {\n Xu,\n Yan and Zhu,\n Xinge and Shi,\n Jianping and Zhang,\n Guofeng and Bao,\n Hujun and Li,\n Hongsheng\n},\n title = {\n Depth Completion From Sparse LiDAR Data With Depth-Normal Constraints\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Depth From Videos in the Wild: Unsupervised Monocular Depth Learning From Unknown Cameras", @@ -8719,7 +8993,8 @@ "aff_campus_unique_index": "0+0;0;0+0;0+0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0+0;0;0+0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Gordon_2019_ICCV,\n \n author = {\n Gordon,\n Ariel and Li,\n Hanhan and Jonschkowski,\n Rico and Angelova,\n Anelia\n},\n title = {\n Depth From Videos in the Wild: Unsupervised Monocular Depth Learning From Unknown Cameras\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Depth-Induced Multi-Scale Recurrent Attention Network for Saliency Detection", @@ -8752,7 +9027,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Piao_2019_ICCV,\n \n author = {\n Piao,\n Yongri and Ji,\n Wei and Li,\n Jingjing and Zhang,\n Miao and Lu,\n Huchuan\n},\n title = {\n Depth-Induced Multi-Scale Recurrent Attention Network for Saliency Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Detecting 11K Classes: Large Scale Object Detection Without Fine-Grained Bounding Boxes", @@ -8778,14 +9054,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Yang_Detecting_11K_Classes_Large_Scale_Object_Detection_Without_Fine-Grained_Bounding_ICCV_2019_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "Amazon", - "aff_unique_dep": "Amazon Web Services", + "aff_unique_norm": "Amazon Web Services", + "aff_unique_dep": "", "aff_unique_url": "https://aws.amazon.com", "aff_unique_abbr": "AWS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Yang_2019_ICCV,\n \n author = {\n Yang,\n Hao and Wu,\n Hao and Chen,\n Hao\n},\n title = {\n Detecting 11K Classes: Large Scale Object Detection Without Fine-Grained Bounding Boxes\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Detecting Photoshopped Faces by Scripting Photoshop", @@ -8818,7 +9095,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Wang_2019_ICCV,\n \n author = {\n Wang,\n Sheng-Yu and Wang,\n Oliver and Owens,\n Andrew and Zhang,\n Richard and Efros,\n Alexei A.\n},\n title = {\n Detecting Photoshopped Faces by Scripting Photoshop\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Detecting Unseen Visual Relations Using Analogies", @@ -8842,7 +9120,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Peyre_Detecting_Unseen_Visual_Relations_Using_Analogies_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Peyre_Detecting_Unseen_Visual_Relations_Using_Analogies_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Peyre_2019_ICCV,\n \n author = {\n Peyre,\n Julia and Laptev,\n Ivan and Schmid,\n Cordelia and Sivic,\n Josef\n},\n title = {\n Detecting Unseen Visual Relations Using Analogies\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Detecting the Unexpected via Image Resynthesis", @@ -8866,7 +9145,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Lis_Detecting_the_Unexpected_via_Image_Resynthesis_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Lis_Detecting_the_Unexpected_via_Image_Resynthesis_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Lis_2019_ICCV,\n \n author = {\n Lis,\n Krzysztof and Nakka,\n Krishna and Fua,\n Pascal and Salzmann,\n Mathieu\n},\n title = {\n Detecting the Unexpected via Image Resynthesis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "DewarpNet: Single-Image Document Unwarping With Stacked 3D and 2D Regression Networks", @@ -8899,7 +9179,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Das_2019_ICCV,\n \n author = {\n Das,\n Sagnik and Ma,\n Ke and Shu,\n Zhixin and Samaras,\n Dimitris and Shilkrot,\n Roy\n},\n title = {\n DewarpNet: Single-Image Document Unwarping With Stacked 3D and 2D Regression Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Differentiable Kernel Evolution", @@ -8924,15 +9205,16 @@ "email": "ee.cuhk.edu.hk;sensetime.com;cse.cuhk.edu.hk;ee.cuhk.edu.hk", "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Liu_Differentiable_Kernel_Evolution_ICCV_2019_paper.html", - "aff_unique_index": "0+0;1;0;0+0", - "aff_unique_norm": "Chinese University of Hong Kong;SenseTime", - "aff_unique_dep": "CUHK-SenseTime Joint Laboratory;SenseTime Research", - "aff_unique_url": "https://www.cuhk.edu.hk;https://www.sensetime.com", - "aff_unique_abbr": "CUHK;SenseTime", + "aff_unique_index": "0+1;2;1;0+1", + "aff_unique_norm": "Chinese University of Hong Kong;The Chinese University of Hong Kong;SenseTime", + "aff_unique_dep": "CUHK-SenseTime Joint Laboratory;;SenseTime Research", + "aff_unique_url": "https://www.cuhk.edu.hk;https://www.cuhk.edu.hk;https://www.sensetime.com", + "aff_unique_abbr": "CUHK;CUHK;SenseTime", "aff_campus_unique_index": "0+0;0;0+0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0+0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2019_ICCV,\n \n author = {\n Liu,\n Yu and Liu,\n Jihao and Zeng,\n Ailing and Wang,\n Xiaogang\n},\n title = {\n Differentiable Kernel Evolution\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Differentiable Learning-to-Group Channels via Groupable Convolutional Neural Networks", @@ -8958,14 +9240,15 @@ "author_num": 7, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Zhang_Differentiable_Learning-to-Group_Channels_via_Groupable_Convolutional_Neural_Networks_ICCV_2019_paper.html", "aff_unique_index": "0;1;0;1;1;0;2", - "aff_unique_norm": "Chinese University of Hong Kong;SenseTime;University of Hong Kong", + "aff_unique_norm": "The Chinese University of Hong Kong;SenseTime;The University of Hong Kong", "aff_unique_dep": "CUHK-SenseTime Joint Laboratory;SenseTime Research;", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.sensetime.com;https://www.hku.hk", "aff_unique_abbr": "CUHK;SenseTime;HKU", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2019_ICCV,\n \n author = {\n Zhang,\n Zhaoyang and Li,\n Jingyu and Shao,\n Wenqi and Peng,\n Zhanglin and Zhang,\n Ruimao and Wang,\n Xiaogang and Luo,\n Ping\n},\n title = {\n Differentiable Learning-to-Group Channels via Groupable Convolutional Neural Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Differentiable Soft Quantization: Bridging Full-Precision and Low-Bit Neural Networks", @@ -8998,7 +9281,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0+0;0+0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Gong_2019_ICCV,\n \n author = {\n Gong,\n Ruihao and Liu,\n Xianglong and Jiang,\n Shenghu and Li,\n Tianxiang and Hu,\n Peng and Lin,\n Jiazhen and Yu,\n Fengwei and Yan,\n Junjie\n},\n title = {\n Differentiable Soft Quantization: Bridging Full-Precision and Low-Bit Neural Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Digging Into Self-Supervised Monocular Depth Estimation", @@ -9006,7 +9290,7 @@ "status": "Poster", "track": "main", "pid": "3670", - "author_site": "Cl\u00c3\u00a9ment Godard, Oisin Mac Aodha, Michael Firman, Gabriel J. Brostow", + "author_site": "Clément Godard, Oisin Mac Aodha, Michael Firman, Gabriel J. Brostow", "author": "Clement Godard; Oisin Mac Aodha; Michael Firman; Gabriel J. Brostow", "abstract": "Per-pixel ground-truth depth data is challenging to acquire at scale. To overcome this limitation, self-supervised learning has emerged as a promising alternative for training models to perform monocular depth estimation. In this paper, we propose a set of improvements, which together result in both quantitatively and qualitatively improved depth maps compared to competing self-supervised methods. Research on self-supervised monocular training usually explores increasingly complex architectures, loss functions, and image formation models, all of which have recently helped to close the gap with fully-supervised methods. We show that a surprisingly simple model, and associated design choices, lead to superior predictions. In particular, we propose (i) a minimum reprojection loss, designed to robustly handle occlusions, (ii) a full-resolution multi-scale sampling method that reduces visual artifacts, and (iii) an auto-masking loss to ignore training pixels that violate camera motion assumptions. We demonstrate the effectiveness of each component in isolation, and show high quality, state-of-the-art results on the KITTI benchmark.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Godard_Digging_Into_Self-Supervised_Monocular_Depth_Estimation_ICCV_2019_paper.pdf", @@ -9022,7 +9306,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Godard_Digging_Into_Self-Supervised_Monocular_Depth_Estimation_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Godard_Digging_Into_Self-Supervised_Monocular_Depth_Estimation_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Godard_2019_ICCV,\n \n author = {\n Godard,\n Clement and Mac Aodha,\n Oisin and Firman,\n Michael and Brostow,\n Gabriel J.\n},\n title = {\n Digging Into Self-Supervised Monocular Depth Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "id": "3443673ccd", @@ -9051,7 +9336,8 @@ "aff_campus_unique_index": "0;1;0;0;0", "aff_campus_unique": "Madison;Berkeley", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhen_2019_ICCV,\n \n author = {\n Zhen,\n Xingjian and Chakraborty,\n Rudrasis and Vogt,\n Nicholas and Bendlin,\n Barbara B. and Singh,\n Vikas\n},\n title = {\n Dilated Convolutional Neural Networks for Sequential Manifold-Valued Data\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "DiscoNet: Shapes Learning on Disconnected Manifolds for 3D Editing", @@ -9059,7 +9345,7 @@ "status": "Poster", "track": "main", "pid": "3502", - "author_site": "\u00c3\u0089loi Mehr, Ariane Jourdan, Nicolas Thome, Matthieu Cord, Vincent Guitteny", + "author_site": "Éloi Mehr, Ariane Jourdan, Nicolas Thome, Matthieu Cord, Vincent Guitteny", "author": "Eloi Mehr; Ariane Jourdan; Nicolas Thome; Matthieu Cord; Vincent Guitteny", "abstract": "Editing 3D models is a very challenging task, as it requires complex interactions with the 3D shape to reach the targeted design, while preserving the global consistency and plausibility of the shape. In this work, we present an intelligent and user-friendly 3D editing tool, where the edited model is constrained to lie onto a learned manifold of realistic shapes. Due to the topological variability of real 3D models, they often lie close to a disconnected manifold, which cannot be learned with a common learning algorithm. Therefore, our tool is based on a new deep learning model, DiscoNet, which extends 3D surface autoencoders in two ways. Firstly, our deep learning model uses several autoencoders to automatically learn each connected component of a disconnected manifold, without any supervision. Secondly, each autoencoder infers the output 3D surface by deforming a pre-learned 3D template specific to each connected component. Both advances translate into improved 3D synthesis, thus enhancing the quality of our 3D editing tool.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Mehr_DiscoNet_Shapes_Learning_on_Disconnected_Manifolds_for_3D_Editing_ICCV_2019_paper.pdf", @@ -9084,7 +9370,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "France" + "aff_country_unique": "France", + "bibtex": "@InProceedings{Mehr_2019_ICCV,\n \n author = {\n Mehr,\n Eloi and Jourdan,\n Ariane and Thome,\n Nicolas and Cord,\n Matthieu and Guitteny,\n Vincent\n},\n title = {\n DiscoNet: Shapes Learning on Disconnected Manifolds for 3D Editing\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Discrete Laplace Operator Estimation for Dynamic 3D Reconstruction", @@ -9117,7 +9404,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hoboken", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Xu_2019_ICCV,\n \n author = {\n Xu,\n Xiangyu and Dunn,\n Enrique\n},\n title = {\n Discrete Laplace Operator Estimation for Dynamic 3D Reconstruction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Discriminative Feature Learning With Consistent Attention Regularization for Person Re-Identification", @@ -9129,7 +9417,7 @@ "author": "Sanping Zhou; Fei Wang; Zeyi Huang; Jinjun Wang", "abstract": "Person re-identification (Re-ID) has undergone a rapid development with the blooming of deep neural network. Most methods are very easily affected by target misalignment and background clutter in the training process. In this paper, we propose a simple yet effective feedforward attention network to address the two mentioned problems, in which a novel consistent attention regularizer and an improved triplet loss are designed to learn foreground attentive features for person Re-ID. Specifically, the consistent attention regularizer aims to keep the deduced foreground masks similar from the low-level, mid-level and high-level feature maps. As a result, the network will focus on the foreground regions at the lower layers, which is benefit to learn discriminative features from the foreground regions at the higher layers. Last but not least, the improved triplet loss is introduced to enhance the feature learning capability, which can jointly minimize the intra-class distance and maximize the inter-class distance in each triplet unit. Experimental results on the Market1501, DukeMTMC-reID and CUHK03 datasets have shown that our method outperforms most of the state-of-the-art approaches.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Zhou_Discriminative_Feature_Learning_With_Consistent_Attention_Regularization_for_Person_Re-Identification_ICCV_2019_paper.pdf", - "aff": "The Institute of Arti\ufb01cial Intelligence and Robotic, Xi\u2019an Jiaotong University; School of Computer Science and Technology, Xi\u2019an Jiaotong University; Robotics Institute, Carnegie Mellon University; The Institute of Arti\ufb01cial Intelligence and Robotic, Xi\u2019an Jiaotong University", + "aff": "The Institute of Artificial Intelligence and Robotic, Xi’an Jiaotong University; School of Computer Science and Technology, Xi’an Jiaotong University; Robotics Institute, Carnegie Mellon University; The Institute of Artificial Intelligence and Robotic, Xi’an Jiaotong University", "project": "", "github": "", "supp": "", @@ -9143,14 +9431,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Zhou_Discriminative_Feature_Learning_With_Consistent_Attention_Regularization_for_Person_Re-Identification_ICCV_2019_paper.html", "aff_unique_index": "0;0;1;0", - "aff_unique_norm": "Xi'an Jiao Tong University;Carnegie Mellon University", + "aff_unique_norm": "Xi'an Jiaotong University;Carnegie Mellon University", "aff_unique_dep": "The Institute of Artificial Intelligence and Robotic;Robotics Institute", "aff_unique_url": "http://www.xjtu.edu.cn;https://www.cmu.edu", "aff_unique_abbr": "XJTU;CMU", "aff_campus_unique_index": "0;0;1;0", "aff_campus_unique": "Xi'an;Pittsburgh", "aff_country_unique_index": "0;0;1;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Zhou_2019_ICCV,\n \n author = {\n Zhou,\n Sanping and Wang,\n Fei and Huang,\n Zeyi and Wang,\n Jinjun\n},\n title = {\n Discriminative Feature Learning With Consistent Attention Regularization for Person Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Discriminative Feature Transformation for Occluded Pedestrian Detection", @@ -9183,7 +9472,8 @@ "aff_campus_unique_index": ";1", "aff_campus_unique": ";Buffalo", "aff_country_unique_index": "0+1;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Zhou_2019_ICCV,\n \n author = {\n Zhou,\n Chunluan and Yang,\n Ming and Yuan,\n Junsong\n},\n title = {\n Discriminative Feature Transformation for Occluded Pedestrian Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Discriminatively Learned Convex Models for Set Based Face Recognition", @@ -9216,7 +9506,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "T\u00fcrkiye" + "aff_country_unique": "Turkey", + "bibtex": "@InProceedings{Cevikalp_2019_ICCV,\n \n author = {\n Cevikalp,\n Hakan and Dordinejad,\n Golara Ghorban\n},\n title = {\n Discriminatively Learned Convex Models for Set Based Face Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Disentangled Image Matting", @@ -9249,7 +9540,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Cai_2019_ICCV,\n \n author = {\n Cai,\n Shaofan and Zhang,\n Xiaoshuai and Fan,\n Haoqiang and Huang,\n Haibin and Liu,\n Jiangyu and Liu,\n Jiaming and Liu,\n Jiaying and Wang,\n Jue and Sun,\n Jian\n},\n title = {\n Disentangled Image Matting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Disentangling Monocular 3D Object Detection", @@ -9257,7 +9549,7 @@ "status": "Poster", "track": "main", "pid": "3921", - "author_site": "Andrea Simonelli, Samuel Rota Bul\u00c3\u00b2, Lorenzo Porzi, Manuel L\u00c3\u00b3pez-Antequera, Peter Kontschieder", + "author_site": "Andrea Simonelli, Samuel Rota Bulò, Lorenzo Porzi, Manuel López-Antequera, Peter Kontschieder", "author": "Andrea Simonelli; Samuel Rota Bulo; Lorenzo Porzi; Manuel Lopez-Antequera; Peter Kontschieder", "abstract": "In this paper we propose an approach for monocular 3D object detection from a single RGB image, which leverages a novel disentangling transformation for 2D and 3D detection losses and a novel, self-supervised confidence score for 3D bounding boxes. Our proposed loss disentanglement has the twofold advantage of simplifying the training dynamics in the presence of losses with complex interactions of parameters, and sidestepping the issue of balancing independent regression terms. Our solution overcomes these issues by isolating the contribution made by groups of parameters to a given loss, without changing its nature. We further apply loss disentanglement to another novel, signed Intersection-over-Union criterion-driven loss for improving 2D detection results. Besides our methodological innovations, we critically review the AP metric used in KITTI3D, which emerged as the most important dataset for comparing 3D detection results. We identify and resolve a flaw in the 11-point interpolated AP metric, affecting all previously published detection results and particularly biases the results of monocular 3D detection. We provide extensive experimental evaluations and ablation studies and set a new state-of-the-art on the KITTI3D Car class.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Simonelli_Disentangling_Monocular_3D_Object_Detection_ICCV_2019_paper.pdf", @@ -9273,7 +9565,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Simonelli_Disentangling_Monocular_3D_Object_Detection_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Simonelli_Disentangling_Monocular_3D_Object_Detection_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Simonelli_2019_ICCV,\n \n author = {\n Simonelli,\n Andrea and Bulo,\n Samuel Rota and Porzi,\n Lorenzo and Lopez-Antequera,\n Manuel and Kontschieder,\n Peter\n},\n title = {\n Disentangling Monocular 3D Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Disentangling Propagation and Generation for Video Prediction", @@ -9306,7 +9599,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;1;0;0;0;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Gao_2019_ICCV,\n \n author = {\n Gao,\n Hang and Xu,\n Huazhe and Cai,\n Qi-Zhi and Wang,\n Ruth and Yu,\n Fisher and Darrell,\n Trevor\n},\n title = {\n Disentangling Propagation and Generation for Video Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "DistInit: Learning Video Representations Without a Single Labeled Video", @@ -9332,14 +9626,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Girdhar_DistInit_Learning_Video_Representations_Without_a_Single_Labeled_Video_ICCV_2019_paper.html", "aff_unique_index": "0;1;2+1;0+3", - "aff_unique_norm": "Carnegie Mellon University;Meta;Dartmouth College;Argo AI", - "aff_unique_dep": ";Facebook, Inc.;;", + "aff_unique_norm": "Carnegie Mellon University;Facebook, Inc.;Dartmouth College;Argo AI", + "aff_unique_dep": ";;;", "aff_unique_url": "https://www.cmu.edu;https://www.facebook.com;https://www.dartmouth.edu;https://www.argo.ai", "aff_unique_abbr": "CMU;FB;Dartmouth;Argo AI", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Girdhar_2019_ICCV,\n \n author = {\n Girdhar,\n Rohit and Tran,\n Du and Torresani,\n Lorenzo and Ramanan,\n Deva\n},\n title = {\n DistInit: Learning Video Representations Without a Single Labeled Video\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Distill Knowledge From NRSfM for Weakly Supervised 3D Pose Learning", @@ -9372,7 +9667,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Wang_2019_ICCV,\n \n author = {\n Wang,\n Chaoyang and Kong,\n Chen and Lucey,\n Simon\n},\n title = {\n Distill Knowledge From NRSfM for Weakly Supervised 3D Pose Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Distillation-Based Training for Multi-Exit Architectures", @@ -9405,7 +9701,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Austria" + "aff_country_unique": "Austria", + "bibtex": "@InProceedings{Phuong_2019_ICCV,\n \n author = {\n Phuong,\n Mary and Lampert,\n Christoph H.\n},\n title = {\n Distillation-Based Training for Multi-Exit Architectures\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Distilling Knowledge From a Deep Pose Regressor Network", @@ -9438,7 +9735,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Oxford", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Saputra_2019_ICCV,\n \n author = {\n Saputra,\n Muhamad Risqi U. and Gusmao,\n Pedro P. B. de and Almalioglu,\n Yasin and Markham,\n Andrew and Trigoni,\n Niki\n},\n title = {\n Distilling Knowledge From a Deep Pose Regressor Network\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Diverse Image Synthesis From Semantic Layouts via Conditional IMLE", @@ -9471,7 +9769,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Li_2019_ICCV,\n \n author = {\n Li,\n Ke and Zhang,\n Tianhao and Malik,\n Jitendra\n},\n title = {\n Diverse Image Synthesis From Semantic Layouts via Conditional IMLE\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Diversity With Cooperation: Ensemble Methods for Few-Shot Classification", @@ -9504,7 +9803,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Grenoble", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "France" + "aff_country_unique": "France", + "bibtex": "@InProceedings{Dvornik_2019_ICCV,\n \n author = {\n Dvornik,\n Nikita and Schmid,\n Cordelia and Mairal,\n Julien\n},\n title = {\n Diversity With Cooperation: Ensemble Methods for Few-Shot Classification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Domain Adaptation for Semantic Segmentation With Maximum Squares Loss", @@ -9537,7 +9837,8 @@ "aff_campus_unique_index": "0+0;0+0;0+0", "aff_campus_unique": "Hangzhou;", "aff_country_unique_index": "0+0+0;0+0+0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2019_ICCV,\n \n author = {\n Chen,\n Minghao and Xue,\n Hongyang and Cai,\n Deng\n},\n title = {\n Domain Adaptation for Semantic Segmentation With Maximum Squares Loss\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Domain Adaptation for Structured Output via Discriminative Patch Representations", @@ -9561,7 +9862,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Tsai_Domain_Adaptation_for_Structured_Output_via_Discriminative_Patch_Representations_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Tsai_Domain_Adaptation_for_Structured_Output_via_Discriminative_Patch_Representations_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Tsai_2019_ICCV,\n \n author = {\n Tsai,\n Yi-Hsuan and Sohn,\n Kihyuk and Schulter,\n Samuel and Chandraker,\n Manmohan\n},\n title = {\n Domain Adaptation for Structured Output via Discriminative Patch Representations\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Domain Intersection and Domain Difference", @@ -9585,7 +9887,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Benaim_Domain_Intersection_and_Domain_Difference_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Benaim_Domain_Intersection_and_Domain_Difference_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Benaim_2019_ICCV,\n \n author = {\n Benaim,\n Sagie and Khaitov,\n Michael and Galanti,\n Tomer and Wolf,\n Lior\n},\n title = {\n Domain Intersection and Domain Difference\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Domain Randomization and Pyramid Consistency: Simulation-to-Real Generalization Without Accessing Target Domain Data", @@ -9612,13 +9915,14 @@ "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Yue_Domain_Randomization_and_Pyramid_Consistency_Simulation-to-Real_Generalization_Without_Accessing_Target_ICCV_2019_paper.html", "aff_unique_index": "0;1;0;0;0;2", "aff_unique_norm": "University of California, Berkeley;University of Central Florida;Google", - "aff_unique_dep": ";;Google", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.berkeley.edu;https://www.ucf.edu;https://www.google.com", "aff_unique_abbr": "UC Berkeley;UCF;Google", "aff_campus_unique_index": "0;0;0;0;2", "aff_campus_unique": "Berkeley;;Mountain View", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Yue_2019_ICCV,\n \n author = {\n Yue,\n Xiangyu and Zhang,\n Yang and Zhao,\n Sicheng and Sangiovanni-Vincentelli,\n Alberto and Keutzer,\n Kurt and Gong,\n Boqing\n},\n title = {\n Domain Randomization and Pyramid Consistency: Simulation-to-Real Generalization Without Accessing Target Domain Data\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Domain-Adaptive Single-View 3D Reconstruction", @@ -9642,7 +9946,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Pinheiro_Domain-Adaptive_Single-View_3D_Reconstruction_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Pinheiro_Domain-Adaptive_Single-View_3D_Reconstruction_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Pinheiro_2019_ICCV,\n \n author = {\n Pinheiro,\n Pedro O. and Rostamzadeh,\n Negar and Ahn,\n Sungjin\n},\n title = {\n Domain-Adaptive Single-View 3D Reconstruction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Drive&Act: A Multi-Modal Dataset for Fine-Grained Driver Behavior Recognition in Autonomous Vehicles", @@ -9650,7 +9955,7 @@ "status": "Poster", "track": "main", "pid": "2", - "author_site": "Manuel Martin, Alina Roitberg, Monica Haurilet, Matthias Horne, Simon Rei\u00c3\u009f, Michael Voit, Rainer Stiefelhagen", + "author_site": "Manuel Martin, Alina Roitberg, Monica Haurilet, Matthias Horne, Simon Reiß, Michael Voit, Rainer Stiefelhagen", "author": "Manuel Martin; Alina Roitberg; Monica Haurilet; Matthias Horne; Simon Reiss; Michael Voit; Rainer Stiefelhagen", "abstract": "We introduce the novel domain-specific Drive&Act benchmark for fine-grained categorization of driver behavior. Our dataset features twelve hours and over 9.6 million frames of people engaged in distractive activities during both, manual and automated driving. We capture color, infrared, depth and 3D body pose information from six views and densely label the videos with a hierarchical annotation scheme, resulting in 83 categories. The key challenges of our dataset are: (1) recognition of fine-grained behavior inside the vehicle cabin; (2) multi-modal activity recognition, focusing on diverse data streams; and (3) a cross view recognition benchmark, where a model handles data from an unfamiliar domain, as sensor type and placement in the cabin can change between vehicles. Finally, we provide challenging benchmarks by adopting prominent methods for video- and body pose-based action recognition.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Martin_DriveAct_A_Multi-Modal_Dataset_for_Fine-Grained_Driver_Behavior_Recognition_in_ICCV_2019_paper.pdf", @@ -9666,7 +9971,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Martin_DriveAct_A_Multi-Modal_Dataset_for_Fine-Grained_Driver_Behavior_Recognition_in_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Martin_DriveAct_A_Multi-Modal_Dataset_for_Fine-Grained_Driver_Behavior_Recognition_in_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Martin_2019_ICCV,\n \n author = {\n Martin,\n Manuel and Roitberg,\n Alina and Haurilet,\n Monica and Horne,\n Matthias and Reiss,\n Simon and Voit,\n Michael and Stiefelhagen,\n Rainer\n},\n title = {\n Drive&Act: A Multi-Modal Dataset for Fine-Grained Driver Behavior Recognition in Autonomous Vehicles\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Drop an Octave: Reducing Spatial Redundancy in Convolutional Neural Networks With Octave Convolution", @@ -9692,14 +9998,15 @@ "author_num": 8, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Chen_Drop_an_Octave_Reducing_Spatial_Redundancy_in_Convolutional_Neural_Networks_ICCV_2019_paper.html", "aff_unique_index": "0;0;0;0;0;0;1+2;1", - "aff_unique_norm": "Meta;National University of Singapore;YITU Technology", + "aff_unique_norm": "Facebook;National University of Singapore;Yitu Technology", "aff_unique_dep": "Facebook AI;;", "aff_unique_url": "https://www.facebook.com;https://www.nus.edu.sg;https://www.yITU.cn", "aff_unique_abbr": "Facebook AI;NUS;YITU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;1+2;1", - "aff_country_unique": "United States;Singapore;China" + "aff_country_unique": "United States;Singapore;China", + "bibtex": "@InProceedings{Chen_2019_ICCV,\n \n author = {\n Chen,\n Yunpeng and Fan,\n Haoqi and Xu,\n Bing and Yan,\n Zhicheng and Kalantidis,\n Yannis and Rohrbach,\n Marcus and Yan,\n Shuicheng and Feng,\n Jiashi\n},\n title = {\n Drop an Octave: Reducing Spatial Redundancy in Convolutional Neural Networks With Octave Convolution\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Drop to Adapt: Learning Discriminative Features for Unsupervised Domain Adaptation", @@ -9725,14 +10032,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Lee_Drop_to_Adapt_Learning_Discriminative_Features_for_Unsupervised_Domain_Adaptation_ICCV_2019_paper.html", "aff_unique_index": "0+1;0+1;1;2", - "aff_unique_norm": "Seoul National University;NAVER LABS;CODE42.ai", + "aff_unique_norm": "Seoul National University;NAVER Labs;CODE42.ai", "aff_unique_dep": ";;", "aff_unique_url": "https://www.snu.ac.kr;https://www.naverlabs.com;https://www.code42.com", "aff_unique_abbr": "SNU;NAVER Labs;CODE42.ai", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0;1", - "aff_country_unique": "South Korea;United States" + "aff_country_unique": "South Korea;United States", + "bibtex": "@InProceedings{Lee_2019_ICCV,\n \n author = {\n Lee,\n Seungmin and Kim,\n Dongwan and Kim,\n Namil and Jeong,\n Seong-Gyun\n},\n title = {\n Drop to Adapt: Learning Discriminative Features for Unsupervised Domain Adaptation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Dual Adversarial Inference for Text-to-Image Synthesis", @@ -9765,7 +10073,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0+0;0;0;0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Lao_2019_ICCV,\n \n author = {\n Lao,\n Qicheng and Havaei,\n Mohammad and Pesaranghader,\n Ahmad and Dutil,\n Francis and Jorio,\n Lisa Di and Fevens,\n Thomas\n},\n title = {\n Dual Adversarial Inference for Text-to-Image Synthesis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Dual Attention Matching for Audio-Visual Event Localization", @@ -9798,7 +10107,8 @@ "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Sydney", "aff_country_unique_index": "0+1;1;2;1", - "aff_country_unique": "China;Australia;United States" + "aff_country_unique": "China;Australia;United States", + "bibtex": "@InProceedings{Wu_2019_ICCV,\n \n author = {\n Wu,\n Yu and Zhu,\n Linchao and Yan,\n Yan and Yang,\n Yi\n},\n title = {\n Dual Attention Matching for Audio-Visual Event Localization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Dual Directed Capsule Network for Very Low Resolution Image Recognition", @@ -9831,7 +10141,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "India" + "aff_country_unique": "India", + "bibtex": "@InProceedings{Singh_2019_ICCV,\n \n author = {\n Singh,\n Maneet and Nagpal,\n Shruti and Singh,\n Richa and Vatsa,\n Mayank\n},\n title = {\n Dual Directed Capsule Network for Very Low Resolution Image Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Dual Student: Breaking the Limits of the Teacher in Semi-Supervised Learning", @@ -9864,7 +10175,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0+0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Ke_2019_ICCV,\n \n author = {\n Ke,\n Zhanghan and Wang,\n Daoye and Yan,\n Qiong and Ren,\n Jimmy and Lau,\n Rynson W.H.\n},\n title = {\n Dual Student: Breaking the Limits of the Teacher in Semi-Supervised Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Dynamic Anchor Feature Selection for Single-Shot Object Detection", @@ -9890,14 +10202,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Li_Dynamic_Anchor_Feature_Selection_for_Single-Shot_Object_Detection_ICCV_2019_paper.html", "aff_unique_index": "0+1;0;1;1;0+1", - "aff_unique_norm": "Hong Kong Polytechnic University;Alibaba Group", + "aff_unique_norm": "The Hong Kong Polytechnic University;Alibaba Group", "aff_unique_dep": ";DAMO Academy", "aff_unique_url": "https://www.polyu.edu.hk;https://www.alibaba-group.com", "aff_unique_abbr": "PolyU;Alibaba", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0+0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2019_ICCV,\n \n author = {\n Li,\n Shuai and Yang,\n Lingxiao and Huang,\n Jianqiang and Hua,\n Xian-Sheng and Zhang,\n Lei\n},\n title = {\n Dynamic Anchor Feature Selection for Single-Shot Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Dynamic Context Correspondence Network for Semantic Alignment", @@ -9930,7 +10243,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Huang_2019_ICCV,\n \n author = {\n Huang,\n Shuaiyi and Wang,\n Qiuyue and Zhang,\n Songyang and Yan,\n Shipeng and He,\n Xuming\n},\n title = {\n Dynamic Context Correspondence Network for Semantic Alignment\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Dynamic Curriculum Learning for Imbalanced Data Classification", @@ -9963,7 +10277,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2019_ICCV,\n \n author = {\n Wang,\n Yiru and Gan,\n Weihao and Yang,\n Jie and Wu,\n Wei and Yan,\n Junjie\n},\n title = {\n Dynamic Curriculum Learning for Imbalanced Data Classification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Dynamic Graph Attention for Referring Expression Comprehension", @@ -9989,14 +10304,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Yang_Dynamic_Graph_Attention_for_Referring_Expression_Comprehension_ICCV_2019_paper.html", "aff_unique_index": "0;1;2", - "aff_unique_norm": "University of Hong Kong;Sun Yat-sen University;Deepwise AI Lab", + "aff_unique_norm": "The University of Hong Kong;Sun Yat-sen University;Deepwise AI Lab", "aff_unique_dep": ";;AI Lab", "aff_unique_url": "https://www.hku.hk;http://www.sysu.edu.cn/;", "aff_unique_abbr": "HKU;SYSU;", "aff_campus_unique_index": "0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Yang_2019_ICCV,\n \n author = {\n Yang,\n Sibei and Li,\n Guanbin and Yu,\n Yizhou\n},\n title = {\n Dynamic Graph Attention for Referring Expression Comprehension\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Dynamic Kernel Distillation for Efficient Pose Estimation in Videos", @@ -10029,7 +10345,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;1;0", - "aff_country_unique": "Singapore;United States;China" + "aff_country_unique": "Singapore;United States;China", + "bibtex": "@InProceedings{Nie_2019_ICCV,\n \n author = {\n Nie,\n Xuecheng and Li,\n Yuncheng and Luo,\n Linjie and Zhang,\n Ning and Feng,\n Jiashi\n},\n title = {\n Dynamic Kernel Distillation for Efficient Pose Estimation in Videos\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Dynamic Multi-Scale Filters for Semantic Segmentation", @@ -10055,14 +10372,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/He_Dynamic_Multi-Scale_Filters_for_Semantic_Segmentation_ICCV_2019_paper.html", "aff_unique_index": "0+1;0;0", - "aff_unique_norm": "Shenzhen Institute of Advanced Technology;Shanghai Jiao Tong University", + "aff_unique_norm": "Shenzhen Institutes of Advanced Technology;Shanghai Jiao Tong University", "aff_unique_dep": "Key Lab of Computer Vision and Pattern Recognition;", "aff_unique_url": "http://www.siat.ac.cn;https://www.sjtu.edu.cn", "aff_unique_abbr": "SIAT;SJTU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Shenzhen;", "aff_country_unique_index": "0+0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{He_2019_ICCV,\n \n author = {\n He,\n Junjun and Deng,\n Zhongying and Qiao,\n Yu\n},\n title = {\n Dynamic Multi-Scale Filters for Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Dynamic PET Image Reconstruction Using Nonnegative Matrix Factorization Incorporated With Deep Image Prior", @@ -10095,7 +10413,8 @@ "aff_campus_unique_index": "0+1;0;1;2;0", "aff_campus_unique": "Nagoya;Tokyo;Wakayama", "aff_country_unique_index": "0+0;0;0;0;0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Yokota_2019_ICCV,\n \n author = {\n Yokota,\n Tatsuya and Kawai,\n Kazuya and Sakata,\n Muneyuki and Kimura,\n Yuichi and Hontani,\n Hidekata\n},\n title = {\n Dynamic PET Image Reconstruction Using Nonnegative Matrix Factorization Incorporated With Deep Image Prior\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Dynamic Points Agglomeration for Hierarchical Point Sets Learning", @@ -10107,7 +10426,7 @@ "author": "Jinxian Liu; Bingbing Ni; Caiyuan Li; Jiancheng Yang; Qi Tian", "abstract": "Many previous works on point sets learning achieve excellent performance with hierarchical architecture. Their strategies towards points agglomeration, however, only perform points sampling and grouping in original Euclidean space in a fixed way. These heuristic and task-irrelevant strategies severely limit their ability to adapt to more varied scenarios. To this end, we develop a novel hierarchical point sets learning architecture, with dynamic points agglomeration. By exploiting the relation of points in semantic space, a module based on graph convolution network is designed to learn a soft points cluster agglomeration. We construct a hierarchical architecture that gradually agglomerates points by stacking this learnable and lightweight module. In contrast to fixed points agglomeration strategy, our method can handle more diverse situations robustly and efficiently. Moreover, we propose a parameter sharing scheme for reducing memory usage and computational burden induced by the agglomeration module. Extensive experimental results on several point cloud analytic tasks, including classification and segmentation, well demonstrate the superior performance of our dynamic hierarchical learning framework over current state-of-the-art methods.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Liu_Dynamic_Points_Agglomeration_for_Hierarchical_Point_Sets_Learning_ICCV_2019_paper.pdf", - "aff": "Shanghai Jiao Tong University + MoE Key Lab of Arti\ufb01cial Intelligence, AI Institute, Shanghai Jiao Tong University; Shanghai Jiao Tong University + MoE Key Lab of Arti\ufb01cial Intelligence, AI Institute, Shanghai Jiao Tong University; Shanghai Jiao Tong University + MoE Key Lab of Arti\ufb01cial Intelligence, AI Institute, Shanghai Jiao Tong University; Shanghai Jiao Tong University + MoE Key Lab of Arti\ufb01cial Intelligence, AI Institute, Shanghai Jiao Tong University; Huawei Noah\u2019s Ark Lab", + "aff": "Shanghai Jiao Tong University + MoE Key Lab of Artificial Intelligence, AI Institute, Shanghai Jiao Tong University; Shanghai Jiao Tong University + MoE Key Lab of Artificial Intelligence, AI Institute, Shanghai Jiao Tong University; Shanghai Jiao Tong University + MoE Key Lab of Artificial Intelligence, AI Institute, Shanghai Jiao Tong University; Shanghai Jiao Tong University + MoE Key Lab of Artificial Intelligence, AI Institute, Shanghai Jiao Tong University; Huawei Noah’s Ark Lab", "project": "", "github": "", "supp": "", @@ -10122,13 +10441,14 @@ "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Liu_Dynamic_Points_Agglomeration_for_Hierarchical_Point_Sets_Learning_ICCV_2019_paper.html", "aff_unique_index": "0+0;0+0;0+0;0+0;1", "aff_unique_norm": "Shanghai Jiao Tong University;Huawei", - "aff_unique_dep": ";Noah\u2019s Ark Lab", + "aff_unique_dep": ";Noah’s Ark Lab", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.huawei.com", "aff_unique_abbr": "SJTU;Huawei", - "aff_campus_unique_index": "1;1;1;1", - "aff_campus_unique": ";Shanghai", + "aff_campus_unique_index": ";;;", + "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0+0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2019_ICCV,\n \n author = {\n Liu,\n Jinxian and Ni,\n Bingbing and Li,\n Caiyuan and Yang,\n Jiancheng and Tian,\n Qi\n},\n title = {\n Dynamic Points Agglomeration for Hierarchical Point Sets Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Dynamic-Net: Tuning the Objective Without Re-Training for Synthesis Tasks", @@ -10161,7 +10481,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Israel" + "aff_country_unique": "Israel", + "bibtex": "@InProceedings{Shoshan_2019_ICCV,\n \n author = {\n Shoshan,\n Alon and Mechrez,\n Roey and Zelnik-Manor,\n Lihi\n},\n title = {\n Dynamic-Net: Tuning the Objective Without Re-Training for Synthesis Tasks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "DynamoNet: Dynamic Action and Motion Network", @@ -10173,7 +10494,7 @@ "author": "Ali Diba; Vivek Sharma; Luc Van Gool; Rainer Stiefelhagen", "abstract": "In this paper, we are interested in self-supervised learning the motion cues in videos using dynamic motion filters for a better motion representation to finally boost human action recognition in particular. Thus far, the vision community has focused on spatio-temporal approaches using standard filters, rather we here propose dynamic filters that adaptively learn the video-specific internal motion representation by predicting the short-term future frames. We name this new motion representation, as dynamic motion representation (DMR) and is embedded inside of 3D convolutional network as a new layer, which captures the visual appearance and motion dynamics throughout entire video clip via end-to-end network learning. Simultaneously, we utilize these motion representation to enrich video classification. We have designed the frame prediction task as an auxiliary task to empower the classification problem. With these overall objectives, to this end, we introduce a novel unified spatio-temporal 3D-CNN architecture (DynamoNet) that jointly optimizes the video classification and learning motion representation by predicting future frames as a multi-task learning problem. We conduct experiments on challenging human action datasets: Kinetics 400, UCF101, HMDB51. The experiments using the proposed DynamoNet show promising results on all the datasets.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Diba_DynamoNet_Dynamic_Action_and_Motion_Network_ICCV_2019_paper.pdf", - "aff": "ESAT-PSI, KU Leuven; CV:HCI, KIT; ESAT-PSI, KU Leuven+CVL, ETH Z\u00fcrich; CV:HCI, KIT", + "aff": "ESAT-PSI, KU Leuven; CV:HCI, KIT; ESAT-PSI, KU Leuven+CVL, ETH Zürich; CV:HCI, KIT", "project": "", "github": "", "supp": "", @@ -10187,14 +10508,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Diba_DynamoNet_Dynamic_Action_and_Motion_Network_ICCV_2019_paper.html", "aff_unique_index": "0;1;0+2;1", - "aff_unique_norm": "KU Leuven;Karlsruhe Institute of Technology;ETH Zurich", + "aff_unique_norm": "KU Leuven;Karlsruhe Institute of Technology;ETH Zürich", "aff_unique_dep": "ESAT-PSI;Computer Vision and Human-Computer Interaction;Computer Vision Laboratory", "aff_unique_url": "https://www.kuleuven.be;https://www.kit.edu;https://www.ethz.ch", "aff_unique_abbr": "KU Leuven;KIT;ETHZ", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0+2;1", - "aff_country_unique": "Belgium;Germany;Switzerland" + "aff_country_unique": "Belgium;Germany;Switzerland", + "bibtex": "@InProceedings{Diba_2019_ICCV,\n \n author = {\n Diba,\n Ali and Sharma,\n Vivek and Gool,\n Luc Van and Stiefelhagen,\n Rainer\n},\n title = {\n DynamoNet: Dynamic Action and Motion Network\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "EGNet: Edge Guidance Network for Salient Object Detection", @@ -10227,7 +10549,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhao_2019_ICCV,\n \n author = {\n Zhao,\n Jia-Xing and Liu,\n Jiang-Jiang and Fan,\n Deng-Ping and Cao,\n Yang and Yang,\n Jufeng and Cheng,\n Ming-Ming\n},\n title = {\n EGNet: Edge Guidance Network for Salient Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "ELF: Embedded Localisation of Features in Pre-Trained CNN", @@ -10235,11 +10558,11 @@ "status": "Poster", "track": "main", "pid": "3882", - "author_site": "Assia Benbihi, Matthieu Geist, C\u00c3\u00a9dric Pradalier", + "author_site": "Assia Benbihi, Matthieu Geist, Cédric Pradalier", "author": "Assia Benbihi; Matthieu Geist; Cedric Pradalier", "abstract": "This paper introduces a novel feature detector based only on information embedded inside a CNN trained on standard tasks (e.g. classification). While previous works already show that the features of a trained CNN are suitable descriptors, we show here how to extract the feature locations from the network to build a detector. This information is computed from the gradient of the feature map with respect to the input image. This provides a saliency map with local maxima on relevant keypoint locations. Contrary to recent CNN-based detectors, this method requires neither supervised training nor finetuning. We evaluate how repeatable and how 'matchable' the detected keypoints are with the repeatability and matching scores. Matchability is measured with a simple descriptor introduced for the sake of the evaluation. This novel detector reaches similar performances on the standard evaluation HPatches dataset, as well as comparable robustness against illumination and viewpoint changes on Webcam and photo-tourism images. These results show that a CNN trained on a standard task embeds feature location information that is as relevant as when the CNN is specifically trained for feature detection.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Benbihi_ELF_Embedded_Localisation_of_Features_in_Pre-Trained_CNN_ICCV_2019_paper.pdf", - "aff": "UMI2958 GeorgiaTech-CNRS+Centrale Sup\u00e9lec+Universit\u00e9 Paris-Saclay; Google Research+Brain Team; GeorgiaTech Lorraine-UMI2958+GeorgiaTech-CNRS", + "aff": "UMI2958 GeorgiaTech-CNRS+Centrale Supélec+Université Paris-Saclay; Google Research+Brain Team; GeorgiaTech Lorraine-UMI2958+GeorgiaTech-CNRS", "project": "", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2019/supplemental/Benbihi_ELF_Embedded_Localisation_ICCV_2019_supplemental.pdf", @@ -10253,14 +10576,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Benbihi_ELF_Embedded_Localisation_of_Features_in_Pre-Trained_CNN_ICCV_2019_paper.html", "aff_unique_index": "0+1+2;3+4;0+0", - "aff_unique_norm": "Georgia Institute of Technology;Centrale Sup\u00e9lec;Universit\u00e9 Paris-Saclay;Google;Brain Team", + "aff_unique_norm": "Georgia Institute of Technology;Centrale Supélec;Université Paris-Saclay;Google;Brain Team", "aff_unique_dep": ";;;Google Research;", "aff_unique_url": "https://www.gatech.edu;https://www.centralesupelec.fr;https://www.universite-paris-saclay.fr;https://research.google;", "aff_unique_abbr": "GeorgiaTech;CS;UPSaclay;Google Research;", "aff_campus_unique_index": ";1;2", "aff_campus_unique": ";Mountain View;Lorraine", "aff_country_unique_index": "0+1+1;0;0+0", - "aff_country_unique": "United States;France;" + "aff_country_unique": "United States;France;", + "bibtex": "@InProceedings{Benbihi_2019_ICCV,\n \n author = {\n Benbihi,\n Assia and Geist,\n Matthieu and Pradalier,\n Cedric\n},\n title = {\n ELF: Embedded Localisation of Features in Pre-Trained CNN\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "EM-Fusion: Dynamic Object-Level SLAM With Probabilistic Data Association", @@ -10268,7 +10592,7 @@ "status": "Poster", "track": "main", "pid": "5672", - "author_site": "Michael Strecke, J\u00c3\u00b6rg St\u00c3\u00bcckler", + "author_site": "Michael Strecke, Jörg Stückler", "author": "Michael Strecke; Jorg Stuckler", "abstract": "The majority of approaches for acquiring dense 3D environment maps with RGB-D cameras assumes static environments or rejects moving objects as outliers. The representation and tracking of moving objects, however, has significant potential for applications in robotics or augmented reality. In this paper, we propose a novel approach to dynamic SLAM with dense object-level representations. We represent rigid objects in local volumetric signed distance function (SDF) maps, and formulate multi-object tracking as direct alignment of RGB-D images with the SDF representations. Our main novelty is a probabilistic formulation which naturally leads to strategies for data association and occlusion handling. We analyze our approach in experiments and demonstrate that our approach compares favorably with the state-of-the-art methods in terms of robustness and accuracy.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Strecke_EM-Fusion_Dynamic_Object-Level_SLAM_With_Probabilistic_Data_Association_ICCV_2019_paper.pdf", @@ -10289,11 +10613,12 @@ "aff_unique_norm": "Max Planck Institute for Intelligent Systems", "aff_unique_dep": "Embodied Vision Group", "aff_unique_url": "https://www.mpi-is.mpg.de", - "aff_unique_abbr": "", + "aff_unique_abbr": "MPI-IS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Strecke_2019_ICCV,\n \n author = {\n Strecke,\n Michael and Stuckler,\n Jorg\n},\n title = {\n EM-Fusion: Dynamic Object-Level SLAM With Probabilistic Data Association\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "EMPNet: Neural Localisation and Mapping Using Embedded Memory Points", @@ -10326,7 +10651,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Australia" + "aff_country_unique": "Australia", + "bibtex": "@InProceedings{Avraham_2019_ICCV,\n \n author = {\n Avraham,\n Gil and Zuo,\n Yan and Dharmasiri,\n Thanuja and Drummond,\n Tom\n},\n title = {\n EMPNet: Neural Localisation and Mapping Using Embedded Memory Points\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "EPIC-Fusion: Audio-Visual Temporal Binding for Egocentric Action Recognition", @@ -10350,7 +10676,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Kazakos_EPIC-Fusion_Audio-Visual_Temporal_Binding_for_Egocentric_Action_Recognition_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Kazakos_EPIC-Fusion_Audio-Visual_Temporal_Binding_for_Egocentric_Action_Recognition_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Kazakos_2019_ICCV,\n \n author = {\n Kazakos,\n Evangelos and Nagrani,\n Arsha and Zisserman,\n Andrew and Damen,\n Dima\n},\n title = {\n EPIC-Fusion: Audio-Visual Temporal Binding for Egocentric Action Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "ERL-Net: Entangled Representation Learning for Single Image De-Raining", @@ -10383,7 +10710,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0", - "aff_country_unique": "Australia" + "aff_country_unique": "Australia", + "bibtex": "@InProceedings{Wang_2019_ICCV,\n \n author = {\n Wang,\n Guoqing and Sun,\n Changming and Sowmya,\n Arcot\n},\n title = {\n ERL-Net: Entangled Representation Learning for Single Image De-Raining\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Efficient Learning on Point Clouds With Basis Point Sets", @@ -10409,14 +10737,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Prokudin_Efficient_Learning_on_Point_Clouds_With_Basis_Point_Sets_ICCV_2019_paper.html", "aff_unique_index": "0+1;1;1", - "aff_unique_norm": "Max Planck Institute for Intelligent Systems;Amazon", - "aff_unique_dep": "Intelligent Systems;Amazon.com, Inc.", + "aff_unique_norm": "Max Planck Institute for Intelligent Systems;Amazon.com, Inc.", + "aff_unique_dep": "Intelligent Systems;", "aff_unique_url": "https://www.mpi-is.mpg.de;https://www.amazon.com", "aff_unique_abbr": "MPI-IS;Amazon", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;1;1", - "aff_country_unique": "Germany;United States" + "aff_country_unique": "Germany;United States", + "bibtex": "@InProceedings{Prokudin_2019_ICCV,\n \n author = {\n Prokudin,\n Sergey and Lassner,\n Christoph and Romero,\n Javier\n},\n title = {\n Efficient Learning on Point Clouds With Basis Point Sets\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Efficient Segmentation: Learning Downsampling Near Semantic Boundaries", @@ -10440,7 +10769,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Marin_Efficient_Segmentation_Learning_Downsampling_Near_Semantic_Boundaries_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Marin_Efficient_Segmentation_Learning_Downsampling_Near_Semantic_Boundaries_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Marin_2019_ICCV,\n \n author = {\n Marin,\n Dmitrii and He,\n Zijian and Vajda,\n Peter and Chatterjee,\n Priyam and Tsai,\n Sam and Yang,\n Fei and Boykov,\n Yuri\n},\n title = {\n Efficient Segmentation: Learning Downsampling Near Semantic Boundaries\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Efficient and Accurate Arbitrary-Shaped Text Detection With Pixel Aggregation Network", @@ -10466,14 +10796,15 @@ "author_num": 8, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Wang_Efficient_and_Accurate_Arbitrary-Shaped_Text_Detection_With_Pixel_Aggregation_Network_ICCV_2019_paper.html", "aff_unique_index": "0;1+2+3;0;4;1;0;2+3;5", - "aff_unique_norm": "Nanjing University;Tongji University;MEGVII;Technology Inc.;University of Electronic Science and Technology of China;University of Adelaide", + "aff_unique_norm": "Nanjing University;Tongji University;Megvii;Technology Inc.;University of Electronic Science and Technology of China;University of Adelaide", "aff_unique_dep": "National Key Lab for Novel Software Technology;;;;;", "aff_unique_url": "http://www.nju.edu.cn;https://www.tongji.edu.cn;https://www.megvii.com;;https://www.uestc.edu.cn;https://www.adelaide.edu.au", "aff_unique_abbr": "Nanjing U;Tongji;Megvii;;UESTC;Adelaide", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0;0;0;0;0;2", - "aff_country_unique": "China;;Australia" + "aff_country_unique": "China;;Australia", + "bibtex": "@InProceedings{Wang_2019_ICCV,\n \n author = {\n Wang,\n Wenhai and Xie,\n Enze and Song,\n Xiaoge and Zang,\n Yuhang and Wang,\n Wenjia and Lu,\n Tong and Yu,\n Gang and Shen,\n Chunhua\n},\n title = {\n Efficient and Accurate Arbitrary-Shaped Text Detection With Pixel Aggregation Network\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Efficient and Robust Registration on the 3D Special Euclidean Group", @@ -10506,7 +10837,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", - "aff_country_unique": "United States;India" + "aff_country_unique": "United States;India", + "bibtex": "@InProceedings{Bhattacharya_2019_ICCV,\n \n author = {\n Bhattacharya,\n Uttaran and Govindu,\n Venu Madhav\n},\n title = {\n Efficient and Robust Registration on the 3D Special Euclidean Group\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Ego-Pose Estimation and Forecasting As Real-Time PD Control", @@ -10539,7 +10871,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Yuan_2019_ICCV,\n \n author = {\n Yuan,\n Ye and Kitani,\n Kris\n},\n title = {\n Ego-Pose Estimation and Forecasting As Real-Time PD Control\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Elaborate Monocular Point and Line SLAM With Robust Initialization", @@ -10572,7 +10905,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Lee_2019_ICCV,\n \n author = {\n Lee,\n Sang Jun and Hwang,\n Sung Soo\n},\n title = {\n Elaborate Monocular Point and Line SLAM With Robust Initialization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Embedded Block Residual Network: A Recursive Restoration Model for Single-Image Super-Resolution", @@ -10605,7 +10939,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Qiu_2019_ICCV,\n \n author = {\n Qiu,\n Yajun and Wang,\n Ruxin and Tao,\n Dapeng and Cheng,\n Jun\n},\n title = {\n Embedded Block Residual Network: A Recursive Restoration Model for Single-Image Super-Resolution\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Embodied Amodal Recognition: Learning to Move to Perceive Objects", @@ -10629,7 +10964,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Yang_Embodied_Amodal_Recognition_Learning_to_Move_to_Perceive_Objects_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Yang_Embodied_Amodal_Recognition_Learning_to_Move_to_Perceive_Objects_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Yang_2019_ICCV,\n \n author = {\n Yang,\n Jianwei and Ren,\n Zhile and Xu,\n Mingze and Chen,\n Xinlei and Crandall,\n David J. and Parikh,\n Devi and Batra,\n Dhruv\n},\n title = {\n Embodied Amodal Recognition: Learning to Move to Perceive Objects\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Employing Deep Part-Object Relationships for Salient Object Detection", @@ -10657,12 +10993,13 @@ "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Xidian University;University of Warwick", "aff_unique_dep": "School of Mechano-Electronic Engineering;WMG", - "aff_unique_url": "http://www.xidian.edu.cn/;https://www.wmg.warwick.ac.uk/", - "aff_unique_abbr": "Xidian;WMG", + "aff_unique_url": "http://www.xidian.edu.cn/;https://warwick.ac.uk", + "aff_unique_abbr": "Xidian;Warwick", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1", - "aff_country_unique": "China;United Kingdom" + "aff_country_unique": "China;United Kingdom", + "bibtex": "@InProceedings{Liu_2019_ICCV,\n \n author = {\n Liu,\n Yi and Zhang,\n Qiang and Zhang,\n Dingwen and Han,\n Jungong\n},\n title = {\n Employing Deep Part-Object Relationships for Salient Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "End-to-End CAD Model Retrieval and 9DoF Alignment in 3D Scans", @@ -10670,7 +11007,7 @@ "status": "Poster", "track": "main", "pid": "1933", - "author_site": "Armen Avetisyan, Angela Dai, Matthias Nie\u00c3\u009fner", + "author_site": "Armen Avetisyan, Angela Dai, Matthias Nießner", "author": "Armen Avetisyan; Angela Dai; Matthias Niessner", "abstract": "We present a novel, end-to-end approach to align CAD models to an 3D scan of a scene, enabling transformation of a noisy, incomplete 3D scan to a compact, CAD reconstruction with clean, complete object geometry. Our main contribution lies in formulating a differentiable Procrustes alignment that is paired with a symmetry-aware dense object correspondence prediction. To simultaneously align CAD models to all the objects of a scanned scene, our approach detects object locations, then predicts symmetry-aware dense object correspondences between scan and CAD geometry in a unified object space, as well as a nearest neighbor CAD model, both of which are then used to inform a differentiable Procrustes alignment. Our approach operates in a fully-convolutional fashion, enabling alignment of CAD models to the objects of a scan in a single forward pass. This enables our method to outperform state-of-the-art approaches by 19.04% for CAD model alignment to scans, with approximately 250x faster runtime than previous data-driven approaches.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Avetisyan_End-to-End_CAD_Model_Retrieval_and_9DoF_Alignment_in_3D_Scans_ICCV_2019_paper.pdf", @@ -10686,7 +11023,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Avetisyan_End-to-End_CAD_Model_Retrieval_and_9DoF_Alignment_in_3D_Scans_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Avetisyan_End-to-End_CAD_Model_Retrieval_and_9DoF_Alignment_in_3D_Scans_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Avetisyan_2019_ICCV,\n \n author = {\n Avetisyan,\n Armen and Dai,\n Angela and Niessner,\n Matthias\n},\n title = {\n End-to-End CAD Model Retrieval and 9DoF Alignment in 3D Scans\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "End-to-End Hand Mesh Recovery From a Monocular RGB Image", @@ -10719,7 +11057,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2019_ICCV,\n \n author = {\n Zhang,\n Xiong and Li,\n Qiang and Mo,\n Hong and Zhang,\n Wenbo and Zheng,\n Wen\n},\n title = {\n End-to-End Hand Mesh Recovery From a Monocular RGB Image\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "End-to-End Learning for Graph Decomposition", @@ -10731,7 +11070,7 @@ "author": "Jie Song; Bjoern Andres; Michael J. Black; Otmar Hilliges; Siyu Tang", "abstract": "Deep neural networks provide powerful tools for pattern recognition, while classical graph algorithms are widely used to solve combinatorial problems. In computer vision, many tasks combine elements of both pattern recognition and graph reasoning. In this paper, we study how to connect deep networks with graph decomposition into an end-to-end trainable framework. More specifically, the minimum cost multicut problem is first converted to an unconstrained binary cubic formulation where cycle consistency constraints are incorporated into the objective function. The new optimization problem can be viewed as a Conditional Random Field (CRF) in which the random variables are associated with the binary edge labels. Cycle constraints are introduced into the CRF as high-order potentials. A standard Convolutional Neural Network (CNN) provides the front-end features for the fully differentiable CRF. The parameters of both parts are optimized in an end-to-end manner. The efficacy of the proposed learning algorithm is demonstrated via experiments on clustering MNIST images and on the challenging task of real-world multi-people pose estimation.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Song_End-to-End_Learning_for_Graph_Decomposition_ICCV_2019_paper.pdf", - "aff": "ETH Zurich; MPI for Intelligent Systems + Bosch Center for AI; MPI for Intelligent Systems; ETH Zurich + University of T\u00fcbingen; ETH Zurich + MPI for Intelligent Systems + University of T\u00fcbingen", + "aff": "ETH Zurich; MPI for Intelligent Systems + Bosch Center for AI; MPI for Intelligent Systems; ETH Zurich + University of Tübingen; ETH Zurich + MPI for Intelligent Systems + University of Tübingen", "project": "", "github": "", "supp": "", @@ -10745,14 +11084,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Song_End-to-End_Learning_for_Graph_Decomposition_ICCV_2019_paper.html", "aff_unique_index": "0;1+2;1;0+3;0+1+3", - "aff_unique_norm": "ETH Zurich;Max Planck Institute for Intelligent Systems;Bosch Center for AI;University of T\u00fcbingen", + "aff_unique_norm": "ETH Zurich;Max Planck Institute for Intelligent Systems;Bosch Center for AI;University of Tübingen", "aff_unique_dep": ";;Center for AI;", "aff_unique_url": "https://www.ethz.ch;https://www.mpi-is.mpg.de;https://www.bosch-ai.com;https://www.uni-tuebingen.de/", - "aff_unique_abbr": "ETHZ;MPI-IS;BCAI;Uni T\u00fcbingen", + "aff_unique_abbr": "ETHZ;MPI-IS;BCAI;Uni Tübingen", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0;1+1;1;0+1;0+1+1", - "aff_country_unique": "Switzerland;Germany" + "aff_country_unique": "Switzerland;Germany", + "bibtex": "@InProceedings{Song_2019_ICCV,\n \n author = {\n Song,\n Jie and Andres,\n Bjoern and Black,\n Michael J. and Hilliges,\n Otmar and Tang,\n Siyu\n},\n title = {\n End-to-End Learning for Graph Decomposition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "End-to-End Learning of Representations for Asynchronous Event-Based Data", @@ -10785,7 +11125,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Toronto", "aff_country_unique_index": "0;0;1;0", - "aff_country_unique": "Switzerland;Canada" + "aff_country_unique": "Switzerland;Canada", + "bibtex": "@InProceedings{Gehrig_2019_ICCV,\n \n author = {\n Gehrig,\n Daniel and Loquercio,\n Antonio and Derpanis,\n Konstantinos G. and Scaramuzza,\n Davide\n},\n title = {\n End-to-End Learning of Representations for Asynchronous Event-Based Data\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "End-to-End Wireframe Parsing", @@ -10818,7 +11159,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhou_2019_ICCV,\n \n author = {\n Zhou,\n Yichao and Qi,\n Haozhi and Ma,\n Yi\n},\n title = {\n End-to-End Wireframe Parsing\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Enforcing Geometric Constraints of Virtual Normal for Depth Prediction", @@ -10830,7 +11172,7 @@ "author": "Wei Yin; Yifan Liu; Chunhua Shen; Youliang Yan", "abstract": "Monocular depth prediction plays a crucial role in understanding 3D scene geometry. Although recent methods have achieved impressive progress in evaluation metrics such as the pixel-wise relative error, most methods neglect the geometric constraints in the 3D space. In this work, we show the importance of the high-order 3D geometric constraints for depth prediction. By designing a loss term that enforces one simple type of geometric constraints, namely, virtual normal directions determined by randomly sampled three points in the reconstructed 3D space, we can considerably improve the depth prediction accuracy. Furthermore, we can not only predict accurate depth but also achieve high-quality other 3D information from the depth without retraining new parameters, Significantly, the byproduct of this predicted depth being sufficiently accurate is that we are now able to recover good 3D structures of the scene such as the point cloud and surface normal directly from the depth, eliminating the necessity of training new sub-models as was previously done. Experiments on two challenging benchmarks: NYU Depth-V2 and KITTI demonstrate the effectiveness of our method and state-of-the-art performance.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Yin_Enforcing_Geometric_Constraints_of_Virtual_Normal_for_Depth_Prediction_ICCV_2019_paper.pdf", - "aff": "The University of Adelaide, Australia; The University of Adelaide, Australia; The University of Adelaide, Australia; Noah\u2019s Ark Lab, Huawei Technologies", + "aff": "The University of Adelaide, Australia; The University of Adelaide, Australia; The University of Adelaide, Australia; Noah’s Ark Lab, Huawei Technologies", "project": "https://tinyurl.com/virtualnormal", "github": "", "supp": "", @@ -10844,14 +11186,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Yin_Enforcing_Geometric_Constraints_of_Virtual_Normal_for_Depth_Prediction_ICCV_2019_paper.html", "aff_unique_index": "0;0;0;1", - "aff_unique_norm": "University of Adelaide;Huawei", - "aff_unique_dep": ";Noah\u2019s Ark Lab", + "aff_unique_norm": "The University of Adelaide;Huawei Technologies", + "aff_unique_dep": ";Noah’s Ark Lab", "aff_unique_url": "https://www.adelaide.edu.au;https://www.huawei.com", "aff_unique_abbr": "Adelaide;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1", - "aff_country_unique": "Australia;China" + "aff_country_unique": "Australia;China", + "bibtex": "@InProceedings{Yin_2019_ICCV,\n \n author = {\n Yin,\n Wei and Liu,\n Yifan and Shen,\n Chunhua and Yan,\n Youliang\n},\n title = {\n Enforcing Geometric Constraints of Virtual Normal for Depth Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Enhancing 2D Representation via Adjacent Views for 3D Shape Retrieval", @@ -10880,11 +11223,12 @@ "aff_unique_norm": "Beihang University;Duke University;National University of Defense Technology", "aff_unique_dep": "School of Computer Science & Engineering;;National Laboratory for Parallel and Distributed Processing", "aff_unique_url": "http://www.buaa.edu.cn;https://www.duke.edu;http://www.nudt.edu.cn/", - "aff_unique_abbr": "BUAA;Duke;NUDT", + "aff_unique_abbr": "Beihang;Duke;NUDT", "aff_campus_unique_index": "1", "aff_campus_unique": ";Beijing", "aff_country_unique_index": "0;0;1;0+0+0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Xu_2019_ICCV,\n \n author = {\n Xu,\n Cheng and Li,\n Zhaoqun and Qiu,\n Qiang and Leng,\n Biao and Jiang,\n Jingfei\n},\n title = {\n Enhancing 2D Representation via Adjacent Views for 3D Shape Retrieval\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Enhancing Adversarial Example Transferability With an Intermediate Level Attack", @@ -10910,14 +11254,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Huang_Enhancing_Adversarial_Example_Transferability_With_an_Intermediate_Level_Attack_ICCV_2019_paper.html", "aff_unique_index": "0;0;0;0;0;1", - "aff_unique_norm": "Cornell University;Meta", + "aff_unique_norm": "Cornell University;Facebook", "aff_unique_dep": ";Facebook AI", "aff_unique_url": "https://www.cornell.edu;https://www.facebook.com", "aff_unique_abbr": "Cornell;Facebook AI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Huang_2019_ICCV,\n \n author = {\n Huang,\n Qian and Katsman,\n Isay and He,\n Horace and Gu,\n Zeqi and Belongie,\n Serge and Lim,\n Ser-Nam\n},\n title = {\n Enhancing Adversarial Example Transferability With an Intermediate Level Attack\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Enhancing Low Light Videos by Exploring High Sensitivity Camera Noise", @@ -10941,7 +11286,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Wang_Enhancing_Low_Light_Videos_by_Exploring_High_Sensitivity_Camera_Noise_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Wang_Enhancing_Low_Light_Videos_by_Exploring_High_Sensitivity_Camera_Noise_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Wang_2019_ICCV,\n \n author = {\n Wang,\n Wei and Chen,\n Xin and Yang,\n Cheng and Li,\n Xiang and Hu,\n Xuemei and Yue,\n Tao\n},\n title = {\n Enhancing Low Light Videos by Exploring High Sensitivity Camera Noise\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Enriched Feature Guided Refinement Network for Object Detection", @@ -10953,7 +11299,7 @@ "author": "Jing Nie; Rao Muhammad Anwer; Hisham Cholakkal; Fahad Shahbaz Khan; Yanwei Pang; Ling Shao", "abstract": "We propose a single-stage detection framework that jointly tackles the problem of multi-scale object detection and class imbalance. Rather than designing deeper networks, we introduce a simple yet effective feature enrichment scheme to produce multi-scale contextual features. We further introduce a cascaded refinement scheme which first instills multi-scale contextual features into the prediction layers of the single-stage detector in order to enrich their discriminative power for multi-scale detection. Second, the cascaded refinement scheme counters the class imbalance problem by refining the anchors and enriched features to improve classification and regression. Experiments are performed on two benchmarks: PASCAL VOC and MS COCO. For a 320x320 input on the MS COCO test-dev, our detector achieves state-of-the-art single-stage detection accuracy with a COCO AP of 33.2 in the case of single-scale inference, while operating at 21 milliseconds on a Titan XP GPU. For a 512x512 input on the MS COCO test-dev, our approach obtains an absolute gain of 1.6% in terms of COCO AP, compared to the best reported single-stage results[5]. Source code and models are available at: https://github.com/Ranchentx/EFGRNet.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Nie_Enriched_Feature_Guided_Refinement_Network_for_Object_Detection_ICCV_2019_paper.pdf", - "aff": "School of Electrical and Information Engineering, Tianjin University+Inception Institute of Arti\ufb01cial Intelligence (IIAI), UAE; Inception Institute of Arti\ufb01cial Intelligence (IIAI), UAE; Inception Institute of Arti\ufb01cial Intelligence (IIAI), UAE; Inception Institute of Arti\ufb01cial Intelligence (IIAI), UAE; School of Electrical and Information Engineering, Tianjin University; Inception Institute of Arti\ufb01cial Intelligence (IIAI), UAE", + "aff": "School of Electrical and Information Engineering, Tianjin University+Inception Institute of Artificial Intelligence (IIAI), UAE; Inception Institute of Artificial Intelligence (IIAI), UAE; Inception Institute of Artificial Intelligence (IIAI), UAE; Inception Institute of Artificial Intelligence (IIAI), UAE; School of Electrical and Information Engineering, Tianjin University; Inception Institute of Artificial Intelligence (IIAI), UAE", "project": "", "github": "https://github.com/Ranchentx/EFGRNet", "supp": "", @@ -10974,7 +11320,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;1;1;1;0;1", - "aff_country_unique": "China;United Arab Emirates" + "aff_country_unique": "China;United Arab Emirates", + "bibtex": "@InProceedings{Nie_2019_ICCV,\n \n author = {\n Nie,\n Jing and Anwer,\n Rao Muhammad and Cholakkal,\n Hisham and Khan,\n Fahad Shahbaz and Pang,\n Yanwei and Shao,\n Ling\n},\n title = {\n Enriched Feature Guided Refinement Network for Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Entangled Transformer for Image Captioning", @@ -11007,7 +11354,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Sydney", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Australia" + "aff_country_unique": "Australia", + "bibtex": "@InProceedings{Li_2019_ICCV,\n \n author = {\n Li,\n Guang and Zhu,\n Linchao and Liu,\n Ping and Yang,\n Yi\n},\n title = {\n Entangled Transformer for Image Captioning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Episodic Training for Domain Generalization", @@ -11033,14 +11381,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Li_Episodic_Training_for_Domain_Generalization_ICCV_2019_paper.html", "aff_unique_index": "0+1;2;1;3;1;4+0+1", - "aff_unique_norm": "Samsung;University of Surrey;University of Science and Technology of China;iFLYTEK;University of Edinburgh", + "aff_unique_norm": "Samsung AI Center;University of Surrey;University of Science and Technology of China;iFlytek;University of Edinburgh", "aff_unique_dep": "AI Center;CVSSP;;Research;", "aff_unique_url": "https://www.samsung.com/global/research-innovation/ai-research-centers/samsung-ai-center-cambridge/;https://www.surrey.ac.uk;http://www.ustc.edu.cn;https://www.iflytek.com;https://www.ed.ac.uk", "aff_unique_abbr": "SAC;Surrey;USTC;iFlytek;Edinburgh", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Cambridge;", "aff_country_unique_index": "0+0;1;0;1;0;0+0+0", - "aff_country_unique": "United Kingdom;China" + "aff_country_unique": "United Kingdom;China", + "bibtex": "@InProceedings{Li_2019_ICCV,\n \n author = {\n Li,\n Da and Zhang,\n Jianshu and Yang,\n Yongxin and Liu,\n Cong and Song,\n Yi-Zhe and Hospedales,\n Timothy M.\n},\n title = {\n Episodic Training for Domain Generalization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Equivariant Multi-View Networks", @@ -11073,7 +11422,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Philadelphia", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Esteves_2019_ICCV,\n \n author = {\n Esteves,\n Carlos and Xu,\n Yinshuang and Allen-Blanchette,\n Christine and Daniilidis,\n Kostas\n},\n title = {\n Equivariant Multi-View Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Escaping Plato's Cave: 3D Shape From Adversarial Rendering", @@ -11106,7 +11456,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Henzler_2019_ICCV,\n \n author = {\n Henzler,\n Philipp and Mitra,\n Niloy J. and Ritschel,\n Tobias\n},\n title = {\n Escaping Plato's Cave: 3D Shape From Adversarial Rendering\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Estimating the Fundamental Matrix Without Point Correspondences With Application to Transmission Imaging", @@ -11114,11 +11465,11 @@ "status": "Poster", "track": "main", "pid": "5105", - "author_site": "Tobias W\u00c3\u00bcrfl, Andr\u00c3\u00a9 Aichert, Nicole Maa\u00c3\u009f, Frank Dennerlein, Andreas Maier", + "author_site": "Tobias Würfl, André Aichert, Nicole Maaß, Frank Dennerlein, Andreas Maier", "author": "Tobias Wurfl; Andre Aichert; Nicole Maass; Frank Dennerlein; Andreas Maier", "abstract": "We present a general method to estimate the fundamental matrix from a pair of images under perspective projection without the need for image point correspondences. Our method is particularly well-suited for transmission imaging, where state-of-the-art feature detection and matching approaches generally do not perform well. Estimation of the fundamental matrix plays a central role in auto-calibration methods for reflection imaging. Such methods are currently not applicable to transmission imaging. Furthermore, our method extends an existing technique proposed for reflection imaging which potentially avoids the outlier-prone feature matching step from an orthographic projection model to a perspective model. Our method exploits the idea that under a linear attenuation model line integrals along corresponding epipolar lines are equal if we compute their derivatives in orthogonal direction to their common epipolar plane. We use the fundamental matrix to parametrize this equality. Our method estimates the matrix by formulating a non-convex optimization problem, minimizing an error in our measurement of this equality. We believe this technique will enable the application of the large body of work on image-based camera pose estimation to transmission imaging leading to more accurate and more general motion compensation and auto-calibration algorithms, particularly in medical X-ray and Computed Tomography imaging.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Wurfl_Estimating_the_Fundamental_Matrix_Without_Point_Correspondences_With_Application_to_ICCV_2019_paper.pdf", - "aff": "Pattern Recognition Lab, Friedrich-Alexander-Universit\u00e4t Erlangen-N\u00fcrnberg (FAU); Siemens Healthcare GmbH; Siemens Healthcare GmbH; Siemens Healthcare GmbH; Pattern Recognition Lab, Friedrich-Alexander-Universit\u00e4t Erlangen-N\u00fcrnberg (FAU)", + "aff": "Pattern Recognition Lab, Friedrich-Alexander-Universität Erlangen-Nürnberg (FAU); Siemens Healthcare GmbH; Siemens Healthcare GmbH; Siemens Healthcare GmbH; Pattern Recognition Lab, Friedrich-Alexander-Universität Erlangen-Nürnberg (FAU)", "project": "https://www5.cs.fau.de", "github": "", "supp": "", @@ -11132,14 +11483,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Wurfl_Estimating_the_Fundamental_Matrix_Without_Point_Correspondences_With_Application_to_ICCV_2019_paper.html", "aff_unique_index": "0;1;1;1;0", - "aff_unique_norm": "Friedrich-Alexander-Universit\u00e4t Erlangen-N\u00fcrnberg;Siemens Healthcare", + "aff_unique_norm": "Friedrich-Alexander-Universität Erlangen-Nürnberg;Siemens Healthcare", "aff_unique_dep": "Pattern Recognition Lab;", "aff_unique_url": "https://www fau.de;https://www.siemens-healthineers.com", "aff_unique_abbr": "FAU;Siemens Healthcare", "aff_campus_unique_index": "0;0", - "aff_campus_unique": "Erlangen-N\u00fcrnberg;", + "aff_campus_unique": "Erlangen-Nürnberg;", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Wurfl_2019_ICCV,\n \n author = {\n Wurfl,\n Tobias and Aichert,\n Andre and Maass,\n Nicole and Dennerlein,\n Frank and Maier,\n Andreas\n},\n title = {\n Estimating the Fundamental Matrix Without Point Correspondences With Application to Transmission Imaging\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "EvalNorm: Estimating Batch Normalization Statistics for Evaluation", @@ -11172,7 +11524,8 @@ "aff_campus_unique_index": "0;1", "aff_campus_unique": "Mountain View;College Park", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Singh_2019_ICCV,\n \n author = {\n Singh,\n Saurabh and Shrivastava,\n Abhinav\n},\n title = {\n EvalNorm: Estimating Batch Normalization Statistics for Evaluation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Evaluating Robustness of Deep Image Super-Resolution Against Adversarial Attacks", @@ -11205,7 +11558,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;1;0;1;0", - "aff_country_unique": "South Korea;United States" + "aff_country_unique": "South Korea;United States", + "bibtex": "@InProceedings{Choi_2019_ICCV,\n \n author = {\n Choi,\n Jun-Ho and Zhang,\n Huan and Kim,\n Jun-Hyuk and Hsieh,\n Cho-Jui and Lee,\n Jong-Seok\n},\n title = {\n Evaluating Robustness of Deep Image Super-Resolution Against Adversarial Attacks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Event-Based Motion Segmentation by Motion Compensation", @@ -11238,7 +11592,8 @@ "aff_campus_unique_index": ";1;;1", "aff_campus_unique": ";Zurich", "aff_country_unique_index": "0+0;1;0+0;0;1", - "aff_country_unique": "Australia;Switzerland" + "aff_country_unique": "Australia;Switzerland", + "bibtex": "@InProceedings{Stoffregen_2019_ICCV,\n \n author = {\n Stoffregen,\n Timo and Gallego,\n Guillermo and Drummond,\n Tom and Kleeman,\n Lindsay and Scaramuzza,\n Davide\n},\n title = {\n Event-Based Motion Segmentation by Motion Compensation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Everybody Dance Now", @@ -11262,7 +11617,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Chan_Everybody_Dance_Now_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Chan_Everybody_Dance_Now_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Chan_2019_ICCV,\n \n author = {\n Chan,\n Caroline and Ginosar,\n Shiry and Zhou,\n Tinghui and Efros,\n Alexei A.\n},\n title = {\n Everybody Dance Now\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Evolving Space-Time Neural Architectures for Videos", @@ -11295,7 +11651,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Piergiovanni_2019_ICCV,\n \n author = {\n Piergiovanni,\n AJ and Angelova,\n Anelia and Toshev,\n Alexander and Ryoo,\n Michael S.\n},\n title = {\n Evolving Space-Time Neural Architectures for Videos\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Expectation-Maximization Attention Networks for Semantic Segmentation", @@ -11319,7 +11676,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Li_Expectation-Maximization_Attention_Networks_for_Semantic_Segmentation_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Li_Expectation-Maximization_Attention_Networks_for_Semantic_Segmentation_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Li_2019_ICCV,\n \n author = {\n Li,\n Xia and Zhong,\n Zhisheng and Wu,\n Jianlong and Yang,\n Yibo and Lin,\n Zhouchen and Liu,\n Hong\n},\n title = {\n Expectation-Maximization Attention Networks for Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Expert Sample Consensus Applied to Camera Re-Localization", @@ -11343,7 +11701,8 @@ "aff_domain": ";", "email": ";", "author_num": 2, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Brachmann_Expert_Sample_Consensus_Applied_to_Camera_Re-Localization_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Brachmann_Expert_Sample_Consensus_Applied_to_Camera_Re-Localization_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Brachmann_2019_ICCV,\n \n author = {\n Brachmann,\n Eric and Rother,\n Carsten\n},\n title = {\n Expert Sample Consensus Applied to Camera Re-Localization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Explaining Neural Networks Semantically and Quantitatively", @@ -11376,7 +11735,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2019_ICCV,\n \n author = {\n Chen,\n Runjin and Chen,\n Hao and Ren,\n Jie and Huang,\n Ge and Zhang,\n Quanshi\n},\n title = {\n Explaining Neural Networks Semantically and Quantitatively\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Explaining the Ambiguity of Object Detection and 6D Pose From Visual Data", @@ -11384,7 +11744,7 @@ "status": "Poster", "track": "main", "pid": "1768", - "author_site": "Fabian Manhardt, Diego Mart\u00c3\u00adn Arroyo, Christian Rupprecht, Benjamin Busam, Tolga Birdal, Nassir Navab, Federico Tombari", + "author_site": "Fabian Manhardt, Diego Martín Arroyo, Christian Rupprecht, Benjamin Busam, Tolga Birdal, Nassir Navab, Federico Tombari", "author": "Fabian Manhardt; Diego Martin Arroyo; Christian Rupprecht; Benjamin Busam; Tolga Birdal; Nassir Navab; Federico Tombari", "abstract": "3D object detection and pose estimation from a single image are two inherently ambiguous problems. Oftentimes, objects appear similar from different viewpoints due to shape symmetries, occlusion and repetitive textures. This ambiguity in both detection and pose estimation means that an object instance can be perfectly described by several different poses and even classes. In this work we propose to explicitly deal with these ambiguities. For each object instance we predict multiple 6D pose outcomes to estimate the specific pose distribution generated by symmetries and repetitive textures. The distribution collapses to a single outcome when the visual appearance uniquely identifies just one valid pose. We show the benefits of our approach which provides not only a better explanation for pose ambiguity, but also a higher accuracy in terms of pose estimation.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Manhardt_Explaining_the_Ambiguity_of_Object_Detection_and_6D_Pose_From_ICCV_2019_paper.pdf", @@ -11402,14 +11762,15 @@ "author_num": 7, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Manhardt_Explaining_the_Ambiguity_of_Object_Detection_and_6D_Pose_From_ICCV_2019_paper.html", "aff_unique_index": "0;0;1;0+2;3;0;0+4", - "aff_unique_norm": "Technical University of Munich;University of Oxford;Huawei;Stanford University;Google", - "aff_unique_dep": ";;Huawei Technologies Co., Ltd.;;Google", + "aff_unique_norm": "Technical University of Munich;University of Oxford;Huawei Technologies Co., Ltd.;Stanford University;Google", + "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.tum.de;https://www.ox.ac.uk;https://www.huawei.com;https://www.stanford.edu;https://www.google.com", "aff_unique_abbr": "TUM;Oxford;Huawei;Stanford;Google", "aff_campus_unique_index": ";1;2", "aff_campus_unique": ";Stanford;Mountain View", "aff_country_unique_index": "0;0;1;0+2;3;0;0+3", - "aff_country_unique": "Germany;United Kingdom;China;United States" + "aff_country_unique": "Germany;United Kingdom;China;United States", + "bibtex": "@InProceedings{Manhardt_2019_ICCV,\n \n author = {\n Manhardt,\n Fabian and Arroyo,\n Diego Martin and Rupprecht,\n Christian and Busam,\n Benjamin and Birdal,\n Tolga and Navab,\n Nassir and Tombari,\n Federico\n},\n title = {\n Explaining the Ambiguity of Object Detection and 6D Pose From Visual Data\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Explicit Shape Encoding for Real-Time Instance Segmentation", @@ -11421,7 +11782,7 @@ "author": "Wenqiang Xu; Haiyang Wang; Fubo Qi; Cewu Lu", "abstract": "In this paper, we propose a novel top-down instance segmentation framework based on explicit shape encoding, named ESE-Seg. It largely reduces the computational consumption of the instance segmentation by explicitly decoding the multiple object shapes with tensor operations, thus performs the instance segmentation at almost the same speed as the object detection. ESE-Seg is based on a novel shape signature Inner-center Radius (IR), Chebyshev polynomial fitting and the strong modern object detectors. ESE-Seg with YOLOv3 outperforms the Mask R-CNN on Pascal VOC 2012 at mAP^r@0.5 while 7 times faster.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Xu_Explicit_Shape_Encoding_for_Real-Time_Instance_Segmentation_ICCV_2019_paper.pdf", - "aff": "Department of Computer Science and Engineering, Shanghai Jiao Tong University; Department of Computer Science and Engineering, Shanghai Jiao Tong University; Department of Computer Science and Engineering, Shanghai Jiao Tong University; Department of Computer Science and Engineering, Shanghai Jiao Tong University + MoE Key Lab of Arti\ufb01cial Intelligence, AI Institute, Shanghai Jiao Tong University, and SJTU-SenseTime AI lab", + "aff": "Department of Computer Science and Engineering, Shanghai Jiao Tong University; Department of Computer Science and Engineering, Shanghai Jiao Tong University; Department of Computer Science and Engineering, Shanghai Jiao Tong University; Department of Computer Science and Engineering, Shanghai Jiao Tong University + MoE Key Lab of Artificial Intelligence, AI Institute, Shanghai Jiao Tong University, and SJTU-SenseTime AI lab", "project": "", "github": "", "supp": "", @@ -11442,7 +11803,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xu_2019_ICCV,\n \n author = {\n Xu,\n Wenqiang and Wang,\n Haiyang and Qi,\n Fubo and Lu,\n Cewu\n},\n title = {\n Explicit Shape Encoding for Real-Time Instance Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Exploiting Spatial-Temporal Relationships for 3D Pose Estimation via Graph Convolutional Networks", @@ -11475,7 +11837,8 @@ "aff_campus_unique_index": ";1", "aff_campus_unique": ";Buffalo", "aff_country_unique_index": "0;0;0;0+1;0;2;0", - "aff_country_unique": "Singapore;Australia;United States" + "aff_country_unique": "Singapore;Australia;United States", + "bibtex": "@InProceedings{Cai_2019_ICCV,\n \n author = {\n Cai,\n Yujun and Ge,\n Liuhao and Liu,\n Jun and Cai,\n Jianfei and Cham,\n Tat-Jen and Yuan,\n Junsong and Thalmann,\n Nadia Magnenat\n},\n title = {\n Exploiting Spatial-Temporal Relationships for 3D Pose Estimation via Graph Convolutional Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Exploiting Temporal Consistency for Real-Time Video Depth Estimation", @@ -11487,7 +11850,7 @@ "author": "Haokui Zhang; Chunhua Shen; Ying Li; Yuanzhouhan Cao; Yu Liu; Youliang Yan", "abstract": "Accuracy of depth estimation from static images has been significantly improved recently, by exploiting hierarchical features from deep convolutional neural networks (CNNs). Compared with static images, vast information exists among video frames and can be exploited to improve the depth estimation performance. In this work, we focus on exploring temporal information from monocular videos for depth estimation. Specifically, we take the advantage of convolutional long short-term memory (CLSTM) and propose a novel spatial-temporal CSLTM (ST-CLSTM) structure. Our ST-CLSTM structure can capture not only the spatial features but also the temporal correlations/consistency among consecutive video frames with negligible increase in computational cost. Additionally, in order to maintain the temporal consistency among the estimated depth frames, we apply the generative adversarial learning scheme and design a temporal consistency loss. The temporal consistency loss is combined with the spatial loss to update the model in an end-to-end fashion. By taking advantage of the temporal information, we build a video depth estimation framework that runs in real-time and generates visually pleasant results. Moreover, our approach is flexible and can be generalized to most existing depth estimation frameworks. Code is available at: https://tinyurl.com/STCLSTM", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Zhang_Exploiting_Temporal_Consistency_for_Real-Time_Video_Depth_Estimation_ICCV_2019_paper.pdf", - "aff": "Northwestern Polytechnical University; University of Adelaide; Northwestern Polytechnical University; University of Adelaide; University of Adelaide; Noah\u2019s Ark Lab, Huawei", + "aff": "Northwestern Polytechnical University; University of Adelaide; Northwestern Polytechnical University; University of Adelaide; University of Adelaide; Noah’s Ark Lab, Huawei", "project": "https://tinyurl.com/STCLSTM", "github": "", "supp": "", @@ -11502,13 +11865,14 @@ "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Zhang_Exploiting_Temporal_Consistency_for_Real-Time_Video_Depth_Estimation_ICCV_2019_paper.html", "aff_unique_index": "0;1;0;1;1;2", "aff_unique_norm": "Northwestern Polytechnical University;University of Adelaide;Huawei", - "aff_unique_dep": ";;Noah\u2019s Ark Lab", + "aff_unique_dep": ";;Noah’s Ark Lab", "aff_unique_url": "https://www.nwpu.edu.cn;https://www.adelaide.edu.au;https://www.huawei.com", "aff_unique_abbr": "NWPU;Adelaide;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;1;1;0", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Zhang_2019_ICCV,\n \n author = {\n Zhang,\n Haokui and Shen,\n Chunhua and Li,\n Ying and Cao,\n Yuanzhouhan and Liu,\n Yu and Yan,\n Youliang\n},\n title = {\n Exploiting Temporal Consistency for Real-Time Video Depth Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Exploring Overall Contextual Information for Image Captioning in Human-Like Cognitive Style", @@ -11520,7 +11884,7 @@ "author": "Hongwei Ge; Zehang Yan; Kai Zhang; Mingde Zhao; Liang Sun", "abstract": "Image captioning is a research hotspot where encoder-decoder models combining convolutional neural network (CNN) and long short-term memory (LSTM) achieve promising results. Despite significant progress, these models generate sentences differently from human cognitive styles. Existing models often generate a complete sentence from the first word to the end, without considering the influence of the following words on the whole sentence generation. In this paper, we explore the utilization of a human-like cognitive style, i.e., building overall cognition for the image to be described and the sentence to be constructed, for enhancing computer image understanding. This paper first proposes a Mutual-aid network structure with Bidirectional LSTMs (MaBi-LSTMs) for acquiring overall contextual information. In the training process, the forward and backward LSTMs encode the succeeding and preceding words into their respective hidden states by simultaneously constructing the whole sentence in a complementary manner. In the captioning process, the LSTM implicitly utilizes the subsequent semantic information contained in its hidden states. In fact, MaBi-LSTMs can generate two sentences in forward and backward directions. To bridge the gap between cross-domain models and generate a sentence with higher quality, we further develop a cross-modal attention mechanism to retouch the two sentences by fusing their salient parts as well as the salient areas of the image. Experimental results on the Microsoft COCO dataset show that the proposed model improves the performance of encoder-decoder models and achieves state-of-the-art results.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Ge_Exploring_Overall_Contextual_Information_for_Image_Captioning_in_Human-Like_Cognitive_ICCV_2019_paper.pdf", - "aff": "College of Computer Science and Technology, Dalian University of Technology, Dalian, China; College of Computer Science and Technology, Dalian University of Technology, Dalian, China; College of Computer Science and Technology, Dalian University of Technology, Dalian, China; Mila, McGill University, Montr \u00b4eal, Canada; College of Computer Science and Technology, Dalian University of Technology, Dalian, China", + "aff": "College of Computer Science and Technology, Dalian University of Technology, Dalian, China; College of Computer Science and Technology, Dalian University of Technology, Dalian, China; College of Computer Science and Technology, Dalian University of Technology, Dalian, China; Mila, McGill University, Montr ´eal, Canada; College of Computer Science and Technology, Dalian University of Technology, Dalian, China", "project": "", "github": "", "supp": "", @@ -11539,9 +11903,10 @@ "aff_unique_url": "http://en.dlut.edu.cn/;https://www.mcgill.ca", "aff_unique_abbr": "DUT;McGill", "aff_campus_unique_index": "0;0;0;1;0", - "aff_campus_unique": "Dalian;Montr\u00e9al", + "aff_campus_unique": "Dalian;Montréal", "aff_country_unique_index": "0;0;0;1;0", - "aff_country_unique": "China;Canada" + "aff_country_unique": "China;Canada", + "bibtex": "@InProceedings{Ge_2019_ICCV,\n \n author = {\n Ge,\n Hongwei and Yan,\n Zehang and Zhang,\n Kai and Zhao,\n Mingde and Sun,\n Liang\n},\n title = {\n Exploring Overall Contextual Information for Image Captioning in Human-Like Cognitive Style\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Exploring Randomly Wired Neural Networks for Image Recognition", @@ -11567,14 +11932,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Xie_Exploring_Randomly_Wired_Neural_Networks_for_Image_Recognition_ICCV_2019_paper.html", "aff_unique_index": "0;0;0;0", - "aff_unique_norm": "Meta", + "aff_unique_norm": "Facebook", "aff_unique_dep": "Facebook AI Research", "aff_unique_url": "https://research.facebook.com", "aff_unique_abbr": "FAIR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Xie_2019_ICCV,\n \n author = {\n Xie,\n Saining and Kirillov,\n Alexander and Girshick,\n Ross and He,\n Kaiming\n},\n title = {\n Exploring Randomly Wired Neural Networks for Image Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Exploring the Limitations of Behavior Cloning for Autonomous Driving", @@ -11582,7 +11948,7 @@ "status": "Oral", "track": "main", "pid": "3693", - "author_site": "Felipe Codevilla, Eder Santana, Antonio M. L\u00c3\u00b3pez, Adrien Gaidon", + "author_site": "Felipe Codevilla, Eder Santana, Antonio M. López, Adrien Gaidon", "author": "Felipe Codevilla; Eder Santana; Antonio M. Lopez; Adrien Gaidon", "abstract": "Driving requires reacting to a wide variety of complex environment conditions and agent behaviors. Explicitly modeling each possible scenario is unrealistic. In contrast, imitation learning can, in theory, leverage data from large fleets of human-driven cars. Behavior cloning in particular has been successfully used to learn simple visuomotor policies end-to-end, but scaling to the full spectrum of driving behaviors remains an unsolved problem. In this paper, we propose a new benchmark to experimentally investigate the scalability and limitations of behavior cloning. We show that behavior cloning leads to state-of-the-art results, executing complex lateral and longitudinal maneuvers, even in unseen environments, without being explicitly programmed to do so. However, we confirm some limitations of the behavior cloning approach: some well-known limitations (e.g., dataset bias and overfitting), new generalization issues (e.g., dynamic objects and the lack of a causal modeling), and training instabilities, all requiring further research before behavior cloning can graduate to real-world driving. The code, dataset, benchmark, and agent studied in this paper can be found at github.com/felipecode/coiltraine/blob/master/docs/exploring_limitations.md", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Codevilla_Exploring_the_Limitations_of_Behavior_Cloning_for_Autonomous_Driving_ICCV_2019_paper.pdf", @@ -11598,7 +11964,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Codevilla_Exploring_the_Limitations_of_Behavior_Cloning_for_Autonomous_Driving_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Codevilla_Exploring_the_Limitations_of_Behavior_Cloning_for_Autonomous_Driving_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Codevilla_2019_ICCV,\n \n author = {\n Codevilla,\n Felipe and Santana,\n Eder and Lopez,\n Antonio M. and Gaidon,\n Adrien\n},\n title = {\n Exploring the Limitations of Behavior Cloning for Autonomous Driving\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Extreme View Synthesis", @@ -11624,14 +11991,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Choi_Extreme_View_Synthesis_ICCV_2019_paper.html", "aff_unique_index": "0+1;0;0;1;0", - "aff_unique_norm": "NVIDIA;Korea Advanced Institute of Science and Technology", - "aff_unique_dep": "NVIDIA Corporation;", + "aff_unique_norm": "NVIDIA Corporation;Korea Advanced Institute of Science and Technology", + "aff_unique_dep": ";", "aff_unique_url": "https://www.nvidia.com;https://www.kaist.ac.kr", "aff_unique_abbr": "NVIDIA;KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0;0;1;0", - "aff_country_unique": "United States;South Korea" + "aff_country_unique": "United States;South Korea", + "bibtex": "@InProceedings{Choi_2019_ICCV,\n \n author = {\n Choi,\n Inchang and Gallo,\n Orazio and Troccoli,\n Alejandro and Kim,\n Min H. and Kautz,\n Jan\n},\n title = {\n Extreme View Synthesis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "FAB: A Robust Facial Landmark Detection Framework for Motion-Blurred Videos", @@ -11664,7 +12032,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Beijing", "aff_country_unique_index": "0;0;0;1;0;0;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Sun_2019_ICCV,\n \n author = {\n Sun,\n Keqiang and Wu,\n Wayne and Liu,\n Tinghao and Yang,\n Shuo and Wang,\n Quan and Zhou,\n Qiang and Ye,\n Zuochang and Qian,\n Chen\n},\n title = {\n FAB: A Robust Facial Landmark Detection Framework for Motion-Blurred Videos\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "FACSIMILE: Fast and Accurate Scans From an Image in Less Than a Second", @@ -11697,7 +12066,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Smith_2019_ICCV,\n \n author = {\n Smith,\n David and Loper,\n Matthew and Hu,\n Xiaochen and Mavroidis,\n Paris and Romero,\n Javier\n},\n title = {\n FACSIMILE: Fast and Accurate Scans From an Image in Less Than a Second\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "FAMNet: Joint Learning of Feature, Affinity and Multi-Dimensional Assignment for Online Multiple Object Tracking", @@ -11730,7 +12100,8 @@ "aff_campus_unique_index": "0;1", "aff_campus_unique": "Philadelphia;Stony Brook", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Chu_2019_ICCV,\n \n author = {\n Chu,\n Peng and Ling,\n Haibin\n},\n title = {\n FAMNet: Joint Learning of Feature,\n Affinity and Multi-Dimensional Assignment for Online Multiple Object Tracking\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "FCOS: Fully Convolutional One-Stage Object Detection", @@ -11756,14 +12127,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Tian_FCOS_Fully_Convolutional_One-Stage_Object_Detection_ICCV_2019_paper.html", "aff_unique_index": "0;0", - "aff_unique_norm": "University of Adelaide", + "aff_unique_norm": "The University of Adelaide", "aff_unique_dep": "", "aff_unique_url": "https://www.adelaide.edu.au", "aff_unique_abbr": "Adelaide", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Australia" + "aff_country_unique": "Australia", + "bibtex": "@InProceedings{Tian_2019_ICCV,\n \n author = {\n Tian,\n Zhi and Shen,\n Chunhua and Chen,\n Hao and He,\n Tong\n},\n title = {\n FCOS: Fully Convolutional One-Stage Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "FDA: Feature Disruptive Attack", @@ -11796,7 +12168,8 @@ "aff_campus_unique_index": "0", "aff_campus_unique": "Tokyo;", "aff_country_unique_index": "0+1;1;1", - "aff_country_unique": "Japan;India" + "aff_country_unique": "Japan;India", + "bibtex": "@InProceedings{Ganeshan_2019_ICCV,\n \n author = {\n Ganeshan,\n Aditya and B.S.,\n Vivek and Babu,\n R. Venkatesh\n},\n title = {\n FDA: Feature Disruptive Attack\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "FSGAN: Subject Agnostic Face Swapping and Reenactment", @@ -11829,7 +12202,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Israel" + "aff_country_unique": "Israel", + "bibtex": "@InProceedings{Nirkin_2019_ICCV,\n \n author = {\n Nirkin,\n Yuval and Keller,\n Yosi and Hassner,\n Tal\n},\n title = {\n FSGAN: Subject Agnostic Face Swapping and Reenactment\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "FW-GAN: Flow-Navigated Warping GAN for Video Virtual Try-On", @@ -11862,7 +12236,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Dong_2019_ICCV,\n \n author = {\n Dong,\n Haoye and Liang,\n Xiaodan and Shen,\n Xiaohui and Wu,\n Bowen and Chen,\n Bing-Cheng and Yin,\n Jian\n},\n title = {\n FW-GAN: Flow-Navigated Warping GAN for Video Virtual Try-On\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Face Alignment With Kernel Density Deep Neural Network", @@ -11895,7 +12270,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Chen_2019_ICCV,\n \n author = {\n Chen,\n Lisha and Su,\n Hui and Ji,\n Qiang\n},\n title = {\n Face Alignment With Kernel Density Deep Neural Network\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Face De-Occlusion Using 3D Morphable Model and Generative Adversarial Network", @@ -11928,7 +12304,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Incheon", "aff_country_unique_index": "0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Yuan_2019_ICCV,\n \n author = {\n Yuan,\n Xiaowei and Park,\n In Kyu\n},\n title = {\n Face De-Occlusion Using 3D Morphable Model and Generative Adversarial Network\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Face Video Deblurring Using 3D Facial Priors", @@ -11954,14 +12331,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Ren_Face_Video_Deblurring_Using_3D_Facial_Priors_ICCV_2019_paper.html", "aff_unique_index": "0;1;0;1;0+2;1", - "aff_unique_norm": "Institute of Information Engineering, Chinese Academy of Sciences;Microsoft;University of Chinese Academy of Sciences", + "aff_unique_norm": "Institute of Information Engineering, Chinese Academy of Sciences;Microsoft Research;University of Chinese Academy of Sciences", "aff_unique_dep": "SKLOIS (State Key Laboratory of Information Security);Research;", "aff_unique_url": "http://www.iie.cas.cn;https://www.microsoft.com/en-us/research/group/asia;http://www.ucas.ac.cn", "aff_unique_abbr": "IIE, CAS;MSR Asia;UCAS", "aff_campus_unique_index": "1;1;;1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0;0;0;0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Ren_2019_ICCV,\n \n author = {\n Ren,\n Wenqi and Yang,\n Jiaolong and Deng,\n Senyou and Wipf,\n David and Cao,\n Xiaochun and Tong,\n Xin\n},\n title = {\n Face Video Deblurring Using 3D Facial Priors\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Face-to-Parameter Translation for Game Character Auto-Creation", @@ -11987,14 +12365,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Shi_Face-to-Parameter_Translation_for_Game_Character_Auto-Creation_ICCV_2019_paper.html", "aff_unique_index": "0;0;0;1;2;3", - "aff_unique_norm": "Netease;University of Michigan;Beihang University;Zhejiang University", + "aff_unique_norm": "NetEase;University of Michigan;Beihang University;Zhejiang University", "aff_unique_dep": "Fuxi AI Lab;;;", "aff_unique_url": "https://www.163.com;https://www.umich.edu;http://www.buaa.edu.cn/;https://www.zju.edu.cn", "aff_unique_abbr": "NetEase;UM;BUAA;ZJU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Ann Arbor", "aff_country_unique_index": "0;0;0;1;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Shi_2019_ICCV,\n \n author = {\n Shi,\n Tianyang and Yuan,\n Yi and Fan,\n Changjie and Zou,\n Zhengxia and Shi,\n Zhenwei and Liu,\n Yong\n},\n title = {\n Face-to-Parameter Translation for Game Character Auto-Creation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "FaceForensics++: Learning to Detect Manipulated Facial Images", @@ -12002,7 +12381,7 @@ "status": "Poster", "track": "main", "pid": "4159", - "author_site": "Andreas R\u00c3\u00b6ssler, Davide Cozzolino, Luisa Verdoliva, Christian Riess, Justus Thies, Matthias Nie\u00c3\u009fner", + "author_site": "Andreas Rössler, Davide Cozzolino, Luisa Verdoliva, Christian Riess, Justus Thies, Matthias Nießner", "author": "Andreas Rossler; Davide Cozzolino; Luisa Verdoliva; Christian Riess; Justus Thies; Matthias Niessner", "abstract": "The rapid progress in synthetic image generation and manipulation has now come to a point where it raises significant concerns for the implications towards society. At best, this leads to a loss of trust in digital content, but could potentially cause further harm by spreading false information or fake news. This paper examines the realism of state-of-the-art image manipulations, and how difficult it is to detect them, either automatically or by humans. To standardize the evaluation of detection methods, we propose an automated benchmark for facial manipulation detection. In particular, the benchmark is based on Deep-Fakes, Face2Face, FaceSwap and NeuralTextures as prominent representatives for facial manipulations at random compression level and size. The benchmark is publicly available and contains a hidden test set as well as a database of over 1.8 million manipulated images. This dataset is over an order of magnitude larger than comparable, publicly available, forgery datasets. Based on this data, we performed a thorough analysis of data-driven forgery detectors. We show that the use of additional domain-specific knowledge improves forgery detection to unprecedented accuracy, even in the presence of strong compression, and clearly outperforms human observers.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Rossler_FaceForensics_Learning_to_Detect_Manipulated_Facial_Images_ICCV_2019_paper.pdf", @@ -12018,7 +12397,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Rossler_FaceForensics_Learning_to_Detect_Manipulated_Facial_Images_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Rossler_FaceForensics_Learning_to_Detect_Manipulated_Facial_Images_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Rossler_2019_ICCV,\n \n author = {\n Rossler,\n Andreas and Cozzolino,\n Davide and Verdoliva,\n Luisa and Riess,\n Christian and Thies,\n Justus and Niessner,\n Matthias\n},\n title = {\n FaceForensics++: Learning to Detect Manipulated Facial Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Fair Loss: Margin-Aware Reinforcement Learning for Deep Face Recognition", @@ -12047,11 +12427,12 @@ "aff_unique_norm": "Beijing University of Posts and Telecommunications;Canon Information Technology", "aff_unique_dep": ";", "aff_unique_url": "http://www.bupt.edu.cn/;https://www.canon.com.cn", - "aff_unique_abbr": "BUPT;", + "aff_unique_abbr": "BUPT;CIT", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Beijing;", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2019_ICCV,\n \n author = {\n Liu,\n Bingyu and Deng,\n Weihong and Zhong,\n Yaoyao and Wang,\n Mei and Hu,\n Jiani and Tao,\n Xunqiang and Huang,\n Yaohai\n},\n title = {\n Fair Loss: Margin-Aware Reinforcement Learning for Deep Face Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Fashion Retrieval via Graph Reasoning Networks on a Similarity Pyramid", @@ -12077,14 +12458,15 @@ "author_num": 7, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Kuang_Fashion_Retrieval_via_Graph_Reasoning_Networks_on_a_Similarity_Pyramid_ICCV_2019_paper.html", "aff_unique_index": "0;0+1;1;2;0;1;0", - "aff_unique_norm": "SenseTime;Sun Yat-sen University;University of Hong Kong", + "aff_unique_norm": "SenseTime;Sun Yat-sen University;The University of Hong Kong", "aff_unique_dep": "SenseTime Research;;", "aff_unique_url": "https://www.sensetime.com;http://www.sysu.edu.cn/;https://www.hku.hk", "aff_unique_abbr": "SenseTime;SYSU;HKU", "aff_campus_unique_index": ";1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0+0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Kuang_2019_ICCV,\n \n author = {\n Kuang,\n Zhanghui and Gao,\n Yiming and Li,\n Guanbin and Luo,\n Ping and Chen,\n Yimin and Lin,\n Liang and Zhang,\n Wayne\n},\n title = {\n Fashion Retrieval via Graph Reasoning Networks on a Similarity Pyramid\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Fashion++: Minimal Edits for Outfit Improvement", @@ -12110,14 +12492,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Hsiao_Fashion_Minimal_Edits_for_Outfit_Improvement_ICCV_2019_paper.html", "aff_unique_index": "0;1;0;2;0+3", - "aff_unique_norm": "University of Texas at Austin;Cornell University;Georgia Institute of Technology;Meta", + "aff_unique_norm": "University of Texas at Austin;Cornell University;Georgia Institute of Technology;Facebook", "aff_unique_dep": ";;;Facebook AI Research", "aff_unique_url": "https://www.utexas.edu;https://tech.cornell.edu;https://www.gatech.edu;https://research.facebook.com", "aff_unique_abbr": "UT Austin;Cornell Tech;Georgia Tech;FAIR", "aff_campus_unique_index": "0;1;0;0", "aff_campus_unique": "Austin;New York City;", "aff_country_unique_index": "0;0;0;0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Hsiao_2019_ICCV,\n \n author = {\n Hsiao,\n Wei-Lin and Katsman,\n Isay and Wu,\n Chao-Yuan and Parikh,\n Devi and Grauman,\n Kristen\n},\n title = {\n Fashion++: Minimal Edits for Outfit Improvement\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Fast Computation of Content-Sensitive Superpixels and Supervoxels Using Q-Distances", @@ -12150,7 +12533,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Ye_2019_ICCV,\n \n author = {\n Ye,\n Zipeng and Yi,\n Ran and Yu,\n Minjing and Liu,\n Yong-Jin and He,\n Ying\n},\n title = {\n Fast Computation of Content-Sensitive Superpixels and Supervoxels Using Q-Distances\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Fast Image Restoration With Multi-Bin Trainable Linear Units", @@ -12183,7 +12567,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Zurich;", "aff_country_unique_index": "0;0;0+1;0", - "aff_country_unique": "Switzerland;Belgium" + "aff_country_unique": "Switzerland;Belgium", + "bibtex": "@InProceedings{Gu_2019_ICCV,\n \n author = {\n Gu,\n Shuhang and Li,\n Wen and Gool,\n Luc Van and Timofte,\n Radu\n},\n title = {\n Fast Image Restoration With Multi-Bin Trainable Linear Units\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Fast Object Detection in Compressed Video", @@ -12216,7 +12601,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2019_ICCV,\n \n author = {\n Wang,\n Shiyao and Lu,\n Hongchao and Deng,\n Zhidong\n},\n title = {\n Fast Object Detection in Compressed Video\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Fast Point R-CNN", @@ -12242,14 +12628,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Chen_Fast_Point_R-CNN_ICCV_2019_paper.html", "aff_unique_index": "0;1;1;0+1", - "aff_unique_norm": "Chinese University of Hong Kong;Tencent", + "aff_unique_norm": "The Chinese University of Hong Kong;Tencent", "aff_unique_dep": ";YouTu Lab", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.tencent.com", "aff_unique_abbr": "CUHK;Tencent", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2019_ICCV,\n \n author = {\n Chen,\n Yilun and Liu,\n Shu and Shen,\n Xiaoyong and Jia,\n Jiaya\n},\n title = {\n Fast Point R-CNN\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Fast Video Object Segmentation via Dynamic Targeting Network", @@ -12282,7 +12669,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Zhang_2019_ICCV,\n \n author = {\n Zhang,\n Lu and Lin,\n Zhe and Zhang,\n Jianming and Lu,\n Huchuan and He,\n You\n},\n title = {\n Fast Video Object Segmentation via Dynamic Targeting Network\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Fast and Practical Neural Architecture Search", @@ -12308,14 +12696,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Cui_Fast_and_Practical_Neural_Architecture_Search_ICCV_2019_paper.html", "aff_unique_index": "0+1;0+1;1;1;1;0+1", - "aff_unique_norm": "Chinese University of Hong Kong;Tencent", + "aff_unique_norm": "The Chinese University of Hong Kong;Tencent", "aff_unique_dep": ";YouTu Lab", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.tencent.com", "aff_unique_abbr": "CUHK;Tencent", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0+0;0+0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Cui_2019_ICCV,\n \n author = {\n Cui,\n Jiequan and Chen,\n Pengguang and Li,\n Ruiyu and Liu,\n Shu and Shen,\n Xiaoyong and Jia,\n Jiaya\n},\n title = {\n Fast and Practical Neural Architecture Search\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Fast-deepKCF Without Boundary Effect", @@ -12339,7 +12728,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Zheng_Fast-deepKCF_Without_Boundary_Effect_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Zheng_Fast-deepKCF_Without_Boundary_Effect_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Zheng_2019_ICCV,\n \n author = {\n Zheng,\n Linyu and Tang,\n Ming and Chen,\n Yingying and Wang,\n Jinqiao and Lu,\n Hanqing\n},\n title = {\n Fast-deepKCF Without Boundary Effect\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Feature Weighting and Boosting for Few-Shot Segmentation", @@ -12363,7 +12753,8 @@ "aff_domain": ";", "email": ";", "author_num": 2, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Nguyen_Feature_Weighting_and_Boosting_for_Few-Shot_Segmentation_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Nguyen_Feature_Weighting_and_Boosting_for_Few-Shot_Segmentation_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Nguyen_2019_ICCV,\n \n author = {\n Nguyen,\n Khoi and Todorovic,\n Sinisa\n},\n title = {\n Feature Weighting and Boosting for Few-Shot Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Few-Shot Adaptive Gaze Estimation", @@ -12375,7 +12766,7 @@ "author": "Seonwook Park; Shalini De Mello; Pavlo Molchanov; Umar Iqbal; Otmar Hilliges; Jan Kautz", "abstract": "Inter-personal anatomical differences limit the accuracy of person-independent gaze estimation networks. Yet there is a need to lower gaze errors further to enable applications requiring higher quality. Further gains can be achieved by personalizing gaze networks, ideally with few calibration samples. However, over-parameterized neural networks are not amenable to learning from few examples as they can quickly over-fit. We embrace these challenges and propose a novel framework for Few-shot Adaptive GaZE Estimation (Faze) for learning person-specific gaze networks with very few (<= 9) calibration samples. Faze learns a rotation-aware latent representation of gaze via a disentangling encoder-decoder architecture along with a highly adaptable gaze estimator trained using meta-learning. It is capable of adapting to any new person to yield significant performance gains with as few as 3 samples, yielding state-of-the-art performance of 3.18-deg on GazeCapture, a 19% improvement over prior art. We open-source our code at https://github.com/NVlabs/few_shot_gaze", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Park_Few-Shot_Adaptive_Gaze_Estimation_ICCV_2019_paper.pdf", - "aff": "NVIDIA+ETH Z\u00fcrich; NVIDIA+ETH Z\u00fcrich; NVIDIA; NVIDIA; ETH Z\u00fcrich; NVIDIA", + "aff": "NVIDIA+ETH Zürich; NVIDIA+ETH Zürich; NVIDIA; NVIDIA; ETH Zürich; NVIDIA", "project": "", "github": "https://github.com/NVlabs/few_shot_gaze1", "supp": "http://openaccess.thecvf.com/content_ICCV_2019/supplemental/Park_Few-Shot_Adaptive_Gaze_ICCV_2019_supplemental.pdf", @@ -12389,14 +12780,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Park_Few-Shot_Adaptive_Gaze_Estimation_ICCV_2019_paper.html", "aff_unique_index": "0+1;0+1;0;0;1;0", - "aff_unique_norm": "NVIDIA;ETH Zurich", - "aff_unique_dep": "NVIDIA Corporation;", + "aff_unique_norm": "NVIDIA Corporation;ETH Zürich", + "aff_unique_dep": ";", "aff_unique_url": "https://www.nvidia.com;https://www.ethz.ch", "aff_unique_abbr": "NVIDIA;ETHZ", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0+1;0;0;1;0", - "aff_country_unique": "United States;Switzerland" + "aff_country_unique": "United States;Switzerland", + "bibtex": "@InProceedings{Park_2019_ICCV,\n \n author = {\n Park,\n Seonwook and Mello,\n Shalini De and Molchanov,\n Pavlo and Iqbal,\n Umar and Hilliges,\n Otmar and Kautz,\n Jan\n},\n title = {\n Few-Shot Adaptive Gaze Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Few-Shot Adversarial Learning of Realistic Neural Talking Head Models", @@ -12420,7 +12812,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Zakharov_Few-Shot_Adversarial_Learning_of_Realistic_Neural_Talking_Head_Models_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Zakharov_Few-Shot_Adversarial_Learning_of_Realistic_Neural_Talking_Head_Models_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Zakharov_2019_ICCV,\n \n author = {\n Zakharov,\n Egor and Shysheya,\n Aliaksandra and Burkov,\n Egor and Lempitsky,\n Victor\n},\n title = {\n Few-Shot Adversarial Learning of Realistic Neural Talking Head Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Few-Shot Generalization for Single-Image 3D Reconstruction via Priors", @@ -12453,7 +12846,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Wallace_2019_ICCV,\n \n author = {\n Wallace,\n Bram and Hariharan,\n Bharath\n},\n title = {\n Few-Shot Generalization for Single-Image 3D Reconstruction via Priors\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Few-Shot Image Recognition With Knowledge Transfer", @@ -12486,7 +12880,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Nanjing;", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Peng_2019_ICCV,\n \n author = {\n Peng,\n Zhimao and Li,\n Zechao and Zhang,\n Junge and Li,\n Yan and Qi,\n Guo-Jun and Tang,\n Jinhui\n},\n title = {\n Few-Shot Image Recognition With Knowledge Transfer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Few-Shot Learning With Embedded Class Models and Shot-Free Meta Training", @@ -12512,14 +12907,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Ravichandran_Few-Shot_Learning_With_Embedded_Class_Models_and_Shot-Free_Meta_Training_ICCV_2019_paper.html", "aff_unique_index": "0;0;0+1", - "aff_unique_norm": "Amazon;University of California, Los Angeles", - "aff_unique_dep": "Amazon Web Services;", + "aff_unique_norm": "Amazon Web Services;University of California, Los Angeles", + "aff_unique_dep": ";", "aff_unique_url": "https://aws.amazon.com;https://www.ucla.edu", "aff_unique_abbr": "AWS;UCLA", "aff_campus_unique_index": "1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Ravichandran_2019_ICCV,\n \n author = {\n Ravichandran,\n Avinash and Bhotika,\n Rahul and Soatto,\n Stefano\n},\n title = {\n Few-Shot Learning With Embedded Class Models and Shot-Free Meta Training\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Few-Shot Learning With Global Class Representations", @@ -12531,7 +12927,7 @@ "author": "Aoxue Li; Tiange Luo; Tao Xiang; Weiran Huang; Liwei Wang", "abstract": "In this paper, we propose to tackle the challenging few-shot learning (FSL) problem by learning global class representations using both base and novel class training samples. In each training episode, an episodic class mean computed from a support set is registered with the global representation via a registration module. This produces a registered global class representation for computing the classification loss using a query set. Though following a similar episodic training pipeline as existing meta learning based approaches, our method differs significantly in that novel class training samples are involved in the training from the beginning. To compensate for the lack of novel class training samples, an effective sample synthesis strategy is developed to avoid overfitting. Importantly, by joint base-novel class training, our approach can be easily extended to a more practical yet challenging FSL setting, i.e., generalized FSL, where the label space of test data is extended to both base and novel classes. Extensive experiments show that our approach is effective for both of the two FSL settings.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Li_Few-Shot_Learning_With_Global_Class_Representations_ICCV_2019_paper.pdf", - "aff": "School of EECS, Peking University, Beijing, China; School of EECS, Peking University, Beijing, China; Department of Electrical and Electronic Engineering, University of Surrey, UK; Huawei Noah\u2019s Ark Lab, Beijing, China; School of EECS, Peking University, Beijing, China", + "aff": "School of EECS, Peking University, Beijing, China; School of EECS, Peking University, Beijing, China; Department of Electrical and Electronic Engineering, University of Surrey, UK; Huawei Noah’s Ark Lab, Beijing, China; School of EECS, Peking University, Beijing, China", "project": "", "github": "", "supp": "", @@ -12545,14 +12941,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Li_Few-Shot_Learning_With_Global_Class_Representations_ICCV_2019_paper.html", "aff_unique_index": "0;0;1;2;0", - "aff_unique_norm": "Peking University;University of Surrey;Huawei", - "aff_unique_dep": "School of EECS;Department of Electrical and Electronic Engineering;Huawei Noah\u2019s Ark Lab", + "aff_unique_norm": "Peking University;University of Surrey;Huawei Noah’s Ark Lab", + "aff_unique_dep": "School of EECS;Department of Electrical and Electronic Engineering;", "aff_unique_url": "http://www.pku.edu.cn;https://www.surrey.ac.uk;https://www.huawei.com/en/ai/noahs-ark-lab", - "aff_unique_abbr": "PKU;Surrey;HNA Lab", + "aff_unique_abbr": "Peking U;Surrey;HNA Lab", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Beijing;", "aff_country_unique_index": "0;0;1;0;0", - "aff_country_unique": "China;United Kingdom" + "aff_country_unique": "China;United Kingdom", + "bibtex": "@InProceedings{Li_2019_ICCV,\n \n author = {\n Li,\n Aoxue and Luo,\n Tiange and Xiang,\n Tao and Huang,\n Weiran and Wang,\n Liwei\n},\n title = {\n Few-Shot Learning With Global Class Representations\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Few-Shot Object Detection via Feature Reweighting", @@ -12576,7 +12973,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Kang_Few-Shot_Object_Detection_via_Feature_Reweighting_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Kang_Few-Shot_Object_Detection_via_Feature_Reweighting_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Kang_2019_ICCV,\n \n author = {\n Kang,\n Bingyi and Liu,\n Zhuang and Wang,\n Xin and Yu,\n Fisher and Feng,\n Jiashi and Darrell,\n Trevor\n},\n title = {\n Few-Shot Object Detection via Feature Reweighting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Few-Shot Unsupervised Image-to-Image Translation", @@ -12602,14 +13000,15 @@ "author_num": 7, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Liu_Few-Shot_Unsupervised_Image-to-Image_Translation_ICCV_2019_paper.html", "aff_unique_index": "0;0+1;0;0;0;0+2;0", - "aff_unique_norm": "NVIDIA;Cornell University;Aalto University", - "aff_unique_dep": "NVIDIA Corporation;;", + "aff_unique_norm": "NVIDIA Corporation;Cornell University;Aalto University", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.nvidia.com;https://www.cornell.edu;https://www.aalto.fi", "aff_unique_abbr": "NVIDIA;Cornell;Aalto", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0;0;0;0+1;0", - "aff_country_unique": "United States;Finland" + "aff_country_unique": "United States;Finland", + "bibtex": "@InProceedings{Liu_2019_ICCV,\n \n author = {\n Liu,\n Ming-Yu and Huang,\n Xun and Mallya,\n Arun and Karras,\n Tero and Aila,\n Timo and Lehtinen,\n Jaakko and Kautz,\n Jan\n},\n title = {\n Few-Shot Unsupervised Image-to-Image Translation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "FiNet: Compatible and Diverse Fashion Image Inpainting", @@ -12642,7 +13041,8 @@ "aff_campus_unique_index": "1;2;1;1;2", "aff_campus_unique": ";Shenzhen;College Park", "aff_country_unique_index": "0+0;1;0+0;0+0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Han_2019_ICCV,\n \n author = {\n Han,\n Xintong and Wu,\n Zuxuan and Huang,\n Weilin and Scott,\n Matthew R. and Davis,\n Larry S.\n},\n title = {\n FiNet: Compatible and Diverse Fashion Image Inpainting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Fine-Grained Action Retrieval Through Multiple Parts-of-Speech Embeddings", @@ -12666,7 +13066,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Wray_Fine-Grained_Action_Retrieval_Through_Multiple_Parts-of-Speech_Embeddings_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Wray_Fine-Grained_Action_Retrieval_Through_Multiple_Parts-of-Speech_Embeddings_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Wray_2019_ICCV,\n \n author = {\n Wray,\n Michael and Larlus,\n Diane and Csurka,\n Gabriela and Damen,\n Dima\n},\n title = {\n Fine-Grained Action Retrieval Through Multiple Parts-of-Speech Embeddings\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Fine-Grained Segmentation Networks: Self-Supervised Segmentation for Improved Long-Term Visual Localization", @@ -12674,7 +13075,7 @@ "status": "Poster", "track": "main", "pid": "4075", - "author_site": "M\u00c3\u00a5ns Larsson, Erik Stenborg, Carl Toft, Lars Hammarstrand, Torsten Sattler, Fredrik Kahl", + "author_site": "MÃ¥ns Larsson, Erik Stenborg, Carl Toft, Lars Hammarstrand, Torsten Sattler, Fredrik Kahl", "author": "Mans Larsson; Erik Stenborg; Carl Toft; Lars Hammarstrand; Torsten Sattler; Fredrik Kahl", "abstract": "Long-term visual localization is the problem of estimating the camera pose of a given query image in a scene whose appearance changes over time. It is an important problem in practice that is, for example, encountered in autonomous driving. In order to gain robustness to such changes, long-term localization approaches often use segmantic segmentations as an invariant scene representation, as the semantic meaning of each scene part should not be affected by seasonal and other changes. However, these representations are typically not very discriminative due to the very limited number of available classes. In this paper, we propose a novel neural network, the Fine-Grained Segmentation Network (FGSN), that can be used to provide image segmentations with a larger number of labels and can be trained in a self-supervised fashion. In addition, we show how FGSNs can be trained to output consistent labels across seasonal changes. We show through extensive experiments that integrating the fine-grained segmentations produced by our FGSNs into existing localization algorithms leads to substantial improvements in localization performance.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Larsson_Fine-Grained_Segmentation_Networks_Self-Supervised_Segmentation_for_Improved_Long-Term_Visual_Localization_ICCV_2019_paper.pdf", @@ -12699,7 +13100,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "Sweden" + "aff_country_unique": "Sweden", + "bibtex": "@InProceedings{Larsson_2019_ICCV,\n \n author = {\n Larsson,\n Mans and Stenborg,\n Erik and Toft,\n Carl and Hammarstrand,\n Lars and Sattler,\n Torsten and Kahl,\n Fredrik\n},\n title = {\n Fine-Grained Segmentation Networks: Self-Supervised Segmentation for Improved Long-Term Visual Localization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Fingerspelling Recognition in the Wild With Iterative Visual Attention", @@ -12732,7 +13134,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Chicago;", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Shi_2019_ICCV,\n \n author = {\n Shi,\n Bowen and Rio,\n Aurora Martinez Del and Keane,\n Jonathan and Brentari,\n Diane and Shakhnarovich,\n Greg and Livescu,\n Karen\n},\n title = {\n Fingerspelling Recognition in the Wild With Iterative Visual Attention\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Flare in Interference-Based Hyperspectral Cameras", @@ -12765,7 +13168,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Haifa", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Israel" + "aff_country_unique": "Israel", + "bibtex": "@InProceedings{Sassoon_2019_ICCV,\n \n author = {\n Sassoon,\n Eden and Schechner,\n Yoav Y. and Treibitz,\n Tali\n},\n title = {\n Flare in Interference-Based Hyperspectral Cameras\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Floor-SP: Inverse CAD for Floorplans by Sequential Room-Wise Shortest Path", @@ -12798,7 +13202,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";St. Louis", "aff_country_unique_index": "0;1;1;0", - "aff_country_unique": "Canada;United States" + "aff_country_unique": "Canada;United States", + "bibtex": "@InProceedings{Chen_2019_ICCV,\n \n author = {\n Chen,\n Jiacheng and Liu,\n Chen and Wu,\n Jiaye and Furukawa,\n Yasutaka\n},\n title = {\n Floor-SP: Inverse CAD for Floorplans by Sequential Room-Wise Shortest Path\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Floorplan-Jigsaw: Jointly Estimating Scene Layout and Aligning Partial Scans", @@ -12824,14 +13229,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Lin_Floorplan-Jigsaw_Jointly_Estimating_Scene_Layout_and_Aligning_Partial_Scans_ICCV_2019_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "University of Hong Kong", + "aff_unique_norm": "The University of Hong Kong", "aff_unique_dep": "", "aff_unique_url": "https://www.hku.hk", "aff_unique_abbr": "HKU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Lin_2019_ICCV,\n \n author = {\n Lin,\n Cheng and Li,\n Changjian and Wang,\n Wenping\n},\n title = {\n Floorplan-Jigsaw: Jointly Estimating Scene Layout and Aligning Partial Scans\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Fooling Network Interpretation in Image Classification", @@ -12864,7 +13270,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Baltimore County", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Subramanya_2019_ICCV,\n \n author = {\n Subramanya,\n Akshayvarun and Pillai,\n Vipin and Pirsiavash,\n Hamed\n},\n title = {\n Fooling Network Interpretation in Image Classification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Foreground-Aware Pyramid Reconstruction for Alignment-Free Occluded Person Re-Identification", @@ -12890,14 +13297,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/He_Foreground-Aware_Pyramid_Reconstruction_for_Alignment-Free_Occluded_Person_Re-Identification_ICCV_2019_paper.html", "aff_unique_index": "0;1;0;0;1;2", - "aff_unique_norm": "JD;Chinese Academy of Sciences Institute of Automation;National University of Singapore", - "aff_unique_dep": "JD AI Research;CRIPAC (Computational Intelligence & Pattern Analysis Group) & NLPR (National Laboratory of Pattern Recognition);", + "aff_unique_norm": "JD AI Research;Chinese Academy of Sciences Institute of Automation;National University of Singapore", + "aff_unique_dep": ";CRIPAC (Computational Intelligence & Pattern Analysis Group) & NLPR (National Laboratory of Pattern Recognition);", "aff_unique_url": "https://www.jd.com;http://www.ia.cas.cn;https://www.nus.edu.sg", "aff_unique_abbr": "JD AI;CASIA;NUS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;1", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{He_2019_ICCV,\n \n author = {\n He,\n Lingxiao and Wang,\n Yinggang and Liu,\n Wu and Zhao,\n He and Sun,\n Zhenan and Feng,\n Jiashi\n},\n title = {\n Foreground-Aware Pyramid Reconstruction for Alignment-Free Occluded Person Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "ForkNet: Multi-Branch Volumetric Semantic Completion From a Single Depth Image", @@ -12921,7 +13329,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Wang_ForkNet_Multi-Branch_Volumetric_Semantic_Completion_From_a_Single_Depth_Image_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Wang_ForkNet_Multi-Branch_Volumetric_Semantic_Completion_From_a_Single_Depth_Image_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Wang_2019_ICCV,\n \n author = {\n Wang,\n Yida and Tan,\n David Joseph and Navab,\n Nassir and Tombari,\n Federico\n},\n title = {\n ForkNet: Multi-Branch Volumetric Semantic Completion From a Single Depth Image\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Frame-to-Frame Aggregation of Active Regions in Web Videos for Weakly Supervised Semantic Segmentation", @@ -12954,7 +13363,8 @@ "aff_campus_unique_index": "0;0;0;0;0+0", "aff_campus_unique": "Seoul", "aff_country_unique_index": "0;0;0;0;0+0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Lee_2019_ICCV,\n \n author = {\n Lee,\n Jungbeom and Kim,\n Eunji and Lee,\n Sungmin and Lee,\n Jangho and Yoon,\n Sungroh\n},\n title = {\n Frame-to-Frame Aggregation of Active Regions in Web Videos for Weakly Supervised Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "FrameNet: Learning Local Canonical Frames of 3D Surfaces From a Single RGB Image", @@ -12978,7 +13388,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Huang_FrameNet_Learning_Local_Canonical_Frames_of_3D_Surfaces_From_a_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Huang_FrameNet_Learning_Local_Canonical_Frames_of_3D_Surfaces_From_a_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Huang_2019_ICCV,\n \n author = {\n Huang,\n Jingwei and Zhou,\n Yichao and Funkhouser,\n Thomas and Guibas,\n Leonidas J.\n},\n title = {\n FrameNet: Learning Local Canonical Frames of 3D Surfaces From a Single RGB Image\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Free-Form Image Inpainting With Gated Convolution", @@ -13004,14 +13415,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Yu_Free-Form_Image_Inpainting_With_Gated_Convolution_ICCV_2019_paper.html", "aff_unique_index": "0;1;1;2;1;0", - "aff_unique_norm": "University of Illinois Urbana-Champaign;Adobe;ByteDance", + "aff_unique_norm": "University of Illinois at Urbana-Champaign;Adobe;ByteDance", "aff_unique_dep": ";Adobe Research;AI Lab", "aff_unique_url": "https://illinois.edu;https://research.adobe.com;https://www.bytedance.com", "aff_unique_abbr": "UIUC;Adobe;ByteDance", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Urbana-Champaign;", "aff_country_unique_index": "0;0;0;1;0;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Yu_2019_ICCV,\n \n author = {\n Yu,\n Jiahui and Lin,\n Zhe and Yang,\n Jimei and Shen,\n Xiaohui and Lu,\n Xin and Huang,\n Thomas S.\n},\n title = {\n Free-Form Image Inpainting With Gated Convolution\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Free-Form Video Inpainting With 3D Gated Convolution and Temporal PatchGAN", @@ -13044,7 +13456,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Taiwan", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chang_2019_ICCV,\n \n author = {\n Chang,\n Ya-Liang and Liu,\n Zhe Yu and Lee,\n Kuan-Ying and Hsu,\n Winston\n},\n title = {\n Free-Form Video Inpainting With 3D Gated Convolution and Temporal PatchGAN\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "FreiHAND: A Dataset for Markerless Capture of Hand Pose and Shape From Single RGB Images", @@ -13068,7 +13481,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Zimmermann_FreiHAND_A_Dataset_for_Markerless_Capture_of_Hand_Pose_and_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Zimmermann_FreiHAND_A_Dataset_for_Markerless_Capture_of_Hand_Pose_and_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Zimmermann_2019_ICCV,\n \n author = {\n Zimmermann,\n Christian and Ceylan,\n Duygu and Yang,\n Jimei and Russell,\n Bryan and Argus,\n Max and Brox,\n Thomas\n},\n title = {\n FreiHAND: A Dataset for Markerless Capture of Hand Pose and Shape From Single RGB Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "From Open Set to Closed Set: Counting Objects by Spatial Divide-and-Conquer", @@ -13094,14 +13508,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Xiong_From_Open_Set_to_Closed_Set_Counting_Objects_by_Spatial_ICCV_2019_paper.html", "aff_unique_index": "0;1;0;0;0;1", - "aff_unique_norm": "Huazhong University of Science and Technology;University of Adelaide", + "aff_unique_norm": "Huazhong University of Science and Technology;The University of Adelaide", "aff_unique_dep": ";", "aff_unique_url": "http://www.hust.edu.cn;https://www.adelaide.edu.au", "aff_unique_abbr": "HUST;Adelaide", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0;1", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Xiong_2019_ICCV,\n \n author = {\n Xiong,\n Haipeng and Lu,\n Hao and Liu,\n Chengxin and Liu,\n Liang and Cao,\n Zhiguo and Shen,\n Chunhua\n},\n title = {\n From Open Set to Closed Set: Counting Objects by Spatial Divide-and-Conquer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "From Strings to Things: Knowledge-Enabled VQA Model That Can Read and Reason", @@ -13125,7 +13540,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Singh_From_Strings_to_Things_Knowledge-Enabled_VQA_Model_That_Can_Read_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Singh_From_Strings_to_Things_Knowledge-Enabled_VQA_Model_That_Can_Read_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Singh_2019_ICCV,\n \n author = {\n Singh,\n Ajeet Kumar and Mishra,\n Anand and Shekhar,\n Shashank and Chakraborty,\n Anirban\n},\n title = {\n From Strings to Things: Knowledge-Enabled VQA Model That Can Read and Reason\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Fully Convolutional Geometric Features", @@ -13149,7 +13565,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Choy_Fully_Convolutional_Geometric_Features_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Choy_Fully_Convolutional_Geometric_Features_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Choy_2019_ICCV,\n \n author = {\n Choy,\n Christopher and Park,\n Jaesik and Koltun,\n Vladlen\n},\n title = {\n Fully Convolutional Geometric Features\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Fully Convolutional Pixel Adaptive Image Denoiser", @@ -13182,7 +13599,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Cha_2019_ICCV,\n \n author = {\n Cha,\n Sungmin and Moon,\n Taesup\n},\n title = {\n Fully Convolutional Pixel Adaptive Image Denoiser\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "G3raphGround: Graph-Based Language Grounding", @@ -13208,14 +13626,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Bajaj_G3raphGround_Graph-Based_Language_Grounding_ICCV_2019_paper.html", "aff_unique_index": "0+1;2;0+1+3", - "aff_unique_norm": "University of British Columbia;Vector Institute for AI;Huawei;Canadian Institute for Advanced Research", - "aff_unique_dep": ";;Huawei Technologies;AI Chair", + "aff_unique_norm": "University of British Columbia;Vector Institute for AI;Huawei Technologies;Canadian Institute for Advanced Research", + "aff_unique_dep": ";;;AI Chair", "aff_unique_url": "https://www.ubc.ca;https://vectorinstitute.ai/;https://www.huawei.com;https://www.cifar.ca", "aff_unique_abbr": "UBC;Vector AI;Huawei;CIFAR", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;1;0+0+0", - "aff_country_unique": "Canada;China" + "aff_country_unique": "Canada;China", + "bibtex": "@InProceedings{Bajaj_2019_ICCV,\n \n author = {\n Bajaj,\n Mohit and Wang,\n Lanjun and Sigal,\n Leonid\n},\n title = {\n G3raphGround: Graph-Based Language Grounding\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "GA-DAN: Geometry-Aware Domain Adaptation Network for Scene Text Detection and Recognition", @@ -13248,7 +13667,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Zhan_2019_ICCV,\n \n author = {\n Zhan,\n Fangneng and Xue,\n Chuhui and Lu,\n Shijian\n},\n title = {\n GA-DAN: Geometry-Aware Domain Adaptation Network for Scene Text Detection and Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "GAN-Based Projector for Faster Recovery With Convergence Guarantees in Linear Inverse Problems", @@ -13274,14 +13694,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Raj_GAN-Based_Projector_for_Faster_Recovery_With_Convergence_Guarantees_in_Linear_ICCV_2019_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "University of Illinois Urbana-Champaign", + "aff_unique_norm": "University of Illinois at Urbana-Champaign", "aff_unique_dep": "", "aff_unique_url": "https://illinois.edu", "aff_unique_abbr": "UIUC", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Urbana-Champaign", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Raj_2019_ICCV,\n \n author = {\n Raj,\n Ankit and Li,\n Yuqi and Bresler,\n Yoram\n},\n title = {\n GAN-Based Projector for Faster Recovery With Convergence Guarantees in Linear Inverse Problems\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "GAN-Tree: An Incrementally Learned Hierarchical Generative Framework for Multi-Modal Data Distributions", @@ -13314,7 +13735,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Bangalore", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "India" + "aff_country_unique": "India", + "bibtex": "@InProceedings{Kundu_2019_ICCV,\n \n author = {\n Kundu,\n Jogendra Nath and Gor,\n Maharshi and Agrawal,\n Dakshit and Babu,\n R. Venkatesh\n},\n title = {\n GAN-Tree: An Incrementally Learned Hierarchical Generative Framework for Multi-Modal Data Distributions\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "GANalyze: Toward Visual Definitions of Cognitive Image Properties", @@ -13347,7 +13769,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0;0;0", - "aff_country_unique": "United States;Belgium" + "aff_country_unique": "United States;Belgium", + "bibtex": "@InProceedings{Goetschalckx_2019_ICCV,\n \n author = {\n Goetschalckx,\n Lore and Andonian,\n Alex and Oliva,\n Aude and Isola,\n Phillip\n},\n title = {\n GANalyze: Toward Visual Definitions of Cognitive Image Properties\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "GEOBIT: A Geodesic-Based Binary Descriptor Invariant to Non-Rigid Deformations for RGB-D Images", @@ -13380,7 +13803,8 @@ "aff_campus_unique_index": ";1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;0;0+1;0;0;2", - "aff_country_unique": "Brazil;France;United States" + "aff_country_unique": "Brazil;France;United States", + "bibtex": "@InProceedings{Nascimento_2019_ICCV,\n \n author = {\n Nascimento,\n Erickson R. and Potje,\n Guilherme and Martins,\n Renato and Cadar,\n Felipe and Campos,\n Mario F. M. and Bajcsy,\n Ruzena\n},\n title = {\n GEOBIT: A Geodesic-Based Binary Descriptor Invariant to Non-Rigid Deformations for RGB-D Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "id": "c8e2d26c96", @@ -13409,7 +13833,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0;0;0", - "aff_country_unique": "Switzerland" + "aff_country_unique": "Switzerland", + "bibtex": "@InProceedings{Truong_2019_ICCV,\n \n author = {\n Truong,\n Prune and Apostolopoulos,\n Stefanos and Mosinska,\n Agata and Stucky,\n Samuel and Ciller,\n Carlos and Zanet,\n Sandro De\n},\n title = {\n GLAMpoints: Greedily Learned Accurate Match Points\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "GLoSH: Global-Local Spherical Harmonics for Intrinsic Image Decomposition", @@ -13442,7 +13867,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "College Park;", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhou_2019_ICCV,\n \n author = {\n Zhou,\n Hao and Yu,\n Xiang and Jacobs,\n David W.\n},\n title = {\n GLoSH: Global-Local Spherical Harmonics for Intrinsic Image Decomposition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "GODS: Generalized One-Class Discriminative Subspaces for Anomaly Detection", @@ -13475,7 +13901,8 @@ "aff_campus_unique_index": "0+1;1", "aff_campus_unique": "Canberra;Cambridge", "aff_country_unique_index": "0+1;1", - "aff_country_unique": "Australia;United States" + "aff_country_unique": "Australia;United States", + "bibtex": "@InProceedings{Wang_2019_ICCV,\n \n author = {\n Wang,\n Jue and Cherian,\n Anoop\n},\n title = {\n GODS: Generalized One-Class Discriminative Subspaces for Anomaly Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "GP2C: Geometric Projection Parameter Consensus for Joint 3D Pose and Focal Length Estimation in the Wild", @@ -13508,7 +13935,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Graz;", "aff_country_unique_index": "0;0;1+0", - "aff_country_unique": "Austria;France" + "aff_country_unique": "Austria;France", + "bibtex": "@InProceedings{Grabner_2019_ICCV,\n \n author = {\n Grabner,\n Alexander and Roth,\n Peter M. and Lepetit,\n Vincent\n},\n title = {\n GP2C: Geometric Projection Parameter Consensus for Joint 3D Pose and Focal Length Estimation in the Wild\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "GSLAM: A General SLAM Framework and Benchmark", @@ -13541,7 +13969,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhao_2019_ICCV,\n \n author = {\n Zhao,\n Yong and Xu,\n Shibiao and Bu,\n Shuhui and Jiang,\n Hongkai and Han,\n Pengcheng\n},\n title = {\n GSLAM: A General SLAM Framework and Benchmark\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "GarNet: A Two-Stream Network for Fast and Accurate 3D Cloth Draping", @@ -13567,14 +13996,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Gundogdu_GarNet_A_Two-Stream_Network_for_Fast_and_Accurate_3D_Cloth_ICCV_2019_paper.html", "aff_unique_index": "0;0;1;1;0;0", - "aff_unique_norm": "EPFL;Fision Technologies", + "aff_unique_norm": "École Polytechnique Fédérale de Lausanne;Fision Technologies", "aff_unique_dep": "CVLab;", "aff_unique_url": "https://cvlab.epfl.ch;", "aff_unique_abbr": "EPFL;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "Switzerland" + "aff_country_unique": "Switzerland", + "bibtex": "@InProceedings{Gundogdu_2019_ICCV,\n \n author = {\n Gundogdu,\n Erhan and Constantin,\n Victor and Seifoddini,\n Amrollah and Dang,\n Minh and Salzmann,\n Mathieu and Fua,\n Pascal\n},\n title = {\n GarNet: A Two-Stream Network for Fast and Accurate 3D Cloth Draping\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Gated-SCNN: Gated Shape CNNs for Semantic Segmentation", @@ -13600,14 +14030,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Takikawa_Gated-SCNN_Gated_Shape_CNNs_for_Semantic_Segmentation_ICCV_2019_paper.html", "aff_unique_index": "0+1;0+2+3;0;0+2+3", - "aff_unique_norm": "NVIDIA;University of Waterloo;University of Toronto;Vector Institute", - "aff_unique_dep": "NVIDIA Corporation;;;", + "aff_unique_norm": "NVIDIA Corporation;University of Waterloo;University of Toronto;Vector Institute", + "aff_unique_dep": ";;;", "aff_unique_url": "https://www.nvidia.com;https://uwaterloo.ca;https://www.utoronto.ca;https://vectorinstitute.ai/", "aff_unique_abbr": "NVIDIA;UW;U of T;Vector Institute", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0+1+1;0;0+1+1", - "aff_country_unique": "United States;Canada" + "aff_country_unique": "United States;Canada", + "bibtex": "@InProceedings{Takikawa_2019_ICCV,\n \n author = {\n Takikawa,\n Towaki and Acuna,\n David and Jampani,\n Varun and Fidler,\n Sanja\n},\n title = {\n Gated-SCNN: Gated Shape CNNs for Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Gated2Depth: Real-Time Dense Lidar From Gated Images", @@ -13631,7 +14062,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Gruber_Gated2Depth_Real-Time_Dense_Lidar_From_Gated_Images_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Gruber_Gated2Depth_Real-Time_Dense_Lidar_From_Gated_Images_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Gruber_2019_ICCV,\n \n author = {\n Gruber,\n Tobias and Julca-Aguilar,\n Frank and Bijelic,\n Mario and Heide,\n Felix\n},\n title = {\n Gated2Depth: Real-Time Dense Lidar From Gated Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Gaussian Affinity for Max-Margin Class Imbalanced Learning", @@ -13643,7 +14075,7 @@ "author": "Munawar Hayat; Salman Khan; Syed Waqas Zamir; Jianbing Shen; Ling Shao", "abstract": "Real-world object classes appear in imbalanced ratios. This poses a significant challenge for classifiers which get biased towards frequent classes. We hypothesize that improving the generalization capability of a classifier should improve learning on imbalanced datasets. Here, we introduce the first hybrid loss function that jointly performs classification and clustering in a single formulation. Our approach is based on an `affinity measure' in Euclidean space that leads to the following benefits: (1) direct enforcement of maximum margin constraints on classification boundaries, (2) a tractable way to ensure uniformly spaced and equidistant cluster centers, (3) flexibility to learn multiple class prototypes to support diversity and discriminability in feature space. Our extensive experiments demonstrate the significant performance improvements on visual classification and verification tasks on multiple imbalanced datasets. The proposed loss can easily be plugged in any deep architecture as a differentiable block and demonstrates robustness against different levels of data imbalance and corrupted labels.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Hayat_Gaussian_Affinity_for_Max-Margin_Class_Imbalanced_Learning_ICCV_2019_paper.pdf", - "aff": "Inception Institute of Arti\ufb01cial Intelligence+University of Canberra; Inception Institute of Arti\ufb01cial Intelligence+Australian National University; Inception Institute of Arti\ufb01cial Intelligence; Inception Institute of Arti\ufb01cial Intelligence+Beijing Institute of Technology; Inception Institute of Arti\ufb01cial Intelligence", + "aff": "Inception Institute of Artificial Intelligence+University of Canberra; Inception Institute of Artificial Intelligence+Australian National University; Inception Institute of Artificial Intelligence; Inception Institute of Artificial Intelligence+Beijing Institute of Technology; Inception Institute of Artificial Intelligence", "project": "", "github": "", "supp": "", @@ -13664,7 +14096,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0+1;0;0+0;0", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Hayat_2019_ICCV,\n \n author = {\n Hayat,\n Munawar and Khan,\n Salman and Zamir,\n Syed Waqas and Shen,\n Jianbing and Shao,\n Ling\n},\n title = {\n Gaussian Affinity for Max-Margin Class Imbalanced Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Gaussian YOLOv3: An Accurate and Fast Object Detector Using Localization Uncertainty for Autonomous Driving", @@ -13697,7 +14130,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Choi_2019_ICCV,\n \n author = {\n Choi,\n Jiwoong and Chun,\n Dayoung and Kim,\n Hyun and Lee,\n Hyuk-Jae\n},\n title = {\n Gaussian YOLOv3: An Accurate and Fast Object Detector Using Localization Uncertainty for Autonomous Driving\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Gaze360: Physically Unconstrained Gaze Estimation in the Wild", @@ -13705,7 +14139,7 @@ "status": "Poster", "track": "main", "pid": "4341", - "author_site": "Petr Kellnhofer, Adri\u00c3\u00a0 Recasens, Simon Stent, Wojciech Matusik, Antonio Torralba", + "author_site": "Petr Kellnhofer, Adrià Recasens, Simon Stent, Wojciech Matusik, Antonio Torralba", "author": "Petr Kellnhofer; Adria Recasens; Simon Stent; Wojciech Matusik; Antonio Torralba", "abstract": "Understanding where people are looking is an informative social cue. In this work, we present Gaze360, a large-scale remote gaze-tracking dataset and method for robust 3D gaze estimation in unconstrained images. Our dataset consists of 238 subjects in indoor and outdoor environments with labelled 3D gaze across a wide range of head poses and distances. It is the largest publicly available dataset of its kind by both subject and variety, made possible by a simple and efficient collection method. Our proposed 3D gaze model extends existing models to include temporal information and to directly output an estimate of gaze uncertainty. We demonstrate the benefits of our model via an ablation study, and show its generalization performance via a cross-dataset evaluation against other recent gaze benchmark datasets. We furthermore propose a simple self-supervised approach to improve cross-dataset domain adaptation. Finally, we demonstrate an application of our model for estimating customer attention in a supermarket setting. Our dataset and models will be made available at http://gaze360.csail.mit.edu.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Kellnhofer_Gaze360_Physically_Unconstrained_Gaze_Estimation_in_the_Wild_ICCV_2019_paper.pdf", @@ -13730,7 +14164,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Kellnhofer_2019_ICCV,\n \n author = {\n Kellnhofer,\n Petr and Recasens,\n Adria and Stent,\n Simon and Matusik,\n Wojciech and Torralba,\n Antonio\n},\n title = {\n Gaze360: Physically Unconstrained Gaze Estimation in the Wild\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Generating Diverse and Descriptive Image Captions Using Visual Paraphrases", @@ -13763,7 +14198,8 @@ "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Beijing", "aff_country_unique_index": "0+0+0;0;0+0+0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2019_ICCV,\n \n author = {\n Liu,\n Lixin and Tang,\n Jiajun and Wan,\n Xiaojun and Guo,\n Zongming\n},\n title = {\n Generating Diverse and Descriptive Image Captions Using Visual Paraphrases\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Generating Easy-to-Understand Referring Expressions for Target Identifications", @@ -13796,7 +14232,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0+0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Tanaka_2019_ICCV,\n \n author = {\n Tanaka,\n Mikihiro and Itamochi,\n Takayuki and Narioka,\n Kenichi and Sato,\n Ikuro and Ushiku,\n Yoshitaka and Harada,\n Tatsuya\n},\n title = {\n Generating Easy-to-Understand Referring Expressions for Target Identifications\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Generative Adversarial Minority Oversampling", @@ -13829,7 +14266,8 @@ "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Kolkata;Durham", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "India;United States" + "aff_country_unique": "India;United States", + "bibtex": "@InProceedings{Mullick_2019_ICCV,\n \n author = {\n Mullick,\n Sankha Subhra and Datta,\n Shounak and Das,\n Swagatam\n},\n title = {\n Generative Adversarial Minority Oversampling\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Generative Adversarial Networks for Extreme Learned Image Compression", @@ -13862,7 +14300,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "Switzerland" + "aff_country_unique": "Switzerland", + "bibtex": "@InProceedings{Agustsson_2019_ICCV,\n \n author = {\n Agustsson,\n Eirikur and Tschannen,\n Michael and Mentzer,\n Fabian and Timofte,\n Radu and Gool,\n Luc Van\n},\n title = {\n Generative Adversarial Networks for Extreme Learned Image Compression\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Generative Adversarial Training for Weakly Supervised Cloud Matting", @@ -13888,14 +14327,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Zou_Generative_Adversarial_Training_for_Weakly_Supervised_Cloud_Matting_ICCV_2019_paper.html", "aff_unique_index": "0;1;2;1;0", - "aff_unique_norm": "University of Michigan;Beihang University;Netease", + "aff_unique_norm": "University of Michigan;Beihang University;NetEase", "aff_unique_dep": ";;Fuxi AI Lab", "aff_unique_url": "https://www.umich.edu;http://www.buaa.edu.cn/;https://www.163.com", "aff_unique_abbr": "UM;BUAA;NetEase", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Ann Arbor;", "aff_country_unique_index": "0;1;1;1;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Zou_2019_ICCV,\n \n author = {\n Zou,\n Zhengxia and Li,\n Wenyuan and Shi,\n Tianyang and Shi,\n Zhenwei and Ye,\n Jieping\n},\n title = {\n Generative Adversarial Training for Weakly Supervised Cloud Matting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Generative Modeling for Small-Data Object Detection", @@ -13928,7 +14368,8 @@ "aff_campus_unique_index": "0+1;1;1;3", "aff_campus_unique": "Ann Arbor;Mountain View;;Stanford", "aff_country_unique_index": "0+0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Liu_2019_ICCV,\n \n author = {\n Liu,\n Lanlan and Muelly,\n Michael and Deng,\n Jia and Pfister,\n Tomas and Li,\n Li-Jia\n},\n title = {\n Generative Modeling for Small-Data Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Generative Multi-View Human Action Recognition", @@ -13961,7 +14402,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Indianapolis", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Wang_2019_ICCV,\n \n author = {\n Wang,\n Lichen and Ding,\n Zhengming and Tao,\n Zhiqiang and Liu,\n Yunyu and Fu,\n Yun\n},\n title = {\n Generative Multi-View Human Action Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "GeoStyle: Discovering Fashion Trends and Events", @@ -13987,14 +14429,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Mall_GeoStyle_Discovering_Fashion_Trends_and_Events_ICCV_2019_paper.html", "aff_unique_index": "0;1;0;0;0", - "aff_unique_norm": "Cornell University;Meta", - "aff_unique_dep": ";Facebook, Inc.", + "aff_unique_norm": "Cornell University;Facebook, Inc.", + "aff_unique_dep": ";", "aff_unique_url": "https://www.cornell.edu;https://www.facebook.com", "aff_unique_abbr": "Cornell;FB", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Mall_2019_ICCV,\n \n author = {\n Mall,\n Utkarsh and Matzen,\n Kevin and Hariharan,\n Bharath and Snavely,\n Noah and Bala,\n Kavita\n},\n title = {\n GeoStyle: Discovering Fashion Trends and Events\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Geometric Disentanglement for Generative Latent Shape Models", @@ -14020,14 +14463,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Aumentado-Armstrong_Geometric_Disentanglement_for_Generative_Latent_Shape_Models_ICCV_2019_paper.html", "aff_unique_index": "0;1;1;1", - "aff_unique_norm": "University of Toronto;Samsung", + "aff_unique_norm": "University of Toronto;Samsung AI Center", "aff_unique_dep": "Vector Institute for AI;AI Center", "aff_unique_url": "https://www.vectorinstitute.ai/;https://www.samsung.com/global/innovation/ai-research/", "aff_unique_abbr": "U of T;SAC", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Toronto", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Aumentado-Armstrong_2019_ICCV,\n \n author = {\n Aumentado-Armstrong,\n Tristan and Tsogkas,\n Stavros and Jepson,\n Allan and Dickinson,\n Sven\n},\n title = {\n Geometric Disentanglement for Generative Latent Shape Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Geometry Normalization Networks for Accurate Scene Text Detection", @@ -14060,7 +14504,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xu_2019_ICCV,\n \n author = {\n Xu,\n Youjiang and Duan,\n Jiaqi and Kuang,\n Zhanghui and Yue,\n Xiaoyu and Sun,\n Hongbin and Guan,\n Yue and Zhang,\n Wayne\n},\n title = {\n Geometry Normalization Networks for Accurate Scene Text Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Global Feature Guided Local Pooling", @@ -14068,6 +14513,7 @@ "status": "Poster", "track": "main", "pid": "4446", + "author_site": "Takumi Kobayashi", "author": "Takumi Kobayashi", "abstract": "In deep convolutional neural networks (CNNs), local pooling operation is a key building block to effectively downsize feature maps for reducing computation cost as well as increasing robustness against input variation. There are several types of pooling operation, such as average/max-pooling, from which one has to be manually selected for building CNNs. The optimal pooling type would be dependent on characteristics of features in CNNs and classification tasks, making it hard to find out the proper pooling module in advance. In this paper, we propose a flexible pooling method which adaptively tunes the pooling functionality based on input features without manually fixing it beforehand. In the proposed method, the parameterized pooling form is derived from a probabilistic perspective to flexibly represent various types of pooling and then the parameters are estimated by means of global statistics in the input feature map. Thus, the proposed local pooling guided by global features effectively works in the CNNs trained in an end-to-end manner. The experimental results on image classification tasks demonstrate the effectiveness of the proposed pooling method in various deep CNNs.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Kobayashi_Global_Feature_Guided_Local_Pooling_ICCV_2019_paper.pdf", @@ -14090,7 +14536,8 @@ "aff_unique_url": "https://www.aist.go.jp", "aff_unique_abbr": "AIST", "aff_country_unique_index": "0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Kobayashi_2019_ICCV,\n \n author = {\n Kobayashi,\n Takumi\n},\n title = {\n Global Feature Guided Local Pooling\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Global-Local Temporal Representations for Video Person Re-Identification", @@ -14117,13 +14564,14 @@ "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Li_Global-Local_Temporal_Representations_for_Video_Person_Re-Identification_ICCV_2019_paper.html", "aff_unique_index": "0;0;1;0;0", "aff_unique_norm": "Peking University;Huawei", - "aff_unique_dep": "School of Electronics Engineering and Computer Science;Noahs Ark Lab", + "aff_unique_dep": "School of Electronics Engineering and Computer Science;", "aff_unique_url": "http://www.pku.edu.cn;https://www.huawei.com", "aff_unique_abbr": "PKU;Huawei", - "aff_campus_unique_index": "", - "aff_campus_unique": "", + "aff_campus_unique_index": "1", + "aff_campus_unique": ";Noahs Ark Lab", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2019_ICCV,\n \n author = {\n Li,\n Jianing and Wang,\n Jingdong and Tian,\n Qi and Gao,\n Wen and Zhang,\n Shiliang\n},\n title = {\n Global-Local Temporal Representations for Video Person Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Goal-Driven Sequential Data Abstraction", @@ -14156,7 +14604,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Muhammad_2019_ICCV,\n \n author = {\n Muhammad,\n Umar Riaz and Yang,\n Yongxin and Hospedales,\n Timothy M. and Xiang,\n Tao and Song,\n Yi-Zhe\n},\n title = {\n Goal-Driven Sequential Data Abstraction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "GradNet: Gradient-Guided Network for Visual Object Tracking", @@ -14182,14 +14631,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Li_GradNet_Gradient-Guided_Network_for_Visual_Object_Tracking_ICCV_2019_paper.html", "aff_unique_index": "0;0;1;0;2;0", - "aff_unique_norm": "Dalian University of Technology;University of Sydney;China Science IntelliCloud Technology Co., Ltd", + "aff_unique_norm": "Dalian University of Technology;The University of Sydney;China Science IntelliCloud Technology Co., Ltd", "aff_unique_dep": ";;", "aff_unique_url": "http://www.dlut.edu.cn/;https://www.sydney.edu.au;", "aff_unique_abbr": "DUT;USYD;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0;0", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Li_2019_ICCV,\n \n author = {\n Li,\n Peixia and Chen,\n Boyu and Ouyang,\n Wanli and Wang,\n Dong and Yang,\n Xiaoyun and Lu,\n Huchuan\n},\n title = {\n GradNet: Gradient-Guided Network for Visual Object Tracking\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Graph Convolutional Networks for Temporal Action Localization", @@ -14215,14 +14665,15 @@ "author_num": 7, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Zeng_Graph_Convolutional_Networks_for_Temporal_Action_Localization_ICCV_2019_paper.html", "aff_unique_index": "0;1+2;0+3;1;1;1;4", - "aff_unique_norm": "South China University of Technology;Tencent;Tsinghua University;Pengcheng Laboratory;Massachusetts Institute of Technology", - "aff_unique_dep": "School of Software Engineering;Tencent AI Lab;Department of Computer Science and Technology;Peng Cheng Laboratory;IBM Watson AI Lab", + "aff_unique_norm": "South China University of Technology;Tencent;Tsinghua University;Peng Cheng Laboratory;Massachusetts Institute of Technology", + "aff_unique_dep": "School of Software Engineering;Tencent AI Lab;Department of Computer Science and Technology;;IBM Watson AI Lab", "aff_unique_url": "https://www.scut.edu.cn;https://ai.tencent.com;https://www.tsinghua.edu.cn;;https://www.mitibmwatsonailab.org", "aff_unique_abbr": "SCUT;Tencent AI Lab;THU;;MIT-IBM AI Lab", - "aff_campus_unique_index": ";1", - "aff_campus_unique": ";Shenzhen", + "aff_campus_unique_index": ";", + "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0+0;0;0;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Zeng_2019_ICCV,\n \n author = {\n Zeng,\n Runhao and Huang,\n Wenbing and Tan,\n Mingkui and Rong,\n Yu and Zhao,\n Peilin and Huang,\n Junzhou and Gan,\n Chuang\n},\n title = {\n Graph Convolutional Networks for Temporal Action Localization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Graph-Based Object Classification for Neuromorphic Vision Sensing", @@ -14255,7 +14706,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "London", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Bi_2019_ICCV,\n \n author = {\n Bi,\n Yin and Chadha,\n Aaron and Abbas,\n Alhabib and Bourtsoulatze,\n Eirina and Andreopoulos,\n Yiannis\n},\n title = {\n Graph-Based Object Classification for Neuromorphic Vision Sensing\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "GraphX-Convolution for Point Cloud Deformation in 2D-to-3D Conversion", @@ -14288,7 +14740,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Nguyen_2019_ICCV,\n \n author = {\n Nguyen,\n Anh-Duc and Choi,\n Seonghwa and Kim,\n Woojae and Lee,\n Sanghoon\n},\n title = {\n GraphX-Convolution for Point Cloud Deformation in 2D-to-3D Conversion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Gravity as a Reference for Estimating a Person's Height From Video", @@ -14296,7 +14749,7 @@ "status": "Poster", "track": "main", "pid": "6294", - "author_site": "Didier Bieler, Semih G\u00c3\u00bcnel, Pascal Fua, Helge Rhodin", + "author_site": "Didier Bieler, Semih Günel, Pascal Fua, Helge Rhodin", "author": "Didier Bieler; Semih Gunel; Pascal Fua; Helge Rhodin", "abstract": "Estimating the metric height of a person from monocular imagery without additional assumptions is ill-posed. Existing solutions either require manual calibration of ground plane and camera geometry, special cameras, or reference objects of known size. We focus on motion cues and exploit gravity on earth as an omnipresent reference 'object' to translate acceleration, and subsequently height, measured in image-pixels to values in meters. We require videos of motion as input, where gravity is the only external force. This limitation is different to those of existing solutions that recover a person's height and, therefore, our method opens up new application fields. We show theoretically and empirically that a simple motion trajectory analysis suffices to translate from pixel measurements to the person's metric height, reaching a MAE of up to 3.9 cm on jumping motions, and that this works without camera and ground plane calibration.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Bieler_Gravity_as_a_Reference_for_Estimating_a_Persons_Height_From_ICCV_2019_paper.pdf", @@ -14314,14 +14767,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Bieler_Gravity_as_a_Reference_for_Estimating_a_Persons_Height_From_ICCV_2019_paper.html", "aff_unique_index": "0;0;0;0+1", - "aff_unique_norm": "EPFL;University of British Columbia", + "aff_unique_norm": "École Polytechnique Fédérale de Lausanne;University of British Columbia", "aff_unique_dep": ";", "aff_unique_url": "https://www.epfl.ch;https://www.ubc.ca", "aff_unique_abbr": "EPFL;UBC", "aff_campus_unique_index": "0;0;0;0+1", "aff_campus_unique": "Lausanne;Vancouver", "aff_country_unique_index": "0;0;0;0+1", - "aff_country_unique": "Switzerland;Canada" + "aff_country_unique": "Switzerland;Canada", + "bibtex": "@InProceedings{Bieler_2019_ICCV,\n \n author = {\n Bieler,\n Didier and Gunel,\n Semih and Fua,\n Pascal and Rhodin,\n Helge\n},\n title = {\n Gravity as a Reference for Estimating a Person's Height From Video\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "GridDehazeNet: Attention-Based Multi-Scale Network for Image Dehazing", @@ -14354,7 +14808,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Liu_2019_ICCV,\n \n author = {\n Liu,\n Xiaohong and Ma,\n Yongrui and Shi,\n Zhihao and Chen,\n Jun\n},\n title = {\n GridDehazeNet: Attention-Based Multi-Scale Network for Image Dehazing\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Ground-to-Aerial Image Geo-Localization With a Hard Exemplar Reweighting Triplet Loss", @@ -14366,7 +14821,7 @@ "author": "Sudong Cai; Yulan Guo; Salman Khan; Jiwei Hu; Gongjian Wen", "abstract": "The task of ground-to-aerial image geo-localization can be achieved by matching a ground view query image to a reference database of aerial/satellite images. It is highly challenging due to the dramatic viewpoint changes and unknown orientations. In this paper, we propose a novel in-batch reweighting triplet loss to emphasize the positive effect of hard exemplars during end-to-end training. We also integrate an attention mechanism into our model using feature-level contextual information. To analyze the difficulty level of each triplet, we first enforce a modified logistic regression to triplets with a distance rectifying factor. Then, the reference negative distances for corresponding anchors are set, and the relative weights of triplets are computed by comparing their difficulty to the corresponding references. To reduce the influence of extreme hard data and less useful simple exemplars, the final weights are pruned using upper and lower bound constraints. Experiments on two benchmark datasets show that the proposed approach significantly outperforms the state-of-the-art methods.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Cai_Ground-to-Aerial_Image_Geo-Localization_With_a_Hard_Exemplar_Reweighting_Triplet_Loss_ICCV_2019_paper.pdf", - "aff": "National University of Defense Technology; Sun Yat-Sen University; Inception Institute of Arti\ufb01cial Intelligence; Wuhan University of Technology; National University of Defense Technology+Sun Yat-Sen University", + "aff": "National University of Defense Technology; Sun Yat-Sen University; Inception Institute of Artificial Intelligence; Wuhan University of Technology; National University of Defense Technology+Sun Yat-Sen University", "project": "", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2019/supplemental/Cai_Ground-to-Aerial_Image_Geo-Localization_ICCV_2019_supplemental.pdf", @@ -14380,14 +14835,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Cai_Ground-to-Aerial_Image_Geo-Localization_With_a_Hard_Exemplar_Reweighting_Triplet_Loss_ICCV_2019_paper.html", "aff_unique_index": "0;1;2;3;0+1", - "aff_unique_norm": "National University of Defense Technology;Sun Yat-sen University;Inception Institute of Artificial Intelligence;Wuhan University of Technology", + "aff_unique_norm": "National University of Defense Technology;Sun Yat-Sen University;Inception Institute of Artificial Intelligence;Wuhan University of Technology", "aff_unique_dep": ";;;", "aff_unique_url": "http://www.nudt.edu.cn/;http://www.sysu.edu.cn/;https://www.inceptionai.org;http://www.wut.edu.cn", "aff_unique_abbr": "NUDT;SYSU;;WUT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Cai_2019_ICCV,\n \n author = {\n Cai,\n Sudong and Guo,\n Yulan and Khan,\n Salman and Hu,\n Jiwei and Wen,\n Gongjian\n},\n title = {\n Ground-to-Aerial Image Geo-Localization With a Hard Exemplar Reweighting Triplet Loss\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Grounded Human-Object Interaction Hotspots From Video", @@ -14413,14 +14869,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Nagarajan_Grounded_Human-Object_Interaction_Hotspots_From_Video_ICCV_2019_paper.html", "aff_unique_index": "0;1;0+1", - "aff_unique_norm": "University of Texas at Austin;Meta", + "aff_unique_norm": "University of Texas at Austin;Facebook", "aff_unique_dep": ";Facebook AI Research", "aff_unique_url": "https://www.utexas.edu;https://research.facebook.com", "aff_unique_abbr": "UT Austin;FAIR", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Austin;", "aff_country_unique_index": "0;0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Nagarajan_2019_ICCV,\n \n author = {\n Nagarajan,\n Tushar and Feichtenhofer,\n Christoph and Grauman,\n Kristen\n},\n title = {\n Grounded Human-Object Interaction Hotspots From Video\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Group-Wise Deep Object Co-Segmentation With Co-Attention Recurrent Neural Network", @@ -14453,7 +14910,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2019_ICCV,\n \n author = {\n Li,\n Bo and Sun,\n Zhengxing and Li,\n Qian and Wu,\n Yunjie and Hu,\n Anqi\n},\n title = {\n Group-Wise Deep Object Co-Segmentation With Co-Attention Recurrent Neural Network\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Grouped Spatial-Temporal Aggregation for Efficient Action Recognition", @@ -14479,14 +14937,15 @@ "author_num": 2, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Luo_Grouped_Spatial-Temporal_Aggregation_for_Efficient_Action_Recognition_ICCV_2019_paper.html", "aff_unique_index": "0;0", - "aff_unique_norm": "Johns Hopkins University", + "aff_unique_norm": "The Johns Hopkins University", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.jhu.edu", "aff_unique_abbr": "JHU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Baltimore", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Luo_2019_ICCV,\n \n author = {\n Luo,\n Chenxu and Yuille,\n Alan L.\n},\n title = {\n Grouped Spatial-Temporal Aggregation for Efficient Action Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Guessing Smart: Biased Sampling for Efficient Black-Box Adversarial Attacks", @@ -14519,7 +14978,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0+0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Brunner_2019_ICCV,\n \n author = {\n Brunner,\n Thomas and Diehl,\n Frederik and Le,\n Michael Truong and Knoll,\n Alois\n},\n title = {\n Guessing Smart: Biased Sampling for Efficient Black-Box Adversarial Attacks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Guided Curriculum Model Adaptation and Uncertainty-Aware Evaluation for Semantic Nighttime Image Segmentation", @@ -14543,7 +15003,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Sakaridis_Guided_Curriculum_Model_Adaptation_and_Uncertainty-Aware_Evaluation_for_Semantic_Nighttime_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Sakaridis_Guided_Curriculum_Model_Adaptation_and_Uncertainty-Aware_Evaluation_for_Semantic_Nighttime_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Sakaridis_2019_ICCV,\n \n author = {\n Sakaridis,\n Christos and Dai,\n Dengxin and Gool,\n Luc Van\n},\n title = {\n Guided Curriculum Model Adaptation and Uncertainty-Aware Evaluation for Semantic Nighttime Image Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Guided Image-to-Image Translation With Bi-Directional Feature Transformation", @@ -14576,7 +15037,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{AlBahar_2019_ICCV,\n \n author = {\n AlBahar,\n Badour and Huang,\n Jia-Bin\n},\n title = {\n Guided Image-to-Image Translation With Bi-Directional Feature Transformation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Guided Super-Resolution As Pixel-to-Pixel Transformation", @@ -14600,7 +15062,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/de_Lutio_Guided_Super-Resolution_As_Pixel-to-Pixel_Transformation_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/de_Lutio_Guided_Super-Resolution_As_Pixel-to-Pixel_Transformation_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Lutio_2019_ICCV,\n \n author = {\n Lutio,\n Riccardo de and D'Aronco,\n Stefano and Wegner,\n Jan Dirk and Schindler,\n Konrad\n},\n title = {\n Guided Super-Resolution As Pixel-to-Pixel Transformation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "HACS: Human Action Clips and Segments Dataset for Recognition and Temporal Localization", @@ -14624,7 +15087,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Zhao_HACS_Human_Action_Clips_and_Segments_Dataset_for_Recognition_and_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Zhao_HACS_Human_Action_Clips_and_Segments_Dataset_for_Recognition_and_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Zhao_2019_ICCV,\n \n author = {\n Zhao,\n Hang and Torralba,\n Antonio and Torresani,\n Lorenzo and Yan,\n Zhicheng\n},\n title = {\n HACS: Human Action Clips and Segments Dataset for Recognition and Temporal Localization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "HAWQ: Hessian AWare Quantization of Neural Networks With Mixed-Precision", @@ -14657,7 +15121,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Dong_2019_ICCV,\n \n author = {\n Dong,\n Zhen and Yao,\n Zhewei and Gholami,\n Amir and Mahoney,\n Michael W. and Keutzer,\n Kurt\n},\n title = {\n HAWQ: Hessian AWare Quantization of Neural Networks With Mixed-Precision\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "HBONet: Harmonious Bottleneck on Two Orthogonal Dimensions", @@ -14683,14 +15148,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Li_HBONet_Harmonious_Bottleneck_on_Two_Orthogonal_Dimensions_ICCV_2019_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "Intel", + "aff_unique_norm": "Intel Corporation", "aff_unique_dep": "Intel Labs", "aff_unique_url": "https://www.intel.cn", "aff_unique_abbr": "Intel", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2019_ICCV,\n \n author = {\n Li,\n Duo and Zhou,\n Aojun and Yao,\n Anbang\n},\n title = {\n HBONet: Harmonious Bottleneck on Two Orthogonal Dimensions\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "HEMlets Pose: Learning Part-Centric Heatmap Triplets for Accurate 3D Human Pose Estimation", @@ -14716,14 +15182,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Zhou_HEMlets_Pose_Learning_Part-Centric_Heatmap_Triplets_for_Accurate_3D_Human_ICCV_2019_paper.html", "aff_unique_index": "0;1;0;2;0", - "aff_unique_norm": "Shenzhen Cloudream Technology Co., Ltd.;Chinese University of Hong Kong;South China University of Technology", + "aff_unique_norm": "Shenzhen Cloudream Technology Co., Ltd.;The Chinese University of Hong Kong;South China University of Technology", "aff_unique_dep": ";;", "aff_unique_url": ";https://www.cuhk.edu.cn;https://www.scut.edu.cn", "aff_unique_abbr": ";CUHK;SCUT", "aff_campus_unique_index": "1", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhou_2019_ICCV,\n \n author = {\n Zhou,\n Kun and Han,\n Xiaoguang and Jiang,\n Nianjuan and Jia,\n Kui and Lu,\n Jiangbo\n},\n title = {\n HEMlets Pose: Learning Part-Centric Heatmap Triplets for Accurate 3D Human Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Habitat: A Platform for Embodied AI Research", @@ -14747,7 +15214,8 @@ "aff_domain": ";;;;;;;;;;;", "email": ";;;;;;;;;;;", "author_num": 12, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Savva_Habitat_A_Platform_for_Embodied_AI_Research_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Savva_Habitat_A_Platform_for_Embodied_AI_Research_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Savva_2019_ICCV,\n \n author = {\n Savva,\n Manolis and Kadian,\n Abhishek and Maksymets,\n Oleksandr and Zhao,\n Yili and Wijmans,\n Erik and Jain,\n Bhavana and Straub,\n Julian and Liu,\n Jia and Koltun,\n Vladlen and Malik,\n Jitendra and Parikh,\n Devi and Batra,\n Dhruv\n},\n title = {\n Habitat: A Platform for Embodied AI Research\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Hallucinating IDT Descriptors and I3D Optical Flow Features for Action Recognition With CNNs", @@ -14780,7 +15248,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0", - "aff_country_unique": "Australia" + "aff_country_unique": "Australia", + "bibtex": "@InProceedings{Wang_2019_ICCV,\n \n author = {\n Wang,\n Lei and Koniusz,\n Piotr and Huynh,\n Du Q.\n},\n title = {\n Hallucinating IDT Descriptors and I3D Optical Flow Features for Action Recognition With CNNs\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "HarDNet: A Low Memory Traffic Network", @@ -14813,7 +15282,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Taiwan;", "aff_country_unique_index": "0+1;0;0;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Chao_2019_ICCV,\n \n author = {\n Chao,\n Ping and Kao,\n Chao-Yang and Ruan,\n Yu-Shan and Huang,\n Chien-Hsiang and Lin,\n Youn-Long\n},\n title = {\n HarDNet: A Low Memory Traffic Network\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "HiPPI: Higher-Order Projected Power Iterations for Scalable Multi-Matching", @@ -14837,7 +15307,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Bernard_HiPPI_Higher-Order_Projected_Power_Iterations_for_Scalable_Multi-Matching_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Bernard_HiPPI_Higher-Order_Projected_Power_Iterations_for_Scalable_Multi-Matching_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Bernard_2019_ICCV,\n \n author = {\n Bernard,\n Florian and Thunberg,\n Johan and Swoboda,\n Paul and Theobalt,\n Christian\n},\n title = {\n HiPPI: Higher-Order Projected Power Iterations for Scalable Multi-Matching\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Hiding Video in Audio via Reversible Generative Models", @@ -14863,14 +15334,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Yang_Hiding_Video_in_Audio_via_Reversible_Generative_Models_ICCV_2019_paper.html", "aff_unique_index": "0;0;1;0", - "aff_unique_norm": "Hong Kong University of Science and Technology;Intel", + "aff_unique_norm": "Hong Kong University of Science and Technology;Intel Corporation", "aff_unique_dep": ";Intel Labs", "aff_unique_url": "https://www.ust.hk;https://www.intel.com", "aff_unique_abbr": "HKUST;Intel", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;1;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Yang_2019_ICCV,\n \n author = {\n Yang,\n Hyukryul and Ouyang,\n Hao and Koltun,\n Vladlen and Chen,\n Qifeng\n},\n title = {\n Hiding Video in Audio via Reversible Generative Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Hierarchical Encoding of Sequential Data With Compact and Sub-Linear Storage Cost", @@ -14894,7 +15366,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Le_Hierarchical_Encoding_of_Sequential_Data_With_Compact_and_Sub-Linear_Storage_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Le_Hierarchical_Encoding_of_Sequential_Data_With_Compact_and_Sub-Linear_Storage_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Le_2019_ICCV,\n \n author = {\n Le,\n Huu and Xu,\n Ming and Hoang,\n Tuan and Milford,\n Michael\n},\n title = {\n Hierarchical Encoding of Sequential Data With Compact and Sub-Linear Storage Cost\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Hierarchical Point-Edge Interaction Network for Point Cloud Semantic Segmentation", @@ -14920,14 +15393,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Jiang_Hierarchical_Point-Edge_Interaction_Network_for_Point_Cloud_Semantic_Segmentation_ICCV_2019_paper.html", "aff_unique_index": "0;0;1;1;0;0+1", - "aff_unique_norm": "Chinese University of Hong Kong;Tencent", + "aff_unique_norm": "The Chinese University of Hong Kong;Tencent", "aff_unique_dep": ";YouTu Lab", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.tencent.com", "aff_unique_abbr": "CUHK;Tencent", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Jiang_2019_ICCV,\n \n author = {\n Jiang,\n Li and Zhao,\n Hengshuang and Liu,\n Shu and Shen,\n Xiaoyong and Fu,\n Chi-Wing and Jia,\n Jiaya\n},\n title = {\n Hierarchical Point-Edge Interaction Network for Point Cloud Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Hierarchical Self-Attention Network for Action Localization in Videos", @@ -14951,7 +15425,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Pramono_Hierarchical_Self-Attention_Network_for_Action_Localization_in_Videos_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Pramono_Hierarchical_Self-Attention_Network_for_Action_Localization_in_Videos_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Pramono_2019_ICCV,\n \n author = {\n Pramono,\n Rizard Renanda Adhi and Chen,\n Yie-Tarng and Fang,\n Wen-Hsien\n},\n title = {\n Hierarchical Self-Attention Network for Action Localization in Videos\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Hierarchical Shot Detector", @@ -14984,7 +15459,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", - "aff_country_unique": "China;United Kingdom" + "aff_country_unique": "China;United Kingdom", + "bibtex": "@InProceedings{Cao_2019_ICCV,\n \n author = {\n Cao,\n Jiale and Pang,\n Yanwei and Han,\n Jungong and Li,\n Xuelong\n},\n title = {\n Hierarchical Shot Detector\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Hierarchy Parsing for Image Captioning", @@ -15010,14 +15486,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Yao_Hierarchy_Parsing_for_Image_Captioning_ICCV_2019_paper.html", "aff_unique_index": "0;0;0;0", - "aff_unique_norm": "JD", - "aff_unique_dep": "JD AI Research", + "aff_unique_norm": "JD AI Research", + "aff_unique_dep": "", "aff_unique_url": "", "aff_unique_abbr": "", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yao_2019_ICCV,\n \n author = {\n Yao,\n Ting and Pan,\n Yingwei and Li,\n Yehao and Mei,\n Tao\n},\n title = {\n Hierarchy Parsing for Image Captioning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Hilbert-Based Generative Defense for Adversarial Examples", @@ -15050,7 +15527,8 @@ "aff_campus_unique_index": "0+0;0+0;1+0;0;0;0+0", "aff_campus_unique": "Shenzhen;Shanghai", "aff_country_unique_index": "0+0;0+0;0+0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Bai_2019_ICCV,\n \n author = {\n Bai,\n Yang and Feng,\n Yan and Wang,\n Yisen and Dai,\n Tao and Xia,\n Shu-Tao and Jiang,\n Yong\n},\n title = {\n Hilbert-Based Generative Defense for Adversarial Examples\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "id": "5d9fcb496e", @@ -15079,7 +15557,8 @@ "aff_campus_unique_index": "0;0+1;0;0;1", "aff_campus_unique": "Toronto;St. Jacobs;", "aff_country_unique_index": "0;0+0;0+0;0;0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Chan_2019_ICCV,\n \n author = {\n Chan,\n Lyndon and Hosseini,\n Mahdi S. and Rowsell,\n Corwyn and Plataniotis,\n Konstantinos N. and Damaskinos,\n Savvas\n},\n title = {\n HistoSegNet: Semantic Segmentation of Histological Tissue Type in Whole Slide Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Holistic++ Scene Understanding: Single-View 3D Holistic Scene Parsing and Human Pose Estimation With Human-Object Interaction and Physical Commonsense", @@ -15112,7 +15591,8 @@ "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States;" + "aff_country_unique": "United States;", + "bibtex": "@InProceedings{Chen_2019_ICCV,\n \n author = {\n Chen,\n Yixin and Huang,\n Siyuan and Yuan,\n Tao and Qi,\n Siyuan and Zhu,\n Yixin and Zhu,\n Song-Chun\n},\n title = {\n Holistic++ Scene Understanding: Single-View 3D Holistic Scene Parsing and Human Pose Estimation With Human-Object Interaction and Physical Commonsense\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "HoloGAN: Unsupervised Learning of 3D Representations From Natural Images", @@ -15145,7 +15625,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;0;0", - "aff_country_unique": "United Kingdom;Canada;United States" + "aff_country_unique": "United Kingdom;Canada;United States", + "bibtex": "@InProceedings{Nguyen-Phuoc_2019_ICCV,\n \n author = {\n Nguyen-Phuoc,\n Thu and Li,\n Chuan and Theis,\n Lucas and Richardt,\n Christian and Yang,\n Yong-Liang\n},\n title = {\n HoloGAN: Unsupervised Learning of 3D Representations From Natural Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Homography From Two Orientation- and Scale-Covariant Features", @@ -15153,7 +15634,7 @@ "status": "Poster", "track": "main", "pid": "2962", - "author_site": "D\u00c3\u00a1niel Bar\u00c3\u00a1th, Zuzana Kukelova", + "author_site": "Dániel Baráth, Zuzana Kukelova", "author": "Daniel Barath; Zuzana Kukelova", "abstract": "This paper proposes a geometric interpretation of the angles and scales which the orientation- and scale-covariant feature detectors, e.g. SIFT, provide. Two new general constraints are derived on the scales and rotations which can be used in any geometric model estimation tasks. Using these formulas, two new constraints on homography estimation are introduced. Exploiting the derived equations, a solver for estimating the homography from the minimal number of two correspondences is proposed. Also, it is shown how the normalization of the point correspondences affects the rotation and scale parameters, thus achieving numerically stable results. Due to requiring merely two feature pairs, robust estimators, e.g. RANSAC, do significantly fewer iterations than by using the four-point algorithm. When using covariant features, e.g. SIFT, this additional information is given at no cost. The method is tested in a synthetic environment and on publicly available real-world datasets.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Barath_Homography_From_Two_Orientation-_and_Scale-Covariant_Features_ICCV_2019_paper.pdf", @@ -15178,7 +15659,8 @@ "aff_campus_unique_index": "0+1;0", "aff_campus_unique": "Prague;Budapest", "aff_country_unique_index": "0+1;0", - "aff_country_unique": "Czech Republic;Hungary" + "aff_country_unique": "Czech Republic;Hungary", + "bibtex": "@InProceedings{Barath_2019_ICCV,\n \n author = {\n Barath,\n Daniel and Kukelova,\n Zuzana\n},\n title = {\n Homography From Two Orientation- and Scale-Covariant Features\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "How Do Neural Networks See Depth in Single Images?", @@ -15211,7 +15693,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Delft", "aff_country_unique_index": "0;0", - "aff_country_unique": "Netherlands" + "aff_country_unique": "Netherlands", + "bibtex": "@InProceedings{Dijk_2019_ICCV,\n \n author = {\n Dijk,\n Tom van and Croon,\n Guido de\n},\n title = {\n How Do Neural Networks See Depth in Single Images?\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "HowTo100M: Learning a Text-Video Embedding by Watching Hundred Million Narrated Video Clips", @@ -15235,7 +15718,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Miech_HowTo100M_Learning_a_Text-Video_Embedding_by_Watching_Hundred_Million_Narrated_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Miech_HowTo100M_Learning_a_Text-Video_Embedding_by_Watching_Hundred_Million_Narrated_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Miech_2019_ICCV,\n \n author = {\n Miech,\n Antoine and Zhukov,\n Dimitri and Alayrac,\n Jean-Baptiste and Tapaswi,\n Makarand and Laptev,\n Ivan and Sivic,\n Josef\n},\n title = {\n HowTo100M: Learning a Text-Video Embedding by Watching Hundred Million Narrated Video Clips\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Human Attention in Image Captioning: Dataset and Analysis", @@ -15259,7 +15743,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/He_Human_Attention_in_Image_Captioning_Dataset_and_Analysis_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/He_Human_Attention_in_Image_Captioning_Dataset_and_Analysis_ICCV_2019_paper.html", + "bibtex": "@InProceedings{He_2019_ICCV,\n \n author = {\n He,\n Sen and Tavakoli,\n Hamed R. and Borji,\n Ali and Pugeault,\n Nicolas\n},\n title = {\n Human Attention in Image Captioning: Dataset and Analysis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Human Mesh Recovery From Monocular Images via a Skeleton-Disentangled Representation", @@ -15285,14 +15770,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Sun_Human_Mesh_Recovery_From_Monocular_Images_via_a_Skeleton-Disentangled_Representation_ICCV_2019_paper.html", "aff_unique_index": "0;1;1;0;0;1", - "aff_unique_norm": "Harbin Institute of Technology;JD", - "aff_unique_dep": ";JD AI Research", + "aff_unique_norm": "Harbin Institute of Technology;JD AI Research", + "aff_unique_dep": ";", "aff_unique_url": "http://www.hit.edu.cn/;https://www.jd.com", "aff_unique_abbr": "HIT;JD AI", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Harbin;", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Sun_2019_ICCV,\n \n author = {\n Sun,\n Yu and Ye,\n Yun and Liu,\n Wu and Gao,\n Wenpeng and Fu,\n Yili and Mei,\n Tao\n},\n title = {\n Human Mesh Recovery From Monocular Images via a Skeleton-Disentangled Representation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Human Motion Prediction via Spatio-Temporal Inpainting", @@ -15300,7 +15786,7 @@ "status": "Poster", "track": "main", "pid": "5823", - "author_site": "Alejandro Hernandez, J\u00c3\u00bcrgen Gall, Francesc Moreno-Noguer", + "author_site": "Alejandro Hernandez, Jürgen Gall, Francesc Moreno-Noguer", "author": "Alejandro Hernandez; Jurgen Gall; Francesc Moreno-Noguer", "abstract": "We propose a Generative Adversarial Network (GAN) to forecast 3D human motion given a sequence of past 3D skeleton poses. While recent GANs have shown promising results, they can only forecast plausible motion over relatively short periods of time (few hundred milliseconds) and typically ignore the absolute position of the skeleton w.r.t. the camera. Our scheme provides long term predictions (two seconds or more) for both the body pose and its absolute position. Our approach builds upon three main contributions. First, we represent the data using a spatio-temporal tensor of 3D skeleton coordinates which allows formulating the prediction problem as an inpainting one, for which GANs work particularly well. Secondly, we design an architecture to learn the joint distribution of body poses and global motion, capable to hypothesize large chunks of the input 3D tensor with missing data. And finally, we argue that the L2 metric, considered so far by most approaches, fails to capture the actual distribution of long-term human motion. We propose two alternative metrics, based on the distribution of frequencies, that are able to capture more realistic motion patterns. Extensive experiments demonstrate our approach to significantly improve the state of the art, while also handling situations in which past observations are corrupted by occlusions, noise and missing frames.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Hernandez_Human_Motion_Prediction_via_Spatio-Temporal_Inpainting_ICCV_2019_paper.pdf", @@ -15318,14 +15804,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Hernandez_Human_Motion_Prediction_via_Spatio-Temporal_Inpainting_ICCV_2019_paper.html", "aff_unique_index": "0;1;0", - "aff_unique_norm": "Institut de Rob\u00f2tica i Inform\u00e0tica Industrial;University of Bonn", + "aff_unique_norm": "Institut de Robòtica i Informàtica Industrial;University of Bonn", "aff_unique_dep": "CSIC-UPC;Computer Vision Group", "aff_unique_url": "http://www.iri.upc.edu/;https://www.uni-bonn.de", "aff_unique_abbr": "IRI;", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Barcelona;", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "Spain;Germany" + "aff_country_unique": "Spain;Germany", + "bibtex": "@InProceedings{Hernandez_2019_ICCV,\n \n author = {\n Hernandez,\n Alejandro and Gall,\n Jurgen and Moreno-Noguer,\n Francesc\n},\n title = {\n Human Motion Prediction via Spatio-Temporal Inpainting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Human Uncertainty Makes Classification More Robust", @@ -15358,7 +15845,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Peterson_2019_ICCV,\n \n author = {\n Peterson,\n Joshua C. and Battleday,\n Ruairidh M. and Griffiths,\n Thomas L. and Russakovsky,\n Olga\n},\n title = {\n Human Uncertainty Makes Classification More Robust\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Human-Aware Motion Deblurring", @@ -15391,7 +15879,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0+1;0;0+1;2;1;0", - "aff_country_unique": "United Arab Emirates;China;United States" + "aff_country_unique": "United Arab Emirates;China;United States", + "bibtex": "@InProceedings{Shen_2019_ICCV,\n \n author = {\n Shen,\n Ziyi and Wang,\n Wenguan and Lu,\n Xiankai and Shen,\n Jianbing and Ling,\n Haibin and Xu,\n Tingfa and Shao,\n Ling\n},\n title = {\n Human-Aware Motion Deblurring\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Hyperpixel Flow: Semantic Correspondence With Multi-Layer Neural Features", @@ -15403,7 +15892,7 @@ "author": "Juhong Min; Jongmin Lee; Jean Ponce; Minsu Cho", "abstract": "Establishing visual correspondences under large intra-class variations requires analyzing images at different levels, from features linked to semantics and context to local patterns, while being invariant to instance-specific details. To tackle these challenges, we represent images by \"hyperpixels\" that leverage a small number of relevant features selected among early to late layers of a convolutional neural network. Taking advantage of the condensed features of hyperpixels, we develop an effective real-time matching algorithm based on Hough geometric voting. The proposed method, hyperpixel flow, sets a new state of the art on three standard benchmarks as well as a new dataset, SPair-71k, which contains a significantly larger number of image pairs than existing datasets, with more accurate and richer annotations for in-depth analysis.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Min_Hyperpixel_Flow_Semantic_Correspondence_With_Multi-Layer_Neural_Features_ICCV_2019_paper.pdf", - "aff": "POSTECH+Neural Processing Research Center; POSTECH+Neural Processing Research Center; Inria+D\u00b4epartement d\u2019informatique de l\u2019ENS; POSTECH+Neural Processing Research Center", + "aff": "POSTECH+Neural Processing Research Center; POSTECH+Neural Processing Research Center; Inria+D´epartement d’informatique de l’ENS; POSTECH+Neural Processing Research Center", "project": "http://cvlab.postech.ac.kr/research/HPF/", "github": "", "supp": "", @@ -15417,14 +15906,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Min_Hyperpixel_Flow_Semantic_Correspondence_With_Multi-Layer_Neural_Features_ICCV_2019_paper.html", "aff_unique_index": "0+1;0+1;2+3;0+1", - "aff_unique_norm": "Pohang University of Science and Technology;Neural Processing Research Center;INRIA;\u00c9cole Normale Sup\u00e9rieure", - "aff_unique_dep": ";;;D\u00e9partement d\u2019informatique", + "aff_unique_norm": "Pohang University of Science and Technology;Neural Processing Research Center;Inria;École Normale Supérieure", + "aff_unique_dep": ";;;Département d’informatique", "aff_unique_url": "https://www.postech.ac.kr;;https://www.inria.fr;https://www.ens.fr", "aff_unique_abbr": "POSTECH;;Inria;ENS", "aff_campus_unique_index": "0;0;;0", "aff_campus_unique": "Pohang;", "aff_country_unique_index": "0;0;2+2;0", - "aff_country_unique": "South Korea;;France" + "aff_country_unique": "South Korea;;France", + "bibtex": "@InProceedings{Min_2019_ICCV,\n \n author = {\n Min,\n Juhong and Lee,\n Jongmin and Ponce,\n Jean and Cho,\n Minsu\n},\n title = {\n Hyperpixel Flow: Semantic Correspondence With Multi-Layer Neural Features\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Hyperspectral Image Reconstruction Using Deep External and Internal Learning", @@ -15457,7 +15947,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2019_ICCV,\n \n author = {\n Zhang,\n Tao and Fu,\n Ying and Wang,\n Lizhi and Huang,\n Hua\n},\n title = {\n Hyperspectral Image Reconstruction Using Deep External and Internal Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "IL2M: Class Incremental Learning With Dual Memory", @@ -15490,7 +15981,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "France" + "aff_country_unique": "France", + "bibtex": "@InProceedings{Belouadah_2019_ICCV,\n \n author = {\n Belouadah,\n Eden and Popescu,\n Adrian\n},\n title = {\n IL2M: Class Incremental Learning With Dual Memory\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "IMP: Instance Mask Projection for High Accuracy Semantic Segmentation of Things", @@ -15514,7 +16006,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Fu_IMP_Instance_Mask_Projection_for_High_Accuracy_Semantic_Segmentation_of_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Fu_IMP_Instance_Mask_Projection_for_High_Accuracy_Semantic_Segmentation_of_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Fu_2019_ICCV,\n \n author = {\n Fu,\n Cheng-Yang and Berg,\n Tamara L. and Berg,\n Alexander C.\n},\n title = {\n IMP: Instance Mask Projection for High Accuracy Semantic Segmentation of Things\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Identity From Here, Pose From There: Self-Supervised Disentanglement and Generation of Objects Using Unlabeled Videos", @@ -15538,7 +16031,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Xiao_Identity_From_Here_Pose_From_There_Self-Supervised_Disentanglement_and_Generation_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Xiao_Identity_From_Here_Pose_From_There_Self-Supervised_Disentanglement_and_Generation_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Xiao_2019_ICCV,\n \n author = {\n Xiao,\n Fanyi and Liu,\n Haotian and Lee,\n Yong Jae\n},\n title = {\n Identity From Here,\n Pose From There: Self-Supervised Disentanglement and Generation of Objects Using Unlabeled Videos\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Image Aesthetic Assessment Based on Pairwise Comparison A Unified Approach to Score Regression, Binary Classification, and Personalization", @@ -15571,7 +16065,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Seoul", "aff_country_unique_index": "0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Lee_2019_ICCV,\n \n author = {\n Lee,\n Jun-Tae and Kim,\n Chang-Su\n},\n title = {\n Image Aesthetic Assessment Based on Pairwise Comparison A Unified Approach to Score Regression,\n Binary Classification,\n and Personalization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Image Generation From Small Datasets via Batch Statistics Adaptation", @@ -15604,7 +16099,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Noguchi_2019_ICCV,\n \n author = {\n Noguchi,\n Atsuhiro and Harada,\n Tatsuya\n},\n title = {\n Image Generation From Small Datasets via Batch Statistics Adaptation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Image Inpainting With Learnable Bidirectional Attention Maps", @@ -15630,14 +16126,15 @@ "author_num": 8, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Xie_Image_Inpainting_With_Learnable_Bidirectional_Attention_Maps_ICCV_2019_paper.html", "aff_unique_index": "0+1;0+1;2;3;0+1;2;2;2", - "aff_unique_norm": "Harbin Institute of Technology;Pengcheng Laboratory;Baidu;Nankai University", - "aff_unique_dep": ";Peng Cheng Laboratory;Department of Computer Vision Technology (VIS);", + "aff_unique_norm": "Harbin Institute of Technology;Peng Cheng Laboratory;Baidu Inc.;Nankai University", + "aff_unique_dep": ";;Department of Computer Vision Technology (VIS);", "aff_unique_url": "http://www.hit.edu.cn/;http://www.pcl.ac.cn;https://www.baidu.com;http://www.nankai.edu.cn", "aff_unique_abbr": "HIT;PCL;Baidu;NKU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Harbin;", "aff_country_unique_index": "0+0;0+0;0;0;0+0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xie_2019_ICCV,\n \n author = {\n Xie,\n Chaohao and Liu,\n Shaohui and Li,\n Chao and Cheng,\n Ming-Ming and Zuo,\n Wangmeng and Liu,\n Xiao and Wen,\n Shilei and Ding,\n Errui\n},\n title = {\n Image Inpainting With Learnable Bidirectional Attention Maps\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Image Synthesis From Reconfigurable Layout and Style", @@ -15670,7 +16167,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Sun_2019_ICCV,\n \n author = {\n Sun,\n Wei and Wu,\n Tianfu\n},\n title = {\n Image Synthesis From Reconfigurable Layout and Style\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Image2StyleGAN: How to Embed Images Into the StyleGAN Latent Space?", @@ -15703,7 +16201,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Saudi Arabia" + "aff_country_unique": "Saudi Arabia", + "bibtex": "@InProceedings{Abdal_2019_ICCV,\n \n author = {\n Abdal,\n Rameen and Qin,\n Yipeng and Wonka,\n Peter\n},\n title = {\n Image2StyleGAN: How to Embed Images Into the StyleGAN Latent Space?\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Imitation Learning for Human Pose Prediction", @@ -15727,7 +16226,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Wang_Imitation_Learning_for_Human_Pose_Prediction_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Wang_Imitation_Learning_for_Human_Pose_Prediction_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Wang_2019_ICCV,\n \n author = {\n Wang,\n Borui and Adeli,\n Ehsan and Chiu,\n Hsu-kuang and Huang,\n De-An and Niebles,\n Juan Carlos\n},\n title = {\n Imitation Learning for Human Pose Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Implicit Surface Representations As Layers in Neural Networks", @@ -15760,7 +16260,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "Australia" + "aff_country_unique": "Australia", + "bibtex": "@InProceedings{Michalkiewicz_2019_ICCV,\n \n author = {\n Michalkiewicz,\n Mateusz and Pontes,\n Jhony K. and Jack,\n Dominic and Baktashmotlagh,\n Mahsa and Eriksson,\n Anders\n},\n title = {\n Implicit Surface Representations As Layers in Neural Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Improved Conditional VRNNs for Video Prediction", @@ -15772,7 +16273,7 @@ "author": "Lluis Castrejon; Nicolas Ballas; Aaron Courville", "abstract": "Predicting future frames for a video sequence is a challenging generative modeling task. Promising approaches include probabilistic latent variable models such as the Variational Auto-Encoder. While VAEs can handle uncertainty and model multiple possible future outcomes, they have a tendency to produce blurry predictions. In this work we argue that this is a sign of underfitting. To address this issue, we propose to increase the expressiveness of the latent distributions and to use higher capacity likelihood models. Our approach relies on a hierarchy of latent variables, which defines a family of flexible prior and posterior distributions in order to better model the probability of future sequences. We validate our proposal through a series of ablation experiments and compare our approach to current state-of-the-art latent variable models. Our method performs favorably under several metrics in three different datasets.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Castrejon_Improved_Conditional_VRNNs_for_Video_Prediction_ICCV_2019_paper.pdf", - "aff": "Mila, Universit\u00e9 de Montr\u00e9al; Facebook AI Research; CIFAR, Mila, Universit\u00e9 de Montr\u00e9al", + "aff": "Mila, Université de Montréal; Facebook AI Research; CIFAR, Mila, Université de Montréal", "project": "", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2019/supplemental/Castrejon_Improved_Conditional_VRNNs_ICCV_2019_supplemental.pdf", @@ -15786,14 +16287,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Castrejon_Improved_Conditional_VRNNs_for_Video_Prediction_ICCV_2019_paper.html", "aff_unique_index": "0;1;0", - "aff_unique_norm": "Universit\u00e9 de Montr\u00e9al;Meta", + "aff_unique_norm": "Université de Montréal;Facebook", "aff_unique_dep": "Mila;Facebook AI Research", "aff_unique_url": "https://umontreal.ca;https://research.facebook.com", "aff_unique_abbr": "UdeM;FAIR", "aff_campus_unique_index": "0;0", - "aff_campus_unique": "Montr\u00e9al;", + "aff_campus_unique": "Montréal;", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "Canada;United States" + "aff_country_unique": "Canada;United States", + "bibtex": "@InProceedings{Castrejon_2019_ICCV,\n \n author = {\n Castrejon,\n Lluis and Ballas,\n Nicolas and Courville,\n Aaron\n},\n title = {\n Improved Conditional VRNNs for Video Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Improved Techniques for Training Adaptive Deep Networks", @@ -15817,7 +16319,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Li_Improved_Techniques_for_Training_Adaptive_Deep_Networks_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Li_Improved_Techniques_for_Training_Adaptive_Deep_Networks_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Li_2019_ICCV,\n \n author = {\n Li,\n Hao and Zhang,\n Hong and Qi,\n Xiaojuan and Yang,\n Ruigang and Huang,\n Gao\n},\n title = {\n Improved Techniques for Training Adaptive Deep Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Improving Adversarial Robustness via Guided Complement Entropy", @@ -15850,7 +16353,8 @@ "aff_campus_unique_index": "0;0;0+0;1;1;1;1", "aff_campus_unique": "Taiwan;Mountain View", "aff_country_unique_index": "0;0;0+0;1;1;1;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Chen_2019_ICCV,\n \n author = {\n Chen,\n Hao-Yun and Liang,\n Jhao-Hong and Chang,\n Shih-Chieh and Pan,\n Jia-Yu and Chen,\n Yu-Ting and Wei,\n Wei and Juan,\n Da-Cheng\n},\n title = {\n Improving Adversarial Robustness via Guided Complement Entropy\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Improving Pedestrian Attribute Recognition With Weakly-Supervised Multi-Scale Attribute-Specific Localization", @@ -15862,7 +16366,7 @@ "author": "Chufeng Tang; Lu Sheng; Zhaoxiang Zhang; Xiaolin Hu", "abstract": "Pedestrian attribute recognition has been an emerging research topic in the area of video surveillance. To predict the existence of a particular attribute, it is demanded to localize the regions related to the attribute. However, in this task, the region annotations are not available. How to carve out these attribute-related regions remains challenging. Existing methods applied attribute-agnostic visual attention or heuristic body-part localization mechanisms to enhance the local feature representations, while neglecting to employ attributes to define local feature areas. We propose a flexible Attribute Localization Module (ALM) to adaptively discover the most discriminative regions and learns the regional features for each attribute at multiple levels. Moreover, a feature pyramid architecture is also introduced to enhance the attribute-specific localization at low-levels with high-level semantic guidance. The proposed framework does not require additional region annotations and can be trained end-to-end with multi-level deep supervision. Extensive experiments show that the proposed method achieves state-of-the-art results on three pedestrian attribute datasets, including PETA, RAP, and PA-100K.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Tang_Improving_Pedestrian_Attribute_Recognition_With_Weakly-Supervised_Multi-Scale_Attribute-Specific_Localization_ICCV_2019_paper.pdf", - "aff": "State Key Laboratory of Intelligent Technology and Systems, Institute for Arti\ufb01cial Intelligence, Department of Computer Science and Technology, Beijing National Research Center for Information Science and Technology, Tsinghua University; College of Software, Beihang University; Institute of Automation, Chinese Academy of Sciences; State Key Laboratory of Intelligent Technology and Systems, Institute for Arti\ufb01cial Intelligence, Department of Computer Science and Technology, Beijing National Research Center for Information Science and Technology, Tsinghua University", + "aff": "State Key Laboratory of Intelligent Technology and Systems, Institute for Artificial Intelligence, Department of Computer Science and Technology, Beijing National Research Center for Information Science and Technology, Tsinghua University; College of Software, Beihang University; Institute of Automation, Chinese Academy of Sciences; State Key Laboratory of Intelligent Technology and Systems, Institute for Artificial Intelligence, Department of Computer Science and Technology, Beijing National Research Center for Information Science and Technology, Tsinghua University", "project": "", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2019/supplemental/Tang_Improving_Pedestrian_Attribute_ICCV_2019_supplemental.pdf", @@ -15883,7 +16387,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Tang_2019_ICCV,\n \n author = {\n Tang,\n Chufeng and Sheng,\n Lu and Zhang,\n Zhaoxiang and Hu,\n Xiaolin\n},\n title = {\n Improving Pedestrian Attribute Recognition With Weakly-Supervised Multi-Scale Attribute-Specific Localization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "InGAN: Capturing and Retargeting the \"DNA\" of a Natural Image", @@ -15907,7 +16412,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Shocher_InGAN_Capturing_and_Retargeting_the_DNA_of_a_Natural_Image_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Shocher_InGAN_Capturing_and_Retargeting_the_DNA_of_a_Natural_Image_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Shocher_2019_ICCV,\n \n author = {\n Shocher,\n Assaf and Bagon,\n Shai and Isola,\n Phillip and Irani,\n Michal\n},\n title = {\n InGAN: Capturing and Retargeting the \"DNA\" of a Natural Image\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Incremental Class Discovery for Semantic Segmentation With RGBD Sensing", @@ -15940,7 +16446,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0;1;0", - "aff_country_unique": "United States;Japan" + "aff_country_unique": "United States;Japan", + "bibtex": "@InProceedings{Nakajima_2019_ICCV,\n \n author = {\n Nakajima,\n Yoshikatsu and Kang,\n Byeongkeun and Saito,\n Hideo and Kitani,\n Kris\n},\n title = {\n Incremental Class Discovery for Semantic Segmentation With RGBD Sensing\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Incremental Learning Using Conditional Adversarial Networks", @@ -15973,7 +16480,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Xiang_2019_ICCV,\n \n author = {\n Xiang,\n Ye and Fu,\n Ying and Ji,\n Pan and Huang,\n Hua\n},\n title = {\n Incremental Learning Using Conditional Adversarial Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Indices Matter: Learning to Index for Deep Image Matting", @@ -15985,7 +16493,7 @@ "author": "Hao Lu; Yutong Dai; Chunhua Shen; Songcen Xu", "abstract": "We show that existing upsampling operators can be unified using the notion of the index function. This notion is inspired by an observation in the decoding process of deep image matting where indices-guided unpooling can often recover boundary details considerably better than other upsampling operators such as bilinear interpolation. By viewing the indices as a function of the feature map, we introduce the concept of 'learning to index', and present a novel index-guided encoder-decoder framework where indices are self-learned adaptively from data and are used to guide the pooling and upsampling operators, without extra training supervision. At the core of this framework is a flexible network module, termed IndexNet, which dynamically generates indices conditioned on the feature map. Due to its flexibility, IndexNet can be used as a plug-in applying to almost all off-the-shelf convolutional networks that have coupled downsampling and upsampling stages. We demonstrate the effectiveness of IndexNet on the task of natural image matting where the quality of learned indices can be visually observed from predicted alpha mattes. Results on the Composition-1k matting dataset show that our model built on MobileNetv2 exhibits at least 16.1% improvement over the seminal VGG-16 based deep matting baseline, with less training data and lower model capacity. Code and models have been made available at: https://tinyurl.com/IndexNetV1.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Lu_Indices_Matter_Learning_to_Index_for_Deep_Image_Matting_ICCV_2019_paper.pdf", - "aff": "The University of Adelaide, Australia; The University of Adelaide, Australia; The University of Adelaide, Australia; Noah\u2019s Ark Lab, Huawei Technologies", + "aff": "The University of Adelaide, Australia; The University of Adelaide, Australia; The University of Adelaide, Australia; Noah’s Ark Lab, Huawei Technologies", "project": "https://tinyurl.com/IndexNetV1", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2019/supplemental/Lu_Indices_Matter_Learning_ICCV_2019_supplemental.pdf", @@ -15999,14 +16507,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Lu_Indices_Matter_Learning_to_Index_for_Deep_Image_Matting_ICCV_2019_paper.html", "aff_unique_index": "0;0;0;1", - "aff_unique_norm": "University of Adelaide;Huawei", - "aff_unique_dep": ";Noah\u2019s Ark Lab", + "aff_unique_norm": "The University of Adelaide;Huawei Technologies", + "aff_unique_dep": ";Noah’s Ark Lab", "aff_unique_url": "https://www.adelaide.edu.au;https://www.huawei.com", "aff_unique_abbr": "Adelaide;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1", - "aff_country_unique": "Australia;China" + "aff_country_unique": "Australia;China", + "bibtex": "@InProceedings{Lu_2019_ICCV,\n \n author = {\n Lu,\n Hao and Dai,\n Yutong and Shen,\n Chunhua and Xu,\n Songcen\n},\n title = {\n Indices Matter: Learning to Index for Deep Image Matting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Information Entropy Based Feature Pooling for Convolutional Neural Networks", @@ -16039,7 +16548,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wan_2019_ICCV,\n \n author = {\n Wan,\n Weitao and Chen,\n Jiansheng and Li,\n Tianpeng and Huang,\n Yiqing and Tian,\n Jingqi and Yu,\n Cheng and Xue,\n Youze\n},\n title = {\n Information Entropy Based Feature Pooling for Convolutional Neural Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "InstaBoost: Boosting Instance Segmentation via Probability Map Guided Copy-Pasting", @@ -16063,7 +16573,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Fang_InstaBoost_Boosting_Instance_Segmentation_via_Probability_Map_Guided_Copy-Pasting_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Fang_InstaBoost_Boosting_Instance_Segmentation_via_Probability_Map_Guided_Copy-Pasting_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Fang_2019_ICCV,\n \n author = {\n Fang,\n Hao-Shu and Sun,\n Jianhua and Wang,\n Runzhong and Gou,\n Minghao and Li,\n Yong-Lu and Lu,\n Cewu\n},\n title = {\n InstaBoost: Boosting Instance Segmentation via Probability Map Guided Copy-Pasting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Instance-Guided Context Rendering for Cross-Domain Person Re-Identification", @@ -16096,7 +16607,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "London;", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United Kingdom;" + "aff_country_unique": "United Kingdom;", + "bibtex": "@InProceedings{Chen_2019_ICCV,\n \n author = {\n Chen,\n Yanbei and Zhu,\n Xiatian and Gong,\n Shaogang\n},\n title = {\n Instance-Guided Context Rendering for Cross-Domain Person Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Instance-Level Future Motion Estimation in a Single Image Based on Ordinal Regression", @@ -16129,7 +16641,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0", - "aff_country_unique": "South Korea;United States" + "aff_country_unique": "South Korea;United States", + "bibtex": "@InProceedings{Kim_2019_ICCV,\n \n author = {\n Kim,\n Kyung-Rae and Choi,\n Whan and Koh,\n Yeong Jun and Jeong,\n Seong-Gyun and Kim,\n Chang-Su\n},\n title = {\n Instance-Level Future Motion Estimation in a Single Image Based on Ordinal Regression\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Integral Object Mining via Online Attention Accumulation", @@ -16155,14 +16668,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Jiang_Integral_Object_Mining_via_Online_Attention_Accumulation_ICCV_2019_paper.html", "aff_unique_index": "0;0;0;0;1;2", - "aff_unique_norm": "Nankai University;University of Technology Sydney;Shanghai Jiao Tong University", + "aff_unique_norm": "Nankai University;University of Technology, Sydney;Shanghai Jiaotong University", "aff_unique_dep": "Computer Science;;", "aff_unique_url": "http://www.nankai.edu.cn;https://www.uts.edu.au;https://www.sjtu.edu.cn", "aff_unique_abbr": "Nankai U;UTS;SJTU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Sydney", "aff_country_unique_index": "0;0;0;0;1;0", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Jiang_2019_ICCV,\n \n author = {\n Jiang,\n Peng-Tao and Hou,\n Qibin and Cao,\n Yang and Cheng,\n Ming-Ming and Wei,\n Yunchao and Xiong,\n Hong-Kai\n},\n title = {\n Integral Object Mining via Online Attention Accumulation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Interactive Sketch & Fill: Multiclass Sketch-to-Image Translation", @@ -16195,7 +16709,8 @@ "aff_campus_unique_index": ";1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;1;0+1;1;1+1;0;1", - "aff_country_unique": "United Kingdom;United States" + "aff_country_unique": "United Kingdom;United States", + "bibtex": "@InProceedings{Ghosh_2019_ICCV,\n \n author = {\n Ghosh,\n Arnab and Zhang,\n Richard and Dokania,\n Puneet K. and Wang,\n Oliver and Efros,\n Alexei A. and Torr,\n Philip H. S. and Shechtman,\n Eli\n},\n title = {\n Interactive Sketch & Fill: Multiclass Sketch-to-Image Translation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Interpolated Convolutional Networks for 3D Point Cloud Understanding", @@ -16221,14 +16736,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Mao_Interpolated_Convolutional_Networks_for_3D_Point_Cloud_Understanding_ICCV_2019_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "Chinese University of Hong Kong", + "aff_unique_norm": "The Chinese University of Hong Kong", "aff_unique_dep": "CUHK-SenseTime Joint Laboratory", "aff_unique_url": "https://www.cuhk.edu.hk", "aff_unique_abbr": "CUHK", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Mao_2019_ICCV,\n \n author = {\n Mao,\n Jiageng and Wang,\n Xiaogang and Li,\n Hongsheng\n},\n title = {\n Interpolated Convolutional Networks for 3D Point Cloud Understanding\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Invariant Information Clustering for Unsupervised Image Classification and Segmentation", @@ -16236,7 +16752,7 @@ "status": "Poster", "track": "main", "pid": "3193", - "author_site": "Xu Ji, Jo\u00c3\u00a3o F. Henriques, Andrea Vedaldi", + "author_site": "Xu Ji, João F. Henriques, Andrea Vedaldi", "author": "Xu Ji; Joao F. Henriques; Andrea Vedaldi", "abstract": "We present a novel clustering objective that learns a neural network classifier from scratch, given only unlabelled data samples. The model discovers clusters that accurately match semantic classes, achieving state-of-the-art results in eight unsupervised clustering benchmarks spanning image classification and segmentation. These include STL10, an unsupervised variant of ImageNet, and CIFAR10, where we significantly beat the accuracy of our closest competitors by 6.6 and 9.5 absolute percentage points respectively. The method is not specialised to computer vision and operates on any paired dataset samples; in our experiments we use random transforms to obtain a pair from each image. The trained network directly outputs semantic labels, rather than high dimensional representations that need external processing to be usable for semantic clustering. The objective is simply to maximise mutual information between the class assignments of each pair. It is easy to implement and rigorously grounded in information theory, meaning we effortlessly avoid degenerate solutions that other clustering methods are susceptible to. In addition to the fully unsupervised mode, we also test two semi-supervised settings. The first achieves 88.8% accuracy on STL10 classification, setting a new global state-of-the-art over all existing methods (whether supervised, semi-supervised or unsupervised). The second shows robustness to 90% reductions in label coverage, of relevance to applications that wish to make use of small amounts of labels. github.com/xu-ji/IIC", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Ji_Invariant_Information_Clustering_for_Unsupervised_Image_Classification_and_Segmentation_ICCV_2019_paper.pdf", @@ -16261,7 +16777,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Ji_2019_ICCV,\n \n author = {\n Ji,\n Xu and Henriques,\n Joao F. and Vedaldi,\n Andrea\n},\n title = {\n Invariant Information Clustering for Unsupervised Image Classification and Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Is This the Right Place? Geometric-Semantic Pose Verification for Indoor Visual Localization", @@ -16285,7 +16802,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Taira_Is_This_the_Right_Place_Geometric-Semantic_Pose_Verification_for_Indoor_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Taira_Is_This_the_Right_Place_Geometric-Semantic_Pose_Verification_for_Indoor_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Taira_2019_ICCV,\n \n author = {\n Taira,\n Hajime and Rocco,\n Ignacio and Sedlar,\n Jiri and Okutomi,\n Masatoshi and Sivic,\n Josef and Pajdla,\n Tomas and Sattler,\n Torsten and Torii,\n Akihiko\n},\n title = {\n Is This the Right Place? Geometric-Semantic Pose Verification for Indoor Visual Localization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Is an Affine Constraint Needed for Affine Subspace Clustering?", @@ -16293,7 +16811,7 @@ "status": "Poster", "track": "main", "pid": "4088", - "author_site": "Chong You, Chun-Guang Li, Daniel P. Robinson, Ren\u00c3\u00a9 Vidal", + "author_site": "Chong You, Chun-Guang Li, Daniel P. Robinson, René Vidal", "author": "Chong You; Chun-Guang Li; Daniel P. Robinson; Rene Vidal", "abstract": "Subspace clustering methods based on expressing each data point as a linear combination of other data points have achieved great success in computer vision applications such as motion segmentation, face and digit clustering. In face clustering, the subspaces are linear and subspace clustering methods can be applied directly. In motion segmentation, the subspaces are affine and an additional affine constraint on the coefficients is often enforced. However, since affine subspaces can always be embedded into linear subspaces of one extra dimension, it is unclear if the affine constraint is really necessary. This paper shows, both theoretically and empirically, that when the dimension of the ambient space is high relative to the sum of the dimensions of the affine subspaces, the affine constraint has a negligible effect on clustering performance. Specifically, our analysis provides conditions that guarantee the correctness of affine subspace clustering methods both with and without the affine constraint, and shows that these conditions are satisfied for high-dimensional data. Underlying our analysis is the notion of affinely independent subspaces, which not only provides geometrically interpretable correctness conditions, but also clarifies the relationships between existing results for affine subspace clustering.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/You_Is_an_Affine_Constraint_Needed_for_Affine_Subspace_Clustering_ICCV_2019_paper.pdf", @@ -16309,7 +16827,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/You_Is_an_Affine_Constraint_Needed_for_Affine_Subspace_Clustering_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/You_Is_an_Affine_Constraint_Needed_for_Affine_Subspace_Clustering_ICCV_2019_paper.html", + "bibtex": "@InProceedings{You_2019_ICCV,\n \n author = {\n You,\n Chong and Li,\n Chun-Guang and Robinson,\n Daniel P. and Vidal,\n Rene\n},\n title = {\n Is an Affine Constraint Needed for Affine Subspace Clustering?\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "JPEG Artifacts Reduction via Deep Convolutional Sparse Coding", @@ -16342,7 +16861,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Fu_2019_ICCV,\n \n author = {\n Fu,\n Xueyang and Zha,\n Zheng-Jun and Wu,\n Feng and Ding,\n Xinghao and Paisley,\n John\n},\n title = {\n JPEG Artifacts Reduction via Deep Convolutional Sparse Coding\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "id": "5613eeb747", @@ -16371,7 +16891,8 @@ "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Beijing;Cardiff", "aff_country_unique_index": "0;0;0;1;0;0;0", - "aff_country_unique": "China;United Kingdom" + "aff_country_unique": "China;United Kingdom", + "bibtex": "@InProceedings{Wu_2019_ICCV,\n \n author = {\n Wu,\n Xiaoping and Wen,\n Ni and Liang,\n Jie and Lai,\n Yu-Kun and She,\n Dongyu and Cheng,\n Ming-Ming and Yang,\n Jufeng\n},\n title = {\n Joint Acne Image Grading and Counting via Label Distribution Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Joint Demosaicking and Denoising by Fine-Tuning of Bursts of Raw Images", @@ -16383,7 +16904,7 @@ "author": "Thibaud Ehret; Axel Davy; Pablo Arias; Gabriele Facciolo", "abstract": "Demosaicking and denoising are the first steps of any camera image processing pipeline and are key for obtaining high quality RGB images. A promising current research trend aims at solving these two problems jointly using convolutional neural networks. Due to the unavailability of ground truth data these networks cannot be currently trained using real RAW images. Instead, they resort to simulated data. In this paper we present a method to learn demosaicking directly from mosaicked images, without requiring ground truth RGB data. We apply this to learn joint demosaicking and denoising only from RAW images, thus enabling the use of real data. In addition we show that for this application fine-tuning a network to a specific burst improves the quality of restoration for both demosaicking and denoising.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Ehret_Joint_Demosaicking_and_Denoising_by_Fine-Tuning_of_Bursts_of_Raw_ICCV_2019_paper.pdf", - "aff": "CMLA, CNRS, ENS Paris-Saclay, Universit\u00e9 Paris-Saclay; Universit\u00e9 Paris-Saclay, 94235 Cachan, France; ; ", + "aff": "CMLA, CNRS, ENS Paris-Saclay, Université Paris-Saclay; Université Paris-Saclay, 94235 Cachan, France; ; ", "project": "", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2019/supplemental/Ehret_Joint_Demosaicking_and_ICCV_2019_supplemental.pdf", @@ -16396,15 +16917,16 @@ "email": "ens-cachan.fr; ; ; ", "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Ehret_Joint_Demosaicking_and_Denoising_by_Fine-Tuning_of_Bursts_of_Raw_ICCV_2019_paper.html", - "aff_unique_index": "0", - "aff_unique_norm": "ENS Paris-Saclay;", + "aff_unique_index": "0;1", + "aff_unique_norm": "ENS Paris-Saclay;Université Paris-Saclay", "aff_unique_dep": ";", - "aff_unique_url": "https://www.ens-paris-saclay.fr;", - "aff_unique_abbr": "ENS Paris-Saclay;", - "aff_campus_unique_index": "0", - "aff_campus_unique": "Paris-Saclay;", - "aff_country_unique_index": "0", - "aff_country_unique": "France;" + "aff_unique_url": "https://www.ens-paris-saclay.fr;https://www.universite-paris-saclay.fr", + "aff_unique_abbr": "ENS Paris-Saclay;UPS", + "aff_campus_unique_index": "0;1", + "aff_campus_unique": "Paris-Saclay;Cachan", + "aff_country_unique_index": "0;0", + "aff_country_unique": "France", + "bibtex": "@InProceedings{Ehret_2019_ICCV,\n \n author = {\n Ehret,\n Thibaud and Davy,\n Axel and Arias,\n Pablo and Facciolo,\n Gabriele\n},\n title = {\n Joint Demosaicking and Denoising by Fine-Tuning of Bursts of Raw Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Joint Embedding of 3D Scan and CAD Objects", @@ -16412,7 +16934,7 @@ "status": "Poster", "track": "main", "pid": "2732", - "author_site": "Manuel Dahnert, Angela Dai, Leonidas J. Guibas, Matthias Nie\u00c3\u009fner", + "author_site": "Manuel Dahnert, Angela Dai, Leonidas J. Guibas, Matthias Nießner", "author": "Manuel Dahnert; Angela Dai; Leonidas J. Guibas; Matthias Niessner", "abstract": "3D scan geometry and CAD models often contain complementary information towards understanding environments, which could be leveraged through establishing a mapping between the two domains. However, this is a challenging task due to strong, lower-level differences between scan and CAD geometry. We propose a novel approach to learn a joint embedding space between scan and CAD geometry, where semantically similar objects from both domains lie close together. To achieve this, we introduce a new 3D CNN-based approach to learn a joint embedding space representing object similarities across these domains. To learn a shared space where scan objects and CAD models can interlace, we propose a stacked hourglass approach to separate foreground and background from a scan object, and transform it to a complete, CAD-like representation to produce a shared embedding space. This embedding space can then be used for CAD model retrieval; to further enable this task, we introduce a new dataset of ranked scan-CAD similarity annotations, enabling new, fine-grained evaluation of CAD model retrieval to cluttered, noisy, partial scans. Our learned joint embedding outperforms current state of the art for CAD model retrieval by 12% in instance retrieval accuracy.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Dahnert_Joint_Embedding_of_3D_Scan_and_CAD_Objects_ICCV_2019_paper.pdf", @@ -16428,7 +16950,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Dahnert_Joint_Embedding_of_3D_Scan_and_CAD_Objects_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Dahnert_Joint_Embedding_of_3D_Scan_and_CAD_Objects_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Dahnert_2019_ICCV,\n \n author = {\n Dahnert,\n Manuel and Dai,\n Angela and Guibas,\n Leonidas J. and Niessner,\n Matthias\n},\n title = {\n Joint Embedding of 3D Scan and CAD Objects\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Joint Group Feature Selection and Discriminative Filter Learning for Robust Visual Object Tracking", @@ -16461,7 +16984,8 @@ "aff_campus_unique_index": "0+1;1;0;1", "aff_campus_unique": "Wuxi;Guildford", "aff_country_unique_index": "0+1;1;0;1", - "aff_country_unique": "China;United Kingdom" + "aff_country_unique": "China;United Kingdom", + "bibtex": "@InProceedings{Xu_2019_ICCV,\n \n author = {\n Xu,\n Tianyang and Feng,\n Zhen-Hua and Wu,\n Xiao-Jun and Kittler,\n Josef\n},\n title = {\n Joint Group Feature Selection and Discriminative Filter Learning for Robust Visual Object Tracking\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Joint Learning of Saliency Detection and Weakly Supervised Semantic Segmentation", @@ -16494,7 +17018,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zeng_2019_ICCV,\n \n author = {\n Zeng,\n Yu and Zhuge,\n Yunzhi and Lu,\n Huchuan and Zhang,\n Lihe\n},\n title = {\n Joint Learning of Saliency Detection and Weakly Supervised Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Joint Learning of Semantic Alignment and Object Landmark Detection", @@ -16506,7 +17031,7 @@ "author": "Sangryul Jeon; Dongbo Min; Seungryong Kim; Kwanghoon Sohn", "abstract": "Convolutional neural networks (CNNs) based approaches for semantic alignment and object landmark detection have improved their performance significantly. Current efforts for the two tasks focus on addressing the lack of massive training data through weakly- or unsupervised learning frameworks. In this paper, we present a joint learning approach for obtaining dense correspondences and discovering object landmarks from semantically similar images. Based on the key insight that the two tasks can mutually provide supervisions to each other, our networks accomplish this through a joint loss function that alternatively imposes a consistency constraint between the two tasks, thereby boosting the performance and addressing the lack of training data in a principled manner. To the best of our knowledge, this is the first attempt to address the lack of training data for the two tasks through the joint learning. To further improve the robustness of our framework, we introduce a probabilistic learning formulation that allows only reliable matches to be used in the joint learning process. With the proposed method, state-of-the-art performance is attained on several benchmarks for semantic matching and landmark detection.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Jeon_Joint_Learning_of_Semantic_Alignment_and_Object_Landmark_Detection_ICCV_2019_paper.pdf", - "aff": "Yonsei University; Ewha Womans University; \u00c9cole Polytechnique F\u00e9d\u00e9rale de Lausanne (EPFL); Yonsei University", + "aff": "Yonsei University; Ewha Womans University; École Polytechnique Fédérale de Lausanne (EPFL); Yonsei University", "project": "", "github": "", "supp": "", @@ -16520,14 +17045,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Jeon_Joint_Learning_of_Semantic_Alignment_and_Object_Landmark_Detection_ICCV_2019_paper.html", "aff_unique_index": "0;1;2;0", - "aff_unique_norm": "Yonsei University;Ewha Womans University;EPFL", + "aff_unique_norm": "Yonsei University;Ewha Womans University;École Polytechnique Fédérale de Lausanne", "aff_unique_dep": ";;", "aff_unique_url": "https://www.yonsei.ac.kr;http://www.ewha.ac.kr;https://www.epfl.ch", "aff_unique_abbr": "Yonsei;Ewha;EPFL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", - "aff_country_unique": "South Korea;Switzerland" + "aff_country_unique": "South Korea;Switzerland", + "bibtex": "@InProceedings{Jeon_2019_ICCV,\n \n author = {\n Jeon,\n Sangryul and Min,\n Dongbo and Kim,\n Seungryong and Sohn,\n Kwanghoon\n},\n title = {\n Joint Learning of Semantic Alignment and Object Landmark Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Joint Monocular 3D Vehicle Detection and Tracking", @@ -16535,7 +17061,7 @@ "status": "Poster", "track": "main", "pid": "839", - "author_site": "Hou-Ning Hu, Qi-Zhi Cai, Dequan Wang, Ji Lin, Min Sun, Philipp Kr\u00c3\u00a4henb\u00c3\u00bchl, Trevor Darrell, Fisher Yu", + "author_site": "Hou-Ning Hu, Qi-Zhi Cai, Dequan Wang, Ji Lin, Min Sun, Philipp Krähenbühl, Trevor Darrell, Fisher Yu", "author": "Hou-Ning Hu; Qi-Zhi Cai; Dequan Wang; Ji Lin; Min Sun; Philipp Krahenbuhl; Trevor Darrell; Fisher Yu", "abstract": "Vehicle 3D extents and trajectories are critical cues for predicting the future location of vehicles and planning future agent ego-motion based on those predictions. In this paper, we propose a novel online framework for 3D vehicle detection and tracking from monocular videos. The framework can not only associate detections of vehicles in motion over time, but also estimate their complete 3D bounding box information from a sequence of 2D images captured on a moving platform. Our method leverages 3D box depth-ordering matching for robust instance association and utilizes 3D trajectory prediction for re-identification of occluded vehicles. We also design a motion learning module based on an LSTM for more accurate long-term motion extrapolation. Our experiments on simulation, KITTI, and Argoverse datasets show that our 3D tracking pipeline offers robust data association and tracking. On Argoverse, our image-based method is significantly better for tracking 3D vehicles within 30 meters than the LiDAR-centric baseline methods.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Hu_Joint_Monocular_3D_Vehicle_Detection_and_Tracking_ICCV_2019_paper.pdf", @@ -16551,7 +17077,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Hu_Joint_Monocular_3D_Vehicle_Detection_and_Tracking_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Hu_Joint_Monocular_3D_Vehicle_Detection_and_Tracking_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Hu_2019_ICCV,\n \n author = {\n Hu,\n Hou-Ning and Cai,\n Qi-Zhi and Wang,\n Dequan and Lin,\n Ji and Sun,\n Min and Krahenbuhl,\n Philipp and Darrell,\n Trevor and Yu,\n Fisher\n},\n title = {\n Joint Monocular 3D Vehicle Detection and Tracking\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Joint Optimization for Cooperative Image Captioning", @@ -16577,14 +17104,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Vered_Joint_Optimization_for_Cooperative_Image_Captioning_ICCV_2019_paper.html", "aff_unique_index": "0;0;0+1;0+1", - "aff_unique_norm": "Bar-Ilan University;NVIDIA", - "aff_unique_dep": ";NVIDIA Corporation", + "aff_unique_norm": "Bar-Ilan University;NVIDIA Corporation", + "aff_unique_dep": ";", "aff_unique_url": "https://www.biu.ac.il;https://www.nvidia.com", "aff_unique_abbr": "BIU;NVIDIA", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+1;0+1", - "aff_country_unique": "Israel;United States" + "aff_country_unique": "Israel;United States", + "bibtex": "@InProceedings{Vered_2019_ICCV,\n \n author = {\n Vered,\n Gilad and Oren,\n Gal and Atzmon,\n Yuval and Chechik,\n Gal\n},\n title = {\n Joint Optimization for Cooperative Image Captioning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Joint Prediction for Kinematic Trajectories in Vehicle-Pedestrian-Mixed Scenes", @@ -16617,7 +17145,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Beijing;", "aff_country_unique_index": "0+1;0;0;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Bi_2019_ICCV,\n \n author = {\n Bi,\n Huikun and Fang,\n Zhong and Mao,\n Tianlu and Wang,\n Zhaoqi and Deng,\n Zhigang\n},\n title = {\n Joint Prediction for Kinematic Trajectories in Vehicle-Pedestrian-Mixed Scenes\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Joint Syntax Representation Learning and Visual Cue Translation for Video Captioning", @@ -16650,7 +17179,8 @@ "aff_campus_unique_index": "0;0;0;1;0", "aff_campus_unique": "Beijing;Rochester", "aff_country_unique_index": "0;0;0;1;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Hou_2019_ICCV,\n \n author = {\n Hou,\n Jingyi and Wu,\n Xinxiao and Zhao,\n Wentian and Luo,\n Jiebo and Jia,\n Yunde\n},\n title = {\n Joint Syntax Representation Learning and Visual Cue Translation for Video Captioning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Jointly Aligning Millions of Images With Deep Penalised Reconstruction Congealing", @@ -16683,7 +17213,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Annunziata_2019_ICCV,\n \n author = {\n Annunziata,\n Roberto and Sagonas,\n Christos and Cali,\n Jacques\n},\n title = {\n Jointly Aligning Millions of Images With Deep Penalised Reconstruction Congealing\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "K-Best Transformation Synchronization", @@ -16716,7 +17247,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Austin;", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States;" + "aff_country_unique": "United States;", + "bibtex": "@InProceedings{Sun_2019_ICCV,\n \n author = {\n Sun,\n Yifan and Zhuo,\n Jiacheng and Mohan,\n Arnav and Huang,\n Qixing\n},\n title = {\n K-Best Transformation Synchronization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "KPConv: Flexible and Deformable Convolution for Point Clouds", @@ -16724,7 +17256,7 @@ "status": "Poster", "track": "main", "pid": "5957", - "author_site": "Hugues Thomas, Charles R. Qi, Jean-Emmanuel Deschaud, Beatriz Marcotegui, Fran\u00c3\u00a7ois Goulette, Leonidas J. Guibas", + "author_site": "Hugues Thomas, Charles R. Qi, Jean-Emmanuel Deschaud, Beatriz Marcotegui, François Goulette, Leonidas J. Guibas", "author": "Hugues Thomas; Charles R. Qi; Jean-Emmanuel Deschaud; Beatriz Marcotegui; Francois Goulette; Leonidas J. Guibas", "abstract": "We present Kernel Point Convolution (KPConv), a new design of point convolution, i.e. that operates on point clouds without any intermediate representation. The convolution weights of KPConv are located in Euclidean space by kernel points, and applied to the input points close to them. Its capacity to use any number of kernel points gives KPConv more flexibility than fixed grid convolutions. Furthermore, these locations are continuous in space and can be learned by the network. Therefore, KPConv can be extended to deformable convolutions that learn to adapt kernel points to local geometry. Thanks to a regular subsampling strategy, KPConv is also efficient and robust to varying densities. Whether they use deformable KPConv for complex tasks, or rigid KPconv for simpler tasks, our networks outperform state-of-the-art classification and segmentation approaches on several datasets. We also offer ablation studies and visualizations to provide understanding of what has been learned by KPConv and to validate the descriptive power of deformable KPConv.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Thomas_KPConv_Flexible_and_Deformable_Convolution_for_Point_Clouds_ICCV_2019_paper.pdf", @@ -16742,14 +17274,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Thomas_KPConv_Flexible_and_Deformable_Convolution_for_Point_Clouds_ICCV_2019_paper.html", "aff_unique_index": "0;1;0;0;0;1+2", - "aff_unique_norm": "MINES ParisTech;Meta;Stanford University", + "aff_unique_norm": "Mines ParisTech;Facebook;Stanford University", "aff_unique_dep": ";Facebook AI Research;", "aff_unique_url": "https://www.mines-paristech.fr;https://research.facebook.com;https://www.stanford.edu", "aff_unique_abbr": "Mines ParisTech;FAIR;Stanford", "aff_campus_unique_index": "1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;1;0;0;0;1+1", - "aff_country_unique": "France;United States" + "aff_country_unique": "France;United States", + "bibtex": "@InProceedings{Thomas_2019_ICCV,\n \n author = {\n Thomas,\n Hugues and Qi,\n Charles R. and Deschaud,\n Jean-Emmanuel and Marcotegui,\n Beatriz and Goulette,\n Francois and Guibas,\n Leonidas J.\n},\n title = {\n KPConv: Flexible and Deformable Convolution for Point Clouds\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Kernel Modeling Super-Resolution on Real Low-Resolution Images", @@ -16757,7 +17290,7 @@ "status": "Poster", "track": "main", "pid": "3508", - "author_site": "Ruofan Zhou, Sabine S\u00c3\u00bcsstrunk", + "author_site": "Ruofan Zhou, Sabine Süsstrunk", "author": "Ruofan Zhou; Sabine Susstrunk", "abstract": "Deep convolutional neural networks (CNNs), trained on corresponding pairs of high- and low-resolution images, achieve state-of-the-art performance in single-image super-resolution and surpass previous signal-processing based approaches. However, their performance is limited when applied to real photographs. The reason lies in their training data: low-resolution (LR) images are obtained by bicubic interpolation of the corresponding high-resolution (HR) images. The applied convolution kernel significantly differs from real-world camera-blur. Consequently, while current CNNs well super-resolve bicubic-downsampled LR images, they often fail on camera-captured LR images. To improve generalization and robustness of deep super-resolution CNNs on real photographs, we present a kernel modeling super-resolution network (KMSR) that incorporates blur-kernel modeling in the training. Our proposed KMSR consists of two stages: we first build a pool of realistic blur-kernels with a generative adversarial network (GAN) and then we train a super-resolution network with HR and corresponding LR images constructed with the generated kernels. Our extensive experimental validations demonstrate the effectiveness of our single-image super-resolution approach on photographs with unknown blur-kernels.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Zhou_Kernel_Modeling_Super-Resolution_on_Real_Low-Resolution_Images_ICCV_2019_paper.pdf", @@ -16775,14 +17308,15 @@ "author_num": 2, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Zhou_Kernel_Modeling_Super-Resolution_on_Real_Low-Resolution_Images_ICCV_2019_paper.html", "aff_unique_index": "0;0", - "aff_unique_norm": "EPFL", + "aff_unique_norm": "École Polytechnique Fédérale de Lausanne", "aff_unique_dep": "IC (Computer Science Department)", "aff_unique_url": "https://www.epfl.ch", "aff_unique_abbr": "EPFL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Switzerland" + "aff_country_unique": "Switzerland", + "bibtex": "@InProceedings{Zhou_2019_ICCV,\n \n author = {\n Zhou,\n Ruofan and Susstrunk,\n Sabine\n},\n title = {\n Kernel Modeling Super-Resolution on Real Low-Resolution Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Key.Net: Keypoint Detection by Handcrafted and Learned CNN Filters", @@ -16815,7 +17349,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1+2;1;0", - "aff_country_unique": "United Kingdom;Spain;United States" + "aff_country_unique": "United Kingdom;Spain;United States", + "bibtex": "@InProceedings{Barroso-Laguna_2019_ICCV,\n \n author = {\n Barroso-Laguna,\n Axel and Riba,\n Edgar and Ponsa,\n Daniel and Mikolajczyk,\n Krystian\n},\n title = {\n Key.Net: Keypoint Detection by Handcrafted and Learned CNN Filters\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Knowledge Distillation via Route Constrained Optimization", @@ -16848,7 +17383,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Jin_2019_ICCV,\n \n author = {\n Jin,\n Xiao and Peng,\n Baoyun and Wu,\n Yichao and Liu,\n Yu and Liu,\n Jiaheng and Liang,\n Ding and Yan,\n Junjie and Hu,\n Xiaolin\n},\n title = {\n Knowledge Distillation via Route Constrained Optimization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "LADN: Local Adversarial Disentangling Network for Facial Makeup and De-Makeup", @@ -16874,14 +17410,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Gu_LADN_Local_Adversarial_Disentangling_Network_for_Facial_Makeup_and_De-Makeup_ICCV_2019_paper.html", "aff_unique_index": "0+1;2+1;3;4;1", - "aff_unique_norm": "Carnegie Mellon University;Hong Kong University of Science and Technology;Stanford University;University of Illinois Urbana-Champaign;Tencent", - "aff_unique_dep": ";;;;Tencent Holdings Limited", + "aff_unique_norm": "Carnegie Mellon University;Hong Kong University of Science and Technology;Stanford University;University of Illinois at Urbana-Champaign;Tencent Holdings Limited", + "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.cmu.edu;https://www.ust.hk;https://www.stanford.edu;https://www illinois.edu;https://www.tencent.com", "aff_unique_abbr": "CMU;HKUST;Stanford;UIUC;Tencent", "aff_campus_unique_index": "1;2+1;3;1", "aff_campus_unique": ";Hong Kong SAR;Stanford;Urbana-Champaign", "aff_country_unique_index": "0+1;0+1;0;1;1", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Gu_2019_ICCV,\n \n author = {\n Gu,\n Qiao and Wang,\n Guanzhi and Chiu,\n Mang Tik and Tai,\n Yu-Wing and Tang,\n Chi-Keung\n},\n title = {\n LADN: Local Adversarial Disentangling Network for Facial Makeup and De-Makeup\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "LAP-Net: Level-Aware Progressive Network for Image Dehazing", @@ -16893,7 +17430,7 @@ "author": "Yunan Li; Qiguang Miao; Wanli Ouyang; Zhenxin Ma; Huijuan Fang; Chao Dong; Yining Quan", "abstract": "In this paper, we propose a level-aware progressive network (LAP-Net) for single image dehazing. Unlike previous multi-stage algorithms that generally learn in a coarse-to-fine fashion, each stage of LAP-Net learns different levels of haze with different supervision. Then the network can progressively learn the gradually aggravating haze. With this design, each stage can focus on a region with specific haze level and restore clear details. To effectively fuse the results of varying haze levels at different stages, we develop an adaptive integration strategy to yield the final dehazed image. This strategy is achieved by a hierarchical integration scheme, which is in cooperation with the memory network and the domain knowledge of dehazing to highlight the best-restored regions of each stage. Extensive experiments on both real-world images and two dehazing benchmarks validate the effectiveness of our proposed method.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Li_LAP-Net_Level-Aware_Progressive_Network_for_Image_Dehazing_ICCV_2019_paper.pdf", - "aff": "School of Computer Science and Technology, Xidian Univeristy, China+Xi\u2019an Key Laboratory of Big Data and Intelligent Vision, China; School of Computer Science and Technology, Xidian Univeristy, China+Xi\u2019an Key Laboratory of Big Data and Intelligent Vision, China; The University of Sydney, SenseTime Computer Vision Research Group, Australia; School of Computer Science and Technology, Xidian Univeristy, China+Xi\u2019an Key Laboratory of Big Data and Intelligent Vision, China; School of Computer Science and Technology, Xidian Univeristy, China+Xi\u2019an Key Laboratory of Big Data and Intelligent Vision, China; Shenzhen Institutes of Advanced Technology, Chinese Academy of Sciences, China; School of Computer Science and Technology, Xidian Univeristy, China+Xi\u2019an Key Laboratory of Big Data and Intelligent Vision, China", + "aff": "School of Computer Science and Technology, Xidian Univeristy, China+Xi’an Key Laboratory of Big Data and Intelligent Vision, China; School of Computer Science and Technology, Xidian Univeristy, China+Xi’an Key Laboratory of Big Data and Intelligent Vision, China; The University of Sydney, SenseTime Computer Vision Research Group, Australia; School of Computer Science and Technology, Xidian Univeristy, China+Xi’an Key Laboratory of Big Data and Intelligent Vision, China; School of Computer Science and Technology, Xidian Univeristy, China+Xi’an Key Laboratory of Big Data and Intelligent Vision, China; Shenzhen Institutes of Advanced Technology, Chinese Academy of Sciences, China; School of Computer Science and Technology, Xidian Univeristy, China+Xi’an Key Laboratory of Big Data and Intelligent Vision, China", "project": "", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2019/supplemental/Li_LAP-Net_Level-Aware_Progressive_ICCV_2019_supplemental.pdf", @@ -16907,14 +17444,15 @@ "author_num": 7, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Li_LAP-Net_Level-Aware_Progressive_Network_for_Image_Dehazing_ICCV_2019_paper.html", "aff_unique_index": "0+1;0+1;2;0+1;0+1;3;0+1", - "aff_unique_norm": "Xidian University;Xi'an Key Laboratory of Big Data and Intelligent Vision;University of Sydney;Shenzhen Institute of Advanced Technology", + "aff_unique_norm": "Xidian University;Xi'an Key Laboratory of Big Data and Intelligent Vision;The University of Sydney;Shenzhen Institutes of Advanced Technology", "aff_unique_dep": "School of Computer Science and Technology;Key Laboratory of Big Data and Intelligent Vision;;", "aff_unique_url": "http://www.xidian.edu.cn;;https://www.sydney.edu.au;http://www.siat.cas.cn", "aff_unique_abbr": "Xidian;;USYD;SIAT", "aff_campus_unique_index": ";;;;1;", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0+0;0+0;1;0+0;0+0;0;0+0", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Li_2019_ICCV,\n \n author = {\n Li,\n Yunan and Miao,\n Qiguang and Ouyang,\n Wanli and Ma,\n Zhenxin and Fang,\n Huijuan and Dong,\n Chao and Quan,\n Yining\n},\n title = {\n LAP-Net: Level-Aware Progressive Network for Image Dehazing\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "LIP: Local Importance-Based Pooling", @@ -16947,7 +17485,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Gao_2019_ICCV,\n \n author = {\n Gao,\n Ziteng and Wang,\n Limin and Wu,\n Gangshan\n},\n title = {\n LIP: Local Importance-Based Pooling\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "LPD-Net: 3D Point Cloud Learning for Large-Scale Place Recognition and Environment Analysis", @@ -16973,14 +17512,15 @@ "author_num": 8, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Liu_LPD-Net_3D_Point_Cloud_Learning_for_Large-Scale_Place_Recognition_and_ICCV_2019_paper.html", "aff_unique_index": "0;0;0;1;0;2;0;0", - "aff_unique_norm": "Chinese University of Hong Kong;Carnegie Mellon University;Shanghai Jiao Tong University", + "aff_unique_norm": "The Chinese University of Hong Kong;Carnegie Mellon University;Shanghai Jiao Tong University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.cmu.edu;https://www.sjtu.edu.cn", "aff_unique_abbr": "CUHK;CMU;SJTU", "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;1;0;0;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Liu_2019_ICCV,\n \n author = {\n Liu,\n Zhe and Zhou,\n Shunbo and Suo,\n Chuanzhe and Yin,\n Peng and Chen,\n Wen and Wang,\n Hesheng and Li,\n Haoang and Liu,\n Yun-Hui\n},\n title = {\n LPD-Net: 3D Point Cloud Learning for Large-Scale Place Recognition and Environment Analysis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Label-PEnet: Sequential Label Propagation and Enhancement Networks for Weakly Supervised Instance Segmentation", @@ -17006,14 +17546,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Ge_Label-PEnet_Sequential_Label_Propagation_and_Enhancement_Networks_for_Weakly_Supervised_ICCV_2019_paper.html", "aff_unique_index": "0+1+2;0+1;0+1;0+1", - "aff_unique_norm": "Malong Technologies;Malong Artificial Intelligence Research Center;University of Hong Kong", + "aff_unique_norm": "Malong Technologies;Malong Artificial Intelligence Research Center;The University of Hong Kong", "aff_unique_dep": ";Artificial Intelligence Research;", "aff_unique_url": ";;https://www.hku.hk", "aff_unique_abbr": ";;HKU", "aff_campus_unique_index": "1+2;1;1;1", "aff_campus_unique": ";Shenzhen;Hong Kong SAR", "aff_country_unique_index": "0+0+0;0+0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Ge_2019_ICCV,\n \n author = {\n Ge,\n Weifeng and Guo,\n Sheng and Huang,\n Weilin and Scott,\n Matthew R.\n},\n title = {\n Label-PEnet: Sequential Label Propagation and Enhancement Networks for Weakly Supervised Instance Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Language Features Matter: Effective Language Representations for Vision-Language Tasks", @@ -17046,7 +17587,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Burns_2019_ICCV,\n \n author = {\n Burns,\n Andrea and Tan,\n Reuben and Saenko,\n Kate and Sclaroff,\n Stan and Plummer,\n Bryan A.\n},\n title = {\n Language Features Matter: Effective Language Representations for Vision-Language Tasks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Language-Agnostic Visual-Semantic Embeddings", @@ -17054,11 +17596,11 @@ "status": "Poster", "track": "main", "pid": "6209", - "author_site": "J\u00c3\u00b4natas Wehrmann, Douglas M. Souza, Maur\u00c3\u00adcio A. Lopes, Rodrigo C. Barros", + "author_site": "Jônatas Wehrmann, Douglas M. Souza, Maurício A. Lopes, Rodrigo C. Barros", "author": "Jonatas Wehrmann; Douglas M. Souza; Mauricio A. Lopes; Rodrigo C. Barros", "abstract": "This paper proposes a framework for training language-invariant cross-modal retrieval models. We also introduce a novel character-based word-embedding approach, allowing the model to project similar words across languages into the same word-embedding space. In addition, by performing cross-modal retrieval at the character level, the storage requirements for a text encoder decrease substantially, allowing for lighter and more scalable retrieval architectures. The proposed language-invariant textual encoder based on characters is virtually unaffected in terms of storage requirements when novel languages are added to the system. Our contributions include new methods for building character-level-based word-embeddings, an improved loss function, and a novel cross-language alignment module that not only makes the architecture language-invariant, but also presents better predictive performance. We show that our models outperform the current state-of-the-art in both single and multi-language scenarios. This work can be seen as the basis of a new path on retrieval research, now allowing for the effective use of captions in multiple-language scenarios. Code is available at https://github.com/jwehrmann/lavse.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Wehrmann_Language-Agnostic_Visual-Semantic_Embeddings_ICCV_2019_paper.pdf", - "aff": "School of Technology, Pontif \u00b4\u0131cia Universidade Cat \u00b4olica do Rio Grande do Sul; School of Technology, Pontif \u00b4\u0131cia Universidade Cat \u00b4olica do Rio Grande do Sul; School of Technology, Pontif \u00b4\u0131cia Universidade Cat \u00b4olica do Rio Grande do Sul; School of Technology, Pontif \u00b4\u0131cia Universidade Cat \u00b4olica do Rio Grande do Sul", + "aff": "School of Technology, Pontif ´ıcia Universidade Cat ´olica do Rio Grande do Sul; School of Technology, Pontif ´ıcia Universidade Cat ´olica do Rio Grande do Sul; School of Technology, Pontif ´ıcia Universidade Cat ´olica do Rio Grande do Sul; School of Technology, Pontif ´ıcia Universidade Cat ´olica do Rio Grande do Sul", "project": "", "github": "https://github.com/jwehrmann/lavse", "supp": "http://openaccess.thecvf.com/content_ICCV_2019/supplemental/Wehrmann_Language-Agnostic_Visual-Semantic_Embeddings_ICCV_2019_supplemental.pdf", @@ -17072,14 +17614,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Wehrmann_Language-Agnostic_Visual-Semantic_Embeddings_ICCV_2019_paper.html", "aff_unique_index": "0;0;0;0", - "aff_unique_norm": "Pontif\u00edcia Universidade Cat\u00f3lica do Rio Grande do Sul", + "aff_unique_norm": "Pontifícia Universidade Católica do Rio Grande do Sul", "aff_unique_dep": "School of Technology", "aff_unique_url": "https://www.pucrs.br", "aff_unique_abbr": "PUCRS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Brazil" + "aff_country_unique": "Brazil", + "bibtex": "@InProceedings{Wehrmann_2019_ICCV,\n \n author = {\n Wehrmann,\n Jonatas and Souza,\n Douglas M. and Lopes,\n Mauricio A. and Barros,\n Rodrigo C.\n},\n title = {\n Language-Agnostic Visual-Semantic Embeddings\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Language-Conditioned Graph Networks for Relational Reasoning", @@ -17103,7 +17646,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Hu_Language-Conditioned_Graph_Networks_for_Relational_Reasoning_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Hu_Language-Conditioned_Graph_Networks_for_Relational_Reasoning_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Hu_2019_ICCV,\n \n author = {\n Hu,\n Ronghang and Rohrbach,\n Anna and Darrell,\n Trevor and Saenko,\n Kate\n},\n title = {\n Language-Conditioned Graph Networks for Relational Reasoning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Laplace Landmark Localization", @@ -17136,7 +17680,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Robinson_2019_ICCV,\n \n author = {\n Robinson,\n Joseph P. and Li,\n Yuncheng and Zhang,\n Ning and Fu,\n Yun and Tulyakov,\n Sergey\n},\n title = {\n Laplace Landmark Localization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Large-Scale Tag-Based Font Retrieval With Generative Feature Learning", @@ -17169,7 +17714,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Chen_2019_ICCV,\n \n author = {\n Chen,\n Tianlang and Wang,\n Zhaowen and Xu,\n Ning and Jin,\n Hailin and Luo,\n Jiebo\n},\n title = {\n Large-Scale Tag-Based Font Retrieval With Generative Feature Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Larger Norm More Transferable: An Adaptive Feature Norm Approach for Unsupervised Domain Adaptation", @@ -17202,7 +17748,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0+1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Xu_2019_ICCV,\n \n author = {\n Xu,\n Ruijia and Li,\n Guanbin and Yang,\n Jihan and Lin,\n Liang\n},\n title = {\n Larger Norm More Transferable: An Adaptive Feature Norm Approach for Unsupervised Domain Adaptation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Layout-Induced Video Representation for Recognizing Agent-in-Place Actions", @@ -17235,7 +17782,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "College Park;", "aff_country_unique_index": "0+0;0;0;0;0+0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Yu_2019_ICCV,\n \n author = {\n Yu,\n Ruichi and Wang,\n Hongcheng and Li,\n Ang and Zheng,\n Jingxiao and Morariu,\n Vlad I. and Davis,\n Larry S.\n},\n title = {\n Layout-Induced Video Representation for Recognizing Agent-in-Place Actions\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "LayoutVAE: Stochastic Scene Layout Generation From a Label Set", @@ -17268,7 +17816,8 @@ "aff_campus_unique_index": ";;;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0+0;0+0;0+0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Jyothi_2019_ICCV,\n \n author = {\n Jyothi,\n Akash Abdu and Durand,\n Thibaut and He,\n Jiawei and Sigal,\n Leonid and Mori,\n Greg\n},\n title = {\n LayoutVAE: Stochastic Scene Layout Generation From a Label Set\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Learn to Scale: Generating Multipolar Normalized Density Maps for Crowd Counting", @@ -17294,14 +17843,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Xu_Learn_to_Scale_Generating_Multipolar_Normalized_Density_Maps_for_Crowd_ICCV_2019_paper.html", "aff_unique_index": "0;1;1;2;0;0", - "aff_unique_norm": "Huazhong University of Science and Technology;Microsoft;University of Oxford", + "aff_unique_norm": "Huazhong University of Science and Technology;Microsoft Research;University of Oxford", "aff_unique_dep": ";Research;", "aff_unique_url": "http://www.hust.edu.cn;https://www.microsoft.com/en-us/research/group/asia;https://www.ox.ac.uk", "aff_unique_abbr": "HUST;MSR Asia;Oxford", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0;0;0;1;0;0", - "aff_country_unique": "China;United Kingdom" + "aff_country_unique": "China;United Kingdom", + "bibtex": "@InProceedings{Xu_2019_ICCV,\n \n author = {\n Xu,\n Chenfeng and Qiu,\n Kai and Fu,\n Jianlong and Bai,\n Song and Xu,\n Yongchao and Bai,\n Xiang\n},\n title = {\n Learn to Scale: Generating Multipolar Normalized Density Maps for Crowd Counting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Learnable Triangulation of Human Pose", @@ -17327,14 +17877,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Iskakov_Learnable_Triangulation_of_Human_Pose_ICCV_2019_paper.html", "aff_unique_index": "0;0+1;0+1;0", - "aff_unique_norm": "Samsung;Skolkovo Institute of Science and Technology", + "aff_unique_norm": "Samsung AI Center;Skolkovo Institute of Science and Technology", "aff_unique_dep": "AI Center;", "aff_unique_url": "https://www.samsung.com/global/innovation/ai-research/;https://www.skoltech.ru", "aff_unique_abbr": "Samsung AI;Skoltech", "aff_campus_unique_index": "0;0+0;0+0;0", "aff_campus_unique": "Moscow", "aff_country_unique_index": "0;0+0;0+0;0", - "aff_country_unique": "Russian Federation" + "aff_country_unique": "Russia", + "bibtex": "@InProceedings{Iskakov_2019_ICCV,\n \n author = {\n Iskakov,\n Karim and Burkov,\n Egor and Lempitsky,\n Victor and Malkov,\n Yury\n},\n title = {\n Learnable Triangulation of Human Pose\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Learned Video Compression", @@ -17367,7 +17918,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Rippel_2019_ICCV,\n \n author = {\n Rippel,\n Oren and Nair,\n Sanjay and Lew,\n Carissa and Branson,\n Steve and Anderson,\n Alexander G. and Bourdev,\n Lubomir\n},\n title = {\n Learned Video Compression\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Learning Aberrance Repressed Correlation Filters for Real-Time UAV Tracking", @@ -17400,7 +17952,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "1", - "aff_country_unique": ";China" + "aff_country_unique": ";China", + "bibtex": "@InProceedings{Huang_2019_ICCV,\n \n author = {\n Huang,\n Ziyuan and Fu,\n Changhong and Li,\n Yiming and Lin,\n Fuling and Lu,\n Peng\n},\n title = {\n Learning Aberrance Repressed Correlation Filters for Real-Time UAV Tracking\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Learning Across Tasks and Domains", @@ -17433,7 +17986,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Italy" + "aff_country_unique": "Italy", + "bibtex": "@InProceedings{Ramirez_2019_ICCV,\n \n author = {\n Ramirez,\n Pierluigi Zama and Tonioni,\n Alessio and Salti,\n Samuele and Stefano,\n Luigi Di\n},\n title = {\n Learning Across Tasks and Domains\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Learning Combinatorial Embedding Networks for Deep Graph Matching", @@ -17445,7 +17999,7 @@ "author": "Runzhong Wang; Junchi Yan; Xiaokang Yang", "abstract": "Graph matching refers to finding node correspondence between graphs, such that the corresponding node and edge's affinity can be maximized. In addition with its NP-completeness nature, another important challenge is effective modeling of the node-wise and structure-wise affinity across graphs and the resulting objective, to guide the matching procedure effectively finding the true matching against noises. To this end, this paper devises an end-to-end differentiable deep network pipeline to learn the affinity for graph matching. It involves a supervised permutation loss regarding with node correspondence to capture the combinatorial nature for graph matching. Meanwhile deep graph embedding models are adopted to parameterize both intra-graph and cross-graph affinity functions, instead of the traditional shallow and simple parametric forms e.g. a Gaussian kernel. The embedding can also effectively capture the higher-order structure beyond second-order edges. The permutation loss model is agnostic to the number of nodes, and the embedding model is shared among nodes such that the network allows for varying numbers of nodes in graphs for training and inference. Moreover, our network is class-agnostic with some generalization capability across different categories. All these features are welcomed for real-world applications. Experiments show its superiority against state-of-the-art graph matching learning methods.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Wang_Learning_Combinatorial_Embedding_Networks_for_Deep_Graph_Matching_ICCV_2019_paper.pdf", - "aff": "Department of Computer Science and Engineering, Shanghai Jiao Tong University + MoE Key Lab of Arti\ufb01cial Intelligence, AI Institute, Shanghai Jiao Tong University; Department of Computer Science and Engineering, Shanghai Jiao Tong University + MoE Key Lab of Arti\ufb01cial Intelligence, AI Institute, Shanghai Jiao Tong University; MoE Key Lab of Arti\ufb01cial Intelligence, AI Institute, Shanghai Jiao Tong University", + "aff": "Department of Computer Science and Engineering, Shanghai Jiao Tong University + MoE Key Lab of Artificial Intelligence, AI Institute, Shanghai Jiao Tong University; Department of Computer Science and Engineering, Shanghai Jiao Tong University + MoE Key Lab of Artificial Intelligence, AI Institute, Shanghai Jiao Tong University; MoE Key Lab of Artificial Intelligence, AI Institute, Shanghai Jiao Tong University", "project": "", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2019/supplemental/Wang_Learning_Combinatorial_Embedding_ICCV_2019_supplemental.pdf", @@ -17463,10 +18017,11 @@ "aff_unique_dep": "Department of Computer Science and Engineering", "aff_unique_url": "https://www.sjtu.edu.cn", "aff_unique_abbr": "SJTU", - "aff_campus_unique_index": "1;1;1", - "aff_campus_unique": ";Shanghai", + "aff_campus_unique_index": ";", + "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2019_ICCV,\n \n author = {\n Wang,\n Runzhong and Yan,\n Junchi and Yang,\n Xiaokang\n},\n title = {\n Learning Combinatorial Embedding Networks for Deep Graph Matching\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Learning Compositional Neural Information Fusion for Human Parsing", @@ -17499,7 +18054,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;1;2;0;1;0", - "aff_country_unique": "United Arab Emirates;China;United States" + "aff_country_unique": "United Arab Emirates;China;United States", + "bibtex": "@InProceedings{Wang_2019_ICCV,\n \n author = {\n Wang,\n Wenguan and Zhang,\n Zhijie and Qi,\n Siyuan and Shen,\n Jianbing and Pang,\n Yanwei and Shao,\n Ling\n},\n title = {\n Learning Compositional Neural Information Fusion for Human Parsing\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Learning Compositional Representations for Few-Shot Recognition", @@ -17532,7 +18088,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Pittsburgh", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Tokmakov_2019_ICCV,\n \n author = {\n Tokmakov,\n Pavel and Wang,\n Yu-Xiong and Hebert,\n Martial\n},\n title = {\n Learning Compositional Representations for Few-Shot Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Learning Deep Priors for Image Dehazing", @@ -17565,7 +18122,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2019_ICCV,\n \n author = {\n Liu,\n Yang and Pan,\n Jinshan and Ren,\n Jimmy and Su,\n Zhixun\n},\n title = {\n Learning Deep Priors for Image Dehazing\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Learning Discriminative Model Prediction for Tracking", @@ -17577,7 +18135,7 @@ "author": "Goutam Bhat; Martin Danelljan; Luc Van Gool; Radu Timofte", "abstract": "The current strive towards end-to-end trainable computer vision systems imposes major challenges for the task of visual tracking. In contrast to most other vision problems, tracking requires the learning of a robust target-specific appearance model online, during the inference stage. To be end-to-end trainable, the online learning of the target model thus needs to be embedded in the tracking architecture itself. Due to the imposed challenges, the popular Siamese paradigm simply predicts a target feature template, while ignoring the background appearance information during inference. Consequently, the predicted model possesses limited target-background discriminability. We develop an end-to-end tracking architecture, capable of fully exploiting both target and background appearance information for target model prediction. Our architecture is derived from a discriminative learning loss by designing a dedicated optimization process that is capable of predicting a powerful model in only a few iterations. Furthermore, our approach is able to learn key aspects of the discriminative loss itself. The proposed tracker sets a new state-of-the-art on 6 tracking benchmarks, achieving an EAO score of 0.440 on VOT2018, while running at over 40 FPS. The code and models are available at https://github.com/visionml/pytracking.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Bhat_Learning_Discriminative_Model_Prediction_for_Tracking_ICCV_2019_paper.pdf", - "aff": "CVL, ETH Z\u00fcrich, Switzerland; CVL, ETH Z\u00fcrich, Switzerland; CVL, ETH Z\u00fcrich, Switzerland; CVL, ETH Z\u00fcrich, Switzerland", + "aff": "CVL, ETH Zürich, Switzerland; CVL, ETH Zürich, Switzerland; CVL, ETH Zürich, Switzerland; CVL, ETH Zürich, Switzerland", "project": "", "github": "https://github.com/visionml/pytracking", "supp": "http://openaccess.thecvf.com/content_ICCV_2019/supplemental/Bhat_Learning_Discriminative_Model_ICCV_2019_supplemental.pdf", @@ -17591,14 +18149,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Bhat_Learning_Discriminative_Model_Prediction_for_Tracking_ICCV_2019_paper.html", "aff_unique_index": "0;0;0;0", - "aff_unique_norm": "ETH Zurich", + "aff_unique_norm": "ETH Zürich", "aff_unique_dep": "CVL", "aff_unique_url": "https://www.ethz.ch", "aff_unique_abbr": "ETHZ", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Switzerland" + "aff_country_unique": "Switzerland", + "bibtex": "@InProceedings{Bhat_2019_ICCV,\n \n author = {\n Bhat,\n Goutam and Danelljan,\n Martin and Gool,\n Luc Van and Timofte,\n Radu\n},\n title = {\n Learning Discriminative Model Prediction for Tracking\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Learning Feature-to-Feature Translator by Alternating Back-Propagation for Generative Zero-Shot Learning", @@ -17631,7 +18190,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;1;0;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Zhu_2019_ICCV,\n \n author = {\n Zhu,\n Yizhe and Xie,\n Jianwen and Liu,\n Bingchen and Elgammal,\n Ahmed\n},\n title = {\n Learning Feature-to-Feature Translator by Alternating Back-Propagation for Generative Zero-Shot Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Learning Filter Basis for Convolutional Neural Network Compression", @@ -17664,7 +18224,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+1;0", - "aff_country_unique": "Switzerland;Belgium" + "aff_country_unique": "Switzerland;Belgium", + "bibtex": "@InProceedings{Li_2019_ICCV,\n \n author = {\n Li,\n Yawei and Gu,\n Shuhang and Gool,\n Luc Van and Timofte,\n Radu\n},\n title = {\n Learning Filter Basis for Convolutional Neural Network Compression\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Learning Fixed Points in Generative Adversarial Networks: From Image-to-Image Translation to Disease Detection and Localization", @@ -17676,7 +18237,7 @@ "author": "Md Mahfuzur Rahman Siddiquee; Zongwei Zhou; Nima Tajbakhsh; Ruibin Feng; Michael B. Gotway; Yoshua Bengio; Jianming Liang", "abstract": "Generative adversarial networks (GANs) have ushered in a revolution in image-to-image translation. The development and proliferation of GANs raises an interesting question: can we train a GAN to remove an object, if present, from an image while otherwise preserving the image? Specifically, can a GAN \"virtually heal\" anyone by turning his medical image, with an unknown health status (diseased or healthy), into a healthy one, so that diseased regions could be revealed by subtracting those two images? Such a task requires a GAN to identify a minimal subset of target pixels for domain translation, an ability that we call fixed-point translation, which no GAN is equipped with yet. Therefore, we propose a new GAN, called Fixed-Point GAN, trained by (1) supervising same-domain translation through a conditional identity loss, and (2) regularizing cross-domain translation through revised adversarial, domain classification, and cycle consistency loss. Based on fixed-point translation, we further derive a novel framework for disease detection and localization using only image-level annotation. Qualitative and quantitative evaluations demonstrate that the proposed method outperforms the state of the art in multi-domain image-to-image translation and that it surpasses predominant weakly-supervised localization methods in both disease detection and localization. Implementation is available at https://github.com/jlianglab/Fixed-Point-GAN.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Siddiquee_Learning_Fixed_Points_in_Generative_Adversarial_Networks_From_Image-to-Image_Translation_ICCV_2019_paper.pdf", - "aff": "Arizona State University; Arizona State University + Mila \u2013 Quebec Artificial Intelligence Institute; Arizona State University; Arizona State University; Mayo Clinic; Mila \u2013 Quebec Artificial Intelligence Institute; Arizona State University + Mila \u2013 Quebec Artificial Intelligence Institute", + "aff": "Arizona State University; Arizona State University + Mila – Quebec Artificial Intelligence Institute; Arizona State University; Arizona State University; Mayo Clinic; Mila – Quebec Artificial Intelligence Institute; Arizona State University + Mila – Quebec Artificial Intelligence Institute", "project": "", "github": "https://github.com/jlianglab/Fixed-Point-GAN", "supp": "http://openaccess.thecvf.com/content_ICCV_2019/supplemental/Siddiquee_Learning_Fixed_Points_ICCV_2019_supplemental.pdf", @@ -17697,7 +18258,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0+1;0;0;0;1;0+1", - "aff_country_unique": "United States;Canada" + "aff_country_unique": "United States;Canada", + "bibtex": "@InProceedings{Siddiquee_2019_ICCV,\n \n author = {\n Siddiquee,\n Md Mahfuzur Rahman and Zhou,\n Zongwei and Tajbakhsh,\n Nima and Feng,\n Ruibin and Gotway,\n Michael B. and Bengio,\n Yoshua and Liang,\n Jianming\n},\n title = {\n Learning Fixed Points in Generative Adversarial Networks: From Image-to-Image Translation to Disease Detection and Localization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Learning Implicit Generative Models by Matching Perceptual Features", @@ -17723,14 +18285,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/dos_Santos_Learning_Implicit_Generative_Models_by_Matching_Perceptual_Features_ICCV_2019_paper.html", "aff_unique_index": "0;0;0;0", - "aff_unique_norm": "IBM", - "aff_unique_dep": "IBM Research", + "aff_unique_norm": "IBM Research", + "aff_unique_dep": "", "aff_unique_url": "https://www.ibm.com/research", "aff_unique_abbr": "IBM", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "T.J. Watson Research Center", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Santos_2019_ICCV,\n \n author = {\n Santos,\n Cicero Nogueira dos and Mroueh,\n Youssef and Padhi,\n Inkit and Dognin,\n Pierre\n},\n title = {\n Learning Implicit Generative Models by Matching Perceptual Features\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Learning Joint 2D-3D Representations for Depth Completion", @@ -17763,7 +18326,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0+1;0;0+1", - "aff_country_unique": "United States;Canada" + "aff_country_unique": "United States;Canada", + "bibtex": "@InProceedings{Chen_2019_ICCV,\n \n author = {\n Chen,\n Yun and Yang,\n Bin and Liang,\n Ming and Urtasun,\n Raquel\n},\n title = {\n Learning Joint 2D-3D Representations for Depth Completion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Learning Lightweight Lane Detection CNNs by Self Attention Distillation", @@ -17789,14 +18353,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Hou_Learning_Lightweight_Lane_Detection_CNNs_by_Self_Attention_Distillation_ICCV_2019_paper.html", "aff_unique_index": "0;1;1;2", - "aff_unique_norm": "Chinese University of Hong Kong;SenseTime Group Limited;Nanyang Technological University", + "aff_unique_norm": "The Chinese University of Hong Kong;SenseTime Group Limited;Nanyang Technological University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.sensetime.com;https://www.ntu.edu.sg", "aff_unique_abbr": "CUHK;SenseTime;NTU", "aff_campus_unique_index": "0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;1", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Hou_2019_ICCV,\n \n author = {\n Hou,\n Yuenan and Ma,\n Zheng and Liu,\n Chunxiao and Loy,\n Chen Change\n},\n title = {\n Learning Lightweight Lane Detection CNNs by Self Attention Distillation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Learning Local Descriptors With a CDF-Based Dynamic Soft Margin", @@ -17829,7 +18394,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhang_2019_ICCV,\n \n author = {\n Zhang,\n Linguang and Rusinkiewicz,\n Szymon\n},\n title = {\n Learning Local Descriptors With a CDF-Based Dynamic Soft Margin\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Learning Local RGB-to-CAD Correspondences for Object Pose Estimation", @@ -17837,7 +18403,7 @@ "status": "Poster", "track": "main", "pid": "1967", - "author_site": "Georgios Georgakis, Srikrishna Karanam, Ziyan Wu, Jana Ko\u00c5\u00a1eck\u00c3\u00a1", + "author_site": "Georgios Georgakis, Srikrishna Karanam, Ziyan Wu, Jana KoÅ¡ecká", "author": "Georgios Georgakis; Srikrishna Karanam; Ziyan Wu; Jana Kosecka", "abstract": "We consider the problem of 3D object pose estimation. While much recent work has focused on the RGB domain, the reliance on accurately annotated images limits generalizability and scalability. On the other hand, the easily available object CAD models are rich sources of data, providing a large number of synthetically rendered images. In this paper, we solve this key problem of existing methods requiring expensive 3D pose annotations by proposing a new method that matches RGB images to CAD models for object pose estimation. Our key innovations compared to existing work include removing the need for either real-world textures for CAD models or explicit 3D pose annotations for RGB images. We achieve this through a series of objectives that learn how to select keypoints and enforce viewpoint and modality invariance across RGB images and CAD model renderings. Our experiments demonstrate that the proposed method can reliably estimate object pose in RGB images and generalize to object instances not seen during training.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Georgakis_Learning_Local_RGB-to-CAD_Correspondences_for_Object_Pose_Estimation_ICCV_2019_paper.pdf", @@ -17862,7 +18428,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0", - "aff_country_unique": "United States;Germany" + "aff_country_unique": "United States;Germany", + "bibtex": "@InProceedings{Georgakis_2019_ICCV,\n \n author = {\n Georgakis,\n Georgios and Karanam,\n Srikrishna and Wu,\n Ziyan and Kosecka,\n Jana\n},\n title = {\n Learning Local RGB-to-CAD Correspondences for Object Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Learning Meshes for Dense Visual SLAM", @@ -17895,7 +18462,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "London;", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United Kingdom;" + "aff_country_unique": "United Kingdom;", + "bibtex": "@InProceedings{Bloesch_2019_ICCV,\n \n author = {\n Bloesch,\n Michael and Laidlow,\n Tristan and Clark,\n Ronald and Leutenegger,\n Stefan and Davison,\n Andrew J.\n},\n title = {\n Learning Meshes for Dense Visual SLAM\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Learning Motion in Feature Space: Locally-Consistent Deformable Convolution Networks for Fine-Grained Action Detection", @@ -17921,14 +18489,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Mac_Learning_Motion_in_Feature_Space_Locally-Consistent_Deformable_Convolution_Networks_for_ICCV_2019_paper.html", "aff_unique_index": "0;1;0;1;1;0", - "aff_unique_norm": "University of Illinois Urbana-Champaign;IBM", + "aff_unique_norm": "University of Illinois at Urbana-Champaign;IBM Research", "aff_unique_dep": ";AI", "aff_unique_url": "https://illinois.edu;https://www.ibm.com/research", "aff_unique_abbr": "UIUC;IBM", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Urbana-Champaign;", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Mac_2019_ICCV,\n \n author = {\n Mac,\n Khoi-Nguyen C. and Joshi,\n Dhiraj and Yeh,\n Raymond A. and Xiong,\n Jinjun and Feris,\n Rogerio S. and Do,\n Minh N.\n},\n title = {\n Learning Motion in Feature Space: Locally-Consistent Deformable Convolution Networks for Fine-Grained Action Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Learning Object-Specific Distance From a Monocular Image", @@ -17961,7 +18530,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Abu Dhabi", "aff_country_unique_index": "0+0+1;0+0+1", - "aff_country_unique": "United States;United Arab Emirates" + "aff_country_unique": "United States;United Arab Emirates", + "bibtex": "@InProceedings{Zhu_2019_ICCV,\n \n author = {\n Zhu,\n Jing and Fang,\n Yi\n},\n title = {\n Learning Object-Specific Distance From a Monocular Image\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Learning Perspective Undistortion of Portraits", @@ -17985,7 +18555,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Zhao_Learning_Perspective_Undistortion_of_Portraits_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Zhao_Learning_Perspective_Undistortion_of_Portraits_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Zhao_2019_ICCV,\n \n author = {\n Zhao,\n Yajie and Huang,\n Zeng and Li,\n Tianye and Chen,\n Weikai and LeGendre,\n Chloe and Ren,\n Xinglei and Shapiro,\n Ari and Li,\n Hao\n},\n title = {\n Learning Perspective Undistortion of Portraits\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Learning Propagation for Arbitrarily-Structured Data", @@ -18011,14 +18582,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Liu_Learning_Propagation_for_Arbitrarily-Structured_Data_ICCV_2019_paper.html", "aff_unique_index": "0;0+1;0;0;0", - "aff_unique_norm": "NVIDIA;University of California, Merced", - "aff_unique_dep": "NVIDIA Corporation;", + "aff_unique_norm": "NVIDIA Corporation;University of California, Merced", + "aff_unique_dep": ";", "aff_unique_url": "https://www.nvidia.com;https://www.ucmerced.edu", "aff_unique_abbr": "NVIDIA;UC Merced", "aff_campus_unique_index": "1", "aff_campus_unique": ";Merced", "aff_country_unique_index": "0;0+0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Liu_2019_ICCV,\n \n author = {\n Liu,\n Sifei and Li,\n Xueting and Jampani,\n Varun and Mello,\n Shalini De and Kautz,\n Jan\n},\n title = {\n Learning Propagation for Arbitrarily-Structured Data\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Learning Relationships for Multi-View 3D Object Recognition", @@ -18051,7 +18623,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Beijing", "aff_country_unique_index": "0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yang_2019_ICCV,\n \n author = {\n Yang,\n Ze and Wang,\n Liwei\n},\n title = {\n Learning Relationships for Multi-View 3D Object Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Learning Rich Features at High-Speed for Single-Shot Object Detection", @@ -18063,7 +18636,7 @@ "author": "Tiancai Wang; Rao Muhammad Anwer; Hisham Cholakkal; Fahad Shahbaz Khan; Yanwei Pang; Ling Shao", "abstract": "Single-stage object detection methods have received significant attention recently due to their characteristic realtime capabilities and high detection accuracies. Generally, most existing single-stage detectors follow two common practices: they employ a network backbone that is pretrained on ImageNet for the classification task and use a top-down feature pyramid representation for handling scale variations. Contrary to common pre-training strategy, recent works have demonstrated the benefits of training from scratch to reduce the task gap between classification and localization, especially at high overlap thresholds. However, detection models trained from scratch require significantly longer training time compared to their typical finetuning based counterparts. We introduce a single-stage detection framework that combines the advantages of both fine-tuning pretrained models and training from scratch. Our framework constitutes a standard network that uses a pre-trained backbone and a parallel light-weight auxiliary network trained from scratch. Further, we argue that the commonly used top-down pyramid representation only focuses on passing high-level semantics from the top layers to bottom layers. We introduce a bi-directional network that efficiently circulates both low-/mid-level and high-level semantic information in the detection framework. Experiments are performed on MS COCO and UAVDT datasets. Compared to the baseline, our detector achieives an absolute gain of 7.4% and 4.2% in average precision (AP) on MS COCO and UAVDT datasets, respectively using VGG backbone. For a 300x300 input on the MS COCO test set, our detector with ResNet backbone surpasses existing single-stage detection methods for single-scale inference achieving 34.3 AP, while operating at an inference time of 19 milliseconds on a single Titan X GPU. Code is avail- able at https://github.com/vaesl/LRF-Net.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Wang_Learning_Rich_Features_at_High-Speed_for_Single-Shot_Object_Detection_ICCV_2019_paper.pdf", - "aff": "School of Electrical and Information Engineering, Tianjin University+Inception Institute of Arti\ufb01cial Intelligence (IIAI), UAE; Inception Institute of Arti\ufb01cial Intelligence (IIAI), UAE; Inception Institute of Arti\ufb01cial Intelligence (IIAI), UAE; Inception Institute of Arti\ufb01cial Intelligence (IIAI), UAE; School of Electrical and Information Engineering, Tianjin University; Inception Institute of Arti\ufb01cial Intelligence (IIAI), UAE", + "aff": "School of Electrical and Information Engineering, Tianjin University+Inception Institute of Artificial Intelligence (IIAI), UAE; Inception Institute of Artificial Intelligence (IIAI), UAE; Inception Institute of Artificial Intelligence (IIAI), UAE; Inception Institute of Artificial Intelligence (IIAI), UAE; School of Electrical and Information Engineering, Tianjin University; Inception Institute of Artificial Intelligence (IIAI), UAE", "project": "", "github": "https://github.com/vaesl/LRF-Net", "supp": "", @@ -18084,7 +18657,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;1;1;1;0;1", - "aff_country_unique": "China;United Arab Emirates" + "aff_country_unique": "China;United Arab Emirates", + "bibtex": "@InProceedings{Wang_2019_ICCV,\n \n author = {\n Wang,\n Tiancai and Anwer,\n Rao Muhammad and Cholakkal,\n Hisham and Khan,\n Fahad Shahbaz and Pang,\n Yanwei and Shao,\n Ling\n},\n title = {\n Learning Rich Features at High-Speed for Single-Shot Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Learning Robust Facial Landmark Detection via Hierarchical Structured Ensemble", @@ -18108,7 +18682,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Zou_Learning_Robust_Facial_Landmark_Detection_via_Hierarchical_Structured_Ensemble_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Zou_Learning_Robust_Facial_Landmark_Detection_via_Hierarchical_Structured_Ensemble_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Zou_2019_ICCV,\n \n author = {\n Zou,\n Xu and Zhong,\n Sheng and Yan,\n Luxin and Zhao,\n Xiangyun and Zhou,\n Jiahuan and Wu,\n Ying\n},\n title = {\n Learning Robust Facial Landmark Detection via Hierarchical Structured Ensemble\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Learning Semantic-Specific Graph Representation for Multi-Label Image Recognition", @@ -18132,7 +18707,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Chen_Learning_Semantic-Specific_Graph_Representation_for_Multi-Label_Image_Recognition_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Chen_Learning_Semantic-Specific_Graph_Representation_for_Multi-Label_Image_Recognition_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Chen_2019_ICCV,\n \n author = {\n Chen,\n Tianshui and Xu,\n Muxin and Hui,\n Xiaolu and Wu,\n Hefeng and Lin,\n Liang\n},\n title = {\n Learning Semantic-Specific Graph Representation for Multi-Label Image Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Learning Shape Templates With Structured Implicit Functions", @@ -18156,7 +18732,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Genova_Learning_Shape_Templates_With_Structured_Implicit_Functions_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Genova_Learning_Shape_Templates_With_Structured_Implicit_Functions_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Genova_2019_ICCV,\n \n author = {\n Genova,\n Kyle and Cole,\n Forrester and Vlasic,\n Daniel and Sarna,\n Aaron and Freeman,\n William T. and Funkhouser,\n Thomas\n},\n title = {\n Learning Shape Templates With Structured Implicit Functions\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Learning Similarity Conditions Without Explicit Supervision", @@ -18182,14 +18759,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Tan_Learning_Similarity_Conditions_Without_Explicit_Supervision_ICCV_2019_paper.html", "aff_unique_index": "0;1;0;0", - "aff_unique_norm": "Boston University;University of Illinois Urbana-Champaign", + "aff_unique_norm": "Boston University;University of Illinois at Urbana-Champaign", "aff_unique_dep": ";", "aff_unique_url": "https://www.bu.edu;https://illinois.edu", "aff_unique_abbr": "BU;UIUC", "aff_campus_unique_index": "1", "aff_campus_unique": ";Urbana-Champaign", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Tan_2019_ICCV,\n \n author = {\n Tan,\n Reuben and Vasileva,\n Mariya I. and Saenko,\n Kate and Plummer,\n Bryan A.\n},\n title = {\n Learning Similarity Conditions Without Explicit Supervision\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Learning Single Camera Depth Estimation Using Dual-Pixels", @@ -18213,7 +18791,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Garg_Learning_Single_Camera_Depth_Estimation_Using_Dual-Pixels_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Garg_Learning_Single_Camera_Depth_Estimation_Using_Dual-Pixels_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Garg_2019_ICCV,\n \n author = {\n Garg,\n Rahul and Wadhwa,\n Neal and Ansari,\n Sameer and Barron,\n Jonathan T.\n},\n title = {\n Learning Single Camera Depth Estimation Using Dual-Pixels\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Learning Spatial Awareness to Improve Crowd Counting", @@ -18239,14 +18818,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Cheng_Learning_Spatial_Awareness_to_Improve_Crowd_Counting_ICCV_2019_paper.html", "aff_unique_index": "0+1;0+2+1;1;0;2", - "aff_unique_norm": "Southwest Jiao Tong University;Microsoft;Carnegie Mellon University", + "aff_unique_norm": "Southwest Jiaotong University;Microsoft Corporation;Carnegie Mellon University", "aff_unique_dep": ";Microsoft Research;", "aff_unique_url": "https://www.swjtu.edu.cn;https://www.microsoft.com/en-us/research;https://www.cmu.edu", "aff_unique_abbr": "SWJTU;MSR;CMU", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0+1+1;1;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Cheng_2019_ICCV,\n \n author = {\n Cheng,\n Zhi-Qi and Li,\n Jun-Xiu and Dai,\n Qi and Wu,\n Xiao and Hauptmann,\n Alexander G.\n},\n title = {\n Learning Spatial Awareness to Improve Crowd Counting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Learning Temporal Action Proposals With Fewer Labels", @@ -18279,7 +18859,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Ji_2019_ICCV,\n \n author = {\n Ji,\n Jingwei and Cao,\n Kaidi and Niebles,\n Juan Carlos\n},\n title = {\n Learning Temporal Action Proposals With Fewer Labels\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Learning Trajectory Dependencies for Human Motion Prediction", @@ -18305,14 +18886,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Mao_Learning_Trajectory_Dependencies_for_Human_Motion_Prediction_ICCV_2019_paper.html", "aff_unique_index": "0;0+1;2;0+1", - "aff_unique_norm": "Australian National University;Australia Centre for Robotic Vision;EPFL", + "aff_unique_norm": "Australian National University;Australia Centre for Robotic Vision;École Polytechnique Fédérale de Lausanne", "aff_unique_dep": ";;CVLab", - "aff_unique_url": "https://www.anu.edu.au;https://roboticvision.org/;https://cvlab.epfl.ch", - "aff_unique_abbr": "ANU;ACRV;EPFL", + "aff_unique_url": "https://www.anu.edu.au;;https://cvlab.epfl.ch", + "aff_unique_abbr": "ANU;;EPFL", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;1;0+0", - "aff_country_unique": "Australia;Switzerland" + "aff_country_unique": "Australia;Switzerland", + "bibtex": "@InProceedings{Mao_2019_ICCV,\n \n author = {\n Mao,\n Wei and Liu,\n Miaomiao and Salzmann,\n Mathieu and Li,\n Hongdong\n},\n title = {\n Learning Trajectory Dependencies for Human Motion Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Learning Two-View Correspondences and Geometry Using Order-Aware Network", @@ -18338,14 +18920,15 @@ "author_num": 9, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Zhang_Learning_Two-View_Correspondences_and_Geometry_Using_Order-Aware_Network_ICCV_2019_paper.html", "aff_unique_index": "0+1;2;1;2;1;3;2;1;0+1", - "aff_unique_norm": "Tsinghua University;Hong Kong University of Science and Technology;Intel;Everest Innovation Technology", + "aff_unique_norm": "Tsinghua University;Hong Kong University of Science and Technology;Intel Corporation;Everest Innovation Technology", "aff_unique_dep": ";;Intel Labs;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.ust.hk;https://www.intel.cn;", "aff_unique_abbr": "THU;HKUST;Intel;EIT", "aff_campus_unique_index": "1;1;1;1;1", "aff_campus_unique": ";Hong Kong SAR", - "aff_country_unique_index": "0+0;0;0;0;0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique_index": "0+0;0;0;0;0;0;0;0+0", + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Zhang_2019_ICCV,\n \n author = {\n Zhang,\n Jiahui and Sun,\n Dawei and Luo,\n Zixin and Yao,\n Anbang and Zhou,\n Lei and Shen,\n Tianwei and Chen,\n Yurong and Quan,\n Long and Liao,\n Hongen\n},\n title = {\n Learning Two-View Correspondences and Geometry Using Order-Aware Network\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Learning With Average Precision: Training Image Retrieval With a Listwise Loss", @@ -18353,7 +18936,7 @@ "status": "Poster", "track": "main", "pid": "4690", - "author_site": "J\u00c3\u00a9r\u00c3\u00b4me Revaud, Jon Almaz\u00c3\u00a1n, Rafael S. Rezende, C\u00c3\u00a9sar Roberto de Souza", + "author_site": "Jérôme Revaud, Jon Almazán, Rafael S. Rezende, César Roberto de Souza", "author": "Jerome Revaud; Jon Almazan; Rafael S. Rezende; Cesar Roberto de Souza", "abstract": "Image retrieval can be formulated as a ranking problem where the goal is to order database images by decreasing similarity to the query. Recent deep models for image retrieval have outperformed traditional methods by leveraging ranking-tailored loss functions, but important theoretical and practical problems remain. First, rather than directly optimizing the global ranking, they minimize an upper-bound on the essential loss, which does not necessarily result in an optimal mean average precision (mAP). Second, these methods require significant engineering efforts to work well, e.g., special pre-training and hard-negative mining. In this paper we propose instead to directly optimize the global mAP by leveraging recent advances in listwise loss formulations. Using a histogram binning approximation, the AP can be differentiated and thus employed to end-to-end learning. Compared to existing losses, the proposed method considers thousands of images simultaneously at each iteration and eliminates the need for ad hoc tricks. It also establishes a new state of the art on many standard retrieval benchmarks. Models and evaluation scripts have been made available at: https://europe.naverlabs.com/Deep-Image-Retrieval/.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Revaud_Learning_With_Average_Precision_Training_Image_Retrieval_With_a_Listwise_ICCV_2019_paper.pdf", @@ -18369,7 +18952,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Revaud_Learning_With_Average_Precision_Training_Image_Retrieval_With_a_Listwise_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Revaud_Learning_With_Average_Precision_Training_Image_Retrieval_With_a_Listwise_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Revaud_2019_ICCV,\n \n author = {\n Revaud,\n Jerome and Almazan,\n Jon and Rezende,\n Rafael S. and Souza,\n Cesar Roberto de\n},\n title = {\n Learning With Average Precision: Training Image Retrieval With a Listwise Loss\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Learning With Unsure Data for Medical Image Diagnosis", @@ -18395,14 +18979,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Wu_Learning_With_Unsure_Data_for_Medical_Image_Diagnosis_ICCV_2019_paper.html", "aff_unique_index": "0;1;2+3+4;0+3+4", - "aff_unique_norm": "Peking University;Microsoft;Capital Medical University;Deepwise AI Lab;Pengcheng Laboratory", - "aff_unique_dep": "Computer Science Dept.;Microsoft Research;Yanjing Medical College;AI Lab;Peng Cheng Laboratory", + "aff_unique_norm": "Peking University;Microsoft Research;Capital Medical University;Deepwise AI Lab;Peng Cheng Laboratory", + "aff_unique_dep": "Computer Science Dept.;;Yanjing Medical College;AI Lab;", "aff_unique_url": "http://www.pku.edu.cn;https://www.microsoft.com/en-us/research/group/asia;http://www.cmu.edu.cn;;http://www.pcl.ac.cn", "aff_unique_abbr": "PKU;MSR Asia;;;PCL", "aff_campus_unique_index": "1;2;", "aff_campus_unique": ";Asia;Yanjing", "aff_country_unique_index": "0;0;0+0;0+0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Wu_2019_ICCV,\n \n author = {\n Wu,\n Botong and Sun,\n Xinwei and Hu,\n Lingjing and Wang,\n Yizhou\n},\n title = {\n Learning With Unsure Data for Medical Image Diagnosis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Learning a Mixture of Granularity-Specific Experts for Fine-Grained Categorization", @@ -18435,7 +19020,8 @@ "aff_campus_unique_index": "0;1;0;1", "aff_campus_unique": "Chippendale;Darlington", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Australia" + "aff_country_unique": "Australia", + "bibtex": "@InProceedings{Zhang_2019_ICCV,\n \n author = {\n Zhang,\n Lianbo and Huang,\n Shaoli and Liu,\n Wei and Tao,\n Dacheng\n},\n title = {\n Learning a Mixture of Granularity-Specific Experts for Fine-Grained Categorization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Learning an Effective Equivariant 3D Descriptor Without Supervision", @@ -18468,7 +19054,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Italy" + "aff_country_unique": "Italy", + "bibtex": "@InProceedings{Spezialetti_2019_ICCV,\n \n author = {\n Spezialetti,\n Riccardo and Salti,\n Samuele and Stefano,\n Luigi Di\n},\n title = {\n Learning an Effective Equivariant 3D Descriptor Without Supervision\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Learning an Event Sequence Embedding for Dense Event-Based Deep Stereo", @@ -18480,7 +19067,7 @@ "author": "Stepan Tulyakov; Francois Fleuret; Martin Kiefel; Peter Gehler; Michael Hirsch", "abstract": "Today, a frame-based camera is the sensor of choice for machine vision applications. However, these cameras, originally developed for acquisition of static images rather than for sensing of dynamic uncontrolled visual environments, suffer from high power consumption, data rate, latency and low dynamic range. An event-based image sensor addresses these drawbacks by mimicking a biological retina. Instead of measuring the intensity of every pixel in a fixed time-interval, it reports events of significant pixel intensity changes. Every such event is represented by its position, sign of change, and timestamp, accurate to the microsecond. Asynchronous event sequences require special handling, since traditional algorithms work only with synchronous, spatially gridded data. To address this problem we introduce a new module for event sequence embedding, for use in difference applications. The module builds a representation of an event sequence by firstly aggregating information locally across time, using a novel fully-connected layer for an irregularly sampled continuous domain, and then across discrete spatial domain. Based on this module, we design a deep learning-based stereo method for event-based cameras. The proposed method is the first learning-based stereo method for an event-based camera and the only method that produces dense results. We show that large performance increases on the Multi Vehicle Stereo Event Camera Dataset (MVSEC), which became the standard set for benchmarking of event-based stereo methods.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Tulyakov_Learning_an_Event_Sequence_Embedding_for_Dense_Event-Based_Deep_Stereo_ICCV_2019_paper.pdf", - "aff": "Space Engineering Center at \u00c9cole Polytechnique F\u00e9d\u00e9rale de Lausanne; \u00c9cole Polytechnique F\u00e9d\u00e9rale de Lausanne and Idiap Research Institute; Amazon, T\u00fcbingen, Germany; Amazon, T\u00fcbingen, Germany; Amazon, T\u00fcbingen, Germany", + "aff": "Space Engineering Center at École Polytechnique Fédérale de Lausanne; École Polytechnique Fédérale de Lausanne and Idiap Research Institute; Amazon, Tübingen, Germany; Amazon, Tübingen, Germany; Amazon, Tübingen, Germany", "project": "", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2019/supplemental/Tulyakov_Learning_an_Event_ICCV_2019_supplemental.pdf", @@ -18494,14 +19081,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Tulyakov_Learning_an_Event_Sequence_Embedding_for_Dense_Event-Based_Deep_Stereo_ICCV_2019_paper.html", "aff_unique_index": "0;0;1;1;1", - "aff_unique_norm": "EPFL;Amazon", - "aff_unique_dep": "Space Engineering Center;Amazon", + "aff_unique_norm": "École Polytechnique Fédérale de Lausanne;Amazon", + "aff_unique_dep": "Space Engineering Center;", "aff_unique_url": "https://www.epfl.ch;https://www.amazon.de", "aff_unique_abbr": "EPFL;", "aff_campus_unique_index": "1;1;1", - "aff_campus_unique": ";T\u00fcbingen", + "aff_campus_unique": ";Tübingen", "aff_country_unique_index": "0;0;1;1;1", - "aff_country_unique": "Switzerland;Germany" + "aff_country_unique": "Switzerland;Germany", + "bibtex": "@InProceedings{Tulyakov_2019_ICCV,\n \n author = {\n Tulyakov,\n Stepan and Fleuret,\n Francois and Kiefel,\n Martin and Gehler,\n Peter and Hirsch,\n Michael\n},\n title = {\n Learning an Event Sequence Embedding for Dense Event-Based Deep Stereo\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Learning the Model Update for Siamese Trackers", @@ -18513,7 +19101,7 @@ "author": "Lichao Zhang; Abel Gonzalez-Garcia; Joost van de Weijer; Martin Danelljan; Fahad Shahbaz Khan", "abstract": "Siamese approaches address the visual tracking problem by extracting an appearance template from the current frame, which is used to localize the target in the next frame. In general, this template is linearly combined with the accumulated template from the previous frame, resulting in an exponential decay of information over time. While such an approach to updating has led to improved results, its simplicity limits the potential gain likely to be obtained by learning to update. Therefore, we propose to replace the handcrafted update function with a method which learns to update. We use a convolutional neural network, called UpdateNet, which given the initial template, the accumulated template and the template of the current frame aims to estimate the optimal template for the next frame. The UpdateNet is compact and can easily be integrated into existing Siamese trackers. We demonstrate the generality of the proposed approach by applying it to two Siamese trackers, SiamFC and DaSiamRPN. Extensive experiments on VOT2016, VOT2018, LaSOT, and TrackingNet datasets demonstrate that our UpdateNet effectively predicts the new target template, outperforming the standard linear update. On the large-scale TrackingNet dataset, our UpdateNet improves the results of DaSiamRPN with an absolute gain of 3.9% in terms of success score.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Zhang_Learning_the_Model_Update_for_Siamese_Trackers_ICCV_2019_paper.pdf", - "aff": "Computer Vision Center, Universitat Autonoma de Barcelona, Spain; Computer Vision Center, Universitat Autonoma de Barcelona, Spain; Computer Vision Center, Universitat Autonoma de Barcelona, Spain; Computer Vision Laboratory, ETH Z\u00fcrich, Switzerland; Inception Institute of Artificial Intelligence, UAE+Computer Vision Laboratory, Link\u00f6ping University, Sweden", + "aff": "Computer Vision Center, Universitat Autonoma de Barcelona, Spain; Computer Vision Center, Universitat Autonoma de Barcelona, Spain; Computer Vision Center, Universitat Autonoma de Barcelona, Spain; Computer Vision Laboratory, ETH Zürich, Switzerland; Inception Institute of Artificial Intelligence, UAE+Computer Vision Laboratory, Linköping University, Sweden", "project": "", "github": "https://github.com/zhanglichao/updatenet", "supp": "http://openaccess.thecvf.com/content_ICCV_2019/supplemental/Zhang_Learning_the_Model_ICCV_2019_supplemental.pdf", @@ -18527,14 +19115,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Zhang_Learning_the_Model_Update_for_Siamese_Trackers_ICCV_2019_paper.html", "aff_unique_index": "0;0;0;1;2+3", - "aff_unique_norm": "Universitat Autonoma de Barcelona;ETH Zurich;Inception Institute of Artificial Intelligence;Link\u00f6ping University", + "aff_unique_norm": "Universitat Autonoma de Barcelona;ETH Zürich;Inception Institute of Artificial Intelligence;Linköping University", "aff_unique_dep": "Computer Vision Center;Computer Vision Laboratory;;Computer Vision Laboratory", "aff_unique_url": "https://www.uab.cat;https://www.ethz.ch;;https://www.liu.se", "aff_unique_abbr": ";ETHZ;;", "aff_campus_unique_index": "1;", - "aff_campus_unique": ";Z\u00fcrich", + "aff_campus_unique": ";Zürich", "aff_country_unique_index": "0;0;0;1;2+3", - "aff_country_unique": "Spain;Switzerland;United Arab Emirates;Sweden" + "aff_country_unique": "Spain;Switzerland;United Arab Emirates;Sweden", + "bibtex": "@InProceedings{Zhang_2019_ICCV,\n \n author = {\n Zhang,\n Lichao and Gonzalez-Garcia,\n Abel and Weijer,\n Joost van de and Danelljan,\n Martin and Khan,\n Fahad Shahbaz\n},\n title = {\n Learning the Model Update for Siamese Trackers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Learning to Assemble Neural Module Tree Networks for Visual Grounding", @@ -18557,7 +19146,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Liu_Learning_to_Assemble_Neural_Module_Tree_Networks_for_Visual_Grounding_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Liu_Learning_to_Assemble_Neural_Module_Tree_Networks_for_Visual_Grounding_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Liu_2019_ICCV,\n \n author = {\n Liu,\n Daqing and Zhang,\n Hanwang and Wu,\n Feng and Zha,\n Zheng-Jun\n},\n title = {\n Learning to Assemble Neural Module Tree Networks for Visual Grounding\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Learning to Caption Images Through a Lifetime by Asking Questions", @@ -18583,14 +19173,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Shen_Learning_to_Caption_Images_Through_a_Lifetime_by_Asking_Questions_ICCV_2019_paper.html", "aff_unique_index": "0+1+2;0+1+2;0+1+2", - "aff_unique_norm": "Vector Institute;University of Toronto;NVIDIA", - "aff_unique_dep": ";;NVIDIA Corporation", + "aff_unique_norm": "Vector Institute;University of Toronto;NVIDIA Corporation", + "aff_unique_dep": ";;", "aff_unique_url": "https://vectorinstitute.ai/;https://www.utoronto.ca;https://www.nvidia.com", "aff_unique_abbr": "Vector Institute;U of T;NVIDIA", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0+1;0+0+1;0+0+1", - "aff_country_unique": "Canada;United States" + "aff_country_unique": "Canada;United States", + "bibtex": "@InProceedings{Shen_2019_ICCV,\n \n author = {\n Shen,\n Tingke and Kar,\n Amlan and Fidler,\n Sanja\n},\n title = {\n Learning to Caption Images Through a Lifetime by Asking Questions\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Learning to Collocate Neural Modules for Image Captioning", @@ -18623,7 +19214,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Singapore;", "aff_country_unique_index": "0;0+1;0+1", - "aff_country_unique": "Singapore;Australia" + "aff_country_unique": "Singapore;Australia", + "bibtex": "@InProceedings{Yang_2019_ICCV,\n \n author = {\n Yang,\n Xu and Zhang,\n Hanwang and Cai,\n Jianfei\n},\n title = {\n Learning to Collocate Neural Modules for Image Captioning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Learning to Discover Novel Visual Categories via Deep Transfer Clustering", @@ -18656,7 +19248,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Oxford", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Han_2019_ICCV,\n \n author = {\n Han,\n Kai and Vedaldi,\n Andrea and Zisserman,\n Andrew\n},\n title = {\n Learning to Discover Novel Visual Categories via Deep Transfer Clustering\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Learning to Find Common Objects Across Few Image Collections", @@ -18689,7 +19282,8 @@ "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Canberra", "aff_country_unique_index": "0;1;0;1;0;1", - "aff_country_unique": "United States;Australia" + "aff_country_unique": "United States;Australia", + "bibtex": "@InProceedings{Shaban_2019_ICCV,\n \n author = {\n Shaban,\n Amirreza and Rahimi,\n Amir and Bansal,\n Shray and Gould,\n Stephen and Boots,\n Byron and Hartley,\n Richard\n},\n title = {\n Learning to Find Common Objects Across Few Image Collections\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Learning to Jointly Generate and Separate Reflections", @@ -18715,14 +19309,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Ma_Learning_to_Jointly_Generate_and_Separate_Reflections_ICCV_2019_paper.html", "aff_unique_index": "0+0;1;0+0+2;1;0+0+2", - "aff_unique_norm": "Peking University;Nanyang Technological University;Pengcheng Laboratory", - "aff_unique_dep": "SECE;School of Electrical and Electronic Engineering;Peng Cheng Laboratory", + "aff_unique_norm": "Peking University;Nanyang Technological University;Peng Cheng Laboratory", + "aff_unique_dep": "SECE;School of Electrical and Electronic Engineering;", "aff_unique_url": "http://www.pku.edu.cn;https://www.ntu.edu.sg;", "aff_unique_abbr": "PKU;NTU;", "aff_campus_unique_index": "0+1;2;0+1+0;2;0+1+0", "aff_campus_unique": "Shenzhen;Beijing;Singapore", "aff_country_unique_index": "0+0;1;0+0+0;1;0+0+0", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Ma_2019_ICCV,\n \n author = {\n Ma,\n Daiqian and Wan,\n Renjie and Shi,\n Boxin and Kot,\n Alex C. and Duan,\n Ling-Yu\n},\n title = {\n Learning to Jointly Generate and Separate Reflections\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Learning to Paint With Model-Based Deep Reinforcement Learning", @@ -18755,7 +19350,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Huang_2019_ICCV,\n \n author = {\n Huang,\n Zhewei and Heng,\n Wen and Zhou,\n Shuchang\n},\n title = {\n Learning to Paint With Model-Based Deep Reinforcement Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Learning to Rank Proposals for Object Detection", @@ -18788,7 +19384,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Beijing;", "aff_country_unique_index": "0;1+0;0;0;0", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Tan_2019_ICCV,\n \n author = {\n Tan,\n Zhiyu and Nie,\n Xuecheng and Qian,\n Qi and Li,\n Nan and Li,\n Hao\n},\n title = {\n Learning to Rank Proposals for Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Learning to Reconstruct 3D Human Pose and Shape via Model-Fitting in the Loop", @@ -18821,7 +19418,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", - "aff_country_unique": "United States;Germany" + "aff_country_unique": "United States;Germany", + "bibtex": "@InProceedings{Kolotouros_2019_ICCV,\n \n author = {\n Kolotouros,\n Nikos and Pavlakos,\n Georgios and Black,\n Michael J. and Daniilidis,\n Kostas\n},\n title = {\n Learning to Reconstruct 3D Human Pose and Shape via Model-Fitting in the Loop\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Learning to Reconstruct 3D Manhattan Wireframes From a Single Image", @@ -18854,7 +19452,8 @@ "aff_campus_unique_index": "0;0;;0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;0;0+1;0;0+1;0;0+1", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Zhou_2019_ICCV,\n \n author = {\n Zhou,\n Yichao and Qi,\n Haozhi and Zhai,\n Yuexiang and Sun,\n Qi and Chen,\n Zhili and Wei,\n Li-Yi and Ma,\n Yi\n},\n title = {\n Learning to Reconstruct 3D Manhattan Wireframes From a Single Image\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Learning to See Moving Objects in the Dark", @@ -18887,7 +19486,8 @@ "aff_campus_unique_index": "0;1", "aff_campus_unique": "Los Angeles;Tokyo", "aff_country_unique_index": "0;1", - "aff_country_unique": "United States;Japan" + "aff_country_unique": "United States;Japan", + "bibtex": "@InProceedings{Jiang_2019_ICCV,\n \n author = {\n Jiang,\n Haiyang and Zheng,\n Yinqiang\n},\n title = {\n Learning to See Moving Objects in the Dark\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Leveraging Long-Range Temporal Relationships Between Proposals for Video Object Detection", @@ -18920,7 +19520,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Chapel Hill;", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Shvets_2019_ICCV,\n \n author = {\n Shvets,\n Mykhailo and Liu,\n Wei and Berg,\n Alexander C.\n},\n title = {\n Leveraging Long-Range Temporal Relationships Between Proposals for Video Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Lifelong GAN: Continual Learning for Conditional Image Generation", @@ -18953,7 +19554,8 @@ "aff_campus_unique_index": ";;;;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0+0;0+0;0+0;0+0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Zhai_2019_ICCV,\n \n author = {\n Zhai,\n Mengyao and Chen,\n Lei and Tung,\n Frederick and He,\n Jiawei and Nawhal,\n Megha and Mori,\n Greg\n},\n title = {\n Lifelong GAN: Continual Learning for Conditional Image Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Linearized Multi-Sampling for Differentiable Image Transformation", @@ -18986,7 +19588,8 @@ "aff_campus_unique_index": "0;0;0+1;1;0", "aff_campus_unique": "Victoria;Mountain View", "aff_country_unique_index": "0;0;0+1;1;0", - "aff_country_unique": "Canada;United States" + "aff_country_unique": "Canada;United States", + "bibtex": "@InProceedings{Jiang_2019_ICCV,\n \n author = {\n Jiang,\n Wei and Sun,\n Weiwei and Tagliasacchi,\n Andrea and Trulls,\n Eduard and Yi,\n Kwang Moo\n},\n title = {\n Linearized Multi-Sampling for Differentiable Image Transformation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Linearly Converging Quasi Branch and Bound Algorithms for Global Rigid Registration", @@ -19019,7 +19622,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Dym_2019_ICCV,\n \n author = {\n Dym,\n Nadav and Kovalsky,\n Shahar Ziv\n},\n title = {\n Linearly Converging Quasi Branch and Bound Algorithms for Global Rigid Registration\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Liquid Warping GAN: A Unified Framework for Human Motion Imitation, Appearance Transfer and Novel View Synthesis", @@ -19052,7 +19656,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2019_ICCV,\n \n author = {\n Liu,\n Wen and Piao,\n Zhixin and Min,\n Jie and Luo,\n Wenhan and Ma,\n Lin and Gao,\n Shenghua\n},\n title = {\n Liquid Warping GAN: A Unified Framework for Human Motion Imitation,\n Appearance Transfer and Novel View Synthesis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Live Face De-Identification in Video", @@ -19078,14 +19683,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Gafni_Live_Face_De-Identification_in_Video_ICCV_2019_paper.html", "aff_unique_index": "0+1;0+1;0", - "aff_unique_norm": "Meta;Tel Aviv University", + "aff_unique_norm": "Facebook;Tel Aviv University", "aff_unique_dep": "Facebook AI Research;", "aff_unique_url": "https://research.facebook.com;https://www.tau.ac.il", "aff_unique_abbr": "FAIR;TAU", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0+1;0", - "aff_country_unique": "United States;Israel" + "aff_country_unique": "United States;Israel", + "bibtex": "@InProceedings{Gafni_2019_ICCV,\n \n author = {\n Gafni,\n Oran and Wolf,\n Lior and Taigman,\n Yaniv\n},\n title = {\n Live Face De-Identification in Video\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Local Aggregation for Unsupervised Learning of Visual Embeddings", @@ -19118,7 +19724,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhuang_2019_ICCV,\n \n author = {\n Zhuang,\n Chengxu and Zhai,\n Alex Lin and Yamins,\n Daniel\n},\n title = {\n Local Aggregation for Unsupervised Learning of Visual Embeddings\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Local Relation Networks for Image Recognition", @@ -19141,7 +19748,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Hu_Local_Relation_Networks_for_Image_Recognition_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Hu_Local_Relation_Networks_for_Image_Recognition_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Hu_2019_ICCV,\n \n author = {\n Hu,\n Han and Zhang,\n Zheng and Xie,\n Zhenda and Lin,\n Stephen\n},\n title = {\n Local Relation Networks for Image Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Local Supports Global: Deep Camera Relocalization With Sequence Enhancement", @@ -19174,7 +19782,8 @@ "aff_campus_unique_index": ";;;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0+0;0+0;0+0;0+0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xue_2019_ICCV,\n \n author = {\n Xue,\n Fei and Wang,\n Xin and Yan,\n Zike and Wang,\n Qiuyuan and Wang,\n Junqiu and Zha,\n Hongbin\n},\n title = {\n Local Supports Global: Deep Camera Relocalization With Sequence Enhancement\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Localization of Deep Inpainting Using High-Pass Fully Convolutional Network", @@ -19207,7 +19816,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Shenzhen;", "aff_country_unique_index": "0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2019_ICCV,\n \n author = {\n Li,\n Haodong and Huang,\n Jiwu\n},\n title = {\n Localization of Deep Inpainting Using High-Pass Fully Convolutional Network\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Looking to Relations for Future Trajectory Forecast", @@ -19230,7 +19840,8 @@ "aff_domain": ";", "email": ";", "author_num": 2, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Choi_Looking_to_Relations_for_Future_Trajectory_Forecast_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Choi_Looking_to_Relations_for_Future_Trajectory_Forecast_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Choi_2019_ICCV,\n \n author = {\n Choi,\n Chiho and Dariush,\n Behzad\n},\n title = {\n Looking to Relations for Future Trajectory Forecast\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "M2FPA: A Multi-Yaw Multi-Pitch High-Quality Dataset and Benchmark for Facial Pose Analysis", @@ -19263,7 +19874,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2019_ICCV,\n \n author = {\n Li,\n Peipei and Wu,\n Xiang and Hu,\n Yibo and He,\n Ran and Sun,\n Zhenan\n},\n title = {\n M2FPA: A Multi-Yaw Multi-Pitch High-Quality Dataset and Benchmark for Facial Pose Analysis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "M3D-RPN: Monocular 3D Region Proposal Network for Object Detection", @@ -19296,7 +19908,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "East Lansing", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Brazil_2019_ICCV,\n \n author = {\n Brazil,\n Garrick and Liu,\n Xiaoming\n},\n title = {\n M3D-RPN: Monocular 3D Region Proposal Network for Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "MIC: Mining Interclass Characteristics for Improved Metric Learning", @@ -19304,7 +19917,7 @@ "status": "Poster", "track": "main", "pid": "2266", - "author_site": "Karsten Roth, Biagio Brattoli, Bj\u00c3\u00b6rn Ommer", + "author_site": "Karsten Roth, Biagio Brattoli, Björn Ommer", "author": "Karsten Roth; Biagio Brattoli; Bjorn Ommer", "abstract": "Metric learning seeks to embed images of objects such that class-defined relations are captured by the embedding space. However, variability in images is not just due to different depicted object classes, but also depends on other latent characteristics such as viewpoint or illumination. In addition to these structured properties, random noise further obstructs the visual relations of interest. The common approach to metric learning is to enforce a representation that is invariant under all factors but the ones of interest. In contrast, we propose to explicitly learn the latent characteristics that are shared by and go across object classes. We can then directly explain away structured visual variability, rather than assuming it to be unknown random noise. We propose a novel surrogate task to learn visual characteristics shared across classes with a separate encoder. This encoder is trained jointly with the encoder for class information by reducing their mutual information. On five standard image retrieval benchmarks the approach significantly improves upon the state-of-the-art. Code is available at https://github.com/Confusezius/metric-learning-mining-interclass-characteristics.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Roth_MIC_Mining_Interclass_Characteristics_for_Improved_Metric_Learning_ICCV_2019_paper.pdf", @@ -19329,7 +19942,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Heidelberg", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Roth_2019_ICCV,\n \n author = {\n Roth,\n Karsten and Brattoli,\n Biagio and Ommer,\n Bjorn\n},\n title = {\n MIC: Mining Interclass Characteristics for Improved Metric Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "MMAct: A Large-Scale Dataset for Cross Modal Human Action Understanding", @@ -19362,7 +19976,8 @@ "aff_campus_unique_index": "1;", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;1;0;0;0+1;0", - "aff_country_unique": "Japan;China" + "aff_country_unique": "Japan;China", + "bibtex": "@InProceedings{Kong_2019_ICCV,\n \n author = {\n Kong,\n Quan and Wu,\n Ziming and Deng,\n Ziwei and Klinkigt,\n Martin and Tong,\n Bin and Murakami,\n Tomokazu\n},\n title = {\n MMAct: A Large-Scale Dataset for Cross Modal Human Action Understanding\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "MONET: Multiview Semi-Supervised Keypoint Detection via Epipolar Divergence", @@ -19395,7 +20010,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Yao_2019_ICCV,\n \n author = {\n Yao,\n Yuan and Jafarian,\n Yasamin and Park,\n Hyun Soo\n},\n title = {\n MONET: Multiview Semi-Supervised Keypoint Detection via Epipolar Divergence\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "MVP Matching: A Maximum-Value Perfect Matching for Mining Hard Samples, With Application to Person Re-Identification", @@ -19407,7 +20023,7 @@ "author": "Han Sun; Zhiyuan Chen; Shiyang Yan; Lin Xu", "abstract": "How to correctly stress hard samples in metric learning is critical for visual recognition tasks, especially in challenging person re-ID applications. Pedestrians across cameras with significant appearance variations are easily confused, which could bias the learned metric and slow down the convergence rate. In this paper, we propose a novel weighted complete bipartite graph based maximum-value perfect (MVP) matching for mining the hard samples from a batch of samples. It can emphasize the hard positive and negative sample pairs respectively, and thus relieve adverse optimization and sample imbalance problems. We then develop a new batch-wise MVP matching based loss objective and combine it in an end-to-end deep metric learning manner. It leads to significant improvements in both convergence rate and recognition performance. Extensive empirical results on five person re-ID benchmark datasets, i.e., Market-1501, CUHK03-Detected, CUHK03-Labeled, Duke-MTMC, and MSMT17, demonstrate the superiority of the proposed method. It can accelerate the convergence rate significantly while achieving state-of-the-art performance. The source code of our method is available at https://github.com/IAAI-CVResearchGroup/MVP-metric.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Sun_MVP_Matching_A_Maximum-Value_Perfect_Matching_for_Mining_Hard_Samples_ICCV_2019_paper.pdf", - "aff": "Nanjing Institute of Advanced Arti\ufb01cial Intelligence+Horizon Robotics; Nanjing Institute of Advanced Arti\ufb01cial Intelligence+Horizon Robotics; Queen\u2019s University Belfast; Nanjing Institute of Advanced Arti\ufb01cial Intelligence+Horizon Robotics", + "aff": "Nanjing Institute of Advanced Artificial Intelligence+Horizon Robotics; Nanjing Institute of Advanced Artificial Intelligence+Horizon Robotics; Queen’s University Belfast; Nanjing Institute of Advanced Artificial Intelligence+Horizon Robotics", "project": "", "github": "https://github.com/IAAI-CVResearchGroup/MVP-metric", "supp": "http://openaccess.thecvf.com/content_ICCV_2019/supplemental/Sun_MVP_Matching_A_ICCV_2019_supplemental.pdf", @@ -19428,7 +20044,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Nanjing;", "aff_country_unique_index": "0+0;0+0;1;0+0", - "aff_country_unique": "China;United Kingdom" + "aff_country_unique": "China;United Kingdom", + "bibtex": "@InProceedings{Sun_2019_ICCV,\n \n author = {\n Sun,\n Han and Chen,\n Zhiyuan and Yan,\n Shiyang and Xu,\n Lin\n},\n title = {\n MVP Matching: A Maximum-Value Perfect Matching for Mining Hard Samples,\n With Application to Person Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "MVSCRF: Learning Multi-View Stereo With Conditional Random Fields", @@ -19461,7 +20078,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xue_2019_ICCV,\n \n author = {\n Xue,\n Youze and Chen,\n Jiansheng and Wan,\n Weitao and Huang,\n Yiqing and Yu,\n Cheng and Li,\n Tianpeng and Bao,\n Jiayu\n},\n title = {\n MVSCRF: Learning Multi-View Stereo With Conditional Random Fields\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Make a Face: Towards Arbitrary High Fidelity Face Manipulation", @@ -19485,7 +20103,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Qian_Make_a_Face_Towards_Arbitrary_High_Fidelity_Face_Manipulation_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Qian_Make_a_Face_Towards_Arbitrary_High_Fidelity_Face_Manipulation_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Qian_2019_ICCV,\n \n author = {\n Qian,\n Shengju and Lin,\n Kwan-Yee and Wu,\n Wayne and Liu,\n Yangxiaokang and Wang,\n Quan and Shen,\n Fumin and Qian,\n Chen and He,\n Ran\n},\n title = {\n Make a Face: Towards Arbitrary High Fidelity Face Manipulation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Making History Matter: History-Advantage Sequence Training for Visual Dialog", @@ -19518,7 +20137,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Yang_2019_ICCV,\n \n author = {\n Yang,\n Tianhao and Zha,\n Zheng-Jun and Zhang,\n Hanwang\n},\n title = {\n Making History Matter: History-Advantage Sequence Training for Visual Dialog\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Making the Invisible Visible: Action Recognition Through Walls and Occlusions", @@ -19551,7 +20171,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Li_2019_ICCV,\n \n author = {\n Li,\n Tianhong and Fan,\n Lijie and Zhao,\n Mingmin and Liu,\n Yingcheng and Katabi,\n Dina\n},\n title = {\n Making the Invisible Visible: Action Recognition Through Walls and Occlusions\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Many Task Learning With Task Routing", @@ -19584,7 +20205,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Netherlands" + "aff_country_unique": "Netherlands", + "bibtex": "@InProceedings{Strezoski_2019_ICCV,\n \n author = {\n Strezoski,\n Gjorgji and Noord,\n Nanne van and Worring,\n Marcel\n},\n title = {\n Many Task Learning With Task Routing\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Markerless Outdoor Human Motion Capture Using Multiple Autonomous Micro Aerial Vehicles", @@ -19608,7 +20230,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Saini_Markerless_Outdoor_Human_Motion_Capture_Using_Multiple_Autonomous_Micro_Aerial_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Saini_Markerless_Outdoor_Human_Motion_Capture_Using_Multiple_Autonomous_Micro_Aerial_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Saini_2019_ICCV,\n \n author = {\n Saini,\n Nitin and Price,\n Eric and Tallamraju,\n Rahul and Enficiaud,\n Raffi and Ludwig,\n Roman and Martinovic,\n Igor and Ahmad,\n Aamir and Black,\n Michael J.\n},\n title = {\n Markerless Outdoor Human Motion Capture Using Multiple Autonomous Micro Aerial Vehicles\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Mask-Guided Attention Network for Occluded Pedestrian Detection", @@ -19620,7 +20243,7 @@ "author": "Yanwei Pang; Jin Xie; Muhammad Haris Khan; Rao Muhammad Anwer; Fahad Shahbaz Khan; Ling Shao", "abstract": "Pedestrian detection relying on deep convolution neural networks has made significant progress. Though promising results have been achieved on standard pedestrians, the performance on heavily occluded pedestrians remains far from satisfactory. The main culprits are intra-class occlusions involving other pedestrians and inter-class occlusions caused by other objects, such as cars and bicycles. These results in a multitude of occlusion patterns. We propose an approach for occluded pedestrian detection with the following contributions. First, we introduce a novel mask-guided attention network that fits naturally into popular pedestrian detection pipelines. Our attention network emphasizes on visible pedestrian regions while suppressing the occluded ones by modulating full body features. Second, we empirically demonstrate that coarse-level segmentation annotations provide reasonable approximation to their dense pixel-wise counterparts. Experiments are performed on CityPersons and Caltech datasets. Our approach sets a new state-of-the-art on both datasets. Our approach obtains an absolute gain of 9.5% in log-average miss rate, compared to the best reported results [32] on the heavily occluded HO pedestrian set of CityPersons test set. Further, on the HO pedestrian set of Caltech dataset, our method achieves an absolute gain of 5.0% in log-average miss rate, compared to the best reported results [13]. Code and models are available at: https://github.com/Leotju/MGAN.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Pang_Mask-Guided_Attention_Network_for_Occluded_Pedestrian_Detection_ICCV_2019_paper.pdf", - "aff": "Tianjin University; Tianjin University; Inception Institute of Arti\ufb01cial Intelligence, UAE; Inception Institute of Arti\ufb01cial Intelligence, UAE; Inception Institute of Arti\ufb01cial Intelligence, UAE+CVL, Link \u00a8oping University, Sweden; Inception Institute of Arti\ufb01cial Intelligence, UAE", + "aff": "Tianjin University; Tianjin University; Inception Institute of Artificial Intelligence, UAE; Inception Institute of Artificial Intelligence, UAE; Inception Institute of Artificial Intelligence, UAE+CVL, Link ¨oping University, Sweden; Inception Institute of Artificial Intelligence, UAE", "project": "", "github": "https://github.com/Leotju/MGAN", "supp": "", @@ -19634,14 +20257,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Pang_Mask-Guided_Attention_Network_for_Occluded_Pedestrian_Detection_ICCV_2019_paper.html", "aff_unique_index": "0;0;1;1;1+2;1", - "aff_unique_norm": "Tianjin University;Inception Institute of Artificial Intelligence;Link\u00f6ping University", + "aff_unique_norm": "Tianjin University;Inception Institute of Artificial Intelligence;Linköping University", "aff_unique_dep": ";;CVL", "aff_unique_url": "http://www.tju.edu.cn;https://www.inceptioniai.org;https://www.liu.se", "aff_unique_abbr": "TJU;;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;1;1+2;1", - "aff_country_unique": "China;United Arab Emirates;Sweden" + "aff_country_unique": "China;United Arab Emirates;Sweden", + "bibtex": "@InProceedings{Pang_2019_ICCV,\n \n author = {\n Pang,\n Yanwei and Xie,\n Jin and Khan,\n Muhammad Haris and Anwer,\n Rao Muhammad and Khan,\n Fahad Shahbaz and Shao,\n Ling\n},\n title = {\n Mask-Guided Attention Network for Occluded Pedestrian Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Mask-ShadowGAN: Learning to Remove Shadows From Unpaired Data", @@ -19665,7 +20289,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Hu_Mask-ShadowGAN_Learning_to_Remove_Shadows_From_Unpaired_Data_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Hu_Mask-ShadowGAN_Learning_to_Remove_Shadows_From_Unpaired_Data_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Hu_2019_ICCV,\n \n author = {\n Hu,\n Xiaowei and Jiang,\n Yitong and Fu,\n Chi-Wing and Heng,\n Pheng-Ann\n},\n title = {\n Mask-ShadowGAN: Learning to Remove Shadows From Unpaired Data\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Maximum-Margin Hamming Hashing", @@ -19698,7 +20323,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0+0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Kang_2019_ICCV,\n \n author = {\n Kang,\n Rong and Cao,\n Yue and Long,\n Mingsheng and Wang,\n Jianmin and Yu,\n Philip S.\n},\n title = {\n Maximum-Margin Hamming Hashing\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Memorizing Normality to Detect Anomaly: Memory-Augmented Deep Autoencoder for Unsupervised Anomaly Detection", @@ -19724,14 +20350,15 @@ "author_num": 7, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Gong_Memorizing_Normality_to_Detect_Anomaly_Memory-Augmented_Deep_Autoencoder_for_Unsupervised_ICCV_2019_paper.html", "aff_unique_index": "0;0;1;1;2;1;0", - "aff_unique_norm": "University of Adelaide;Deakin University;University of Western Australia", + "aff_unique_norm": "The University of Adelaide;Deakin University;University of Western Australia", "aff_unique_dep": ";A2I2;", "aff_unique_url": "https://www.adelaide.edu.au;https://www.deakin.edu.au;https://www.uwa.edu.au", "aff_unique_abbr": "Adelaide;;UWA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "Australia" + "aff_country_unique": "Australia", + "bibtex": "@InProceedings{Gong_2019_ICCV,\n \n author = {\n Gong,\n Dong and Liu,\n Lingqiao and Le,\n Vuong and Saha,\n Budhaditya and Mansour,\n Moussa Reda and Venkatesh,\n Svetha and Hengel,\n Anton van den\n},\n title = {\n Memorizing Normality to Detect Anomaly: Memory-Augmented Deep Autoencoder for Unsupervised Anomaly Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Memory-Based Neighbourhood Embedding for Visual Recognition", @@ -19764,7 +20391,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0+0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2019_ICCV,\n \n author = {\n Li,\n Suichan and Chen,\n Dapeng and Liu,\n Bin and Yu,\n Nenghai and Zhao,\n Rui\n},\n title = {\n Memory-Based Neighbourhood Embedding for Visual Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Mesh R-CNN", @@ -19788,7 +20416,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Gkioxari_Mesh_R-CNN_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Gkioxari_Mesh_R-CNN_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Gkioxari_2019_ICCV,\n \n author = {\n Gkioxari,\n Georgia and Malik,\n Jitendra and Johnson,\n Justin\n},\n title = {\n Mesh R-CNN\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Meta R-CNN: Towards General Solver for Instance-Level Low-Shot Learning", @@ -19812,7 +20441,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Yan_Meta_R-CNN_Towards_General_Solver_for_Instance-Level_Low-Shot_Learning_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Yan_Meta_R-CNN_Towards_General_Solver_for_Instance-Level_Low-Shot_Learning_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Yan_2019_ICCV,\n \n author = {\n Yan,\n Xiaopeng and Chen,\n Ziliang and Xu,\n Anni and Wang,\n Xiaoxi and Liang,\n Xiaodan and Lin,\n Liang\n},\n title = {\n Meta R-CNN: Towards General Solver for Instance-Level Low-Shot Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Meta-Learning to Detect Rare Objects", @@ -19845,7 +20475,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Pittsburgh", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Wang_2019_ICCV,\n \n author = {\n Wang,\n Yu-Xiong and Ramanan,\n Deva and Hebert,\n Martial\n},\n title = {\n Meta-Learning to Detect Rare Objects\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Meta-Sim: Learning to Generate Synthetic Datasets", @@ -19869,7 +20500,8 @@ "aff_domain": ";;;;;;;;", "email": ";;;;;;;;", "author_num": 9, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Kar_Meta-Sim_Learning_to_Generate_Synthetic_Datasets_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Kar_Meta-Sim_Learning_to_Generate_Synthetic_Datasets_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Kar_2019_ICCV,\n \n author = {\n Kar,\n Amlan and Prakash,\n Aayush and Liu,\n Ming-Yu and Cameracci,\n Eric and Yuan,\n Justin and Rusiniak,\n Matt and Acuna,\n David and Torralba,\n Antonio and Fidler,\n Sanja\n},\n title = {\n Meta-Sim: Learning to Generate Synthetic Datasets\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "MetaPruning: Meta Learning for Automatic Neural Network Channel Pruning", @@ -19893,7 +20525,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Liu_MetaPruning_Meta_Learning_for_Automatic_Neural_Network_Channel_Pruning_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Liu_MetaPruning_Meta_Learning_for_Automatic_Neural_Network_Channel_Pruning_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Liu_2019_ICCV,\n \n author = {\n Liu,\n Zechun and Mu,\n Haoyuan and Zhang,\n Xiangyu and Guo,\n Zichao and Yang,\n Xin and Cheng,\n Kwang-Ting and Sun,\n Jian\n},\n title = {\n MetaPruning: Meta Learning for Automatic Neural Network Channel Pruning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "MeteorNet: Deep Learning on Dynamic 3D Point Cloud Sequences", @@ -19917,7 +20550,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Liu_MeteorNet_Deep_Learning_on_Dynamic_3D_Point_Cloud_Sequences_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Liu_MeteorNet_Deep_Learning_on_Dynamic_3D_Point_Cloud_Sequences_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Liu_2019_ICCV,\n \n author = {\n Liu,\n Xingyu and Yan,\n Mengyuan and Bohg,\n Jeannette\n},\n title = {\n MeteorNet: Deep Learning on Dynamic 3D Point Cloud Sequences\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Metric Learning With HORDE: High-Order Regularizer for Deep Embeddings", @@ -19929,7 +20563,7 @@ "author": "Pierre Jacob; David Picard; Aymeric Histace; Edouard Klein", "abstract": "Learning an effective similarity measure between image representations is key to the success of recent advances in visual search tasks (e.g. verification or zero-shot learning). Although the metric learning part is well addressed, this metric is usually computed over the average of the extracted deep features. This representation is then trained to be discriminative. However, these deep features tend to be scattered across the feature space. Consequently, the representations are not robust to outliers, object occlusions, background variations, etc. In this paper, we tackle this scattering problem with a distribution-aware regularization named HORDE. This regularizer enforces visually-close images to have deep features with the same distribution which are well localized in the feature space. We provide a theoretical analysis supporting this regularization effect. We also show the effectiveness of our approach by obtaining state-of-the-art results on 4 well-known datasets (Cub-200-2011, Cars-196, Stanford Online Products and Inshop Clothes Retrieval).", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Jacob_Metric_Learning_With_HORDE_High-Order_Regularizer_for_Deep_Embeddings_ICCV_2019_paper.pdf", - "aff": "ETIS UMR 8051, Universit \u00b4e Paris Seine, UCP, ENSEA, CNRS, F-95000, Cergy, France; ETIS UMR 8051, Universit \u00b4e Paris Seine, UCP, ENSEA, CNRS, F-95000, Cergy, France + LIGM, UMR 8049, \u00b4Ecole des Ponts, UPE, Champs-sur-Marne, France; ETIS UMR 8051, Universit \u00b4e Paris Seine, UCP, ENSEA, CNRS, F-95000, Cergy, France; C3N, P \u02c6ole Judiciaire de la Gendarmerie Nationale, 5 boulevard de l\u2019Hautil, 95000 Cergy, France", + "aff": "ETIS UMR 8051, Universit ´e Paris Seine, UCP, ENSEA, CNRS, F-95000, Cergy, France; ETIS UMR 8051, Universit ´e Paris Seine, UCP, ENSEA, CNRS, F-95000, Cergy, France + LIGM, UMR 8049, ´Ecole des Ponts, UPE, Champs-sur-Marne, France; ETIS UMR 8051, Universit ´e Paris Seine, UCP, ENSEA, CNRS, F-95000, Cergy, France; C3N, P ˆole Judiciaire de la Gendarmerie Nationale, 5 boulevard de l’Hautil, 95000 Cergy, France", "project": "", "github": "https://github.com/pierre-jacob/ICCV2019-Horde", "supp": "", @@ -19943,14 +20577,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Jacob_Metric_Learning_With_HORDE_High-Order_Regularizer_for_Deep_Embeddings_ICCV_2019_paper.html", "aff_unique_index": "0;0+1;0;2", - "aff_unique_norm": "Universit\u00e9 Paris Seine;Ecole des Ponts ParisTech;P\u00f4le Judiciaire de la Gendarmerie Nationale", - "aff_unique_dep": "ETIS UMR 8051;Laboratoire d'Informatique, de Gestion et des Mod\u00e8les Economiques (LIGM);C3N", + "aff_unique_norm": "Université Paris Seine;Ecole des Ponts ParisTech;Pôle Judiciaire de la Gendarmerie Nationale", + "aff_unique_dep": "ETIS UMR 8051;Laboratoire d'Informatique, de Gestion et des Mutation;C3N", "aff_unique_url": "https://www.universite-paris-seine.fr;https://www.ponts.org;", "aff_unique_abbr": "UPS;ENPC;", "aff_campus_unique_index": "0;0+1;0", "aff_campus_unique": "Cergy;Champs-sur-Marne;", "aff_country_unique_index": "0;0+0;0;0", - "aff_country_unique": "France" + "aff_country_unique": "France", + "bibtex": "@InProceedings{Jacob_2019_ICCV,\n \n author = {\n Jacob,\n Pierre and Picard,\n David and Histace,\n Aymeric and Klein,\n Edouard\n},\n title = {\n Metric Learning With HORDE: High-Order Regularizer for Deep Embeddings\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Micro-Baseline Structured Light", @@ -19983,7 +20618,8 @@ "aff_campus_unique_index": "0+1;0;2;0", "aff_campus_unique": "New York City;Pittsburgh;Madison", "aff_country_unique_index": "0+0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Saragadam_2019_ICCV,\n \n author = {\n Saragadam,\n Vishwanath and Wang,\n Jian and Gupta,\n Mohit and Nayar,\n Shree\n},\n title = {\n Micro-Baseline Structured Light\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Minimum Delay Object Detection From Video", @@ -20016,7 +20652,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Saudi Arabia" + "aff_country_unique": "Saudi Arabia", + "bibtex": "@InProceedings{Lao_2019_ICCV,\n \n author = {\n Lao,\n Dong and Sundaramoorthi,\n Ganesh\n},\n title = {\n Minimum Delay Object Detection From Video\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Miss Detection vs. False Alarm: Adversarial Learning for Small Object Segmentation in Infrared Images", @@ -20040,7 +20677,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Wang_Miss_Detection_vs._False_Alarm_Adversarial_Learning_for_Small_Object_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Wang_Miss_Detection_vs._False_Alarm_Adversarial_Learning_for_Small_Object_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Wang_2019_ICCV,\n \n author = {\n Wang,\n Huan and Zhou,\n Luping and Wang,\n Lei\n},\n title = {\n Miss Detection vs. False Alarm: Adversarial Learning for Small Object Segmentation in Infrared Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Mixed High-Order Attention Network for Person Re-Identification", @@ -20073,7 +20711,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2019_ICCV,\n \n author = {\n Chen,\n Binghui and Deng,\n Weihong and Hu,\n Jiani\n},\n title = {\n Mixed High-Order Attention Network for Person Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Mixture-Kernel Graph Attention Network for Situation Recognition", @@ -20106,7 +20745,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0+0;0+0+0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Suhail_2019_ICCV,\n \n author = {\n Suhail,\n Mohammed and Sigal,\n Leonid\n},\n title = {\n Mixture-Kernel Graph Attention Network for Situation Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Modeling Inter and Intra-Class Relations in the Triplet Loss for Zero-Shot Learning", @@ -20114,11 +20754,11 @@ "status": "Poster", "track": "main", "pid": "1023", - "author_site": "Yannick Le Cacheux, Herv\u00c3\u00a9 Le Borgne, Michel Crucianu", + "author_site": "Yannick Le Cacheux, Hervé Le Borgne, Michel Crucianu", "author": "Yannick Le Cacheux; Herve Le Borgne; Michel Crucianu", "abstract": "Recognizing visual unseen classes, i.e. for which no training data is available, is known as Zero Shot Learning (ZSL). Some of the best performing methods apply the triplet loss to seen classes to learn a mapping between visual representations of images and attribute vectors that constitute class prototypes. They nevertheless make several implicit assumptions that limit their performance on real use cases, particularly with fine-grained datasets comprising a large number of classes. We identify three of these assumptions and put forward corresponding novel contributions to address them. Our approach consists in taking into account both inter-class and intra-class relations, respectively by being more permissive with confusions between similar classes, and by penalizing visual samples which are atypical to their class. The approach is tested on four datasets, including the large-scale ImageNet, and exhibits performances significantly above recent methods, even generative methods based on more restrictive hypotheses.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Le_Cacheux_Modeling_Inter_and_Intra-Class_Relations_in_the_Triplet_Loss_for_ICCV_2019_paper.pdf", - "aff": "CEA LIST; CEA LIST; CEDRIC \u2013 CNAM", + "aff": "CEA LIST; CEA LIST; CEDRIC – CNAM", "project": "", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2019/supplemental/Le_Cacheux_Modeling_Inter_and_ICCV_2019_supplemental.pdf", @@ -20132,14 +20772,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Le_Cacheux_Modeling_Inter_and_Intra-Class_Relations_in_the_Triplet_Loss_for_ICCV_2019_paper.html", "aff_unique_index": "0;0;1", - "aff_unique_norm": "CEA LIST;Conservatoire National des Arts et M\u00e9tiers", + "aff_unique_norm": "CEA LIST;Conservatoire National des Arts et Métiers", "aff_unique_dep": ";CEDRIC", "aff_unique_url": "https://www-list.cea.fr;https://www.cnam.fr", "aff_unique_abbr": "CEA LIST;CNAM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "France" + "aff_country_unique": "France", + "bibtex": "@InProceedings{Cacheux_2019_ICCV,\n \n author = {\n Cacheux,\n Yannick Le and Borgne,\n Herve Le and Crucianu,\n Michel\n},\n title = {\n Modeling Inter and Intra-Class Relations in the Triplet Loss for Zero-Shot Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Moment Matching for Multi-Source Domain Adaptation", @@ -20172,7 +20813,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0;2", - "aff_country_unique": "United States;China;Canada" + "aff_country_unique": "United States;China;Canada", + "bibtex": "@InProceedings{Peng_2019_ICCV,\n \n author = {\n Peng,\n Xingchao and Bai,\n Qinxun and Xia,\n Xide and Huang,\n Zijun and Saenko,\n Kate and Wang,\n Bo\n},\n title = {\n Moment Matching for Multi-Source Domain Adaptation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Mono-SF: Multi-View Geometry Meets Single-View Depth for Monocular Scene Flow Estimation of Dynamic Traffic Scenes", @@ -20205,7 +20847,8 @@ "aff_campus_unique_index": "1;2+1", "aff_campus_unique": ";Frankfurt;Trondheim", "aff_country_unique_index": "0+0;0;1+0", - "aff_country_unique": "Germany;Norway" + "aff_country_unique": "Germany;Norway", + "bibtex": "@InProceedings{Brickwedde_2019_ICCV,\n \n author = {\n Brickwedde,\n Fabian and Abraham,\n Steffen and Mester,\n Rudolf\n},\n title = {\n Mono-SF: Multi-View Geometry Meets Single-View Depth for Monocular Scene Flow Estimation of Dynamic Traffic Scenes\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "MonoLoco: Monocular 3D Pedestrian Localization and Uncertainty Estimation", @@ -20229,7 +20872,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Bertoni_MonoLoco_Monocular_3D_Pedestrian_Localization_and_Uncertainty_Estimation_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Bertoni_MonoLoco_Monocular_3D_Pedestrian_Localization_and_Uncertainty_Estimation_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Bertoni_2019_ICCV,\n \n author = {\n Bertoni,\n Lorenzo and Kreiss,\n Sven and Alahi,\n Alexandre\n},\n title = {\n MonoLoco: Monocular 3D Pedestrian Localization and Uncertainty Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Monocular 3D Human Pose Estimation by Generation and Ordinal Ranking", @@ -20262,7 +20906,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1", - "aff_country_unique": "Germany;India;" + "aff_country_unique": "Germany;India;", + "bibtex": "@InProceedings{Sharma_2019_ICCV,\n \n author = {\n Sharma,\n Saurabh and Varigonda,\n Pavan Teja and Bindal,\n Prashast and Sharma,\n Abhishek and Jain,\n Arjun\n},\n title = {\n Monocular 3D Human Pose Estimation by Generation and Ordinal Ranking\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Monocular Neural Image Based Rendering With Continuous View Control", @@ -20295,7 +20940,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Switzerland" + "aff_country_unique": "Switzerland", + "bibtex": "@InProceedings{Chen_2019_ICCV,\n \n author = {\n Chen,\n Xu and Song,\n Jie and Hilliges,\n Otmar\n},\n title = {\n Monocular Neural Image Based Rendering With Continuous View Control\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Monocular Piecewise Depth Estimation in Dynamic Scenes by Exploiting Superpixel Relations", @@ -20319,7 +20965,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Di_Monocular_Piecewise_Depth_Estimation_in_Dynamic_Scenes_by_Exploiting_Superpixel_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Di_Monocular_Piecewise_Depth_Estimation_in_Dynamic_Scenes_by_Exploiting_Superpixel_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Di_2019_ICCV,\n \n author = {\n Di,\n Yan and Morimitsu,\n Henrique and Gao,\n Shan and Ji,\n Xiangyang\n},\n title = {\n Monocular Piecewise Depth Estimation in Dynamic Scenes by Exploiting Superpixel Relations\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Mop Moire Patterns Using MopNet", @@ -20345,14 +20992,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/He_Mop_Moire_Patterns_Using_MopNet_ICCV_2019_paper.html", "aff_unique_index": "0;0;0+1;0+1", - "aff_unique_norm": "Peking University;Pengcheng Laboratory", - "aff_unique_dep": "National Engineering Lab for Video Technology;Peng Cheng Laboratory", + "aff_unique_norm": "Peking University;Peng Cheng Laboratory", + "aff_unique_dep": "National Engineering Lab for Video Technology;", "aff_unique_url": "http://www.pku.edu.cn;", "aff_unique_abbr": "PKU;", "aff_campus_unique_index": "0;0;0+1;0+1", "aff_campus_unique": "Beijing;Shenzhen", "aff_country_unique_index": "0;0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{He_2019_ICCV,\n \n author = {\n He,\n Bin and Wang,\n Ce and Shi,\n Boxin and Duan,\n Ling-Yu\n},\n title = {\n Mop Moire Patterns Using MopNet\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Motion Guided Attention for Video Salient Object Detection", @@ -20378,14 +21026,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Li_Motion_Guided_Attention_for_Video_Salient_Object_Detection_ICCV_2019_paper.html", "aff_unique_index": "0;1;1+2;0+2", - "aff_unique_norm": "University of Hong Kong;Sun Yat-sen University;Deepwise AI Lab", + "aff_unique_norm": "The University of Hong Kong;Sun Yat-sen University;Deepwise AI Lab", "aff_unique_dep": ";;AI Lab", "aff_unique_url": "https://www.hku.hk;http://www.sysu.edu.cn/;", "aff_unique_abbr": "HKU;SYSU;", "aff_campus_unique_index": "0;;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Li_2019_ICCV,\n \n author = {\n Li,\n Haofeng and Chen,\n Guanqi and Li,\n Guanbin and Yu,\n Yizhou\n},\n title = {\n Motion Guided Attention for Video Salient Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Moulding Humans: Non-Parametric 3D Human Shape Estimation From Single Images", @@ -20393,11 +21042,11 @@ "status": "Poster", "track": "main", "pid": "5728", - "author_site": "Valentin Gabeur, Jean-S\u00c3\u00a9bastien Franco, Xavier Martin, Cordelia Schmid, Gr\u00c3\u00a9gory Rogez", + "author_site": "Valentin Gabeur, Jean-Sébastien Franco, Xavier Martin, Cordelia Schmid, Grégory Rogez", "author": "Valentin Gabeur; Jean-Sebastien Franco; Xavier Martin; Cordelia Schmid; Gregory Rogez", "abstract": "In this paper, we tackle the problem of 3D human shape estimation from single RGB images. While the recent progress in convolutional neural networks has allowed impressive results for 3D human pose estimation, estimating the full 3D shape of a person is still an open issue. Model-based approaches can output precise meshes of naked under-cloth human bodies but fail to estimate details and un-modelled elements such as hair or clothing. On the other hand, non-parametric volumetric approaches can potentially estimate complete shapes but, in practice, they are limited by the resolution of the output grid and cannot produce detailed estimates. In this work, we propose a non-parametric approach that employs a double depth map to represent the 3D shape of a person: a visible depth map and a \"hidden\" depth map are estimated and combined, to reconstruct the human 3D shape as done with a \"mould\". This representation through 2D depth maps allows a higher resolution output with a much lower dimension than voxel-based volumetric representations. Additionally, our fully derivable depth-based model allows us to efficiently incorporate a discriminator in an adversarial fashion to improve the accuracy and \"humanness\" of the 3D output. We train and quantitatively validate our approach on SURREAL and on 3D-HUMANS, a new photorealistic dataset made of semi-synthetic in-house videos annotated with 3D ground truth surfaces.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Gabeur_Moulding_Humans_Non-Parametric_3D_Human_Shape_Estimation_From_Single_Images_ICCV_2019_paper.pdf", - "aff": "Inria*; Inria*; Inria*; Inria*; NAVER LABS Europe\u2020", + "aff": "Inria*; Inria*; Inria*; Inria*; NAVER LABS Europe†", "project": "", "github": "", "supp": "", @@ -20411,14 +21060,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Gabeur_Moulding_Humans_Non-Parametric_3D_Human_Shape_Estimation_From_Single_Images_ICCV_2019_paper.html", "aff_unique_index": "0;0;0;0;1", - "aff_unique_norm": "INRIA;NAVER LABS Europe", + "aff_unique_norm": "Inria;NAVER LABS Europe", "aff_unique_dep": ";", "aff_unique_url": "https://www.inria.fr;https://www.naverlabs.com/europe", "aff_unique_abbr": "Inria;NAVER LABS Europe", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1", - "aff_country_unique": "France;Unknown" + "aff_country_unique": "France;Unknown", + "bibtex": "@InProceedings{Gabeur_2019_ICCV,\n \n author = {\n Gabeur,\n Valentin and Franco,\n Jean-Sebastien and Martin,\n Xavier and Schmid,\n Cordelia and Rogez,\n Gregory\n},\n title = {\n Moulding Humans: Non-Parametric 3D Human Shape Estimation From Single Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Moving Indoor: Unsupervised Video Depth Learning in Challenging Environments", @@ -20442,7 +21092,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Zhou_Moving_Indoor_Unsupervised_Video_Depth_Learning_in_Challenging_Environments_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Zhou_Moving_Indoor_Unsupervised_Video_Depth_Learning_in_Challenging_Environments_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Zhou_2019_ICCV,\n \n author = {\n Zhou,\n Junsheng and Wang,\n Yuwang and Qin,\n Kaihuai and Zeng,\n Wenjun\n},\n title = {\n Moving Indoor: Unsupervised Video Depth Learning in Challenging Environments\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Multi-Adversarial Faster-RCNN for Unrestricted Object Detection", @@ -20466,7 +21117,8 @@ "aff_domain": ";", "email": ";", "author_num": 2, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/He_Multi-Adversarial_Faster-RCNN_for_Unrestricted_Object_Detection_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/He_Multi-Adversarial_Faster-RCNN_for_Unrestricted_Object_Detection_ICCV_2019_paper.html", + "bibtex": "@InProceedings{He_2019_ICCV,\n \n author = {\n He,\n Zhenwei and Zhang,\n Lei\n},\n title = {\n Multi-Adversarial Faster-RCNN for Unrestricted Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Multi-Agent Reinforcement Learning Based Frame Sampling for Effective Untrimmed Video Recognition", @@ -20492,14 +21144,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Wu_Multi-Agent_Reinforcement_Learning_Based_Frame_Sampling_for_Effective_Untrimmed_Video_ICCV_2019_paper.html", "aff_unique_index": "0+1;2;2;0;2", - "aff_unique_norm": "Shenzhen Institute of Advanced Technology;University of Chinese Academy of Sciences;Baidu", + "aff_unique_norm": "Shenzhen Institutes of Advanced Technology;University of Chinese Academy of Sciences;Baidu Inc.", "aff_unique_dep": ";;Department of Computer Vision Technology (VIS)", "aff_unique_url": "http://www.siat.cas.cn;http://www.ucas.ac.cn;https://www.baidu.com", "aff_unique_abbr": "SIAT;UCAS;Baidu", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Shenzhen;", "aff_country_unique_index": "0+0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wu_2019_ICCV,\n \n author = {\n Wu,\n Wenhao and He,\n Dongliang and Tan,\n Xiao and Chen,\n Shifeng and Wen,\n Shilei\n},\n title = {\n Multi-Agent Reinforcement Learning Based Frame Sampling for Effective Untrimmed Video Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Multi-Angle Point Cloud-VAE: Unsupervised Feature Learning for 3D Point Clouds From Multiple Angles by Joint Self-Reconstruction and Half-to-Half Prediction", @@ -20523,7 +21176,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Han_Multi-Angle_Point_Cloud-VAE_Unsupervised_Feature_Learning_for_3D_Point_Clouds_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Han_Multi-Angle_Point_Cloud-VAE_Unsupervised_Feature_Learning_for_3D_Point_Clouds_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Han_2019_ICCV,\n \n author = {\n Han,\n Zhizhong and Wang,\n Xiyang and Liu,\n Yu-Shen and Zwicker,\n Matthias\n},\n title = {\n Multi-Angle Point Cloud-VAE: Unsupervised Feature Learning for 3D Point Clouds From Multiple Angles by Joint Self-Reconstruction and Half-to-Half Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Multi-Class Part Parsing With Joint Boundary-Semantic Awareness", @@ -20549,14 +21203,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Zhao_Multi-Class_Part_Parsing_With_Joint_Boundary-Semantic_Awareness_ICCV_2019_paper.html", "aff_unique_index": "0;0+1;0;2+1", - "aff_unique_norm": "Beihang University;Pengcheng Laboratory;Peking University", - "aff_unique_dep": "State Key Laboratory of Virtual Reality Technology and Systems, SCSE;Peng Cheng Laboratory;School of EE&CS", + "aff_unique_norm": "Beihang University;Peng Cheng Laboratory;Peking University", + "aff_unique_dep": "State Key Laboratory of Virtual Reality Technology and Systems, SCSE;;School of EE&CS", "aff_unique_url": "http://www.buaa.edu.cn;;http://www.pku.edu.cn", "aff_unique_abbr": "Beihang;;PKU", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0;0+0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhao_2019_ICCV,\n \n author = {\n Zhao,\n Yifan and Li,\n Jia and Zhang,\n Yu and Tian,\n Yonghong\n},\n title = {\n Multi-Class Part Parsing With Joint Boundary-Semantic Awareness\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Multi-Garment Net: Learning to Dress 3D People From Images", @@ -20589,7 +21244,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Saarland", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Bhatnagar_2019_ICCV,\n \n author = {\n Bhatnagar,\n Bharat Lal and Tiwari,\n Garvita and Theobalt,\n Christian and Pons-Moll,\n Gerard\n},\n title = {\n Multi-Garment Net: Learning to Dress 3D People From Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Multi-Level Bottom-Top and Top-Bottom Feature Fusion for Crowd Counting", @@ -20622,7 +21278,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Baltimore", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Sindagi_2019_ICCV,\n \n author = {\n Sindagi,\n Vishwanath A. and Patel,\n Vishal M.\n},\n title = {\n Multi-Level Bottom-Top and Top-Bottom Feature Fusion for Crowd Counting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Multi-Modality Latent Interaction Network for Visual Question Answering", @@ -20648,14 +21305,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Gao_Multi-Modality_Latent_Interaction_Network_for_Visual_Question_Answering_ICCV_2019_paper.html", "aff_unique_index": "0;1;2;0;0", - "aff_unique_norm": "Chinese University of Hong Kong;Tsinghua University;SenseTime", + "aff_unique_norm": "The Chinese University of Hong Kong;Tsinghua University;SenseTime", "aff_unique_dep": "CUHK-SenseTime Joint Lab;;SenseTime Research", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.tsinghua.edu.cn;https://www.sensetime.com", "aff_unique_abbr": "CUHK;THU;SenseTime", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Gao_2019_ICCV,\n \n author = {\n Gao,\n Peng and You,\n Haoxuan and Zhang,\n Zhanpeng and Wang,\n Xiaogang and Li,\n Hongsheng\n},\n title = {\n Multi-Modality Latent Interaction Network for Visual Question Answering\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "id": "f6da4ed074", @@ -20684,7 +21342,8 @@ "aff_campus_unique_index": ";;;;;;", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0+0;0+0;0+0;0+0;0+0;0+0+0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Takahama_2019_ICCV,\n \n author = {\n Takahama,\n Shusuke and Kurose,\n Yusuke and Mukuta,\n Yusuke and Abe,\n Hiroyuki and Fukayama,\n Masashi and Yoshizawa,\n Akihiko and Kitagawa,\n Masanobu and Harada,\n Tatsuya\n},\n title = {\n Multi-Stage Pathological Image Classification Using Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Multi-View Image Fusion", @@ -20708,7 +21367,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Trinidad_Multi-View_Image_Fusion_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Trinidad_Multi-View_Image_Fusion_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Trinidad_2019_ICCV,\n \n author = {\n Trinidad,\n Marc Comino and Brualla,\n Ricardo Martin and Kainz,\n Florian and Kontkanen,\n Janne\n},\n title = {\n Multi-View Image Fusion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Multi-View Stereo by Temporal Nonparametric Fusion", @@ -20741,7 +21401,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Finland" + "aff_country_unique": "Finland", + "bibtex": "@InProceedings{Hou_2019_ICCV,\n \n author = {\n Hou,\n Yuxin and Kannala,\n Juho and Solin,\n Arno\n},\n title = {\n Multi-View Stereo by Temporal Nonparametric Fusion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "MultiSeg: Semantically Meaningful, Scale-Diverse Segmentations From Minimal User Input", @@ -20774,7 +21435,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;0;0", - "aff_country_unique": "Singapore;United States" + "aff_country_unique": "Singapore;United States", + "bibtex": "@InProceedings{Liew_2019_ICCV,\n \n author = {\n Liew,\n Jun Hao and Cohen,\n Scott and Price,\n Brian and Mai,\n Long and Ong,\n Sim-Heng and Feng,\n Jiashi\n},\n title = {\n MultiSeg: Semantically Meaningful,\n Scale-Diverse Segmentations From Minimal User Input\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Multimodal Style Transfer via Graph Cuts", @@ -20798,7 +21460,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Zhang_Multimodal_Style_Transfer_via_Graph_Cuts_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Zhang_Multimodal_Style_Transfer_via_Graph_Cuts_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Zhang_2019_ICCV,\n \n author = {\n Zhang,\n Yulun and Fang,\n Chen and Wang,\n Yilin and Wang,\n Zhaowen and Lin,\n Zhe and Fu,\n Yun and Yang,\n Jimei\n},\n title = {\n Multimodal Style Transfer via Graph Cuts\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Multinomial Distribution Learning for Effective Neural Architecture Search", @@ -20810,7 +21473,7 @@ "author": "Xiawu Zheng; Rongrong Ji; Lang Tang; Baochang Zhang; Jianzhuang Liu; Qi Tian", "abstract": "Architectures obtained by Neural Architecture Search (NAS) have achieved highly competitive performance in various computer vision tasks. However, the prohibitive computation demand of forward-backward propagation in deep neural networks and searching algorithms makes it difficult to apply NAS in practice. In this paper, we propose a Multinomial Distribution Learning for extremely effective NAS, which considers the search space as a joint multinomial distribution, i.e., the operation between two nodes is sampled from this distribution, and the optimal network structure is obtained by the operations with the most likely probability in this distribution. Therefore, NAS can be transformed to a multinomial distribution learning problem, i.e., the distribution is optimized to have a high expectation of the performance. Besides, a hypothesis that the performance ranking is consistent in every training epoch is proposed and demonstrated to further accelerate the learning process. Experiments on CIFAR-10 and ImageNet demonstrate the effectiveness of our method. On CIFAR-10, the structure searched by our method achieves 2.55% test error, while being 6.0x (only 4 GPU hours on GTX1080Ti) faster compared with state-of-the-art NAS algorithms. On ImageNet, our model achieves 74% top1 accuracy under MobileNet settings (MobileNet V1/V2), while being 1.2x faster with measured GPU latency. Test code with pre-trained models are available at https: //github.com/tanglang96/MDENAS", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Zheng_Multinomial_Distribution_Learning_for_Effective_Neural_Architecture_Search_ICCV_2019_paper.pdf", - "aff": "Media Analytics and Computing Lab, Department of Artificial Intelligence, School of Informatics, Xiamen University, 361005, China + Peng Cheng Laboratory, Shenzhen, China; Media Analytics and Computing Lab, Department of Artificial Intelligence, School of Informatics, Xiamen University, 361005, China + Peng Cheng Laboratory, Shenzhen, China; Media Analytics and Computing Lab, Department of Artificial Intelligence, School of Informatics, Xiamen University, 361005, China + Peng Cheng Laboratory, Shenzhen, China; Beihang University, China; Huawei Noah\u2019s Ark Lab; Department of Computer Science, University of Texas at San Antonio", + "aff": "Media Analytics and Computing Lab, Department of Artificial Intelligence, School of Informatics, Xiamen University, 361005, China + Peng Cheng Laboratory, Shenzhen, China; Media Analytics and Computing Lab, Department of Artificial Intelligence, School of Informatics, Xiamen University, 361005, China + Peng Cheng Laboratory, Shenzhen, China; Media Analytics and Computing Lab, Department of Artificial Intelligence, School of Informatics, Xiamen University, 361005, China + Peng Cheng Laboratory, Shenzhen, China; Beihang University, China; Huawei Noah’s Ark Lab; Department of Computer Science, University of Texas at San Antonio", "project": "", "github": "https://github.com/tanglang96/MDENAS", "supp": "", @@ -20824,14 +21487,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Zheng_Multinomial_Distribution_Learning_for_Effective_Neural_Architecture_Search_ICCV_2019_paper.html", "aff_unique_index": "0+1;0+1;0+1;2;3;4", - "aff_unique_norm": "Xiamen University;Pengcheng Laboratory;Beihang University;Huawei;University of Texas at San Antonio", - "aff_unique_dep": "Department of Artificial Intelligence;Peng Cheng Laboratory;;Noah\u2019s Ark Lab;Department of Computer Science", + "aff_unique_norm": "Xiamen University;Peng Cheng Laboratory;Beihang University;Huawei;University of Texas at San Antonio", + "aff_unique_dep": "Department of Artificial Intelligence;;;Noah’s Ark Lab;Department of Computer Science", "aff_unique_url": "https://www.xmu.edu.cn;;http://www.buaa.edu.cn/;https://www.huawei.com;https://www.utsa.edu", "aff_unique_abbr": "XMU;;BUAA;Huawei;UTSA", "aff_campus_unique_index": "1;1;1;2", "aff_campus_unique": ";Shenzhen;San Antonio", "aff_country_unique_index": "0+0;0+0;0+0;0;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Zheng_2019_ICCV,\n \n author = {\n Zheng,\n Xiawu and Ji,\n Rongrong and Tang,\n Lang and Zhang,\n Baochang and Liu,\n Jianzhuang and Tian,\n Qi\n},\n title = {\n Multinomial Distribution Learning for Effective Neural Architecture Search\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "NLNL: Negative Learning for Noisy Labels", @@ -20864,7 +21528,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Kim_2019_ICCV,\n \n author = {\n Kim,\n Youngdong and Yim,\n Junho and Yun,\n Juseung and Kim,\n Junmo\n},\n title = {\n NLNL: Negative Learning for Noisy Labels\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "NOTE-RCNN: NOise Tolerant Ensemble RCNN for Semi-Supervised Object Detection", @@ -20891,13 +21556,14 @@ "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Gao_NOTE-RCNN_NOise_Tolerant_Ensemble_RCNN_for_Semi-Supervised_Object_Detection_ICCV_2019_paper.html", "aff_unique_index": "0;1;1;2;0", "aff_unique_norm": "University of Southern California;Google;Stanford University", - "aff_unique_dep": ";Google;", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.usc.edu;https://www.google.com;https://www.stanford.edu", "aff_unique_abbr": "USC;Google;Stanford", "aff_campus_unique_index": "0;2;0", "aff_campus_unique": "Los Angeles;;Stanford", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Gao_2019_ICCV,\n \n author = {\n Gao,\n Jiyang and Wang,\n Jiang and Dai,\n Shengyang and Li,\n Li-Jia and Nevatia,\n Ram\n},\n title = {\n NOTE-RCNN: NOise Tolerant Ensemble RCNN for Semi-Supervised Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Neighborhood Preserving Hashing for Scalable Video Retrieval", @@ -20930,7 +21596,8 @@ "aff_campus_unique_index": "1;;;1;", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0+0;0+0+0;0+0+0;0+0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2019_ICCV,\n \n author = {\n Li,\n Shuyan and Chen,\n Zhixiang and Lu,\n Jiwen and Li,\n Xiu and Zhou,\n Jie\n},\n title = {\n Neighborhood Preserving Hashing for Scalable Video Retrieval\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Neural 3D Morphable Models: Spiral Convolutional Networks for 3D Shape Representation Learning and Generation", @@ -20963,7 +21630,8 @@ "aff_campus_unique_index": ";;;;", "aff_campus_unique": "", "aff_country_unique_index": "0;2+3;0;0+2+3;0", - "aff_country_unique": "United Kingdom;;Switzerland;United States" + "aff_country_unique": "United Kingdom;;Switzerland;United States", + "bibtex": "@InProceedings{Bouritsas_2019_ICCV,\n \n author = {\n Bouritsas,\n Giorgos and Bokhnyak,\n Sergiy and Ploumpis,\n Stylianos and Bronstein,\n Michael and Zafeiriou,\n Stefanos\n},\n title = {\n Neural 3D Morphable Models: Spiral Convolutional Networks for 3D Shape Representation Learning and Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Neural Inter-Frame Compression for Video Coding", @@ -20991,12 +21659,13 @@ "aff_unique_index": "0;0;0+1;0", "aff_unique_norm": "Disney Research;ETH Zurich", "aff_unique_dep": "Studios;Department of Computer Science", - "aff_unique_url": "https://www.disneyresearch.com;https://www.ethz.ch", + "aff_unique_url": "https://research.disney.com;https://www.ethz.ch", "aff_unique_abbr": "Disney Research;ETHZ", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+1;0", - "aff_country_unique": "United States;Switzerland" + "aff_country_unique": "United States;Switzerland", + "bibtex": "@InProceedings{Djelouah_2019_ICCV,\n \n author = {\n Djelouah,\n Abdelaziz and Campos,\n Joaquim and Schaub-Meyer,\n Simone and Schroers,\n Christopher\n},\n title = {\n Neural Inter-Frame Compression for Video Coding\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Neural Inverse Rendering of an Indoor Scene From a Single Image", @@ -21022,14 +21691,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Sengupta_Neural_Inverse_Rendering_of_an_Indoor_Scene_From_a_Single_ICCV_2019_paper.html", "aff_unique_index": "0+1+2;0+3;0;0;1;0", - "aff_unique_norm": "NVIDIA;University of Maryland;University of Washington;SenseTime", - "aff_unique_dep": "NVIDIA Corporation;;;", + "aff_unique_norm": "NVIDIA Corporation;University of Maryland;University of Washington;SenseTime", + "aff_unique_dep": ";;;", "aff_unique_url": "https://www.nvidia.com;https://www/umd.edu;https://www.washington.edu;https://www.sensetime.com", "aff_unique_abbr": "NVIDIA;UMD;UW;SenseTime", "aff_campus_unique_index": "1;;1", "aff_campus_unique": ";College Park", "aff_country_unique_index": "0+0+0;0+1;0;0;0;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Sengupta_2019_ICCV,\n \n author = {\n Sengupta,\n Soumyadip and Gu,\n Jinwei and Kim,\n Kihwan and Liu,\n Guilin and Jacobs,\n David W. and Kautz,\n Jan\n},\n title = {\n Neural Inverse Rendering of an Indoor Scene From a Single Image\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Neural Re-Simulation for Generating Bounces in Single Images", @@ -21062,7 +21732,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0+1", - "aff_country_unique": "United Kingdom;United States" + "aff_country_unique": "United Kingdom;United States", + "bibtex": "@InProceedings{Innamorati_2019_ICCV,\n \n author = {\n Innamorati,\n Carlo and Russell,\n Bryan and Kaufman,\n Danny M. and Mitra,\n Niloy J.\n},\n title = {\n Neural Re-Simulation for Generating Bounces in Single Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Neural Turtle Graphics for Modeling City Road Layouts", @@ -21088,14 +21759,15 @@ "author_num": 9, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Chu_Neural_Turtle_Graphics_for_Modeling_City_Road_Layouts_ICCV_2019_paper.html", "aff_unique_index": "0+1+2;2;0+1+2;0+1+2;0+1+2;0+2;2;3;0+1+2", - "aff_unique_norm": "University of Toronto;Vector Institute;NVIDIA;Massachusetts Institute of Technology", - "aff_unique_dep": ";;NVIDIA Corporation;", + "aff_unique_norm": "University of Toronto;Vector Institute;NVIDIA Corporation;Massachusetts Institute of Technology", + "aff_unique_dep": ";;;", "aff_unique_url": "https://www.utoronto.ca;https://vectorinstitute.ai/;https://www.nvidia.com;https://web.mit.edu", "aff_unique_abbr": "U of T;Vector Institute;NVIDIA;MIT", "aff_campus_unique_index": ";;;;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0+1;1;0+0+1;0+0+1;0+0+1;0+1;1;1;0+0+1", - "aff_country_unique": "Canada;United States" + "aff_country_unique": "Canada;United States", + "bibtex": "@InProceedings{Chu_2019_ICCV,\n \n author = {\n Chu,\n Hang and Li,\n Daiqing and Acuna,\n David and Kar,\n Amlan and Shugrina,\n Maria and Wei,\n Xinkai and Liu,\n Ming-Yu and Torralba,\n Antonio and Fidler,\n Sanja\n},\n title = {\n Neural Turtle Graphics for Modeling City Road Layouts\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Neural-Guided RANSAC: Learning Where to Sample Model Hypotheses", @@ -21128,7 +21800,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "1", - "aff_country_unique": ";Germany" + "aff_country_unique": ";Germany", + "bibtex": "@InProceedings{Brachmann_2019_ICCV,\n \n author = {\n Brachmann,\n Eric and Rother,\n Carsten\n},\n title = {\n Neural-Guided RANSAC: Learning Where to Sample Model Hypotheses\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "New Convex Relaxations for MRF Inference With Unknown Graphs", @@ -21140,7 +21813,7 @@ "author": "Zhenhua Wang; Tong Liu; Qinfeng Shi; M. Pawan Kumar; Jianhua Zhang", "abstract": "Treating graph structures of Markov random fields as unknown and estimating them jointly with labels have been shown to be useful for modeling human activity recognition and other related tasks. We propose two novel relaxations for solving this problem. The first is a linear programming (LP) relaxation, which is provably tighter than the existing LP relaxation. The second is a non-convex quadratic programming (QP) relaxation, which admits an efficient concave-convex procedure (CCCP). The CCCP algorithm is initialized by solving a convex QP relaxation of the problem, which is obtained by modifying the diagonal of the matrix that specifies the non-convex QP relaxation. We show that our convex QP relaxation is optimal in the sense that it minimizes the L1 norm of the diagonal modification vector. While the convex QP relaxation is not as tight as the existing and the new LP relaxations, when used in conjunction with the CCCP algorithm for the non-convex QP relaxation, it provides accurate solutions. We demonstrate the efficacy of our new relaxations for both synthetic data and human activity recognition.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Wang_New_Convex_Relaxations_for_MRF_Inference_With_Unknown_Graphs_ICCV_2019_paper.pdf", - "aff": "Zhejiang University of Technology\u2020; Zhejiang University of Technology\u2020; The University of Adelaide\u2021; University of Oxford\u266f; Zhejiang University of Technology\u2020", + "aff": "Zhejiang University of Technology†; Zhejiang University of Technology†; The University of Adelaide‡; University of Oxford♯; Zhejiang University of Technology†", "project": "", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2019/supplemental/Wang_New_Convex_Relaxations_ICCV_2019_supplemental.pdf", @@ -21154,14 +21827,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Wang_New_Convex_Relaxations_for_MRF_Inference_With_Unknown_Graphs_ICCV_2019_paper.html", "aff_unique_index": "0;0;1;2;0", - "aff_unique_norm": "Zhejiang University of Technology;University of Adelaide;University of Oxford", + "aff_unique_norm": "Zhejiang University of Technology;The University of Adelaide;University of Oxford", "aff_unique_dep": ";;", "aff_unique_url": "https://www.zjut.edu.cn;https://www.adelaide.edu.au;https://www.ox.ac.uk", "aff_unique_abbr": "ZJUT;Adelaide;Oxford", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;2;0", - "aff_country_unique": "China;Australia;United Kingdom" + "aff_country_unique": "China;Australia;United Kingdom", + "bibtex": "@InProceedings{Wang_2019_ICCV,\n \n author = {\n Wang,\n Zhenhua and Liu,\n Tong and Shi,\n Qinfeng and Kumar,\n M. Pawan and Zhang,\n Jianhua\n},\n title = {\n New Convex Relaxations for MRF Inference With Unknown Graphs\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "No Fear of the Dark: Image Retrieval Under Varying Illumination Conditions", @@ -21169,7 +21843,7 @@ "status": "Poster", "track": "main", "pid": "4874", - "author_site": "Tomas Jenicek, Ond\u00c5\u0099ej Chum", + "author_site": "Tomas Jenicek, Ondřej Chum", "author": "Tomas Jenicek; Ondrej Chum", "abstract": "Image retrieval under varying illumination conditions, such as day and night images, is addressed by image preprocessing, both hand-crafted and learned. Prior to extracting image descriptors by a convolutional neural network, images are photometrically normalised in order to reduce the descriptor sensitivity to illumination changes. We propose a learnable normalisation based on the U-Net architecture, which is trained on a combination of single-camera multi-exposure images and a newly constructed collection of similar views of landmarks during day and night. We experimentally show that both hand-crafted normalisation based on local histogram equalisation and the learnable normalisation outperform standard approaches in varying illumination conditions, while staying on par with the state-of-the-art methods on daylight illumination benchmarks, such as Oxford or Paris datasets.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Jenicek_No_Fear_of_the_Dark_Image_Retrieval_Under_Varying_Illumination_ICCV_2019_paper.pdf", @@ -21194,7 +21868,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Prague", "aff_country_unique_index": "0;0", - "aff_country_unique": "Czech Republic" + "aff_country_unique": "Czech Republic", + "bibtex": "@InProceedings{Jenicek_2019_ICCV,\n \n author = {\n Jenicek,\n Tomas and Chum,\n Ondrej\n},\n title = {\n No Fear of the Dark: Image Retrieval Under Varying Illumination Conditions\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "No-Frills Human-Object Interaction Detection: Factorization, Layout Encodings, and Training Techniques", @@ -21220,14 +21895,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Gupta_No-Frills_Human-Object_Interaction_Detection_Factorization_Layout_Encodings_and_Training_Techniques_ICCV_2019_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "University of Illinois Urbana-Champaign", + "aff_unique_norm": "University of Illinois at Urbana-Champaign", "aff_unique_dep": "", "aff_unique_url": "https://illinois.edu", "aff_unique_abbr": "UIUC", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Urbana-Champaign", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Gupta_2019_ICCV,\n \n author = {\n Gupta,\n Tanmay and Schwing,\n Alexander and Hoiem,\n Derek\n},\n title = {\n No-Frills Human-Object Interaction Detection: Factorization,\n Layout Encodings,\n and Training Techniques\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Noise Flow: Noise Modeling With Conditional Normalizing Flows", @@ -21250,7 +21926,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Abdelhamed_Noise_Flow_Noise_Modeling_With_Conditional_Normalizing_Flows_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Abdelhamed_Noise_Flow_Noise_Modeling_With_Conditional_Normalizing_Flows_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Abdelhamed_2019_ICCV,\n \n author = {\n Abdelhamed,\n Abdelrahman and Brubaker,\n Marcus A. and Brown,\n Michael S.\n},\n title = {\n Noise Flow: Noise Modeling With Conditional Normalizing Flows\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Non-Local ConvLSTM for Video Compression Artifact Reduction", @@ -21283,7 +21960,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Shanghai", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xu_2019_ICCV,\n \n author = {\n Xu,\n Yi and Gao,\n Longwen and Tian,\n Kai and Zhou,\n Shuigeng and Sun,\n Huyang\n},\n title = {\n Non-Local ConvLSTM for Video Compression Artifact Reduction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Non-Local Intrinsic Decomposition With Near-Infrared Priors", @@ -21316,7 +21994,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;1", - "aff_country_unique": "Australia;Japan" + "aff_country_unique": "Australia;Japan", + "bibtex": "@InProceedings{Cheng_2019_ICCV,\n \n author = {\n Cheng,\n Ziang and Zheng,\n Yinqiang and You,\n Shaodi and Sato,\n Imari\n},\n title = {\n Non-Local Intrinsic Decomposition With Near-Infrared Priors\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Non-Local Recurrent Neural Memory for Supervised Sequence Modeling", @@ -21342,14 +22021,15 @@ "author_num": 7, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Fu_Non-Local_Recurrent_Neural_Memory_for_Supervised_Sequence_Modeling_ICCV_2019_paper.html", "aff_unique_index": "0+1;1;1;0;0;1;1", - "aff_unique_norm": "Peking University;Tencent", - "aff_unique_dep": "School of ECE;Tencent Holdings Limited", + "aff_unique_norm": "Peking University;Tencent Holdings Limited", + "aff_unique_dep": "School of ECE;", "aff_unique_url": "http://www.pku.edu.cn;https://www.tencent.com", "aff_unique_abbr": "PKU;Tencent", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Fu_2019_ICCV,\n \n author = {\n Fu,\n Canmiao and Pei,\n Wenjie and Cao,\n Qiong and Zhang,\n Chaopeng and Zhao,\n Yong and Shen,\n Xiaoyong and Tai,\n Yu-Wing\n},\n title = {\n Non-Local Recurrent Neural Memory for Supervised Sequence Modeling\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Normalized Wasserstein for Mixture Distributions With Applications in Adversarial Learning and Domain Adaptation", @@ -21382,7 +22062,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Balaji_2019_ICCV,\n \n author = {\n Balaji,\n Yogesh and Chellappa,\n Rama and Feizi,\n Soheil\n},\n title = {\n Normalized Wasserstein for Mixture Distributions With Applications in Adversarial Learning and Domain Adaptation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Not All Parts Are Created Equal: 3D Pose Estimation by Modeling Bi-Directional Dependencies of Body Parts", @@ -21415,7 +22096,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Darlington;", "aff_country_unique_index": "0+0;0;1;0", - "aff_country_unique": "Australia;United States" + "aff_country_unique": "Australia;United States", + "bibtex": "@InProceedings{Wang_2019_ICCV,\n \n author = {\n Wang,\n Jue and Huang,\n Shaoli and Wang,\n Xinchao and Tao,\n Dacheng\n},\n title = {\n Not All Parts Are Created Equal: 3D Pose Estimation by Modeling Bi-Directional Dependencies of Body Parts\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "O2U-Net: A Simple Noisy Label Detection Approach for Deep Neural Networks", @@ -21448,7 +22130,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Huang_2019_ICCV,\n \n author = {\n Huang,\n Jinchi and Qu,\n Lie and Jia,\n Rongfei and Zhao,\n Binqiang\n},\n title = {\n O2U-Net: A Simple Noisy Label Detection Approach for Deep Neural Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Object Guided External Memory Network for Video Object Detection", @@ -21460,7 +22143,7 @@ "author": "Hanming Deng; Yang Hua; Tao Song; Zongpu Zhang; Zhengui Xue; Ruhui Ma; Neil Robertson; Haibing Guan", "abstract": "Video object detection is more challenging than image object detection because of the deteriorated frame quality. To enhance the feature representation, state-of-the-art methods propagate temporal information into the deteriorated frame by aligning and aggregating entire feature maps from multiple nearby frames. However, restricted by feature map's low storage-efficiency and vulnerable content-address allocation, long-term temporal information is not fully stressed by these methods. In this work, we propose the first object guided external memory network for online video object detection. Storage-efficiency is handled by object guided hard-attention to selectively store valuable features, and long-term information is protected when stored in an addressable external data matrix. A set of read/write operations are designed to accurately propagate/allocate and delete multi-level memory feature under object guidance. We evaluate our method on the ImageNet VID dataset and achieve state-of-the-art performance as well as good speed-accuracy tradeoff. Furthermore, by visualizing the external memory, we show the detailed object-level reasoning process across frames.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Deng_Object_Guided_External_Memory_Network_for_Video_Object_Detection_ICCV_2019_paper.pdf", - "aff": "Shanghai Jiao Tong University; Queen\u2019s University Belfast; Shanghai Jiao Tong University; Shanghai Jiao Tong University; Shanghai Jiao Tong University; Shanghai Jiao Tong University; Queen\u2019s University Belfast; Shanghai Jiao Tong University", + "aff": "Shanghai Jiao Tong University; Queen’s University Belfast; Shanghai Jiao Tong University; Shanghai Jiao Tong University; Shanghai Jiao Tong University; Shanghai Jiao Tong University; Queen’s University Belfast; Shanghai Jiao Tong University", "project": "", "github": "", "supp": "", @@ -21481,7 +22164,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0;0;1;0", - "aff_country_unique": "China;United Kingdom" + "aff_country_unique": "China;United Kingdom", + "bibtex": "@InProceedings{Deng_2019_ICCV,\n \n author = {\n Deng,\n Hanming and Hua,\n Yang and Song,\n Tao and Zhang,\n Zongpu and Xue,\n Zhengui and Ma,\n Ruhui and Robertson,\n Neil and Guan,\n Haibing\n},\n title = {\n Object Guided External Memory Network for Video Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Object-Aware Instance Labeling for Weakly Supervised Object Detection", @@ -21507,14 +22191,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Kosugi_Object-Aware_Instance_Labeling_for_Weakly_Supervised_Object_Detection_ICCV_2019_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "University of Tokyo", + "aff_unique_norm": "The University of Tokyo", "aff_unique_dep": "", "aff_unique_url": "https://www.u-tokyo.ac.jp", "aff_unique_abbr": "UTokyo", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Kosugi_2019_ICCV,\n \n author = {\n Kosugi,\n Satoshi and Yamasaki,\n Toshihiko and Aizawa,\n Kiyoharu\n},\n title = {\n Object-Aware Instance Labeling for Weakly Supervised Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Object-Driven Multi-Layer Scene Decomposition From a Single Image", @@ -21526,7 +22211,7 @@ "author": "Helisa Dhamo; Nassir Navab; Federico Tombari", "abstract": "We present a method that tackles the challenge of predicting color and depth behind the visible content of an image. Our approach aims at building up a Layered Depth Image (LDI) from a single RGB input, which is an efficient representation that arranges the scene in layers, including originally occluded regions. Unlike previous work, we enable an adaptive scheme for the number of layers and incorporate semantic encoding for better hallucination of partly occluded objects. Additionally, our approach is object-driven, which especially boosts the accuracy for the occluded intermediate objects. The framework consists of two steps. First, we individually complete each object in terms of color and depth, while estimating the scene layout. Second, we rebuild the scene based on the regressed layers and enforce the recomposed image to resemble the structure of the original input. The learned representation enables various applications, such as 3D photography and diminished reality, all from a single RGB image.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Dhamo_Object-Driven_Multi-Layer_Scene_Decomposition_From_a_Single_Image_ICCV_2019_paper.pdf", - "aff": "Technische Universit \u00a8at M \u00a8unchen; Technische Universit \u00a8at M \u00a8unchen; Technische Universit \u00a8at M \u00a8unchen+Google", + "aff": "Technische Universit ¨at M ¨unchen; Technische Universit ¨at M ¨unchen; Technische Universit ¨at M ¨unchen+Google", "project": "", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2019/supplemental/Dhamo_Object-Driven_Multi-Layer_Scene_ICCV_2019_supplemental.pdf", @@ -21540,14 +22225,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Dhamo_Object-Driven_Multi-Layer_Scene_Decomposition_From_a_Single_Image_ICCV_2019_paper.html", "aff_unique_index": "0;0;0+1", - "aff_unique_norm": "Technische Universit\u00e4t M\u00fcnchen;Google", - "aff_unique_dep": ";Google", + "aff_unique_norm": "Technische Universität München;Google", + "aff_unique_dep": ";", "aff_unique_url": "https://www.tum.de;https://www.google.com", "aff_unique_abbr": "TUM;Google", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0+1", - "aff_country_unique": "Germany;United States" + "aff_country_unique": "Germany;United States", + "bibtex": "@InProceedings{Dhamo_2019_ICCV,\n \n author = {\n Dhamo,\n Helisa and Navab,\n Nassir and Tombari,\n Federico\n},\n title = {\n Object-Driven Multi-Layer Scene Decomposition From a Single Image\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Objects365: A Large-Scale, High-Quality Dataset for Object Detection", @@ -21580,7 +22266,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Shao_2019_ICCV,\n \n author = {\n Shao,\n Shuai and Li,\n Zeming and Zhang,\n Tianyuan and Peng,\n Chao and Yu,\n Gang and Zhang,\n Xiangyu and Li,\n Jing and Sun,\n Jian\n},\n title = {\n Objects365: A Large-Scale,\n High-Quality Dataset for Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Occlusion Robust Face Recognition Based on Mask Learning With Pairwise Differential Siamese Network", @@ -21613,7 +22300,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Song_2019_ICCV,\n \n author = {\n Song,\n Lingxue and Gong,\n Dihong and Li,\n Zhifeng and Liu,\n Changsong and Liu,\n Wei\n},\n title = {\n Occlusion Robust Face Recognition Based on Mask Learning With Pairwise Differential Siamese Network\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Occlusion-Aware Networks for 3D Human Pose Estimation in Video", @@ -21646,7 +22334,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0;0+0", - "aff_country_unique": "Singapore;China" + "aff_country_unique": "Singapore;China", + "bibtex": "@InProceedings{Cheng_2019_ICCV,\n \n author = {\n Cheng,\n Yu and Yang,\n Bo and Wang,\n Bo and Yan,\n Wending and Tan,\n Robby T.\n},\n title = {\n Occlusion-Aware Networks for 3D Human Pose Estimation in Video\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Occlusion-Shared and Feature-Separated Network for Occlusion Relationship Reasoning", @@ -21679,7 +22368,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Beijing;", "aff_country_unique_index": "0;0;0+0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Lu_2019_ICCV,\n \n author = {\n Lu,\n Rui and Xue,\n Feng and Zhou,\n Menghan and Ming,\n Anlong and Zhou,\n Yu\n},\n title = {\n Occlusion-Shared and Feature-Separated Network for Occlusion Relationship Reasoning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Occupancy Flow: 4D Reconstruction by Learning Particle Dynamics", @@ -21691,7 +22381,7 @@ "author": "Michael Niemeyer; Lars Mescheder; Michael Oechsle; Andreas Geiger", "abstract": "Deep learning based 3D reconstruction techniques have recently achieved impressive results. However, while state-of-the-art methods are able to output complex 3D geometry, it is not clear how to extend these results to time-varying topologies. Approaches treating each time step individually lack continuity and exhibit slow inference, while traditional 4D reconstruction methods often utilize a template model or discretize the 4D space at fixed resolution. In this work, we present Occupancy Flow, a novel spatio-temporal representation of time-varying 3D geometry with implicit correspondences. Towards this goal, we learn a temporally and spatially continuous vector field which assigns a motion vector to every point in space and time. In order to perform dense 4D reconstruction from images or sparse point clouds, we combine our method with a continuous 3D representation. Implicitly, our model yields correspondences over time, thus enabling fast inference while providing a sound physical description of the temporal dynamics. We show that our method can be used for interpolation and reconstruction tasks, and demonstrate the accuracy of the learned correspondences. We believe that Occupancy Flow is a promising new 4D representation which will be useful for a variety of spatio-temporal reconstruction tasks.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Niemeyer_Occupancy_Flow_4D_Reconstruction_by_Learning_Particle_Dynamics_ICCV_2019_paper.pdf", - "aff": "Autonomous Vision Group, MPI for Intelligent Systems and University of T\u00fcbingen; Autonomous Vision Group, MPI for Intelligent Systems and University of T\u00fcbingen; Autonomous Vision Group, MPI for Intelligent Systems and University of T\u00fcbingen + ETAS GmbH, Bosch Group, Stuttgart; Autonomous Vision Group, MPI for Intelligent Systems and University of T\u00fcbingen", + "aff": "Autonomous Vision Group, MPI for Intelligent Systems and University of Tübingen; Autonomous Vision Group, MPI for Intelligent Systems and University of Tübingen; Autonomous Vision Group, MPI for Intelligent Systems and University of Tübingen + ETAS GmbH, Bosch Group, Stuttgart; Autonomous Vision Group, MPI for Intelligent Systems and University of Tübingen", "project": "", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2019/supplemental/Niemeyer_Occupancy_Flow_4D_ICCV_2019_supplemental.pdf", @@ -21712,7 +22402,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Niemeyer_2019_ICCV,\n \n author = {\n Niemeyer,\n Michael and Mescheder,\n Lars and Oechsle,\n Michael and Geiger,\n Andreas\n},\n title = {\n Occupancy Flow: 4D Reconstruction by Learning Particle Dynamics\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Omni-Scale Feature Learning for Person Re-Identification", @@ -21738,14 +22429,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Zhou_Omni-Scale_Feature_Learning_for_Person_Re-Identification_ICCV_2019_paper.html", "aff_unique_index": "0;0;1;0+2", - "aff_unique_norm": "University of Surrey;Queen Mary University of London;Samsung", + "aff_unique_norm": "University of Surrey;Queen Mary University of London;Samsung AI Center", "aff_unique_dep": ";;AI Center", "aff_unique_url": "https://www.surrey.ac.uk;https://www.qmul.ac.uk;https://www.samsung.com/global/research-innovation/ai-research-centers/samsung-ai-center-cambridge/", "aff_unique_abbr": "Surrey;QMUL;SAC", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";London;Cambridge", "aff_country_unique_index": "0;0;0;0+0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Zhou_2019_ICCV,\n \n author = {\n Zhou,\n Kaiyang and Yang,\n Yongxin and Cavallaro,\n Andrea and Xiang,\n Tao\n},\n title = {\n Omni-Scale Feature Learning for Person Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "OmniMVS: End-to-End Learning for Omnidirectional Stereo Matching", @@ -21768,7 +22460,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Won_OmniMVS_End-to-End_Learning_for_Omnidirectional_Stereo_Matching_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Won_OmniMVS_End-to-End_Learning_for_Omnidirectional_Stereo_Matching_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Won_2019_ICCV,\n \n author = {\n Won,\n Changhee and Ryu,\n Jongbin and Lim,\n Jongwoo\n},\n title = {\n OmniMVS: End-to-End Learning for Omnidirectional Stereo Matching\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "On Boosting Single-Frame 3D Human Pose Estimation via Monocular Videos", @@ -21780,7 +22473,7 @@ "author": "Zhi Li; Xuan Wang; Fei Wang; Peilin Jiang", "abstract": "The premise of training an accurate 3D human pose estimation network is the possession of huge amount of richly annotated training data. Nonetheless, manually obtaining rich and accurate annotations is, even not impossible, tedious and slow. In this paper, we propose to exploit monocular videos to complement the training dataset for the single-image 3D human pose estimation tasks. At the beginning, a baseline model is trained with a small set of annotations. By fixing some reliable estimations produced by the resulting model, our method automatically collects the annotations across the entire video as solving the 3D trajectory completion problem. Then, the baseline model is further trained with the collected annotations to learn the new poses. We evaluate our method on the broadly-adopted Human3.6M and MPI-INF-3DHP datasets. As illustrated in experiments, given only a small set of annotations, our method successfully makes the model to learn new poses from unlabelled monocular videos, promoting the accuracies of the baseline model by about 10%. By contrast with previous approaches, our method does not rely on either multi-view imagery or any explicit 2D keypoint annotations.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Li_On_Boosting_Single-Frame_3D_Human_Pose_Estimation_via_Monocular_Videos_ICCV_2019_paper.pdf", - "aff": "School of Software Engineering, Xi\u2019an Jiaotong University, China; School of Electronic and Information Engineering, Xi\u2019an Jiaotong University, China; School of Electronic and Information Engineering, Xi\u2019an Jiaotong University, China; School of Software Engineering, Xi\u2019an Jiaotong University, China", + "aff": "School of Software Engineering, Xi’an Jiaotong University, China; School of Electronic and Information Engineering, Xi’an Jiaotong University, China; School of Electronic and Information Engineering, Xi’an Jiaotong University, China; School of Software Engineering, Xi’an Jiaotong University, China", "project": "", "github": "", "supp": "", @@ -21794,14 +22487,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Li_On_Boosting_Single-Frame_3D_Human_Pose_Estimation_via_Monocular_Videos_ICCV_2019_paper.html", "aff_unique_index": "0;0;0;0", - "aff_unique_norm": "Xi'an Jiao Tong University", + "aff_unique_norm": "Xi'an Jiaotong University", "aff_unique_dep": "School of Software Engineering", "aff_unique_url": "http://www.xjtu.edu.cn", "aff_unique_abbr": "XJTU", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Xi'an", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2019_ICCV,\n \n author = {\n Li,\n Zhi and Wang,\n Xuan and Wang,\n Fei and Jiang,\n Peilin\n},\n title = {\n On Boosting Single-Frame 3D Human Pose Estimation via Monocular Videos\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "On Network Design Spaces for Visual Recognition", @@ -21809,7 +22503,7 @@ "status": "Poster", "track": "main", "pid": "2551", - "author_site": "Ilija Radosavovic, Justin Johnson, Saining Xie, Wan-Yen Lo, Piotr Doll\u00c3\u00a1r", + "author_site": "Ilija Radosavovic, Justin Johnson, Saining Xie, Wan-Yen Lo, Piotr Dollár", "author": "Ilija Radosavovic; Justin Johnson; Saining Xie; Wan-Yen Lo; Piotr Dollar", "abstract": "Over the past several years progress in designing better neural network architectures for visual recognition has been substantial. To help sustain this rate of progress, in this work we propose to reexamine the methodology for comparing network architectures. In particular, we introduce a new comparison paradigm of distribution estimates, in which network design spaces are compared by applying statistical techniques to populations of sampled models, while controlling for confounding factors like network complexity. Compared to current methodologies of comparing point and curve estimates of model families, distribution estimates paint a more complete picture of the entire design landscape. As a case study, we examine design spaces used in neural architecture search (NAS). We find significant statistical differences between recent NAS design space variants that have been largely overlooked. Furthermore, our analysis reveals that the design spaces for standard model families like ResNeXt can be comparable to the more complex ones used in recent NAS work. We hope these insights into distribution analysis will enable more robust progress toward discovering better networks for visual recognition.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Radosavovic_On_Network_Design_Spaces_for_Visual_Recognition_ICCV_2019_paper.pdf", @@ -21827,14 +22521,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Radosavovic_On_Network_Design_Spaces_for_Visual_Recognition_ICCV_2019_paper.html", "aff_unique_index": "0;0;0;0;0", - "aff_unique_norm": "Meta", + "aff_unique_norm": "Facebook", "aff_unique_dep": "Facebook AI Research", "aff_unique_url": "https://research.facebook.com", "aff_unique_abbr": "FAIR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Radosavovic_2019_ICCV,\n \n author = {\n Radosavovic,\n Ilija and Johnson,\n Justin and Xie,\n Saining and Lo,\n Wan-Yen and Dollar,\n Piotr\n},\n title = {\n On Network Design Spaces for Visual Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "On the Design of Black-Box Adversarial Examples by Leveraging Gradient-Free Optimization and Operator Splitting Method", @@ -21860,14 +22555,15 @@ "author_num": 7, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Zhao_On_the_Design_of_Black-Box_Adversarial_Examples_by_Leveraging_Gradient-Free_ICCV_2019_paper.html", "aff_unique_index": "0;1;1;1;0;2;0", - "aff_unique_norm": "Northeastern University;IBM;Syracuse University", + "aff_unique_norm": "Northeastern University;MIT-IBM Watson AI Lab;Syracuse University", "aff_unique_dep": "Department of Electrical and Computer Engineering;AI Lab;", "aff_unique_url": "https://www.northeastern.edu;https://www.ibmwatsonai.org/;https://www.syracuse.edu", "aff_unique_abbr": "NU;MIT-IBM AI Lab;Syracuse", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhao_2019_ICCV,\n \n author = {\n Zhao,\n Pu and Liu,\n Sijia and Chen,\n Pin-Yu and Hoang,\n Nghia and Xu,\n Kaidi and Kailkhura,\n Bhavya and Lin,\n Xue\n},\n title = {\n On the Design of Black-Box Adversarial Examples by Leveraging Gradient-Free Optimization and Operator Splitting Method\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "On the Efficacy of Knowledge Distillation", @@ -21900,7 +22596,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Cho_2019_ICCV,\n \n author = {\n Cho,\n Jang Hyun and Hariharan,\n Bharath\n},\n title = {\n On the Efficacy of Knowledge Distillation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "On the Global Optima of Kernelized Adversarial Representation Learning", @@ -21933,7 +22630,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "United States;Cyprus" + "aff_country_unique": "United States;Cyprus", + "bibtex": "@InProceedings{Sadeghi_2019_ICCV,\n \n author = {\n Sadeghi,\n Bashir and Yu,\n Runyi and Boddeti,\n Vishnu\n},\n title = {\n On the Global Optima of Kernelized Adversarial Representation Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "On the Over-Smoothing Problem of CNN Based Disparity Estimation", @@ -21966,7 +22664,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2019_ICCV,\n \n author = {\n Chen,\n Chuangrong and Chen,\n Xiaozhi and Cheng,\n Hui\n},\n title = {\n On the Over-Smoothing Problem of CNN Based Disparity Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Once a MAN: Towards Multi-Target Attack via Learning Multi-Target Adversarial Network Once", @@ -21992,14 +22691,15 @@ "author_num": 8, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Han_Once_a_MAN_Towards_Multi-Target_Attack_via_Learning_Multi-Target_Adversarial_ICCV_2019_paper.html", "aff_unique_index": "0;1;0;1;1;1;2;0", - "aff_unique_norm": "Chinese University of Hong Kong;University of Science and Technology of China;University of Hong Kong", + "aff_unique_norm": "The Chinese University of Hong Kong;University of Science and Technology of China;The University of Hong Kong", "aff_unique_dep": "CUHK-SenseTime Joint Laboratory;Key Laboratory of Electromagnetic Space Information;", "aff_unique_url": "https://www.cuhk.edu.hk;http://www.ustc.edu.cn;https://www.hku.hk", "aff_unique_abbr": "CUHK;USTC;HKU", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Han_2019_ICCV,\n \n author = {\n Han,\n Jiangfan and Dong,\n Xiaoyi and Zhang,\n Ruimao and Chen,\n Dongdong and Zhang,\n Weiming and Yu,\n Nenghai and Luo,\n Ping and Wang,\n Xiaogang\n},\n title = {\n Once a MAN: Towards Multi-Target Attack via Learning Multi-Target Adversarial Network Once\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "One-Shot Neural Architecture Search via Self-Evaluated Template Network", @@ -22011,7 +22711,7 @@ "author": "Xuanyi Dong; Yi Yang", "abstract": "Neural architecture search (NAS) aims to automate the search procedure of architecture instead of manual design. Even if recent NAS approaches finish the search within days, lengthy training is still required for a specific architecture candidate to get the parameters for its accurate evaluation. Recently one-shot NAS methods are proposed to largely squeeze the tedious training process by sharing parameters across candidates. In this way, the parameters for each candidate can be directly extracted from the shared parameters instead of training them from scratch. However, they have no sense of which candidate will perform better until evaluation so that the candidates to evaluate are randomly sampled and the top-1 candidate is considered the best. In this paper, we propose a Self-Evaluated Template Network (SETN) to improve the quality of the architecture candidates for evaluation so that it is more likely to cover competitive candidates. SETN consists of two components: (1) an evaluator, which learns to indicate the probability of each individual architecture being likely to have a lower validation loss. The candidates for evaluation can thus be selectively sampled according to this evaluator. (2) a template network, which shares parameters among all candidates to amortize the training cost of generated candidates. In experiments, the architecture found by SETN achieves the state-of-the-art performance on CIFAR and ImageNet benchmarks within comparable computation costs.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Dong_One-Shot_Neural_Architecture_Search_via_Self-Evaluated_Template_Network_ICCV_2019_paper.pdf", - "aff": "Baidu Research\u2020; ReLER, University of Technology Sydney\u2021", + "aff": "Baidu Research†; ReLER, University of Technology Sydney‡", "project": "", "github": "", "supp": "", @@ -22032,7 +22732,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Dong_2019_ICCV,\n \n author = {\n Dong,\n Xuanyi and Yang,\n Yi\n},\n title = {\n One-Shot Neural Architecture Search via Self-Evaluated Template Network\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Onion-Peel Networks for Deep Video Completion", @@ -22056,7 +22757,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Oh_Onion-Peel_Networks_for_Deep_Video_Completion_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Oh_Onion-Peel_Networks_for_Deep_Video_Completion_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Oh_2019_ICCV,\n \n author = {\n Oh,\n Seoung Wug and Lee,\n Sungho and Lee,\n Joon-Young and Kim,\n Seon Joo\n},\n title = {\n Onion-Peel Networks for Deep Video Completion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Online Hyper-Parameter Learning for Auto-Augmentation Strategy", @@ -22082,14 +22784,15 @@ "author_num": 8, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Lin_Online_Hyper-Parameter_Learning_for_Auto-Augmentation_Strategy_ICCV_2019_paper.html", "aff_unique_index": "0;0;0;0;0;0;1;2+3", - "aff_unique_norm": "SenseTime Group Limited;Chinese University of Hong Kong;University of Sydney;SenseTime", + "aff_unique_norm": "SenseTime Group Limited;The Chinese University of Hong Kong;University of Sydney;SenseTime", "aff_unique_dep": ";;;Computer Vision Research Group", "aff_unique_url": "https://www.sensetime.com;https://www.cuhk.edu.hk;https://www.sydney.edu.au;https://www.sensetime.com", "aff_unique_abbr": "SenseTime;CUHK;USYD;SenseTime", "aff_campus_unique_index": "1;", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0;0;0;1+1", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Lin_2019_ICCV,\n \n author = {\n Lin,\n Chen and Guo,\n Minghao and Li,\n Chuming and Yuan,\n Xin and Wu,\n Wei and Yan,\n Junjie and Lin,\n Dahua and Ouyang,\n Wanli\n},\n title = {\n Online Hyper-Parameter Learning for Auto-Augmentation Strategy\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Online Model Distillation for Efficient Video Inference", @@ -22122,7 +22825,8 @@ "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Mullapudi_2019_ICCV,\n \n author = {\n Mullapudi,\n Ravi Teja and Chen,\n Steven and Zhang,\n Keyi and Ramanan,\n Deva and Fatahalian,\n Kayvon\n},\n title = {\n Online Model Distillation for Efficient Video Inference\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Online Unsupervised Learning of the 3D Kinematic Structure of Arbitrary Rigid Bodies", @@ -22155,7 +22859,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "London", "aff_country_unique_index": "0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Nunes_2019_ICCV,\n \n author = {\n Nunes,\n Urbano Miguel and Demiris,\n Yiannis\n},\n title = {\n Online Unsupervised Learning of the 3D Kinematic Structure of Arbitrary Rigid Bodies\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "OperatorNet: Recovering 3D Shapes From Difference Operators", @@ -22188,7 +22893,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;0;1;1;0", - "aff_country_unique": "France;United States" + "aff_country_unique": "France;United States", + "bibtex": "@InProceedings{Huang_2019_ICCV,\n \n author = {\n Huang,\n Ruqi and Rakotosaona,\n Marie-Julie and Achlioptas,\n Panos and Guibas,\n Leonidas J. and Ovsjanikov,\n Maks\n},\n title = {\n OperatorNet: Recovering 3D Shapes From Difference Operators\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Optimizing Network Structure for 3D Human Pose Estimation", @@ -22214,14 +22920,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Ci_Optimizing_Network_Structure_for_3D_Human_Pose_Estimation_ICCV_2019_paper.html", "aff_unique_index": "0;1;0+2+3;0+2+3", - "aff_unique_norm": "Peking University;Microsoft;Deepwise AI Lab;Pengcheng Laboratory", - "aff_unique_dep": "Computer Science Dept.;Microsoft Research;AI Lab;Peng Cheng Laboratory", + "aff_unique_norm": "Peking University;Microsoft Research;Deepwise AI Lab;Peng Cheng Laboratory", + "aff_unique_dep": "Computer Science Dept.;;AI Lab;", "aff_unique_url": "http://www.pku.edu.cn;https://www.microsoft.com/en-us/research/group/asia;;http://www.pcl.ac.cn", "aff_unique_abbr": "PKU;MSR Asia;;PCL", "aff_campus_unique_index": "1;;", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0;0;0+0;0+0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Ci_2019_ICCV,\n \n author = {\n Ci,\n Hai and Wang,\n Chunyu and Ma,\n Xiaoxuan and Wang,\n Yizhou\n},\n title = {\n Optimizing Network Structure for 3D Human Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Optimizing the F-Measure for Threshold-Free Salient Object Detection", @@ -22233,7 +22940,7 @@ "author": "Kai Zhao; Shanghua Gao; Wenguan Wang; Ming-Ming Cheng", "abstract": "Current CNN-based solutions to salient object detection (SOD) mainly rely on the optimization of cross-entropy loss (CELoss). Then the quality of detected saliency maps is often evaluated in terms of F-measure. In this paper, we investigate an interesting issue: can we consistently use the F-measure formulation in both training and evaluation for SOD? By reformulating the standard F-measure we propose the relaxed F-measure which is differentiable w.r.t the posterior and can be easily appended to the back of CNNs as the loss function. Compared to the conventional cross-entropy loss of which the gradients decrease dramatically in the saturated area, our loss function, named FLoss, holds considerable gradients even when the activation approaches the target. Consequently, the FLoss can continuously force the network to produce polarized activations. Comprehensive benchmarks on several popular datasets show that FLoss outperforms the state-of-the-art with a considerable margin. More specifically, due to the polarized predictions, our method is able to obtain high-quality saliency maps without carefully tuning the optimal threshold, showing significant advantages in real-world applications.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Zhao_Optimizing_the_F-Measure_for_Threshold-Free_Salient_Object_Detection_ICCV_2019_paper.pdf", - "aff": "TKLNDST, CS, Nankai University; TKLNDST, CS, Nankai University; Inception Institute of Arti\ufb01cial Intelligence; TKLNDST, CS, Nankai University", + "aff": "TKLNDST, CS, Nankai University; TKLNDST, CS, Nankai University; Inception Institute of Artificial Intelligence; TKLNDST, CS, Nankai University", "project": "http://kaizhao.net/fmeasure", "github": "", "supp": "", @@ -22254,7 +22961,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhao_2019_ICCV,\n \n author = {\n Zhao,\n Kai and Gao,\n Shanghua and Wang,\n Wenguan and Cheng,\n Ming-Ming\n},\n title = {\n Optimizing the F-Measure for Threshold-Free Salient Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Order-Aware Generative Modeling Using the 3D-Craft Dataset", @@ -22280,14 +22988,15 @@ "author_num": 14, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Chen_Order-Aware_Generative_Modeling_Using_the_3D-Craft_Dataset_ICCV_2019_paper.html", "aff_unique_index": "0;0;0;0;0;0;0;0;0;0;0;0;0;0", - "aff_unique_norm": "Meta", + "aff_unique_norm": "Facebook", "aff_unique_dep": "Facebook AI Research", "aff_unique_url": "https://research.facebook.com", "aff_unique_abbr": "FAIR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Chen_2019_ICCV,\n \n author = {\n Chen,\n Zhuoyuan and Guo,\n Demi and Xiao,\n Tong and Xie,\n Saining and Chen,\n Xinlei and Yu,\n Haonan and Gray,\n Jonathan and Srinet,\n Kavya and Fan,\n Haoqi and Ma,\n Jerry and Qi,\n Charles R. and Tulsiani,\n Shubham and Szlam,\n Arthur and Zitnick,\n C. Lawrence\n},\n title = {\n Order-Aware Generative Modeling Using the 3D-Craft Dataset\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Order-Preserving Wasserstein Discriminant Analysis", @@ -22320,7 +23029,8 @@ "aff_campus_unique_index": "0;1;1", "aff_campus_unique": "Beijing;Evanston", "aff_country_unique_index": "0;1;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Su_2019_ICCV,\n \n author = {\n Su,\n Bing and Zhou,\n Jiahuan and Wu,\n Ying\n},\n title = {\n Order-Preserving Wasserstein Discriminant Analysis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Orientation-Aware Semantic Segmentation on Icosahedron Spheres", @@ -22353,7 +23063,8 @@ "aff_campus_unique_index": "0;0;0+0", "aff_campus_unique": "Cambridge;", "aff_country_unique_index": "0;0;0;0+0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Zhang_2019_ICCV,\n \n author = {\n Zhang,\n Chao and Liwicki,\n Stephan and Smith,\n William and Cipolla,\n Roberto\n},\n title = {\n Orientation-Aware Semantic Segmentation on Icosahedron Spheres\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Overcoming Catastrophic Forgetting With Unlabeled Data in the Wild", @@ -22386,7 +23097,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0", - "aff_country_unique": "United States;South Korea" + "aff_country_unique": "United States;South Korea", + "bibtex": "@InProceedings{Lee_2019_ICCV,\n \n author = {\n Lee,\n Kibok and Lee,\n Kimin and Shin,\n Jinwoo and Lee,\n Honglak\n},\n title = {\n Overcoming Catastrophic Forgetting With Unlabeled Data in the Wild\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "P-MVSNet: Learning Patch-Wise Matching Confidence Aggregation for Multi-View Stereo", @@ -22419,7 +23131,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0;2;0", - "aff_country_unique": "China;;United States" + "aff_country_unique": "China;;United States", + "bibtex": "@InProceedings{Luo_2019_ICCV,\n \n author = {\n Luo,\n Keyang and Guan,\n Tao and Ju,\n Lili and Huang,\n Haipeng and Luo,\n Yawei\n},\n title = {\n P-MVSNet: Learning Patch-Wise Matching Confidence Aggregation for Multi-View Stereo\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "PAMTRI: Pose-Aware Multi-Task Learning for Vehicle Re-Identification Using Highly Randomized Synthetic Data", @@ -22443,7 +23156,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Tang_PAMTRI_Pose-Aware_Multi-Task_Learning_for_Vehicle_Re-Identification_Using_Highly_Randomized_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Tang_PAMTRI_Pose-Aware_Multi-Task_Learning_for_Vehicle_Re-Identification_Using_Highly_Randomized_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Tang_2019_ICCV,\n \n author = {\n Tang,\n Zheng and Naphade,\n Milind and Birchfield,\n Stan and Tremblay,\n Jonathan and Hodge,\n William and Kumar,\n Ratnesh and Wang,\n Shuo and Yang,\n Xiaodong\n},\n title = {\n PAMTRI: Pose-Aware Multi-Task Learning for Vehicle Re-Identification Using Highly Randomized Synthetic Data\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "PANet: Few-Shot Image Semantic Segmentation With Prototype Alignment", @@ -22476,7 +23190,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Singapore;", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Wang_2019_ICCV,\n \n author = {\n Wang,\n Kaixin and Liew,\n Jun Hao and Zou,\n Yingtian and Zhou,\n Daquan and Feng,\n Jiashi\n},\n title = {\n PANet: Few-Shot Image Semantic Segmentation With Prototype Alignment\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "PARN: Position-Aware Relation Networks for Few-Shot Learning", @@ -22504,12 +23219,13 @@ "aff_unique_index": "0;0;0;0", "aff_unique_norm": "South China University of Technology", "aff_unique_dep": "School of Electronic and Information Engineering", - "aff_unique_url": "https://www.scut.edu.cn", + "aff_unique_url": "http://www.scut.edu.cn", "aff_unique_abbr": "SCUT", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Guangzhou", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wu_2019_ICCV,\n \n author = {\n Wu,\n Ziyang and Li,\n Yuwei and Guo,\n Lihua and Jia,\n Kui\n},\n title = {\n PARN: Position-Aware Relation Networks for Few-Shot Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "PIE: A Large-Scale Dataset and Models for Pedestrian Intention Estimation and Trajectory Prediction", @@ -22537,12 +23253,13 @@ "aff_unique_index": "0;0;0;0", "aff_unique_norm": "York University", "aff_unique_dep": "", - "aff_unique_url": "https://www.yorku.ca", + "aff_unique_url": "https://yorku.ca", "aff_unique_abbr": "York U", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Toronto", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Rasouli_2019_ICCV,\n \n author = {\n Rasouli,\n Amir and Kotseruba,\n Iuliia and Kunic,\n Toni and Tsotsos,\n John K.\n},\n title = {\n PIE: A Large-Scale Dataset and Models for Pedestrian Intention Estimation and Trajectory Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "PIFu: Pixel-Aligned Implicit Function for High-Resolution Clothed Human Digitization", @@ -22566,7 +23283,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Saito_PIFu_Pixel-Aligned_Implicit_Function_for_High-Resolution_Clothed_Human_Digitization_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Saito_PIFu_Pixel-Aligned_Implicit_Function_for_High-Resolution_Clothed_Human_Digitization_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Saito_2019_ICCV,\n \n author = {\n Saito,\n Shunsuke and Huang,\n Zeng and Natsume,\n Ryota and Morishima,\n Shigeo and Kanazawa,\n Angjoo and Li,\n Hao\n},\n title = {\n PIFu: Pixel-Aligned Implicit Function for High-Resolution Clothed Human Digitization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "PLMP - Point-Line Minimal Problems in Complete Multi-View Visibility", @@ -22574,7 +23292,7 @@ "status": "Oral", "track": "main", "pid": "5582", - "author_site": "Timothy Duff, Kathl\u00c3\u00a9n Kohn, Anton Leykin, Tomas Pajdla", + "author_site": "Timothy Duff, Kathlén Kohn, Anton Leykin, Tomas Pajdla", "author": "Timothy Duff; Kathlen Kohn; Anton Leykin; Tomas Pajdla", "abstract": "We present a complete classification of all minimal problems for generic arrangements of points and lines completely observed by calibrated perspective cameras. We show that there are only 30 minimal problems in total, no problems exist for more than 6 cameras, for more than 5 points, and for more than 6 lines. We present a sequence of tests for detecting minimality starting with counting degrees of freedom and ending with full symbolic and numeric verification of representative examples. For all minimal problems discovered, we present their algebraic degrees, i.e. the number of solutions, which measure their intrinsic difficulty. It shows how exactly the difficulty of problems grows with the number of views. Importantly, several new mini- mal problems have small degrees that might be practical in image matching and 3D reconstruction.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Duff_PLMP_-_Point-Line_Minimal_Problems_in_Complete_Multi-View_Visibility_ICCV_2019_paper.pdf", @@ -22590,7 +23308,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Duff_PLMP_-_Point-Line_Minimal_Problems_in_Complete_Multi-View_Visibility_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Duff_PLMP_-_Point-Line_Minimal_Problems_in_Complete_Multi-View_Visibility_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Duff_2019_ICCV,\n \n author = {\n Duff,\n Timothy and Kohn,\n Kathlen and Leykin,\n Anton and Pajdla,\n Tomas\n},\n title = {\n PLMP - Point-Line Minimal Problems in Complete Multi-View Visibility\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "POD: Practical Object Detection With Scale-Sensitive Network", @@ -22623,7 +23342,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0+0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Peng_2019_ICCV,\n \n author = {\n Peng,\n Junran and Sun,\n Ming and Zhang,\n Zhaoxiang and Tan,\n Tieniu and Yan,\n Junjie\n},\n title = {\n POD: Practical Object Detection With Scale-Sensitive Network\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "PR Product: A Substitute for Inner Product in Neural Networks", @@ -22656,7 +23376,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Shenzhen", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2019_ICCV,\n \n author = {\n Wang,\n Zhennan and Zou,\n Wenbin and Xu,\n Chen\n},\n title = {\n PR Product: A Substitute for Inner Product in Neural Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "PRECOG: PREdiction Conditioned on Goals in Visual Multi-Agent Settings", @@ -22689,7 +23410,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Rhinehart_2019_ICCV,\n \n author = {\n Rhinehart,\n Nicholas and McAllister,\n Rowan and Kitani,\n Kris and Levine,\n Sergey\n},\n title = {\n PRECOG: PREdiction Conditioned on Goals in Visual Multi-Agent Settings\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "PU-GAN: A Point Cloud Upsampling Adversarial Network", @@ -22715,14 +23437,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Li_PU-GAN_A_Point_Cloud_Upsampling_Adversarial_Network_ICCV_2019_paper.html", "aff_unique_index": "0;0+1;0+2;1;0+2", - "aff_unique_norm": "Chinese University of Hong Kong;Tel Aviv University;Chinese Academy of Sciences", + "aff_unique_norm": "The Chinese University of Hong Kong;Tel Aviv University;Chinese Academy of Sciences", "aff_unique_dep": ";;Guangdong Provincial Key Laboratory of Computer Vision and Virtual Reality Technology", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.tau.ac.il;http://www.cas.cn", "aff_unique_abbr": "CUHK;TAU;CAS", "aff_campus_unique_index": "0;0;0+2;0+2", "aff_campus_unique": "Hong Kong SAR;;Shenzhen", "aff_country_unique_index": "0;0+1;0+0;1;0+0", - "aff_country_unique": "China;Israel" + "aff_country_unique": "China;Israel", + "bibtex": "@InProceedings{Li_2019_ICCV,\n \n author = {\n Li,\n Ruihui and Li,\n Xianzhi and Fu,\n Chi-Wing and Cohen-Or,\n Daniel and Heng,\n Pheng-Ann\n},\n title = {\n PU-GAN: A Point Cloud Upsampling Adversarial Network\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Parametric Majorization for Data-Driven Energy Minimization Methods", @@ -22755,7 +23478,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Geiping_2019_ICCV,\n \n author = {\n Geiping,\n Jonas and Moeller,\n Michael\n},\n title = {\n Parametric Majorization for Data-Driven Energy Minimization Methods\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Pareto Meets Huber: Efficiently Avoiding Poor Minima in Robust Estimation", @@ -22788,7 +23512,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", - "aff_country_unique": "Sweden;France" + "aff_country_unique": "Sweden;France", + "bibtex": "@InProceedings{Zach_2019_ICCV,\n \n author = {\n Zach,\n Christopher and Bourmaud,\n Guillaume\n},\n title = {\n Pareto Meets Huber: Efficiently Avoiding Poor Minima in Robust Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Patchwork: A Patch-Wise Attention Network for Efficient Object Detection and Segmentation in Video Streams", @@ -22796,6 +23521,7 @@ "status": "Poster", "track": "main", "pid": "5412", + "author_site": "Yuning Chai", "author": "Yuning Chai", "abstract": "Recent advances in single-frame object detection and segmentation techniques have motivated a wide range of works to extend these methods to process video streams. In this paper, we explore the idea of hard attention aimed for latency-sensitive applications. Instead of reasoning about every frame separately, our method selects and only processes a small sub-window of the frame. Our technique then makes predictions for the full frame based on the sub-windows from previous frames and the update from the current sub-window. The latency reduction by this hard attention mechanism comes at the cost of degraded accuracy. We made two contributions to address this. First, we propose a specialized memory cell that recovers lost context when processing sub-windows. Secondly, we adopt a Q-learning-based policy training strategy that enables our approach to intelligently select the sub-windows such that the staleness in the memory hurts the performance the least. Our experiments suggest that our approach reduces the latency by approximately four times without significantly sacrificing the accuracy on the ImageNet VID video object detection dataset and the DAVIS video object segmentation dataset. We further demonstrate that we can reinvest the saved computation into other parts of the network, and thus resulting in an accuracy increase at a comparable computational cost as the original system and beating other recently proposed state-of-the-art methods in the low latency range.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Chai_Patchwork_A_Patch-Wise_Attention_Network_for_Efficient_Object_Detection_and_ICCV_2019_paper.pdf", @@ -22814,13 +23540,14 @@ "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Chai_Patchwork_A_Patch-Wise_Attention_Network_for_Efficient_Object_Detection_and_ICCV_2019_paper.html", "aff_unique_index": "0+1", "aff_unique_norm": "Google;Waymo", - "aff_unique_dep": "Google;", + "aff_unique_dep": ";", "aff_unique_url": "https://www.google.com;https://www.waymo.com", "aff_unique_abbr": "Google;Waymo", "aff_campus_unique_index": "0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Chai_2019_ICCV,\n \n author = {\n Chai,\n Yuning\n},\n title = {\n Patchwork: A Patch-Wise Attention Network for Efficient Object Detection and Segmentation in Video Streams\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Perceptual Deep Depth Super-Resolution", @@ -22853,7 +23580,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0+0;0;1+0", - "aff_country_unique": "Russian Federation;United States" + "aff_country_unique": "Russia;United States", + "bibtex": "@InProceedings{Voynov_2019_ICCV,\n \n author = {\n Voynov,\n Oleg and Artemov,\n Alexey and Egiazarian,\n Vage and Notchenko,\n Alexander and Bobrovskikh,\n Gleb and Burnaev,\n Evgeny and Zorin,\n Denis\n},\n title = {\n Perceptual Deep Depth Super-Resolution\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Permutation-Invariant Feature Restructuring for Correlation-Aware Image Set-Based Recognition", @@ -22879,14 +23607,15 @@ "author_num": 7, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Liu_Permutation-Invariant_Feature_Restructuring_for_Correlation-Aware_Image_Set-Based_Recognition_ICCV_2019_paper.html", "aff_unique_index": "0+1;0+2;0;3;3;4;0", - "aff_unique_norm": "Carnegie Mellon University;Harvard University;Tsinghua University;Chinese Academy of Sciences;Hong Kong Polytechnic University", + "aff_unique_norm": "Carnegie Mellon University;Harvard University;Tsinghua University;Chinese Academy of Sciences;The Hong Kong Polytechnic University", "aff_unique_dep": ";;Graduate School;CIOMP;", "aff_unique_url": "https://www.cmu.edu;https://www.harvard.edu;https://www.tsinghua.edu.cn;http://www.cas.cn;https://www.polyu.edu.hk", "aff_unique_abbr": "CMU;Harvard;THU;CAS;PolyU", "aff_campus_unique_index": ";1;2", "aff_campus_unique": ";Shenzhen;Hong Kong SAR", "aff_country_unique_index": "0+0;0+1;0;1;1;1;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Liu_2019_ICCV,\n \n author = {\n Liu,\n Xiaofeng and Guo,\n Zhenhua and Li,\n Site and Kong,\n Lingsheng and Jia,\n Ping and You,\n Jane and Kumar,\n B.V.K. Vijaya\n},\n title = {\n Permutation-Invariant Feature Restructuring for Correlation-Aware Image Set-Based Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Person Search by Text Attribute Query As Zero-Shot Learning", @@ -22919,7 +23648,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "London;", "aff_country_unique_index": "0;0", - "aff_country_unique": "United Kingdom;" + "aff_country_unique": "United Kingdom;", + "bibtex": "@InProceedings{Dong_2019_ICCV,\n \n author = {\n Dong,\n Qi and Gong,\n Shaogang and Zhu,\n Xiatian\n},\n title = {\n Person Search by Text Attribute Query As Zero-Shot Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Person-in-WiFi: Fine-Grained Person Perception Using WiFi", @@ -22943,7 +23673,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Wang_Person-in-WiFi_Fine-Grained_Person_Perception_Using_WiFi_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Wang_Person-in-WiFi_Fine-Grained_Person_Perception_Using_WiFi_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Wang_2019_ICCV,\n \n author = {\n Wang,\n Fei and Zhou,\n Sanping and Panev,\n Stanislav and Han,\n Jinsong and Huang,\n Dong\n},\n title = {\n Person-in-WiFi: Fine-Grained Person Perception Using WiFi\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Personalized Fashion Design", @@ -22976,7 +23707,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yu_2019_ICCV,\n \n author = {\n Yu,\n Cong and Hu,\n Yang and Chen,\n Yan and Zeng,\n Bing\n},\n title = {\n Personalized Fashion Design\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Perspective-Guided Convolution Networks for Crowd Counting", @@ -23002,14 +23734,15 @@ "author_num": 7, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Yan_Perspective-Guided_Convolution_Networks_for_Crowd_Counting_ICCV_2019_paper.html", "aff_unique_index": "0;1;0+2;1;0;1;1", - "aff_unique_norm": "Harbin Institute of Technology;Baidu;Pengcheng Laboratory", - "aff_unique_dep": ";Department of Computer Vision Technology (VIS);Peng Cheng Laboratory", + "aff_unique_norm": "Harbin Institute of Technology;Baidu Inc.;Peng Cheng Laboratory", + "aff_unique_dep": ";Department of Computer Vision Technology (VIS);", "aff_unique_url": "http://www.hit.edu.cn/;https://www.baidu.com;", "aff_unique_abbr": "HIT;Baidu;", - "aff_campus_unique_index": "0;0+2;0", - "aff_campus_unique": "Harbin;;Shenzhen", + "aff_campus_unique_index": "0;0;0", + "aff_campus_unique": "Harbin;", "aff_country_unique_index": "0;0;0+0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yan_2019_ICCV,\n \n author = {\n Yan,\n Zhaoyi and Yuan,\n Yuchen and Zuo,\n Wangmeng and Tan,\n Xiao and Wang,\n Yezhen and Wen,\n Shilei and Ding,\n Errui\n},\n title = {\n Perspective-Guided Convolution Networks for Crowd Counting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Photo-Realistic Facial Details Synthesis From Single Image", @@ -23042,7 +23775,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0", - "aff_country_unique": "China;United Kingdom" + "aff_country_unique": "China;United Kingdom", + "bibtex": "@InProceedings{Chen_2019_ICCV,\n \n author = {\n Chen,\n Anpei and Chen,\n Zhang and Zhang,\n Guli and Mitchell,\n Kenny and Yu,\n Jingyi\n},\n title = {\n Photo-Realistic Facial Details Synthesis From Single Image\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Photo-Realistic Monocular Gaze Redirection Using Generative Adversarial Networks", @@ -23054,7 +23788,7 @@ "author": "Zhe He; Adrian Spurr; Xucong Zhang; Otmar Hilliges", "abstract": "Gaze redirection is the task of changing the gaze to a desired direction for a given monocular eye patch image. Many applications such as videoconferencing, films, games, and generation of training data for gaze estimation require redirecting the gaze, without distorting the appearance of the area surrounding the eye and while producing photo-realistic images. Existing methods lack the ability to generate perceptually plausible images. In this work, we present a novel method to alleviate this problem by leveraging generative adversarial training to synthesize an eye image conditioned on a target gaze direction. Our method ensures perceptual similarity and consistency of synthesized images to the real images. Furthermore, a gaze estimation loss is used to control the gaze direction accurately. To attain high-quality images, we incorporate perceptual and cycle consistency losses into our architecture. In extensive evaluations we show that the proposed method outperforms state-of-the-art approaches in terms of both image quality and redirection precision. Finally, we show that generated images can bring significant improvement for the gaze estimation task if used to augment real training data.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/He_Photo-Realistic_Monocular_Gaze_Redirection_Using_Generative_Adversarial_Networks_ICCV_2019_paper.pdf", - "aff": "AIT Lab, ETH Z\u00fcrich + Institute of Neuroinformatics, ETH Z\u00fcrich & University of Z\u00fcrich; AIT Lab, ETH Z\u00fcrich; AIT Lab, ETH Z\u00fcrich; AIT Lab, ETH Z\u00fcrich", + "aff": "AIT Lab, ETH Zürich + Institute of Neuroinformatics, ETH Zürich & University of Zürich; AIT Lab, ETH Zürich; AIT Lab, ETH Zürich; AIT Lab, ETH Zürich", "project": "", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2019/supplemental/He_Photo-Realistic_Monocular_Gaze_ICCV_2019_supplemental.pdf", @@ -23068,14 +23802,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/He_Photo-Realistic_Monocular_Gaze_Redirection_Using_Generative_Adversarial_Networks_ICCV_2019_paper.html", "aff_unique_index": "0+0;0;0;0", - "aff_unique_norm": "ETH Zurich", + "aff_unique_norm": "ETH Zürich", "aff_unique_dep": "AIT Lab", "aff_unique_url": "https://www.ethz.ch", "aff_unique_abbr": "ETHZ", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0", - "aff_country_unique": "Switzerland" + "aff_country_unique": "Switzerland", + "bibtex": "@InProceedings{He_2019_ICCV,\n \n author = {\n He,\n Zhe and Spurr,\n Adrian and Zhang,\n Xucong and Hilliges,\n Otmar\n},\n title = {\n Photo-Realistic Monocular Gaze Redirection Using Generative Adversarial Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Photorealistic Style Transfer via Wavelet Transforms", @@ -23098,7 +23833,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Yoo_Photorealistic_Style_Transfer_via_Wavelet_Transforms_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Yoo_Photorealistic_Style_Transfer_via_Wavelet_Transforms_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Yoo_2019_ICCV,\n \n author = {\n Yoo,\n Jaejun and Uh,\n Youngjung and Chun,\n Sanghyuk and Kang,\n Byeongkyu and Ha,\n Jung-Woo\n},\n title = {\n Photorealistic Style Transfer via Wavelet Transforms\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Phrase Localization Without Paired Training Examples", @@ -23131,7 +23867,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Wang_2019_ICCV,\n \n author = {\n Wang,\n Josiah and Specia,\n Lucia\n},\n title = {\n Phrase Localization Without Paired Training Examples\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Physical Adversarial Textures That Fool Visual Object Tracking", @@ -23164,7 +23901,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Wiyatno_2019_ICCV,\n \n author = {\n Wiyatno,\n Rey Reza and Xu,\n Anqi\n},\n title = {\n Physical Adversarial Textures That Fool Visual Object Tracking\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Physics-Based Rendering for Improving Robustness to Rain", @@ -23172,11 +23910,11 @@ "status": "Poster", "track": "main", "pid": "6981", - "author_site": "Shirsendu Sukanta Halder, Jean-Fran\u00c3\u00a7ois Lalonde, Raoul de Charette", + "author_site": "Shirsendu Sukanta Halder, Jean-François Lalonde, Raoul de Charette", "author": "Shirsendu Sukanta Halder; Jean-Francois Lalonde; Raoul de Charette", "abstract": "To improve the robustness to rain, we present a physically-based rain rendering pipeline for realistically inserting rain into clear weather images. Our rendering relies on a physical particle simulator, an estimation of the scene lighting and an accurate rain photometric modeling to augment images with arbitrary amount of realistic rain or fog. We validate our rendering with a user study, proving our rain is judged 40% more realistic that state-of-the-art. Using our generated weather augmented Kitti and Cityscapes dataset, we conduct a thorough evaluation of deep object detection and semantic segmentation algorithms and show that their performance decreases in degraded weather, on the order of 15% for object detection and 60% for semantic segmentation. Furthermore, we show refining existing networks with our augmented images improves the robustness of both object detection and semantic segmentation algorithms. We experiment on nuScenes and measure an improvement of 15% for object detection and 35% for semantic segmentation compared to original rainy performance. Augmented databases and code are available on the project page.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Halder_Physics-Based_Rendering_for_Improving_Robustness_to_Rain_ICCV_2019_paper.pdf", - "aff": "Inria, Paris, France; Universit\u00e9 Laval, Qu\u00e9bec, Canada; Inria, Paris, France", + "aff": "Inria, Paris, France; Université Laval, Québec, Canada; Inria, Paris, France", "project": "https://team.inria.fr/rits/computer-vision/weather-augment/", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2019/supplemental/Halder_Physics-Based_Rendering_for_ICCV_2019_supplemental.pdf", @@ -23190,14 +23928,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Halder_Physics-Based_Rendering_for_Improving_Robustness_to_Rain_ICCV_2019_paper.html", "aff_unique_index": "0;1;0", - "aff_unique_norm": "INRIA;Universit\u00e9 Laval", + "aff_unique_norm": "Inria;Université Laval", "aff_unique_dep": ";", "aff_unique_url": "https://www.inria.fr;https://www.ulaval.ca", "aff_unique_abbr": "Inria;ULaval", "aff_campus_unique_index": "0;1;0", - "aff_campus_unique": "Paris;Qu\u00e9bec", + "aff_campus_unique": "Paris;Québec", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "France;Canada" + "aff_country_unique": "France;Canada", + "bibtex": "@InProceedings{Halder_2019_ICCV,\n \n author = {\n Halder,\n Shirsendu Sukanta and Lalonde,\n Jean-Francois and Charette,\n Raoul de\n},\n title = {\n Physics-Based Rendering for Improving Robustness to Rain\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Pix2Pose: Pixel-Wise Coordinate Regression of Objects for 6D Pose Estimation", @@ -23230,7 +23969,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Wien", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Austria" + "aff_country_unique": "Austria", + "bibtex": "@InProceedings{Park_2019_ICCV,\n \n author = {\n Park,\n Kiru and Patten,\n Timothy and Vincze,\n Markus\n},\n title = {\n Pix2Pose: Pixel-Wise Coordinate Regression of Objects for 6D Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Pix2Vox: Context-Aware 3D Reconstruction From Single and Multi-View Images", @@ -23256,14 +23996,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Xie_Pix2Vox_Context-Aware_3D_Reconstruction_From_Single_and_Multi-View_Images_ICCV_2019_paper.html", "aff_unique_index": "0;0;0;1;0+2", - "aff_unique_norm": "Harbin Institute of Technology;SenseTime;Pengcheng Laboratory", - "aff_unique_dep": ";SenseTime Research;Peng Cheng Laboratory", + "aff_unique_norm": "Harbin Institute of Technology;SenseTime;Peng Cheng Laboratory", + "aff_unique_dep": ";SenseTime Research;", "aff_unique_url": "http://www.hit.edu.cn/;https://www.sensetime.com;http://www.pcl.ac.cn", "aff_unique_abbr": "HIT;SenseTime;PCL", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Harbin;", "aff_country_unique_index": "0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xie_2019_ICCV,\n \n author = {\n Xie,\n Haozhe and Yao,\n Hongxun and Sun,\n Xiaoshuai and Zhou,\n Shangchen and Zhang,\n Shengping\n},\n title = {\n Pix2Vox: Context-Aware 3D Reconstruction From Single and Multi-View Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Pixel2Mesh++: Multi-View 3D Mesh Generation via Deformation", @@ -23287,7 +24028,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Wen_Pixel2Mesh_Multi-View_3D_Mesh_Generation_via_Deformation_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Wen_Pixel2Mesh_Multi-View_3D_Mesh_Generation_via_Deformation_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Wen_2019_ICCV,\n \n author = {\n Wen,\n Chao and Zhang,\n Yinda and Li,\n Zhuwen and Fu,\n Yanwei\n},\n title = {\n Pixel2Mesh++: Multi-View 3D Mesh Generation via Deformation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Point-Based Multi-View Stereo Network", @@ -23320,7 +24062,8 @@ "aff_campus_unique_index": "1;2+1;1", "aff_campus_unique": ";San Diego;Hong Kong SAR", "aff_country_unique_index": "0+1;0+1;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Chen_2019_ICCV,\n \n author = {\n Chen,\n Rui and Han,\n Songfang and Xu,\n Jing and Su,\n Hao\n},\n title = {\n Point-Based Multi-View Stereo Network\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Point-to-Point Video Generation", @@ -23353,7 +24096,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Taiwan", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2019_ICCV,\n \n author = {\n Wang,\n Tsun-Hsuan and Cheng,\n Yen-Chi and Lin,\n Chieh Hubert and Chen,\n Hwann-Tzong and Sun,\n Min\n},\n title = {\n Point-to-Point Video Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "PointAE: Point Auto-Encoder for 3D Statistical Shape and Texture Modelling", @@ -23377,7 +24121,8 @@ "aff_domain": ";", "email": ";", "author_num": 2, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Dai_PointAE_Point_Auto-Encoder_for_3D_Statistical_Shape_and_Texture_Modelling_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Dai_PointAE_Point_Auto-Encoder_for_3D_Statistical_Shape_and_Texture_Modelling_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Dai_2019_ICCV,\n \n author = {\n Dai,\n Hang and Shao,\n Ling\n},\n title = {\n PointAE: Point Auto-Encoder for 3D Statistical Shape and Texture Modelling\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "PointCloud Saliency Maps", @@ -23410,7 +24155,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Buffalo;", "aff_country_unique_index": "0;0;0;1;1", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Zheng_2019_ICCV,\n \n author = {\n Zheng,\n Tianhang and Chen,\n Changyou and Yuan,\n Junsong and Li,\n Bo and Ren,\n Kui\n},\n title = {\n PointCloud Saliency Maps\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "PointFlow: 3D Point Cloud Generation With Continuous Normalizing Flows", @@ -23434,7 +24180,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Yang_PointFlow_3D_Point_Cloud_Generation_With_Continuous_Normalizing_Flows_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Yang_PointFlow_3D_Point_Cloud_Generation_With_Continuous_Normalizing_Flows_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Yang_2019_ICCV,\n \n author = {\n Yang,\n Guandao and Huang,\n Xun and Hao,\n Zekun and Liu,\n Ming-Yu and Belongie,\n Serge and Hariharan,\n Bharath\n},\n title = {\n PointFlow: 3D Point Cloud Generation With Continuous Normalizing Flows\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Polarimetric Relative Pose Estimation", @@ -23458,7 +24205,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Cui_Polarimetric_Relative_Pose_Estimation_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Cui_Polarimetric_Relative_Pose_Estimation_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Cui_2019_ICCV,\n \n author = {\n Cui,\n Zhaopeng and Larsson,\n Viktor and Pollefeys,\n Marc\n},\n title = {\n Polarimetric Relative Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Pose-Aware Multi-Level Feature Network for Human Object Interaction Detection", @@ -23491,7 +24239,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Shanghai", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wan_2019_ICCV,\n \n author = {\n Wan,\n Bo and Zhou,\n Desen and Liu,\n Yongfei and Li,\n Rongjie and He,\n Xuming\n},\n title = {\n Pose-Aware Multi-Level Feature Network for Human Object Interaction Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Pose-Guided Feature Alignment for Occluded Person Re-Identification", @@ -23524,7 +24273,8 @@ "aff_campus_unique_index": "1;1;1;1", "aff_campus_unique": ";Sydney", "aff_country_unique_index": "0+1;0+1;1;0;1", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Miao_2019_ICCV,\n \n author = {\n Miao,\n Jiaxu and Wu,\n Yu and Liu,\n Ping and Ding,\n Yuhang and Yang,\n Yi\n},\n title = {\n Pose-Guided Feature Alignment for Occluded Person Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Predicting 3D Human Dynamics From Video", @@ -23557,7 +24307,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhang_2019_ICCV,\n \n author = {\n Zhang,\n Jason Y. and Felsen,\n Panna and Kanazawa,\n Angjoo and Malik,\n Jitendra\n},\n title = {\n Predicting 3D Human Dynamics From Video\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Predicting the Future: A Jointly Learnt Model for Action Anticipation", @@ -23590,7 +24341,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Australia" + "aff_country_unique": "Australia", + "bibtex": "@InProceedings{Gammulle_2019_ICCV,\n \n author = {\n Gammulle,\n Harshala and Denman,\n Simon and Sridharan,\n Sridha and Fookes,\n Clinton\n},\n title = {\n Predicting the Future: A Jointly Learnt Model for Action Anticipation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Presence-Only Geographical Priors for Fine-Grained Image Classification", @@ -23614,7 +24366,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Aodha_Presence-Only_Geographical_Priors_for_Fine-Grained_Image_Classification_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Aodha_Presence-Only_Geographical_Priors_for_Fine-Grained_Image_Classification_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Aodha_2019_ICCV,\n \n author = {\n Mac Aodha,\n Oisin and Cole,\n Elijah and Perona,\n Pietro\n},\n title = {\n Presence-Only Geographical Priors for Fine-Grained Image Classification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Prior Guided Dropout for Robust Visual Localization in Dynamic Environments", @@ -23626,7 +24379,7 @@ "author": "Zhaoyang Huang; Yan Xu; Jianping Shi; Xiaowei Zhou; Hujun Bao; Guofeng Zhang", "abstract": "Camera localization from monocular images has been a long-standing problem, but its robustness in dynamic environments is still not adequately addressed. Compared with classic geometric approaches, modern CNN-based methods (e.g. PoseNet) have manifested the reliability against illumination or viewpoint variations, but they still have the following limitations. First, foreground moving objects are not explicitly handled, which results in poor performance and instability in dynamic environments. Second, the output for each image is a point estimate without uncertainty quantification. In this paper, we propose a framework which can be generally applied to existing CNN-based pose regressors to improve their robustness in dynamic environments. The key idea is a prior guided dropout module coupled with a self-attention module which can guide CNNs to ignore foreground objects during both training and inference. Additionally, the dropout module enables the pose regressor to output multiple hypotheses from which the uncertainty of pose estimates can be quantified and leveraged in the following uncertainty-aware pose-graph optimization to improve the robustness further. We achieve an average accuracy of 9.98m/3.63deg on RobotCar dataset, which outperforms the state-of-the-art method by 62.97%/47.08%. The source code of our implementation is available at https://github.com/zju3dv/RVL-dynamic.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Huang_Prior_Guided_Dropout_for_Robust_Visual_Localization_in_Dynamic_Environments_ICCV_2019_paper.pdf", - "aff": "State Key Lab of CAD&CG, Zhejiang University\u2020; SenseTime Research; SenseTime Research; State Key Lab of CAD&CG, Zhejiang University\u2020; State Key Lab of CAD&CG, Zhejiang University\u2020; State Key Lab of CAD&CG, Zhejiang University\u2020", + "aff": "State Key Lab of CAD&CG, Zhejiang University†; SenseTime Research; SenseTime Research; State Key Lab of CAD&CG, Zhejiang University†; State Key Lab of CAD&CG, Zhejiang University†; State Key Lab of CAD&CG, Zhejiang University†", "project": "", "github": "https://github.com/zju3dv/RVL-Dynamic", "supp": "http://openaccess.thecvf.com/content_ICCV_2019/supplemental/Huang_Prior_Guided_Dropout_ICCV_2019_supplemental.pdf", @@ -23647,7 +24400,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Huang_2019_ICCV,\n \n author = {\n Huang,\n Zhaoyang and Xu,\n Yan and Shi,\n Jianping and Zhou,\n Xiaowei and Bao,\n Hujun and Zhang,\n Guofeng\n},\n title = {\n Prior Guided Dropout for Robust Visual Localization in Dynamic Environments\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "id": "04ab601b3c", @@ -23667,7 +24421,8 @@ "gs_version_total": 12, "aff_domain": ";;;;;;;", "email": ";;;;;;;", - "author_num": 8 + "author_num": 8, + "bibtex": "@InProceedings{Zhou_2019_ICCV,\n \n author = {\n Zhou,\n Yuyin and Li,\n Zhe and Bai,\n Song and Wang,\n Chong and Chen,\n Xinlei and Han,\n Mei and Fishman,\n Elliot and Yuille,\n Alan L.\n},\n title = {\n Prior-Aware Neural Network for Partially-Supervised Multi-Organ Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Privacy Preserving Image Queries for Camera Localization", @@ -23675,7 +24430,7 @@ "status": "Oral", "track": "main", "pid": "3611", - "author_site": "Pablo Speciale, Johannes L. Sch\u00c3\u00b6nberger, Sudipta N. Sinha, Marc Pollefeys", + "author_site": "Pablo Speciale, Johannes L. Schönberger, Sudipta N. Sinha, Marc Pollefeys", "author": "Pablo Speciale; Johannes L. Schonberger; Sudipta N. Sinha; Marc Pollefeys", "abstract": "Augmented/mixed reality and robotic applications are increasingly relying on cloud-based localization services, which require users to upload query images to perform camera pose estimation on a server. This raises significant privacy concerns when consumers use such services in their homes or in confidential industrial settings. Even if only image features are uploaded, the privacy concerns remain as the images can be reconstructed fairly well from feature locations and descriptors. We propose to conceal the content of the query images from an adversary on the server or a man-in-the-middle intruder. The key insight is to replace the 2D image feature points in the query image with randomly oriented 2D lines passing through their original 2D positions. It will be shown that this feature representation hides the image contents, and thereby protects user privacy, yet still provides sufficient geometric constraints to enable robust and accurate 6-DOF camera pose estimation from feature correspondences. Our proposed method can handle single- and multi-image queries as well as exploit additional information about known structure, gravity, and scale. Numerous experiments demonstrate the high practical relevance of our approach.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Speciale_Privacy_Preserving_Image_Queries_for_Camera_Localization_ICCV_2019_paper.pdf", @@ -23691,7 +24446,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Speciale_Privacy_Preserving_Image_Queries_for_Camera_Localization_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Speciale_Privacy_Preserving_Image_Queries_for_Camera_Localization_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Speciale_2019_ICCV,\n \n author = {\n Speciale,\n Pablo and Schonberger,\n Johannes L. and Sinha,\n Sudipta N. and Pollefeys,\n Marc\n},\n title = {\n Privacy Preserving Image Queries for Camera Localization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Pro-Cam SSfM: Projector-Camera System for Structure and Spectral Reflectance From Motion", @@ -23715,7 +24471,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Li_Pro-Cam_SSfM_Projector-Camera_System_for_Structure_and_Spectral_Reflectance_From_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Li_Pro-Cam_SSfM_Projector-Camera_System_for_Structure_and_Spectral_Reflectance_From_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Li_2019_ICCV,\n \n author = {\n Li,\n Chunyu and Monno,\n Yusuke and Hidaka,\n Hironori and Okutomi,\n Masatoshi\n},\n title = {\n Pro-Cam SSfM: Projector-Camera System for Structure and Spectral Reflectance From Motion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Probabilistic Deep Ordinal Regression Based on Gaussian Processes", @@ -23748,7 +24505,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Liu_2019_ICCV,\n \n author = {\n Liu,\n Yanzhu and Wang,\n Fan and Kong,\n Adams Wai Kin\n},\n title = {\n Probabilistic Deep Ordinal Regression Based on Gaussian Processes\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Probabilistic Face Embeddings", @@ -23781,7 +24539,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "East Lansing", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Shi_2019_ICCV,\n \n author = {\n Shi,\n Yichun and Jain,\n Anil K.\n},\n title = {\n Probabilistic Face Embeddings\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Program-Guided Image Manipulators", @@ -23805,7 +24564,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Mao_Program-Guided_Image_Manipulators_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Mao_Program-Guided_Image_Manipulators_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Mao_2019_ICCV,\n \n author = {\n Mao,\n Jiayuan and Zhang,\n Xiuming and Li,\n Yikai and Freeman,\n William T. and Tenenbaum,\n Joshua B. and Wu,\n Jiajun\n},\n title = {\n Program-Guided Image Manipulators\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Progressive Differentiable Architecture Search: Bridging the Depth Gap Between Search and Evaluation", @@ -23817,7 +24577,7 @@ "author": "Xin Chen; Lingxi Xie; Jun Wu; Qi Tian", "abstract": "Recently, differentiable search methods have made major progress in reducing the computational costs of neural architecture search. However, these approaches often report lower accuracy in evaluating the searched architecture or transferring it to another dataset. This is arguably due to the large gap between the architecture depths in search and evaluation scenarios. In this paper, we present an efficient algorithm which allows the depth of searched architectures to grow gradually during the training procedure. This brings two issues, namely, heavier computational overheads and weaker search stability, which we solve using search space approximation and regularization, respectively. With a significantly reduced search time ( 7 hours on a single GPU), our approach achieves state-of-the-art performance on both the proxy dataset (CIFAR10 or CIFAR100) and the target dataset (ImageNet). Code is available at https://github.com/chenxin061/pdarts", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Chen_Progressive_Differentiable_Architecture_Search_Bridging_the_Depth_Gap_Between_Search_ICCV_2019_paper.pdf", - "aff": "Tongji University; Huawei Noah\u2019s Ark Lab; Tongji University; Huawei Noah\u2019s Ark Lab", + "aff": "Tongji University; Huawei Noah’s Ark Lab; Tongji University; Huawei Noah’s Ark Lab", "project": "", "github": "https://github.com/chenxin061/pdarts", "supp": "", @@ -23832,13 +24592,14 @@ "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Chen_Progressive_Differentiable_Architecture_Search_Bridging_the_Depth_Gap_Between_Search_ICCV_2019_paper.html", "aff_unique_index": "0;1;0;1", "aff_unique_norm": "Tongji University;Huawei", - "aff_unique_dep": ";Noah\u2019s Ark Lab", + "aff_unique_dep": ";Noah’s Ark Lab", "aff_unique_url": "https://www.tongji.edu.cn;https://www.huawei.com", "aff_unique_abbr": "Tongji;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2019_ICCV,\n \n author = {\n Chen,\n Xin and Xie,\n Lingxi and Wu,\n Jun and Tian,\n Qi\n},\n title = {\n Progressive Differentiable Architecture Search: Bridging the Depth Gap Between Search and Evaluation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Progressive Fusion Video Super-Resolution Network via Exploiting Non-Local Spatio-Temporal Correlations", @@ -23864,14 +24625,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Yi_Progressive_Fusion_Video_Super-Resolution_Network_via_Exploiting_Non-Local_Spatio-Temporal_Correlations_ICCV_2019_paper.html", "aff_unique_index": "0;0;0;1+2;0", - "aff_unique_norm": "Wuhan University;Harbin Institute of Technology;Pengcheng Laboratory", - "aff_unique_dep": ";;Peng Cheng Laboratory", + "aff_unique_norm": "Wuhan University;Harbin Institute of Technology;Peng Cheng Laboratory", + "aff_unique_dep": ";;", "aff_unique_url": "http://www.whu.edu.cn/;http://www.hit.edu.cn/;http://www.pcl.ac.cn", "aff_unique_abbr": "WHU;HIT;PCL", "aff_campus_unique_index": "1", "aff_campus_unique": ";Harbin", "aff_country_unique_index": "0;0;0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yi_2019_ICCV,\n \n author = {\n Yi,\n Peng and Wang,\n Zhongyuan and Jiang,\n Kui and Jiang,\n Junjun and Ma,\n Jiayi\n},\n title = {\n Progressive Fusion Video Super-Resolution Network via Exploiting Non-Local Spatio-Temporal Correlations\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Progressive Reconstruction of Visual Structure for Image Inpainting", @@ -23897,14 +24659,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Li_Progressive_Reconstruction_of_Visual_Structure_for_Image_Inpainting_ICCV_2019_paper.html", "aff_unique_index": "0;1;0;0;1", - "aff_unique_norm": "Wuhan University;University of Sydney", + "aff_unique_norm": "Wuhan University;The University of Sydney", "aff_unique_dep": "School of Computer Science;School of Computer Science", "aff_unique_url": "http://www.whu.edu.cn;https://www.sydney.edu.au", "aff_unique_abbr": "WHU;USYD", "aff_campus_unique_index": "0;1;0;0;1", "aff_campus_unique": "Wuhan;Sydney", "aff_country_unique_index": "0;1;0;0;1", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Li_2019_ICCV,\n \n author = {\n Li,\n Jingyuan and He,\n Fengxiang and Zhang,\n Lefei and Du,\n Bo and Tao,\n Dacheng\n},\n title = {\n Progressive Reconstruction of Visual Structure for Image Inpainting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Progressive Sparse Local Attention for Video Object Detection", @@ -23912,7 +24675,7 @@ "status": "Poster", "track": "main", "pid": "838", - "author_site": "Chaoxu Guo, Bin Fan, Jie Gu, Qian Zhang, Shiming Xiang, V\u00c3\u00a9ronique Prinet, Chunhong Pan", + "author_site": "Chaoxu Guo, Bin Fan, Jie Gu, Qian Zhang, Shiming Xiang, Véronique Prinet, Chunhong Pan", "author": "Chaoxu Guo; Bin Fan; Jie Gu; Qian Zhang; Shiming Xiang; Veronique Prinet; Chunhong Pan", "abstract": "Transferring image-based object detectors to the domain of videos remains a challenging problem. Previous efforts mostly exploit optical flow to propagate features across frames, aiming to achieve a good trade-off between accuracy and efficiency. However, introducing an extra model to estimate optical flow can significantly increase the overall model size. The gap between optical flow and high-level features can also hinder it from establishing spatial correspondence accurately. Instead of relying on optical flow, this paper proposes a novel module called Progressive Sparse Local Attention (PSLA), which establishes the spatial correspondence between features across frames in a local region with progressively sparser stride and uses the correspondence to propagate features. Based on PSLA, Recursive Feature Updating (RFU) and Dense Feature Transforming (DenseFT) are proposed to model temporal appearance and enrich feature representation respectively in a novel video object detection framework. Experiments on ImageNet VID show that our method achieves the best accuracy compared to existing methods with smaller model size and acceptable runtime speed.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Guo_Progressive_Sparse_Local_Attention_for_Video_Object_Detection_ICCV_2019_paper.pdf", @@ -23928,7 +24691,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Guo_Progressive_Sparse_Local_Attention_for_Video_Object_Detection_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Guo_Progressive_Sparse_Local_Attention_for_Video_Object_Detection_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Guo_2019_ICCV,\n \n author = {\n Guo,\n Chaoxu and Fan,\n Bin and Gu,\n Jie and Zhang,\n Qian and Xiang,\n Shiming and Prinet,\n Veronique and Pan,\n Chunhong\n},\n title = {\n Progressive Sparse Local Attention for Video Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Progressive-X: Efficient, Anytime, Multi-Model Fitting Algorithm", @@ -23936,7 +24700,7 @@ "status": "Poster", "track": "main", "pid": "5018", - "author_site": "D\u00c3\u00a1niel Bar\u00c3\u00a1th, Ji\u00c5\u0099\u00c3\u00ad Matas", + "author_site": "Dániel Baráth, Jiří Matas", "author": "Daniel Barath; Jiri Matas", "abstract": "The Progressive-X algorithm, Prog-X in short, is proposed for geometric multi-model fitting. The method interleaves sampling and consolidation of the current data interpretation via repetitive hypothesis proposal, fast rejection, and integration of the new hypothesis into the kept instance set by labeling energy minimization. Due to exploring the data progressively, the method has several beneficial properties compared with the state-of-the-art. First, a clear criterion, adopted from RANSAC, controls the termination and stops the algorithm when the probability of finding a new model with a reasonable number of inliers falls below a threshold. Second, Prog-X is an any-time algorithm. Thus, whenever is interrupted, e.g. due to a time limit, the returned instances cover real and, likely, the most dominant ones. The method is superior to the state-of-the-art in terms of accuracy in both synthetic experiments and on publicly available real-world datasets for homography, two-view motion, and motion segmentation.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Barath_Progressive-X_Efficient_Anytime_Multi-Model_Fitting_Algorithm_ICCV_2019_paper.pdf", @@ -23961,7 +24725,8 @@ "aff_campus_unique_index": "0+1;0", "aff_campus_unique": "Prague;Budapest", "aff_country_unique_index": "0+1;0", - "aff_country_unique": "Czech Republic;Hungary" + "aff_country_unique": "Czech Republic;Hungary", + "bibtex": "@InProceedings{Barath_2019_ICCV,\n \n author = {\n Barath,\n Daniel and Matas,\n Jiri\n},\n title = {\n Progressive-X: Efficient,\n Anytime,\n Multi-Model Fitting Algorithm\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Proximal Mean-Field for Neural Network Quantization", @@ -23994,7 +24759,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;1", - "aff_country_unique": "Australia;United Kingdom" + "aff_country_unique": "Australia;United Kingdom", + "bibtex": "@InProceedings{Ajanthan_2019_ICCV,\n \n author = {\n Ajanthan,\n Thalaiyasingam and Dokania,\n Puneet K. and Hartley,\n Richard and Torr,\n Philip H. S.\n},\n title = {\n Proximal Mean-Field for Neural Network Quantization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "PuppetGAN: Cross-Domain Image Manipulation by Demonstration", @@ -24027,7 +24793,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0+0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Usman_2019_ICCV,\n \n author = {\n Usman,\n Ben and Dufour,\n Nick and Saenko,\n Kate and Bregler,\n Chris\n},\n title = {\n PuppetGAN: Cross-Domain Image Manipulation by Demonstration\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Pushing the Frontiers of Unconstrained Crowd Counting: New Dataset and Benchmark Method", @@ -24060,7 +24827,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Baltimore", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Sindagi_2019_ICCV,\n \n author = {\n Sindagi,\n Vishwanath A. and Yasarla,\n Rajeev and Patel,\n Vishal M.\n},\n title = {\n Pushing the Frontiers of Unconstrained Crowd Counting: New Dataset and Benchmark Method\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Pyramid Graph Networks With Connection Attentions for Region-Based One-Shot Semantic Segmentation", @@ -24093,7 +24861,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;0;0;1;2;2", - "aff_country_unique": "Singapore;United States;China" + "aff_country_unique": "Singapore;United States;China", + "bibtex": "@InProceedings{Zhang_2019_ICCV,\n \n author = {\n Zhang,\n Chi and Lin,\n Guosheng and Liu,\n Fayao and Guo,\n Jiushuang and Wu,\n Qingyao and Yao,\n Rui\n},\n title = {\n Pyramid Graph Networks With Connection Attentions for Region-Based One-Shot Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "QUARCH: A New Quasi-Affine Reconstruction Stratum From Vague Relative Camera Orientation Knowledge", @@ -24101,11 +24870,11 @@ "status": "Poster", "track": "main", "pid": "4811", - "author_site": "Devesh Adlakha, Adlane Habed, Fabio Morbidi, C\u00c3\u00a9dric Demonceaux, Michel de Mathelin", + "author_site": "Devesh Adlakha, Adlane Habed, Fabio Morbidi, Cédric Demonceaux, Michel de Mathelin", "author": "Devesh Adlakha; Adlane Habed; Fabio Morbidi; Cedric Demonceaux; Michel de Mathelin", "abstract": "We present a new quasi-affine reconstruction of a scene and its application to camera self-calibration. We refer to this reconstruction as QUARCH (QUasi-Affine Reconstruction with respect to Camera centers and the Hodographs of horopters). A QUARCH can be obtained by solving a semidefinite programming problem when, (i) the images have been captured by a moving camera with constant intrinsic parameters, and (ii) a vague knowledge of the relative orientation (under or over 120 degrees) between camera pairs is available. The resulting reconstruction comes close enough to an affine one allowing thus an easy upgrade of the QUARCH to its affine and metric counterparts. We also present a constrained Levenberg-Marquardt method for nonlinear optimization subject to Linear Matrix Inequality (LMI) constraints so as to ensure that the QUARCH LMIs are satisfied during optimization. Experiments with synthetic and real data show the benefits of QUARCH in reliably obtaining a metric reconstruction.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Adlakha_QUARCH_A_New_Quasi-Affine_Reconstruction_Stratum_From_Vague_Relative_Camera_ICCV_2019_paper.pdf", - "aff": "ICube laboratory, CNRS, University of Strasbourg; ICube laboratory, CNRS, University of Strasbourg; MIS laboratory, University of Picardie Jules Verne; ImViA laboratory, VIBOT ERL CNRS, University of Burgundy - Franche-Comt \u00b4e; ICube laboratory, CNRS, University of Strasbourg", + "aff": "ICube laboratory, CNRS, University of Strasbourg; ICube laboratory, CNRS, University of Strasbourg; MIS laboratory, University of Picardie Jules Verne; ImViA laboratory, VIBOT ERL CNRS, University of Burgundy - Franche-Comt ´e; ICube laboratory, CNRS, University of Strasbourg", "project": "", "github": "", "supp": "", @@ -24119,14 +24888,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Adlakha_QUARCH_A_New_Quasi-Affine_Reconstruction_Stratum_From_Vague_Relative_Camera_ICCV_2019_paper.html", "aff_unique_index": "0;0;1;2;0", - "aff_unique_norm": "University of Strasbourg;University of Picardie Jules Verne;University of Burgundy - Franche-Comt\u00e9", + "aff_unique_norm": "University of Strasbourg;University of Picardie Jules Verne;University of Burgundy - Franche-Comté", "aff_unique_dep": "ICube laboratory;MIS laboratory;ImViA laboratory, VIBOT ERL CNRS", "aff_unique_url": "https://www.unistra.fr;https://www.univ-picardie.fr;https://www.ubfc.fr", "aff_unique_abbr": "Unistra;;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "France" + "aff_country_unique": "France", + "bibtex": "@InProceedings{Adlakha_2019_ICCV,\n \n author = {\n Adlakha,\n Devesh and Habed,\n Adlane and Morbidi,\n Fabio and Demonceaux,\n Cedric and Mathelin,\n Michel de\n},\n title = {\n QUARCH: A New Quasi-Affine Reconstruction Stratum From Vague Relative Camera Orientation Knowledge\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Quasi-Globally Optimal and Efficient Vanishing Point Estimation in Manhattan World", @@ -24152,14 +24922,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Li_Quasi-Globally_Optimal_and_Efficient_Vanishing_Point_Estimation_in_Manhattan_World_ICCV_2019_paper.html", "aff_unique_index": "0;1;2;0;0;0", - "aff_unique_norm": "Chinese University of Hong Kong;TuSimple;Korea Advanced Institute of Science and Technology", + "aff_unique_norm": "The Chinese University of Hong Kong;TuSimple;Korea Advanced Institute of Science and Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.tusimple.com;https://www.kaist.ac.kr", "aff_unique_abbr": "CUHK;;KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0;0", - "aff_country_unique": "China;South Korea" + "aff_country_unique": "China;South Korea", + "bibtex": "@InProceedings{Li_2019_ICCV,\n \n author = {\n Li,\n Haoang and Zhao,\n Ji and Bazin,\n Jean-Charles and Chen,\n Wen and Liu,\n Zhe and Liu,\n Yun-Hui\n},\n title = {\n Quasi-Globally Optimal and Efficient Vanishing Point Estimation in Manhattan World\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "RANet: Ranking Attention Network for Fast Video Object Segmentation", @@ -24185,14 +24956,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Wang_RANet_Ranking_Attention_Network_for_Fast_Video_Object_Segmentation_ICCV_2019_paper.html", "aff_unique_index": "0+1;2+3;2;2;2", - "aff_unique_norm": "University of Sydney;Xi'an Jiao Tong University;Inception Institute of Artificial Intelligence;Nankai University", + "aff_unique_norm": "The University of Sydney;Xi'an Jiaotong University;Inception Institute of Artificial Intelligence;Nankai University", "aff_unique_dep": ";Institute of Artificial Intelligence and Robotics;;College of Computer Science", "aff_unique_url": "https://www.sydney.edu.au;http://www.xjtu.edu.cn;;http://www.nankai.edu.cn", "aff_unique_abbr": "USYD;XJTU;IIAI;Nankai", "aff_campus_unique_index": "0+1;2+3;2;2;2", "aff_campus_unique": "Sydney;Xi'an;Abu Dhabi;Tianjin", "aff_country_unique_index": "0+1;2+1;2;2;2", - "aff_country_unique": "Australia;China;United Arab Emirates" + "aff_country_unique": "Australia;China;United Arab Emirates", + "bibtex": "@InProceedings{Wang_2019_ICCV,\n \n author = {\n Wang,\n Ziqin and Xu,\n Jun and Liu,\n Li and Zhu,\n Fan and Shao,\n Ling\n},\n title = {\n RANet: Ranking Attention Network for Fast Video Object Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "RGB-Infrared Cross-Modality Person Re-Identification via Joint Pixel and Feature Alignment", @@ -24216,7 +24988,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Wang_RGB-Infrared_Cross-Modality_Person_Re-Identification_via_Joint_Pixel_and_Feature_Alignment_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Wang_RGB-Infrared_Cross-Modality_Person_Re-Identification_via_Joint_Pixel_and_Feature_Alignment_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Wang_2019_ICCV,\n \n author = {\n Wang,\n Guan'an and Zhang,\n Tianzhu and Cheng,\n Jian and Liu,\n Si and Yang,\n Yang and Hou,\n Zengguang\n},\n title = {\n RGB-Infrared Cross-Modality Person Re-Identification via Joint Pixel and Feature Alignment\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "RIO: 3D Object Instance Re-Localization in Changing Indoor Environments", @@ -24224,7 +24997,7 @@ "status": "Oral", "track": "main", "pid": "1206", - "author_site": "Johanna Wald, Armen Avetisyan, Nassir Navab, Federico Tombari, Matthias Nie\u00c3\u009fner", + "author_site": "Johanna Wald, Armen Avetisyan, Nassir Navab, Federico Tombari, Matthias Nießner", "author": "Johanna Wald; Armen Avetisyan; Nassir Navab; Federico Tombari; Matthias Niessner", "abstract": "In this work, we introduce the task of 3D object instance re-localization (RIO): given one or multiple objects in an RGB-D scan, we want to estimate their corresponding 6DoF poses in another 3D scan of the same environment taken at a later point in time. We consider RIO a particularly important task in 3D vision since it enables a wide range of practical applications, including AI-assistants or robots that are asked to find a specific object in a 3D scene. To address this problem, we first introduce 3RScan, a novel dataset and benchmark, which features 1482 RGB-D scans of 478 environments across multiple time steps. Each scene includes several objects whose positions change over time, together with ground truth annotations of object instances and their respective 6DoF mappings among re-scans. Automatically finding 6DoF object poses leads to a particular challenging feature matching task due to varying partial observations and changes in the surrounding context. To this end, we introduce a new data-driven approach that efficiently finds matching features using a fully-convolutional 3D correspondence network operating on multiple spatial scales. Combined with a 6DoF pose optimization, our method outperforms state-of-the-art baselines on our newly-established benchmark, achieving an accuracy of 30.58%.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Wald_RIO_3D_Object_Instance_Re-Localization_in_Changing_Indoor_Environments_ICCV_2019_paper.pdf", @@ -24243,13 +25016,14 @@ "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Wald_RIO_3D_Object_Instance_Re-Localization_in_Changing_Indoor_Environments_ICCV_2019_paper.html", "aff_unique_index": "0;0;0;0+1;0+1", "aff_unique_norm": "Technical University of Munich;Google", - "aff_unique_dep": ";Google", + "aff_unique_dep": ";", "aff_unique_url": "https://www.tum.de;https://www.google.com", "aff_unique_abbr": "TUM;Google", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0+1;0+1", - "aff_country_unique": "Germany;United States" + "aff_country_unique": "Germany;United States", + "bibtex": "@InProceedings{Wald_2019_ICCV,\n \n author = {\n Wald,\n Johanna and Avetisyan,\n Armen and Navab,\n Nassir and Tombari,\n Federico and Niessner,\n Matthias\n},\n title = {\n RIO: 3D Object Instance Re-Localization in Changing Indoor Environments\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Racial Faces in the Wild: Reducing Racial Bias by Information Maximization Adaptation Network", @@ -24278,11 +25052,12 @@ "aff_unique_norm": "Beijing University of Posts and Telecommunications;Canon Information Technology", "aff_unique_dep": ";", "aff_unique_url": "http://www.bupt.edu.cn/;https://www.canon.com.cn", - "aff_unique_abbr": "BUPT;", + "aff_unique_abbr": "BUPT;CIT", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Beijing;", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2019_ICCV,\n \n author = {\n Wang,\n Mei and Deng,\n Weihong and Hu,\n Jiani and Tao,\n Xunqiang and Huang,\n Yaohai\n},\n title = {\n Racial Faces in the Wild: Reducing Racial Bias by Information Maximization Adaptation Network\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "RainFlow: Optical Flow Under Rain Streaks and Rain Veiling Effect", @@ -24290,7 +25065,7 @@ "status": "Poster", "track": "main", "pid": "2908", - "author_site": "Ruoteng Li, Robby T. Tan, Loong-Fah Cheong, Angelica I. Aviles-Rivero, Qingnan Fan, Carola-Bibiane Sch\u00c3\u00b6nlieb", + "author_site": "Ruoteng Li, Robby T. Tan, Loong-Fah Cheong, Angelica I. Aviles-Rivero, Qingnan Fan, Carola-Bibiane Schönlieb", "author": "Ruoteng Li; Robby T. Tan; Loong-Fah Cheong; Angelica I. Aviles-Rivero; Qingnan Fan; Carola-Bibiane Schonlieb", "abstract": "Optical flow in heavy rainy scenes is challenging due to the presence of both rain steaks and rain veiling effect, which break the existing optical flow constraints. Concerning this, we propose a deep-learning based optical flow method designed to handle heavy rain. We introduce a feature multiplier in our network that transforms the features of an image affected by the rain veiling effect into features that are less affected by it, which we call veiling-invariant features. We establish a new mapping operation in the feature space to produce streak-invariant features. The operation is based on a feature pyramid structure of the input images, and the basic idea is to preserve the chromatic features of the background scenes while canceling the rain-streak patterns. Both the veiling-invariant and streak-invariant features are computed and optimized automatically based on the the accuracy of our optical flow estimation. Our network is end-to-end, and handles both rain streaks and the veiling effect in an integrated framework. Extensive experiments show the effectiveness of our method, which outperforms the state of the art method and other baseline methods. We also show that our network can robustly maintain good performance on clean (no rain) images even though it is trained under rain image data.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Li_RainFlow_Optical_Flow_Under_Rain_Streaks_and_Rain_Veiling_Effect_ICCV_2019_paper.pdf", @@ -24315,7 +25090,8 @@ "aff_campus_unique_index": ";1;2;1", "aff_campus_unique": ";Cambridge;Stanford", "aff_country_unique_index": "0;0+0;0;1;2;1", - "aff_country_unique": "Singapore;United Kingdom;United States" + "aff_country_unique": "Singapore;United Kingdom;United States", + "bibtex": "@InProceedings{Li_2019_ICCV,\n \n author = {\n Li,\n Ruoteng and Tan,\n Robby T. and Cheong,\n Loong-Fah and Aviles-Rivero,\n Angelica I. and Fan,\n Qingnan and Schonlieb,\n Carola-Bibiane\n},\n title = {\n RainFlow: Optical Flow Under Rain Streaks and Rain Veiling Effect\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "RankSRGAN: Generative Adversarial Networks With Ranker for Image Super-Resolution", @@ -24341,14 +25117,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Zhang_RankSRGAN_Generative_Adversarial_Networks_With_Ranker_for_Image_Super-Resolution_ICCV_2019_paper.html", "aff_unique_index": "0;0+1;0;0", - "aff_unique_norm": "Shenzhen Institute of Advanced Technology;University of Chinese Academy of Sciences", + "aff_unique_norm": "Shenzhen Institutes of Advanced Technology;University of Chinese Academy of Sciences", "aff_unique_dep": "Key Lab of Computer Vision and Pattern Recognition;", "aff_unique_url": "http://www.siat.ac.cn;http://www.ucas.ac.cn", "aff_unique_abbr": "SIAT;UCAS", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Shenzhen;", "aff_country_unique_index": "0;0+0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2019_ICCV,\n \n author = {\n Zhang,\n Wenlong and Liu,\n Yihao and Dong,\n Chao and Qiao,\n Yu\n},\n title = {\n RankSRGAN: Generative Adversarial Networks With Ranker for Image Super-Resolution\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Re-ID Driven Localization Refinement for Person Search", @@ -24381,7 +25158,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Han_2019_ICCV,\n \n author = {\n Han,\n Chuchu and Ye,\n Jiacheng and Zhong,\n Yunshan and Tan,\n Xin and Zhang,\n Chi and Gao,\n Changxin and Sang,\n Nong\n},\n title = {\n Re-ID Driven Localization Refinement for Person Search\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Real Image Denoising With Feature Attention", @@ -24414,7 +25192,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Australia" + "aff_country_unique": "Australia", + "bibtex": "@InProceedings{Anwar_2019_ICCV,\n \n author = {\n Anwar,\n Saeed and Barnes,\n Nick\n},\n title = {\n Real Image Denoising With Feature Attention\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Reasoning About Human-Object Interactions Through Dual Attention Networks", @@ -24440,14 +25219,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Xiao_Reasoning_About_Human-Object_Interactions_Through_Dual_Attention_Networks_ICCV_2019_paper.html", "aff_unique_index": "0;1;1;2;2;3", - "aff_unique_norm": "University of California, Berkeley;IBM;Massachusetts Institute of Technology;Chinese University of Hong Kong", + "aff_unique_norm": "University of California, Berkeley;MIT-IBM Watson AI Lab;Massachusetts Institute of Technology;The Chinese University of Hong Kong", "aff_unique_dep": ";AI Lab;;", "aff_unique_url": "https://www.berkeley.edu;https://www.ibmwatsonai.org/;https://web.mit.edu;https://www.cuhk.edu.hk", "aff_unique_abbr": "UC Berkeley;MIT-IBM AI Lab;MIT;CUHK", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Berkeley;;Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0;1", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Xiao_2019_ICCV,\n \n author = {\n Xiao,\n Tete and Fan,\n Quanfu and Gutfreund,\n Dan and Monfort,\n Mathew and Oliva,\n Aude and Zhou,\n Bolei\n},\n title = {\n Reasoning About Human-Object Interactions Through Dual Attention Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Reciprocal Multi-Layer Subspace Learning for Multi-View Clustering", @@ -24459,7 +25239,7 @@ "author": "Ruihuang Li; Changqing Zhang; Huazhu Fu; Xi Peng; Tianyi Zhou; Qinghua Hu", "abstract": "Multi-view clustering is a long-standing important research topic, however, remains challenging when handling high-dimensional data and simultaneously exploring the consistency and complementarity of different views. In this work, we present a novel Reciprocal Multi-layer Subspace Learning (RMSL) algorithm for multi-view clustering, which is composed of two main components: Hierarchical Self-Representative Layers (HSRL), and Backward Encoding Networks (BEN). Specifically, HSRL constructs reciprocal multi-layer subspace representations linked with a latent representation to hierarchically recover the underlying low-dimensional subspaces in which the high-dimensional data lie; BEN explores complex relationships among different views and implicitly enforces the subspaces of all views to be consistent with each other and more separable. The latent representation flexibly encodes complementary information from multiple views and depicts data more comprehensively. Our model can be efficiently optimized by an alternating optimization scheme. Extensive experiments on benchmark datasets show the superiority of RMSL over other state-of-the-art clustering methods.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Li_Reciprocal_Multi-Layer_Subspace_Learning_for_Multi-View_Clustering_ICCV_2019_paper.pdf", - "aff": "Tianjin University; Tianjin University; Inception Institute of Arti\ufb01cial Intelligence; Sichuan University; Institute of High Performance Computing, A*STAR; Tianjin University", + "aff": "Tianjin University; Tianjin University; Inception Institute of Artificial Intelligence; Sichuan University; Institute of High Performance Computing, A*STAR; Tianjin University", "project": "", "github": "", "supp": "", @@ -24480,7 +25260,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1;0", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Li_2019_ICCV,\n \n author = {\n Li,\n Ruihuang and Zhang,\n Changqing and Fu,\n Huazhu and Peng,\n Xi and Zhou,\n Tianyi and Hu,\n Qinghua\n},\n title = {\n Reciprocal Multi-Layer Subspace Learning for Multi-View Clustering\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Recognizing Part Attributes With Insufficient Data", @@ -24504,7 +25285,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Zhao_Recognizing_Part_Attributes_With_Insufficient_Data_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Zhao_Recognizing_Part_Attributes_With_Insufficient_Data_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Zhao_2019_ICCV,\n \n author = {\n Zhao,\n Xiangyun and Yang,\n Yi and Zhou,\n Feng and Tan,\n Xiao and Yuan,\n Yuchen and Bao,\n Yingze and Wu,\n Ying\n},\n title = {\n Recognizing Part Attributes With Insufficient Data\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Recover and Identify: A Generative Dual Model for Cross-Resolution Person Re-Identification", @@ -24537,7 +25319,8 @@ "aff_campus_unique_index": "0+0;0+0+0;0;0+0+0", "aff_campus_unique": "Taiwan;", "aff_country_unique_index": "0+0;0+0+0;0;1;0+0+0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Li_2019_ICCV,\n \n author = {\n Li,\n Yu-Jhe and Chen,\n Yun-Chun and Lin,\n Yen-Yu and Du,\n Xiaofei and Wang,\n Yu-Chiang Frank\n},\n title = {\n Recover and Identify: A Generative Dual Model for Cross-Resolution Person Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Recurrent U-Net for Resource-Constrained Segmentation", @@ -24563,14 +25346,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Wang_Recurrent_U-Net_for_Resource-Constrained_Segmentation_ICCV_2019_paper.html", "aff_unique_index": "0;0;0;0;0", - "aff_unique_norm": "EPFL", + "aff_unique_norm": "École Polytechnique Fédérale de Lausanne", "aff_unique_dep": "CVLab", "aff_unique_url": "https://www.epfl.ch", "aff_unique_abbr": "EPFL", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Lausanne", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "Switzerland" + "aff_country_unique": "Switzerland", + "bibtex": "@InProceedings{Wang_2019_ICCV,\n \n author = {\n Wang,\n Wei and Yu,\n Kaicheng and Hugonot,\n Joachim and Fua,\n Pascal and Salzmann,\n Mathieu\n},\n title = {\n Recurrent U-Net for Resource-Constrained Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Recursive Cascaded Networks for Unsupervised Medical Image Registration", @@ -24596,14 +25380,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Zhao_Recursive_Cascaded_Networks_for_Unsupervised_Medical_Image_Registration_ICCV_2019_paper.html", "aff_unique_index": "0+1;0;1;2+1", - "aff_unique_norm": "Tsinghua University;Microsoft;Beihang University", + "aff_unique_norm": "Tsinghua University;Microsoft Corporation;Beihang University", "aff_unique_dep": "Institute for Interdisciplinary Information Sciences;Microsoft Research;School of Biological Science and Medical Engineering", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.microsoft.com/en-us/research;http://www.buaa.edu.cn/", "aff_unique_abbr": "THU;MSR;Beihang", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0;1;0+1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Zhao_2019_ICCV,\n \n author = {\n Zhao,\n Shengyu and Dong,\n Yue and Chang,\n Eric I-Chao and Xu,\n Yan\n},\n title = {\n Recursive Cascaded Networks for Unsupervised Medical Image Registration\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Recursive Visual Sound Separation Using Minus-Plus Net", @@ -24629,14 +25414,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Xu_Recursive_Visual_Sound_Separation_Using_Minus-Plus_Net_ICCV_2019_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "Chinese University of Hong Kong", + "aff_unique_norm": "The Chinese University of Hong Kong", "aff_unique_dep": "CUHK-SenseTime Joint Lab", "aff_unique_url": "https://www.cuhk.edu.hk", "aff_unique_abbr": "CUHK", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xu_2019_ICCV,\n \n author = {\n Xu,\n Xudong and Dai,\n Bo and Lin,\n Dahua\n},\n title = {\n Recursive Visual Sound Separation Using Minus-Plus Net\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Reflective Decoding Network for Image Captioning", @@ -24662,14 +25448,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Ke_Reflective_Decoding_Network_for_Image_Captioning_ICCV_2019_paper.html", "aff_unique_index": "0;1;1;1;1", - "aff_unique_norm": "Hong Kong University of Science and Technology;Tencent", - "aff_unique_dep": ";Tencent Holdings Limited", + "aff_unique_norm": "Hong Kong University of Science and Technology;Tencent Holdings Limited", + "aff_unique_dep": ";", "aff_unique_url": "https://www.ust.hk;https://www.tencent.com", "aff_unique_abbr": "HKUST;Tencent", "aff_campus_unique_index": "0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Ke_2019_ICCV,\n \n author = {\n Ke,\n Lei and Pei,\n Wenjie and Li,\n Ruiyu and Shen,\n Xiaoyong and Tai,\n Yu-Wing\n},\n title = {\n Reflective Decoding Network for Image Captioning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "RelGAN: Multi-Domain Image-to-Image Translation via Relative Attributes", @@ -24702,7 +25489,8 @@ "aff_campus_unique_index": "0;0;0;0+1;0", "aff_campus_unique": "Taiwan;Stanford", "aff_country_unique_index": "0;0;0;0+1;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Wu_2019_ICCV,\n \n author = {\n Wu,\n Po-Wei and Lin,\n Yu-Jing and Chang,\n Che-Han and Chang,\n Edward Y. and Liao,\n Shih-Wei\n},\n title = {\n RelGAN: Multi-Domain Image-to-Image Translation via Relative Attributes\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Relation Distillation Networks for Video Object Detection", @@ -24728,14 +25516,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Deng_Relation_Distillation_Networks_for_Video_Object_Detection_ICCV_2019_paper.html", "aff_unique_index": "0;1;1;0;0;1", - "aff_unique_norm": "University of Science and Technology of China;JD", - "aff_unique_dep": "CAS Key Laboratory of GIPAS;JD AI Research", + "aff_unique_norm": "University of Science and Technology of China;JD AI Research", + "aff_unique_dep": "CAS Key Laboratory of GIPAS;", "aff_unique_url": "http://www.ustc.edu.cn;", "aff_unique_abbr": "USTC;", "aff_campus_unique_index": "0;1;1;0;0;1", "aff_campus_unique": "Hefei;Beijing", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Deng_2019_ICCV,\n \n author = {\n Deng,\n Jiajun and Pan,\n Yingwei and Yao,\n Ting and Zhou,\n Wengang and Li,\n Houqiang and Mei,\n Tao\n},\n title = {\n Relation Distillation Networks for Video Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Relation Parsing Neural Network for Human-Object Interaction Detection", @@ -24768,7 +25557,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Shanghai", "aff_country_unique_index": "0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhou_2019_ICCV,\n \n author = {\n Zhou,\n Penghao and Chi,\n Mingmin\n},\n title = {\n Relation Parsing Neural Network for Human-Object Interaction Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Relation-Aware Graph Attention Network for Visual Question Answering", @@ -24801,7 +25591,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Li_2019_ICCV,\n \n author = {\n Li,\n Linjie and Gan,\n Zhe and Cheng,\n Yu and Liu,\n Jingjing\n},\n title = {\n Relation-Aware Graph Attention Network for Visual Question Answering\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Relational Attention Network for Crowd Counting", @@ -24813,7 +25604,7 @@ "author": "Anran Zhang; Jiayi Shen; Zehao Xiao; Fan Zhu; Xiantong Zhen; Xianbin Cao; Ling Shao", "abstract": "Crowd counting is receiving rapidly growing research interests due to its potential application value in numerous real-world scenarios. However, due to various challenges such as occlusion, insufficient resolution and dynamic backgrounds, crowd counting remains an unsolved problem in computer vision. Density estimation is a popular strategy for crowd counting, where conventional density estimation methods perform pixel-wise regression without explicitly accounting the interdependence of pixels. As a result, independent pixel-wise predictions can be noisy and inconsistent. In order to address such an issue, we propose a Relational Attention Network (RANet) with a self-attention mechanism for capturing interdependence of pixels. The RANet enhances the self-attention mechanism by accounting both short-range and long-range interdependence of pixels, where we respectively denote these implementations as local self-attention (LSA) and global self-attention (GSA). We further introduce a relation module to fuse LSA and GSA to achieve more informative aggregated feature representations. We conduct extensive experiments on four public datasets, including ShanghaiTech A, ShanghaiTech B, UCF-CC-50 and UCF-QNRF. Experimental results on all datasets suggest RANet consistently reduces estimation errors and surpasses the state-of-the-art approaches by large margins.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Zhang_Relational_Attention_Network_for_Crowd_Counting_ICCV_2019_paper.pdf", - "aff": "School of Electronic and Information Engineering, Beihang University, Beijing, China; School of Electronic and Information Engineering, Beihang University, Beijing, China; School of Electronic and Information Engineering, Beihang University, Beijing, China; Inception Institute of Arti\ufb01cial Intelligence, Abu Dhabi, UAE; Inception Institute of Arti\ufb01cial Intelligence, Abu Dhabi, UAE; School of Electronic and Information Engineering, Beihang University, Beijing, China + Key Laboratory of Advanced Technology of Near Space Information System (Beihang University), Ministry of Industry and Information Technology of China, Beijing, China + Beijing Advanced Innovation Center for Big Data-Based Precision Medicine, Beijing, China; Inception Institute of Arti\ufb01cial Intelligence, Abu Dhabi, UAE", + "aff": "School of Electronic and Information Engineering, Beihang University, Beijing, China; School of Electronic and Information Engineering, Beihang University, Beijing, China; School of Electronic and Information Engineering, Beihang University, Beijing, China; Inception Institute of Artificial Intelligence, Abu Dhabi, UAE; Inception Institute of Artificial Intelligence, Abu Dhabi, UAE; School of Electronic and Information Engineering, Beihang University, Beijing, China + Key Laboratory of Advanced Technology of Near Space Information System (Beihang University), Ministry of Industry and Information Technology of China, Beijing, China + Beijing Advanced Innovation Center for Big Data-Based Precision Medicine, Beijing, China; Inception Institute of Artificial Intelligence, Abu Dhabi, UAE", "project": "", "github": "", "supp": "", @@ -24834,7 +25625,8 @@ "aff_campus_unique_index": "0;0;0;1;1;0+0;1", "aff_campus_unique": "Beijing;Abu Dhabi;", "aff_country_unique_index": "0;0;0;1;1;0+0+0;1", - "aff_country_unique": "China;United Arab Emirates" + "aff_country_unique": "China;United Arab Emirates", + "bibtex": "@InProceedings{Zhang_2019_ICCV,\n \n author = {\n Zhang,\n Anran and Shen,\n Jiayi and Xiao,\n Zehao and Zhu,\n Fan and Zhen,\n Xiantong and Cao,\n Xianbin and Shao,\n Ling\n},\n title = {\n Relational Attention Network for Crowd Counting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Remote Heart Rate Measurement From Highly Compressed Facial Videos: An End-to-End Deep Learning Solution With Video Enhancement", @@ -24846,7 +25638,7 @@ "author": "Zitong Yu; Wei Peng; Xiaobai Li; Xiaopeng Hong; Guoying Zhao", "abstract": "Remote photoplethysmography (rPPG), which aims at measuring heart activities without any contact, has great potential in many applications (e.g., remote healthcare). Existing rPPG approaches rely on analyzing very fine details of facial videos, which are prone to be affected by video compression. Here we propose a two-stage, end-to-end method using hidden rPPG information enhancement and attention networks, which is the first attempt to counter video compression loss and recover rPPG signals from highly compressed videos. The method includes two parts: 1) a Spatio-Temporal Video Enhancement Network (STVEN) for video enhancement, and 2) an rPPG network (rPPGNet) for rPPG signal recovery. The rPPGNet can work on its own for robust rPPG measurement, and the STVEN network can be added and jointly trained to further boost the performance especially on highly compressed videos. Comprehensive experiments are performed on two benchmark datasets to show that, 1) the proposed method not only achieves superior performance on compressed videos with high-quality videos pair, 2) it also generalizes well on novel data with only compressed videos available, which implies the promising potential for real-world applications.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Yu_Remote_Heart_Rate_Measurement_From_Highly_Compressed_Facial_Videos_An_ICCV_2019_paper.pdf", - "aff": "Center for Machine Vision and Signal Analysis, University of Oulu, Finland; Center for Machine Vision and Signal Analysis, University of Oulu, Finland; Center for Machine Vision and Signal Analysis, University of Oulu, Finland; Faculty of Electronic and Information Engineering, Xi\u2019an Jiaotong University, PRC+Peng Cheng Laborotory, China+Center for Machine Vision and Signal Analysis, University of Oulu, Finland; School of Information and Technology, Northwest University, PRC+Center for Machine Vision and Signal Analysis, University of Oulu, Finland", + "aff": "Center for Machine Vision and Signal Analysis, University of Oulu, Finland; Center for Machine Vision and Signal Analysis, University of Oulu, Finland; Center for Machine Vision and Signal Analysis, University of Oulu, Finland; Faculty of Electronic and Information Engineering, Xi’an Jiaotong University, PRC+Peng Cheng Laborotory, China+Center for Machine Vision and Signal Analysis, University of Oulu, Finland; School of Information and Technology, Northwest University, PRC+Center for Machine Vision and Signal Analysis, University of Oulu, Finland", "project": "", "github": "", "supp": "", @@ -24860,14 +25652,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Yu_Remote_Heart_Rate_Measurement_From_Highly_Compressed_Facial_Videos_An_ICCV_2019_paper.html", "aff_unique_index": "0;0;0;1+2+0;3+0", - "aff_unique_norm": "University of Oulu;Xi'an Jiao Tong University;Pengcheng Laboratory;Northwest University", - "aff_unique_dep": "Center for Machine Vision and Signal Analysis;Faculty of Electronic and Information Engineering;Peng Cheng Laboratory;School of Information and Technology", + "aff_unique_norm": "University of Oulu;Xi'an Jiaotong University;Peng Cheng Laboratory;Northwest University", + "aff_unique_dep": "Center for Machine Vision and Signal Analysis;Faculty of Electronic and Information Engineering;;School of Information and Technology", "aff_unique_url": "https://www.oulu.fi;http://www.xjtu.edu.cn;;", "aff_unique_abbr": ";XJTU;;", "aff_campus_unique_index": "1;", "aff_campus_unique": ";Xi'an", "aff_country_unique_index": "0;0;0;1+1+0;1+0", - "aff_country_unique": "Finland;China" + "aff_country_unique": "Finland;China", + "bibtex": "@InProceedings{Yu_2019_ICCV,\n \n author = {\n Yu,\n Zitong and Peng,\n Wei and Li,\n Xiaobai and Hong,\n Xiaopeng and Zhao,\n Guoying\n},\n title = {\n Remote Heart Rate Measurement From Highly Compressed Facial Videos: An End-to-End Deep Learning Solution With Video Enhancement\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "RepPoints: Point Set Representation for Object Detection", @@ -24893,14 +25686,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Yang_RepPoints_Point_Set_Representation_for_Object_Detection_ICCV_2019_paper.html", "aff_unique_index": "0;1+2;2;0;2", - "aff_unique_norm": "Peking University;Tsinghua University;Microsoft", + "aff_unique_norm": "Peking University;Tsinghua University;Microsoft Research", "aff_unique_dep": ";;Research", "aff_unique_url": "http://www.pku.edu.cn;https://www.tsinghua.edu.cn;https://www.microsoft.com/en-us/research/group/asia", "aff_unique_abbr": "Peking U;THU;MSR Asia", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0;0+0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yang_2019_ICCV,\n \n author = {\n Yang,\n Ze and Liu,\n Shaohui and Hu,\n Han and Wang,\n Liwei and Lin,\n Stephen\n},\n title = {\n RepPoints: Point Set Representation for Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Rescan: Inductive Instance Segmentation for Indoor RGBD Scans", @@ -24924,7 +25718,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Halber_Rescan_Inductive_Instance_Segmentation_for_Indoor_RGBD_Scans_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Halber_Rescan_Inductive_Instance_Segmentation_for_Indoor_RGBD_Scans_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Halber_2019_ICCV,\n \n author = {\n Halber,\n Maciej and Shi,\n Yifei and Xu,\n Kai and Funkhouser,\n Thomas\n},\n title = {\n Rescan: Inductive Instance Segmentation for Indoor RGBD Scans\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Resolving 3D Human Pose Ambiguities With 3D Scene Constraints", @@ -24957,7 +25752,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Hassan_2019_ICCV,\n \n author = {\n Hassan,\n Mohamed and Choutas,\n Vasileios and Tzionas,\n Dimitrios and Black,\n Michael J.\n},\n title = {\n Resolving 3D Human Pose Ambiguities With 3D Scene Constraints\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Resource Constrained Neural Network Architecture Search: Will a Submodularity Assumption Help?", @@ -24990,7 +25786,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Madison", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Xiong_2019_ICCV,\n \n author = {\n Xiong,\n Yunyang and Mehta,\n Ronak and Singh,\n Vikas\n},\n title = {\n Resource Constrained Neural Network Architecture Search: Will a Submodularity Assumption Help?\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Restoration of Non-Rigidly Distorted Underwater Images Using a Combination of Compressive Sensing and Local Polynomial Image Representations", @@ -25023,7 +25820,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Mumbai", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "India" + "aff_country_unique": "India", + "bibtex": "@InProceedings{James_2019_ICCV,\n \n author = {\n James,\n Jerin Geo and Agrawal,\n Pranay and Rajwade,\n Ajit\n},\n title = {\n Restoration of Non-Rigidly Distorted Underwater Images Using a Combination of Compressive Sensing and Local Polynomial Image Representations\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Rethinking ImageNet Pre-Training", @@ -25031,7 +25829,7 @@ "status": "Poster", "track": "main", "pid": "2571", - "author_site": "Kaiming He, Ross Girshick, Piotr Doll\u00c3\u00a1r", + "author_site": "Kaiming He, Ross Girshick, Piotr Dollár", "author": "Kaiming He; Ross Girshick; Piotr Dollar", "abstract": "We report competitive results on object detection and instance segmentation on the COCO dataset using standard models trained from random initialization. The results are no worse than their ImageNet pre-training counterparts even when using the hyper-parameters of the baseline system (Mask R-CNN) that were optimized for fine-tuning pre-trained models, with the sole exception of increasing the number of training iterations so the randomly initialized models may converge. Training from random initialization is surprisingly robust; our results hold even when: (i) using only 10% of the training data, (ii) for deeper and wider models, and (iii) for multiple tasks and metrics. Experiments show that ImageNet pre-training speeds up convergence early in training, but does not necessarily provide regularization or improve final target task accuracy. To push the envelope we demonstrate 50.9 AP on COCO object detection without using any external data---a result on par with the top COCO 2017 competition results that used ImageNet pre-training. These observations challenge the conventional wisdom of ImageNet pre-training for dependent tasks and we expect these discoveries will encourage people to rethink the current de facto paradigm of `pre-training and fine-tuning' in computer vision.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/He_Rethinking_ImageNet_Pre-Training_ICCV_2019_paper.pdf", @@ -25047,7 +25845,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/He_Rethinking_ImageNet_Pre-Training_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/He_Rethinking_ImageNet_Pre-Training_ICCV_2019_paper.html", + "bibtex": "@InProceedings{He_2019_ICCV,\n \n author = {\n He,\n Kaiming and Girshick,\n Ross and Dollar,\n Piotr\n},\n title = {\n Rethinking ImageNet Pre-Training\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Rethinking Zero-Shot Learning: A Conditional Visual Classification Perspective", @@ -25080,7 +25879,8 @@ "aff_campus_unique_index": "0+0;0+0", "aff_campus_unique": "Boston;", "aff_country_unique_index": "0+0;0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Li_2019_ICCV,\n \n author = {\n Li,\n Kai and Min,\n Martin Renqiang and Fu,\n Yun\n},\n title = {\n Rethinking Zero-Shot Learning: A Conditional Visual Classification Perspective\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Revisiting Point Cloud Classification: A New Benchmark Dataset and Classification Model on Real-World Data", @@ -25113,7 +25913,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;1;2;3;0", - "aff_country_unique": "China;Singapore;Japan;Australia" + "aff_country_unique": "China;Singapore;Japan;Australia", + "bibtex": "@InProceedings{Uy_2019_ICCV,\n \n author = {\n Uy,\n Mikaela Angelina and Pham,\n Quang-Hieu and Hua,\n Binh-Son and Nguyen,\n Thanh and Yeung,\n Sai-Kit\n},\n title = {\n Revisiting Point Cloud Classification: A New Benchmark Dataset and Classification Model on Real-World Data\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Revisiting Radial Distortion Absolute Pose", @@ -25137,7 +25938,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Larsson_Revisiting_Radial_Distortion_Absolute_Pose_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Larsson_Revisiting_Radial_Distortion_Absolute_Pose_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Larsson_2019_ICCV,\n \n author = {\n Larsson,\n Viktor and Sattler,\n Torsten and Kukelova,\n Zuzana and Pollefeys,\n Marc\n},\n title = {\n Revisiting Radial Distortion Absolute Pose\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Robust Change Captioning", @@ -25161,7 +25963,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Park_Robust_Change_Captioning_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Park_Robust_Change_Captioning_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Park_2019_ICCV,\n \n author = {\n Park,\n Dong Huk and Darrell,\n Trevor and Rohrbach,\n Anna\n},\n title = {\n Robust Change Captioning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Robust Motion Segmentation From Pairwise Matches", @@ -25173,7 +25976,7 @@ "author": "Federica Arrigoni; Tomas Pajdla", "abstract": "In this paper we consider the problem of motion segmentation, where only pairwise correspondences are assumed as input without prior knowledge about tracks. The problem is formulated as a two-step process. First, motion segmentation is performed on image pairs independently. Secondly, we combine independent pairwise segmentation results in a robust way into the final globally consistent segmentation. Our approach is inspired by the success of averaging methods. We demonstrate in simulated as well as in real experiments that our method is very effective in reducing the errors in the pairwise motion segmentation and can cope with large number of mismatches.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Arrigoni_Robust_Motion_Segmentation_From_Pairwise_Matches_ICCV_2019_paper.pdf", - "aff": "CIIRC \u2013 Czech Technical University in Prague; CIIRC \u2013 Czech Technical University in Prague", + "aff": "CIIRC – Czech Technical University in Prague; CIIRC – Czech Technical University in Prague", "project": "", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2019/supplemental/Arrigoni_Robust_Motion_Segmentation_ICCV_2019_supplemental.pdf", @@ -25194,7 +25997,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Prague", "aff_country_unique_index": "0;0", - "aff_country_unique": "Czech Republic" + "aff_country_unique": "Czech Republic", + "bibtex": "@InProceedings{Arrigoni_2019_ICCV,\n \n author = {\n Arrigoni,\n Federica and Pajdla,\n Tomas\n},\n title = {\n Robust Motion Segmentation From Pairwise Matches\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Robust Multi-Modality Multi-Object Tracking", @@ -25227,7 +26031,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2+1;1;1;0", - "aff_country_unique": "Singapore;China;United Kingdom" + "aff_country_unique": "Singapore;China;United Kingdom", + "bibtex": "@InProceedings{Zhang_2019_ICCV,\n \n author = {\n Zhang,\n Wenwei and Zhou,\n Hui and Sun,\n Shuyang and Wang,\n Zhe and Shi,\n Jianping and Loy,\n Chen Change\n},\n title = {\n Robust Multi-Modality Multi-Object Tracking\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Robust Person Re-Identification by Modelling Feature Uncertainty", @@ -25253,14 +26058,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Yu_Robust_Person_Re-Identification_by_Modelling_Feature_Uncertainty_ICCV_2019_paper.html", "aff_unique_index": "0;0+1;0;2;0+1", - "aff_unique_norm": "University of Surrey;Samsung;University of Edinburgh", + "aff_unique_norm": "University of Surrey;Samsung AI Centre;University of Edinburgh", "aff_unique_dep": ";AI Centre;", "aff_unique_url": "https://www.surrey.ac.uk;https://www.samsung.com/global/campaign/ai-research-centre/;https://www.ed.ac.uk", "aff_unique_abbr": "Surrey;SAC;Edinburgh", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Cambridge", "aff_country_unique_index": "0;0+0;0;0;0+0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Yu_2019_ICCV,\n \n author = {\n Yu,\n Tianyuan and Li,\n Da and Yang,\n Yongxin and Hospedales,\n Timothy M. and Xiang,\n Tao\n},\n title = {\n Robust Person Re-Identification by Modelling Feature Uncertainty\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Robust Variational Bayesian Point Set Registration", @@ -25272,7 +26078,7 @@ "author": "Jie Zhou; Xinke Ma; Li Liang; Yang Yang; Shijin Xu; Yuhe Liu; Sim-Heng Ong", "abstract": "In this work, we propose a hierarchical Bayesian network based point set registration method to solve missing correspondences and various massive outliers. We construct this network first using the finite Student s t latent mixture model (TLMM), in which distributions of latent variables are estimated by a tree-structured variational inference (VI) so that to obtain a tighter lower bound under the Bayesian framework. We then divide the TLMM into two different mixtures with isotropic and anisotropic covariances for correspondences recovering and outliers identification, respectively. Finally, the parameters of mixing proportion and covariances are both taken as latent variables, which benefits explaining of missing correspondences and heteroscedastic outliers. In addition, a cooling schedule is adopted to anneal prior on covariances and scale variables within designed two phases of transformation, it anneal priors on global and local variables to perform a coarse-to- fine registration. In experiments, our method outperforms five state-of-the-art methods in synthetic point set and realistic imaging registrations.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Zhou_Robust_Variational_Bayesian_Point_Set_Registration_ICCV_2019_paper.pdf", - "aff": "School of Information Science and Technology, Yunnan Normal University + Laboratory of Pattern Recognition and Arti\ufb01cial Intelligence, Yunnan Normal University; School of Information Science and Technology, Yunnan Normal University + Laboratory of Pattern Recognition and Arti\ufb01cial Intelligence, Yunnan Normal University; School of Information Science and Technology, Yunnan Normal University + Laboratory of Pattern Recognition and Arti\ufb01cial Intelligence, Yunnan Normal University; School of Information Science and Technology, Yunnan Normal University + Laboratory of Pattern Recognition and Arti\ufb01cial Intelligence, Yunnan Normal University; School of Information Science and Technology, Yunnan Normal University + Laboratory of Pattern Recognition and Arti\ufb01cial Intelligence, Yunnan Normal University; School of Information Science and Technology, Yunnan Normal University + Laboratory of Pattern Recognition and Arti\ufb01cial Intelligence, Yunnan Normal University; Department of Electrical and Computer Engineering, National University of Singapore", + "aff": "School of Information Science and Technology, Yunnan Normal University + Laboratory of Pattern Recognition and Artificial Intelligence, Yunnan Normal University; School of Information Science and Technology, Yunnan Normal University + Laboratory of Pattern Recognition and Artificial Intelligence, Yunnan Normal University; School of Information Science and Technology, Yunnan Normal University + Laboratory of Pattern Recognition and Artificial Intelligence, Yunnan Normal University; School of Information Science and Technology, Yunnan Normal University + Laboratory of Pattern Recognition and Artificial Intelligence, Yunnan Normal University; School of Information Science and Technology, Yunnan Normal University + Laboratory of Pattern Recognition and Artificial Intelligence, Yunnan Normal University; School of Information Science and Technology, Yunnan Normal University + Laboratory of Pattern Recognition and Artificial Intelligence, Yunnan Normal University; Department of Electrical and Computer Engineering, National University of Singapore", "project": "", "github": "", "supp": "", @@ -25285,15 +26091,16 @@ "email": "163.com; ; ; ; ; ; ", "author_num": 7, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Zhou_Robust_Variational_Bayesian_Point_Set_Registration_ICCV_2019_paper.html", - "aff_unique_index": "0+0;0+0;0+0;0+0;0+0;0+0;1", - "aff_unique_norm": "Yunnan Normal University;National University of Singapore", - "aff_unique_dep": "School of Information Science and Technology;Department of Electrical and Computer Engineering", - "aff_unique_url": "http://www.ynnu.edu.cn;https://www.nus.edu.sg", - "aff_unique_abbr": ";NUS", + "aff_unique_index": "1;1;1;1;1;1;2", + "aff_unique_norm": ";Yunnan Normal University;National University of Singapore", + "aff_unique_dep": ";Laboratory of Pattern Recognition and Artificial Intelligence;Department of Electrical and Computer Engineering", + "aff_unique_url": ";http://www.ynnu.edu.cn;https://www.nus.edu.sg", + "aff_unique_abbr": ";;NUS", "aff_campus_unique_index": ";;;;;", "aff_campus_unique": "", - "aff_country_unique_index": "0+0;0+0;0+0;0+0;0+0;0+0;1", - "aff_country_unique": "China;Singapore" + "aff_country_unique_index": "1;1;1;1;1;1;2", + "aff_country_unique": ";China;Singapore", + "bibtex": "@InProceedings{Zhou_2019_ICCV,\n \n author = {\n Zhou,\n Jie and Ma,\n Xinke and Liang,\n Li and Yang,\n Yang and Xu,\n Shijin and Liu,\n Yuhe and Ong,\n Sim-Heng\n},\n title = {\n Robust Variational Bayesian Point Set Registration\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "S2GAN: Share Aging Factors Across Ages and Share Aging Trends Among Individuals", @@ -25319,14 +26126,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/He_S2GAN_Share_Aging_Factors_Across_Ages_and_Share_Aging_Trends_ICCV_2019_paper.html", "aff_unique_index": "0+1;0+1;0+1+2;0+1", - "aff_unique_norm": "Chinese Academy of Sciences;University of Chinese Academy of Sciences;Pengcheng Laboratory", - "aff_unique_dep": "Institute of Computing Technology;;Peng Cheng Laboratory", + "aff_unique_norm": "Chinese Academy of Sciences;University of Chinese Academy of Sciences;Peng Cheng Laboratory", + "aff_unique_dep": "Institute of Computing Technology;;", "aff_unique_url": "http://www.cas.ac.cn;http://www.ucas.ac.cn;", "aff_unique_abbr": "CAS;UCAS;", "aff_campus_unique_index": "0+0;0+0;0+0+1;0+0", "aff_campus_unique": "Beijing;Shenzhen", "aff_country_unique_index": "0+0;0+0;0+0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{He_2019_ICCV,\n \n author = {\n He,\n Zhenliang and Kan,\n Meina and Shan,\n Shiguang and Chen,\n Xilin\n},\n title = {\n S2GAN: Share Aging Factors Across Ages and Share Aging Trends Among Individuals\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "S4L: Self-Supervised Semi-Supervised Learning", @@ -25359,7 +26167,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhai_2019_ICCV,\n \n author = {\n Zhai,\n Xiaohua and Oliver,\n Avital and Kolesnikov,\n Alexander and Beyer,\n Lucas\n},\n title = {\n S4L: Self-Supervised Semi-Supervised Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "SANet: Scene Agnostic Network for Camera Localization", @@ -25392,7 +26201,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0;0", - "aff_country_unique": "Canada;China" + "aff_country_unique": "Canada;China", + "bibtex": "@InProceedings{Yang_2019_ICCV,\n \n author = {\n Yang,\n Luwei and Bai,\n Ziqian and Tang,\n Chengzhou and Li,\n Honghua and Furukawa,\n Yasutaka and Tan,\n Ping\n},\n title = {\n SANet: Scene Agnostic Network for Camera Localization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "SBSGAN: Suppression of Inter-Domain Background Shift for Person Re-Identification", @@ -25425,7 +26235,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Sydney;", "aff_country_unique_index": "0;0;0;1", - "aff_country_unique": "Australia;China" + "aff_country_unique": "Australia;China", + "bibtex": "@InProceedings{Huang_2019_ICCV,\n \n author = {\n Huang,\n Yan and Wu,\n Qiang and Xu,\n JingSong and Zhong,\n Yi\n},\n title = {\n SBSGAN: Suppression of Inter-Domain Background Shift for Person Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "SC-FEGAN: Face Editing Generative Adversarial Network With User's Sketch and Color", @@ -25458,7 +26269,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Jo_2019_ICCV,\n \n author = {\n Jo,\n Youngjoo and Park,\n Jongyoul\n},\n title = {\n SC-FEGAN: Face Editing Generative Adversarial Network With User's Sketch and Color\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "SCRDet: Towards More Robust Detection for Small, Cluttered and Rotated Objects", @@ -25482,7 +26294,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Yang_SCRDet_Towards_More_Robust_Detection_for_Small_Cluttered_and_Rotated_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Yang_SCRDet_Towards_More_Robust_Detection_for_Small_Cluttered_and_Rotated_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Yang_2019_ICCV,\n \n author = {\n Yang,\n Xue and Yang,\n Jirui and Yan,\n Junchi and Zhang,\n Yue and Zhang,\n Tengfei and Guo,\n Zhi and Sun,\n Xian and Fu,\n Kun\n},\n title = {\n SCRDet: Towards More Robust Detection for Small,\n Cluttered and Rotated Objects\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "SCSampler: Sampling Salient Clips From Video for Efficient Action Recognition", @@ -25508,14 +26321,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Korbar_SCSampler_Sampling_Salient_Clips_From_Video_for_Efficient_Action_Recognition_ICCV_2019_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "Meta", + "aff_unique_norm": "Facebook", "aff_unique_dep": "Facebook AI", "aff_unique_url": "https://www.facebook.com", "aff_unique_abbr": "Facebook AI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Korbar_2019_ICCV,\n \n author = {\n Korbar,\n Bruno and Tran,\n Du and Torresani,\n Lorenzo\n},\n title = {\n SCSampler: Sampling Salient Clips From Video for Efficient Action Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "SENSE: A Shared Encoder Network for Scene-Flow Estimation", @@ -25541,14 +26355,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Jiang_SENSE_A_Shared_Encoder_Network_for_Scene-Flow_Estimation_ICCV_2019_paper.html", "aff_unique_index": "0;1;1;2;0;1", - "aff_unique_norm": "University of Massachusetts Amherst;NVIDIA;Georgia Institute of Technology", - "aff_unique_dep": ";NVIDIA Corporation;", + "aff_unique_norm": "University of Massachusetts Amherst;NVIDIA Corporation;Georgia Institute of Technology", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.umass.edu;https://www.nvidia.com;https://www.gatech.edu", "aff_unique_abbr": "UMass Amherst;NVIDIA;Georgia Tech", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Amherst;", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Jiang_2019_ICCV,\n \n author = {\n Jiang,\n Huaizu and Sun,\n Deqing and Jampani,\n Varun and Lv,\n Zhaoyang and Learned-Miller,\n Erik and Kautz,\n Jan\n},\n title = {\n SENSE: A Shared Encoder Network for Scene-Flow Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "SID4VAM: A Benchmark Dataset With Synthetic Images for Visual Attention Modeling", @@ -25556,7 +26371,7 @@ "status": "Poster", "track": "main", "pid": "1404", - "author_site": "David Berga, Xos\u00c3\u00a9 R. Fdez-Vidal, Xavier Otazu, Xos\u00c3\u00a9 M. Pardo", + "author_site": "David Berga, Xosé R. Fdez-Vidal, Xavier Otazu, Xosé M. Pardo", "author": "David Berga; Xose R. Fdez-Vidal; Xavier Otazu; Xose M. Pardo", "abstract": "A benchmark of saliency models performance with a synthetic image dataset is provided. Model performance is evaluated through saliency metrics as well as the influence of model inspiration and consistency with human psychophysics. SID4VAM is composed of 230 synthetic images, with known salient regions. Images were generated with 15 distinct types of low-level features (e.g. orientation, brightness, color, size...) with a target-distractor pop-out type of synthetic patterns. We have used Free-Viewing and Visual Search task instructions and 7 feature contrasts for each feature category. Our study reveals that state-of-the-art Deep Learning saliency models do not perform well with synthetic pattern images, instead, models with Spectral/Fourier inspiration outperform others in saliency metrics and are more consistent with human psychophysical experimentation. This study proposes a new way to evaluate saliency models in the forthcoming literature, accounting for synthetic images with uniquely low-level feature contexts, distinct from previous eye tracking image datasets.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Berga_SID4VAM_A_Benchmark_Dataset_With_Synthetic_Images_for_Visual_Attention_ICCV_2019_paper.pdf", @@ -25581,7 +26396,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Spain" + "aff_country_unique": "Spain", + "bibtex": "@InProceedings{Berga_2019_ICCV,\n \n author = {\n Berga,\n David and Fdez-Vidal,\n Xose R. and Otazu,\n Xavier and Pardo,\n Xose M.\n},\n title = {\n SID4VAM: A Benchmark Dataset With Synthetic Images for Visual Attention Modeling\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "SILCO: Show a Few Images, Localize the Common Object", @@ -25605,7 +26421,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Hu_SILCO_Show_a_Few_Images_Localize_the_Common_Object_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Hu_SILCO_Show_a_Few_Images_Localize_the_Common_Object_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Hu_2019_ICCV,\n \n author = {\n Hu,\n Tao and Mettes,\n Pascal and Huang,\n Jia-Hong and Snoek,\n Cees G. M.\n},\n title = {\n SILCO: Show a Few Images,\n Localize the Common Object\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "SME-Net: Sparse Motion Estimation for Parametric Video Prediction Through Reinforcement Learning", @@ -25638,7 +26455,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Taiwan", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Ho_2019_ICCV,\n \n author = {\n Ho,\n Yung-Han and Cho,\n Chuan-Yuan and Peng,\n Wen-Hsiao and Jin,\n Guo-Lun\n},\n title = {\n SME-Net: Sparse Motion Estimation for Parametric Video Prediction Through Reinforcement Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "SO-HandNet: Self-Organizing Network for 3D Hand Pose Estimation With Semi-Supervised Learning", @@ -25671,7 +26489,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Buffalo", "aff_country_unique_index": "0;0;1;0;0;2", - "aff_country_unique": "China;Singapore;United States" + "aff_country_unique": "China;Singapore;United States", + "bibtex": "@InProceedings{Chen_2019_ICCV,\n \n author = {\n Chen,\n Yujin and Tu,\n Zhigang and Ge,\n Liuhao and Zhang,\n Dejun and Chen,\n Ruizhi and Yuan,\n Junsong\n},\n title = {\n SO-HandNet: Self-Organizing Network for 3D Hand Pose Estimation With Semi-Supervised Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "SPGNet: Semantic Prediction Guidance for Scene Parsing", @@ -25697,14 +26516,15 @@ "author_num": 9, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Cheng_SPGNet_Semantic_Prediction_Guidance_for_Scene_Parsing_ICCV_2019_paper.html", "aff_unique_index": "0;0;0+1;0;0;2;0;0;2+0+3", - "aff_unique_norm": "University of Illinois Urbana-Champaign;University of Technology Sydney;IBM;University of Oregon", + "aff_unique_norm": "University of Illinois at Urbana-Champaign;University of Technology, Sydney;IBM;University of Oregon", "aff_unique_dep": ";ReLER;IBM Research;", "aff_unique_url": "https://www illinois.edu;https://www.uts.edu.au;https://www.ibm.com/research;https://www.uoregon.edu", "aff_unique_abbr": "UIUC;UTS;IBM;UO", "aff_campus_unique_index": "0;0;0+1;0;0;0;0;0", "aff_campus_unique": "Urbana-Champaign;Sydney;", "aff_country_unique_index": "0;0;0+1;0;0;0;0;0;0+0+0", - "aff_country_unique": "United States;Australia" + "aff_country_unique": "United States;Australia", + "bibtex": "@InProceedings{Cheng_2019_ICCV,\n \n author = {\n Cheng,\n Bowen and Chen,\n Liang-Chieh and Wei,\n Yunchao and Zhu,\n Yukun and Huang,\n Zilong and Xiong,\n Jinjun and Huang,\n Thomas S. and Hwu,\n Wen-Mei and Shi,\n Honghui\n},\n title = {\n SPGNet: Semantic Prediction Guidance for Scene Parsing\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "SPLINE-Net: Sparse Photometric Stereo Through Lighting Interpolation and Normal Estimation Networks", @@ -25730,14 +26550,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Zheng_SPLINE-Net_Sparse_Photometric_Stereo_Through_Lighting_Interpolation_and_Normal_Estimation_ICCV_2019_paper.html", "aff_unique_index": "0;1;2+3;0;2+3;0", - "aff_unique_norm": "Nanyang Technological University;Tsinghua University;Peking University;Pengcheng Laboratory", - "aff_unique_dep": "School of Electrical and Electronic Engineering;Department of Precision Instrument;Department of Computer Science;Peng Cheng Laboratory", + "aff_unique_norm": "Nanyang Technological University;Tsinghua University;Peking University;Peng Cheng Laboratory", + "aff_unique_dep": "School of Electrical and Electronic Engineering;Department of Precision Instrument;Department of Computer Science;", "aff_unique_url": "https://www.ntu.edu.sg;https://www.tsinghua.edu.cn;http://www.pku.edu.cn;", "aff_unique_abbr": "NTU;THU;PKU;", "aff_campus_unique_index": "0;1;1+2;0;1+2;0", "aff_campus_unique": "Singapore;Beijing;Shenzhen", "aff_country_unique_index": "0;1;1+1;0;1+1;0", - "aff_country_unique": "Singapore;China" + "aff_country_unique": "Singapore;China", + "bibtex": "@InProceedings{Zheng_2019_ICCV,\n \n author = {\n Zheng,\n Qian and Jia,\n Yiming and Shi,\n Boxin and Jiang,\n Xudong and Duan,\n Ling-Yu and Kot,\n Alex C.\n},\n title = {\n SPLINE-Net: Sparse Photometric Stereo Through Lighting Interpolation and Normal Estimation Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "SRM: A Style-Based Recalibration Module for Convolutional Neural Networks", @@ -25770,7 +26591,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Lee_2019_ICCV,\n \n author = {\n Lee,\n HyunJae and Kim,\n Hyo-Eun and Nam,\n Hyeonseob\n},\n title = {\n SRM: A Style-Based Recalibration Module for Convolutional Neural Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "SROBB: Targeted Perceptual Loss for Single Image Super-Resolution", @@ -25796,14 +26618,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Rad_SROBB_Targeted_Perceptual_Loss_for_Single_Image_Super-Resolution_ICCV_2019_paper.html", "aff_unique_index": "0;0;1;1;0+2;0", - "aff_unique_norm": "EPFL;Swisscom AG;ITU", + "aff_unique_norm": "École Polytechnique Fédérale de Lausanne;Swisscom AG;ITU", "aff_unique_dep": "LTS5;AI Lab;SiMiT Lab", "aff_unique_url": "https://www.epfl.ch;https://www.swisscom.ch/en.html;https://www.itu.edu.tr", - "aff_unique_abbr": "EPFL;;", + "aff_unique_abbr": "EPFL;;ITU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0+1;0", - "aff_country_unique": "Switzerland;T\u00fcrkiye" + "aff_country_unique": "Switzerland;Turkey", + "bibtex": "@InProceedings{Rad_2019_ICCV,\n \n author = {\n Rad,\n Mohammad Saeed and Bozorgtabar,\n Behzad and Marti,\n Urs-Viktor and Basler,\n Max and Ekenel,\n Hazim Kemal and Thiran,\n Jean-Philippe\n},\n title = {\n SROBB: Targeted Perceptual Loss for Single Image Super-Resolution\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "SSAP: Single-Shot Instance Segmentation With Affinity Pyramid", @@ -25831,12 +26654,13 @@ "aff_unique_index": "0+1;2;0+1;0+1;2;2;0+1+0", "aff_unique_norm": "Chinese Academy of Sciences;University of Chinese Academy of Sciences;Horizon Robotics", "aff_unique_dep": "Institute of Automation;;", - "aff_unique_url": "http://www.ia.cas.cn;http://www.ucas.ac.cn;https://www.horizon-robotics.com", + "aff_unique_url": "http://www.ia.cas.cn;http://www.ucas.ac.cn;https://www.horizon-robotics.com/", "aff_unique_abbr": "CAS;UCAS;Horizon Robotics", "aff_campus_unique_index": ";;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0+0;0+0;0;0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Gao_2019_ICCV,\n \n author = {\n Gao,\n Naiyu and Shan,\n Yanhu and Wang,\n Yupei and Zhao,\n Xin and Yu,\n Yinan and Yang,\n Ming and Huang,\n Kaiqi\n},\n title = {\n SSAP: Single-Shot Instance Segmentation With Affinity Pyramid\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "SSF-DAN: Separated Semantic Feature Based Domain Adaptation Network for Semantic Segmentation", @@ -25862,14 +26686,15 @@ "author_num": 8, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Du_SSF-DAN_Separated_Semantic_Feature_Based_Domain_Adaptation_Network_for_Semantic_ICCV_2019_paper.html", "aff_unique_index": "0;0;0;1;1;1;2;0+3", - "aff_unique_norm": "Shanghai Institute of Microsystem and Information Technology;Fudan University;Baidu;ShanghaiTech University", - "aff_unique_dep": "Bionic Vision System Laboratory, State Key Laboratory of Transducer Technology;Institute of Science and Technology for Brain-Inspired Intelligence;Baidu Inc.;", + "aff_unique_norm": "Shanghai Institute of Microsystem and Information Technology;Fudan University;Baidu Inc.;ShanghaiTech University", + "aff_unique_dep": "Bionic Vision System Laboratory, State Key Laboratory of Transducer Technology;Institute of Science and Technology for Brain-Inspired Intelligence;;", "aff_unique_url": "http://www.sIMIT.ac.cn;https://www.fudan.edu.cn/en/;https://www.baidu.com;http://www.shanghaitech.edu.cn", "aff_unique_abbr": "SIMIT;Fudan;Baidu;ShanghaiTech", "aff_campus_unique_index": "0;0;0;0+0", "aff_campus_unique": "Shanghai;", "aff_country_unique_index": "0;0;0;0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Du_2019_ICCV,\n \n author = {\n Du,\n Liang and Tan,\n Jingang and Yang,\n Hongye and Feng,\n Jianfeng and Xue,\n Xiangyang and Zheng,\n Qibao and Ye,\n Xiaoqing and Zhang,\n Xiaolin\n},\n title = {\n SSF-DAN: Separated Semantic Feature Based Domain Adaptation Network for Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "STD: Sparse-to-Dense 3D Object Detector for Point Cloud", @@ -25895,14 +26720,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Yang_STD_Sparse-to-Dense_3D_Object_Detector_for_Point_Cloud_ICCV_2019_paper.html", "aff_unique_index": "0;0;0;0;1", - "aff_unique_norm": "Tencent;Chinese University of Hong Kong", + "aff_unique_norm": "Tencent;The Chinese University of Hong Kong", "aff_unique_dep": "YouTu Lab;", "aff_unique_url": "https://www.tencent.com;https://www.cuhk.edu.hk", "aff_unique_abbr": "Tencent;CUHK", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yang_2019_ICCV,\n \n author = {\n Yang,\n Zetong and Sun,\n Yanan and Liu,\n Shu and Shen,\n Xiaoyong and Jia,\n Jiaya\n},\n title = {\n STD: Sparse-to-Dense 3D Object Detector for Point Cloud\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "STGAT: Modeling Spatial-Temporal Interactions for Human Trajectory Prediction", @@ -25935,7 +26761,8 @@ "aff_campus_unique_index": "0+0;0;0;0;0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0+0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Huang_2019_ICCV,\n \n author = {\n Huang,\n Yingfan and Bi,\n Huikun and Li,\n Zhaoxin and Mao,\n Tianlu and Wang,\n Zhaoqi\n},\n title = {\n STGAT: Modeling Spatial-Temporal Interactions for Human Trajectory Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "STM: SpatioTemporal and Motion Encoding for Action Recognition", @@ -25968,7 +26795,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Jiang_2019_ICCV,\n \n author = {\n Jiang,\n Boyuan and Wang,\n MengMeng and Gan,\n Weihao and Wu,\n Wei and Yan,\n Junjie\n},\n title = {\n STM: SpatioTemporal and Motion Encoding for Action Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "SVD: A Large-Scale Short Video Dataset for Near-Duplicate Video Retrieval", @@ -26001,7 +26829,8 @@ "aff_campus_unique_index": "0;1;1;0;1;0", "aff_campus_unique": "Nanjing;Beijing", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Jiang_2019_ICCV,\n \n author = {\n Jiang,\n Qing-Yuan and He,\n Yi and Li,\n Gen and Lin,\n Jian and Li,\n Lei and Li,\n Wu-Jun\n},\n title = {\n SVD: A Large-Scale Short Video Dataset for Near-Duplicate Video Retrieval\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Saliency-Guided Attention Network for Image-Sentence Matching", @@ -26034,7 +26863,8 @@ "aff_campus_unique_index": "0;0;1;0", "aff_campus_unique": "Tianjin;Coventry", "aff_country_unique_index": "0;0;1;0", - "aff_country_unique": "China;United Kingdom" + "aff_country_unique": "China;United Kingdom", + "bibtex": "@InProceedings{Ji_2019_ICCV,\n \n author = {\n Ji,\n Zhong and Wang,\n Haoran and Han,\n Jungong and Pang,\n Yanwei\n},\n title = {\n Saliency-Guided Attention Network for Image-Sentence Matching\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Sampling Wisely: Deep Image Embedding by Top-K Precision Optimization", @@ -26059,15 +26889,16 @@ "email": "jd.com;126.com;gmail.com;pku.edu.cn;live.com", "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Lu_Sampling_Wisely_Deep_Image_Embedding_by_Top-K_Precision_Optimization_ICCV_2019_paper.html", - "aff_unique_index": "0;0+1;0;2;0", - "aff_unique_norm": "JD;Harbin Institute of Technology;Peking University", - "aff_unique_dep": "Business Growth BU;;", - "aff_unique_url": "https://www.jd.com;http://www.hit.edu.cn/;http://www.pku.edu.cn", - "aff_unique_abbr": "JD;HIT;Peking U", + "aff_unique_index": "0;1+2;1;3;1", + "aff_unique_norm": "JD;JD AI Research;Harbin Institute of Technology;Peking University", + "aff_unique_dep": "Business Growth BU;;;", + "aff_unique_url": "https://www.jd.com;https://www.jd.com;http://www.hit.edu.cn/;http://www.pku.edu.cn", + "aff_unique_abbr": "JD;JD AI;HIT;Peking U", "aff_campus_unique_index": "1", "aff_campus_unique": ";Harbin", "aff_country_unique_index": "0;0+0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Lu_2019_ICCV,\n \n author = {\n Lu,\n Jing and Xu,\n Chaofan and Zhang,\n Wei and Duan,\n Ling-Yu and Mei,\n Tao\n},\n title = {\n Sampling Wisely: Deep Image Embedding by Top-K Precision Optimization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Sampling-Free Epistemic Uncertainty Estimation Using Approximated Variance Propagation", @@ -26094,13 +26925,14 @@ "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Postels_Sampling-Free_Epistemic_Uncertainty_Estimation_Using_Approximated_Variance_Propagation_ICCV_2019_paper.html", "aff_unique_index": "0+1;1;0;0;0+2", "aff_unique_norm": "Technical University of Munich;Autonomous Intelligent Driving GmbH;Google", - "aff_unique_dep": ";;Google", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.tum.de;https://www.aid.io;https://www.google.com", "aff_unique_abbr": "TUM;AID;Google", "aff_campus_unique_index": ";1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0+0;0;0;0;0+1", - "aff_country_unique": "Germany;United States" + "aff_country_unique": "Germany;United States", + "bibtex": "@InProceedings{Postels_2019_ICCV,\n \n author = {\n Postels,\n Janis and Ferroni,\n Francesco and Coskun,\n Huseyin and Navab,\n Nassir and Tombari,\n Federico\n},\n title = {\n Sampling-Free Epistemic Uncertainty Estimation Using Approximated Variance Propagation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Scalable Place Recognition Under Appearance Change for Autonomous Driving", @@ -26126,14 +26958,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Doan_Scalable_Place_Recognition_Under_Appearance_Change_for_Autonomous_Driving_ICCV_2019_paper.html", "aff_unique_index": "0;0;0;0;1;0", - "aff_unique_norm": "University of Adelaide;University of Liverpool", + "aff_unique_norm": "The University of Adelaide;University of Liverpool", "aff_unique_dep": "School of Computer Science;Department of Computer Science", "aff_unique_url": "https://www.adelaide.edu.au;https://www.liverpool.ac.uk", "aff_unique_abbr": "Adelaide;Liv Uni", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1;0", - "aff_country_unique": "Australia;United Kingdom" + "aff_country_unique": "Australia;United Kingdom", + "bibtex": "@InProceedings{Doan_2019_ICCV,\n \n author = {\n Doan,\n Anh-Dzung and Latif,\n Yasir and Chin,\n Tat-Jun and Liu,\n Yu and Do,\n Thanh-Toan and Reid,\n Ian\n},\n title = {\n Scalable Place Recognition Under Appearance Change for Autonomous Driving\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Scalable Verified Training for Provably Robust Image Classification", @@ -26141,7 +26974,7 @@ "status": "Poster", "track": "main", "pid": "4660", - "author_site": "Sven Gowal, Krishnamurthy (Dj) Dvijotham, Robert Stanforth, Rudy Bunel, Chongli Qin, Jonathan Uesato, Relja Arandjelovi\u00c4\u0087, Timothy Mann, Pushmeet Kohli", + "author_site": "Sven Gowal, Krishnamurthy (Dj) Dvijotham, Robert Stanforth, Rudy Bunel, Chongli Qin, Jonathan Uesato, Relja Arandjelović, Timothy Mann, Pushmeet Kohli", "author": "Sven Gowal; Krishnamurthy (Dj) Dvijotham; Robert Stanforth; Rudy Bunel; Chongli Qin; Jonathan Uesato; Relja Arandjelovic; Timothy Mann; Pushmeet Kohli", "abstract": "Recent work has shown that it is possible to train deep neural networks that are provably robust to norm-bounded adversarial perturbations. Most of these methods are based on minimizing an upper bound on the worst-case loss over all possible adversarial perturbations. While these techniques show promise, they often result in difficult optimization procedures that remain hard to scale to larger networks. Through a comprehensive analysis, we show how a simple bounding technique, interval bound propagation (IBP), can be exploited to train large provably robust neural networks that beat the state-of-the-art in verified accuracy. While the upper bound computed by IBP can be quite weak for general networks, we demonstrate that an appropriate loss and clever hyper-parameter schedule allow the network to adapt such that the IBP bound is tight. This results in a fast and stable learning algorithm that outperforms more sophisticated methods and achieves state-of-the-art results on MNIST, CIFAR-10 and SVHN. It also allows us to train the largest model to be verified beyond vacuous bounds on a downscaled version of IMAGENET.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Gowal_Scalable_Verified_Training_for_Provably_Robust_Image_Classification_ICCV_2019_paper.pdf", @@ -26166,7 +26999,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Gowal_2019_ICCV,\n \n author = {\n Gowal,\n Sven and Dvijotham,\n Krishnamurthy (Dj) and Stanforth,\n Robert and Bunel,\n Rudy and Qin,\n Chongli and Uesato,\n Jonathan and Arandjelovic,\n Relja and Mann,\n Timothy and Kohli,\n Pushmeet\n},\n title = {\n Scalable Verified Training for Provably Robust Image Classification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Scale-Aware Trident Networks for Object Detection", @@ -26199,7 +27033,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0+0;1+0+0;1;0+0+0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Li_2019_ICCV,\n \n author = {\n Li,\n Yanghao and Chen,\n Yuntao and Wang,\n Naiyan and Zhang,\n Zhaoxiang\n},\n title = {\n Scale-Aware Trident Networks for Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Scaling Object Detection by Transferring Classification Weights", @@ -26232,7 +27067,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;0", - "aff_country_unique": "Singapore;United States" + "aff_country_unique": "Singapore;United States", + "bibtex": "@InProceedings{Kuen_2019_ICCV,\n \n author = {\n Kuen,\n Jason and Perazzi,\n Federico and Lin,\n Zhe and Zhang,\n Jianming and Tan,\n Yap-Peng\n},\n title = {\n Scaling Object Detection by Transferring Classification Weights\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Scaling Recurrent Models via Orthogonal Approximations in Tensor Trains", @@ -26265,7 +27101,8 @@ "aff_campus_unique_index": "0;1;0;0", "aff_campus_unique": "Madison;Berkeley", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Mehta_2019_ICCV,\n \n author = {\n Mehta,\n Ronak and Chakraborty,\n Rudrasis and Xiong,\n Yunyang and Singh,\n Vikas\n},\n title = {\n Scaling Recurrent Models via Orthogonal Approximations in Tensor Trains\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Scaling and Benchmarking Self-Supervised Visual Representation Learning", @@ -26291,14 +27128,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Goyal_Scaling_and_Benchmarking_Self-Supervised_Visual_Representation_Learning_ICCV_2019_paper.html", "aff_unique_index": "0;0;0;0", - "aff_unique_norm": "Meta", + "aff_unique_norm": "Facebook", "aff_unique_dep": "Facebook AI Research", "aff_unique_url": "https://research.facebook.com", "aff_unique_abbr": "FAIR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Goyal_2019_ICCV,\n \n author = {\n Goyal,\n Priya and Mahajan,\n Dhruv and Gupta,\n Abhinav and Misra,\n Ishan\n},\n title = {\n Scaling and Benchmarking Self-Supervised Visual Representation Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Scene Graph Prediction With Limited Labels", @@ -26306,7 +27144,7 @@ "status": "Poster", "track": "main", "pid": "3739", - "author_site": "Vincent S. Chen, Paroma Varma, Ranjay Krishna, Michael Bernstein, Christopher R\u00c3\u00a9, Li Fei-Fei", + "author_site": "Vincent S. Chen, Paroma Varma, Ranjay Krishna, Michael Bernstein, Christopher Ré, Li Fei-Fei", "author": "Vincent S. Chen; Paroma Varma; Ranjay Krishna; Michael Bernstein; Christopher Re; Li Fei-Fei", "abstract": "Visual knowledge bases such as Visual Genome power numerous applications in computer vision, including visual question answering and captioning, but suffer from sparse, incomplete relationships. All scene graph models to date are limited to training on a small set of visual relationships that have thousands of training labels each. Hiring human annotators is expensive, and using textual knowledge base completion methods are incompatible with visual data. In this paper, we introduce a semi-supervised method that assigns probabilistic relationship labels to a large number of unlabeled images using few labeled examples. We analyze visual relationships to suggest two types of image-agnostic features that are used to generate noisy heuristics, whose outputs are aggregated using a factor graph-based generative model. With as few as 10 labeled examples per relationship, the generative model creates enough training data to train any existing state-of-the-art scene graph model. We demonstrate that our method outperforms all baseline approaches on scene graph prediction by5.16 recall@100 for PREDCLS. In our limited label setting, we define a complexity metric for relationships that serves as an indicator (R^2 = 0.778) for conditions under which our method succeeds over transfer learning, the de-facto approach for training with limited labels.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Chen_Scene_Graph_Prediction_With_Limited_Labels_ICCV_2019_paper.pdf", @@ -26331,7 +27169,8 @@ "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Chen_2019_ICCV,\n \n author = {\n Chen,\n Vincent S. and Varma,\n Paroma and Krishna,\n Ranjay and Bernstein,\n Michael and Re,\n Christopher and Fei-Fei,\n Li\n},\n title = {\n Scene Graph Prediction With Limited Labels\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Scene Text Visual Question Answering", @@ -26339,7 +27178,7 @@ "status": "Poster", "track": "main", "pid": "5810", - "author_site": "Ali Furkan Biten, Rub\u00c3\u00a8n Tito, Andr\u00c3\u00a9s Mafla, Lluis Gomez, Mar\u00c3\u00a7al Rusi\u00c3\u00b1ol, Ernest Valveny, C.V. Jawahar, Dimosthenis Karatzas", + "author_site": "Ali Furkan Biten, Rubèn Tito, Andrés Mafla, Lluis Gomez, Marçal Rusiñol, Ernest Valveny, C.V. Jawahar, Dimosthenis Karatzas", "author": "Ali Furkan Biten; Ruben Tito; Andres Mafla; Lluis Gomez; Marcal Rusinol; Ernest Valveny; C.V. Jawahar; Dimosthenis Karatzas", "abstract": "Current visual question answering datasets do not consider the rich semantic information conveyed by text within an image. In this work, we present a new dataset, ST-VQA, that aims to highlight the importance of exploiting high-level semantic information present in images as textual cues in the Visual Question Answering process. We use this dataset to define a series of tasks of increasing difficulty for which reading the scene text in the context provided by the visual information is necessary to reason and generate an appropriate answer. We propose a new evaluation metric for these tasks to account both for reasoning errors as well as shortcomings of the text recognition module. In addition we put forward a series of baseline methods, which provide further insight to the newly released dataset, and set the scene for further research.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Biten_Scene_Text_Visual_Question_Answering_ICCV_2019_paper.pdf", @@ -26357,14 +27196,15 @@ "author_num": 8, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Biten_Scene_Text_Visual_Question_Answering_ICCV_2019_paper.html", "aff_unique_index": "0;0;0;0;0;0;1;0", - "aff_unique_norm": "Universitat Aut\u00f2noma de Barcelona;International Institute of Information Technology, Hyderabad", + "aff_unique_norm": "Universitat Autònoma de Barcelona;International Institute of Information Technology, Hyderabad", "aff_unique_dep": "Computer Vision Center;", "aff_unique_url": "https://www.uab.cat;https://iiit Hyderabad.ac.in", "aff_unique_abbr": "UAB;IIIT Hyderabad", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hyderabad", "aff_country_unique_index": "0;0;0;0;0;0;1;0", - "aff_country_unique": "Spain;India" + "aff_country_unique": "Spain;India", + "bibtex": "@InProceedings{Biten_2019_ICCV,\n \n author = {\n Biten,\n Ali Furkan and Tito,\n Ruben and Mafla,\n Andres and Gomez,\n Lluis and Rusinol,\n Marcal and Valveny,\n Ernest and Jawahar,\n C.V. and Karatzas,\n Dimosthenis\n},\n title = {\n Scene Text Visual Question Answering\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "SceneGraphNet: Neural Message Passing for 3D Indoor Scene Augmentation", @@ -26397,7 +27237,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Amherst", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhou_2019_ICCV,\n \n author = {\n Zhou,\n Yang and While,\n Zachary and Kalogerakis,\n Evangelos\n},\n title = {\n SceneGraphNet: Neural Message Passing for 3D Indoor Scene Augmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Scoot: A Perceptual Metric for Facial Sketches", @@ -26409,7 +27250,7 @@ "author": "Deng-Ping Fan; ShengChuan Zhang; Yu-Huan Wu; Yun Liu; Ming-Ming Cheng; Bo Ren; Paul L. Rosin; Rongrong Ji", "abstract": "While it is trivial for humans to quickly assess the perceptual similarity between two images, the underlying mechanism are thought to be quite complex. Despite this, the most widely adopted perceptual metrics today, such as SSIM and FSIM, are simple, shallow functions, and fail to consider many factors of human perception. Recently, the facial modeling community has observed that the inclusion of both structure and texture has a significant positive benefit for face sketch synthesis (FSS). But how perceptual are these so-called \"perceptual features\"? Which elements are critical for their success? In this paper, we design a perceptual metric, called Structure Co-Occurrence Texture (Scoot), which simultaneously considers the block-level spatial structure and co-occurrence texture statistics. To test the quality of metrics, we propose three novel meta-measures based on various reliable properties. Extensive experiments verify that our Scoot metric exceeds the performance of prior work. Besides, we built the first largest scale (152k judgments) human-perception-based sketch database that can evaluate how well a metric consistent with human perception. Our results suggest that \"spatial structure\" and \"co-occurrence texture\" are two generally applicable perceptual features in face sketch synthesis.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Fan_Scoot_A_Perceptual_Metric_for_Facial_Sketches_ICCV_2019_paper.pdf", - "aff": "TKLNDST, CS, Nankai University+Inception Institute of Arti\ufb01cial Intelligence (IIAI); Department of Arti\ufb01cial Intelligence, School of Informatics, Xiamen University; TKLNDST, CS, Nankai University; TKLNDST, CS, Nankai University; TKLNDST, CS, Nankai University; TKLNDST, CS, Nankai University; Cardiff University; Department of Arti\ufb01cial Intelligence, School of Informatics, Xiamen University+Peng Cheng Lab", + "aff": "TKLNDST, CS, Nankai University+Inception Institute of Artificial Intelligence (IIAI); Department of Artificial Intelligence, School of Informatics, Xiamen University; TKLNDST, CS, Nankai University; TKLNDST, CS, Nankai University; TKLNDST, CS, Nankai University; TKLNDST, CS, Nankai University; Cardiff University; Department of Artificial Intelligence, School of Informatics, Xiamen University+Peng Cheng Lab", "project": "http://mmcheng.net/scoot/", "github": "", "supp": "", @@ -26423,14 +27264,15 @@ "author_num": 8, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Fan_Scoot_A_Perceptual_Metric_for_Facial_Sketches_ICCV_2019_paper.html", "aff_unique_index": "0+1;2;0;0;0;0;3;2+4", - "aff_unique_norm": "Nankai University;Inception Institute of Artificial Intelligence;Xiamen University;Cardiff University;Pengcheng Laboratory", - "aff_unique_dep": "Computer Science;;Department of Arti\ufb01cial Intelligence, School of Informatics;;Peng Cheng Lab", + "aff_unique_norm": "Nankai University;Inception Institute of Artificial Intelligence;Xiamen University;Cardiff University;Peng Cheng Lab", + "aff_unique_dep": "Computer Science;;Department of Artificial Intelligence, School of Informatics;;", "aff_unique_url": "http://www.nankai.edu.cn;https://www.iiai.cn;https://www.xmu.edu.cn;https://www.cardiff.ac.uk;", "aff_unique_abbr": "Nankai U;IIAI;XMU;Cardiff;", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0;0;0;1;0+0", - "aff_country_unique": "China;United Kingdom" + "aff_country_unique": "China;United Kingdom", + "bibtex": "@InProceedings{Fan_2019_ICCV,\n \n author = {\n Fan,\n Deng-Ping and Zhang,\n ShengChuan and Wu,\n Yu-Huan and Liu,\n Yun and Cheng,\n Ming-Ming and Ren,\n Bo and Rosin,\n Paul L. and Ji,\n Rongrong\n},\n title = {\n Scoot: A Perceptual Metric for Facial Sketches\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Searching for MobileNetV3", @@ -26463,7 +27305,8 @@ "aff_campus_unique_index": "0;0;0;0;0;0;0;0;0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Howard_2019_ICCV,\n \n author = {\n Howard,\n Andrew and Sandler,\n Mark and Chu,\n Grace and Chen,\n Liang-Chieh and Chen,\n Bo and Tan,\n Mingxing and Wang,\n Weijun and Zhu,\n Yukun and Pang,\n Ruoming and Vasudevan,\n Vijay and Le,\n Quoc V. and Adam,\n Hartwig\n},\n title = {\n Searching for MobileNetV3\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Second-Order Non-Local Attention Networks for Person Re-Identification", @@ -26496,7 +27339,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Xia_2019_ICCV,\n \n author = {\n Xia,\n Bryan (Ning) and Gong,\n Yuan and Zhang,\n Yizhe and Poellabauer,\n Christian\n},\n title = {\n Second-Order Non-Local Attention Networks for Person Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "See-Through-Text Grouping for Referring Image Segmentation", @@ -26529,7 +27373,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Taiwan", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2019_ICCV,\n \n author = {\n Chen,\n Ding-Jie and Jia,\n Songhao and Lo,\n Yi-Chen and Chen,\n Hwann-Tzong and Liu,\n Tyng-Luh\n},\n title = {\n See-Through-Text Grouping for Referring Image Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Seeing Motion in the Dark", @@ -26555,14 +27400,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Chen_Seeing_Motion_in_the_Dark_ICCV_2019_paper.html", "aff_unique_index": "0;1;0;2", - "aff_unique_norm": "University of Illinois Urbana-Champaign;Hong Kong University of Science and Technology;Intel", + "aff_unique_norm": "University of Illinois at Urbana-Champaign;Hong Kong University of Science and Technology;Intel Corporation", "aff_unique_dep": ";;Intel Labs", "aff_unique_url": "https://www illinois.edu;https://www.ust.hk;https://www.intel.com", "aff_unique_abbr": "UIUC;HKUST;Intel", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Urbana-Champaign;Hong Kong SAR;", "aff_country_unique_index": "0;1;0;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Chen_2019_ICCV,\n \n author = {\n Chen,\n Chen and Chen,\n Qifeng and Do,\n Minh N. and Koltun,\n Vladlen\n},\n title = {\n Seeing Motion in the Dark\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Seeing What a GAN Cannot Generate", @@ -26588,14 +27434,15 @@ "author_num": 7, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Bau_Seeing_What_a_GAN_Cannot_Generate_ICCV_2019_paper.html", "aff_unique_index": "0+0;0;0;0;0;1;0+0", - "aff_unique_norm": "Massachusetts Institute of Technology;Chinese University of Hong Kong", + "aff_unique_norm": "Massachusetts Institute of Technology;The Chinese University of Hong Kong", "aff_unique_dep": "Computer Science and Artificial Intelligence Laboratory;", "aff_unique_url": "https://www.csail.mit.edu;https://www.cuhk.edu.hk", "aff_unique_abbr": "MIT CSAIL;CUHK", "aff_campus_unique_index": "0;0;0;0;2;0", "aff_campus_unique": "Cambridge;;Hong Kong SAR", "aff_country_unique_index": "0+0;0;0;0;0;1;0+0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Bau_2019_ICCV,\n \n author = {\n Bau,\n David and Zhu,\n Jun-Yan and Wulff,\n Jonas and Peebles,\n William and Strobelt,\n Hendrik and Zhou,\n Bolei and Torralba,\n Antonio\n},\n title = {\n Seeing What a GAN Cannot Generate\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "SegEQA: Video Segmentation Based Visual Attention for Embodied Question Answering", @@ -26619,7 +27466,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Luo_SegEQA_Video_Segmentation_Based_Visual_Attention_for_Embodied_Question_Answering_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Luo_SegEQA_Video_Segmentation_Based_Visual_Attention_for_Embodied_Question_Answering_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Luo_2019_ICCV,\n \n author = {\n Luo,\n Haonan and Lin,\n Guosheng and Liu,\n Zichuan and Liu,\n Fayao and Tang,\n Zhenmin and Yao,\n Yazhou\n},\n title = {\n SegEQA: Video Segmentation Based Visual Attention for Embodied Question Answering\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "SegSort: Segmentation by Discriminative Sorting of Segments", @@ -26643,7 +27491,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Hwang_SegSort_Segmentation_by_Discriminative_Sorting_of_Segments_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Hwang_SegSort_Segmentation_by_Discriminative_Sorting_of_Segments_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Hwang_2019_ICCV,\n \n author = {\n Hwang,\n Jyh-Jing and Yu,\n Stella X. and Shi,\n Jianbo and Collins,\n Maxwell D. and Yang,\n Tien-Ju and Zhang,\n Xiao and Chen,\n Liang-Chieh\n},\n title = {\n SegSort: Segmentation by Discriminative Sorting of Segments\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Selective Sparse Sampling for Fine-Grained Image Recognition", @@ -26669,14 +27518,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Ding_Selective_Sparse_Sampling_for_Fine-Grained_Image_Recognition_ICCV_2019_paper.html", "aff_unique_index": "0;0+1;0;0+1;0", - "aff_unique_norm": "University of Chinese Academy of Sciences;Pengcheng Laboratory", - "aff_unique_dep": ";Peng Cheng Laboratory", + "aff_unique_norm": "University of Chinese Academy of Sciences;Peng Cheng Laboratory", + "aff_unique_dep": ";", "aff_unique_url": "http://www.ucas.ac.cn;http://www.pcl.ac.cn", "aff_unique_abbr": "UCAS;PCL", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Ding_2019_ICCV,\n \n author = {\n Ding,\n Yao and Zhou,\n Yanzhao and Zhu,\n Yi and Ye,\n Qixiang and Jiao,\n Jianbin\n},\n title = {\n Selective Sparse Sampling for Fine-Grained Image Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Selectivity or Invariance: Boundary-Aware Salient Object Detection", @@ -26702,14 +27552,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Su_Selectivity_or_Invariance_Boundary-Aware_Salient_Object_Detection_ICCV_2019_paper.html", "aff_unique_index": "0+1;0+1;0;1;2+1", - "aff_unique_norm": "Beihang University;Pengcheng Laboratory;Peking University", - "aff_unique_dep": "State Key Laboratory of Virtual Reality Technology and Systems, SCSE;Peng Cheng Laboratory;School of EE&CS", + "aff_unique_norm": "Beihang University;Peng Cheng Laboratory;Peking University", + "aff_unique_dep": "State Key Laboratory of Virtual Reality Technology and Systems, SCSE;;School of EE&CS", "aff_unique_url": "http://www.buaa.edu.cn;;http://www.pku.edu.cn", "aff_unique_abbr": "Beihang;;PKU", "aff_campus_unique_index": "1;1;1;1", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0+0;0+0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Su_2019_ICCV,\n \n author = {\n Su,\n Jinming and Li,\n Jia and Zhang,\n Yu and Xia,\n Changqun and Tian,\n Yonghong\n},\n title = {\n Selectivity or Invariance: Boundary-Aware Salient Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Self-Critical Attention Learning for Person Re-Identification", @@ -26742,7 +27593,8 @@ "aff_campus_unique_index": ";;;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0+0;0+0+0;0+0+0;0+0+0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2019_ICCV,\n \n author = {\n Chen,\n Guangyi and Lin,\n Chunze and Ren,\n Liangliang and Lu,\n Jiwen and Zhou,\n Jie\n},\n title = {\n Self-Critical Attention Learning for Person Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Self-Ensembling With GAN-Based Data Augmentation for Domain Adaptation in Semantic Segmentation", @@ -26775,7 +27627,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Choi_2019_ICCV,\n \n author = {\n Choi,\n Jaehoon and Kim,\n Taekyung and Kim,\n Changick\n},\n title = {\n Self-Ensembling With GAN-Based Data Augmentation for Domain Adaptation in Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Self-Guided Network for Fast Image Denoising", @@ -26808,7 +27661,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+1;0", - "aff_country_unique": "Switzerland;Belgium" + "aff_country_unique": "Switzerland;Belgium", + "bibtex": "@InProceedings{Gu_2019_ICCV,\n \n author = {\n Gu,\n Shuhang and Li,\n Yawei and Gool,\n Luc Van and Timofte,\n Radu\n},\n title = {\n Self-Guided Network for Fast Image Denoising\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Self-Similarity Grouping: A Simple Unsupervised Cross Domain Adaptation Approach for Person Re-Identification", @@ -26832,7 +27686,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Fu_Self-Similarity_Grouping_A_Simple_Unsupervised_Cross_Domain_Adaptation_Approach_for_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Fu_Self-Similarity_Grouping_A_Simple_Unsupervised_Cross_Domain_Adaptation_Approach_for_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Fu_2019_ICCV,\n \n author = {\n Fu,\n Yang and Wei,\n Yunchao and Wang,\n Guanshuo and Zhou,\n Yuqian and Shi,\n Honghui and Huang,\n Thomas S.\n},\n title = {\n Self-Similarity Grouping: A Simple Unsupervised Cross Domain Adaptation Approach for Person Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Self-Supervised Deep Depth Denoising", @@ -26856,7 +27711,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Sterzentsenko_Self-Supervised_Deep_Depth_Denoising_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Sterzentsenko_Self-Supervised_Deep_Depth_Denoising_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Sterzentsenko_2019_ICCV,\n \n author = {\n Sterzentsenko,\n Vladimiros and Saroglou,\n Leonidas and Chatzitofis,\n Anargyros and Thermos,\n Spyridon and Zioulis,\n Nikolaos and Doumanoglou,\n Alexandros and Zarpalas,\n Dimitrios and Daras,\n Petros\n},\n title = {\n Self-Supervised Deep Depth Denoising\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Self-Supervised Difference Detection for Weakly-Supervised Semantic Segmentation", @@ -26880,7 +27736,8 @@ "aff_domain": ";", "email": ";", "author_num": 2, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Shimoda_Self-Supervised_Difference_Detection_for_Weakly-Supervised_Semantic_Segmentation_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Shimoda_Self-Supervised_Difference_Detection_for_Weakly-Supervised_Semantic_Segmentation_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Shimoda_2019_ICCV,\n \n author = {\n Shimoda,\n Wataru and Yanai,\n Keiji\n},\n title = {\n Self-Supervised Difference Detection for Weakly-Supervised Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Self-Supervised Learning With Geometric Constraints in Monocular Video: Connecting Flow, Depth, and Camera", @@ -26913,7 +27770,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0+1;0;0", - "aff_country_unique": "United States;Switzerland" + "aff_country_unique": "United States;Switzerland", + "bibtex": "@InProceedings{Chen_2019_ICCV,\n \n author = {\n Chen,\n Yuhua and Schmid,\n Cordelia and Sminchisescu,\n Cristian\n},\n title = {\n Self-Supervised Learning With Geometric Constraints in Monocular Video: Connecting Flow,\n Depth,\n and Camera\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Self-Supervised Monocular Depth Hints", @@ -26937,7 +27795,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Watson_Self-Supervised_Monocular_Depth_Hints_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Watson_Self-Supervised_Monocular_Depth_Hints_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Watson_2019_ICCV,\n \n author = {\n Watson,\n Jamie and Firman,\n Michael and Brostow,\n Gabriel J. and Turmukhambetov,\n Daniyar\n},\n title = {\n Self-Supervised Monocular Depth Hints\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Self-Supervised Moving Vehicle Tracking With Stereo Sound", @@ -26963,14 +27822,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Gan_Self-Supervised_Moving_Vehicle_Tracking_With_Stereo_Sound_ICCV_2019_paper.html", "aff_unique_index": "0+1;0;1;0+1;0", - "aff_unique_norm": "Massachusetts Institute of Technology;IBM", + "aff_unique_norm": "Massachusetts Institute of Technology;IBM Research", "aff_unique_dep": "IBM Watson AI Lab;AI", "aff_unique_url": "https://www.mitibmwatsonailab.org;https://www.ibm.com/research", "aff_unique_abbr": "MIT-IBM AI Lab;IBM", "aff_campus_unique_index": ";1;;1", "aff_campus_unique": ";Cambridge", "aff_country_unique_index": "0+0;0;0;0+0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Gan_2019_ICCV,\n \n author = {\n Gan,\n Chuang and Zhao,\n Hang and Chen,\n Peihao and Cox,\n David and Torralba,\n Antonio\n},\n title = {\n Self-Supervised Moving Vehicle Tracking With Stereo Sound\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Self-Supervised Representation Learning From Multi-Domain Data", @@ -26996,14 +27856,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Feng_Self-Supervised_Representation_Learning_From_Multi-Domain_Data_ICCV_2019_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "University of Sydney", + "aff_unique_norm": "The University of Sydney", "aff_unique_dep": "School of Computer Science", "aff_unique_url": "https://www.sydney.edu.au", "aff_unique_abbr": "USYD", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Darlington", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Australia" + "aff_country_unique": "Australia", + "bibtex": "@InProceedings{Feng_2019_ICCV,\n \n author = {\n Feng,\n Zeyu and Xu,\n Chang and Tao,\n Dacheng\n},\n title = {\n Self-Supervised Representation Learning From Multi-Domain Data\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Self-Supervised Representation Learning via Neighborhood-Relational Encoding", @@ -27036,7 +27897,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;0;1", - "aff_country_unique": "Iran;United States" + "aff_country_unique": "Iran;United States", + "bibtex": "@InProceedings{Sabokrou_2019_ICCV,\n \n author = {\n Sabokrou,\n Mohammad and Khalooei,\n Mohammad and Adeli,\n Ehsan\n},\n title = {\n Self-Supervised Representation Learning via Neighborhood-Relational Encoding\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Self-Training With Progressive Augmentation for Unsupervised Cross-Domain Person Re-Identification", @@ -27062,14 +27924,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Zhang_Self-Training_With_Progressive_Augmentation_for_Unsupervised_Cross-Domain_Person_Re-Identification_ICCV_2019_paper.html", "aff_unique_index": "0;1;1;0", - "aff_unique_norm": "Tongji University;University of Adelaide", + "aff_unique_norm": "Tongji University;The University of Adelaide", "aff_unique_dep": ";", "aff_unique_url": "https://www.tongji.edu.cn;https://www.adelaide.edu.au", "aff_unique_abbr": "Tongji;Adelaide", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Zhang_2019_ICCV,\n \n author = {\n Zhang,\n Xinyu and Cao,\n Jiewei and Shen,\n Chunhua and You,\n Mingyu\n},\n title = {\n Self-Training With Progressive Augmentation for Unsupervised Cross-Domain Person Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Self-Training and Adversarial Background Regularization for Unsupervised Domain Adaptive One-Stage Object Detection", @@ -27102,7 +27965,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Kim_2019_ICCV,\n \n author = {\n Kim,\n Seunghyeon and Choi,\n Jaehoon and Kim,\n Taekyung and Kim,\n Changick\n},\n title = {\n Self-Training and Adversarial Background Regularization for Unsupervised Domain Adaptive One-Stage Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Semantic Adversarial Attacks: Parametric Transformations That Fool Deep Classifiers", @@ -27135,7 +27999,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Joshi_2019_ICCV,\n \n author = {\n Joshi,\n Ameya and Mukherjee,\n Amitangshu and Sarkar,\n Soumik and Hegde,\n Chinmay\n},\n title = {\n Semantic Adversarial Attacks: Parametric Transformations That Fool Deep Classifiers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Semantic Part Detection via Matching: Learning to Generalize to Novel Viewpoints From Limited Training Data", @@ -27147,7 +28012,7 @@ "author": "Yutong Bai; Qing Liu; Lingxi Xie; Weichao Qiu; Yan Zheng; Alan L. Yuille", "abstract": "Detecting semantic parts of an object is a challenging task, particularly because it is hard to annotate semantic parts and construct large datasets. In this paper, we present an approach which can learn from a small annotated dataset containing a limited range of viewpoints and generalize to detect semantic parts for a much larger range of viewpoints. The approach is based on our matching algorithm, which is used for finding accurate spatial correspondence between two images and transplanting semantic parts annotated on one image to the other. Images in the training set are matched to synthetic images rendered from a 3D CAD model, following which a clustering algorithm is used to automatically annotate semantic parts of the CAD model. During the testing period, this CAD model can synthesize annotated images under every viewpoint. These synthesized images are matched to images in the testing set to detect semantic parts in novel viewpoints. Our algorithm is simple, intuitive, and contains very few parameters. Experiments show our method outperforms standard deep learning approaches and, in particular, performs much better on novel viewpoints. For facilitating the future research, code is available: https://github.com/ytongbai/SemanticPartDetection", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Bai_Semantic_Part_Detection_via_Matching_Learning_to_Generalize_to_Novel_ICCV_2019_paper.pdf", - "aff": "Johns Hopkins University; Johns Hopkins University; Johns Hopkins University+Huawei Noah\u2019s Ark Lab; Johns Hopkins University; University of Texas at Austin; Johns Hopkins University", + "aff": "Johns Hopkins University; Johns Hopkins University; Johns Hopkins University+Huawei Noah’s Ark Lab; Johns Hopkins University; University of Texas at Austin; Johns Hopkins University", "project": "", "github": "https://github.com/ytongbai/SemanticPartDetection", "supp": "", @@ -27162,13 +28027,14 @@ "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Bai_Semantic_Part_Detection_via_Matching_Learning_to_Generalize_to_Novel_ICCV_2019_paper.html", "aff_unique_index": "0;0;0+1;0;2;0", "aff_unique_norm": "Johns Hopkins University;Huawei;University of Texas at Austin", - "aff_unique_dep": ";Noah\u2019s Ark Lab;", + "aff_unique_dep": ";Noah’s Ark Lab;", "aff_unique_url": "https://www.jhu.edu;https://www.huawei.com;https://www.utexas.edu", "aff_unique_abbr": "JHU;Huawei;UT Austin", "aff_campus_unique_index": ";1", "aff_campus_unique": ";Austin", "aff_country_unique_index": "0;0;0+1;0;0;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Bai_2019_ICCV,\n \n author = {\n Bai,\n Yutong and Liu,\n Qing and Xie,\n Lingxi and Qiu,\n Weichao and Zheng,\n Yan and Yuille,\n Alan L.\n},\n title = {\n Semantic Part Detection via Matching: Learning to Generalize to Novel Viewpoints From Limited Training Data\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Semantic Stereo Matching With Pyramid Cost Volumes", @@ -27196,12 +28062,13 @@ "aff_unique_index": "0+1;0+1;2;0+2+1;0+2+1", "aff_unique_norm": "University of South Carolina;Farsee2 Technology Ltd;Wuhan University", "aff_unique_dep": ";;", - "aff_unique_url": "https://www.sc.edu;;http://www.whu.edu.cn", + "aff_unique_url": "https://www.sc.edu;;http://www.whu.edu.cn/", "aff_unique_abbr": "USC;;WHU", "aff_campus_unique_index": ";;;", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0+1;1;0+1+1;0+1+1", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Wu_2019_ICCV,\n \n author = {\n Wu,\n Zhenyao and Wu,\n Xinyi and Zhang,\n Xiaoping and Wang,\n Song and Ju,\n Lili\n},\n title = {\n Semantic Stereo Matching With Pyramid Cost Volumes\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Semantic-Aware Knowledge Preservation for Zero-Shot Sketch-Based Image Retrieval", @@ -27213,7 +28080,7 @@ "author": "Qing Liu; Lingxi Xie; Huiyu Wang; Alan L. Yuille", "abstract": "Sketch-based image retrieval (SBIR) is widely recognized as an important vision problem which implies a wide range of real-world applications. Recently, research interests arise in solving this problem under the more realistic and challenging setting of zero-shot learning. In this paper, we investigate this problem from the viewpoint of domain adaptation which we show is critical in improving feature embedding in the zero-shot scenario. Based on a framework which starts with a pre-trained model on ImageNet and fine-tunes it on the training set of SBIR benchmark, we advocate the importance of preserving previously acquired knowledge, e.g., the rich discriminative features learned from ImageNet, to improve the model's transfer ability. For this purpose, we design an approach named Semantic-Aware Knowledge prEservation (SAKE), which fine-tunes the pre-trained model in an economical way and leverages semantic information, e.g., inter-class relationship, to achieve the goal of knowledge preservation. Zero-shot experiments on two extended SBIR datasets, TU-Berlin and Sketchy, verify the superior performance of our approach. Extensive diagnostic experiments validate that knowledge preserved benefits SBIR in zero-shot settings, as a large fraction of the performance gain is from the more properly structured feature embedding for photo images.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Liu_Semantic-Aware_Knowledge_Preservation_for_Zero-Shot_Sketch-Based_Image_Retrieval_ICCV_2019_paper.pdf", - "aff": "Johns Hopkins University; Johns Hopkins University + Noah\u2019s Ark Lab, Huawei Inc.; Johns Hopkins University; Johns Hopkins University", + "aff": "Johns Hopkins University; Johns Hopkins University + Noah’s Ark Lab, Huawei Inc.; Johns Hopkins University; Johns Hopkins University", "project": "", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2019/supplemental/Liu_Semantic-Aware_Knowledge_Preservation_ICCV_2019_supplemental.pdf", @@ -27227,14 +28094,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Liu_Semantic-Aware_Knowledge_Preservation_for_Zero-Shot_Sketch-Based_Image_Retrieval_ICCV_2019_paper.html", "aff_unique_index": "0;0+1;0;0", - "aff_unique_norm": "Johns Hopkins University;Huawei", - "aff_unique_dep": ";Noah\u2019s Ark Lab", + "aff_unique_norm": "Johns Hopkins University;Huawei Inc.", + "aff_unique_dep": ";Noah’s Ark Lab", "aff_unique_url": "https://www.jhu.edu;https://www.huawei.com", "aff_unique_abbr": "JHU;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+1;0;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Liu_2019_ICCV,\n \n author = {\n Liu,\n Qing and Xie,\n Lingxi and Wang,\n Huiyu and Yuille,\n Alan L.\n},\n title = {\n Semantic-Aware Knowledge Preservation for Zero-Shot Sketch-Based Image Retrieval\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "id": "b0e81cc9ac", @@ -27263,7 +28131,8 @@ "aff_campus_unique_index": "0+0+1;0+0;0+0+1;0+0+1", "aff_campus_unique": "Shenyang;Beijing", "aff_country_unique_index": "0+0+0;0+0;0+0+0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Dong_2019_ICCV,\n \n author = {\n Dong,\n Jiahua and Cong,\n Yang and Sun,\n Gan and Hou,\n Dongdong\n},\n title = {\n Semantic-Transferable Weakly-Supervised Endoscopic Lesions Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "SemanticKITTI: A Dataset for Semantic Scene Understanding of LiDAR Sequences", @@ -27271,7 +28140,7 @@ "status": "Oral", "track": "main", "pid": "2944", - "author_site": "Jens Behley, Martin Garbade, Andres Milioto, Jan Quenzel, Sven Behnke, Cyrill Stachniss, J\u00c3\u00bcrgen Gall", + "author_site": "Jens Behley, Martin Garbade, Andres Milioto, Jan Quenzel, Sven Behnke, Cyrill Stachniss, Jürgen Gall", "author": "Jens Behley; Martin Garbade; Andres Milioto; Jan Quenzel; Sven Behnke; Cyrill Stachniss; Jurgen Gall", "abstract": "Semantic scene understanding is important for various applications. In particular, self-driving cars need a fine-grained understanding of the surfaces and objects in their vicinity. Light detection and ranging (LiDAR) provides precise geometric information about the environment and is thus a part of the sensor suites of almost all self-driving cars. Despite the relevance of semantic scene understanding for this application, there is a lack of a large dataset for this task which is based on an automotive LiDAR. In this paper, we introduce a large dataset to propel research on laser-based semantic segmentation. We annotated all sequences of the KITTI Vision Odometry Benchmark and provide dense point-wise annotations for the complete 360-degree field-of-view of the employed automotive LiDAR. We propose three benchmark tasks based on this dataset: (i) semantic segmentation of point clouds using a single scan, (ii) semantic segmentation using multiple past scans, and (iii) semantic scene completion, which requires to anticipate the semantic scene in the future. We provide baseline experiments and show that there is a need for more sophisticated models to efficiently tackle these tasks. Our dataset opens the door for the development of more advanced methods, but also provides plentiful data to investigate new research directions.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Behley_SemanticKITTI_A_Dataset_for_Semantic_Scene_Understanding_of_LiDAR_Sequences_ICCV_2019_paper.pdf", @@ -27287,7 +28156,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Behley_SemanticKITTI_A_Dataset_for_Semantic_Scene_Understanding_of_LiDAR_Sequences_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Behley_SemanticKITTI_A_Dataset_for_Semantic_Scene_Understanding_of_LiDAR_Sequences_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Behley_2019_ICCV,\n \n author = {\n Behley,\n Jens and Garbade,\n Martin and Milioto,\n Andres and Quenzel,\n Jan and Behnke,\n Sven and Stachniss,\n Cyrill and Gall,\n Jurgen\n},\n title = {\n SemanticKITTI: A Dataset for Semantic Scene Understanding of LiDAR Sequences\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Semantics-Enhanced Adversarial Nets for Text-to-Image Synthesis", @@ -27313,14 +28183,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Tan_Semantics-Enhanced_Adversarial_Nets_for_Text-to-Image_Synthesis_ICCV_2019_paper.html", "aff_unique_index": "0;0;1;0;0+2", - "aff_unique_norm": "Dalian University of Technology;Louisiana State University;Pengcheng Laboratory", - "aff_unique_dep": ";;Peng Cheng Laboratory", + "aff_unique_norm": "Dalian University of Technology;Louisiana State University;Peng Cheng Laboratory", + "aff_unique_dep": ";;", "aff_unique_url": "http://www.dlut.edu.cn/;https://www.lsu.edu;http://www.pcl.ac.cn", - "aff_unique_abbr": "DUT;LSU;PCL", + "aff_unique_abbr": "DUT;LSU;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0+0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Tan_2019_ICCV,\n \n author = {\n Tan,\n Hongchen and Liu,\n Xiuping and Li,\n Xin and Zhang,\n Yi and Yin,\n Baocai\n},\n title = {\n Semantics-Enhanced Adversarial Nets for Text-to-Image Synthesis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Semi-Supervised Domain Adaptation via Minimax Entropy", @@ -27353,7 +28224,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Saito_2019_ICCV,\n \n author = {\n Saito,\n Kuniaki and Kim,\n Donghyun and Sclaroff,\n Stan and Darrell,\n Trevor and Saenko,\n Kate\n},\n title = {\n Semi-Supervised Domain Adaptation via Minimax Entropy\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Semi-Supervised Learning by Augmented Distribution Alignment", @@ -27386,7 +28258,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0+1;0+1", - "aff_country_unique": "Switzerland;Belgium" + "aff_country_unique": "Switzerland;Belgium", + "bibtex": "@InProceedings{Wang_2019_ICCV,\n \n author = {\n Wang,\n Qin and Li,\n Wen and Gool,\n Luc Van\n},\n title = {\n Semi-Supervised Learning by Augmented Distribution Alignment\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Semi-Supervised Monocular 3D Face Reconstruction With End-to-End Shape-Preserved Domain Transfer", @@ -27412,14 +28285,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Piao_Semi-Supervised_Monocular_3D_Face_Reconstruction_With_End-to-End_Shape-Preserved_Domain_Transfer_ICCV_2019_paper.html", "aff_unique_index": "0;1;0", - "aff_unique_norm": "Chinese University of Hong Kong;SenseTime", + "aff_unique_norm": "The Chinese University of Hong Kong;Sensetime", "aff_unique_dep": "CUHK-SenseTime Joint Laboratory;Research", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.sensetime.com/", "aff_unique_abbr": "CUHK;SenseTime", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Piao_2019_ICCV,\n \n author = {\n Piao,\n Jingtan and Qian,\n Chen and Li,\n Hongsheng\n},\n title = {\n Semi-Supervised Monocular 3D Face Reconstruction With End-to-End Shape-Preserved Domain Transfer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Semi-Supervised Pedestrian Instance Synthesis and Detection With Mutual Reinforcement", @@ -27452,7 +28326,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wu_2019_ICCV,\n \n author = {\n Wu,\n Si and Lin,\n Sihao and Wu,\n Wenhao and Azzam,\n Mohamed and Wong,\n Hau-San\n},\n title = {\n Semi-Supervised Pedestrian Instance Synthesis and Detection With Mutual Reinforcement\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Semi-Supervised Skin Detection by Network With Mutual Guidance", @@ -27485,7 +28360,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{He_2019_ICCV,\n \n author = {\n He,\n Yi and Shi,\n Jiayuan and Wang,\n Chuan and Huang,\n Haibin and Liu,\n Jiaming and Li,\n Guanbin and Liu,\n Risheng and Wang,\n Jue\n},\n title = {\n Semi-Supervised Skin Detection by Network With Mutual Guidance\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Semi-Supervised Video Salient Object Detection Using Pseudo-Labels", @@ -27511,14 +28387,15 @@ "author_num": 7, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Yan_Semi-Supervised_Video_Salient_Object_Detection_Using_Pseudo-Labels_ICCV_2019_paper.html", "aff_unique_index": "0;0;0+1;2;3;0+1;0+1", - "aff_unique_norm": "Sun Yat-sen University;DarkMatter AI Research;Chinese University of Hong Kong (Shenzhen);Megvii Technology", + "aff_unique_norm": "Sun Yat-sen University;DarkMatter AI Research;the Chinese University of Hong Kong (Shenzhen);Megvii Technology", "aff_unique_dep": ";AI Research;Shenzhen Research Institute of Big Data;", "aff_unique_url": "http://www.sysu.edu.cn/;;https://www.cuhk.edu.cn;https://www.megvii.com", "aff_unique_abbr": "SYSU;;CUHK (Shenzhen);Megvii", "aff_campus_unique_index": ";1;;", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0;0;0+1;0;0;0+1;0+1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Yan_2019_ICCV,\n \n author = {\n Yan,\n Pengxiang and Li,\n Guanbin and Xie,\n Yuan and Li,\n Zhen and Wang,\n Chuan and Chen,\n Tianshui and Lin,\n Liang\n},\n title = {\n Semi-Supervised Video Salient Object Detection Using Pseudo-Labels\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Seq-SG2SL: Inferring Semantic Layout From Scene Graph Through Sequence to Sequence Learning", @@ -27551,7 +28428,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2019_ICCV,\n \n author = {\n Li,\n Boren and Zhuang,\n Boyu and Li,\n Mingyang and Gu,\n Jian\n},\n title = {\n Seq-SG2SL: Inferring Semantic Layout From Scene Graph Through Sequence to Sequence Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Sequence Level Semantics Aggregation for Video Object Detection", @@ -27584,7 +28462,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;1+2+2+2;1;2+2+2", - "aff_country_unique": "Canada;United States;China" + "aff_country_unique": "Canada;United States;China", + "bibtex": "@InProceedings{Wu_2019_ICCV,\n \n author = {\n Wu,\n Haiping and Chen,\n Yuntao and Wang,\n Naiyan and Zhang,\n Zhaoxiang\n},\n title = {\n Sequence Level Semantics Aggregation for Video Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Sequential Adversarial Learning for Self-Supervised Deep Visual Odometry", @@ -27617,7 +28496,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2019_ICCV,\n \n author = {\n Li,\n Shunkai and Xue,\n Fei and Wang,\n Xin and Yan,\n Zike and Zha,\n Hongbin\n},\n title = {\n Sequential Adversarial Learning for Self-Supervised Deep Visual Odometry\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Sequential Latent Spaces for Modeling the Intention During Diverse Image Captioning", @@ -27643,14 +28523,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Aneja_Sequential_Latent_Spaces_for_Modeling_the_Intention_During_Diverse_Image_ICCV_2019_paper.html", "aff_unique_index": "0;1;1+2;0", - "aff_unique_norm": "University of Illinois;Georgia Institute of Technology;Meta", + "aff_unique_norm": "University of Illinois;Georgia Institute of Technology;Facebook", "aff_unique_dep": ";;Facebook AI Research", "aff_unique_url": "https://illinois.edu;https://www.gatech.edu;https://research.facebook.com", "aff_unique_abbr": "UIUC;Georgia Tech;FAIR", "aff_campus_unique_index": "0;;0", "aff_campus_unique": "Urbana-Champaign;", "aff_country_unique_index": "0;0;0+0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Aneja_2019_ICCV,\n \n author = {\n Aneja,\n Jyoti and Agrawal,\n Harsh and Batra,\n Dhruv and Schwing,\n Alexander\n},\n title = {\n Sequential Latent Spaces for Modeling the Intention During Diverse Image Captioning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Shadow Removal via Shadow Image Decomposition", @@ -27683,7 +28564,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Le_2019_ICCV,\n \n author = {\n Le,\n Hieu and Samaras,\n Dimitris\n},\n title = {\n Shadow Removal via Shadow Image Decomposition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Shape Reconstruction Using Differentiable Projections and Deep Priors", @@ -27716,7 +28598,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Amherst", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Gadelha_2019_ICCV,\n \n author = {\n Gadelha,\n Matheus and Wang,\n Rui and Maji,\n Subhransu\n},\n title = {\n Shape Reconstruction Using Differentiable Projections and Deep Priors\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Shape-Aware Human Pose and Shape Reconstruction Using Multi-View Images", @@ -27749,7 +28632,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "College Park", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Liang_2019_ICCV,\n \n author = {\n Liang,\n Junbang and Lin,\n Ming C.\n},\n title = {\n Shape-Aware Human Pose and Shape Reconstruction Using Multi-View Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "ShapeMask: Learning to Segment Novel Objects by Refining Shape Priors", @@ -27782,7 +28666,8 @@ "aff_campus_unique_index": "0;0;1;0", "aff_campus_unique": "Mountain View;Berkeley", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Kuo_2019_ICCV,\n \n author = {\n Kuo,\n Weicheng and Angelova,\n Anelia and Malik,\n Jitendra and Lin,\n Tsung-Yi\n},\n title = {\n ShapeMask: Learning to Segment Novel Objects by Refining Shape Priors\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Shapeglot: Learning Language for Shape Differentiation", @@ -27815,7 +28700,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Achlioptas_2019_ICCV,\n \n author = {\n Achlioptas,\n Panos and Fan,\n Judy and Hawkins,\n Robert and Goodman,\n Noah and Guibas,\n Leonidas J.\n},\n title = {\n Shapeglot: Learning Language for Shape Differentiation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Sharpen Focus: Learning With Attention Separability and Consistency", @@ -27848,7 +28734,8 @@ "aff_campus_unique_index": "0;1;1;1;1;0;0", "aff_campus_unique": "New Brunswick;Princeton", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Wang_2019_ICCV,\n \n author = {\n Wang,\n Lezi and Wu,\n Ziyan and Karanam,\n Srikrishna and Peng,\n Kuan-Chuan and Singh,\n Rajat Vikram and Liu,\n Bo and Metaxas,\n Dimitris N.\n},\n title = {\n Sharpen Focus: Learning With Attention Separability and Consistency\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "ShellNet: Efficient Point Cloud Convolutional Neural Networks Using Concentric Shells Statistics", @@ -27872,7 +28759,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Zhang_ShellNet_Efficient_Point_Cloud_Convolutional_Neural_Networks_Using_Concentric_Shells_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Zhang_ShellNet_Efficient_Point_Cloud_Convolutional_Neural_Networks_Using_Concentric_Shells_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Zhang_2019_ICCV,\n \n author = {\n Zhang,\n Zhiyuan and Hua,\n Binh-Son and Yeung,\n Sai-Kit\n},\n title = {\n ShellNet: Efficient Point Cloud Convolutional Neural Networks Using Concentric Shells Statistics\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Siamese Networks: The Tale of Two Manifolds", @@ -27905,7 +28793,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0+0;0", - "aff_country_unique": "Australia" + "aff_country_unique": "Australia", + "bibtex": "@InProceedings{Roy_2019_ICCV,\n \n author = {\n Roy,\n Soumava Kumar and Harandi,\n Mehrtash and Nock,\n Richard and Hartley,\n Richard\n},\n title = {\n Siamese Networks: The Tale of Two Manifolds\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Significance-Aware Information Bottleneck for Domain Adaptive Semantic Segmentation", @@ -27938,7 +28827,8 @@ "aff_campus_unique_index": "1;1;;1", "aff_campus_unique": ";Sydney", "aff_country_unique_index": "0+1;1;0;0;1+0", - "aff_country_unique": "China;Australia;" + "aff_country_unique": "China;Australia;", + "bibtex": "@InProceedings{Luo_2019_ICCV,\n \n author = {\n Luo,\n Yawei and Liu,\n Ping and Guan,\n Tao and Yu,\n Junqing and Yang,\n Yi\n},\n title = {\n Significance-Aware Information Bottleneck for Domain Adaptive Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Similarity-Preserving Knowledge Distillation", @@ -27971,7 +28861,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Tung_2019_ICCV,\n \n author = {\n Tung,\n Frederick and Mori,\n Greg\n},\n title = {\n Similarity-Preserving Knowledge Distillation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Simultaneous Multi-View Instance Detection With Learned Geometric Soft-Constraints", @@ -27979,11 +28870,11 @@ "status": "Poster", "track": "main", "pid": "3964", - "author_site": "Ahmed Samy Nassar, S\u00c3\u00a9bastien Lef\u00c3\u00a8vre, Jan Dirk Wegner", + "author_site": "Ahmed Samy Nassar, Sébastien Lefèvre, Jan Dirk Wegner", "author": "Ahmed Samy Nassar; Sebastien Lefevre; Jan Dirk Wegner", "abstract": "We propose to jointly learn multi-view geometry and warping between views of the same object instances for robust cross-view object detection. What makes multi-view object instance detection difficult are strong changes in viewpoint, lighting conditions, high similarity of neighbouring objects, and strong variability in scale. By turning object detection and instance re-identification in different views into a joint learning task, we are able to incorporate both image appearance and geometric soft constraints into a single, multi-view detection process that is learnable end-to-end. We validate our method on a new, large data set of street-level panoramas of urban objects and show superior performance compared to various baselines. Our contribution is threefold: a large-scale, publicly available data set for multi-view instance detection and re-identification; an annotation tool custom-tailored for multi-view instance detection; and a novel, holistic multi-view instance detection and re-identification method that jointly models geometry and appearance across views.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Nassar_Simultaneous_Multi-View_Instance_Detection_With_Learned_Geometric_Soft-Constraints_ICCV_2019_paper.pdf", - "aff": "IRISA, Universit \u00b4e Bretagne Sud; IRISA, Universit \u00b4e Bretagne Sud; EcoVision Lab, Photogrammetry and Remote Sensing group, ETH Zurich", + "aff": "IRISA, Universit ´e Bretagne Sud; IRISA, Universit ´e Bretagne Sud; EcoVision Lab, Photogrammetry and Remote Sensing group, ETH Zurich", "project": "", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2019/supplemental/Nassar_Simultaneous_Multi-View_Instance_ICCV_2019_supplemental.pdf", @@ -27997,14 +28888,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Nassar_Simultaneous_Multi-View_Instance_Detection_With_Learned_Geometric_Soft-Constraints_ICCV_2019_paper.html", "aff_unique_index": "0;0;1", - "aff_unique_norm": "Universit\u00e9 Bretagne Sud;ETH Zurich", + "aff_unique_norm": "Université Bretagne Sud;ETH Zurich", "aff_unique_dep": "IRISA;Photogrammetry and Remote Sensing group", "aff_unique_url": "https://www.univ-ubs.fr;https://www.ethz.ch", "aff_unique_abbr": "UBS;ETHZ", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", - "aff_country_unique": "France;Switzerland" + "aff_country_unique": "France;Switzerland", + "bibtex": "@InProceedings{Nassar_2019_ICCV,\n \n author = {\n Nassar,\n Ahmed Samy and Lefevre,\n Sebastien and Wegner,\n Jan Dirk\n},\n title = {\n Simultaneous Multi-View Instance Detection With Learned Geometric Soft-Constraints\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "SinGAN: Learning a Generative Model From a Single Natural Image", @@ -28028,7 +28920,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Shaham_SinGAN_Learning_a_Generative_Model_From_a_Single_Natural_Image_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Shaham_SinGAN_Learning_a_Generative_Model_From_a_Single_Natural_Image_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Shaham_2019_ICCV,\n \n author = {\n Shaham,\n Tamar Rott and Dekel,\n Tali and Michaeli,\n Tomer\n},\n title = {\n SinGAN: Learning a Generative Model From a Single Natural Image\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Single-Network Whole-Body Pose Estimation", @@ -28054,14 +28947,15 @@ "author_num": 7, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Hidalgo_Single-Network_Whole-Body_Pose_Estimation_ICCV_2019_paper.html", "aff_unique_index": "0;0;1;0;2;0;0", - "aff_unique_norm": "Carnegie Mellon University;RetailNext;Meta", + "aff_unique_norm": "Carnegie Mellon University;RetailNext;Facebook", "aff_unique_dep": ";;Facebook AI Research", "aff_unique_url": "https://www.cmu.edu;https://www.retailnext.com;https://research.facebook.com", "aff_unique_abbr": "CMU;RetailNext;FAIR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Hidalgo_2019_ICCV,\n \n author = {\n Hidalgo,\n Gines and Raaj,\n Yaadhav and Idrees,\n Haroon and Xiang,\n Donglai and Joo,\n Hanbyul and Simon,\n Tomas and Sheikh,\n Yaser\n},\n title = {\n Single-Network Whole-Body Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Single-Stage Multi-Person Pose Machines", @@ -28087,14 +28981,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Nie_Single-Stage_Multi-Person_Pose_Machines_ICCV_2019_paper.html", "aff_unique_index": "0;0;0;0+1", - "aff_unique_norm": "National University of Singapore;YITU Technology", + "aff_unique_norm": "National University of Singapore;Yitu Technology", "aff_unique_dep": "Department of Electrical and Computer Engineering;", "aff_unique_url": "https://www.nus.edu.sg;https://www.yITU.cn", "aff_unique_abbr": "NUS;YITU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0+1", - "aff_country_unique": "Singapore;China" + "aff_country_unique": "Singapore;China", + "bibtex": "@InProceedings{Nie_2019_ICCV,\n \n author = {\n Nie,\n Xuecheng and Feng,\n Jiashi and Zhang,\n Jianfeng and Yan,\n Shuicheng\n},\n title = {\n Single-Stage Multi-Person Pose Machines\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Situational Fusion of Visual Representation for Visual Navigation", @@ -28118,7 +29013,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Shen_Situational_Fusion_of_Visual_Representation_for_Visual_Navigation_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Shen_Situational_Fusion_of_Visual_Representation_for_Visual_Navigation_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Shen_2019_ICCV,\n \n author = {\n Shen,\n William B. and Xu,\n Danfei and Zhu,\n Yuke and Guibas,\n Leonidas J. and Fei-Fei,\n Li and Savarese,\n Silvio\n},\n title = {\n Situational Fusion of Visual Representation for Visual Navigation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Skeleton-Aware 3D Human Shape Reconstruction From Point Clouds", @@ -28151,7 +29047,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+1;0", - "aff_country_unique": "Singapore;Australia" + "aff_country_unique": "Singapore;Australia", + "bibtex": "@InProceedings{Jiang_2019_ICCV,\n \n author = {\n Jiang,\n Haiyong and Cai,\n Jianfei and Zheng,\n Jianmin\n},\n title = {\n Skeleton-Aware 3D Human Shape Reconstruction From Point Clouds\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "SkyScapes Fine-Grained Semantic Understanding of Aerial Scenes", @@ -28184,7 +29081,8 @@ "aff_campus_unique_index": "0;0;1;1;0", "aff_campus_unique": "Wessling;Karlsruhe", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Azimi_2019_ICCV,\n \n author = {\n Azimi,\n Seyed Majid and Henry,\n Corentin and Sommer,\n Lars and Schumann,\n Arne and Vig,\n Eleonora\n},\n title = {\n SkyScapes Fine-Grained Semantic Understanding of Aerial Scenes\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "SlowFast Networks for Video Recognition", @@ -28210,14 +29108,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Feichtenhofer_SlowFast_Networks_for_Video_Recognition_ICCV_2019_paper.html", "aff_unique_index": "0;0;0;0", - "aff_unique_norm": "Meta", + "aff_unique_norm": "Facebook", "aff_unique_dep": "Facebook AI Research", "aff_unique_url": "https://research.facebook.com", "aff_unique_abbr": "FAIR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Feichtenhofer_2019_ICCV,\n \n author = {\n Feichtenhofer,\n Christoph and Fan,\n Haoqi and Malik,\n Jitendra and He,\n Kaiming\n},\n title = {\n SlowFast Networks for Video Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Small Steps and Giant Leaps: Minimal Newton Solvers for Deep Learning", @@ -28225,7 +29124,7 @@ "status": "Poster", "track": "main", "pid": "5148", - "author_site": "Jo\u00c3\u00a3o F. Henriques, Sebastien Ehrhardt, Samuel Albanie, Andrea Vedaldi", + "author_site": "João F. Henriques, Sebastien Ehrhardt, Samuel Albanie, Andrea Vedaldi", "author": "Joao F. Henriques; Sebastien Ehrhardt; Samuel Albanie; Andrea Vedaldi", "abstract": "We propose a fast second-order method that can be used as a drop-in replacement for current deep learning solvers. Compared to stochastic gradient descent (SGD), it only requires two additional forward-mode automatic differentiation operations per iteration, which has a computational cost comparable to two standard forward passes and is easy to implement. Our method addresses long-standing issues with current second-order solvers, which invert an approximate Hessian matrix every iteration exactly or by conjugate-gradient methods, procedures that are much slower than a SGD step. Instead, we propose to keep a single estimate of the gradient projected by the inverse Hessian matrix, and update it once per iteration with just two passes over the network. This estimate has the same size and is similar to the momentum variable that is commonly used in SGD . No estimate of the Hessian is maintained. We first validate our method, called CurveBall, on small problems with known solutions (noisy Rosenbrock function and degenerate 2-layer linear networks), where current deep learning solvers struggle. We then train several large models on CIFAR and ImageNet, including ResNet and VGG-f networks, where we demonstrate faster convergence with no hyperparameter tuning. We also show our optimiser's generality by testing on a large set of randomly generated architectures.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Henriques_Small_Steps_and_Giant_Leaps_Minimal_Newton_Solvers_for_Deep_ICCV_2019_paper.pdf", @@ -28250,7 +29149,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Oxford", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Henriques_2019_ICCV,\n \n author = {\n Henriques,\n Joao F. and Ehrhardt,\n Sebastien and Albanie,\n Samuel and Vedaldi,\n Andrea\n},\n title = {\n Small Steps and Giant Leaps: Minimal Newton Solvers for Deep Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Soft Rasterizer: A Differentiable Renderer for Image-Based 3D Reasoning", @@ -28283,7 +29183,8 @@ "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0+0;0+0;0;0+0+1", - "aff_country_unique": "United States;Israel" + "aff_country_unique": "United States;Israel", + "bibtex": "@InProceedings{Liu_2019_ICCV,\n \n author = {\n Liu,\n Shichen and Li,\n Tianye and Chen,\n Weikai and Li,\n Hao\n},\n title = {\n Soft Rasterizer: A Differentiable Renderer for Image-Based 3D Reasoning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "SoftTriple Loss: Deep Metric Learning Without Triplet Sampling", @@ -28307,7 +29208,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Qian_SoftTriple_Loss_Deep_Metric_Learning_Without_Triplet_Sampling_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Qian_SoftTriple_Loss_Deep_Metric_Learning_Without_Triplet_Sampling_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Qian_2019_ICCV,\n \n author = {\n Qian,\n Qi and Shang,\n Lei and Sun,\n Baigui and Hu,\n Juhua and Li,\n Hao and Jin,\n Rong\n},\n title = {\n SoftTriple Loss: Deep Metric Learning Without Triplet Sampling\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Solving Vision Problems via Filtering", @@ -28340,7 +29242,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;1;0;1", - "aff_country_unique": "United States;Australia" + "aff_country_unique": "United States;Australia", + "bibtex": "@InProceedings{Young_2019_ICCV,\n \n author = {\n Young,\n Sean I. and Naman,\n Aous T. and Girod,\n Bernd and Taubman,\n David\n},\n title = {\n Solving Vision Problems via Filtering\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "SpaceNet MVOI: A Multi-View Overhead Imagery Dataset", @@ -28364,7 +29267,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Weir_SpaceNet_MVOI_A_Multi-View_Overhead_Imagery_Dataset_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Weir_SpaceNet_MVOI_A_Multi-View_Overhead_Imagery_Dataset_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Weir_2019_ICCV,\n \n author = {\n Weir,\n Nicholas and Lindenbaum,\n David and Bastidas,\n Alexei and Etten,\n Adam Van and McPherson,\n Sean and Shermeyer,\n Jacob and Kumar,\n Varun and Tang,\n Hanlin\n},\n title = {\n SpaceNet MVOI: A Multi-View Overhead Imagery Dataset\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Sparse and Imperceivable Adversarial Attacks", @@ -28376,7 +29280,7 @@ "author": "Francesco Croce; Matthias Hein", "abstract": "Neural networks have been proven to be vulnerable to a variety of adversarial attacks. From a safety perspective, highly sparse adversarial attacks are particularly dangerous. On the other hand the pixelwise perturbations of sparse attacks are typically large and thus can be potentially detected. We propose a new black-box technique to craft adversarial examples aiming at minimizing l_0-distance to the original image. Extensive experiments show that our attack is better or competitive to the state of the art. Moreover, we can integrate additional bounds on the componentwise perturbation. Allowing pixels to change only in region of high variation and avoiding changes along axis-aligned edges makes our adversarial examples almost non-perceivable. Moreover, we adapt the Projected Gradient Descent attack to the l_0-norm integrating componentwise constraints. This allows us to do adversarial training to enhance the robustness of classifiers against sparse and imperceivable adversarial manipulations.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Croce_Sparse_and_Imperceivable_Adversarial_Attacks_ICCV_2019_paper.pdf", - "aff": "University of T\u00fcbingen; University of T\u00fcbingen", + "aff": "University of Tübingen; University of Tübingen", "project": "", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2019/supplemental/Croce_Sparse_and_Imperceivable_ICCV_2019_supplemental.pdf", @@ -28390,14 +29294,15 @@ "author_num": 2, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Croce_Sparse_and_Imperceivable_Adversarial_Attacks_ICCV_2019_paper.html", "aff_unique_index": "0;0", - "aff_unique_norm": "University of T\u00fcbingen", + "aff_unique_norm": "University of Tübingen", "aff_unique_dep": "", "aff_unique_url": "https://www.uni-tuebingen.de/", - "aff_unique_abbr": "Uni T\u00fcbingen", + "aff_unique_abbr": "Uni Tübingen", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Croce_2019_ICCV,\n \n author = {\n Croce,\n Francesco and Hein,\n Matthias\n},\n title = {\n Sparse and Imperceivable Adversarial Attacks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "SparseMask: Differentiable Connectivity Learning for Dense Image Prediction", @@ -28430,7 +29335,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wu_2019_ICCV,\n \n author = {\n Wu,\n Huikai and Zhang,\n Junge and Huang,\n Kaiqi\n},\n title = {\n SparseMask: Differentiable Connectivity Learning for Dense Image Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Spatial Correspondence With Generative Adversarial Network: Learning Depth From Monocular Videos", @@ -28458,12 +29364,13 @@ "aff_unique_index": "0+1;0+1;2;0+2+1;0+2+1", "aff_unique_norm": "University of South Carolina;Farsee2 Technology Ltd;Wuhan University", "aff_unique_dep": ";;", - "aff_unique_url": "https://www.sc.edu;;http://www.whu.edu.cn", + "aff_unique_url": "https://www.sc.edu;;http://www.whu.edu.cn/", "aff_unique_abbr": "USC;;WHU", "aff_campus_unique_index": ";;;", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0+1;1;0+1+1;0+1+1", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Wu_2019_ICCV,\n \n author = {\n Wu,\n Zhenyao and Wu,\n Xinyi and Zhang,\n Xiaoping and Wang,\n Song and Ju,\n Lili\n},\n title = {\n Spatial Correspondence With Generative Adversarial Network: Learning Depth From Monocular Videos\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Spatial-Temporal Relation Networks for Multi-Object Tracking", @@ -28489,14 +29396,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Xu_Spatial-Temporal_Relation_Networks_for_Multi-Object_Tracking_ICCV_2019_paper.html", "aff_unique_index": "0+1;2+1;1;1", - "aff_unique_norm": "Hong Kong University of Science and Technology;Microsoft;Tsinghua University", + "aff_unique_norm": "Hong Kong University of Science and Technology;Microsoft Research;Tsinghua University", "aff_unique_dep": ";Research;School of Software", "aff_unique_url": "https://www.ust.hk;https://www.microsoft.com/en-us/research/group/asia;https://www.tsinghua.edu.cn", "aff_unique_abbr": "HKUST;MSR Asia;THU", "aff_campus_unique_index": "0+1;1;1;1", "aff_campus_unique": "Hong Kong SAR;Asia;", "aff_country_unique_index": "0+0;0+0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xu_2019_ICCV,\n \n author = {\n Xu,\n Jiarui and Cao,\n Yue and Zhang,\n Zheng and Hu,\n Han\n},\n title = {\n Spatial-Temporal Relation Networks for Multi-Object Tracking\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "SpatialSense: An Adversarially Crowdsourced Benchmark for Spatial Relation Recognition", @@ -28529,7 +29437,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Yang_2019_ICCV,\n \n author = {\n Yang,\n Kaiyu and Russakovsky,\n Olga and Deng,\n Jia\n},\n title = {\n SpatialSense: An Adversarially Crowdsourced Benchmark for Spatial Relation Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Spatio-Temporal Filter Adaptive Network for Video Deblurring", @@ -28557,12 +29466,13 @@ "aff_unique_index": "0;0;1;0+2;2;0", "aff_unique_norm": "SenseTime;Nanjing University of Science and Technology;Harbin Institute of Technology", "aff_unique_dep": "SenseTime Research;;", - "aff_unique_url": "https://www.sensetime.com;http://www.nust.edu.cn;http://www.hit.edu.cn/", + "aff_unique_url": "https://www.sensetime.com;http://www.nust.edu.cn/;http://www.hit.edu.cn/", "aff_unique_abbr": "SenseTime;NUST;HIT", "aff_campus_unique_index": "1;2;2", "aff_campus_unique": ";Nanjing;Harbin", "aff_country_unique_index": "0;0;0;0+0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhou_2019_ICCV,\n \n author = {\n Zhou,\n Shangchen and Zhang,\n Jiawei and Pan,\n Jinshan and Xie,\n Haozhe and Zuo,\n Wangmeng and Ren,\n Jimmy\n},\n title = {\n Spatio-Temporal Filter Adaptive Network for Video Deblurring\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Spatio-Temporal Fusion Based Convolutional Sequence Learning for Lip Reading", @@ -28595,7 +29505,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2019_ICCV,\n \n author = {\n Zhang,\n Xingxuan and Cheng,\n Feng and Wang,\n Shilin\n},\n title = {\n Spatio-Temporal Fusion Based Convolutional Sequence Learning for Lip Reading\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Spatiotemporal Feature Residual Propagation for Action Prediction", @@ -28628,7 +29539,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Toronto", "aff_country_unique_index": "0;0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Zhao_2019_ICCV,\n \n author = {\n Zhao,\n He and Wildes,\n Richard P.\n},\n title = {\n Spatiotemporal Feature Residual Propagation for Action Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Specifying Object Attributes and Relations in Interactive Scene Generation", @@ -28652,7 +29564,8 @@ "aff_domain": ";", "email": ";", "author_num": 2, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Ashual_Specifying_Object_Attributes_and_Relations_in_Interactive_Scene_Generation_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Ashual_Specifying_Object_Attributes_and_Relations_in_Interactive_Scene_Generation_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Ashual_2019_ICCV,\n \n author = {\n Ashual,\n Oron and Wolf,\n Lior\n},\n title = {\n Specifying Object Attributes and Relations in Interactive Scene Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Spectral Feature Transformation for Person Re-Identification", @@ -28685,7 +29598,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0+0;0+0+0;1;0+0+0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Luo_2019_ICCV,\n \n author = {\n Luo,\n Chuanchen and Chen,\n Yuntao and Wang,\n Naiyan and Zhang,\n Zhaoxiang\n},\n title = {\n Spectral Feature Transformation for Person Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Spectral Regularization for Combating Mode Collapse in GANs", @@ -28697,7 +29611,7 @@ "author": "Kanglin Liu; Wenming Tang; Fei Zhou; Guoping Qiu", "abstract": "Despite excellent progress in recent years, mode collapse remains a major unsolved problem in generative adversarial networks (GANs). In this paper, we present spectral regularization for GANs (SR-GANs), a new and robust method for combating the mode collapse problem in GANs. Theoretical analysis shows that the optimal solution to the discriminator has a strong relationship to the spectral distributions of the weight matrix. Therefore, we monitor the spectral distribution in the discriminator of spectral normalized GANs (SN-GANs), and discover a phenomenon which we refer to as spectral collapse, where a large number of singular values of the weight matrices drop dramatically when mode collapse occurs. We show that there are strong evidence linking mode collapse to spectral collapse; and based on this link, we set out to tackle spectral collapse as a surrogate of mode collapse. We have developed a spectral regularization method where we compensate the spectral distributions of the weight matrices to prevent them from collapsing, which in turn successfully prevents mode collapse in GANs. We provide theoretical explanations for why SR-GANs are more stable and can provide better performances than SN-GANs. We also present extensive experimental results and analysis to show that SR-GANs not only always outperform SN-GANs but also always succeed in combating mode collapse where SN-GANs fail.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Liu_Spectral_Regularization_for_Combating_Mode_Collapse_in_GANs_ICCV_2019_paper.pdf", - "aff": "Shenzhen University, Shenzhen, China+Guangdong Key Laboratory of Intelligent Information Processing, Shenzhen, China+Shenzhen Institute of Arti\ufb01cial Intelligence and Robotics for Society, Shenzhen, China; Shenzhen University, Shenzhen, China+Guangdong Key Laboratory of Intelligent Information Processing, Shenzhen, China+Shenzhen Institute of Arti\ufb01cial Intelligence and Robotics for Society, Shenzhen, China; Shenzhen University, Shenzhen, China+Guangdong Key Laboratory of Intelligent Information Processing, Shenzhen, China+Shenzhen Institute of Arti\ufb01cial Intelligence and Robotics for Society, Shenzhen, China; Shenzhen University, Shenzhen, China+Guangdong Key Laboratory of Intelligent Information Processing, Shenzhen, China+Shenzhen Institute of Arti\ufb01cial Intelligence and Robotics for Society, Shenzhen, China+University of Nottingham, Nottingham, United Kingdom", + "aff": "Shenzhen University, Shenzhen, China+Guangdong Key Laboratory of Intelligent Information Processing, Shenzhen, China+Shenzhen Institute of Artificial Intelligence and Robotics for Society, Shenzhen, China; Shenzhen University, Shenzhen, China+Guangdong Key Laboratory of Intelligent Information Processing, Shenzhen, China+Shenzhen Institute of Artificial Intelligence and Robotics for Society, Shenzhen, China; Shenzhen University, Shenzhen, China+Guangdong Key Laboratory of Intelligent Information Processing, Shenzhen, China+Shenzhen Institute of Artificial Intelligence and Robotics for Society, Shenzhen, China; Shenzhen University, Shenzhen, China+Guangdong Key Laboratory of Intelligent Information Processing, Shenzhen, China+Shenzhen Institute of Artificial Intelligence and Robotics for Society, Shenzhen, China+University of Nottingham, Nottingham, United Kingdom", "project": "", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2019/supplemental/Liu_Spectral_Regularization_for_ICCV_2019_supplemental.pdf", @@ -28718,7 +29632,8 @@ "aff_campus_unique_index": "0+0+0;0+0+0;0+0+0;0+0+0+1", "aff_campus_unique": "Shenzhen;Nottingham", "aff_country_unique_index": "0+0+0;0+0+0;0+0+0;0+0+0+1", - "aff_country_unique": "China;United Kingdom" + "aff_country_unique": "China;United Kingdom", + "bibtex": "@InProceedings{Liu_2019_ICCV,\n \n author = {\n Liu,\n Kanglin and Tang,\n Wenming and Zhou,\n Fei and Qiu,\n Guoping\n},\n title = {\n Spectral Regularization for Combating Mode Collapse in GANs\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "SplitNet: Sim2Sim and Task2Task Transfer for Embodied Visual Navigation", @@ -28744,14 +29659,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Gordon_SplitNet_Sim2Sim_and_Task2Task_Transfer_for_Embodied_Visual_Navigation_ICCV_2019_paper.html", "aff_unique_index": "0;1;1+2;1+2;1+2", - "aff_unique_norm": "University of Washington;Meta;Georgia Institute of Technology", + "aff_unique_norm": "University of Washington;Facebook;Georgia Institute of Technology", "aff_unique_dep": "Paul G. Allen School of Computer Science;Facebook AI Research;", - "aff_unique_url": "https://www.washington.edu;https://research.facebook.com;https://www.gatech.edu", + "aff_unique_url": "https://www.cs.washington.edu;https://research.facebook.com;https://www.gatech.edu", "aff_unique_abbr": "UW;FAIR;Georgia Tech", "aff_campus_unique_index": "0;;;", "aff_campus_unique": "Seattle;", "aff_country_unique_index": "0;0;0+0;0+0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Gordon_2019_ICCV,\n \n author = {\n Gordon,\n Daniel and Kadian,\n Abhishek and Parikh,\n Devi and Hoffman,\n Judy and Batra,\n Dhruv\n},\n title = {\n SplitNet: Sim2Sim and Task2Task Transfer for Embodied Visual Navigation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Stacked Cross Refinement Network for Edge-Aware Salient Object Detection", @@ -28777,14 +29693,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Wu_Stacked_Cross_Refinement_Network_for_Edge-Aware_Salient_Object_Detection_ICCV_2019_paper.html", "aff_unique_index": "0+0+1;0+0+1+2;0+0+1+2", - "aff_unique_norm": "University of Chinese Academy of Sciences;Chinese Academy of Sciences;Pengcheng Laboratory", - "aff_unique_dep": "School of Computer Science and Technology;Institute of Computing Technology;Peng Cheng Laboratory", + "aff_unique_norm": "University of Chinese Academy of Sciences;Chinese Academy of Sciences;Peng Cheng Laboratory", + "aff_unique_dep": "School of Computer Science and Technology;Institute of Computing Technology;", "aff_unique_url": "http://www.ucas.ac.cn;http://www.ict.cas.cn;", "aff_unique_abbr": "UCAS;CAS;", "aff_campus_unique_index": "0+0+0;0+0+0+1;0+0+0+1", "aff_campus_unique": "Beijing;ShenZhen", "aff_country_unique_index": "0+0+0;0+0+0+0;0+0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wu_2019_ICCV,\n \n author = {\n Wu,\n Zhe and Su,\n Li and Huang,\n Qingming\n},\n title = {\n Stacked Cross Refinement Network for Edge-Aware Salient Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "StartNet: Online Detection of Action Start in Untrimmed Videos", @@ -28817,7 +29734,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Gao_2019_ICCV,\n \n author = {\n Gao,\n Mingfei and Xu,\n Mingze and Davis,\n Larry S. and Socher,\n Richard and Xiong,\n Caiming\n},\n title = {\n StartNet: Online Detection of Action Start in Untrimmed Videos\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Stochastic Attraction-Repulsion Embedding for Large Scale Image Localization", @@ -28850,7 +29768,8 @@ "aff_campus_unique_index": "0;0;2", "aff_campus_unique": "Canberra;;Xian", "aff_country_unique_index": "0+0;0+0;1", - "aff_country_unique": "Australia;China" + "aff_country_unique": "Australia;China", + "bibtex": "@InProceedings{Liu_2019_ICCV,\n \n author = {\n Liu,\n Liu and Li,\n Hongdong and Dai,\n Yuchao\n},\n title = {\n Stochastic Attraction-Repulsion Embedding for Large Scale Image Localization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Stochastic Exposure Coding for Handling Multi-ToF-Camera Interference", @@ -28883,7 +29802,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Madison", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Lee_2019_ICCV,\n \n author = {\n Lee,\n Jongho and Gupta,\n Mohit\n},\n title = {\n Stochastic Exposure Coding for Handling Multi-ToF-Camera Interference\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Stochastic Filter Groups for Multi-Task CNNs: Learning Specialist and Generalist Convolution Kernels", @@ -28916,7 +29836,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Bragman_2019_ICCV,\n \n author = {\n Bragman,\n Felix J.S. and Tanno,\n Ryutaro and Ourselin,\n Sebastien and Alexander,\n Daniel C. and Cardoso,\n Jorge\n},\n title = {\n Stochastic Filter Groups for Multi-Task CNNs: Learning Specialist and Generalist Convolution Kernels\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "StructureFlow: Image Inpainting via Structure-Aware Appearance Flow", @@ -28942,14 +29863,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Ren_StructureFlow_Image_Inpainting_via_Structure-Aware_Appearance_Flow_ICCV_2019_paper.html", "aff_unique_index": "0+1;0+1;1;0+0;2;0+1", - "aff_unique_norm": "Peking University;Pengcheng Laboratory;Tencent", - "aff_unique_dep": "School of Electronics and Computer Engineering;Peng Cheng Laboratory;Tencent America", + "aff_unique_norm": "Peking University;Peng Cheng Laboratory;Tencent America", + "aff_unique_dep": "School of Electronics and Computer Engineering;;", "aff_unique_url": "http://www.pku.edu.cn;http://www.pcl.ac.cn;https://www.tencent.com/en-us", "aff_unique_abbr": "PKU;PCL;Tencent America", "aff_campus_unique_index": ";;1;", "aff_campus_unique": ";Beijing", "aff_country_unique_index": "0+0;0+0;0;0+0;1;0+0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Ren_2019_ICCV,\n \n author = {\n Ren,\n Yurui and Yu,\n Xiaoming and Zhang,\n Ruonan and Li,\n Thomas H. and Liu,\n Shan and Li,\n Ge\n},\n title = {\n StructureFlow: Image Inpainting via Structure-Aware Appearance Flow\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Structured Modeling of Joint Deep Feature and Prediction Refinement for Salient Object Detection", @@ -28961,7 +29883,7 @@ "author": "Yingyue Xu; Dan Xu; Xiaopeng Hong; Wanli Ouyang; Rongrong Ji; Min Xu; Guoying Zhao", "abstract": "Recent saliency models extensively explore to incorporate multi-scale contextual information from Convolutional Neural Networks (CNNs). Besides direct fusion strategies, many approaches introduce message-passing to enhance CNN features or predictions. However, the messages are mainly transmitted in two ways, by feature-to-feature passing, and by prediction-to-prediction passing. In this paper, we add message-passing between features and predictions and propose a deep unified CRF saliency model . We design a novel cascade CRFs architecture with CNN to jointly refine deep features and predictions at each scale and progressively compute a final refined saliency map. We formulate the CRF graphical model that involves message-passing of feature-feature, feature-prediction, and prediction-prediction, from the coarse scale to the finer scale, to update the features and the corresponding predictions. Also, we formulate the mean-field updates for joint end-to-end model training with CNN through back propagation. The proposed deep unified CRF saliency model is evaluated over six datasets and shows highly competitive performance among the state of the arts.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Xu_Structured_Modeling_of_Joint_Deep_Feature_and_Prediction_Refinement_for_ICCV_2019_paper.pdf", - "aff": "University of Oulu; University of Oxford; Xi\u2019an Jiaotong University + Peng Cheng Laborotory; SenseTime Computer Vision Group, The University of Sydney; Xiamen University + Peng Cheng Laborotory; University of Technology Sydney; University of Oulu", + "aff": "University of Oulu; University of Oxford; Xi’an Jiaotong University + Peng Cheng Laborotory; SenseTime Computer Vision Group, The University of Sydney; Xiamen University + Peng Cheng Laborotory; University of Technology Sydney; University of Oulu", "project": "", "github": "", "supp": "", @@ -28975,14 +29897,15 @@ "author_num": 7, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Xu_Structured_Modeling_of_Joint_Deep_Feature_and_Prediction_Refinement_for_ICCV_2019_paper.html", "aff_unique_index": "0;1;2+3;4;5+3;6;0", - "aff_unique_norm": "University of Oulu;University of Oxford;Xi'an Jiao Tong University;Pengcheng Laboratory;University of Sydney;Xiamen University;University of Technology Sydney", - "aff_unique_dep": ";;;Peng Cheng Laboratory;Computer Vision Group;;", + "aff_unique_norm": "University of Oulu;University of Oxford;Xi'an Jiaotong University;Peng Cheng Laboratory;The University of Sydney;Xiamen University;University of Technology Sydney", + "aff_unique_dep": ";;;;Computer Vision Group;;", "aff_unique_url": "https://www.oulu.fi;https://www.ox.ac.uk;https://www.xjtu.edu.cn;http://www.pcl.ac.cn;https://www.sydney.edu.au;https://www.xmu.edu.cn;https://www.uts.edu.au", "aff_unique_abbr": "UOulu;Oxford;XJTU;;USYD;XMU;UTS", "aff_campus_unique_index": ";1;", "aff_campus_unique": ";Sydney", "aff_country_unique_index": "0;1;2+2;3;2+2;3;0", - "aff_country_unique": "Finland;United Kingdom;China;Australia" + "aff_country_unique": "Finland;United Kingdom;China;Australia", + "bibtex": "@InProceedings{Xu_2019_ICCV,\n \n author = {\n Xu,\n Yingyue and Xu,\n Dan and Hong,\n Xiaopeng and Ouyang,\n Wanli and Ji,\n Rongrong and Xu,\n Min and Zhao,\n Guoying\n},\n title = {\n Structured Modeling of Joint Deep Feature and Prediction Refinement for Salient Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Structured Prediction Helps 3D Human Motion Modelling", @@ -28994,7 +29917,7 @@ "author": "Emre Aksan; Manuel Kaufmann; Otmar Hilliges", "abstract": "Human motion prediction is a challenging and important task in many computer vision application domains. Existing work only implicitly models the spatial structure of the human skeleton. In this paper, we propose a novel approach that decomposes the prediction into individual joints by means of a structured prediction layer that explicitly models the joint dependencies. This is implemented via a hierarchy of small-sized neural networks connected analogously to the kinematic chains in the human body as well as a joint-wise decomposition in the loss function. The proposed layer is agnostic to the underlying network and can be used with existing architectures for motion modelling. Prior work typically leverages the H3.6M dataset. We show that some state-of-the-art techniques do not perform well when trained and tested on AMASS, a recently released dataset 14 times the size of H3.6M. Our experiments indicate that the proposed layer increases the performance of motion forecasting irrespective of the base network, joint-angle representation, and prediction horizon. We furthermore show that the layer also improves motion predictions qualitatively. We make code and models publicly available at https://ait.ethz.ch/projects/2019/spl.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Aksan_Structured_Prediction_Helps_3D_Human_Motion_Modelling_ICCV_2019_paper.pdf", - "aff": "Department of Computer Science, ETH Z\u00fcrich; Department of Computer Science, ETH Z\u00fcrich; Department of Computer Science, ETH Z\u00fcrich", + "aff": "Department of Computer Science, ETH Zürich; Department of Computer Science, ETH Zürich; Department of Computer Science, ETH Zürich", "project": "https://ait.ethz.ch/projects/2019/spl", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2019/supplemental/Aksan_Structured_Prediction_Helps_ICCV_2019_supplemental.pdf", @@ -29008,14 +29931,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Aksan_Structured_Prediction_Helps_3D_Human_Motion_Modelling_ICCV_2019_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "ETH Zurich", + "aff_unique_norm": "ETH Zürich", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.ethz.ch", "aff_unique_abbr": "ETHZ", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Switzerland" + "aff_country_unique": "Switzerland", + "bibtex": "@InProceedings{Aksan_2019_ICCV,\n \n author = {\n Aksan,\n Emre and Kaufmann,\n Manuel and Hilliges,\n Otmar\n},\n title = {\n Structured Prediction Helps 3D Human Motion Modelling\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Subspace Structure-Aware Spectral Clustering for Robust Subspace Clustering", @@ -29048,7 +29972,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Yamaguchi_2019_ICCV,\n \n author = {\n Yamaguchi,\n Masataka and Irie,\n Go and Kawanishi,\n Takahito and Kashino,\n Kunio\n},\n title = {\n Subspace Structure-Aware Spectral Clustering for Robust Subspace Clustering\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Surface Networks via General Covers", @@ -29081,7 +30006,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "Israel" + "aff_country_unique": "Israel", + "bibtex": "@InProceedings{Haim_2019_ICCV,\n \n author = {\n Haim,\n Niv and Segol,\n Nimrod and Ben-Hamu,\n Heli and Maron,\n Haggai and Lipman,\n Yaron\n},\n title = {\n Surface Networks via General Covers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Surface Normals and Shape From Water", @@ -29114,7 +30040,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Murai_2019_ICCV,\n \n author = {\n Murai,\n Satoshi and Kuo,\n Meng-Yu Jennifer and Kawahara,\n Ryo and Nobuhara,\n Shohei and Nishino,\n Ko\n},\n title = {\n Surface Normals and Shape From Water\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Switchable Whitening for Deep Representation Learning", @@ -29140,14 +30067,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Pan_Switchable_Whitening_for_Deep_Representation_Learning_ICCV_2019_paper.html", "aff_unique_index": "0;0;1;0;0+2", - "aff_unique_norm": "Chinese University of Hong Kong;SenseTime Group Limited;University of Hong Kong", + "aff_unique_norm": "The Chinese University of Hong Kong;SenseTime Group Limited;The University of Hong Kong", "aff_unique_dep": "CUHK-SenseTime Joint Lab;;", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.sensetime.com;https://www.hku.hk", "aff_unique_abbr": "CUHK;SenseTime;HKU", "aff_campus_unique_index": "0;0;0;0+0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Pan_2019_ICCV,\n \n author = {\n Pan,\n Xingang and Zhan,\n Xiaohang and Shi,\n Jianping and Tang,\n Xiaoou and Luo,\n Ping\n},\n title = {\n Switchable Whitening for Deep Representation Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Sym-Parameterized Dynamic Inference for Mixed-Domain Image Translation", @@ -29173,14 +30101,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Chang_Sym-Parameterized_Dynamic_Inference_for_Mixed-Domain_Image_Translation_ICCV_2019_paper.html", "aff_unique_index": "0;0;0;0+1", - "aff_unique_norm": "Seoul National University;Samsung", - "aff_unique_dep": ";Samsung Electronics", + "aff_unique_norm": "Seoul National University;Samsung Electronics", + "aff_unique_dep": ";", "aff_unique_url": "https://www.snu.ac.kr;https://www.samsung.com", "aff_unique_abbr": "SNU;Samsung", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0+0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Chang_2019_ICCV,\n \n author = {\n Chang,\n Simyung and Park,\n SeongUk and Yang,\n John and Kwak,\n Nojun\n},\n title = {\n Sym-Parameterized Dynamic Inference for Mixed-Domain Image Translation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Symmetric Cross Entropy for Robust Learning With Noisy Labels", @@ -29205,15 +30134,16 @@ "email": "gmail.com;unimelb.edu.au; ; ; ; ", "author_num": 6, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Wang_Symmetric_Cross_Entropy_for_Robust_Learning_With_Noisy_Labels_ICCV_2019_paper.html", - "aff_unique_index": "0;1;2;3;3;1", - "aff_unique_norm": "Shanghai Jiao Tong University;University of Melbourne;Cainiao;JD", - "aff_unique_dep": ";;Cainiao AI;JD AI", - "aff_unique_url": "https://www.sjtu.edu.cn;https://www.unimelb.edu.au;https://www.cainiao.com;https://www.jd.com", - "aff_unique_abbr": "SJTU;UniMelb;Cainiao;JD AI", + "aff_unique_index": "0;1;2;3;4;1", + "aff_unique_norm": "Shanghai Jiao Tong University;University of Melbourne;Cainiao;JD AI;JD AI Research", + "aff_unique_dep": ";;Cainiao AI;;", + "aff_unique_url": "https://www.sjtu.edu.cn;https://www.unimelb.edu.au;https://www.cainiao.com;https://www.jd.com;https://www.jd.com", + "aff_unique_abbr": "SJTU;UniMelb;Cainiao;JD AI;JD AI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0;1", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Wang_2019_ICCV,\n \n author = {\n Wang,\n Yisen and Ma,\n Xingjun and Chen,\n Zaiyi and Luo,\n Yuan and Yi,\n Jinfeng and Bailey,\n James\n},\n title = {\n Symmetric Cross Entropy for Robust Learning With Noisy Labels\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Symmetric Graph Convolutional Autoencoder for Unsupervised Graph Representation Learning", @@ -29240,13 +30170,14 @@ "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Park_Symmetric_Graph_Convolutional_Autoencoder_for_Unsupervised_Graph_Representation_Learning_ICCV_2019_paper.html", "aff_unique_index": "0;1;2;0;0", "aff_unique_norm": "Seoul National University;Hanyang University;University of Birmingham", - "aff_unique_dep": "Dept. of Electrical and Computer Engineering;Division of Electrical Engineering;School of Computer Science", + "aff_unique_dep": "Dept. of ECE.;Division of Electrical Engineering;School of Computer Science", "aff_unique_url": "https://www.snu.ac.kr;http://www.hanyang.ac.kr;https://www.birmingham.ac.uk", "aff_unique_abbr": "SNU;HYU;UoB", "aff_campus_unique_index": "0;2;0;0", "aff_campus_unique": "Seoul;;Birmingham", "aff_country_unique_index": "0;0;1;0;0", - "aff_country_unique": "South Korea;United Kingdom" + "aff_country_unique": "South Korea;United Kingdom", + "bibtex": "@InProceedings{Park_2019_ICCV,\n \n author = {\n Park,\n Jiwoong and Lee,\n Minsik and Chang,\n Hyung Jin and Lee,\n Kyuewang and Choi,\n Jin Young\n},\n title = {\n Symmetric Graph Convolutional Autoencoder for Unsupervised Graph Representation Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Symmetry-Constrained Rectification Network for Scene Text Recognition", @@ -29272,14 +30203,15 @@ "author_num": 8, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Yang_Symmetry-Constrained_Rectification_Network_for_Scene_Text_Recognition_ICCV_2019_paper.html", "aff_unique_index": "0;1;0;2+3;1;4;2+3;0", - "aff_unique_norm": "Huazhong University of Science and Technology;Peking University;MEGVII;Unknown Institution;University of Oxford", + "aff_unique_norm": "Huazhong University of Science and Technology;Peking University;Megvii;Unknown Institution;University of Oxford", "aff_unique_dep": ";;;;", "aff_unique_url": "http://www.hust.edu.cn;http://www.pku.edu.cn;https://www.megvii.com;;https://www.ox.ac.uk", "aff_unique_abbr": "HUST;Peking U;Megvii;;Oxford", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;2;0;0", - "aff_country_unique": "China;;United Kingdom" + "aff_country_unique": "China;;United Kingdom", + "bibtex": "@InProceedings{Yang_2019_ICCV,\n \n author = {\n Yang,\n Mingkun and Guan,\n Yushuo and Liao,\n Minghui and He,\n Xin and Bian,\n Kaigui and Bai,\n Song and Yao,\n Cong and Bai,\n Xiang\n},\n title = {\n Symmetry-Constrained Rectification Network for Scene Text Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "SynDeMo: Synergistic Deep Feature Alignment for Joint Learning of Depth and Ego-Motion", @@ -29291,7 +30223,7 @@ "author": "Behzad Bozorgtabar; Mohammad Saeed Rad; Dwarikanath Mahapatra; Jean-Philippe Thiran", "abstract": "Despite well-established baselines, learning of scene depth and ego-motion from monocular video remains an ongoing challenge, specifically when handling scaling ambiguity issues and depth inconsistencies in image sequences. Much prior work uses either a supervised mode of learning or stereo images. The former is limited by the amount of labeled data, as it requires expensive sensors, while the latter is not always readily available as monocular sequences. In this work, we demonstrate the benefit of using geometric information from synthetic images, coupled with scene depth information, to recover the scale in depth and ego-motion estimation from monocular videos. We developed our framework using synthetic image-depth pairs and unlabeled real monocular images. We had three training objectives: first, to use deep feature alignment to reduce the domain gap between synthetic and monocular images to yield more accurate depth estimation when presented with only real monocular images at test time. Second, we learn scene specific representation by exploiting self-supervision coming from multi-view synthetic images without the need for depth labels. Third, our method uses single-view depth and pose networks, which are capable of jointly training and supervising one another mutually, yielding consistent depth and ego-motion estimates. Extensive experiments demonstrate that our depth and ego-motion models surpass the state-of-the-art, unsupervised methods and compare favorably to early supervised deep models for geometric understanding. We validate the effectiveness of our training objectives against standard benchmarks thorough an ablation study.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Bozorgtabar_SynDeMo_Synergistic_Deep_Feature_Alignment_for_Joint_Learning_of_Depth_ICCV_2019_paper.pdf", - "aff": "\u00b4Ecole Polytechnique F \u00b4ed\u00b4erale de Lausanne (EPFL); \u00b4Ecole Polytechnique F \u00b4ed\u00b4erale de Lausanne (EPFL); ; \u00b4Ecole Polytechnique F \u00b4ed\u00b4erale de Lausanne (EPFL)", + "aff": "´Ecole Polytechnique F ´ed´erale de Lausanne (EPFL); ´Ecole Polytechnique F ´ed´erale de Lausanne (EPFL); ; ´Ecole Polytechnique F ´ed´erale de Lausanne (EPFL)", "project": "", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2019/supplemental/Bozorgtabar_SynDeMo_Synergistic_Deep_ICCV_2019_supplemental.pdf", @@ -29305,14 +30237,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Bozorgtabar_SynDeMo_Synergistic_Deep_Feature_Alignment_for_Joint_Learning_of_Depth_ICCV_2019_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "EPFL", + "aff_unique_norm": "Ecole Polytechnique Fédérale de Lausanne", "aff_unique_dep": "", "aff_unique_url": "https://www.epfl.ch", "aff_unique_abbr": "EPFL", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Lausanne", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Switzerland" + "aff_country_unique": "Switzerland", + "bibtex": "@InProceedings{Bozorgtabar_2019_ICCV,\n \n author = {\n Bozorgtabar,\n Behzad and Rad,\n Mohammad Saeed and Mahapatra,\n Dwarikanath and Thiran,\n Jean-Philippe\n},\n title = {\n SynDeMo: Synergistic Deep Feature Alignment for Joint Learning of Depth and Ego-Motion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "TAPA-MVS: Textureless-Aware PAtchMatch Multi-View Stereo", @@ -29345,7 +30278,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Italy" + "aff_country_unique": "Italy", + "bibtex": "@InProceedings{Romanoni_2019_ICCV,\n \n author = {\n Romanoni,\n Andrea and Matteucci,\n Matteo\n},\n title = {\n TAPA-MVS: Textureless-Aware PAtchMatch Multi-View Stereo\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "TASED-Net: Temporally-Aggregating Spatial Encoder-Decoder Network for Video Saliency Detection", @@ -29369,7 +30303,8 @@ "aff_domain": ";", "email": ";", "author_num": 2, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Min_TASED-Net_Temporally-Aggregating_Spatial_Encoder-Decoder_Network_for_Video_Saliency_Detection_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Min_TASED-Net_Temporally-Aggregating_Spatial_Encoder-Decoder_Network_for_Video_Saliency_Detection_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Min_2019_ICCV,\n \n author = {\n Min,\n Kyle and Corso,\n Jason J.\n},\n title = {\n TASED-Net: Temporally-Aggregating Spatial Encoder-Decoder Network for Video Saliency Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "TRB: A Novel Triplet Representation for Understanding 2D Human Body", @@ -29395,14 +30330,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Duan_TRB_A_Novel_Triplet_Representation_for_Understanding_2D_Human_Body_ICCV_2019_paper.html", "aff_unique_index": "0;1;1;1;1;2", - "aff_unique_norm": "Chinese University of Hong Kong;SenseTime Group Limited;University of Sydney", + "aff_unique_norm": "Chinese University of Hong Kong;SenseTime Group Limited;The University of Sydney", "aff_unique_dep": "Sensetime Joint Lab;;", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.sensetime.com;https://www.sydney.edu.au", "aff_unique_abbr": "CUHK;SenseTime;USYD", "aff_campus_unique_index": "0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0;1", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Duan_2019_ICCV,\n \n author = {\n Duan,\n Haodong and Lin,\n Kwan-Yee and Jin,\n Sheng and Liu,\n Wentao and Qian,\n Chen and Ouyang,\n Wanli\n},\n title = {\n TRB: A Novel Triplet Representation for Understanding 2D Human Body\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "TSM: Temporal Shift Module for Efficient Video Understanding", @@ -29435,7 +30371,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Lin_2019_ICCV,\n \n author = {\n Lin,\n Ji and Gan,\n Chuang and Han,\n Song\n},\n title = {\n TSM: Temporal Shift Module for Efficient Video Understanding\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Tag2Pix: Line Art Colorization Using Text Tag With SECat and Changing Loss", @@ -29468,7 +30405,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Kim_2019_ICCV,\n \n author = {\n Kim,\n Hyunsu and Jhoo,\n Ho Young and Park,\n Eunhyeok and Yoo,\n Sungjoo\n},\n title = {\n Tag2Pix: Line Art Colorization Using Text Tag With SECat and Changing Loss\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Taking a HINT: Leveraging Explanations to Make Vision and Language Models More Grounded", @@ -29494,14 +30432,15 @@ "author_num": 8, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Selvaraju_Taking_a_HINT_Leveraging_Explanations_to_Make_Vision_and_Language_ICCV_2019_paper.html", "aff_unique_index": "0;1+2;1;1;1;1;0+3;0+3", - "aff_unique_norm": "Georgia Institute of Technology;Samsung;Oregon State University;Meta", - "aff_unique_dep": ";Samsung Research;;Facebook AI Research", + "aff_unique_norm": "Georgia Institute of Technology;Samsung Research;Oregon State University;Facebook", + "aff_unique_dep": ";;;Facebook AI Research", "aff_unique_url": "https://www.gatech.edu;https://research.samsung.com;https://oregonstate.edu;https://research.facebook.com", "aff_unique_abbr": "Georgia Tech;Samsung;OSU;FAIR", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0;1+0;1;1;1;1;0+0;0+0", - "aff_country_unique": "United States;South Korea" + "aff_country_unique": "United States;South Korea", + "bibtex": "@InProceedings{Selvaraju_2019_ICCV,\n \n author = {\n Selvaraju,\n Ramprasaath R. and Lee,\n Stefan and Shen,\n Yilin and Jin,\n Hongxia and Ghosh,\n Shalini and Heck,\n Larry and Batra,\n Dhruv and Parikh,\n Devi\n},\n title = {\n Taking a HINT: Leveraging Explanations to Make Vision and Language Models More Grounded\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Talking With Hands 16.2M: A Large-Scale Dataset of Synchronized Body-Finger Motion and Audio for Conversational Motion Analysis and Synthesis", @@ -29527,14 +30466,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Lee_Talking_With_Hands_16.2M_A_Large-Scale_Dataset_of_Synchronized_Body-Finger_ICCV_2019_paper.html", "aff_unique_index": "0;1;2+0;2;0;2", - "aff_unique_norm": "University of Washington;Simon Fraser University;Meta", + "aff_unique_norm": "University of Washington;Simon Fraser University;Facebook Reality Labs", "aff_unique_dep": ";;Facebook Reality Labs", "aff_unique_url": "https://www.washington.edu;https://www.sfu.ca;https://www.facebook.com/realitylabs", "aff_unique_abbr": "UW;SFU;FRL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0+0;0;0;0", - "aff_country_unique": "United States;Canada" + "aff_country_unique": "United States;Canada", + "bibtex": "@InProceedings{Lee_2019_ICCV,\n \n author = {\n Lee,\n Gilwoo and Deng,\n Zhiwei and Ma,\n Shugao and Shiratori,\n Takaaki and Srinivasa,\n Siddhartha S. and Sheikh,\n Yaser\n},\n title = {\n Talking With Hands 16.2M: A Large-Scale Dataset of Synchronized Body-Finger Motion and Audio for Conversational Motion Analysis and Synthesis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Targeted Mismatch Adversarial Attack: Query With a Flower to Retrieve the Tower", @@ -29542,7 +30482,7 @@ "status": "Poster", "track": "main", "pid": "5008", - "author_site": "Giorgos Tolias, Filip Radenovic, Ond\u00c5\u0099ej Chum", + "author_site": "Giorgos Tolias, Filip Radenovic, Ondřej Chum", "author": "Giorgos Tolias; Filip Radenovic; Ondrej Chum", "abstract": "Access to online visual search engines implies sharing of private user content -- the query images. We introduce the concept of targeted mismatch attack for deep learning based retrieval systems to generate an adversarial image to conceal the query image. The generated image looks nothing like the user intended query, but leads to identical or very similar retrieval results. Transferring attacks to fully unseen networks is challenging. We show successful attacks to partially unknown systems, by designing various loss functions for the adversarial image construction. These include loss functions, for example, for unknown global pooling operation or unknown input resolution by the retrieval system. We evaluate the attacks on standard retrieval benchmarks and compare the results retrieved with the original and adversarial image.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Tolias_Targeted_Mismatch_Adversarial_Attack_Query_With_a_Flower_to_Retrieve_ICCV_2019_paper.pdf", @@ -29558,7 +30498,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Tolias_Targeted_Mismatch_Adversarial_Attack_Query_With_a_Flower_to_Retrieve_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Tolias_Targeted_Mismatch_Adversarial_Attack_Query_With_a_Flower_to_Retrieve_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Tolias_2019_ICCV,\n \n author = {\n Tolias,\n Giorgos and Radenovic,\n Filip and Chum,\n Ondrej\n},\n title = {\n Targeted Mismatch Adversarial Attack: Query With a Flower to Retrieve the Tower\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Task-Driven Modular Networks for Zero-Shot Compositional Learning", @@ -29584,14 +30525,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Purushwalkam_Task-Driven_Modular_Networks_for_Zero-Shot_Compositional_Learning_ICCV_2019_paper.html", "aff_unique_index": "0;1;0+1;1", - "aff_unique_norm": "Carnegie Mellon University;Meta", + "aff_unique_norm": "Carnegie Mellon University;Facebook", "aff_unique_dep": ";Facebook AI Research", "aff_unique_url": "https://www.cmu.edu;https://research.facebook.com", "aff_unique_abbr": "CMU;FAIR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Purushwalkam_2019_ICCV,\n \n author = {\n Purushwalkam,\n Senthil and Nickel,\n Maximilian and Gupta,\n Abhinav and Ranzato,\n Marc'Aurelio\n},\n title = {\n Task-Driven Modular Networks for Zero-Shot Compositional Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Task2Vec: Task Embedding for Meta-Learning", @@ -29617,14 +30559,15 @@ "author_num": 8, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Achille_Task2Vec_Task_Embedding_for_Meta-Learning_ICCV_2019_paper.html", "aff_unique_index": "0+1;0;0;0;0+2;0+3;0+1;0+4", - "aff_unique_norm": "Amazon;University of California, Los Angeles;University of Massachusetts Amherst;University of California, Irvine;California Institute of Technology", - "aff_unique_dep": "Amazon Web Services;;;;", + "aff_unique_norm": "Amazon Web Services;University of California, Los Angeles;University of Massachusetts Amherst;University of California, Irvine;California Institute of Technology", + "aff_unique_dep": ";;;;", "aff_unique_url": "https://aws.amazon.com;https://www.ucla.edu;https://www.umass.edu;https://www.uci.edu;https://www.caltech.edu", "aff_unique_abbr": "AWS;UCLA;UMass Amherst;UCI;Caltech", "aff_campus_unique_index": "1;2;3;1;4", "aff_campus_unique": ";Los Angeles;Amherst;Irvine;Pasadena", "aff_country_unique_index": "0+0;0;0;0;0+0;0+0;0+0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Achille_2019_ICCV,\n \n author = {\n Achille,\n Alessandro and Lam,\n Michael and Tewari,\n Rahul and Ravichandran,\n Avinash and Maji,\n Subhransu and Fowlkes,\n Charless C. and Soatto,\n Stefano and Perona,\n Pietro\n},\n title = {\n Task2Vec: Task Embedding for Meta-Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Teacher Guided Architecture Search", @@ -29657,7 +30600,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Cambridge;", "aff_country_unique_index": "0+0+0;1;0+0+0", - "aff_country_unique": "United States;Netherlands" + "aff_country_unique": "United States;Netherlands", + "bibtex": "@InProceedings{Bashivan_2019_ICCV,\n \n author = {\n Bashivan,\n Pouya and Tensen,\n Mark and DiCarlo,\n James J.\n},\n title = {\n Teacher Guided Architecture Search\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Teacher Supervises Students How to Learn From Partially Labeled Images for Facial Landmark Detection", @@ -29690,7 +30634,8 @@ "aff_campus_unique_index": ";1", "aff_campus_unique": ";Sydney", "aff_country_unique_index": "0+0;1", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Dong_2019_ICCV,\n \n author = {\n Dong,\n Xuanyi and Yang,\n Yi\n},\n title = {\n Teacher Supervises Students How to Learn From Partially Labeled Images for Facial Landmark Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Tell, Draw, and Repeat: Generating and Modifying Images Based on Continual Linguistic Instruction", @@ -29702,7 +30647,7 @@ "author": "Alaaeldin El-Nouby; Shikhar Sharma; Hannes Schulz; Devon Hjelm; Layla El Asri; Samira Ebrahimi Kahou; Yoshua Bengio; Graham W. Taylor", "abstract": "Conditional text-to-image generation is an active area of research, with many possible applications. Existing research has primarily focused on generating a single image from available conditioning information in one step. One practical extension beyond one-step generation is a system that generates an image iteratively, conditioned on ongoing linguistic input or feedback. This is significantly more challenging than one-step generation tasks, as such a system must understand the contents of its generated images with respect to the feedback history, the current feedback, as well as the interactions among concepts present in the feedback history. In this work, we present a recurrent image generation model which takes into account both the generated output up to the current step as well as all past instructions for generation. We show that our model is able to generate the background, add new objects, and apply simple transformations to existing objects. We believe our approach is an important step toward interactive generation. Code and data is available at: https://www.microsoft.com/en-us/research/project/generative-neural-visual-artist-geneva/.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/El-Nouby_Tell_Draw_and_Repeat_Generating_and_Modifying_Images_Based_on_ICCV_2019_paper.pdf", - "aff": "University of Guelph+Vector Institute for Arti\ufb01cial Intelligence; Microsoft Research; Microsoft Research; Microsoft Research+University of Montreal+Canadian Institute for Advanced Research; Microsoft Research; Microsoft Research+Montreal Institute for Learning Algorithms+Canadian Institute for Advanced Research; Montreal Institute for Learning Algorithms+University of Montreal+Canadian Institute for Advanced Research; University of Guelph+Vector Institute for Arti\ufb01cial Intelligence+Canadian Institute for Advanced Research", + "aff": "University of Guelph+Vector Institute for Artificial Intelligence; Microsoft Research; Microsoft Research; Microsoft Research+University of Montreal+Canadian Institute for Advanced Research; Microsoft Research; Microsoft Research+Montreal Institute for Learning Algorithms+Canadian Institute for Advanced Research; Montreal Institute for Learning Algorithms+University of Montreal+Canadian Institute for Advanced Research; University of Guelph+Vector Institute for Artificial Intelligence+Canadian Institute for Advanced Research", "project": "https://www.microsoft.com/en-us/research/project/generative-neural-visual-artist-geneva/", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2019/supplemental/El-Nouby_Tell_Draw_and_ICCV_2019_supplemental.pdf", @@ -29716,14 +30661,15 @@ "author_num": 8, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/El-Nouby_Tell_Draw_and_Repeat_Generating_and_Modifying_Images_Based_on_ICCV_2019_paper.html", "aff_unique_index": "0+1;2;2;2+3+4;2;2+5+4;5+3+4;0+1+4", - "aff_unique_norm": "University of Guelph;Vector Institute for Artificial Intelligence;Microsoft;University of Montreal;Canadian Institute for Advanced Research;Montreal Institute for Learning Algorithms", - "aff_unique_dep": ";Artificial Intelligence;Microsoft Research;;;", + "aff_unique_norm": "University of Guelph;Vector Institute for Artificial Intelligence;Microsoft Corporation;University of Montreal;Canadian Institute for Advanced Research;Montreal Institute for Learning Algorithms", + "aff_unique_dep": ";Artificial Intelligence;Microsoft Research;;;Artificial Intelligence", "aff_unique_url": "https://www.uoguelph.ca;https://vectorinstitute.ai/;https://www.microsoft.com/en-us/research;https://wwwumontreal.ca;https://www.cifar.ca;https://mila.quebec", "aff_unique_abbr": "U of G;Vector Institute;MSR;UM;CIFAR;MILA", - "aff_campus_unique_index": ";;;;", - "aff_campus_unique": "", + "aff_campus_unique_index": ";;1;1;", + "aff_campus_unique": ";Montreal", "aff_country_unique_index": "0+0;1;1;1+0+0;1;1+0+0;0+0+0;0+0+0", - "aff_country_unique": "Canada;United States" + "aff_country_unique": "Canada;United States", + "bibtex": "@InProceedings{El-Nouby_2019_ICCV,\n \n author = {\n El-Nouby,\n Alaaeldin and Sharma,\n Shikhar and Schulz,\n Hannes and Hjelm,\n Devon and Asri,\n Layla El and Kahou,\n Samira Ebrahimi and Bengio,\n Yoshua and Taylor,\n Graham W.\n},\n title = {\n Tell,\n Draw,\n and Repeat: Generating and Modifying Images Based on Continual Linguistic Instruction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Temporal Attentive Alignment for Large-Scale Video Domain Adaptation", @@ -29747,7 +30693,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Chen_Temporal_Attentive_Alignment_for_Large-Scale_Video_Domain_Adaptation_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Chen_Temporal_Attentive_Alignment_for_Large-Scale_Video_Domain_Adaptation_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Chen_2019_ICCV,\n \n author = {\n Chen,\n Min-Hung and Kira,\n Zsolt and AlRegib,\n Ghassan and Yoo,\n Jaekwon and Chen,\n Ruxin and Zheng,\n Jian\n},\n title = {\n Temporal Attentive Alignment for Large-Scale Video Domain Adaptation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Temporal Knowledge Propagation for Image-to-Video Person Re-Identification", @@ -29771,7 +30718,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Gu_Temporal_Knowledge_Propagation_for_Image-to-Video_Person_Re-Identification_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Gu_Temporal_Knowledge_Propagation_for_Image-to-Video_Person_Re-Identification_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Gu_2019_ICCV,\n \n author = {\n Gu,\n Xinqian and Ma,\n Bingpeng and Chang,\n Hong and Shan,\n Shiguang and Chen,\n Xilin\n},\n title = {\n Temporal Knowledge Propagation for Image-to-Video Person Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Temporal Recurrent Networks for Online Action Detection", @@ -29804,7 +30752,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Xu_2019_ICCV,\n \n author = {\n Xu,\n Mingze and Gao,\n Mingfei and Chen,\n Yi-Ting and Davis,\n Larry S. and Crandall,\n David J.\n},\n title = {\n Temporal Recurrent Networks for Online Action Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Temporal Structure Mining for Weakly Supervised Action Detection", @@ -29828,7 +30777,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Yu_Temporal_Structure_Mining_for_Weakly_Supervised_Action_Detection_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Yu_Temporal_Structure_Mining_for_Weakly_Supervised_Action_Detection_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Yu_2019_ICCV,\n \n author = {\n Yu,\n Tan and Ren,\n Zhou and Li,\n Yuncheng and Yan,\n Enxu and Xu,\n Ning and Yuan,\n Junsong\n},\n title = {\n Temporal Structure Mining for Weakly Supervised Action Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "TensorMask: A Foundation for Dense Object Segmentation", @@ -29836,7 +30786,7 @@ "status": "Poster", "track": "main", "pid": "669", - "author_site": "Xinlei Chen, Ross Girshick, Kaiming He, Piotr Doll\u00c3\u00a1r", + "author_site": "Xinlei Chen, Ross Girshick, Kaiming He, Piotr Dollár", "author": "Xinlei Chen; Ross Girshick; Kaiming He; Piotr Dollar", "abstract": "Sliding-window object detectors that generate bounding-box object predictions over a dense, regular grid have advanced rapidly and proven popular. In contrast, modern instance segmentation approaches are dominated by methods that first detect object bounding boxes, and then crop and segment these regions, as popularized by Mask R-CNN. In this work, we investigate the paradigm of dense sliding-window instance segmentation, which is surprisingly under-explored. Our core observation is that this task is fundamentally different than other dense prediction tasks such as semantic segmentation or bounding-box object detection, as the output at every spatial location is itself a geometric structure with its own spatial dimensions. To formalize this, we treat dense instance segmentation as a prediction task over 4D tensors and present a general framework called TensorMask that explicitly captures this geometry and enables novel operators on 4D tensors. We demonstrate that the tensor view leads to large gains over baselines that ignore this structure, and leads to results comparable to Mask R-CNN. These promising results suggest that TensorMask can serve as a foundation for novel advances in dense mask prediction and a more complete understanding of the task. Code will be made available.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Chen_TensorMask_A_Foundation_for_Dense_Object_Segmentation_ICCV_2019_paper.pdf", @@ -29852,7 +30802,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Chen_TensorMask_A_Foundation_for_Dense_Object_Segmentation_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Chen_TensorMask_A_Foundation_for_Dense_Object_Segmentation_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Chen_2019_ICCV,\n \n author = {\n Chen,\n Xinlei and Girshick,\n Ross and He,\n Kaiming and Dollar,\n Piotr\n},\n title = {\n TensorMask: A Foundation for Dense Object Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Tex2Shape: Detailed Full Human Body Geometry From a Single Image", @@ -29885,7 +30836,8 @@ "aff_campus_unique_index": "1;1;1;1", "aff_campus_unique": ";Saarland", "aff_country_unique_index": "0+0;0;0;0+0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Alldieck_2019_ICCV,\n \n author = {\n Alldieck,\n Thiemo and Pons-Moll,\n Gerard and Theobalt,\n Christian and Magnor,\n Marcus\n},\n title = {\n Tex2Shape: Detailed Full Human Body Geometry From a Single Image\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "TextDragon: An End-to-End Framework for Arbitrary Shaped Text Spotting", @@ -29909,7 +30861,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Feng_TextDragon_An_End-to-End_Framework_for_Arbitrary_Shaped_Text_Spotting_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Feng_TextDragon_An_End-to-End_Framework_for_Arbitrary_Shaped_Text_Spotting_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Feng_2019_ICCV,\n \n author = {\n Feng,\n Wei and He,\n Wenhao and Yin,\n Fei and Zhang,\n Xu-Yao and Liu,\n Cheng-Lin\n},\n title = {\n TextDragon: An End-to-End Framework for Arbitrary Shaped Text Spotting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "TextPlace: Visual Place Recognition and Topological Localization Through Reading Scene Texts", @@ -29942,7 +30895,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Edinburgh;", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Hong_2019_ICCV,\n \n author = {\n Hong,\n Ziyang and Petillot,\n Yvan and Lane,\n David and Miao,\n Yishu and Wang,\n Sen\n},\n title = {\n TextPlace: Visual Place Recognition and Topological Localization Through Reading Scene Texts\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Texture Fields: Learning Texture Representations in Function Space", @@ -29954,7 +30908,7 @@ "author": "Michael Oechsle; Lars Mescheder; Michael Niemeyer; Thilo Strauss; Andreas Geiger", "abstract": "In recent years, substantial progress has been achieved in learning-based reconstruction of 3D objects. At the same time, generative models were proposed that can generate highly realistic images. However, despite this success in these closely related tasks, texture reconstruction of 3D objects has received little attention from the research community and state-of-the-art methods are either limited to comparably low resolution or constrained experimental setups. A major reason for these limitations is that common representations of texture are inefficient or hard to interface for modern deep learning techniques. In this paper, we propose Texture Fields, a novel texture representation which is based on regressing a continuous 3D function parameterized with a neural network. Our approach circumvents limiting factors like shape discretization and parameterization, as the proposed texture representation is independent of the shape representation of the 3D object. We show that Texture Fields are able to represent high frequency texture and naturally blend with modern deep learning techniques. Experimentally, we find that Texture Fields compare favorably to state-of-the-art methods for conditional texture reconstruction of 3D objects and enable learning of probabilistic generative models for texturing unseen 3D models. We believe that Texture Fields will become an important building block for the next generation of generative 3D models.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Oechsle_Texture_Fields_Learning_Texture_Representations_in_Function_Space_ICCV_2019_paper.pdf", - "aff": "Autonomous Vision Group, MPI for Intelligent Systems and University of T\u00fcbingen+ETAS GmbH, Bosch Group, Stuttgart; Autonomous Vision Group, MPI for Intelligent Systems and University of T\u00fcbingen; Autonomous Vision Group, MPI for Intelligent Systems and University of T\u00fcbingen; ETAS GmbH, Bosch Group, Stuttgart; Autonomous Vision Group, MPI for Intelligent Systems and University of T\u00fcbingen", + "aff": "Autonomous Vision Group, MPI for Intelligent Systems and University of Tübingen+ETAS GmbH, Bosch Group, Stuttgart; Autonomous Vision Group, MPI for Intelligent Systems and University of Tübingen; Autonomous Vision Group, MPI for Intelligent Systems and University of Tübingen; ETAS GmbH, Bosch Group, Stuttgart; Autonomous Vision Group, MPI for Intelligent Systems and University of Tübingen", "project": "", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2019/supplemental/Oechsle_Texture_Fields_Learning_ICCV_2019_supplemental.pdf", @@ -29975,7 +30929,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Oechsle_2019_ICCV,\n \n author = {\n Oechsle,\n Michael and Mescheder,\n Lars and Niemeyer,\n Michael and Strauss,\n Thilo and Geiger,\n Andreas\n},\n title = {\n Texture Fields: Learning Texture Representations in Function Space\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "TexturePose: Supervising Human Mesh Estimation With Texture Consistency", @@ -29999,7 +30954,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Pavlakos_TexturePose_Supervising_Human_Mesh_Estimation_With_Texture_Consistency_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Pavlakos_TexturePose_Supervising_Human_Mesh_Estimation_With_Texture_Consistency_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Pavlakos_2019_ICCV,\n \n author = {\n Pavlakos,\n Georgios and Kolotouros,\n Nikos and Daniilidis,\n Kostas\n},\n title = {\n TexturePose: Supervising Human Mesh Estimation With Texture Consistency\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "The LogBarrier Adversarial Attack: Making Effective Use of Decision Boundary Information", @@ -30032,7 +30988,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Finlay_2019_ICCV,\n \n author = {\n Finlay,\n Chris and Pooladian,\n Aram-Alexandre and Oberman,\n Adam\n},\n title = {\n The LogBarrier Adversarial Attack: Making Effective Use of Decision Boundary Information\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "The Sound of Motions", @@ -30065,7 +31022,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhao_2019_ICCV,\n \n author = {\n Zhao,\n Hang and Gan,\n Chuang and Ma,\n Wei-Chiu and Torralba,\n Antonio\n},\n title = {\n The Sound of Motions\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "The Trajectron: Probabilistic Multi-Agent Trajectory Modeling With Dynamic Spatiotemporal Graphs", @@ -30098,7 +31056,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Ivanovic_2019_ICCV,\n \n author = {\n Ivanovic,\n Boris and Pavone,\n Marco\n},\n title = {\n The Trajectron: Probabilistic Multi-Agent Trajectory Modeling With Dynamic Spatiotemporal Graphs\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Three-D Safari: Learning to Estimate Zebra Pose, Shape, and Texture From Images \"In the Wild\"", @@ -30110,7 +31069,7 @@ "author": "Silvia Zuffi; Angjoo Kanazawa; Tanya Berger-Wolf; Michael J. Black", "abstract": "We present the first method to perform automatic 3D pose, shape and texture capture of animals from images acquired in-the-wild. In particular, we focus on the problem of capturing 3D information about Grevy's zebras from a collection of images. The Grevy's zebra is one of the most endangered species in Africa, with only a few thousand individuals left. Capturing the shape and pose of these animals can provide biologists and conservationists with information about animal health and behavior. In contrast to research on human pose, shape and texture estimation, training data for endangered species is limited, the animals are in complex natural scenes with occlusion, they are naturally camouflaged, travel in herds, and look similar to each other. To overcome these challenges, we integrate the recent SMAL animal model into a network-based regression pipeline, which we train end-to-end on synthetically generated images with pose, shape, and background variation. Going beyond state-of-the-art methods for human shape and pose estimation, our method learns a shape space for zebras during training. Learning such a shape space from images using only a photometric loss is novel, and the approach can be used to learn shape in other settings with limited 3D supervision. Moreover, we couple 3D pose and shape prediction with the task of texture synthesis, obtaining a full texture map of the animal from a single image. We show that the predicted texture map allows a novel per-instance unsupervised optimization over the network features. This method, SMALST (SMAL with learned Shape and Texture) goes beyond previous work, which assumed manual keypoints and/or segmentation, to regress directly from pixels to 3D animal shape, pose and texture.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Zuffi_Three-D_Safari_Learning_to_Estimate_Zebra_Pose_Shape_and_Texture_ICCV_2019_paper.pdf", - "aff": "IMATI-CNR, Milan, Italy; University of California, Berkeley; University of Illinois at Chicago; Max Planck Institute for Intelligent Systems, T\u00fcbingen, Germany", + "aff": "IMATI-CNR, Milan, Italy; University of California, Berkeley; University of Illinois at Chicago; Max Planck Institute for Intelligent Systems, Tübingen, Germany", "project": "", "github": "https://github.com/silviazuffi/smalst", "supp": "http://openaccess.thecvf.com/content_ICCV_2019/supplemental/Zuffi_Three-D_Safari_Learning_ICCV_2019_supplemental.pdf", @@ -30129,9 +31088,10 @@ "aff_unique_url": "https://www.imati.cnr.it;https://www.berkeley.edu;https://www.uic.edu;https://www.mpi-is.mpg.de", "aff_unique_abbr": ";UC Berkeley;UIC;MPI-IS", "aff_campus_unique_index": "0;1;2;3", - "aff_campus_unique": "Milan;Berkeley;Chicago;T\u00fcbingen", + "aff_campus_unique": "Milan;Berkeley;Chicago;Tübingen", "aff_country_unique_index": "0;1;1;2", - "aff_country_unique": "Italy;United States;Germany" + "aff_country_unique": "Italy;United States;Germany", + "bibtex": "@InProceedings{Zuffi_2019_ICCV,\n \n author = {\n Zuffi,\n Silvia and Kanazawa,\n Angjoo and Berger-Wolf,\n Tanya and Black,\n Michael J.\n},\n title = {\n Three-D Safari: Learning to Estimate Zebra Pose,\n Shape,\n and Texture From Images \"In the Wild\"\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Through-Wall Human Mesh Recovery Using Radio Signals", @@ -30164,7 +31124,8 @@ "aff_campus_unique_index": "0;0;0;0;0;0;0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhao_2019_ICCV,\n \n author = {\n Zhao,\n Mingmin and Liu,\n Yingcheng and Raghu,\n Aniruddh and Li,\n Tianhong and Zhao,\n Hang and Torralba,\n Antonio and Katabi,\n Dina\n},\n title = {\n Through-Wall Human Mesh Recovery Using Radio Signals\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "ThunderNet: Towards Real-Time Generic Object Detection on Mobile Devices", @@ -30197,7 +31158,8 @@ "aff_campus_unique_index": ";;;", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Qin_2019_ICCV,\n \n author = {\n Qin,\n Zheng and Li,\n Zeming and Zhang,\n Zhaoning and Bao,\n Yiping and Yu,\n Gang and Peng,\n Yuxing and Sun,\n Jian\n},\n title = {\n ThunderNet: Towards Real-Time Generic Object Detection on Mobile Devices\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Topological Map Extraction From Overhead Images", @@ -30205,11 +31167,11 @@ "status": "Poster", "track": "main", "pid": "3034", - "author_site": "Zuoyue Li, Jan Dirk Wegner, Aur\u00c3\u00a9lien Lucchi", + "author_site": "Zuoyue Li, Jan Dirk Wegner, Aurélien Lucchi", "author": "Zuoyue Li; Jan Dirk Wegner; Aurelien Lucchi", "abstract": "We propose a new approach, named PolyMapper, to circumvent the conventional pixel-wise segmentation of (aerial) images and predict objects in a vector representation directly. PolyMapper directly extracts the topological map of a city from overhead images as collections of building footprints and road networks. In order to unify the shape representation for different types of objects, we also propose a novel sequentialization method that reformulates a graph structure as closed polygons. Experiments are conducted on both existing and self-collected large-scale datasets of several cities. Our empirical results demonstrate that our end-to-end learnable model is capable of drawing polygons of building footprints and road networks that very closely approximate the structure of existing online map services, in a fully automated manner. Quantitative and qualitative comparison to the state-of-the-arts also show that our approach achieves good levels of performance. To the best of our knowledge, the automatic extraction of large-scale topological maps is a novel contribution in the remote sensing community that we believe will help develop models with more informed geometrical constraints.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Li_Topological_Map_Extraction_From_Overhead_Images_ICCV_2019_paper.pdf", - "aff": "ETH Z\u00fcrich, Switzerland; ETH Z\u00fcrich, Switzerland; ETH Z\u00fcrich, Switzerland", + "aff": "ETH Zürich, Switzerland; ETH Zürich, Switzerland; ETH Zürich, Switzerland", "project": "", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2019/supplemental/Li_Topological_Map_Extraction_ICCV_2019_supplemental.pdf", @@ -30223,14 +31185,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Li_Topological_Map_Extraction_From_Overhead_Images_ICCV_2019_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "ETH Zurich", + "aff_unique_norm": "ETH Zürich", "aff_unique_dep": "", "aff_unique_url": "https://www.ethz.ch", "aff_unique_abbr": "ETHZ", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Switzerland" + "aff_country_unique": "Switzerland", + "bibtex": "@InProceedings{Li_2019_ICCV,\n \n author = {\n Li,\n Zuoyue and Wegner,\n Jan Dirk and Lucchi,\n Aurelien\n},\n title = {\n Topological Map Extraction From Overhead Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Total Denoising: Unsupervised Learning of 3D Point Cloud Cleaning", @@ -30254,7 +31217,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Hermosilla_Total_Denoising_Unsupervised_Learning_of_3D_Point_Cloud_Cleaning_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Hermosilla_Total_Denoising_Unsupervised_Learning_of_3D_Point_Cloud_Cleaning_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Hermosilla_2019_ICCV,\n \n author = {\n Hermosilla,\n Pedro and Ritschel,\n Tobias and Ropinski,\n Timo\n},\n title = {\n Total Denoising: Unsupervised Learning of 3D Point Cloud Cleaning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Toward Real-World Single Image Super-Resolution: A New Benchmark and a New Model", @@ -30280,14 +31244,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Cai_Toward_Real-World_Single_Image_Super-Resolution_A_New_Benchmark_and_a_ICCV_2019_paper.html", "aff_unique_index": "0;0;0+1;2;0+1", - "aff_unique_norm": "Hong Kong Polytechnic University;Alibaba Group;DJI", + "aff_unique_norm": "The Hong Kong Polytechnic University;Alibaba Group;DJI", "aff_unique_dep": ";DAMO Academy;", "aff_unique_url": "https://www.polyu.edu.hk;https://www.alibaba-group.com;https://www.dji.com", "aff_unique_abbr": "PolyU;Alibaba;DJI", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0+0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Cai_2019_ICCV,\n \n author = {\n Cai,\n Jianrui and Zeng,\n Hui and Yong,\n Hongwei and Cao,\n Zisheng and Zhang,\n Lei\n},\n title = {\n Toward Real-World Single Image Super-Resolution: A New Benchmark and a New Model\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Towards Adversarially Robust Object Detection", @@ -30313,14 +31278,15 @@ "author_num": 2, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Zhang_Towards_Adversarially_Robust_Object_Detection_ICCV_2019_paper.html", "aff_unique_index": "0;0", - "aff_unique_norm": "Baidu", + "aff_unique_norm": "Baidu Research", "aff_unique_dep": "Research", "aff_unique_url": "https://research.baidu.com", - "aff_unique_abbr": "Baidu Res.", + "aff_unique_abbr": "Baidu", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Sunnyvale", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhang_2019_ICCV,\n \n author = {\n Zhang,\n Haichao and Wang,\n Jianyu\n},\n title = {\n Towards Adversarially Robust Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Towards Bridging Semantic Gap to Improve Semantic Segmentation", @@ -30332,7 +31298,7 @@ "author": "Yanwei Pang; Yazhao Li; Jianbing Shen; Ling Shao", "abstract": "Aggregating multi-level features is essential for capturing multi-scale context information for precise scene semantic segmentation. However, the improvement by directly fusing shallow features and deep features becomes limited as the semantic gap between them increases. To solve this problem, we explore two strategies for robust feature fusion. One is enhancing shallow features using a semantic enhancement module (SeEM) to alleviate the semantic gap between shallow features and deep features. The other strategy is feature attention, which involves discovering complementary information (i.e., boundary information) from low-level features to enhance high-level features for precise segmentation. By embedding these two strategies, we construct a parallel feature pyramid towards improving multi-level feature fusion. A Semantic Enhanced Network called SeENet is constructed with the parallel pyramid to implement precise segmentation. Experiments on three benchmark datasets demonstrate the effectiveness of our method for robust multi-level feature aggregation. As a result, our SeENet has achieved better performance than other state-of-the-art methods for semantic segmentation.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Pang_Towards_Bridging_Semantic_Gap_to_Improve_Semantic_Segmentation_ICCV_2019_paper.pdf", - "aff": "Tianjin University, Tianjin, China; Tianjin University, Tianjin, China; Inception Institute of Arti\ufb01cial Intelligence, Abu Dhabi, UAE; Inception Institute of Arti\ufb01cial Intelligence, Abu Dhabi, UAE", + "aff": "Tianjin University, Tianjin, China; Tianjin University, Tianjin, China; Inception Institute of Artificial Intelligence, Abu Dhabi, UAE; Inception Institute of Artificial Intelligence, Abu Dhabi, UAE", "project": "", "github": "", "supp": "", @@ -30353,7 +31319,8 @@ "aff_campus_unique_index": "0;0;1;1", "aff_campus_unique": "Tianjin;Abu Dhabi", "aff_country_unique_index": "0;0;1;1", - "aff_country_unique": "China;United Arab Emirates" + "aff_country_unique": "China;United Arab Emirates", + "bibtex": "@InProceedings{Pang_2019_ICCV,\n \n author = {\n Pang,\n Yanwei and Li,\n Yazhao and Shen,\n Jianbing and Shao,\n Ling\n},\n title = {\n Towards Bridging Semantic Gap to Improve Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Towards High-Resolution Salient Object Detection", @@ -30379,14 +31346,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Zeng_Towards_High-Resolution_Salient_Object_Detection_ICCV_2019_paper.html", "aff_unique_index": "0;0;1;1;0", - "aff_unique_norm": "Dalian University of Technology;Adobe", - "aff_unique_dep": ";Adobe Research", + "aff_unique_norm": "Dalian University of Technology;Adobe Research", + "aff_unique_dep": ";", "aff_unique_url": "http://www.dlut.edu.cn/;https://research.adobe.com", "aff_unique_abbr": "DUT;Adobe", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;1;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Zeng_2019_ICCV,\n \n author = {\n Zeng,\n Yi and Zhang,\n Pingping and Zhang,\n Jianming and Lin,\n Zhe and Lu,\n Huchuan\n},\n title = {\n Towards High-Resolution Salient Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Towards Interpretable Face Recognition", @@ -30412,14 +31380,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Yin_Towards_Interpretable_Face_Recognition_ICCV_2019_paper.html", "aff_unique_index": "0;0;1+2;3+2;0", - "aff_unique_norm": "Michigan State University;Wormpex AI Research;Adobe;ByteDance", - "aff_unique_dep": ";AI Research;Adobe Inc.;AI Lab", + "aff_unique_norm": "Michigan State University;Wormpex AI Research;Adobe Inc.;ByteDance", + "aff_unique_dep": ";AI Research;;AI Lab", "aff_unique_url": "https://www.msu.edu;;https://www.adobe.com;https://www.bytedance.com", "aff_unique_abbr": "MSU;Wormpex AI;Adobe;ByteDance", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0;1+0;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Yin_2019_ICCV,\n \n author = {\n Yin,\n Bangjie and Tran,\n Luan and Li,\n Haoxiang and Shen,\n Xiaohui and Liu,\n Xiaoming\n},\n title = {\n Towards Interpretable Face Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Towards Interpretable Object Detection by Unfolding Latent Structures", @@ -30452,7 +31421,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0", - "aff_country_unique": "United States;" + "aff_country_unique": "United States;", + "bibtex": "@InProceedings{Wu_2019_ICCV,\n \n author = {\n Wu,\n Tianfu and Song,\n Xi\n},\n title = {\n Towards Interpretable Object Detection by Unfolding Latent Structures\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Towards Latent Attribute Discovery From Triplet Similarities", @@ -30485,7 +31455,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Pittsburgh", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Nigam_2019_ICCV,\n \n author = {\n Nigam,\n Ishan and Tokmakov,\n Pavel and Ramanan,\n Deva\n},\n title = {\n Towards Latent Attribute Discovery From Triplet Similarities\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Towards Multi-Pose Guided Virtual Try-On Network", @@ -30518,7 +31489,8 @@ "aff_campus_unique_index": ";;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0;0+0;0+0;1;0+0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Dong_2019_ICCV,\n \n author = {\n Dong,\n Haoye and Liang,\n Xiaodan and Shen,\n Xiaohui and Wang,\n Bochao and Lai,\n Hanjiang and Zhu,\n Jia and Hu,\n Zhiting and Yin,\n Jian\n},\n title = {\n Towards Multi-Pose Guided Virtual Try-On Network\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Towards Photorealistic Reconstruction of Highly Multiplexed Lensless Images", @@ -30551,7 +31523,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Madras;", "aff_country_unique_index": "0;0;1;1;1;0", - "aff_country_unique": "India;United States" + "aff_country_unique": "India;United States", + "bibtex": "@InProceedings{Khan_2019_ICCV,\n \n author = {\n Khan,\n Salman S. and ,\n Adarsh V. R. and Boominathan,\n Vivek and Tan,\n Jasper and Veeraraghavan,\n Ashok and Mitra,\n Kaushik\n},\n title = {\n Towards Photorealistic Reconstruction of Highly Multiplexed Lensless Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Towards Precise End-to-End Weakly Supervised Object Detection Network", @@ -30584,7 +31557,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yang_2019_ICCV,\n \n author = {\n Yang,\n Ke and Li,\n Dongsheng and Dou,\n Yong\n},\n title = {\n Towards Precise End-to-End Weakly Supervised Object Detection Network\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Towards Unconstrained End-to-End Text Spotting", @@ -30617,7 +31591,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Qin_2019_ICCV,\n \n author = {\n Qin,\n Siyang and Bissacco,\n Alessandro and Raptis,\n Michalis and Fujii,\n Yasuhisa and Xiao,\n Ying\n},\n title = {\n Towards Unconstrained End-to-End Text Spotting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Towards Unsupervised Image Captioning With Shared Multimodal Embeddings", @@ -30629,7 +31604,7 @@ "author": "Iro Laina; Christian Rupprecht; Nassir Navab", "abstract": "Understanding images without explicit supervision has become an important problem in computer vision. In this paper, we address image captioning by generating language descriptions of scenes without learning from annotated pairs of images and their captions. The core component of our approach is a shared latent space that is structured by visual concepts. In this space, the two modalities should be indistinguishable. A language model is first trained to encode sentences into semantically structured embeddings. Image features that are translated into this embedding space can be decoded into descriptions through the same language model, similarly to sentence embeddings. This translation is learned from weakly paired images and text using a loss robust to noisy assignments and a conditional adversarial component. Our approach allows to exploit large text corpora outside the annotated distributions of image/caption data. Our experiments show that the proposed domain alignment learns a semantically meaningful representation which outperforms previous work.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Laina_Towards_Unsupervised_Image_Captioning_With_Shared_Multimodal_Embeddings_ICCV_2019_paper.pdf", - "aff": "Technische Universit \u00a8at M \u00a8unchen; University of Oxford; Technische Universit \u00a8at M \u00a8unchen", + "aff": "Technische Universit ¨at M ¨unchen; University of Oxford; Technische Universit ¨at M ¨unchen", "project": "", "github": "", "supp": "", @@ -30643,14 +31618,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Laina_Towards_Unsupervised_Image_Captioning_With_Shared_Multimodal_Embeddings_ICCV_2019_paper.html", "aff_unique_index": "0;1;0", - "aff_unique_norm": "Technische Universit\u00e4t M\u00fcnchen;University of Oxford", + "aff_unique_norm": "Technische Universität München;University of Oxford", "aff_unique_dep": ";", "aff_unique_url": "https://www.tum.de;https://www.ox.ac.uk", "aff_unique_abbr": "TUM;Oxford", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "Germany;United Kingdom" + "aff_country_unique": "Germany;United Kingdom", + "bibtex": "@InProceedings{Laina_2019_ICCV,\n \n author = {\n Laina,\n Iro and Rupprecht,\n Christian and Navab,\n Nassir\n},\n title = {\n Towards Unsupervised Image Captioning With Shared Multimodal Embeddings\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Toyota Smarthome: Real-World Activities of Daily Living", @@ -30674,7 +31650,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Das_Toyota_Smarthome_Real-World_Activities_of_Daily_Living_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Das_Toyota_Smarthome_Real-World_Activities_of_Daily_Living_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Das_2019_ICCV,\n \n author = {\n Das,\n Srijan and Dai,\n Rui and Koperski,\n Michal and Minciullo,\n Luca and Garattoni,\n Lorenzo and Bremond,\n Francois and Francesca,\n Gianpiero\n},\n title = {\n Toyota Smarthome: Real-World Activities of Daily Living\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Tracking Without Bells and Whistles", @@ -30682,7 +31659,7 @@ "status": "Poster", "track": "main", "pid": "2441", - "author_site": "Philipp Bergmann, Tim Meinhardt, Laura Leal-Taix\u00c3\u00a9", + "author_site": "Philipp Bergmann, Tim Meinhardt, Laura Leal-Taixé", "author": "Philipp Bergmann; Tim Meinhardt; Laura Leal-Taixe", "abstract": "The problem of tracking multiple objects in a video sequence poses several challenging tasks. For tracking-by-detection, these include object re-identification, motion prediction and dealing with occlusions. We present a tracker (without bells and whistles) that accomplishes tracking without specifically targeting any of these tasks, in particular, we perform no training or optimization on tracking data. To this end, we exploit the bounding box regression of an object detector to predict the position of an object in the next frame, thereby converting a detector into a Tracktor. We demonstrate the potential of Tracktor and provide a new state-of-the-art on three multi-object tracking benchmarks by extending it with a straightforward re-identification and camera motion compensation. We then perform an analysis on the performance and failure cases of several state-of-the-art tracking methods in comparison to our Tracktor. Surprisingly, none of the dedicated tracking methods are considerably better in dealing with complex tracking scenarios, namely, small and occluded objects or missing detections. However, our approach tackles most of the easy tracking scenarios. Therefore, we motivate our approach as a new tracking paradigm and point out promising future research directions. Overall, Tracktor yields superior tracking performance than any current tracking method and our analysis exposes remaining and unsolved tracking challenges to inspire future research directions.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Bergmann_Tracking_Without_Bells_and_Whistles_ICCV_2019_paper.pdf", @@ -30707,7 +31684,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Bergmann_2019_ICCV,\n \n author = {\n Bergmann,\n Philipp and Meinhardt,\n Tim and Leal-Taixe,\n Laura\n},\n title = {\n Tracking Without Bells and Whistles\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Transductive Episodic-Wise Adaptive Metric for Few-Shot Learning", @@ -30733,14 +31711,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Qiao_Transductive_Episodic-Wise_Adaptive_Metric_for_Few-Shot_Learning_ICCV_2019_paper.html", "aff_unique_index": "0+1;0+1;2+1;1;0+1;0+1", - "aff_unique_norm": "Peking University;Pengcheng Laboratory;Beihang University", - "aff_unique_dep": "Center for Data Science;Peng Cheng Laboratory;State Key Laboratory of Virtual Reality Technology and Systems, SCSE", + "aff_unique_norm": "Peking University;Peng Cheng Laboratory;Beihang University", + "aff_unique_dep": "Center for Data Science;;State Key Laboratory of Virtual Reality Technology and Systems, SCSE", "aff_unique_url": "http://www.pku.edu.cn;;http://www.buaa.edu.cn", "aff_unique_abbr": "PKU;;Beihang", - "aff_campus_unique_index": "0;;;;", - "aff_campus_unique": "Beijing;", + "aff_campus_unique_index": "0+1;1;1;1;1;1", + "aff_campus_unique": "Beijing;Shenzhen;", "aff_country_unique_index": "0+0;0+0;0+0;0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Qiao_2019_ICCV,\n \n author = {\n Qiao,\n Limeng and Shi,\n Yemin and Li,\n Jia and Wang,\n Yaowei and Huang,\n Tiejun and Tian,\n Yonghong\n},\n title = {\n Transductive Episodic-Wise Adaptive Metric for Few-Shot Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Transductive Learning for Zero-Shot Object Detection", @@ -30763,7 +31742,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Rahman_Transductive_Learning_for_Zero-Shot_Object_Detection_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Rahman_Transductive_Learning_for_Zero-Shot_Object_Detection_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Rahman_2019_ICCV,\n \n author = {\n Rahman,\n Shafin and Khan,\n Salman and Barnes,\n Nick\n},\n title = {\n Transductive Learning for Zero-Shot Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Transferability and Hardness of Supervised Classification Tasks", @@ -30789,14 +31769,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Tran_Transferability_and_Hardness_of_Supervised_Classification_Tasks_ICCV_2019_paper.html", "aff_unique_index": "0;1;2", - "aff_unique_norm": "VinAI Research;Amazon;Meta", - "aff_unique_dep": ";Amazon Web Services;Facebook AI", - "aff_unique_url": "https://www.vinai.io/;https://aws.amazon.com;https://www.facebook.com", + "aff_unique_norm": "VinAI Research;Amazon Web Services;Facebook", + "aff_unique_dep": ";;Facebook AI", + "aff_unique_url": "https://www.vinai.io;https://aws.amazon.com;https://www.facebook.com", "aff_unique_abbr": "VinAI;AWS;Facebook AI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", - "aff_country_unique": "Vietnam;United States" + "aff_country_unique": "Vietnam;United States", + "bibtex": "@InProceedings{Tran_2019_ICCV,\n \n author = {\n Tran,\n Anh T. and Nguyen,\n Cuong V. and Hassner,\n Tal\n},\n title = {\n Transferability and Hardness of Supervised Classification Tasks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Transferable Contrastive Network for Generalized Zero-Shot Learning", @@ -30824,12 +31805,13 @@ "aff_unique_index": "0+1+2+3;0+1;0+1;0+1", "aff_unique_norm": "Chinese Academy of Sciences;University of Chinese Academy of Sciences;Shanghai Institute of Microsystem and Information Technology;ShanghaiTech University", "aff_unique_dep": "Institute of Computing Technology;;;School of Information Science and Technology", - "aff_unique_url": "http://www.cas.ac.cn;http://www.ucas.ac.cn;;https://www.shanghaitech.edu.cn", + "aff_unique_url": "http://www.cas.ac.cn;http://www.ucas.ac.cn;;http://www.shanghaitech.edu.cn", "aff_unique_abbr": "CAS;UCAS;;ShanghaiTech", "aff_campus_unique_index": "0+0+1+1;0+0;0+0;0+0", "aff_campus_unique": "Beijing;Shanghai", "aff_country_unique_index": "0+0+0+0;0+0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Jiang_2019_ICCV,\n \n author = {\n Jiang,\n Huajie and Wang,\n Ruiping and Shan,\n Shiguang and Chen,\n Xilin\n},\n title = {\n Transferable Contrastive Network for Generalized Zero-Shot Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Transferable Representation Learning in Vision-and-Language Navigation", @@ -30862,7 +31844,8 @@ "aff_campus_unique_index": "0;0;0;0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Huang_2019_ICCV,\n \n author = {\n Huang,\n Haoshuo and Jain,\n Vihan and Mehta,\n Harsh and Ku,\n Alexander and Magalhaes,\n Gabriel and Baldridge,\n Jason and Ie,\n Eugene\n},\n title = {\n Transferable Representation Learning in Vision-and-Language Navigation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Transferable Semi-Supervised 3D Object Detection From RGB-D Data", @@ -30895,7 +31878,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Tang_2019_ICCV,\n \n author = {\n Tang,\n Yew Siang and Lee,\n Gim Hee\n},\n title = {\n Transferable Semi-Supervised 3D Object Detection From RGB-D Data\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Transformable Bottleneck Networks", @@ -30921,14 +31905,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Olszewski_Transformable_Bottleneck_Networks_ICCV_2019_paper.html", "aff_unique_index": "0+0+1;2;2;0+0+1;3", - "aff_unique_norm": "University of Southern California;Pinscreen Inc.;Snap Inc.;ByteDance", + "aff_unique_norm": "University of Southern California;Pinscreen;Snap Inc.;ByteDance", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.usc.edu;https://www.pinscreen.com;https://www.snapinc.com;https://www.bytedance.com", "aff_unique_abbr": "USC;Pinscreen;Snap;ByteDance", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0+0+0;0;0;0+0+0;1", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Olszewski_2019_ICCV,\n \n author = {\n Olszewski,\n Kyle and Tulyakov,\n Sergey and Woodford,\n Oliver and Li,\n Hao and Luo,\n Linjie\n},\n title = {\n Transformable Bottleneck Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Two-Stream Action Recognition-Oriented Video Super-Resolution", @@ -30961,7 +31946,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hefei", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2019_ICCV,\n \n author = {\n Zhang,\n Haochen and Liu,\n Dong and Xiong,\n Zhiwei\n},\n title = {\n Two-Stream Action Recognition-Oriented Video Super-Resolution\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "U-CAM: Visual Explanation Using Uncertainty Based Class Activation Maps", @@ -30994,7 +31980,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Kanpur", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "India" + "aff_country_unique": "India", + "bibtex": "@InProceedings{Patro_2019_ICCV,\n \n author = {\n Patro,\n Badri N. and Lunayach,\n Mayank and Patel,\n Shivansh and Namboodiri,\n Vinay P.\n},\n title = {\n U-CAM: Visual Explanation Using Uncertainty Based Class Activation Maps\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "U4D: Unsupervised 4D Dynamic Scene Understanding", @@ -31027,7 +32014,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Mustafa_2019_ICCV,\n \n author = {\n Mustafa,\n Armin and Russell,\n Chris and Hilton,\n Adrian\n},\n title = {\n U4D: Unsupervised 4D Dynamic Scene Understanding\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "UM-Adapt: Unsupervised Multi-Task Adaptation Using Adversarial Cross-Task Distillation", @@ -31060,7 +32048,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Bangalore", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "India" + "aff_country_unique": "India", + "bibtex": "@InProceedings{Kundu_2019_ICCV,\n \n author = {\n Kundu,\n Jogendra Nath and Lakkakula,\n Nishank and Babu,\n R. Venkatesh\n},\n title = {\n UM-Adapt: Unsupervised Multi-Task Adaptation Using Adversarial Cross-Task Distillation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "USIP: Unsupervised Stable Interest Point Detection From 3D Point Clouds", @@ -31093,7 +32082,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+1", - "aff_country_unique": "Singapore;United States" + "aff_country_unique": "Singapore;United States", + "bibtex": "@InProceedings{Li_2019_ICCV,\n \n author = {\n Li,\n Jiaxin and Lee,\n Gim Hee\n},\n title = {\n USIP: Unsupervised Stable Interest Point Detection From 3D Point Clouds\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Uncertainty Modeling of Contextual-Connections Between Tracklets for Unconstrained Video-Based Face Recognition", @@ -31126,7 +32116,8 @@ "aff_campus_unique_index": "0;0;2;0;0;0", "aff_campus_unique": "College Park;;Taiwan", "aff_country_unique_index": "0;0+0;1;0;0;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Zheng_2019_ICCV,\n \n author = {\n Zheng,\n Jingxiao and Yu,\n Ruichi and Chen,\n Jun-Cheng and Lu,\n Boyu and Castillo,\n Carlos D. and Chellappa,\n Rama\n},\n title = {\n Uncertainty Modeling of Contextual-Connections Between Tracklets for Unconstrained Video-Based Face Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Uncertainty-Aware Audiovisual Activity Recognition Using Deep Bayesian Variational Inference", @@ -31152,14 +32143,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Subedar_Uncertainty-Aware_Audiovisual_Activity_Recognition_Using_Deep_Bayesian_Variational_Inference_ICCV_2019_paper.html", "aff_unique_index": "0;0;0;0;0", - "aff_unique_norm": "Intel", + "aff_unique_norm": "Intel Corporation", "aff_unique_dep": "Intel Labs", "aff_unique_url": "https://www.intel.com", "aff_unique_abbr": "Intel", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Subedar_2019_ICCV,\n \n author = {\n Subedar,\n Mahesh and Krishnan,\n Ranganath and Meyer,\n Paulo Lopez and Tickoo,\n Omesh and Huang,\n Jonathan\n},\n title = {\n Uncertainty-Aware Audiovisual Activity Recognition Using Deep Bayesian Variational Inference\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Unconstrained Foreground Object Search", @@ -31192,7 +32184,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Austin;", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhao_2019_ICCV,\n \n author = {\n Zhao,\n Yinan and Price,\n Brian and Cohen,\n Scott and Gurari,\n Danna\n},\n title = {\n Unconstrained Foreground Object Search\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Unconstrained Motion Deblurring for Dual-Lens Cameras", @@ -31225,7 +32218,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Madras", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "India" + "aff_country_unique": "India", + "bibtex": "@InProceedings{Mohan_2019_ICCV,\n \n author = {\n Mohan,\n M. R. Mahesh and Girish,\n Sharath and Rajagopalan,\n A. N.\n},\n title = {\n Unconstrained Motion Deblurring for Dual-Lens Cameras\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Understanding Deep Networks via Extremal Perturbations and Smooth Masks", @@ -31251,14 +32245,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Fong_Understanding_Deep_Networks_via_Extremal_Perturbations_and_Smooth_Masks_ICCV_2019_paper.html", "aff_unique_index": "0;0;1", - "aff_unique_norm": "University of Oxford;Meta", + "aff_unique_norm": "University of Oxford;Facebook", "aff_unique_dep": ";Facebook AI Research", "aff_unique_url": "https://www.ox.ac.uk;https://research.facebook.com", "aff_unique_abbr": "Oxford;FAIR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", - "aff_country_unique": "United Kingdom;United States" + "aff_country_unique": "United Kingdom;United States", + "bibtex": "@InProceedings{Fong_2019_ICCV,\n \n author = {\n Fong,\n Ruth and Patrick,\n Mandela and Vedaldi,\n Andrea\n},\n title = {\n Understanding Deep Networks via Extremal Perturbations and Smooth Masks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Understanding Generalized Whitening and Coloring Transform for Universal Style Transfer", @@ -31266,6 +32261,7 @@ "status": "Oral", "track": "main", "pid": "5752", + "author_site": "Tai-Yin Chiu", "author": "Tai-Yin Chiu", "abstract": "Style transfer is a task of rendering images in the styles of other images. In the past few years, neural style transfer has achieved a great success in this task, yet suffers from either the inability to generalize to unseen style images or fast style transfer. Recently, an universal style transfer technique that applies zero-phase component analysis (ZCA) for whitening and coloring image features realizes fast and arbitrary style transfer. However, using ZCA for style transfer is empirical and does not have any theoretical support. In addition, other whitening and coloring transforms (WCT) than ZCA have not been investigated. In this report, we generalize ZCA to the general form of WCT, provide an analytical performance analysis from the angle of neural style transfer, and show why ZCA is a good choice for style transfer among different WCTs and why some WCTs are not well applicable for style transfer.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Chiu_Understanding_Generalized_Whitening_and_Coloring_Transform_for_Universal_Style_Transfer_ICCV_2019_paper.pdf", @@ -31290,7 +32286,8 @@ "aff_campus_unique_index": "0", "aff_campus_unique": "Austin", "aff_country_unique_index": "0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Chiu_2019_ICCV,\n \n author = {\n Chiu,\n Tai-Yin\n},\n title = {\n Understanding Generalized Whitening and Coloring Transform for Universal Style Transfer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Understanding Human Gaze Communication by Spatio-Temporal Graph Reasoning", @@ -31302,7 +32299,7 @@ "author": "Lifeng Fan; Wenguan Wang; Siyuan Huang; Xinyu Tang; Song-Chun Zhu", "abstract": "This paper addresses a new problem of understanding human gaze communication in social videos from both atomic-level and event-level, which is significant for studying human social interactions. To tackle this novel and challenging problem, we contribute a large-scale video dataset, VACATION, which covers diverse daily social scenes and gaze communication behaviors with complete annotations of objects and human faces, human attention, and communication structures and labels in both atomic-level and event-level. Together with VACATION, we propose a spatio-temporal graph neural network to explicitly represent the diverse gaze interactions in the social scenes and to infer atomic-level gaze communication by message passing. We further propose an event network with encoder-decoder structure to predict the event-level gaze communication. Our experiments demonstrate that the proposed model improves various baselines significantly in predicting the atomic-level and event-level gaze communications.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Fan_Understanding_Human_Gaze_Communication_by_Spatio-Temporal_Graph_Reasoning_ICCV_2019_paper.pdf", - "aff": "Center for Vision, Cognition, Learning and Autonomy, UCLA, USA+Inception Institute of Arti\ufb01cial Intelligence, UAE; Center for Vision, Cognition, Learning and Autonomy, UCLA, USA; Center for Vision, Cognition, Learning and Autonomy, UCLA, USA; University of Science and Technology of China, China; Center for Vision, Cognition, Learning and Autonomy, UCLA, USA", + "aff": "Center for Vision, Cognition, Learning and Autonomy, UCLA, USA+Inception Institute of Artificial Intelligence, UAE; Center for Vision, Cognition, Learning and Autonomy, UCLA, USA; Center for Vision, Cognition, Learning and Autonomy, UCLA, USA; University of Science and Technology of China, China; Center for Vision, Cognition, Learning and Autonomy, UCLA, USA", "project": "", "github": "https://github.com/LifengFan/Human-Gaze-Communication", "supp": "", @@ -31323,7 +32320,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0+1;0;0;2;0", - "aff_country_unique": "United States;United Arab Emirates;China" + "aff_country_unique": "United States;United Arab Emirates;China", + "bibtex": "@InProceedings{Fan_2019_ICCV,\n \n author = {\n Fan,\n Lifeng and Wang,\n Wenguan and Huang,\n Siyuan and Tang,\n Xinyu and Zhu,\n Song-Chun\n},\n title = {\n Understanding Human Gaze Communication by Spatio-Temporal Graph Reasoning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Universal Adversarial Perturbation via Prior Driven Uncertainty Approximation", @@ -31349,14 +32347,15 @@ "author_num": 7, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Liu_Universal_Adversarial_Perturbation_via_Prior_Driven_Uncertainty_Approximation_ICCV_2019_paper.html", "aff_unique_index": "0+1;0+1;0;2;3;4;4", - "aff_unique_norm": "Xiamen University;Pengcheng Laboratory;Beihang University;Tsinghua University;Tencent", - "aff_unique_dep": "Department of Artificial Intelligence, School of Informatics;Peng Cheng Lab;;;Youtu Lab", + "aff_unique_norm": "Xiamen University;Peng Cheng Lab;Beihang University;Tsinghua University;Tencent", + "aff_unique_dep": "Department of Artificial Intelligence, School of Informatics;;;;Youtu Lab", "aff_unique_url": "https://www.xmu.edu.cn;;http://www.buaa.edu.cn/;https://www.tsinghua.edu.cn;https://www.tencent.com", "aff_unique_abbr": "XMU;;BUAA;THU;Tencent", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2019_ICCV,\n \n author = {\n Liu,\n Hong and Ji,\n Rongrong and Li,\n Jie and Zhang,\n Baochang and Gao,\n Yue and Wu,\n Yongjian and Huang,\n Feiyue\n},\n title = {\n Universal Adversarial Perturbation via Prior Driven Uncertainty Approximation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Universal Perturbation Attack Against Image Retrieval", @@ -31368,7 +32367,7 @@ "author": "Jie Li; Rongrong Ji; Hong Liu; Xiaopeng Hong; Yue Gao; Qi Tian", "abstract": "Universal adversarial perturbations (UAPs), a.k.a. input-agnostic perturbations, has been proved to exist and be able to fool cutting-edge deep learning models on most of the data samples. Existing UAP methods mainly focus on attacking image classification models. Nevertheless, little attention has been paid to attacking image retrieval systems. In this paper, we make the first attempt in attacking image retrieval systems. Concretely, image retrieval attack is to make the retrieval system return irrelevant images to the query at the top ranking list. It plays an important role to corrupt the neighbourhood relationships among features in image retrieval attack. To this end, we propose a novel method to generate retrieval-against UAP to break the neighbourhood relationships of image features via degrading the corresponding ranking metric. To expand the attack method to scenarios with varying input sizes or untouchable network parameters, a multi-scale random resizing scheme and a ranking distillation strategy are proposed. We evaluate the proposed method on four widely-used image retrieval datasets, and report a significant performance drop in terms of different metrics, such as mAP and mP@10. Finally, we test our attack methods on the real-world visual search engine, i.e., Google Images, which demonstrates the practical potentials of our methods.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Li_Universal_Perturbation_Attack_Against_Image_Retrieval_ICCV_2019_paper.pdf", - "aff": "Department of Artificial Intelligence, School of Informatics, Xiamen University; Department of Artificial Intelligence, School of Informatics, Xiamen University + Peng Cheng Lab, Shenzhen, China; Department of Artificial Intelligence, School of Informatics, Xiamen University; MOE Key Lab. for Intelligent Networks and Network Security/Faculty of Electronic and Information Engineering, Xi\u2019an Jiaotong University, PRC + University of Oulu, Finland; Tsinghua University; Huawei Noah\u2019s Ark Lab", + "aff": "Department of Artificial Intelligence, School of Informatics, Xiamen University; Department of Artificial Intelligence, School of Informatics, Xiamen University + Peng Cheng Lab, Shenzhen, China; Department of Artificial Intelligence, School of Informatics, Xiamen University; MOE Key Lab. for Intelligent Networks and Network Security/Faculty of Electronic and Information Engineering, Xi’an Jiaotong University, PRC + University of Oulu, Finland; Tsinghua University; Huawei Noah’s Ark Lab", "project": "", "github": "", "supp": "", @@ -31382,14 +32381,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Li_Universal_Perturbation_Attack_Against_Image_Retrieval_ICCV_2019_paper.html", "aff_unique_index": "0;0+1;0;2+3;4;5", - "aff_unique_norm": "Xiamen University;Pengcheng Laboratory;Xi'an Jiao Tong University;University of Oulu;Tsinghua University;Huawei", - "aff_unique_dep": "Department of Artificial Intelligence, School of Informatics;Peng Cheng Lab;Faculty of Electronic and Information Engineering;;;Noah\u2019s Ark Lab", + "aff_unique_norm": "Xiamen University;Peng Cheng Lab;Xi'an Jiaotong University;University of Oulu;Tsinghua University;Huawei", + "aff_unique_dep": "Department of Artificial Intelligence, School of Informatics;;Faculty of Electronic and Information Engineering;;;Noah’s Ark Lab", "aff_unique_url": "https://www.xmu.edu.cn;;http://www.xjtu.edu.cn;https://www.oulu.fi;https://www.tsinghua.edu.cn;https://www.huawei.com", "aff_unique_abbr": "XMU;;XJTU;UOulu;THU;Huawei", "aff_campus_unique_index": "1;", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0;0+0;0;0+1;0;0", - "aff_country_unique": "China;Finland" + "aff_country_unique": "China;Finland", + "bibtex": "@InProceedings{Li_2019_ICCV,\n \n author = {\n Li,\n Jie and Ji,\n Rongrong and Liu,\n Hong and Hong,\n Xiaopeng and Gao,\n Yue and Tian,\n Qi\n},\n title = {\n Universal Perturbation Attack Against Image Retrieval\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Universal Semi-Supervised Semantic Segmentation", @@ -31422,7 +32422,8 @@ "aff_campus_unique_index": "0;0;1;0", "aff_campus_unique": "Hyderabad;San Diego", "aff_country_unique_index": "0;0;1;0", - "aff_country_unique": "India;United States" + "aff_country_unique": "India;United States", + "bibtex": "@InProceedings{Kalluri_2019_ICCV,\n \n author = {\n Kalluri,\n Tarun and Varma,\n Girish and Chandraker,\n Manmohan and Jawahar,\n C.V.\n},\n title = {\n Universal Semi-Supervised Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Universally Slimmable Networks and Improved Training Techniques", @@ -31448,14 +32449,15 @@ "author_num": 2, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Yu_Universally_Slimmable_Networks_and_Improved_Training_Techniques_ICCV_2019_paper.html", "aff_unique_index": "0;0", - "aff_unique_norm": "University of Illinois Urbana-Champaign", + "aff_unique_norm": "University of Illinois at Urbana-Champaign", "aff_unique_dep": "", "aff_unique_url": "https://illinois.edu", "aff_unique_abbr": "UIUC", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Urbana-Champaign", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Yu_2019_ICCV,\n \n author = {\n Yu,\n Jiahui and Huang,\n Thomas S.\n},\n title = {\n Universally Slimmable Networks and Improved Training Techniques\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Unpaired Image Captioning via Scene Graph Alignments", @@ -31481,14 +32483,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Gu_Unpaired_Image_Captioning_via_Scene_Graph_Alignments_ICCV_2019_paper.html", "aff_unique_index": "0;0+1;0+2;3;0;4", - "aff_unique_norm": "Nanyang Technological University;Salesforce Research Asia;Monash University;Adobe;Alibaba Group", - "aff_unique_dep": ";;;Adobe Research;", + "aff_unique_norm": "Nanyang Technological University;Salesforce Research Asia;Monash University;Adobe Research;Alibaba Group", + "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.ntu.edu.sg;https://research.salesforce.com;https://www.monash.edu;https://research.adobe.com;https://www.alibaba.com", "aff_unique_abbr": "NTU;SRA;Monash;Adobe;Alibaba", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0+1;2;0;3", - "aff_country_unique": "Singapore;Australia;United States;China" + "aff_country_unique": "Singapore;Australia;United States;China", + "bibtex": "@InProceedings{Gu_2019_ICCV,\n \n author = {\n Gu,\n Jiuxiang and Joty,\n Shafiq and Cai,\n Jianfei and Zhao,\n Handong and Yang,\n Xu and Wang,\n Gang\n},\n title = {\n Unpaired Image Captioning via Scene Graph Alignments\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Unpaired Image-to-Speech Synthesis With Multimodal Information Bottleneck", @@ -31514,14 +32517,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Ma_Unpaired_Image-to-Speech_Synthesis_With_Multimodal_Information_Bottleneck_ICCV_2019_paper.html", "aff_unique_index": "0;1;1", - "aff_unique_norm": "State University of New York at Buffalo;Microsoft", - "aff_unique_dep": ";Microsoft Corporation", + "aff_unique_norm": "State University of New York at Buffalo;Microsoft Corporation", + "aff_unique_dep": ";", "aff_unique_url": "https://www.buffalo.edu;https://www.microsoft.com", "aff_unique_abbr": "SUNY Buffalo;Microsoft", "aff_campus_unique_index": "0", "aff_campus_unique": "Buffalo;", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Ma_2019_ICCV,\n \n author = {\n Ma,\n Shuang and McDuff,\n Daniel and Song,\n Yale\n},\n title = {\n Unpaired Image-to-Speech Synthesis With Multimodal Information Bottleneck\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Unsupervised 3D Reconstruction Networks", @@ -31533,7 +32537,7 @@ "author": "Geonho Cha; Minsik Lee; Songhwai Oh", "abstract": "In this paper, we propose 3D unsupervised reconstruction networks (3D-URN), which reconstruct the 3D structures of instances in a given object category from their 2D feature points under an orthographic camera model. 3D-URN consists of a 3D shape reconstructor and a rotation estimator, which are trained in a fully-unsupervised manner incorporating the proposed unsupervised loss functions. The role of the 3D shape reconstructor is to reconstruct the 3D shape of an instance from its 2D feature points, and the rotation estimator infers the camera pose. After training, 3D-URN can infer the 3D structure of an unseen instance in the same category, which is not possible in the conventional schemes of non-rigid structure from motion and structure from category. The experimental result shows the state-of-the-art performance, which demonstrates the effectiveness of the proposed method.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Cha_Unsupervised_3D_Reconstruction_Networks_ICCV_2019_paper.pdf", - "aff": "Electrical and Computer Engineering, ASRI, Seoul National University, Korea\u2020; Division of Electrical Engineering, Hanyang University, Korea\u2021; Electrical and Computer Engineering, ASRI, Seoul National University, Korea\u2020", + "aff": "Electrical and Computer Engineering, ASRI, Seoul National University, Korea†; Division of Electrical Engineering, Hanyang University, Korea‡; Electrical and Computer Engineering, ASRI, Seoul National University, Korea†", "project": "", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2019/supplemental/Cha_Unsupervised_3D_Reconstruction_ICCV_2019_supplemental.pdf", @@ -31554,7 +32558,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Seoul;", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Cha_2019_ICCV,\n \n author = {\n Cha,\n Geonho and Lee,\n Minsik and Oh,\n Songhwai\n},\n title = {\n Unsupervised 3D Reconstruction Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Unsupervised Collaborative Learning of Keyframe Detection and Visual Odometry Towards Monocular Deep SLAM", @@ -31580,14 +32585,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Sheng_Unsupervised_Collaborative_Learning_of_Keyframe_Detection_and_Visual_Odometry_Towards_ICCV_2019_paper.html", "aff_unique_index": "0;1;2;3", - "aff_unique_norm": "Beihang University;University of Oxford;University of Sydney;Chinese University of Hong Kong", + "aff_unique_norm": "Beihang University;University of Oxford;The University of Sydney;The Chinese University of Hong Kong", "aff_unique_dep": "College of Software;;;CUHK-SenseTime Joint Lab", "aff_unique_url": "http://www.buaa.edu.cn;https://www.ox.ac.uk;https://www.sydney.edu.au;https://www.cuhk.edu.hk", "aff_unique_abbr": "Beihang;Oxford;USYD;CUHK", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;1;2;0", - "aff_country_unique": "China;United Kingdom;Australia" + "aff_country_unique": "China;United Kingdom;Australia", + "bibtex": "@InProceedings{Sheng_2019_ICCV,\n \n author = {\n Sheng,\n Lu and Xu,\n Dan and Ouyang,\n Wanli and Wang,\n Xiaogang\n},\n title = {\n Unsupervised Collaborative Learning of Keyframe Detection and Visual Odometry Towards Monocular Deep SLAM\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Unsupervised Deep Learning for Structured Shape Matching", @@ -31599,7 +32605,7 @@ "author": "Jean-Michel Roufosse; Abhishek Sharma; Maks Ovsjanikov", "abstract": "We present a novel method for computing correspondences across 3D shapes using unsupervised learning. Our method computes a non-linear transformation of given descriptor functions, while optimizing for global structural properties of the resulting maps, such as their bijectivity or approximate isometry. To this end, we use the functional maps framework, and build upon the recent FMNet architecture for descriptor learning. Unlike that approach, however, we show that learning can be done in a purely unsupervised setting, without having access to any ground truth correspondences. This results in a very general shape matching method that we call SURFMNet for Spectral Unsupervised FMNet, and which can be used to establish correspondences within 3D shape collections without any prior information. We demonstrate on a wide range of challenging benchmarks, that our approach leads to state-of-the-art results compared to the existing unsupervised methods and achieves results that are comparable even to the supervised learning techniques. Moreover, our framework is an order of magnitude faster, and does not rely on geodesic distance computation or expensive post-processing.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Roufosse_Unsupervised_Deep_Learning_for_Structured_Shape_Matching_ICCV_2019_paper.pdf", - "aff": "LIX, \u00b4Ecole Polytechnique; LIX, \u00b4Ecole Polytechnique; LIX, \u00b4Ecole Polytechnique", + "aff": "LIX, ´Ecole Polytechnique; LIX, ´Ecole Polytechnique; LIX, ´Ecole Polytechnique", "project": "", "github": "", "supp": "http://openaccess.thecvf.com/content_ICCV_2019/supplemental/Roufosse_Unsupervised_Deep_Learning_ICCV_2019_supplemental.pdf", @@ -31620,7 +32626,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "France" + "aff_country_unique": "France", + "bibtex": "@InProceedings{Roufosse_2019_ICCV,\n \n author = {\n Roufosse,\n Jean-Michel and Sharma,\n Abhishek and Ovsjanikov,\n Maks\n},\n title = {\n Unsupervised Deep Learning for Structured Shape Matching\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Unsupervised Domain Adaptation via Regularized Conditional Alignment", @@ -31653,7 +32660,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Los Angeles", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Cicek_2019_ICCV,\n \n author = {\n Cicek,\n Safa and Soatto,\n Stefano\n},\n title = {\n Unsupervised Domain Adaptation via Regularized Conditional Alignment\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Unsupervised Graph Association for Person Re-Identification", @@ -31665,9 +32673,9 @@ "author": "Jinlin Wu; Yang Yang; Hao Liu; Shengcai Liao; Zhen Lei; Stan Z. Li", "abstract": "In this paper, we propose an unsupervised graph association (UGA) framework to learn the underlying viewinvariant representations from the video pedestrian tracklets. The core points of UGA are mining the underlying cross-view associations and reducing the damage of noise associations. To this end, UGA is adopts a two-stage training strategy: (1) intra-camera learning stage and (2) intercamera learning stage. The former learns the intra-camera representation for each camera. While the latter builds a cross-view graph (CVG) to associate different cameras. By doing this, we can learn view-invariant representation for all person. Extensive experiments and ablation studies on seven re-id datasets demonstrate the superiority of the proposed UGA over most state-of-the-art unsupervised and domain adaptation re-id methods.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Wu_Unsupervised_Graph_Association_for_Person_Re-Identification_ICCV_2019_paper.pdf", - "aff": "CBSR & NLPR, Institute of Automation, Chinese Academy of Sciences, Beijing, China + University of Chinese Academy of Sciences, Beijing, China; CBSR & NLPR, Institute of Automation, Chinese Academy of Sciences, Beijing, China + University of Chinese Academy of Sciences, Beijing, China; CBSR & NLPR, Institute of Automation, Chinese Academy of Sciences, Beijing, China + University of Chinese Academy of Sciences, Beijing, China; Inception Institute of Arti\ufb01cial Intelligence (IIAI), Abu Dhabi, UAE; CBSR & NLPR, Institute of Automation, Chinese Academy of Sciences, Beijing, China + University of Chinese Academy of Sciences, Beijing, China; CBSR & NLPR, Institute of Automation, Chinese Academy of Sciences, Beijing, China + University of Chinese Academy of Sciences, Beijing, China", + "aff": "CBSR & NLPR, Institute of Automation, Chinese Academy of Sciences, Beijing, China + University of Chinese Academy of Sciences, Beijing, China; CBSR & NLPR, Institute of Automation, Chinese Academy of Sciences, Beijing, China + University of Chinese Academy of Sciences, Beijing, China; CBSR & NLPR, Institute of Automation, Chinese Academy of Sciences, Beijing, China + University of Chinese Academy of Sciences, Beijing, China; Inception Institute of Artificial Intelligence (IIAI), Abu Dhabi, UAE; CBSR & NLPR, Institute of Automation, Chinese Academy of Sciences, Beijing, China + University of Chinese Academy of Sciences, Beijing, China; CBSR & NLPR, Institute of Automation, Chinese Academy of Sciences, Beijing, China + University of Chinese Academy of Sciences, Beijing, China", "project": "", - "github": "https://github.com/yichuan9527/Unsupervised-Graph-Association-for-Person-Re-identi\ufb01cation", + "github": "https://github.com/yichuan9527/Unsupervised-Graph-Association-for-Person-Re-identification", "supp": "", "arxiv": "", "pdf_size": 931995, @@ -31686,7 +32694,8 @@ "aff_campus_unique_index": "0+0;0+0;0+0;1;0+0;0+0", "aff_campus_unique": "Beijing;Abu Dhabi", "aff_country_unique_index": "0+0;0+0;0+0;1;0+0;0+0", - "aff_country_unique": "China;United Arab Emirates" + "aff_country_unique": "China;United Arab Emirates", + "bibtex": "@InProceedings{Wu_2019_ICCV,\n \n author = {\n Wu,\n Jinlin and Yang,\n Yang and Liu,\n Hao and Liao,\n Shengcai and Lei,\n Zhen and Li,\n Stan Z.\n},\n title = {\n Unsupervised Graph Association for Person Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Unsupervised High-Resolution Depth Learning From Videos With Dual Networks", @@ -31712,14 +32721,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Zhou_Unsupervised_High-Resolution_Depth_Learning_From_Videos_With_Dual_Networks_ICCV_2019_paper.html", "aff_unique_index": "0;1;0;1", - "aff_unique_norm": "Tsinghua University;Microsoft", + "aff_unique_norm": "Tsinghua University;Microsoft Corporation", "aff_unique_dep": ";Microsoft Research", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "THU;MSR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Zhou_2019_ICCV,\n \n author = {\n Zhou,\n Junsheng and Wang,\n Yuwang and Qin,\n Kaihuai and Zeng,\n Wenjun\n},\n title = {\n Unsupervised High-Resolution Depth Learning From Videos With Dual Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Unsupervised Learning of Landmarks by Descriptor Vector Exchange", @@ -31752,7 +32762,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Oxford", "aff_country_unique_index": "1;1;1", - "aff_country_unique": ";United Kingdom" + "aff_country_unique": ";United Kingdom", + "bibtex": "@InProceedings{Thewlis_2019_ICCV,\n \n author = {\n Thewlis,\n James and Albanie,\n Samuel and Bilen,\n Hakan and Vedaldi,\n Andrea\n},\n title = {\n Unsupervised Learning of Landmarks by Descriptor Vector Exchange\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "id": "611a9645ed", @@ -31774,14 +32785,15 @@ "email": ";;;", "author_num": 4, "aff_unique_index": "0;0+1;0+2;0+2", - "aff_unique_norm": "Tel Aviv University;Meta;Tel-Aviv University", + "aff_unique_norm": "Tel Aviv University;Facebook;Tel-Aviv University", "aff_unique_dep": "School of Computer Science;Facebook AI Research;Sagol School of Neuroscience", "aff_unique_url": "https://www.tau.ac.il;https://research.facebook.com;https://www.tau.ac.il", "aff_unique_abbr": "TAU;FAIR;TAU", "aff_campus_unique_index": "0;0;0+2;0+2", "aff_campus_unique": "Tel Aviv;;Tel-Aviv", "aff_country_unique_index": "0;0+1;0+0;0+0", - "aff_country_unique": "Israel;United States" + "aff_country_unique": "Israel;United States", + "bibtex": "@InProceedings{Gur_2019_ICCV,\n \n author = {\n Gur,\n Shir and Wolf,\n Lior and Golgher,\n Lior and Blinder,\n Pablo\n},\n title = {\n Unsupervised Microvascular Image Segmentation Using an Active Contours Mimicking Neural Network\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Unsupervised Multi-Task Feature Learning on Point Clouds", @@ -31814,7 +32826,8 @@ "aff_campus_unique_index": "0;1", "aff_campus_unique": "Toronto;San Francisco", "aff_country_unique_index": "0;1", - "aff_country_unique": "Canada;United States" + "aff_country_unique": "Canada;United States", + "bibtex": "@InProceedings{Hassani_2019_ICCV,\n \n author = {\n Hassani,\n Kaveh and Haley,\n Mike\n},\n title = {\n Unsupervised Multi-Task Feature Learning on Point Clouds\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Unsupervised Neural Quantization for Compressed-Domain Similarity Search", @@ -31847,7 +32860,8 @@ "aff_campus_unique_index": "1;", "aff_campus_unique": ";Moscow", "aff_country_unique_index": "0+0;0+0", - "aff_country_unique": "Russian Federation" + "aff_country_unique": "Russia", + "bibtex": "@InProceedings{Morozov_2019_ICCV,\n \n author = {\n Morozov,\n Stanislav and Babenko,\n Artem\n},\n title = {\n Unsupervised Neural Quantization for Compressed-Domain Similarity Search\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Unsupervised Out-of-Distribution Detection by Maximum Classifier Discrepancy", @@ -31873,14 +32887,15 @@ "author_num": 2, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Yu_Unsupervised_Out-of-Distribution_Detection_by_Maximum_Classifier_Discrepancy_ICCV_2019_paper.html", "aff_unique_index": "0;0", - "aff_unique_norm": "University of Tokyo", + "aff_unique_norm": "The University of Tokyo", "aff_unique_dep": "", "aff_unique_url": "https://www.u-tokyo.ac.jp", "aff_unique_abbr": "UTokyo", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Yu_2019_ICCV,\n \n author = {\n Yu,\n Qing and Aizawa,\n Kiyoharu\n},\n title = {\n Unsupervised Out-of-Distribution Detection by Maximum Classifier Discrepancy\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Unsupervised Person Re-Identification by Camera-Aware Similarity Consistency Learning", @@ -31906,14 +32921,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Wu_Unsupervised_Person_Re-Identification_by_Camera-Aware_Similarity_Consistency_Learning_ICCV_2019_paper.html", "aff_unique_index": "0;0+1+2;0+3", - "aff_unique_norm": "Sun Yat-sen University;Pengcheng Laboratory;Key Laboratory of Machine Intelligence and Advanced Computing;Guangdong Province Key Laboratory of Information Security", - "aff_unique_dep": "School of Electronics and Information Technology;Peng Cheng Laboratory;Ministry of Education;Information Security", + "aff_unique_norm": "Sun Yat-sen University;Peng Cheng Laboratory;Key Laboratory of Machine Intelligence and Advanced Computing;Guangdong Province Key Laboratory of Information Security", + "aff_unique_dep": "School of Electronics and Information Technology;;Ministry of Education;Information Security", "aff_unique_url": "http://www.sysu.edu.cn;;;", "aff_unique_abbr": "SYSU;;;", "aff_campus_unique_index": "1;", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0;0+0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wu_2019_ICCV,\n \n author = {\n Wu,\n Ancong and Zheng,\n Wei-Shi and Lai,\n Jian-Huang\n},\n title = {\n Unsupervised Person Re-Identification by Camera-Aware Similarity Consistency Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Unsupervised Pre-Training of Image Features on Non-Curated Data", @@ -31937,7 +32953,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Caron_Unsupervised_Pre-Training_of_Image_Features_on_Non-Curated_Data_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Caron_Unsupervised_Pre-Training_of_Image_Features_on_Non-Curated_Data_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Caron_2019_ICCV,\n \n author = {\n Caron,\n Mathilde and Bojanowski,\n Piotr and Mairal,\n Julien and Joulin,\n Armand\n},\n title = {\n Unsupervised Pre-Training of Image Features on Non-Curated Data\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Unsupervised Procedure Learning via Joint Dynamic Summarization", @@ -31970,7 +32987,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Elhamifar_2019_ICCV,\n \n author = {\n Elhamifar,\n Ehsan and Naing,\n Zwe\n},\n title = {\n Unsupervised Procedure Learning via Joint Dynamic Summarization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Unsupervised Robust Disentangling of Latent Characteristics for Image Synthesis", @@ -31978,7 +32996,7 @@ "status": "Poster", "track": "main", "pid": "3555", - "author_site": "Patrick Esser, Johannes Haux, Bj\u00c3\u00b6rn Ommer", + "author_site": "Patrick Esser, Johannes Haux, Björn Ommer", "author": "Patrick Esser; Johannes Haux; Bjorn Ommer", "abstract": "Deep generative models come with the promise to learn an explainable representation for visual objects that allows image sampling, synthesis, and selective modification. The main challenge is to learn to properly model the independent latent characteristics of an object, especially its appearance and pose. We present a novel approach that learns disentangled representations of these characteristics and explains them individually. Training requires only pairs of images depicting the same object appearance, but no pose annotations. We propose an additional classifier that estimates the minimal amount of regularization required to enforce disentanglement. Thus both representations together can completely explain an image while being independent of each other. Previous methods based on adversarial approaches fail to enforce this independence, while methods based on variational approaches lead to uninformative representations. In experiments on diverse object categories, the approach successfully recombines pose and appearance to reconstruct and retarget novel synthesized images. We achieve significant improvements over state-of-the-art methods which utilize the same level of supervision, and reach performances comparable to those of pose-supervised approaches. However, we can handle the vast body of articulated object classes for which no pose models/annotations are available.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Esser_Unsupervised_Robust_Disentangling_of_Latent_Characteristics_for_Image_Synthesis_ICCV_2019_paper.pdf", @@ -32003,7 +33021,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Heidelberg", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Esser_2019_ICCV,\n \n author = {\n Esser,\n Patrick and Haux,\n Johannes and Ommer,\n Bjorn\n},\n title = {\n Unsupervised Robust Disentangling of Latent Characteristics for Image Synthesis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Unsupervised Video Interpolation Using Cycle Consistency", @@ -32029,14 +33048,15 @@ "author_num": 9, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Reda_Unsupervised_Video_Interpolation_Using_Cycle_Consistency_ICCV_2019_paper.html", "aff_unique_index": "0;0;0;0;0;0;0;0;0", - "aff_unique_norm": "NVIDIA", - "aff_unique_dep": "NVIDIA Corporation", + "aff_unique_norm": "NVIDIA Corporation", + "aff_unique_dep": "", "aff_unique_url": "https://www.nvidia.com", "aff_unique_abbr": "NVIDIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Reda_2019_ICCV,\n \n author = {\n Reda,\n Fitsum A. and Sun,\n Deqing and Dundar,\n Aysegul and Shoeybi,\n Mohammad and Liu,\n Guilin and Shih,\n Kevin J. and Tao,\n Andrew and Kautz,\n Jan and Catanzaro,\n Bryan\n},\n title = {\n Unsupervised Video Interpolation Using Cycle Consistency\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "UprightNet: Geometry-Aware Camera Orientation Estimation From Single Images", @@ -32069,7 +33089,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Cornell Tech;", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Xian_2019_ICCV,\n \n author = {\n Xian,\n Wenqi and Li,\n Zhengqi and Fisher,\n Matthew and Eisenmann,\n Jonathan and Shechtman,\n Eli and Snavely,\n Noah\n},\n title = {\n UprightNet: Geometry-Aware Camera Orientation Estimation From Single Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "VTNFP: An Image-Based Virtual Try-On Network With Body and Clothing Feature Preservation", @@ -32102,7 +33123,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Irvine", "aff_country_unique_index": "0;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Yu_2019_ICCV,\n \n author = {\n Yu,\n Ruiyun and Wang,\n Xiaoqi and Xie,\n Xiaohui\n},\n title = {\n VTNFP: An Image-Based Virtual Try-On Network With Body and Clothing Feature Preservation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "VV-Net: Voxel VAE Net With Group Convolutions for Point Cloud Segmentation", @@ -32135,7 +33157,8 @@ "aff_campus_unique_index": "0;2;3;0", "aff_campus_unique": "College Park;;Beijing;Cardiff", "aff_country_unique_index": "0+1;1;2;0", - "aff_country_unique": "United States;China;United Kingdom" + "aff_country_unique": "United States;China;United Kingdom", + "bibtex": "@InProceedings{Meng_2019_ICCV,\n \n author = {\n Meng,\n Hsien-Yu and Gao,\n Lin and Lai,\n Yu-Kun and Manocha,\n Dinesh\n},\n title = {\n VV-Net: Voxel VAE Net With Group Convolutions for Point Cloud Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "VaTeX: A Large-Scale, High-Quality Multilingual Dataset for Video-and-Language Research", @@ -32159,7 +33182,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Wang_VaTeX_A_Large-Scale_High-Quality_Multilingual_Dataset_for_Video-and-Language_Research_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Wang_VaTeX_A_Large-Scale_High-Quality_Multilingual_Dataset_for_Video-and-Language_Research_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Wang_2019_ICCV,\n \n author = {\n Wang,\n Xin and Wu,\n Jiawei and Chen,\n Junkun and Li,\n Lei and Wang,\n Yuan-Fang and Wang,\n William Yang\n},\n title = {\n VaTeX: A Large-Scale,\n High-Quality Multilingual Dataset for Video-and-Language Research\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Variable Rate Deep Image Compression With a Conditional Autoencoder", @@ -32185,14 +33209,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Choi_Variable_Rate_Deep_Image_Compression_With_a_Conditional_Autoencoder_ICCV_2019_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "Samsung", + "aff_unique_norm": "Samsung Semiconductor Inc.", "aff_unique_dep": "SoC R&D", - "aff_unique_url": "https://www.samsung.com/us\u534a\u5bfc\u4f53/", + "aff_unique_url": "https://www.samsung.com/us半导体/", "aff_unique_abbr": "SSI", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "San Diego", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Choi_2019_ICCV,\n \n author = {\n Choi,\n Yoojin and El-Khamy,\n Mostafa and Lee,\n Jungwon\n},\n title = {\n Variable Rate Deep Image Compression With a Conditional Autoencoder\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Variational Adversarial Active Learning", @@ -32225,7 +33250,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;1;1", - "aff_country_unique": "Canada;United States" + "aff_country_unique": "Canada;United States", + "bibtex": "@InProceedings{Sinha_2019_ICCV,\n \n author = {\n Sinha,\n Samarth and Ebrahimi,\n Sayna and Darrell,\n Trevor\n},\n title = {\n Variational Adversarial Active Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Variational Few-Shot Learning", @@ -32237,7 +33263,7 @@ "author": "Jian Zhang; Chenglong Zhao; Bingbing Ni; Minghao Xu; Xiaokang Yang", "abstract": "We propose a variational Bayesian framework for enhancing few-shot learning performance. This idea is motivated by the fact that single point based metric learning approaches are inherently noise-vulnerable and easy-to-be-biased. In a nutshell, stochastic variational inference is invoked to approximate bias-eliminated class specific sample distributions. In the meantime, a classifier-free prediction is attained by leveraging the distribution statistics on novel samples. Extensive experimental results on several benchmarks well demonstrate the effectiveness of our distribution-driven few-shot learning framework over previous point estimates based methods, in terms of superior classification accuracy and robustness.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Zhang_Variational_Few-Shot_Learning_ICCV_2019_paper.pdf", - "aff": "Shanghai Jiao Tong University; Shanghai Jiao Tong University; Shanghai Jiao Tong University; Shanghai Jiao Tong University; MoE Key Lab of Arti\ufb01cial Intelligence, AI Institute, Shanghai Jiao Tong University", + "aff": "Shanghai Jiao Tong University; Shanghai Jiao Tong University; Shanghai Jiao Tong University; Shanghai Jiao Tong University; MoE Key Lab of Artificial Intelligence, AI Institute, Shanghai Jiao Tong University", "project": "", "github": "", "supp": "", @@ -32255,10 +33281,11 @@ "aff_unique_dep": "", "aff_unique_url": "https://www.sjtu.edu.cn", "aff_unique_abbr": "SJTU", - "aff_campus_unique_index": "1", - "aff_campus_unique": ";Shanghai", + "aff_campus_unique_index": "", + "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2019_ICCV,\n \n author = {\n Zhang,\n Jian and Zhao,\n Chenglong and Ni,\n Bingbing and Xu,\n Minghao and Yang,\n Xiaokang\n},\n title = {\n Variational Few-Shot Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Variational Uncalibrated Photometric Stereo Under General Lighting", @@ -32266,7 +33293,7 @@ "status": "Poster", "track": "main", "pid": "1335", - "author_site": "Bjoern Haefner, Zhenzhang Ye, Maolin Gao, Tao Wu, Yvain Qu\u00c3\u00a9au, Daniel Cremers", + "author_site": "Bjoern Haefner, Zhenzhang Ye, Maolin Gao, Tao Wu, Yvain Quéau, Daniel Cremers", "author": "Bjoern Haefner; Zhenzhang Ye; Maolin Gao; Tao Wu; Yvain Queau; Daniel Cremers", "abstract": "Photometric stereo (PS) techniques nowadays remain constrained to an ideal laboratory setup where modeling and calibration of lighting is amenable. To eliminate such restrictions, we propose an efficient principled variational approach to uncalibrated PS under general illumination. To this end, the Lambertian reflectance model is approximated through a spherical harmonic expansion, which preserves the spatial invariance of the lighting. The joint recovery of shape, reflectance and illumination is then formulated as a single variational problem. There the shape estimation is carried out directly in terms of the underlying perspective depth map, thus implicitly ensuring integrability and bypassing the need for a subsequent normal integration. To tackle the resulting nonconvex problem numerically, we undertake a two-phase procedure to initialize a balloon-like perspective depth map, followed by a \"lagged\" block coordinate descent scheme. The experiments validate efficiency and robustness of this approach. Across a variety of evaluations, we are able to reduce the mean angular error consistently by a factor of 2-3 compared to the state-of-the-art.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Haefner_Variational_Uncalibrated_Photometric_Stereo_Under_General_Lighting_ICCV_2019_paper.pdf", @@ -32291,7 +33318,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;2;0", - "aff_country_unique": "Germany;;France" + "aff_country_unique": "Germany;;France", + "bibtex": "@InProceedings{Haefner_2019_ICCV,\n \n author = {\n Haefner,\n Bjoern and Ye,\n Zhenzhang and Gao,\n Maolin and Wu,\n Tao and Queau,\n Yvain and Cremers,\n Daniel\n},\n title = {\n Variational Uncalibrated Photometric Stereo Under General Lighting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Vehicle Re-Identification With Viewpoint-Aware Metric Learning", @@ -32324,7 +33352,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chu_2019_ICCV,\n \n author = {\n Chu,\n Ruihang and Sun,\n Yifan and Li,\n Yadong and Liu,\n Zheng and Zhang,\n Chi and Wei,\n Yichen\n},\n title = {\n Vehicle Re-Identification With Viewpoint-Aware Metric Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Vehicle Re-Identification in Aerial Imagery: Dataset and Approach", @@ -32336,7 +33365,7 @@ "author": "Peng Wang; Bingliang Jiao; Lu Yang; Yifei Yang; Shizhou Zhang; Wei Wei; Yanning Zhang", "abstract": "In this work, we construct a large-scale dataset for vehicle re-identification (ReID), which contains 137k images of 13k vehicle instances captured by UAV-mounted cameras. To our knowledge, it is the largest UAV-based vehicle ReID dataset. To increase intra-class variation, each vehicle is captured by at least two UAVs at different locations, with diverse view-angles and flight-altitudes. We manually label a variety of vehicle attributes, including vehicle type, color, skylight, bumper, spare tire and luggage rack. Furthermore, for each vehicle image, the annotator is also required to mark the discriminative parts that helps them to distinguish this particular vehicle from others. Besides the dataset, we also design a specific vehicle ReID algorithm to make full use of the rich annotation information. It is capable of explicitly detecting discriminative parts for each specific vehicle and significantly outperforming the evaluated baselines and state-of-the-art vehicle ReID approaches.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Wang_Vehicle_Re-Identification_in_Aerial_Imagery_Dataset_and_Approach_ICCV_2019_paper.pdf", - "aff": "School of Computer Science and Engineering, Northwestern Polytechnical University, Xi\u2019an, China+National Engineering Laboratory for Integrated Aero-Space-Ground-Ocean Big Data Application Technology, China; School of Computer Science and Engineering, Northwestern Polytechnical University, Xi\u2019an, China+National Engineering Laboratory for Integrated Aero-Space-Ground-Ocean Big Data Application Technology, China; School of Computer Science and Engineering, Northwestern Polytechnical University, Xi\u2019an, China+National Engineering Laboratory for Integrated Aero-Space-Ground-Ocean Big Data Application Technology, China; School of Computer Science and Engineering, Northwestern Polytechnical University, Xi\u2019an, China+National Engineering Laboratory for Integrated Aero-Space-Ground-Ocean Big Data Application Technology, China; School of Computer Science and Engineering, Northwestern Polytechnical University, Xi\u2019an, China+National Engineering Laboratory for Integrated Aero-Space-Ground-Ocean Big Data Application Technology, China; School of Computer Science and Engineering, Northwestern Polytechnical University, Xi\u2019an, China+National Engineering Laboratory for Integrated Aero-Space-Ground-Ocean Big Data Application Technology, China; School of Computer Science and Engineering, Northwestern Polytechnical University, Xi\u2019an, China+National Engineering Laboratory for Integrated Aero-Space-Ground-Ocean Big Data Application Technology, China", + "aff": "School of Computer Science and Engineering, Northwestern Polytechnical University, Xi’an, China+National Engineering Laboratory for Integrated Aero-Space-Ground-Ocean Big Data Application Technology, China; School of Computer Science and Engineering, Northwestern Polytechnical University, Xi’an, China+National Engineering Laboratory for Integrated Aero-Space-Ground-Ocean Big Data Application Technology, China; School of Computer Science and Engineering, Northwestern Polytechnical University, Xi’an, China+National Engineering Laboratory for Integrated Aero-Space-Ground-Ocean Big Data Application Technology, China; School of Computer Science and Engineering, Northwestern Polytechnical University, Xi’an, China+National Engineering Laboratory for Integrated Aero-Space-Ground-Ocean Big Data Application Technology, China; School of Computer Science and Engineering, Northwestern Polytechnical University, Xi’an, China+National Engineering Laboratory for Integrated Aero-Space-Ground-Ocean Big Data Application Technology, China; School of Computer Science and Engineering, Northwestern Polytechnical University, Xi’an, China+National Engineering Laboratory for Integrated Aero-Space-Ground-Ocean Big Data Application Technology, China; School of Computer Science and Engineering, Northwestern Polytechnical University, Xi’an, China+National Engineering Laboratory for Integrated Aero-Space-Ground-Ocean Big Data Application Technology, China", "project": "", "github": "", "supp": "", @@ -32357,7 +33386,8 @@ "aff_campus_unique_index": "0;0;0;0;0;0;0", "aff_campus_unique": "Xi'an;", "aff_country_unique_index": "0+0;0+0;0+0;0+0;0+0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2019_ICCV,\n \n author = {\n Wang,\n Peng and Jiao,\n Bingliang and Yang,\n Lu and Yang,\n Yifei and Zhang,\n Shizhou and Wei,\n Wei and Zhang,\n Yanning\n},\n title = {\n Vehicle Re-Identification in Aerial Imagery: Dataset and Approach\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Very Long Natural Scenery Image Prediction by Outpainting", @@ -32383,14 +33413,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Yang_Very_Long_Natural_Scenery_Image_Prediction_by_Outpainting_ICCV_2019_paper.html", "aff_unique_index": "0+1;2;1;1;3", - "aff_unique_norm": "Southern University of Science and Technology;University of Technology Sydney;Qihoo 360;YITU Technology", + "aff_unique_norm": "Southern University of Science and Technology;University of Technology Sydney;Qihoo 360;Yitu Technology", "aff_unique_dep": "Joint Centre of CIS;ReLER;;", "aff_unique_url": "https://www.sustech.edu.cn;https://www.uts.edu.au;https://www.qihoo.net;https://www.yITU.cn", "aff_unique_abbr": "SUSTech;UTS;Qihoo;YITU", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Sydney", "aff_country_unique_index": "0+1;0;1;1;0", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Yang_2019_ICCV,\n \n author = {\n Yang,\n Zongxin and Dong,\n Jian and Liu,\n Ping and Yang,\n Yi and Yan,\n Shuicheng\n},\n title = {\n Very Long Natural Scenery Image Prediction by Outpainting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "ViCo: Word Embeddings From Visual Co-Occurrences", @@ -32416,14 +33447,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Gupta_ViCo_Word_Embeddings_From_Visual_Co-Occurrences_ICCV_2019_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "University of Illinois Urbana-Champaign", + "aff_unique_norm": "University of Illinois at Urbana-Champaign", "aff_unique_dep": "", "aff_unique_url": "https://illinois.edu", "aff_unique_abbr": "UIUC", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Urbana-Champaign", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Gupta_2019_ICCV,\n \n author = {\n Gupta,\n Tanmay and Schwing,\n Alexander and Hoiem,\n Derek\n},\n title = {\n ViCo: Word Embeddings From Visual Co-Occurrences\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "ViSiL: Fine-Grained Spatio-Temporal Video Similarity Learning", @@ -32456,7 +33488,8 @@ "aff_campus_unique_index": "0+1;0;1;0", "aff_campus_unique": "Thessaloniki;Mile End", "aff_country_unique_index": "0+1;0;1;0", - "aff_country_unique": "Greece;United Kingdom" + "aff_country_unique": "Greece;United Kingdom", + "bibtex": "@InProceedings{Kordopatis-Zilos_2019_ICCV,\n \n author = {\n Kordopatis-Zilos,\n Giorgos and Papadopoulos,\n Symeon and Patras,\n Ioannis and Kompatsiaris,\n Ioannis\n},\n title = {\n ViSiL: Fine-Grained Spatio-Temporal Video Similarity Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Video Classification With Channel-Separated Convolutional Networks", @@ -32482,14 +33515,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Tran_Video_Classification_With_Channel-Separated_Convolutional_Networks_ICCV_2019_paper.html", "aff_unique_index": "0;0;0;0", - "aff_unique_norm": "Meta", + "aff_unique_norm": "Facebook", "aff_unique_dep": "Facebook AI", "aff_unique_url": "https://www.facebook.com", "aff_unique_abbr": "Facebook AI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Tran_2019_ICCV,\n \n author = {\n Tran,\n Du and Wang,\n Heng and Torresani,\n Lorenzo and Feiszli,\n Matt\n},\n title = {\n Video Classification With Channel-Separated Convolutional Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Video Compression With Rate-Distortion Autoencoders", @@ -32522,7 +33556,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Habibian_2019_ICCV,\n \n author = {\n Habibian,\n Amirhossein and Rozendaal,\n Ties van and Tomczak,\n Jakub M. and Cohen,\n Taco S.\n},\n title = {\n Video Compression With Rate-Distortion Autoencoders\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Video Face Clustering With Unknown Number of Clusters", @@ -32548,14 +33583,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Tapaswi_Video_Face_Clustering_With_Unknown_Number_of_Clusters_ICCV_2019_paper.html", "aff_unique_index": "0+1+2+3;1+2+3;1+2+3", - "aff_unique_norm": "INRIA;University of Toronto;Vector Institute;NVIDIA", - "aff_unique_dep": ";;;NVIDIA Corporation", + "aff_unique_norm": "Inria;University of Toronto;Vector Institute;NVIDIA Corporation", + "aff_unique_dep": ";;;", "aff_unique_url": "https://www.inria.fr;https://www.utoronto.ca;https://vectorinstitute.ai/;https://www.nvidia.com", "aff_unique_abbr": "Inria;U of T;Vector Institute;NVIDIA", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+1+1+2;1+1+2;1+1+2", - "aff_country_unique": "France;Canada;United States" + "aff_country_unique": "France;Canada;United States", + "bibtex": "@InProceedings{Tapaswi_2019_ICCV,\n \n author = {\n Tapaswi,\n Makarand and Law,\n Marc T. and Fidler,\n Sanja\n},\n title = {\n Video Face Clustering With Unknown Number of Clusters\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Video Instance Segmentation", @@ -32581,14 +33617,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Yang_Video_Instance_Segmentation_ICCV_2019_paper.html", "aff_unique_index": "0+1;2;3", - "aff_unique_norm": "ByteDance;Snap Inc.;University of Illinois Urbana-Champaign;Adobe", + "aff_unique_norm": "ByteDance;Snap Inc.;University of Illinois at Urbana-Champaign;Adobe", "aff_unique_dep": "AI Lab;;;Adobe Research", "aff_unique_url": "https://www.bytedance.com;https://www.snapinc.com;https://www illinois.edu;https://research.adobe.com", "aff_unique_abbr": "ByteDance;Snap;UIUC;Adobe", "aff_campus_unique_index": ";1", "aff_campus_unique": ";Urbana-Champaign", "aff_country_unique_index": "0+1;1;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Yang_2019_ICCV,\n \n author = {\n Yang,\n Linjie and Fan,\n Yuchen and Xu,\n Ning\n},\n title = {\n Video Instance Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Video Object Segmentation Using Space-Time Memory Networks", @@ -32612,7 +33649,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Oh_Video_Object_Segmentation_Using_Space-Time_Memory_Networks_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Oh_Video_Object_Segmentation_Using_Space-Time_Memory_Networks_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Oh_2019_ICCV,\n \n author = {\n Oh,\n Seoung Wug and Lee,\n Joon-Young and Xu,\n Ning and Kim,\n Seon Joo\n},\n title = {\n Video Object Segmentation Using Space-Time Memory Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "VideoBERT: A Joint Model for Video and Language Representation Learning", @@ -32645,7 +33683,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Sun_2019_ICCV,\n \n author = {\n Sun,\n Chen and Myers,\n Austin and Vondrick,\n Carl and Murphy,\n Kevin and Schmid,\n Cordelia\n},\n title = {\n VideoBERT: A Joint Model for Video and Language Representation Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "VideoMem: Constructing, Analyzing, Predicting Short-Term and Long-Term Video Memorability", @@ -32653,7 +33692,7 @@ "status": "Poster", "track": "main", "pid": "5131", - "author_site": "Romain Cohendet, Claire-H\u00c3\u00a9l\u00c3\u00a8ne Demarty, Ngoc Q. K. Duong, Martin Engilberge", + "author_site": "Romain Cohendet, Claire-Hélène Demarty, Ngoc Q. K. Duong, Martin Engilberge", "author": "Romain Cohendet; Claire-Helene Demarty; Ngoc Q. K. Duong; Martin Engilberge", "abstract": "Humans share a strong tendency to memorize/forget some of the visual information they encounter. This paper focuses on understanding the intrinsic memorability of visual content. To address this challenge, we introduce a large scale dataset (VideoMem) composed of 10,000 videos with memorability scores. In contrast to previous work on image memorability -- where memorability was measured a few minutes after memorization -- memory performance is measured twice: a few minutes and again 24-72 hours after memorization. Hence, the dataset comes with short-term and long-term memorability annotations. After an in-depth analysis of the dataset, we investigate various deep neural network-based models for the prediction of video memorability. Our best model using a ranking loss achieves a Spearman's rank correlation of 0.494 (respectively 0.256) for short-term (resp. long-term) memorability prediction, while our model with attention mechanism provides insights of what makes a content memorable. The VideoMem dataset with pre-extracted features is publicly available.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Cohendet_VideoMem_Constructing_Analyzing_Predicting_Short-Term_and_Long-Term_Video_Memorability_ICCV_2019_paper.pdf", @@ -32678,7 +33717,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Rennes", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "France" + "aff_country_unique": "France", + "bibtex": "@InProceedings{Cohendet_2019_ICCV,\n \n author = {\n Cohendet,\n Romain and Demarty,\n Claire-Helene and Duong,\n Ngoc Q. K. and Engilberge,\n Martin\n},\n title = {\n VideoMem: Constructing,\n Analyzing,\n Predicting Short-Term and Long-Term Video Memorability\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "View Confusion Feature Learning for Person Re-Identification", @@ -32702,7 +33742,8 @@ "aff_domain": ";", "email": ";", "author_num": 2, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Liu_View_Confusion_Feature_Learning_for_Person_Re-Identification_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Liu_View_Confusion_Feature_Learning_for_Person_Re-Identification_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Liu_2019_ICCV,\n \n author = {\n Liu,\n Fangyi and Zhang,\n Lei\n},\n title = {\n View Confusion Feature Learning for Person Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "View Independent Generative Adversarial Network for Novel View Synthesis", @@ -32728,14 +33769,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Xu_View_Independent_Generative_Adversarial_Network_for_Novel_View_Synthesis_ICCV_2019_paper.html", "aff_unique_index": "0;0;0+1", - "aff_unique_norm": "Chinese University of Hong Kong;Tencent", + "aff_unique_norm": "The Chinese University of Hong Kong;Tencent", "aff_unique_dep": ";YouTu Lab", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.tencent.com", "aff_unique_abbr": "CUHK;Tencent", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xu_2019_ICCV,\n \n author = {\n Xu,\n Xiaogang and Chen,\n Ying-Cong and Jia,\n Jiaya\n},\n title = {\n View Independent Generative Adversarial Network for Novel View Synthesis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "View N-Gram Network for 3D Object Retrieval", @@ -32768,7 +33810,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", - "aff_country_unique": "China;United Kingdom" + "aff_country_unique": "China;United Kingdom", + "bibtex": "@InProceedings{He_2019_ICCV,\n \n author = {\n He,\n Xinwei and Huang,\n Tengteng and Bai,\n Song and Bai,\n Xiang\n},\n title = {\n View N-Gram Network for 3D Object Retrieval\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "View-Consistent 4D Light Field Superpixel Segmentation", @@ -32792,7 +33835,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Khan_View-Consistent_4D_Light_Field_Superpixel_Segmentation_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Khan_View-Consistent_4D_Light_Field_Superpixel_Segmentation_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Khan_2019_ICCV,\n \n author = {\n Khan,\n Numair and Zhang,\n Qian and Kasser,\n Lucas and Stone,\n Henry and Kim,\n Min H. and Tompkin,\n James\n},\n title = {\n View-Consistent 4D Light Field Superpixel Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "View-LSTM: Novel-View Video Synthesis Through View Decomposition", @@ -32825,7 +33869,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "London;", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "United Kingdom;Italy" + "aff_country_unique": "United Kingdom;Italy", + "bibtex": "@InProceedings{Lakhal_2019_ICCV,\n \n author = {\n Lakhal,\n Mohamed Ilyes and Lanz,\n Oswald and Cavallaro,\n Andrea\n},\n title = {\n View-LSTM: Novel-View Video Synthesis Through View Decomposition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Vision-Infused Deep Audio Inpainting", @@ -32851,14 +33896,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Zhou_Vision-Infused_Deep_Audio_Inpainting_ICCV_2019_paper.html", "aff_unique_index": "0;0;0;1;0", - "aff_unique_norm": "Chinese University of Hong Kong;University of Hong Kong", + "aff_unique_norm": "The Chinese University of Hong Kong;The University of Hong Kong", "aff_unique_dep": "CUHK - SenseTime Joint Lab;", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.hku.hk", "aff_unique_abbr": "CUHK;HKU", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhou_2019_ICCV,\n \n author = {\n Zhou,\n Hang and Liu,\n Ziwei and Xu,\n Xudong and Luo,\n Ping and Wang,\n Xiaogang\n},\n title = {\n Vision-Infused Deep Audio Inpainting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Visual Deprojection: Probabilistic Recovery of Collapsed Dimensions", @@ -32866,7 +33912,7 @@ "status": "Poster", "track": "main", "pid": "5128", - "author_site": "Guha Balakrishnan, Adrian V. Dalca, Amy Zhao, John V. Guttag, Fr\u00c3\u00a9do Durand, William T. Freeman", + "author_site": "Guha Balakrishnan, Adrian V. Dalca, Amy Zhao, John V. Guttag, Frédo Durand, William T. Freeman", "author": "Guha Balakrishnan; Adrian V. Dalca; Amy Zhao; John V. Guttag; Fredo Durand; William T. Freeman", "abstract": "We introduce visual deprojection: the task of recovering an image or video that has been collapsed along a dimension. Projections arise in various contexts, such as long-exposure photography, where a dynamic scene is collapsed in time to produce a motion-blurred image, and corner cameras, where reflected light from a scene is collapsed along a spatial dimension because of an edge occluder to yield a 1D video. Deprojection is ill-posed-- often there are many plausible solutions for a given input. We first propose a probabilistic model capturing the ambiguity of the task. We then present a variational inference strategy using convolutional neural networks as functional approximators. Sampling from the inference network at test time yields plausible candidates from the distribution of original signals that are consistent with a given input projection. We evaluate the method on several datasets for both spatial and temporal deprojection tasks. We first demonstrate the method can recover human gait videos and face images from spatial projections, and then show that it can recover videos of moving digits from dramatically motion-blurred images obtained via temporal projection.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Balakrishnan_Visual_Deprojection_Probabilistic_Recovery_of_Collapsed_Dimensions_ICCV_2019_paper.pdf", @@ -32891,7 +33937,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Balakrishnan_2019_ICCV,\n \n author = {\n Balakrishnan,\n Guha and Dalca,\n Adrian V. and Zhao,\n Amy and Guttag,\n John V. and Durand,\n Fredo and Freeman,\n William T.\n},\n title = {\n Visual Deprojection: Probabilistic Recovery of Collapsed Dimensions\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Visual Semantic Reasoning for Image-Text Matching", @@ -32924,7 +33971,8 @@ "aff_campus_unique_index": "0;0;0;0;0+0", "aff_campus_unique": "Boston", "aff_country_unique_index": "0;0;0;0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Li_2019_ICCV,\n \n author = {\n Li,\n Kunpeng and Zhang,\n Yulun and Li,\n Kai and Li,\n Yuanyuan and Fu,\n Yun\n},\n title = {\n Visual Semantic Reasoning for Image-Text Matching\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Visualization of Convolutional Neural Networks for Monocular Depth Estimation", @@ -32957,7 +34005,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0+0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Hu_2019_ICCV,\n \n author = {\n Hu,\n Junjie and Zhang,\n Yan and Okatani,\n Takayuki\n},\n title = {\n Visualization of Convolutional Neural Networks for Monocular Depth Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Visualizing the Invisible: Occluded Vehicle Segmentation and Recovery", @@ -32983,14 +34032,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Yan_Visualizing_the_Invisible_Occluded_Vehicle_Segmentation_and_Recovery_ICCV_2019_paper.html", "aff_unique_index": "0;0;0+1;0+1;1;2", - "aff_unique_norm": "Fuzhou University;South China University of Technology;University of Hong Kong", + "aff_unique_norm": "Fuzhou University;South China University of Technology;The University of Hong Kong", "aff_unique_dep": "College of Mathematics and Computer Science;School of Computer Science and Engineering;Department of Computer Science", "aff_unique_url": "https://www.fzu.edu.cn;https://www.scut.edu.cn;https://www.hku.hk", "aff_unique_abbr": "FZU;SCUT;HKU", "aff_campus_unique_index": ";;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0+0;0+0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yan_2019_ICCV,\n \n author = {\n Yan,\n Xiaosheng and Wang,\n Feigege and Liu,\n Wenxi and Yu,\n Yuanlong and He,\n Shengfeng and Pan,\n Jia\n},\n title = {\n Visualizing the Invisible: Occluded Vehicle Segmentation and Recovery\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "VrR-VG: Refocusing Visually-Relevant Relationships", @@ -33002,7 +34052,7 @@ "author": "Yuanzhi Liang; Yalong Bai; Wei Zhang; Xueming Qian; Li Zhu; Tao Mei", "abstract": "Relationships encode the interactions among individual instances and play a critical role in deep visual scene understanding. Suffering from the high predictability with non-visual information, relationship models tend to fit the statistical bias rather than \"learning\" to infer the relationships from images. To encourage further development in visual relationships, we propose a novel method to mine more valuable relationships by automatically pruning visually-irrelevant relationships. We construct a new scene graph dataset named Visually-Relevant Relationships Dataset (VrR-VG) based on Visual Genome. Compared with existing datasets, the performance gap between learnable and statistical method is more significant in VrR-VG, and frequency-based analysis does not work anymore. Moreover, we propose to learn a relationship-aware representation by jointly considering instances, attributes and relationships. By applying the representation-aware feature learned on VrR-VG, the performances of image captioning and visual question answering are systematically improved, which demonstrates the effectiveness of both our dataset and features embedding schema. Both our VrR-VG dataset and representation-aware features will be made publicly available soon.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Liang_VrR-VG_Refocusing_Visually-Relevant_Relationships_ICCV_2019_paper.pdf", - "aff": "Xi\u2019an Jiaotong University+JD AI Research; JD AI Research; JD AI Research; Xi\u2019an Jiaotong University; Xi\u2019an Jiaotong University; JD AI Research", + "aff": "Xi’an Jiaotong University+JD AI Research; JD AI Research; JD AI Research; Xi’an Jiaotong University; Xi’an Jiaotong University; JD AI Research", "project": "http://vrr-vg.com/", "github": "", "supp": "", @@ -33016,14 +34066,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Liang_VrR-VG_Refocusing_Visually-Relevant_Relationships_ICCV_2019_paper.html", "aff_unique_index": "0+1;1;1;0;0;1", - "aff_unique_norm": "Xi'an Jiao Tong University;JD", - "aff_unique_dep": ";JD AI Research", + "aff_unique_norm": "Xi'an Jiaotong University;JD AI Research", + "aff_unique_dep": ";", "aff_unique_url": "https://www.xjtu.edu.cn;https://www.jd.com", "aff_unique_abbr": "XJTU;JD AI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liang_2019_ICCV,\n \n author = {\n Liang,\n Yuanzhi and Bai,\n Yalong and Zhang,\n Wei and Qian,\n Xueming and Zhu,\n Li and Mei,\n Tao\n},\n title = {\n VrR-VG: Refocusing Visually-Relevant Relationships\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "WSOD2: Learning Bottom-Up and Top-Down Objectness Distillation for Weakly-Supervised Object Detection", @@ -33049,14 +34100,15 @@ "author_num": 5, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Zeng_WSOD2_Learning_Bottom-Up_and_Top-Down_Objectness_Distillation_for_Weakly-Supervised_Object_ICCV_2019_paper.html", "aff_unique_index": "0+0;1;1;0+0;1", - "aff_unique_norm": "Sun Yat-sen University;Microsoft", + "aff_unique_norm": "Sun Yat-sen University;Microsoft Corporation", "aff_unique_dep": "School of Data and Computer Science;Microsoft Research", "aff_unique_url": "http://www.sysu.edu.cn/;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "SYSU;MSR", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;1;1;0+0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Zeng_2019_ICCV,\n \n author = {\n Zeng,\n Zhaoyang and Liu,\n Bei and Fu,\n Jianlong and Chao,\n Hongyang and Zhang,\n Lei\n},\n title = {\n WSOD2: Learning Bottom-Up and Top-Down Objectness Distillation for Weakly-Supervised Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Wasserstein GAN With Quadratic Transport Cost", @@ -33080,7 +34132,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Liu_Wasserstein_GAN_With_Quadratic_Transport_Cost_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Liu_Wasserstein_GAN_With_Quadratic_Transport_Cost_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Liu_2019_ICCV,\n \n author = {\n Liu,\n Huidong and Gu,\n Xianfeng and Samaras,\n Dimitris\n},\n title = {\n Wasserstein GAN With Quadratic Transport Cost\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Watch, Listen and Tell: Multi-Modal Weakly Supervised Dense Event Captioning", @@ -33113,7 +34166,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0+0;0+0+0;0+0+0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Rahman_2019_ICCV,\n \n author = {\n Rahman,\n Tanzila and Xu,\n Bicheng and Sigal,\n Leonid\n},\n title = {\n Watch,\n Listen and Tell: Multi-Modal Weakly Supervised Dense Event Captioning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Wavelet Domain Style Transfer for an Effective Perception-Distortion Tradeoff in Single Image Super-Resolution", @@ -33146,7 +34200,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;0", - "aff_country_unique": "United Kingdom;Switzerland;China" + "aff_country_unique": "United Kingdom;Switzerland;China", + "bibtex": "@InProceedings{Deng_2019_ICCV,\n \n author = {\n Deng,\n Xin and Yang,\n Ren and Xu,\n Mai and Dragotti,\n Pier Luigi\n},\n title = {\n Wavelet Domain Style Transfer for an Effective Perception-Distortion Tradeoff in Single Image Super-Resolution\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Weakly Aligned Cross-Modal Learning for Multispectral Pedestrian Detection", @@ -33179,7 +34234,8 @@ "aff_campus_unique_index": ";;;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0+0;0+0;0;0+0+0;0+0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2019_ICCV,\n \n author = {\n Zhang,\n Lu and Zhu,\n Xiangyu and Chen,\n Xiangyu and Yang,\n Xu and Lei,\n Zhen and Liu,\n Zhiyong\n},\n title = {\n Weakly Aligned Cross-Modal Learning for Multispectral Pedestrian Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Weakly Supervised Energy-Based Learning for Action Segmentation", @@ -33205,14 +34261,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Li_Weakly_Supervised_Energy-Based_Learning_for_Action_Segmentation_ICCV_2019_paper.html", "aff_unique_index": "0;1+0;0", - "aff_unique_norm": "Oregon State University;Amazon", - "aff_unique_dep": ";Amazon.com Services, Inc.", + "aff_unique_norm": "Oregon State University;Amazon.com Services, Inc.", + "aff_unique_dep": ";", "aff_unique_url": "https://oregonstate.edu;https://www.amazon.com", "aff_unique_abbr": "OSU;Amazon", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Li_2019_ICCV,\n \n author = {\n Li,\n Jun and Lei,\n Peng and Todorovic,\n Sinisa\n},\n title = {\n Weakly Supervised Energy-Based Learning for Action Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Weakly Supervised Object Detection With Segmentation Collaboration", @@ -33238,14 +34295,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Li_Weakly_Supervised_Object_Detection_With_Segmentation_Collaboration_ICCV_2019_paper.html", "aff_unique_index": "0+1;0+1;0+1+2;0+1", - "aff_unique_norm": "Chinese Academy of Sciences;University of Chinese Academy of Sciences;Pengcheng Laboratory", - "aff_unique_dep": "Institute of Computing Technology;;Peng Cheng Laboratory", + "aff_unique_norm": "Chinese Academy of Sciences;University of Chinese Academy of Sciences;Peng Cheng Laboratory", + "aff_unique_dep": "Institute of Computing Technology;;", "aff_unique_url": "http://www.cas.ac.cn;http://www.ucas.ac.cn;", "aff_unique_abbr": "CAS;UCAS;", "aff_campus_unique_index": "0+0;0+0;0+0+1;0+0", "aff_campus_unique": "Beijing;Shenzhen", "aff_country_unique_index": "0+0;0+0;0+0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2019_ICCV,\n \n author = {\n Li,\n Xiaoyan and Kan,\n Meina and Shan,\n Shiguang and Chen,\n Xilin\n},\n title = {\n Weakly Supervised Object Detection With Segmentation Collaboration\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Weakly Supervised Temporal Action Localization Through Contrast Based Evaluation Networks", @@ -33271,14 +34329,15 @@ "author_num": 7, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Liu_Weakly_Supervised_Temporal_Action_Localization_Through_Contrast_Based_Evaluation_Networks_ICCV_2019_paper.html", "aff_unique_index": "0;0;1;2;2;0;3", - "aff_unique_norm": "Xi'an Jiao Tong University;HERE Technologies;Alibaba Group;Wormpex AI Research", + "aff_unique_norm": "Xi'an Jiaotong University;HERE Technologies;Alibaba Group;Wormpex AI Research", "aff_unique_dep": "Institute of Artificial Intelligence and Robotics;;DAMO Academy;AI Research", "aff_unique_url": "http://www.xjtu.edu.cn;https://www.here.com;https://www.alibaba-group.com;", "aff_unique_abbr": "XJTU;HERE;Alibaba;Wormpex AI", "aff_campus_unique_index": "0;0;2;0", "aff_campus_unique": "Xi'an;;Israel Lab", "aff_country_unique_index": "0;0;1;0;2;0;3", - "aff_country_unique": "China;Finland;Israel;United States" + "aff_country_unique": "China;Finland;Israel;United States", + "bibtex": "@InProceedings{Liu_2019_ICCV,\n \n author = {\n Liu,\n Ziyi and Wang,\n Le and Zhang,\n Qilin and Gao,\n Zhanning and Niu,\n Zhenxing and Zheng,\n Nanning and Hua,\n Gang\n},\n title = {\n Weakly Supervised Temporal Action Localization Through Contrast Based Evaluation Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Weakly-Supervised Action Localization With Background Modeling", @@ -33311,7 +34370,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Irvine;", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Nguyen_2019_ICCV,\n \n author = {\n Nguyen,\n Phuc Xuan and Ramanan,\n Deva and Fowlkes,\n Charless C.\n},\n title = {\n Weakly-Supervised Action Localization With Background Modeling\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "What Else Can Fool Deep Learning? Addressing Color Constancy Errors on Deep Neural Network Performance", @@ -33337,14 +34397,15 @@ "author_num": 2, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Afifi_What_Else_Can_Fool_Deep_Learning_Addressing_Color_Constancy_Errors_ICCV_2019_paper.html", "aff_unique_index": "0;0+1", - "aff_unique_norm": "York University;Samsung", + "aff_unique_norm": "York University;Samsung AI Center", "aff_unique_dep": ";AI Center", "aff_unique_url": "https://yorku.ca;https://www.samsung.com/global/innovation/ai-research/", "aff_unique_abbr": "York U;SAC", "aff_campus_unique_index": "0;0+0", "aff_campus_unique": "Toronto", "aff_country_unique_index": "0;0+0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Afifi_2019_ICCV,\n \n author = {\n Afifi,\n Mahmoud and Brown,\n Michael S.\n},\n title = {\n What Else Can Fool Deep Learning? Addressing Color Constancy Errors on Deep Neural Network Performance\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "What Is Wrong With Scene Text Recognition Model Comparisons? Dataset and Model Analysis", @@ -33377,7 +34438,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0;0;0;0", - "aff_country_unique": "South Korea;Japan" + "aff_country_unique": "South Korea;Japan", + "bibtex": "@InProceedings{Baek_2019_ICCV,\n \n author = {\n Baek,\n Jeonghun and Kim,\n Geewook and Lee,\n Junyeop and Park,\n Sungrae and Han,\n Dongyoon and Yun,\n Sangdoo and Oh,\n Seong Joon and Lee,\n Hwalsuk\n},\n title = {\n What Is Wrong With Scene Text Recognition Model Comparisons? Dataset and Model Analysis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "What Synthesis Is Missing: Depth Adaptation Integrated With Weak Supervision for Indoor Scene Parsing", @@ -33410,7 +34472,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Taiwan", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2019_ICCV,\n \n author = {\n Liu,\n Keng-Chi and Shen,\n Yi-Ting and Klopp,\n Jan P. and Chen,\n Liang-Gee\n},\n title = {\n What Synthesis Is Missing: Depth Adaptation Integrated With Weak Supervision for Indoor Scene Parsing\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "What Would You Expect? Anticipating Egocentric Actions With Rolling-Unrolling LSTMs and Modality Attention", @@ -33443,7 +34506,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Italy" + "aff_country_unique": "Italy", + "bibtex": "@InProceedings{Furnari_2019_ICCV,\n \n author = {\n Furnari,\n Antonino and Farinella,\n Giovanni Maria\n},\n title = {\n What Would You Expect? Anticipating Egocentric Actions With Rolling-Unrolling LSTMs and Modality Attention\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Where Is My Mirror?", @@ -33469,14 +34533,15 @@ "author_num": 6, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Yang_Where_Is_My_Mirror_ICCV_2019_paper.html", "aff_unique_index": "0;0;0+1;0;0+2;1", - "aff_unique_norm": "Dalian University of Technology;City University of Hong Kong;Pengcheng Laboratory", - "aff_unique_dep": ";;Peng Cheng Laboratory", + "aff_unique_norm": "Dalian University of Technology;City University of Hong Kong;Peng Cheng Laboratory", + "aff_unique_dep": ";;", "aff_unique_url": "http://www.dlut.edu.cn/;https://www.cityu.edu.hk;http://www.pcl.ac.cn", "aff_unique_abbr": "DUT;CityU;PCL", "aff_campus_unique_index": "1;;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0+0;0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yang_2019_ICCV,\n \n author = {\n Yang,\n Xin and Mei,\n Haiyang and Xu,\n Ke and Wei,\n Xiaopeng and Yin,\n Baocai and Lau,\n Rynson W.H.\n},\n title = {\n Where Is My Mirror?\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Why Does a Visual Question Have Different Answers?", @@ -33509,7 +34574,8 @@ "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Austin;Los Angeles", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Bhattacharya_2019_ICCV,\n \n author = {\n Bhattacharya,\n Nilavra and Li,\n Qing and Gurari,\n Danna\n},\n title = {\n Why Does a Visual Question Have Different Answers?\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "WoodScape: A Multi-Task, Multi-Camera Fisheye Dataset for Autonomous Driving", @@ -33517,7 +34583,7 @@ "status": "Oral", "track": "main", "pid": "630", - "author_site": "Senthil Yogamani, Ciar\u00c3\u00a1n Hughes, Jonathan Horgan, Ganesh Sistu, Padraig Varley, Derek O'Dea, Michal U\u00c5\u0099i\u00c4\u008d\u00c3\u00a1\u00c5\u0099, Stefan Milz, Martin Simon, Karl Amende, Christian Witt, Hazem Rashed, Sumanth Chennupati, Sanjaya Nayak, Saquib Mansoor, Xavier Perrotton, Patrick P\u00c3\u00a9rez", + "author_site": "Senthil Yogamani, Ciarán Hughes, Jonathan Horgan, Ganesh Sistu, Padraig Varley, Derek O'Dea, Michal Uřičář, Stefan Milz, Martin Simon, Karl Amende, Christian Witt, Hazem Rashed, Sumanth Chennupati, Sanjaya Nayak, Saquib Mansoor, Xavier Perrotton, Patrick Pérez", "author": "Senthil Yogamani; Ciaran Hughes; Jonathan Horgan; Ganesh Sistu; Padraig Varley; Derek O'Dea; Michal Uricar; Stefan Milz; Martin Simon; Karl Amende; Christian Witt; Hazem Rashed; Sumanth Chennupati; Sanjaya Nayak; Saquib Mansoor; Xavier Perrotton; Patrick Perez", "abstract": "Fisheye cameras are commonly employed for obtaining a large field of view in surveillance, augmented reality and in particular automotive applications. In spite of their prevalence, there are few public datasets for detailed evaluation of computer vision algorithms on fisheye images. We release the first extensive fisheye automotive dataset, WoodScape, named after Robert Wood who invented the fisheye camera in 1906. WoodScape comprises of four surround view cameras and nine tasks including segmentation, depth estimation, 3D bounding box detection and soiling detection. Semantic annotation of 40 classes at the instance level is provided for over 10,000 images and annotation for other tasks are provided for over 100,000 images. With WoodScape, we would like to encourage the community to adapt computer vision models for fisheye camera instead of using naive rectification.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Yogamani_WoodScape_A_Multi-Task_Multi-Camera_Fisheye_Dataset_for_Autonomous_Driving_ICCV_2019_paper.pdf", @@ -33533,7 +34599,8 @@ "aff_domain": ";;;;;;;;;;;;;;;;", "email": ";;;;;;;;;;;;;;;;", "author_num": 17, - "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Yogamani_WoodScape_A_Multi-Task_Multi-Camera_Fisheye_Dataset_for_Autonomous_Driving_ICCV_2019_paper.html" + "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Yogamani_WoodScape_A_Multi-Task_Multi-Camera_Fisheye_Dataset_for_Autonomous_Driving_ICCV_2019_paper.html", + "bibtex": "@InProceedings{Yogamani_2019_ICCV,\n \n author = {\n Yogamani,\n Senthil and Hughes,\n Ciaran and Horgan,\n Jonathan and Sistu,\n Ganesh and Varley,\n Padraig and O'Dea,\n Derek and Uricar,\n Michal and Milz,\n Stefan and Simon,\n Martin and Amende,\n Karl and Witt,\n Christian and Rashed,\n Hazem and Chennupati,\n Sumanth and Nayak,\n Sanjaya and Mansoor,\n Saquib and Perrotton,\n Xavier and Perez,\n Patrick\n},\n title = {\n WoodScape: A Multi-Task,\n Multi-Camera Fisheye Dataset for Autonomous Driving\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "X-Section: Cross-Section Prediction for Enhanced RGB-D Fusion", @@ -33566,7 +34633,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";London", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Nicastro_2019_ICCV,\n \n author = {\n Nicastro,\n Andrea and Clark,\n Ronald and Leutenegger,\n Stefan\n},\n title = {\n X-Section: Cross-Section Prediction for Enhanced RGB-D Fusion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "XRAI: Better Attributions Through Regions", @@ -33574,7 +34642,7 @@ "status": "Poster", "track": "main", "pid": "6352", - "author_site": "Andrei Kapishnikov, Tolga Bolukbasi, Fernanda Vi\u00c3\u00a9gas, Michael Terry", + "author_site": "Andrei Kapishnikov, Tolga Bolukbasi, Fernanda Viégas, Michael Terry", "author": "Andrei Kapishnikov; Tolga Bolukbasi; Fernanda Viegas; Michael Terry", "abstract": "Saliency methods can aid understanding of deep neural networks. Recent years have witnessed many improvements to saliency methods, as well as new ways for evaluating them. In this paper, we 1) present a novel region-based attribution method, XRAI, that builds upon integrated gradients (Sundararajan et al. 2017), 2) introduce evaluation methods for empirically assessing the quality of image-based saliency maps (Performance Information Curves (PICs)), and 3) contribute an axiom-based sanity check for attribution methods. Through empirical experiments and example results, we show that XRAI produces better results than other saliency methods for common models and the ImageNet dataset.", "pdf": "http://openaccess.thecvf.com/content_ICCV_2019/papers/Kapishnikov_XRAI_Better_Attributions_Through_Regions_ICCV_2019_paper.pdf", @@ -33599,7 +34667,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Kapishnikov_2019_ICCV,\n \n author = {\n Kapishnikov,\n Andrei and Bolukbasi,\n Tolga and Viegas,\n Fernanda and Terry,\n Michael\n},\n title = {\n XRAI: Better Attributions Through Regions\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "YOLACT: Real-Time Instance Segmentation", @@ -33632,7 +34701,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Davis", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Bolya_2019_ICCV,\n \n author = {\n Bolya,\n Daniel and Zhou,\n Chong and Xiao,\n Fanyi and Lee,\n Yong Jae\n},\n title = {\n YOLACT: Real-Time Instance Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Zero-Shot Anticipation for Instructional Activities", @@ -33665,7 +34735,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", - "aff_country_unique": "Germany;Singapore" + "aff_country_unique": "Germany;Singapore", + "bibtex": "@InProceedings{Sener_2019_ICCV,\n \n author = {\n Sener,\n Fadime and Yao,\n Angela\n},\n title = {\n Zero-Shot Anticipation for Instructional Activities\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Zero-Shot Emotion Recognition via Affective Structural Embedding", @@ -33698,7 +34769,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;0;1+0;0;0+1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Zhan_2019_ICCV,\n \n author = {\n Zhan,\n Chi and She,\n Dongyu and Zhao,\n Sicheng and Cheng,\n Ming-Ming and Yang,\n Jufeng\n},\n title = {\n Zero-Shot Emotion Recognition via Affective Structural Embedding\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Zero-Shot Grounding of Objects From Natural Language Queries", @@ -33724,14 +34796,15 @@ "author_num": 3, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Sadhu_Zero-Shot_Grounding_of_Objects_From_Natural_Language_Queries_ICCV_2019_paper.html", "aff_unique_index": "0;1+0;0", - "aff_unique_norm": "University of Southern California;Meta", - "aff_unique_dep": ";Facebook", + "aff_unique_norm": "University of Southern California;Facebook", + "aff_unique_dep": ";", "aff_unique_url": "https://www.usc.edu;https://www.facebook.com", "aff_unique_abbr": "USC;FB", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0;0+0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Sadhu_2019_ICCV,\n \n author = {\n Sadhu,\n Arka and Chen,\n Kan and Nevatia,\n Ram\n},\n title = {\n Zero-Shot Grounding of Objects From Natural Language Queries\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "Zero-Shot Video Object Segmentation via Attentive Graph Neural Networks", @@ -33764,7 +34837,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+1;1;0", - "aff_country_unique": "United Arab Emirates;United States" + "aff_country_unique": "United Arab Emirates;United States", + "bibtex": "@InProceedings{Wang_2019_ICCV,\n \n author = {\n Wang,\n Wenguan and Lu,\n Xiankai and Shen,\n Jianbing and Crandall,\n David J. and Shao,\n Ling\n},\n title = {\n Zero-Shot Video Object Segmentation via Attentive Graph Neural Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "advPattern: Physical-World Attacks on Deep Person Re-Identification via Adversarially Transformable Patterns", @@ -33797,7 +34871,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Knoxville", "aff_country_unique_index": "0;0;0;0;1;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Wang_2019_ICCV,\n \n author = {\n Wang,\n Zhibo and Zheng,\n Siyan and Song,\n Mengkai and Wang,\n Qian and Rahimpour,\n Alireza and Qi,\n Hairong\n},\n title = {\n advPattern: Physical-World Attacks on Deep Person Re-Identification via Adversarially Transformable Patterns\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "l-Net: Reconstruct Hyperspectral Images From a Snapshot Measurement", @@ -33823,14 +34898,15 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Miao_l-Net_Reconstruct_Hyperspectral_Images_From_a_Snapshot_Measurement_ICCV_2019_paper.html", "aff_unique_index": "0;1;2;0", - "aff_unique_norm": "University of Texas at Arlington;Nokia Bell Labs;Meta", - "aff_unique_dep": ";;Facebook", + "aff_unique_norm": "University of Texas at Arlington;Nokia Bell Labs;Facebook", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.uta.edu;https://www.nokialabs.com;https://www.facebook.com", "aff_unique_abbr": "UTA;Nokia Bell Labs;FB", "aff_campus_unique_index": "0;1;2;0", "aff_campus_unique": "Arlington;New Jersey;Menlo Park", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Miao_2019_ICCV,\n \n author = {\n Miao,\n Xin and Yuan,\n Xin and Pu,\n Yunchen and Athitsos,\n Vassilis\n},\n title = {\n l-Net: Reconstruct Hyperspectral Images From a Snapshot Measurement\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "nocaps: novel object captioning at scale", @@ -33856,14 +34932,15 @@ "author_num": 10, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Agrawal_nocaps_novel_object_captioning_at_scale_ICCV_2019_paper.html", "aff_unique_index": "0;1;0+2;0+3;1;0+3;3;0+4;0;0", - "aff_unique_norm": "Georgia Institute of Technology;Macquarie University;University of Michigan;Meta;Oregon State University", + "aff_unique_norm": "Georgia Institute of Technology;Macquarie University;University of Michigan;Facebook;Oregon State University", "aff_unique_dep": ";;;Facebook AI Research;", "aff_unique_url": "https://www.gatech.edu;https://www.mq.edu.au;https://www.umich.edu;https://research.facebook.com;https://oregonstate.edu", "aff_unique_abbr": "Georgia Tech;MQ;UM;FAIR;OSU", "aff_campus_unique_index": ";;;", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0+0;0+0;1;0+0;0;0+0;0;0", - "aff_country_unique": "United States;Australia" + "aff_country_unique": "United States;Australia", + "bibtex": "@InProceedings{Agrawal_2019_ICCV,\n \n author = {\n Agrawal,\n Harsh and Desai,\n Karan and Wang,\n Yufei and Chen,\n Xinlei and Jain,\n Rishabh and Johnson,\n Mark and Batra,\n Dhruv and Parikh,\n Devi and Lee,\n Stefan and Anderson,\n Peter\n},\n title = {\n nocaps: novel object captioning at scale\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" }, { "title": "xR-EgoPose: Egocentric 3D Human Pose From an HMD Camera", @@ -33889,13 +34966,14 @@ "author_num": 4, "oa": "http://openaccess.thecvf.com/content_ICCV_2019/html/Tome_xR-EgoPose_Egocentric_3D_Human_Pose_From_an_HMD_Camera_ICCV_2019_paper.html", "aff_unique_index": "0+1;1;0;1", - "aff_unique_norm": "University College London;Meta", + "aff_unique_norm": "University College London;Facebook Reality Lab", "aff_unique_dep": ";Facebook Reality Lab", "aff_unique_url": "https://www.ucl.ac.uk;https://www.facebook.com/realitylab", "aff_unique_abbr": "UCL;FRL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;1;0;1", - "aff_country_unique": "United Kingdom;United States" + "aff_country_unique": "United Kingdom;United States", + "bibtex": "@InProceedings{Tome_2019_ICCV,\n \n author = {\n Tome,\n Denis and Peluse,\n Patrick and Agapito,\n Lourdes and Badino,\n Hernan\n},\n title = {\n xR-EgoPose: Egocentric 3D Human Pose From an HMD Camera\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2019\n} \n}" } ] \ No newline at end of file diff --git a/iccv/iccv2021.json b/iccv/iccv2021.json index 6d16d08..7706ed0 100644 --- a/iccv/iccv2021.json +++ b/iccv/iccv2021.json @@ -5,7 +5,8 @@ "status": "Poster", "track": "main", "pid": 3757, - "author": "S\u00e9rgio Agostinho; Aljo\u0161a O\u0161ep; Alessio Del Bue; Laura Leal-Taix\u00e9", + "author_site": "Sérgio Agostinho; Aljoša Ošep; Alessio Del Bue; Laura Leal-Taixé", + "author": "Sérgio Agostinho; Aljoša Ošep; Alessio Del Bue; Laura Leal-Taixé", "abstract": "In this paper, we tackle data-driven 3D point cloud registration. Given point correspondences, the standard Kabsch algorithm provides an optimal rotation estimate. This allows to train registration models in an end-to-end manner by differentiating the SVD operation. However, given the initial rotation estimate supplied by Kabsch, we show we can improve point correspondence learning during model training by extending the original optimization problem. In particular, we linearize the governing constraints of the rotation matrix and solve the resulting linear system of equations. We then iteratively produce new solutions by updating the initial estimate. Our experiments show that, by plugging our differentiable layer to existing learning-based registration methods, we improve the correspondence matching quality. This yields up to a 7% decrease in rotation error for correspondence-based data-driven registration methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Agostinho_Just_A_Spoonful_of_Refinements_Helps_the_Registration_Error_Go_ICCV_2021_paper.pdf", "aff": ";;;", @@ -19,7 +20,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Agostinho_Just_A_Spoonful_of_Refinements_Helps_the_Registration_Error_Go_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Agostinho_Just_A_Spoonful_of_Refinements_Helps_the_Registration_Error_Go_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Agostinho_2021_ICCV,\n \n author = {\n Agostinho,\n S\\'ergio and O\\v{s\n}ep,\n Aljo\\v{s\n}a and Del Bue,\n Alessio and Leal-Taix\\'e,\n Laura\n},\n title = {\n (Just) A Spoonful of Refinements Helps the Registration Error Go Down\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6108-6117\n} \n}" }, { "title": "3D Building Reconstruction From Monocular Remote Sensing Images", @@ -27,6 +29,7 @@ "status": "Poster", "track": "main", "pid": 6918, + "author_site": "Weijia Li; Lingxuan Meng; Jinwang Wang; Conghui He; Gui-Song Xia; Dahua Lin", "author": "Weijia Li; Lingxuan Meng; Jinwang Wang; Conghui He; Gui-Song Xia; Dahua Lin", "abstract": "3D building reconstruction from monocular remote sensing imagery is an important research problem and an economic solution to large-scale city modeling, compared with reconstruction from LiDAR data and multi-view imagery. However, several challenges such as the partial invisibility of building footprints and facades, the serious shadow effect, and the extreme variance of building height in large-scale areas, have restricted the existing monocular image based building reconstruction studies to certain application scenes, i.e., modeling simple low-rise buildings from near-nadir images. In this study, we propose a novel 3D building reconstruction method for monocular remote sensing images, which tackles the above difficulties, thus providing an appealing solution for more complicated scenarios. We design a multi-task building reconstruction network, named MTBR-Net, to learn the geometric property of oblique images, the key components of a 3D building model and their relations via four semantic-related and three offset-related tasks. The network outputs are further integrated by a prior knowledge based 3D model optimization method to produce the the final 3D building models. Results on a public 3D reconstruction dataset and a novel released dataset demonstrate that our method improves the height estimation performance by over 40% and the segmentation F1-score by 2% - 4% compared with current state-of-the-art.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_3D_Building_Reconstruction_From_Monocular_Remote_Sensing_Images_ICCV_2021_paper.pdf", @@ -43,14 +46,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Li_3D_Building_Reconstruction_From_Monocular_Remote_Sensing_Images_ICCV_2021_paper.html", "aff_unique_index": "0+1;2+3;2+4;2;4;0+1", - "aff_unique_norm": "Chinese University of Hong Kong;Shanghai AI Laboratory;SenseTime Research;University of Electronic Science and Technology of China;Wuhan University", + "aff_unique_norm": "The Chinese University of Hong Kong;Shanghai AI Laboratory;SenseTime Research;University of Electronic Science and Technology of China;Wuhan University", "aff_unique_dep": "1CUHK-SenseTime Joint Lab;;;;", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.shanghai-ai-lab.com;https://www.sensetime.com;https://www.uestc.edu.cn;http://www.whu.edu.cn/", "aff_unique_abbr": "CUHK;SAIL;SenseTime;UESTC;WHU", "aff_campus_unique_index": "0;;;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0+0;0+0;0+0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Weijia and Meng,\n Lingxuan and Wang,\n Jinwang and He,\n Conghui and Xia,\n Gui-Song and Lin,\n Dahua\n},\n title = {\n 3D Building Reconstruction From Monocular Remote Sensing Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12548-12557\n} \n}" }, { "title": "3D Human Pose Estimation With Spatial and Temporal Transformers", @@ -58,6 +62,7 @@ "status": "Poster", "track": "main", "pid": 3877, + "author_site": "Ce Zheng; Sijie Zhu; Matias Mendieta; Taojiannan Yang; Chen Chen; Zhengming Ding", "author": "Ce Zheng; Sijie Zhu; Matias Mendieta; Taojiannan Yang; Chen Chen; Zhengming Ding", "abstract": "Transformer architectures have become the model of choice in natural language processing and are now being introduced into computer vision tasks such as image classification, object detection, and semantic segmentation. However, in the field of human pose estimation, convolutional architectures still remain dominant. In this work, we present PoseFormer, a purely transformer-based approach for 3D human pose estimation in videos without convolutional architectures involved. Inspired by recent developments in vision transformers, we design a spatial-temporal transformer structure to comprehensively model the human joint relations within each frame as well as the temporal correlations across frames, then output an accurate 3D human pose of the center frame. We quantitatively and qualitatively evaluate our method on two popular and standard benchmark datasets: Human3.6M and MPI-INF-3DHP. Extensive experiments show that PoseFormer achieves state-of-the-art performance on both datasets. Our code and model will be publicly available.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zheng_3D_Human_Pose_Estimation_With_Spatial_and_Temporal_Transformers_ICCV_2021_paper.pdf", @@ -78,10 +83,11 @@ "aff_unique_dep": "Center for Research in Computer Vision;Department of Computer Science", "aff_unique_url": "https://www.ucf.edu;https://www.tulane.edu", "aff_unique_abbr": "UCF;Tulane", - "aff_campus_unique_index": "0;0;0;0;0", - "aff_campus_unique": "Central Florida;", + "aff_campus_unique_index": "", + "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zheng_2021_ICCV,\n \n author = {\n Zheng,\n Ce and Zhu,\n Sijie and Mendieta,\n Matias and Yang,\n Taojiannan and Chen,\n Chen and Ding,\n Zhengming\n},\n title = {\n 3D Human Pose Estimation With Spatial and Temporal Transformers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11656-11665\n} \n}" }, { "title": "3D Human Texture Estimation From a Single Image With Transformers", @@ -89,6 +95,7 @@ "status": "Poster", "track": "main", "pid": 1281, + "author_site": "Xiangyu Xu; Chen Change Loy", "author": "Xiangyu Xu; Chen Change Loy", "abstract": "We propose a Transformer-based framework for 3D human texture estimation from a single image. The proposed Transformer is able to effectively exploit the global information of the input image, overcoming the limitations of existing methods that are solely based on convolutional neural networks. In addition, we also propose a mask-fusion strategy to combine the advantages of the RGB-based and texture-flow-based models. We further introduce a part-style loss to help reconstruct high-fidelity colors without introducing unpleasant artifacts. Extensive experiments demonstrate the effectiveness of the proposed method against state-of-the-art 3D human texture estimation approaches both quantitatively and qualitatively.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xu_3D_Human_Texture_Estimation_From_a_Single_Image_With_Transformers_ICCV_2021_paper.pdf", @@ -112,7 +119,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Xu_2021_ICCV,\n \n author = {\n Xu,\n Xiangyu and Loy,\n Chen Change\n},\n title = {\n 3D Human Texture Estimation From a Single Image With Transformers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13849-13858\n} \n}" }, { "title": "3D Local Convolutional Neural Networks for Gait Recognition", @@ -120,6 +128,7 @@ "status": "Poster", "track": "main", "pid": 3075, + "author_site": "Zhen Huang; Dixiu Xue; Xu Shen; Xinmei Tian; Houqiang Li; Jianqiang Huang; Xian-Sheng Hua", "author": "Zhen Huang; Dixiu Xue; Xu Shen; Xinmei Tian; Houqiang Li; Jianqiang Huang; Xian-Sheng Hua", "abstract": "The goal of gait recognition is to learn the unique spatio-temporal pattern about the human body shape from its temporal changing characteristics. As different body parts behave differently during walking, it is intuitive to model the spatio-temporal patterns of each part separately. However, existing part-based methods equally divide the feature maps of each frame into fixed horizontal stripes to get local parts. It is obvious that these stripe partition-based methods cannot accurately locate the body parts. First, different body parts can appear at the same stripe (e.g., arms and the torso), and one part can appear at different stripes in different frames (e.g., hands). Second, different body parts possess different scales, and even the same part in different frames can appear at different locations and scales. Third, different parts also exhibit distinct movement patterns (e.g., at which frame the movement starts, the position change frequency, how long it lasts). To overcome these issues, we propose novel 3D local operations as a generic family of building blocks for 3D gait recognition backbones. The proposed 3D local operations support the extraction of local 3D volumes of body parts in a sequence with adaptive spatial and temporal scales, locations and lengths. In this way, the spatio-temporal patterns of the body parts are well learned from the 3D local neighborhood in part-specific scales, locations, frequencies and lengths. Experiments demonstrate that our 3D local convolutional neural networks achieve state-of-the-art performance on popular gait datasets. Code is available at: https://github.com/yellowtownhz/3DLocalCNN.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Huang_3D_Local_Convolutional_Neural_Networks_for_Gait_Recognition_ICCV_2021_paper.pdf", @@ -143,7 +152,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0+0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Huang_2021_ICCV,\n \n author = {\n Huang,\n Zhen and Xue,\n Dixiu and Shen,\n Xu and Tian,\n Xinmei and Li,\n Houqiang and Huang,\n Jianqiang and Hua,\n Xian-Sheng\n},\n title = {\n 3D Local Convolutional Neural Networks for Gait Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14920-14929\n} \n}" }, { "title": "3D Shape Generation and Completion Through Point-Voxel Diffusion", @@ -151,6 +161,7 @@ "status": "Poster", "track": "main", "pid": 9265, + "author_site": "Linqi Zhou; Yilun Du; Jiajun Wu", "author": "Linqi Zhou; Yilun Du; Jiajun Wu", "abstract": "We propose a novel approach for probabilistic generative modeling of 3D shapes. Unlike most existing models that learn to deterministically translate a latent vector to a shape, our model, Point-Voxel Diffusion (PVD), is a unified, probabilistic formulation for unconditional shape generation and conditional, multi-modal shape completion. PVDmarries denoising diffusion models with the hybrid, point-voxel representation of 3D shapes. It can be viewed as a series of denoising steps, reversing the diffusion process from observed point cloud data to Gaussian noise, and is trained by optimizing a variational lower bound to the (conditional) likelihood function. Experiments demonstrate that PVD is capable of synthesizing high-fidelity shapes, completing partial point clouds, and generating multiple completion results from single-view depth scans of real objects.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhou_3D_Shape_Generation_and_Completion_Through_Point-Voxel_Diffusion_ICCV_2021_paper.pdf", @@ -174,7 +185,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhou_2021_ICCV,\n \n author = {\n Zhou,\n Linqi and Du,\n Yilun and Wu,\n Jiajun\n},\n title = {\n 3D Shape Generation and Completion Through Point-Voxel Diffusion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5826-5835\n} \n}" }, { "title": "3D-FRONT: 3D Furnished Rooms With layOuts and semaNTics", @@ -182,6 +194,7 @@ "status": "Poster", "track": "main", "pid": 5496, + "author_site": "Huan Fu; Bowen Cai; Lin Gao; Ling-Xiao Zhang; Jiaming Wang; Cao Li; Qixun Zeng; Chengyue Sun; Rongfei Jia; Binqiang Zhao; Hao Zhang", "author": "Huan Fu; Bowen Cai; Lin Gao; Ling-Xiao Zhang; Jiaming Wang; Cao Li; Qixun Zeng; Chengyue Sun; Rongfei Jia; Binqiang Zhao; Hao Zhang", "abstract": "We introduce 3D-FRONT (3D Furnished Rooms with layOuts and semaNTics), a new, large-scale, and compre- hensive repository of synthetic indoor scenes highlighted by professionally designed layouts and a large number of rooms populated by high-quality textured 3D models with style compatibility. From layout semantics down to texture details of individual objects, our dataset is freely available to the academic community and beyond. Currently, 3D- FRONT contains 6,813 CAD houses, where 18,968 rooms diversely furnished by 3D objects, far surpassing all publicly available scene datasets. The 13,151 furniture objects all come with high-quality textures. While the floorplans and layout designs (i.e., furniture arrangements) are directly sourced from professional creations, the interior de- signs in terms of furniture styles, color, and textures have been carefully curated based on a recommender system we develop to attain consistent styles as expert designs. Furthermore, we release Trescope, a light-weight rendering tool, to support benchmark rendering of 2D images and annotations from 3D-FRONT. We demonstrate two applications, interior scene synthesis and texture synthesis, that are especially tailored to the strengths of our new dataset.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Fu_3D-FRONT_3D_Furnished_Rooms_With_layOuts_and_semaNTics_ICCV_2021_paper.pdf", @@ -205,7 +218,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0;1", - "aff_country_unique": "China;Canada" + "aff_country_unique": "China;Canada", + "bibtex": "@InProceedings{Fu_2021_ICCV,\n \n author = {\n Fu,\n Huan and Cai,\n Bowen and Gao,\n Lin and Zhang,\n Ling-Xiao and Wang,\n Jiaming and Li,\n Cao and Zeng,\n Qixun and Sun,\n Chengyue and Jia,\n Rongfei and Zhao,\n Binqiang and Zhang,\n Hao\n},\n title = {\n 3D-FRONT: 3D Furnished Rooms With layOuts and semaNTics\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10933-10942\n} \n}" }, { "title": "3DIAS: 3D Shape Reconstruction With Implicit Algebraic Surfaces", @@ -213,6 +227,7 @@ "status": "Poster", "track": "main", "pid": 5644, + "author_site": "Mohsen Yavartanoo; Jaeyoung Chung; Reyhaneh Neshatavar; Kyoung Mu Lee", "author": "Mohsen Yavartanoo; Jaeyoung Chung; Reyhaneh Neshatavar; Kyoung Mu Lee", "abstract": "3D Shape representation has substantial effects on 3D shape reconstruction. Primitive-based representations approximate a 3D shape mainly by a set of simple implicit primitives, but the low geometrical complexity of the primitives limits the shape resolution. Moreover, setting a sufficient number of primitives for an arbitrary shape is challenging. To overcome these issues, we propose a constrained implicit algebraic surface as the primitive with few learnable coefficients and higher geometrical complexities and a deep neural network to produce these primitives. Our experiments demonstrate the superiorities of our method in terms of representation power compared to the state-of-the-art methods in single RGB image 3D shape reconstruction. Furthermore, we show that our method can semantically learn segments of 3D shapes in an unsupervised manner. The code is publicly available from this link.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yavartanoo_3DIAS_3D_Shape_Reconstruction_With_Implicit_Algebraic_Surfaces_ICCV_2021_paper.pdf", @@ -236,7 +251,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Seoul", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Yavartanoo_2021_ICCV,\n \n author = {\n Yavartanoo,\n Mohsen and Chung,\n Jaeyoung and Neshatavar,\n Reyhaneh and Lee,\n Kyoung Mu\n},\n title = {\n 3DIAS: 3D Shape Reconstruction With Implicit Algebraic Surfaces\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12446-12455\n} \n}" }, { "title": "3DStyleNet: Creating 3D Shapes With Geometric and Texture Style Variations", @@ -244,6 +260,7 @@ "status": "Poster", "track": "main", "pid": 6694, + "author_site": "Kangxue Yin; Jun Gao; Maria Shugrina; Sameh Khamis; Sanja Fidler", "author": "Kangxue Yin; Jun Gao; Maria Shugrina; Sameh Khamis; Sanja Fidler", "abstract": "We propose a method to create plausible geometric and texture style variations of 3D objects in the quest to democratize 3D content creation. Given a pair of textured source and target objects, our method predicts a part-aware affine transformation field that naturally warps the source shape to imitate the overall geometric style of the target. In addition, the texture style of the target is transferred to the warped source object with the help of a multi-view differentiable renderer. Our model, 3DStyleNet, is composed of two sub-networks trained in two stages. First, the geometric style network is trained on a large set of untextured 3D shapes. Second, we jointly optimize our geometric style network and a pre-trained image style transfer network with losses defined over both the geometry and the rendering of the result. Given a small set of high-quality textured objects, our method can create many novel stylized shapes, resulting in effortless 3D content creation and style-ware data augmentation. We showcase our approach qualitatively on 3D content stylization, and provide user studies to validate the quality of our results. In addition, our method can serve as a valuable tool to create 3D data augmentations for computer vision tasks. Extensive quantitative analysis shows that 3DStyleNet outperforms alternative data augmentation techniques for the downstream task of single-image 3D reconstruction.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yin_3DStyleNet_Creating_3D_Shapes_With_Geometric_and_Texture_Style_Variations_ICCV_2021_paper.pdf", @@ -260,14 +277,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Yin_3DStyleNet_Creating_3D_Shapes_With_Geometric_and_Texture_Style_Variations_ICCV_2021_paper.html", "aff_unique_index": "0;0+1+2;0;0;0+1+2", - "aff_unique_norm": "NVIDIA;University of Toronto;Vector Institute", - "aff_unique_dep": "NVIDIA Corporation;;", + "aff_unique_norm": "NVIDIA Corporation;University of Toronto;Vector Institute", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.nvidia.com;https://www.utoronto.ca;https://vectorinstitute.ai/", "aff_unique_abbr": "NVIDIA;U of T;Vector Institute", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0+1+1;0;0;0+1+1", - "aff_country_unique": "United States;Canada" + "aff_country_unique": "United States;Canada", + "bibtex": "@InProceedings{Yin_2021_ICCV,\n \n author = {\n Yin,\n Kangxue and Gao,\n Jun and Shugrina,\n Maria and Khamis,\n Sameh and Fidler,\n Sanja\n},\n title = {\n 3DStyleNet: Creating 3D Shapes With Geometric and Texture Style Variations\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12456-12465\n} \n}" }, { "title": "3DVG-Transformer: Relation Modeling for Visual Grounding on Point Clouds", @@ -275,6 +293,7 @@ "status": "Poster", "track": "main", "pid": 7574, + "author_site": "Lichen Zhao; Daigang Cai; Lu Sheng; Dong Xu", "author": "Lichen Zhao; Daigang Cai; Lu Sheng; Dong Xu", "abstract": "Visual grounding on 3D point clouds is an emerging vision and language task that benefits various applications in understanding the 3D visual world. By formulating this task as a grounding-by-detection problem, lots of recent works focus on how to exploit more powerful detectors and comprehensive language features, but (1) how to model complex relations for generating context-aware object proposals and (2) how to leverage proposal relations to distinguish the true target object from similar proposals are not fully studied yet. Inspired by the well-known transformer architecture, we propose a relation-aware visual grounding method on 3D point clouds, named as 3DVG-Transformer, to fully utilize the contextual clues for relationenhanced proposal generation and cross-modal proposal disambiguation, which are enabled by a newly designed coordinate-guided contextual aggregation (CCA) module in the object proposal generation stage, and a multiplex attention (MA) module in the cross-modal feature fusion stage. We validate that our 3DVG-Transformer outperforms the state-of-the-art methods by a large margin, on two point cloud-based visual grounding datasets, ScanRefer and Nr3D/Sr3D from ReferIt3D, especially for complex scenarios containing multiple objects of the same category.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhao_3DVG-Transformer_Relation_Modeling_for_Visual_Grounding_on_Point_Clouds_ICCV_2021_paper.pdf", @@ -291,14 +310,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhao_3DVG-Transformer_Relation_Modeling_for_Visual_Grounding_on_Point_Clouds_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;1", - "aff_unique_norm": "Beihang University;University of Sydney", + "aff_unique_norm": "Beihang University;The University of Sydney", "aff_unique_dep": "College of Software;", "aff_unique_url": "http://www.buaa.edu.cn;https://www.sydney.edu.au", "aff_unique_abbr": "Beihang;USYD", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Zhao_2021_ICCV,\n \n author = {\n Zhao,\n Lichen and Cai,\n Daigang and Sheng,\n Lu and Xu,\n Dong\n},\n title = {\n 3DVG-Transformer: Relation Modeling for Visual Grounding on Point Clouds\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2928-2937\n} \n}" }, { "title": "3DeepCT: Learning Volumetric Scattering Tomography of Clouds", @@ -306,6 +326,7 @@ "status": "Poster", "track": "main", "pid": 9172, + "author_site": "Yael Sde-Chen; Yoav Y. Schechner; Vadim Holodovsky; Eshkol Eytan", "author": "Yael Sde-Chen; Yoav Y. Schechner; Vadim Holodovsky; Eshkol Eytan", "abstract": "We present 3DeepCT, a deep neural network for computed tomography, which performs 3D reconstruction of scattering volumes from multi-view images. The architecture is dictated by the stationary nature of atmospheric cloud fields. The task of volumetric scattering tomography aims at recovering a volume from its 2D projections. This problem has been approached by diverse inverse methods based on signal processing and physics models. However, such techniques are typically iterative, exhibiting a high computational load and a long convergence time. We show that 3DeepCT outperforms physics-based inverse scattering methods, in accuracy, as well as offering orders of magnitude improvement in computational run-time. We further introduce a hybrid model that combines 3DeepCT and physics-based analysis. The resultant hybrid technique enjoys fast inference time and improved recovery performance.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Sde-Chen_3DeepCT_Learning_Volumetric_Scattering_Tomography_of_Clouds_ICCV_2021_paper.pdf", @@ -322,14 +343,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Sde-Chen_3DeepCT_Learning_Volumetric_Scattering_Tomography_of_Clouds_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;1", - "aff_unique_norm": "Technion - Israel Institute of Technology;Weizmann Institute of Science", + "aff_unique_norm": "Technion - Israel Institute of Technology;The Weizmann Institute of Science", "aff_unique_dep": "Viterbi Faculty of Electrical and Computer Engineering;Department of Earth and Planetary Science", "aff_unique_url": "https://www.technion.ac.il;https://www.weizmann.ac.il", "aff_unique_abbr": "Technion;Weizmann", "aff_campus_unique_index": "0;0;0;1", "aff_campus_unique": "Haifa;Rehovot", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Israel" + "aff_country_unique": "Israel", + "bibtex": "@InProceedings{Sde-Chen_2021_ICCV,\n \n author = {\n Sde-Chen,\n Yael and Schechner,\n Yoav Y. and Holodovsky,\n Vadim and Eytan,\n Eshkol\n},\n title = {\n 3DeepCT: Learning Volumetric Scattering Tomography of Clouds\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5671-5682\n} \n}" }, { "title": "4D Cloud Scattering Tomography", @@ -337,6 +359,7 @@ "status": "Poster", "track": "main", "pid": 3380, + "author_site": "Roi Ronen; Yoav Y. Schechner; Eshkol Eytan", "author": "Roi Ronen; Yoav Y. Schechner; Eshkol Eytan", "abstract": "We derive computed tomography (CT) of a time-varying volumetric scattering object, using a small number of moving cameras. We focus on passive tomography of dynamic clouds, as clouds have a major effect on the Earth's climate. State of the art scattering CT assumes a static object. Existing 4D CT methods rely on a linear image formation model and often on significant priors. In this paper, the angular and temporal sampling rates needed for a proper recovery are discussed. Spatiotemporal CT is achieved using gradient-based optimization, which accounts for the correlation time of the dynamic object content. We demonstrate this in physics-based simulations and on experimental real-world data.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ronen_4D_Cloud_Scattering_Tomography_ICCV_2021_paper.pdf", @@ -353,14 +376,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Ronen_4D_Cloud_Scattering_Tomography_ICCV_2021_paper.html", "aff_unique_index": "0;0;1", - "aff_unique_norm": "Technion - Israel Institute of Technology;Weizmann Institute of Science", + "aff_unique_norm": "Technion - Israel Institute of Technology;The Weizmann Institute of Science", "aff_unique_dep": "Viterbi Faculty of Electrical & Computer Eng.;Department of Earth & Planetary Sciences", "aff_unique_url": "https://www.technion.ac.il;https://www.weizmann.ac.il", "aff_unique_abbr": "Technion;Weizmann", "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Haifa;Rehovot", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Israel" + "aff_country_unique": "Israel", + "bibtex": "@InProceedings{Ronen_2021_ICCV,\n \n author = {\n Ronen,\n Roi and Schechner,\n Yoav Y. and Eytan,\n Eshkol\n},\n title = {\n 4D Cloud Scattering Tomography\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5520-5529\n} \n}" }, { "title": "4D-Net for Learned Multi-Modal Alignment", @@ -368,6 +392,7 @@ "status": "Poster", "track": "main", "pid": 2743, + "author_site": "AJ Piergiovanni; Vincent Casser; Michael S. Ryoo; Anelia Angelova", "author": "AJ Piergiovanni; Vincent Casser; Michael S. Ryoo; Anelia Angelova", "abstract": "We present 4D-Net, a 3D object detection approach, which utilizes 3D Point Cloud and RGB sensing information, both in time. We are able to incorporate the 4D information by performing a novel dynamic connection learning across various feature representations and levels of abstraction and by observing geometric constraints. Our approach outperforms the state-of-the-art and strong baselines on the Waymo Open Dataset. 4D-Net is better able to use motion cues and dense image information to detect distant objects more successfully. We will open source the code.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Piergiovanni_4D-Net_for_Learned_Multi-Modal_Alignment_ICCV_2021_paper.pdf", @@ -382,7 +407,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Piergiovanni_4D-Net_for_Learned_Multi-Modal_Alignment_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Piergiovanni_4D-Net_for_Learned_Multi-Modal_Alignment_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Piergiovanni_2021_ICCV,\n \n author = {\n Piergiovanni,\n AJ and Casser,\n Vincent and Ryoo,\n Michael S. and Angelova,\n Anelia\n},\n title = {\n 4D-Net for Learned Multi-Modal Alignment\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15435-15445\n} \n}" }, { "title": "4DComplete: Non-Rigid Motion Estimation Beyond the Observable Surface", @@ -390,7 +416,8 @@ "status": "Poster", "track": "main", "pid": 5666, - "author": "Yang Li; Hikari Takehara; Takafumi Taketomi; Bo Zheng; Matthias Nie\u00dfner", + "author_site": "Yang Li; Hikari Takehara; Takafumi Taketomi; Bo Zheng; Matthias Nießner", + "author": "Yang Li; Hikari Takehara; Takafumi Taketomi; Bo Zheng; Matthias Nießner", "abstract": "Tracking non-rigidly deforming scenes using range sensors has numerous applications including computer vision, AR/VR, and robotics. However, due to occlusions and physical limitations of range sensors, existing methods only handle the visible surface, thus causing discontinuities and incompleteness in the motion field. To this end, we introduce 4DComplete, a novel data-driven approach that estimates the non-rigid motion for the unobserved geometry. 4DComplete takes as input a partial shape and motion observation, extracts 4D time-space embedding, and jointly infers the missing geometry and motion field using a sparse fully-convolutional network. For network training, we constructed a large-scale synthetic dataset called DeformingThings4D, which consists of 1,972 animation sequences spanning 31 different animals or humanoid categories with dense 4D annotation. Experiments show that 4DComplete 1) reconstructs high-resolution volumetric shape and motion field from a partial observation, 2) learns an entangled 4D feature representation that benefits both shape and motion estimation, 3) yields more accurate and natural deformation than classic non-rigid priors such as As-RigidAs-Possible (ARAP) deformation, and 4) generalizes well to unseen objects in real-world sequences.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_4DComplete_Non-Rigid_Motion_Estimation_Beyond_the_Observable_Surface_ICCV_2021_paper.pdf", "aff": "The University of Tokyo; Tokyo Research Center, Huawei; Tokyo Research Center, Huawei; Tokyo Research Center, Huawei; Technical University Munich", @@ -413,7 +440,8 @@ "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Tokyo", "aff_country_unique_index": "0;1;1;1;2", - "aff_country_unique": "Japan;China;Germany" + "aff_country_unique": "Japan;China;Germany", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Yang and Takehara,\n Hikari and Taketomi,\n Takafumi and Zheng,\n Bo and Nie{\\ss\n}ner,\n Matthias\n},\n title = {\n 4DComplete: Non-Rigid Motion Estimation Beyond the Observable Surface\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12706-12716\n} \n}" }, { "title": "A Backdoor Attack Against 3D Point Cloud Classifiers", @@ -421,6 +449,7 @@ "status": "Poster", "track": "main", "pid": 6274, + "author_site": "Zhen Xiang; David J. Miller; Siheng Chen; Xi Li; George Kesidis", "author": "Zhen Xiang; David J. Miller; Siheng Chen; Xi Li; George Kesidis", "abstract": "Vulnerability of 3D point cloud (PC) classifiers has become a grave concern due to the popularity of 3D sensors in safety-critical applications. Existing adversarial attacks against 3D PC classifiers are all test-time evasion (TTE) attacks that aim to induce test-time misclassifications using knowledge of the classifier. But since the victim classifier is usually not accessible to the attacker, the threat is largely diminished in practice, as PC TTEs typically have poor transferability. Here, we propose the first backdoor attack (BA) against PC classifiers. Originally proposed for images, BAs poison the victim classifier's training set so that the classifier learns to decide to the attacker's target class whenever the attacker's backdoor pattern is present in a given input sample. Significantly, BAs do not require knowledge of the victim classifier. Different from image BAs, we propose to insert a cluster of points into a PC as a robust backdoor pattern customized for 3D PCs. Such clusters are also consistent with a physical attack (i.e., with a captured object in a scene). We optimize the cluster's location using an independently trained surrogate classifier and choose the cluster's local geometry to evade possible PC preprocessing and PC anomaly detectors (ADs). Experimentally, our BA achieves a uniformly high success rate (>=87%) and shows evasiveness against state-of-the-art PC ADs. Code is available at https://github.com/zhenxianglance/PCBA.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xiang_A_Backdoor_Attack_Against_3D_Point_Cloud_Classifiers_ICCV_2021_paper.pdf", @@ -435,7 +464,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Xiang_A_Backdoor_Attack_Against_3D_Point_Cloud_Classifiers_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Xiang_A_Backdoor_Attack_Against_3D_Point_Cloud_Classifiers_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Xiang_2021_ICCV,\n \n author = {\n Xiang,\n Zhen and Miller,\n David J. and Chen,\n Siheng and Li,\n Xi and Kesidis,\n George\n},\n title = {\n A Backdoor Attack Against 3D Point Cloud Classifiers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7597-7607\n} \n}" }, { "title": "A Broad Study on the Transferability of Visual Representations With Contrastive Learning", @@ -443,6 +473,7 @@ "status": "Poster", "track": "main", "pid": 9362, + "author_site": "Ashraful Islam; Chun-Fu (Richard) Chen; Rameswar Panda; Leonid Karlinsky; Richard Radke; Rogerio Feris", "author": "Ashraful Islam; Chun-Fu (Richard) Chen; Rameswar Panda; Leonid Karlinsky; Richard Radke; Rogerio Feris", "abstract": "Tremendous progress has been made in visual representation learning, notably with the recent success of self-supervised contrastive learning methods. Supervised contrastive learning has also been shown to outperform its cross-entropy counterparts by leveraging labels for choosing where to contrast. However, there has been little work to explore the transfer capability of contrastive learning to a different domain. In this paper, we conduct a comprehensive study on the transferability of learned representations of different contrastive approaches for linear evaluation, full-network transfer, and few-shot recognition on 12 downstream datasets from different domains, and object detection tasks on MSCOCO and VOC0712. The results show that the contrastive approaches learn representations that are easily transferable to a different downstream task. We further observe that the joint objective of self-supervised contrastive loss with cross-entropy/supervised-contrastive loss leads to better transferability of these models over their supervised counterparts. Our analysis reveals that the representations learned from the contrastive approaches contain more low/mid-level semantics than cross-entropy models, which enables them to quickly adapt to a new task. Our codes and models will be publicly available to facilitate future research on transferability of visual representations.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Islam_A_Broad_Study_on_the_Transferability_of_Visual_Representations_With_ICCV_2021_paper.pdf", @@ -466,7 +497,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0+0;0;0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Islam_2021_ICCV,\n \n author = {\n Islam,\n Ashraful and Chen,\n Chun-Fu (Richard) and Panda,\n Rameswar and Karlinsky,\n Leonid and Radke,\n Richard and Feris,\n Rogerio\n},\n title = {\n A Broad Study on the Transferability of Visual Representations With Contrastive Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8845-8855\n} \n}" }, { "title": "A Closer Look at Rotation-Invariant Deep Point Cloud Analysis", @@ -474,6 +506,7 @@ "status": "Poster", "track": "main", "pid": 4030, + "author_site": "Feiran Li; Kent Fujiwara; Fumio Okura; Yasuyuki Matsushita", "author": "Feiran Li; Kent Fujiwara; Fumio Okura; Yasuyuki Matsushita", "abstract": "We consider the deep point cloud analysis tasks where the inputs of the networks are randomly rotated. Recent progress in rotation-invariant point cloud analysis is mainly driven by converting point clouds into their respective canonical poses, and principal component analysis (PCA) is a practical tool to achieve this. Due to the imperfect alignment of PCA, most of the current works are devoted to developing powerful network structures and features to overcome this deficiency, without thoroughly analyzing the PCA-based canonical poses themselves. In this work, we present a detailed study w.r.t. the PCA-based canonical poses of point clouds. Our investigation reveals that the ambiguity problem associated with the PCA-based canonical poses is handled insufficiently in some recent works. To this end, we develop a simple pose selector module for disambiguation, which presents noticeable enhancement (i.e., 5:3% classification accuracy) over state-of-the-art approaches on the challenging real-world dataset.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_A_Closer_Look_at_Rotation-Invariant_Deep_Point_Cloud_Analysis_ICCV_2021_paper.pdf", @@ -497,7 +530,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Feiran and Fujiwara,\n Kent and Okura,\n Fumio and Matsushita,\n Yasuyuki\n},\n title = {\n A Closer Look at Rotation-Invariant Deep Point Cloud Analysis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 16218-16227\n} \n}" }, { "title": "A Confidence-Based Iterative Solver of Depths and Surface Normals for Deep Multi-View Stereo", @@ -505,6 +539,7 @@ "status": "Poster", "track": "main", "pid": 4072, + "author_site": "Wang Zhao; Shaohui Liu; Yi Wei; Hengkai Guo; Yong-Jin Liu", "author": "Wang Zhao; Shaohui Liu; Yi Wei; Hengkai Guo; Yong-Jin Liu", "abstract": "In this paper, we introduce a deep multi-view stereo (MVS) system that jointly predicts depths, surface normals and per-view confidence maps. The key to our approach is a novel solver that iteratively solves for per-view depth map and normal map by optimizing an energy potential based upon the local planar assumption. Specifically, the algorithm updates depth map by propagating from neighboring pixels with slanted planes, and updates normal map with local probabilistic plane fitting. Both two steps are monitored by a customized confidence map. This confidence-based solver is not only effective as a post-processing tool for plane based depth refinement and completion, but also differentiable such that it can be efficiently integrated into deep learning pipelines. Our multi-view stereo system employs multiple optimization steps of the solver over the initial prediction of depths and surface normals. The whole system can be trained end-to-end, decoupling the challenging problem of matching pixels within poorly textured regions from the cost volume based neural network. Experimental results on ScanNet and RGB-D Scenes V2 demonstrate state-of-the-art performance of the proposed deep MVS system on multi-view depth estimation, with our proposed solver consistently improving the depth quality over both conventional and deep learning based MVS pipelines.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhao_A_Confidence-Based_Iterative_Solver_of_Depths_and_Surface_Normals_for_ICCV_2021_paper.pdf", @@ -519,7 +554,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhao_A_Confidence-Based_Iterative_Solver_of_Depths_and_Surface_Normals_for_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhao_A_Confidence-Based_Iterative_Solver_of_Depths_and_Surface_Normals_for_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Zhao_2021_ICCV,\n \n author = {\n Zhao,\n Wang and Liu,\n Shaohui and Wei,\n Yi and Guo,\n Hengkai and Liu,\n Yong-Jin\n},\n title = {\n A Confidence-Based Iterative Solver of Depths and Surface Normals for Deep Multi-View Stereo\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6168-6177\n} \n}" }, { "title": "A Dark Flash Normal Camera", @@ -527,6 +563,7 @@ "status": "Poster", "track": "main", "pid": 9360, + "author_site": "Zhihao Xia; Jason Lawrence; Supreeth Achar", "author": "Zhihao Xia; Jason Lawrence; Supreeth Achar", "abstract": "Casual photography is often performed in uncontrolled lighting that can result in low quality images and degrade the performance of downstream processing. We consider the problem of estimating surface normal and reflectance maps of scenes depicting people despite these conditions by supplementing the available visible illumination with a single near infrared (NIR) light source and camera, a so-called \"dark flash image\". Our method takes as input a single color image captured under arbitrary visible lighting and a single dark flash image captured under controlled front-lit NIR lighting at the same viewpoint, and computes a normal map, a diffuse albedo map, and a specular intensity map of the scene. Since ground truth normal and reflectance maps of faces are difficult to capture, we propose a novel training technique that combines information from two readily available and complementary sources: a stereo depth signal and photometric shading cues. We evaluate our method over a range of subjects and lighting conditions and describe two applications: optimizing stereo geometry and filling the shadows in an image.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xia_A_Dark_Flash_Normal_Camera_ICCV_2021_paper.pdf", @@ -550,7 +587,8 @@ "aff_campus_unique_index": "0;1;1", "aff_campus_unique": "St. Louis;Mountain View", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Xia_2021_ICCV,\n \n author = {\n Xia,\n Zhihao and Lawrence,\n Jason and Achar,\n Supreeth\n},\n title = {\n A Dark Flash Normal Camera\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2430-2439\n} \n}" }, { "title": "A General Recurrent Tracking Framework Without Real Data", @@ -558,6 +596,7 @@ "status": "Poster", "track": "main", "pid": 9971, + "author_site": "Shuai Wang; Hao Sheng; Yang Zhang; Yubin Wu; Zhang Xiong", "author": "Shuai Wang; Hao Sheng; Yang Zhang; Yubin Wu; Zhang Xiong", "abstract": "Recent progress in multi-object tracking (MOT) has shown great significance of a robust scoring mechanism for potential tracks. However, the lack of available data in MOT makes it difficult to learn a general scoring mechanism. Multiple cues including appearance, motion and etc., are limitedly utilized in current manual scoring functions. In this paper, we propose a Multiple Nodes Tracking (MNT) framework that adapts to most trackers. Based on this framework, a Recurrent Tracking Unit (RTU) is designed to score potential tracks through long-term information. In addition, we present a method of generating simulated tracking data without real data to overcome the defect of limited available data in MOT. The experiments demonstrate that our simulated tracking data is effective for training RTU and achieves state-of-the-art performance on both MOT17 and MOT16 benchmarks. Meanwhile, RTU can be flexibly plugged into classic trackers such as DeepSORT and MHT, and makes remarkable improvements as well.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_A_General_Recurrent_Tracking_Framework_Without_Real_Data_ICCV_2021_paper.pdf", @@ -581,7 +620,8 @@ "aff_campus_unique_index": "0+1;0+1;0;0+1;0+1", "aff_campus_unique": "Beijing;Hangzhou", "aff_country_unique_index": "0+0;0+0;0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Shuai and Sheng,\n Hao and Zhang,\n Yang and Wu,\n Yubin and Xiong,\n Zhang\n},\n title = {\n A General Recurrent Tracking Framework Without Real Data\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13219-13228\n} \n}" }, { "title": "A Hierarchical Transformation-Discriminating Generative Model for Few Shot Anomaly Detection", @@ -589,6 +629,7 @@ "status": "Poster", "track": "main", "pid": 7605, + "author_site": "Shelly Sheynin; Sagie Benaim; Lior Wolf", "author": "Shelly Sheynin; Sagie Benaim; Lior Wolf", "abstract": "Anomaly detection, the task of identifying unusual samples in data, often relies on a large set of training samples. In this work, we consider the setting of few-shot anomaly detection in images, where only a few images are given at training. We devise a hierarchical generative model that captures the multi-scale patch distribution of each training image. We further enhance the representation of our model by using image transformations and optimize scale-specific patch-discriminators to distinguish between real and fake patches of the image, as well as between different transformations applied to those patches. The anomaly score is obtained by aggregating the patch-based votes of the correct transformation across scales and image regions. We demonstrate the superiority of our method on both the one-shot and few-shot settings, on the datasets of Paris, CIFAR10, MNIST and FashionMNIST as well as in the setting of defect detection on MVTec. In all cases, our method outperforms the recent baseline methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Sheynin_A_Hierarchical_Transformation-Discriminating_Generative_Model_for_Few_Shot_Anomaly_Detection_ICCV_2021_paper.pdf", @@ -603,7 +644,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Sheynin_A_Hierarchical_Transformation-Discriminating_Generative_Model_for_Few_Shot_Anomaly_Detection_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Sheynin_A_Hierarchical_Transformation-Discriminating_Generative_Model_for_Few_Shot_Anomaly_Detection_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Sheynin_2021_ICCV,\n \n author = {\n Sheynin,\n Shelly and Benaim,\n Sagie and Wolf,\n Lior\n},\n title = {\n A Hierarchical Transformation-Discriminating Generative Model for Few Shot Anomaly Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8495-8504\n} \n}" }, { "title": "A Hierarchical Variational Neural Uncertainty Model for Stochastic Video Prediction", @@ -611,6 +653,7 @@ "status": "Poster", "track": "main", "pid": 10802, + "author_site": "Moitreya Chatterjee; Narendra Ahuja; Anoop Cherian", "author": "Moitreya Chatterjee; Narendra Ahuja; Anoop Cherian", "abstract": "Predicting the future frames of a video is a challenging task, in part due to the underlying stochastic real-world phenomena. Prior approaches to solve this task typically estimate a latent prior characterizing this stochasticity, however do not account for the predictive uncertainty of the (deep learning) model. Such approaches often derive the training signal from the mean-squared error (MSE) between the generated frame and the ground truth, which can lead to sub-optimal training, especially when the predictive uncertainty is high. Towards this end, we introduce Neural Uncertainty Quantifier (NUQ) - a stochastic quantification of the model's predictive uncertainty, and use it to weigh the MSE loss. We propose a hierarchical, variational framework to derive NUQ in a principled manner using a deep, Bayesian graphical model. Our experiments on three benchmark stochastic video prediction datasets show that our proposed framework trains more effectively compared to the state-of-the-art models (especially when the training sets are small), while demonstrating better video generation quality and diversity against several evaluation metrics.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chatterjee_A_Hierarchical_Variational_Neural_Uncertainty_Model_for_Stochastic_Video_Prediction_ICCV_2021_paper.pdf", @@ -627,14 +670,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Chatterjee_A_Hierarchical_Variational_Neural_Uncertainty_Model_for_Stochastic_Video_Prediction_ICCV_2021_paper.html", "aff_unique_index": "0;0;1", - "aff_unique_norm": "University of Illinois Urbana-Champaign;Mitsubishi Electric Research Laboratories", + "aff_unique_norm": "University of Illinois at Urbana-Champaign;Mitsubishi Electric Research Laboratories", "aff_unique_dep": ";", "aff_unique_url": "https://illinois.edu;https://www.merl.com", "aff_unique_abbr": "UIUC;MERL", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Urbana-Champaign;", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Chatterjee_2021_ICCV,\n \n author = {\n Chatterjee,\n Moitreya and Ahuja,\n Narendra and Cherian,\n Anoop\n},\n title = {\n A Hierarchical Variational Neural Uncertainty Model for Stochastic Video Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9751-9761\n} \n}" }, { "title": "A Hybrid Frequency-Spatial Domain Model for Sparse Image Reconstruction in Scanning Transmission Electron Microscopy", @@ -642,6 +686,7 @@ "status": "Poster", "track": "main", "pid": 10032, + "author_site": "Bintao He; Fa Zhang; Huanshui Zhang; Renmin Han", "author": "Bintao He; Fa Zhang; Huanshui Zhang; Renmin Han", "abstract": "Scanning transmission electron microscopy (STEM) is a powerful technique in high-resolution atomic imaging of materials. Decreasing scanning time and reducing electron beam exposure with an acceptable signal-to-noise results are two popular research aspects when applying STEM to beam-sensitive materials. Specifically, partially sampling with fixed electron doses is one of the most important solutions, and then the lost information is restored by computational methods. Following successful applications of deep learning in image in-painting, we have developed an encoder-decoder network to reconstruct STEM images in extremely sparse sampling case. In our model, we combine both local pixel information from convolution operators and global texture features, by applying specific filter operations on frequency domain to acquire initial reconstruction and global structure prior. Our method can effectively restore texture structures and be robust in different sampling ratios with Poisson noise. A comprehensive study demonstrates that our method gains about 50% performance enhancement in comparison with the state-of-art methods. Code is available at https://github.com/icthrm/Sparse-Sampling-Reconstruction.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/He_A_Hybrid_Frequency-Spatial_Domain_Model_for_Sparse_Image_Reconstruction_in_ICCV_2021_paper.pdf", @@ -665,7 +710,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{He_2021_ICCV,\n \n author = {\n He,\n Bintao and Zhang,\n Fa and Zhang,\n Huanshui and Han,\n Renmin\n},\n title = {\n A Hybrid Frequency-Spatial Domain Model for Sparse Image Reconstruction in Scanning Transmission Electron Microscopy\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2682-2691\n} \n}" }, { "title": "A Hybrid Video Anomaly Detection Framework via Memory-Augmented Flow Reconstruction and Flow-Guided Frame Prediction", @@ -673,6 +719,7 @@ "status": "Poster", "track": "main", "pid": 6834, + "author_site": "Zhian Liu; Yongwei Nie; Chengjiang Long; Qing Zhang; Guiqing Li", "author": "Zhian Liu; Yongwei Nie; Chengjiang Long; Qing Zhang; Guiqing Li", "abstract": "In this paper, we propose HF2-VAD, a Hybrid framework that integrates Flow reconstruction and Frame prediction seamlessly to handle Video Anomaly Detection. Firstly, we design the network of ML-MemAE-SC (Multi-Level Memory modules in an Autoencoder with Skip Connections) to memorize normal patterns for optical flow reconstruction so that abnormal events can be sensitively identified with larger flow reconstruction errors. More importantly, conditioned on the reconstructed flows, we then employ a Conditional Variational Autoencoder (CVAE), which captures the high correlation between video frame and optical flow, to predict the next frame given several previous frames. By CVAE, the quality of flow reconstruction essentially influences that of frame prediction. Therefore, poorly reconstructed optical flows of abnormal events further deteriorate the quality of the final predicted future frame, making the anomalies more detectable. Experimental results demonstrate the effectiveness of the proposed method. Code is available at https://github.com/LiUzHiAn/hf2vad.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liu_A_Hybrid_Video_Anomaly_Detection_Framework_via_Memory-Augmented_Flow_Reconstruction_ICCV_2021_paper.pdf", @@ -689,14 +736,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Liu_A_Hybrid_Video_Anomaly_Detection_Framework_via_Memory-Augmented_Flow_Reconstruction_ICCV_2021_paper.html", "aff_unique_index": "0;0;1;2;0", - "aff_unique_norm": "South China University of Technology;JD;Sun Yat-sen University", - "aff_unique_dep": "School of Computer Science and Engineering;JD Finance America Corporation;School of Computer Science and Engineering", + "aff_unique_norm": "South China University of Technology;JD Finance America Corporation;Sun Yat-sen University", + "aff_unique_dep": "School of Computer Science and Engineering;;School of Computer Science and Engineering", "aff_unique_url": "https://www.scut.edu.cn;;http://www.sysu.edu.cn", "aff_unique_abbr": "SCUT;;SYSU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;1;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Liu_2021_ICCV,\n \n author = {\n Liu,\n Zhian and Nie,\n Yongwei and Long,\n Chengjiang and Zhang,\n Qing and Li,\n Guiqing\n},\n title = {\n A Hybrid Video Anomaly Detection Framework via Memory-Augmented Flow Reconstruction and Flow-Guided Frame Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13588-13597\n} \n}" }, { "title": "A Latent Transformer for Disentangled Face Editing in Images and Videos", @@ -704,10 +752,11 @@ "status": "Poster", "track": "main", "pid": 7580, + "author_site": "Xu Yao; Alasdair Newson; Yann Gousseau; Pierre Hellier", "author": "Xu Yao; Alasdair Newson; Yann Gousseau; Pierre Hellier", "abstract": "High quality facial image editing is a challenging problem in the movie post-production industry, requiring a high degree of control and identity preservation. Previous works that attempt to tackle this problem may suffer from the entanglement of facial attributes and the loss of the person's identity. Furthermore, many algorithms are limited to a certain task. To tackle these limitations, we propose to edit facial attributes via the latent space of a StyleGAN generator, by training a dedicated latent transformation network and incorporating explicit disentanglement and identity preservation terms in the loss function. We further introduce a pipeline to generalize our face editing to videos. Our model achieves a disentangled, controllable, and identity-preserving facial attribute editing, even in the challenging case of real (i.e., non-synthetic) images and videos. We conduct extensive experiments on image and video datasets and show that our model outperforms other state-of-the-art methods in visual quality and quantitative evaluation. Source codes are available at https://github.com/InterDigitalInc/latent-transformer.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yao_A_Latent_Transformer_for_Disentangled_Face_Editing_in_Images_and_ICCV_2021_paper.pdf", - "aff": "LTCI, T \u00b4el\u00b4ecom Paris, Institut Polytechnique de Paris, France + InterDigital R&I, 975 avenue des Champs Blancs, Cesson-S \u00b4evign \u00b4e, France; LTCI, T \u00b4el\u00b4ecom Paris, Institut Polytechnique de Paris, France; LTCI, T \u00b4el\u00b4ecom Paris, Institut Polytechnique de Paris, France; InterDigital R&I, 975 avenue des Champs Blancs, Cesson-S \u00b4evign \u00b4e, France", + "aff": "LTCI, T ´el´ecom Paris, Institut Polytechnique de Paris, France + InterDigital R&I, 975 avenue des Champs Blancs, Cesson-S ´evign ´e, France; LTCI, T ´el´ecom Paris, Institut Polytechnique de Paris, France; LTCI, T ´el´ecom Paris, Institut Polytechnique de Paris, France; InterDigital R&I, 975 avenue des Champs Blancs, Cesson-S ´evign ´e, France", "project": "", "github": "https://github.com/InterDigitalInc/latent-transformer", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Yao_A_Latent_Transformer_ICCV_2021_supplemental.pdf", @@ -720,14 +769,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Yao_A_Latent_Transformer_for_Disentangled_Face_Editing_in_Images_and_ICCV_2021_paper.html", "aff_unique_index": "0+1;0;0;1", - "aff_unique_norm": "T\u00e9l\u00e9com Paris;InterDigital R&I", + "aff_unique_norm": "Télécom Paris;InterDigital R&I", "aff_unique_dep": "LTCI;", "aff_unique_url": "https://www.telecom-paris.fr;https://www.interdigital.com", - "aff_unique_abbr": "T\u00e9l\u00e9com Paris;", + "aff_unique_abbr": "Télécom Paris;InterDigital", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0", - "aff_country_unique": "France" + "aff_country_unique": "France", + "bibtex": "@InProceedings{Yao_2021_ICCV,\n \n author = {\n Yao,\n Xu and Newson,\n Alasdair and Gousseau,\n Yann and Hellier,\n Pierre\n},\n title = {\n A Latent Transformer for Disentangled Face Editing in Images and Videos\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13789-13798\n} \n}" }, { "title": "A Lazy Approach to Long-Horizon Gradient-Based Meta-Learning", @@ -735,6 +785,7 @@ "status": "Poster", "track": "main", "pid": 9304, + "author_site": "Muhammad Abdullah Jamal; Liqiang Wang; Boqing Gong", "author": "Muhammad Abdullah Jamal; Liqiang Wang; Boqing Gong", "abstract": "Gradient-based meta-learning relates task-specific models to a meta-model by gradients. By this design, an algorithm first optimizes the task-specific models by an inner loop and then backpropagates meta-gradients through the loop to update the meta-model. The number of inner-loop optimization steps has to be small (e.g., one step) to avoid high-order derivatives, big memory footprints, and the risk of vanishing or exploding meta-gradients. We propose an intuitive teacher-student scheme to enable the gradient-based meta-learning algorithms to explore long horizons by the inner loop. The key idea is to employ a student network to adequately explore the search space of task-specific models (e.g., by more than ten steps), and a teacher then takes a \"leap\" toward the regions probed by the student. The teacher not only arrives at a high-quality model but also defines a lightweight computation graph for meta-gradients. Our approach is generic; it performs well when applied to four meta-learning algorithms over three tasks: few-shot learning, long-tailed classification, and meta-attack.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Jamal_A_Lazy_Approach_to_Long-Horizon_Gradient-Based_Meta-Learning_ICCV_2021_paper.pdf", @@ -749,7 +800,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Jamal_A_Lazy_Approach_to_Long-Horizon_Gradient-Based_Meta-Learning_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Jamal_A_Lazy_Approach_to_Long-Horizon_Gradient-Based_Meta-Learning_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Jamal_2021_ICCV,\n \n author = {\n Jamal,\n Muhammad Abdullah and Wang,\n Liqiang and Gong,\n Boqing\n},\n title = {\n A Lazy Approach to Long-Horizon Gradient-Based Meta-Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6577-6586\n} \n}" }, { "title": "A Light Stage on Every Desk", @@ -757,6 +809,7 @@ "status": "Poster", "track": "main", "pid": 9062, + "author_site": "Soumyadip Sengupta; Brian Curless; Ira Kemelmacher-Shlizerman; Steven M. Seitz", "author": "Soumyadip Sengupta; Brian Curless; Ira Kemelmacher-Shlizerman; Steven M. Seitz", "abstract": "Every time you sit in front of a TV or monitor, your face is actively illuminated by time-varying patterns of light. This paper proposes to use this time-varying illumination for synthetic relighting of your face with any new illumination condition. In doing so, we take inspiration from the light stage work of Debevec et al. [4], who first demonstrated the ability to relight people captured in a controlled lighting environment. Whereas existing light stages require expensive, room-scale spherical capture gantries and exist in only a few labs in the world, we demonstrate how to acquire useful data from a normal TV or desktop monitor. Instead of subjecting the user to uncomfortable rapidly flashing light patterns, we operate on images of the user watching a YouTube video or other standard content. We train a deep network on images plus monitor patterns of a given user and learn to predict images of that user under any target illumination (monitor pattern). Experimental evaluation shows that our method produces realistic relighting results.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Sengupta_A_Light_Stage_on_Every_Desk_ICCV_2021_paper.pdf", @@ -771,7 +824,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Sengupta_A_Light_Stage_on_Every_Desk_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Sengupta_A_Light_Stage_on_Every_Desk_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Sengupta_2021_ICCV,\n \n author = {\n Sengupta,\n Soumyadip and Curless,\n Brian and Kemelmacher-Shlizerman,\n Ira and Seitz,\n Steven M.\n},\n title = {\n A Light Stage on Every Desk\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2420-2429\n} \n}" }, { "title": "A Machine Teaching Framework for Scalable Recognition", @@ -779,6 +833,7 @@ "status": "Poster", "track": "main", "pid": 3586, + "author_site": "Pei Wang; Nuno Vasconcelos", "author": "Pei Wang; Nuno Vasconcelos", "abstract": "We consider the scalable recognition problem in the fine-grained expert domain where large-scale data collection is easy whereas annotation is difficult. Existing solutions are typically based on semi-supervised or self-supervised learning. We propose an alternative new framework, MEMORABLE, based on machine teaching and online crowdsourcing platforms. A small amount of data is first labeled by experts and then used to teach online annotators for the classes of interest, who finally label the entire dataset. Preliminary studies show that the accuracy of classifiers trained on the final dataset is a function of the accuracy of the student annotators. A new machine teaching algorithm, CMaxGrad, is then proposed to enhance this accuracy by introducing explanations in a state-of-the-art machine teaching algorithm. For this, CMaxGrad leverages counterfactual explanations, which take into account student predictions, thereby proving feedback that is student-specific, explicitly addresses the causes of student confusion, and adapts to the level of competence of the student. Experiments show that both MEMORABLE and CMaxGrad outperform existing solutions to their respective problems.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_A_Machine_Teaching_Framework_for_Scalable_Recognition_ICCV_2021_paper.pdf", @@ -802,7 +857,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "San Diego", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Pei and Vasconcelos,\n Nuno\n},\n title = {\n A Machine Teaching Framework for Scalable Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4945-4954\n} \n}" }, { "title": "A Multi-Mode Modulator for Multi-Domain Few-Shot Classification", @@ -810,6 +866,7 @@ "status": "Poster", "track": "main", "pid": 1608, + "author_site": "Yanbin Liu; Juho Lee; Linchao Zhu; Ling Chen; Humphrey Shi; Yi Yang", "author": "Yanbin Liu; Juho Lee; Linchao Zhu; Ling Chen; Humphrey Shi; Yi Yang", "abstract": "Most existing few-shot classification methods only consider generalization on one dataset (i.e., single-domain), failing to transfer across various seen and unseen domains. In this paper, we consider the more realistic multi-domain few-shot classification problem to investigate the cross-domain generalization. Two challenges exist in this new setting: (1) how to efficiently generate multi-domain feature representation, and (2) how to explore domain correlations for better cross-domain generalization. We propose a parameter-efficient multi-mode modulator to address both challenges. First, the modulator is designed to maintain multiple modulation parameters (one for each domain) in a single network, thus achieving single-network multi-domain representation. Given a particular domain, domain-aware features can be efficiently generated with the well-devised separative selection module and cooperative query module. Second, we further divide the modulation parameters into the domain-specific set and the domain-cooperative set to explore the intra-domain information and inter-domain correlations, respectively. The intra-domain information describes each domain independently to prevent negative interference. The inter-domain correlations guide information sharing among relevant domains to enrich their own representation. Moreover, unseen domains can utilize the correlations to obtain an adaptive combination of seen domains for extrapolation. We demonstrate that the proposed multi-mode modulator achieves state-of-the-art results on the challenging META-DATASET benchmark, especially for unseen test domains.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liu_A_Multi-Mode_Modulator_for_Multi-Domain_Few-Shot_Classification_ICCV_2021_paper.pdf", @@ -833,7 +890,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+1;2+2;1;1;3+3;1", - "aff_country_unique": "China;Australia;South Korea;United States" + "aff_country_unique": "China;Australia;South Korea;United States", + "bibtex": "@InProceedings{Liu_2021_ICCV,\n \n author = {\n Liu,\n Yanbin and Lee,\n Juho and Zhu,\n Linchao and Chen,\n Ling and Shi,\n Humphrey and Yang,\n Yi\n},\n title = {\n A Multi-Mode Modulator for Multi-Domain Few-Shot Classification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8453-8462\n} \n}" }, { "title": "A New Journey From SDRTV to HDRTV", @@ -841,6 +899,7 @@ "status": "Poster", "track": "main", "pid": 6924, + "author_site": "Xiangyu Chen; Zhengwen Zhang; Jimmy S. Ren; Lynhoo Tian; Yu Qiao; Chao Dong", "author": "Xiangyu Chen; Zhengwen Zhang; Jimmy S. Ren; Lynhoo Tian; Yu Qiao; Chao Dong", "abstract": "Nowadays modern displays are capable to render video content with high dynamic range (HDR) and wide color gamut (WCG). However, most available resources are still in standard dynamic range (SDR). Therefore, there is an urgent demand to transform existing SDR-TV contents into their HDR-TV versions. In this paper, we conduct an analysis of SDRTV-to-HDRTV task by modeling the formation of SDRTV/HDRTV content. Base on the analysis, we propose a three-step solution pipeline including adaptive global color mapping, local enhancement and highlight generation. Moreover, the above analysis inspires us to present a lightweight network that utilizes global statistics as guidance to conduct image-adaptive color mapping. In addition, we construct a dataset using HDR videos in HDR10 standard, named HDRTV1K, and select five metrics to evaluate the results of SDRTV-to-HDRTV algorithms. Furthermore, our final results achieve state-of-the-art performance in quantitative comparisons and visual quality. The code and dataset are available at https://github.com/chxy95/HDRTVNet.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_A_New_Journey_From_SDRTV_to_HDRTV_ICCV_2021_paper.pdf", @@ -864,7 +923,8 @@ "aff_campus_unique_index": "0;0;2;0;0", "aff_campus_unique": "Shenzhen;;Shanghai", "aff_country_unique_index": "0+0;0+0;0+0;0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Xiangyu and Zhang,\n Zhengwen and Ren,\n Jimmy S. and Tian,\n Lynhoo and Qiao,\n Yu and Dong,\n Chao\n},\n title = {\n A New Journey From SDRTV to HDRTV\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4500-4509\n} \n}" }, { "title": "A Robust Loss for Point Cloud Registration", @@ -872,6 +932,7 @@ "status": "Poster", "track": "main", "pid": 8097, + "author_site": "Zhi Deng; Yuxin Yao; Bailin Deng; Juyong Zhang", "author": "Zhi Deng; Yuxin Yao; Bailin Deng; Juyong Zhang", "abstract": "The performance of surface registration relies heavily on the metric used for the alignment error between the source and target shapes. Traditionally, such a metric is based on the point-to-point or point-to-plane distance from the points on the source surface to their closest points on the target surface, which is susceptible to failure due to instability of the closest-point correspondence. In this paper, we propose a novel metric based on the intersection points between the two shapes and a random straight line, which does not assume a specific correspondence. We verify the effectiveness of this metric by extensive experiments, including its direct optimization for a single registration problem as well as unsupervised learning for a set of registration problems. The results demonstrate that the algorithms utilizing our proposed metric outperforms the state-of-the-art optimization-based and unsupervised learning-based methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Deng_A_Robust_Loss_for_Point_Cloud_Registration_ICCV_2021_paper.pdf", @@ -895,7 +956,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", - "aff_country_unique": "China;United Kingdom" + "aff_country_unique": "China;United Kingdom", + "bibtex": "@InProceedings{Deng_2021_ICCV,\n \n author = {\n Deng,\n Zhi and Yao,\n Yuxin and Deng,\n Bailin and Zhang,\n Juyong\n},\n title = {\n A Robust Loss for Point Cloud Registration\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6138-6147\n} \n}" }, { "title": "A Simple Baseline for Semi-Supervised Semantic Segmentation With Strong Data Augmentation", @@ -903,6 +965,7 @@ "status": "Poster", "track": "main", "pid": 6658, + "author_site": "Jianlong Yuan; Yifan Liu; Chunhua Shen; Zhibin Wang; Hao Li", "author": "Jianlong Yuan; Yifan Liu; Chunhua Shen; Zhibin Wang; Hao Li", "abstract": "Recently, significant progress has been made on semantic segmentation. However, the success of supervised semantic segmentation typically relies on a large amount of labeled data, which is time-consuming and costly to obtain. Inspired by the success of semi-supervised learning methods in image classification, here we propose a simple yet effective semi-supervised learning framework for semantic segmentation. We demonstrate that the devil is in the details: a set of simple design and training techniques can collectively improve the performance of semi-supervised semantic segmentation significantly. Previous works fail to employ strong augmentation in pseudo label learning efficiently, as the large distribution change caused by strong augmentation harms the batch normalization statistics. We design a new batch normalization, namely distribution-specific batch normalization (DSBN) to address this problem and demonstrate the importance of strong augmentation for semantic segmentation. Moreover, we design a self-correction loss which is effective in noise resistance. We conduct a series of ablation studies to show the effectiveness of each component. Our method achieves state-of-the-art results in the semi-supervised settings on the Cityscapes and Pascal VOC datasets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yuan_A_Simple_Baseline_for_Semi-Supervised_Semantic_Segmentation_With_Strong_Data_ICCV_2021_paper.pdf", @@ -919,14 +982,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Yuan_A_Simple_Baseline_for_Semi-Supervised_Semantic_Segmentation_With_Strong_Data_ICCV_2021_paper.html", "aff_unique_index": "0;1;1;0;0", - "aff_unique_norm": "Alibaba Group;University of Adelaide", + "aff_unique_norm": "Alibaba Group;The University of Adelaide", "aff_unique_dep": ";", "aff_unique_url": "https://www.alibaba.com;https://www.adelaide.edu.au", "aff_unique_abbr": "Alibaba;Adelaide", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0;0", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Yuan_2021_ICCV,\n \n author = {\n Yuan,\n Jianlong and Liu,\n Yifan and Shen,\n Chunhua and Wang,\n Zhibin and Li,\n Hao\n},\n title = {\n A Simple Baseline for Semi-Supervised Semantic Segmentation With Strong Data Augmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8229-8238\n} \n}" }, { "title": "A Simple Baseline for Weakly-Supervised Scene Graph Generation", @@ -934,6 +998,7 @@ "status": "Poster", "track": "main", "pid": 6323, + "author_site": "Jing Shi; Yiwu Zhong; Ning Xu; Yin Li; Chenliang Xu", "author": "Jing Shi; Yiwu Zhong; Ning Xu; Yin Li; Chenliang Xu", "abstract": "We investigate the weakly-supervised scene graph generation, which is a challenging task since no correspondence of label and object is provided. The previous work regards such correspondence as a latent variable which is iteratively updated via nested optimization of the scene graph generation objective. However, we further reduce the complexity by decoupling it into an efficient first-order graph matching module optimized via contrastive learning to obtain such correspondence, which is used to train a standard scene graph generation model. The extensive experiments show that such a simple pipeline can significantly surpass the previous state-of-the-art by more than 30% on the Visual Genome dataset, both in terms of graph matching accuracy and scene graph quality. We believe this work serves as a strong baseline for future research.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Shi_A_Simple_Baseline_for_Weakly-Supervised_Scene_Graph_Generation_ICCV_2021_paper.pdf", @@ -957,7 +1022,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Madison", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Shi_2021_ICCV,\n \n author = {\n Shi,\n Jing and Zhong,\n Yiwu and Xu,\n Ning and Li,\n Yin and Xu,\n Chenliang\n},\n title = {\n A Simple Baseline for Weakly-Supervised Scene Graph Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 16393-16402\n} \n}" }, { "title": "A Simple Feature Augmentation for Domain Generalization", @@ -965,6 +1031,7 @@ "status": "Poster", "track": "main", "pid": 7601, + "author_site": "Pan Li; Da Li; Wei Li; Shaogang Gong; Yanwei Fu; Timothy M. Hospedales", "author": "Pan Li; Da Li; Wei Li; Shaogang Gong; Yanwei Fu; Timothy M. Hospedales", "abstract": "The topical domain generalization (DG) problem asks trained models to perform well on an unseen target domain with different data statistics from the source training domains. In computer vision, data augmentation has proven one of the most effective ways of better exploiting the source data to improve domain generalization. However, existing approaches primarily rely on image-space data augmentation, which requires careful augmentation design, and provides limited diversity of augmented data. We argue that feature augmentation is a more promising direction for DG. We find that an extremely simple technique of perturbing the feature embedding with Gaussian noise during training leads to a classifier with domain-generalization performance comparable to existing state of the art. To model more meaningful statistics reflective of cross-domain variability, we further estimate the full class-conditional feature covariance matrix iteratively during training. Subsequent joint stochastic feature augmentation provides an effective domain randomization method, perturbing features in the directions of intra-class/cross-domain variability. We verify our proposed method on three standard domain generalization benchmarks, Digit-DG, VLCS and PACS, and show it is outperforming or comparable to the state of the art in all setups, together with experimental analysis to illustrate how our method works towards training a robust generalisable model.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_A_Simple_Feature_Augmentation_for_Domain_Generalization_ICCV_2021_paper.pdf", @@ -981,14 +1048,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Li_A_Simple_Feature_Augmentation_for_Domain_Generalization_ICCV_2021_paper.html", "aff_unique_index": "0;1+2;0;0;3;2", - "aff_unique_norm": "Queen Mary University of London;Samsung;University of Edinburgh;Fudan University", + "aff_unique_norm": "Queen Mary University of London;Samsung AI Center;University of Edinburgh;Fudan University", "aff_unique_dep": ";AI Center;;", "aff_unique_url": "https://www.qmul.ac.uk;https://www.samsung.com/global/research-innovation/ai-research-centers/samsung-ai-center-cambridge/;https://www.ed.ac.uk;https://www.fudan.edu.cn", "aff_unique_abbr": "QMUL;SAC;Edinburgh;Fudan", "aff_campus_unique_index": "0;1;0;0", "aff_campus_unique": "London;Cambridge;", "aff_country_unique_index": "0;0+0;0;0;1;0", - "aff_country_unique": "United Kingdom;China" + "aff_country_unique": "United Kingdom;China", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Pan and Li,\n Da and Li,\n Wei and Gong,\n Shaogang and Fu,\n Yanwei and Hospedales,\n Timothy M.\n},\n title = {\n A Simple Feature Augmentation for Domain Generalization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8886-8895\n} \n}" }, { "title": "A Simple Framework for 3D Lensless Imaging With Programmable Masks", @@ -996,10 +1064,11 @@ "status": "Poster", "track": "main", "pid": 3489, + "author_site": "Yucheng Zheng; Yi Hua; Aswin C. Sankaranarayanan; M. Salman Asif", "author": "Yucheng Zheng; Yi Hua; Aswin C. Sankaranarayanan; M. Salman Asif", "abstract": "Lensless cameras provide a framework to build thin imaging systems by replacing the lens in a conventional camera with an amplitude or phase mask near the sensor. Existing methods for lensless imaging can recover the depth and intensity of the scene, but they require solving computationally-expensive inverse problems. Furthermore, existing methods struggle to recover dense scenes with large depth variations. In this paper, we propose a lensless imaging system that captures a small number of measurements using different patterns on a programmable mask. In this context, we make three contributions. First, we present a fast recovery algorithm to recover textures on a fixed number of depth planes in the scene. Second, we consider the mask design problem, for programmable lensless cameras, and provide a design template for optimizing the mask patterns with the goal of improving depth estimation. Third, we use a refinement network as a post-processing step to identify and remove artifacts in the reconstruction. These modifications are evaluated extensively with experimental results on a lensless camera prototype to showcase the performance benefits of the optimized masks and recovery algorithms over the state of the art.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zheng_A_Simple_Framework_for_3D_Lensless_Imaging_With_Programmable_Masks_ICCV_2021_paper.pdf", - "aff": "University of California Riverside\u2020; Carnegie Mellon University\u2021; Carnegie Mellon University\u2021; University of California Riverside\u2020", + "aff": "University of California Riverside†; Carnegie Mellon University‡; Carnegie Mellon University‡; University of California Riverside†", "project": "", "github": "", "supp": "", @@ -1019,7 +1088,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Riverside;", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zheng_2021_ICCV,\n \n author = {\n Zheng,\n Yucheng and Hua,\n Yi and Sankaranarayanan,\n Aswin C. and Asif,\n M. Salman\n},\n title = {\n A Simple Framework for 3D Lensless Imaging With Programmable Masks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2603-2612\n} \n}" }, { "title": "A Style and Semantic Memory Mechanism for Domain Generalization", @@ -1027,6 +1097,7 @@ "status": "Poster", "track": "main", "pid": 10176, + "author_site": "Yang Chen; Yu Wang; Yingwei Pan; Ting Yao; Xinmei Tian; Tao Mei", "author": "Yang Chen; Yu Wang; Yingwei Pan; Ting Yao; Xinmei Tian; Tao Mei", "abstract": "Mainstream state-of-the-art domain generalization algorithms tend to prioritize the assumption on semantic invariance across domains. Meanwhile, the inherent intra-domain style invariance is usually underappreciated and put on the shelf. In this paper, we reveal that leveraging intra-domain style invariance is also of pivotal importance in improving the efficiency of domain generalization. We verify that it is critical for the network to be informative on what domain features are invariant and shared among instances, so that the network sharpens its understanding and improves its semantic discriminative ability. Correspondingly, we also propose a novel \"jury\" mechanism, which is particularly effective in learning useful semantic feature commonalities among domains. Our complete model called STEAM can be interpreted as a novel probabilistic graphical model, for which the implementation requires convenient constructions of two kinds of memory banks: semantic feature bank and style feature bank. Empirical results show that our proposed framework surpasses the state-of-the-art methods by clear margins.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_A_Style_and_Semantic_Memory_Mechanism_for_Domain_Generalization_ICCV_2021_paper.pdf", @@ -1043,14 +1114,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Chen_A_Style_and_Semantic_Memory_Mechanism_for_Domain_Generalization_ICCV_2021_paper.html", "aff_unique_index": "0;1;1;1;0;1", - "aff_unique_norm": "University of Science and Technology of China;JD", - "aff_unique_dep": ";JD AI Research", + "aff_unique_norm": "University of Science and Technology of China;JD AI Research", + "aff_unique_dep": ";", "aff_unique_url": "http://www.ustc.edu.cn;https://www.jd.com", "aff_unique_abbr": "USTC;JD AI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Yang and Wang,\n Yu and Pan,\n Yingwei and Yao,\n Ting and Tian,\n Xinmei and Mei,\n Tao\n},\n title = {\n A Style and Semantic Memory Mechanism for Domain Generalization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9164-9173\n} \n}" }, { "title": "A Unified 3D Human Motion Synthesis Model via Conditional Variational Auto-Encoder", @@ -1058,6 +1130,7 @@ "status": "Poster", "track": "main", "pid": 3567, + "author_site": "Yujun Cai; Yiwei Wang; Yiheng Zhu; Tat-Jen Cham; Jianfei Cai; Junsong Yuan; Jun Liu; Chuanxia Zheng; Sijie Yan; Henghui Ding; Xiaohui Shen; Ding Liu; Nadia Magnenat Thalmann", "author": "Yujun Cai; Yiwei Wang; Yiheng Zhu; Tat-Jen Cham; Jianfei Cai; Junsong Yuan; Jun Liu; Chuanxia Zheng; Sijie Yan; Henghui Ding; Xiaohui Shen; Ding Liu; Nadia Magnenat Thalmann", "abstract": "We present a unified and flexible framework to address the generalized problem of 3D motion synthesis that covers the tasks of motion prediction, completion, interpolation, and spatial-temporal recovery. Since these tasks have different input constraints and various fidelity and diversity requirements, most existing approaches only cater to a specific task or use different architectures to address various tasks. Here we propose a unified framework based on Conditional Variational Auto-Encoder (CVAE), where we treat any arbitrary input as a masked motion series. Notably, by considering this problem as a conditional generation process, we estimate a parametric distribution of the missing regions based on the input conditions, from which to sample and synthesize the full motion series. To further allow the flexibility of manipulating the motion style of the generated series, we design an Action-Adaptive Modulation (AAM) to propagate the given semantic guidance through the whole sequence. We also introduce a cross-attention mechanism to exploit distant relations among decoder and encoder features for better realism and global consistency. We conducted extensive experiments on Human 3.6M and CMU-Mocap. The results show that our method produces coherent and realistic results for various motion synthesis tasks, with the synthesized motions distinctly adapted by the given action labels.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Cai_A_Unified_3D_Human_Motion_Synthesis_Model_via_Conditional_Variational_ICCV_2021_paper.pdf", @@ -1074,14 +1147,15 @@ "author_num": 13, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Cai_A_Unified_3D_Human_Motion_Synthesis_Model_via_Conditional_Variational_ICCV_2021_paper.html", "aff_unique_index": "0;1;2;0;3;4;5;0;6;0;2;2;7", - "aff_unique_norm": "Nanyang Technological University;National University of Singapore;ByteDance;Monash University;State University of New York at Buffalo;Singapore University of Technology and Design;Chinese University of Hong Kong;University of Geneva", + "aff_unique_norm": "Nanyang Technological University;National University of Singapore;ByteDance;Monash University;State University of New York at Buffalo;Singapore University of Technology and Design;The Chinese University of Hong Kong;University of Geneva", "aff_unique_dep": ";;Research;;;;;", "aff_unique_url": "https://www.ntu.edu.sg;https://www.nus.edu.sg;https://www.bytedance.com;https://www.monash.edu;https://www.buffalo.edu;https://www.sutd.edu.sg;https://www.cuhk.edu.hk;https://www.unige.ch", "aff_unique_abbr": "NTU;NUS;ByteDance;Monash;SUNY Buffalo;SUTD;CUHK;UNIGE", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Buffalo;Hong Kong SAR", "aff_country_unique_index": "0;0;1;0;2;3;0;0;1;0;1;1;4", - "aff_country_unique": "Singapore;China;Australia;United States;Switzerland" + "aff_country_unique": "Singapore;China;Australia;United States;Switzerland", + "bibtex": "@InProceedings{Cai_2021_ICCV,\n \n author = {\n Cai,\n Yujun and Wang,\n Yiwei and Zhu,\n Yiheng and Cham,\n Tat-Jen and Cai,\n Jianfei and Yuan,\n Junsong and Liu,\n Jun and Zheng,\n Chuanxia and Yan,\n Sijie and Ding,\n Henghui and Shen,\n Xiaohui and Liu,\n Ding and Thalmann,\n Nadia Magnenat\n},\n title = {\n A Unified 3D Human Motion Synthesis Model via Conditional Variational Auto-Encoder\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11645-11655\n} \n}" }, { "title": "A Unified Objective for Novel Class Discovery", @@ -1089,10 +1163,11 @@ "status": "Poster", "track": "main", "pid": 3938, - "author": "Enrico Fini; Enver Sangineto; St\u00e9phane Lathuili\u00e8re; Zhun Zhong; Moin Nabi; Elisa Ricci", + "author_site": "Enrico Fini; Enver Sangineto; Stéphane Lathuilière; Zhun Zhong; Moin Nabi; Elisa Ricci", + "author": "Enrico Fini; Enver Sangineto; Stéphane Lathuilière; Zhun Zhong; Moin Nabi; Elisa Ricci", "abstract": "In this paper, we study the problem of Novel Class Discovery (NCD). NCD aims at inferring novel object categories in an unlabeled set by leveraging from prior knowledge of a labeled set containing different, but related classes. Existing approaches tackle this problem by considering multiple objective functions, usually involving specialized loss terms for the labeled and the unlabeled samples respectively, and often requiring auxiliary regularization terms. In this paper, we depart from this traditional scheme and introduce a UNified Objective function (UNO) for discovering novel classes, with the explicit purpose of favoring synergy between supervised and unsupervised learning. Using a multi-view self-labeling strategy, we generate pseudo-labels that can be treated homogeneously with ground truth labels. This leads to a single classification objective operating on both known and unknown classes. Despite its simplicity, UNO outperforms the state of the art by a significant margin on several benchmarks (approximately +10% on CIFAR-100 and +8% on ImageNet). Our source code will be publicly available. The project page is available at: https://ncd-uno.github.io.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Fini_A_Unified_Objective_for_Novel_Class_Discovery_ICCV_2021_paper.pdf", - "aff": "University of Trento, Trento, Italy; University of Trento, Trento, Italy; LTCI, T \u00b4el\u00b4ecom Paris, Institut Polytechnique de Paris, France; University of Trento, Trento, Italy + Fondazione Bruno Kessler, Trento, Italy; SAP AI Research, Berlin, Germany; University of Trento, Trento, Italy + Fondazione Bruno Kessler, Trento, Italy", + "aff": "University of Trento, Trento, Italy; University of Trento, Trento, Italy; LTCI, T ´el´ecom Paris, Institut Polytechnique de Paris, France; University of Trento, Trento, Italy + Fondazione Bruno Kessler, Trento, Italy; SAP AI Research, Berlin, Germany; University of Trento, Trento, Italy + Fondazione Bruno Kessler, Trento, Italy", "project": "https://ncd-uno.github.io", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Fini_A_Unified_Objective_ICCV_2021_supplemental.pdf", @@ -1105,14 +1180,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Fini_A_Unified_Objective_for_Novel_Class_Discovery_ICCV_2021_paper.html", "aff_unique_index": "0;0;1;0+2;3;0+2", - "aff_unique_norm": "University of Trento;T\u00e9l\u00e9com Paris;Fondazione Bruno Kessler;SAP AI Research", + "aff_unique_norm": "University of Trento;Télécom Paris;Fondazione Bruno Kessler;SAP AI Research", "aff_unique_dep": ";LTCI;;AI Research", "aff_unique_url": "https://www.unitn.it;https://www.telecom-paris.fr;https://www.fbk.eu;https://www.sap.com", - "aff_unique_abbr": "UniTN;T\u00e9l\u00e9com Paris;FBK;SAP", + "aff_unique_abbr": "UniTN;Télécom Paris;FBK;SAP", "aff_campus_unique_index": "0;0;0+0;2;0+0", "aff_campus_unique": "Trento;;Berlin", "aff_country_unique_index": "0;0;1;0+0;2;0+0", - "aff_country_unique": "Italy;France;Germany" + "aff_country_unique": "Italy;France;Germany", + "bibtex": "@InProceedings{Fini_2021_ICCV,\n \n author = {\n Fini,\n Enrico and Sangineto,\n Enver and Lathuili\\`ere,\n St\\'ephane and Zhong,\n Zhun and Nabi,\n Moin and Ricci,\n Elisa\n},\n title = {\n A Unified Objective for Novel Class Discovery\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9284-9292\n} \n}" }, { "title": "A Weakly Supervised Amodal Segmenter With Boundary Uncertainty Estimation", @@ -1120,6 +1196,7 @@ "status": "Poster", "track": "main", "pid": 4365, + "author_site": "Khoi Nguyen; Sinisa Todorovic", "author": "Khoi Nguyen; Sinisa Todorovic", "abstract": "This paper addresses weakly supervised amodal instance segmentation, where the goal is to segment both visible and occluded (amodal) object parts, while training provides only ground-truth visible (modal) segmentations. Following prior work, we use data manipulation to generate occlusions in training images and thus train a segmenter to predict amodal segmentations of the manipulated data. The resulting predictions on training images are taken as the pseudo-ground truth for the standard training of Mask-RCNN, which we use for amodal instance segmentation of test images. For generating the pseudo-ground truth, we specify a new Amodal Segmenter based on Boundary Uncertainty estimation (ASBU) and make two contributions. First, while prior work uses the occluder's mask, our ASBU uses the occlusion boundary as input. Second, ASBU estimates an uncertainty map of the prediction. The estimated uncertainty regularizes learning such that lower segmentation loss is incurred on regions with high uncertainty. ASBU achieves significant performance improvement relative to the state of the art on the COCOA and KINS datasets in three tasks: amodal instance segmentation, amodal completion, and ordering recovery.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Nguyen_A_Weakly_Supervised_Amodal_Segmenter_With_Boundary_Uncertainty_Estimation_ICCV_2021_paper.pdf", @@ -1134,7 +1211,8 @@ "aff_domain": ";", "email": ";", "author_num": 2, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Nguyen_A_Weakly_Supervised_Amodal_Segmenter_With_Boundary_Uncertainty_Estimation_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Nguyen_A_Weakly_Supervised_Amodal_Segmenter_With_Boundary_Uncertainty_Estimation_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Nguyen_2021_ICCV,\n \n author = {\n Nguyen,\n Khoi and Todorovic,\n Sinisa\n},\n title = {\n A Weakly Supervised Amodal Segmenter With Boundary Uncertainty Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7396-7405\n} \n}" }, { "title": "A-SDF: Learning Disentangled Signed Distance Functions for Articulated Shape Representation", @@ -1142,6 +1220,7 @@ "status": "Poster", "track": "main", "pid": 7079, + "author_site": "Jiteng Mu; Weichao Qiu; Adam Kortylewski; Alan Yuille; Nuno Vasconcelos; Xiaolong Wang", "author": "Jiteng Mu; Weichao Qiu; Adam Kortylewski; Alan Yuille; Nuno Vasconcelos; Xiaolong Wang", "abstract": "Recent work has made significant progress on using implicit functions, as a continuous representation for 3D rigid object shape reconstruction. However, much less effort has been devoted to modeling general articulated objects. Compared to rigid objects, articulated objects have higher degrees of freedom, which makes it hard to generalize to unseen shapes. To deal with the large shape variance, we introduce Articulated Signed Distance Functions (A-SDF) to represent articulated shapes with a disentangled latent space, where we have separate codes for encoding shape and articulation. With this disentangled continuous representation, we demonstrate that we can control the articulation input and animate unseen instances with unseen joint angles. Furthermore, we propose a Test-Time Adaptation inference algorithm to adjust our model during inference. We demonstrate our model generalize well to out-of-distribution and unseen data, e.g., partial point clouds and real-world depth images.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Mu_A-SDF_Learning_Disentangled_Signed_Distance_Functions_for_Articulated_Shape_Representation_ICCV_2021_paper.pdf", @@ -1156,7 +1235,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Mu_A-SDF_Learning_Disentangled_Signed_Distance_Functions_for_Articulated_Shape_Representation_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Mu_A-SDF_Learning_Disentangled_Signed_Distance_Functions_for_Articulated_Shape_Representation_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Mu_2021_ICCV,\n \n author = {\n Mu,\n Jiteng and Qiu,\n Weichao and Kortylewski,\n Adam and Yuille,\n Alan and Vasconcelos,\n Nuno and Wang,\n Xiaolong\n},\n title = {\n A-SDF: Learning Disentangled Signed Distance Functions for Articulated Shape Representation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13001-13011\n} \n}" }, { "title": "AA-RMVSNet: Adaptive Aggregation Recurrent Multi-View Stereo Network", @@ -1164,6 +1244,7 @@ "status": "Poster", "track": "main", "pid": 8900, + "author_site": "Zizhuang Wei; Qingtian Zhu; Chen Min; Yisong Chen; Guoping Wang", "author": "Zizhuang Wei; Qingtian Zhu; Chen Min; Yisong Chen; Guoping Wang", "abstract": "In this paper, we present a novel recurrent multi-view stereo network based on long short-term memory (LSTM) with adaptive aggregation, namely AA-RMVSNet. We firstly introduce an intra-view aggregation module to adaptively extract image features by using context-aware convolution and multi-scale aggregation, which efficiently improves the performance on challenging regions, such as thin objects and large low-textured surfaces. To overcome the difficulty of varying occlusion in complex scenes, we propose an inter-view cost volume aggregation module for adaptive pixel-wise view aggregation, which is able to preserve better-matched pairs among all views. The two proposed adaptive aggregation modules are lightweight, effective and complementary regarding improving the accuracy and completeness of 3D reconstruction. Instead of conventional 3D CNNs, we utilize a hybrid network with recurrent structure for cost volume regularization, which allows high-resolution reconstruction and finer hypothetical plane sweep. The proposed network is trained end-to-end and achieves excellent performance on various datasets. It ranks 1st among all submissions on Tanks and Temples benchmark and achieves competitive results on DTU dataset, which exhibits strong generalizability and robustness. Implementation of our method is available at https://github.com/QT-Zhu/AA-RMVSNet.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wei_AA-RMVSNet_Adaptive_Aggregation_Recurrent_Multi-View_Stereo_Network_ICCV_2021_paper.pdf", @@ -1187,7 +1268,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wei_2021_ICCV,\n \n author = {\n Wei,\n Zizhuang and Zhu,\n Qingtian and Min,\n Chen and Chen,\n Yisong and Wang,\n Guoping\n},\n title = {\n AA-RMVSNet: Adaptive Aggregation Recurrent Multi-View Stereo Network\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6187-6196\n} \n}" }, { "title": "ACAV100M: Automatic Curation of Large-Scale Datasets for Audio-Visual Video Representation Learning", @@ -1195,6 +1277,7 @@ "status": "Poster", "track": "main", "pid": 7671, + "author_site": "Sangho Lee; Jiwan Chung; Youngjae Yu; Gunhee Kim; Thomas Breuel; Gal Chechik; Yale Song", "author": "Sangho Lee; Jiwan Chung; Youngjae Yu; Gunhee Kim; Thomas Breuel; Gal Chechik; Yale Song", "abstract": "The natural association between visual observations and their corresponding sound provides powerful self-supervisory signals for learning video representations, which makes the ever-growing amount of online videos an attractive source of training data. However, large portions of online videos contain irrelevant audio-visual signals because of edited/overdubbed audio, and models trained on such uncurated videos have shown to learn suboptimal representations. Therefore, existing self-supervised approaches rely on datasets with predetermined taxonomies of semantic concepts, where there is a high chance of audio-visual correspondence. Unfortunately, constructing such datasets require labor intensive manual annotation and/or verification, which severely limits the utility of online videos for large-scale learning. In this work, we present an automatic dataset curation approach based on subset optimization where the objective is to maximize the mutual information between audio and visual channels in videos. We demonstrate that our approach finds videos with high audio-visual correspondence and show that self-supervised models trained on our data achieve competitive performances compared to models trained on existing manually curated datasets. The most significant benefit of our approach is scalability: We release ACAV100M that contains 100 million videos with high audio-visual correspondence, ideal for self-supervised video representation learning.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Lee_ACAV100M_Automatic_Curation_of_Large-Scale_Datasets_for_Audio-Visual_Video_Representation_ICCV_2021_paper.pdf", @@ -1211,14 +1294,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Lee_ACAV100M_Automatic_Curation_of_Large-Scale_Datasets_for_Audio-Visual_Video_Representation_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;0;1;1;2", - "aff_unique_norm": "Seoul National University;NVIDIA;Microsoft", + "aff_unique_norm": "Seoul National University;NVIDIA Corporation;Microsoft Corporation", "aff_unique_dep": ";NVIDIA Research;Microsoft Research", "aff_unique_url": "https://www.snu.ac.kr;https://www.nvidia.com/research;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "SNU;NVIDIA;MSR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1;1;1", - "aff_country_unique": "South Korea;United States" + "aff_country_unique": "South Korea;United States", + "bibtex": "@InProceedings{Lee_2021_ICCV,\n \n author = {\n Lee,\n Sangho and Chung,\n Jiwan and Yu,\n Youngjae and Kim,\n Gunhee and Breuel,\n Thomas and Chechik,\n Gal and Song,\n Yale\n},\n title = {\n ACAV100M: Automatic Curation of Large-Scale Datasets for Audio-Visual Video Representation Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10274-10284\n} \n}" }, { "title": "ACDC: The Adverse Conditions Dataset With Correspondences for Semantic Driving Scene Understanding", @@ -1226,6 +1310,7 @@ "status": "Poster", "track": "main", "pid": 1741, + "author_site": "Christos Sakaridis; Dengxin Dai; Luc Van Gool", "author": "Christos Sakaridis; Dengxin Dai; Luc Van Gool", "abstract": "Level 5 autonomy for self-driving cars requires a robust visual perception system that can parse input images under any visual condition. However, existing semantic segmentation datasets are either dominated by images captured under normal conditions or are small in scale. To address this, we introduce ACDC, the Adverse Conditions Dataset with Correspondences for training and testing semantic segmentation methods on adverse visual conditions. ACDC consists of a large set of 4006 images which are equally distributed between four common adverse conditions: fog, nighttime, rain, and snow. Each adverse-condition image comes with a high-quality fine pixel-level semantic annotation, a corresponding image of the same scene taken under normal conditions, and a binary mask that distinguishes between intra-image regions of clear and uncertain semantic content. Thus, ACDC supports both standard semantic segmentation and the newly introduced uncertainty-aware semantic segmentation. A detailed empirical study demonstrates the challenges that the adverse domains of ACDC pose to state-of-the-art supervised and unsupervised approaches and indicates the value of our dataset in steering future progress in the field. Our dataset and benchmark are publicly available.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Sakaridis_ACDC_The_Adverse_Conditions_Dataset_With_Correspondences_for_Semantic_Driving_ICCV_2021_paper.pdf", @@ -1240,7 +1325,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Sakaridis_ACDC_The_Adverse_Conditions_Dataset_With_Correspondences_for_Semantic_Driving_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Sakaridis_ACDC_The_Adverse_Conditions_Dataset_With_Correspondences_for_Semantic_Driving_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Sakaridis_2021_ICCV,\n \n author = {\n Sakaridis,\n Christos and Dai,\n Dengxin and Van Gool,\n Luc\n},\n title = {\n ACDC: The Adverse Conditions Dataset With Correspondences for Semantic Driving Scene Understanding\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10765-10775\n} \n}" }, { "title": "ACE: Ally Complementary Experts for Solving Long-Tailed Recognition in One-Shot", @@ -1248,6 +1334,7 @@ "status": "Poster", "track": "main", "pid": 2705, + "author_site": "Jiarui Cai; Yizhou Wang; Jenq-Neng Hwang", "author": "Jiarui Cai; Yizhou Wang; Jenq-Neng Hwang", "abstract": "One-stage long-tailed recognition methods improve the overall performance in a \"seesaw\" manner, i.e., either sacrifice the head's accuracy for better tail classification or elevate the head's accuracy even higher but ignore the tail. Existing algorithms bypass such trade-off by a multi-stage training process: pre-training on imbalanced set and fine-tuning on balanced set. Though achieving promising performance, not only are they sensitive to the generalizability of the pre-trained model, but also not easily integrated into other computer vision tasks like detection and segmentation, where pre-training of classifier solely is not applicable. In this paper, we propose a one-stage long-tailed recognition scheme, ally complementary experts (ACE), where the expert is the most knowledgeable specialist in a sub-set that dominates its training, and is complementary to other experts in the less-seen categories without disturbed by what it has never seen. We design a distribution-adaptive optimizer to adjust the learning pace of each expert to avoid over-fitting. Without special bells and whistles, the vanilla ACE outperforms the current one-stage SOTA method by 3 10% on CIFAR10-LT, CIFAR100-LT, ImageNet-LT and iNaturalist datasets. It is also shown to be the first one to break the \"seesaw\" trade-off by improving the accuracy of the majority and minority categories simultaneously in only one stage.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Cai_ACE_Ally_Complementary_Experts_for_Solving_Long-Tailed_Recognition_in_One-Shot_ICCV_2021_paper.pdf", @@ -1271,7 +1358,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Cai_2021_ICCV,\n \n author = {\n Cai,\n Jiarui and Wang,\n Yizhou and Hwang,\n Jenq-Neng\n},\n title = {\n ACE: Ally Complementary Experts for Solving Long-Tailed Recognition in One-Shot\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 112-121\n} \n}" }, { "title": "AD-NeRF: Audio Driven Neural Radiance Fields for Talking Head Synthesis", @@ -1279,6 +1367,7 @@ "status": "Poster", "track": "main", "pid": 6986, + "author_site": "Yudong Guo; Keyu Chen; Sen Liang; Yong-Jin Liu; Hujun Bao; Juyong Zhang", "author": "Yudong Guo; Keyu Chen; Sen Liang; Yong-Jin Liu; Hujun Bao; Juyong Zhang", "abstract": "Generating high-fidelity talking head video by fitting with the input audio sequence is a challenging problem that receives considerable attentions recently. In this paper, we address this problem with the aid of neural scene representation networks. Our method is completely different from existing methods that rely on intermediate representations like 2D landmarks or 3D face models to bridge the gap between audio input and video output. Specifically, the feature of input audio signal is directly fed into a conditional implicit function to generate a dynamic neural radiance field, from which a high-fidelity talking-head video corresponding to the audio signal is synthesized using volume rendering. Another advantage of our framework is that not only the head (with hair) region is synthesized as previous methods did, but also the upper body is generated via two individual neural radiance fields. Experimental results demonstrate that our novel framework can (1) produce high-fidelity and natural results, and (2) support free adjustment of audio signals, viewing directions, and background images. Code is available at https://github.com/YudongGuo/AD-NeRF.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Guo_AD-NeRF_Audio_Driven_Neural_Radiance_Fields_for_Talking_Head_Synthesis_ICCV_2021_paper.pdf", @@ -1302,7 +1391,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Guo_2021_ICCV,\n \n author = {\n Guo,\n Yudong and Chen,\n Keyu and Liang,\n Sen and Liu,\n Yong-Jin and Bao,\n Hujun and Zhang,\n Juyong\n},\n title = {\n AD-NeRF: Audio Driven Neural Radiance Fields for Talking Head Synthesis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5784-5794\n} \n}" }, { "title": "ADNet: Leveraging Error-Bias Towards Normal Direction in Face Alignment", @@ -1310,6 +1400,7 @@ "status": "Poster", "track": "main", "pid": 7732, + "author_site": "Yangyu Huang; Hao Yang; Chong Li; Jongyoo Kim; Fangyun Wei", "author": "Yangyu Huang; Hao Yang; Chong Li; Jongyoo Kim; Fangyun Wei", "abstract": "The recent progress of CNN has dramatically improved face alignment performance. However, few works have paid attention to the error-bias with respect to error distribution of facial landmarks. In this paper, we investigate the error-bias issue in face alignment, where the distributions of landmark errors tend to spread along the tangent line to landmark curves. This error-bias is not trivial since it is closely connected to the ambiguous landmark labeling task. Inspired by this observation, we seek a way to leverage the error-bias property for better convergence of CNN model. To this end, we propose anisotropic direction loss (ADL) and anisotropic attention module (AAM) for coordinate and heatmap regression, respectively. ADL imposes strong binding force in normal direction for each landmark point on facial boundaries. On the other hand, AAM is an attention module which can get anisotropic attention mask focusing on the region of point and its local edge connected by adjacent points, it has a stronger response in tangent than in normal, which means relaxed constraints in the tangent. These two methods work in a complementary manner to learn both facial structures and texture details. Finally, we integrate them into an optimized end-to-end training pipeline named ADNet. Our ADNet achieves state-of-the-art results on 300W, WFLW and COFW datasets, which demonstrates the effectiveness and robustness.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Huang_ADNet_Leveraging_Error-Bias_Towards_Normal_Direction_in_Face_Alignment_ICCV_2021_paper.pdf", @@ -1326,14 +1417,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Huang_ADNet_Leveraging_Error-Bias_Towards_Normal_Direction_in_Face_Alignment_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;0;0", - "aff_unique_norm": "Microsoft", + "aff_unique_norm": "Microsoft Research", "aff_unique_dep": "Research", "aff_unique_url": "https://www.microsoft.com/en-us/research/group/asia", "aff_unique_abbr": "MSR Asia", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Asia", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Huang_2021_ICCV,\n \n author = {\n Huang,\n Yangyu and Yang,\n Hao and Li,\n Chong and Kim,\n Jongyoo and Wei,\n Fangyun\n},\n title = {\n ADNet: Leveraging Error-Bias Towards Normal Direction in Face Alignment\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3080-3090\n} \n}" }, { "title": "AESOP: Abstract Encoding of Stories, Objects, and Pictures", @@ -1341,6 +1433,7 @@ "status": "Poster", "track": "main", "pid": 10863, + "author_site": "Hareesh Ravi; Kushal Kafle; Scott Cohen; Jonathan Brandt; Mubbasir Kapadia", "author": "Hareesh Ravi; Kushal Kafle; Scott Cohen; Jonathan Brandt; Mubbasir Kapadia", "abstract": "Visual storytelling and story comprehension are uniquely human skills that play a central role in how we learn about and experience the world. Despite remarkable progress in recent years in synthesis of visual and textual content in isolation and learning effective joint visual-linguistic representations, existing systems still operate only at a superficial, factual level. With the goal of developing systems that are able to comprehend rich human-generated narratives, and co-create new stories, we introduce AESOP: a new dataset that captures the creative process associated with visual storytelling. Visual panels are composed of clip-art objects with specific attributes enabling a broad range of creative expression. Using AESOP, we propose foundational storytelling tasks that are generative variants of story cloze tests, to better measure the creative and causal reasoning ability required for visual storytelling. We further develop a generalized story completion framework that models stories as the co-evolution of visual and textual concepts. We benchmark the proposed approach with human baselines and evaluate using comprehensive qualitative and quantitative metrics. Our results highlight key insights related to the dataset, modelling and evaluation of visual storytelling for future research in this promising field of study.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ravi_AESOP_Abstract_Encoding_of_Stories_Objects_and_Pictures_ICCV_2021_paper.pdf", @@ -1364,7 +1457,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Ravi_2021_ICCV,\n \n author = {\n Ravi,\n Hareesh and Kafle,\n Kushal and Cohen,\n Scott and Brandt,\n Jonathan and Kapadia,\n Mubbasir\n},\n title = {\n AESOP: Abstract Encoding of Stories,\n Objects,\n and Pictures\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2052-2063\n} \n}" }, { "title": "AGKD-BML: Defense Against Adversarial Attack by Attention Guided Knowledge Distillation and Bi-Directional Metric Learning", @@ -1372,6 +1466,7 @@ "status": "Poster", "track": "main", "pid": 8066, + "author_site": "Hong Wang; Yuefan Deng; Shinjae Yoo; Haibin Ling; Yuewei Lin", "author": "Hong Wang; Yuefan Deng; Shinjae Yoo; Haibin Ling; Yuewei Lin", "abstract": "While deep neural networks have shown impressive performance in many tasks, they are fragile to carefully designed adversarial attacks. We propose a novel adversarial training-based model by Attention Guided Knowledge Distillation and Bi-directional Metric Learning (AGKD-BML). The attention knowledge is obtained from a weight-fixed model trained on a clean dataset, referred to as a teacher model, and transferred to a model that is under training on adversarial examples (AEs), referred to as a student model. In this way, the student model is able to focus on the correct region, as well as correcting the intermediate features corrupted by AEs to eventually improve the model accuracy. Moreover, to efficiently regularize the representation in feature space, we propose a bidirectional metric learning. Specifically, given a clean image, it is first attacked to its most confusing class to get the forward AE. A clean image in the most confusing class is then randomly picked and attacked back to the original class to get the backward AE. A triplet loss is then used to shorten the representation distance between original image and its AE, while enlarge that between the forward and backward AEs. We conduct extensive adversarial robustness experiments on two widely used datasets with different attacks. Our proposed AGKD-BML model consistently outperforms the state-of-the-art approaches. The code of AGKD-BML will be available at: https://github.com/hongw579/AGKD-BML.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_AGKD-BML_Defense_Against_Adversarial_Attack_by_Attention_Guided_Knowledge_Distillation_ICCV_2021_paper.pdf", @@ -1395,7 +1490,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Hong and Deng,\n Yuefan and Yoo,\n Shinjae and Ling,\n Haibin and Lin,\n Yuewei\n},\n title = {\n AGKD-BML: Defense Against Adversarial Attack by Attention Guided Knowledge Distillation and Bi-Directional Metric Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7658-7667\n} \n}" }, { "title": "AI Choreographer: Music Conditioned 3D Dance Generation With AIST++", @@ -1403,6 +1499,7 @@ "status": "Poster", "track": "main", "pid": 8465, + "author_site": "Ruilong Li; Shan Yang; David A. Ross; Angjoo Kanazawa", "author": "Ruilong Li; Shan Yang; David A. Ross; Angjoo Kanazawa", "abstract": "We present AIST++, a new multi-modal dataset of 3D dance motion and music, along with FACT, a Full-Attention Cross-modal Transformer network for generating 3D dance motion conditioned on music. The proposed AIST++ dataset contains 1.1M frames of 3D dance motion in 1408 sequences, covering 10 dance genres with multi-view videos with known camera poses---the largest dataset of this kind to our knowledge. We show that naively applying sequence models such as transformers to this dataset for the task of music conditioned 3D motion generation does not produce satisfactory 3D motion that is well correlated with the input music. We overcome these shortcomings by introducing key changes in its architecture design and supervision: FACT model involves a deep cross-modal transformer block with full-attention that is trained to predict N future motions. We empirically show that these changes are key factors in generating long sequences of realistic dance motion that are well-attuned to the input music. We conduct extensive experiments on AIST++ with user studies, where our method outperforms recent state-of-the-art methods both qualitatively and quantitatively. The code and the dataset can be found at: https://google.github.io/aichoreographer.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_AI_Choreographer_Music_Conditioned_3D_Dance_Generation_With_AIST_ICCV_2021_paper.pdf", @@ -1417,7 +1514,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Li_AI_Choreographer_Music_Conditioned_3D_Dance_Generation_With_AIST_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Li_AI_Choreographer_Music_Conditioned_3D_Dance_Generation_With_AIST_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Ruilong and Yang,\n Shan and Ross,\n David A. and Kanazawa,\n Angjoo\n},\n title = {\n AI Choreographer: Music Conditioned 3D Dance Generation With AIST++\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13401-13412\n} \n}" }, { "title": "AINet: Association Implantation for Superpixel Segmentation", @@ -1425,10 +1523,11 @@ "status": "Poster", "track": "main", "pid": 1940, + "author_site": "Yaxiong Wang; Yunchao Wei; Xueming Qian; Li Zhu; Yi Yang", "author": "Yaxiong Wang; Yunchao Wei; Xueming Qian; Li Zhu; Yi Yang", "abstract": "Recently, some approaches are proposed to harness deep convolutional networks to facilitate superpixel segmentation. The common practice is to first evenly divide the image into a pre-defined number of grids and then learn to associate each pixel with its surrounding grids. However, simply applying a series of convolution operations with limited receptive fields can only implicitly perceive the relations between the pixel and its surrounding grids. Consequently, existing methods often fail to provide an effective context when inferring the association map. To remedy this issue, we propose a novel Association Implantation (AI) module to enable the network to explicitly capture the relations between the pixel and its surrounding grids. The proposed AI module directly implants the features of grid cells to the surrounding of its corresponding central pixel, and conducts convolution on the padded window to adaptively transfer knowledge between them. With such an implantation operation, the network could explicitly harvest the pixel-grid level context, which is more in line with the target of superpixel segmentation comparing to the pixel-wise relation. Furthermore, to pursue better boundary precision, we design a boundary-perceiving loss to help the network discriminate the pixels around boundaries in hidden feature level, which could benefit the subsequent inferring modules to accurately identify more boundary pixels. Extensive experiments on BSDS500 and NYUv2 datasets show that our method could not only achieve state-of-the-art performance but maintain satisfactory inference efficiency. Code and pre-trained model are available at https://github.com/wangyxxjtu/AINet-ICCV2021.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_AINet_Association_Implantation_for_Superpixel_Segmentation_ICCV_2021_paper.pdf", - "aff": "Xi\u2019an Jiaotong University+Baidu Research; Beijing Jiaotong University; Xi\u2019an Jiaotong University; Xi\u2019an Jiaotong University; Zhejiang University", + "aff": "Xi’an Jiaotong University+Baidu Research; Beijing Jiaotong University; Xi’an Jiaotong University; Xi’an Jiaotong University; Zhejiang University", "project": "", "github": "https://github.com/wangyxxjtu/AINet-ICCV2021", "supp": "", @@ -1441,14 +1540,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wang_AINet_Association_Implantation_for_Superpixel_Segmentation_ICCV_2021_paper.html", "aff_unique_index": "0+1;2;0;0;3", - "aff_unique_norm": "Xi'an Jiao Tong University;Baidu;Beijing Jiao Tong University;Zhejiang University", + "aff_unique_norm": "Xi'an Jiaotong University;Baidu;Beijing Jiaotong University;Zhejiang University", "aff_unique_dep": ";Baidu Research;;", "aff_unique_url": "https://www.xjtu.edu.cn;https://research.baidu.com;http://www.njtu.edu.cn/en;https://www.zju.edu.cn", "aff_unique_abbr": "XJTU;Baidu;BJTU;ZJU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Yaxiong and Wei,\n Yunchao and Qian,\n Xueming and Zhu,\n Li and Yang,\n Yi\n},\n title = {\n AINet: Association Implantation for Superpixel Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7078-7087\n} \n}" }, { "title": "ALADIN: All Layer Adaptive Instance Normalization for Fine-Grained Style Similarity", @@ -1456,6 +1556,7 @@ "status": "Poster", "track": "main", "pid": 4339, + "author_site": "Dan Ruta; Saeid Motiian; Baldo Faieta; Zhe Lin; Hailin Jin; Alex Filipkowski; Andrew Gilbert; John Collomosse", "author": "Dan Ruta; Saeid Motiian; Baldo Faieta; Zhe Lin; Hailin Jin; Alex Filipkowski; Andrew Gilbert; John Collomosse", "abstract": "We present ALADIN (All Layer AdaIN); a novel architecture for searching images based on the similarity of their artistic style. Representation learning is critical to visual search, where distance in the learned search embedding reflects image similarity. Learning an embedding that discriminates fine-grained variations in style is hard, due to the difficulty of defining and labelling style. ALADIN takes a weakly supervised approach to learning a representation for fine-grained style similarity of digital artworks, leveraging BAM-FG, a novel large-scale dataset of user generated content groupings gathered from the web. ALADIN sets a new state of the art accuracy for style-based visual search over both coarse labelled style data (BAM) and BAM-FG; a new 2.62 million image dataset of 310,000 fine-grained style groupings also contributed by this work.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ruta_ALADIN_All_Layer_Adaptive_Instance_Normalization_for_Fine-Grained_Style_Similarity_ICCV_2021_paper.pdf", @@ -1470,7 +1571,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Ruta_ALADIN_All_Layer_Adaptive_Instance_Normalization_for_Fine-Grained_Style_Similarity_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Ruta_ALADIN_All_Layer_Adaptive_Instance_Normalization_for_Fine-Grained_Style_Similarity_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Ruta_2021_ICCV,\n \n author = {\n Ruta,\n Dan and Motiian,\n Saeid and Faieta,\n Baldo and Lin,\n Zhe and Jin,\n Hailin and Filipkowski,\n Alex and Gilbert,\n Andrew and Collomosse,\n John\n},\n title = {\n ALADIN: All Layer Adaptive Instance Normalization for Fine-Grained Style Similarity\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11926-11935\n} \n}" }, { "title": "ALL Snow Removed: Single Image Desnowing Algorithm Using Hierarchical Dual-Tree Complex Wavelet Representation and Contradict Channel Loss", @@ -1478,6 +1580,7 @@ "status": "Poster", "track": "main", "pid": 8547, + "author_site": "Wei-Ting Chen; Hao-Yu Fang; Cheng-Lin Hsieh; Cheng-Che Tsai; I-Hsiang Chen; Jian-Jiun Ding; Sy-Yen Kuo", "author": "Wei-Ting Chen; Hao-Yu Fang; Cheng-Lin Hsieh; Cheng-Che Tsai; I-Hsiang Chen; Jian-Jiun Ding; Sy-Yen Kuo", "abstract": "Snow is a highly complicated atmospheric phenomenon that usually contains snowflake, snow streak, and veiling effect (similar to the haze or the mist). In this literature, we propose a single image desnowing algorithm to address the diversity of snow particles in shape and size. First, to better represent the complex snow shape, we apply the dual-tree wavelet transform and propose a complex wavelet loss in the network. Second, we propose a hierarchical decomposition paradigm in our network for better understanding the different sizes of snow particles. Last, we propose a novel feature called the contradict channel (CC) for the snow scenes. We find that the regions containing the snow particles tend to have higher intensity in the CC than that in the snow-free regions. We leverage this discriminative feature to construct the contradict channel loss for improving the performance of snow removal. Moreover, due to the limitation of existing snow datasets, to simulate the snow scenarios comprehensively, we propose a large-scale dataset called Comprehensive Snow Dataset (CSD). Experimental results show that the proposed method can favorably outperform existing methods in three synthetic datasets and real-world datasets. The code and dataset are released in https://github.com/weitingchen83/ICCV2021-Single-Image-Desnowing-HDCWNet.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_ALL_Snow_Removed_Single_Image_Desnowing_Algorithm_Using_Hierarchical_Dual-Tree_ICCV_2021_paper.pdf", @@ -1496,12 +1599,13 @@ "aff_unique_index": "0+1;0;0;0;0;0;0", "aff_unique_norm": "National Taiwan University;ASUS Intelligent Cloud Services", "aff_unique_dep": "Graduate Institute of Electronics Engineering;", - "aff_unique_url": "https://www.ntu.edu.tw;https://www.asus.com/", + "aff_unique_url": "https://www.ntu.edu.tw;https://www.asus.com", "aff_unique_abbr": "NTU;ASUS", "aff_campus_unique_index": "0+0;0;0;0;0;0;0", "aff_campus_unique": "Taiwan", "aff_country_unique_index": "0+0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Wei-Ting and Fang,\n Hao-Yu and Hsieh,\n Cheng-Lin and Tsai,\n Cheng-Che and Chen,\n I-Hsiang and Ding,\n Jian-Jiun and Kuo,\n Sy-Yen\n},\n title = {\n ALL Snow Removed: Single Image Desnowing Algorithm Using Hierarchical Dual-Tree Complex Wavelet Representation and Contradict Channel Loss\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4196-4205\n} \n}" }, { "title": "ARAPReg: An As-Rigid-As Possible Regularization Loss for Learning Deformable Shape Generators", @@ -1509,6 +1613,7 @@ "status": "Poster", "track": "main", "pid": 8223, + "author_site": "Qixing Huang; Xiangru Huang; Bo Sun; Zaiwei Zhang; Junfeng Jiang; Chandrajit Bajaj", "author": "Qixing Huang; Xiangru Huang; Bo Sun; Zaiwei Zhang; Junfeng Jiang; Chandrajit Bajaj", "abstract": "This paper introduces an unsupervised loss for training parametric deformation shape generators. The key idea is to enforce the preservation of local rigidity among the generated shapes. Our approach builds on a local approximation of the as-rigid-as possible (or ARAP) deformation energy. We show how to develop the unsupervised loss via a spectral decomposition of the Hessian of the ARAP loss. Our loss nicely decouples pose and shape variations through a robust norm. The loss admits simple closed-form expressions. It is easy to train and can be plugged into any standard generation models, e.g., VAE and GAN. Experimental results show that our approach outperforms existing shape generation approaches considerably across various datasets such as DFAUST, Animal, and Bone.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Huang_ARAPReg_An_As-Rigid-As_Possible_Regularization_Loss_for_Learning_Deformable_Shape_ICCV_2021_paper.pdf", @@ -1532,7 +1637,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Austin;", "aff_country_unique_index": "0;0+0;0;0;1;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Huang_2021_ICCV,\n \n author = {\n Huang,\n Qixing and Huang,\n Xiangru and Sun,\n Bo and Zhang,\n Zaiwei and Jiang,\n Junfeng and Bajaj,\n Chandrajit\n},\n title = {\n ARAPReg: An As-Rigid-As Possible Regularization Loss for Learning Deformable Shape Generators\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5815-5825\n} \n}" }, { "title": "ARCH++: Animation-Ready Clothed Human Reconstruction Revisited", @@ -1540,6 +1646,7 @@ "status": "Poster", "track": "main", "pid": 5420, + "author_site": "Tong He; Yuanlu Xu; Shunsuke Saito; Stefano Soatto; Tony Tung", "author": "Tong He; Yuanlu Xu; Shunsuke Saito; Stefano Soatto; Tony Tung", "abstract": "We present ARCH++, an image-based method to reconstruct 3D avatars with arbitrary clothing styles. Our reconstructed avatars are animation-ready and highly realistic, in both the visible regions from input views and the unseen regions. While prior work shows great promise of reconstructing animatable clothed humans with various topologies, we observe that there exist fundamental limitations resulting in sub-optimal reconstruction quality. In this paper, we revisit the major steps of image-based avatar reconstruction and address the limitations with ARCH++. First, we introduce an end-to-end point based geometry encoder to better describe the semantics of the underlying 3D human body, in replacement of previous hand-crafted features. Second, in order to address the occupancy ambiguity caused by topological changes of clothed humans in the canonical pose, we propose a co-supervising framework with cross-space consistency to jointly estimate the occupancy in both the posed and canonical spaces. Last, we use image-to-image translation networks to further refine detailed geometry and texture on the reconstructed surface, which improves the fidelity and consistency across arbitrary viewpoints. In the experiments, we demonstrate improvements over the state of the art on both public benchmarks and user studies in reconstruction quality and realism.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/He_ARCH_Animation-Ready_Clothed_Human_Reconstruction_Revisited_ICCV_2021_paper.pdf", @@ -1556,14 +1663,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/He_ARCH_Animation-Ready_Clothed_Human_Reconstruction_Revisited_ICCV_2021_paper.html", "aff_unique_index": "0+1;0+1;0;1;0", - "aff_unique_norm": "Meta;University of California, Los Angeles", + "aff_unique_norm": "Facebook Reality Labs Research;University of California, Los Angeles", "aff_unique_dep": "Research;", "aff_unique_url": "https://www.facebook.com/realitylabs;https://www.ucla.edu", "aff_unique_abbr": "FRL;UCLA", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0+0;0+0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{He_2021_ICCV,\n \n author = {\n He,\n Tong and Xu,\n Yuanlu and Saito,\n Shunsuke and Soatto,\n Stefano and Tung,\n Tony\n},\n title = {\n ARCH++: Animation-Ready Clothed Human Reconstruction Revisited\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11046-11056\n} \n}" }, { "title": "ASCNet: Self-Supervised Video Representation Learning With Appearance-Speed Consistency", @@ -1571,6 +1679,7 @@ "status": "Poster", "track": "main", "pid": 3647, + "author_site": "Deng Huang; Wenhao Wu; Weiwen Hu; Xu Liu; Dongliang He; Zhihua Wu; Xiangmiao Wu; Mingkui Tan; Errui Ding", "author": "Deng Huang; Wenhao Wu; Weiwen Hu; Xu Liu; Dongliang He; Zhihua Wu; Xiangmiao Wu; Mingkui Tan; Errui Ding", "abstract": "We study self-supervised video representation learning, which is a challenging task due to 1) sufficient labels for supervision; 2) unstructured and noisy visual information. Existing methods mainly use contrastive loss with video clips as the instances and learn visual representation by discriminating instances from each other, but they need a careful treatment of negative pairs by either relying on large batch sizes, memory banks, extra modalities or customized mining strategies, which inevitably includes noisy data. In this paper, we observe that the consistency between positive samples is the key to learn robust video representation. Specifically, we propose two tasks to learn appearance and speed consistency, respectively. The appearance consistency task aims to maximize the similarity between two clips of the same video with different playback speeds. The speed consistency task aims to maximize the similarity between two clips with the same playback speed but different appearance information. We show that optimizing the two tasks jointly consistently improves the performance on downstream tasks, e.g., action recognition and video retrieval. Remarkably, for action recognition on the UCF-101 dataset, we achieve 90.8% accuracy without using any extra modalities or negative pairs for unsupervised pre-training, which outperforms the ImageNet supervised pre-trained model. Codes and models will be available.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Huang_ASCNet_Self-Supervised_Video_Representation_Learning_With_Appearance-Speed_Consistency_ICCV_2021_paper.pdf", @@ -1587,14 +1696,15 @@ "author_num": 9, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Huang_ASCNet_Self-Supervised_Video_Representation_Learning_With_Appearance-Speed_Consistency_ICCV_2021_paper.html", "aff_unique_index": "0+1;2+1;0;0;2;2;0;0+3;2", - "aff_unique_norm": "South China University of Technology;Pazhou Laboratory;Baidu;Ministry of Education", - "aff_unique_dep": ";;Baidu Inc.;Key Laboratory of Big Data and Intelligent Robot", + "aff_unique_norm": "South China University of Technology;Pazhou Laboratory;Baidu Inc.;Ministry of Education", + "aff_unique_dep": ";;;Key Laboratory of Big Data and Intelligent Robot", "aff_unique_url": "https://www.scut.edu.cn;;https://www.baidu.com;", "aff_unique_abbr": "SCUT;;Baidu;", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0;0;0;0;0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Huang_2021_ICCV,\n \n author = {\n Huang,\n Deng and Wu,\n Wenhao and Hu,\n Weiwen and Liu,\n Xu and He,\n Dongliang and Wu,\n Zhihua and Wu,\n Xiangmiao and Tan,\n Mingkui and Ding,\n Errui\n},\n title = {\n ASCNet: Self-Supervised Video Representation Learning With Appearance-Speed Consistency\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8096-8105\n} \n}" }, { "title": "ASMR: Learning Attribute-Based Person Search With Adaptive Semantic Margin Regularizer", @@ -1602,6 +1712,7 @@ "status": "Poster", "track": "main", "pid": 3640, + "author_site": "Boseung Jeong; Jicheol Park; Suha Kwak", "author": "Boseung Jeong; Jicheol Park; Suha Kwak", "abstract": "Attribute-based person search is the task of finding person images that are best matched with a set of text attributes given as query. The main challenge of this task is the large modality gap between attributes and images. To reduce the gap, we present a new loss for learning cross-modal embeddings in the context of attribute-based person search. We regard a set of attributes as a category of people sharing the same traits. In a joint embedding space of the two modalities, our loss pulls images close to their person categories for modality alignment. More importantly, it pushes apart a pair of person categories by a margin determined adaptively by their semantic distance, where the distance metric is learned end-to-end so that the loss considers importance of each attribute when relating person categories. Our loss guided by the adaptive semantic margin leads to more discriminative and semantically well-arranged distributions of person images. As a consequence, it enables a simple embedding model to achieve state-of-the-art records on public benchmarks without bells and whistles.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Jeong_ASMR_Learning_Attribute-Based_Person_Search_With_Adaptive_Semantic_Margin_Regularizer_ICCV_2021_paper.pdf", @@ -1625,7 +1736,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Jeong_2021_ICCV,\n \n author = {\n Jeong,\n Boseung and Park,\n Jicheol and Kwak,\n Suha\n},\n title = {\n ASMR: Learning Attribute-Based Person Search With Adaptive Semantic Margin Regularizer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12016-12025\n} \n}" }, { "title": "Accelerating Atmospheric Turbulence Simulation via Learned Phase-to-Space Transform", @@ -1633,6 +1745,7 @@ "status": "Poster", "track": "main", "pid": 4332, + "author_site": "Zhiyuan Mao; Nicholas Chimitt; Stanley H. Chan", "author": "Zhiyuan Mao; Nicholas Chimitt; Stanley H. Chan", "abstract": "Fast and accurate simulation of imaging through atmospheric turbulence is essential for developing turbulence mitigation algorithms. Recognizing the limitations of previous approaches, we introduce a new concept known as the phase-to-space (P2S) transform to significantly speed up the simulation. P2S is built upon three ideas: (1) reformulating the spatially varying convolution as a set of invariant convolutions with basis functions, (2) learning the basis function via the known turbulence statistics models, (3) implementing the P2S transform via a light-weight network that directly converts the phase representation to spatial representation. The new simulator offers 300x - 1000x speed up compared to the mainstream split-step simulators while preserving the essential turbulence statistics.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Mao_Accelerating_Atmospheric_Turbulence_Simulation_via_Learned_Phase-to-Space_Transform_ICCV_2021_paper.pdf", @@ -1656,7 +1769,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "West Lafayette", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Mao_2021_ICCV,\n \n author = {\n Mao,\n Zhiyuan and Chimitt,\n Nicholas and Chan,\n Stanley H.\n},\n title = {\n Accelerating Atmospheric Turbulence Simulation via Learned Phase-to-Space Transform\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14759-14768\n} \n}" }, { "title": "Achieving On-Mobile Real-Time Super-Resolution With Neural Architecture and Pruning Search", @@ -1664,6 +1778,7 @@ "status": "Poster", "track": "main", "pid": 10890, + "author_site": "Zheng Zhan; Yifan Gong; Pu Zhao; Geng Yuan; Wei Niu; Yushu Wu; Tianyun Zhang; Malith Jayaweera; David Kaeli; Bin Ren; Xue Lin; Yanzhi Wang", "author": "Zheng Zhan; Yifan Gong; Pu Zhao; Geng Yuan; Wei Niu; Yushu Wu; Tianyun Zhang; Malith Jayaweera; David Kaeli; Bin Ren; Xue Lin; Yanzhi Wang", "abstract": "Though recent years have witnessed remarkable progress in single image super-resolution (SISR) tasks with the prosperous development of deep neural networks (DNNs), the deep learning methods are confronted with the computation and memory consumption issues in practice, especially for resource-limited platforms such as mobile devices. To overcome the challenge and facilitate the real-time deployment of SISR tasks on mobile, we combine neural architecture search with pruning search and propose an automatic search framework that derives sparse super-resolution (SR) models with high image quality while satisfying the real-time inference requirement. To decrease the search cost, we leverage the weight sharing strategy by introducing a supernet and decouple the search problem into three stages, including supernet construction, compiler-aware architecture and pruning search, and compiler-aware pruning ratio search. With the proposed framework, we are the first to achieve real-time SR inference (with only tens of milliseconds per frame) for implementing 720p resolution with competitive image quality (in terms of PSNR and SSIM) on mobile platforms (Samsung Galaxy S20).", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhan_Achieving_On-Mobile_Real-Time_Super-Resolution_With_Neural_Architecture_and_Pruning_Search_ICCV_2021_paper.pdf", @@ -1687,7 +1802,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0;0;0;0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhan_2021_ICCV,\n \n author = {\n Zhan,\n Zheng and Gong,\n Yifan and Zhao,\n Pu and Yuan,\n Geng and Niu,\n Wei and Wu,\n Yushu and Zhang,\n Tianyun and Jayaweera,\n Malith and Kaeli,\n David and Ren,\n Bin and Lin,\n Xue and Wang,\n Yanzhi\n},\n title = {\n Achieving On-Mobile Real-Time Super-Resolution With Neural Architecture and Pruning Search\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4821-4831\n} \n}" }, { "title": "Act the Part: Learning Interaction Strategies for Articulated Object Part Discovery", @@ -1695,6 +1811,7 @@ "status": "Poster", "track": "main", "pid": 4383, + "author_site": "Samir Yitzhak Gadre; Kiana Ehsani; Shuran Song", "author": "Samir Yitzhak Gadre; Kiana Ehsani; Shuran Song", "abstract": "People often use physical intuition when manipulating articulated objects, irrespective of object semantics. Motivated by this observation, we identify an important embodied task where an agent must play with objects to recover their parts. To this end, we introduce Act the Part (AtP) to learn how to interact with articulated objects to discover and segment their pieces. By coupling action selection and motion segmentation, AtP is able to isolate structures to make perceptual part recovery possible without semantic labels. Our experiments show AtP learns efficient strategies for part discovery, can generalize to unseen categories, and is capable of conditional reasoning for the task. Although trained in simulation, we show convincing transfer to real world data with no fine-tuning. A summery video, interactive demo, and code will be available at atp.cs.columbia.edu.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Gadre_Act_the_Part_Learning_Interaction_Strategies_for_Articulated_Object_Part_ICCV_2021_paper.pdf", @@ -1718,7 +1835,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Gadre_2021_ICCV,\n \n author = {\n Gadre,\n Samir Yitzhak and Ehsani,\n Kiana and Song,\n Shuran\n},\n title = {\n Act the Part: Learning Interaction Strategies for Articulated Object Part Discovery\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15752-15761\n} \n}" }, { "title": "Action-Conditioned 3D Human Motion Synthesis With Transformer VAE", @@ -1726,11 +1844,12 @@ "status": "Poster", "track": "main", "pid": 2200, - "author": "Mathis Petrovich; Michael J. Black; G\u00fcl Varol", + "author_site": "Mathis Petrovich; Michael J. Black; Gül Varol", + "author": "Mathis Petrovich; Michael J. Black; Gül Varol", "abstract": "We tackle the problem of action-conditioned generation of realistic and diverse human motion sequences. In contrast to methods that complete, or extend, motion sequences, this task does not require an initial pose or sequence. Here we learn an action-aware latent representation for human motions by training a generative variational autoencoder (VAE). By sampling from this latent space and querying a certain duration through a series of positional encodings, we synthesize variable-length motion sequences conditioned on a categorical action. Specifically, we design a Transformer-based architecture, ACTOR, for encoding and decoding a sequence of parametric SMPL human body models estimated from action recognition datasets. We evaluate our approach on the NTU RGB+D, HumanAct12 and UESTC datasets and show improvements over the state of the art. Furthermore, we present two use cases: improving action recognition through adding our synthesized data to training, and motion denoising. Code and models are available on our project page.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Petrovich_Action-Conditioned_3D_Human_Motion_Synthesis_With_Transformer_VAE_ICCV_2021_paper.pdf", - "aff": "LIGM, \u00b4Ecole des Ponts, Univ Gustave Eiffel, CNRS, France; Max Planck Institute for Intelligent Systems, T \u00a8ubingen, Germany; LIGM, \u00b4Ecole des Ponts, Univ Gustave Eiffel, CNRS, France", - "project": "https://imagine.enpc.fr/\u02dcpetrovim/actor", + "aff": "LIGM, ´Ecole des Ponts, Univ Gustave Eiffel, CNRS, France; Max Planck Institute for Intelligent Systems, T ¨ubingen, Germany; LIGM, ´Ecole des Ponts, Univ Gustave Eiffel, CNRS, France", + "project": "https://imagine.enpc.fr/˜petrovim/actor", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Petrovich_Action-Conditioned_3D_Human_ICCV_2021_supplemental.zip", "arxiv": "2104.05670", @@ -1747,9 +1866,10 @@ "aff_unique_url": "https://www.ponts.org;https://www.mpi-is.mpg.de", "aff_unique_abbr": "ENPC;MPI-IS", "aff_campus_unique_index": "1", - "aff_campus_unique": ";T\u00fcbingen", + "aff_campus_unique": ";Tübingen", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "France;Germany" + "aff_country_unique": "France;Germany", + "bibtex": "@InProceedings{Petrovich_2021_ICCV,\n \n author = {\n Petrovich,\n Mathis and Black,\n Michael J. and Varol,\n G\\"ul\n},\n title = {\n Action-Conditioned 3D Human Motion Synthesis With Transformer VAE\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10985-10995\n} \n}" }, { "title": "Active Domain Adaptation via Clustering Uncertainty-Weighted Embeddings", @@ -1757,10 +1877,11 @@ "status": "Poster", "track": "main", "pid": 1712, + "author_site": "Viraj Prabhu; Arjun Chandrasekaran; Kate Saenko; Judy Hoffman", "author": "Viraj Prabhu; Arjun Chandrasekaran; Kate Saenko; Judy Hoffman", "abstract": "Generalizing deep neural networks to new target domains is critical to their real-world utility. In practice, it may be feasible to get some target data labeled, but to be cost-effective it is desirable to select a maximally-informative subset via active learning (AL). We study the problem of AL under a domain shift, called Active Domain Adaptation (Active DA). We demonstrate how existing AL approaches based solely on model uncertainty or diversity sampling are less effective for Active DA. We propose Clustering Uncertainty-weighted Embeddings (CLUE), a novel label acquisition strategy for Active DA that performs uncertainty-weighted clustering to identify target instances for labeling that are both uncertain under the model and diverse in feature space. CLUE consistently outperforms competing label acquisition strategies for Active DA and AL across learning settings on 6 diverse domain shifts for image classification.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Prabhu_Active_Domain_Adaptation_via_Clustering_Uncertainty-Weighted_Embeddings_ICCV_2021_paper.pdf", - "aff": "Georgia Tech; Max Planck Institute for Intelligent Systems, T\u00fcbingen + Georgia Tech; Boston University; Georgia Tech", + "aff": "Georgia Tech; Max Planck Institute for Intelligent Systems, Tübingen + Georgia Tech; Boston University; Georgia Tech", "project": "", "github": "https://github.com/virajprabhu/CLUE", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Prabhu_Active_Domain_Adaptation_ICCV_2021_supplemental.pdf", @@ -1778,9 +1899,10 @@ "aff_unique_url": "https://www.gatech.edu;https://www.mpi-is.mpg.de;https://www.bu.edu", "aff_unique_abbr": "Georgia Tech;MPI-IS;BU", "aff_campus_unique_index": "1", - "aff_campus_unique": ";T\u00fcbingen", + "aff_campus_unique": ";Tübingen", "aff_country_unique_index": "0;1+0;0;0", - "aff_country_unique": "United States;Germany" + "aff_country_unique": "United States;Germany", + "bibtex": "@InProceedings{Prabhu_2021_ICCV,\n \n author = {\n Prabhu,\n Viraj and Chandrasekaran,\n Arjun and Saenko,\n Kate and Hoffman,\n Judy\n},\n title = {\n Active Domain Adaptation via Clustering Uncertainty-Weighted Embeddings\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8505-8514\n} \n}" }, { "title": "Active Learning for Deep Object Detection via Probabilistic Modeling", @@ -1788,6 +1910,7 @@ "status": "Poster", "track": "main", "pid": 4237, + "author_site": "Jiwoong Choi; Ismail Elezi; Hyuk-Jae Lee; Clement Farabet; Jose M. Alvarez", "author": "Jiwoong Choi; Ismail Elezi; Hyuk-Jae Lee; Clement Farabet; Jose M. Alvarez", "abstract": "Active learning aims to reduce labeling costs by selecting only the most informative samples on a dataset. Few existing works have addressed active learning for object detection. Most of these methods are based on multiple models or are straightforward extensions of classification methods, hence estimate an image's informativeness using only the classification head. In this paper, we propose a novel deep active learning approach for object detection. Our approach relies on mixture density networks that estimate a probabilistic distribution for each localization and classification head's output. We explicitly estimate the aleatoric and epistemic uncertainty in a single forward pass of a single model. Our method uses a scoring function that aggregates these two types of uncertainties for both heads to obtain every image's informativeness score. We demonstrate the efficacy of our approach in PASCAL VOC and MS-COCO datasets. Our approach outperforms single-model based methods and performs on par with multi-model based methods at a fraction of the computing cost.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Choi_Active_Learning_for_Deep_Object_Detection_via_Probabilistic_Modeling_ICCV_2021_paper.pdf", @@ -1804,14 +1927,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Choi_Active_Learning_for_Deep_Object_Detection_via_Probabilistic_Modeling_ICCV_2021_paper.html", "aff_unique_index": "0+1+2;1+2;0;2;2", - "aff_unique_norm": "Seoul National University;Technical University of Munich;NVIDIA", - "aff_unique_dep": ";;NVIDIA Corporation", + "aff_unique_norm": "Seoul National University;Technical University of Munich;NVIDIA Corporation", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.snu.ac.kr;https://www.tum.de;https://www.nvidia.com", "aff_unique_abbr": "SNU;TUM;NVIDIA", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+1+2;1+2;0;2;2", - "aff_country_unique": "South Korea;Germany;United States" + "aff_country_unique": "South Korea;Germany;United States", + "bibtex": "@InProceedings{Choi_2021_ICCV,\n \n author = {\n Choi,\n Jiwoong and Elezi,\n Ismail and Lee,\n Hyuk-Jae and Farabet,\n Clement and Alvarez,\n Jose M.\n},\n title = {\n Active Learning for Deep Object Detection via Probabilistic Modeling\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10264-10273\n} \n}" }, { "title": "Active Learning for Lane Detection: A Knowledge Distillation Approach", @@ -1819,10 +1943,11 @@ "status": "Poster", "track": "main", "pid": 7268, + "author_site": "Fengchao Peng; Chao Wang; Jianzhuang Liu; Zhen Yang", "author": "Fengchao Peng; Chao Wang; Jianzhuang Liu; Zhen Yang", "abstract": "Lane detection is a key task for autonomous driving vehicles. Currently, lane detection relies on a huge amount of annotated images, which is a heavy burden. Active learning has been proposed to reduce annotation in many computer vision tasks, but no effort has been made for lane detection. Through experiments, we find that existing active learning methods perform poorly for lane detection, and the reasons are twofold. On one hand, most methods evaluate data uncertainties based on entropy, which is undesirable in lane detection because it encourages to select images with very few lanes or even no lane at all. On the other hand, existing methods are not aware of the noise of lane annotations, which is caused by heavy occlusion and unclear lane marks. In this paper, we build a novel knowledge distillation framework and evaluate the uncertainty of images based on the knowledge learnt by the student model. We show that the proposed uncertainty metric overcomes the above two problems. To reduce data redundancy, we explore the influence sets of image samples, and propose a new diversity metric for data selection. Finally we incorporate the uncertainty and diversity metrics, and develop a greedy algorithm for data selection. The experiments show that our method achieves new state-of-the-art on the lane detection benchmarks. In addition, we extend this method to common 2D object detection and the results show that it is also effective.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Peng_Active_Learning_for_Lane_Detection_A_Knowledge_Distillation_Approach_ICCV_2021_paper.pdf", - "aff": "Noah\u2019s Ark Lab, Huawei Technologies; Noah\u2019s Ark Lab, Huawei Technologies; Noah\u2019s Ark Lab, Huawei Technologies; Noah\u2019s Ark Lab, Huawei Technologies", + "aff": "Noah’s Ark Lab, Huawei Technologies; Noah’s Ark Lab, Huawei Technologies; Noah’s Ark Lab, Huawei Technologies; Noah’s Ark Lab, Huawei Technologies", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Peng_Active_Learning_for_ICCV_2021_supplemental.pdf", @@ -1835,14 +1960,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Peng_Active_Learning_for_Lane_Detection_A_Knowledge_Distillation_Approach_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;0", - "aff_unique_norm": "Huawei", - "aff_unique_dep": "Noah\u2019s Ark Lab", + "aff_unique_norm": "Huawei Technologies", + "aff_unique_dep": "Noah’s Ark Lab", "aff_unique_url": "https://www.huawei.com", "aff_unique_abbr": "Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Peng_2021_ICCV,\n \n author = {\n Peng,\n Fengchao and Wang,\n Chao and Liu,\n Jianzhuang and Yang,\n Zhen\n},\n title = {\n Active Learning for Lane Detection: A Knowledge Distillation Approach\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15152-15161\n} \n}" }, { "title": "Active Universal Domain Adaptation", @@ -1850,6 +1976,7 @@ "status": "Poster", "track": "main", "pid": 2262, + "author_site": "Xinhong Ma; Junyu Gao; Changsheng Xu", "author": "Xinhong Ma; Junyu Gao; Changsheng Xu", "abstract": "Most unsupervised domain adaptation methods rely on rich prior knowledge about the source-target label set relationship, and they cannot recognize categories beyond the source classes, which limits their applicability in practical scenarios. This paper proposes a new paradigm for unsupervised domain adaptation, termed as Active Universal Domain Adaptation (AUDA), which removes all label set assumptions and aims for not only recognizing target samples from source classes but also inferring those from target-private classes by using active learning to annotate a small budget of target data. For AUDA, it is challenging to jointly adapt the model to the target domain and select informative target samples for annotations under a large domain gap and significant semantic shift. To address the problems, we propose an Active Universal Adaptation Network (AUAN). Specifically, we first introduce Adversarial and Diverse Curriculum Learning (ADCL), which progressively aligns source and target domains to classify whether target samples are from source classes. Then, we propose a Clustering Non-transferable Gradient Embedding (CNTGE) strategy, which utilizes the clues of transferability, diversity, and uncertainty to annotate target informative sample, making it possible to infer labels for target samples of target-private classes. Finally, we propose to jointly train ADCL and CNTGE with target supervision to promote domain adaptation and target-private class recognition. Extensive experiments demonstrate that the proposed AUDA model equipped with ADCL and CNTGE achieves significant results on four popular benchmarks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ma_Active_Universal_Domain_Adaptation_ICCV_2021_paper.pdf", @@ -1866,14 +1993,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Ma_Active_Universal_Domain_Adaptation_ICCV_2021_paper.html", "aff_unique_index": "0+1;0+1;0+1+2", - "aff_unique_norm": "Chinese Academy of Sciences;University of Chinese Academy of Sciences;Pengcheng Laboratory", - "aff_unique_dep": "Institute of Automation;School of Artificial Intelligence;Peng Cheng Laboratory", + "aff_unique_norm": "Chinese Academy of Sciences;University of Chinese Academy of Sciences;Peng Cheng Laboratory", + "aff_unique_dep": "Institute of Automation;School of Artificial Intelligence;", "aff_unique_url": "http://www.ia.cas.cn;http://www.ucas.ac.cn;", "aff_unique_abbr": "CASIA;UCAS;", "aff_campus_unique_index": ";;1", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0+0;0+0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Ma_2021_ICCV,\n \n author = {\n Ma,\n Xinhong and Gao,\n Junyu and Xu,\n Changsheng\n},\n title = {\n Active Universal Domain Adaptation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8968-8977\n} \n}" }, { "title": "AdaAttN: Revisit Attention Mechanism in Arbitrary Neural Style Transfer", @@ -1881,6 +2009,7 @@ "status": "Poster", "track": "main", "pid": 7254, + "author_site": "Songhua Liu; Tianwei Lin; Dongliang He; Fu Li; Meiling Wang; Xin Li; Zhengxing Sun; Qian Li; Errui Ding", "author": "Songhua Liu; Tianwei Lin; Dongliang He; Fu Li; Meiling Wang; Xin Li; Zhengxing Sun; Qian Li; Errui Ding", "abstract": "Fast arbitrary neural style transfer has attracted widespread attention from academic, industrial and art communities due to its flexibility in enabling various applications. Existing solutions either attentively fuse deep style feature into deep content feature without considering feature distributions, or adaptively normalize deep content feature according to the style such that their global statistic information is matched. Although effective, leaving shallow feature unexplored or without locally considering feature statistics, they are prone to suffer from unnatural output with unpleasing local distortions. To alleviate this problem, in this paper, we propose a novel Adaptive Attention Normalization (AdaAttN) module to adaptively perform attentive normalization on per-point basis. Specifically, spatial attention score is learnt from both shallow and deep features of content and style images. Then per-point weighted statistics are calculated by regarding a style feature point as a distribution of attention-weighted output of all style feature points. Finally, the content feature is normalized so that they demonstrate the same local feature statistics as the calculated per-point weighted style feature statistics. Besides, a novel local feature loss is derived based on AdaAttN to enhance local visual quality. We also extend AdaAttN to be ready for video style transfer with slight modifications. Extensive experiments demonstrate that our method achieves state-of-the-art arbitrary image/video style transfer. Codes and models will be available.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liu_AdaAttN_Revisit_Attention_Mechanism_in_Arbitrary_Neural_Style_Transfer_ICCV_2021_paper.pdf", @@ -1895,7 +2024,8 @@ "aff_domain": ";;;;;;;;", "email": ";;;;;;;;", "author_num": 9, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Liu_AdaAttN_Revisit_Attention_Mechanism_in_Arbitrary_Neural_Style_Transfer_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Liu_AdaAttN_Revisit_Attention_Mechanism_in_Arbitrary_Neural_Style_Transfer_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Liu_2021_ICCV,\n \n author = {\n Liu,\n Songhua and Lin,\n Tianwei and He,\n Dongliang and Li,\n Fu and Wang,\n Meiling and Li,\n Xin and Sun,\n Zhengxing and Li,\n Qian and Ding,\n Errui\n},\n title = {\n AdaAttN: Revisit Attention Mechanism in Arbitrary Neural Style Transfer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6649-6658\n} \n}" }, { "title": "AdaFit: Rethinking Learning-Based Normal Estimation on Point Clouds", @@ -1903,6 +2033,7 @@ "status": "Poster", "track": "main", "pid": 6738, + "author_site": "Runsong Zhu; Yuan Liu; Zhen Dong; Yuan Wang; Tengping Jiang; Wenping Wang; Bisheng Yang", "author": "Runsong Zhu; Yuan Liu; Zhen Dong; Yuan Wang; Tengping Jiang; Wenping Wang; Bisheng Yang", "abstract": "This paper presents a neural network for robust normal estimation on point clouds, named AdaFit, that can deal with point clouds with noise and density variations. Existing works use a network to learn point-wise weights for weighted least squares surface fitting to estimate the normals, which has difficulty in finding accurate normals in complex regions or containing noisy points. By analyzing the step of weighted least squares surface fitting, we find that it is hard to determine the polynomial order of the fitting surface and the fitting surface is sensitive to outliers. To address these problems, we propose a simple yet effective solution that adds an additional offset prediction to improve the quality of normal estimation. Furthermore, in order to take advantage of points from different neighborhood sizes, a novel Cascaded Scale Aggregation layer is proposed to help the network predict more accurate point-wise offsets and weights. Extensive experiments demonstrate that AdaFit achieves state-of-the-art performance on both the synthetic PCPNet dataset and the real-word SceneNN dataset.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhu_AdaFit_Rethinking_Learning-Based_Normal_Estimation_on_Point_Clouds_ICCV_2021_paper.pdf", @@ -1919,14 +2050,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhu_AdaFit_Rethinking_Learning-Based_Normal_Estimation_on_Point_Clouds_ICCV_2021_paper.html", "aff_unique_index": "0;1;0+2;0;0;1+2;0", - "aff_unique_norm": "Wuhan University;University of Hong Kong;Texas A&M University", + "aff_unique_norm": "Wuhan University;The University of Hong Kong;Texas A&M University", "aff_unique_dep": ";;", "aff_unique_url": "http://www.whu.edu.cn/;https://www.hku.hk;https://www.tamu.edu", "aff_unique_abbr": "WHU;HKU;TAMU", "aff_campus_unique_index": "1;;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0+1;0;0;0+1;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Zhu_2021_ICCV,\n \n author = {\n Zhu,\n Runsong and Liu,\n Yuan and Dong,\n Zhen and Wang,\n Yuan and Jiang,\n Tengping and Wang,\n Wenping and Yang,\n Bisheng\n},\n title = {\n AdaFit: Rethinking Learning-Based Normal Estimation on Point Clouds\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6118-6127\n} \n}" }, { "title": "AdaMML: Adaptive Multi-Modal Learning for Efficient Video Recognition", @@ -1934,6 +2066,7 @@ "status": "Poster", "track": "main", "pid": 2915, + "author_site": "Rameswar Panda; Chun-Fu (Richard) Chen; Quanfu Fan; Ximeng Sun; Kate Saenko; Aude Oliva; Rogerio Feris", "author": "Rameswar Panda; Chun-Fu (Richard) Chen; Quanfu Fan; Ximeng Sun; Kate Saenko; Aude Oliva; Rogerio Feris", "abstract": "Multi-modal learning, which focuses on utilizing various modalities to improve the performance of a model, is widely used in video recognition. While traditional multi-modal learning offers excellent recognition results, its computational expense limits its impact for many real-world applications. In this paper, we propose an adaptive multi-modal learning framework, called AdaMML, that selects on-the-fly the optimal modalities for each segment conditioned on the input for efficient video recognition. Specifically, given a video segment, a multi-modal policy network is used to decide what modalities should be used for processing by the recognition model, with the goal of improving both accuracy and efficiency. We efficiently train the policy network jointly with the recognition model using standard back-propagation. Extensive experiments on four challenging diverse datasets demonstrate that our proposed adaptive approach yields 35%-55% reduction in computation when compared to the traditional baseline that simply uses all the modalities irrespective of the input, while also achieving consistent improvements in accuracy over the state-of-the-art methods. Project page: https://rpand002.github.io/adamml.html.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Panda_AdaMML_Adaptive_Multi-Modal_Learning_for_Efficient_Video_Recognition_ICCV_2021_paper.pdf", @@ -1948,7 +2081,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Panda_AdaMML_Adaptive_Multi-Modal_Learning_for_Efficient_Video_Recognition_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Panda_AdaMML_Adaptive_Multi-Modal_Learning_for_Efficient_Video_Recognition_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Panda_2021_ICCV,\n \n author = {\n Panda,\n Rameswar and Chen,\n Chun-Fu (Richard) and Fan,\n Quanfu and Sun,\n Ximeng and Saenko,\n Kate and Oliva,\n Aude and Feris,\n Rogerio\n},\n title = {\n AdaMML: Adaptive Multi-Modal Learning for Efficient Video Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7576-7585\n} \n}" }, { "title": "AdaSGN: Adapting Joint Number and Model Size for Efficient Skeleton-Based Action Recognition", @@ -1956,6 +2090,7 @@ "status": "Poster", "track": "main", "pid": 2315, + "author_site": "Lei Shi; Yifan Zhang; Jian Cheng; Hanqing Lu", "author": "Lei Shi; Yifan Zhang; Jian Cheng; Hanqing Lu", "abstract": "Existing methods for skeleton-based action recognition mainly focus on improving the recognition accuracy, whereas the efficiency of the model is rarely considered. Recently, there are some works trying to speed up the skeleton modeling by designing light-weight modules. However, in addition to the model size, the amount of the data involved in the calculation is also an important factor for the running speed, especially for the skeleton data where most of the joints are redundant or non-informative to identify a specific skeleton.Besides, previous works usually employ one fix-sized model for all the samples regardless of the difficulty of recognition, which wastes computations for easy samples.To address these limitations, a novel approach, called AdaSGN, is proposed in this paper, which can reduce the computational cost of the inference process by adaptively controlling the input number of the joints of the skeleton on-the-fly. Moreover, it can also adaptively select the optimal model size for each sample to achieve a better trade-off between the accuracy and the efficiency. We conduct extensive experiments on three challenging datasets, namely, NTU-60, NTU-120 and SHREC, to verify the superiority of the proposed approach, where AdaSGN achieves comparable or even higher performance with much lower GFLOPs compared with the baseline method.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Shi_AdaSGN_Adapting_Joint_Number_and_Model_Size_for_Efficient_Skeleton-Based_ICCV_2021_paper.pdf", @@ -1979,7 +2114,8 @@ "aff_campus_unique_index": "1;1;1;1", "aff_campus_unique": ";Beijing", "aff_country_unique_index": "0+0+0;0+0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Shi_2021_ICCV,\n \n author = {\n Shi,\n Lei and Zhang,\n Yifan and Cheng,\n Jian and Lu,\n Hanqing\n},\n title = {\n AdaSGN: Adapting Joint Number and Model Size for Efficient Skeleton-Based Action Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13413-13422\n} \n}" }, { "title": "Adaptive Adversarial Network for Source-Free Domain Adaptation", @@ -1987,6 +2123,7 @@ "status": "Poster", "track": "main", "pid": 6209, + "author_site": "Haifeng Xia; Handong Zhao; Zhengming Ding", "author": "Haifeng Xia; Handong Zhao; Zhengming Ding", "abstract": "Unsupervised Domain Adaptation solves knowledge transfer along with the coexistence of well-annotated source domain and unlabeled target instances. However, the source domain in many practical applications is not always accessible due to data privacy or the insufficient memory storage for small devices. This scenario defined as Source-free Domain Adaptation only allows accessing the well-trained source model for target learning. To address the challenge of source data unavailability, we develop an Adaptive Adversarial Network (A2Net) including three components. Specifically, the first one named Adaptive Adversarial Inference seeks a target-specific classifier to advance the recognition of samples which the provided source-specific classifier difficultly identifies. Then, the Contrastive Category-wise Matching module exploits the positive relation of every two target images to enforce the compactness of subspace for each category. Thirdly, Self-Supervised Rotation facilitates the model to learn additional semantics from target images by themselves. Extensive experiments on the popular cross-domain benchmarks verify the effectiveness of our proposed model on solving adaptation task without any source data.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xia_Adaptive_Adversarial_Network_for_Source-Free_Domain_Adaptation_ICCV_2021_paper.pdf", @@ -2003,14 +2140,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Xia_Adaptive_Adversarial_Network_for_Source-Free_Domain_Adaptation_ICCV_2021_paper.html", "aff_unique_index": "0;1;0", - "aff_unique_norm": "Tulane University;Adobe", - "aff_unique_dep": "Department of Computer Science;Adobe Research", + "aff_unique_norm": "Tulane University;Adobe Research", + "aff_unique_dep": "Department of Computer Science;", "aff_unique_url": "https://www.tulane.edu;https://research.adobe.com", "aff_unique_abbr": "Tulane;Adobe", "aff_campus_unique_index": "1", "aff_campus_unique": ";San Jose", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Xia_2021_ICCV,\n \n author = {\n Xia,\n Haifeng and Zhao,\n Handong and Ding,\n Zhengming\n},\n title = {\n Adaptive Adversarial Network for Source-Free Domain Adaptation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9010-9019\n} \n}" }, { "title": "Adaptive Boundary Proposal Network for Arbitrary Shape Text Detection", @@ -2018,6 +2156,7 @@ "status": "Poster", "track": "main", "pid": 6487, + "author_site": "Shi-Xue Zhang; Xiaobin Zhu; Chun Yang; Hongfa Wang; Xu-Cheng Yin", "author": "Shi-Xue Zhang; Xiaobin Zhu; Chun Yang; Hongfa Wang; Xu-Cheng Yin", "abstract": "Arbitrary shape text detection is a challenging task due to the high complexity and variety of scene texts. In this work, we propose a novel adaptive boundary proposal network for arbitrary shape text detection, which can learn to directly produce accurate boundary for arbitrary shape text without any post-processing. Our method mainly consists of a boundary proposal model and an innovative adaptive boundary deformation model. The boundary proposal model constructed by multi-layer dilated convolutions is adopted to produce prior information (including classification map, distance field, and direction field) and coarse boundary proposals. The adaptive boundary deformation model is an encoder-decoder network, in which the encoder mainly consists of a Graph Convolutional Network (GCN) and a Recurrent Neural Network (RNN). It aims to perform boundary deformation in an iterative way for obtaining text instance shape guided by prior information from the boundary proposal model. In this way, our method can directly and efficiently generate accurate text boundaries without complex post-processing. Extensive experiments on publicly available datasets demonstrate the state-of-the-art performance of our method.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhang_Adaptive_Boundary_Proposal_Network_for_Arbitrary_Shape_Text_Detection_ICCV_2021_paper.pdf", @@ -2034,14 +2173,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhang_Adaptive_Boundary_Proposal_Network_for_Arbitrary_Shape_Text_Detection_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;1;0+2+0", - "aff_unique_norm": "University of Science and Technology Beijing;Tencent;USTB-EEasyTech Joint Lab of Artificial Intelligence", - "aff_unique_dep": "School of Computer and Communication Engineering;Tencent Technology;Artificial Intelligence", - "aff_unique_url": "https://www.ustb.edu.cn;https://www.tencent.com;", + "aff_unique_norm": "University of Science and Technology Beijing;Tencent Technology;USTB-EEasyTech Joint Lab of Artificial Intelligence", + "aff_unique_dep": "School of Computer and Communication Engineering;;Artificial Intelligence", + "aff_unique_url": "http://www.ustb.edu.cn;https://www.tencent.com;", "aff_unique_abbr": "USTB;Tencent;", - "aff_campus_unique_index": "0;0;0;1;0", - "aff_campus_unique": "Beijing;Shenzhen;", + "aff_campus_unique_index": "1;", + "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0;0;0;0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2021_ICCV,\n \n author = {\n Zhang,\n Shi-Xue and Zhu,\n Xiaobin and Yang,\n Chun and Wang,\n Hongfa and Yin,\n Xu-Cheng\n},\n title = {\n Adaptive Boundary Proposal Network for Arbitrary Shape Text Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1305-1314\n} \n}" }, { "title": "Adaptive Confidence Thresholding for Monocular Depth Estimation", @@ -2049,6 +2189,7 @@ "status": "Poster", "track": "main", "pid": 7776, + "author_site": "Hyesong Choi; Hunsang Lee; Sunkyung Kim; Sunok Kim; Seungryong Kim; Kwanghoon Sohn; Dongbo Min", "author": "Hyesong Choi; Hunsang Lee; Sunkyung Kim; Sunok Kim; Seungryong Kim; Kwanghoon Sohn; Dongbo Min", "abstract": "Self-supervised monocular depth estimation has become an appealing solution to the lack of ground truth labels, but its reconstruction loss often produces over-smoothed results across object boundaries and is incapable of handling occlusion explicitly. In this paper, we propose a new approach to leverage pseudo ground truth depth maps of stereo images generated from self-supervised stereo matching methods. The confidence map of the pseudo ground truth depth map is estimated to mitigate performance degeneration by inaccurate pseudo depth maps. To cope with the prediction error of the confidence map itself, we also leverage the threshold network that learns the threshold dynamically conditioned on the pseudo depth maps. The pseudo depth labels filtered out by the thresholded confidence map are used to supervise the monocular depth network. Furthermore, we propose the probabilistic framework that refines the monocular depth map with the help of its uncertainty map through the pixel-adaptive convolution (PAC) layer. Experimental results demonstrate superior performance to state-of-the-art monocular depth estimation methods. Lastly, we exhibit that the proposed threshold learning can also be used to improve the performance of existing confidence estimation approaches.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Choi_Adaptive_Confidence_Thresholding_for_Monocular_Depth_Estimation_ICCV_2021_paper.pdf", @@ -2072,7 +2213,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Choi_2021_ICCV,\n \n author = {\n Choi,\n Hyesong and Lee,\n Hunsang and Kim,\n Sunkyung and Kim,\n Sunok and Kim,\n Seungryong and Sohn,\n Kwanghoon and Min,\n Dongbo\n},\n title = {\n Adaptive Confidence Thresholding for Monocular Depth Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12808-12818\n} \n}" }, { "title": "Adaptive Convolutions With Per-Pixel Dynamic Filter Atom", @@ -2080,6 +2222,7 @@ "status": "Poster", "track": "main", "pid": 7514, + "author_site": "Ze Wang; Zichen Miao; Jun Hu; Qiang Qiu", "author": "Ze Wang; Zichen Miao; Jun Hu; Qiang Qiu", "abstract": "Applying feature dependent network weights have been proved to be effective in many fields. However, in practice, restricted by the enormous size of model parameters and memory footprints, scalable and versatile dynamic convolutions with per-pixel adapted filters are yet to be fully explored. In this paper, we address this challenge by decomposing filters, adapted to each spatial position, over dynamic filter atoms generated by a light-weight network from local features. Adaptive receptive fields can be supported by further representing each filter atom over sets of pre-fixed multi-scale bases. As plug-and-play replacements to convolutional layers, the introduced adaptive convolutions with per-pixel dynamic atoms enable explicit modeling of intra-image variance, while avoiding heavy computation, parameters, and memory cost. Our method preserves the appealing properties of conventional convolutions as being translation-equivariant and parametrically efficient. We present experiments to show that, the proposed method delivers comparable or even better performance across tasks, and are particularly effective on handling tasks with significant intra-image variance.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_Adaptive_Convolutions_With_Per-Pixel_Dynamic_Filter_Atom_ICCV_2021_paper.pdf", @@ -2096,14 +2239,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wang_Adaptive_Convolutions_With_Per-Pixel_Dynamic_Filter_Atom_ICCV_2021_paper.html", "aff_unique_index": "0;0;1;0", - "aff_unique_norm": "Purdue University;Meta", - "aff_unique_dep": ";Facebook, Inc.", + "aff_unique_norm": "Purdue University;Facebook, Inc.", + "aff_unique_dep": ";", "aff_unique_url": "https://www.purdue.edu;https://www.facebook.com", "aff_unique_abbr": "Purdue;FB", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Ze and Miao,\n Zichen and Hu,\n Jun and Qiu,\n Qiang\n},\n title = {\n Adaptive Convolutions With Per-Pixel Dynamic Filter Atom\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12302-12311\n} \n}" }, { "title": "Adaptive Curriculum Learning", @@ -2111,6 +2255,7 @@ "status": "Poster", "track": "main", "pid": 7157, + "author_site": "Yajing Kong; Liu Liu; Jun Wang; Dacheng Tao", "author": "Yajing Kong; Liu Liu; Jun Wang; Dacheng Tao", "abstract": "Inspired by the human learning principle that learning easier concepts first and then gradually paying more attention to harder ones, curriculum learning uses the non-uniform sampling of mini-batches according to the order of examples' difficulty. Just as a teacher adjusts the curriculum according to the learning progress of each student, a proper curriculum should be adapted to the current state of the model. Therefore, in contrast to recent works using a fixed curriculum, we devise a new curriculum learning method, Adaptive Curriculum Learning (Adaptive CL), adapting the difficulty of examples to the current state of the model. Specifically, we make use of the loss of the current model to adjust the difficulty score while retaining previous useful learned knowledge by KL divergence. Moreover, under a non-linear model and binary classification, we theoretically prove that the expected convergence rate of curriculum learning monotonically decreases with respect to the loss of a point regarding the optimal hypothesis, and monotonically increases with respect to the loss of a point regarding the current hypothesis. The analyses indicate that Adaptive CL could improve the convergence properties during the early stages of learning. Extensive experimental results demonstrate the superiority of the proposed approach over existing competitive curriculum learning methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Kong_Adaptive_Curriculum_Learning_ICCV_2021_paper.pdf", @@ -2127,14 +2272,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Kong_Adaptive_Curriculum_Learning_ICCV_2021_paper.html", "aff_unique_index": "0;0;1;2+0", - "aff_unique_norm": "University of Sydney;City University of Hong Kong;JD", - "aff_unique_dep": ";;JD Explore Academy", + "aff_unique_norm": "The University of Sydney;City University of Hong Kong;JD Explore Academy", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.sydney.edu.au;https://www.cityu.edu.hk;", "aff_unique_abbr": "USYD;CityU;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;1+0", - "aff_country_unique": "Australia;China" + "aff_country_unique": "Australia;China", + "bibtex": "@InProceedings{Kong_2021_ICCV,\n \n author = {\n Kong,\n Yajing and Liu,\n Liu and Wang,\n Jun and Tao,\n Dacheng\n},\n title = {\n Adaptive Curriculum Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5067-5076\n} \n}" }, { "title": "Adaptive Focus for Efficient Video Recognition", @@ -2142,6 +2288,7 @@ "status": "Poster", "track": "main", "pid": 10048, + "author_site": "Yulin Wang; Zhaoxi Chen; Haojun Jiang; Shiji Song; Yizeng Han; Gao Huang", "author": "Yulin Wang; Zhaoxi Chen; Haojun Jiang; Shiji Song; Yizeng Han; Gao Huang", "abstract": "In this paper, we explore the spatial redundancy in video recognition with the aim to improve the computational efficiency. It is observed that the most informative region in each frame of a video is usually a small image patch, which shifts smoothly across frames. Therefore, we model the patch localization problem as a sequential decision task, and propose a reinforcement learning based approach for efficient spatially adaptive video recognition (AdaFocus). In specific, a light-weighted ConvNet is first adopted to quickly process the full video sequence, whose features are used by a recurrent policy network to localize the most task-relevant regions. Then the selected patches are inferred by a high-capacity network for the final prediction. During offline inference, once the informative patch sequence has been generated, the bulk of computation can be done in parallel, and is efficient on modern GPU devices. In addition, we demonstrate that the proposed method can be easily extended by further considering the temporal redundancy, e.g., dynamically skipping less valuable frames. Extensive experiments on five benchmark datasets, i.e., ActivityNet, FCVID, Mini-Kinetics, Something-Something V1&V2, demonstrate that our method is significantly more efficient than the competitive baselines. Code is available at https://github. com/blackfeather-wang/AdaFocus.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_Adaptive_Focus_for_Efficient_Video_Recognition_ICCV_2021_paper.pdf", @@ -2165,7 +2312,8 @@ "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Yulin and Chen,\n Zhaoxi and Jiang,\n Haojun and Song,\n Shiji and Han,\n Yizeng and Huang,\n Gao\n},\n title = {\n Adaptive Focus for Efficient Video Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 16249-16258\n} \n}" }, { "title": "Adaptive Graph Convolution for Point Cloud Analysis", @@ -2173,6 +2321,7 @@ "status": "Poster", "track": "main", "pid": 6349, + "author_site": "Haoran Zhou; Yidan Feng; Mingsheng Fang; Mingqiang Wei; Jing Qin; Tong Lu", "author": "Haoran Zhou; Yidan Feng; Mingsheng Fang; Mingqiang Wei; Jing Qin; Tong Lu", "abstract": "Convolution on 3D point clouds that generalized from 2D grid-like domains is widely researched yet far from perfect. The standard convolution characterises feature correspondences indistinguishably among 3D points, presenting an intrinsic limitation of poor distinctive feature learning. In this paper, we propose Adaptive Graph Convolution (AdaptConv) which generates adaptive kernels for points according to their dynamically learned features. Compared with using a fixed/isotropic kernel, AdaptConv improves the flexibility of point cloud convolutions, effectively and precisely capturing the diverse relations between points from different semantic parts. Unlike popular attentional weight schemes, the proposed AdaptConv implements the adaptiveness inside the convolution operation instead of simply assigning different weights to the neighboring points. Extensive qualitative and quantitative evaluations show that our method outperforms state-of-the-art point cloud classification and segmentation approaches on several benchmark datasets. Our code is available at https://github.com/hrzhou2/AdaptConv-master.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhou_Adaptive_Graph_Convolution_for_Point_Cloud_Analysis_ICCV_2021_paper.pdf", @@ -2189,14 +2338,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhou_Adaptive_Graph_Convolution_for_Point_Cloud_Analysis_ICCV_2021_paper.html", "aff_unique_index": "0;1;0;1;2;0", - "aff_unique_norm": "Nanjing University;Nanjing University of Aeronautics and Astronautics;Hong Kong Polytechnic University", + "aff_unique_norm": "Nanjing University;Nanjing University of Aeronautics and Astronautics;The Hong Kong Polytechnic University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.nju.edu.cn;http://www.nuaa.edu.cn;https://www.polyu.edu.hk", "aff_unique_abbr": "Nanjing U;NUAA;PolyU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhou_2021_ICCV,\n \n author = {\n Zhou,\n Haoran and Feng,\n Yidan and Fang,\n Mingsheng and Wei,\n Mingqiang and Qin,\n Jing and Lu,\n Tong\n},\n title = {\n Adaptive Graph Convolution for Point Cloud Analysis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4965-4974\n} \n}" }, { "title": "Adaptive Hierarchical Graph Reasoning With Semantic Coherence for Video-and-Language Inference", @@ -2204,10 +2354,11 @@ "status": "Poster", "track": "main", "pid": 3076, + "author_site": "Juncheng Li; Siliang Tang; Linchao Zhu; Haochen Shi; Xuanwen Huang; Fei Wu; Yi Yang; Yueting Zhuang", "author": "Juncheng Li; Siliang Tang; Linchao Zhu; Haochen Shi; Xuanwen Huang; Fei Wu; Yi Yang; Yueting Zhuang", "abstract": "Video-and-Language Inference is a recently proposed task for joint video-and-language understanding. This new task requires a model to draw inference on whether a natural language statement entails or contradicts a given video clip. In this paper, we study how to address three critical challenges for this task: judging the global correctness of the statement involved multiple semantic meanings, joint reasoning over video and subtitles, and modeling long-range relationships and complex social interactions. First, we propose an adaptive hierarchical graph network that achieves in-depth understanding of the video over complex interactions. Specifically, it performs joint reasoning over video and subtitles in three hierarchies, where the graph structure is adaptively adjusted according to the semantic structures of the statement. Secondly, we introduce semantic coherence learning to explicitly encourage the semantic coherence of the adaptive hierarchical graph network from three hierarchies. The semantic coherence learning can further improve the alignment between vision and linguistics, and the coherence across a sequence of video segments. Experimental results show that our method significantly outperforms the baseline by a large margin.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_Adaptive_Hierarchical_Graph_Reasoning_With_Semantic_Coherence_for_Video-and-Language_Inference_ICCV_2021_paper.pdf", - "aff": "Zhejiang University; Zhejiang University; ReLER, University of Technology Sydney; Universit \u00b4e de Montr \u00b4eal; Zhejiang University; Zhejiang University; Zhejiang University; Zhejiang University", + "aff": "Zhejiang University; Zhejiang University; ReLER, University of Technology Sydney; Universit ´e de Montr ´eal; Zhejiang University; Zhejiang University; Zhejiang University; Zhejiang University", "project": "", "github": "", "supp": "", @@ -2220,14 +2371,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Li_Adaptive_Hierarchical_Graph_Reasoning_With_Semantic_Coherence_for_Video-and-Language_Inference_ICCV_2021_paper.html", "aff_unique_index": "0;0;1;2;0;0;0;0", - "aff_unique_norm": "Zhejiang University;University of Technology Sydney;Universit\u00e9 de Montr\u00e9al", + "aff_unique_norm": "Zhejiang University;University of Technology Sydney;Université de Montréal", "aff_unique_dep": ";ReLER;", "aff_unique_url": "https://www.zju.edu.cn;https://www.uts.edu.au;https://www.umontreal.ca", "aff_unique_abbr": "ZJU;UTS;UdeM", "aff_campus_unique_index": "1", "aff_campus_unique": ";Sydney", "aff_country_unique_index": "0;0;1;2;0;0;0;0", - "aff_country_unique": "China;Australia;Canada" + "aff_country_unique": "China;Australia;Canada", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Juncheng and Tang,\n Siliang and Zhu,\n Linchao and Shi,\n Haochen and Huang,\n Xuanwen and Wu,\n Fei and Yang,\n Yi and Zhuang,\n Yueting\n},\n title = {\n Adaptive Hierarchical Graph Reasoning With Semantic Coherence for Video-and-Language Inference\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1867-1877\n} \n}" }, { "title": "Adaptive Label Noise Cleaning With Meta-Supervision for Deep Face Recognition", @@ -2235,6 +2387,7 @@ "status": "Poster", "track": "main", "pid": 11436, + "author_site": "Yaobin Zhang; Weihong Deng; Yaoyao Zhong; Jiani Hu; Xian Li; Dongyue Zhao; Dongchao Wen", "author": "Yaobin Zhang; Weihong Deng; Yaoyao Zhong; Jiani Hu; Xian Li; Dongyue Zhao; Dongchao Wen", "abstract": "The training of a deep face recognition system usually faces the interference of label noise in the training data. However, it is difficult to obtain a high-precision cleaning model to remove these noises. In this paper, we propose an adaptive label noise cleaning algorithm based on meta-learning for face recognition datasets, which can learn the distribution of the data to be cleaned and make automatic adjustments based on class differences. It first learns reliable cleaning knowledge from well-labeled noisy data, then gradually transfers it to the target data with meta-supervision to improve performance. A threshold adapter module is also proposed to address the drift problem in transfer learning methods. Extensive experiments clean two noisy in-the-wild face recognition datasets and show the effectiveness of the proposed method to reach state-of-the-art performance on the IJB-C face recognition benchmark.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhang_Adaptive_Label_Noise_Cleaning_With_Meta-Supervision_for_Deep_Face_Recognition_ICCV_2021_paper.pdf", @@ -2258,7 +2411,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Beijing;", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2021_ICCV,\n \n author = {\n Zhang,\n Yaobin and Deng,\n Weihong and Zhong,\n Yaoyao and Hu,\n Jiani and Li,\n Xian and Zhao,\n Dongyue and Wen,\n Dongchao\n},\n title = {\n Adaptive Label Noise Cleaning With Meta-Supervision for Deep Face Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15065-15075\n} \n}" }, { "title": "Adaptive Surface Normal Constraint for Depth Estimation", @@ -2266,6 +2420,7 @@ "status": "Poster", "track": "main", "pid": 6781, + "author_site": "Xiaoxiao Long; Cheng Lin; Lingjie Liu; Wei Li; Christian Theobalt; Ruigang Yang; Wenping Wang", "author": "Xiaoxiao Long; Cheng Lin; Lingjie Liu; Wei Li; Christian Theobalt; Ruigang Yang; Wenping Wang", "abstract": "We present a novel method for single image depth estimation using surface normal constraints. Existing depth estimation methods either suffer from the lack of geometric constraints, or are limited to the difficulty of reliably capturing geometric context, which leads to a bottleneck of depth estimation quality. We therefore introduce a simple yet effective method, named Adaptive Surface Normal (ASN) constraint, to effectively correlate the depth estimation with geometric consistency. Our key idea is to adaptively determine the reliable local geometry from a set of randomly sampled candidates to derive surface normal constraint, for which we measure the consistency of the geometric contextual features. As a result, our method can faithfully reconstruct the 3D geometry and is robust to local shape variations, such as boundaries, sharp corners and noises. We conduct extensive evaluations and comparisons using public datasets. The experimental results demonstrate our method outperforms the state-of-the-art methods and has superior efficiency and robustness.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Long_Adaptive_Surface_Normal_Constraint_for_Depth_Estimation_ICCV_2021_paper.pdf", @@ -2280,7 +2435,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Long_Adaptive_Surface_Normal_Constraint_for_Depth_Estimation_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Long_Adaptive_Surface_Normal_Constraint_for_Depth_Estimation_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Long_2021_ICCV,\n \n author = {\n Long,\n Xiaoxiao and Lin,\n Cheng and Liu,\n Lingjie and Li,\n Wei and Theobalt,\n Christian and Yang,\n Ruigang and Wang,\n Wenping\n},\n title = {\n Adaptive Surface Normal Constraint for Depth Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12849-12858\n} \n}" }, { "title": "Adaptive Surface Reconstruction With Multiscale Convolutional Kernels", @@ -2288,6 +2444,7 @@ "status": "Poster", "track": "main", "pid": 2378, + "author_site": "Benjamin Ummenhofer; Vladlen Koltun", "author": "Benjamin Ummenhofer; Vladlen Koltun", "abstract": "We propose generalized convolutional kernels for 3D reconstruction with ConvNets from point clouds. Our method uses multiscale convolutional kernels that can be applied to adaptive grids as generated with octrees. In addition to standard kernels in which each element has a distinct spatial location relative to the center, our elements have a distinct relative location as well as a relative scale level. Making our kernels span multiple resolutions allows us to apply ConvNets to adaptive grids for large problem sizes where the input data is sparse but the entire domain needs to be processed. Our ConvNet architecture can predict the signed and unsigned distance fields for large data sets with millions of input points and is faster and more accurate than classic energy minimization or recent learning approaches. We demonstrate this in a zero-shot setting where we only train on synthetic data and evaluate on the Tanks and Temples dataset of real-world large-scale 3D scenes.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ummenhofer_Adaptive_Surface_Reconstruction_With_Multiscale_Convolutional_Kernels_ICCV_2021_paper.pdf", @@ -2304,14 +2461,15 @@ "author_num": 2, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Ummenhofer_Adaptive_Surface_Reconstruction_With_Multiscale_Convolutional_Kernels_ICCV_2021_paper.html", "aff_unique_index": "0;0", - "aff_unique_norm": "Intel", + "aff_unique_norm": "Intel Corporation", "aff_unique_dep": "Intel Labs", "aff_unique_url": "https://www.intel.com", "aff_unique_abbr": "Intel", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Ummenhofer_2021_ICCV,\n \n author = {\n Ummenhofer,\n Benjamin and Koltun,\n Vladlen\n},\n title = {\n Adaptive Surface Reconstruction With Multiscale Convolutional Kernels\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5651-5660\n} \n}" }, { "title": "Adaptive Unfolding Total Variation Network for Low-Light Image Enhancement", @@ -2319,6 +2477,7 @@ "status": "Poster", "track": "main", "pid": 2818, + "author_site": "Chuanjun Zheng; Daming Shi; Wentian Shi", "author": "Chuanjun Zheng; Daming Shi; Wentian Shi", "abstract": "Real-world low-light images suffer from two main degradations, namely, inevitable noise and poor visibility. Since the noise exhibits different levels, its estimation has been implemented in recent works when enhancing low-light images from raw Bayer space. When it comes to sRGB color space, the noise estimation becomes more complicated due to the effect of the image processing pipeline. Nevertheless, most existing enhancing algorithms in sRGB space only focus on the low visibility problem or suppress the noise under a hypothetical noise level, leading them impractical due to the lack of robustness. To address this issue, we propose an adaptive unfolding total variation network (UTVNet), which approximates the noise level from the real sRGB low-light image by learning the balancing parameter in the model-based denoising method with total variation regularization. Meanwhile, we learn the noise level map by unrolling the corresponding minimization process for providing the inferences of smoothness and fidelity constraints. Guided by the noise level map, our UTVNet can recover finer details and is more capable to suppress noise in real captured low-light scenes. Extensive experiments on real-world low-light images clearly demonstrate the superior performance of UTVNet over state-of-the-art methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zheng_Adaptive_Unfolding_Total_Variation_Network_for_Low-Light_Image_Enhancement_ICCV_2021_paper.pdf", @@ -2342,7 +2501,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zheng_2021_ICCV,\n \n author = {\n Zheng,\n Chuanjun and Shi,\n Daming and Shi,\n Wentian\n},\n title = {\n Adaptive Unfolding Total Variation Network for Low-Light Image Enhancement\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4439-4448\n} \n}" }, { "title": "Admix: Enhancing the Transferability of Adversarial Attacks", @@ -2350,6 +2510,7 @@ "status": "Poster", "track": "main", "pid": 1643, + "author_site": "Xiaosen Wang; Xuanran He; Jingdong Wang; Kun He", "author": "Xiaosen Wang; Xuanran He; Jingdong Wang; Kun He", "abstract": "Deep neural networks are known to be extremely vulnerable to adversarial examples under white-box setting. Moreover, the malicious adversaries crafted on the surrogate (source) model often exhibit black-box transferability on other models with the same learning task but having different architectures. Recently, various methods are proposed to boost the adversarial transferability, among which the input transformation is one of the most effective approaches. We investigate in this direction and observe that existing transformations are all applied on a single image, which might limit the adversarial transferability. To this end, we propose a new input transformation based attack method called Admix that considers the input image and a set of images randomly sampled from other categories. Instead of directly calculating the gradient on the original input, Admix calculates the gradient on the input image admixed with a small portion of each add-in image while using the original label of the input to craft more transferable adversaries.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_Admix_Enhancing_the_Transferability_of_Adversarial_Attacks_ICCV_2021_paper.pdf", @@ -2366,14 +2527,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wang_Admix_Enhancing_the_Transferability_of_Adversarial_Attacks_ICCV_2021_paper.html", "aff_unique_index": "0;1;2;0", - "aff_unique_norm": "Huazhong University of Science and Technology;Nanyang Technological University;Microsoft", + "aff_unique_norm": "Huazhong University of Science and Technology;Nanyang Technological University;Microsoft Research", "aff_unique_dep": "School of Computer Science and Technology;Wee Kim Wee School of Communication and Information;Research", "aff_unique_url": "http://www.hust.edu.cn;https://www.ntu.edu.sg;https://www.microsoft.com/en-us/research/group/asia", "aff_unique_abbr": "HUST;NTU;MSR Asia", "aff_campus_unique_index": "1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0;1;0;0", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Xiaosen and He,\n Xuanran and Wang,\n Jingdong and He,\n Kun\n},\n title = {\n Admix: Enhancing the Transferability of Adversarial Attacks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 16158-16167\n} \n}" }, { "title": "AdvDrop: Adversarial Attack to DNNs by Dropping Information", @@ -2381,6 +2543,7 @@ "status": "Poster", "track": "main", "pid": 9497, + "author_site": "Ranjie Duan; Yuefeng Chen; Dantong Niu; Yun Yang; A. K. Qin; Yuan He", "author": "Ranjie Duan; Yuefeng Chen; Dantong Niu; Yun Yang; A. K. Qin; Yuan He", "abstract": "Human can easily recognize visual objects with lost information: even losing most details with only contour reserved, e.g. cartoon. However, in terms of visual perception of Deep Neural Networks (DNNs), the ability for recognizing abstract objects (visual objects with lost information) is still a challenge. In this work, we investigate this issue from an adversarial viewpoint: will the performance of DNNs decrease even for the images only losing a little information? Towards this end, we propose a novel adversarial attack, named AdvDrop, which crafts adversarial examples by dropping existing information of images. Previously, most adversarial attacks add extra disturbing information on clean images explicitly. Opposite to previous works, our proposed work explores the adversarial robustness of DNN models in a novel perspective by dropping imperceptible details to craft adversarial examples. We demonstrate the effectiveness of AdvDrop by extensive experiments, and show that this new type of adversarial examples is more difficult to be defended by current defense systems.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Duan_AdvDrop_Adversarial_Attack_to_DNNs_by_Dropping_Information_ICCV_2021_paper.pdf", @@ -2404,7 +2567,8 @@ "aff_campus_unique_index": ";1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0+1;1;2;0;0;1", - "aff_country_unique": "Australia;China;United States" + "aff_country_unique": "Australia;China;United States", + "bibtex": "@InProceedings{Duan_2021_ICCV,\n \n author = {\n Duan,\n Ranjie and Chen,\n Yuefeng and Niu,\n Dantong and Yang,\n Yun and Qin,\n A. K. and He,\n Yuan\n},\n title = {\n AdvDrop: Adversarial Attack to DNNs by Dropping Information\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7506-7515\n} \n}" }, { "title": "AdvRush: Searching for Adversarially Robust Neural Architectures", @@ -2412,6 +2576,7 @@ "status": "Poster", "track": "main", "pid": 1990, + "author_site": "Jisoo Mok; Byunggook Na; Hyeokjun Choe; Sungroh Yoon", "author": "Jisoo Mok; Byunggook Na; Hyeokjun Choe; Sungroh Yoon", "abstract": "Deep neural networks continue to awe the world with their remarkable performance. Their predictions, however, are prone to be corrupted by adversarial examples that are imperceptible to humans. Current efforts to improve the robustness of neural networks against adversarial examples are focused on developing robust training methods, which update the weights of a neural network in a more robust direction. In this work, we take a step beyond training of the weight parameters and consider the problem of designing an adversarially robust neural architecture with high intrinsic robustness. We propose AdvRush, a novel adversarial robustness-aware neural architecture search algorithm, based upon a finding that independent of the training method, the intrinsic robustness of a neural network can be represented with the smoothness of its input loss landscape. Through a regularizer that favors a candidate architecture with a smoother input loss landscape, AdvRush successfully discovers an adversarially robust neural architecture. Along with a comprehensive theoretical motivation for AdvRush, we conduct an extensive amount of experiments to demonstrate the efficacy of AdvRush on various benchmark datasets. Notably, on CIFAR-10, AdvRush achieves 55.91% robust accuracy under FGSM attack after standard training and 50.04% robust accuracy under AutoAttack after 7-step PGD adversarial training.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Mok_AdvRush_Searching_for_Adversarially_Robust_Neural_Architectures_ICCV_2021_paper.pdf", @@ -2435,7 +2600,8 @@ "aff_campus_unique_index": "0;0;0;0+0", "aff_campus_unique": "Seoul", "aff_country_unique_index": "0;0;0;0+0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Mok_2021_ICCV,\n \n author = {\n Mok,\n Jisoo and Na,\n Byunggook and Choe,\n Hyeokjun and Yoon,\n Sungroh\n},\n title = {\n AdvRush: Searching for Adversarially Robust Neural Architectures\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12322-12332\n} \n}" }, { "title": "Adversarial Attack on Deep Cross-Modal Hamming Retrieval", @@ -2443,6 +2609,7 @@ "status": "Poster", "track": "main", "pid": 10865, + "author_site": "Chao Li; Shangqian Gao; Cheng Deng; Wei Liu; Heng Huang", "author": "Chao Li; Shangqian Gao; Cheng Deng; Wei Liu; Heng Huang", "abstract": "Recently, Cross-Modal Hamming space Retrieval (CMHR) regains ever-increasing attention, mainly benefiting from the excellent representation capability of deep neural networks. On the other hand, the vulnerability of deep networks exposes a deep cross-modal retrieval system to various safety risks (e.g., adversarial attack). However, attacking deep cross-modal Hamming retrieval remains underexplored. In this paper, we propose an effective Adversarial Attack on Deep Cross-Modal Hamming Retrieval, dubbed AACH, which fools a target deep CMHR model in a black-box setting. Specifically, given a target model, we first construct its substitute model to exploit cross-modal correlations within hamming space, with which we create adversarial examples by limitedly querying from a target model. Furthermore, to enhance the efficiency of adversarial attacks, we design a triplet construction module to exploit cross-modal positive and negative instances. In this way, perturbations can be learned to fool the target model through pulling perturbed examples far away from the positive instances whereas pushing them close to the negative ones. Extensive experiments on three widely used cross-modal (image and text) retrieval benchmarks demonstrate the superiority of the proposed AACH. We find that AACH can successfully attack a given target deep CMHR model with fewer interactions, and that its performance is on par with previous state-of-the-art attacks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_Adversarial_Attack_on_Deep_Cross-Modal_Hamming_Retrieval_ICCV_2021_paper.pdf", @@ -2459,14 +2626,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Li_Adversarial_Attack_on_Deep_Cross-Modal_Hamming_Retrieval_ICCV_2021_paper.html", "aff_unique_index": "0;1;0;2;1+3", - "aff_unique_norm": "Xidian University;University of Pittsburgh;Tencent;JD", - "aff_unique_dep": ";;Data Platform;JD Explore Academy", + "aff_unique_norm": "Xidian University;University of Pittsburgh;Tencent;JD Explore Academy", + "aff_unique_dep": ";;Data Platform;", "aff_unique_url": "http://www.xidian.edu.cn/;https://www.pitt.edu;https://www.tencent.com;", "aff_unique_abbr": "Xidian;Pitt;Tencent;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;1", - "aff_country_unique": "China;United States;" + "aff_country_unique": "China;United States;", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Chao and Gao,\n Shangqian and Deng,\n Cheng and Liu,\n Wei and Huang,\n Heng\n},\n title = {\n Adversarial Attack on Deep Cross-Modal Hamming Retrieval\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2218-2227\n} \n}" }, { "title": "Adversarial Attacks Are Reversible With Natural Supervision", @@ -2474,6 +2642,7 @@ "status": "Poster", "track": "main", "pid": 2870, + "author_site": "Chengzhi Mao; Mia Chiquier; Hao Wang; Junfeng Yang; Carl Vondrick", "author": "Chengzhi Mao; Mia Chiquier; Hao Wang; Junfeng Yang; Carl Vondrick", "abstract": "We find that images contain intrinsic structure that enables the reversal of many adversarial attacks. Attack vectors cause not only image classifiers to fail, but also collaterally disrupt incidental structure in the image. We demonstrate that modifying the attacked image to restore the natural structure will reverse many types of attacks, providing a defense. Experiments demonstrate significantly improved robustness for several state-of-the-art models across the CIFAR-10, CIFAR-100, SVHN, and ImageNet datasets. Our results show that our defense is still effective even if the attacker is aware of the defense mechanism. Since our defense is deployed during inference instead of training, it is compatible with pre-trained networks as well as most other defenses. Our results suggest deep networks are vulnerable to adversarial examples partly because their representations do not enforce the natural structure of images.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Mao_Adversarial_Attacks_Are_Reversible_With_Natural_Supervision_ICCV_2021_paper.pdf", @@ -2497,7 +2666,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Mao_2021_ICCV,\n \n author = {\n Mao,\n Chengzhi and Chiquier,\n Mia and Wang,\n Hao and Yang,\n Junfeng and Vondrick,\n Carl\n},\n title = {\n Adversarial Attacks Are Reversible With Natural Supervision\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 661-671\n} \n}" }, { "title": "Adversarial Attacks on Multi-Agent Communication", @@ -2505,6 +2675,7 @@ "status": "Poster", "track": "main", "pid": 9139, + "author_site": "James Tu; Tsunhsuan Wang; Jingkang Wang; Sivabalan Manivasagam; Mengye Ren; Raquel Urtasun", "author": "James Tu; Tsunhsuan Wang; Jingkang Wang; Sivabalan Manivasagam; Mengye Ren; Raquel Urtasun", "abstract": "Growing at a fast pace, modern autonomous systems will soon be deployed at scale, opening up the possibility for cooperative multi-agent systems. Sharing information and distributing workloads allow autonomous agents to better perform tasks and increase computation efficiency. However, shared information can be modified to execute adversarial attacks on deep learning models that are widely employed in modern systems. Thus, we aim to study the robustness of such systems and focus on exploring adversarial attacks in a novel multi-agent setting where communication is done through sharing learned intermediate representations of neural networks. We observe that an indistinguishable adversarial message can severely degrade performance, but becomes weaker as the number of benign agents increases. Furthermore, we show that black-box transfer attacks are more difficult in this setting when compared to directly perturbing the inputs, as it is necessary to align the distribution of learned representations with domain adaptation. Our work studies robustness at the neural network level to contribute an additional layer of fault tolerance to modern security protocols for more secure multi-agent systems.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Tu_Adversarial_Attacks_on_Multi-Agent_Communication_ICCV_2021_paper.pdf", @@ -2528,7 +2699,8 @@ "aff_campus_unique_index": ";;;;", "aff_campus_unique": "", "aff_country_unique_index": "1;2;1;1;1;1", - "aff_country_unique": ";Canada;United States" + "aff_country_unique": ";Canada;United States", + "bibtex": "@InProceedings{Tu_2021_ICCV,\n \n author = {\n Tu,\n James and Wang,\n Tsunhsuan and Wang,\n Jingkang and Manivasagam,\n Sivabalan and Ren,\n Mengye and Urtasun,\n Raquel\n},\n title = {\n Adversarial Attacks on Multi-Agent Communication\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7768-7777\n} \n}" }, { "title": "Adversarial Example Detection Using Latent Neighborhood Graph", @@ -2536,6 +2708,7 @@ "status": "Poster", "track": "main", "pid": 6309, + "author_site": "Ahmed Abusnaina; Yuhang Wu; Sunpreet Arora; Yizhen Wang; Fei Wang; Hao Yang; David Mohaisen", "author": "Ahmed Abusnaina; Yuhang Wu; Sunpreet Arora; Yizhen Wang; Fei Wang; Hao Yang; David Mohaisen", "abstract": "Detection of adversarial examples with high accuracy is critical for the security of deployed deep neural network-based models. We present the first graph-based adversarial detection method that constructs a Latent Neighborhood Graph (LNG) around an input example to determine if the input example is adversarial. Given an input example, selected reference adversarial and benign examples are used to capture the local manifold in the vicinity of the input example. The LNG node connectivity parameters are optimized jointly with the parameters of a graph attention network in an end-to-end manner to determine the optimal graph topology for adversarial example detection. The graph attention network is used to determine if the LNG is derived from an adversarial or benign input example. Experimental evaluations on CIFAR-10, STL-10, and ImageNet datasets, using six adversarial attack methods, demonstrate that the proposed method outperforms state-of-the-art adversarial detection methods in white-box and gray-box settings. The proposed method is able to successfully detect adversarial examples crafted with small perturbations using unseen attacks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Abusnaina_Adversarial_Example_Detection_Using_Latent_Neighborhood_Graph_ICCV_2021_paper.pdf", @@ -2559,7 +2732,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0;0;0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Abusnaina_2021_ICCV,\n \n author = {\n Abusnaina,\n Ahmed and Wu,\n Yuhang and Arora,\n Sunpreet and Wang,\n Yizhen and Wang,\n Fei and Yang,\n Hao and Mohaisen,\n David\n},\n title = {\n Adversarial Example Detection Using Latent Neighborhood Graph\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7687-7696\n} \n}" }, { "title": "Adversarial Robustness for Unsupervised Domain Adaptation", @@ -2567,10 +2741,11 @@ "status": "Poster", "track": "main", "pid": 4222, + "author_site": "Muhammad Awais; Fengwei Zhou; Hang Xu; Lanqing Hong; Ping Luo; Sung-Ho Bae; Zhenguo Li", "author": "Muhammad Awais; Fengwei Zhou; Hang Xu; Lanqing Hong; Ping Luo; Sung-Ho Bae; Zhenguo Li", "abstract": "Extensive Unsupervised Domain Adaptation (UDA) studies have shown great success in practice by learning transferable representations across a labeled source domain and an unlabeled target domain with deep models. However, current work focuses on improving the generalization ability of UDA models on clean examples without considering the adversarial robustness, which is crucial in real-world applications. Conventional adversarial training methods are not suitable for the adversarial robustness on the unlabeled target domain of UDA since they train models with adversarial examples generated by the supervised loss function. In this work, we propose to leverage intermediate representations learned by robust ImageNet models to improve the robustness of UDA models. Our method works by aligning the features of the UDA model with the robust features learned by ImageNet pre-trained models along with domain adaptation training. It utilizes both labeled and unlabeled domains and instills robustness without any adversarial intervention or label requirement during domain adaptation training. Our experimental results show that our method significantly improves adversarial robustness compared to the baseline while keeping clean accuracy on various UDA benchmarks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Awais_Adversarial_Robustness_for_Unsupervised_Domain_Adaptation_ICCV_2021_paper.pdf", - "aff": "Huawei Noah\u2019s Ark Lab + Dept. of Computer Science, Kyung-Hee University, South Korea; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Dept. of Computer Science, The University of Hong Kong; Dept. of Computer Science, Kyung-Hee University, South Korea; Huawei Noah\u2019s Ark Lab", + "aff": "Huawei Noah’s Ark Lab + Dept. of Computer Science, Kyung-Hee University, South Korea; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; Dept. of Computer Science, The University of Hong Kong; Dept. of Computer Science, Kyung-Hee University, South Korea; Huawei Noah’s Ark Lab", "project": "awaisrauf.github.io/robust_uda", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Awais_Adversarial_Robustness_for_ICCV_2021_supplemental.pdf", @@ -2583,14 +2758,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Awais_Adversarial_Robustness_for_Unsupervised_Domain_Adaptation_ICCV_2021_paper.html", "aff_unique_index": "0+1;0;0;0;2;1;0", - "aff_unique_norm": "Huawei;Kyung-Hee University;University of Hong Kong", - "aff_unique_dep": "Noah\u2019s Ark Lab;Dept. of Computer Science;Dept. of Computer Science", + "aff_unique_norm": "Huawei;Kyung-Hee University;The University of Hong Kong", + "aff_unique_dep": "Noah’s Ark Lab;Dept. of Computer Science;Dept. of Computer Science", "aff_unique_url": "https://www.huawei.com;http://www.khu.ac.kr;https://www.hku.hk", "aff_unique_abbr": "Huawei;KHU;HKU", "aff_campus_unique_index": ";1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0+1;0;0;0;0;1;0", - "aff_country_unique": "China;South Korea" + "aff_country_unique": "China;South Korea", + "bibtex": "@InProceedings{Awais_2021_ICCV,\n \n author = {\n Awais,\n Muhammad and Zhou,\n Fengwei and Xu,\n Hang and Hong,\n Lanqing and Luo,\n Ping and Bae,\n Sung-Ho and Li,\n Zhenguo\n},\n title = {\n Adversarial Robustness for Unsupervised Domain Adaptation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8568-8577\n} \n}" }, { "title": "Adversarial Unsupervised Domain Adaptation With Conditional and Label Shift: Infer, Align and Iterate", @@ -2598,6 +2774,7 @@ "status": "Poster", "track": "main", "pid": 9313, + "author_site": "Xiaofeng Liu; Zhenhua Guo; Site Li; Fangxu Xing; Jane You; C.-C. Jay Kuo; Georges El Fakhri; Jonghye Woo", "author": "Xiaofeng Liu; Zhenhua Guo; Site Li; Fangxu Xing; Jane You; C.-C. Jay Kuo; Georges El Fakhri; Jonghye Woo", "abstract": "In this work, we propose an adversarial unsupervised domain adaptation (UDA) approach with the inherent conditional and label shifts, in which we aim to align the distributions w.r.t. both p(x|y) and p(y). Since the label is inaccessible in the target domain, the conventional adversarial UDA assumes p(y) is invariant across domains, and relies on aligning p(x) as an alternative to the p(x|y) alignment. To address this, we provide a thorough theoretical and empirical analysis of the conventional adversarial UDA methods under both conditional and label shifts, and propose a novel and practical alternative optimization scheme for adversarial UDA. Specifically, we infer the marginal p(y) and align p(x|y) iteratively in the training, and precisely align the posterior p(y|x) in testing. Our experimental results demonstrate its effectiveness on both classification and segmentation UDA, and partial UDA.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liu_Adversarial_Unsupervised_Domain_Adaptation_With_Conditional_and_Label_Shift_Infer_ICCV_2021_paper.pdf", @@ -2612,7 +2789,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Liu_Adversarial_Unsupervised_Domain_Adaptation_With_Conditional_and_Label_Shift_Infer_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Liu_Adversarial_Unsupervised_Domain_Adaptation_With_Conditional_and_Label_Shift_Infer_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Liu_2021_ICCV,\n \n author = {\n Liu,\n Xiaofeng and Guo,\n Zhenhua and Li,\n Site and Xing,\n Fangxu and You,\n Jane and Kuo,\n C.-C. Jay and El Fakhri,\n Georges and Woo,\n Jonghye\n},\n title = {\n Adversarial Unsupervised Domain Adaptation With Conditional and Label Shift: Infer,\n Align and Iterate\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10367-10376\n} \n}" }, { "title": "Adversarial VQA: A New Benchmark for Evaluating the Robustness of VQA Models", @@ -2620,6 +2798,7 @@ "status": "Poster", "track": "main", "pid": 9311, + "author_site": "Linjie Li; Jie Lei; Zhe Gan; Jingjing Liu", "author": "Linjie Li; Jie Lei; Zhe Gan; Jingjing Liu", "abstract": "Benefiting from large-scale pre-training, we have witnessed significant performance boost on the popular Visual Question Answering (VQA) task. Despite rapid progress, it remains unclear whether these state-of-the-art (SOTA) models are robust when encountering examples in the wild. To study this, we introduce Adversarial VQA, a new large-scale VQA benchmark, collected iteratively via an adversarial human-and-model-in-the-loop procedure. Through this new benchmark, we discover several interesting findings. (i) Surprisingly, we find that during dataset collection, non-expert annotators can easily attack SOTA VQA models successfully. (ii) Both large-scale pre-trained models and adversarial training methods achieve far worse performance on the new benchmark than over standard VQA v2 dataset, revealing the fragility of these models while demonstrating the effectiveness of our adversarial dataset. (iii) When used for data augmentation, our dataset can effectively boost model performance on other robust VQA benchmarks. We hope our Adversarial VQA dataset can shed new light on robustness study in the community and serve as a valuable benchmark for future work.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_Adversarial_VQA_A_New_Benchmark_for_Evaluating_the_Robustness_of_ICCV_2021_paper.pdf", @@ -2636,14 +2815,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Li_Adversarial_VQA_A_New_Benchmark_for_Evaluating_the_Robustness_of_ICCV_2021_paper.html", "aff_unique_index": "0;1;0;2", - "aff_unique_norm": "Microsoft;University of North Carolina at Chapel Hill;Tsinghua University", - "aff_unique_dep": "Microsoft Corporation;;", + "aff_unique_norm": "Microsoft Corporation;University of North Carolina at Chapel Hill;Tsinghua University", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.microsoft.com;https://www.unc.edu;https://www.tsinghua.edu.cn", "aff_unique_abbr": "Microsoft;UNC;THU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Chapel Hill", "aff_country_unique_index": "0;0;0;1", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Linjie and Lei,\n Jie and Gan,\n Zhe and Liu,\n Jingjing\n},\n title = {\n Adversarial VQA: A New Benchmark for Evaluating the Robustness of VQA Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2042-2051\n} \n}" }, { "title": "AgentFormer: Agent-Aware Transformers for Socio-Temporal Multi-Agent Forecasting", @@ -2651,6 +2831,7 @@ "status": "Poster", "track": "main", "pid": 1012, + "author_site": "Ye Yuan; Xinshuo Weng; Yanglan Ou; Kris M. Kitani", "author": "Ye Yuan; Xinshuo Weng; Yanglan Ou; Kris M. Kitani", "abstract": "Predicting accurate future trajectories of multiple agents is essential for autonomous systems but is challenging due to the complex interaction between agents and the uncertainty in each agent's future behavior. Forecasting multi-agent trajectories requires modeling two key dimensions: (1) time dimension, where we model the influence of past agent states over future states; (2) social dimension, where we model how the state of each agent affects others. Most prior methods model these two dimensions separately, e.g., first using a temporal model to summarize features over time for each agent independently and then modeling the interaction of the summarized features with a social model. This approach is suboptimal since independent feature encoding over either the time or social dimension can result in a loss of information. Instead, we would prefer a method that allows an agent's state at one time to directly affect another agent's state at a future time. To this end, we propose a new Transformer, termed AgentFormer, that simultaneously models the time and social dimensions. The model leverages a sequence representation of multi-agent trajectories by flattening trajectory features across time and agents. Since standard attention operations disregard the agent identity of each element in the sequence, AgentFormer uses a novel agent-aware attention mechanism that preserves agent identities by attending to elements of the same agent differently than elements of other agents. Based on AgentFormer, we propose a stochastic multi-agent trajectory prediction model that can attend to features of any agent at any previous timestep when inferring an agent's future position. The latent intent of all agents is also jointly modeled, allowing the stochasticity in one agent's behavior to affect other agents. Extensive experiments show that our method significantly improves the state of the art on well-established pedestrian and autonomous driving datasets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yuan_AgentFormer_Agent-Aware_Transformers_for_Socio-Temporal_Multi-Agent_Forecasting_ICCV_2021_paper.pdf", @@ -2665,7 +2846,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Yuan_AgentFormer_Agent-Aware_Transformers_for_Socio-Temporal_Multi-Agent_Forecasting_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Yuan_AgentFormer_Agent-Aware_Transformers_for_Socio-Temporal_Multi-Agent_Forecasting_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Yuan_2021_ICCV,\n \n author = {\n Yuan,\n Ye and Weng,\n Xinshuo and Ou,\n Yanglan and Kitani,\n Kris M.\n},\n title = {\n AgentFormer: Agent-Aware Transformers for Socio-Temporal Multi-Agent Forecasting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9813-9823\n} \n}" }, { "title": "Aggregation With Feature Detection", @@ -2673,6 +2855,7 @@ "status": "Poster", "track": "main", "pid": 7952, + "author_site": "Shuyang Sun; Xiaoyu Yue; Xiaojuan Qi; Wanli Ouyang; Victor Adrian Prisacariu; Philip H.S. Torr", "author": "Shuyang Sun; Xiaoyu Yue; Xiaojuan Qi; Wanli Ouyang; Victor Adrian Prisacariu; Philip H.S. Torr", "abstract": "Aggregating features from different depths of a network is widely adopted to improve the network capability. Lots of modern architectures are equipped with skip connections, which actually makes the feature aggregation happen in all these networks. Since different features tell different semantic meanings, there are inconsistencies and incompatibilities to be solved. However, existing works naively blend deep features via element-wise summation or concatenation with a convolution behind. Better feature aggregation method beyond summation or concatenation is rarely explored. In this paper, given two layers of features to be aggregated together, we first detect and identify where and what needs to be updated in one layer, then replace the feature at the identified location with the information of the other layer. This process, which we call DEtect-rePLAce (DEPLA), enables us to avoid inconsistent patterns while keeping useful information in the merged outputs. Experimental results demonstrate our method largely boosts multiple baselines e.g. ResNet, FishNet and FPN on three major vision tasks including ImageNet classification, MS COCO object detection and instance segmentation.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Sun_Aggregation_With_Feature_Detection_ICCV_2021_paper.pdf", @@ -2687,7 +2870,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Sun_Aggregation_With_Feature_Detection_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Sun_Aggregation_With_Feature_Detection_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Sun_2021_ICCV,\n \n author = {\n Sun,\n Shuyang and Yue,\n Xiaoyu and Qi,\n Xiaojuan and Ouyang,\n Wanli and Prisacariu,\n Victor Adrian and Torr,\n Philip H.S.\n},\n title = {\n Aggregation With Feature Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 527-536\n} \n}" }, { "title": "Aha! Adaptive History-Driven Attack for Decision-Based Black-Box Models", @@ -2695,10 +2879,11 @@ "status": "Poster", "track": "main", "pid": 2086, + "author_site": "Jie Li; Rongrong Ji; Peixian Chen; Baochang Zhang; Xiaopeng Hong; Ruixin Zhang; Shaoxin Li; Jilin Li; Feiyue Huang; Yongjian Wu", "author": "Jie Li; Rongrong Ji; Peixian Chen; Baochang Zhang; Xiaopeng Hong; Ruixin Zhang; Shaoxin Li; Jilin Li; Feiyue Huang; Yongjian Wu", "abstract": "The decision-based black-box attack means to craft adversarial examples with only the top-1 label of the victim model available. A common practice is to start from a large perturbation and then iteratively reduce it with a deterministic direction and a random one while keeping it adversarial. The limited information obtained from each query and inefficient direction sampling impede attack efficiency, making it hard to obtain a small enough perturbation within a limited number of queries. To tackle this problem, we propose a novel attack method termed Adaptive History-driven Attack (AHA) which gathers information from all historical queries as the prior for current sampling. Moreover, to balance between the deterministic direction and the random one, we dynamically adjust the coefficient according to the ratio of the actual magnitude reduction to the expected one. Such a strategy improves the success rate of queries during optimization, letting adversarial examples move swiftly along the decision boundary. Our method can also integrate with subspace optimization like dimension reduction to further improve efficiency. Extensive experiments on both ImageNet and CelebA datasets demonstrate that our method achieves at least 24.3% lower magnitude of perturbation on average with the same number of queries. Finally, we prove the practical potential of our method by evaluating it on popular defense methods and a real-world system provided by MEGVII Face++.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_Aha_Adaptive_History-Driven_Attack_for_Decision-Based_Black-Box_Models_ICCV_2021_paper.pdf", - "aff": "MAC Lab, School of Informatics, Xiamen University+Peng Cheng Lab+Institute of Arti\ufb01cial Intelligence, Xiamen University; MAC Lab, School of Informatics, Xiamen University+Peng Cheng Lab+Institute of Arti\ufb01cial Intelligence, Xiamen University; MAC Lab, School of Informatics, Xiamen University+Youtu Lab, Tencent; Beihang University; Xi\u2019an Jiaotong University; Youtu Lab, Tencent; Youtu Lab, Tencent; Youtu Lab, Tencent; Youtu Lab, Tencent; Youtu Lab, Tencent", + "aff": "MAC Lab, School of Informatics, Xiamen University+Peng Cheng Lab+Institute of Artificial Intelligence, Xiamen University; MAC Lab, School of Informatics, Xiamen University+Peng Cheng Lab+Institute of Artificial Intelligence, Xiamen University; MAC Lab, School of Informatics, Xiamen University+Youtu Lab, Tencent; Beihang University; Xi’an Jiaotong University; Youtu Lab, Tencent; Youtu Lab, Tencent; Youtu Lab, Tencent; Youtu Lab, Tencent; Youtu Lab, Tencent", "project": "", "github": "", "supp": "", @@ -2711,14 +2896,15 @@ "author_num": 10, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Li_Aha_Adaptive_History-Driven_Attack_for_Decision-Based_Black-Box_Models_ICCV_2021_paper.html", "aff_unique_index": "0+1+0;0+1+0;0+2;3;4;2;2;2;2;2", - "aff_unique_norm": "Xiamen University;Pengcheng Laboratory;Tencent;Beihang University;Xi'an Jiao Tong University", - "aff_unique_dep": "School of Informatics;Peng Cheng Lab;Youtu Lab;;", + "aff_unique_norm": "Xiamen University;Peng Cheng Lab;Tencent;Beihang University;Xi'an Jiaotong University", + "aff_unique_dep": "School of Informatics;;Youtu Lab;;", "aff_unique_url": "https://www.xmu.edu.cn;;https://www.tencent.com;http://www.buaa.edu.cn/;https://www.xjtu.edu.cn", "aff_unique_abbr": "XMU;;Tencent;BUAA;XJTU", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0+0;0+0+0;0+0;0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Jie and Ji,\n Rongrong and Chen,\n Peixian and Zhang,\n Baochang and Hong,\n Xiaopeng and Zhang,\n Ruixin and Li,\n Shaoxin and Li,\n Jilin and Huang,\n Feiyue and Wu,\n Yongjian\n},\n title = {\n Aha! Adaptive History-Driven Attack for Decision-Based Black-Box Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 16168-16177\n} \n}" }, { "title": "Airbert: In-Domain Pretraining for Vision-and-Language Navigation", @@ -2726,10 +2912,11 @@ "status": "Poster", "track": "main", "pid": 2609, + "author_site": "Pierre-Louis Guhur; Makarand Tapaswi; Shizhe Chen; Ivan Laptev; Cordelia Schmid", "author": "Pierre-Louis Guhur; Makarand Tapaswi; Shizhe Chen; Ivan Laptev; Cordelia Schmid", "abstract": "Vision-and-language navigation (VLN) aims to enable embodied agents to navigate in realistic environments using natural language instructions. Given the scarcity of domain-specific training data and the high diversity of image and language inputs, the generalization of VLN agents to unseen environments remains challenging. Recent methods explore pretraining to improve generalization, however, the use of generic image-caption datasets or existing small-scale VLN environments is suboptimal and results in limited improvements. In this work, we introduce BnB, a large-scale and diverse in-domain VLN dataset. We first collect image-caption (IC) pairs from hundreds of thousands of listings from online rental marketplaces. Using IC pairs we next propose automatic strategies to generate millions of VLN path-instruction (PI) pairs. We further propose a shuffling loss that improves the learning of temporal order inside PI pairs. We use BnB to pretrain our Airbert model that can be adapted to discriminative and generative settings and show that it outperforms state of the art for Room-to-Room (R2R) navigation and Remote Referring Expression (REVERIE) benchmarks. Moreover, our in-domain pretraining significantly increases performance on a challenging few-shot VLN evaluation, where we train the model only on VLN instructions from a few houses.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Guhur_Airbert_In-Domain_Pretraining_for_Vision-and-Language_Navigation_ICCV_2021_paper.pdf", - "aff": "Inria, \u00b4Ecole normale sup\u00e9rieure, CNRS, PSL Research University, Paris, France; IIIT Hyderabad, India; Inria, \u00b4Ecole normale sup\u00e9rieure, CNRS, PSL Research University, Paris, France; Inria, \u00b4Ecole normale sup\u00e9rieure, CNRS, PSL Research University, Paris, France; Inria, \u00b4Ecole normale sup\u00e9rieure, CNRS, PSL Research University, Paris, France", + "aff": "Inria, ´Ecole normale supérieure, CNRS, PSL Research University, Paris, France; IIIT Hyderabad, India; Inria, ´Ecole normale supérieure, CNRS, PSL Research University, Paris, France; Inria, ´Ecole normale supérieure, CNRS, PSL Research University, Paris, France; Inria, ´Ecole normale supérieure, CNRS, PSL Research University, Paris, France", "project": "https://airbert-vln.github.io", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Guhur_Airbert_In-Domain_Pretraining_ICCV_2021_supplemental.pdf", @@ -2742,14 +2929,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Guhur_Airbert_In-Domain_Pretraining_for_Vision-and-Language_Navigation_ICCV_2021_paper.html", "aff_unique_index": "0;1;0;0;0", - "aff_unique_norm": "INRIA;International Institute of Information Technology, Hyderabad", + "aff_unique_norm": "Inria;International Institute of Information Technology, Hyderabad", "aff_unique_dep": ";", "aff_unique_url": "https://www.inria.fr;https://iiit Hyderabad.ac.in", "aff_unique_abbr": "Inria;IIIT Hyderabad", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hyderabad", "aff_country_unique_index": "0;1;0;0;0", - "aff_country_unique": "France;India" + "aff_country_unique": "France;India", + "bibtex": "@InProceedings{Guhur_2021_ICCV,\n \n author = {\n Guhur,\n Pierre-Louis and Tapaswi,\n Makarand and Chen,\n Shizhe and Laptev,\n Ivan and Schmid,\n Cordelia\n},\n title = {\n Airbert: In-Domain Pretraining for Vision-and-Language Navigation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1634-1643\n} \n}" }, { "title": "Aligning Latent and Image Spaces To Connect the Unconnectable", @@ -2757,6 +2945,7 @@ "status": "Poster", "track": "main", "pid": 9236, + "author_site": "Ivan Skorokhodov; Grigorii Sotnikov; Mohamed Elhoseiny", "author": "Ivan Skorokhodov; Grigorii Sotnikov; Mohamed Elhoseiny", "abstract": "In this work, we develop a method to generate infinite high-resolution images with diverse and complex content. It is based on a perfectly equivariant patch-wise generator with synchronous interpolations in the image and latent spaces. Latent codes, when sampled, are positioned on the coordinate grid, and each pixel is computed from an interpolation of the neighboring codes. We modify the AdaIN mechanism to work in such a setup and train a GAN model to generate images positioned between any two latent vectors. At test time, this allows for generating infinitely large images of diverse scenes that transition naturally from one into another. Apart from that, we introduce LHQ: a new dataset of 90k high-resolution nature landscapes. We test the approach on LHQ, LSUN Tower and LSUN Bridge and outperform the baselines by at least 4 times in terms of quality and diversity of the produced infinite images. The project website is located at https://universome.github.io/alis.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Skorokhodov_Aligning_Latent_and_Image_Spaces_To_Connect_the_Unconnectable_ICCV_2021_paper.pdf", @@ -2780,7 +2969,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1+2+2;0", - "aff_country_unique": "Saudi Arabia;United States;Russian Federation" + "aff_country_unique": "Saudi Arabia;United States;Russia", + "bibtex": "@InProceedings{Skorokhodov_2021_ICCV,\n \n author = {\n Skorokhodov,\n Ivan and Sotnikov,\n Grigorii and Elhoseiny,\n Mohamed\n},\n title = {\n Aligning Latent and Image Spaces To Connect the Unconnectable\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14144-14153\n} \n}" }, { "title": "Aligning Subtitles in Sign Language Videos", @@ -2788,7 +2978,8 @@ "status": "Poster", "track": "main", "pid": 3906, - "author": "Hannah Bull; Triantafyllos Afouras; G\u00fcl Varol; Samuel Albanie; Liliane Momeni; Andrew Zisserman", + "author_site": "Hannah Bull; Triantafyllos Afouras; Gül Varol; Samuel Albanie; Liliane Momeni; Andrew Zisserman", + "author": "Hannah Bull; Triantafyllos Afouras; Gül Varol; Samuel Albanie; Liliane Momeni; Andrew Zisserman", "abstract": "The goal of this work is to temporally align asynchronous subtitles in sign language videos. In particular, we focus on sign-language interpreted TV broadcast data comprising (i) a video of continuous signing, and (ii) subtitles corresponding to the audio content. Previous work exploiting such weakly-aligned data only considered finding keyword-sign correspondences, whereas we aim to localise a complete subtitle text in continuous signing. We propose a Transformer architecture tailored for this task, which we train on manually annotated alignments covering over 15K subtitles that span 17.7 hours of video. We use BERT subtitle embeddings and CNN video representations learned for sign recognition to encode the two signals, which interact through a series of attention layers. Our model outputs frame-level predictions, i.e., for each video frame, whether it belongs to the queried subtitle or not. Through extensive evaluations, we show substantial improvements over existing alignment baselines that do not make use of subtitle text embeddings for learning. Our automatic alignment model opens up possibilities for advancing machine translation of sign languages via providing continuously synchronized video-text data.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Bull_Aligning_Subtitles_in_Sign_Language_Videos_ICCV_2021_paper.pdf", "aff": ";;;;;", @@ -2802,7 +2993,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Bull_Aligning_Subtitles_in_Sign_Language_Videos_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Bull_Aligning_Subtitles_in_Sign_Language_Videos_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Bull_2021_ICCV,\n \n author = {\n Bull,\n Hannah and Afouras,\n Triantafyllos and Varol,\n G\\"ul and Albanie,\n Samuel and Momeni,\n Liliane and Zisserman,\n Andrew\n},\n title = {\n Aligning Subtitles in Sign Language Videos\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11552-11561\n} \n}" }, { "title": "Always Be Dreaming: A New Approach for Data-Free Class-Incremental Learning", @@ -2810,6 +3002,7 @@ "status": "Poster", "track": "main", "pid": 4340, + "author_site": "James Smith; Yen-Chang Hsu; Jonathan Balloch; Yilin Shen; Hongxia Jin; Zsolt Kira", "author": "James Smith; Yen-Chang Hsu; Jonathan Balloch; Yilin Shen; Hongxia Jin; Zsolt Kira", "abstract": "Modern computer vision applications suffer from catastrophic forgetting when incrementally learning new concepts over time. The most successful approaches to alleviate this forgetting require extensive replay of previously seen data, which is problematic when memory constraints or data legality concerns exist. In this work, we consider the high-impact problem of Data-Free Class-Incremental Learning (DFCIL), where an incremental learning agent must learn new concepts over time without storing generators or training data from past tasks. One approach for DFCIL is to replay synthetic images produced by inverting a frozen copy of the learner's classification model, but we show this approach fails for common class-incremental benchmarks when using standard distillation strategies. We diagnose the cause of this failure and propose a novel incremental distillation strategy for DFCIL, contributing a modified cross-entropy training and importance-weighted feature distillation, and show that our method results in up to a 25.1% increase in final task accuracy (absolute difference) compared to SOTA DFCIL methods for common class-incremental benchmarks. Our method even outperforms several standard replay based methods which store a coreset of images.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Smith_Always_Be_Dreaming_A_New_Approach_for_Data-Free_Class-Incremental_Learning_ICCV_2021_paper.pdf", @@ -2826,14 +3019,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Smith_Always_Be_Dreaming_A_New_Approach_for_Data-Free_Class-Incremental_Learning_ICCV_2021_paper.html", "aff_unique_index": "0;1;0;1;1;0", - "aff_unique_norm": "Georgia Institute of Technology;Samsung", - "aff_unique_dep": ";Samsung Research America", + "aff_unique_norm": "Georgia Institute of Technology;Samsung Research America", + "aff_unique_dep": ";", "aff_unique_url": "https://www.gatech.edu;https://www.samsung.com/us/careers/research/", "aff_unique_abbr": "Georgia Tech;SRA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Smith_2021_ICCV,\n \n author = {\n Smith,\n James and Hsu,\n Yen-Chang and Balloch,\n Jonathan and Shen,\n Yilin and Jin,\n Hongxia and Kira,\n Zsolt\n},\n title = {\n Always Be Dreaming: A New Approach for Data-Free Class-Incremental Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9374-9384\n} \n}" }, { "title": "Amplitude-Phase Recombination: Rethinking Robustness of Convolutional Neural Networks in Frequency Domain", @@ -2841,6 +3035,7 @@ "status": "Poster", "track": "main", "pid": 2522, + "author_site": "Guangyao Chen; Peixi Peng; Li Ma; Jia Li; Lin Du; Yonghong Tian", "author": "Guangyao Chen; Peixi Peng; Li Ma; Jia Li; Lin Du; Yonghong Tian", "abstract": "Recently, the generalization behavior of Convolutional Neural Networks (CNN) is gradually transparent through explanation techniques with the frequency components decomposition. However, the importance of the phase spectrum of the image for a robust vision system is still ignored. In this paper, we notice that the CNN tends to converge at the local optimum which is closely related to the high-frequency components of the training images, while the amplitude spectrum is easily disturbed such as noises or common corruptions. In contrast, more empirical studies found that humans rely on more phase components to achieve robust recognition. This observation leads to more explanations of the CNN's generalization behaviors in both robustness to common perturbations and out-of-distribution detection, and motivates a new perspective on data augmentation designed by re-combing the phase spectrum of the current image and the amplitude spectrum of the distracter image. That is, the generated samples force the CNN to pay more attention to the structured information from phase components and keep robust to the variation of the amplitude. Experiments on several image datasets indicate that the proposed method achieves state-of-the-art performances on multiple generalizations and calibration tasks, including adaptability for common corruptions and surface variations, out-of-distribution detection, and adversarial attack.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_Amplitude-Phase_Recombination_Rethinking_Robustness_of_Convolutional_Neural_Networks_in_Frequency_ICCV_2021_paper.pdf", @@ -2857,14 +3052,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Chen_Amplitude-Phase_Recombination_Rethinking_Robustness_of_Convolutional_Neural_Networks_in_Frequency_ICCV_2021_paper.html", "aff_unique_index": "0+1;0+1;0+1;2+1;3;0+1", - "aff_unique_norm": "Peking University;Pengcheng Laboratory;Beihang University;Huawei", - "aff_unique_dep": "Department of Computer Science and Technology;Peng Cheng Laboratory;State Key Laboratory of Virtual Reality Technology and Systems, SCSE;AI Application Research Center", + "aff_unique_norm": "Peking University;Peng Cheng Laboratory;Beihang University;Huawei", + "aff_unique_dep": "Department of Computer Science and Technology;;State Key Laboratory of Virtual Reality Technology and Systems, SCSE;AI Application Research Center", "aff_unique_url": "http://www.pku.edu.cn;http://www.pcl.ac.cn;http://www.buaa.edu.cn;https://www.huawei.com", "aff_unique_abbr": "Peking U;;Beihang;Huawei", "aff_campus_unique_index": ";;;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0+0;0+0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Guangyao and Peng,\n Peixi and Ma,\n Li and Li,\n Jia and Du,\n Lin and Tian,\n Yonghong\n},\n title = {\n Amplitude-Phase Recombination: Rethinking Robustness of Convolutional Neural Networks in Frequency Domain\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 458-467\n} \n}" }, { "title": "An Asynchronous Kalman Filter for Hybrid Event Cameras", @@ -2872,6 +3068,7 @@ "status": "Poster", "track": "main", "pid": 6755, + "author_site": "Ziwei Wang; Yonhon Ng; Cedric Scheerlinck; Robert Mahony", "author": "Ziwei Wang; Yonhon Ng; Cedric Scheerlinck; Robert Mahony", "abstract": "Event cameras are ideally suited to capture HDR visual information without blur but perform poorly on static or slowly changing scenes. Conversely, conventional image sensors measure absolute intensity of slowly changing scenes effectively but do poorly on high dynamic range or quickly changing scenes. In this paper, we present an event-based video reconstruction pipeline for High Dynamic Range (HDR) scenarios. The proposed algorithm includes a frame augmentation pre-processing step that deblurs and temporally interpolates frame data using events. The augmented frame and event data are then fused using a novel asynchronous Kalman filter under a unifying uncertainty model for both sensors. Our experimental results are evaluated on both publicly available datasets with challenging lighting conditions and fast motions and our new dataset with HDR reference. The proposed algorithm outperforms state-of-the-art methods in both absolute intensity error (48% reduction) and image similarity indexes (average 11% improvement).", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_An_Asynchronous_Kalman_Filter_for_Hybrid_Event_Cameras_ICCV_2021_paper.pdf", @@ -2895,7 +3092,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Australia" + "aff_country_unique": "Australia", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Ziwei and Ng,\n Yonhon and Scheerlinck,\n Cedric and Mahony,\n Robert\n},\n title = {\n An Asynchronous Kalman Filter for Hybrid Event Cameras\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 448-457\n} \n}" }, { "title": "An Elastica Geodesic Approach With Convexity Shape Prior", @@ -2903,10 +3101,11 @@ "status": "Poster", "track": "main", "pid": 8098, + "author_site": "Da Chen; Laurent D. Cohen; Jean-Marie Mirebeau; Xue-Cheng Tai", "author": "Da Chen; Laurent D. Cohen; Jean-Marie Mirebeau; Xue-Cheng Tai", "abstract": "The minimal geodesic models based on the Eikonal equations are capable of finding suitable solutions in various image segmentation scenarios. Existing geodesic-based segmentation approaches usually exploit the image features in conjunction with geometric regularization terms (such as curve length or elastica length) for computing geodesic paths. In this paper, we consider a more complicated problem: finding simple and closed geodesic curves which are imposed a convexity shape prior. The proposed approach relies on an orientation-lifting strategy, by which a planar curve can be mapped to an high-dimensional orientation space. The convexity shape prior serves as a constraint for the construction of local metrics. The geodesic curves in the lifted space then can be efficiently computed through the fast marching method. In addition, we introduce a way to incorporate region-based homogeneity features into the proposed geodesic model so as to solve the region-based segmentation issues with shape prior constraints.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_An_Elastica_Geodesic_Approach_With_Convexity_Shape_Prior_ICCV_2021_paper.pdf", - "aff": "Shandong Artificial Intelligence Institute, Qilu University of Technology (Shandong Academy of Sciences), China; University Paris Dauphine, PSL Research University, CNRS, UMR 7534, CEREMADE, Paris, France; Laboratoire de math\u00e9matiques d\u2019Orsay, CNRS, Universit\u00e9 Paris-Sud, Universit\u00e9 Paris-Saclay, 91405 ORSAY, France; Department of Mathematics, Hong Kong Baptist University, Hong Kong", + "aff": "Shandong Artificial Intelligence Institute, Qilu University of Technology (Shandong Academy of Sciences), China; University Paris Dauphine, PSL Research University, CNRS, UMR 7534, CEREMADE, Paris, France; Laboratoire de mathématiques d’Orsay, CNRS, Université Paris-Sud, Université Paris-Saclay, 91405 ORSAY, France; Department of Mathematics, Hong Kong Baptist University, Hong Kong", "project": "", "github": "", "supp": "", @@ -2919,14 +3118,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Chen_An_Elastica_Geodesic_Approach_With_Convexity_Shape_Prior_ICCV_2021_paper.html", "aff_unique_index": "0;1;2;3", - "aff_unique_norm": "Qilu University of Technology;University Paris Dauphine;Universit\u00e9 Paris-Sud;Hong Kong Baptist University", - "aff_unique_dep": "Shandong Artificial Intelligence Institute;CEREMADE;Laboratoire de math\u00e9matiques d\u2019Orsay;Department of Mathematics", + "aff_unique_norm": "Qilu University of Technology;University Paris Dauphine;Université Paris-Sud;Hong Kong Baptist University", + "aff_unique_dep": "Shandong Artificial Intelligence Institute;CEREMADE;Laboratoire de mathématiques d’Orsay;Department of Mathematics", "aff_unique_url": ";https://www.univ-paris-dauphine.fr;https://www.universite-paris-sud.fr;https://www.hkbu.edu.hk", "aff_unique_abbr": ";UPD;UPS;HKBU", "aff_campus_unique_index": "1;2;3", "aff_campus_unique": ";Paris;Orsay;Hong Kong SAR", "aff_country_unique_index": "0;1;1;0", - "aff_country_unique": "China;France" + "aff_country_unique": "China;France", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Da and Cohen,\n Laurent D. and Mirebeau,\n Jean-Marie and Tai,\n Xue-Cheng\n},\n title = {\n An Elastica Geodesic Approach With Convexity Shape Prior\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6900-6909\n} \n}" }, { "title": "An Empirical Study of Training Self-Supervised Vision Transformers", @@ -2934,6 +3134,7 @@ "status": "Poster", "track": "main", "pid": 2611, + "author_site": "Xinlei Chen; Saining Xie; Kaiming He", "author": "Xinlei Chen; Saining Xie; Kaiming He", "abstract": "This paper does not describe a novel method. Instead, it studies a straightforward, incremental, yet must-know baseline given the recent progress in computer vision: self-supervised learning for Vision Transformers (ViT). While the training recipes for standard convolutional networks have been highly mature and robust, the recipes for ViT are yet to be built, especially in the self-supervised scenarios where training becomes more challenging. In this work, we go back to basics and investigate the effects of several fundamental components for training self-supervised ViT. We observe that instability is a major issue that degrades accuracy, and it can be hidden by apparently good results. We reveal that these results are indeed partial failure, and they can be improved when training is made more stable. We benchmark ViT results in MoCo v3 and several other self-supervised frameworks, with ablations in various aspects. We discuss the currently positive evidence as well as challenges and open questions. We hope that this work will provide useful data points and experience for future research.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_An_Empirical_Study_of_Training_Self-Supervised_Vision_Transformers_ICCV_2021_paper.pdf", @@ -2948,7 +3149,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Chen_An_Empirical_Study_of_Training_Self-Supervised_Vision_Transformers_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Chen_An_Empirical_Study_of_Training_Self-Supervised_Vision_Transformers_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Xinlei and Xie,\n Saining and He,\n Kaiming\n},\n title = {\n An Empirical Study of Training Self-Supervised Vision Transformers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9640-9649\n} \n}" }, { "title": "An Empirical Study of the Collapsing Problem in Semi-Supervised 2D Human Pose Estimation", @@ -2956,6 +3158,7 @@ "status": "Poster", "track": "main", "pid": 3910, + "author_site": "Rongchang Xie; Chunyu Wang; Wenjun Zeng; Yizhou Wang", "author": "Rongchang Xie; Chunyu Wang; Wenjun Zeng; Yizhou Wang", "abstract": "The state-of-the-art semi-supervised learning models are consistency-based which learn about unlabeled images by maximizing the similarity between different augmentations of an image. But when we apply the methods to human pose estimation which has extremely imbalanced class distribution, the models often collapse and predict every pixel in unlabeled images as background. This is because the decision boundary may pass through the high-density area of the minor class so more and more pixels are gradually mis-classified as the background class. In this work, we present a surprisingly simple approach to drive the model to learn in the correct direction. For each image, it composes a pair of easy and hard augmentations and uses the more accurate predictions on the easy image to teach the network to learn about the hard one. The accuracy superiority of teaching signals allows the network to be \"monotonically\" improved which effectively avoids collapsing. We apply our method to recent pose estimators and find that they achieve significantly better performances than their supervised counterparts on three public datasets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xie_An_Empirical_Study_of_the_Collapsing_Problem_in_Semi-Supervised_2D_ICCV_2021_paper.pdf", @@ -2972,14 +3175,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Xie_An_Empirical_Study_of_the_Collapsing_Problem_in_Semi-Supervised_2D_ICCV_2021_paper.html", "aff_unique_index": "0;1;1;0", - "aff_unique_norm": "Peking University;Microsoft", + "aff_unique_norm": "Peking University;Microsoft Research", "aff_unique_dep": "Center for Data Science;Research", "aff_unique_url": "http://www.pku.edu.cn;https://www.microsoft.com/en-us/research/group/asia", "aff_unique_abbr": "PKU;MSR Asia", "aff_campus_unique_index": "0;1;1", "aff_campus_unique": "Beijing;Asia;", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xie_2021_ICCV,\n \n author = {\n Xie,\n Rongchang and Wang,\n Chunyu and Zeng,\n Wenjun and Wang,\n Yizhou\n},\n title = {\n An Empirical Study of the Collapsing Problem in Semi-Supervised 2D Human Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11240-11249\n} \n}" }, { "title": "An End-to-End Transformer Model for 3D Object Detection", @@ -2987,6 +3191,7 @@ "status": "Poster", "track": "main", "pid": 1901, + "author_site": "Ishan Misra; Rohit Girdhar; Armand Joulin", "author": "Ishan Misra; Rohit Girdhar; Armand Joulin", "abstract": "We propose 3DETR, an end-to-end Transformer based object detection model for 3D point clouds. Compared to existing detection methods that employ a number of 3D-specific inductive biases, 3DETR requires minimal modifications to the vanilla Transformer block. Specifically, we find that a standard Transformer with non-parametric queries and Fourier positional embeddings is competitive with specialized architectures that employ libraries of 3D-specific operators with hand-tuned hyperparameters. Nevertheless, 3DETR is conceptually simple and easy to implement, enabling further improvements by incorporating 3D domain knowledge. Through extensive experiments, we show 3DETR outperforms the well-established and highly optimized VoteNet baselines on the challenging ScanNetV2 dataset by 9.5%. Furthermore, we show 3DETR is applicable to 3D tasks beyond detection, and can serve as a building block for future research.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Misra_An_End-to-End_Transformer_Model_for_3D_Object_Detection_ICCV_2021_paper.pdf", @@ -3001,7 +3206,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Misra_An_End-to-End_Transformer_Model_for_3D_Object_Detection_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Misra_An_End-to-End_Transformer_Model_for_3D_Object_Detection_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Misra_2021_ICCV,\n \n author = {\n Misra,\n Ishan and Girdhar,\n Rohit and Joulin,\n Armand\n},\n title = {\n An End-to-End Transformer Model for 3D Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2906-2917\n} \n}" }, { "title": "Animatable Neural Radiance Fields for Modeling Dynamic Human Bodies", @@ -3009,6 +3215,7 @@ "status": "Poster", "track": "main", "pid": 2681, + "author_site": "Sida Peng; Junting Dong; Qianqian Wang; Shangzhan Zhang; Qing Shuai; Xiaowei Zhou; Hujun Bao", "author": "Sida Peng; Junting Dong; Qianqian Wang; Shangzhan Zhang; Qing Shuai; Xiaowei Zhou; Hujun Bao", "abstract": "This paper addresses the challenge of reconstructing an animatable human model from a multi-view video. Some recent works have proposed to decompose a non-rigidly deforming scene into a canonical neural radiance field and a set of deformation fields that map observation-space points to the canonical space, thereby enabling them to learn the dynamic scene from images. However, they represent the deformation field as translational vector field or SE(3) field, which makes the optimization highly under-constrained. Moreover, these representations cannot be explicitly controlled by input motions. Instead, we introduce neural blend weight fields to produce the deformation fields. Based on the skeleton-driven deformation, blend weight fields are used with 3D human skeletons to generate observation-to-canonical and canonical-to-observation correspondences. Since 3D human skeletons are more observable, they can regularize the learning of deformation fields. Moreover, the learned blend weight fields can be combined with input skeletal motions to generate new deformation fields to animate the human model. Experiments show that our approach significantly outperforms recent human synthesis methods. The code and supplementary materials are available at \\href https://zju3dv.github.io/animatable_nerf/ https://zju3dv.github.io/animatable_nerf/ .", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Peng_Animatable_Neural_Radiance_Fields_for_Modeling_Dynamic_Human_Bodies_ICCV_2021_paper.pdf", @@ -3023,7 +3230,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Peng_Animatable_Neural_Radiance_Fields_for_Modeling_Dynamic_Human_Bodies_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Peng_Animatable_Neural_Radiance_Fields_for_Modeling_Dynamic_Human_Bodies_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Peng_2021_ICCV,\n \n author = {\n Peng,\n Sida and Dong,\n Junting and Wang,\n Qianqian and Zhang,\n Shangzhan and Shuai,\n Qing and Zhou,\n Xiaowei and Bao,\n Hujun\n},\n title = {\n Animatable Neural Radiance Fields for Modeling Dynamic Human Bodies\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14314-14323\n} \n}" }, { "title": "Anonymizing Egocentric Videos", @@ -3031,6 +3239,7 @@ "status": "Poster", "track": "main", "pid": 10113, + "author_site": "Daksh Thapar; Aditya Nigam; Chetan Arora", "author": "Daksh Thapar; Aditya Nigam; Chetan Arora", "abstract": "In egocentric videos, the face of a wearer capturing the video is never captured. This gives a false sense of security that the wearer's privacy is preserved while sharing such videos. However, egocentric cameras are typically harnessed to wearer's head, and hence, also capture wearer's gait. Recent works have shown that wearer gait signatures can be extracted from egocentric videos, which can be used to determine if two egocentric videos have the same wearer. In a more damaging scenario, one can even recognize a wearer using hand gestures from egocentric videos, or identify a wearer in third person videos such as from a surveillance camera. We believe, this could be a death knell in sharing of egocentric videos, and fatal for egocentric vision research. In this work, we suggest a novel technique to anonymize egocentric videos, which create carefully crafted, but small, and imperceptible optical flow perturbations in an egocentric video's frames. Importantly, these perturbations do not affect object detection or action/activity recognition from egocentric videos but are strong enough to dis-balance the gait recovery process. In our experiments on benchmark \\epic dataset, the proposed perturbation degrades the wearer recognition performance of [??], from 66.3% to 13.4%, while preserving the activity recognition performance of [??] from 89.6% to 87.4%. To test our anonymization with more wearer recognition techniques, we also developed a stronger, and more generalizable wearer recognition method based on camera egomotion cues. The approach achieves state-of-the-art (SOTA) performance of 59.67% on \\epicns, compared to 55.06% by [??]. However, the accuracy of our recognition technique also drops to 12% using the proposed anonymizing perturbations.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Thapar_Anonymizing_Egocentric_Videos_ICCV_2021_paper.pdf", @@ -3054,7 +3263,8 @@ "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Mandi;Delhi", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "India" + "aff_country_unique": "India", + "bibtex": "@InProceedings{Thapar_2021_ICCV,\n \n author = {\n Thapar,\n Daksh and Nigam,\n Aditya and Arora,\n Chetan\n},\n title = {\n Anonymizing Egocentric Videos\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2320-2329\n} \n}" }, { "title": "Anticipative Video Transformer", @@ -3062,6 +3272,7 @@ "status": "Poster", "track": "main", "pid": 1123, + "author_site": "Rohit Girdhar; Kristen Grauman", "author": "Rohit Girdhar; Kristen Grauman", "abstract": "We propose Anticipative Video Transformer (AVT), an end-to-end attention-based video modeling architecture that attends to the previously observed video in order to anticipate future actions. We train the model jointly to predict the next action in a video sequence, while also learning frame feature encoders that are predictive of successive future frames' features. Compared to existing temporal aggregation strategies, AVT has the advantage of both maintaining the sequential progression of observed actions while still capturing long-range dependencies--both critical for the anticipation task. Through extensive experiments, we show that AVT obtains the best reported performance on four popular action anticipation benchmarks: EpicKitchens-55, EpicKitchens-100, EGTEA Gaze+, and 50-Salads; and it wins first place in the EpicKitchens-100 CVPR'21 challenge.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Girdhar_Anticipative_Video_Transformer_ICCV_2021_paper.pdf", @@ -3078,14 +3289,15 @@ "author_num": 2, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Girdhar_Anticipative_Video_Transformer_ICCV_2021_paper.html", "aff_unique_index": "0;0+1", - "aff_unique_norm": "Meta;University of Texas at Austin", + "aff_unique_norm": "Facebook;University of Texas at Austin", "aff_unique_dep": "Facebook AI Research;", "aff_unique_url": "https://research.facebook.com;https://www.utexas.edu", "aff_unique_abbr": "FAIR;UT Austin", "aff_campus_unique_index": "1", "aff_campus_unique": ";Austin", "aff_country_unique_index": "0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Girdhar_2021_ICCV,\n \n author = {\n Girdhar,\n Rohit and Grauman,\n Kristen\n},\n title = {\n Anticipative Video Transformer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13505-13515\n} \n}" }, { "title": "Architecture Disentanglement for Deep Neural Networks", @@ -3093,10 +3305,11 @@ "status": "Poster", "track": "main", "pid": 4038, + "author_site": "Jie Hu; Liujuan Cao; Tong Tong; Qixiang Ye; Shengchuan Zhang; Ke Li; Feiyue Huang; Ling Shao; Rongrong Ji", "author": "Jie Hu; Liujuan Cao; Tong Tong; Qixiang Ye; Shengchuan Zhang; Ke Li; Feiyue Huang; Ling Shao; Rongrong Ji", "abstract": "Understanding the inner workings of deep neural networks (DNNs) is essential to provide trustworthy artificial intelligence techniques for practical applications. Existing studies typically involve linking semantic concepts to units or layers of DNNs, but fail to explain the inference process. In this paper, we introduce neural architecture disentanglement (NAD) to fill the gap. Specifically, NAD learns to disentangle a pre-trained DNN into sub-architectures according to independent tasks, forming information flows that describe the inference processes. We investigate whether, where, and how the disentanglement occurs through experiments conducted with handcrafted and automatically-searched network architectures, on both object-based and scene-based datasets. Based on the experimental results, we present three new findings that provide fresh insights into the inner logic of DNNs. First, DNNs can be divided into sub-architectures for independent tasks. Second, deeper layers do not always correspond to higher semantics. Third, the connection type in a DNN affects how the information flows across layers, leading to different disentanglement behaviors. With NAD, we further explain why DNNs sometimes give wrong predictions. Experimental results show that misclassified images have a high probability of being assigned to task sub-architectures similar to the correct ones. Our code is available at https://github.com/hujiecpp/NAD.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Hu_Architecture_Disentanglement_for_Deep_Neural_Networks_ICCV_2021_paper.pdf", - "aff": "MAC Lab, School of Informatics, Xiamen University; MAC Lab, School of Informatics, Xiamen University; MAC Lab, School of Informatics, Xiamen University; University of Chinese Academy of Sciences; MAC Lab, School of Informatics, Xiamen University; Tencent Youtu Lab; Tencent Youtu Lab; Inception Institute of Arti\ufb01cial Intelligence; Institute of Arti\ufb01cial Intelligence, Xiamen University+Peng Cheng Lab", + "aff": "MAC Lab, School of Informatics, Xiamen University; MAC Lab, School of Informatics, Xiamen University; MAC Lab, School of Informatics, Xiamen University; University of Chinese Academy of Sciences; MAC Lab, School of Informatics, Xiamen University; Tencent Youtu Lab; Tencent Youtu Lab; Inception Institute of Artificial Intelligence; Institute of Artificial Intelligence, Xiamen University+Peng Cheng Lab", "project": "", "github": "https://github.com/hujiecpp/NAD", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Hu_Architecture_Disentanglement_for_ICCV_2021_supplemental.pdf", @@ -3109,14 +3322,15 @@ "author_num": 9, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Hu_Architecture_Disentanglement_for_Deep_Neural_Networks_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;1;0;2;2;3;0+4", - "aff_unique_norm": "Xiamen University;University of Chinese Academy of Sciences;Tencent;Inception Institute of Artificial Intelligence;Pengcheng Laboratory", - "aff_unique_dep": "School of Informatics;;Youtu Lab;;Peng Cheng Lab", + "aff_unique_norm": "Xiamen University;University of Chinese Academy of Sciences;Tencent;Inception Institute of Artificial Intelligence;Peng Cheng Lab", + "aff_unique_dep": "School of Informatics;;Youtu Lab;;", "aff_unique_url": "https://www.xmu.edu.cn;http://www.ucas.ac.cn;https://www.tencent.com;https://www.inceptionai.org;", "aff_unique_abbr": "XMU;UCAS;Tencent;;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Hu_2021_ICCV,\n \n author = {\n Hu,\n Jie and Cao,\n Liujuan and Tong,\n Tong and Ye,\n Qixiang and Zhang,\n Shengchuan and Li,\n Ke and Huang,\n Feiyue and Shao,\n Ling and Ji,\n Rongrong\n},\n title = {\n Architecture Disentanglement for Deep Neural Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 672-681\n} \n}" }, { "title": "Are We Missing Confidence in Pseudo-LiDAR Methods for Monocular 3D Object Detection?", @@ -3124,7 +3338,8 @@ "status": "Poster", "track": "main", "pid": 3721, - "author": "Andrea Simonelli; Samuel Rota Bul\u00f2; Lorenzo Porzi; Peter Kontschieder; Elisa Ricci", + "author_site": "Andrea Simonelli; Samuel Rota Bulò; Lorenzo Porzi; Peter Kontschieder; Elisa Ricci", + "author": "Andrea Simonelli; Samuel Rota Bulò; Lorenzo Porzi; Peter Kontschieder; Elisa Ricci", "abstract": "Pseudo-LiDAR-based methods for monocular 3D object detection have received considerable attention in the community due to the performance gains exhibited on the KITTI3D benchmark, in particular on the commonly reported validation split. This generated a distorted impression about the superiority of Pseudo-LiDAR-based (PL-based) approaches over methods working with RGB images only. Our first contribution consists in rectifying this view by pointing out and showing experimentally that the validation results published by PL-based methods are substantially biased. The source of the bias resides in an overlap between the KITTI3D object detection validation set and the training/validation sets used to train depth predictors feeding PL-based methods. Surprisingly, the bias remains also after geographically removing the overlap. This leaves the test set as the only reliable set for comparison, where published PL-based methods do not excel. Our second contribution brings PL-based methods back up in the ranking with the design of a novel deep architecture which introduces a 3D confidence prediction module. We show that 3D confidence estimation techniques derived from RGB-only 3D detection approaches can be successfully integrated into our framework and, more importantly, that improved performance can be obtained with a newly designed 3D confidence measure, leading to state-of-the-art performance on the KITTI3D benchmark.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Simonelli_Are_We_Missing_Confidence_in_Pseudo-LiDAR_Methods_for_Monocular_3D_ICCV_2021_paper.pdf", "aff": "University of Trento, Fondazione Bruno Kessler; Facebook Reality Labs; Facebook Reality Labs; Facebook Reality Labs; University of Trento, Fondazione Bruno Kessler", @@ -3140,14 +3355,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Simonelli_Are_We_Missing_Confidence_in_Pseudo-LiDAR_Methods_for_Monocular_3D_ICCV_2021_paper.html", "aff_unique_index": "0;1;1;1;0", - "aff_unique_norm": "University of Trento;Meta", + "aff_unique_norm": "University of Trento;Facebook Reality Labs", "aff_unique_dep": ";Facebook Reality Labs", "aff_unique_url": "https://www.unitn.it;https://www.facebook.com/realitylabs", "aff_unique_abbr": "UniTN;FRL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;0", - "aff_country_unique": "Italy;United States" + "aff_country_unique": "Italy;United States", + "bibtex": "@InProceedings{Simonelli_2021_ICCV,\n \n author = {\n Simonelli,\n Andrea and Bul\\`o,\n Samuel Rota and Porzi,\n Lorenzo and Kontschieder,\n Peter and Ricci,\n Elisa\n},\n title = {\n Are We Missing Confidence in Pseudo-LiDAR Methods for Monocular 3D Object Detection?\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3225-3233\n} \n}" }, { "title": "Artificial Fingerprinting for Generative Models: Rooting Deepfake Attribution in Training Data", @@ -3155,6 +3371,7 @@ "status": "Poster", "track": "main", "pid": 8062, + "author_site": "Ning Yu; Vladislav Skripniuk; Sahar Abdelnabi; Mario Fritz", "author": "Ning Yu; Vladislav Skripniuk; Sahar Abdelnabi; Mario Fritz", "abstract": "Photorealistic image generation has reached a new level of quality due to the breakthroughs of generative adversarial networks (GANs). Yet, the dark side of such deepfakes, the malicious use of generated media, raises concerns about visual misinformation. While existing research work on deepfake detection demonstrates high accuracy, it is subject to advances in generation techniques and adversarial iterations on detection countermeasure techniques. Thus, we seek a proactive and sustainable solution on deepfake detection, that is agnostic to the evolution of generative models, by introducing artificial fingerprints into the models. Our approach is simple and effective. We first embed artificial fingerprints into training data, then validate a surprising discovery on the transferability of such fingerprints from training data to generative models, which in turn appears in the generated deepfakes. Experiments show that our fingerprinting solution (1) holds for a variety of cutting-edge generative models, (2) leads to a negligible side effect on generation quality, (3) stays robust against image-level and model-level perturbations, (4) stays hard to be detected by adversaries, and (5) converts deepfake detection and attribution into trivial tasks and outperforms the recent state-of-the-art baselines. Our solution closes the responsibility loop between publishing pre-trained generative model inventions and their possible misuses, which makes it independent of the current arms race.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yu_Artificial_Fingerprinting_for_Generative_Models_Rooting_Deepfake_Attribution_in_Training_ICCV_2021_paper.pdf", @@ -3178,7 +3395,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;1;1;1", - "aff_country_unique": "United States;Germany" + "aff_country_unique": "United States;Germany", + "bibtex": "@InProceedings{Yu_2021_ICCV,\n \n author = {\n Yu,\n Ning and Skripniuk,\n Vladislav and Abdelnabi,\n Sahar and Fritz,\n Mario\n},\n title = {\n Artificial Fingerprinting for Generative Models: Rooting Deepfake Attribution in Training Data\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14448-14457\n} \n}" }, { "title": "Ask&Confirm: Active Detail Enriching for Cross-Modal Retrieval With Partial Query", @@ -3186,6 +3404,7 @@ "status": "Poster", "track": "main", "pid": 3273, + "author_site": "Guanyu Cai; Jun Zhang; Xinyang Jiang; Yifei Gong; Lianghua He; Fufu Yu; Pai Peng; Xiaowei Guo; Feiyue Huang; Xing Sun", "author": "Guanyu Cai; Jun Zhang; Xinyang Jiang; Yifei Gong; Lianghua He; Fufu Yu; Pai Peng; Xiaowei Guo; Feiyue Huang; Xing Sun", "abstract": "Text-based image retrieval has seen considerable progress in recent years. However, the performance of existing methods suffers in real life since the user is likely to provide an incomplete description of an image, which often leads to results filled with false positives that fit the incomplete description. In this work, we introduce the partial-query problem and extensively analyze its influence on text-based image retrieval. Previous interactive methods tackle the problem by passively receiving users' feedback to supplement the incomplete query iteratively, which is time-consuming and requires heavy user effort. Instead, we propose a novel retrieval framework that conducts the interactive process in an Ask-and-Confirm fashion, where AI actively searches for discriminative details missing in the current query, and users only need to confirm AI's proposal. Specifically, we propose an object-based interaction to make the interactive retrieval more user-friendly and present a reinforcement-learning-based policy to search for discriminative objects. Furthermore, since fully-supervised training is often infeasible due to the difficulty of obtaining human-machine dialog data, we present a weakly-supervised training strategy that needs no human-annotated dialogs other than a text-image dataset. Experiments show that our framework significantly improves the performance of text-based image retrieval. Code is available at https://github.com/CuthbertCai/Ask-Confirm.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Cai_AskConfirm_Active_Detail_Enriching_for_Cross-Modal_Retrieval_With_Partial_Query_ICCV_2021_paper.pdf", @@ -3202,14 +3421,15 @@ "author_num": 10, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Cai_AskConfirm_Active_Detail_Enriching_for_Cross-Modal_Retrieval_With_Partial_Query_ICCV_2021_paper.html", "aff_unique_index": "0+1;1;2+1;1;0+1;1;1;1;1;2+1", - "aff_unique_norm": "Tongji University;Tencent;Microsoft", - "aff_unique_dep": ";Youtu Lab;Microsoft Research", + "aff_unique_norm": "Tongji University;Tencent;Microsoft Research", + "aff_unique_dep": ";Youtu Lab;", "aff_unique_url": "https://www.tongji.edu.cn;https://www.tencent.com;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "Tongji;Tencent Youtu Lab;MSR", "aff_campus_unique_index": ";;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;1+0;0;0+0;0;0;0;0;1+0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Cai_2021_ICCV,\n \n author = {\n Cai,\n Guanyu and Zhang,\n Jun and Jiang,\n Xinyang and Gong,\n Yifei and He,\n Lianghua and Yu,\n Fufu and Peng,\n Pai and Guo,\n Xiaowei and Huang,\n Feiyue and Sun,\n Xing\n},\n title = {\n Ask\\&Confirm: Active Detail Enriching for Cross-Modal Retrieval With Partial Query\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1835-1844\n} \n}" }, { "title": "Assignment-Space-Based Multi-Object Tracking and Segmentation", @@ -3217,6 +3437,7 @@ "status": "Poster", "track": "main", "pid": 8053, + "author_site": "Anwesa Choudhuri; Girish Chowdhary; Alexander G. Schwing", "author": "Anwesa Choudhuri; Girish Chowdhary; Alexander G. Schwing", "abstract": "Multi-object tracking and segmentation (MOTS) is important for understanding dynamic scenes in video data. Existing methods perform well on multi-object detection and segmentation for independent video frames, but tracking of objects over time remains a challenge. MOTS methods formulate tracking locally, i.e., frame-by-frame, leading to sub-optimal results. Classical global methods on tracking operate directly on object detections, which leads to a combinatorial growth in the detection space. In contrast, we formulate a global method for MOTS over the space of assignments rather than detections: First, we find all top-k assignments of objects detected and segmented between any two consecutive frames and develop a structured prediction formulation to score assignment sequences across any number of consecutive frames. We use dynamic programming to find the global optimizer of this formulation in polynomial time. Second, we connect objects which reappear after having been out of view for some time. For this we formulate an assignment problem. On the challenging KITTI-MOTS and MOTSChallenge datasets, this achieves state-of-the-art results among methods which don't use depth data.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Choudhuri_Assignment-Space-Based_Multi-Object_Tracking_and_Segmentation_ICCV_2021_paper.pdf", @@ -3233,14 +3454,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Choudhuri_Assignment-Space-Based_Multi-Object_Tracking_and_Segmentation_ICCV_2021_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "University of Illinois Urbana-Champaign", + "aff_unique_norm": "University of Illinois at Urbana-Champaign", "aff_unique_dep": "", "aff_unique_url": "https://illinois.edu", "aff_unique_abbr": "UIUC", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Urbana-Champaign", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Choudhuri_2021_ICCV,\n \n author = {\n Choudhuri,\n Anwesa and Chowdhary,\n Girish and Schwing,\n Alexander G.\n},\n title = {\n Assignment-Space-Based Multi-Object Tracking and Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13598-13607\n} \n}" }, { "title": "Asymmetric Bilateral Motion Estimation for Video Frame Interpolation", @@ -3248,6 +3470,7 @@ "status": "Poster", "track": "main", "pid": 6085, + "author_site": "Junheum Park; Chul Lee; Chang-Su Kim", "author": "Junheum Park; Chul Lee; Chang-Su Kim", "abstract": "We propose a novel video frame interpolation algorithm based on asymmetric bilateral motion estimation (ABME), which synthesizes an intermediate frame between two input frames. First, we predict symmetric bilateral motion fields to interpolate an anchor frame. Second, we estimate asymmetric bilateral motions fields from the anchor frame to the input frames. Third, we use the asymmetric fields to warp the input frames backward and reconstruct the intermediate frame. Last, to refine the intermediate frame, we develop a new synthesis network that generates a set of dynamic filters and a residual frame using local and global information. Experimental results show that the proposed algorithm achieves excellent performance on various datasets. The source codes and pretrained models are available at https://github.com/JunHeum/ABME.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Park_Asymmetric_Bilateral_Motion_Estimation_for_Video_Frame_Interpolation_ICCV_2021_paper.pdf", @@ -3271,7 +3494,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Park_2021_ICCV,\n \n author = {\n Park,\n Junheum and Lee,\n Chul and Kim,\n Chang-Su\n},\n title = {\n Asymmetric Bilateral Motion Estimation for Video Frame Interpolation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14539-14548\n} \n}" }, { "title": "Asymmetric Loss for Multi-Label Classification", @@ -3279,6 +3503,7 @@ "status": "Poster", "track": "main", "pid": 7868, + "author_site": "Tal Ridnik; Emanuel Ben-Baruch; Nadav Zamir; Asaf Noy; Itamar Friedman; Matan Protter; Lihi Zelnik-Manor", "author": "Tal Ridnik; Emanuel Ben-Baruch; Nadav Zamir; Asaf Noy; Itamar Friedman; Matan Protter; Lihi Zelnik-Manor", "abstract": "In a typical multi-label setting, a picture contains on average few positive labels, and many negative ones. This positive-negative imbalance dominates the optimization process, and can lead to under-emphasizing gradients from positive labels during training, resulting in poor accuracy. In this paper, we introduce a novel asymmetric loss (\"\"ASL\"\"), which operates differently on positive and negative samples. The loss enables to dynamically down-weights and hard-thresholds easy negative samples, while also discarding possibly mislabeled samples. We demonstrate how ASL can balance the probabilities of different samples, and how this balancing is translated to better mAP scores. With ASL, we reach state-of-the-art results on multiple popular multi-label datasets: MS-COCO, Pascal-VOC, NUS-WIDE and Open Images. We also demonstrate ASL applicability for other tasks, such as single-label classification and object detection. ASL is effective, easy to implement, and does not increase the training time or complexity. Implementation is available at: https://github.com/Alibaba-MIIL/ASL.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ridnik_Asymmetric_Loss_for_Multi-Label_Classification_ICCV_2021_paper.pdf", @@ -3302,7 +3527,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Ridnik_2021_ICCV,\n \n author = {\n Ridnik,\n Tal and Ben-Baruch,\n Emanuel and Zamir,\n Nadav and Noy,\n Asaf and Friedman,\n Itamar and Protter,\n Matan and Zelnik-Manor,\n Lihi\n},\n title = {\n Asymmetric Loss for Multi-Label Classification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 82-91\n} \n}" }, { "title": "Attack As the Best Defense: Nullifying Image-to-Image Translation GANs via Limit-Aware Adversarial Attack", @@ -3310,6 +3536,7 @@ "status": "Poster", "track": "main", "pid": 5602, + "author_site": "Chin-Yuan Yeh; Hsi-Wen Chen; Hong-Han Shuai; De-Nian Yang; Ming-Syan Chen", "author": "Chin-Yuan Yeh; Hsi-Wen Chen; Hong-Han Shuai; De-Nian Yang; Ming-Syan Chen", "abstract": "Due to the great success of image-to-image (Img2Img) translation GANs, many applications with ethics issues arise, e.g., DeepFake and DeepNude, presenting a challenging problem to prevent the misuse of these techniques. In this work, we tackle the problem by a new adversarial attack scheme, namely the Nullifying Attack, which cancels the image translation process and proposes a corresponding framework, the Limit-Aware Self-Guiding Gradient Sliding Attack (LaS-GSA) under a black-box setting. In other words, by processing the image with the proposed LaS-GSA before publishing, any image translation functions can be nullified, which prevents the images from malicious manipulations. First, we introduce the limit-aware RGF and gradient sliding mechanism to estimate the gradient that adheres to the adversarial limit, i.e., the pixel value limitations of the adversarial example. We theoretically prove that our model is able to avoid the error caused by the projection operation in both the direction and the length. Then, an effective self-guiding prior is extracted solely from the threat model and the target image to efficiently leverage the prior information and guide the gradient estimation process. Extensive experiments demonstrate that LaS-GSA requires fewer queries to nullify the image translation process with higher success rates than 4 state-of-the-art methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yeh_Attack_As_the_Best_Defense_Nullifying_Image-to-Image_Translation_GANs_via_ICCV_2021_paper.pdf", @@ -3326,14 +3553,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Yeh_Attack_As_the_Best_Defense_Nullifying_Image-to-Image_Translation_GANs_via_ICCV_2021_paper.html", "aff_unique_index": "0;0;1;2;0", - "aff_unique_norm": "National Taiwan University;National Yangming Jiao Tong University;Academia Sinica", + "aff_unique_norm": "National Taiwan University;National Yangming Jiaotong University;Academia Sinica", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ntu.edu.tw;https://www.nycu.edu.tw;https://www.sinica.edu.tw", "aff_unique_abbr": "NTU;NYCU;Academia Sinica", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Taiwan", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yeh_2021_ICCV,\n \n author = {\n Yeh,\n Chin-Yuan and Chen,\n Hsi-Wen and Shuai,\n Hong-Han and Yang,\n De-Nian and Chen,\n Ming-Syan\n},\n title = {\n Attack As the Best Defense: Nullifying Image-to-Image Translation GANs via Limit-Aware Adversarial Attack\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 16188-16197\n} \n}" }, { "title": "Attack-Guided Perceptual Data Generation for Real-World Re-Identification", @@ -3341,6 +3569,7 @@ "status": "Poster", "track": "main", "pid": 3180, + "author_site": "Yukun Huang; Xueyang Fu; Zheng-Jun Zha", "author": "Yukun Huang; Xueyang Fu; Zheng-Jun Zha", "abstract": "In unconstrained real-world surveillance scenarios, person re-identification (Re-ID) models usually suffer from different low-level perceptual variations, e.g., cross-resolution and insufficient lighting. Due to the limited variation range of training data, existing models are difficult to generalize to scenes with unknown perceptual interference types. To address the above problem, in this paper, we propose two disjoint data-generation ways to complement existing training samples to improve the robustness of Re-ID models. Firstly, considering the sparsity and imbalance of samples in the perceptual space, a dense resampling method from the estimated perceptual distribution is performed. Secondly, to dig more representative generated samples for identity representation learning, we introduce a graph-based white-box attacker to guide the data generation process with intra-batch ranking and discriminate attention. In addition, two synthetic-to-real feature constraints are introduced into the Re-ID training to prevent the generated data from bringing domain bias. Our method is effective, easy-to-implement, and independent of the specific network architecture. Applying our approach to a ResNet-50 baseline can already achieve competitive results, surpassing state-of-the-art methods by +1.2% at Rank-1 on the MLR-CUHK03 dataset.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Huang_Attack-Guided_Perceptual_Data_Generation_for_Real-World_Re-Identification_ICCV_2021_paper.pdf", @@ -3364,7 +3593,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Huang_2021_ICCV,\n \n author = {\n Huang,\n Yukun and Fu,\n Xueyang and Zha,\n Zheng-Jun\n},\n title = {\n Attack-Guided Perceptual Data Generation for Real-World Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 215-224\n} \n}" }, { "title": "Attention Is Not Enough: Mitigating the Distribution Discrepancy in Asynchronous Multimodal Sequence Fusion", @@ -3372,6 +3602,7 @@ "status": "Poster", "track": "main", "pid": 4269, + "author_site": "Tao Liang; Guosheng Lin; Lei Feng; Yan Zhang; Fengmao Lv", "author": "Tao Liang; Guosheng Lin; Lei Feng; Yan Zhang; Fengmao Lv", "abstract": "Videos flow as the mixture of language, acoustic, and vision modalities. A thorough video understanding needs to fuse time-series data of different modalities for prediction. Due to the variable receiving frequency for sequences from each modality, there usually exists inherent asynchrony across the collected multimodal streams. Towards an efficient multimodal fusion from asynchronous multimodal streams, we need to model the correlations between elements from different modalities. The recent Multimodal Transformer (MulT) approach extends the self-attention mechanism of the original Transformer network to learn the crossmodal dependencies between elements. However, the direct replication of self-attention will suffer from the distribution mismatch across different modality features. As a result, the learnt crossmodal dependencies can be unreliable. Motivated by this observation, this work proposes the Modality-Invariant Crossmodal Attention (MICA) approach towards learning crossmodal interactions over modality-invariant space in which the distribution mismatch between different modalities is well bridged. To this end, both the marginal distribution and the elements with high-confidence correlations are aligned over the common space of the query and key vectors which are computed from different modalities. Experiments on three standard benchmarks of multimodal video understanding clearly validate the superiority of our approach.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liang_Attention_Is_Not_Enough_Mitigating_the_Distribution_Discrepancy_in_Asynchronous_ICCV_2021_paper.pdf", @@ -3388,14 +3619,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Liang_Attention_Is_Not_Enough_Mitigating_the_Distribution_Discrepancy_in_Asynchronous_ICCV_2021_paper.html", "aff_unique_index": "0+1;2;2;3;0+4", - "aff_unique_norm": "Southwest Jiao Tong University;ByteDance;Nanyang Technological University;University of Electronic Science and Technology of China;Southwestern University of Finance and Economics", + "aff_unique_norm": "Southwest Jiaotong University;Bytedance;Nanyang Technological University;University of Electronic Science and Technology of China;Southwestern University of Finance and Economics", "aff_unique_dep": ";Engineering Productivity & Quality Assurance;;;Center of Statistical Research", "aff_unique_url": "https://www.swjtu.edu.cn;https://www.bytedance.com;https://www.ntu.edu.sg;https://www.uestc.edu.cn;http://www.swufe.edu.cn", "aff_unique_abbr": "SWJTU;Bytedance;NTU;UESTC;", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;1;1;0;0+0", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Liang_2021_ICCV,\n \n author = {\n Liang,\n Tao and Lin,\n Guosheng and Feng,\n Lei and Zhang,\n Yan and Lv,\n Fengmao\n},\n title = {\n Attention Is Not Enough: Mitigating the Distribution Discrepancy in Asynchronous Multimodal Sequence Fusion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8148-8156\n} \n}" }, { "title": "Attention-Based Multi-Reference Learning for Image Super-Resolution", @@ -3403,6 +3635,7 @@ "status": "Poster", "track": "main", "pid": 10357, + "author_site": "Marco Pesavento; Marco Volino; Adrian Hilton", "author": "Marco Pesavento; Marco Volino; Adrian Hilton", "abstract": "This paper proposes a novel Attention-based Multi-Reference Super-resolution network (AMRSR) that, given a low-resolution image, learns to adaptively transfer the most similar texture from multiple reference images to the super-resolution output whilst maintaining spatial coherence. The use of multiple reference images together with attention-based sampling is demonstrated to achieve significantly improved performance over state-of-the-art reference super-resolution approaches on multiple benchmark datasets. Reference super-resolution approaches have recently been proposed to overcome the ill-posed problem of image super-resolution by providing additional information from a high-resolution reference image. Multi-reference super-resolution extends this approach by providing a more diverse pool of image features to overcome the inherent information deficit whilst maintaining memory efficiency. A novel hierarchical attention-based sampling approach is introduced to learn the similarity between low-resolution image features and multiple reference images based on a perceptual loss. Ablation demonstrates the contribution of both multi-reference and hierarchical attention-based sampling to overall performance. Perceptual and quantitative ground-truth evaluation demonstrates significant improvement in performance even when the reference images deviate significantly from the target image. The project website can be found at https://marcopesavento.github.io/AMRSR/", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Pesavento_Attention-Based_Multi-Reference_Learning_for_Image_Super-Resolution_ICCV_2021_paper.pdf", @@ -3426,7 +3659,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Pesavento_2021_ICCV,\n \n author = {\n Pesavento,\n Marco and Volino,\n Marco and Hilton,\n Adrian\n},\n title = {\n Attention-Based Multi-Reference Learning for Image Super-Resolution\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14697-14706\n} \n}" }, { "title": "Attentional Pyramid Pooling of Salient Visual Residuals for Place Recognition", @@ -3434,6 +3668,7 @@ "status": "Poster", "track": "main", "pid": 10433, + "author_site": "Guohao Peng; Jun Zhang; Heshan Li; Danwei Wang", "author": "Guohao Peng; Jun Zhang; Heshan Li; Danwei Wang", "abstract": "The core of visual place recognition (VPR) lies in how to identify task-relevant visual cues and embed them into discriminative representations. Focusing on these two points, we propose a novel encoding strategy named Attentional Pyramid Pooling of Salient Visual Residuals (APPSVR). It incorporates three types of attention modules to model the saliency of local features in individual, spatial and cluster dimensions respectively. (1) To inhibit task-irrelevant local features, a semantic-reinforced local weighting scheme is employed for local feature refinement; (2) To leverage the spatial context, an attentional pyramid structure is constructed to adaptively encode regional features according to their relative spatial saliency; (3) To distinguish the different importance of visual clusters to the task, a parametric normalization is proposed to adjust their contribution to image descriptor generation. Experiments demonstrate APPSVR outperforms the existing techniques and achieves a new state-of-the-art performance on VPR benchmark datasets. The visualization shows the saliency map learned in a weakly supervised manner is largely consistent with human cognition.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Peng_Attentional_Pyramid_Pooling_of_Salient_Visual_Residuals_for_Place_Recognition_ICCV_2021_paper.pdf", @@ -3457,7 +3692,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Peng_2021_ICCV,\n \n author = {\n Peng,\n Guohao and Zhang,\n Jun and Li,\n Heshan and Wang,\n Danwei\n},\n title = {\n Attentional Pyramid Pooling of Salient Visual Residuals for Place Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 885-894\n} \n}" }, { "title": "Attentive and Contrastive Learning for Joint Depth and Motion Field Estimation", @@ -3465,6 +3701,7 @@ "status": "Poster", "track": "main", "pid": 3833, + "author_site": "Seokju Lee; Francois Rameau; Fei Pan; In So Kweon", "author": "Seokju Lee; Francois Rameau; Fei Pan; In So Kweon", "abstract": "Estimating the motion of the camera together with the 3D structure of the scene from a monocular vision system is a complex task that often relies on the so-called scene rigidity assumption. When observing a dynamic environment, this assumption is violated which leads to an ambiguity between the ego-motion of the camera and the motion of the objects. To solve this problem, we present a self-supervised learning framework for 3D object motion field estimation from monocular videos. Our contributions are two-fold. First, we propose a two-stage projection pipeline to explicitly disentangle the camera ego-motion and the object motions with dynamics attention module, called DAM. Specifically, we design an integrated motion model that estimates the motion of the camera and object in the first and second warping stages, respectively, controlled by the attention module through a shared motion encoder. Second, we propose an object motion field estimation through contrastive sample consensus, called CSAC, taking advantage of weak semantic prior (bounding box from an object detector) and geometric constraints (each object respects the rigid body motion model). Experiments on KITTI, Cityscapes, and Waymo Open Dataset demonstrate the relevance of our approach and show that our method outperforms state-of-the-art algorithms for the tasks of self-supervised monocular depth estimation, object motion segmentation, monocular scene flow estimation, and visual odometry.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Lee_Attentive_and_Contrastive_Learning_for_Joint_Depth_and_Motion_Field_ICCV_2021_paper.pdf", @@ -3488,7 +3725,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Lee_2021_ICCV,\n \n author = {\n Lee,\n Seokju and Rameau,\n Francois and Pan,\n Fei and Kweon,\n In So\n},\n title = {\n Attentive and Contrastive Learning for Joint Depth and Motion Field Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4862-4871\n} \n}" }, { "title": "Audio-Visual Floorplan Reconstruction", @@ -3496,7 +3734,8 @@ "status": "Poster", "track": "main", "pid": 5897, - "author": "Senthil Purushwalkam; Sebasti\u00e0 Vicenc Amengual Gar\u00ed; Vamsi Krishna Ithapu; Carl Schissler; Philip Robinson; Abhinav Gupta; Kristen Grauman", + "author_site": "Senthil Purushwalkam; Sebastià Vicenc Amengual Garí; Vamsi Krishna Ithapu; Carl Schissler; Philip Robinson; Abhinav Gupta; Kristen Grauman", + "author": "Senthil Purushwalkam; Sebastià Vicenc Amengual Garí; Vamsi Krishna Ithapu; Carl Schissler; Philip Robinson; Abhinav Gupta; Kristen Grauman", "abstract": "Given only a few glimpses of an environment, how much can we infer about its entire floorplan? Existing methods can map only what is visible or immediately apparent from context, and thus require substantial movements through a space to fully map it. We explore how both audio and visual sensing together can provide rapid floorplan reconstruction from limited viewpoints. Audio not only helps sense geometry outside the camera's field of view, but it also reveals the existence of distant freespace (e.g., a dog barking in another room) and suggests the presence of rooms not visible to the camera (e.g., a dishwasher humming in what must be the kitchen to the left). We introduce AV-Map, a novel multi-modal encoder-decoder framework that reasons jointly about audio and vision to reconstruct a floorplan from a short input video sequence. We train our model to predict both the interior structure of the environment and the associated rooms' semantic labels. Our results on 85 large real-world environments show the impact: with just a few glimpses spanning 26% of an area, we can estimate the whole area with 66% accuracy---substantially better than the state of the art approach for extrapolating visual maps.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Purushwalkam_Audio-Visual_Floorplan_Reconstruction_ICCV_2021_paper.pdf", "aff": ";;;;;;", @@ -3510,7 +3749,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Purushwalkam_Audio-Visual_Floorplan_Reconstruction_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Purushwalkam_Audio-Visual_Floorplan_Reconstruction_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Purushwalkam_2021_ICCV,\n \n author = {\n Purushwalkam,\n Senthil and Gar{\\'\\i\n},\n Sebasti\\`a Vicenc Amengual and Ithapu,\n Vamsi Krishna and Schissler,\n Carl and Robinson,\n Philip and Gupta,\n Abhinav and Grauman,\n Kristen\n},\n title = {\n Audio-Visual Floorplan Reconstruction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1183-1192\n} \n}" }, { "title": "Audio2Gestures: Generating Diverse Gestures From Speech Audio With Conditional Variational Autoencoders", @@ -3518,6 +3758,7 @@ "status": "Poster", "track": "main", "pid": 3681, + "author_site": "Jing Li; Di Kang; Wenjie Pei; Xuefei Zhe; Ying Zhang; Zhenyu He; Linchao Bao", "author": "Jing Li; Di Kang; Wenjie Pei; Xuefei Zhe; Ying Zhang; Zhenyu He; Linchao Bao", "abstract": "Generating conversational gestures from speech audio is challenging due to the inherent one-to-many mapping between audio and body motions. Conventional CNNs/RNNs assume one-to-one mapping, and thus tend to predict the average of all possible target motions, resulting in plain/boring motions during inference. In order to overcome this problem, we propose a novel conditional variational autoencoder (VAE) that explicitly models one-to-many audio-to-motion mapping by splitting the cross-modal latent code into shared code and motion-specific code. The shared code mainly models the strong correlation between audio and motion (such as the synchronized audio and motion beats), while the motion-specific code captures diverse motion information independent of the audio. However, splitting the latent code into two parts poses training difficulties for the VAE model. A mapping network facilitating random sampling along with other techniques including relaxed motion loss, bicycle constraint, and diversity loss are designed to better train the VAE. Experiments on both 3D and 2D motion datasets verify that our method generates more realistic and diverse motions than state-of-the-art methods, quantitatively and qualitatively. Finally, we demonstrate that our method can be readily used to generate motion sequences with user-specified motion clips on the timeline. Code and more results are at https://jingli513.github.io/audio2gestures.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_Audio2Gestures_Generating_Diverse_Gestures_From_Speech_Audio_With_Conditional_Variational_ICCV_2021_paper.pdf", @@ -3541,7 +3782,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Shenzhen;", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Jing and Kang,\n Di and Pei,\n Wenjie and Zhe,\n Xuefei and Zhang,\n Ying and He,\n Zhenyu and Bao,\n Linchao\n},\n title = {\n Audio2Gestures: Generating Diverse Gestures From Speech Audio With Conditional Variational Autoencoders\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11293-11302\n} \n}" }, { "title": "Augmented Lagrangian Adversarial Attacks", @@ -3549,10 +3791,11 @@ "status": "Poster", "track": "main", "pid": 6379, - "author": "J\u00e9r\u00f4me Rony; Eric Granger; Marco Pedersoli; Ismail Ben Ayed", + "author_site": "Jérôme Rony; Eric Granger; Marco Pedersoli; Ismail Ben Ayed", + "author": "Jérôme Rony; Eric Granger; Marco Pedersoli; Ismail Ben Ayed", "abstract": "Adversarial attack algorithms are dominated by penalty methods, which are slow in practice, or more efficient distance-customized methods, which are heavily tailored to the properties of the considered distance. We propose a white-box attack algorithm to generate minimally perturbed adversarial examples based on Augmented Lagrangian principles. We bring several algorithmic modifications, which have a crucial effect on performance. Our attack enjoys the generality of penalty methods and the computational efficiency of distance-customized algorithms, and can be readily used for a wide set of distances. We compare our attack to state-of-the-art methods on three datasets and several models, and consistently obtain competitive performances with similar or lower computational complexity.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Rony_Augmented_Lagrangian_Adversarial_Attacks_ICCV_2021_paper.pdf", - "aff": "\u00b4ETS Montreal, Canada; \u00b4ETS Montreal, Canada; \u00b4ETS Montreal, Canada; \u00b4ETS Montreal, Canada", + "aff": "´ETS Montreal, Canada; ´ETS Montreal, Canada; ´ETS Montreal, Canada; ´ETS Montreal, Canada", "project": "", "github": "https://github.com/jeromerony/augmented_lagrangian_adversarial_attacks", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Rony_Augmented_Lagrangian_Adversarial_ICCV_2021_supplemental.pdf", @@ -3565,14 +3808,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Rony_Augmented_Lagrangian_Adversarial_Attacks_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;0", - "aff_unique_norm": "\u00c9cole de technologie sup\u00e9rieure", + "aff_unique_norm": "École de technologie supérieure", "aff_unique_dep": "", "aff_unique_url": "https://www.etsmtl.ca", "aff_unique_abbr": "ETS", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Montreal", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Rony_2021_ICCV,\n \n author = {\n Rony,\n J\\'er\\^ome and Granger,\n Eric and Pedersoli,\n Marco and Ben Ayed,\n Ismail\n},\n title = {\n Augmented Lagrangian Adversarial Attacks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7738-7747\n} \n}" }, { "title": "Augmenting Depth Estimation With Geospatial Context", @@ -3580,6 +3824,7 @@ "status": "Poster", "track": "main", "pid": 5871, + "author_site": "Scott Workman; Hunter Blanton", "author": "Scott Workman; Hunter Blanton", "abstract": "Modern cameras are equipped with a wide array of sensors that enable recording the geospatial context of an image. Taking advantage of this, we explore depth estimation under the assumption that the camera is geocalibrated, a problem we refer to as geo-enabled depth estimation. Our key insight is that if capture location is known, the corresponding overhead viewpoint offers a valuable resource for understanding the scale of the scene. We propose an end-to-end architecture for depth estimation that uses geospatial context to infer a synthetic ground-level depth map from a co-located overhead image, then fuses it inside of an encoder/decoder style segmentation network. To support evaluation of our methods, we extend a recently released dataset with overhead imagery and corresponding height maps. Results demonstrate that integrating geospatial context significantly reduces error compared to baselines, both at close ranges and when evaluating at much larger distances than existing benchmarks consider.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Workman_Augmenting_Depth_Estimation_With_Geospatial_Context_ICCV_2021_paper.pdf", @@ -3603,7 +3848,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Workman_2021_ICCV,\n \n author = {\n Workman,\n Scott and Blanton,\n Hunter\n},\n title = {\n Augmenting Depth Estimation With Geospatial Context\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4562-4571\n} \n}" }, { "title": "Auto Graph Encoder-Decoder for Neural Network Pruning", @@ -3611,6 +3857,7 @@ "status": "Poster", "track": "main", "pid": 6859, + "author_site": "Sixing Yu; Arya Mazaheri; Ali Jannesari", "author": "Sixing Yu; Arya Mazaheri; Ali Jannesari", "abstract": "Model compression aims to deploy deep neural networks (DNN) on mobile devices with limited computing and storage resources. However, most of the existing model compression methods rely on manually defined rules, which require domain expertise. DNNs are essentially computational graphs, which contain rich structural information. In this paper, we aim to find a suitable compression policy from DNNs' structural information. We propose an automatic graph encoder-decoder model compression (AGMC) method combined with graph neural networks (GNN) and reinforcement learning (RL). We model the target DNN as a graph and use GNN to learn the DNN's embeddings automatically. We compared our method with rule-based DNN embedding model compression methods to show the effectiveness of our method. Results show that our learning-based DNN embedding achieves better performance and a higher compression ratio with fewer search steps. We evaluated our method on over-parameterized and mobile-friendly DNNs and compared our method with handcrafted and learning-based model compression approaches. On over parameterized DNNs, such as ResNet-56, our method outperformed handcrafted and learning-based methods with 4.36% and 2.56% higher accuracy, respectively. Furthermore, on MobileNet-v2, we achieved a higher compression ratio than state-of-the-art methods with just 0.93% accuracy loss.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yu_Auto_Graph_Encoder-Decoder_for_Neural_Network_Pruning_ICCV_2021_paper.pdf", @@ -3634,7 +3881,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "United States;Germany" + "aff_country_unique": "United States;Germany", + "bibtex": "@InProceedings{Yu_2021_ICCV,\n \n author = {\n Yu,\n Sixing and Mazaheri,\n Arya and Jannesari,\n Ali\n},\n title = {\n Auto Graph Encoder-Decoder for Neural Network Pruning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6362-6372\n} \n}" }, { "title": "Auto-Parsing Network for Image Captioning and Visual Question Answering", @@ -3642,6 +3890,7 @@ "status": "Poster", "track": "main", "pid": 8190, + "author_site": "Xu Yang; Chongyang Gao; Hanwang Zhang; Jianfei Cai", "author": "Xu Yang; Chongyang Gao; Hanwang Zhang; Jianfei Cai", "abstract": "We propose an Auto-Parsing Network (APN) to discover and exploit the input data's hidden tree structures for improving the effectiveness of the Transformer-based vision-language systems. Specifically, we impose a Probabilistic Graphical Model (PGM) parameterized by the attention operations on each self-attention layer to incorporate sparse assumption. We use this PGM to softly segment an input sequence into a few clusters where each cluster can be treated as the parent of the inside entities. By stacking these PGM constrained self-attention layers, the clusters in a lower layer compose into a new sequence, and the PGM in a higher layer will further segment this sequence. Iteratively, a sparse tree can be implicitly parsed, and this tree's hierarchical knowledge is incorporated into the transformed embeddings, which can be used for solving the target vision-language tasks. Specifically, we showcase that our APN can strengthen Transformer based networks in two major vision-language tasks: Captioning and Visual Question Answering. Also, a PGM probability-based parsing algorithm is developed by which we can discover what the hidden structure of input is during the inference.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yang_Auto-Parsing_Network_for_Image_Captioning_and_Visual_Question_Answering_ICCV_2021_paper.pdf", @@ -3665,7 +3914,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;3", - "aff_country_unique": "China;United States;Singapore;Australia" + "aff_country_unique": "China;United States;Singapore;Australia", + "bibtex": "@InProceedings{Yang_2021_ICCV,\n \n author = {\n Yang,\n Xu and Gao,\n Chongyang and Zhang,\n Hanwang and Cai,\n Jianfei\n},\n title = {\n Auto-Parsing Network for Image Captioning and Visual Question Answering\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2197-2207\n} \n}" }, { "title": "AutoFormer: Searching Transformers for Visual Recognition", @@ -3673,6 +3923,7 @@ "status": "Poster", "track": "main", "pid": 3160, + "author_site": "Minghao Chen; Houwen Peng; Jianlong Fu; Haibin Ling", "author": "Minghao Chen; Houwen Peng; Jianlong Fu; Haibin Ling", "abstract": "Recently, pure transformer-based models have shown great potentials for vision tasks such as image classification and detection. However, the design of transformer networks is challenging. It has been observed that the depth, embedding dimension, and number of heads can largely affect the performance of vision transformers. Previous models configure these dimensions based upon manual crafting. In this work, we propose a new one-shot architecture search framework, namely AutoFormer, dedicated to vision transformer search. AutoFormer entangles the weights of different blocks in the same layers during supernet training. Benefiting from the strategy, the trained supernet allows thousands of subnets to be very well-trained. Specifically, the performance of these subnets with weights inherited from the supernet is comparable to those retrained from scratch. Besides, the searched models, which we refer to AutoFormers, surpass the recent state-of-the-arts such as ViT and DeiT. In particular, AutoFormer-tiny/small/base achieve 74.7%/81.7%/82.4% top-1 accuracy on ImageNet with 5.7M/22.9M/53.7M parameters, respectively. Lastly, we verify the transferability of AutoFormer by providing the performance on downstream benchmarks and distillation experiments. Code and models are available at https://github.com/microsoft/Cream.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_AutoFormer_Searching_Transformers_for_Visual_Recognition_ICCV_2021_paper.pdf", @@ -3689,14 +3940,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Chen_AutoFormer_Searching_Transformers_for_Visual_Recognition_ICCV_2021_paper.html", "aff_unique_index": "0;1+0;1;0", - "aff_unique_norm": "Stony Brook University;Microsoft", + "aff_unique_norm": "Stony Brook University;Microsoft Research", "aff_unique_dep": ";Research", "aff_unique_url": "https://www.stonybrook.edu;https://www.microsoft.com/en-us/research/group/asia", "aff_unique_abbr": "SBU;MSR Asia", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0;1+0;1;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Minghao and Peng,\n Houwen and Fu,\n Jianlong and Ling,\n Haibin\n},\n title = {\n AutoFormer: Searching Transformers for Visual Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12270-12280\n} \n}" }, { "title": "AutoShape: Real-Time Shape-Aware Monocular 3D Object Detection", @@ -3704,6 +3956,7 @@ "status": "Poster", "track": "main", "pid": 8478, + "author_site": "Zongdai Liu; Dingfu Zhou; Feixiang Lu; Jin Fang; Liangjun Zhang", "author": "Zongdai Liu; Dingfu Zhou; Feixiang Lu; Jin Fang; Liangjun Zhang", "abstract": "Existing deep learning-based approaches for monocular 3D object detection in autonomous driving often model the object as a rotated 3D cuboid while the object's geometric shape has been ignored. In this work, we propose an approach for incorporating the shape-aware 2D/3D constraints into the 3D detection framework. Specifically, we employ the deep neural network to learn distinguished 2D keypoints in the 2D image domain and regress their corresponding 3D coordinates in the local 3D object coordinate first. Then the 2D/3D geometric constraints are built by these correspondences for each object to boost the detection performance. For generating the ground truth of 2D/3D keypoints, an automatic model-fitting approach has been proposed by fitting the deformed 3D object model and the object mask in the 2D image. The proposed framework has been verified on the public KITTI dataset and the experimental results demonstrate that by using additional geometrical constraints the detection performance has been significantly improved as compared to the baseline method. More importantly, the proposed framework achieves state-of-the-art performance with real time. Data and code will be available at https://github.com/zongdai/AutoShape", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liu_AutoShape_Real-Time_Shape-Aware_Monocular_3D_Object_Detection_ICCV_2021_paper.pdf", @@ -3718,7 +3971,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Liu_AutoShape_Real-Time_Shape-Aware_Monocular_3D_Object_Detection_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Liu_AutoShape_Real-Time_Shape-Aware_Monocular_3D_Object_Detection_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Liu_2021_ICCV,\n \n author = {\n Liu,\n Zongdai and Zhou,\n Dingfu and Lu,\n Feixiang and Fang,\n Jin and Zhang,\n Liangjun\n},\n title = {\n AutoShape: Real-Time Shape-Aware Monocular 3D Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15641-15650\n} \n}" }, { "title": "AutoSpace: Neural Architecture Search With Less Human Interference", @@ -3726,6 +3980,7 @@ "status": "Poster", "track": "main", "pid": 4044, + "author_site": "Daquan Zhou; Xiaojie Jin; Xiaochen Lian; Linjie Yang; Yujing Xue; Qibin Hou; Jiashi Feng", "author": "Daquan Zhou; Xiaojie Jin; Xiaochen Lian; Linjie Yang; Yujing Xue; Qibin Hou; Jiashi Feng", "abstract": "Current neural architecture search (NAS) algorithms still require expert knowledge and effort to design a search space for network construction. In this paper, we consider automating the search space design to minimize human interference, which however faces two challenges: the explosive complexity of the exploration space and the expensive computation cost to evaluate the quality of different search spaces. To solve them, we propose a novel differentiable evolutionary framework named AutoSpace, which evolves the search space to an optimal one with following novel techniques: a differentiable fitness scoring function to efficiently evaluate the performance of cells and a reference architecture to speedup the evolution procedure and avoid falling into sub-optimal solutions. The framework is generic and compatible with additional computational constraints, making it feasible to learn specialized search spaces that fit different computational budgets. With the learned search space, the performance of recent NAS algorithms can be improved significantly compared with using manually de-signed spaces. Remarkably, the models generated from the new search space achieve 77.8% top-1 accuracy on ImageNet under the mobile setting (MAdds<=500M), outperforming previous SOTA EfficientNet-B0 by 0.7%. https://github.com/zhoudaquan/AutoSpace.git", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhou_AutoSpace_Neural_Architecture_Search_With_Less_Human_Interference_ICCV_2021_paper.pdf", @@ -3749,7 +4004,8 @@ "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";United States", "aff_country_unique_index": "0;1;1;1;0;0;0", - "aff_country_unique": "Singapore;United States" + "aff_country_unique": "Singapore;United States", + "bibtex": "@InProceedings{Zhou_2021_ICCV,\n \n author = {\n Zhou,\n Daquan and Jin,\n Xiaojie and Lian,\n Xiaochen and Yang,\n Linjie and Xue,\n Yujing and Hou,\n Qibin and Feng,\n Jiashi\n},\n title = {\n AutoSpace: Neural Architecture Search With Less Human Interference\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 337-346\n} \n}" }, { "title": "Auxiliary Tasks and Exploration Enable ObjectGoal Navigation", @@ -3757,6 +4013,7 @@ "status": "Poster", "track": "main", "pid": 7270, + "author_site": "Joel Ye; Dhruv Batra; Abhishek Das; Erik Wijmans", "author": "Joel Ye; Dhruv Batra; Abhishek Das; Erik Wijmans", "abstract": "ObjectGoal Navigation (ObjectNav) is an embodied task wherein agents are to navigate to an object instance in an unseen environment. Prior works have shown that end-to-end ObjectNav agents that use vanilla visual and recurrent modules, e.g. a CNN+RNN, perform poorly due to overfitting and sample inefficiency. This has motivated current state-of-the-art methods to mix analytic and learned components and operate on explicit spatial maps of the environment. We instead re-enable a generic learned agent by adding auxiliary learning tasks and an exploration reward. Our agents achieve 24.5% success and 8.1% SPL, a 37% and 8% relative improvement over prior state-of-the-art, respectively, on the Habitat ObjectNav Challenge. From our analysis, we propose that agents will act to simplify their visual inputs so as to smooth their RNN dynamics, and that auxiliary tasks reduce overfitting by minimizing effective RNN dimensionality; i.e. a performant ObjectNav agent that must maintain coherent plans over long horizons does so by learning smooth, low-dimensional recurrent dynamics.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ye_Auxiliary_Tasks_and_Exploration_Enable_ObjectGoal_Navigation_ICCV_2021_paper.pdf", @@ -3773,14 +4030,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Ye_Auxiliary_Tasks_and_Exploration_Enable_ObjectGoal_Navigation_ICCV_2021_paper.html", "aff_unique_index": "0;0+1;1;0", - "aff_unique_norm": "Georgia Institute of Technology;Meta", + "aff_unique_norm": "Georgia Institute of Technology;Facebook", "aff_unique_dep": ";Facebook AI Research", "aff_unique_url": "https://www.gatech.edu;https://research.facebook.com", "aff_unique_abbr": "Georgia Tech;FAIR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Ye_2021_ICCV,\n \n author = {\n Ye,\n Joel and Batra,\n Dhruv and Das,\n Abhishek and Wijmans,\n Erik\n},\n title = {\n Auxiliary Tasks and Exploration Enable ObjectGoal Navigation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 16117-16126\n} \n}" }, { "title": "BAPA-Net: Boundary Adaptation and Prototype Alignment for Cross-Domain Semantic Segmentation", @@ -3788,6 +4046,7 @@ "status": "Poster", "track": "main", "pid": 9911, + "author_site": "Yahao Liu; Jinhong Deng; Xinchen Gao; Wen Li; Lixin Duan", "author": "Yahao Liu; Jinhong Deng; Xinchen Gao; Wen Li; Lixin Duan", "abstract": "Existing cross-domain semantic segmentation methods usually focus on the overall segmentation results of whole objects but neglect the importance of object boundaries. In this work, we find that the segmentation performance can be considerably boosted if we treat object boundaries properly. For that, we propose a novel method called BAPA-Net, which is based on a convolutional neural network via Boundary Adaptation and Prototype Alignment, under the unsupervised domain adaptation setting. Specifically, we first construct additional images by pasting objects from source images to target images, and we develop a so-called boundary adaptation module to weigh each pixel based on its distance to the nearest boundary pixel of those pasted source objects. Moreover, we pro- pose another prototype alignment module to reduce the domain mismatch by minimizing distances between the class prototypes of the source and target domains, where boundaries are removed to avoid domain confusion during prototype calculation. By integrating the boundary adaptation and prototype alignment, we are able to train a discriminative and domain-invariant model for cross-domain semantic segmentation. We conduct extensive experiments on the benchmark datasets of urban scenes (i.e., GTA5->Cityscapes and SYNTHIA->Cityscapes). And the promising results clearly show the effectiveness of our BAPA-Net method over existing state-of-the-art for cross-domain semantic segmentation. Our implementation is available at https://github.com/manmanjun/BAPA-Net.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liu_BAPA-Net_Boundary_Adaptation_and_Prototype_Alignment_for_Cross-Domain_Semantic_Segmentation_ICCV_2021_paper.pdf", @@ -3811,7 +4070,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2021_ICCV,\n \n author = {\n Liu,\n Yahao and Deng,\n Jinhong and Gao,\n Xinchen and Li,\n Wen and Duan,\n Lixin\n},\n title = {\n BAPA-Net: Boundary Adaptation and Prototype Alignment for Cross-Domain Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8801-8811\n} \n}" }, { "title": "BARF: Bundle-Adjusting Neural Radiance Fields", @@ -3819,6 +4079,7 @@ "status": "Poster", "track": "main", "pid": 2202, + "author_site": "Chen-Hsuan Lin; Wei-Chiu Ma; Antonio Torralba; Simon Lucey", "author": "Chen-Hsuan Lin; Wei-Chiu Ma; Antonio Torralba; Simon Lucey", "abstract": "Neural Radiance Fields (NeRF) have recently gained a surge of interest within the computer vision community for its power to synthesize photorealistic novel views of real-world scenes. One limitation of NeRF, however, is its requirement of known camera poses to learn the scene representations. In this paper, we propose Bundle-Adjusting Neural Radiance Fields (BARF) for training NeRF from imperfect camera poses -- the joint problem of learning neural 3D representations and registering camera frames. We establish a theoretical connection to classical planar image registration and show that coarse-to-fine registration is also applicable to NeRF. Furthermore, we demonstrate mathematically that positional encoding has a direct impact on the basin of attraction for registration with a synthesis-based objective. Experiments on synthetic and real-world data show that BARF can effectively optimize the neural scene representations and resolve large camera pose misalignment at the same time. This enables applications of view synthesis and localization of video sequences from unknown camera poses, opening up new avenues for visual localization systems (e.g. SLAM) towards sequential registration with NeRF.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Lin_BARF_Bundle-Adjusting_Neural_Radiance_Fields_ICCV_2021_paper.pdf", @@ -3833,7 +4094,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Lin_BARF_Bundle-Adjusting_Neural_Radiance_Fields_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Lin_BARF_Bundle-Adjusting_Neural_Radiance_Fields_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Lin_2021_ICCV,\n \n author = {\n Lin,\n Chen-Hsuan and Ma,\n Wei-Chiu and Torralba,\n Antonio and Lucey,\n Simon\n},\n title = {\n BARF: Bundle-Adjusting Neural Radiance Fields\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5741-5751\n} \n}" }, { "title": "BEV-Net: Assessing Social Distancing Compliance by Joint People Localization and Geometric Reasoning", @@ -3841,6 +4103,7 @@ "status": "Poster", "track": "main", "pid": 3985, + "author_site": "Zhirui Dai; Yuepeng Jiang; Yi Li; Bo Liu; Antoni B. Chan; Nuno Vasconcelos", "author": "Zhirui Dai; Yuepeng Jiang; Yi Li; Bo Liu; Antoni B. Chan; Nuno Vasconcelos", "abstract": "ocial distancing, an essential public health measure to limit the spread of contagious diseases, has gained significant attention since the outbreak of the COVID-19 pandemic. In this work, the problem of visual social distancing compliance assessment in busy public areas, with wide field-of-view cameras, is considered. A dataset of crowd scenes with people annotations under a bird's eye view (BEV) and ground truth for metric distances is introduced, and several measures for the evaluation of social distance detection systems are proposed. A multi-branch network, BEV-Net, is proposed to localize individuals in world coordinates and identify high-risk regions where social distancing is violated. BEV-Net combines detection of head and feet locations, camera pose estimation, a differentiable homography module to map image into BEV coordinates, and geometric reasoning to produce a BEV map of the people locations in the scene. Experiments on complex crowded scenes demonstrate the power of the approach and show superior performance over baselines derived from methods in the literature. Applications of interest for public health decision makers are finally discussed. Datasets, code and pretrained models are publicly available at GitHub.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Dai_BEV-Net_Assessing_Social_Distancing_Compliance_by_Joint_People_Localization_and_ICCV_2021_paper.pdf", @@ -3864,7 +4127,8 @@ "aff_campus_unique_index": "0;0;0;2;0", "aff_campus_unique": "San Diego;;Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;1;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Dai_2021_ICCV,\n \n author = {\n Dai,\n Zhirui and Jiang,\n Yuepeng and Li,\n Yi and Liu,\n Bo and Chan,\n Antoni B. and Vasconcelos,\n Nuno\n},\n title = {\n BEV-Net: Assessing Social Distancing Compliance by Joint People Localization and Geometric Reasoning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5401-5411\n} \n}" }, { "title": "BN-NAS: Neural Architecture Search With Batch Normalization", @@ -3872,6 +4136,7 @@ "status": "Poster", "track": "main", "pid": 2040, + "author_site": "Boyu Chen; Peixia Li; Baopu Li; Chen Lin; Chuming Li; Ming Sun; Junjie Yan; Wanli Ouyang", "author": "Boyu Chen; Peixia Li; Baopu Li; Chen Lin; Chuming Li; Ming Sun; Junjie Yan; Wanli Ouyang", "abstract": "Model training and evaluation are two main time-consuming processes during neural architecture search (NAS). Although weight-sharing based methods have been proposed to reduce the number of trained networks, these methods still need to train the supernet for hundreds of epochs and evaluate thousands of subnets to find the optimal network architecture. In this paper, we propose NAS with Batch Normalization (BN), which we refer to as BN-NAS, to accelerate both the evaluation and training process. For fast evaluation, we propose a novel BN-based indicator that predicts subnet performance at a very early training stage. We further improve the training efficiency by only training the BN parameters during the supernet training. This is based on our observation that training the whole supernet is not necessary while training only BN parameters accelerates network convergence for network architecture search. Extensive experiments show that our method can significantly shorten the time of training supernet by more than 10 times and evaluating subnets by more than 600,000 times without losing accuracy.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_BN-NAS_Neural_Architecture_Search_With_Batch_Normalization_ICCV_2021_paper.pdf", @@ -3889,13 +4154,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Chen_BN-NAS_Neural_Architecture_Search_With_Batch_Normalization_ICCV_2021_paper.html", "aff_unique_index": "0;1;2;3;0;3;3;0", "aff_unique_norm": "University of Sydney;Baidu;University of Oxford;SenseTime Group Limited", - "aff_unique_dep": ";Baidu;;", + "aff_unique_dep": ";;;", "aff_unique_url": "https://www.sydney.edu.au;https://www.baidu.com;https://www.ox.ac.uk;https://www.sensetime.com", "aff_unique_abbr": "USYD;Baidu;Oxford;SenseTime", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;3;0;3;3;0", - "aff_country_unique": "Australia;United States;United Kingdom;China" + "aff_country_unique": "Australia;United States;United Kingdom;China", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Boyu and Li,\n Peixia and Li,\n Baopu and Lin,\n Chen and Li,\n Chuming and Sun,\n Ming and Yan,\n Junjie and Ouyang,\n Wanli\n},\n title = {\n BN-NAS: Neural Architecture Search With Batch Normalization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 307-316\n} \n}" }, { "title": "BV-Person: A Large-Scale Dataset for Bird-View Person Re-Identification", @@ -3903,6 +4169,7 @@ "status": "Poster", "track": "main", "pid": 6890, + "author_site": "Cheng Yan; Guansong Pang; Lei Wang; Jile Jiao; Xuetao Feng; Chunhua Shen; Jingjing Li", "author": "Cheng Yan; Guansong Pang; Lei Wang; Jile Jiao; Xuetao Feng; Chunhua Shen; Jingjing Li", "abstract": "Person Re-IDentification (ReID) aims at re-identifying persons from non-overlapping cameras. Existing person ReID studies focus on horizontal-view ReID tasks, in which the person images are captured by the cameras from a (nearly) horizontal view. In this work we introduce a new ReID task, bird-view person ReID, which aims at searching for a person in a gallery of horizontal-view images with the query images taken from a bird's-eye view, i.e., an elevated view of an object from above. The task is important because there are a large number of video surveillance cameras capturing persons from such an elevated view at public places. However, it is a challenging task in that the images from the bird view (i) provide limited person appearance information and (ii) have a large discrepancy compared to the persons in the horizontal view. We aim to facilitate the development of person ReID from this line by introducing a large-scale real-world dataset for this task. The proposed dataset, named BV-Person, contains 114k images of 18k identities in which nearly 20k images of 7.4k identities are taken from the bird's-eye view. We further introduce a novel model for this new ReID task. Large-scale experiments are performed to evaluate our model and 11 current state-of-the-art ReID models on BV-Person to establish performance benchmarks from multiple perspectives. The empirical results show that our model consistently and substantially outperforms the state-of-the-arts on all five datasets derived from BV-Person. Our model also achieves state-of-the-art performance on two general ReID datasets. Our code and dataset will be made publicly available.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yan_BV-Person_A_Large-Scale_Dataset_for_Bird-View_Person_Re-Identification_ICCV_2021_paper.pdf", @@ -3917,7 +4184,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Yan_BV-Person_A_Large-Scale_Dataset_for_Bird-View_Person_Re-Identification_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Yan_BV-Person_A_Large-Scale_Dataset_for_Bird-View_Person_Re-Identification_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Yan_2021_ICCV,\n \n author = {\n Yan,\n Cheng and Pang,\n Guansong and Wang,\n Lei and Jiao,\n Jile and Feng,\n Xuetao and Shen,\n Chunhua and Li,\n Jingjing\n},\n title = {\n BV-Person: A Large-Scale Dataset for Bird-View Person Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10943-10952\n} \n}" }, { "title": "BabelCalib: A Universal Approach to Calibrating Central Cameras", @@ -3925,6 +4193,7 @@ "status": "Poster", "track": "main", "pid": 3863, + "author_site": "Yaroslava Lochman; Kostiantyn Liepieshov; Jianhui Chen; Michal Perdoch; Christopher Zach; James Pritts", "author": "Yaroslava Lochman; Kostiantyn Liepieshov; Jianhui Chen; Michal Perdoch; Christopher Zach; James Pritts", "abstract": "Existing calibration methods occasionally fail for large field-of-view cameras due to the non-linearity of the underlying problem and the lack of good initial values for all parameters of the used camera model. This might occur because a simpler projection model is assumed in an initial step, or a poor initial guess for the internal parameters is pre-defined. A lot of the difficulties of general camera calibration lie in the use of a forward projection model. We side-step these challenges by first proposing a solver to calibrate the parameters in terms of a back-projection model and then regress the parameters for a target forward model. These steps are incorporated in a robust estimation framework to cope with outlying detections. Extensive experiments demonstrate that our approach is very reliable and returns the most accurate calibration parameters as measured on the downstream task of absolute pose estimation on test sets. The code is released at https://github.com/ylochman/babelcalib", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Lochman_BabelCalib_A_Universal_Approach_to_Calibrating_Central_Cameras_ICCV_2021_paper.pdf", @@ -3941,14 +4210,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Lochman_BabelCalib_A_Universal_Approach_to_Calibrating_Central_Cameras_ICCV_2021_paper.html", "aff_unique_index": "0;1;2;2;0;0", - "aff_unique_norm": "Chalmers University of Technology;Ukrainian Catholic University;Meta", + "aff_unique_norm": "Chalmers University of Technology;Ukrainian Catholic University;Facebook Reality Labs", "aff_unique_dep": ";;Facebook Reality Labs", "aff_unique_url": "https://www.chalmers.se;https://ucu.edu.ua;https://www.facebook.com/realitylabs", "aff_unique_abbr": "Chalmers;UCU;FRL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;2;0;0", - "aff_country_unique": "Sweden;Ukraine;United States" + "aff_country_unique": "Sweden;Ukraine;United States", + "bibtex": "@InProceedings{Lochman_2021_ICCV,\n \n author = {\n Lochman,\n Yaroslava and Liepieshov,\n Kostiantyn and Chen,\n Jianhui and Perdoch,\n Michal and Zach,\n Christopher and Pritts,\n James\n},\n title = {\n BabelCalib: A Universal Approach to Calibrating Central Cameras\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15253-15262\n} \n}" }, { "title": "Baking Neural Radiance Fields for Real-Time View Synthesis", @@ -3956,6 +4226,7 @@ "status": "Poster", "track": "main", "pid": 8765, + "author_site": "Peter Hedman; Pratul P. Srinivasan; Ben Mildenhall; Jonathan T. Barron; Paul Debevec", "author": "Peter Hedman; Pratul P. Srinivasan; Ben Mildenhall; Jonathan T. Barron; Paul Debevec", "abstract": "Neural volumetric representations such as Neural Radiance Fields (NeRF) have emerged as a compelling technique for learning to represent 3D scenes from images with the goal of rendering photorealistic images of the scene from unobserved viewpoints. However, NeRF's computational requirements are prohibitive for real-time applications: rendering views from a trained NeRF requires querying a multilayer perceptron (MLP) hundreds of times per ray. We present a method to train a NeRF, then precompute and store (i.e. \"\"bake\"\") it as a novel representation called a Sparse Neural Radiance Grid (SNeRG) that enables real-time rendering on commodity hardware. To achieve this, we introduce 1) a reformulation of NeRF's architecture, and 2) a sparse voxel grid representation with learned feature vectors. The resulting scene representation retains NeRF's ability to render fine geometric details and view-dependent appearance, is compact (averaging less than 90 MB per scene), and can be rendered in real-time (higher than 30 frames per second on a laptop GPU). Actual screen captures are shown in our video.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Hedman_Baking_Neural_Radiance_Fields_for_Real-Time_View_Synthesis_ICCV_2021_paper.pdf", @@ -3979,7 +4250,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Hedman_2021_ICCV,\n \n author = {\n Hedman,\n Peter and Srinivasan,\n Pratul P. and Mildenhall,\n Ben and Barron,\n Jonathan T. and Debevec,\n Paul\n},\n title = {\n Baking Neural Radiance Fields for Real-Time View Synthesis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5875-5884\n} \n}" }, { "title": "Batch Normalization Increases Adversarial Vulnerability and Decreases Adversarial Transferability: A Non-Robust Feature Perspective", @@ -3987,6 +4259,7 @@ "status": "Poster", "track": "main", "pid": 10212, + "author_site": "Philipp Benz; Chaoning Zhang; In So Kweon", "author": "Philipp Benz; Chaoning Zhang; In So Kweon", "abstract": "Batch normalization (BN) has been widely used in modern deep neural networks (DNNs) due to improved convergence. BN is observed to increase the model accuracy while at the cost of adversarial robustness. There is an increasing interest in the ML community to understand the impact of BN on DNNs, especially related to the model robustness. This work attempts to understand the impact of BN on DNNs from a non-robust feature perspective. Straightforwardly, the improved accuracy can be attributed to the better utilization of useful features. It remains unclear whether BN mainly favors learning robust features (RFs) or non-robust features (NRFs). Our work presents empirical evidence that supports that BN shifts a model towards being more dependent on NRFs. To facilitate the analysis of such a feature robustness shift, we propose a framework for disentangling robust usefulness into robustness and usefulness. Extensive analysis under the proposed framework yields valuable insight on the DNN behavior regarding robustness, e.g. DNNs first mainly learn RFs and then NRFs. The insight that RFs transfer better than NRFs, further inspires simple techniques to strengthen transfer-based black-box attacks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Benz_Batch_Normalization_Increases_Adversarial_Vulnerability_and_Decreases_Adversarial_Transferability_A_ICCV_2021_paper.pdf", @@ -4010,7 +4283,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Benz_2021_ICCV,\n \n author = {\n Benz,\n Philipp and Zhang,\n Chaoning and Kweon,\n In So\n},\n title = {\n Batch Normalization Increases Adversarial Vulnerability and Decreases Adversarial Transferability: A Non-Robust Feature Perspective\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7818-7827\n} \n}" }, { "title": "Bayesian Deep Basis Fitting for Depth Completion With Uncertainty", @@ -4018,6 +4292,7 @@ "status": "Poster", "track": "main", "pid": 8200, + "author_site": "Chao Qu; Wenxin Liu; Camillo J. Taylor", "author": "Chao Qu; Wenxin Liu; Camillo J. Taylor", "abstract": "In this work we investigate the problem of uncertainty estimation for image-guided depth completion. We extend Deep Basis Fitting (DBF) for depth completion within a Bayesian evidence framework to provide calibrated per-pixel variance. The DBF approach frames the depth completion problem in terms of a network that produces a set of low-dimensional depth bases and a differentiable least-squares fitting module that computes the basis weights using the sparse depths. By adopting a Bayesian treatment, our Bayesian Deep Basis Fitting (BDBF) approach is able to 1) predict high-quality uncertainty estimates and 2) enable depth completion with few or no sparse measurements. We conduct controlled experiments to compare BDBF against commonly used techniques for uncertainty estimation under various scenarios. Results show that our method produces better uncertainty estimates with accurate depth prediction.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Qu_Bayesian_Deep_Basis_Fitting_for_Depth_Completion_With_Uncertainty_ICCV_2021_paper.pdf", @@ -4041,7 +4316,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Qu_2021_ICCV,\n \n author = {\n Qu,\n Chao and Liu,\n Wenxin and Taylor,\n Camillo J.\n},\n title = {\n Bayesian Deep Basis Fitting for Depth Completion With Uncertainty\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 16147-16157\n} \n}" }, { "title": "Bayesian Triplet Loss: Uncertainty Quantification in Image Retrieval", @@ -4049,7 +4325,8 @@ "status": "Poster", "track": "main", "pid": 8001, - "author": "Frederik Warburg; Martin J\u00f8rgensen; Javier Civera; S\u00f8ren Hauberg", + "author_site": "Frederik Warburg; Martin Jørgensen; Javier Civera; Søren Hauberg", + "author": "Frederik Warburg; Martin Jørgensen; Javier Civera; Søren Hauberg", "abstract": "Uncertainty quantification in image retrieval is crucial for downstream decisions, yet it remains a challenging and largely unexplored problem. Current methods for estimating uncertainties are poorly calibrated, computationally expensive, or based on heuristics. We present a new method that views image embeddings as stochastic features rather than deterministic features. Our two main contributions are (1) a likelihood that matches the triplet constraint and that evaluates the probability of an anchor being closer to a positive than a negative; and (2) a prior over the feature space that justifies the conventional l2 normalization. To ensure computational efficiency, we derive a variational approximation of the posterior, called the Bayesian triplet loss, that produces state-of-the-art uncertainty estimates and matches the predictive performance of current state-of-the-art methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Warburg_Bayesian_Triplet_Loss_Uncertainty_Quantification_in_Image_Retrieval_ICCV_2021_paper.pdf", "aff": "Technical University of Denmark; University of Oxford; University of Zaragoza; Technical University of Denmark", @@ -4072,7 +4349,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;0", - "aff_country_unique": "Denmark;United Kingdom;Spain" + "aff_country_unique": "Denmark;United Kingdom;Spain", + "bibtex": "@InProceedings{Warburg_2021_ICCV,\n \n author = {\n Warburg,\n Frederik and J{\\o\n}rgensen,\n Martin and Civera,\n Javier and Hauberg,\n S{\\o\n}ren\n},\n title = {\n Bayesian Triplet Loss: Uncertainty Quantification in Image Retrieval\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12158-12168\n} \n}" }, { "title": "Benchmark Platform for Ultra-Fine-Grained Visual Categorization Beyond Human Performance", @@ -4080,6 +4358,7 @@ "status": "Poster", "track": "main", "pid": 8127, + "author_site": "Xiaohan Yu; Yang Zhao; Yongsheng Gao; Xiaohui Yuan; Shengwu Xiong", "author": "Xiaohan Yu; Yang Zhao; Yongsheng Gao; Xiaohui Yuan; Shengwu Xiong", "abstract": "Deep learning methods have achieved remarkable success in fine-grained visual categorization. Such successful categorization at sub-ordinate level, e.g., different animal or plant species, however relies heavily on the visual differences that human can observe and the ground-truths are labelled on the basis of such human visual observation. In contrast, few research has been done for visual categorization at the ultra-fine-grained level, i.e., a granularity where even human experts can hardly identify the visual differences or are not yet able to give affirmative labels by inferring observed pattern differences. This paper reports our efforts towards mitigating this research gap. We introduce the ultra-fine-grained (UFG) image dataset, a large collection of 47,114 images from 3,526 categories. All the images in the proposed UFG image dataset are grouped into categories with different confirmed cultivar names. In addition, we perform an extensive evaluation of state-of-the-art fine-grained classification methods on the proposed UFG image dataset as comparative baselines. The proposed UFG image dataset and evaluation protocols is intended to serve as a benchmark platform that can advance research of visual classification from approaching human performance to beyond human ability, via facilitating benchmark data of artificial intelligence (AI) not to be limited by the labels of human intelligence (HI). The dataset is available online at https://github.com/XiaohanYu-GU/Ultra-FGVC.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yu_Benchmark_Platform_for_Ultra-Fine-Grained_Visual_Categorization_Beyond_Human_Performance_ICCV_2021_paper.pdf", @@ -4094,7 +4373,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Yu_Benchmark_Platform_for_Ultra-Fine-Grained_Visual_Categorization_Beyond_Human_Performance_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Yu_Benchmark_Platform_for_Ultra-Fine-Grained_Visual_Categorization_Beyond_Human_Performance_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Yu_2021_ICCV,\n \n author = {\n Yu,\n Xiaohan and Zhao,\n Yang and Gao,\n Yongsheng and Yuan,\n Xiaohui and Xiong,\n Shengwu\n},\n title = {\n Benchmark Platform for Ultra-Fine-Grained Visual Categorization Beyond Human Performance\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10285-10295\n} \n}" }, { "title": "Benchmarking Ultra-High-Definition Image Super-Resolution", @@ -4102,7 +4382,8 @@ "status": "Poster", "track": "main", "pid": 5467, - "author": "Kaihao Zhang; Dongxu Li; Wenhan Luo; Wenqi Ren; Bj\u00f6rn Stenger; Wei Liu; Hongdong Li; Ming-Hsuan Yang", + "author_site": "Kaihao Zhang; Dongxu Li; Wenhan Luo; Wenqi Ren; Björn Stenger; Wei Liu; Hongdong Li; Ming-Hsuan Yang", + "author": "Kaihao Zhang; Dongxu Li; Wenhan Luo; Wenqi Ren; Björn Stenger; Wei Liu; Hongdong Li; Ming-Hsuan Yang", "abstract": "Increasingly, modern mobile devices allow capturing images at Ultra-High-Definition (UHD) resolution, which includes 4K and 8K images. However, current single image super-resolution (SISR) methods focus on super-resolving images to ones with resolution up to high definition (HD) and ignore higher-resolution UHD images. To explore their performance on UHD images, in this paper, we first introduce two large-scale image datasets, UHDSR4K and UHDSR8K, to benchmark existing SISR methods. With 70,000 V100 GPU hours of training, we benchmark these methods on 4K and 8K resolution images under seven different settings to provide a set of baseline models. Moreover, we propose a baseline model, called Mesh Attention Network (MANet) for SISR. The MANet applies the attention mechanism in both different depths (horizontal) and different levels of receptive field (vertical). In this way, correlations among feature maps are learned, enabling the network to focus on more important features.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhang_Benchmarking_Ultra-High-Definition_Image_Super-Resolution_ICCV_2021_paper.pdf", "aff": ";;;;;;;", @@ -4116,7 +4397,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhang_Benchmarking_Ultra-High-Definition_Image_Super-Resolution_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhang_Benchmarking_Ultra-High-Definition_Image_Super-Resolution_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Zhang_2021_ICCV,\n \n author = {\n Zhang,\n Kaihao and Li,\n Dongxu and Luo,\n Wenhan and Ren,\n Wenqi and Stenger,\n Bj\\"orn and Liu,\n Wei and Li,\n Hongdong and Yang,\n Ming-Hsuan\n},\n title = {\n Benchmarking Ultra-High-Definition Image Super-Resolution\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14769-14778\n} \n}" }, { "title": "Better Aggregation in Test-Time Augmentation", @@ -4124,6 +4406,7 @@ "status": "Poster", "track": "main", "pid": 9233, + "author_site": "Divya Shanmugam; Davis Blalock; Guha Balakrishnan; John Guttag", "author": "Divya Shanmugam; Davis Blalock; Guha Balakrishnan; John Guttag", "abstract": "Test-time augmentation---the aggregation of predictions across transformed versions of a test input---is a common practice in image classification. Traditionally, predictions are combined using a simple average. In this paper, we present 1) experimental analyses that shed light on cases in which the simple average is suboptimal and 2) a method to address these shortcomings. A key finding is that even when test-time augmentation produces a net improvement in accuracy, it can change many correct predictions into incorrect predictions. We delve into when and why test-time augmentation changes a prediction from being correct to incorrect and vice versa. Building on these insights, we present a learning-based method for aggregating test-time augmentations. Experiments across a diverse set of models, datasets, and augmentations show that our method delivers consistent improvements over existing approaches.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Shanmugam_Better_Aggregation_in_Test-Time_Augmentation_ICCV_2021_paper.pdf", @@ -4147,7 +4430,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Cambridge;", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Shanmugam_2021_ICCV,\n \n author = {\n Shanmugam,\n Divya and Blalock,\n Davis and Balakrishnan,\n Guha and Guttag,\n John\n},\n title = {\n Better Aggregation in Test-Time Augmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1214-1223\n} \n}" }, { "title": "Beyond Question-Based Biases: Assessing Multimodal Shortcut Learning in Visual Question Answering", @@ -4155,10 +4439,11 @@ "status": "Poster", "track": "main", "pid": 1960, - "author": "Corentin Dancette; R\u00e9mi Cad\u00e8ne; Damien Teney; Matthieu Cord", + "author_site": "Corentin Dancette; Rémi Cadène; Damien Teney; Matthieu Cord", + "author": "Corentin Dancette; Rémi Cadène; Damien Teney; Matthieu Cord", "abstract": "We introduce an evaluation methodology for visual question answering (VQA) to better diagnose cases of shortcut learning. These cases happen when a model exploits spurious statistical regularities to produce correct answers but does not actually deploy the desired behavior. There is a need to identify possible shortcuts in a dataset and assess their use before deploying a model in the real world. The research community in VQA has focused exclusively on question-based shortcuts, where a model might, for example, answer \"What is the color of the sky\" with \"blue\" by relying mostly on the question-conditional training prior and give little weight to visual evidence. We go a step further and consider multimodal shortcuts that involve both questions and images. We first identify potential shortcuts in the popular VQA v2 training set by mining trivial predictive rules such as co-occurrences of words and visual elements. We then introduce VQA-CounterExamples (VQA-CE), an evaluation protocol based on our subset of CounterExamples i.e. image-question-answer triplets where our rules lead to incorrect answers. We use this new evaluation in a large-scale study of existing approaches for VQA. We demonstrate that even state-of-the-art models perform poorly and that existing techniques to reduce biases are largely ineffective in this context. Our findings suggest that past work on question-based biases in VQA has only addressed one facet of a complex issue. The code for our method is available at \\url https://github.com/cdancette/detect-shortcuts", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Dancette_Beyond_Question-Based_Biases_Assessing_Multimodal_Shortcut_Learning_in_Visual_Question_ICCV_2021_paper.pdf", - "aff": "Sorbonne Universit \u00b4e, CNRS, LIP6; Sorbonne Universit \u00b4e, CNRS, LIP6 + Carney Institute for Brain Science, Brown University, USA + Idiap Research Institute; Idiap Research Institute + Australian Institute for Machine Learning, University of Adelaide; Sorbonne Universit \u00b4e, CNRS, LIP6 + Valeo.ai", + "aff": "Sorbonne Universit ´e, CNRS, LIP6; Sorbonne Universit ´e, CNRS, LIP6 + Carney Institute for Brain Science, Brown University, USA + Idiap Research Institute; Idiap Research Institute + Australian Institute for Machine Learning, University of Adelaide; Sorbonne Universit ´e, CNRS, LIP6 + Valeo.ai", "project": "", "github": "https://github.com/cdancette/detect-shortcuts", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Dancette_Beyond_Question-Based_Biases_ICCV_2021_supplemental.pdf", @@ -4171,14 +4456,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Dancette_Beyond_Question-Based_Biases_Assessing_Multimodal_Shortcut_Learning_in_Visual_Question_ICCV_2021_paper.html", "aff_unique_index": "0;0+1+2;2+3;0+4", - "aff_unique_norm": "Sorbonne Universit\u00e9;Brown University;Idiap Research Institute;University of Adelaide;Valeo", + "aff_unique_norm": "Sorbonne Université;Brown University;Idiap Research Institute;University of Adelaide;Valeo", "aff_unique_dep": "LIP6;Carney Institute for Brain Science;;Australian Institute for Machine Learning;Valeo.ai", "aff_unique_url": "https://www.sorbonne-universite.fr;https://www.brown.edu;https://www.idiap.ch;https://www.adelaide.edu.au;https://www.valeo.com", "aff_unique_abbr": "SU;Brown;Idiap;Adelaide;Valeo", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0;0+1+2;2+3;0+0", - "aff_country_unique": "France;United States;Switzerland;Australia" + "aff_country_unique": "France;United States;Switzerland;Australia", + "bibtex": "@InProceedings{Dancette_2021_ICCV,\n \n author = {\n Dancette,\n Corentin and Cad\\`ene,\n R\\'emi and Teney,\n Damien and Cord,\n Matthieu\n},\n title = {\n Beyond Question-Based Biases: Assessing Multimodal Shortcut Learning in Visual Question Answering\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1574-1583\n} \n}" }, { "title": "Beyond Road Extraction: A Dataset for Map Update Using Aerial Images", @@ -4186,6 +4472,7 @@ "status": "Poster", "track": "main", "pid": 10436, + "author_site": "Favyen Bastani; Samuel Madden", "author": "Favyen Bastani; Samuel Madden", "abstract": "The increasing availability of satellite and aerial imagery has sparked substantial interest in automatically updating street maps by processing aerial images. Until now, the community has largely focused on road extraction, where road networks are inferred from scratch from an aerial image. However, given that relatively high-quality maps exist in most parts of the world, in practice, inference approaches must be applied to update existing maps rather than infer new ones. With recent road extraction methods showing high accuracy, we argue that it is time to transition to the more practical map update task, where an existing map is updated by adding, removing, and shifting roads, without introducing errors in parts of the existing map that remain up-to-date. In this paper, we develop a new dataset called MUNO21 for the map update task, and show that it poses several new and interesting research challenges. We evaluate several state-of-the-art road extraction methods on MUNO21, and find that substantial further improvements in accuracy will be needed to realize automatic map update.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Bastani_Beyond_Road_Extraction_A_Dataset_for_Map_Update_Using_Aerial_ICCV_2021_paper.pdf", @@ -4209,7 +4496,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Bastani_2021_ICCV,\n \n author = {\n Bastani,\n Favyen and Madden,\n Samuel\n},\n title = {\n Beyond Road Extraction: A Dataset for Map Update Using Aerial Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11905-11914\n} \n}" }, { "title": "Beyond Trivial Counterfactual Explanations With Diverse Valuable Explanations", @@ -4217,7 +4505,8 @@ "status": "Poster", "track": "main", "pid": 7903, - "author": "Pau Rodr\u00edguez; Massimo Caccia; Alexandre Lacoste; Lee Zamparo; Issam Laradji; Laurent Charlin; David Vazquez", + "author_site": "Pau Rodríguez; Massimo Caccia; Alexandre Lacoste; Lee Zamparo; Issam Laradji; Laurent Charlin; David Vazquez", + "author": "Pau Rodríguez; Massimo Caccia; Alexandre Lacoste; Lee Zamparo; Issam Laradji; Laurent Charlin; David Vazquez", "abstract": "Explainability for machine learning models has gained considerable attention within the research community given the importance of deploying more reliable machine-learning systems. In computer vision applications, generative counterfactual methods indicate how to perturb a model's input to change its prediction, providing details about the model's decision-making. Current methods tend to generate trivial counterfactuals about a model's decisions, as they often suggest to exaggerate or remove the presence of the attribute being classified. For the machine learning practitioner, these types of counterfactuals offer little value, since they provide no new information about undesired model or data biases. In this work, we identify the problem of trivial counterfactual generation and we propose DiVE to alleviate it. DiVE learns a perturbation in a disentangled latent space that is constrained using a diversity-enforcing loss to uncover multiple valuable explanations about the model's prediction. Further, we introduce a mechanism to prevent the model from producing trivial explanations. Experiments on CelebA and Synbols demonstrate that our model improves the success rate of producing high-quality valuable explanations when compared to previous state-of-the-art methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Rodriguez_Beyond_Trivial_Counterfactual_Explanations_With_Diverse_Valuable_Explanations_ICCV_2021_paper.pdf", "aff": "Element AI+MILA+UdeM; Element AI+MILA+UdeM+McGill University; Element AI; Element AI; Element AI+MILA+McGill University; MILA+HEC Montreal+CIFAR AI Chair; Element AI", @@ -4233,14 +4522,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Rodriguez_Beyond_Trivial_Counterfactual_Explanations_With_Diverse_Valuable_Explanations_ICCV_2021_paper.html", "aff_unique_index": "0+1+2;0+1+2+3;0;0;0+1+3;1+4+5;0", - "aff_unique_norm": "Element AI;Mila;Universit\u00e9 de Montr\u00e9al;McGill University;HEC Montreal;CIFAR", + "aff_unique_norm": "Element AI;MILA;Université de Montréal;McGill University;HEC Montreal;CIFAR", "aff_unique_dep": ";;;;;AI Chair", - "aff_unique_url": "https://www.elementai.com;https://mila.quebec;https://www.udem\u8499\u7279\u5229\u5c14\u5927\u5b66.ca;https://www.mcgill.ca;https://www.hec.ca;https://www.cifar.ca", + "aff_unique_url": "https://www.elementai.com;https://mila.quebec;https://www.udem蒙特利尔大学.ca;https://www.mcgill.ca;https://www.hec.ca;https://www.cifar.ca", "aff_unique_abbr": "Element AI;MILA;UdeM;McGill;HEC;CIFAR", "aff_campus_unique_index": ";;;1", "aff_campus_unique": ";Montreal", "aff_country_unique_index": "0+0+0;0+0+0+0;0;0;0+0+0;0+0+0;0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Rodriguez_2021_ICCV,\n \n author = {\n Rodr{\\'\\i\n}guez,\n Pau and Caccia,\n Massimo and Lacoste,\n Alexandre and Zamparo,\n Lee and Laradji,\n Issam and Charlin,\n Laurent and Vazquez,\n David\n},\n title = {\n Beyond Trivial Counterfactual Explanations With Diverse Valuable Explanations\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1056-1065\n} \n}" }, { "title": "BiMaL: Bijective Maximum Likelihood Approach to Domain Adaptation in Semantic Scene Segmentation", @@ -4248,6 +4538,7 @@ "status": "Poster", "track": "main", "pid": 10868, + "author_site": "Thanh-Dat Truong; Chi Nhan Duong; Ngan Le; Son Lam Phung; Chase Rainwater; Khoa Luu", "author": "Thanh-Dat Truong; Chi Nhan Duong; Ngan Le; Son Lam Phung; Chase Rainwater; Khoa Luu", "abstract": "Semantic segmentation aims to predict pixel-level labels. It has become a popular task in various computer vision applications. While fully supervised segmentation methods have achieved high accuracy on large-scale vision datasets, they are unable to generalize on a new test environment or a new domain well. In this work, we first introduce a new Un- aligned Domain Score to measure the efficiency of a learned model on a new target domain in unsupervised manner. Then, we present the new Bijective Maximum Likelihood (BiMaL) loss that is a generalized form of the Adversarial Entropy Minimization without any assumption about pixel independence. We have evaluated the proposed BiMaL on two domains. The proposed BiMaL approach consistently outperforms the SOTA methods on empirical experiments on \"SYNTHIA to Cityscapes\", \"GTA5 to Cityscapes\", and \"SYNTHIA to Vistas\".", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Truong_BiMaL_Bijective_Maximum_Likelihood_Approach_to_Domain_Adaptation_in_Semantic_ICCV_2021_paper.pdf", @@ -4271,7 +4562,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;2;0;0", - "aff_country_unique": "United States;Canada;Australia" + "aff_country_unique": "United States;Canada;Australia", + "bibtex": "@InProceedings{Truong_2021_ICCV,\n \n author = {\n Truong,\n Thanh-Dat and Duong,\n Chi Nhan and Le,\n Ngan and Phung,\n Son Lam and Rainwater,\n Chase and Luu,\n Khoa\n},\n title = {\n BiMaL: Bijective Maximum Likelihood Approach to Domain Adaptation in Semantic Scene Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8548-8557\n} \n}" }, { "title": "BiaSwap: Removing Dataset Bias With Bias-Tailored Swapping Augmentation", @@ -4279,6 +4571,7 @@ "status": "Poster", "track": "main", "pid": 7421, + "author_site": "Eungyeup Kim; Jihyeon Lee; Jaegul Choo", "author": "Eungyeup Kim; Jihyeon Lee; Jaegul Choo", "abstract": "Deep neural networks often make decisions based on the spurious correlations inherent in the dataset, failing to generalize in an unbiased data distribution. Although previous approaches pre-define the type of dataset bias to prevent the network from learning it, recognizing the bias type in the real dataset is often prohibitive. This paper proposes a novel bias-tailored augmentation-based approach, BiaSwap, for learning debiased representation without requiring supervision on the bias type. Motivated by the phenomenon that the bias corresponds to the attributes the model learns as a shortcut, we utilize an image-to-image translation model optimized to transfer the attributes that the classifier often learns easily. As a prerequisite, we sort the training samples based on how much a biased model exploits them as a shortcut and divide them into bias-guiding and bias-contrary samples in an unsupervised manner. Afterwards, we utilize the CAM of GCE-trained classifier in the patch cooccurrence discriminator in order to focus on translating the bias attributes. Therefore, given the pair of bias-guiding and bias-contrary, the model generates the augmented bias-swapped image which contains the bias attributes from the bias-contrary images, while preserving bias-irrelevant ones in the bias-guiding images. We demonstrate the superiority of our approach against the baselines over both synthetic and real-world datasets. Even without careful supervision on the bias, BiaSwap achieves a remarkable performance on both unbiased and bias-guiding samples, implying the improved generalization capability of the model.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Kim_BiaSwap_Removing_Dataset_Bias_With_Bias-Tailored_Swapping_Augmentation_ICCV_2021_paper.pdf", @@ -4302,7 +4595,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Kim_2021_ICCV,\n \n author = {\n Kim,\n Eungyeup and Lee,\n Jihyeon and Choo,\n Jaegul\n},\n title = {\n BiaSwap: Removing Dataset Bias With Bias-Tailored Swapping Augmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14992-15001\n} \n}" }, { "title": "Bias Loss for Mobile Neural Networks", @@ -4310,6 +4604,7 @@ "status": "Poster", "track": "main", "pid": 3796, + "author_site": "Lusine Abrahamyan; Valentin Ziatchin; Yiming Chen; Nikos Deligiannis", "author": "Lusine Abrahamyan; Valentin Ziatchin; Yiming Chen; Nikos Deligiannis", "abstract": "Compact convolutional neural networks (CNNs) have witnessed exceptional improvements in performance in recent years. However, they still fail to provide the same predictive power as CNNs with a large number of parameters. The diverse and even abundant features captured by the layers is an important characteristic of these successful CNNs. However, differences in this characteristic between large CNNs and their compact counterparts have rarely been investigated. In compact CNNs, due to the limited number of parameters, abundant features are unlikely to be obtained, and feature diversity becomes an essential characteristic. Diverse features present in the activation maps derived from a data point during model inference may indicate the presence of a set of unique descriptors necessary to distinguish between objects of different classes. In contrast, data points with low feature diversity may not provide a sufficient amount of unique descriptors to make a valid prediction; we refer to them as random predictions. Random predictions can negatively impact the optimization process and harm the final performance. This paper proposes addressing the problem raised by random predictions by reshaping the standard cross-entropy to make it biased toward data points with a limited number of unique descriptive features. Our novel Bias Loss focuses the training on a set of valuable data points and prevents the vast number of samples with poor learning features from misleading the optimization process. Furthermore, to show the importance of diversity, we present a family of SkipblockNet models whose architectures are brought to boost the number of unique descriptors in the last layers. Experiments conducted on benchmark datasets demonstrate the superiority of the proposed loss function over the cross-entropy loss. Moreover, our SkipblockNet-M can achieve 1% higher classification accuracy than MobileNetV3 Large with similar computational cost on the ImageNet ILSVRC-2012 classification dataset. The code is available on the link - https://github.com/lusinlu/biasloss_skipblocknet.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Abrahamyan_Bias_Loss_for_Mobile_Neural_Networks_ICCV_2021_paper.pdf", @@ -4326,14 +4621,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Abrahamyan_Bias_Loss_for_Mobile_Neural_Networks_ICCV_2021_paper.html", "aff_unique_index": "0+1;2;0+1;0+1", - "aff_unique_norm": "Vrije Universiteit Brussel;IMEC;PicsArt Inc.", + "aff_unique_norm": "Vrije Universiteit Brussel;imec;PicsArt Inc.", "aff_unique_dep": "ETRO Department;;", "aff_unique_url": "https://www.vub.be;https://www.imec-int.com;https://picsart.com", "aff_unique_abbr": "VUB;imec;PicsArt", "aff_campus_unique_index": "0;2;0;0", "aff_campus_unique": "Brussels;;San Francisco", "aff_country_unique_index": "0+0;1;0+0;0+0", - "aff_country_unique": "Belgium;United States" + "aff_country_unique": "Belgium;United States", + "bibtex": "@InProceedings{Abrahamyan_2021_ICCV,\n \n author = {\n Abrahamyan,\n Lusine and Ziatchin,\n Valentin and Chen,\n Yiming and Deligiannis,\n Nikos\n},\n title = {\n Bias Loss for Mobile Neural Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6556-6566\n} \n}" }, { "title": "Bifold and Semantic Reasoning for Pedestrian Behavior Prediction", @@ -4341,6 +4637,7 @@ "status": "Poster", "track": "main", "pid": 4350, + "author_site": "Amir Rasouli; Mohsen Rohani; Jun Luo", "author": "Amir Rasouli; Mohsen Rohani; Jun Luo", "abstract": "Pedestrian behavior prediction is one of the major challenges for intelligent driving systems. Pedestrians often exhibit complex behaviors influenced by various contextual elements. To address this problem, we propose BiPed, a multitask learning framework that simultaneously predicts trajectories and actions of pedestrians by relying on multimodal data. Our method benefits from 1) a bifold encoding approach where different data modalities are processed independently allowing them to develop their own representations, and jointly to produce a representation for all modalities using shared parameters; 2) a novel interaction modeling technique that relies on categorical semantic parsing of the scenes to capture interactions between target pedestrians and their surroundings; and 3) a bifold prediction mechanism that uses both independent and shared decoding of multimodal representations. Using public pedestrian behavior benchmark datasets for driving, PIE and JAAD, we highlight the benefits of the proposed method for behavior prediction and show that our model achieves state-of-the-art performance and improves trajectory and action prediction by up to 22% and 9% respectively. We further investigate the contributions of the proposed reasoning techniques via extensive ablation studies.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Rasouli_Bifold_and_Semantic_Reasoning_for_Pedestrian_Behavior_Prediction_ICCV_2021_paper.pdf", @@ -4357,14 +4654,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Rasouli_Bifold_and_Semantic_Reasoning_for_Pedestrian_Behavior_Prediction_ICCV_2021_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "Huawei", - "aff_unique_dep": "Huawei Technologies", + "aff_unique_norm": "Huawei Technologies", + "aff_unique_dep": "", "aff_unique_url": "https://www.huawei.com/ca-en/", "aff_unique_abbr": "Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Rasouli_2021_ICCV,\n \n author = {\n Rasouli,\n Amir and Rohani,\n Mohsen and Luo,\n Jun\n},\n title = {\n Bifold and Semantic Reasoning for Pedestrian Behavior Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15600-15610\n} \n}" }, { "title": "Big Self-Supervised Models Advance Medical Image Classification", @@ -4372,6 +4670,7 @@ "status": "Poster", "track": "main", "pid": 5551, + "author_site": "Shekoofeh Azizi; Basil Mustafa; Fiona Ryan; Zachary Beaver; Jan Freyberg; Jonathan Deaton; Aaron Loh; Alan Karthikesalingam; Simon Kornblith; Ting Chen; Vivek Natarajan; Mohammad Norouzi", "author": "Shekoofeh Azizi; Basil Mustafa; Fiona Ryan; Zachary Beaver; Jan Freyberg; Jonathan Deaton; Aaron Loh; Alan Karthikesalingam; Simon Kornblith; Ting Chen; Vivek Natarajan; Mohammad Norouzi", "abstract": "Self-supervised pretraining followed by supervised fine-tuning has seen success in image recognition, especially when labeled examples are scarce, but has received limited attention in medical image analysis. This paper studies the effectiveness of self-supervised learning as a pretraining strategy for medical image classification. We conduct experiments on two distinct tasks: dermatology condition classification from digital camera images and multi-label chest X-ray classification, and demonstrate that self-supervised learning on ImageNet, followed by additional self-supervised learning on unlabeled domain-specific medical images significantly improves the accuracy of medical image classifiers.We introduce a novel Multi-Instance Contrastive Learning (MICLe) method that uses multiple images of the underlying pathology per patient case, when available, to construct more informative positive pairs for self-supervised learning. Combining our contributions, we achieve an improvement of 6.7% in top-1 accuracy and an improvement of 1.1% in mean AUC on dermatology and chest X-ray classification respectively, outperforming strong supervised baselines pretrained on ImageNet. In addition, we show that big self-supervised models are robust to distribution shift and can learn efficiently with a small number of labeled medical images.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Azizi_Big_Self-Supervised_Models_Advance_Medical_Image_Classification_ICCV_2021_paper.pdf", @@ -4395,7 +4694,8 @@ "aff_campus_unique_index": "0;0;0;0;0;0;0;0;0;0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;0+0;0;0;0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Azizi_2021_ICCV,\n \n author = {\n Azizi,\n Shekoofeh and Mustafa,\n Basil and Ryan,\n Fiona and Beaver,\n Zachary and Freyberg,\n Jan and Deaton,\n Jonathan and Loh,\n Aaron and Karthikesalingam,\n Alan and Kornblith,\n Simon and Chen,\n Ting and Natarajan,\n Vivek and Norouzi,\n Mohammad\n},\n title = {\n Big Self-Supervised Models Advance Medical Image Classification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3478-3488\n} \n}" }, { "title": "Binocular Mutual Learning for Improving Few-Shot Classification", @@ -4403,6 +4703,7 @@ "status": "Poster", "track": "main", "pid": 1314, + "author_site": "Ziqi Zhou; Xi Qiu; Jiangtao Xie; Jianan Wu; Chi Zhang", "author": "Ziqi Zhou; Xi Qiu; Jiangtao Xie; Jianan Wu; Chi Zhang", "abstract": "Most of the few-shot learning methods learn to transfer knowledge from datasets with abundant labeled data (i.e., the base set). From the perspective of class space on base set, existing methods either focus on utilizing all classes under a global view by normal pretraining, or pay more attention to adopt an episodic manner to train meta-tasks within few classes in a local view. However, the interaction of the two views is rarely explored. As the two views capture complementary information, we naturally think of the compatibility of them for achieving further performance gains. Inspired by the mutual learning paradigm and binocular parallax, we propose a unified framework, namely Binocular Mutual Learning (BML), which achieves the compatibility of the global view and the local view through both intra-view and cross-view modeling. Concretely, the global view learns in the whole class space to capture rich inter-class relationships. Meanwhile, the local view learns in the local class space within each episode, focusing on matching positive pairs correctly. In addition, cross-view mutual interaction further promotes the collaborative learning and the implicit exploration of useful knowledge from each other. During meta-test, binocular embeddings are aggregated together to support decision-making, which greatly improve the accuracy of classification. Extensive experiments conducted on multiple benchmarks including cross-domain validation confirm the effectiveness of our method.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhou_Binocular_Mutual_Learning_for_Improving_Few-Shot_Classification_ICCV_2021_paper.pdf", @@ -4426,7 +4727,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhou_2021_ICCV,\n \n author = {\n Zhou,\n Ziqi and Qiu,\n Xi and Xie,\n Jiangtao and Wu,\n Jianan and Zhang,\n Chi\n},\n title = {\n Binocular Mutual Learning for Improving Few-Shot Classification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8402-8411\n} \n}" }, { "title": "BioFors: A Large Biomedical Image Forensics Dataset", @@ -4434,6 +4736,7 @@ "status": "Poster", "track": "main", "pid": 10057, + "author_site": "Ekraam Sabir; Soumyaroop Nandi; Wael Abd-Almageed; Prem Natarajan", "author": "Ekraam Sabir; Soumyaroop Nandi; Wael Abd-Almageed; Prem Natarajan", "abstract": "Research in media forensics has gained traction to combat the spread of misinformation. However, most of this research has been directed towards content generated on social media. Biomedical image forensics is a related problem, where manipulation or misuse of images reported in biomedical research documents is of serious concern. The problem has failed to gain momentum beyond an academic discussion due to an absence of benchmark datasets and standardized tasks. In this paper we present BioFors -- the first dataset for benchmarking common biomedical image manipulations. BioFors comprises 47,805 images extracted from 1,031 open-source research papers. Images in BioFors are divided into four categories -- Microscopy, Blot/Gel, FACS and Macroscopy. We also propose three tasks for forensic analysis -- external duplication detection, internal duplication detection and cut/sharp-transition detection. We benchmark BioFors on all tasks with suitable state-of-the-art algorithms. Our results and analysis show that existing algorithms developed on common computer vision datasets are not robust when applied to biomedical images, validating that more research is required to address the unique challenges of biomedical image forensics.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Sabir_BioFors_A_Large_Biomedical_Image_Forensics_Dataset_ICCV_2021_paper.pdf", @@ -4457,7 +4760,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Marina del Rey", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Sabir_2021_ICCV,\n \n author = {\n Sabir,\n Ekraam and Nandi,\n Soumyaroop and Abd-Almageed,\n Wael and Natarajan,\n Prem\n},\n title = {\n BioFors: A Large Biomedical Image Forensics Dataset\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10963-10973\n} \n}" }, { "title": "Bit-Mixer: Mixed-Precision Networks With Runtime Bit-Width Selection", @@ -4465,6 +4769,7 @@ "status": "Poster", "track": "main", "pid": 1967, + "author_site": "Adrian Bulat; Georgios Tzimiropoulos", "author": "Adrian Bulat; Georgios Tzimiropoulos", "abstract": "Mixed-precision networks allow for a variable bit-width quantization for every layer in the network. A major limitation of existing work is that the bit-width for each layer must be predefined during training time. This allows little flexibility if the characteristics of the device on which the network is deployed change during runtime. In this work, we propose Bit-Mixer, the very first method to train a meta-quantized network where during test time any layer can change its bit-width without affecting at all the overall network's ability for highly accurate inference. To this end, we make 2 key contributions: (a) Transitional Batch-Norms, and (b) a 3-stage optimization process which is shown capable of training such a network. We show that our method can result in mixed precision networks that exhibit the desirable flexibility properties for on-device deployment without compromising accuracy. Code will be made available.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Bulat_Bit-Mixer_Mixed-Precision_Networks_With_Runtime_Bit-Width_Selection_ICCV_2021_paper.pdf", @@ -4481,14 +4786,15 @@ "author_num": 2, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Bulat_Bit-Mixer_Mixed-Precision_Networks_With_Runtime_Bit-Width_Selection_ICCV_2021_paper.html", "aff_unique_index": "0;0+1", - "aff_unique_norm": "Samsung;Queen Mary University of London", + "aff_unique_norm": "Samsung AI;Queen Mary University of London", "aff_unique_dep": "Samsung AI;", "aff_unique_url": "https://www.samsung.com/uk;https://www.qmul.ac.uk", "aff_unique_abbr": "Samsung AI;QMUL", "aff_campus_unique_index": "0;0+1", "aff_campus_unique": "Cambridge;London", "aff_country_unique_index": "0;0+0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Bulat_2021_ICCV,\n \n author = {\n Bulat,\n Adrian and Tzimiropoulos,\n Georgios\n},\n title = {\n Bit-Mixer: Mixed-Precision Networks With Runtime Bit-Width Selection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5188-5197\n} \n}" }, { "title": "Black-Box Detection of Backdoor Attacks With Limited Information and Data", @@ -4496,6 +4802,7 @@ "status": "Poster", "track": "main", "pid": 6098, + "author_site": "Yinpeng Dong; Xiao Yang; Zhijie Deng; Tianyu Pang; Zihao Xiao; Hang Su; Jun Zhu", "author": "Yinpeng Dong; Xiao Yang; Zhijie Deng; Tianyu Pang; Zihao Xiao; Hang Su; Jun Zhu", "abstract": "Although deep neural networks (DNNs) have made rapid progress in recent years, they are vulnerable in adversarial environments. A malicious backdoor could be embedded in a model by poisoning the training dataset, whose intention is to make the infected model give wrong predictions during inference when the specific trigger appears. To mitigate the potential threats of backdoor attacks, various backdoor detection and defense methods have been proposed. However, the existing techniques usually require the poisoned training data or access to the white-box model, which is commonly unavailable in practice. In this paper, we propose a black-box backdoor detection (B3D) method to identify backdoor attacks with only query access to the model. We introduce a gradient-free optimization algorithm to reverse-engineer the potential trigger for each class, which helps to reveal the existence of backdoor attacks. In addition to backdoor detection, we also propose a simple strategy for reliable predictions using the identified backdoored models. Extensive experiments on hundreds of DNN models trained on several datasets corroborate the effectiveness of our method under the black-box setting against various backdoor attacks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Dong_Black-Box_Detection_of_Backdoor_Attacks_With_Limited_Information_and_Data_ICCV_2021_paper.pdf", @@ -4519,7 +4826,8 @@ "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Beijing;", "aff_country_unique_index": "0;0;0;0;1;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Dong_2021_ICCV,\n \n author = {\n Dong,\n Yinpeng and Yang,\n Xiao and Deng,\n Zhijie and Pang,\n Tianyu and Xiao,\n Zihao and Su,\n Hang and Zhu,\n Jun\n},\n title = {\n Black-Box Detection of Backdoor Attacks With Limited Information and Data\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 16482-16491\n} \n}" }, { "title": "BlockCopy: High-Resolution Video Processing With Block-Sparse Feature Propagation and Online Policies", @@ -4527,6 +4835,7 @@ "status": "Poster", "track": "main", "pid": 4264, + "author_site": "Thomas Verelst; Tinne Tuytelaars", "author": "Thomas Verelst; Tinne Tuytelaars", "abstract": "In this paper we propose BlockCopy, a scheme that accelerates pretrained frame-based CNNs to process video more efficiently, compared to standard frame-by-frame processing. To this end, a lightweight policy network determines important regions in an image, and operations are applied on selected regions only, using custom block-sparse convolutions. Features of non-selected regions are simply copied from the preceding frame, reducing the number of computations and latency. The execution policy is trained using reinforcement learning in an online fashion without requiring ground truth annotations. Our universal framework is demonstrated on dense prediction tasks such as pedestrian detection, instance segmentation and semantic segmentation, using both state of the art (Center and Scale Predictor, MGAN, SwiftNet) and standard baseline networks (Mask-RCNN, DeepLabV3+). BlockCopy achieves significant FLOPS savings and inference speedup with minimal impact on accuracy.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Verelst_BlockCopy_High-Resolution_Video_Processing_With_Block-Sparse_Feature_Propagation_and_Online_ICCV_2021_paper.pdf", @@ -4541,7 +4850,8 @@ "aff_domain": ";", "email": ";", "author_num": 2, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Verelst_BlockCopy_High-Resolution_Video_Processing_With_Block-Sparse_Feature_Propagation_and_Online_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Verelst_BlockCopy_High-Resolution_Video_Processing_With_Block-Sparse_Feature_Propagation_and_Online_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Verelst_2021_ICCV,\n \n author = {\n Verelst,\n Thomas and Tuytelaars,\n Tinne\n},\n title = {\n BlockCopy: High-Resolution Video Processing With Block-Sparse Feature Propagation and Online Policies\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5158-5167\n} \n}" }, { "title": "BlockPlanner: City Block Generation With Vectorized Graph Representation", @@ -4549,6 +4859,7 @@ "status": "Poster", "track": "main", "pid": 3893, + "author_site": "Linning Xu; Yuanbo Xiangli; Anyi Rao; Nanxuan Zhao; Bo Dai; Ziwei Liu; Dahua Lin", "author": "Linning Xu; Yuanbo Xiangli; Anyi Rao; Nanxuan Zhao; Bo Dai; Ziwei Liu; Dahua Lin", "abstract": "City modeling is the foundation for computational urban planning, navigation, and entertainment. In this work, we present the first generative model of city blocks named BlockPlanner, and showcase its ability to synthesize valid city blocks with varying land lots configurations. We propose a novel vectorized city block representation utilizing a ring topology and a two-tier graph to capture the global and local structures of a city block. Each land lot is abstracted into a vector representation covering both its 3D geometry and land use semantics. Such vectorized representation enables us to deploy a lightweight network to capture the underlying distribution of land lots configuration in a city block. To enforce intrinsic spatial constraints of a valid city block, a set of effective loss functions are imposed to shape rational results. We contribute a pilot city block dataset to demonstrate the effectiveness and efficiency of our representation and framework over the state-of-the-art. Notably, our BlockPlanner is also able to edit and manipulate city blocks, enabling several useful applications, e.g., topology refinement and footprint generation.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xu_BlockPlanner_City_Block_Generation_With_Vectorized_Graph_Representation_ICCV_2021_paper.pdf", @@ -4565,14 +4876,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Xu_BlockPlanner_City_Block_Generation_With_Vectorized_Graph_Representation_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;0+1+2;3;3;0+1+2", - "aff_unique_norm": "Chinese University of Hong Kong;Centre of Perceptual and Interactive Intelligence;Shanghai AI Laboratory;Nanyang Technological University", + "aff_unique_norm": "The Chinese University of Hong Kong;Centre of Perceptual and Interactive Intelligence;Shanghai AI Laboratory;Nanyang Technological University", "aff_unique_dep": ";;;S-Lab", "aff_unique_url": "https://www.cuhk.edu.hk;;https://www.shanghai-ai-lab.com;https://www.ntu.edu.sg", "aff_unique_abbr": "CUHK;;SAIL;NTU", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0+0;2;2;0+0", - "aff_country_unique": "China;;Singapore" + "aff_country_unique": "China;;Singapore", + "bibtex": "@InProceedings{Xu_2021_ICCV,\n \n author = {\n Xu,\n Linning and Xiangli,\n Yuanbo and Rao,\n Anyi and Zhao,\n Nanxuan and Dai,\n Bo and Liu,\n Ziwei and Lin,\n Dahua\n},\n title = {\n BlockPlanner: City Block Generation With Vectorized Graph Representation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5077-5086\n} \n}" }, { "title": "Body-Face Joint Detection via Embedding and Head Hook", @@ -4580,6 +4892,7 @@ "status": "Poster", "track": "main", "pid": 1989, + "author_site": "Junfeng Wan; Jiangfan Deng; Xiaosong Qiu; Feng Zhou", "author": "Junfeng Wan; Jiangfan Deng; Xiaosong Qiu; Feng Zhou", "abstract": "Detecting pedestrians and their associated faces jointly is a challenging task.On one hand, body or face could be absent because of occlusion or non-frontal human pose.On the other hand, the association becomes difficult or even miss-leading in crowded scenes due to the lack of strong correlational evidence. This paper proposes Body-Face Joint (BFJ) detector, a novel framework for detecting bodies and their faces with accurate correspondance. We follow the classical multi-class detector design by detecting body and face in parallel but with two key contributions. First, we propose an Embedding Matching Loss (EML) to learn an associative embedding for matching body and face of the same person. Second, we introduce a novel concept, \"head hook\", to bridge the gap of matching body and faces spatially. With the new semantical and geometrical sources of information, BFJ greatly reduces the difficulty of detecting body and face in pairs. Since the problem is unexplored yet, we design a new metric named log-average miss matching rate (mMR^ -2 ) to evaluate the association performance and extend the CrowdHuman and CityPersons benchmarks by annotating each face box. Experiments show that our BFJ detector can maintain state-of-the-art performance in pedestrian detection on both one-stage and two-stage structures while greatly outperform various body-face association strategies. Code and datasets will be released soon.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wan_Body-Face_Joint_Detection_via_Embedding_and_Head_Hook_ICCV_2021_paper.pdf", @@ -4594,7 +4907,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wan_Body-Face_Joint_Detection_via_Embedding_and_Head_Hook_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wan_Body-Face_Joint_Detection_via_Embedding_and_Head_Hook_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Wan_2021_ICCV,\n \n author = {\n Wan,\n Junfeng and Deng,\n Jiangfan and Qiu,\n Xiaosong and Zhou,\n Feng\n},\n title = {\n Body-Face Joint Detection via Embedding and Head Hook\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2959-2968\n} \n}" }, { "title": "Boosting Monocular Depth Estimation With Lightweight 3D Point Fusion", @@ -4602,7 +4916,8 @@ "status": "Poster", "track": "main", "pid": 8661, - "author": "Lam Huynh; Phong Nguyen; Ji\u0159\u00ed Matas; Esa Rahtu; Janne Heikkil\u00e4", + "author_site": "Lam Huynh; Phong Nguyen; Jiří Matas; Esa Rahtu; Janne Heikkilä", + "author": "Lam Huynh; Phong Nguyen; Jiří Matas; Esa Rahtu; Janne Heikkilä", "abstract": "In this paper, we propose enhancing monocular depth estimation by adding 3D points as depth guidance. Unlike existing depth completion methods, our approach performs well on extremely sparse and unevenly distributed point clouds, which makes it agnostic to the source of the 3D points. We achieve this by introducing a novel multi-scale 3D point fusion network that is both lightweight and efficient. We demonstrate its versatility on two different depth estimation problems where the 3D points have been acquired with conventional structure-from-motion and LiDAR. In both cases, our network performs on par with state-of-the-art depth completion methods and achieves significantly higher accuracy when only a small number of points is used while being more compact in terms of the number of parameters. We show that our method outperforms some contemporary deep learning based multi-view stereo and structure-from-motion methods both in accuracy and in compactness.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Huynh_Boosting_Monocular_Depth_Estimation_With_Lightweight_3D_Point_Fusion_ICCV_2021_paper.pdf", "aff": "University of Oulu; Czech Technical University in Prague; Tampere University; Tampere University; University of Oulu", @@ -4625,7 +4940,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Prague", "aff_country_unique_index": "0;1;0;0;0", - "aff_country_unique": "Finland;Czech Republic" + "aff_country_unique": "Finland;Czech Republic", + "bibtex": "@InProceedings{Huynh_2021_ICCV,\n \n author = {\n Huynh,\n Lam and Nguyen,\n Phong and Matas,\n Ji\\v{r\n}{\\'\\i\n} and Rahtu,\n Esa and Heikkil\\"a,\n Janne\n},\n title = {\n Boosting Monocular Depth Estimation With Lightweight 3D Point Fusion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12767-12776\n} \n}" }, { "title": "Boosting Weakly Supervised Object Detection via Learning Bounding Box Adjusters", @@ -4633,6 +4949,7 @@ "status": "Poster", "track": "main", "pid": 4167, + "author_site": "Bowen Dong; Zitong Huang; Yuelin Guo; Qilong Wang; Zhenxing Niu; Wangmeng Zuo", "author": "Bowen Dong; Zitong Huang; Yuelin Guo; Qilong Wang; Zhenxing Niu; Wangmeng Zuo", "abstract": "Weakly-supervised object detection (WSOD) has emerged as an inspiring recent topic to avoid expensive instance-level object annotations. However, the bounding boxes of most existing WSOD methods are mainly determined by precomputed proposals, thereby being limited in precise object localization. In this paper, we defend the problem setting for improving localization performance by leveraging the bounding box regression knowledge from a well-annotated auxiliary dataset. First, we use the well-annotated auxiliary dataset to explore a series of learnable bounding box adjusters (LBBAs) in a multi-stage training manner, which is class-agnostic. Then, only LBBAs and a weakly-annotated dataset with non-overlapped classes are used for training LBBA-boosted WSOD. As such, our LBBAs are practically more convenient and economical to implement while avoiding the leakage of the auxiliary well-annotated dataset. In particular, we formulate learning bounding box adjusters as a bi-level optimization problem and suggest an EM-like multi-stage training algorithm. Then, a multi-stage scheme is further presented for LBBA-boosted WSOD. Additionally, a masking strategy is adopted to improve proposal classification. Experimental results verify the effectiveness of our method. Our method performs favorably against state-of-the-art WSOD methods and knowledge transfer model with similar problem setting. Code is publicly available at https://github.com/DongSky/lbba_boosted_wsod.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Dong_Boosting_Weakly_Supervised_Object_Detection_via_Learning_Bounding_Box_Adjusters_ICCV_2021_paper.pdf", @@ -4656,7 +4973,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Harbin;", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Dong_2021_ICCV,\n \n author = {\n Dong,\n Bowen and Huang,\n Zitong and Guo,\n Yuelin and Wang,\n Qilong and Niu,\n Zhenxing and Zuo,\n Wangmeng\n},\n title = {\n Boosting Weakly Supervised Object Detection via Learning Bounding Box Adjusters\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2876-2885\n} \n}" }, { "title": "Boosting the Generalization Capability in Cross-Domain Few-Shot Learning via Noise-Enhanced Supervised Autoencoder", @@ -4664,6 +4982,7 @@ "status": "Poster", "track": "main", "pid": 6226, + "author_site": "Hanwen Liang; Qiong Zhang; Peng Dai; Juwei Lu", "author": "Hanwen Liang; Qiong Zhang; Peng Dai; Juwei Lu", "abstract": "State of the art (SOTA) few-shot learning (FSL) methods suffer significant performance drop in the presence of domain differences between source and target datasets. The strong discrimination ability on the source dataset does not necessarily translate to high classification accuracy on the target dataset. In this work, we address this cross-domain few-shot learning (CDFSL) problem by boosting the generalization capability of the model. Specifically, we teach the model to capture broader variations of the feature distributions with a novel noise-enhanced supervised autoencoder (NSAE). NSAE trains the model by jointly reconstructing inputs and predicting the labels of inputs as well as their reconstructed pairs. Theoretical analysis based on intra-class correlation (ICC) shows that the feature embeddings learned from NSAE have stronger discrimination and generalization abilities in the target domain. We also take advantage of NSAE structure and propose a two-step fine-tuning procedure that achieves better adaption and improves classification performance in the target domain. Extensive experiments and ablation studies are conducted to demonstrate the effectiveness of the proposed method. Experimental results show that our proposed method consistently outperforms SOTA methods under various conditions.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liang_Boosting_the_Generalization_Capability_in_Cross-Domain_Few-Shot_Learning_via_Noise-Enhanced_ICCV_2021_paper.pdf", @@ -4678,7 +4997,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Liang_Boosting_the_Generalization_Capability_in_Cross-Domain_Few-Shot_Learning_via_Noise-Enhanced_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Liang_Boosting_the_Generalization_Capability_in_Cross-Domain_Few-Shot_Learning_via_Noise-Enhanced_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Liang_2021_ICCV,\n \n author = {\n Liang,\n Hanwen and Zhang,\n Qiong and Dai,\n Peng and Lu,\n Juwei\n},\n title = {\n Boosting the Generalization Capability in Cross-Domain Few-Shot Learning via Noise-Enhanced Supervised Autoencoder\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9424-9434\n} \n}" }, { "title": "Bootstrap Your Own Correspondences", @@ -4686,6 +5006,7 @@ "status": "Poster", "track": "main", "pid": 5855, + "author_site": "Mohamed El Banani; Justin Johnson", "author": "Mohamed El Banani; Justin Johnson", "abstract": "Geometric feature extraction is a crucial component of point cloud registration pipelines. Recent work has demonstrated how supervised learning can be leveraged to learn better and more compact 3D features. However, those approaches' reliance on ground-truth annotation limits their scalability. We propose BYOC: a self-supervised approach that learns visual and geometric features from RGB-D video without relying on ground-truth pose or correspondence. Our key observation is that randomly-initialized CNNs readily provide us with good correspondences; allowing us to bootstrap the learning of both visual and geometric features. Our approach combines classic ideas from point cloud registration with more recent representation learning approaches. We evaluate our approach on indoor scene datasets and find that our method outperforms traditional and learned descriptors, while being competitive with current state-of-the-art supervised approaches.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Banani_Bootstrap_Your_Own_Correspondences_ICCV_2021_paper.pdf", @@ -4709,7 +5030,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{El_Banani_2021_ICCV,\n \n author = {\n El Banani,\n Mohamed and Johnson,\n Justin\n},\n title = {\n Bootstrap Your Own Correspondences\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6433-6442\n} \n}" }, { "title": "BossNAS: Exploring Hybrid CNN-Transformers With Block-Wisely Self-Supervised Neural Architecture Search", @@ -4717,6 +5039,7 @@ "status": "Poster", "track": "main", "pid": 4179, + "author_site": "Changlin Li; Tao Tang; Guangrun Wang; Jiefeng Peng; Bing Wang; Xiaodan Liang; Xiaojun Chang", "author": "Changlin Li; Tao Tang; Guangrun Wang; Jiefeng Peng; Bing Wang; Xiaodan Liang; Xiaojun Chang", "abstract": "A myriad of recent breakthroughs in hand-crafted neural architectures for visual recognition have highlighted the urgent need to explore hybrid architectures consisting of diversified building blocks. Meanwhile, neural architecture search methods are surging with an expectation to reduce human efforts. However, whether NAS methods can efficiently and effectively handle diversified search spaces with disparate candidates (e.g. CNNs and transformers) is still an open question. In this work, we present Block-wisely Self-supervised Neural Architecture Search (BossNAS), an unsupervised NAS method that addresses the problem of inaccurate architecture rating caused by large weight-sharing space and biased supervision in previous methods. More specifically, we factorize the search space into blocks and utilize a novel self-supervised training scheme, named ensemble bootstrapping, to train each block separately before searching them as a whole towards the population center. Additionally, we present HyTra search space, a fabric-like hybrid CNN-transformer search space with searchable down-sampling positions. On this challenging search space, our searched model, BossNet-T, achieves up to 82.5% accuracy on ImageNet, surpassing EfficientNet by 2.4% with comparable compute time. Moreover, our method achieves superior architecture rating accuracy with 0.78 and 0.76 Spearman correlation on the canonical MBConv search space with ImageNet and on NATS-Bench size search space with CIFAR-100, respectively, surpassing state-of-the-art NAS methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_BossNAS_Exploring_Hybrid_CNN-Transformers_With_Block-Wisely_Self-Supervised_Neural_Architecture_Search_ICCV_2021_paper.pdf", @@ -4731,7 +5054,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Li_BossNAS_Exploring_Hybrid_CNN-Transformers_With_Block-Wisely_Self-Supervised_Neural_Architecture_Search_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Li_BossNAS_Exploring_Hybrid_CNN-Transformers_With_Block-Wisely_Self-Supervised_Neural_Architecture_Search_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Changlin and Tang,\n Tao and Wang,\n Guangrun and Peng,\n Jiefeng and Wang,\n Bing and Liang,\n Xiaodan and Chang,\n Xiaojun\n},\n title = {\n BossNAS: Exploring Hybrid CNN-Transformers With Block-Wisely Self-Supervised Neural Architecture Search\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12281-12291\n} \n}" }, { "title": "Boundary-Sensitive Pre-Training for Temporal Localization in Videos", @@ -4739,7 +5063,8 @@ "status": "Poster", "track": "main", "pid": 4213, - "author": "Mengmeng Xu; Juan-Manuel P\u00e9rez-R\u00faa; Victor Escorcia; Brais Mart\u00ednez; Xiatian Zhu; Li Zhang; Bernard Ghanem; Tao Xiang", + "author_site": "Mengmeng Xu; Juan-Manuel Pérez-Rúa; Victor Escorcia; Brais Martínez; Xiatian Zhu; Li Zhang; Bernard Ghanem; Tao Xiang", + "author": "Mengmeng Xu; Juan-Manuel Pérez-Rúa; Victor Escorcia; Brais Martínez; Xiatian Zhu; Li Zhang; Bernard Ghanem; Tao Xiang", "abstract": "Many video analysis tasks require temporal localization for the detection of content changes. However, most existing models developed for these tasks are pre-trained on general video action classification tasks. This is due to large scale annotation of temporal boundaries in untrimmed videos being expensive. Therefore, no suitable datasets exist that enable pre-training in a manner sensitive to temporal boundaries. In this paper for the first time, we investigate model pre-training for temporal localization by introducing a novel boundary-sensitive pretext (BSP) task. Instead of relying on costly manual annotations of temporal boundaries, we propose to synthesize temporal boundaries in existing video action classification datasets. By defining different ways of synthesizing boundaries, BSP can then be simply conducted in a self-supervised manner via the classification of the boundary types. This enables the learning of video representations that are much more transferable to downstream temporal localization tasks. Extensive experiments show that the proposed BSP is superior and complementary to the existing action classification-based pre-training counterpart, and achieves new state-of-the-art performance on several temporal localization tasks. Please visit our website for more details https://frostinassiky.github.io/bsp.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xu_Boundary-Sensitive_Pre-Training_for_Temporal_Localization_in_Videos_ICCV_2021_paper.pdf", "aff": ";;;;;;;", @@ -4753,7 +5078,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Xu_Boundary-Sensitive_Pre-Training_for_Temporal_Localization_in_Videos_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Xu_Boundary-Sensitive_Pre-Training_for_Temporal_Localization_in_Videos_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Xu_2021_ICCV,\n \n author = {\n Xu,\n Mengmeng and P\\'erez-R\\'ua,\n Juan-Manuel and Escorcia,\n Victor and Mart{\\'\\i\n}nez,\n Brais and Zhu,\n Xiatian and Zhang,\n Li and Ghanem,\n Bernard and Xiang,\n Tao\n},\n title = {\n Boundary-Sensitive Pre-Training for Temporal Localization in Videos\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7220-7230\n} \n}" }, { "title": "Box-Aware Feature Enhancement for Single Object Tracking on Point Clouds", @@ -4761,6 +5087,7 @@ "status": "Poster", "track": "main", "pid": 6811, + "author_site": "Chaoda Zheng; Xu Yan; Jiantao Gao; Weibing Zhao; Wei Zhang; Zhen Li; Shuguang Cui", "author": "Chaoda Zheng; Xu Yan; Jiantao Gao; Weibing Zhao; Wei Zhang; Zhen Li; Shuguang Cui", "abstract": "Current 3D single object tracking approaches track the target based on a feature comparison between the target template and the search area. However, due to the common occlusion in LiDAR scans, it is non-trivial to conduct accurate feature comparisons on severe sparse and incomplete shapes. In this work, we exploit the ground truth bounding box given in the first frame as a strong cue to enhance the feature description of the target object, enabling a more accurate feature comparison in a simple yet effective way. In particular, we first propose the BoxCloud, an informative and robust representation, to depict an object using the point-to-box relation. We further design an efficient box-aware feature fusion module, which leverages the aforementioned BoxCloud for reliable feature matching and embedding. Integrating the proposed general components into an existing model P2B, we construct a superior box-aware tracker (BAT). Experiments confirm that our proposed BAT outperforms the previous state-of-the-art by a large margin on both KITTI and NuScenes benchmarks, achieving a 12.8% improvement in terms of precision while running 20% faster.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zheng_Box-Aware_Feature_Enhancement_for_Single_Object_Tracking_on_Point_Clouds_ICCV_2021_paper.pdf", @@ -4777,14 +5104,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zheng_Box-Aware_Feature_Enhancement_for_Single_Object_Tracking_on_Point_Clouds_ICCV_2021_paper.html", "aff_unique_index": "0;0;1;0;2;0;0", - "aff_unique_norm": "Chinese University of Hong Kong (Shenzhen);Shanghai University;Baidu", - "aff_unique_dep": ";Research Institute of USV Engineering;Baidu", + "aff_unique_norm": "The Chinese University of Hong Kong (Shenzhen);Shanghai University;Baidu", + "aff_unique_dep": ";Research Institute of USV Engineering;", "aff_unique_url": "https://www.cuhk.edu.cn;https://www.shu.edu.cn;https://www.baidu.com", "aff_unique_abbr": "CUHK(SZ);;Baidu", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Shenzhen;", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zheng_2021_ICCV,\n \n author = {\n Zheng,\n Chaoda and Yan,\n Xu and Gao,\n Jiantao and Zhao,\n Weibing and Zhang,\n Wei and Li,\n Zhen and Cui,\n Shuguang\n},\n title = {\n Box-Aware Feature Enhancement for Single Object Tracking on Point Clouds\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13199-13208\n} \n}" }, { "title": "Bridging Unsupervised and Supervised Depth From Focus via All-in-Focus Supervision", @@ -4792,6 +5120,7 @@ "status": "Poster", "track": "main", "pid": 1939, + "author_site": "Ning-Hsu Wang; Ren Wang; Yu-Lun Liu; Yu-Hao Huang; Yu-Lin Chang; Chia-Ping Chen; Kevin Jou", "author": "Ning-Hsu Wang; Ren Wang; Yu-Lun Liu; Yu-Hao Huang; Yu-Lin Chang; Chia-Ping Chen; Kevin Jou", "abstract": "Depth estimation is a long-lasting yet important task in computer vision. Most of the previous works try to estimate depth from input images and assume images are all-in-focus (AiF), which is less common in real-world applications. On the other hand, a few works take defocus blur into account and consider it as another cue for depth estimation. In this paper, we propose a method to estimate not only a depth map but an AiF image from a set of images with different focus positions (known as a focal stack). We design a shared architecture to exploit the relationship between depth and AiF estimation. As a result, the proposed method can be trained either supervisedly with ground truth depth, or unsupervisedly with AiF images as supervisory signals. We show in various experiments that our method outperforms the state-of-the-art methods both quantitatively and qualitatively, and also has higher efficiency in inference time.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_Bridging_Unsupervised_and_Supervised_Depth_From_Focus_via_All-in-Focus_Supervision_ICCV_2021_paper.pdf", @@ -4815,7 +5144,8 @@ "aff_campus_unique_index": "0+0;0;0;0;0;0;0", "aff_campus_unique": "Taiwan", "aff_country_unique_index": "0+0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Ning-Hsu and Wang,\n Ren and Liu,\n Yu-Lun and Huang,\n Yu-Hao and Chang,\n Yu-Lin and Chen,\n Chia-Ping and Jou,\n Kevin\n},\n title = {\n Bridging Unsupervised and Supervised Depth From Focus via All-in-Focus Supervision\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12621-12631\n} \n}" }, { "title": "Bridging the Gap Between Label- and Reference-Based Synthesis in Multi-Attribute Image-to-Image Translation", @@ -4823,6 +5153,7 @@ "status": "Poster", "track": "main", "pid": 8297, + "author_site": "Qiusheng Huang; Zhilin Zheng; Xueqi Hu; Li Sun; Qingli Li", "author": "Qiusheng Huang; Zhilin Zheng; Xueqi Hu; Li Sun; Qingli Li", "abstract": "The image-to-image translation (I2IT) model takes a target label or a reference image as the input, and changes a source into the specified target domain. The two types of synthesis, either label- or reference-based, have substantial differences. Particularly, the label-based synthesis reflects the common characteristics of the target domain, and the reference-based shows the specific style similar to the reference. This paper intends to bridge the gap between them in the task of multi-attribute I2IT. We design the label- and reference-based encoding modules (LEM and REM) to compare the domain differences. They first transfer the source image and target label (or reference) into a common embedding space, by providing the opposite directions through the attribute difference vector. Then the two embeddings are simply fused together to form the latent code S_ rand (or S_ ref ), reflecting the domain style differences, which is injected into each layer of the generator by SPADE. To link LEM and REM, so that two types of results benefit each other, we encourage the two latent codes to be close, and set up the cycle consistency between the forward and backward translations on them. Moreover, the interpolation between the S_ rand and S_ ref is also used to synthesize an extra image. Experiments show that label- and reference-based synthesis are indeed mutually promoted, so that we can have the diverse results from LEM, and high quality results with the similar style of the reference.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Huang_Bridging_the_Gap_Between_Label-_and_Reference-Based_Synthesis_in_Multi-Attribute_ICCV_2021_paper.pdf", @@ -4837,7 +5168,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Huang_Bridging_the_Gap_Between_Label-_and_Reference-Based_Synthesis_in_Multi-Attribute_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Huang_Bridging_the_Gap_Between_Label-_and_Reference-Based_Synthesis_in_Multi-Attribute_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Huang_2021_ICCV,\n \n author = {\n Huang,\n Qiusheng and Zheng,\n Zhilin and Hu,\n Xueqi and Sun,\n Li and Li,\n Qingli\n},\n title = {\n Bridging the Gap Between Label- and Reference-Based Synthesis in Multi-Attribute Image-to-Image Translation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14628-14637\n} \n}" }, { "title": "Bringing Events Into Video Deblurring With Non-Consecutively Blurry Frames", @@ -4845,6 +5177,7 @@ "status": "Poster", "track": "main", "pid": 2893, + "author_site": "Wei Shang; Dongwei Ren; Dongqing Zou; Jimmy S. Ren; Ping Luo; Wangmeng Zuo", "author": "Wei Shang; Dongwei Ren; Dongqing Zou; Jimmy S. Ren; Ping Luo; Wangmeng Zuo", "abstract": "Recently, video deblurring has attracted considerable research attention, and several works suggest that events at high time rate can benefit deblurring. In this paper, we develop a principled framework D2Nets for video deblurring to exploit non-consecutively blurry frames, and propose a flexible event fusion module (EFM) to bridge the gap between event-driven and video deblurring. In D2Nets, we propose to first detect nearest sharp frames (NSFs) using a bidirectional LSTM detector, and then perform deblurring guided by NSFs. Furthermore, the proposed EFM is flexible to be incorporated into D2Nets, in which events can be leveraged to notably boost the deblurring performance. EFM can also be easily incorporated into existing deblurring networks, making event-driven deblurring task benefit from state-of-the-art deblurring methods. On synthetic and real-world blurry datasets, our methods achieve better results than competing methods, and EFM not only benefits D2Nets but also significantly improves the competing deblurring networks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Shang_Bringing_Events_Into_Video_Deblurring_With_Non-Consecutively_Blurry_Frames_ICCV_2021_paper.pdf", @@ -4861,14 +5194,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Shang_Bringing_Events_Into_Video_Deblurring_With_Non-Consecutively_Blurry_Frames_ICCV_2021_paper.html", "aff_unique_index": "0;0;1+2;1+2;3;4", - "aff_unique_norm": "Harbin Institute of Technology;SenseTime;Shanghai Jiao Tong University;University of Hong Kong;Pengcheng Laboratory", - "aff_unique_dep": "School of Computer Science and Technology;SenseTime Research;Qing Yuan Research Institute;;Peng Cheng Laboratory", + "aff_unique_norm": "Harbin Institute of Technology;SenseTime;Shanghai Jiao Tong University;The University of Hong Kong;Peng Cheng Laboratory", + "aff_unique_dep": "School of Computer Science and Technology;SenseTime Research;Qing Yuan Research Institute;;", "aff_unique_url": "http://www.hit.edu.cn/;https://www.sensetime.com;https://www.sjtu.edu.cn;https://www.hku.hk;", "aff_unique_abbr": "HIT;SenseTime;SJTU;HKU;", - "aff_campus_unique_index": "0;0;2;2;3;4", - "aff_campus_unique": "Harbin;;Shanghai;Hong Kong SAR;Shenzhen", + "aff_campus_unique_index": "0;0;2;2;3", + "aff_campus_unique": "Harbin;;Shanghai;Hong Kong SAR", "aff_country_unique_index": "0;0;0+0;0+0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Shang_2021_ICCV,\n \n author = {\n Shang,\n Wei and Ren,\n Dongwei and Zou,\n Dongqing and Ren,\n Jimmy S. and Luo,\n Ping and Zuo,\n Wangmeng\n},\n title = {\n Bringing Events Into Video Deblurring With Non-Consecutively Blurry Frames\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4531-4540\n} \n}" }, { "title": "Broaden Your Views for Self-Supervised Video Learning", @@ -4876,7 +5210,8 @@ "status": "Poster", "track": "main", "pid": 2642, - "author": "Adri\u00e0 Recasens; Pauline Luc; Jean-Baptiste Alayrac; Luyu Wang; Florian Strub; Corentin Tallec; Mateusz Malinowski; Viorica P\u0103tr\u0103ucean; Florent Altch\u00e9; Michal Valko; Jean-Bastien Grill; A\u00e4ron van den Oord; Andrew Zisserman", + "author_site": "Adrià Recasens; Pauline Luc; Jean-Baptiste Alayrac; Luyu Wang; Florian Strub; Corentin Tallec; Mateusz Malinowski; Viorica Pătrăucean; Florent Altché; Michal Valko; Jean-Bastien Grill; Aäron van den Oord; Andrew Zisserman", + "author": "Adrià Recasens; Pauline Luc; Jean-Baptiste Alayrac; Luyu Wang; Florian Strub; Corentin Tallec; Mateusz Malinowski; Viorica Pătrăucean; Florent Altché; Michal Valko; Jean-Bastien Grill; Aäron van den Oord; Andrew Zisserman", "abstract": "Most successful self-supervised learning methods are trained to align the representations of two independent views from the data. State-of-the-art methods in video are inspired by image techniques, where these two views are similarly extracted by cropping and augmenting the resulting crop. However, these methods miss a crucial element in the video domain: time. We introduce BraVe, a self-supervised learning framework for video. In BraVe, one of the views has access to a narrow temporal window of the video while the other view has a broad access to the video content. Our models learn to generalise from the narrow view to the general content of the video. Furthermore, BraVe processes the views with different backbones, enabling the use of alternative augmentations or modalities into the broad view such as optical flow, randomly convolved RGB frames, audio or their combinations. We demonstrate that BraVe achieves state-of-the-art results in self-supervised representation learning on standard video and audio classification benchmarks including UCF101, HMDB51, Kinetics, ESC-50 and AudioSet.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Recasens_Broaden_Your_Views_for_Self-Supervised_Video_Learning_ICCV_2021_paper.pdf", "aff": "DeepMind; DeepMind; DeepMind; DeepMind; DeepMind; DeepMind; DeepMind; DeepMind; DeepMind; DeepMind; DeepMind; DeepMind; DeepMind + VGG, Dept. of Engineering Science, University of Oxford", @@ -4899,7 +5234,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Oxford", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0;0;0;0+0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Recasens_2021_ICCV,\n \n author = {\n Recasens,\n Adri\\`a and Luc,\n Pauline and Alayrac,\n Jean-Baptiste and Wang,\n Luyu and Strub,\n Florian and Tallec,\n Corentin and Malinowski,\n Mateusz and P\\u{a\n}tr\\u{a\n}ucean,\n Viorica and Altch\\'e,\n Florent and Valko,\n Michal and Grill,\n Jean-Bastien and van den Oord,\n A\\"aron and Zisserman,\n Andrew\n},\n title = {\n Broaden Your Views for Self-Supervised Video Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1255-1265\n} \n}" }, { "title": "Building-GAN: Graph-Conditioned Architectural Volumetric Design Generation", @@ -4907,6 +5243,7 @@ "status": "Poster", "track": "main", "pid": 10895, + "author_site": "Kai-Hung Chang; Chin-Yi Cheng; Jieliang Luo; Shingo Murata; Mehdi Nourbakhsh; Yoshito Tsuji", "author": "Kai-Hung Chang; Chin-Yi Cheng; Jieliang Luo; Shingo Murata; Mehdi Nourbakhsh; Yoshito Tsuji", "abstract": "Volumetric design is the first and critical step for professional building design, where architects not only depict the rough 3D geometry of the building but also specify the programs to form a 2D layout on each floor. Though 2D layout generation for a single story has been widely studied, there is no developed method for multi-story buildings. This paper focuses on volumetric design generation conditioned on an input program graph. Instead of outputting dense 3D voxels, we propose a new 3D representation named voxel graph that is both compact and expressive for building geometries. Our generator is a cross-modal graph neural network that uses a pointer mechanism to connect the input program graph and the output voxel graph, and the whole pipeline is trained using the adversarial framework. The generated designs are evaluated qualitatively by a user study and quantitatively using three metrics: quality, diversity, and connectivity accuracy. We show that our model generates realistic 3D volumetric designs and outperforms previous methods and baselines.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chang_Building-GAN_Graph-Conditioned_Architectural_Volumetric_Design_Generation_ICCV_2021_paper.pdf", @@ -4930,7 +5267,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0;1", - "aff_country_unique": "United States;Japan" + "aff_country_unique": "United States;Japan", + "bibtex": "@InProceedings{Chang_2021_ICCV,\n \n author = {\n Chang,\n Kai-Hung and Cheng,\n Chin-Yi and Luo,\n Jieliang and Murata,\n Shingo and Nourbakhsh,\n Mehdi and Tsuji,\n Yoshito\n},\n title = {\n Building-GAN: Graph-Conditioned Architectural Volumetric Design Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11956-11965\n} \n}" }, { "title": "BuildingNet: Learning To Label 3D Buildings", @@ -4938,6 +5276,7 @@ "status": "Poster", "track": "main", "pid": 3162, + "author_site": "Pratheba Selvaraju; Mohamed Nabail; Marios Loizou; Maria Maslioukova; Melinos Averkiou; Andreas Andreou; Siddhartha Chaudhuri; Evangelos Kalogerakis", "author": "Pratheba Selvaraju; Mohamed Nabail; Marios Loizou; Maria Maslioukova; Melinos Averkiou; Andreas Andreou; Siddhartha Chaudhuri; Evangelos Kalogerakis", "abstract": "We introduce BuildingNet: (a) a large-scale dataset of 3D building models whose exteriors are consistently labeled, and (b) a graph neural network that labels building meshes by analyzing spatial and structural relations of their geometric primitives. To create our dataset, we used crowdsourcing combined with expert guidance, resulting in 513K annotated mesh primitives, grouped into 292K semantic part components across 2K building models. The dataset covers several building categories, such as houses, churches, skyscrapers, town halls, libraries, and castles. We include a benchmark for evaluating mesh and point cloud labeling. Buildings have more challenging structural complexity compared to objects in existing benchmarks (e.g., ShapeNet, PartNet), thus, we hope that our dataset can nurture the development of algorithms that are able to cope with such large-scale geometric data for both vision and graphics tasks e.g., 3D semantic segmentation, part-based generative models, correspondences, texturing, and analysis of point cloud data acquired from real-world buildings. Finally, we show that our mesh-based graph neural network significantly improves performance over several baselines for labeling 3D meshes. Our project page www.buildingnet.org includes our dataset and code.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Selvaraju_BuildingNet_Learning_To_Label_3D_Buildings_ICCV_2021_paper.pdf", @@ -4952,7 +5291,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Selvaraju_BuildingNet_Learning_To_Label_3D_Buildings_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Selvaraju_BuildingNet_Learning_To_Label_3D_Buildings_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Selvaraju_2021_ICCV,\n \n author = {\n Selvaraju,\n Pratheba and Nabail,\n Mohamed and Loizou,\n Marios and Maslioukova,\n Maria and Averkiou,\n Melinos and Andreou,\n Andreas and Chaudhuri,\n Siddhartha and Kalogerakis,\n Evangelos\n},\n title = {\n BuildingNet: Learning To Label 3D Buildings\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10397-10407\n} \n}" }, { "title": "C2N: Practical Generative Noise Modeling for Real-World Denoising", @@ -4960,6 +5300,7 @@ "status": "Poster", "track": "main", "pid": 6678, + "author_site": "Geonwoon Jang; Wooseok Lee; Sanghyun Son; Kyoung Mu Lee", "author": "Geonwoon Jang; Wooseok Lee; Sanghyun Son; Kyoung Mu Lee", "abstract": "Learning-based image denoising methods have been bounded to situations where well-aligned noisy and clean images are given, or samples are synthesized from predetermined noise models, e.g., Gaussian. While recent generative noise modeling methods aim to simulate the unknown distribution of real-world noise, several limitations still exist. In a practical scenario, a noise generator should learn to simulate the general and complex noise distribution without using paired noisy and clean images. However, since existing methods are constructed on the unrealistic assumption of real-world noise, they tend to generate implausible patterns and cannot express complicated noise maps. Therefore, we introduce a Clean-to-Noisy image generation framework, namely C2N, to imitate complex real-world noise without using any paired examples. We construct the noise generator in C2N accordingly with each component of real-world noise characteristics to express a wide range of noise accurately. Combined with our C2N, conventional denoising CNNs can be trained to outperform existing unsupervised methods on challenging real-world benchmarks by a large margin.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Jang_C2N_Practical_Generative_Noise_Modeling_for_Real-World_Denoising_ICCV_2021_paper.pdf", @@ -4983,7 +5324,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Seoul", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Jang_2021_ICCV,\n \n author = {\n Jang,\n Geonwoon and Lee,\n Wooseok and Son,\n Sanghyun and Lee,\n Kyoung Mu\n},\n title = {\n C2N: Practical Generative Noise Modeling for Real-World Denoising\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2350-2359\n} \n}" }, { "title": "C3-SemiSeg: Contrastive Semi-Supervised Segmentation via Cross-Set Learning and Dynamic Class-Balancing", @@ -4991,10 +5333,11 @@ "status": "Poster", "track": "main", "pid": 2140, + "author_site": "Yanning Zhou; Hang Xu; Wei Zhang; Bin Gao; Pheng-Ann Heng", "author": "Yanning Zhou; Hang Xu; Wei Zhang; Bin Gao; Pheng-Ann Heng", "abstract": "The semi-supervised semantic segmentation methods utilize the unlabeled data to increase the feature discriminative ability to alleviate the burden of the annotated data. However, the dominant consistency learning diagram is limited by a) the misalignment between features from labeled and unlabeled data; b) treating each image and region separately without considering crucial semantic dependencies among classes. In this work, we introduce a novel C^3-SemiSeg to improve consistency-based semi-supervised learning by exploiting better feature alignment under perturbations and enhancing discriminative of the inter-class features cross images. Specifically, we first introduce a cross-set region-level data augmentation strategy to reduce the feature discrepancy between labeled data and unlabeled data. Cross-set pixel-wise contrastive learning is further integrated into the pipeline to facilitate discriminative and consistent intra-class features in a `compared to learn' way. To stabilize training from the noisy label, we propose a dynamic confidence region selection strategy to focus on the high confidence region for loss calculation. We validate the proposed approach on Cityscapes and BDD100K dataset, which significantly outperforms other state-of-the-art semi-supervised semantic segmentation methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhou_C3-SemiSeg_Contrastive_Semi-Supervised_Segmentation_via_Cross-Set_Learning_and_Dynamic_Class-Balancing_ICCV_2021_paper.pdf", - "aff": "CUHK; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; CUHK", + "aff": "CUHK; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; CUHK", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Zhou_C3-SemiSeg_Contrastive_Semi-Supervised_ICCV_2021_supplemental.pdf", @@ -5007,14 +5350,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhou_C3-SemiSeg_Contrastive_Semi-Supervised_Segmentation_via_Cross-Set_Learning_and_Dynamic_Class-Balancing_ICCV_2021_paper.html", "aff_unique_index": "0;1;1;1;0", - "aff_unique_norm": "Chinese University of Hong Kong;Huawei", - "aff_unique_dep": ";Noah\u2019s Ark Lab", + "aff_unique_norm": "The Chinese University of Hong Kong;Huawei", + "aff_unique_dep": ";Noah’s Ark Lab", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.huawei.com", "aff_unique_abbr": "CUHK;Huawei", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhou_2021_ICCV,\n \n author = {\n Zhou,\n Yanning and Xu,\n Hang and Zhang,\n Wei and Gao,\n Bin and Heng,\n Pheng-Ann\n},\n title = {\n C3-SemiSeg: Contrastive Semi-Supervised Segmentation via Cross-Set Learning and Dynamic Class-Balancing\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7036-7045\n} \n}" }, { "title": "CAG-QIL: Context-Aware Actionness Grouping via Q Imitation Learning for Online Temporal Action Localization", @@ -5022,6 +5366,7 @@ "status": "Poster", "track": "main", "pid": 7634, + "author_site": "Hyolim Kang; Kyungmin Kim; Yumin Ko; Seon Joo Kim", "author": "Hyolim Kang; Kyungmin Kim; Yumin Ko; Seon Joo Kim", "abstract": "Temporal action localization has been one of the most popular tasks in video understanding, due to the importance of detecting action instances in videos. However, not much progress has been made on extending it to work in an online fashion, although many video related tasks can benefit by going online with the growing video streaming services. To this end, we introduce a new task called Online Temporal Action Localization (On-TAL), in which the goal is to immediately detect action instances from an untrimmed streaming video. The online setting makes the new task very challenging as the actionness decision for every frame has to be made without access to future frames and also because post-processing methods cannot be used to modify past action proposals. We propose a novel framework, Context-Aware Actionness Grouping (CAG) as a solution for On-TAL and train it with the imitation learning algorithm, which allows us to avoid sophisticated reward engineering. Evaluation of our work on THUMOS14 and Activitynet1.3 shows significant improvement over non-naive baselines, demonstrating the effectiveness of our approach. As a by-product, our method can also be used for the Online Detection of Action Start (ODAS), in which our method also outperforms previous state-of-the-art models.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Kang_CAG-QIL_Context-Aware_Actionness_Grouping_via_Q_Imitation_Learning_for_Online_ICCV_2021_paper.pdf", @@ -5045,7 +5390,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Kang_2021_ICCV,\n \n author = {\n Kang,\n Hyolim and Kim,\n Kyungmin and Ko,\n Yumin and Kim,\n Seon Joo\n},\n title = {\n CAG-QIL: Context-Aware Actionness Grouping via Q Imitation Learning for Online Temporal Action Localization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13729-13738\n} \n}" }, { "title": "CANet: A Context-Aware Network for Shadow Removal", @@ -5053,6 +5399,7 @@ "status": "Poster", "track": "main", "pid": 7182, + "author_site": "Zipei Chen; Chengjiang Long; Ling Zhang; Chunxia Xiao", "author": "Zipei Chen; Chengjiang Long; Ling Zhang; Chunxia Xiao", "abstract": "In this paper, we propose a novel two-stage context-aware network named CANet for shadow removal, in which the contextual information from non-shadow regions is transferred to shadow regions at the embedded feature spaces. At Stage-I, we propose a contextual patch matching module to generate a set of potential matching pairs of shadow and non-shadow patches. Combined with the potential contextual relationships between shadow and non-shadow regions, our well-designed contextual feature transfer (CFT) mechanism can transfer contextual information from non-shadow to shadow regions at different scales. With the reconstructed feature maps, we remove shadows at L and A/B channels separately. At Stage-II, we use an encoder-decoder to refine current results and generate the final shadow removal results. We evaluate our proposed CANet on two benchmark datasets and some real-world shadow images with complex scenes. Extensive experiment results strongly demonstrate the efficacy of our proposed CANet and exhibit superior performance to state-of-the-arts.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_CANet_A_Context-Aware_Network_for_Shadow_Removal_ICCV_2021_paper.pdf", @@ -5069,14 +5416,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Chen_CANet_A_Context-Aware_Network_for_Shadow_Removal_ICCV_2021_paper.html", "aff_unique_index": "0;1;2;0", - "aff_unique_norm": "Wuhan University;JD;Wuhan University of Science and Technology", - "aff_unique_dep": "School of Computer Science;JD Finance America Corporation;", + "aff_unique_norm": "Wuhan University;JD Finance America Corporation;Wuhan University of Science and Technology", + "aff_unique_dep": "School of Computer Science;;", "aff_unique_url": "http://www.whu.edu.cn;;http://www.wust.edu.cn", "aff_unique_abbr": "WHU;;WUST", "aff_campus_unique_index": "0;1;0;0", "aff_campus_unique": "Wuhan;Mountain View", "aff_country_unique_index": "0;1;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Zipei and Long,\n Chengjiang and Zhang,\n Ling and Xiao,\n Chunxia\n},\n title = {\n CANet: A Context-Aware Network for Shadow Removal\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4743-4752\n} \n}" }, { "title": "CAPTRA: CAtegory-Level Pose Tracking for Rigid and Articulated Objects From Point Clouds", @@ -5084,6 +5432,7 @@ "status": "Poster", "track": "main", "pid": 8434, + "author_site": "Yijia Weng; He Wang; Qiang Zhou; Yuzhe Qin; Yueqi Duan; Qingnan Fan; Baoquan Chen; Hao Su; Leonidas J. Guibas", "author": "Yijia Weng; He Wang; Qiang Zhou; Yuzhe Qin; Yueqi Duan; Qingnan Fan; Baoquan Chen; Hao Su; Leonidas J. Guibas", "abstract": "In this work, we tackle the problem of category-level online pose tracking for objects from point cloud sequences. For the first time, we propose a unified framework that can handle 9DoF object pose tracking for novel rigid object instances as well as per-part pose tracking for articulated objects from known categories. Here the 9DoF pose, comprising 6D pose and 3D size, is equivalent to a 3D amodal bounding box representation with free 6D pose. Given the depth point cloud at the current frame and the estimated pose from the last frame, our novel end-to-end pipeline learns to accurately update the pose. Our pipeline is composed of three modules: 1) a pose canonicalization module that normalizes the pose of the input depth point cloud; 2) RotationNet, a module that directly regresses small interframe delta rotations; and 3) CoordinateNet, a module that predicts the normalized coordinates and segmentation, enabling analytical computation of the 3D size and translation. Leveraging the small pose regime in the pose-canonicalized point clouds, our method integrates the best of both worlds by combining dense coordinate prediction and direct rotation regression, thus yielding an end-to-end differentiable pipeline optimized for 9DoF pose accuracy (without using non-differentiable RANSAC). Our extensive experiments demonstrate that our method achieves new state-of-the-art performance on category-level rigid object pose and articulated object pose benchmarks at the fastest FPS 12.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Weng_CAPTRA_CAtegory-Level_Pose_Tracking_for_Rigid_and_Articulated_Objects_From_ICCV_2021_paper.pdf", @@ -5105,9 +5454,10 @@ "aff_unique_url": "http://www.pku.edu.cn;https://www.stanford.edu;http://www general-ai.cn;http://www.sdu.edu.cn;https://ucsd.edu;https://ai.tencent.com", "aff_unique_abbr": "PKU;Stanford;;SDU;UCSD;Tencent AI Lab", "aff_campus_unique_index": "1;2;1;1;2;1", - "aff_campus_unique": ";Stanford;La Jolla", + "aff_campus_unique": ";Stanford;San Diego", "aff_country_unique_index": "0;0+1+0;0;1;1;1+0;0;1;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Weng_2021_ICCV,\n \n author = {\n Weng,\n Yijia and Wang,\n He and Zhou,\n Qiang and Qin,\n Yuzhe and Duan,\n Yueqi and Fan,\n Qingnan and Chen,\n Baoquan and Su,\n Hao and Guibas,\n Leonidas J.\n},\n title = {\n CAPTRA: CAtegory-Level Pose Tracking for Rigid and Articulated Objects From Point Clouds\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13209-13218\n} \n}" }, { "title": "CCT-Net: Category-Invariant Cross-Domain Transfer for Medical Single-to-Multiple Disease Diagnosis", @@ -5115,10 +5465,11 @@ "status": "Poster", "track": "main", "pid": 1930, + "author_site": "Yi Zhou; Lei Huang; Tao Zhou; Ling Shao", "author": "Yi Zhou; Lei Huang; Tao Zhou; Ling Shao", "abstract": "A medical imaging model is usually explored for the diagnosis of a single disease. However, with the expanding demand for multi-disease diagnosis in clinical applications, multi-function solutions need to be investigated. Previous works proposed to either exploit different disease labels to conduct transfer learning through fine-tuning, or transfer knowledge across different domains with similar diseases. However, these methods still cannot address the real clinical challenge - a multi-disease model is required but annotations for each disease are not always available. In this paper, we introduce the task of transferring knowledge from single-disease diagnosis (source domain) to enhance multi-disease diagnosis (target domain). A category-invariant cross-domain transfer (CCT) method is proposed to address this single-to-multiple extension. First, for domain-specific task learning, we present a confidence weighted pooling (CWP) to obtain coarse heatmaps for different disease categories. Then, conditioned on these heatmaps, category-invariant feature refinement (CIFR) blocks are proposed to better localize discriminative semantic regions related to the corresponding diseases. The category-invariant characteristic enables transferability from the source domain to the target domain. We validate our method in two popular areas: extending diabetic retinopathy to identifying multiple ocular diseases, and extending glioma identification to the diagnosis of other brain tumors.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhou_CCT-Net_Category-Invariant_Cross-Domain_Transfer_for_Medical_Single-to-Multiple_Disease_Diagnosis_ICCV_2021_paper.pdf", - "aff": "School of Computer Science and Engineering, Southeast University, Nanjing, China; SKLSDE, Institute of Arti\ufb01cial Intelligence, Beihang University, Beijing, China; School of Computer Science and Technology, Nanjing University of Science and Technology, China; Inception Institute of Arti\ufb01cial Intelligence, Abu Dhabi, UAE", + "aff": "School of Computer Science and Engineering, Southeast University, Nanjing, China; SKLSDE, Institute of Artificial Intelligence, Beihang University, Beijing, China; School of Computer Science and Technology, Nanjing University of Science and Technology, China; Inception Institute of Artificial Intelligence, Abu Dhabi, UAE", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Zhou_CCT-Net_Category-Invariant_Cross-Domain_ICCV_2021_supplemental.pdf", @@ -5132,13 +5483,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhou_CCT-Net_Category-Invariant_Cross-Domain_Transfer_for_Medical_Single-to-Multiple_Disease_Diagnosis_ICCV_2021_paper.html", "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Southeast University;Beihang University;Nanjing University of Science and Technology;Inception Institute of Artificial Intelligence", - "aff_unique_dep": "School of Computer Science and Engineering;Institute of Arti\ufb01cial Intelligence;School of Computer Science and Technology;", - "aff_unique_url": "https://www.seu.edu.cn/;http://www.buaa.edu.cn;http://www.nust.edu.cn;", - "aff_unique_abbr": "SEU;BUAA;NUST;", + "aff_unique_dep": "School of Computer Science and Engineering;Institute of Artificial Intelligence;School of Computer Science and Technology;", + "aff_unique_url": "https://www.seu.edu.cn/;http://www.buaa.edu.cn;;", + "aff_unique_abbr": "SEU;BUAA;;", "aff_campus_unique_index": "0;1;3", "aff_campus_unique": "Nanjing;Beijing;;Abu Dhabi", "aff_country_unique_index": "0;0;0;1", - "aff_country_unique": "China;United Arab Emirates" + "aff_country_unique": "China;United Arab Emirates", + "bibtex": "@InProceedings{Zhou_2021_ICCV,\n \n author = {\n Zhou,\n Yi and Huang,\n Lei and Zhou,\n Tao and Shao,\n Ling\n},\n title = {\n CCT-Net: Category-Invariant Cross-Domain Transfer for Medical Single-to-Multiple Disease Diagnosis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8260-8270\n} \n}" }, { "title": "CDNet: Centripetal Direction Network for Nuclear Instance Segmentation", @@ -5146,6 +5498,7 @@ "status": "Poster", "track": "main", "pid": 4177, + "author_site": "Hongliang He; Zhongyi Huang; Yao Ding; Guoli Song; Lin Wang; Qian Ren; Pengxu Wei; Zhiqiang Gao; Jie Chen", "author": "Hongliang He; Zhongyi Huang; Yao Ding; Guoli Song; Lin Wang; Qian Ren; Pengxu Wei; Zhiqiang Gao; Jie Chen", "abstract": "Nuclear instance segmentation is a challenging task due to a large number of touching and overlapping nuclei in pathological images. Existing methods cannot effectively recognize the accurate boundary owing to neglecting the relationship between pixels (e.g., direction information). In this paper, we propose a novel Centripetal Direction Network (CDNet) for nuclear instance segmentation. Specifically, we define the centripetal direction feature as a class of adjacent directions pointing to the nuclear center to represent the spatial relationship between pixels within the nucleus. These direction features are then used to construct a direction difference map to represent the similarity within instances and the differences between instances. Finally, we propose a direction-guided refinement module, which acts as a plug-and-play module to effectively integrate auxiliary tasks and aggregate the features of different branches. Experiments on MoNuSeg and CPM17 datasets show that CDNet is significantly better than the other methods and achieves the state-of-the-art performance. The code is available at https://github.com/honglianghe/CDNet.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/He_CDNet_Centripetal_Direction_Network_for_Nuclear_Instance_Segmentation_ICCV_2021_paper.pdf", @@ -5162,14 +5515,15 @@ "author_num": 9, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/He_CDNet_Centripetal_Direction_Network_for_Nuclear_Instance_Segmentation_ICCV_2021_paper.html", "aff_unique_index": "0+1;0;2;1;0+1;0+1;3;1;0+1", - "aff_unique_norm": "Peking University;Pengcheng Laboratory;University of Chinese Academy of Sciences;Sun Yat-sen University", - "aff_unique_dep": "School of Electronic and Computer Engineering;Peng Cheng Laboratory;;", + "aff_unique_norm": "Peking University;Peng Cheng Laboratory;University of Chinese Academy of Sciences;Sun Yat-sen University", + "aff_unique_dep": "School of Electronic and Computer Engineering;;;", "aff_unique_url": "http://www.pku.edu.cn;;http://www.ucas.ac.cn;http://www.sysu.edu.cn/", "aff_unique_abbr": "PKU;;UCAS;SYSU", "aff_campus_unique_index": "0+0;0;1;0;0+0;0+0;2;0;0+0", "aff_campus_unique": "Shenzhen;Beijing;Guangzhou", "aff_country_unique_index": "0+0;0;0;0;0+0;0+0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{He_2021_ICCV,\n \n author = {\n He,\n Hongliang and Huang,\n Zhongyi and Ding,\n Yao and Song,\n Guoli and Wang,\n Lin and Ren,\n Qian and Wei,\n Pengxu and Gao,\n Zhiqiang and Chen,\n Jie\n},\n title = {\n CDNet: Centripetal Direction Network for Nuclear Instance Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4026-4035\n} \n}" }, { "title": "CDS: Cross-Domain Self-Supervised Pre-Training", @@ -5177,6 +5531,7 @@ "status": "Poster", "track": "main", "pid": 3048, + "author_site": "Donghyun Kim; Kuniaki Saito; Tae-Hyun Oh; Bryan A. Plummer; Stan Sclaroff; Kate Saenko", "author": "Donghyun Kim; Kuniaki Saito; Tae-Hyun Oh; Bryan A. Plummer; Stan Sclaroff; Kate Saenko", "abstract": "We present a two-stage pre-training approach that improves the generalization ability of standard single-domain pre-training. While standard pre-training on a single large dataset (such as ImageNet) can provide a good initial representation for transfer learning tasks, this approach may result in biased representations that impact the success of learning with new multi-domain data (e.g., different artistic styles) via methods like domain adaptation. We propose a novel pre-training approach called Cross-Domain Self-supervision (CDS), which directly employs unlabeled multi-domain data for downstream domain transfer tasks. Our approach uses self-supervision not only within a single domain but also across domains. In-domain instance discrimination is used to learn discriminative features on new data in a domain-adaptive manner, while cross-domain matching is used to learn domain-invariant features. We apply our method as a second pre-training step (after ImageNet pre-training), resulting in a significant target accuracy boost to diverse domain transfer tasks compared to standard one-stage pre-training.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Kim_CDS_Cross-Domain_Self-Supervised_Pre-Training_ICCV_2021_paper.pdf", @@ -5200,7 +5555,8 @@ "aff_campus_unique_index": "1;", "aff_campus_unique": ";Pohang", "aff_country_unique_index": "0;0;1;0;0;0+0", - "aff_country_unique": "United States;South Korea" + "aff_country_unique": "United States;South Korea", + "bibtex": "@InProceedings{Kim_2021_ICCV,\n \n author = {\n Kim,\n Donghyun and Saito,\n Kuniaki and Oh,\n Tae-Hyun and Plummer,\n Bryan A. and Sclaroff,\n Stan and Saenko,\n Kate\n},\n title = {\n CDS: Cross-Domain Self-Supervised Pre-Training\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9123-9132\n} \n}" }, { "title": "CLEAR: Clean-Up Sample-Targeted Backdoor in Neural Networks", @@ -5208,6 +5564,7 @@ "status": "Poster", "track": "main", "pid": 3446, + "author_site": "Liuwan Zhu; Rui Ning; Chunsheng Xin; Chonggang Wang; Hongyi Wu", "author": "Liuwan Zhu; Rui Ning; Chunsheng Xin; Chonggang Wang; Hongyi Wu", "abstract": "The data poisoning attack has raised serious security concerns on the safety of deep neural networks since it can lead to neural backdoor that misclassifies certain inputs crafted by an attacker. In particular, the sample-targeted backdoor attack is a new challenge. It targets at one or a few specific samples, called target samples, to misclassify them to a target class. Without a trigger planted in the backdoor model, the existing backdoor detection schemes fail to detect the sample-targeted backdoor as they depend on reverse-engineering the trigger or strong features of the trigger. In this paper, we propose a novel scheme to detect and mitigate sample-targeted backdoor attacks. We discover and demonstrate a unique property of the sample-targeted backdoor, which forces a boundary change such that small \"pockets\" are formed around the target sample. Based on this observation, we propose a novel defense mechanism to pinpoint a malicious pocket by \"wrapping\" them into a tight convex hull in the feature space. We design an effective algorithm to search for such a convex hull and remove the backdoor by fine-tuning the model using the identified malicious samples with the corrected label according to the convex hull. The experiments show that the proposed approach is highly efficient for detecting and mitigating a wide range of sample-targeted backdoor attacks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhu_CLEAR_Clean-Up_Sample-Targeted_Backdoor_in_Neural_Networks_ICCV_2021_paper.pdf", @@ -5227,11 +5584,12 @@ "aff_unique_norm": "Old Dominion University;InterDigital Communications, Inc.", "aff_unique_dep": ";", "aff_unique_url": "https://www.odu.edu;https://www.interdigital.com", - "aff_unique_abbr": "ODU;InterDigital", + "aff_unique_abbr": "ODU;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhu_2021_ICCV,\n \n author = {\n Zhu,\n Liuwan and Ning,\n Rui and Xin,\n Chunsheng and Wang,\n Chonggang and Wu,\n Hongyi\n},\n title = {\n CLEAR: Clean-Up Sample-Targeted Backdoor in Neural Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 16453-16462\n} \n}" }, { "title": "CM-NAS: Cross-Modality Neural Architecture Search for Visible-Infrared Person Re-Identification", @@ -5239,10 +5597,11 @@ "status": "Poster", "track": "main", "pid": 6034, + "author_site": "Chaoyou Fu; Yibo Hu; Xiang Wu; Hailin Shi; Tao Mei; Ran He", "author": "Chaoyou Fu; Yibo Hu; Xiang Wu; Hailin Shi; Tao Mei; Ran He", "abstract": "Visible-Infrared person re-identification (VI-ReID) aims to match cross-modality pedestrian images, breaking through the limitation of single-modality person ReID in dark environment. In order to mitigate the impact of large modality discrepancy, existing works manually design various two-stream architectures to separately learn modality-specific and modality-sharable representations. Such a manual design routine, however, highly depends on massive experiments and empirical practice, which is time consuming and labor intensive. In this paper, we systematically study the manually designed architectures, and identify that appropriately separating Batch Normalization (BN) layers is the key to bring a great boost towards cross-modality matching. Based on this observation, the essential objective is to find the optimal separation scheme for each BN layer. To this end, we propose a novel method, named Cross-Modality Neural Architecture Search (CM-NAS). It consists of a BN-oriented search space in which the standard optimization can be fulfilled subject to the cross-modality task. Equipped with the searched architecture, our method outperforms state-of-the-art counterparts in both two benchmarks, improving the Rank-1/mAP by 6.70%/6.13% on SYSU-MM01 and by 12.17%/11.23% on RegDB. Code is released at https://github.com/JDAI-CV/CM-NAS.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Fu_CM-NAS_Cross-Modality_Neural_Architecture_Search_for_Visible-Infrared_Person_Re-Identification_ICCV_2021_paper.pdf", - "aff": "School of Arti\ufb01cial Intelligence, University of Chinese Academy of Sciences+NLPR & CEBSIT & CRIPAC, CASIA; JD AI Research; NLPR & CEBSIT & CRIPAC, CASIA; JD AI Research; JD AI Research; School of Arti\ufb01cial Intelligence, University of Chinese Academy of Sciences+NLPR & CEBSIT & CRIPAC, CASIA", + "aff": "School of Artificial Intelligence, University of Chinese Academy of Sciences+NLPR & CEBSIT & CRIPAC, CASIA; JD AI Research; NLPR & CEBSIT & CRIPAC, CASIA; JD AI Research; JD AI Research; School of Artificial Intelligence, University of Chinese Academy of Sciences+NLPR & CEBSIT & CRIPAC, CASIA", "project": "", "github": "https://github.com/JDAI-CV/CM-NAS", "supp": "", @@ -5255,14 +5614,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Fu_CM-NAS_Cross-Modality_Neural_Architecture_Search_for_Visible-Infrared_Person_Re-Identification_ICCV_2021_paper.html", "aff_unique_index": "0+1;2;1;2;2;0+1", - "aff_unique_norm": "University of Chinese Academy of Sciences;Chinese Academy of Sciences;JD", - "aff_unique_dep": "School of Arti\ufb01cial Intelligence;NLPR, CEBSIT, CRIPAC;JD AI Research", - "aff_unique_url": "http://www.ucas.ac.cn;http://www.casia.ac.cn;https://www.jd.com", + "aff_unique_norm": "University of Chinese Academy of Sciences;Chinese Academy of Sciences Institute of Automation;JD AI Research", + "aff_unique_dep": "School of Artificial Intelligence;NLPR, CEBSIT, CRIPAC;", + "aff_unique_url": "http://www.ucas.ac.cn;http://www.ia.cas.cn;https://www.jd.com", "aff_unique_abbr": "UCAS;CASIA;JD AI", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Fu_2021_ICCV,\n \n author = {\n Fu,\n Chaoyou and Hu,\n Yibo and Wu,\n Xiang and Shi,\n Hailin and Mei,\n Tao and He,\n Ran\n},\n title = {\n CM-NAS: Cross-Modality Neural Architecture Search for Visible-Infrared Person Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11823-11832\n} \n}" }, { "title": "CODEs: Chamfer Out-of-Distribution Examples Against Overconfidence Issue", @@ -5270,6 +5630,7 @@ "status": "Poster", "track": "main", "pid": 2367, + "author_site": "Keke Tang; Dingruibo Miao; Weilong Peng; Jianpeng Wu; Yawen Shi; Zhaoquan Gu; Zhihong Tian; Wenping Wang", "author": "Keke Tang; Dingruibo Miao; Weilong Peng; Jianpeng Wu; Yawen Shi; Zhaoquan Gu; Zhihong Tian; Wenping Wang", "abstract": "Overconfident predictions on out-of-distribution (OOD) samples is a thorny issue for deep neural networks. The key to resolve the OOD overconfidence issue inherently is to build a subset of OOD samples and then suppress predictions on them. This paper proposes the Chamfer OOD examples (CODEs), whose distribution is close to that of in-distribution samples, and thus could be utilized to alleviate the OOD overconfidence issue effectively by suppressing predictions on them. To obtain CODEs, we first generate seed OOD examples via slicing&splicing operations on in-distribution samples from different categories, and then feed them to the Chamfer generative adversarial network for distribution transformation, without accessing to any extra data. Training with suppressing predictions on CODEs is validated to alleviate the OOD overconfidence issue largely without hurting classification accuracy, and outperform the state-of-the-art methods. Besides, we demonstrate CODEs are useful for improving OOD detection and classification.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Tang_CODEs_Chamfer_Out-of-Distribution_Examples_Against_Overconfidence_Issue_ICCV_2021_paper.pdf", @@ -5286,14 +5647,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Tang_CODEs_Chamfer_Out-of-Distribution_Examples_Against_Overconfidence_Issue_ICCV_2021_paper.html", "aff_unique_index": "0+1;0+1;0;0;0;0;0;2+3", - "aff_unique_norm": "Guangzhou University;Pengcheng Laboratory;Texas A&M University;University of Hong Kong", - "aff_unique_dep": ";Peng Cheng Laboratory;;", + "aff_unique_norm": "Guangzhou University;Peng Cheng Laboratory;Texas A&M University;The University of Hong Kong", + "aff_unique_dep": ";;;", "aff_unique_url": "http://www.gzhu.edu.cn;http://www.pcl.ac.cn;https://www.tamu.edu;https://www.hku.hk", "aff_unique_abbr": "GU;PCL;TAMU;HKU", "aff_campus_unique_index": ";;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0+0;0+0;0;0;0;0;0;1+0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Tang_2021_ICCV,\n \n author = {\n Tang,\n Keke and Miao,\n Dingruibo and Peng,\n Weilong and Wu,\n Jianpeng and Shi,\n Yawen and Gu,\n Zhaoquan and Tian,\n Zhihong and Wang,\n Wenping\n},\n title = {\n CODEs: Chamfer Out-of-Distribution Examples Against Overconfidence Issue\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1153-1162\n} \n}" }, { "title": "COMISR: Compression-Informed Video Super-Resolution", @@ -5301,6 +5663,7 @@ "status": "Poster", "track": "main", "pid": 10703, + "author_site": "Yinxiao Li; Pengchong Jin; Feng Yang; Ce Liu; Ming-Hsuan Yang; Peyman Milanfar", "author": "Yinxiao Li; Pengchong Jin; Feng Yang; Ce Liu; Ming-Hsuan Yang; Peyman Milanfar", "abstract": "Most video super-resolution methods focus on restoring high-resolution video frames from low-resolution videos without taking into account compression. However, most videos on the web or mobile devices are compressed, and the compression can be severe when the bandwidth is limited. In this paper, we propose a new compression-informed video super-resolution model to restore high-resolution content without introducing artifacts caused by compression. The proposed model consists of three modules for video super-resolution: bi-directional recurrent warping, detail-preserving flow estimation, and Laplacian enhancement. All these three modules are used to deal with compression properties such as the location of the intra-frames in the input and smoothness in the output frames. For thorough performance evaluation, we conducted extensive experiments on standard datasets with a wide range of compression rates, covering many real video use cases. We showed that our method not only recovers high-resolution content on uncompressed frames from the widely-used benchmark datasets, but also achieves state-of-the-art performance in super-resolving compressed videos based on numerous quantitative metrics. We also evaluated the proposed method by simulating streaming from YouTube to demonstrate its effectiveness and robustness. The source codes and trained models are available at https://github.com/google-research/google-research/tree/master/comisr.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_COMISR_Compression-Informed_Video_Super-Resolution_ICCV_2021_paper.pdf", @@ -5318,13 +5681,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Li_COMISR_Compression-Informed_Video_Super-Resolution_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "Google", - "aff_unique_dep": "Google", + "aff_unique_dep": "", "aff_unique_url": "https://www.google.com", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Yinxiao and Jin,\n Pengchong and Yang,\n Feng and Liu,\n Ce and Yang,\n Ming-Hsuan and Milanfar,\n Peyman\n},\n title = {\n COMISR: Compression-Informed Video Super-Resolution\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2543-2552\n} \n}" }, { "title": "COOKIE: Contrastive Cross-Modal Knowledge Sharing Pre-Training for Vision-Language Representation", @@ -5332,6 +5696,7 @@ "status": "Poster", "track": "main", "pid": 8814, + "author_site": "Keyu Wen; Jin Xia; Yuanyuan Huang; Linyang Li; Jiayan Xu; Jie Shao", "author": "Keyu Wen; Jin Xia; Yuanyuan Huang; Linyang Li; Jiayan Xu; Jie Shao", "abstract": "There has been a recent surge of interest in cross-modal pre-training. However, existed approaches pre-train a one-stream model to learn joint vision-language representation, which suffers from calculation explosion when conducting cross-modal retrieval. In this work, we propose the Contrastive Cross-Modal Knowledge Sharing Pre-training (COOKIE) method to learn universal text-image representations. There are two key designs in it, one is the weight-sharing transformer on top of the visual and textual encoders to align text and image semantically, the other is three kinds of contrastive learning designed for sharing knowledge between different modalities. Cross-modal knowledge sharing greatly promotes the learning of unimodal representation. Experiments on multi-modal matching tasks including cross-modal retrieval, text matching, and image retrieval show the effectiveness and efficiency of our pre-training framework. Our COOKIE fine-tuned on cross-modal datasets MSCOCO, Flickr30K, and MSRVTT achieves new state-of-the-art results while using only 3/1000 inference time comparing to one-stream models. There are also 5.7 and 3.9 improvements in the task of image retrieval and text matching. Source code will be made public.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wen_COOKIE_Contrastive_Cross-Modal_Knowledge_Sharing_Pre-Training_for_Vision-Language_Representation_ICCV_2021_paper.pdf", @@ -5355,7 +5720,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wen_2021_ICCV,\n \n author = {\n Wen,\n Keyu and Xia,\n Jin and Huang,\n Yuanyuan and Li,\n Linyang and Xu,\n Jiayan and Shao,\n Jie\n},\n title = {\n COOKIE: Contrastive Cross-Modal Knowledge Sharing Pre-Training for Vision-Language Representation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2208-2217\n} \n}" }, { "title": "COTR: Correspondence Transformer for Matching Across Images", @@ -5363,6 +5729,7 @@ "status": "Poster", "track": "main", "pid": 3943, + "author_site": "Wei Jiang; Eduard Trulls; Jan Hosang; Andrea Tagliasacchi; Kwang Moo Yi", "author": "Wei Jiang; Eduard Trulls; Jan Hosang; Andrea Tagliasacchi; Kwang Moo Yi", "abstract": "We propose a novel framework for finding correspondences in images based on a deep neural network that, given two images and a query point in one of them, finds its correspondence in the other. By doing so, one has the option to query only the points of interest and retrieve sparse correspondences, or to query all points in an image and obtain dense mappings. Importantly, in order to capture both local and global priors, and to let our model relate between image regions using the most relevant among said priors, we realize our network using a transformer. At inference time, we apply our correspondence network by recursively zooming in around the estimates, yielding a multi-scale pipeline able to provide highly-accurate correspondences. Our method significantly outperforms the state-of-the-art on both sparse and dense correspondence problems on multiple datasets and tasks, ranging from wide-baseline stereo to optical flow, without any retraining for a specific dataset.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Jiang_COTR_Correspondence_Transformer_for_Matching_Across_Images_ICCV_2021_paper.pdf", @@ -5386,7 +5753,8 @@ "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;1;1+0;0", - "aff_country_unique": "Canada;United States" + "aff_country_unique": "Canada;United States", + "bibtex": "@InProceedings{Jiang_2021_ICCV,\n \n author = {\n Jiang,\n Wei and Trulls,\n Eduard and Hosang,\n Jan and Tagliasacchi,\n Andrea and Yi,\n Kwang Moo\n},\n title = {\n COTR: Correspondence Transformer for Matching Across Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6207-6217\n} \n}" }, { "title": "CPF: Learning a Contact Potential Field To Model the Hand-Object Interaction", @@ -5394,6 +5762,7 @@ "status": "Poster", "track": "main", "pid": 2919, + "author_site": "Lixin Yang; Xinyu Zhan; Kailin Li; Wenqiang Xu; Jiefeng Li; Cewu Lu", "author": "Lixin Yang; Xinyu Zhan; Kailin Li; Wenqiang Xu; Jiefeng Li; Cewu Lu", "abstract": "Modeling the hand-object (HO) interaction not only requires estimation of the HO pose, but also pays attention to the contact due to their interaction. Significant progress has been made in estimating hand and object separately with deep learning methods, simultaneous HO pose estimation and contact modeling has not yet been fully explored. In this paper, we present an explicit contact representation namely Contact Potential Field (CPF), and a learning-fitting hybrid framework namely MIHO to Modeling the Interaction of Hand and Object. In CPF, we treat each contacting HO vertex pair as a spring-mass system. Hence the whole system forms a potential field with minimal elastic energy at the grasp position. Extensive experiments on the two commonly used benchmarks have demonstrated that our method can achieve state-of-the-art in several reconstruction metrics, and allow us to produce more physically plausible HO pose even when the ground-truth exhibits severe interpenetration or disjointedness. Our code is available at https://github.com/lixiny/CPF.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yang_CPF_Learning_a_Contact_Potential_Field_To_Model_the_Hand-Object_ICCV_2021_paper.pdf", @@ -5417,7 +5786,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0+0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yang_2021_ICCV,\n \n author = {\n Yang,\n Lixin and Zhan,\n Xinyu and Li,\n Kailin and Xu,\n Wenqiang and Li,\n Jiefeng and Lu,\n Cewu\n},\n title = {\n CPF: Learning a Contact Potential Field To Model the Hand-Object Interaction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11097-11106\n} \n}" }, { "title": "CPFN: Cascaded Primitive Fitting Networks for High-Resolution Point Clouds", @@ -5425,7 +5795,8 @@ "status": "Poster", "track": "main", "pid": 6977, - "author": "Eric-Tuan L\u00ea; Minhyuk Sung; Duygu Ceylan; Radomir Mech; Tamy Boubekeur; Niloy J. Mitra", + "author_site": "Eric-Tuan Lê; Minhyuk Sung; Duygu Ceylan; Radomir Mech; Tamy Boubekeur; Niloy J. Mitra", + "author": "Eric-Tuan Lê; Minhyuk Sung; Duygu Ceylan; Radomir Mech; Tamy Boubekeur; Niloy J. Mitra", "abstract": "Representing human-made objects as a collection of base primitives has a long history in computer vision and reverse engineering. In the case of high-resolution point cloud scans, the challenge is to be able to detect both large primitives as well as those explaining the detailed parts. While the classical RANSAC approach requires case-specific parameter tuning, state-of-the-art networks are limited by memory consumption of their backbone modules such as PointNet++, and hence fail to detect the fine-scale primitives. We present Cascaded Primitive Fitting Networks (CPFN) that relies on an adaptive patch sampling network to assemble detection results of global and local primitive detection networks. As a key enabler, we present a merging formulation that dynamically aggregates the primitives across global and local scales. Our evaluation demonstrates that CPFN improves the state-of-the-art SPFN performance by 13-14% on high-resolution point cloud datasets and specifically improves the detection of fine-scale primitives by 20-22%. Our code is available at: https://github.com/erictuanle/CPFN", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Le_CPFN_Cascaded_Primitive_Fitting_Networks_for_High-Resolution_Point_Clouds_ICCV_2021_paper.pdf", "aff": "University College London; KAIST; Adobe Research; Adobe Research; Adobe Research; University College London+Adobe Research", @@ -5448,7 +5819,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;2;2;0+2", - "aff_country_unique": "United Kingdom;South Korea;United States" + "aff_country_unique": "United Kingdom;South Korea;United States", + "bibtex": "@InProceedings{Le_2021_ICCV,\n \n author = {\n L\\^e,\n Eric-Tuan and Sung,\n Minhyuk and Ceylan,\n Duygu and Mech,\n Radomir and Boubekeur,\n Tamy and Mitra,\n Niloy J.\n},\n title = {\n CPFN: Cascaded Primitive Fitting Networks for High-Resolution Point Clouds\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7457-7466\n} \n}" }, { "title": "CR-Fill: Generative Image Inpainting With Auxiliary Contextual Reconstruction", @@ -5456,6 +5828,7 @@ "status": "Poster", "track": "main", "pid": 3561, + "author_site": "Yu Zeng; Zhe Lin; Huchuan Lu; Vishal M. Patel", "author": "Yu Zeng; Zhe Lin; Huchuan Lu; Vishal M. Patel", "abstract": "Recent deep generative inpainting methods use attention layers to allow the generator to explicitly borrow feature patches from the known region to complete a missing region. Due to the lack of supervision signals for the correspondence between missing regions and known regions, it may fail to find proper reference features, which often leads to artifacts in the results. Also, it computes pair-wise similarity across the entire feature map during inference bringing a significant computational overhead. To address this issue, we propose to teach such patch-borrowing behavior to an attention-free generator by joint training of an auxiliary contextual reconstruction task, which encourages the generated output to be plausible even when reconstructed by surrounding regions. The auxiliary branch can be seen as a learnable loss function, i.e. named as contextual reconstruction (CR) loss, where query-reference feature similarity and reference-based reconstructor are jointly optimized with the inpainting generator. The auxiliary branch (i.e. CR loss) is required only during training, and only the inpainting generator is required during the inference. Experimental results demonstrate that the proposed inpainting model compares favourably against the state-of-the-art in terms of quantitative and visual performance. Code is available at https://github.com/zengxianyu/crfill.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zeng_CR-Fill_Generative_Image_Inpainting_With_Auxiliary_Contextual_Reconstruction_ICCV_2021_paper.pdf", @@ -5470,7 +5843,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zeng_CR-Fill_Generative_Image_Inpainting_With_Auxiliary_Contextual_Reconstruction_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zeng_CR-Fill_Generative_Image_Inpainting_With_Auxiliary_Contextual_Reconstruction_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Zeng_2021_ICCV,\n \n author = {\n Zeng,\n Yu and Lin,\n Zhe and Lu,\n Huchuan and Patel,\n Vishal M.\n},\n title = {\n CR-Fill: Generative Image Inpainting With Auxiliary Contextual Reconstruction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14164-14173\n} \n}" }, { "title": "CSG-Stump: A Learning Friendly CSG-Like Representation for Interpretable Shape Parsing", @@ -5478,6 +5852,7 @@ "status": "Poster", "track": "main", "pid": 2600, + "author_site": "Daxuan Ren; Jianmin Zheng; Jianfei Cai; Jiatong Li; Haiyong Jiang; Zhongang Cai; Junzhe Zhang; Liang Pan; Mingyuan Zhang; Haiyu Zhao; Shuai Yi", "author": "Daxuan Ren; Jianmin Zheng; Jianfei Cai; Jiatong Li; Haiyong Jiang; Zhongang Cai; Junzhe Zhang; Liang Pan; Mingyuan Zhang; Haiyu Zhao; Shuai Yi", "abstract": "Generating an interpretable and compact representation of 3D shapes from point clouds is an important and challenging problem. This paper presents CSG-Stump Net, an unsupervised end-to-end network for learning shapes from point clouds and discovering the underlying constituent modeling primitives and operations as well. At the core is a three-level structure called CSG-Stump , consisting of a complement layer at the bottom, an intersection layer in the middle, and a union layer at the top. CSG-Stump is proven to be equivalent to CSG in terms of representation, therefore inheriting the interpretable, compact and editable nature of CSG while freeing from CSG's complex tree structures. Particularly, the CSG-Stump has a simple and regular structure, allowing neural networks to give outputs of a constant dimensionality, which makes itself deep-learning friendly. Due to these characteristics of CSG-Stump, CSG-Stump Net achieves superior results compared to previous CSG-based methods and generates much more appealing shapes, as confirmed by extensive experiment", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ren_CSG-Stump_A_Learning_Friendly_CSG-Like_Representation_for_Interpretable_Shape_Parsing_ICCV_2021_paper.pdf", @@ -5501,7 +5876,8 @@ "aff_campus_unique_index": ";;;;", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0;2;0+1;1;0+0;0+0;0;0+0;0;1", - "aff_country_unique": "Singapore;China;Australia" + "aff_country_unique": "Singapore;China;Australia", + "bibtex": "@InProceedings{Ren_2021_ICCV,\n \n author = {\n Ren,\n Daxuan and Zheng,\n Jianmin and Cai,\n Jianfei and Li,\n Jiatong and Jiang,\n Haiyong and Cai,\n Zhongang and Zhang,\n Junzhe and Pan,\n Liang and Zhang,\n Mingyuan and Zhao,\n Haiyu and Yi,\n Shuai\n},\n title = {\n CSG-Stump: A Learning Friendly CSG-Like Representation for Interpretable Shape Parsing\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12478-12487\n} \n}" }, { "title": "CTRL-C: Camera Calibration TRansformer With Line-Classification", @@ -5509,6 +5885,7 @@ "status": "Poster", "track": "main", "pid": 7206, + "author_site": "Jinwoo Lee; Hyunsung Go; Hyunjoon Lee; Sunghyun Cho; Minhyuk Sung; Junho Kim", "author": "Jinwoo Lee; Hyunsung Go; Hyunjoon Lee; Sunghyun Cho; Minhyuk Sung; Junho Kim", "abstract": "Single image camera calibration is the task of estimating the camera parameters from a single input image, such as the vanishing points, focal length, and horizon line. In this work, we propose Camera calibration TRansformer with Line-Classification (CTRL-C), an end-to-end neural network-based approach to single image camera calibration, which directly estimates the camera parameters from an image and a set of line segments. Our network adopts the transformer architecture to capture the global structure of an image with multi-modal inputs in an end-to-end manner. We also propose an auxiliary task of line classification to train the network to extract the global geometric information from lines effectively. Our experiments demonstrate that CTRL-C outperforms the previous state-of-the-art methods on the Google Street View and SUN360 benchmark datasets. Code is available at https://github.com/jwlee-vcl/CTRL-C.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Lee_CTRL-C_Camera_Calibration_TRansformer_With_Line-Classification_ICCV_2021_paper.pdf", @@ -5532,7 +5909,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Pohang", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Lee_2021_ICCV,\n \n author = {\n Lee,\n Jinwoo and Go,\n Hyunsung and Lee,\n Hyunjoon and Cho,\n Sunghyun and Sung,\n Minhyuk and Kim,\n Junho\n},\n title = {\n CTRL-C: Camera Calibration TRansformer With Line-Classification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 16228-16237\n} \n}" }, { "title": "CaT: Weakly Supervised Object Detection With Category Transfer", @@ -5540,6 +5918,7 @@ "status": "Poster", "track": "main", "pid": 11454, + "author_site": "Tianyue Cao; Lianyu Du; Xiaoyun Zhang; Siheng Chen; Ya Zhang; Yan-Feng Wang", "author": "Tianyue Cao; Lianyu Du; Xiaoyun Zhang; Siheng Chen; Ya Zhang; Yan-Feng Wang", "abstract": "A large gap exists between fully-supervised object detection and weakly-supervised object detection. To narrow this gap, some methods consider knowledge transfer from additional fully-supervised dataset. But these methods do not fully exploit discriminative category information in the fully-supervised dataset, thus causing low mAP. To solve this issue, we propose a novel category transfer framework for weakly supervised object detection. The intuition is to fully leverage both visually-discriminative and semantically-correlated category information in the fully-supervised dataset to enhance the object-classification ability of a weakly-supervised detector. To handle overlapping category transfer, we propose a double-supervision mean teacher to gather common category information and bridge the domain gap between two datasets. To handle non-overlapping category transfer, we propose a semantic graph convolutional network to promote the aggregation of semantic features between correlated categories. Experiments are conducted with Pascal VOC 2007 as the target weakly-supervised dataset and COCO as the source fully-supervised dataset. Our category transfer framework achieves 63.5% mAP and 80.3% CorLoc with 5 overlapping categories between two datasets, which outperforms the state-of-the-art methods. Codes are avaliable at https://github.com/MediaBrain-SJTU/CaT.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Cao_CaT_Weakly_Supervised_Object_Detection_With_Category_Transfer_ICCV_2021_paper.pdf", @@ -5563,7 +5942,8 @@ "aff_campus_unique_index": ";;;;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0+0;0+0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Cao_2021_ICCV,\n \n author = {\n Cao,\n Tianyue and Du,\n Lianyu and Zhang,\n Xiaoyun and Chen,\n Siheng and Zhang,\n Ya and Wang,\n Yan-Feng\n},\n title = {\n CaT: Weakly Supervised Object Detection With Category Transfer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3070-3079\n} \n}" }, { "title": "Calibrated Adversarial Refinement for Stochastic Semantic Segmentation", @@ -5571,6 +5951,7 @@ "status": "Poster", "track": "main", "pid": 4372, + "author_site": "Elias Kassapis; Georgi Dikov; Deepak K. Gupta; Cedric Nugteren", "author": "Elias Kassapis; Georgi Dikov; Deepak K. Gupta; Cedric Nugteren", "abstract": "In semantic segmentation tasks, input images can often have more than one plausible interpretation, thus allowing for multiple valid labels. To capture such ambiguities, recent work has explored the use of probabilistic networks that can learn a distribution over predictions. However, these do not necessarily represent the empirical distribution accurately. In this work, we present a strategy for learning a calibrated predictive distribution over semantic maps, where the probability associated with each prediction reflects its ground truth correctness likelihood. To this end, we propose a novel two-stage, cascaded approach for calibrated adversarial refinement: (i) a standard segmentation network is trained with categorical cross-entropy to predict a pixelwise probability distribution over semantic classes and (ii) an adversarially trained stochastic network is used to model the inter-pixel correlations to refine the output of the first network into coherent samples. Importantly, to calibrate the refinement network and prevent mode collapse, the expectation of the samples in the second stage is matched to the probabilities predicted in the first. We demonstrate the versatility and robustness of the approach by achieving state-of-the-art results on the multigrader LIDC dataset and on a modified Cityscapes dataset with injected ambiguities. In addition, we show that the core design can be adapted to other tasks requiring learning a calibrated predictive distribution by experimenting on a toy regression dataset. We provide an open source implementation of our method at https://github.com/EliasKassapis/CARSSS.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Kassapis_Calibrated_Adversarial_Refinement_for_Stochastic_Semantic_Segmentation_ICCV_2021_paper.pdf", @@ -5594,7 +5975,8 @@ "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Amsterdam", "aff_country_unique_index": "0+0;0;0;0", - "aff_country_unique": "Netherlands" + "aff_country_unique": "Netherlands", + "bibtex": "@InProceedings{Kassapis_2021_ICCV,\n \n author = {\n Kassapis,\n Elias and Dikov,\n Georgi and Gupta,\n Deepak K. and Nugteren,\n Cedric\n},\n title = {\n Calibrated Adversarial Refinement for Stochastic Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7057-7067\n} \n}" }, { "title": "Calibrated and Partially Calibrated Semi-Generalized Homographies", @@ -5602,7 +5984,8 @@ "status": "Poster", "track": "main", "pid": 9025, - "author": "Snehal Bhayani; Torsten Sattler; Daniel Barath; Patrik Beliansky; Janne Heikkil\u00e4; Zuzana Kukelova", + "author_site": "Snehal Bhayani; Torsten Sattler; Daniel Barath; Patrik Beliansky; Janne Heikkilä; Zuzana Kukelova", + "author": "Snehal Bhayani; Torsten Sattler; Daniel Barath; Patrik Beliansky; Janne Heikkilä; Zuzana Kukelova", "abstract": "In this paper, we propose the first minimal solutions for estimating the semi-generalized homography given a perspective and a generalized camera. The proposed solvers use five 2D-2D image point correspondences induced by a scene plane. One group of solvers assumes the perspective camera to be fully calibrated, while the other estimates the unknown focal length together with the absolute pose parameters. This setup is particularly important in structure-from-motion and visual localization pipelines, where a new camera is localized in each step with respect to a set of known cameras and 2D-3D correspondences might not be available. Thanks to a clever parametrization and the elimination ideal method, our solvers only need to solve a univariate polynomial of degree five or three, respectively a system of polynomial equations in two variables. All proposed solvers are stable and efficient as demonstrated by a number of synthetic and real-world experiments.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Bhayani_Calibrated_and_Partially_Calibrated_Semi-Generalized_Homographies_ICCV_2021_paper.pdf", "aff": ";;;;;", @@ -5616,7 +5999,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Bhayani_Calibrated_and_Partially_Calibrated_Semi-Generalized_Homographies_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Bhayani_Calibrated_and_Partially_Calibrated_Semi-Generalized_Homographies_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Bhayani_2021_ICCV,\n \n author = {\n Bhayani,\n Snehal and Sattler,\n Torsten and Barath,\n Daniel and Beliansky,\n Patrik and Heikkil\\"a,\n Janne and Kukelova,\n Zuzana\n},\n title = {\n Calibrated and Partially Calibrated Semi-Generalized Homographies\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5936-5945\n} \n}" }, { "title": "Calibrating Concepts and Operations: Towards Symbolic Reasoning on Real Images", @@ -5624,6 +6008,7 @@ "status": "Poster", "track": "main", "pid": 6608, + "author_site": "Zhuowan Li; Elias Stengel-Eskin; Yixiao Zhang; Cihang Xie; Quan Hung Tran; Benjamin Van Durme; Alan Yuille", "author": "Zhuowan Li; Elias Stengel-Eskin; Yixiao Zhang; Cihang Xie; Quan Hung Tran; Benjamin Van Durme; Alan Yuille", "abstract": "While neural symbolic methods demonstrate impressive performance in visual question answering on synthetic images, their performance suffers on real images. We identify that the long-tail distribution of visual concepts and unequal importance of reasoning steps in real data are the two key obstacles that limit the models' real-world potentials. To address these challenges, we propose a new paradigm, Calibrating Concepts and Operations (CCO), which enables neural symbolic models to capture underlying data characteristics and to reason with hierarchical importance. Specifically, we introduce an executor with learnable concept embedding magnitudes for handling distribution imbalance, and an operation calibrator for highlighting important operations and suppressing redundant ones. Our experiments show CCO substantially boosts the performance of neural symbolic methods on real images. By evaluating models on the real world dataset GQA, CCO helps the neural symbolic method NSCL outperforms its vanilla counterpart by 9.1% (from 47.0% to 56.1%); this result also largely reduces the performance gap between symbolic and non-symbolic methods. Additionally, we create a perturbed test set for better understanding and analyzing model performance on real images. Code is available at https://lizw14.github.io/project/ccosr.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_Calibrating_Concepts_and_Operations_Towards_Symbolic_Reasoning_on_Real_Images_ICCV_2021_paper.pdf", @@ -5638,7 +6023,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Li_Calibrating_Concepts_and_Operations_Towards_Symbolic_Reasoning_on_Real_Images_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Li_Calibrating_Concepts_and_Operations_Towards_Symbolic_Reasoning_on_Real_Images_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Zhuowan and Stengel-Eskin,\n Elias and Zhang,\n Yixiao and Xie,\n Cihang and Tran,\n Quan Hung and Van Durme,\n Benjamin and Yuille,\n Alan\n},\n title = {\n Calibrating Concepts and Operations: Towards Symbolic Reasoning on Real Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14910-14919\n} \n}" }, { "title": "Camera Distortion-Aware 3D Human Pose Estimation in Video With Optimization-Based Meta-Learning", @@ -5646,6 +6032,7 @@ "status": "Poster", "track": "main", "pid": 7688, + "author_site": "Hanbyel Cho; Yooshin Cho; Jaemyung Yu; Junmo Kim", "author": "Hanbyel Cho; Yooshin Cho; Jaemyung Yu; Junmo Kim", "abstract": "Existing 3D human pose estimation algorithms trained on distortion-free datasets suffer performance drop when applied to new scenarios with a specific camera distortion. In this paper, we propose a simple yet effective model for 3D human pose estimation in video that can quickly adapt to any distortion environment by utilizing MAML, a representative optimization-based meta-learning algorithm. We consider a sequence of 2D keypoints in a particular distortion as a single task of MAML. However, due to the absence of a large-scale dataset in a distorted environment, we propose an efficient method to generate synthetic distorted data from undistorted 2D keypoints. For the evaluation, we assume two practical testing situations depending on whether a motion capture sensor is available or not. In particular, we propose Inference Stage Optimization using bone-length symmetry and consistency. Extensive evaluation shows that our proposed method successfully adapts to various degrees of distortion in the testing phase and outperforms the existing state-of-the-art approaches. The proposed method is useful in practice because it does not require camera calibration and additional computations in a testing set-up.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Cho_Camera_Distortion-Aware_3D_Human_Pose_Estimation_in_Video_With_Optimization-Based_ICCV_2021_paper.pdf", @@ -5669,7 +6056,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Cho_2021_ICCV,\n \n author = {\n Cho,\n Hanbyel and Cho,\n Yooshin and Yu,\n Jaemyung and Kim,\n Junmo\n},\n title = {\n Camera Distortion-Aware 3D Human Pose Estimation in Video With Optimization-Based Meta-Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11169-11178\n} \n}" }, { "title": "Can Scale-Consistent Monocular Depth Be Learned in a Self-Supervised Scale-Invariant Manner?", @@ -5677,6 +6065,7 @@ "status": "Poster", "track": "main", "pid": 2514, + "author_site": "Lijun Wang; Yifan Wang; Linzhao Wang; Yunlong Zhan; Ying Wang; Huchuan Lu", "author": "Lijun Wang; Yifan Wang; Linzhao Wang; Yunlong Zhan; Ying Wang; Huchuan Lu", "abstract": "Geometric constraints are shown to enforce scale consistency and remedy the scale ambiguity issue in self-supervised monocular depth estimation. Meanwhile, scale-invariant losses focus on learning relative depth, leading to accurate relative depth prediction. To combine the best of both worlds, we learn scale-consistent self-supervised depth in a scale-invariant manner. Towards this goal, we present a scale-aware geometric (SAG) loss, which enforces scale consistency through point cloud alignment. Compared to prior arts, SAG loss takes relative scale into consideration during relative motion estimation, enabling more precise alignment and explicit supervision for scale inference. In addition, a novel two-stream architecture for depth estimation is designed, which disentangles scale from depth estimation and allows depth to be learned in a scale-invariant manner. The integration of SAG loss and two-stream network enables more consistent scale inference and more accurate relative depth estimation. Our method achieves state-of-the-art performance under both scale-invariant and scale-dependent evaluation settings.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_Can_Scale-Consistent_Monocular_Depth_Be_Learned_in_a_Self-Supervised_Scale-Invariant_ICCV_2021_paper.pdf", @@ -5693,14 +6082,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wang_Can_Scale-Consistent_Monocular_Depth_Be_Learned_in_a_Self-Supervised_Scale-Invariant_ICCV_2021_paper.html", "aff_unique_index": "0;0;1;1;1;0+2", - "aff_unique_norm": "Dalian University of Technology;Huawei;Pengcheng Laboratory", - "aff_unique_dep": ";Huawei Technologies;Peng Cheng Lab", + "aff_unique_norm": "Dalian University of Technology;Huawei Technologies;Peng Cheng Lab", + "aff_unique_dep": ";;", "aff_unique_url": "http://www.dlut.edu.cn/;https://www.huawei.com;", "aff_unique_abbr": "DUT;Huawei;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Lijun and Wang,\n Yifan and Wang,\n Linzhao and Zhan,\n Yunlong and Wang,\n Ying and Lu,\n Huchuan\n},\n title = {\n Can Scale-Consistent Monocular Depth Be Learned in a Self-Supervised Scale-Invariant Manner?\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12727-12736\n} \n}" }, { "title": "Can Shape Structure Features Improve Model Robustness Under Diverse Adversarial Settings?", @@ -5708,6 +6098,7 @@ "status": "Poster", "track": "main", "pid": 11187, + "author_site": "Mingjie Sun; Zichao Li; Chaowei Xiao; Haonan Qiu; Bhavya Kailkhura; Mingyan Liu; Bo Li", "author": "Mingjie Sun; Zichao Li; Chaowei Xiao; Haonan Qiu; Bhavya Kailkhura; Mingyan Liu; Bo Li", "abstract": "Recent studies show that convolutional neural networks (CNNs) are vulnerable under various settings, including adversarial attacks, common corruptions, and backdoor attacks. Motivated by the findings that human visual system pays more attention to global structure (e.g., shapes) for recognition while CNNs are biased towards local texture features in images, in this work we aim to analyze whether \"edge features\" could improve the recognition robustness in these scenarios, and if so, to what extent? To answer these questions and systematically evaluate the global structure features, we focus on shape features and pro-pose two edge-enabled pipelines EdgeNetRob and Edge-GANRob, forcing the CNNs to rely more on edge features. Specifically, EdgeNetRob and EdgeGANRob first explicitly extract shape structure features from a given image via an edge detection algorithm. Then EdgeNetRob trains down-stream learning tasks directly on the extracted edge features, while EdgeGANRob reconstructs a new image by re-filling the texture information with a trained generative adversarial network (GANs). To reduce the sensitivity of edge detection algorithms to perturbations, we additionally propose a robust edge detection approach Robust Canny based on vanilla Canny. Based on our evaluation, we find that EdgeNetRob can help boost model robustness under differ-ent attack scenarios at the cost of the clean model accuracy. EdgeGANRob, on the other hand, is able to improve the clean model accuracy compared to EdgeNetRob while preserving robustness. This shows that given such edge features, how to leverage them matters for robustness, and it also depends on data properties. Our systematic studies on edge structure features under different settings will shed light on future robust feature exploration and optimization.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Sun_Can_Shape_Structure_Features_Improve_Model_Robustness_Under_Diverse_Adversarial_ICCV_2021_paper.pdf", @@ -5724,14 +6115,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Sun_Can_Shape_Structure_Features_Improve_Model_Robustness_Under_Diverse_Adversarial_ICCV_2021_paper.html", "aff_unique_index": "0;1;2+3;4;5;6;7", - "aff_unique_norm": "Carnegie Mellon University;University of California, Santa Cruz;NVIDIA;Arizona State University;Nanyang Technological University;Lawrence Livermore National Laboratory;University of Michigan;University of Illinois Urbana-Champaign", - "aff_unique_dep": ";;NVIDIA Corporation;;;;;", + "aff_unique_norm": "Carnegie Mellon University;University of California, Santa Cruz;NVIDIA Corporation;Arizona State University;Nanyang Technological University;Lawrence Livermore National Laboratory;University of Michigan;University of Illinois at Urbana-Champaign", + "aff_unique_dep": ";;;;;;;", "aff_unique_url": "https://www.cmu.edu;https://www.ucsc.edu;https://www.nvidia.com;https://www.asu.edu;https://www.ntu.edu.sg;https://www.llnl.gov;https://www.umich.edu;https://www illinois.edu", "aff_unique_abbr": "CMU;UCSC;NVIDIA;ASU;NTU;LLNL;UM;UIUC", "aff_campus_unique_index": "1;;2;3", "aff_campus_unique": ";Santa Cruz;Ann Arbor;Urbana-Champaign", "aff_country_unique_index": "0;0;0+0;1;0;0;0", - "aff_country_unique": "United States;Singapore" + "aff_country_unique": "United States;Singapore", + "bibtex": "@InProceedings{Sun_2021_ICCV,\n \n author = {\n Sun,\n Mingjie and Li,\n Zichao and Xiao,\n Chaowei and Qiu,\n Haonan and Kailkhura,\n Bhavya and Liu,\n Mingyan and Li,\n Bo\n},\n title = {\n Can Shape Structure Features Improve Model Robustness Under Diverse Adversarial Settings?\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7526-7535\n} \n}" }, { "title": "CanvasVAE: Learning To Generate Vector Graphic Documents", @@ -5739,6 +6131,7 @@ "status": "Poster", "track": "main", "pid": 8643, + "author_site": "Kota Yamaguchi", "author": "Kota Yamaguchi", "abstract": "Vector graphic documents present visual elements in a resolution free, compact format and are often seen in creative applications. In this work, we attempt to learn a generative model of vector graphic documents. We define vector graphic documents by a multi-modal set of attributes associated to a canvas and a sequence of visual elements such as shapes, images, or texts, and train variational auto-encoders to learn the representation of the documents. We collect a new dataset of design templates from an online service that features complete document structure including occluded elements. In experiments, we show that our model, named CanvasVAE, constitutes a strong baseline for generative modeling of vector graphic documents.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yamaguchi_CanvasVAE_Learning_To_Generate_Vector_Graphic_Documents_ICCV_2021_paper.pdf", @@ -5760,7 +6153,8 @@ "aff_unique_url": "https://www.cyberagent.co.jp", "aff_unique_abbr": "CA", "aff_country_unique_index": "0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Yamaguchi_2021_ICCV,\n \n author = {\n Yamaguchi,\n Kota\n},\n title = {\n CanvasVAE: Learning To Generate Vector Graphic Documents\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5481-5489\n} \n}" }, { "title": "Cascade Image Matting With Deformable Graph Refinement", @@ -5768,6 +6162,7 @@ "status": "Poster", "track": "main", "pid": 8508, + "author_site": "Zijian Yu; Xuhui Li; Huijuan Huang; Wen Zheng; Li Chen", "author": "Zijian Yu; Xuhui Li; Huijuan Huang; Wen Zheng; Li Chen", "abstract": "Image matting refers to the estimation of the opacity of foreground objects. It requires correct contours and fine details of foreground objects for the matting results. To better accomplish human image matting tasks, we propose the Cascade Image Matting Network with Deformable Graph Refinement(CasDGR), which can automatically predict precise alpha mattes from single human images without any additional inputs. We adopt a network cascade architecture to perform matting from low-to-high resolution, which corresponds to coarse-to-fine optimization. We also introduce the Deformable Graph Refinement (DGR) module based on graph neural networks (GNNs) to overcome the limitations of convolutional neural networks (CNNs). The DGR module can effectively capture long-range relations and obtain more global and local information to help produce finer alpha mattes. We also reduce the computation complexity of the DGR module by dynamically predicting the neighbors and apply DGR module to higher-resolution features. Experimental results demonstrate the ability of our CasDGR to achieve state-of-the-art performance on synthetic datasets and produce good results on real human images.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yu_Cascade_Image_Matting_With_Deformable_Graph_Refinement_ICCV_2021_paper.pdf", @@ -5791,7 +6186,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yu_2021_ICCV,\n \n author = {\n Yu,\n Zijian and Li,\n Xuhui and Huang,\n Huijuan and Zheng,\n Wen and Chen,\n Li\n},\n title = {\n Cascade Image Matting With Deformable Graph Refinement\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7167-7176\n} \n}" }, { "title": "Causal Attention for Unbiased Visual Recognition", @@ -5799,6 +6195,7 @@ "status": "Poster", "track": "main", "pid": 2534, + "author_site": "Tan Wang; Chang Zhou; Qianru Sun; Hanwang Zhang", "author": "Tan Wang; Chang Zhou; Qianru Sun; Hanwang Zhang", "abstract": "Attention module does not always help deep models learn causal features that are robust in any confounding context, e.g., a foreground object feature is invariant to different backgrounds. This is because the confounders trick the attention to capture spurious correlations that benefit the prediction when the training and testing data are IID (identical & independent distribution); while harm the prediction when the data are OOD (out-of-distribution). The sole fundamental solution to learn causal attention is by causal intervention, which requires additional annotations of the confounders, e.g., a \"dog\" model is learned within \"grass+dog\" and \"road+dog\" respectively, so the \"grass\" and \"road\" contexts will no longer confound the \"dog\" recognition. However, such annotation is not only prohibitively expensive, but also inherently problematic, as the confounders are elusive in nature. In this paper, we propose a causal attention module (CaaM) that self-annotates the confounders in unsupervised fashion. In particular, multiple CaaMs can be stacked and integrated in conventional attention CNN and self-attention Vision Transformer. In OOD settings, deep models with CaaM outperform those without it significantly; even in IID settings, the attention localization is also improved by CaaM, showing a great potential in applications that require robust visual saliency. Codes are available at https://github.com/Wangt-CN/CaaM.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_Causal_Attention_for_Unbiased_Visual_Recognition_ICCV_2021_paper.pdf", @@ -5822,7 +6219,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", - "aff_country_unique": "Singapore;China" + "aff_country_unique": "Singapore;China", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Tan and Zhou,\n Chang and Sun,\n Qianru and Zhang,\n Hanwang\n},\n title = {\n Causal Attention for Unbiased Visual Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3091-3100\n} \n}" }, { "title": "Change Is Everywhere: Single-Temporal Supervised Object Change Detection in Remote Sensing Imagery", @@ -5830,6 +6228,7 @@ "status": "Poster", "track": "main", "pid": 2467, + "author_site": "Zhuo Zheng; Ailong Ma; Liangpei Zhang; Yanfei Zhong", "author": "Zhuo Zheng; Ailong Ma; Liangpei Zhang; Yanfei Zhong", "abstract": "For high spatial resolution (HSR) remote sensing images, bitemporal supervised learning always dominates change detection using many pairwise labeled bitemporal images. However, it is very expensive and time-consuming to pairwise label large-scale bitemporal HSR remote sensing images. In this paper, we propose single-temporal supervised learning (STAR) for change detection from a new perspective of exploiting object changes in unpaired images as supervisory signals. STAR enables us to train a high-accuracy change detector only using unpaired labeled images and generalize to real-world bitemporal images. To evaluate the effectiveness of STAR, we design a simple yet effective change detector called ChangeStar, which can reuse any deep semantic segmentation architecture by the ChangeMixin module. The comprehensive experimental results show that ChangeStar outperforms the baseline with a large margin under single-temporal supervision and achieves superior performance under bitemporal supervision. Code is available at https://github.com/Z-Zheng/ChangeStar.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zheng_Change_Is_Everywhere_Single-Temporal_Supervised_Object_Change_Detection_in_Remote_ICCV_2021_paper.pdf", @@ -5853,7 +6252,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zheng_2021_ICCV,\n \n author = {\n Zheng,\n Zhuo and Ma,\n Ailong and Zhang,\n Liangpei and Zhong,\n Yanfei\n},\n title = {\n Change Is Everywhere: Single-Temporal Supervised Object Change Detection in Remote Sensing Imagery\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15193-15202\n} \n}" }, { "title": "Channel Augmented Joint Learning for Visible-Infrared Recognition", @@ -5861,6 +6261,7 @@ "status": "Poster", "track": "main", "pid": 3219, + "author_site": "Mang Ye; Weijian Ruan; Bo Du; Mike Zheng Shou", "author": "Mang Ye; Weijian Ruan; Bo Du; Mike Zheng Shou", "abstract": "This paper introduces a powerful channel augmented joint learning strategy for the visible-infrared recognition problem. For data augmentation, most existing methods directly adopt the standard operations designed for single-modality visible images, and thus do not fully consider the imagery properties in visible to infrared matching. Our basic idea is to homogenously generate color-irrelevant images by randomly exchanging the color channels. It can be seamlessly integrated into existing augmentation operations without modifying the network, consistently improving the robustness against color variations. Incorporated with a random erasing strategy, it further greatly enriches the diversity by simulating random occlusions. For cross-modality metric learning, we design an enhanced channel-mixed learning strategy to simultaneously handle the intra- and cross-modality variations with squared difference for stronger discriminability. Besides, a channel-augmented joint learning strategy is further developed to explicitly optimize the outputs of augmented images. Extensive experiments with insightful analysis on two visible-infrared recognition tasks show that the proposed strategies consistently improve the accuracy. Without auxiliary information, it improves the state-of-the-art Rank-1/mAP by 14.59%/13.00% on the large-scale SYSU-MM01 dataset.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ye_Channel_Augmented_Joint_Learning_for_Visible-Infrared_Recognition_ICCV_2021_paper.pdf", @@ -5884,7 +6285,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0;0;0;1", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Ye_2021_ICCV,\n \n author = {\n Ye,\n Mang and Ruan,\n Weijian and Du,\n Bo and Shou,\n Mike Zheng\n},\n title = {\n Channel Augmented Joint Learning for Visible-Infrared Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13567-13576\n} \n}" }, { "title": "Channel-Wise Knowledge Distillation for Dense Prediction", @@ -5892,6 +6294,7 @@ "status": "Poster", "track": "main", "pid": 6529, + "author_site": "Changyong Shu; Yifan Liu; Jianfei Gao; Zheng Yan; Chunhua Shen", "author": "Changyong Shu; Yifan Liu; Jianfei Gao; Zheng Yan; Chunhua Shen", "abstract": "Knowledge distillation (KD) has been proven a simple and effective tool for training compact dense prediction models. Lightweight student networks are trained by extra supervision transferred from large teacher networks. Most previous KD variants for dense prediction tasks align the activation maps from the student and teacher network in the spatial domain, typically by normalizing the activation values on each spatial location and minimizing point-wise and/or pair-wise discrepancy. Different from the previous methods, here we propose to normalize the activation map of each channel to obtain a soft probability map. By simply minimizing the Kullback--Leibler (KL) divergence between the channel-wise probability map of the two networks, the distillation process pays more attention to the most salient regions of each channel, which are valuable for dense prediction tasks. We conduct experiments on a few dense prediction tasks, including semantic segmentation and object detection. Experiments demonstrate that our proposed method outperforms state-of-the-art distillation methods considerably, and can require less computational cost during training. In particular, we improve the RetinaNet detector (ResNet50backbone) by3.4%in mAP on the COCO dataset and spent (ResNet18 backbone) by5.81%in mIoU on the cityscapes dataset. Code is available at: https://git.io/Distiller.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Shu_Channel-Wise_Knowledge_Distillation_for_Dense_Prediction_ICCV_2021_paper.pdf", @@ -5906,7 +6309,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Shu_Channel-Wise_Knowledge_Distillation_for_Dense_Prediction_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Shu_Channel-Wise_Knowledge_Distillation_for_Dense_Prediction_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Shu_2021_ICCV,\n \n author = {\n Shu,\n Changyong and Liu,\n Yifan and Gao,\n Jianfei and Yan,\n Zheng and Shen,\n Chunhua\n},\n title = {\n Channel-Wise Knowledge Distillation for Dense Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5311-5320\n} \n}" }, { "title": "Channel-Wise Topology Refinement Graph Convolution for Skeleton-Based Action Recognition", @@ -5914,6 +6318,7 @@ "status": "Poster", "track": "main", "pid": 3649, + "author_site": "Yuxin Chen; Ziqi Zhang; Chunfeng Yuan; Bing Li; Ying Deng; Weiming Hu", "author": "Yuxin Chen; Ziqi Zhang; Chunfeng Yuan; Bing Li; Ying Deng; Weiming Hu", "abstract": "Graph convolutional networks (GCNs) have been widely used and achieved remarkable results in skeleton-based action recognition. In GCNs, graph topology dominates feature aggregation and therefore is the key to extracting representative features. In this work, we propose a novel Channel-wise Topology Refinement Graph Convolution (CTR-GC) to dynamically learn different topologies and effectively aggregate joint features in different channels for skeleton-based action recognition. The proposed CTR-GC models channel-wise topologies through learning a shared topology as a generic prior for all channels and refining it with channel-specific correlations for each channel. Our refinement method introduces few extra parameters and significantly reduces the difficulty of modeling channel-wise topologies. Furthermore, via reformulating graph convolutions into a unified form, we find that CTR-GC relaxes strict constraints of graph convolutions, leading to stronger representation capability. Combining CTR-GC with temporal modeling modules, we develop a powerful graph convolutional network named CTR-GCN which notably outperforms state-of-the-art methods on the NTU RGB+D, NTU RGB+D 120, and NW-UCLA datasets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_Channel-Wise_Topology_Refinement_Graph_Convolution_for_Skeleton-Based_Action_Recognition_ICCV_2021_paper.pdf", @@ -5937,7 +6342,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Yuxin and Zhang,\n Ziqi and Yuan,\n Chunfeng and Li,\n Bing and Deng,\n Ying and Hu,\n Weiming\n},\n title = {\n Channel-Wise Topology Refinement Graph Convolution for Skeleton-Based Action Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13359-13368\n} \n}" }, { "title": "Cherry-Picking Gradients: Learning Low-Rank Embeddings of Visual Data via Differentiable Cross-Approximation", @@ -5945,6 +6351,7 @@ "status": "Poster", "track": "main", "pid": 3486, + "author_site": "Mikhail Usvyatsov; Anastasia Makarova; Rafael Ballester-Ripoll; Maxim Rakhuba; Andreas Krause; Konrad Schindler", "author": "Mikhail Usvyatsov; Anastasia Makarova; Rafael Ballester-Ripoll; Maxim Rakhuba; Andreas Krause; Konrad Schindler", "abstract": "We propose an end-to-end trainable framework that processes large-scale visual data tensors by looking at a fraction of their entries only. Our method combines a neural network encoder with a tensor train decomposition to learn a low-rank latent encoding, coupled with cross-approximation (CA) to learn the representation through a subset of the original samples. CA is an adaptive sampling algorithm that is native to tensor decompositions and avoids working with the full high-resolution data explicitly. Instead, it actively selects local representative samples that we fetch out-of-core and on demand. The required number of samples grows only logarithmically with the size of the input. Our implicit representation of the tensor in the network enables processing large grids that could not be otherwise tractable in their uncompressed form. The proposed approach is particularly useful for large-scale multidimensional grid data (e.g., 3D tomography), and for tasks that require context over a large receptive field (e.g., predicting the medical condition of entire organs). The code is available at https://github.com/aelphy/c-pic.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Usvyatsov_Cherry-Picking_Gradients_Learning_Low-Rank_Embeddings_of_Visual_Data_via_Differentiable_ICCV_2021_paper.pdf", @@ -5959,7 +6366,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Usvyatsov_Cherry-Picking_Gradients_Learning_Low-Rank_Embeddings_of_Visual_Data_via_Differentiable_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Usvyatsov_Cherry-Picking_Gradients_Learning_Low-Rank_Embeddings_of_Visual_Data_via_Differentiable_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Usvyatsov_2021_ICCV,\n \n author = {\n Usvyatsov,\n Mikhail and Makarova,\n Anastasia and Ballester-Ripoll,\n Rafael and Rakhuba,\n Maxim and Krause,\n Andreas and Schindler,\n Konrad\n},\n title = {\n Cherry-Picking Gradients: Learning Low-Rank Embeddings of Visual Data via Differentiable Cross-Approximation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11426-11435\n} \n}" }, { "title": "Class Semantics-Based Attention for Action Detection", @@ -5967,10 +6375,11 @@ "status": "Poster", "track": "main", "pid": 9109, + "author_site": "Deepak Sridhar; Niamul Quader; Srikanth Muralidharan; Yaoxin Li; Peng Dai; Juwei Lu", "author": "Deepak Sridhar; Niamul Quader; Srikanth Muralidharan; Yaoxin Li; Peng Dai; Juwei Lu", "abstract": "Action localization networks are often structured as a feature encoder sub-network and a localization sub-network, where the feature encoder learns to transform an input video to features that are useful for the localization sub-network to generate reliable action proposals. While some of the encoded features may be more useful for generating action proposals, prior action localization approaches do not include any attention mechanism that enables the localization sub-network to attend more to the more important features. In this paper, we propose a novel attention mechanism, the Class Semantics-based Attention (CSA), that learns from the temporal distribution of semantics of action classes present in an input video to find the importance scores of the encoded features, which are used to provide attention to the more useful encoded features. We demonstrate on two popular action detection datasets that incorporating our novel attention mechanism provides considerable performance gains on competitive action detection models (e.g., around 6.2% improvement over BMN action detection baseline to obtain 47.5% mAP on the THUMOS-14 dataset), and a new state-of-the-art of 36.25% mAP on the ActivityNet v1.3 dataset. Further, the CSA localization model family which includes BMN-CSA, was part of the second-placed submission at the 2021 ActivityNet action localization challenge. Our attention mechanism outperforms prior self-attention modules such as the squeeze-and-excitation in action detection task. We also observe that our attention mechanism is complementary to such self-attention modules in that performance improvements are seen when both are used together.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Sridhar_Class_Semantics-Based_Attention_for_Action_Detection_ICCV_2021_paper.pdf", - "aff": "Huawei Noah\u2019s Ark Lab, Canada; Huawei Noah\u2019s Ark Lab, Canada; Huawei Noah\u2019s Ark Lab, Canada; Huawei Noah\u2019s Ark Lab, Canada+University of Waterloo; Huawei Noah\u2019s Ark Lab, Canada; Huawei Noah\u2019s Ark Lab, Canada", + "aff": "Huawei Noah’s Ark Lab, Canada; Huawei Noah’s Ark Lab, Canada; Huawei Noah’s Ark Lab, Canada; Huawei Noah’s Ark Lab, Canada+University of Waterloo; Huawei Noah’s Ark Lab, Canada; Huawei Noah’s Ark Lab, Canada", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Sridhar_Class_Semantics-Based_Attention_ICCV_2021_supplemental.pdf", @@ -5983,14 +6392,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Sridhar_Class_Semantics-Based_Attention_for_Action_Detection_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;0+1;0;0", - "aff_unique_norm": "Huawei;University of Waterloo", - "aff_unique_dep": "Huawei Noah\u2019s Ark Lab;", + "aff_unique_norm": "Huawei Noah’s Ark Lab;University of Waterloo", + "aff_unique_dep": ";", "aff_unique_url": "https://www.huawei.com/en/ai/noahs-ark-lab;https://uwaterloo.ca", "aff_unique_abbr": "HNAL;UW", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0+0;0;0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Sridhar_2021_ICCV,\n \n author = {\n Sridhar,\n Deepak and Quader,\n Niamul and Muralidharan,\n Srikanth and Li,\n Yaoxin and Dai,\n Peng and Lu,\n Juwei\n},\n title = {\n Class Semantics-Based Attention for Action Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13739-13748\n} \n}" }, { "title": "Class-Incremental Learning for Action Recognition in Videos", @@ -5998,6 +6408,7 @@ "status": "Poster", "track": "main", "pid": 6577, + "author_site": "Jaeyoo Park; Minsoo Kang; Bohyung Han", "author": "Jaeyoo Park; Minsoo Kang; Bohyung Han", "abstract": "We tackle catastrophic forgetting problem in the context of class-incremental learning for video recognition, which has not been explored actively despite the popularity of continual learning. Our framework addresses this challenging task by introducing time-channel importance maps and exploiting the importance maps for learning the representations of incoming examples via knowledge distillation. We also incorporate a regularization scheme in our objective function, which encourages individual features obtained from different time steps in a video to be uncorrelated and eventually improves accuracy by alleviating catastrophic forgetting. We evaluate the proposed approach on brand-new splits of class-incremental action recognition benchmarks constructed upon the UCF101, HMDB51, and Something-Something V2 datasets, and demonstrate the effectiveness of our algorithm in comparison to the existing continual learning methods that are originally designed for image data.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Park_Class-Incremental_Learning_for_Action_Recognition_in_Videos_ICCV_2021_paper.pdf", @@ -6021,7 +6432,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Seoul", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Park_2021_ICCV,\n \n author = {\n Park,\n Jaeyoo and Kang,\n Minsoo and Han,\n Bohyung\n},\n title = {\n Class-Incremental Learning for Action Recognition in Videos\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13698-13707\n} \n}" }, { "title": "Click To Move: Controlling Video Generation With Sparse Motion", @@ -6029,10 +6441,11 @@ "status": "Poster", "track": "main", "pid": 10116, - "author": "Pierfrancesco Ardino; Marco De Nadai; Bruno Lepri; Elisa Ricci; St\u00e9phane Lathuili\u00e8re", + "author_site": "Pierfrancesco Ardino; Marco De Nadai; Bruno Lepri; Elisa Ricci; Stéphane Lathuilière", + "author": "Pierfrancesco Ardino; Marco De Nadai; Bruno Lepri; Elisa Ricci; Stéphane Lathuilière", "abstract": "This paper introduces Click to Move (C2M), a novel framework for video generation where the user can control the motion of the synthesized video through mouse clicks specifying simple object trajectories of the key objects in the scene. Our model receives as input an initial frame, its corresponding segmentation map and the sparse motion vectors encoding the input provided by the user. It outputs a plausible video sequence starting from the given frame and with a motion that is consistent with user input. Notably, our proposed deep architecture incorporates a Graph Convolution Network (GCN) modelling the movements of all the objects in the scene in a holistic manner and effectively combining the sparse user motion information and image features. Experimental results show that C2M outperforms existing methods on two publicly available datasets, thus demonstrating the effectiveness of our GCN framework at modelling object interactions. The source code is publicly available at https://github.com/PierfrancescoArdino/C2M.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ardino_Click_To_Move_Controlling_Video_Generation_With_Sparse_Motion_ICCV_2021_paper.pdf", - "aff": "University of Trento + Fondazione Bruno Kessler; Fondazione Bruno Kessler; Fondazione Bruno Kessler; University of Trento + Fondazione Bruno Kessler; LTCI, T \u00b4el\u00b4ecom Paris, Institut Polytechnique de Paris", + "aff": "University of Trento + Fondazione Bruno Kessler; Fondazione Bruno Kessler; Fondazione Bruno Kessler; University of Trento + Fondazione Bruno Kessler; LTCI, T ´el´ecom Paris, Institut Polytechnique de Paris", "project": "", "github": "https://github.com/PierfrancescoArdino/C2M", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Ardino_Click_To_Move_ICCV_2021_supplemental.zip", @@ -6045,14 +6458,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Ardino_Click_To_Move_Controlling_Video_Generation_With_Sparse_Motion_ICCV_2021_paper.html", "aff_unique_index": "0+1;1;1;0+1;2", - "aff_unique_norm": "University of Trento;Fondazione Bruno Kessler;T\u00e9l\u00e9com Paris", + "aff_unique_norm": "University of Trento;Fondazione Bruno Kessler;Télécom Paris", "aff_unique_dep": ";;LTCI", "aff_unique_url": "https://www.unitn.it;https://www.fbk.eu;https://www.telecom-paris.fr", - "aff_unique_abbr": "UniTN;FBK;T\u00e9l\u00e9com Paris", + "aff_unique_abbr": "UniTN;FBK;Télécom Paris", "aff_campus_unique_index": ";;1", "aff_campus_unique": ";Paris", "aff_country_unique_index": "0+0;0;0;0+0;1", - "aff_country_unique": "Italy;France" + "aff_country_unique": "Italy;France", + "bibtex": "@InProceedings{Ardino_2021_ICCV,\n \n author = {\n Ardino,\n Pierfrancesco and De Nadai,\n Marco and Lepri,\n Bruno and Ricci,\n Elisa and Lathuili\\`ere,\n St\\'ephane\n},\n title = {\n Click To Move: Controlling Video Generation With Sparse Motion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14749-14758\n} \n}" }, { "title": "Clothing Status Awareness for Long-Term Person Re-Identification", @@ -6060,6 +6474,7 @@ "status": "Poster", "track": "main", "pid": 9554, + "author_site": "Yan Huang; Qiang Wu; JingSong Xu; Yi Zhong; ZhaoXiang Zhang", "author": "Yan Huang; Qiang Wu; JingSong Xu; Yi Zhong; ZhaoXiang Zhang", "abstract": "Long-Term person re-identification (LT-reID) exposes extreme challenges because of the longer time gaps between two recording footages where a person is likely to change clothing. There are two types of approaches for LT-reID: biometrics-based approach and data adaptation based approach. The former one is to seek clothing irrelevant biometric features. However, seeking high quality biometric feature is the main concern. The latter one adopts fine-tuning strategy by using data with significant clothing change. However, the performance is compromised when it is applied to cases without clothing change. This work argues that these approaches in fact are not aware of clothing status (i.e., change or no-change) of a pedestrian. Instead, they blindly assume all footages of a pedestrian have different clothes. To tackle this issue, a Regularization via Clothing Status Awareness Network (RCSANet) is proposed to regularize descriptions of a pedestrian by embedding the clothing status awareness. Consequently, the description can be enhanced to maintain the best ID discriminative feature while improving its robustness to real-world LT-reID where both clothing-change case and no-clothing-change case exist. Experiments show that RCSANet performs reasonably well on three LT-reID datasets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Huang_Clothing_Status_Awareness_for_Long-Term_Person_Re-Identification_ICCV_2021_paper.pdf", @@ -6083,7 +6498,8 @@ "aff_campus_unique_index": "0;0;0;2", "aff_campus_unique": "Sydney;;Beijing", "aff_country_unique_index": "0;0;0;1;1", - "aff_country_unique": "Australia;China" + "aff_country_unique": "Australia;China", + "bibtex": "@InProceedings{Huang_2021_ICCV,\n \n author = {\n Huang,\n Yan and Wu,\n Qiang and Xu,\n JingSong and Zhong,\n Yi and Zhang,\n ZhaoXiang\n},\n title = {\n Clothing Status Awareness for Long-Term Person Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11895-11904\n} \n}" }, { "title": "Cloud Transformers: A Universal Approach to Point Cloud Processing Tasks", @@ -6091,6 +6507,7 @@ "status": "Poster", "track": "main", "pid": 8848, + "author_site": "Kirill Mazur; Victor Lempitsky", "author": "Kirill Mazur; Victor Lempitsky", "abstract": "We present a new versatile building block for deep point cloud processing architectures that is equally suited for diverse tasks. This building block combines the ideas of spatial transformers and multi-view convolutional networks with the efficiency of standard convolutional layers in two and three-dimensional dense grids. The new block operates via multiple parallel heads, whereas each head differentiably rasterizes feature representations of individual points into a low-dimensional space, and then uses dense convolution to propagate information across points. The results of the processing of individual heads are then combined together resulting in the update of point features. Using the new block, we build architectures for both discriminative (point cloud segmentation, point cloud classification) and generative (point cloud inpainting and image-based point cloud reconstruction) tasks. The resulting architectures achieve state-of-the-art performance for these tasks, demonstrating the versatility of the new block for point cloud processing.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Mazur_Cloud_Transformers_A_Universal_Approach_to_Point_Cloud_Processing_Tasks_ICCV_2021_paper.pdf", @@ -6107,14 +6524,15 @@ "author_num": 2, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Mazur_Cloud_Transformers_A_Universal_Approach_to_Point_Cloud_Processing_Tasks_ICCV_2021_paper.html", "aff_unique_index": "0;0+1", - "aff_unique_norm": "Samsung;Skolkovo Institute of Science and Technology", + "aff_unique_norm": "Samsung AI Center;Skolkovo Institute of Science and Technology", "aff_unique_dep": "AI Center;", "aff_unique_url": "https://www.samsung.com/global/innovation/ai-research/;https://www.skoltech.ru", "aff_unique_abbr": "Samsung AI;Skoltech", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Moscow;", "aff_country_unique_index": "0;0+0", - "aff_country_unique": "Russian Federation" + "aff_country_unique": "Russia", + "bibtex": "@InProceedings{Mazur_2021_ICCV,\n \n author = {\n Mazur,\n Kirill and Lempitsky,\n Victor\n},\n title = {\n Cloud Transformers: A Universal Approach to Point Cloud Processing Tasks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10715-10724\n} \n}" }, { "title": "Cluster-Promoting Quantization With Bit-Drop for Minimizing Network Quantization Loss", @@ -6122,6 +6540,7 @@ "status": "Poster", "track": "main", "pid": 7526, + "author_site": "Jung Hyun Lee; Jihun Yun; Sung Ju Hwang; Eunho Yang", "author": "Jung Hyun Lee; Jihun Yun; Sung Ju Hwang; Eunho Yang", "abstract": "Network quantization, which aims to reduce the bit-lengths of the network weights and activations, has emerged for their deployments to resource-limited devices. Although recent studies have successfully discretized a full-precision network, they still incur large quantization errors after training, thus giving rise to a significant performance gap between a full-precision network and its quantized counterpart. In this work, we propose a novel quantization method for neural networks, Cluster-Promoting Quantization (CPQ) that finds the optimal quantization grids while naturally encouraging the underlying full-precision weights to gather around those quantization grids cohesively during training. This property of CPQ is thanks to our two main ingredients that enable differentiable quantization: i) the use of the categorical distribution designed by a specific probabilistic parametrization in the forward pass and ii) our proposed multi-class straight-through estimator (STE) in the backward pass. Since our second component, multi-class STE, is intrinsically biased, we additionally propose a new bit-drop technique, DropBits, that revises the standard dropout regularization to randomly drop bits instead of neurons. As a natural extension of DropBits, we further introduce the way of learning heterogeneous quantization levels to find proper bit-length for each layer by imposing an additional regularization on DropBits. We experimentally validate our method on various benchmark datasets and network architectures, and also support a new hypothesis for quantization: learning heterogeneous quantization levels outperforms the case using the same but fixed quantization levels from scratch.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Lee_Cluster-Promoting_Quantization_With_Bit-Drop_for_Minimizing_Network_Quantization_Loss_ICCV_2021_paper.pdf", @@ -6145,7 +6564,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0;0+0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Lee_2021_ICCV,\n \n author = {\n Lee,\n Jung Hyun and Yun,\n Jihun and Hwang,\n Sung Ju and Yang,\n Eunho\n},\n title = {\n Cluster-Promoting Quantization With Bit-Drop for Minimizing Network Quantization Loss\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5370-5379\n} \n}" }, { "title": "Clustering by Maximizing Mutual Information Across Views", @@ -6153,10 +6573,11 @@ "status": "Poster", "track": "main", "pid": 9567, + "author_site": "Kien Do; Truyen Tran; Svetha Venkatesh", "author": "Kien Do; Truyen Tran; Svetha Venkatesh", "abstract": "We propose a novel framework for image clustering that incorporates joint representation learning and clustering. Our method consists of two heads that share the same backbone network - a \"representation learning\" head and a \"clustering\" head. The \"representation learning\" head captures fine-grained patterns of objects at the instance level which serve as clues for the \"clustering\" head to extract coarse-grain information that separates objects into clusters. The whole model is trained in an end-to-end manner by minimizing the weighted sum of two sample-oriented contrastive losses applied to the outputs of the two heads. To ensure that the contrastive loss corresponding to the \"clustering\" head is optimal, we introduce a novel critic function called \"log-of-dot-product\". Extensive experimental results demonstrate that our method significantly outperforms state-of-the-art single-stage clustering methods across a variety of image datasets, improving over the best baseline by about 5-7% in accuracy on CIFAR10/20, STL10, and ImageNet-Dogs. Further, the \"two-stage\" variant of our method also achieves better results than baselines on three challenging ImageNet subsets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Do_Clustering_by_Maximizing_Mutual_Information_Across_Views_ICCV_2021_paper.pdf", - "aff": "Applied Arti\ufb01cial Intelligence Institute (A2I2), Deakin University, Geelong, Australia; Applied Arti\ufb01cial Intelligence Institute (A2I2), Deakin University, Geelong, Australia; Applied Arti\ufb01cial Intelligence Institute (A2I2), Deakin University, Geelong, Australia", + "aff": "Applied Artificial Intelligence Institute (A2I2), Deakin University, Geelong, Australia; Applied Artificial Intelligence Institute (A2I2), Deakin University, Geelong, Australia; Applied Artificial Intelligence Institute (A2I2), Deakin University, Geelong, Australia", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Do_Clustering_by_Maximizing_ICCV_2021_supplemental.pdf", @@ -6170,13 +6591,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Do_Clustering_by_Maximizing_Mutual_Information_Across_Views_ICCV_2021_paper.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Deakin University", - "aff_unique_dep": "Applied Arti\ufb01cial Intelligence Institute (A2I2)", + "aff_unique_dep": "Applied Artificial Intelligence Institute (A2I2)", "aff_unique_url": "https://www.deakin.edu.au", "aff_unique_abbr": "Deakin", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Geelong", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Australia" + "aff_country_unique": "Australia", + "bibtex": "@InProceedings{Do_2021_ICCV,\n \n author = {\n Do,\n Kien and Tran,\n Truyen and Venkatesh,\n Svetha\n},\n title = {\n Clustering by Maximizing Mutual Information Across Views\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9928-9938\n} \n}" }, { "title": "Co-Scale Conv-Attentional Image Transformers", @@ -6184,6 +6606,7 @@ "status": "Poster", "track": "main", "pid": 9887, + "author_site": "Weijian Xu; Yifan Xu; Tyler Chang; Zhuowen Tu", "author": "Weijian Xu; Yifan Xu; Tyler Chang; Zhuowen Tu", "abstract": "In this paper, we present Co-scale conv-attentional image Transformers (CoaT), a Transformer-based image classifier equipped with co-scale and conv-attentional mechanisms. First, the co-scale mechanism maintains the integrity of Transformers' encoder branches at individual scales, while allowing representations learned at different scales to effectively communicate with each other; we design a series of serial and parallel blocks to realize the co-scale mechanism. Second, we devise a conv-attentional mechanism by realizing a relative position embedding formulation in the factorized attention module with an efficient convolution-like implementation. CoaT empowers image Transformers with enriched multi-scale and contextual modeling capabilities. On ImageNet, relatively small CoaT models attain superior classification results compared with similar-sized convolutional neural networks and image/vision Transformers. The effectiveness of CoaT's backbone is also illustrated on object detection and instance segmentation, demonstrating its applicability to downstream computer vision tasks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xu_Co-Scale_Conv-Attentional_Image_Transformers_ICCV_2021_paper.pdf", @@ -6207,7 +6630,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "San Diego", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Xu_2021_ICCV,\n \n author = {\n Xu,\n Weijian and Xu,\n Yifan and Chang,\n Tyler and Tu,\n Zhuowen\n},\n title = {\n Co-Scale Conv-Attentional Image Transformers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9981-9990\n} \n}" }, { "title": "Co2L: Contrastive Continual Learning", @@ -6215,6 +6639,7 @@ "status": "Poster", "track": "main", "pid": 9812, + "author_site": "Hyuntak Cha; Jaeho Lee; Jinwoo Shin", "author": "Hyuntak Cha; Jaeho Lee; Jinwoo Shin", "abstract": "Recent breakthroughs in self-supervised learning show that such algorithms learn visual representations that can be transferred better to unseen tasks than cross-entropy based methods which rely on task-specific supervision. In this paper, we found that the similar holds in the continual learning context: contrastively learned representations are more robust against the catastrophic forgetting than ones trained with the cross-entropy objective. Based on this novel observation, we propose a rehearsal-based continual learning algorithm that focuses on continually learning and maintaining transferable representations. More specifically, the proposed scheme (1) learns representations using the contrastive learning objective, and (2) preserves learned representations using a self-supervised distillation step. We conduct extensive experimental validations under popular benchmark image classification datasets, where our method sets the new state-of-the-art performance. Source code is available at https://github.com/chaht01/Co2L.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Cha_Co2L_Contrastive_Continual_Learning_ICCV_2021_paper.pdf", @@ -6238,7 +6663,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Cha_2021_ICCV,\n \n author = {\n Cha,\n Hyuntak and Lee,\n Jaeho and Shin,\n Jinwoo\n},\n title = {\n Co2L: Contrastive Continual Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9516-9525\n} \n}" }, { "title": "CoMatch: Semi-Supervised Learning With Contrastive Graph Regularization", @@ -6246,6 +6672,7 @@ "status": "Poster", "track": "main", "pid": 6465, + "author_site": "Junnan Li; Caiming Xiong; Steven C.H. Hoi", "author": "Junnan Li; Caiming Xiong; Steven C.H. Hoi", "abstract": "Semi-supervised learning has been an effective paradigm for leveraging unlabeled data to reduce the reliance on labeled data. We propose CoMatch, a new semi-supervised learning method that unifies dominant approaches and addresses their limitations. CoMatch jointly learns two representations of the training data, their class probabilities and low-dimensional embeddings. The two representations interact with each other to jointly evolve. The embeddings impose a smoothness constraint on the class probabilities to improve the pseudo-labels, whereas the pseudo-labels regularize the structure of the embeddings through graph-based contrastive learning. CoMatch achieves state-of-the-art performance on multiple datasets. It achieves substantial accuracy improvements on the label-scarce CIFAR-10 and STL-10. On ImageNet with 1% labels, CoMatch achieves a top-1 accuracy of 66.0%, outperforming FixMatch by 12.6%. Furthermore, CoMatch achieves better representation learning performance on downstream tasks, outperforming both supervised learning and self-supervised learning. Code and pre-trained models are available at https://github.com/salesforce/CoMatch/.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_CoMatch_Semi-Supervised_Learning_With_Contrastive_Graph_Regularization_ICCV_2021_paper.pdf", @@ -6269,7 +6696,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Junnan and Xiong,\n Caiming and Hoi,\n Steven C.H.\n},\n title = {\n CoMatch: Semi-Supervised Learning With Contrastive Graph Regularization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9475-9484\n} \n}" }, { "title": "Coarsely-Labeled Data for Better Few-Shot Transfer", @@ -6277,6 +6705,7 @@ "status": "Poster", "track": "main", "pid": 6170, + "author_site": "Cheng Perng Phoo; Bharath Hariharan", "author": "Cheng Perng Phoo; Bharath Hariharan", "abstract": "Few-shot learning is based on the premise that labels are expensive, especially when they are fine-grained and require expertise. But coarse labels might be easy to acquire and thus abundant. We present a representation learning approach - PAS that allows few-shot learners to leverage coarsely-labeled data available before evaluation. Inspired by self-training, we label the additional data using a teacher trained on the base dataset and filter the teacher's prediction based on the coarse labels; a new student representation is then trained on the base dataset and the pseudo-labeled dataset. PAS is able to produce a representation that consistently and significantly outperforms the baselines in 3 different datasets. Code is available at https://github.com/cpphoo/PAS.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Phoo_Coarsely-Labeled_Data_for_Better_Few-Shot_Transfer_ICCV_2021_paper.pdf", @@ -6300,7 +6729,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Phoo_2021_ICCV,\n \n author = {\n Phoo,\n Cheng Perng and Hariharan,\n Bharath\n},\n title = {\n Coarsely-Labeled Data for Better Few-Shot Transfer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9052-9061\n} \n}" }, { "title": "CodeNeRF: Disentangled Neural Radiance Fields for Object Categories", @@ -6308,6 +6738,7 @@ "status": "Poster", "track": "main", "pid": 3951, + "author_site": "Wonbong Jang; Lourdes Agapito", "author": "Wonbong Jang; Lourdes Agapito", "abstract": "CodeNeRF is an implicit 3D neural representation that learns the variation of object shapes and textures across a category and can be trained, from a set of posed images, to synthesize novel views of unseen objects. Unlike the original NeRF, which is scene specific, CodeNeRF learns to disentangle shape and texture by learning separate embeddings. At test time, given a single unposed image of an unseen object, CodeNeRF jointly estimates camera viewpoint, and shape and appearance codes via optimization. Unseen objects can be reconstructed from a single image, and then rendered from new viewpoints or their shape and texture edited by varying the latent codes. We conduct experiments on the SRN benchmark, which show that CodeNeRF generalises well to unseen objects and achieves on-par performance with methods that require known camera pose at test time. Our results on real-world images demonstrate that CodeNeRF can bridge the sim-to-real gap. Project page: https://github.com/wayne1123/code-nerf", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Jang_CodeNeRF_Disentangled_Neural_Radiance_Fields_for_Object_Categories_ICCV_2021_paper.pdf", @@ -6331,7 +6762,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "London", "aff_country_unique_index": "0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Jang_2021_ICCV,\n \n author = {\n Jang,\n Wonbong and Agapito,\n Lourdes\n},\n title = {\n CodeNeRF: Disentangled Neural Radiance Fields for Object Categories\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12949-12958\n} \n}" }, { "title": "Collaborative Learning With Disentangled Features for Zero-Shot Domain Adaptation", @@ -6339,6 +6771,7 @@ "status": "Poster", "track": "main", "pid": 9927, + "author_site": "Won Young Jhoo; Jae-Pil Heo", "author": "Won Young Jhoo; Jae-Pil Heo", "abstract": "Typical domain adaptation techniques aim to transfer label information from a label-rich source domain to a label-scarce target domain in the same label space. However, it is often hard to get even the unlabeled target domain data of a task of interest. In such a case, we can capture the domain shift between the source domain and target domain from an unseen task and transfer it to the task of interest, which is known as zero-shot domain adaptation (ZSDA). Existing state-of-the-art methods for ZSDA attempted to generate target domain data. However, training such generative models causes significant computational overhead and is hardly optimized. In this paper, we propose a novel ZSDA method that learns a task-agnostic domain shift by collaborative training of domain-invariant semantic features and task-invariant domain features via adversarial learning. Meanwhile, the spatial attention map is learned from disentangled feature representations to selectively emphasize the domain-specific salient parts of the domain-invariant features. Experimental results show that our ZSDA method achieves state-of-the-art performance on several benchmarks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Jhoo_Collaborative_Learning_With_Disentangled_Features_for_Zero-Shot_Domain_Adaptation_ICCV_2021_paper.pdf", @@ -6362,7 +6795,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Jhoo_2021_ICCV,\n \n author = {\n Jhoo,\n Won Young and Heo,\n Jae-Pil\n},\n title = {\n Collaborative Learning With Disentangled Features for Zero-Shot Domain Adaptation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8896-8905\n} \n}" }, { "title": "Collaborative Optimization and Aggregation for Decentralized Domain Generalization and Adaptation", @@ -6370,6 +6804,7 @@ "status": "Poster", "track": "main", "pid": 9123, + "author_site": "Guile Wu; Shaogang Gong", "author": "Guile Wu; Shaogang Gong", "abstract": "Contemporary domain generalization (DG) and multi-source unsupervised domain adaptation (UDA) methods mostly collect data from multiple domains together for joint optimization. However, this centralized training paradigm poses a threat to data privacy and is not applicable when data are non-shared across domains. In this work, we propose a new approach called Collaborative Optimization and Aggregation (COPA), which aims at optimizing a generalized target model for decentralized DG and UDA, where data from different domains are non-shared and private. Our base model consists of a domain-invariant feature extractor and an ensemble of domain-specific classifiers. In an iterative learning process, we optimize a local model for each domain, and then centrally aggregate local feature extractors and assemble domain-specific classifiers to construct a generalized global model, without sharing data from different domains. To improve generalization of feature extractors, we employ hybrid batch-instance normalization and collaboration of frozen classifiers. For better decentralized UDA, we further introduce a prediction agreement mechanism to overcome local disparities towards central model aggregation. Extensive experiments on five DG and UDA benchmark datasets show that COPA is capable of achieving comparable performance against the state-of-the-art DG and UDA methods without the need for centralized data collection in model training.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wu_Collaborative_Optimization_and_Aggregation_for_Decentralized_Domain_Generalization_and_Adaptation_ICCV_2021_paper.pdf", @@ -6393,7 +6828,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "London", "aff_country_unique_index": "0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Wu_2021_ICCV,\n \n author = {\n Wu,\n Guile and Gong,\n Shaogang\n},\n title = {\n Collaborative Optimization and Aggregation for Decentralized Domain Generalization and Adaptation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6484-6493\n} \n}" }, { "title": "Collaborative Unsupervised Visual Representation Learning From Decentralized Data", @@ -6401,6 +6837,7 @@ "status": "Poster", "track": "main", "pid": 1976, + "author_site": "Weiming Zhuang; Xin Gan; Yonggang Wen; Shuai Zhang; Shuai Yi", "author": "Weiming Zhuang; Xin Gan; Yonggang Wen; Shuai Zhang; Shuai Yi", "abstract": "Unsupervised representation learning has achieved outstanding performances using centralized data available on the Internet. However, the increasing awareness of privacy protection limits sharing of decentralized unlabeled image data that grows explosively in multiple parties (e.g. mobile phones and cameras). As such, a natural problem is how to leverage these data to learn visual representations for downstream tasks while preserving data privacy. To address this problem, we propose a novel federated unsupervised learning framework, FedU. In this framework, each party trains models from unlabeled data independently using contrastive learning with an online network and a target network. Then, a central server aggregates trained models and updates clients' models with the aggregated global model. It preserves data privacy as each party only has access to its raw data. Decentralized data among multiple parties is normally non-independent and identically distributed (non-IID), which leads to performance degradation. To tackle this challenge, we propose two simple but effective methods: (1) We design the communication protocol to upload only the encoders of online networks for server aggregation and update them with the aggregated encoder. (2) We introduce a new module to dynamically decide how to update the predictors based on the degree of divergence caused by non-IID. The predictor is the other component of the online network. Extensive experiments and ablations demonstrate the effectiveness and significance of FedU. It outperforms training with only one party by over 5% and other methods by over 14% in linear and semi-supervised evaluation on non-IID data.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhuang_Collaborative_Unsupervised_Visual_Representation_Learning_From_Decentralized_Data_ICCV_2021_paper.pdf", @@ -6424,7 +6861,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0;0;1;1", - "aff_country_unique": "Singapore;China" + "aff_country_unique": "Singapore;China", + "bibtex": "@InProceedings{Zhuang_2021_ICCV,\n \n author = {\n Zhuang,\n Weiming and Gan,\n Xin and Wen,\n Yonggang and Zhang,\n Shuai and Yi,\n Shuai\n},\n title = {\n Collaborative Unsupervised Visual Representation Learning From Decentralized Data\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4912-4921\n} \n}" }, { "title": "Collaborative and Adversarial Learning of Focused and Dispersive Representations for Semi-Supervised Polyp Segmentation", @@ -6432,6 +6870,7 @@ "status": "Poster", "track": "main", "pid": 6815, + "author_site": "Huisi Wu; Guilian Chen; Zhenkun Wen; Jing Qin", "author": "Huisi Wu; Guilian Chen; Zhenkun Wen; Jing Qin", "abstract": "Automatic polyp segmentation from colonoscopy images is an essential step in computer aided diagnosis for colorectal cancer. Most of polyp segmentation methods reported in recent years are based on fully supervised deep learning. However, annotation for polyp images by physicians during the diagnosis is time-consuming and costly. In this paper, we present a novel semi-supervised polyp segmentation via collaborative and adversarial learning of focused and dispersive representations learning model, where focused and dispersive extraction module are used to deal with the diversity of location and shape of polyps. In addition, confidence maps produced by a discriminator in an adversarial training framework shows the effectiveness of leveraging unlabeled data and improving the performance of segmentation network. Consistent regularization is further employed to optimize the segmentation networks to strengthen the representation of the outputs of focused and dispersive extraction module. We also propose an auxiliary adversarial learning method to better leverage unlabeled examples to further improve semantic segmentation accuracy. We conduct extensive experiments on two famous polyp datasets: Kvasir-SEG and CVC-Clinic DB. Experimental results demonstrate the effectiveness of the proposed model, consistently outperforming state-of-the-art semi-supervised segmentation models based on adversarial training and even some advanced fully supervised models. Codes will be released upon publication.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wu_Collaborative_and_Adversarial_Learning_of_Focused_and_Dispersive_Representations_for_ICCV_2021_paper.pdf", @@ -6446,7 +6885,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wu_Collaborative_and_Adversarial_Learning_of_Focused_and_Dispersive_Representations_for_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wu_Collaborative_and_Adversarial_Learning_of_Focused_and_Dispersive_Representations_for_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Wu_2021_ICCV,\n \n author = {\n Wu,\n Huisi and Chen,\n Guilian and Wen,\n Zhenkun and Qin,\n Jing\n},\n title = {\n Collaborative and Adversarial Learning of Focused and Dispersive Representations for Semi-Supervised Polyp Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3489-3498\n} \n}" }, { "title": "Collaging Class-Specific GANs for Semantic Image Synthesis", @@ -6454,6 +6894,7 @@ "status": "Poster", "track": "main", "pid": 6297, + "author_site": "Yuheng Li; Yijun Li; Jingwan Lu; Eli Shechtman; Yong Jae Lee; Krishna Kumar Singh", "author": "Yuheng Li; Yijun Li; Jingwan Lu; Eli Shechtman; Yong Jae Lee; Krishna Kumar Singh", "abstract": "We propose a new approach for high resolution semantic image synthesis. It consists of one base image generator and multiple class-specific generators. The base generator generates high quality images based on a segmentation map. To further improve the quality of different objects, we create a bank of Generative Adversarial Networks (GANs) by separately training class-specific models. This has several benefits including -- dedicated weights for each class; centrally aligned data for each model; additional training data from other sources, potential of higher resolution and quality; and easy manipulation of a specific object in the scene. Experiments show that our approach can generate high quality images in high resolution while having flexibility of object-level control by using class-specific generators.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_Collaging_Class-Specific_GANs_for_Semantic_Image_Synthesis_ICCV_2021_paper.pdf", @@ -6468,7 +6909,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Li_Collaging_Class-Specific_GANs_for_Semantic_Image_Synthesis_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Li_Collaging_Class-Specific_GANs_for_Semantic_Image_Synthesis_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Yuheng and Li,\n Yijun and Lu,\n Jingwan and Shechtman,\n Eli and Lee,\n Yong Jae and Singh,\n Krishna Kumar\n},\n title = {\n Collaging Class-Specific GANs for Semantic Image Synthesis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14418-14427\n} \n}" }, { "title": "Common Objects in 3D: Large-Scale Learning and Evaluation of Real-Life 3D Category Reconstruction", @@ -6476,6 +6918,7 @@ "status": "Poster", "track": "main", "pid": 6639, + "author_site": "Jeremy Reizenstein; Roman Shapovalov; Philipp Henzler; Luca Sbordone; Patrick Labatut; David Novotny", "author": "Jeremy Reizenstein; Roman Shapovalov; Philipp Henzler; Luca Sbordone; Patrick Labatut; David Novotny", "abstract": "Traditional approaches for learning 3D object categories have been predominantly trained and evaluated on synthetic datasets due to the unavailability of real 3D-annotated category-centric data. Our main goal is to facilitate advances in this field by collecting real-world data in a magnitude similar to the existing synthetic counterparts. The principal contribution of this work is thus a large-scale dataset, called Common Objects in 3D, with real multi-view images of object categories annotated with camera poses and ground truth 3D point clouds. The dataset contains a total of 1.5 million frames from nearly 19,000 videos capturing objects from 50 MS-COCO categories and, as such, it is significantly larger than alternatives both in terms of the number of categories and objects. We exploit this new dataset to conduct one of the first large-scale \"in-the-wild\" evaluations of several new-view-synthesis and category-centric 3D reconstruction methods. Finally, we contribute NerFormer - a novel neural rendering method that leverages the powerful Transformer to reconstruct an object given a small number of its views.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Reizenstein_Common_Objects_in_3D_Large-Scale_Learning_and_Evaluation_of_Real-Life_ICCV_2021_paper.pdf", @@ -6492,14 +6935,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Reizenstein_Common_Objects_in_3D_Large-Scale_Learning_and_Evaluation_of_Real-Life_ICCV_2021_paper.html", "aff_unique_index": "0;0;1;0;0;0", - "aff_unique_norm": "Meta;University College London", + "aff_unique_norm": "Facebook;University College London", "aff_unique_dep": "Facebook AI Research;", "aff_unique_url": "https://research.facebook.com;https://www.ucl.ac.uk", "aff_unique_abbr": "FAIR;UCL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0;0", - "aff_country_unique": "United States;United Kingdom" + "aff_country_unique": "United States;United Kingdom", + "bibtex": "@InProceedings{Reizenstein_2021_ICCV,\n \n author = {\n Reizenstein,\n Jeremy and Shapovalov,\n Roman and Henzler,\n Philipp and Sbordone,\n Luca and Labatut,\n Patrick and Novotny,\n David\n},\n title = {\n Common Objects in 3D: Large-Scale Learning and Evaluation of Real-Life 3D Category Reconstruction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10901-10911\n} \n}" }, { "title": "Complementary Patch for Weakly Supervised Semantic Segmentation", @@ -6507,6 +6951,7 @@ "status": "Poster", "track": "main", "pid": 2701, + "author_site": "Fei Zhang; Chaochen Gu; Chenyue Zhang; Yuchao Dai", "author": "Fei Zhang; Chaochen Gu; Chenyue Zhang; Yuchao Dai", "abstract": "Weakly Supervised Semantic Segmentation (WSSS) based on image-level labels has been greatly advanced by exploiting the outputs of Class Activation Map (CAM) to generate the pseudo labels for semantic segmentation. However, CAM merely discovers seeds from a small number of regions, which may be insufficient to serve as pseudo masks for semantic segmentation. In this paper, we formulate the expansion of object regions in CAM as an increase in information. From the perspective of information theory, we propose a novel Complementary Patch (CP) Representation and prove that the information of the sum of the CAMs by a pair of input images with complementary hidden (patched) parts, namely CP Pair, is greater than or equal to the information of the baseline CAM. Therefore, a CAM with more information related to object seeds can be obtained by narrowing down the gap between the sum of CAMs generated by the CP Pair and the original CAM. We propose a CP Network (CPN) implemented by a triplet network and three regularization functions. To further improve the quality of the CAMs, we propose a Pixel-Region Correlation Module (PRCM) to augment the contextual information by using object-region relations between the feature maps and the CAMs. Experimental results on the PASCAL VOC 2012 datasets show that our proposed method achieves a new state-of-the-art in WSSS, validating the effectiveness of our CP Representation and CPN.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhang_Complementary_Patch_for_Weakly_Supervised_Semantic_Segmentation_ICCV_2021_paper.pdf", @@ -6530,7 +6975,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2021_ICCV,\n \n author = {\n Zhang,\n Fei and Gu,\n Chaochen and Zhang,\n Chenyue and Dai,\n Yuchao\n},\n title = {\n Complementary Patch for Weakly Supervised Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7242-7251\n} \n}" }, { "title": "Composable Augmentation Encoding for Video Representation Learning", @@ -6538,6 +6984,7 @@ "status": "Poster", "track": "main", "pid": 6269, + "author_site": "Chen Sun; Arsha Nagrani; Yonglong Tian; Cordelia Schmid", "author": "Chen Sun; Arsha Nagrani; Yonglong Tian; Cordelia Schmid", "abstract": "We focus on contrastive methods for self-supervised video representation learning. A common paradigm in contrastive learning is to construct positive pairs by sampling different data views for the same instance, with different data instances as negatives. These methods implicitly assume a set of representational invariances to the view selection mechanism (e.g., sampling frames with temporal shifts), which may lead to poor performance on downstream tasks which violate these invariances (fine-grained video action recognition that would benefit from temporal information). To overcome this limitation, we propose an `augmentation aware' contrastive learning framework, where we explicitly provide a sequence of augmentation parameterisations (such as the values of the time shifts used to create data views) as composable augmentation encodings (CATE) to our model when projecting the video representations for contrastive learning. We show that representations learned by our method encode valuable information about specified spatial or temporal augmentation, and in doing so also achieve state-of-the-art performance on a number of video benchmarks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Sun_Composable_Augmentation_Encoding_for_Video_Representation_Learning_ICCV_2021_paper.pdf", @@ -6561,7 +7008,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Sun_2021_ICCV,\n \n author = {\n Sun,\n Chen and Nagrani,\n Arsha and Tian,\n Yonglong and Schmid,\n Cordelia\n},\n title = {\n Composable Augmentation Encoding for Video Representation Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8834-8844\n} \n}" }, { "title": "Compressing Visual-Linguistic Model via Knowledge Distillation", @@ -6569,6 +7017,7 @@ "status": "Poster", "track": "main", "pid": 1601, + "author_site": "Zhiyuan Fang; Jianfeng Wang; Xiaowei Hu; Lijuan Wang; Yezhou Yang; Zicheng Liu", "author": "Zhiyuan Fang; Jianfeng Wang; Xiaowei Hu; Lijuan Wang; Yezhou Yang; Zicheng Liu", "abstract": "Despite exciting progress in pre-training for visual-linguistic (VL) representations, very few aspire to a small VL model. In this paper, we study knowledge distillation(KD) to effectively compress a transformer-based large VL model into a small VL model. The major challenge arises from the inconsistent regional visual tokens extracted from different detectors of Teacher and Student, resulting in the misalignment of hidden representations and attention distributions. To address the problem, we retrain and adapt the Teacher by using the same region proposals from Student's detector while the features are from Teacher's own object detector. With aligned network inputs, the adapted Teacher is capable of transferring the knowledge through the intermediate representations. Specifically, we use the mean square error loss to mimic the attention distribution inside the transformer block and present a token-wise noise contrastive loss to align the hidden state by contrasting with negative representations stored in a sample queue. To this end, we show that our proposed distillation significantly improves the performance of small VL models on image captioning and visual question answering tasks. It reaches 120.8 in CIDEr score on COCO captioning, an improvement of 5.1 over its non-distilled counterpart; and an accuracy of 69.8 on VQA 2.0, a 0.8 gain from the baseline. Our extensive experiments and ablations confirm the effective-ness of VL distillation in both pre-training and fine-tuning stages.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Fang_Compressing_Visual-Linguistic_Model_via_Knowledge_Distillation_ICCV_2021_paper.pdf", @@ -6585,14 +7034,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Fang_Compressing_Visual-Linguistic_Model_via_Knowledge_Distillation_ICCV_2021_paper.html", "aff_unique_index": "0;1;1;1;0;1", - "aff_unique_norm": "Arizona State University;Microsoft", - "aff_unique_dep": ";Microsoft Corporation", + "aff_unique_norm": "Arizona State University;Microsoft Corporation", + "aff_unique_dep": ";", "aff_unique_url": "https://www.asu.edu;https://www.microsoft.com", "aff_unique_abbr": "ASU;Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Fang_2021_ICCV,\n \n author = {\n Fang,\n Zhiyuan and Wang,\n Jianfeng and Hu,\n Xiaowei and Wang,\n Lijuan and Yang,\n Yezhou and Liu,\n Zicheng\n},\n title = {\n Compressing Visual-Linguistic Model via Knowledge Distillation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1428-1438\n} \n}" }, { "title": "Concept Generalization in Visual Representation Learning", @@ -6600,6 +7050,7 @@ "status": "Poster", "track": "main", "pid": 1015, + "author_site": "Mert Bulent Sariyildiz; Yannis Kalantidis; Diane Larlus; Karteek Alahari", "author": "Mert Bulent Sariyildiz; Yannis Kalantidis; Diane Larlus; Karteek Alahari", "abstract": "Measuring concept generalization, i.e., the extent to which models trained on a set of (seen) visual concepts can be leveraged to recognize a new set of (unseen) concepts, is a popular way of evaluating visual representations, especially in a self-supervised learning framework. Nonetheless, the choice of unseen concepts for such an evaluation is usually made arbitrarily, and independently from the seen concepts used to train representations, thus ignoring any semantic relationships between the two. In this paper, we argue that the semantic relationships between seen and unseen concepts affect generalization performance and propose ImageNet-CoG, a novel benchmark on the ImageNet-21K (IN-21K) dataset that enables measuring concept generalization in a principled way. Our benchmark leverages expert knowledge that comes from WordNet in order to define a sequence of unseen IN-21K concept sets that are semantically more and more distant from the ImageNet-1K (IN-1K) subset, a ubiquitous training set. This allows us to benchmark visual representations learned on IN-1K out-of-the box. We conduct a large-scale study encompassing 31 convolution and transformer-based models and show how different architectures, levels of supervision, regularization techniques and use of web data impact the concept generalization performance.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Sariyildiz_Concept_Generalization_in_Visual_Representation_Learning_ICCV_2021_paper.pdf", @@ -6614,7 +7065,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Sariyildiz_Concept_Generalization_in_Visual_Representation_Learning_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Sariyildiz_Concept_Generalization_in_Visual_Representation_Learning_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Sariyildiz_2021_ICCV,\n \n author = {\n Sariyildiz,\n Mert Bulent and Kalantidis,\n Yannis and Larlus,\n Diane and Alahari,\n Karteek\n},\n title = {\n Concept Generalization in Visual Representation Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9629-9639\n} \n}" }, { "title": "CondLaneNet: A Top-To-Down Lane Detection Framework Based on Conditional Convolution", @@ -6622,6 +7074,7 @@ "status": "Poster", "track": "main", "pid": 7698, + "author_site": "Lizhe Liu; Xiaohao Chen; Siyu Zhu; Ping Tan", "author": "Lizhe Liu; Xiaohao Chen; Siyu Zhu; Ping Tan", "abstract": "Modern deep-learning-based lane detection methods are successful in most scenarios but struggling for lane lines with complex topologies. In this work, we propose CondLaneNet, a novel top-to-down lane detection framework that detects the lane instances first and then dynamically predicts the line shape for each instance. Aiming to resolve lane instance-level discrimination problem, we introduce a conditional lane detection strategy based on conditional convolution and row-wise formulation. Further, we design the Recurrent Instance Module(RIM) to overcome the problem of detecting lane lines with complex topologies such as dense lines and fork lines. Benefit from the end-to-end pipeline which requires little post-process, our method has real-time efficiency. We extensively evaluate our method on three benchmarks of lane detection. Results show that our method achieves state-of-the-art performance on all three benchmark datasets. Moreover, our method has the coexistence of accuracy and efficiency, e.g. a 78.14 F1 score and 220 FPS on CULane. Our code is available at https://github.com/aliyun/ conditional-lane-detection.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liu_CondLaneNet_A_Top-To-Down_Lane_Detection_Framework_Based_on_Conditional_Convolution_ICCV_2021_paper.pdf", @@ -6636,7 +7089,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Liu_CondLaneNet_A_Top-To-Down_Lane_Detection_Framework_Based_on_Conditional_Convolution_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Liu_CondLaneNet_A_Top-To-Down_Lane_Detection_Framework_Based_on_Conditional_Convolution_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Liu_2021_ICCV,\n \n author = {\n Liu,\n Lizhe and Chen,\n Xiaohao and Zhu,\n Siyu and Tan,\n Ping\n},\n title = {\n CondLaneNet: A Top-To-Down Lane Detection Framework Based on Conditional Convolution\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3773-3782\n} \n}" }, { "title": "Condensing a Sequence to One Informative Frame for Video Recognition", @@ -6644,6 +7098,7 @@ "status": "Poster", "track": "main", "pid": 10460, + "author_site": "Zhaofan Qiu; Ting Yao; Yan Shu; Chong-Wah Ngo; Tao Mei", "author": "Zhaofan Qiu; Ting Yao; Yan Shu; Chong-Wah Ngo; Tao Mei", "abstract": "Video is complex due to large variations in motion and rich content in fine-grained visual details. Abstracting useful information from such information-intensive media requires exhaustive computing resources. This paper studies a two-step alternative that first condenses the video sequence to an informative \"frame\" and then exploits off-the-shelf image recognition system on the synthetic frame. A valid question is how to define \"useful information\" and then distill it from a video sequence down to one synthetic frame. This paper presents a novel Informative Frame Synthesis (IFS) architecture that incorporates three objective tasks, i.e., appearance reconstruction, video categorization, motion estimation, and two regularizers, i.e., adversarial learning, color consistency. Each task equips the synthetic frame with one ability, while each regularizer enhances its visual quality. With these, by jointly learning the frame synthesis in an end-to-end manner, the generated frame is expected to encapsulate the required spatio-temporal information useful for video analysis. Extensive experiments are conducted on the large-scale Kinetics dataset. When comparing to baseline methods that map video sequence to a single image, IFS shows superior performance. More remarkably, IFS consistently demonstrates evident improvements on image-based 2D networks and clip-based 3D networks, and achieves comparable performance with the state-of-the-art methods with less computational cost.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Qiu_Condensing_a_Sequence_to_One_Informative_Frame_for_Video_Recognition_ICCV_2021_paper.pdf", @@ -6660,14 +7115,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Qiu_Condensing_a_Sequence_to_One_Informative_Frame_for_Video_Recognition_ICCV_2021_paper.html", "aff_unique_index": "0;0;1;2;0", - "aff_unique_norm": "JD;University of Science and Technology of China;Singapore Management University", - "aff_unique_dep": "JD AI Research;;", + "aff_unique_norm": "JD AI Research;University of Science and Technology of China;Singapore Management University", + "aff_unique_dep": ";;", "aff_unique_url": ";http://www.ustc.edu.cn;https://www.smu.edu.sg", "aff_unique_abbr": ";USTC;SMU", "aff_campus_unique_index": "0;0;1;0", "aff_campus_unique": "Beijing;Hefei;", "aff_country_unique_index": "0;0;0;1;0", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Qiu_2021_ICCV,\n \n author = {\n Qiu,\n Zhaofan and Yao,\n Ting and Shu,\n Yan and Ngo,\n Chong-Wah and Mei,\n Tao\n},\n title = {\n Condensing a Sequence to One Informative Frame for Video Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 16311-16320\n} \n}" }, { "title": "Conditional DETR for Fast Training Convergence", @@ -6675,6 +7131,7 @@ "status": "Poster", "track": "main", "pid": 8703, + "author_site": "Depu Meng; Xiaokang Chen; Zejia Fan; Gang Zeng; Houqiang Li; Yuhui Yuan; Lei Sun; Jingdong Wang", "author": "Depu Meng; Xiaokang Chen; Zejia Fan; Gang Zeng; Houqiang Li; Yuhui Yuan; Lei Sun; Jingdong Wang", "abstract": "The recently-developed DETR approach applies the transformer encoder and decoder architecture to object detection and achieves promising performance. In this paper, we handle the critical issue, slow training convergence, and present a conditional cross-attention mechanism for fast DETR training. Our approach is motivated by that the cross-attention in DETR relies highly on the content embeddings for localizing the four extremities and predicting the box, which increases the need for high-quality content embeddings and thus the training difficulty. Our approach, named conditional DETR, learns a conditional spatial query from the decoder embedding for decoder multi-head cross-attention. The benefit is that through the conditional spatial query, each cross-attention head is able to attend to a band containing a distinct region, e.g., one object extremity or a region inside the object box. This narrows down the spatial range for localizing the distinct regions for object classification and box regression, thus relaxing the dependence on the content embeddings and easing the training. Empirical results show that conditional DETR converges 6.7x faster for the backbones R50 and R101 and 10x faster for stronger backbones DC5-R50 and DC5-R101. Code is available at https://github.com/Atten4Vis/ConditionalDETR.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Meng_Conditional_DETR_for_Fast_Training_Convergence_ICCV_2021_paper.pdf", @@ -6691,14 +7148,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Meng_Conditional_DETR_for_Fast_Training_Convergence_ICCV_2021_paper.html", "aff_unique_index": "0;1;1;1;0;2;2;2", - "aff_unique_norm": "University of Science and Technology of China;Peking University;Microsoft", + "aff_unique_norm": "University of Science and Technology of China;Peking University;Microsoft Research", "aff_unique_dep": ";;Research", "aff_unique_url": "http://www.ustc.edu.cn;http://www.pku.edu.cn;https://www.microsoft.com/en-us/research/group/asia", "aff_unique_abbr": "USTC;Peking U;MSR Asia", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0;0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Meng_2021_ICCV,\n \n author = {\n Meng,\n Depu and Chen,\n Xiaokang and Fan,\n Zejia and Zeng,\n Gang and Li,\n Houqiang and Yuan,\n Yuhui and Sun,\n Lei and Wang,\n Jingdong\n},\n title = {\n Conditional DETR for Fast Training Convergence\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3651-3660\n} \n}" }, { "title": "Conditional Diffusion for Interactive Segmentation", @@ -6706,6 +7164,7 @@ "status": "Poster", "track": "main", "pid": 5593, + "author_site": "Xi Chen; Zhiyan Zhao; Feiwu Yu; Yilei Zhang; Manni Duan", "author": "Xi Chen; Zhiyan Zhao; Feiwu Yu; Yilei Zhang; Manni Duan", "abstract": "In click-based interactive segmentation, the mask extraction process is dictated by positive/negative user clicks; however, most existing methods do not fully exploit the user cues, requiring excessive numbers of clicks for satisfactory results. We propose Conditional Diffusion Network(CDNet), which propagates labeled representations from clicks to conditioned destinations with two levels of affinities: Feature Diffusion Module (FDM) spreads features from clicks to potential target regions with global similarity; Pixel Diffusion Module (PDM) diffuses the predicted logits of clicks within locally connected regions. Thus, the information inferred by user clicks could be generalized to proper destinations. In addition, we put forward Diversified Training(DT), which reduces the optimization ambiguity caused by click simulation. With FDM,PDM and DT, CDNet could better understand user's intentions and make better predictions with limited interactions. CDNet achieves state-of-the-art performance on several benchmarks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_Conditional_Diffusion_for_Interactive_Segmentation_ICCV_2021_paper.pdf", @@ -6729,7 +7188,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Xi and Zhao,\n Zhiyan and Yu,\n Feiwu and Zhang,\n Yilei and Duan,\n Manni\n},\n title = {\n Conditional Diffusion for Interactive Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7345-7354\n} \n}" }, { "title": "Conditional Variational Capsule Network for Open Set Recognition", @@ -6737,10 +7197,11 @@ "status": "Poster", "track": "main", "pid": 1444, + "author_site": "Yunrui Guo; Guglielmo Camporese; Wenjing Yang; Alessandro Sperduti; Lamberto Ballan", "author": "Yunrui Guo; Guglielmo Camporese; Wenjing Yang; Alessandro Sperduti; Lamberto Ballan", "abstract": "In open set recognition, a classifier has to detect unknown classes that are not known at training time. In order to recognize new categories, the classifier has to project the input samples of known classes in very compact and separated regions of the features space for discriminating samples of unknown classes. Recently proposed Capsule Networks have shown to outperform alternatives in many fields, particularly in image recognition, however they have not been fully applied yet to open-set recognition. In capsule networks, scalar neurons are replaced by capsule vectors or matrices, whose entries represent different properties of objects. In our proposal, during training, capsules features of the same known class are encouraged to match a pre-defined gaussian, one for each class. To this end, we use the variational autoencoder framework, with a set of gaussian priors as the approximation for the posterior distribution. In this way, we are able to control the compactness of the features of the same class around the center of the gaussians, thus controlling the ability of the classifier in detecting samples from unknown classes. We conducted several experiments and ablation of our model, obtaining state of the art results on different datasets in the open set recognition and unknown detection tasks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Guo_Conditional_Variational_Capsule_Network_for_Open_Set_Recognition_ICCV_2021_paper.pdf", - "aff": "National University of Defense Technology, China+Department of Mathematics \u201cTullio Levi-Civita\u201d, University of Padova, Italy; Department of Mathematics \u201cTullio Levi-Civita\u201d, University of Padova, Italy; National University of Defense Technology, China; Department of Mathematics \u201cTullio Levi-Civita\u201d, University of Padova, Italy; Department of Mathematics \u201cTullio Levi-Civita\u201d, University of Padova, Italy", + "aff": "National University of Defense Technology, China+Department of Mathematics “Tullio Levi-Civita”, University of Padova, Italy; Department of Mathematics “Tullio Levi-Civita”, University of Padova, Italy; National University of Defense Technology, China; Department of Mathematics “Tullio Levi-Civita”, University of Padova, Italy; Department of Mathematics “Tullio Levi-Civita”, University of Padova, Italy", "project": "", "github": "", "supp": "", @@ -6760,7 +7221,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;1;0;1;1", - "aff_country_unique": "China;Italy" + "aff_country_unique": "China;Italy", + "bibtex": "@InProceedings{Guo_2021_ICCV,\n \n author = {\n Guo,\n Yunrui and Camporese,\n Guglielmo and Yang,\n Wenjing and Sperduti,\n Alessandro and Ballan,\n Lamberto\n},\n title = {\n Conditional Variational Capsule Network for Open Set Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 103-111\n} \n}" }, { "title": "Confidence Calibration for Domain Generalization Under Covariate Shift", @@ -6768,6 +7230,7 @@ "status": "Poster", "track": "main", "pid": 10502, + "author_site": "Yunye Gong; Xiao Lin; Yi Yao; Thomas G. Dietterich; Ajay Divakaran; Melinda Gervasio", "author": "Yunye Gong; Xiao Lin; Yi Yao; Thomas G. Dietterich; Ajay Divakaran; Melinda Gervasio", "abstract": "Existing calibration algorithms address the problem of covariate shift via unsupervised domain adaptation. However, these methods suffer from the following limitations: 1) they require unlabeled data from the target domain, which may not be available at the stage of calibration in real-world applications and 2) their performance depends heavily on the disparity between the distributions of the source and target domains. To address these two limitations, we present novel calibration solutions via domain generalization. Our core idea is to leverage multiple calibration domains to reduce the effective distribution disparity between the target and calibration domains for improved calibration transfer without needing any data from the target domain. We provide theoretical justification and empirical experimental results to demonstrate the effectiveness of our proposed algorithms. Compared against state-of-the-art calibration methods designed for domain adaptation, we observe a decrease of 8.86 percentage points in expected calibration error or, equivalently, an increase of 35 percentage points in improvement ratio for multi-class classification on the Office-Home dataset.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Gong_Confidence_Calibration_for_Domain_Generalization_Under_Covariate_Shift_ICCV_2021_paper.pdf", @@ -6791,7 +7254,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Corvallis", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Gong_2021_ICCV,\n \n author = {\n Gong,\n Yunye and Lin,\n Xiao and Yao,\n Yi and Dietterich,\n Thomas G. and Divakaran,\n Ajay and Gervasio,\n Melinda\n},\n title = {\n Confidence Calibration for Domain Generalization Under Covariate Shift\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8958-8967\n} \n}" }, { "title": "Conformer: Local Features Coupling Global Representations for Visual Recognition", @@ -6799,6 +7263,7 @@ "status": "Poster", "track": "main", "pid": 2302, + "author_site": "Zhiliang Peng; Wei Huang; Shanzhi Gu; Lingxi Xie; Yaowei Wang; Jianbin Jiao; Qixiang Ye", "author": "Zhiliang Peng; Wei Huang; Shanzhi Gu; Lingxi Xie; Yaowei Wang; Jianbin Jiao; Qixiang Ye", "abstract": "Within Convolutional Neural Network (CNN), the convolution operations are good at extracting local features but experience difficulty to capture global representations. Within visual transformer, the cascaded self-attention modules can capture long-distance feature dependencies but unfortunately deteriorate local feature details. In this paper, we propose a hybrid network structure, termed Conformer, to take advantage of convolutional operations and self-attention mechanisms for enhanced representation learning. Conformer roots in the Feature Coupling Unit (FCU), which fuses local features and global representations under different resolutions in an interactive fashion. Conformer adopts a concurrent structure so that local features and global representations are retained to the maximum extent. Experiments show that Conformer, under the comparable parameter complexity, outperforms the visual transformer (DeiT-B) by 2.3% on ImageNet. On MSCOCO, it outperforms ResNet-101 by 3.7% and 3.6% mAPs for object detection and instance segmentation, respectively, demonstrating the great potential to be a general backbone network. Code is available at github.com/pengzhiliang/Conformer.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Peng_Conformer_Local_Features_Coupling_Global_Representations_for_Visual_Recognition_ICCV_2021_paper.pdf", @@ -6815,14 +7280,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Peng_Conformer_Local_Features_Coupling_Global_Representations_for_Visual_Recognition_ICCV_2021_paper.html", "aff_unique_index": "0;0;1;2;1;0;0+1", - "aff_unique_norm": "University of Chinese Academy of Sciences;Pengcheng Laboratory;Huawei", - "aff_unique_dep": ";Peng Cheng Laboratory;Huawei", + "aff_unique_norm": "University of Chinese Academy of Sciences;Peng Cheng Laboratory;Huawei", + "aff_unique_dep": ";;", "aff_unique_url": "http://www.ucas.ac.cn;http://www.pcl.ac.cn;https://www.huawei.com", "aff_unique_abbr": "UCAS;PCL;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Peng_2021_ICCV,\n \n author = {\n Peng,\n Zhiliang and Huang,\n Wei and Gu,\n Shanzhi and Xie,\n Lingxi and Wang,\n Yaowei and Jiao,\n Jianbin and Ye,\n Qixiang\n},\n title = {\n Conformer: Local Features Coupling Global Representations for Visual Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 367-376\n} \n}" }, { "title": "Consistency-Aware Graph Network for Human Interaction Understanding", @@ -6830,6 +7296,7 @@ "status": "Poster", "track": "main", "pid": 8222, + "author_site": "Zhenhua Wang; Jiajun Meng; Dongyan Guo; Jianhua Zhang; Javen Qinfeng Shi; Shengyong Chen", "author": "Zhenhua Wang; Jiajun Meng; Dongyan Guo; Jianhua Zhang; Javen Qinfeng Shi; Shengyong Chen", "abstract": "Compared with the progress made on human activity classification, much less success has been achieved on human interaction understanding (HIU). Apart from the latter task is much more challenging, the main cause is that recent approaches learn human interactive relations via shallow graphical models, which is inadequate to model complicated human interactions. In this paper, we propose a consistency-aware graph network, which combines the representative ability of graph network and the consistency-aware reasoning to facilitate the HIU task. Our network consists of three components, a backbone CNN to extract image features, a factor graph network to learn third-order interactive relations among participants, and a consistency-aware reasoning module to enforce labeling and grouping consistencies. Our key observation is that the consistency-aware-reasoning bias for HIU can be embedded into an energy function, minimizing which delivers consistent predictions. An efficient mean-field inference algorithm is proposed, such that all modules of our network could be trained jointly in an end-to-end manner. Experimental results show that our approach achieves leading performance on three benchmarks. Code will be publicly available.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_Consistency-Aware_Graph_Network_for_Human_Interaction_Understanding_ICCV_2021_paper.pdf", @@ -6853,7 +7320,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1;0", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Zhenhua and Meng,\n Jiajun and Guo,\n Dongyan and Zhang,\n Jianhua and Shi,\n Javen Qinfeng and Chen,\n Shengyong\n},\n title = {\n Consistency-Aware Graph Network for Human Interaction Understanding\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13369-13378\n} \n}" }, { "title": "Consistency-Sensitivity Guided Ensemble Black-Box Adversarial Attacks in Low-Dimensional Spaces", @@ -6861,6 +7329,7 @@ "status": "Poster", "track": "main", "pid": 9336, + "author_site": "Jianhe Yuan; Zhihai He", "author": "Jianhe Yuan; Zhihai He", "abstract": "Black-box attacks aim to generate adversarial noise tofail the victim deep neural network in the black box. Thecentral task in black-box attack method design is to estimateand characterize the victim model in the high-dimensionalmodel space based on feedback results of queries submittedto the victim network. The central performance goal is tominimize the number of queries needed for successful at-tack. Existing attack methods directly search and refine theadversarial noise in an extremely high-dimensional space,requiring hundreds or even thousands queries to the victimnetwork. To address this challenge, we propose to explore aconsistency and sensitivity guided ensemble attack (CSEA)method in a low-dimensional space. Specifically, we esti-mate the victim model in the black box using a learned lin-ear composition of an ensemble of surrogate models withdiversified network structures. Using random block maskson the input image, these surrogate models jointly constructand submit randomized and sparsified queries to the victimmodel. Based on these query results and guided by a con-sistency constraint, the surrogate models can be trained us-ing a very small number of queries such that their learnedcomposition is able to accurately approximate the victimmodel in the high-dimensional space. The randomized andsparsified queries also provide important information for usto construct an attack sensitivity map for the input image,with which the adversarial attack can be locally refined tofurther increase its success rate. Our extensive experimen-tal results demonstrate that our proposed approach signifi-cantly reduces the number of queries to the victim networkwhile maintaining very high success rates, outperformingexisting black-box attack methods by large margins.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yuan_Consistency-Sensitivity_Guided_Ensemble_Black-Box_Adversarial_Attacks_in_Low-Dimensional_Spaces_ICCV_2021_paper.pdf", @@ -6884,7 +7353,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Columbia", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Yuan_2021_ICCV,\n \n author = {\n Yuan,\n Jianhe and He,\n Zhihai\n},\n title = {\n Consistency-Sensitivity Guided Ensemble Black-Box Adversarial Attacks in Low-Dimensional Spaces\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7778-7786\n} \n}" }, { "title": "Contact-Aware Retargeting of Skinned Motion", @@ -6892,6 +7362,7 @@ "status": "Poster", "track": "main", "pid": 3452, + "author_site": "Ruben Villegas; Duygu Ceylan; Aaron Hertzmann; Jimei Yang; Jun Saito", "author": "Ruben Villegas; Duygu Ceylan; Aaron Hertzmann; Jimei Yang; Jun Saito", "abstract": "This paper introduces a motion retargeting method that preserves self-contacts and prevents inter-penetration. Self-contacts, such as when hands touch each other or the torso or the head, are important attributes of human body language and dynamics, yet existing methods do not model or preserve these contacts. Likewise, self-penetrations, such as a hand passing into the torso, are a typical artifact of motion estimation methods. The input to our method is a human motion sequence and a target skeleton and character geometry. The method identifies self-contacts and ground contacts in the input motion, and optimizes the motion to apply to the output skeleton, while preserving these contacts and reducing self-penetrations. We introduce a novel geometry-conditioned recurrent network with an encoder-space optimization strategy that achieves efficient retargeting while satisfying contact constraints. In experiments, our results quantitatively outperform previous methods and in the user study our retargeted motions are rated as higher-quality than those produced by recent works. We also show our method generalizes to motion estimated from human videos where we improve over previous works that produce noticeable interpenetration.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Villegas_Contact-Aware_Retargeting_of_Skinned_Motion_ICCV_2021_paper.pdf", @@ -6915,7 +7386,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Villegas_2021_ICCV,\n \n author = {\n Villegas,\n Ruben and Ceylan,\n Duygu and Hertzmann,\n Aaron and Yang,\n Jimei and Saito,\n Jun\n},\n title = {\n Contact-Aware Retargeting of Skinned Motion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9720-9729\n} \n}" }, { "title": "Context Decoupling Augmentation for Weakly Supervised Semantic Segmentation", @@ -6923,6 +7395,7 @@ "status": "Poster", "track": "main", "pid": 8294, + "author_site": "Yukun Su; Ruizhou Sun; Guosheng Lin; Qingyao Wu", "author": "Yukun Su; Ruizhou Sun; Guosheng Lin; Qingyao Wu", "abstract": "Data augmentation is vital for deep learning neural networks. By providing massive training samples, it helps to improve the generalization ability of the model. Weakly supervised semantic segmentation (WSSS) is a challenging problem that has been deeply studied in recent years, conventional data augmentation approaches for WSSS usually employ geometrical transformations, random cropping, and color jittering. However, merely increasing the same contextual semantic data does not bring much gain to the networks to distinguish the objects, e.g., the correct image-level classification of \"aeroplane\" may be not only due to the recognition of the object itself but also its co-occurrence context like \"sky\", which will cause the model to focus less on the object features. To this end, we present a Context Decoupling Augmentation (CDA) method, to change the inherent context in which the objects appear and thus drive the network to remove the dependence between object instances and contextual information. To validate the effectiveness of the proposed method, extensive experiments on PASCAL VOC 2012 dataset with several alternative network architectures demonstrate that CDA can boost various popular WSSS methods to the new state-of-the-art by a large margin.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Su_Context_Decoupling_Augmentation_for_Weakly_Supervised_Semantic_Segmentation_ICCV_2021_paper.pdf", @@ -6946,7 +7419,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;1;0+0", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Su_2021_ICCV,\n \n author = {\n Su,\n Yukun and Sun,\n Ruizhou and Lin,\n Guosheng and Wu,\n Qingyao\n},\n title = {\n Context Decoupling Augmentation for Weakly Supervised Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7004-7014\n} \n}" }, { "title": "Context Reasoning Attention Network for Image Super-Resolution", @@ -6954,6 +7428,7 @@ "status": "Poster", "track": "main", "pid": 1957, + "author_site": "Yulun Zhang; Donglai Wei; Can Qin; Huan Wang; Hanspeter Pfister; Yun Fu", "author": "Yulun Zhang; Donglai Wei; Can Qin; Huan Wang; Hanspeter Pfister; Yun Fu", "abstract": "Deep convolutional neural networks (CNNs) are achieving great successes for image super-resolution (SR), where global context is crucial for accurate restoration. However, the basic convolutional layer in CNNs is designed to extract local patterns, lacking the ability to model global context. Many efforts have been devoted to augmenting SR networks with the global context information, especially by global feature interaction methods. These works incorporate the global context into local feature representation. However, recent advances in neuroscience show that it is necessary for the neurons to dynamically modulate their functions according to context, which is neglected in most CNN based SR methods. Motivated by those observations and analyses, we propose context reasoning attention network (CRAN) to adaptively modulate the convolution kernel according to the global context. Specifically, we extract global context descriptors, which are further enhanced with semantic reasoning. Channel and spatial interactions are then proposed to generate context reasoning attention mask, which is applied to modify the convolution kernel adaptively. Such a modulated convolution layer is utilized as basic component to build the network blocks and itself. Extensive experiments on benchmark datasets with multiple degradation models show that our CRAN achieves superior SR results and favourable efficiency trade-off.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhang_Context_Reasoning_Attention_Network_for_Image_Super-Resolution_ICCV_2021_paper.pdf", @@ -6977,7 +7452,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhang_2021_ICCV,\n \n author = {\n Zhang,\n Yulun and Wei,\n Donglai and Qin,\n Can and Wang,\n Huan and Pfister,\n Hanspeter and Fu,\n Yun\n},\n title = {\n Context Reasoning Attention Network for Image Super-Resolution\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4278-4287\n} \n}" }, { "title": "Context-Aware Scene Graph Generation With Seq2Seq Transformers", @@ -6985,6 +7461,7 @@ "status": "Poster", "track": "main", "pid": 11117, + "author_site": "Yichao Lu; Himanshu Rai; Jason Chang; Boris Knyazev; Guangwei Yu; Shashank Shekhar; Graham W. Taylor; Maksims Volkovs", "author": "Yichao Lu; Himanshu Rai; Jason Chang; Boris Knyazev; Guangwei Yu; Shashank Shekhar; Graham W. Taylor; Maksims Volkovs", "abstract": "Scene graph generation is an important task in computer vision aimed at improving the semantic understand- ing of the visual world. In this task, the model needs to detect objects and predict visual relationships between them. Most of the existing models predict relationships in parallel assuming their independence. While there are differ- ent ways to capture these dependencies, we explore a conditional approach motivated by the sequence-to-sequence (Seq2Seq) formalism. Different from the previous research, our proposed model predicts visual relationships one at a time in an autoregressive manner by explicitly conditioning on the already predicted relationships. Drawing from translation models in NLP, we propose an encoder- decoder model built using Transformers where the encoder captures global context and long range interactions. The decoder then makes sequential predictions by conditioning on the scene graph constructed so far. In addition, we introduce a novel reinforcement learning-based training strategy tailored to Seq2Seq scene graph generation. By using a self-critical policy gradient training approach with Monte Carlo search we directly optimize for the (mean) recall metrics and bridge the gap between training and evaluation. Experimental results on two public benchmark datasets demonstrate that our Seq2Seq learning approach achieves strong empirical performance, out- performing previous state-of-the-art, while remaining efficient in terms of training and inference time. Full code for this work is available here: https://github.com/ layer6ai-labs/SGG-Seq2Seq.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Lu_Context-Aware_Scene_Graph_Generation_With_Seq2Seq_Transformers_ICCV_2021_paper.pdf", @@ -7008,7 +7485,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0+0;0;0+0;0+0;0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Lu_2021_ICCV,\n \n author = {\n Lu,\n Yichao and Rai,\n Himanshu and Chang,\n Jason and Knyazev,\n Boris and Yu,\n Guangwei and Shekhar,\n Shashank and Taylor,\n Graham W. and Volkovs,\n Maksims\n},\n title = {\n Context-Aware Scene Graph Generation With Seq2Seq Transformers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15931-15941\n} \n}" }, { "title": "Context-Sensitive Temporal Feature Learning for Gait Recognition", @@ -7016,6 +7494,7 @@ "status": "Poster", "track": "main", "pid": 6829, + "author_site": "Xiaohu Huang; Duowang Zhu; Hao Wang; Xinggang Wang; Bo Yang; Botao He; Wenyu Liu; Bin Feng", "author": "Xiaohu Huang; Duowang Zhu; Hao Wang; Xinggang Wang; Bo Yang; Botao He; Wenyu Liu; Bin Feng", "abstract": "Although gait recognition has drawn increasing research attention recently, it remains challenging to learn discriminative temporal representation since the silhouette differences are quite subtle in spatial domain. Inspired by the observation that humans can distinguish gaits of different subjects by adaptively focusing on temporal sequences with different time scales, we propose a context-sensitive temporal feature learning (CSTL) network in this paper, which aggregates temporal features in three scales to obtain motion representation according to the temporal contextual information. Specifically, CSTL introduces relation modeling among multi-scale features to evaluate feature importances, based on which network adaptively enhances more important scale and suppresses less important scale. Besides that, we propose a salient spatial feature learning (SSFL) module to tackle the misalignment problem caused by temporal operation, e.g., temporal convolution. SSFL recombines a frame of salient spatial features by extracting the most discriminative parts across the whole sequence. In this way, we achieve adaptive temporal learning and salient spatial mining simultaneously. Extensive experiments conducted on two datasets demonstrate the state-of-the-art performance. On CASIA-B dataset, we achieve rank-1 accuracies of 98.0%, 95.4% and 87.0% under normal walking, bag-carrying and coat-wearing conditions. On OU-MVLP dataset, we achieve rank-1 accuracy of 90.2%. The source code will be published at https://github.com/OliverHxh/CSTL.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Huang_Context-Sensitive_Temporal_Feature_Learning_for_Gait_Recognition_ICCV_2021_paper.pdf", @@ -7039,7 +7518,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Huang_2021_ICCV,\n \n author = {\n Huang,\n Xiaohu and Zhu,\n Duowang and Wang,\n Hao and Wang,\n Xinggang and Yang,\n Bo and He,\n Botao and Liu,\n Wenyu and Feng,\n Bin\n},\n title = {\n Context-Sensitive Temporal Feature Learning for Gait Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12909-12918\n} \n}" }, { "title": "Contextually Plausible and Diverse 3D Human Motion Prediction", @@ -7047,6 +7527,7 @@ "status": "Poster", "track": "main", "pid": 1968, + "author_site": "Sadegh Aliakbarian; Fatemeh Saleh; Lars Petersson; Stephen Gould; Mathieu Salzmann", "author": "Sadegh Aliakbarian; Fatemeh Saleh; Lars Petersson; Stephen Gould; Mathieu Salzmann", "abstract": "We tackle the task of diverse 3D human motion prediction, that is, forecasting multiple plausible future 3D poses given a sequence of observed 3D poses. In this context, a popular approach consists of using a Conditional Variational Autoencoder (CVAE). However, existing approaches that do so either fail to capture the diversity in human motion, or generate diverse but semantically implausible continuations of the observed motion. In this paper, we address both of these problems by developing a new variational framework that accounts for both diversity and context of the generated future motion. To this end, and in contrast to existing approaches, we condition the sampling of the latent variable that acts as source of diversity on the representation of the past observation, thus encouraging it to carry relevant information. Our experiments demonstrate that our approach yields motions not only of higher quality while retaining diversity, but also that preserve the contextual information contained in the observed motion.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Aliakbarian_Contextually_Plausible_and_Diverse_3D_Human_Motion_Prediction_ICCV_2021_paper.pdf", @@ -7063,14 +7544,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Aliakbarian_Contextually_Plausible_and_Diverse_3D_Human_Motion_Prediction_ICCV_2021_paper.html", "aff_unique_index": "0;1;2;1;3", - "aff_unique_norm": "Microsoft;Australian National University;CSIRO;EPFL", - "aff_unique_dep": "Microsoft Corporation;Australian Centre for Robotic Vision;Data61;CVLab", + "aff_unique_norm": "Microsoft Corporation;Australian National University;CSIRO;École Polytechnique Fédérale de Lausanne", + "aff_unique_dep": ";Australian Centre for Robotic Vision;Data61;CVLab", "aff_unique_url": "https://www.microsoft.com;https://www.anu.edu.au;https://www.csiro.au;https://cvlab.epfl.ch", "aff_unique_abbr": "Microsoft;ANU;CSIRO;EPFL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;2", - "aff_country_unique": "United States;Australia;Switzerland" + "aff_country_unique": "United States;Australia;Switzerland", + "bibtex": "@InProceedings{Aliakbarian_2021_ICCV,\n \n author = {\n Aliakbarian,\n Sadegh and Saleh,\n Fatemeh and Petersson,\n Lars and Gould,\n Stephen and Salzmann,\n Mathieu\n},\n title = {\n Contextually Plausible and Diverse 3D Human Motion Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11333-11342\n} \n}" }, { "title": "Continual Learning for Image-Based Camera Localization", @@ -7078,6 +7560,7 @@ "status": "Poster", "track": "main", "pid": 3464, + "author_site": "Shuzhe Wang; Zakaria Laskar; Iaroslav Melekhov; Xiaotian Li; Juho Kannala", "author": "Shuzhe Wang; Zakaria Laskar; Iaroslav Melekhov; Xiaotian Li; Juho Kannala", "abstract": "For several emerging technologies such as augmented reality, autonomous driving and robotics, visual localization is a critical component. Directly regressing camera pose/3D scene coordinates from the input image using deep neural networks has shown great potential. However, such methods assume a stationary data distribution with all scenes simultaneously available during training. In this paper, we approach the problem of visual localization in a continual learning setup -- whereby the model is trained on scenes in an incremental manner. Our results show that similar to the classification domain, non-stationary data induces catastrophic forgetting in deep networks for visual localization. To address this issue, a strong baseline based on storing and replaying images from a fixed buffer is proposed. Furthermore, we propose a new sampling method based on coverage score (Buff-CS) that adapts the existing sampling strategies in the buffering process to the problem of visual localization. Results demonstrate consistent improvements over standard buffering methods on two challenging datasets -- 7Scenes, 12Scenes, and also 19Scenes by combining the former scenes.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_Continual_Learning_for_Image-Based_Camera_Localization_ICCV_2021_paper.pdf", @@ -7101,7 +7584,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "Finland" + "aff_country_unique": "Finland", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Shuzhe and Laskar,\n Zakaria and Melekhov,\n Iaroslav and Li,\n Xiaotian and Kannala,\n Juho\n},\n title = {\n Continual Learning for Image-Based Camera Localization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3252-3262\n} \n}" }, { "title": "Continual Learning on Noisy Data Streams via Self-Purified Replay", @@ -7109,6 +7593,7 @@ "status": "Poster", "track": "main", "pid": 9539, + "author_site": "Chris Dongjoo Kim; Jinseo Jeong; Sangwoo Moon; Gunhee Kim", "author": "Chris Dongjoo Kim; Jinseo Jeong; Sangwoo Moon; Gunhee Kim", "abstract": "Continually learning in the real world must overcome many challenges, among which noisy labels are a common and inevitable issue. In this work, we present a replay-based continual learning framework that simultaneously addresses both catastrophic forgetting and noisy labels for the first time. Our solution is based on two observations; (i) forgetting can be mitigated even with noisy labels via self-supervised learning, and (ii) the purity of the replay buffer is crucial. Building on this regard, we propose two key components of our method: (i) a self-supervised replay technique named Self-Replay, which can circumvent erroneous training signals arising from noisy labeled data, and (ii) the Self-Centered filter that maintains a purified replay buffer via centrality-based stochastic graph ensembles. The empirical results on MNIST, CIFAR-10, CIFAR-100, and WebVision with real-world noise demonstrate that our framework can maintain a highly pure replay buffer amidst noisy streamed data while greatly outperforming the combinations of the state-of-the-art continual learning and noisy label learning methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Kim_Continual_Learning_on_Noisy_Data_Streams_via_Self-Purified_Replay_ICCV_2021_paper.pdf", @@ -7132,7 +7617,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Seoul", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Kim_2021_ICCV,\n \n author = {\n Kim,\n Chris Dongjoo and Jeong,\n Jinseo and Moon,\n Sangwoo and Kim,\n Gunhee\n},\n title = {\n Continual Learning on Noisy Data Streams via Self-Purified Replay\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 537-547\n} \n}" }, { "title": "Continual Neural Mapping: Learning an Implicit Scene Representation From Sequential Observations", @@ -7140,6 +7626,7 @@ "status": "Poster", "track": "main", "pid": 2844, + "author_site": "Zike Yan; Yuxin Tian; Xuesong Shi; Ping Guo; Peng Wang; Hongbin Zha", "author": "Zike Yan; Yuxin Tian; Xuesong Shi; Ping Guo; Peng Wang; Hongbin Zha", "abstract": "Recent advances have enabled a single neural network to serve as an implicit scene representation, establishing the mapping function between spatial coordinates and scene properties. In this paper, we make a further step towards continual learning of the implicit scene representation directly from sequential observations, namely Continual Neural Mapping. The proposed problem setting bridges the gap between batch-trained implicit neural representations and commonly used streaming data in robotics and vision communities. We introduce an experience replay approach to tackle an exemplary task of continual neural mapping: approximating a continuous signed distance function (SDF) from sequential depth images as a scene geometry representation. We show for the first time that a single network can represent scene geometry over time continually without catastrophic forgetting, while achieving promising trade-offs between accuracy and efficiency.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yan_Continual_Neural_Mapping_Learning_an_Implicit_Scene_Representation_From_Sequential_ICCV_2021_paper.pdf", @@ -7156,14 +7643,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Yan_Continual_Neural_Mapping_Learning_an_Implicit_Scene_Representation_From_Sequential_ICCV_2021_paper.html", "aff_unique_index": "0+0;1;2;2;2;0+0", - "aff_unique_norm": "Peking University;Beihang University;Intel", + "aff_unique_norm": "Peking University;Beihang University;Intel Corporation", "aff_unique_dep": "School of EECS;School of Automation Science and Electrical Engineering;Intel Labs", "aff_unique_url": "http://www.pku.edu.cn;http://www.buaa.edu.cn;https://www.intel.cn", "aff_unique_abbr": "PKU;BUAA;Intel", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yan_2021_ICCV,\n \n author = {\n Yan,\n Zike and Tian,\n Yuxin and Shi,\n Xuesong and Guo,\n Ping and Wang,\n Peng and Zha,\n Hongbin\n},\n title = {\n Continual Neural Mapping: Learning an Implicit Scene Representation From Sequential Observations\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15782-15792\n} \n}" }, { "title": "Continual Prototype Evolution: Learning Online From Non-Stationary Data Streams", @@ -7171,6 +7659,7 @@ "status": "Poster", "track": "main", "pid": 1108, + "author_site": "Matthias De Lange; Tinne Tuytelaars", "author": "Matthias De Lange; Tinne Tuytelaars", "abstract": "Attaining prototypical features to represent class distributions is well established in representation learning. However, learning prototypes online from streaming data proves a challenging endeavor as they rapidly become outdated, caused by an ever-changing parameter space during the learning process. Additionally, continual learning does not assume the data stream to be stationary, typically resulting in catastrophic forgetting of previous knowledge. As a first, we introduce a system addressing both problems, where prototypes evolve continually in a shared latent space, enabling learning and prediction at any point in time. To facilitate learning, a novel objective function synchronizes the latent space with the continually evolving prototypes. In contrast to the major body of work in continual learning, data streams are processed in an online fashion without task information and can be highly imbalanced, for which we propose an efficient memory scheme. As an additional contribution, we propose the learner-evaluator framework that i) generalizes existing paradigms in continual learning, ii) introduces data incremental learning, and iii) models the bridge between continual learning and concept drift. We obtain state-of-the-art performance by a significant margin on eight benchmarks, including three highly imbalanced data streams. Code is publicly available.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/De_Lange_Continual_Prototype_Evolution_Learning_Online_From_Non-Stationary_Data_Streams_ICCV_2021_paper.pdf", @@ -7194,7 +7683,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Belgium" + "aff_country_unique": "Belgium", + "bibtex": "@InProceedings{De_Lange_2021_ICCV,\n \n author = {\n De Lange,\n Matthias and Tuytelaars,\n Tinne\n},\n title = {\n Continual Prototype Evolution: Learning Online From Non-Stationary Data Streams\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8250-8259\n} \n}" }, { "title": "Continuous Copy-Paste for One-Stage Multi-Object Tracking and Segmentation", @@ -7202,6 +7692,7 @@ "status": "Poster", "track": "main", "pid": 1926, + "author_site": "Zhenbo Xu; Ajin Meng; Zhenbo Shi; Wei Yang; Zhi Chen; Liusheng Huang", "author": "Zhenbo Xu; Ajin Meng; Zhenbo Shi; Wei Yang; Zhi Chen; Liusheng Huang", "abstract": "Current one-step multi-object tracking and segmentation (MOTS) methods lag behind recent two-step methods. By separating the instance segmentation stage from the tracking stage, two-step methods can exploit non-video datasets as extra data for training instance segmentation. Moreover, instances belonging to different IDs on different frames, rather than limited numbers of instances in raw consecutive frames, can be gathered to allow more effective hard example mining in the training of trackers. In this paper, we bridge this gap by presenting a novel data augmentation strategy named continuous copy-paste (CCP). Our intuition behind CCP is to fully exploit the pixel-wise annotations provided by MOTS to actively increase the number of instances as well as unique instance IDs in training. Without any modifications to frameworks, current MOTS methods achieve significant performance gains when trained with CCP. Based on CCP, we propose the first effective one-stage online MOTS method named CCPNet, which generates instance masks as well as the tracking results in one shot. Our CCPNet surpasses all state-of-the-art methods by large margins (3.8% higher sMOTSA and 4.1% higher MOTSA for pedestrians on the KITTI MOTS Validation) and ranks 1st on the KITTI MOTS leaderboard. Evaluations across three datasets also demonstrate the effectiveness of both CCP and CCPNet. Our codes are publicly available at: https://github.com/detectRecog/CCP.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xu_Continuous_Copy-Paste_for_One-Stage_Multi-Object_Tracking_and_Segmentation_ICCV_2021_paper.pdf", @@ -7225,7 +7716,8 @@ "aff_campus_unique_index": "1+2+2", "aff_campus_unique": ";Hangzhou;Beijing", "aff_country_unique_index": "0+0+0+0+0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xu_2021_ICCV,\n \n author = {\n Xu,\n Zhenbo and Meng,\n Ajin and Shi,\n Zhenbo and Yang,\n Wei and Chen,\n Zhi and Huang,\n Liusheng\n},\n title = {\n Continuous Copy-Paste for One-Stage Multi-Object Tracking and Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15323-15332\n} \n}" }, { "title": "Contrast and Classify: Training Robust VQA Models", @@ -7233,6 +7725,7 @@ "status": "Poster", "track": "main", "pid": 6026, + "author_site": "Yash Kant; Abhinav Moudgil; Dhruv Batra; Devi Parikh; Harsh Agrawal", "author": "Yash Kant; Abhinav Moudgil; Dhruv Batra; Devi Parikh; Harsh Agrawal", "abstract": "Recent Visual Question Answering (VQA) models have shown impressive performance on the VQA benchmark but remain sensitive to small linguistic variations in input questions. Existing approaches address this by augmenting the dataset with question paraphrases from visual question generation models or adversarial perturbations. These approaches use the combined data to learn an answer classifier by minimizing the standard cross-entropy loss. To more effectively leverage augmented data, we build on the recent success in contrastive learning. We propose a novel training paradigm (ConClaT) that optimizes both cross-entropy and contrastive losses. The contrastive loss encourages representations to be robust to linguistic variations in questions while the cross-entropy loss preserves the discriminative power of representations for answer prediction. We find that optimizing both losses -- either alternately or jointly -- is key to effective training. On the VQA-Rephrasings benchmark, which measures the VQA model's answer consistency across human paraphrases of a question, ConClaT improves Consensus Score by 1.63% over an improved baseline. In addition, on the standard VQA 2.0 benchmark, we improve the VQA accuracy by 0.78% overall. We also show that ConClaT is agnostic to the type of data-augmentation strategy used.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Kant_Contrast_and_Classify_Training_Robust_VQA_Models_ICCV_2021_paper.pdf", @@ -7249,14 +7742,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Kant_Contrast_and_Classify_Training_Robust_VQA_Models_ICCV_2021_paper.html", "aff_unique_index": "0;0;0+1;0+1;0", - "aff_unique_norm": "Georgia Institute of Technology;Meta", + "aff_unique_norm": "Georgia Institute of Technology;Facebook", "aff_unique_dep": ";Facebook AI Research", "aff_unique_url": "https://www.gatech.edu;https://research.facebook.com", "aff_unique_abbr": "Georgia Tech;FAIR", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0;0+0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Kant_2021_ICCV,\n \n author = {\n Kant,\n Yash and Moudgil,\n Abhinav and Batra,\n Dhruv and Parikh,\n Devi and Agrawal,\n Harsh\n},\n title = {\n Contrast and Classify: Training Robust VQA Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1604-1613\n} \n}" }, { "title": "Contrast and Order Representations for Video Self-Supervised Learning", @@ -7264,6 +7758,7 @@ "status": "Poster", "track": "main", "pid": 1214, + "author_site": "Kai Hu; Jie Shao; Yuan Liu; Bhiksha Raj; Marios Savvides; Zhiqiang Shen", "author": "Kai Hu; Jie Shao; Yuan Liu; Bhiksha Raj; Marios Savvides; Zhiqiang Shen", "abstract": "This paper studies the problem of learning self-supervised representations on videos. In contrast to image modality that only requires appearance information on objects or scenes, video needs to further explore the relations between multiple frames/clips along the temporal dimension. However, the recent proposed contrastive-based self-supervised frameworks do not grasp such relations explicitly since they simply utilize two augmented clips from the same video and compare their distance without referring to their temporal relation. To address this, we present a contrast-and-order representation (CORP) framework for learning self-supervised video representations that can automatically capture both the appearance information within each frame and temporal information across different frames. In particular, given two video clips, our model first predicts whether they come from the same input video, and then predict the temporal ordering of the clips if they come from the same video. We also propose a novel decoupling attention method to learn symmetric similarity (contrast) and anti-symmetric patterns (order). Such design involves neither extra parameters nor computation, but can speed up the learning process and improve accuracy compared to the vanilla multi-head attention. We extensively validate the representation ability of our learned video features for the downstream action recognition task on Kinetics-400 and Something-something V2. Our method outperforms previous state-of-the-arts by a significant margin.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Hu_Contrast_and_Order_Representations_for_Video_Self-Supervised_Learning_ICCV_2021_paper.pdf", @@ -7287,7 +7782,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+1;1+1;1;0;0;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Hu_2021_ICCV,\n \n author = {\n Hu,\n Kai and Shao,\n Jie and Liu,\n Yuan and Raj,\n Bhiksha and Savvides,\n Marios and Shen,\n Zhiqiang\n},\n title = {\n Contrast and Order Representations for Video Self-Supervised Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7939-7949\n} \n}" }, { "title": "Contrasting Contrastive Self-Supervised Representation Learning Pipelines", @@ -7295,6 +7791,7 @@ "status": "Poster", "track": "main", "pid": 3635, + "author_site": "Klemen Kotar; Gabriel Ilharco; Ludwig Schmidt; Kiana Ehsani; Roozbeh Mottaghi", "author": "Klemen Kotar; Gabriel Ilharco; Ludwig Schmidt; Kiana Ehsani; Roozbeh Mottaghi", "abstract": "In the past few years, we have witnessed remarkable breakthroughs in self-supervised representation learning. Despite the success and adoption of representations learned through this paradigm, much is yet to be understood about how different training methods and datasets influence performance on downstream tasks. In this paper, we analyze contrastive approaches as one of the most successful and popular variants of self-supervised representation learning. We perform this analysis from the perspective of the training algorithms, pre-training datasets and end tasks. We examine over 700 training experiments including 30 encoders, 4 pre-training datasets and 20 diverse downstream tasks. Our experiments address various questions regarding the performance of self-supervised models compared to their supervised counterparts, current benchmarks used for evaluation, and the effect of the pre-training data on end task performance. We hope the insights and empirical evidence provided by this work will help future research in learning better visual representations.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Kotar_Contrasting_Contrastive_Self-Supervised_Representation_Learning_Pipelines_ICCV_2021_paper.pdf", @@ -7318,7 +7815,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Kotar_2021_ICCV,\n \n author = {\n Kotar,\n Klemen and Ilharco,\n Gabriel and Schmidt,\n Ludwig and Ehsani,\n Kiana and Mottaghi,\n Roozbeh\n},\n title = {\n Contrasting Contrastive Self-Supervised Representation Learning Pipelines\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9949-9959\n} \n}" }, { "title": "Contrastive Attention Maps for Self-Supervised Co-Localization", @@ -7326,6 +7824,7 @@ "status": "Poster", "track": "main", "pid": 1645, + "author_site": "Minsong Ki; Youngjung Uh; Junsuk Choe; Hyeran Byun", "author": "Minsong Ki; Youngjung Uh; Junsuk Choe; Hyeran Byun", "abstract": "The goal of unsupervised co-localization is to locate the object in a scene under the assumptions that 1) the dataset consists of only one superclass, e.g., birds, and 2) there are no human-annotated labels in the dataset. The most recent method achieves impressive co-localization performance by employing self-supervised representation learning approaches such as predicting rotation. In this paper, we introduce a new contrastive objective directly on the attention maps to enhance co-localization performance. Our contrastive loss function exploits rich information of location, which induces the model to activate the extent of the object effectively. In addition, we propose a pixel-wise attention pooling that selectively aggregates the feature map regarding their magnitudes across channels. Our methods are simple and shown effective by extensive qualitative and quantitative evaluation, achieving state-of-the-art co-localization performances by large margins on four datasets: CUB-200-2011, Stanford Cars, FGVC-Aircraft, and Stanford Dogs. Our code will be publicly available online for the research community.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ki_Contrastive_Attention_Maps_for_Self-Supervised_Co-Localization_ICCV_2021_paper.pdf", @@ -7340,7 +7839,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Ki_Contrastive_Attention_Maps_for_Self-Supervised_Co-Localization_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Ki_Contrastive_Attention_Maps_for_Self-Supervised_Co-Localization_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Ki_2021_ICCV,\n \n author = {\n Ki,\n Minsong and Uh,\n Youngjung and Choe,\n Junsuk and Byun,\n Hyeran\n},\n title = {\n Contrastive Attention Maps for Self-Supervised Co-Localization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2803-2812\n} \n}" }, { "title": "Contrastive Coding for Active Learning Under Class Distribution Mismatch", @@ -7348,6 +7848,7 @@ "status": "Poster", "track": "main", "pid": 8143, + "author_site": "Pan Du; Suyun Zhao; Hui Chen; Shuwen Chai; Hong Chen; Cuiping Li", "author": "Pan Du; Suyun Zhao; Hui Chen; Shuwen Chai; Hong Chen; Cuiping Li", "abstract": "Active learning (AL) is successful based on the assumption that labeled and unlabeled data are obtained from the same class distribution. However, its performance deteriorates under class distribution mismatch, wherein the unlabeled data contain many samples out of the class distribution of labeled data. To effectively handle the problems under class distribution mismatch, we propose a contrastive coding based AL framework named CCAL. Unlike the existing AL methods that focus on selecting the most informative samples for annotating, CCAL extracts both semantic and distinctive features by contrastive learning and combines them in a query strategy to choose the most informative unlabeled samples with matched categories. Theoretically, we prove that the AL error of CCAL has a tight upper bound. Experimentally, we evaluate its performance on CIFAR10, CIFAR100, and an artificial cross-dataset that consists of five datasets; consequently, CCAL achieves state-of-the-art performance by a large margin with remarkably lower annotation cost. To the best of our knowledge, CCAL is the first work related to AL for class distribution mismatch.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Du_Contrastive_Coding_for_Active_Learning_Under_Class_Distribution_Mismatch_ICCV_2021_paper.pdf", @@ -7371,7 +7872,8 @@ "aff_campus_unique_index": "1;1;1;1;1;1", "aff_campus_unique": ";Beijing", "aff_country_unique_index": "0+0;0+0;0;0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Du_2021_ICCV,\n \n author = {\n Du,\n Pan and Zhao,\n Suyun and Chen,\n Hui and Chai,\n Shuwen and Chen,\n Hong and Li,\n Cuiping\n},\n title = {\n Contrastive Coding for Active Learning Under Class Distribution Mismatch\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8927-8936\n} \n}" }, { "title": "Contrastive Learning for Label Efficient Semantic Segmentation", @@ -7379,6 +7881,7 @@ "status": "Poster", "track": "main", "pid": 9250, + "author_site": "Xiangyun Zhao; Raviteja Vemulapalli; Philip Andrew Mansfield; Boqing Gong; Bradley Green; Lior Shapira; Ying Wu", "author": "Xiangyun Zhao; Raviteja Vemulapalli; Philip Andrew Mansfield; Boqing Gong; Bradley Green; Lior Shapira; Ying Wu", "abstract": "Collecting labeled data for the task of semantic segmentation is expensive and time-consuming, as it requires dense pixel-level annotations. While recent Convolutional Neural Network (CNN) based semantic segmentation approaches have achieved impressive results by using large amounts of labeled training data, their performance drops significantly as the amount of labeled data decreases. This happens because deep CNNs trained with the de facto cross-entropy loss can easily overfit to small amounts of labeled data. To address this issue, we propose a simple and effective contrastive learning-based training strategy in which we first pretrain the network using a pixel-wise, label-based contrastive loss, and then fine-tune it using the cross-entropy loss. This approach increases intra-class compactness and inter-class separability, thereby resulting in a better pixel classifier. We demonstrate the effectiveness of the proposed training strategy using the Cityscapes and PASCAL VOC 2012 segmentation datasets. Our results show that pretraining with the proposed contrastive loss results in large performance gains (more than 20% absolute improvement in some settings) when the amount of labeled data is limited. In many settings, the proposed contrastive pretraining strategy, which does not use any additional data, is able to match or outperform the widely-used ImageNet pretraining strategy that uses more than a million additional labeled images.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhao_Contrastive_Learning_for_Label_Efficient_Semantic_Segmentation_ICCV_2021_paper.pdf", @@ -7402,7 +7905,8 @@ "aff_campus_unique_index": "1;1;1;1;1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0+0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhao_2021_ICCV,\n \n author = {\n Zhao,\n Xiangyun and Vemulapalli,\n Raviteja and Mansfield,\n Philip Andrew and Gong,\n Boqing and Green,\n Bradley and Shapira,\n Lior and Wu,\n Ying\n},\n title = {\n Contrastive Learning for Label Efficient Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10623-10633\n} \n}" }, { "title": "Contrastive Learning of Image Representations With Cross-Video Cycle-Consistency", @@ -7410,6 +7914,7 @@ "status": "Poster", "track": "main", "pid": 2880, + "author_site": "Haiping Wu; Xiaolong Wang", "author": "Haiping Wu; Xiaolong Wang", "abstract": "Recent works have advanced the performance of self-supervised representation learning by a large margin. The core among these methods is intra-image invariance learning. Two different transformations of one image instance are considered as a positive sample pair, where various tasks are designed to learn invariant representations by comparing the pair. Analogically, for video data, representations of frames from the same video are trained to be closer than frames from other videos, i.e. intra-video invariance. However, cross-video relation has barely been explored for visual representation learning. Unlike intra-video invariance, ground-truth labels of cross-video relation is usually unavailable without human labors. In this paper, we propose a novel contrastive learning method which explores the cross-video relation by using cycle-consistency for general image representation learning. This allows to collect positive sample pairs across different video instances, which we hypothesize will lead to higher-level semantics. We validate our method by transferring our image representation to multiple downstream tasks including visual object tracking, image classification, and action recognition. We show significant improvement over state-of-the-art contrastive learning methods. Project page is available at https://happywu.github.io/cycle_contrast_video.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wu_Contrastive_Learning_of_Image_Representations_With_Cross-Video_Cycle-Consistency_ICCV_2021_paper.pdf", @@ -7433,7 +7938,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";San Diego", "aff_country_unique_index": "0;1", - "aff_country_unique": "Canada;United States" + "aff_country_unique": "Canada;United States", + "bibtex": "@InProceedings{Wu_2021_ICCV,\n \n author = {\n Wu,\n Haiping and Wang,\n Xiaolong\n},\n title = {\n Contrastive Learning of Image Representations With Cross-Video Cycle-Consistency\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10149-10159\n} \n}" }, { "title": "Contrastive Multimodal Fusion With TupleInfoNCE", @@ -7441,6 +7947,7 @@ "status": "Poster", "track": "main", "pid": 6784, + "author_site": "Yunze Liu; Qingnan Fan; Shanghang Zhang; Hao Dong; Thomas Funkhouser; Li Yi", "author": "Yunze Liu; Qingnan Fan; Shanghang Zhang; Hao Dong; Thomas Funkhouser; Li Yi", "abstract": "This paper proposes a method for representation learning of multimodal data using contrastive losses. A traditional approach is to contrast different modalities to learn the information shared between them. However, that approach could fail to learn the complementary synergies between modalities that might be useful for downstream tasks. Another approach is to concatenate all the modalities into a tuple and then contrast positive and negative tuple correspondences. However, that approach could consider only the stronger modalities while ignoring the weaker ones. To address these issues, we propose a novel contrastive learning objective, TupleInfoNCE. It contrasts tuples based not only on positive and negative correspondences, but also by composing new negative tuples using modalities describing different scenes. Training with these additional negatives encourages the learning model to examine the correspondences among modalities in the same tuple, ensuring that weak modalities are not ignored. We provide a theoretical justification based on mutual-information for why this approach works, and we propose a sample optimization algorithm to generate positive and negative samples to maximize training efficacy. We find that TupleInfoNCE significantly outperforms previous state of the arts on three different downstream tasks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liu_Contrastive_Multimodal_Fusion_With_TupleInfoNCE_ICCV_2021_paper.pdf", @@ -7464,7 +7971,8 @@ "aff_campus_unique_index": "1;2;3;4;4", "aff_campus_unique": ";Stanford;Berkeley;Beijing;Mountain View", "aff_country_unique_index": "0;1;1;0+0;1;0+1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Liu_2021_ICCV,\n \n author = {\n Liu,\n Yunze and Fan,\n Qingnan and Zhang,\n Shanghang and Dong,\n Hao and Funkhouser,\n Thomas and Yi,\n Li\n},\n title = {\n Contrastive Multimodal Fusion With TupleInfoNCE\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 754-763\n} \n}" }, { "title": "Cortical Surface Shape Analysis Based on Alexandrov Polyhedra", @@ -7472,6 +7980,7 @@ "status": "Poster", "track": "main", "pid": 10627, + "author_site": "Min Zhang; Yang Guo; Na Lei; Zhou Zhao; Jianfeng Wu; Xiaoyin Xu; Yalin Wang; Xianfeng Gu", "author": "Min Zhang; Yang Guo; Na Lei; Zhou Zhao; Jianfeng Wu; Xiaoyin Xu; Yalin Wang; Xianfeng Gu", "abstract": "Shape analysis has been playing an important role in early diagnosis and prognosis of neurodegenerative diseases such as Alzheimer's diseases (AD). However, obtaining effective shape representations remains challenging. This paper proposes to use the Alexandrov polyhedra as surface-based shape signatures for cortical morphometry analysis. Given a closed genus-0 surface, its Alexandrov polyhedron is a convex representation that encodes its intrinsic geometry information. We propose to compute the polyhedra via a novel spherical optimal transport (OT) computation. In our experiments, we observe that the Alexandrov polyhedra of cortical surfaces between pathology-confirmed AD and cognitively unimpaired individuals are significantly different. Moreover, we propose a visualization method by comparing local geometry differences across cortical surfaces. We show that the proposed method is effective in pinpointing regional cortical structural changes impacted by AD.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhang_Cortical_Surface_Shape_Analysis_Based_on_Alexandrov_Polyhedra_ICCV_2021_paper.pdf", @@ -7486,7 +7995,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhang_Cortical_Surface_Shape_Analysis_Based_on_Alexandrov_Polyhedra_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhang_Cortical_Surface_Shape_Analysis_Based_on_Alexandrov_Polyhedra_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Zhang_2021_ICCV,\n \n author = {\n Zhang,\n Min and Guo,\n Yang and Lei,\n Na and Zhao,\n Zhou and Wu,\n Jianfeng and Xu,\n Xiaoyin and Wang,\n Yalin and Gu,\n Xianfeng\n},\n title = {\n Cortical Surface Shape Analysis Based on Alexandrov Polyhedra\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14244-14252\n} \n}" }, { "title": "Counterfactual Attention Learning for Fine-Grained Visual Categorization and Re-Identification", @@ -7494,6 +8004,7 @@ "status": "Poster", "track": "main", "pid": 7385, + "author_site": "Yongming Rao; Guangyi Chen; Jiwen Lu; Jie Zhou", "author": "Yongming Rao; Guangyi Chen; Jiwen Lu; Jie Zhou", "abstract": "Attention mechanism has demonstrated great potential in fine-grained visual recognition tasks. In this paper, we present a counterfactual attention learning method to learn more effective attention based on causal inference. Unlike most existing methods that learn visual attention based on conventional likelihood, we propose to learn the attention with counterfactual causality, which provides a tool to measure the attention quality and a powerful supervisory signal to guide the learning process. Specifically, we analyze the effect of the learned visual attention on network prediction through counterfactual intervention and maximize the effect to encourage the network to learn more useful attention for fine-grained image recognition. Empirically, we evaluate our method on a wide range of fine-grained visual recognition tasks where attention plays a crucial role, including fine-grained image categorization, person re-identification, and vehicle re-identification. The consistent improvement on all benchmarks demonstrates the effectiveness of our method.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Rao_Counterfactual_Attention_Learning_for_Fine-Grained_Visual_Categorization_and_Re-Identification_ICCV_2021_paper.pdf", @@ -7517,7 +8028,8 @@ "aff_campus_unique_index": ";;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0+0;0+0+0;0+0+0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Rao_2021_ICCV,\n \n author = {\n Rao,\n Yongming and Chen,\n Guangyi and Lu,\n Jiwen and Zhou,\n Jie\n},\n title = {\n Counterfactual Attention Learning for Fine-Grained Visual Categorization and Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1025-1034\n} \n}" }, { "title": "CrackFormer: Transformer Network for Fine-Grained Crack Detection", @@ -7525,6 +8037,7 @@ "status": "Poster", "track": "main", "pid": 8719, + "author_site": "Huajun Liu; Xiangyu Miao; Christoph Mertz; Chengzhong Xu; Hui Kong", "author": "Huajun Liu; Xiangyu Miao; Christoph Mertz; Chengzhong Xu; Hui Kong", "abstract": "Cracks are irregular line structures that are of interest in many computer vision applications. Crack detection (e.g., from pavement images) is a challenging task due to intensity in-homogeneity, topology complexity, low contrast and noisy background. The overall crack detection accuracy can be significantly affected by the detection performance on fine-grained cracks. In this work, we propose a Crack Transformer network (CrackFormer) for fine-grained crack detection. The CrackFormer is composed of novel attention modules in a SegNet-like encoder-decoder architecture. Specifically, it consists of novel self-attention modules with 1x1 convolutional kernels for efficient contextual information extraction across feature-channels, and efficient positional embedding to capture large receptive field contextual information for long range interactions. It also introduces new scaling-attention modules to combine outputs from the corresponding encoder and decoder blocks to suppress non-semantic features and sharpen semantic cracks. The CrackFormer is trained and evaluated on three classical crack datasets. The experimental results show that CrackFormer achieves ODS values of 0.871, 0.877 and 0.881, respectively, on the three datasets and outperforms the state-of-the-art methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liu_CrackFormer_Transformer_Network_for_Fine-Grained_Crack_Detection_ICCV_2021_paper.pdf", @@ -7548,7 +8061,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Macau SAR", "aff_country_unique_index": "0;0;1;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Liu_2021_ICCV,\n \n author = {\n Liu,\n Huajun and Miao,\n Xiangyu and Mertz,\n Christoph and Xu,\n Chengzhong and Kong,\n Hui\n},\n title = {\n CrackFormer: Transformer Network for Fine-Grained Crack Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3783-3792\n} \n}" }, { "title": "Cross-Camera Convolutional Color Constancy", @@ -7556,6 +8070,7 @@ "status": "Poster", "track": "main", "pid": 1277, + "author_site": "Mahmoud Afifi; Jonathan T. Barron; Chloe LeGendre; Yun-Ta Tsai; Francois Bleibel", "author": "Mahmoud Afifi; Jonathan T. Barron; Chloe LeGendre; Yun-Ta Tsai; Francois Bleibel", "abstract": "We present \"Cross-Camera Convolutional Color Constancy\" (C5), a learning-based method, trained on images from multiple cameras, that accurately estimates a scene's illuminant color from raw images captured by a new camera previously unseen during training. C5 is a hypernetwork-like extension of the convolutional color constancy (CCC) approach: C5 learns to generate the weights of a CCC model that is then evaluated on the input image, with the CCC weights dynamically adapted to different input content. Unlike prior cross-camera color constancy models, which are usually designed to be agnostic to the spectral properties of test-set images from unobserved cameras, C5 approaches this problem through the lens of transductive inference: additional unlabeled images are provided as input to the model at test time, which allows the model to calibrate itself to the spectral properties of the test-set camera during inference. C5 achieves state-of-the-art accuracy for cross-camera color constancy on several datasets, is fast to evaluate ( 7 and 90 ms per image on a GPU or CPU, respectively), and requires little memory ( 2 MB), and thus is a practical solution to the problem of calibration-free automatic white balance for mobile photography.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Afifi_Cross-Camera_Convolutional_Color_Constancy_ICCV_2021_paper.pdf", @@ -7579,7 +8094,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0+1;0;0;0;0", - "aff_country_unique": "United States;Canada" + "aff_country_unique": "United States;Canada", + "bibtex": "@InProceedings{Afifi_2021_ICCV,\n \n author = {\n Afifi,\n Mahmoud and Barron,\n Jonathan T. and LeGendre,\n Chloe and Tsai,\n Yun-Ta and Bleibel,\n Francois\n},\n title = {\n Cross-Camera Convolutional Color Constancy\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1981-1990\n} \n}" }, { "title": "Cross-Category Video Highlight Detection via Set-Based Learning", @@ -7587,6 +8103,7 @@ "status": "Poster", "track": "main", "pid": 2902, + "author_site": "Minghao Xu; Hang Wang; Bingbing Ni; Riheng Zhu; Zhenbang Sun; Changhu Wang", "author": "Minghao Xu; Hang Wang; Bingbing Ni; Riheng Zhu; Zhenbang Sun; Changhu Wang", "abstract": "Autonomous highlight detection is crucial for enhancing the efficiency of video browsing on social media platforms. To attain this goal in a data-driven way, one may often face the situation where highlight annotations are not available on the target video category used in practice, while the supervision on another video category (named as source video category) is achievable. In such a situation, one can derive an effective highlight detector on target video category by transferring the highlight knowledge acquired from source video category to the target one. We call this problem cross-category video highlight detection, which has been rarely studied in previous works. For tackling such practical problem, we propose a Dual-Learner-based Video Highlight Detection (DL-VHD) framework. Under this framework, we first design a Set-based Learning module (SL-module) to improve the conventional pair-based learning by assessing the highlight extent of a video segment under a broader context. Based on such learning manner, we introduce two different learners to acquire the basic distinction of target category videos and the characteristics of highlight moments on source video category, respectively. These two types of highlight knowledge are further consolidated via knowledge distillation. Extensive experiments on three benchmark datasets demonstrate the superiority of the proposed SL-module, and the DL-VHD method outperforms five typical Unsupervised Domain Adaptation (UDA) algorithms on various cross-category highlight detection tasks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xu_Cross-Category_Video_Highlight_Detection_via_Set-Based_Learning_ICCV_2021_paper.pdf", @@ -7610,7 +8127,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xu_2021_ICCV,\n \n author = {\n Xu,\n Minghao and Wang,\n Hang and Ni,\n Bingbing and Zhu,\n Riheng and Sun,\n Zhenbang and Wang,\n Changhu\n},\n title = {\n Cross-Category Video Highlight Detection via Set-Based Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7970-7979\n} \n}" }, { "title": "Cross-Descriptor Visual Localization and Mapping", @@ -7618,7 +8136,8 @@ "status": "Poster", "track": "main", "pid": 3246, - "author": "Mihai Dusmanu; Ondrej Miksik; Johannes L. Sch\u00f6nberger; Marc Pollefeys", + "author_site": "Mihai Dusmanu; Ondrej Miksik; Johannes L. Schönberger; Marc Pollefeys", + "author": "Mihai Dusmanu; Ondrej Miksik; Johannes L. Schönberger; Marc Pollefeys", "abstract": "Visual localization and mapping is the key technology underlying the majority of mixed reality and robotics systems. Most state-of-the-art approaches rely on local features to establish correspondences between images. In this paper, we present three novel scenarios for localization and mapping which require the continuous update of feature representations and the ability to match across different feature types. While localization and mapping is a fundamental computer vision problem, the traditional setup supposes the same local features are used throughout the evolution of a map. Thus, whenever the underlying features are changed, the whole process is repeated from scratch. However, this is typically impossible in practice, because raw images are often not stored and re-building the maps could lead to loss of the attached digital content. To overcome the limitations of current approaches, we present the first principled solution to cross-descriptor localization and mapping. Our data-driven approach is agnostic to the feature descriptor type, has low computational requirements, and scales linearly with the number of description algorithms. Extensive experiments demonstrate the effectiveness of our approach on state-of-the-art benchmarks for a variety of handcrafted and learned features.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Dusmanu_Cross-Descriptor_Visual_Localization_and_Mapping_ICCV_2021_paper.pdf", "aff": ";;;", @@ -7632,7 +8151,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Dusmanu_Cross-Descriptor_Visual_Localization_and_Mapping_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Dusmanu_Cross-Descriptor_Visual_Localization_and_Mapping_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Dusmanu_2021_ICCV,\n \n author = {\n Dusmanu,\n Mihai and Miksik,\n Ondrej and Sch\\"onberger,\n Johannes L. and Pollefeys,\n Marc\n},\n title = {\n Cross-Descriptor Visual Localization and Mapping\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6058-6067\n} \n}" }, { "title": "Cross-Encoder for Unsupervised Gaze Representation Learning", @@ -7640,6 +8160,7 @@ "status": "Poster", "track": "main", "pid": 9541, + "author_site": "Yunjia Sun; Jiabei Zeng; Shiguang Shan; Xilin Chen", "author": "Yunjia Sun; Jiabei Zeng; Shiguang Shan; Xilin Chen", "abstract": "In order to train 3D gaze estimators without too many annotations, we propose an unsupervised learning framework, Cross-Encoder, to leverage the unlabeled data to learn suitable representation for gaze estimation. To address the issue that the feature of gaze is always intertwined with the appearance of the eye, Cross-Encoder disentangles the features using a latent-code-swapping mechanism on eye-consistent image pairs and gaze-similar ones. Specifically, each image is encoded as a gaze feature and an eye feature. Cross-Encoder is trained to reconstruct each image in the eye-consistent pair according to its gaze feature and the other's eye feature, but to reconstruct each image in the gaze-similar pair according to its eye feature and the other's gaze feature. Experimental results show the validity of our work. First, using the Cross-Encoder-learned gaze representation, the gaze estimator trained with very few samples outperforms the ones using other unsupervised learning methods, under both within-dataset and cross-dataset protocol. Second, ResNet18 pretrained by Cross-Encoder is competitive with state-of-the-art gaze estimation methods. Third, ablation study shows that Cross-Encoder disentangles the gaze feature and eye feature.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Sun_Cross-Encoder_for_Unsupervised_Gaze_Representation_Learning_ICCV_2021_paper.pdf", @@ -7663,7 +8184,8 @@ "aff_campus_unique_index": "0+0;0+0;0+0;0+0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0+0;0+0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Sun_2021_ICCV,\n \n author = {\n Sun,\n Yunjia and Zeng,\n Jiabei and Shan,\n Shiguang and Chen,\n Xilin\n},\n title = {\n Cross-Encoder for Unsupervised Gaze Representation Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3702-3711\n} \n}" }, { "title": "Cross-Modality Person Re-Identification via Modality Confusion and Center Aggregation", @@ -7671,6 +8193,7 @@ "status": "Poster", "track": "main", "pid": 4170, + "author_site": "Xin Hao; Sanyuan Zhao; Mang Ye; Jianbing Shen", "author": "Xin Hao; Sanyuan Zhao; Mang Ye; Jianbing Shen", "abstract": "Cross-modality person re-identification is a challenging task due to large cross-modality discrepancy and intra-modality variations. Currently, most existing methods focus on learning modality-specific or modality-shareable features by using the identity supervision or modality label. Different from existing methods, this paper presents a novel Modality Confusion Learning Network (MCLNet). Its basic idea is to confuse two modalities, ensuring that the optimization is explicitly concentrated on the modality-irrelevant perspective. Specifically, MCLNet is designed to learn modality-invariant features by simultaneously minimizing inter-modality discrepancy while maximizing cross-modality similarity among instances in a single framework. Furthermore, an identity-aware marginal center aggregation strategy is introduced to extract the centralization features, while keeping diversity with a marginal constraint. Finally, we design a camera-aware learning scheme to enrich the discriminability. Extensive experiments on SYSU-MM01 and RegDB datasets show that MCLNet outperforms the state-of-the-art by a large margin. On the large-scale SYSU-MM01 dataset, our model can achieve 65.40% and 61.98% in terms of Rank-1 accuracy and mAP value.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Hao_Cross-Modality_Person_Re-Identification_via_Modality_Confusion_and_Center_Aggregation_ICCV_2021_paper.pdf", @@ -7694,7 +8217,8 @@ "aff_campus_unique_index": "0;0;1;2", "aff_campus_unique": "Beijing;Wuhan;Macau", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Hao_2021_ICCV,\n \n author = {\n Hao,\n Xin and Zhao,\n Sanyuan and Ye,\n Mang and Shen,\n Jianbing\n},\n title = {\n Cross-Modality Person Re-Identification via Modality Confusion and Center Aggregation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 16403-16412\n} \n}" }, { "title": "Cross-Patch Graph Convolutional Network for Image Denoising", @@ -7702,6 +8226,7 @@ "status": "Poster", "track": "main", "pid": 10041, + "author_site": "Yao Li; Xueyang Fu; Zheng-Jun Zha", "author": "Yao Li; Xueyang Fu; Zheng-Jun Zha", "abstract": "Recently, deep learning-based image denoising methods have achieved significant improvements over traditional methods. Due to the hardware limitation, most deep learning-based image denoising methods utilize cropped small patches to train a convolutional neural network to infer the clean images. However, the real noisy images in practical are mostly of high resolution rather than the cropped small patches and the vanilla training strategies ignore the cross-patch contextual dependency in the whole image. In this paper, we propose Cross-Patch Net (CPNet), which is the first deep- learning-based real image denoising method for HR (high resolution) input. Furthermore, we design a novel loss guided by the noise level map to obtain better performance. Compared with the vanilla patch-based training strategies, our approach effectively exploits the cross-patch contextual dependency. effective method to generate realistic sRGB noisy images from their corresponding clean sRGB images for denoiser training. Denoising experiments on real-world sRGB images show the effectiveness of the proposed method. More importantly, our method achieves state-of-the-art performance on practical sRGB noisy image denoising.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_Cross-Patch_Graph_Convolutional_Network_for_Image_Denoising_ICCV_2021_paper.pdf", @@ -7725,7 +8250,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Yao and Fu,\n Xueyang and Zha,\n Zheng-Jun\n},\n title = {\n Cross-Patch Graph Convolutional Network for Image Denoising\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4651-4660\n} \n}" }, { "title": "Cross-Sentence Temporal and Semantic Relations in Video Activity Localisation", @@ -7733,6 +8259,7 @@ "status": "Poster", "track": "main", "pid": 1623, + "author_site": "Jiabo Huang; Yang Liu; Shaogang Gong; Hailin Jin", "author": "Jiabo Huang; Yang Liu; Shaogang Gong; Hailin Jin", "abstract": "Video activity localisation has recently attained increasing attention due to its practical values in automatically localising the most salient visual segments corresponding to their language descriptions (sentences) from untrimmed and unstructured videos. For supervised model training, a temporal annotation of both the start and end time index of each video segment for a sentence (a video moment) must be given. This is not only very expensive but also sensitive to ambiguity and subjective annotation bias, a much harder task than image labelling. In this work, we develop a more accurate weakly-supervised solution by introducing Cross-Sentence Relations Mining (CRM) in video moment proposal generation and matching when only a paragraph description of activities without per-sentence temporal annotation is available. Specifically, we explore two cross-sentence relational constraints: (1) Temporal ordering and (2) semantic consistency among sentences in a paragraph description of video activities. Existing weakly-supervised techniques only consider within-sentence video segment correlations in training without considering cross-sentence paragraph context. This can mislead due to ambiguous expressions of individual sentences with visually indiscriminate video moment proposals in isolation. Experiments on two publicly available activity localisation datasets show the advantages of our approach over the state-of-the-art weakly supervised methods, especially so when the video activity descriptions become more complex.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Huang_Cross-Sentence_Temporal_and_Semantic_Relations_in_Video_Activity_Localisation_ICCV_2021_paper.pdf", @@ -7756,7 +8283,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "London;", "aff_country_unique_index": "0;1;0;2", - "aff_country_unique": "United Kingdom;China;United States" + "aff_country_unique": "United Kingdom;China;United States", + "bibtex": "@InProceedings{Huang_2021_ICCV,\n \n author = {\n Huang,\n Jiabo and Liu,\n Yang and Gong,\n Shaogang and Jin,\n Hailin\n},\n title = {\n Cross-Sentence Temporal and Semantic Relations in Video Activity Localisation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7199-7208\n} \n}" }, { "title": "CrossCLR: Cross-Modal Contrastive Learning for Multi-Modal Video Representations", @@ -7764,6 +8292,7 @@ "status": "Poster", "track": "main", "pid": 10625, + "author_site": "Mohammadreza Zolfaghari; Yi Zhu; Peter Gehler; Thomas Brox", "author": "Mohammadreza Zolfaghari; Yi Zhu; Peter Gehler; Thomas Brox", "abstract": "Contrastive learning allows us to flexibly define powerful losses by contrasting positive pairs from sets of negative samples. Recently, the principle has also been used to learn cross-modal embeddings for video and text, yet without exploiting its full potential. In particular, previous losses do not take the intra-modality similarities into account, which leads to inefficient embeddings, as the same content is mapped to multiple points in the embedding space. With CrossCLR, we present a contrastive loss that fixes this issue. Moreover, we define sets of highly related samples in terms of their input embeddings and exclude them from the negative samples to avoid issues with false negatives. We show that these principles consistently improve the quality of the learned embeddings. The joint embeddings learned with CrossCLR extend the state of the art in video-text retrieval on Youcook2 and LSMDC datasets and in video captioning on the Youcook2 dataset by a large margin. We also demonstrate the generality of the concept by learning improved joint embeddings for other pairs of modalities.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zolfaghari_CrossCLR_Cross-Modal_Contrastive_Learning_for_Multi-Modal_Video_Representations_ICCV_2021_paper.pdf", @@ -7780,14 +8309,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zolfaghari_CrossCLR_Cross-Modal_Contrastive_Learning_for_Multi-Modal_Video_Representations_ICCV_2021_paper.html", "aff_unique_index": "0;1;1;1", - "aff_unique_norm": "University of Freiburg;Amazon", - "aff_unique_dep": ";Amazon.com, Inc.", + "aff_unique_norm": "University of Freiburg;Amazon.com, Inc.", + "aff_unique_dep": ";", "aff_unique_url": "https://www.uni-freiburg.de;https://www.amazon.com", "aff_unique_abbr": "UoF;Amazon", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1", - "aff_country_unique": "Germany;United States" + "aff_country_unique": "Germany;United States", + "bibtex": "@InProceedings{Zolfaghari_2021_ICCV,\n \n author = {\n Zolfaghari,\n Mohammadreza and Zhu,\n Yi and Gehler,\n Peter and Brox,\n Thomas\n},\n title = {\n CrossCLR: Cross-Modal Contrastive Learning for Multi-Modal Video Representations\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1450-1459\n} \n}" }, { "title": "CrossDet: Crossline Representation for Object Detection", @@ -7795,6 +8325,7 @@ "status": "Poster", "track": "main", "pid": 7772, + "author_site": "Heqian Qiu; Hongliang Li; Qingbo Wu; Jianhua Cui; Zichen Song; Lanxiao Wang; Minjian Zhang", "author": "Heqian Qiu; Hongliang Li; Qingbo Wu; Jianhua Cui; Zichen Song; Lanxiao Wang; Minjian Zhang", "abstract": "Object detection aims to accurately locate and classify objects in an image, which requires precise object representations. Existing methods usually use rectangular anchor boxes or a set of points to represent objects. However, these methods either introduce background noise or miss the continuous appearance information inside the object, and thus cause incorrect detection results. In this paper, we propose a novel anchor-free object detection network, called CrossDet, which uses a set of growing cross lines along horizontal and vertical axes as object representations. An object can be flexibly represented as cross lines in different combinations. It not only can effectively reduce the interference of noise, but also takes into account the continuous object information, which is useful to enhance the discriminability of object features and find the object boundaries. Based on the learned cross lines, we propose a crossline extraction module to adaptively capture features of cross lines. Furthermore, we design a decoupled regression mechanism to regress the localization along the horizontal and vertical directions respectively, which helps to decrease the optimization difficulty because the optimization space is limited to a specific direction. Our method achieves consistently improvement on the PASCAL VOC and MS-COCO datasets. The experiment results demonstrate the effectiveness of our proposed method.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Qiu_CrossDet_Crossline_Representation_for_Object_Detection_ICCV_2021_paper.pdf", @@ -7818,7 +8349,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Qiu_2021_ICCV,\n \n author = {\n Qiu,\n Heqian and Li,\n Hongliang and Wu,\n Qingbo and Cui,\n Jianhua and Song,\n Zichen and Wang,\n Lanxiao and Zhang,\n Minjian\n},\n title = {\n CrossDet: Crossline Representation for Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3195-3204\n} \n}" }, { "title": "CrossNorm and SelfNorm for Generalization Under Distribution Shifts", @@ -7826,6 +8358,7 @@ "status": "Poster", "track": "main", "pid": 1276, + "author_site": "Zhiqiang Tang; Yunhe Gao; Yi Zhu; Zhi Zhang; Mu Li; Dimitris N. Metaxas", "author": "Zhiqiang Tang; Yunhe Gao; Yi Zhu; Zhi Zhang; Mu Li; Dimitris N. Metaxas", "abstract": "Traditional normalization techniques (e.g., Batch Normalization and Instance Normalization) generally and simplistically assume that training and test data follow the same distribution. As distribution shifts are inevitable in real-world applications, well-trained models with previous normalization methods can perform badly in new environments. Can we develop new normalization methods to improve generalization robustness under distribution shifts? In this paper, we answer the question by proposing CrossNorm and SelfNorm. CrossNorm exchanges channel-wise mean and variance between feature maps to enlarge training distribution, while SelfNorm uses attention to recalibrate the statistics to bridge gaps between training and test distributions. CrossNorm and SelfNorm can complement each other, though exploring different directions in statistics usage. Extensive experiments on different fields (vision and language), tasks (classification and segmentation), settings (supervised and semi-supervised), and distribution shift types (synthetic and natural) show the effectiveness. Code is available at https://github.com/amazon-research/crossnorm-selfnorm", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Tang_CrossNorm_and_SelfNorm_for_Generalization_Under_Distribution_Shifts_ICCV_2021_paper.pdf", @@ -7842,14 +8375,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Tang_CrossNorm_and_SelfNorm_for_Generalization_Under_Distribution_Shifts_ICCV_2021_paper.html", "aff_unique_index": "0;1;0;0;0;1", - "aff_unique_norm": "Amazon;Rutgers University", - "aff_unique_dep": "Amazon Web Services;", + "aff_unique_norm": "Amazon Web Services;Rutgers University", + "aff_unique_dep": ";", "aff_unique_url": "https://aws.amazon.com;https://www.rutgers.edu", "aff_unique_abbr": "AWS;Rutgers", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Tang_2021_ICCV,\n \n author = {\n Tang,\n Zhiqiang and Gao,\n Yunhe and Zhu,\n Yi and Zhang,\n Zhi and Li,\n Mu and Metaxas,\n Dimitris N.\n},\n title = {\n CrossNorm and SelfNorm for Generalization Under Distribution Shifts\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 52-61\n} \n}" }, { "title": "CrossViT: Cross-Attention Multi-Scale Vision Transformer for Image Classification", @@ -7857,6 +8391,7 @@ "status": "Poster", "track": "main", "pid": 2112, + "author_site": "Chun-Fu (Richard) Chen; Quanfu Fan; Rameswar Panda", "author": "Chun-Fu (Richard) Chen; Quanfu Fan; Rameswar Panda", "abstract": "The recently developed vision transformer (ViT) has achieved promising results on image classification compared to convolutional neural networks. Inspired by this, in this paper, we study how to learn multi-scale feature representations in transformer models for image classification. To this end, we propose a dual-branch transformer to combine image patches (i.e., tokens in a transformer) of different sizes to produce stronger image features. Our approach processes small-patch and large-patch tokens with two separate branches of different computational complexity and these tokens are then fused purely by attention multiple times to complement each other. Furthermore, to reduce computation, we develop a simple yet effective token fusion module based on cross attention, which uses a single token for each branch as a query to exchange information with other branches. Our proposed cross-attention only requires linear time for both computational and memory complexity instead of quadratic time otherwise. Extensive experiments demonstrate that our approach performs better than or on par with several concurrent works on vision transformer, in addition to efficient CNN models. For example, on the ImageNet1K dataset, with some architectural changes, our approach outperforms the recent DeiT by a large margin of 2% with a small to moderate increase in FLOPs and model parameters. Our source codes and models are available at https://github.com/IBM/CrossViT.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_CrossViT_Cross-Attention_Multi-Scale_Vision_Transformer_for_Image_Classification_ICCV_2021_paper.pdf", @@ -7880,7 +8415,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Chun-Fu (Richard) and Fan,\n Quanfu and Panda,\n Rameswar\n},\n title = {\n CrossViT: Cross-Attention Multi-Scale Vision Transformer for Image Classification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 357-366\n} \n}" }, { "title": "Crossover Learning for Fast Online Video Instance Segmentation", @@ -7888,6 +8424,7 @@ "status": "Poster", "track": "main", "pid": 1345, + "author_site": "Shusheng Yang; Yuxin Fang; Xinggang Wang; Yu Li; Chen Fang; Ying Shan; Bin Feng; Wenyu Liu", "author": "Shusheng Yang; Yuxin Fang; Xinggang Wang; Yu Li; Chen Fang; Ying Shan; Bin Feng; Wenyu Liu", "abstract": "Modeling temporal visual context across frames is critical for video instance segmentation (VIS) and other video understanding tasks. In this paper, we propose a fast online VIS model termed CrossVIS. For temporal information modeling in VIS, we present a novel crossover learning scheme that uses the instance feature in the current frame to pixel-wisely localize the same instance in other frames. Different from previous schemes, crossover learning does not require any additional network parameters for feature enhancement. By integrating with the instance segmentation loss, crossover learning enables efficient cross-frame instance-to-pixel relation learning and brings cost-free improvement during inference. Besides, a global balanced instance embedding branch is proposed for better and more stable online instance association. We conduct extensive experiments on three challenging VIS benchmarks, i.e., YouTube-VIS-2019, OVIS, and YouTube-VIS-2021 to evaluate our methods. CrossVIS achieves state-of-the-art online VIS performance and shows a decent trade-off between latency and accuracy. Code is available at https://github.com/hustvl/CrossVIS.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yang_Crossover_Learning_for_Fast_Online_Video_Instance_Segmentation_ICCV_2021_paper.pdf", @@ -7903,15 +8440,16 @@ "email": "hust.edu.cn;hust.edu.cn;hust.edu.cn;tencent.com;tencent.com;tencent.com;hust.edu.cn;hust.edu.cn", "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Yang_Crossover_Learning_for_Fast_Online_Video_Instance_Segmentation_ICCV_2021_paper.html", - "aff_unique_index": "0+1;0;0;1;1;1;0;0", - "aff_unique_norm": "Huazhong University of Science & Technology;Tencent", - "aff_unique_dep": "School of EIC;Applied Research Center (ARC)", - "aff_unique_url": "http://www.hust.edu.cn;https://www.tencent.com", - "aff_unique_abbr": "HUST;Tencent ARC", + "aff_unique_index": "0+1;0;0;1;2;1;0;0", + "aff_unique_norm": "Huazhong University of Science & Technology;Tencent;Tencent Holdings Limited", + "aff_unique_dep": "School of EIC;Applied Research Center (ARC);", + "aff_unique_url": "http://www.hust.edu.cn;https://www.tencent.com;https://www.tencent.com", + "aff_unique_abbr": "HUST;Tencent ARC;Tencent", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yang_2021_ICCV,\n \n author = {\n Yang,\n Shusheng and Fang,\n Yuxin and Wang,\n Xinggang and Li,\n Yu and Fang,\n Chen and Shan,\n Ying and Feng,\n Bin and Liu,\n Wenyu\n},\n title = {\n Crossover Learning for Fast Online Video Instance Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8043-8052\n} \n}" }, { "title": "Crowd Counting With Partial Annotations in an Image", @@ -7919,10 +8457,11 @@ "status": "Poster", "track": "main", "pid": 8883, + "author_site": "Yanyu Xu; Ziming Zhong; Dongze Lian; Jing Li; Zhengxin Li; Xinxing Xu; Shenghua Gao", "author": "Yanyu Xu; Ziming Zhong; Dongze Lian; Jing Li; Zhengxin Li; Xinxing Xu; Shenghua Gao", "abstract": "To fully leverage the data captured from different scenes with different view angles while reducing the annotation cost, this paper studies a novel crowd counting setting, i.e. only using partial annotations in each image as training data. Inspired by the repetitive patterns in the annotated and unannotated regions as well as the ones between them, we design a network with three components to tackle those unannotated regions: i) in an Unannotated Regions Characterization (URC) module, we employ a memory bank to only store the annotated features, which could help the visual features extracted from these annotated regions flow to these unannotated regions; ii) For each image, Feature Distribution Consistency (FDC) regularizes the feature distributions of annotated head and unannotated head regions to be consistent; iii) a Cross-regressor Consistency Regularization (CCR) module is designed to learn the visual features of unannotated regions in a self-supervised style. The experimental results validate the effectiveness of our proposed model under the partial annotation setting for several datasets, such as ShanghaiTech, UCF-CC-50, UCF-QNRF, NWPU-Crowd, and JHU-CROWD++. With only 10% annotated regions in each image, our proposed model achieves better performance than the recent methods and baselines under semi-supervised or active learning settings on all datasets. The code is https://github.com/svip-lab/CrwodCountingPAL.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xu_Crowd_Counting_With_Partial_Annotations_in_an_Image_ICCV_2021_paper.pdf", - "aff": "IHPC, A*STAR, Singapore; ShanghaiTech University, China; ShanghaiTech University, China; ShanghaiTech University, China; ShanghaiTech University, China; IHPC, A*STAR, Singapore; ShanghaiTech University, China+Shanghai Engineering Research Center of Intelligent Vision and Imaging, China+Shanghai Engineering Research Center of Energy Ef\ufb01cient and Custom AI IC, China", + "aff": "IHPC, A*STAR, Singapore; ShanghaiTech University, China; ShanghaiTech University, China; ShanghaiTech University, China; ShanghaiTech University, China; IHPC, A*STAR, Singapore; ShanghaiTech University, China+Shanghai Engineering Research Center of Intelligent Vision and Imaging, China+Shanghai Engineering Research Center of Energy Efficient and Custom AI IC, China", "project": "", "github": "https://github.com/svip-lab/CrwodCountingPAL", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Xu_Crowd_Counting_With_ICCV_2021_supplemental.pdf", @@ -7942,7 +8481,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;1;0;1+1+1", - "aff_country_unique": "Singapore;China" + "aff_country_unique": "Singapore;China", + "bibtex": "@InProceedings{Xu_2021_ICCV,\n \n author = {\n Xu,\n Yanyu and Zhong,\n Ziming and Lian,\n Dongze and Li,\n Jing and Li,\n Zhengxin and Xu,\n Xinxing and Gao,\n Shenghua\n},\n title = {\n Crowd Counting With Partial Annotations in an Image\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15570-15579\n} \n}" }, { "title": "CrowdDriven: A New Challenging Dataset for Outdoor Visual Localization", @@ -7950,7 +8490,8 @@ "status": "Poster", "track": "main", "pid": 9272, - "author": "Ara Jafarzadeh; Manuel L\u00f3pez Antequera; Pau Gargallo; Yubin Kuang; Carl Toft; Fredrik Kahl; Torsten Sattler", + "author_site": "Ara Jafarzadeh; Manuel López Antequera; Pau Gargallo; Yubin Kuang; Carl Toft; Fredrik Kahl; Torsten Sattler", + "author": "Ara Jafarzadeh; Manuel López Antequera; Pau Gargallo; Yubin Kuang; Carl Toft; Fredrik Kahl; Torsten Sattler", "abstract": "Visual localization is the problem of estimating the position and orientation from which a given image (or a sequence of images) is taken in a known scene. It is an important part of a wide range of computer vision and robotics applications, from self-driving cars to augmented/virtual reality systems. Visual localization techniques should work reliably and robustly under a wide range of conditions, including seasonal, weather, illumination and man-made changes. Recent benchmarking efforts model this by providing images under different conditions, and the community has made rapid progress on these datasets since their inception. However, they are limited to a few geographical regions and often recorded with a single device. We propose a new benchmark for visual localization in outdoor scenes, using crowd-sourced data to cover a wide range of geographical regions and camera devices with a focus on the failure cases of current algorithms. Experiments with state-of-the-art localization approaches show that our dataset is very challenging, with all evaluated methods failing on its hardest parts. As part of the dataset release, we provide the tooling used to generate it, enabling efficient and effective 2D correspondence annotation to obtain reference poses.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Jafarzadeh_CrowdDriven_A_New_Challenging_Dataset_for_Outdoor_Visual_Localization_ICCV_2021_paper.pdf", "aff": ";;;;;;", @@ -7964,7 +8505,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Jafarzadeh_CrowdDriven_A_New_Challenging_Dataset_for_Outdoor_Visual_Localization_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Jafarzadeh_CrowdDriven_A_New_Challenging_Dataset_for_Outdoor_Visual_Localization_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Jafarzadeh_2021_ICCV,\n \n author = {\n Jafarzadeh,\n Ara and Antequera,\n Manuel L\\'opez and Gargallo,\n Pau and Kuang,\n Yubin and Toft,\n Carl and Kahl,\n Fredrik and Sattler,\n Torsten\n},\n title = {\n CrowdDriven: A New Challenging Dataset for Outdoor Visual Localization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9845-9855\n} \n}" }, { "title": "CryoDRGN2: Ab Initio Neural Reconstruction of 3D Protein Structures From Real Cryo-EM Images", @@ -7972,6 +8514,7 @@ "status": "Poster", "track": "main", "pid": 10940, + "author_site": "Ellen D. Zhong; Adam Lerer; Joseph H. Davis; Bonnie Berger", "author": "Ellen D. Zhong; Adam Lerer; Joseph H. Davis; Bonnie Berger", "abstract": "Protein structure determination from cryo-EM data requires reconstructing a 3D volume (or distribution of volumes) from many noisy and randomly oriented 2D projection images. While the standard homogeneous reconstruction task aims to recover a single static structure, recently-proposed neural and non-neural methods can reconstruct distributions of structures, thereby enabling the study of protein complexes that possess intrinsic structural or conformational heterogeneity. These heterogeneous reconstruction methods, however, require fixed image poses, which are typically estimated from an upstream homogeneous reconstruction and are not guaranteed to be accurate under highly heterogeneous conditions. In this work we describe cryoDRGN2, an ab initio reconstruction algorithm, which can jointly estimate image poses and learn a neural model of a distribution of 3D structures on real heterogeneous cryo-EM data. To achieve this, we adapt search algorithms from the traditional cryo-EM literature, and describe the optimizations and design choices required to make such a search procedure computationally tractable in the neural model setting. We show that cryoDRGN2 is robust to the high noise levels of real cryo-EM images, trains faster than earlier neural methods, and achieves state-of-the-art performance on real cryo-EM datasets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhong_CryoDRGN2_Ab_Initio_Neural_Reconstruction_of_3D_Protein_Structures_From_ICCV_2021_paper.pdf", @@ -7988,14 +8531,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhong_CryoDRGN2_Ab_Initio_Neural_Reconstruction_of_3D_Protein_Structures_From_ICCV_2021_paper.html", "aff_unique_index": "0;1;0;0", - "aff_unique_norm": "Massachusetts Institute of Technology;Meta", + "aff_unique_norm": "Massachusetts Institute of Technology;Facebook", "aff_unique_dep": ";Facebook AI", "aff_unique_url": "https://web.mit.edu;https://www.facebook.com", "aff_unique_abbr": "MIT;Facebook AI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhong_2021_ICCV,\n \n author = {\n Zhong,\n Ellen D. and Lerer,\n Adam and Davis,\n Joseph H. and Berger,\n Bonnie\n},\n title = {\n CryoDRGN2: Ab Initio Neural Reconstruction of 3D Protein Structures From Real Cryo-EM Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4066-4075\n} \n}" }, { "title": "Curious Representation Learning for Embodied Intelligence", @@ -8003,6 +8547,7 @@ "status": "Poster", "track": "main", "pid": 3922, + "author_site": "Yilun Du; Chuang Gan; Phillip Isola", "author": "Yilun Du; Chuang Gan; Phillip Isola", "abstract": "Self-supervised visual representation learning has achieved remarkable success in recent years. By subverting the need for supervised labels, such approaches are able to utilize the numerous unlabeled images that exist on the Internet and in photographic datasets. Yet to build truly intelligent agents, we must construct representation learning algorithms that can learn not only from datasets but also learn in environments. An agent in a natural environment will not typically be fed curated data. Instead, it must explore its environment to acquire the data it will learn from. We propose a framework, curious representation learning (CRL), which jointly learns a reinforcement learning policy and a visual representation model. The policy is trained to maximize the error of the representation learner, and in doing so is incentivized to explore its environment. At the same time, the learned representation becomes stronger and stronger as the policy feeds it ever harder data to learn from. Our learned embodied representations enable promising transfer to downstream embodied semantic and language-guided navigation, performing better or comparable to ImageNet pretraining without using any supervision at all. In addition, despite being trained in simulation, our learned representations can obtain interpretable results on real images.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Du_Curious_Representation_Learning_for_Embodied_Intelligence_ICCV_2021_paper.pdf", @@ -8026,7 +8571,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Du_2021_ICCV,\n \n author = {\n Du,\n Yilun and Gan,\n Chuang and Isola,\n Phillip\n},\n title = {\n Curious Representation Learning for Embodied Intelligence\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10408-10417\n} \n}" }, { "title": "Curvature Generation in Curved Spaces for Few-Shot Learning", @@ -8034,6 +8580,7 @@ "status": "Poster", "track": "main", "pid": 6460, + "author_site": "Zhi Gao; Yuwei Wu; Yunde Jia; Mehrtash Harandi", "author": "Zhi Gao; Yuwei Wu; Yunde Jia; Mehrtash Harandi", "abstract": "Few-shot learning describes the challenging problem of recognizing samples from unseen classes given very few labeled examples. In many cases, few-shot learning is cast as learning an embedding space that assigns test samples to their corresponding class prototypes. Previous methods assume that data of all few-shot learning tasks comply with a fixed geometrical structure, mostly a Euclidean structure. Questioning this assumption that is clearly difficult to hold in real-world scenarios and incurs distortions to data, we propose to learn a task-aware curved embedding space by making use of the hyperbolic geometry. As a result, task-specific embedding spaces where suitable curvatures are generated to match the characteristics of data are constructed, leading to more generic embedding spaces. We then leverage on intra-class and inter-class context information in the embedding space to generate class prototypes for discriminative classification. We conduct a comprehensive set of experiments on inductive and transductive few-shot learning, demonstrating the benefits of our proposed method over existing embedding methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Gao_Curvature_Generation_in_Curved_Spaces_for_Few-Shot_Learning_ICCV_2021_paper.pdf", @@ -8057,7 +8604,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Beijing;", "aff_country_unique_index": "0;0;0;1", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Gao_2021_ICCV,\n \n author = {\n Gao,\n Zhi and Wu,\n Yuwei and Jia,\n Yunde and Harandi,\n Mehrtash\n},\n title = {\n Curvature Generation in Curved Spaces for Few-Shot Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8691-8700\n} \n}" }, { "title": "CvT: Introducing Convolutions to Vision Transformers", @@ -8065,6 +8613,7 @@ "status": "Poster", "track": "main", "pid": 2295, + "author_site": "Haiping Wu; Bin Xiao; Noel Codella; Mengchen Liu; Xiyang Dai; Lu Yuan; Lei Zhang", "author": "Haiping Wu; Bin Xiao; Noel Codella; Mengchen Liu; Xiyang Dai; Lu Yuan; Lei Zhang", "abstract": "We present in this paper a new architecture, named Convolutional vision Transformer (CvT), that improves Vision Transformer (ViT) in performance and efficiency by introducing convolutions into ViT to yield the best of both designs. This is accomplished through two primary modifications: a hierarchy of Transformers containing a new convolutional token embedding, and a convolutional Trasnsformer block leveraging a convolutional projection. These changes introduce desirable properties of convolutional neural networks (CNNs) to the ViT architecture (i.e. shift, scale, and distortion invariance) while maintaining the merits of Transformers (i.e. dynamic attention, global context, and better generalization). We validate CvT by conducting extensive experiments, showing that this approach achieves state-of-the-art performance over other Vision Transformers and ResNets on ImageNet-1k, with less parameters and lower FLOPs. In addition, performance gains are maintained when pretrained on larger datasets (e.g. ImageNet-22k) and fine-tuned to downstream tasks. Finally, our results show that the positional encoding, a crucial component in existing Vision Transformers, can be safely removed in our model, simplifying the design for higher resolution vision tasks. Code will be released at https://github.com/microsoft/CvT.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wu_CvT_Introducing_Convolutions_to_Vision_Transformers_ICCV_2021_paper.pdf", @@ -8081,14 +8630,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wu_CvT_Introducing_Convolutions_to_Vision_Transformers_ICCV_2021_paper.html", "aff_unique_index": "0+1+2;1+2;1+2;1+2;1+2;1+2;1+2", - "aff_unique_norm": "McGill University;Microsoft;AI", + "aff_unique_norm": "McGill University;Microsoft Corporation;AI", "aff_unique_dep": ";Cloud Computing;", "aff_unique_url": "https://www.mcgill.ca;https://www.microsoft.com/en-us/cloud;", "aff_unique_abbr": "McGill;Microsoft;", "aff_campus_unique_index": ";;;;;;", "aff_campus_unique": "", "aff_country_unique_index": "0+1;1;1;1;1;1;1", - "aff_country_unique": "Canada;United States;" + "aff_country_unique": "Canada;United States;", + "bibtex": "@InProceedings{Wu_2021_ICCV,\n \n author = {\n Wu,\n Haiping and Xiao,\n Bin and Codella,\n Noel and Liu,\n Mengchen and Dai,\n Xiyang and Yuan,\n Lu and Zhang,\n Lei\n},\n title = {\n CvT: Introducing Convolutions to Vision Transformers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 22-31\n} \n}" }, { "title": "D2-Net: Weakly-Supervised Action Localization via Discriminative Embeddings and Denoised Activations", @@ -8096,10 +8646,11 @@ "status": "Poster", "track": "main", "pid": 3339, + "author_site": "Sanath Narayan; Hisham Cholakkal; Munawar Hayat; Fahad Shahbaz Khan; Ming-Hsuan Yang; Ling Shao", "author": "Sanath Narayan; Hisham Cholakkal; Munawar Hayat; Fahad Shahbaz Khan; Ming-Hsuan Yang; Ling Shao", "abstract": "This work proposes a weakly-supervised temporal action localization framework, called D2-Net, which strives to temporally localize actions using video-level supervision. Our main contribution is the introduction of a novel loss formulation, which jointly enhances the discriminability of latent embeddings and robustness of the output temporal class activations with respect to foreground-background noise caused by weak supervision. The proposed formulation comprises a discriminative and a denoising loss term for enhancing temporal action localization. The discriminative term incorporates a classification loss and utilizes a top-down attention mechanism to enhance the separability of latent foreground-background embeddings. The denoising loss term explicitly addresses the foreground-background noise in class activations by simultaneously maximizing intra-video and inter-video mutual information using a bottom-up attention mechanism. As a result, activations in the foreground regions are emphasized whereas those in the background regions are suppressed, thereby leading to more robust predictions. Comprehensive experiments are performed on multiple benchmarks, including THUMOS14 and ActivityNet1.2. Our D2-Net performs favorably in comparison to the existing methods on all datasets, achieving gains as high as 2.3% in terms of mAP at IoU=0.5 on THUMOS14. Source code is available at https://github.com/naraysa/D2-Net.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Narayan_D2-Net_Weakly-Supervised_Action_Localization_via_Discriminative_Embeddings_and_Denoised_Activations_ICCV_2021_paper.pdf", - "aff": "Inception Institute of Artificial Intelligence; Mohamed Bin Zayed University of AI; Monash University; Link\u00f6ping University; University of California, Merced+Google Research+Yonsei University; Inception Institute of Artificial Intelligence", + "aff": "Inception Institute of Artificial Intelligence; Mohamed Bin Zayed University of AI; Monash University; Linköping University; University of California, Merced+Google Research+Yonsei University; Inception Institute of Artificial Intelligence", "project": "", "github": "https://github.com/naraysa/D2-Net", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Narayan_D2-Net_Weakly-Supervised_Action_ICCV_2021_supplemental.zip", @@ -8112,14 +8663,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Narayan_D2-Net_Weakly-Supervised_Action_Localization_via_Discriminative_Embeddings_and_Denoised_Activations_ICCV_2021_paper.html", "aff_unique_index": "0;1;2;3;4+5+6;0", - "aff_unique_norm": "Inception Institute of Artificial Intelligence;Mohamed bin Zayed University of Artificial Intelligence;Monash University;Link\u00f6ping University;University of California, Merced;Google;Yonsei University", + "aff_unique_norm": "Inception Institute of Artificial Intelligence;Mohamed Bin Zayed University of Artificial Intelligence;Monash University;Linköping University;University of California, Merced;Google;Yonsei University", "aff_unique_dep": ";;;;;Google Research;", "aff_unique_url": "https://www.inceptioniai.org;https://mbzuai.ac.ae;https://www.monash.edu;https://www.liu.se;https://www.ucmerced.edu;https://research.google;https://www.yonsei.ac.kr", "aff_unique_abbr": ";MBZUAI;Monash;LiU;UC Merced;Google Research;Yonsei", "aff_campus_unique_index": "1+2", "aff_campus_unique": ";Merced;Mountain View", "aff_country_unique_index": "0;0;1;2;3+3+4;0", - "aff_country_unique": "United Arab Emirates;Australia;Sweden;United States;South Korea" + "aff_country_unique": "United Arab Emirates;Australia;Sweden;United States;South Korea", + "bibtex": "@InProceedings{Narayan_2021_ICCV,\n \n author = {\n Narayan,\n Sanath and Cholakkal,\n Hisham and Hayat,\n Munawar and Khan,\n Fahad Shahbaz and Yang,\n Ming-Hsuan and Shao,\n Ling\n},\n title = {\n D2-Net: Weakly-Supervised Action Localization via Discriminative Embeddings and Denoised Activations\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13608-13617\n} \n}" }, { "title": "DAE-GAN: Dynamic Aspect-Aware GAN for Text-to-Image Synthesis", @@ -8127,6 +8679,7 @@ "status": "Poster", "track": "main", "pid": 4049, + "author_site": "Shulan Ruan; Yong Zhang; Kun Zhang; Yanbo Fan; Fan Tang; Qi Liu; Enhong Chen", "author": "Shulan Ruan; Yong Zhang; Kun Zhang; Yanbo Fan; Fan Tang; Qi Liu; Enhong Chen", "abstract": "Text-to-image synthesis refers to generating an image from a given text description, the key goal of which lies in photo realism and semantic consistency. Previous methods usually generate an initial image with sentence embedding and then refine it with fine-grained word embedding. Despite the significant progress, the 'aspect' information (e.g., red eyes) contained in the text, referring to several words rather than a word that depicts 'a particular part or feature of something', is often ignored, which is highly helpful for synthesizing image details. How to make better utilization of aspect information in text-to-image synthesis still remains an unresolved challenge. To address this problem, in this paper, we propose a Dynamic Aspect-awarE GAN (DAE-GAN) that represents text information comprehensively from multiple granularities, including sentence-level, word-level, and aspect-level. Moreover, inspired by human learning behaviors, we develop a novel Aspect-aware Dynamic Re-drawer (ADR) for image refinement, in which an Attended Global Refinement (AGR) module and an Aspect-aware Local Refinement (ALR) module are alternately employed. AGR utilizes word-level embedding to globally enhance the previously generated image, while ALR dynamically employs aspect-level embedding to refine image details from a local perspective. Finally, a corresponding matching loss function is designed to ensure the text-image semantic consistency at different levels. Extensive experiments on two well-studied and publicly available datasets (i.e., CUB-200 and COCO) demonstrate the superiority and rationality of our method.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ruan_DAE-GAN_Dynamic_Aspect-Aware_GAN_for_Text-to-Image_Synthesis_ICCV_2021_paper.pdf", @@ -8145,12 +8698,13 @@ "aff_unique_index": "0;1;2;1;3;0;0", "aff_unique_norm": "University of Science and Technology of China;Tencent;Hefei University of Technology;Jilin University", "aff_unique_dep": "School of Computer Science and Technology;Tencent AI Lab;;", - "aff_unique_url": "http://www.ustc.edu.cn;https://ai.tencent.com;http://www.hfut.edu.cn/;http://www.jlu.edu.cn", + "aff_unique_url": "http://www.ustc.edu.cn;https://ai.tencent.com;http://www.hfut.edu.cn;http://www.jlu.edu.cn", "aff_unique_abbr": "USTC;Tencent AI Lab;HUT;JLU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Ruan_2021_ICCV,\n \n author = {\n Ruan,\n Shulan and Zhang,\n Yong and Zhang,\n Kun and Fan,\n Yanbo and Tang,\n Fan and Liu,\n Qi and Chen,\n Enhong\n},\n title = {\n DAE-GAN: Dynamic Aspect-Aware GAN for Text-to-Image Synthesis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13960-13969\n} \n}" }, { "title": "DAM: Discrepancy Alignment Metric for Face Recognition", @@ -8158,6 +8712,7 @@ "status": "Poster", "track": "main", "pid": 2944, + "author_site": "Jiaheng Liu; Yudong Wu; Yichao Wu; Chuming Li; Xiaolin Hu; Ding Liang; Mengyu Wang", "author": "Jiaheng Liu; Yudong Wu; Yichao Wu; Chuming Li; Xiaolin Hu; Ding Liang; Mengyu Wang", "abstract": "The field of face recognition (FR) has witnessed remarkable progress with the surge of deep learning. The effective loss functions play an important role for FR. In this paper, we observe that a majority of loss functions, including the widespread triplet loss and softmax-based cross-entropy loss, embed inter-class (negative) similarity s_n and intra-class (positive) similarity s_p into similarity pairs and optimize to reduce (s_n - s_p) in the training process. However, in the verification process, existing metrics directly take the absolute similarity between two features as the confidence of belonging to the same identity, which inevitably causes a gap between the training and verification process. To bridge the gap, we propose a new metric called Discrepancy Alignment Metric (DAM) for verification, which introduces the Local Inter-class Discrepancy (LID) for each face image to normalize the absolute similarity score. To estimate the LID of each face image in the verification process, we propose two types of LID Estimation (LIDE) methods, which are reference-based and learning-based estimation methods, respectively. The proposed DAM is plug-and-play and can be easily applied to the most existing methods. Extensive experiments on multiple popular face recognition benchmark datasets demonstrate the effectiveness of our proposed method.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liu_DAM_Discrepancy_Alignment_Metric_for_Face_Recognition_ICCV_2021_paper.pdf", @@ -8181,7 +8736,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2021_ICCV,\n \n author = {\n Liu,\n Jiaheng and Wu,\n Yudong and Wu,\n Yichao and Li,\n Chuming and Hu,\n Xiaolin and Liang,\n Ding and Wang,\n Mengyu\n},\n title = {\n DAM: Discrepancy Alignment Metric for Face Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3814-3823\n} \n}" }, { "title": "DC-ShadowNet: Single-Image Hard and Soft Shadow Removal Using Unsupervised Domain-Classifier Guided Network", @@ -8189,6 +8745,7 @@ "status": "Poster", "track": "main", "pid": 3793, + "author_site": "Yeying Jin; Aashish Sharma; Robby T. Tan", "author": "Yeying Jin; Aashish Sharma; Robby T. Tan", "abstract": "Shadow removal from a single image is generally still an open problem. Most existing learning-based methods use supervised learning and require a large number of paired images (shadow and corresponding non-shadow images) for training. A recent unsupervised method, Mask-ShadowGAN, addresses this limitation. However, it requires a binary mask to represent shadow regions, making it inapplicable to soft shadows. To address the problem, in this paper, we propose an unsupervised domain-classifier guided shadow removal network, DC-ShadowNet. Specifically, we propose to integrate a shadow/shadow-free domain classifier into a generator and its discriminator, enabling them to focus on shadow regions. To train our network, we introduce novel losses based on physics-based shadow-free chromaticity, shadow-robust perceptual features, and boundary smoothness. Moreover, we show that our network being unsupervised can be used for test-time training that further improves the results. Our experiments show that all these novel components allow our method to handle soft shadows, and also to perform better on hard shadows both quantitatively and qualitatively than the existing state-of-the-art shadow removal methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Jin_DC-ShadowNet_Single-Image_Hard_and_Soft_Shadow_Removal_Using_Unsupervised_Domain-Classifier_ICCV_2021_paper.pdf", @@ -8203,7 +8760,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Jin_DC-ShadowNet_Single-Image_Hard_and_Soft_Shadow_Removal_Using_Unsupervised_Domain-Classifier_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Jin_DC-ShadowNet_Single-Image_Hard_and_Soft_Shadow_Removal_Using_Unsupervised_Domain-Classifier_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Jin_2021_ICCV,\n \n author = {\n Jin,\n Yeying and Sharma,\n Aashish and Tan,\n Robby T.\n},\n title = {\n DC-ShadowNet: Single-Image Hard and Soft Shadow Removal Using Unsupervised Domain-Classifier Guided Network\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5027-5036\n} \n}" }, { "title": "DCT-SNN: Using DCT To Distribute Spatial Information Over Time for Low-Latency Spiking Neural Networks", @@ -8211,6 +8769,7 @@ "status": "Poster", "track": "main", "pid": 9512, + "author_site": "Isha Garg; Sayeed Shafayet Chowdhury; Kaushik Roy", "author": "Isha Garg; Sayeed Shafayet Chowdhury; Kaushik Roy", "abstract": "Spiking Neural Networks (SNNs) offer a promising alternative to traditional deep learning frameworks, since they provide higher computational efficiency due to event-driven information processing. SNNs distribute the analog values of pixel intensities into binary spikes over time. However, the most widely used input coding schemes, such as Poisson based rate-coding, do not leverage the additional temporal learning capability of SNNs effectively. Moreover, these SNNs suffer from high inference latency which is a major bottleneck to their deployment. To overcome this, we propose a time-based encoding scheme that utilizes the Discrete Cosine Transform (DCT) to reduce the number of timesteps required for inference. DCT decomposes an image into a weighted sum of sinusoidal basis images. At each time step, a single frequency base, taken in order and modulated by its corresponding DCT coefficient, is input to an accumulator that generates spikes upon crossing a threshold. We use the proposed scheme to learn DCT-SNN, a low-latency deep SNN with leaky-integrate-and-fire neurons, trained using surrogate gradient descent based backpropagation. We achieve top-1 accuracy of 89.94%, 68.3% and 52.43% on CIFAR-10, CIFAR-100 and TinyImageNet, respectively using VGG architectures. Notably, DCT-SNN performs inference with 2-14X reduced latency compared to other state-of-the-art SNNs, while achieving comparable accuracy to their standard deep learning counterparts. The dimension of the transform allows us to control the number of timesteps required for inference. Additionally, we can trade-off accuracy with latency in a principled manner by dropping the highest frequency components during inference. The code is publicly available*.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Garg_DCT-SNN_Using_DCT_To_Distribute_Spatial_Information_Over_Time_for_ICCV_2021_paper.pdf", @@ -8234,7 +8793,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "West Lafayette", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Garg_2021_ICCV,\n \n author = {\n Garg,\n Isha and Chowdhury,\n Sayeed Shafayet and Roy,\n Kaushik\n},\n title = {\n DCT-SNN: Using DCT To Distribute Spatial Information Over Time for Low-Latency Spiking Neural Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4671-4680\n} \n}" }, { "title": "DECA: Deep Viewpoint-Equivariant Human Pose Estimation Using Capsule Autoencoders", @@ -8242,7 +8802,8 @@ "status": "Poster", "track": "main", "pid": 8622, - "author": "Nicola Garau; Niccol\u00f2 Bisagno; Piotr Br\u00f3dka; Nicola Conci", + "author_site": "Nicola Garau; Niccolò Bisagno; Piotr Bródka; Nicola Conci", + "author": "Nicola Garau; Niccolò Bisagno; Piotr Bródka; Nicola Conci", "abstract": "Human Pose Estimation (HPE) aims at retrieving the 3D position of human joints from images or videos. We show that current 3D HPE methods suffer a lack of viewpoint equivariance, namely they tend to fail or perform poorly when dealing with viewpoints unseen at training time. Deep learning methods often rely on either scale-invariant, translation-invariant, or rotation-invariant operations, such as max-pooling. However, the adoption of such procedures does not necessarily improve viewpoint generalization, rather leading to more data-dependent methods. To tackle this issue, we propose a novel capsule autoencoder network with fast Variational Bayes capsule routing, named DECA. By modeling each joint as a capsule entity, combined with the routing algorithm, our approach can preserve the joints' hierarchical and geometrical structure in the feature space, independently from the viewpoint. By achieving viewpoint equivariance, we drastically reduce the network data dependency at training time, resulting in an improved ability to generalize for unseen viewpoints. In the experimental validation, we outperform other methods on depth images from both seen and unseen viewpoints, both top-view, and front-view. In the RGB domain, the same network gives state-of-the-art results on the challenging viewpoint transfer task, also establishing a new framework for top-view HPE.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Garau_DECA_Deep_Viewpoint-Equivariant_Human_Pose_Estimation_Using_Capsule_Autoencoders_ICCV_2021_paper.pdf", "aff": "University of Trento; University of Trento; University of Trento; University of Trento", @@ -8265,7 +8826,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Italy" + "aff_country_unique": "Italy", + "bibtex": "@InProceedings{Garau_2021_ICCV,\n \n author = {\n Garau,\n Nicola and Bisagno,\n Niccol\\`o and Br\\'odka,\n Piotr and Conci,\n Nicola\n},\n title = {\n DECA: Deep Viewpoint-Equivariant Human Pose Estimation Using Capsule Autoencoders\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11677-11686\n} \n}" }, { "title": "DOLG: Single-Stage Image Retrieval With Deep Orthogonal Fusion of Local and Global Features", @@ -8273,6 +8835,7 @@ "status": "Poster", "track": "main", "pid": 7256, + "author_site": "Min Yang; Dongliang He; Miao Fan; Baorong Shi; Xuetong Xue; Fu Li; Errui Ding; Jizhou Huang", "author": "Min Yang; Dongliang He; Miao Fan; Baorong Shi; Xuetong Xue; Fu Li; Errui Ding; Jizhou Huang", "abstract": "Image Retrieval is a fundamental task of obtaining images similar to the query one from a database. A common image retrieval practice is to firstly retrieve candidate images via similarity search using global image features and then re-rank the candidates by leveraging their local features. Previous learning-based studies mainly focus on either global or local image representation learning to tackle the retrieval task. In this paper, we abandon the two-stage paradigm and seek to design an effective single-stage solution by integrating local and global information inside images into compact image representations. Specifically, we propose a Deep Orthogonal Local and Global (DOLG) information fusion framework for end-to-end image retrieval. It attentively extracts representative local information with multi-atrous convolutions and self-attention at first. Components orthogonal to the global image representation are then extracted from the local information. At last, the orthogonal components are concatenated with the global representation as a complementary, and then aggregation is performed to generate the final representation. The whole framework is end-to-end differentiable and can be trained with image-level labels. Extensive experimental results validate the effectiveness of our solution and show that our model achieves state-of-the-art image retrieval performances on Revisited Oxford and Paris datasets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yang_DOLG_Single-Stage_Image_Retrieval_With_Deep_Orthogonal_Fusion_of_Local_ICCV_2021_paper.pdf", @@ -8289,14 +8852,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Yang_DOLG_Single-Stage_Image_Retrieval_With_Deep_Orthogonal_Fusion_of_Local_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;0;0;0;0;0", - "aff_unique_norm": "Baidu", - "aff_unique_dep": "Baidu Inc.", + "aff_unique_norm": "Baidu Inc.", + "aff_unique_dep": "", "aff_unique_url": "https://www.baidu.com", "aff_unique_abbr": "Baidu", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yang_2021_ICCV,\n \n author = {\n Yang,\n Min and He,\n Dongliang and Fan,\n Miao and Shi,\n Baorong and Xue,\n Xuetong and Li,\n Fu and Ding,\n Errui and Huang,\n Jizhou\n},\n title = {\n DOLG: Single-Stage Image Retrieval With Deep Orthogonal Fusion of Local and Global Features\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11772-11781\n} \n}" }, { "title": "DRAEM - A Discriminatively Trained Reconstruction Embedding for Surface Anomaly Detection", @@ -8304,7 +8868,8 @@ "status": "Poster", "track": "main", "pid": 10474, - "author": "Vitjan Zavrtanik; Matej Kristan; Danijel Sko\u010daj", + "author_site": "Vitjan Zavrtanik; Matej Kristan; Danijel Skočaj", + "author": "Vitjan Zavrtanik; Matej Kristan; Danijel Skočaj", "abstract": "Visual surface anomaly detection aims to detect local image regions that significantly deviate from normal appearance. Recent surface anomaly detection methods rely on generative models to accurately reconstruct the normal areas and to fail on anomalies. These methods are trained only on anomaly-free images, and often require hand-crafted post-processing steps to localize the anomalies, which prohibits optimizing the feature extraction for maximal detection capability. In addition to reconstructive approach, we cast surface anomaly detection primarily as a discriminative problem and propose a discriminatively trained reconstruction anomaly embedding model (DRAEM). The proposed method learns a joint representation of an anomalous image and its anomaly-free reconstruction, while simultaneously learning a decision boundary between normal and anomalous examples. The method enables direct anomaly localization without the need for additional complicated post-processing of the network output and can be trained using simple and general anomaly simulations. On the challenging MVTec anomaly detection dataset, DRAEM outperforms the current state-of-the-art unsupervised methods by a large margin and even delivers detection performance close to the fully-supervised methods on the widely used DAGM surface-defect detection dataset, while substantially outperforming them in localization accuracy.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zavrtanik_DRAEM_-_A_Discriminatively_Trained_Reconstruction_Embedding_for_Surface_Anomaly_ICCV_2021_paper.pdf", "aff": "University of Ljubljana, Faculty of Computer and Information Science; University of Ljubljana, Faculty of Computer and Information Science; University of Ljubljana, Faculty of Computer and Information Science", @@ -8327,7 +8892,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Slovenia" + "aff_country_unique": "Slovenia", + "bibtex": "@InProceedings{Zavrtanik_2021_ICCV,\n \n author = {\n Zavrtanik,\n Vitjan and Kristan,\n Matej and Sko\\v{c\n}aj,\n Danijel\n},\n title = {\n DRAEM - A Discriminatively Trained Reconstruction Embedding for Surface Anomaly Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8330-8339\n} \n}" }, { "title": "DRB-GAN: A Dynamic ResBlock Generative Adversarial Network for Artistic Style Transfer", @@ -8335,6 +8901,7 @@ "status": "Poster", "track": "main", "pid": 6374, + "author_site": "Wenju Xu; Chengjiang Long; Ruisheng Wang; Guanghui Wang", "author": "Wenju Xu; Chengjiang Long; Ruisheng Wang; Guanghui Wang", "abstract": "In this work, we propose a Dynamic ResBlock Generative Adversarial Network (DRB-GAN) for artistic style transfer. The style code is modeled as the shared parameters for Dynamic ResBlocks connecting both the style encoding network and the style transfer network. In the style encoding network, a style class-aware attention mechanism is used to attend the style feature represent for generating the style codes. In the style transfer network, multiple Dynamic ResBlocks are designed to integrate the style code and the extracted CNN semantic feature and and then feed into the spatial window Layer-Instance Normalization (SW-LIN) decoder, which enables high-quality synthetic images with artistic style transfer. Moreover, the style collection conditional discriminator is designed to ensure our DRB-GAN model to equip with abilities for both arbitrary style transfer and collection style transfer during the training stage. No matter for arbitrary style transfer or collection style transfer, extensive experimental results strongly demonstrate that our proposed DRB-GAN beats state-of-the-art methods and exhibits its superior performance in terms of visual quality and efficiency.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xu_DRB-GAN_A_Dynamic_ResBlock_Generative_Adversarial_Network_for_Artistic_Style_ICCV_2021_paper.pdf", @@ -8351,14 +8918,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Xu_DRB-GAN_A_Dynamic_ResBlock_Generative_Adversarial_Network_for_Artistic_Style_ICCV_2021_paper.html", "aff_unique_index": "0;1;2;3", - "aff_unique_norm": "OPPO US Research Center;JD;University of Calgary;Ryerson University", - "aff_unique_dep": ";JD Finance America Corporation;Department of Geomatics Engineering;Department of Computer Science", + "aff_unique_norm": "OPPO US Research Center;JD Finance America Corporation;University of Calgary;Ryerson University", + "aff_unique_dep": ";;Department of Geomatics Engineering;Department of Computer Science", "aff_unique_url": ";;https://www.ucalgary.ca;https://www.ryerson.ca", "aff_unique_abbr": ";;U of C;Ryerson", "aff_campus_unique_index": "0;1;2;3", "aff_campus_unique": "Palo Alto;Mountain View;Calgary;Toronto", "aff_country_unique_index": "0;0;1;1", - "aff_country_unique": "United States;Canada" + "aff_country_unique": "United States;Canada", + "bibtex": "@InProceedings{Xu_2021_ICCV,\n \n author = {\n Xu,\n Wenju and Long,\n Chengjiang and Wang,\n Ruisheng and Wang,\n Guanghui\n},\n title = {\n DRB-GAN: A Dynamic ResBlock Generative Adversarial Network for Artistic Style Transfer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6383-6392\n} \n}" }, { "title": "DRINet: A Dual-Representation Iterative Learning Network for Point Cloud Segmentation", @@ -8366,6 +8934,7 @@ "status": "Poster", "track": "main", "pid": 5691, + "author_site": "Maosheng Ye; Shuangjie Xu; Tongyi Cao; Qifeng Chen", "author": "Maosheng Ye; Shuangjie Xu; Tongyi Cao; Qifeng Chen", "abstract": "We present a novel and flexible architecture for point cloud segmentation with dual-representation iterative learning. In point cloud processing, different representations have their own pros and cons. Thus, finding suitable ways to represent point cloud data structure while keeping its own internal physical property such as permutation and scale-invariant is a fundamental problem. Therefore, we propose our work, DRINet, which serves as the basic network structure for dual-representation learning with great flexibility at feature transferring and less computation cost, especially for large-scale point clouds. DRINet mainly consists of two modules called Sparse Point-Voxel Feature Extraction and Sparse Voxel-Point Feature Extraction. By utilizing these two modules iteratively, features can be propagated between two different representations. We further propose a novel multi-scale pooling layer for pointwise locality learning to improve context information propagation. Our network achieves state-of-the-art results for point cloud classification and segmentation tasks on several datasets while maintaining high runtime efficiency. For large-scale outdoor scenarios, our method outperforms state-of-the-art methods with a real-time inference speed of 62ms per frame.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ye_DRINet_A_Dual-Representation_Iterative_Learning_Network_for_Point_Cloud_Segmentation_ICCV_2021_paper.pdf", @@ -8389,7 +8958,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Ye_2021_ICCV,\n \n author = {\n Ye,\n Maosheng and Xu,\n Shuangjie and Cao,\n Tongyi and Chen,\n Qifeng\n},\n title = {\n DRINet: A Dual-Representation Iterative Learning Network for Point Cloud Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7447-7456\n} \n}" }, { "title": "DRIVE: Deep Reinforced Accident Anticipation With Visual Explanation", @@ -8397,6 +8967,7 @@ "status": "Poster", "track": "main", "pid": 3110, + "author_site": "Wentao Bao; Qi Yu; Yu Kong", "author": "Wentao Bao; Qi Yu; Yu Kong", "abstract": "Traffic accident anticipation aims to accurately and promptly predict the occurrence of a future accident from dashcam videos, which is vital for a safety-guaranteed self-driving system. To encourage an early and accurate decision, existing approaches typically focus on capturing the cues of spatial and temporal context before a future accident occurs. However, their decision-making lacks visual explanation and ignores the dynamic interaction with the environment. In this paper, we propose Deep ReInforced accident anticipation with Visual Explanation, named DRIVE. The method simulates both the bottom-up and top-down visual attention mechanism in a dashcam observation environment so that the decision from the proposed stochastic multi-task agent can be visually explained by attentive regions. Moreover, the proposed dense anticipation reward and sparse fixation reward are effective in training the DRIVE model with our improved reinforcement learning algorithm. Experimental results show that the DRIVE model achieves state-of-the-art performance on multiple real-world traffic accident datasets. Code and pre-trained models are available at https://www.rit.edu/actionlab/drive.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Bao_DRIVE_Deep_Reinforced_Accident_Anticipation_With_Visual_Explanation_ICCV_2021_paper.pdf", @@ -8420,7 +8991,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Rochester", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Bao_2021_ICCV,\n \n author = {\n Bao,\n Wentao and Yu,\n Qi and Kong,\n Yu\n},\n title = {\n DRIVE: Deep Reinforced Accident Anticipation With Visual Explanation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7619-7628\n} \n}" }, { "title": "DTMNet: A Discrete Tchebichef Moments-Based Deep Neural Network for Multi-Focus Image Fusion", @@ -8428,6 +9000,7 @@ "status": "Poster", "track": "main", "pid": 11079, + "author_site": "Bin Xiao; Haifeng Wu; Xiuli Bi", "author": "Bin Xiao; Haifeng Wu; Xiuli Bi", "abstract": "Compared with traditional methods, the deep learning-based multi-focus image fusion methods can effectively improve the performance of image fusion tasks. However, the existing deep learning-based methods encounter a common issue of a large number of parameters, which leads to the deep learning models with high time complexity and low fusion efficiency. To address this issue, we propose a novel discrete Tchebichef moment-based Deep neural network, termed as DTMNet, for multi-focus image fusion. The proposed DTMNet is an end-to-end deep neural network with only one convolutional layer and three fully connected layers. The convolutional layer is fixed with DTM coefficients (DTMConv) to extract high/low-frequency information without learning parameters effectively. The three fully connected layers have learnable parameters for feature classification. Therefore, the proposed DTMNet for multi-focus image fusion has a small number of parameters (0.01M paras vs. 4.93M paras of regular CNN) and high computational efficiency (0.32s vs. 79.09s by regular CNN to fuse an image). In addition, a large-scale multi-focus image dataset is synthesized for training and verifying the deep learning model. Experimental results on three public datasets demonstrate that the proposed method is competitive with or even outperforms the state-of-the-art multi-focus image fusion methods in terms of subjective visual perception and objective evaluation metrics.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xiao_DTMNet_A_Discrete_Tchebichef_Moments-Based_Deep_Neural_Network_for_Multi-Focus_ICCV_2021_paper.pdf", @@ -8451,7 +9024,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xiao_2021_ICCV,\n \n author = {\n Xiao,\n Bin and Wu,\n Haifeng and Bi,\n Xiuli\n},\n title = {\n DTMNet: A Discrete Tchebichef Moments-Based Deep Neural Network for Multi-Focus Image Fusion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 43-51\n} \n}" }, { "title": "DWKS: A Local Descriptor of Deformations Between Meshes and Point Clouds", @@ -8459,10 +9033,11 @@ "status": "Poster", "track": "main", "pid": 10814, + "author_site": "Robin Magnet; Maks Ovsjanikov", "author": "Robin Magnet; Maks Ovsjanikov", "abstract": "We propose a novel pointwise descriptor, called DWKS, aimed at finding correspondences across two deformable shape collections. Unlike the majority of existing descriptors, rather than capturing local geometry, DWKS captures the deformation around a point within a collection in a multi-scale and informative manner. This, in turn, allows to compute inter-collection correspondences without using landmarks. To this end, we build upon the successful spectral WKS descriptors, but rather than using the Laplace-Beltrami operator, show that a similar construction can be performed on shape difference operators, that capture differences or distortion within a collection. By leveraging the collection information our descriptor facilitates difficult non-rigid shape matching tasks, even in the presence of strong partiality and significant deformations. We demonstrate the utility of our approach across a range of challenging matching problems on both meshes and point clouds. The code for this paper can be found at https://github.com/RobinMagnet/DWKS.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Magnet_DWKS_A_Local_Descriptor_of_Deformations_Between_Meshes_and_Point_ICCV_2021_paper.pdf", - "aff": "LIX, \u00b4Ecole Polytechnique; LIX, \u00b4Ecole Polytechnique", + "aff": "LIX, ´Ecole Polytechnique; LIX, ´Ecole Polytechnique", "project": "", "github": "https://github.com/RobinMagnet/DWKS", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Magnet_DWKS_A_Local_ICCV_2021_supplemental.pdf", @@ -8482,7 +9057,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "France" + "aff_country_unique": "France", + "bibtex": "@InProceedings{Magnet_2021_ICCV,\n \n author = {\n Magnet,\n Robin and Ovsjanikov,\n Maks\n},\n title = {\n DWKS: A Local Descriptor of Deformations Between Meshes and Point Clouds\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3793-3802\n} \n}" }, { "title": "Dance With Self-Attention: A New Look of Conditional Random Fields on Anomaly Detection in Videos", @@ -8490,6 +9066,7 @@ "status": "Poster", "track": "main", "pid": 2433, + "author_site": "Didik Purwanto; Yie-Tarng Chen; Wen-Hsien Fang", "author": "Didik Purwanto; Yie-Tarng Chen; Wen-Hsien Fang", "abstract": "This paper proposes a novel weakly supervised approach for anomaly detection, which begins with a relation-aware feature extractor to capture the multi-scale convolutional neural network (CNN) features from a video. Afterwards, self-attention is integrated with conditional random fields (CRFs), the core of the network, to make use of the ability of self-attention in capturing the short-range correlations of the features and the ability of CRFs in learning the inter-dependencies of these features. Such a framework can learn not only the spatio-temporal interactions among the actors which are important for detecting complex movements, but also their short- and long-term dependencies across frames. Also, to deal with both local and non-local relationships of the features, a new variant of self-attention is developed by taking into consideration a set of cliques with different temporal localities. Moreover, a contrastive multi-instance learning scheme is considered to broaden the gap between the normal and abnormal instances, resulting in more accurate abnormal discrimination. Simulations reveal that the new method provides superior performance to the state-of-the-art works on the widespread UCF-Crime and ShanghaiTech datasets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Purwanto_Dance_With_Self-Attention_A_New_Look_of_Conditional_Random_Fields_ICCV_2021_paper.pdf", @@ -8504,7 +9081,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Purwanto_Dance_With_Self-Attention_A_New_Look_of_Conditional_Random_Fields_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Purwanto_Dance_With_Self-Attention_A_New_Look_of_Conditional_Random_Fields_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Purwanto_2021_ICCV,\n \n author = {\n Purwanto,\n Didik and Chen,\n Yie-Tarng and Fang,\n Wen-Hsien\n},\n title = {\n Dance With Self-Attention: A New Look of Conditional Random Fields on Anomaly Detection in Videos\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 173-183\n} \n}" }, { "title": "Data-Free Universal Adversarial Perturbation and Black-Box Attack", @@ -8512,6 +9090,7 @@ "status": "Poster", "track": "main", "pid": 10960, + "author_site": "Chaoning Zhang; Philipp Benz; Adil Karjauv; In So Kweon", "author": "Chaoning Zhang; Philipp Benz; Adil Karjauv; In So Kweon", "abstract": "Universal adversarial perturbation (UAP), i.e. a single perturbation to fool the network for most images, is widely recognized as a more practical attack because the UAP can be generated beforehand and applied directly during the attack stage. One intriguing phenomenon regarding untargeted UAP is that most images are misclassified to a dominant label. This phenomenon has been reported in previous works while lacking a justified explanation, for which our work attempts to provide an alternative explanation. For a more practical universal attack, our investigation of untargeted UAP focuses on alleviating the dependence on the original training samples, from removing the need for sample labels to limiting the sample size. Towards strictly data-free untargeted UAP, our work proposes to exploit artificial Jigsaw images as the training samples, demonstrating competitive performance. We further investigate the possibility of exploiting the UAP for a data-free black-box attack which is arguably the most practical yet challenging threat model. We demonstrate that there exists optimization-free repetitive patterns which can successfully attack deep models. Code is available at https://bit.ly/3y0ZTIC.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhang_Data-Free_Universal_Adversarial_Perturbation_and_Black-Box_Attack_ICCV_2021_paper.pdf", @@ -8535,7 +9114,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Zhang_2021_ICCV,\n \n author = {\n Zhang,\n Chaoning and Benz,\n Philipp and Karjauv,\n Adil and Kweon,\n In So\n},\n title = {\n Data-Free Universal Adversarial Perturbation and Black-Box Attack\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7868-7877\n} \n}" }, { "title": "De-Rendering Stylized Texts", @@ -8543,6 +9123,7 @@ "status": "Poster", "track": "main", "pid": 8578, + "author_site": "Wataru Shimoda; Daichi Haraguchi; Seiichi Uchida; Kota Yamaguchi", "author": "Wataru Shimoda; Daichi Haraguchi; Seiichi Uchida; Kota Yamaguchi", "abstract": "Editing raster text is a promising but challenging task. We propose to apply text vectorization for the task of raster text editing in display media, such as posters, web pages, or advertisements. In our approach, instead of applying image transformation or generation in the raster domain, we learn a text vectorization model to parse all the rendering parameters including text, location, size, font, style, effects, and hidden background, then utilize those parameters for reconstruction and any editing task. Our text vectorization takes advantage of differentiable text rendering to accurately reproduce the input raster text in a resolution-free parametric format. We show in the experiments that our approach can successfully parse text, styling, and background information in the unified model, and produces artifact-free text editing compared to a raster baseline.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Shimoda_De-Rendering_Stylized_Texts_ICCV_2021_paper.pdf", @@ -8557,7 +9138,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Shimoda_De-Rendering_Stylized_Texts_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Shimoda_De-Rendering_Stylized_Texts_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Shimoda_2021_ICCV,\n \n author = {\n Shimoda,\n Wataru and Haraguchi,\n Daichi and Uchida,\n Seiichi and Yamaguchi,\n Kota\n},\n title = {\n De-Rendering Stylized Texts\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1076-1085\n} \n}" }, { "title": "DeFRCN: Decoupled Faster R-CNN for Few-Shot Object Detection", @@ -8565,6 +9147,7 @@ "status": "Poster", "track": "main", "pid": 6196, + "author_site": "Limeng Qiao; Yuxuan Zhao; Zhiyuan Li; Xi Qiu; Jianan Wu; Chi Zhang", "author": "Limeng Qiao; Yuxuan Zhao; Zhiyuan Li; Xi Qiu; Jianan Wu; Chi Zhang", "abstract": "Few-shot object detection, which aims at detecting novel objects rapidly from extremely few annotated examples of previously unseen classes, has attracted significant research interest in the community. Most existing approaches employ the Faster R-CNN as basic detection framework, yet, due to the lack of tailored considerations for data-scarce scenario, their performance is often not satisfactory. In this paper, we look closely into the conventional Faster R-CNN and analyze its contradictions from two orthogonal perspectives, namely multi-stage (RPN vs. RCNN) and multi-task (classification vs. localization). To resolve these issues, we propose a simple yet effective architecture, named Decoupled Faster R-CNN (DeFRCN). To be concrete, we extend Faster R-CNN by introducing Gradient Decoupled Layer for multi-stage decoupling and Prototypical Calibration Block for multi-task decoupling. The former is a novel deep layer with redefining the feature-forward operation and gradient-backward operation for decoupling its subsequent layer and preceding layer, and the latter is an offline prototype-based classification model with taking the proposals from detector as input and boosting the original classification scores with additional pairwise scores for calibration. Extensive experiments on multiple benchmarks show our framework is remarkably superior to other existing approaches and establishes a new state-of-the-art in few-shot literature.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Qiao_DeFRCN_Decoupled_Faster_R-CNN_for_Few-Shot_Object_Detection_ICCV_2021_paper.pdf", @@ -8588,7 +9171,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Qiao_2021_ICCV,\n \n author = {\n Qiao,\n Limeng and Zhao,\n Yuxuan and Li,\n Zhiyuan and Qiu,\n Xi and Wu,\n Jianan and Zhang,\n Chi\n},\n title = {\n DeFRCN: Decoupled Faster R-CNN for Few-Shot Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8681-8690\n} \n}" }, { "title": "DecentLaM: Decentralized Momentum SGD for Large-Batch Deep Training", @@ -8596,6 +9180,7 @@ "status": "Poster", "track": "main", "pid": 2327, + "author_site": "Kun Yuan; Yiming Chen; Xinmeng Huang; Yingya Zhang; Pan Pan; Yinghui Xu; Wotao Yin", "author": "Kun Yuan; Yiming Chen; Xinmeng Huang; Yingya Zhang; Pan Pan; Yinghui Xu; Wotao Yin", "abstract": "The scale of deep learning nowadays calls for efficient distributed training algorithms. Decentralized momentum SGD (DmSGD), in which each node averages only with its neighbors, is more communication efficient than vanilla Parallel momentum SGD that incurs global average across all computing nodes. On the other hand, the large-batch training has been demonstrated critical to achieve runtime speedup. This motivates us to investigate how DmSGD performs in the large-batch scenario. In this work, we find the momentum term can amplify the inconsistency bias in DmSGD. Such bias becomes more evident as batch-size grows large and hence results in severe performance degradation. We next propose DecentLaM, a novel decentralized large-batch momentum SGD to remove the momentum-incurred bias. The convergence rate for both strongly convex and non-convex scenarios is established. Our theoretical results justify the superiority of DecentLaM to DmSGD especially in the large-batch scenario. Experimental results on a a variety of computer vision tasks and models show that DecentLaM promises both efficient and high-quality training.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yuan_DecentLaM_Decentralized_Momentum_SGD_for_Large-Batch_Deep_Training_ICCV_2021_paper.pdf", @@ -8619,7 +9204,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Yuan_2021_ICCV,\n \n author = {\n Yuan,\n Kun and Chen,\n Yiming and Huang,\n Xinmeng and Zhang,\n Yingya and Pan,\n Pan and Xu,\n Yinghui and Yin,\n Wotao\n},\n title = {\n DecentLaM: Decentralized Momentum SGD for Large-Batch Deep Training\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3029-3039\n} \n}" }, { "title": "DeePSD: Automatic Deep Skinning and Pose Space Deformation for 3D Garment Animation", @@ -8627,6 +9213,7 @@ "status": "Poster", "track": "main", "pid": 7975, + "author_site": "Hugo Bertiche; Meysam Madadi; Emilio Tylson; Sergio Escalera", "author": "Hugo Bertiche; Meysam Madadi; Emilio Tylson; Sergio Escalera", "abstract": "We present a novel solution to the garment animation problem through deep learning. Our contribution allows animating any template outfit with arbitrary topology and geometric complexity. Recent works develop models for garment edition, resizing and animation at the same time by leveraging the support body model (encoding garments as body homotopies). This leads to complex engineering solutions that suffer from scalability, applicability and compatibility. By limiting our scope to garment animation only, we are able to propose a simple model that can animate any outfit, independently of its topology, vertex order or connectivity. Our proposed architecture maps outfits to animated 3D models into the standard format for 3D animation (blend weights and blend shapes matrices), automatically providing of compatibility with any graphics engine. We also propose a methodology to complement supervised learning with an unsupervised physically based learning that implicitly solves collisions and enhances cloth quality.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Bertiche_DeePSD_Automatic_Deep_Skinning_and_Pose_Space_Deformation_for_3D_ICCV_2021_paper.pdf", @@ -8643,14 +9230,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Bertiche_DeePSD_Automatic_Deep_Skinning_and_Pose_Space_Deformation_for_3D_ICCV_2021_paper.html", "aff_unique_index": "0+1;0+1;0;0+1", - "aff_unique_norm": "University of Barcelona;Computer Vision Center", + "aff_unique_norm": "Universitat de Barcelona;Computer Vision Center", "aff_unique_dep": ";", "aff_unique_url": "https://www.ub.edu;https://www.cvc.uab.cat/", "aff_unique_abbr": "UB;CVC", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0;0+0", - "aff_country_unique": "Spain" + "aff_country_unique": "Spain", + "bibtex": "@InProceedings{Bertiche_2021_ICCV,\n \n author = {\n Bertiche,\n Hugo and Madadi,\n Meysam and Tylson,\n Emilio and Escalera,\n Sergio\n},\n title = {\n DeePSD: Automatic Deep Skinning and Pose Space Deformation for 3D Garment Animation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5471-5480\n} \n}" }, { "title": "Deep 3D Mask Volume for View Synthesis of Dynamic Scenes", @@ -8658,6 +9246,7 @@ "status": "Poster", "track": "main", "pid": 9328, + "author_site": "Kai-En Lin; Lei Xiao; Feng Liu; Guowei Yang; Ravi Ramamoorthi", "author": "Kai-En Lin; Lei Xiao; Feng Liu; Guowei Yang; Ravi Ramamoorthi", "abstract": "Image view synthesis has seen great success in reconstructing photorealistic visuals, thanks to deep learning and various novel representations. The next key step in immersive virtual experiences is view synthesis of dynamic scenes. However, several challenges exist due to the lack of high-quality training datasets, and the additional time dimension for videos of dynamic scenes. To address this issue, we introduce a multi-view video dataset, captured with a custom 10-camera rig in 120FPS. The dataset contains 96 high-quality scenes showing various visual effects and human interactions in outdoor scenes. We develop a new algorithm, Deep 3D Mask Volume, which enables temporally-stable view extrapolation from binocular videos of dynamic scenes, captured by static cameras. Our algorithm addresses the temporal inconsistency of disocclusions by identifying the error-prone areas with a 3D mask volume, and replaces them with static background observed throughout the video. Our method enables manipulation in 3D space as opposed to simple 2D masks. We demonstrate better temporal stability than frame-by-frame static view synthesis methods, or those that use 2D masks. The resulting view synthesis videos show minimal flickering artifacts and allow for larger translational movements.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Lin_Deep_3D_Mask_Volume_for_View_Synthesis_of_Dynamic_Scenes_ICCV_2021_paper.pdf", @@ -8674,14 +9263,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Lin_Deep_3D_Mask_Volume_for_View_Synthesis_of_Dynamic_Scenes_ICCV_2021_paper.html", "aff_unique_index": "0;1;1;0;0", - "aff_unique_norm": "University of California, San Diego;Meta", + "aff_unique_norm": "University of California, San Diego;Facebook Reality Labs", "aff_unique_dep": ";Facebook Reality Labs", "aff_unique_url": "https://www.ucsd.edu;https://www.facebook.com/realitylabs", "aff_unique_abbr": "UCSD;FRL", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "San Diego;", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Lin_2021_ICCV,\n \n author = {\n Lin,\n Kai-En and Xiao,\n Lei and Liu,\n Feng and Yang,\n Guowei and Ramamoorthi,\n Ravi\n},\n title = {\n Deep 3D Mask Volume for View Synthesis of Dynamic Scenes\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1749-1758\n} \n}" }, { "title": "Deep Blind Video Super-Resolution", @@ -8689,6 +9279,7 @@ "status": "Poster", "track": "main", "pid": 7151, + "author_site": "Jinshan Pan; Haoran Bai; Jiangxin Dong; Jiawei Zhang; Jinhui Tang", "author": "Jinshan Pan; Haoran Bai; Jiangxin Dong; Jiawei Zhang; Jinhui Tang", "abstract": "Existing video super-resolution (SR) algorithms usually assume that the blur kernels in the degradation process are known and do not model the blur kernels in the restoration. However, this assumption does not hold for blind video SR and usually leads to over-smoothed super-resolved frames. In this paper, we propose an effective blind video SR algorithm based on deep convolutional neural networks (CNNs). Our algorithm first estimates blur kernels from low-resolution (LR) input videos. Then, with the estimated blur kernels, we develop an effective image deconvolution method based on the image formation model of blind video SR to generate intermediate latent frames so that sharp image contents can be restored well. To effectively explore the information from adjacent frames, we estimate the motion fields from LR input videos, extract features from LR videos by a feature extraction network, and warp the extracted features from LR inputs based on the motion fields. Moreover, we develop an effective sharp feature exploration method which first extracts sharp features from restored intermediate latent frames and then uses a transformation operation based on the extracted sharp features and warped features from LR inputs to generate better features for HR video restoration. We formulate the proposed algorithm into an end-to-end trainable framework and show that it performs favorably against state-of-the-art methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Pan_Deep_Blind_Video_Super-Resolution_ICCV_2021_paper.pdf", @@ -8712,7 +9303,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Pan_2021_ICCV,\n \n author = {\n Pan,\n Jinshan and Bai,\n Haoran and Dong,\n Jiangxin and Zhang,\n Jiawei and Tang,\n Jinhui\n},\n title = {\n Deep Blind Video Super-Resolution\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4811-4820\n} \n}" }, { "title": "Deep Co-Training With Task Decomposition for Semi-Supervised Domain Adaptation", @@ -8720,6 +9312,7 @@ "status": "Poster", "track": "main", "pid": 10085, + "author_site": "Luyu Yang; Yan Wang; Mingfei Gao; Abhinav Shrivastava; Kilian Q. Weinberger; Wei-Lun Chao; Ser-Nam Lim", "author": "Luyu Yang; Yan Wang; Mingfei Gao; Abhinav Shrivastava; Kilian Q. Weinberger; Wei-Lun Chao; Ser-Nam Lim", "abstract": "Semi-supervised domain adaptation (SSDA) aims to adapt models trained from a labeled source domain to a different but related target domain, from which unlabeled data and a small set of labeled data are provided. Current methods that treat source and target supervision without distinction overlook their inherent discrepancy, resulting in a source-dominated model that has not effectively use the target supervision. In this paper, we argue that the labeled target data needs to be distinguished for effective SSDA, and propose to explicitly decompose the SSDA task into two sub-tasks: a semi-supervised learning (SSL) task in the target domain and an unsupervised domain adaptation (UDA) task across domains. By doing so, the two sub-tasks can better leverage the corresponding supervision and thus yield very different classifiers. To integrate the strengths of the two classifiers, we apply the well established co-training framework, in which the two classifiers exchange their high confident predictions to iteratively \"teach each other\" so that both classifiers can excel in the target domain. We call our approach Deep Co-training with Task decomposition (DeCoTa). DeCoTa requires no adversarial training and is easy to implement. Moreover, DeCoTa is well founded on the theoretical condition of when co-training would succeed. As a result, DeCoTa achieves state-of-the-art results on several SSDA datasets, outperforming the prior art by a notable 4% margin on DomainNet. Code is available at https://github.com/LoyoYang/DeCoTa.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yang_Deep_Co-Training_With_Task_Decomposition_for_Semi-Supervised_Domain_Adaptation_ICCV_2021_paper.pdf", @@ -8734,7 +9327,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Yang_Deep_Co-Training_With_Task_Decomposition_for_Semi-Supervised_Domain_Adaptation_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Yang_Deep_Co-Training_With_Task_Decomposition_for_Semi-Supervised_Domain_Adaptation_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Yang_2021_ICCV,\n \n author = {\n Yang,\n Luyu and Wang,\n Yan and Gao,\n Mingfei and Shrivastava,\n Abhinav and Weinberger,\n Kilian Q. and Chao,\n Wei-Lun and Lim,\n Ser-Nam\n},\n title = {\n Deep Co-Training With Task Decomposition for Semi-Supervised Domain Adaptation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8906-8916\n} \n}" }, { "title": "Deep Edge-Aware Interactive Colorization Against Color-Bleeding Effects", @@ -8742,6 +9336,7 @@ "status": "Poster", "track": "main", "pid": 6958, + "author_site": "Eungyeup Kim; Sanghyeon Lee; Jeonghoon Park; Somi Choi; Choonghyun Seo; Jaegul Choo", "author": "Eungyeup Kim; Sanghyeon Lee; Jeonghoon Park; Somi Choi; Choonghyun Seo; Jaegul Choo", "abstract": "Deep neural networks for automatic image colorization often suffer from the color-bleeding artifact, a problematic color spreading near the boundaries between adjacent objects. Such color-bleeding artifacts debase the reality of generated outputs, limiting the applicability of colorization models in practice. Although previous approaches have attempted to address this problem in an automatic manner, they tend to work only in limited cases where a high contrast of gray-scale values are given in an input image. Alternatively, leveraging user interactions would be a promising approach for solving this color-breeding artifacts. In this paper, we propose a novel edge-enhancing network for the regions of interest via simple user scribbles indicating where to enhance. In addition, our method requires a minimal amount of effort from users for their satisfactory enhancement. Experimental results demonstrate that our interactive edge-enhancing approach effectively improves the color-bleeding artifacts compared to the existing baselines across various datasets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Kim_Deep_Edge-Aware_Interactive_Colorization_Against_Color-Bleeding_Effects_ICCV_2021_paper.pdf", @@ -8765,7 +9360,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Kim_2021_ICCV,\n \n author = {\n Kim,\n Eungyeup and Lee,\n Sanghyeon and Park,\n Jeonghoon and Choi,\n Somi and Seo,\n Choonghyun and Choo,\n Jaegul\n},\n title = {\n Deep Edge-Aware Interactive Colorization Against Color-Bleeding Effects\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14667-14676\n} \n}" }, { "title": "Deep Halftoning With Reversible Binary Pattern", @@ -8773,6 +9369,7 @@ "status": "Poster", "track": "main", "pid": 1528, + "author_site": "Menghan Xia; Wenbo Hu; Xueting Liu; Tien-Tsin Wong", "author": "Menghan Xia; Wenbo Hu; Xueting Liu; Tien-Tsin Wong", "abstract": "Existing halftoning algorithms usually drop colors and fine details when dithering color images with binary dot patterns, which makes it extremely difficult to recover the original information. To dispense the recovery trouble in future, we propose a novel halftoning technique that converts a color image into binary halftone with full restorability to the original version. The key idea is to implicitly embed those previously dropped information into the halftone patterns. So, the halftone pattern not only serves to reproduce the image tone, maintain the blue-noise randomness, but also represents the color information and fine details. To this end, we exploit two collaborative convolutional neural networks (CNNs) to learn the dithering scheme, under a non-trivial self-supervision formulation. To tackle the flatness degradation issue of CNNs, we propose a novel noise incentive block (NIB) that can serve as a generic CNN plug-in for performance promotion. At last, we tailor a guiding-aware training scheme that secures the convergence direction as regulated. We evaluate the invertible halftones in multiple aspects, which evidences the effectiveness of our method.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xia_Deep_Halftoning_With_Reversible_Binary_Pattern_ICCV_2021_paper.pdf", @@ -8789,14 +9386,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Xia_Deep_Halftoning_With_Reversible_Binary_Pattern_ICCV_2021_paper.html", "aff_unique_index": "0+1;0;2;0+1", - "aff_unique_norm": "Chinese University of Hong Kong;Chinese Academy of Sciences;Caritas Institute of Higher Education", + "aff_unique_norm": "The Chinese University of Hong Kong;Chinese Academy of Sciences;Caritas Institute of Higher Education", "aff_unique_dep": ";Guangdong-Hong Kong-Macao Joint Laboratory of Human-Machine Intelligence-Synergy Systems;", "aff_unique_url": "https://www.cuhk.edu.hk;http://www.cas.cn;https://www.caritas.edu.hk", "aff_unique_abbr": "CUHK;CAS;", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0+0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xia_2021_ICCV,\n \n author = {\n Xia,\n Menghan and Hu,\n Wenbo and Liu,\n Xueting and Wong,\n Tien-Tsin\n},\n title = {\n Deep Halftoning With Reversible Binary Pattern\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14000-14009\n} \n}" }, { "title": "Deep Hough Voting for Robust Global Registration", @@ -8804,6 +9402,7 @@ "status": "Poster", "track": "main", "pid": 4141, + "author_site": "Junha Lee; Seungwook Kim; Minsu Cho; Jaesik Park", "author": "Junha Lee; Seungwook Kim; Minsu Cho; Jaesik Park", "abstract": "Point cloud registration is the task of estimating the rigid transformation that aligns a pair of point cloud fragments. We present an efficient and robust framework for pairwise registration of real-world 3D scans, leveraging Hough voting in the 6D transformation parameter space. First, deep geometric features are extracted from a point cloud pair to compute putative correspondences. We then construct a set of triplets of correspondences to cast votes on the 6D Hough space, which represents the transformation parameters in the form of sparse tensors. Next, a fully convolutional refinement module is applied to refine the noisy votes. Finally, we identify the consensus among the correspondences from the Hough space, which we use to predict our final transformation parameters. Our method outperforms state-of-the-art methods on the 3DMatch and 3DLoMatch benchmarks while achieving comparable performance on the KITTI odometry dataset. We further demonstrate the generalizability of our approach by setting a new state-of-the-art on the ICL-NUIM dataset, where we integrate our module into a multi-way registration pipeline.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Lee_Deep_Hough_Voting_for_Robust_Global_Registration_ICCV_2021_paper.pdf", @@ -8818,7 +9417,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Lee_Deep_Hough_Voting_for_Robust_Global_Registration_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Lee_Deep_Hough_Voting_for_Robust_Global_Registration_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Lee_2021_ICCV,\n \n author = {\n Lee,\n Junha and Kim,\n Seungwook and Cho,\n Minsu and Park,\n Jaesik\n},\n title = {\n Deep Hough Voting for Robust Global Registration\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15994-16003\n} \n}" }, { "title": "Deep Hybrid Self-Prior for Full 3D Mesh Generation", @@ -8826,6 +9426,7 @@ "status": "Poster", "track": "main", "pid": 3738, + "author_site": "Xingkui Wei; Zhengqing Chen; Yanwei Fu; Zhaopeng Cui; Yinda Zhang", "author": "Xingkui Wei; Zhengqing Chen; Yanwei Fu; Zhaopeng Cui; Yinda Zhang", "abstract": "We present a deep learning pipeline that leverages network self-prior to recover a full 3D model consisting of both a triangular mesh and a texture map from the colored 3D point cloud. Different from previous methods either exploiting 2D self-prior for image editing or 3D self-prior for pure surface reconstruction, we propose to exploit a novel hybrid 2D-3D self-prior in deep neural networks to significantly improve the geometry quality and produce a high-resolution texture map, which is typically missing from the output of commodity-level 3D scanners. In particular, we first generate an initial mesh using a 3D convolutional neural network with 3D self-prior, and then encode both 3D information and color information in the 2D UV atlas, which is further refined by 2D convolutional neural networks with the self-prior. In this way, both 2D and 3D self-priors are utilized for the mesh and texture recovery. Experiments show that, without the need of any additional training data, our method recovers the 3D textured mesh model of high quality from sparse input, and outperforms the state-of-the-art methods in terms of both the geometry and texture quality.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wei_Deep_Hybrid_Self-Prior_for_Full_3D_Mesh_Generation_ICCV_2021_paper.pdf", @@ -8843,13 +9444,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wei_Deep_Hybrid_Self-Prior_for_Full_3D_Mesh_Generation_ICCV_2021_paper.html", "aff_unique_index": "0;0;0+1;2;1", "aff_unique_norm": "Fudan University;Google;Zhejiang University", - "aff_unique_dep": ";Google;", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.fudan.edu.cn;https://www.google.com;https://www.zju.edu.cn", "aff_unique_abbr": "Fudan;Google;ZJU", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0+1;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Wei_2021_ICCV,\n \n author = {\n Wei,\n Xingkui and Chen,\n Zhengqing and Fu,\n Yanwei and Cui,\n Zhaopeng and Zhang,\n Yinda\n},\n title = {\n Deep Hybrid Self-Prior for Full 3D Mesh Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5805-5814\n} \n}" }, { "title": "Deep Implicit Surface Point Prediction Networks", @@ -8857,7 +9459,8 @@ "status": "Poster", "track": "main", "pid": 9228, - "author": "Rahul Venkatesh; Tejan Karmali; Sarthak Sharma; Aurobrata Ghosh; R. Venkatesh Babu; L\u00e1szl\u00f3 A. Jeni; Maneesh Singh", + "author_site": "Rahul Venkatesh; Tejan Karmali; Sarthak Sharma; Aurobrata Ghosh; R. Venkatesh Babu; László A. Jeni; Maneesh Singh", + "author": "Rahul Venkatesh; Tejan Karmali; Sarthak Sharma; Aurobrata Ghosh; R. Venkatesh Babu; László A. Jeni; Maneesh Singh", "abstract": "Deep neural representations of 3D shapes as implicit functions have been shown to produce high fidelity models surpassing the resolution-memory trade-off faced by the explicit representations using meshes and point clouds. However, most such approaches focus on representing closed shapes. Unsigned distance function (UDF) based approaches have been proposed recently as a promising alternative to represent both open and closed shapes. However, since the gradients of UDFs vanish on the surface, it is challenging to estimate local (differential) geometric properties like the normals and tangent planes which are needed for many downstream applications in vision and graphics. There are additional challenges in computing these properties efficiently with a low-memory footprint. This paper presents a novel approach that models such surfaces using a new class of implicit representations called the closest surface-point CSP representation. We show that CSP allows us to represent complex surfaces of any topology (open or closed) with high fidelity. It also allows for accurate and efficient computation of local geometric properties. We further demonstrate that it leads to efficient implementation of downstream algorithms like sphere-tracing for rendering the 3D surface as well as to create explicit mesh-based representations. Extensive experimental evaluation on the ShapeNet dataset validate the above contributions with results surpassing the state-of-the-art. Code and data are available at https://sites.google.com/view/cspnet", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Venkatesh_Deep_Implicit_Surface_Point_Prediction_Networks_ICCV_2021_paper.pdf", "aff": ";;;;;;", @@ -8871,7 +9474,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Venkatesh_Deep_Implicit_Surface_Point_Prediction_Networks_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Venkatesh_Deep_Implicit_Surface_Point_Prediction_Networks_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Venkatesh_2021_ICCV,\n \n author = {\n Venkatesh,\n Rahul and Karmali,\n Tejan and Sharma,\n Sarthak and Ghosh,\n Aurobrata and Babu,\n R. Venkatesh and Jeni,\n L\\'aszl\\'o A. and Singh,\n Maneesh\n},\n title = {\n Deep Implicit Surface Point Prediction Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12653-12662\n} \n}" }, { "title": "Deep Matching Prior: Test-Time Optimization for Dense Correspondence", @@ -8879,6 +9483,7 @@ "status": "Poster", "track": "main", "pid": 7528, + "author_site": "Sunghwan Hong; Seungryong Kim", "author": "Sunghwan Hong; Seungryong Kim", "abstract": "Conventional techniques to establish dense correspondences across visually or semantically similar images focused on designing a task-specific matching prior, which is difficult to model in general. To overcome this, recent learning-based methods have attempted to learn a good matching prior within a model itself on large training data. The performance improvement was apparent, but the need for sufficient training data and intensive learning hinders their applicability. Moreover, using the fixed model at test time does not account for the fact that a pair of images may require their own prior, thus providing limited performance and poor generalization to unseen images. In this paper, we show that an image pair-specific prior can be captured by solely optimizing the untrained matching networks on an input pair of images. Tailored for such test-time optimization for dense correspondence, we present a residual matching network and a confidence-aware contrastive loss to guarantee a meaningful convergence. Experiments demonstrate that our framework, dubbed Deep Matching Prior (DMP), is competitive, or even outperforms, against the latest learning-based methods on several benchmarks for geometric matching and semantic matching, even though it requires neither large training data nor intensive learning. With the networks pre-trained, DMP attains state-of-the-art performance on all benchmarks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Hong_Deep_Matching_Prior_Test-Time_Optimization_for_Dense_Correspondence_ICCV_2021_paper.pdf", @@ -8902,7 +9507,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Hong_2021_ICCV,\n \n author = {\n Hong,\n Sunghwan and Kim,\n Seungryong\n},\n title = {\n Deep Matching Prior: Test-Time Optimization for Dense Correspondence\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9907-9917\n} \n}" }, { "title": "Deep Metric Learning for Open World Semantic Segmentation", @@ -8910,6 +9516,7 @@ "status": "Poster", "track": "main", "pid": 3871, + "author_site": "Jun Cen; Peng Yun; Junhao Cai; Michael Yu Wang; Ming Liu", "author": "Jun Cen; Peng Yun; Junhao Cai; Michael Yu Wang; Ming Liu", "abstract": "Classical close-set semantic segmentation networks have limited ability to detect out-of-distribution (OOD) objects, which is important for safety-critical applications such as autonomous driving. Incrementally learning these OOD objects with few annotations is an ideal way to enlarge the knowledge base of the deep learning models. In this paper, we propose an open world semantic segmentation system that includes two modules: (1) an open-set semantic segmentation module to detect both in-distribution and OOD objects. (2) an incremental few-shot learning module to gradually incorporate those OOD objects into its existing knowledge base. This open world semantic segmentation system behaves like a human being, which is able to identify OOD objects and gradually learn them with corresponding supervision. We adopt the Deep Metric Learning Network (DMLNet) with contrastive clustering to implement open-set semantic segmentation. Compared to other open-set semantic segmentation methods, our DMLNet achieves state-of-the-art performance on three challenging open-set semantic segmentation datasets without using additional data or generative models. On this basis, two incremental few-shot learning methods are further proposed to progressively improve the DMLNet with the annotations of OOD objects.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Cen_Deep_Metric_Learning_for_Open_World_Semantic_Segmentation_ICCV_2021_paper.pdf", @@ -8933,7 +9540,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Cen_2021_ICCV,\n \n author = {\n Cen,\n Jun and Yun,\n Peng and Cai,\n Junhao and Wang,\n Michael Yu and Liu,\n Ming\n},\n title = {\n Deep Metric Learning for Open World Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15333-15342\n} \n}" }, { "title": "Deep Permutation Equivariant Structure From Motion", @@ -8941,6 +9549,7 @@ "status": "Poster", "track": "main", "pid": 9310, + "author_site": "Dror Moran; Hodaya Koslowsky; Yoni Kasten; Haggai Maron; Meirav Galun; Ronen Basri", "author": "Dror Moran; Hodaya Koslowsky; Yoni Kasten; Haggai Maron; Meirav Galun; Ronen Basri", "abstract": "Existing deep methods produce highly accurate 3D reconstructions in stereo and multiview stereo settings, i.e., when cameras are both internally and externally calibrated. Nevertheless, the challenge of simultaneous recovery of camera poses and 3D scene structure in multiview settings with deep networks is still outstanding. Inspired by projective factorization for Structure from Motion (SFM) and by deep matrix completion techniques, we propose a neural network architecture that, given a set of point tracks in multiple images of a static scene, recovers both the camera parameters and a (sparse) scene structure by minimizing an unsupervised reprojection loss. Our network architecture is designed to respect the structure of the problem: the sought output is equivariant to permutations of both cameras and scene points. Notably, our method does not require initialization of camera parameters or 3D point locations. We test our architecture in two setups: (1) single scene reconstruction and (2) learning from multiple scenes. Our experiments, conducted on a variety of datasets in both internally calibrated and uncalibrated settings, indicate that our method accurately recovers pose and structure, on par with classical state of the art methods. Additionally, we show that a pre-trained network can be used to reconstruct novel scenes using inexpensive fine-tuning with no loss of accuracy.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Moran_Deep_Permutation_Equivariant_Structure_From_Motion_ICCV_2021_paper.pdf", @@ -8957,14 +9566,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Moran_Deep_Permutation_Equivariant_Structure_From_Motion_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;1;0;0", - "aff_unique_norm": "Weizmann Institute of Science;NVIDIA", + "aff_unique_norm": "Weizmann Institute of Science;NVIDIA Corporation", "aff_unique_dep": ";NVIDIA Research", "aff_unique_url": "https://www.weizmann.org.il;https://www.nvidia.com/research", "aff_unique_abbr": "Weizmann;NVIDIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0;0", - "aff_country_unique": "Israel;United States" + "aff_country_unique": "Israel;United States", + "bibtex": "@InProceedings{Moran_2021_ICCV,\n \n author = {\n Moran,\n Dror and Koslowsky,\n Hodaya and Kasten,\n Yoni and Maron,\n Haggai and Galun,\n Meirav and Basri,\n Ronen\n},\n title = {\n Deep Permutation Equivariant Structure From Motion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5976-5986\n} \n}" }, { "title": "Deep Relational Metric Learning", @@ -8972,6 +9582,7 @@ "status": "Poster", "track": "main", "pid": 2964, + "author_site": "Wenzhao Zheng; Borui Zhang; Jiwen Lu; Jie Zhou", "author": "Wenzhao Zheng; Borui Zhang; Jiwen Lu; Jie Zhou", "abstract": "This paper presents a deep relational metric learning (DRML) framework for image clustering and retrieval. Most existing deep metric learning methods learn an embedding space with a general objective of increasing interclass distances and decreasing intraclass distances. However, the conventional losses of metric learning usually suppress intraclass variations which might be helpful to identify samples of unseen classes. To address this problem, we propose to adaptively learn an ensemble of features that characterizes an image from different aspects to model both interclass and intraclass distributions. We further employ a relational module to capture the correlations among each feature in the ensemble and construct a graph to represent an image. We then perform relational inference on the graph to integrate the ensemble and obtain a relation-aware embedding to measure the similarities. Extensive experiments on the widely-used CUB-200-2011, Cars196, and Stanford Online Products datasets demonstrate that our framework improves existing deep metric learning methods and achieves very competitive results.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zheng_Deep_Relational_Metric_Learning_ICCV_2021_paper.pdf", @@ -8995,7 +9606,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zheng_2021_ICCV,\n \n author = {\n Zheng,\n Wenzhao and Zhang,\n Borui and Lu,\n Jiwen and Zhou,\n Jie\n},\n title = {\n Deep Relational Metric Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12065-12074\n} \n}" }, { "title": "Deep Reparametrization of Multi-Frame Super-Resolution and Denoising", @@ -9003,6 +9615,7 @@ "status": "Poster", "track": "main", "pid": 5884, + "author_site": "Goutam Bhat; Martin Danelljan; Fisher Yu; Luc Van Gool; Radu Timofte", "author": "Goutam Bhat; Martin Danelljan; Fisher Yu; Luc Van Gool; Radu Timofte", "abstract": "We propose a deep reparametrization of the maximum a posteriori formulation commonly employed in multi-frame image restoration tasks. Our approach is derived by introducing a learned error metric and a latent representation of the target image, which transforms the MAP objective to a deep feature space. The deep reparametrization allows us to directly model the image formation process in the latent space, and to integrate learned image priors into the prediction. Our approach thereby leverages the advantages of deep learning, while also benefiting from the principled multi-frame fusion provided by the classical MAP formulation. We validate our approach through comprehensive experiments on burst denoising and burst super-resolution datasets. Our approach sets a new state-of-the-art for both tasks, demonstrating the generality and effectiveness of the proposed formulation.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Bhat_Deep_Reparametrization_of_Multi-Frame_Super-Resolution_and_Denoising_ICCV_2021_paper.pdf", @@ -9026,7 +9639,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "Switzerland" + "aff_country_unique": "Switzerland", + "bibtex": "@InProceedings{Bhat_2021_ICCV,\n \n author = {\n Bhat,\n Goutam and Danelljan,\n Martin and Yu,\n Fisher and Van Gool,\n Luc and Timofte,\n Radu\n},\n title = {\n Deep Reparametrization of Multi-Frame Super-Resolution and Denoising\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2460-2470\n} \n}" }, { "title": "Deep Structured Instance Graph for Distilling Object Detectors", @@ -9034,6 +9648,7 @@ "status": "Poster", "track": "main", "pid": 6152, + "author_site": "Yixin Chen; Pengguang Chen; Shu Liu; Liwei Wang; Jiaya Jia", "author": "Yixin Chen; Pengguang Chen; Shu Liu; Liwei Wang; Jiaya Jia", "abstract": "Effectively structuring deep knowledge plays a pivotal role in transfer from teacher to student, especially in semantic vision tasks. In this paper, we present a simple knowledge structure to exploit and encode information inside the detection system to facilitate detector knowledge distillation. Specifically, aiming at solving the feature imbalance problem while further excavating the missing relation inside semantic instances, we design a graph whose nodes correspond to instance proposal-level features and edges represent the relation between nodes. To further refine this graph, we design an adaptive background loss weight to reduce node noise and background samples mining to prune trivial edges. We transfer the entire graph as encoded knowledge representation from teacher to student, capturing local and global information simultaneously. We achieve new state-of-the-art results on the challenging COCO object detection task with diverse student-teacher pairs on both one- and two-stage detectors. We also experiment with instance segmentation to demonstrate robustness of our method. It is notable that distilled Faster R-CNN with ResNet18-FPN and ResNet50-FPN yields 38.68 and 41.82 Box AP respectively on the COCO benchmark, Faster R-CNN with ResNet101-FPN significantly achieves 43.38 AP, which outperforms ResNet152-FPN teacher about 0.7 AP. Code: https://github.com/dvlab-research/Dsig.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_Deep_Structured_Instance_Graph_for_Distilling_Object_Detectors_ICCV_2021_paper.pdf", @@ -9050,14 +9665,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Chen_Deep_Structured_Instance_Graph_for_Distilling_Object_Detectors_ICCV_2021_paper.html", "aff_unique_index": "0;0;1;0;0", - "aff_unique_norm": "Chinese University of Hong Kong;SmartMore", + "aff_unique_norm": "The Chinese University of Hong Kong;SmartMore", "aff_unique_dep": ";", "aff_unique_url": "https://www.cuhk.edu.hk;", "aff_unique_abbr": "CUHK;", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Yixin and Chen,\n Pengguang and Liu,\n Shu and Wang,\n Liwei and Jia,\n Jiaya\n},\n title = {\n Deep Structured Instance Graph for Distilling Object Detectors\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4359-4368\n} \n}" }, { "title": "Deep Survival Analysis With Longitudinal X-Rays for COVID-19", @@ -9065,6 +9681,7 @@ "status": "Poster", "track": "main", "pid": 7091, + "author_site": "Michelle Shu; Richard Strong Bowen; Charles Herrmann; Gengmo Qi; Michele Santacatterina; Ramin Zabih", "author": "Michelle Shu; Richard Strong Bowen; Charles Herrmann; Gengmo Qi; Michele Santacatterina; Ramin Zabih", "abstract": "Time-to-event analysis is an important statistical tool for allocating clinical resources such as ICU beds. However, classical techniques like the Cox model cannot directly incorporate images due to their high dimensionality. We propose a deep learning approach that naturally incorporates multiple, time-dependent imaging studies as well as non-imaging data into time-to-event analysis. Our techniques are benchmarked on a clinical dataset of 1,894 COVID-19 patients, and show that image sequences significantly improve predictions. For example, classical time-to-event methods produce a concordance error of around 30-40% for predicting hospital admission, while our error is 25% without images and 20% with multiple X-rays included. Ablation studies suggest that our models are not learning spurious features such as scanner artifacts. While our focus and evaluation is on COVID-19, the methods we develop are broadly applicable.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Shu_Deep_Survival_Analysis_With_Longitudinal_X-Rays_for_COVID-19_ICCV_2021_paper.pdf", @@ -9088,7 +9705,8 @@ "aff_campus_unique_index": "0;0;0;0;0+2", "aff_campus_unique": "New York City;;Mountain View", "aff_country_unique_index": "0;0;0;0;0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Shu_2021_ICCV,\n \n author = {\n Shu,\n Michelle and Bowen,\n Richard Strong and Herrmann,\n Charles and Qi,\n Gengmo and Santacatterina,\n Michele and Zabih,\n Ramin\n},\n title = {\n Deep Survival Analysis With Longitudinal X-Rays for COVID-19\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4046-4055\n} \n}" }, { "title": "Deep Symmetric Network for Underexposed Image Enhancement With Recurrent Attentional Learning", @@ -9096,6 +9714,7 @@ "status": "Poster", "track": "main", "pid": 5482, + "author_site": "Lin Zhao; Shao-Ping Lu; Tao Chen; Zhenglu Yang; Ariel Shamir", "author": "Lin Zhao; Shao-Ping Lu; Tao Chen; Zhenglu Yang; Ariel Shamir", "abstract": "Underexposed image enhancement is of importance in many research domains. In this paper, we take this problem as image feature transformation between the underexposed image and its paired enhanced version, and we propose a deep symmetric network for the issue. Our symmetric network adapts invertible neural networks (INN) for bidirectional feature learning between images, and to ensure the mutual propagation invertible we specifically construct two pairs of encoder-decoder with the same pretrained parameters. This invertible mechanism with bidirectional feature transformations enable us to both avoid colour bias and recover the content effectively for image enhancement. In addition, we propose a new recurrent residual-attention module (RRAM), where the recurrent learning network is designed to gradually perform the desired colour adjustments. Ablation experiments are executed to show the role of each component of our new architecture. We conduct a large number of experiments on two datasets to demonstrate that our method achieves the state-of-the-art effect in underexposed image enhancement. Code is available at https://www.shaopinglu.net/proj-iccv21/ImageEnhancement.html", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhao_Deep_Symmetric_Network_for_Underexposed_Image_Enhancement_With_Recurrent_Attentional_ICCV_2021_paper.pdf", @@ -9112,14 +9731,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhao_Deep_Symmetric_Network_for_Underexposed_Image_Enhancement_With_Recurrent_Attentional_ICCV_2021_paper.html", "aff_unique_index": "0;0;1;0;2", - "aff_unique_norm": "Nankai University;Elephant Technologies;Interdisciplinary Center", + "aff_unique_norm": "Nankai University;Elephant Technologies;The Interdisciplinary Center", "aff_unique_dep": "Computer Science;;", "aff_unique_url": ";;", "aff_unique_abbr": ";;", "aff_campus_unique_index": "0;0;0;2", "aff_campus_unique": "Tianjin;;Herzliya", "aff_country_unique_index": "0;0;0;0;1", - "aff_country_unique": "China;Israel" + "aff_country_unique": "China;Israel", + "bibtex": "@InProceedings{Zhao_2021_ICCV,\n \n author = {\n Zhao,\n Lin and Lu,\n Shao-Ping and Chen,\n Tao and Yang,\n Zhenglu and Shamir,\n Ariel\n},\n title = {\n Deep Symmetric Network for Underexposed Image Enhancement With Recurrent Attentional Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12075-12084\n} \n}" }, { "title": "Deep Transport Network for Unsupervised Video Object Segmentation", @@ -9127,6 +9747,7 @@ "status": "Poster", "track": "main", "pid": 2928, + "author_site": "Kaihua Zhang; Zicheng Zhao; Dong Liu; Qingshan Liu; Bo Liu", "author": "Kaihua Zhang; Zicheng Zhao; Dong Liu; Qingshan Liu; Bo Liu", "abstract": "The popular unsupervised video object segmentation methods fuse the RGB frame and optical flow via a two-stream network. However, they cannot handle the distracting noises in each input modality, which may vastly deteriorate the model performance. We propose to establish the correspondence between the input modalities while suppressing the distracting signals via optimal structural matching. Given a video frame, we extract the dense local features from the RGB image and optical flow, and treat them as two complex structured representations. The Wasserstein distance is then employed to compute the global optimal flows to transport the features in one modality to the other, where the magnitude of each flow measures the extent of the alignment between two local features. To plug the structural matching into a two-stream network for end-to-end training, we factorize the input cost matrix into small spatial blocks and design a differentiable long-short Sinkhorn module consisting of a long-distant Sinkhorn layer and a short-distant Sinkhorn layer. We integrate the module into a dedicated two-stream network and dub our model TransportNet. Our experiments show that aligning motion-appearance yields the state-of-the-art results on the popular video object segmentation datasets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhang_Deep_Transport_Network_for_Unsupervised_Video_Object_Segmentation_ICCV_2021_paper.pdf", @@ -9143,14 +9764,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhang_Deep_Transport_Network_for_Unsupervised_Video_Object_Segmentation_ICCV_2021_paper.html", "aff_unique_index": "0;0;1;0;2", - "aff_unique_norm": "Nanjing University of Information Science and Technology;Netflix Inc.;JD", - "aff_unique_dep": "School of Computing and Software;;JD Finance America Corporation", + "aff_unique_norm": "Nanjing University of Information Science and Technology;Netflix Inc.;JD Finance America Corporation", + "aff_unique_dep": "School of Computing and Software;;", "aff_unique_url": "http://www.nuist.edu.cn;https://www.netflix.com;", "aff_unique_abbr": ";Netflix;", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Los Gatos;Mountain View", "aff_country_unique_index": "0;0;1;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Zhang_2021_ICCV,\n \n author = {\n Zhang,\n Kaihua and Zhao,\n Zicheng and Liu,\n Dong and Liu,\n Qingshan and Liu,\n Bo\n},\n title = {\n Deep Transport Network for Unsupervised Video Object Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8781-8790\n} \n}" }, { "title": "Deep Virtual Markers for Articulated 3D Shapes", @@ -9158,6 +9780,7 @@ "status": "Poster", "track": "main", "pid": 5772, + "author_site": "Hyomin Kim; Jungeon Kim; Jaewon Kam; Jaesik Park; Seungyong Lee", "author": "Hyomin Kim; Jungeon Kim; Jaewon Kam; Jaesik Park; Seungyong Lee", "abstract": "We propose deep virtual markers, a framework for estimating dense and accurate positional information for various types of 3D data. We design a concept and construct a framework that maps 3D points of 3D articulated models, like humans, into virtual marker labels. To realize the framework, we adopt a sparse convolutional neural network and classify 3D points of an articulated model into virtual marker labels. We propose to use soft labels for the classifier to learn rich and dense interclass relationships based on geodesic distance. To measure the localization accuracy of the virtual markers, we test FAUST challenge, and our result outperforms the state-of-the-art. We also observe outstanding performance on the generalizability test, unseen data evaluation, and different 3D data types (meshes and depth maps). We show additional applications using the estimated virtual markers, such as non-rigid registration, texture transfer, and realtime dense marker prediction from depth maps.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Kim_Deep_Virtual_Markers_for_Articulated_3D_Shapes_ICCV_2021_paper.pdf", @@ -9181,7 +9804,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Pohang", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Kim_2021_ICCV,\n \n author = {\n Kim,\n Hyomin and Kim,\n Jungeon and Kam,\n Jaewon and Park,\n Jaesik and Lee,\n Seungyong\n},\n title = {\n Deep Virtual Markers for Articulated 3D Shapes\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11615-11625\n} \n}" }, { "title": "DeepCAD: A Deep Generative Network for Computer-Aided Design Models", @@ -9189,6 +9813,7 @@ "status": "Poster", "track": "main", "pid": 6743, + "author_site": "Rundi Wu; Chang Xiao; Changxi Zheng", "author": "Rundi Wu; Chang Xiao; Changxi Zheng", "abstract": "Deep generative models of 3D shapes have received a great deal of research interest. Yet, almost all of them generate discrete shape representations, such as voxels, point clouds, and polygon meshes. We present the first 3D generative model for a drastically different shape representation --- describing a shape as a sequence of computer-aided design (CAD) operations. Unlike meshes and point clouds, CAD models encode the user creation process of 3D shapes, widely used in numerous industrial and engineering design tasks. However, the sequential and irregular structure of CAD operations poses significant challenges for existing 3D generative models. Drawing an analogy between CAD operations and natural language, we propose a CAD generative network based on the Transformer. We demonstrate the performance of our model for both shape autoencoding and random shape generation. To train our network, we create a new CAD dataset consisting of 178,238 models and their CAD construction sequences. We have made this dataset publicly available to promote future research on this topic.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wu_DeepCAD_A_Deep_Generative_Network_for_Computer-Aided_Design_Models_ICCV_2021_paper.pdf", @@ -9212,7 +9837,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Wu_2021_ICCV,\n \n author = {\n Wu,\n Rundi and Xiao,\n Chang and Zheng,\n Changxi\n},\n title = {\n DeepCAD: A Deep Generative Network for Computer-Aided Design Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6772-6782\n} \n}" }, { "title": "DeepGaze IIE: Calibrated Prediction in and Out-of-Domain for State-of-the-Art Saliency Modeling", @@ -9220,10 +9846,11 @@ "status": "Poster", "track": "main", "pid": 7559, - "author": "Akis Linardos; Matthias K\u00fcmmerer; Ori Press; Matthias Bethge", + "author_site": "Akis Linardos; Matthias Kümmerer; Ori Press; Matthias Bethge", + "author": "Akis Linardos; Matthias Kümmerer; Ori Press; Matthias Bethge", "abstract": "Since 2014 transfer learning has become the key driver for the improvement of spatial saliency prediction - however, with stagnant progress in the last 3-5 years. We conduct a large-scale transfer learning study which tests different ImageNet backbones, always using the same read out architecture and learning protocol adopted from DeepGaze II. By replacing the VGG19 backbone of DeepGaze II with ResNet50 features we improve the performance on saliency prediction from 78% to 85%. However, as we continue to test better ImageNet models as backbones - such as EfficientNetB5 - we observe no additional improvement on saliency prediction. By analyzing the backbones further, we find that generalization to other datasets differs substantially, with models being consistently overconfident in their fixation predictions. We show that by combining multiple backbones in a principled manner a good confidence calibration on unseen datasets can be achieved. This new model \"DeepGaze IIE\" yields a significant leap in benchmark performance in and out-of-domain with a 15 percent point improvement over DeepGaze II to 93% on MIT1003, marking a new state of the art on the MIT/Tuebingen Saliency Benchmark in all available metrics (AUC: 88.3%, sAUC: 79.4%, CC: 82.4%).", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Linardos_DeepGaze_IIE_Calibrated_Prediction_in_and_Out-of-Domain_for_State-of-the-Art_Saliency_ICCV_2021_paper.pdf", - "aff": "University of Barcelona; University of T\u00fcbingen; University of T\u00fcbingen; University of T\u00fcbingen", + "aff": "University of Barcelona; University of Tübingen; University of Tübingen; University of Tübingen", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Linardos_DeepGaze_IIE_Calibrated_ICCV_2021_supplemental.pdf", @@ -9236,14 +9863,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Linardos_DeepGaze_IIE_Calibrated_Prediction_in_and_Out-of-Domain_for_State-of-the-Art_Saliency_ICCV_2021_paper.html", "aff_unique_index": "0;1;1;1", - "aff_unique_norm": "University of Barcelona;University of T\u00fcbingen", + "aff_unique_norm": "University of Barcelona;University of Tübingen", "aff_unique_dep": ";", "aff_unique_url": "https://www.ub.edu;https://www.uni-tuebingen.de/", - "aff_unique_abbr": "UB;Uni T\u00fcbingen", + "aff_unique_abbr": "UB;Uni Tübingen", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1", - "aff_country_unique": "Spain;Germany" + "aff_country_unique": "Spain;Germany", + "bibtex": "@InProceedings{Linardos_2021_ICCV,\n \n author = {\n Linardos,\n Akis and K\\"ummerer,\n Matthias and Press,\n Ori and Bethge,\n Matthias\n},\n title = {\n DeepGaze IIE: Calibrated Prediction in and Out-of-Domain for State-of-the-Art Saliency Modeling\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12919-12928\n} \n}" }, { "title": "DeepMultiCap: Performance Capture of Multiple Characters Using Sparse Multiview Cameras", @@ -9251,6 +9879,7 @@ "status": "Poster", "track": "main", "pid": 8847, + "author_site": "Yang Zheng; Ruizhi Shao; Yuxiang Zhang; Tao Yu; Zerong Zheng; Qionghai Dai; Yebin Liu", "author": "Yang Zheng; Ruizhi Shao; Yuxiang Zhang; Tao Yu; Zerong Zheng; Qionghai Dai; Yebin Liu", "abstract": "We propose DeepMultiCap, a novel method for multi-person performance capture using sparse multi-view cameras. Our method can capture time varying surface details without the need of using pre-scanned template models. To tackle with the serious occlusion challenge for close interacting scenes, we combine a recently proposed pixel-aligned implicit function with parametric model for robust reconstruction of the invisible surface areas. An effective attention-aware module is designed to obtain the fine-grained geometry details from multi-view images, where high-fidelity results can be generated. In addition to the spatial attention method, for video inputs, we further propose a novel temporal fusion method to alleviate the noise and temporal inconsistencies for moving character reconstruction. For quantitative evaluation, we contribute a high quality multi-person dataset, MultiHuman, which consists of 150 static scenes with different levels of occlusions and ground truth 3D human models. Experimental results demonstrate the state-of-the-art performance of our method and the well generalization to real multiview video data, which outperforms the prior works by a large margin.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zheng_DeepMultiCap_Performance_Capture_of_Multiple_Characters_Using_Sparse_Multiview_Cameras_ICCV_2021_paper.pdf", @@ -9274,7 +9903,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zheng_2021_ICCV,\n \n author = {\n Zheng,\n Yang and Shao,\n Ruizhi and Zhang,\n Yuxiang and Yu,\n Tao and Zheng,\n Zerong and Dai,\n Qionghai and Liu,\n Yebin\n},\n title = {\n DeepMultiCap: Performance Capture of Multiple Characters Using Sparse Multiview Cameras\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6239-6249\n} \n}" }, { "title": "DeepPRO: Deep Partial Point Cloud Registration of Objects", @@ -9282,6 +9912,7 @@ "status": "Poster", "track": "main", "pid": 9934, + "author_site": "Donghoon Lee; Onur C. Hamsici; Steven Feng; Prachee Sharma; Thorsten Gernoth", "author": "Donghoon Lee; Onur C. Hamsici; Steven Feng; Prachee Sharma; Thorsten Gernoth", "abstract": "We consider the problem of online and real-time registration of partial point clouds obtained from an unseen real-world rigid object without knowing its 3D model. The point cloud is partial as it is obtained by a depth sensor capturing only the visible part of the object from a certain viewpoint. It introduces two main challenges: 1) two partial point clouds do not fully overlap and 2) keypoints tend to be less reliable when the visible part of the object does not have salient local structures. To address these issues, we propose DeepPRO, a keypoint-free and an end-to-end trainable deep neural network. Its core idea is inspired by how humans align two point clouds: we can imagine how two point clouds will look like after the registration based on their shape. To realize the idea, DeepPRO has inputs of two partial point clouds and directly predicts the point-wise location of the aligned point cloud. By preserving the ordering of points during the prediction, we enjoy dense correspondences between input and predicted point clouds when inferring rigid transform parameters. We conduct extensive experiments on the real-world Linemod and synthetic ModelNet40 datasets. In addition, we collect and evaluate on the PRO1k dataset, a large-scale version of Linemod meant to test generalization to real-world scans. Results show that DeepPRO achieves the best accuracy against thirteen strong baseline methods, e.g., 2.2mm ADD on the Linemod dataset, while running 50 fps on mobile devices.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Lee_DeepPRO_Deep_Partial_Point_Cloud_Registration_of_Objects_ICCV_2021_paper.pdf", @@ -9298,14 +9929,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Lee_DeepPRO_Deep_Partial_Point_Cloud_Registration_of_Objects_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;0;0", - "aff_unique_norm": "Apple", - "aff_unique_dep": "Apple Inc.", + "aff_unique_norm": "Apple Inc.", + "aff_unique_dep": "", "aff_unique_url": "https://www.apple.com", "aff_unique_abbr": "Apple", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Lee_2021_ICCV,\n \n author = {\n Lee,\n Donghoon and Hamsici,\n Onur C. and Feng,\n Steven and Sharma,\n Prachee and Gernoth,\n Thorsten\n},\n title = {\n DeepPRO: Deep Partial Point Cloud Registration of Objects\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5683-5692\n} \n}" }, { "title": "DeepPanoContext: Panoramic 3D Scene Understanding With Holistic Scene Context Graph and Relation-Based Optimization", @@ -9313,6 +9945,7 @@ "status": "Poster", "track": "main", "pid": 7881, + "author_site": "Cheng Zhang; Zhaopeng Cui; Cai Chen; Shuaicheng Liu; Bing Zeng; Hujun Bao; Yinda Zhang", "author": "Cheng Zhang; Zhaopeng Cui; Cai Chen; Shuaicheng Liu; Bing Zeng; Hujun Bao; Yinda Zhang", "abstract": "Panorama images have a much larger field-of-view thus naturally encode enriched scene context information compared to standard perspective images, which however is not well exploited in the previous scene understanding methods. In this paper, we propose a novel method for panoramic 3D scene understanding which recovers the 3D room layout and the shape, pose, position, and semantic category for each object from a single full-view panorama image. In order to fully utilize the rich context information, we design a novel graph neural network based context model to predict the relationship among objects and room layout, and a differentiable relationship-based optimization module to optimize object arrangement with well-designed objective functions on-the-fly. Realizing the existing data are either with incomplete ground truth or overly-simplified scene, we present a new synthetic dataset with good diversity in room layout and furniture placement, and realistic image quality for total panoramic 3D scene understanding. Experiments demonstrate that our method outperforms existing methods on panoramic scene understanding in terms of both geometry accuracy and object arrangement. Code is available at https://chengzhag.github.io/publication/dpc.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhang_DeepPanoContext_Panoramic_3D_Scene_Understanding_With_Holistic_Scene_Context_Graph_ICCV_2021_paper.pdf", @@ -9330,13 +9963,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhang_DeepPanoContext_Panoramic_3D_Scene_Understanding_With_Holistic_Scene_Context_Graph_ICCV_2021_paper.html", "aff_unique_index": "0;1;0;0;0;1;2", "aff_unique_norm": "University of Electronic Science and Technology of China;Zhejiang University;Google", - "aff_unique_dep": ";State Key Lab of CAD & CG;Google", + "aff_unique_dep": ";State Key Lab of CAD & CG;", "aff_unique_url": "https://www.uestc.edu.cn;http://www.zju.edu.cn;https://www.google.com", "aff_unique_abbr": "UESTC;ZJU;Google", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0;0;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Zhang_2021_ICCV,\n \n author = {\n Zhang,\n Cheng and Cui,\n Zhaopeng and Chen,\n Cai and Liu,\n Shuaicheng and Zeng,\n Bing and Bao,\n Hujun and Zhang,\n Yinda\n},\n title = {\n DeepPanoContext: Panoramic 3D Scene Understanding With Holistic Scene Context Graph and Relation-Based Optimization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12632-12641\n} \n}" }, { "title": "Defending Against Universal Adversarial Patches by Clipping Feature Norms", @@ -9344,6 +9978,7 @@ "status": "Poster", "track": "main", "pid": 4001, + "author_site": "Cheng Yu; Jiansheng Chen; Youze Xue; Yuyang Liu; Weitao Wan; Jiayu Bao; Huimin Ma", "author": "Cheng Yu; Jiansheng Chen; Youze Xue; Yuyang Liu; Weitao Wan; Jiayu Bao; Huimin Ma", "abstract": "Physical-world adversarial attacks based on universal adversarial patches have been proved to be able to mislead deep convolutional neural networks (CNNs), exposing the vulnerability of real-world visual classification systems based on CNNs. In this paper, we empirically reveal and mathematically explain that the universal adversarial patches usually lead to deep feature vectors with very large norms in popular CNNs. Inspired by this, we propose a simple yet effective defending approach using a new feature norm clipping (FNC) layer which is a differentiable module that can be flexibly inserted in different CNNs to adaptively suppress the generation of large norm deep feature vectors. FNC introduces no trainable parameter and only very low computational overhead. However, experiments on multiple datasets validate that it can effectively improve the robustness of different CNNs towards white-box patch attacks while maintaining a satisfactory recognition accuracy for clean samples.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yu_Defending_Against_Universal_Adversarial_Patches_by_Clipping_Feature_Norms_ICCV_2021_paper.pdf", @@ -9367,7 +10002,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0+0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yu_2021_ICCV,\n \n author = {\n Yu,\n Cheng and Chen,\n Jiansheng and Xue,\n Youze and Liu,\n Yuyang and Wan,\n Weitao and Bao,\n Jiayu and Ma,\n Huimin\n},\n title = {\n Defending Against Universal Adversarial Patches by Clipping Feature Norms\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 16434-16442\n} \n}" }, { "title": "Defocus Map Estimation and Deblurring From a Single Dual-Pixel Image", @@ -9375,6 +10011,7 @@ "status": "Poster", "track": "main", "pid": 3469, + "author_site": "Shumian Xin; Neal Wadhwa; Tianfan Xue; Jonathan T. Barron; Pratul P. Srinivasan; Jiawen Chen; Ioannis Gkioulekas; Rahul Garg", "author": "Shumian Xin; Neal Wadhwa; Tianfan Xue; Jonathan T. Barron; Pratul P. Srinivasan; Jiawen Chen; Ioannis Gkioulekas; Rahul Garg", "abstract": "We present a method that takes as input a single dual-pixel image, and simultaneously estimates the image's defocus map---the amount of defocus blur at each pixel---and recovers an all-in-focus image. Our method is inspired from recent works that leverage the dual-pixel sensors available in many consumer cameras to assist with autofocus, and use them for recovery of defocus maps or all-in-focus images. These prior works have solved the two recovery problems independently of each other, and often require large labeled datasets for supervised training. By contrast, we show that it is beneficial to treat these two closely-connected problems simultaneously. To this end, we set up an optimization problem that, by carefully modeling the optics of dual-pixel images, jointly solves both problems. We use data captured with a consumer smartphone camera to demonstrate that, after a one-time calibration step, our approach improves upon prior works for both defocus map estimation and blur removal, despite being entirely unsupervised.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xin_Defocus_Map_Estimation_and_Deblurring_From_a_Single_Dual-Pixel_Image_ICCV_2021_paper.pdf", @@ -9391,14 +10028,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Xin_Defocus_Map_Estimation_and_Deblurring_From_a_Single_Dual-Pixel_Image_ICCV_2021_paper.html", "aff_unique_index": "0;1;1;1;1;2;0;1", - "aff_unique_norm": "Carnegie Mellon University;Google;Adobe", - "aff_unique_dep": ";Google Research;Adobe Inc.", + "aff_unique_norm": "Carnegie Mellon University;Google;Adobe Inc.", + "aff_unique_dep": ";Google Research;", "aff_unique_url": "https://www.cmu.edu;https://research.google;https://www.adobe.com", "aff_unique_abbr": "CMU;Google Research;Adobe", "aff_campus_unique_index": "1;1;1;1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Xin_2021_ICCV,\n \n author = {\n Xin,\n Shumian and Wadhwa,\n Neal and Xue,\n Tianfan and Barron,\n Jonathan T. and Srinivasan,\n Pratul P. and Chen,\n Jiawen and Gkioulekas,\n Ioannis and Garg,\n Rahul\n},\n title = {\n Defocus Map Estimation and Deblurring From a Single Dual-Pixel Image\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2228-2238\n} \n}" }, { "title": "Dense Deep Unfolding Network With 3D-CNN Prior for Snapshot Compressive Imaging", @@ -9406,6 +10044,7 @@ "status": "Poster", "track": "main", "pid": 6677, + "author_site": "Zhuoyuan Wu; Jian Zhang; Chong Mou", "author": "Zhuoyuan Wu; Jian Zhang; Chong Mou", "abstract": "Snapshot compressive imaging (SCI) aims to record three-dimensional signals via a two-dimensional camera. For the sake of building a fast and accurate SCI recovery algorithm, we incorporate the interpretability of model-based methods and the speed of learning-based ones and present a novel dense deep unfolding network (DUN) with 3D-CNN prior for SCI, where each phase is unrolled from an iteration of Half-Quadratic Splitting (HQS). To better exploit the spatial-temporal correlation among frames and address the problem of information loss between adjacent phases in existing DUNs, we propose to adopt the 3D-CNN prior in our proximal mapping module and develop a novel dense feature map (DFM) strategy, respectively. Besides, in order to promote network robustness, we further propose a dense feature map adaption (DFMA) module to allow inter-phase information to fuse adaptively. All the parameters are learned in an end-to-end fashion. Extensive experiments on simulation data and real data verify the superiority of our method. The source code is available at \\href https://github.com/jianzhangcs/SCI3D https://github.com/jianzhangcs/SCI3D .", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wu_Dense_Deep_Unfolding_Network_With_3D-CNN_Prior_for_Snapshot_Compressive_ICCV_2021_paper.pdf", @@ -9420,7 +10059,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wu_Dense_Deep_Unfolding_Network_With_3D-CNN_Prior_for_Snapshot_Compressive_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wu_Dense_Deep_Unfolding_Network_With_3D-CNN_Prior_for_Snapshot_Compressive_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Wu_2021_ICCV,\n \n author = {\n Wu,\n Zhuoyuan and Zhang,\n Jian and Mou,\n Chong\n},\n title = {\n Dense Deep Unfolding Network With 3D-CNN Prior for Snapshot Compressive Imaging\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4892-4901\n} \n}" }, { "title": "Dense Interaction Learning for Video-Based Person Re-Identification", @@ -9428,6 +10068,7 @@ "status": "Poster", "track": "main", "pid": 7184, + "author_site": "Tianyu He; Xin Jin; Xu Shen; Jianqiang Huang; Zhibo Chen; Xian-Sheng Hua", "author": "Tianyu He; Xin Jin; Xu Shen; Jianqiang Huang; Zhibo Chen; Xian-Sheng Hua", "abstract": "Video-based person re-identification (re-ID) aims at matching the same person across video clips. Efficiently exploiting multi-scale fine-grained features while building the structural interaction among them is pivotal for its success. In this paper, we propose a hybrid framework, Dense Interaction Learning (DenseIL), that takes the principal advantages of both CNN-based and Attention-based architectures to tackle video-based person re-ID difficulties. DenseIL contains a CNN encoder and a Dense Interaction (DI) decoder. The CNN encoder is responsible for efficiently extracting discriminative spatial features while the DI decoder is designed to densely model spatial-temporal inherent interaction across frames. Different from previous works, we additionally let the DI decoder densely attends to intermediate fine-grained CNN features and that naturally yields multi-grained spatial-temporal representation for each video clip. Moreover, we introduce Spatio-TEmporal Positional Embedding (STEP-Emb) into the DI decoder to investigate the positional relation among the spatial-temporal inputs. Our experiments consistently and significantly outperform all the state-of-the-art methods on multiple standard video-based person re-ID datasets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/He_Dense_Interaction_Learning_for_Video-Based_Person_Re-Identification_ICCV_2021_paper.pdf", @@ -9451,7 +10092,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{He_2021_ICCV,\n \n author = {\n He,\n Tianyu and Jin,\n Xin and Shen,\n Xu and Huang,\n Jianqiang and Chen,\n Zhibo and Hua,\n Xian-Sheng\n},\n title = {\n Dense Interaction Learning for Video-Based Person Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1490-1501\n} \n}" }, { "title": "DensePose 3D: Lifting Canonical Surface Maps of Articulated Objects to the Third Dimension", @@ -9459,6 +10101,7 @@ "status": "Poster", "track": "main", "pid": 9147, + "author_site": "Roman Shapovalov; David Novotny; Benjamin Graham; Patrick Labatut; Andrea Vedaldi", "author": "Roman Shapovalov; David Novotny; Benjamin Graham; Patrick Labatut; Andrea Vedaldi", "abstract": "We tackle the problem of monocular 3D reconstruction of articulated objects like humans and animals. Our key contribution is DensePose 3D, a novel parametric model of an articulated mesh, which can be learned in a self-supervised fashion from 2D image annotations only. This is in stark contrast with previous human body reconstruction methods that utilize a parametric model like SMPL pre-trained on a large dataset of 3D body scans that had to be obtained in a controlled environment. DensePose 3D can thus be applied for modelling broad range of articulated categories such as animal species. In an end-to-end fashion, it automatically learns to softly assign each vertex of a category-specific 3D template mesh to one of the rigidly moving latent parts and trains a single-view network predicting rigid motions of the parts to deform the template so that it re-projects correctly to the dense 2D surface annotations of objects (such as DensePose). In order to prevent unrealistic template deformations, we further propose to align the motions of nearby mesh vertices by expressing the part assignment as a function of the smooth eigenfunctions of the Laplace--Beltrami operator computed on the template mesh. Our experiments demonstrate improvements over the state-of-the-art non-rigid structure-from-motion baselines on both synthetic and real data on categories of humans and animals.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Shapovalov_DensePose_3D_Lifting_Canonical_Surface_Maps_of_Articulated_Objects_to_ICCV_2021_paper.pdf", @@ -9475,14 +10118,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Shapovalov_DensePose_3D_Lifting_Canonical_Surface_Maps_of_Articulated_Objects_to_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;0;0", - "aff_unique_norm": "Meta", + "aff_unique_norm": "Facebook", "aff_unique_dep": "Facebook AI Research", "aff_unique_url": "https://research.facebook.com", "aff_unique_abbr": "FAIR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Shapovalov_2021_ICCV,\n \n author = {\n Shapovalov,\n Roman and Novotny,\n David and Graham,\n Benjamin and Labatut,\n Patrick and Vedaldi,\n Andrea\n},\n title = {\n DensePose 3D: Lifting Canonical Surface Maps of Articulated Objects to the Third Dimension\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11729-11739\n} \n}" }, { "title": "DenseTNT: End-to-End Trajectory Prediction From Dense Goal Sets", @@ -9490,6 +10134,7 @@ "status": "Poster", "track": "main", "pid": 7610, + "author_site": "Junru Gu; Chen Sun; Hang Zhao", "author": "Junru Gu; Chen Sun; Hang Zhao", "abstract": "Due to the stochasticity of human behaviors, predicting the future trajectories of road agents is challenging for autonomous driving. Recently, goal-based multi-trajectory prediction methods are proved to be effective, where they first score over-sampled goal candidates and then select a final set from them. However, these methods usually involve goal predictions based on sparse pre-defined anchors and heuristic goal selection algorithms. In this work, we propose an anchor-free and end-to-end trajectory prediction model, named DenseTNT, that directly outputs a set of trajectories from dense goal candidates. In addition, we introduce an offline optimization-based technique to provide multi-future pseudo-labels for our final online model. Experiments show that DenseTNT achieves state-of-the-art performance, ranking 1st on the Argoverse motion forecasting benchmark and being the 1st place winner of the 2021 Waymo Open Dataset Motion Prediction Challenge.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Gu_DenseTNT_End-to-End_Trajectory_Prediction_From_Dense_Goal_Sets_ICCV_2021_paper.pdf", @@ -9513,7 +10158,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Gu_2021_ICCV,\n \n author = {\n Gu,\n Junru and Sun,\n Chen and Zhao,\n Hang\n},\n title = {\n DenseTNT: End-to-End Trajectory Prediction From Dense Goal Sets\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15303-15312\n} \n}" }, { "title": "Densely Guided Knowledge Distillation Using Multiple Teacher Assistants", @@ -9521,6 +10167,7 @@ "status": "Poster", "track": "main", "pid": 6520, + "author_site": "Wonchul Son; Jaemin Na; Junyong Choi; Wonjun Hwang", "author": "Wonchul Son; Jaemin Na; Junyong Choi; Wonjun Hwang", "abstract": "With the success of deep neural networks, knowledge distillation which guides the learning of a small student network from a large teacher network is being actively studied for model compression and transfer learning. However, few studies have been performed to resolve the poor learning issue of the student network when the student and teacher model sizes significantly differ. In this paper, we propose a densely guided knowledge distillation using multiple teacher assistants that gradually decreases the model size to efficiently bridge the large gap between the teacher and student networks. To stimulate more efficient learning of the student network, we guide each teacher assistant to every other smaller teacher assistants iteratively. Specifically, when teaching a smaller teacher assistant at the next step, the existing larger teacher assistants from the previous step are used as well as the teacher network. Moreover, we design stochastic teaching where, for each mini-batch, a teacher or teacher assistants are randomly dropped. This acts as a regularizer to improve the efficiency of teaching of the student network. Thus, the student can always learn salient distilled knowledge from the multiple sources. We verified the effectiveness of the proposed method for a classification task using CIFAR-10, CIFAR-100, and ImageNet. We also achieved significant performance improvements with various backbone architectures such as ResNet, WideResNet, and VGG.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Son_Densely_Guided_Knowledge_Distillation_Using_Multiple_Teacher_Assistants_ICCV_2021_paper.pdf", @@ -9544,7 +10191,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Son_2021_ICCV,\n \n author = {\n Son,\n Wonchul and Na,\n Jaemin and Choi,\n Junyong and Hwang,\n Wonjun\n},\n title = {\n Densely Guided Knowledge Distillation Using Multiple Teacher Assistants\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9395-9404\n} \n}" }, { "title": "DepthInSpace: Exploitation and Fusion of Multiple Video Frames for Structured-Light Depth Estimation", @@ -9552,7 +10200,8 @@ "status": "Poster", "track": "main", "pid": 7984, - "author": "Mohammad Mahdi Johari; Camilla Carta; Fran\u00e7ois Fleuret", + "author_site": "Mohammad Mahdi Johari; Camilla Carta; François Fleuret", + "author": "Mohammad Mahdi Johari; Camilla Carta; François Fleuret", "abstract": "We present DepthInSpace, a self-supervised deep-learning method for depth estimation using a structured-light camera. The design of this method is motivated by the commercial use case of embedded depth sensors in nowadays smartphones. We first propose to use estimated optical flow from ambient information of multiple video frames as a complementary guide for training a single-frame depth estimation network, helping to preserve edges and reduce over-smoothing issues. Utilizing optical flow, we also propose to fuse the data of multiple video frames to get a more accurate depth map. In particular, fused depth maps are more robust in occluded areas and incur less in flying pixels artifacts. We finally demonstrate that these more precise fused depth maps can be used as self-supervision for fine-tuning a single-frame depth estimation network to improve its performance. Our models' effectiveness is evaluated and compared with state-of-the-art models on both synthetic and our newly introduced real datasets. The implementation code, training procedure, and both synthetic and captured real datasets are available at https://www.idiap.ch/paper/depthinspace.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Johari_DepthInSpace_Exploitation_and_Fusion_of_Multiple_Video_Frames_for_Structured-Light_ICCV_2021_paper.pdf", "aff": "Idiap Reserach Institute, EPFL; ams OSRAM; University of Geneva, EPFL", @@ -9575,7 +10224,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "Switzerland;Austria" + "aff_country_unique": "Switzerland;Austria", + "bibtex": "@InProceedings{Johari_2021_ICCV,\n \n author = {\n Johari,\n Mohammad Mahdi and Carta,\n Camilla and Fleuret,\n Fran\\c{c\n}ois\n},\n title = {\n DepthInSpace: Exploitation and Fusion of Multiple Video Frames for Structured-Light Depth Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6039-6048\n} \n}" }, { "title": "DepthTrack: Unveiling the Power of RGBD Tracking", @@ -9583,7 +10233,8 @@ "status": "Poster", "track": "main", "pid": 6872, - "author": "Song Yan; Jinyu Yang; Jani K\u00e4pyl\u00e4; Feng Zheng; Ale\u0161 Leonardis; Joni-Kristian K\u00e4m\u00e4r\u00e4inen", + "author_site": "Song Yan; Jinyu Yang; Jani Käpylä; Feng Zheng; Aleš Leonardis; Joni-Kristian Kämäräinen", + "author": "Song Yan; Jinyu Yang; Jani Käpylä; Feng Zheng; Aleš Leonardis; Joni-Kristian Kämäräinen", "abstract": "RGBD (RGB plus depth) object tracking is gaining momentum as RGBD sensors have become popular in many application fields such as robotics. However, the best RGBD trackers are extensions of the state-of-the-art deep RGB trackers. They are trained with RGB data and the depth channel is used as a sidekick for subtleties such as occlusion detection. This can be explained by the fact that there are no sufficiently large RGBD datasets to 1) train \"deep depth trackers\" and to 2) challenge RGB trackers with sequences for which the depth cue is essential. This work introduces a new RGBD tracking dataset - DepthTrack - that has twice as many sequences (200) and scene types (40) than in the largest existing dataset, and three times more objects (90). In addition, the average length of the sequences (1473), the number of deformable objects (16) and the number of annotated tracking attributes (15) have been increased. Furthermore, by running the SotA RGB and RGBD trackers on DepthTrack, we propose a new RGBD tracking baseline, namely DeT, which reveals that deep RGBD tracking indeed benefits from genuine training data. The code and dataset is available at https://github.com/xiaozai/DeT.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yan_DepthTrack_Unveiling_the_Power_of_RGBD_Tracking_ICCV_2021_paper.pdf", "aff": "Tampere University; Southern University of Science and Technology+University of Birmingham; Tampere University; Southern University of Science and Technology; University of Birmingham; Tampere University", @@ -9606,7 +10257,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1+2;0;1;2;0", - "aff_country_unique": "Finland;China;United Kingdom" + "aff_country_unique": "Finland;China;United Kingdom", + "bibtex": "@InProceedings{Yan_2021_ICCV,\n \n author = {\n Yan,\n Song and Yang,\n Jinyu and K\\"apyl\\"a,\n Jani and Zheng,\n Feng and Leonardis,\n Ale\\v{s\n} and K\\"am\\"ar\\"ainen,\n Joni-Kristian\n},\n title = {\n DepthTrack: Unveiling the Power of RGBD Tracking\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10725-10733\n} \n}" }, { "title": "Describing and Localizing Multiple Changes With Transformers", @@ -9614,6 +10266,7 @@ "status": "Poster", "track": "main", "pid": 5437, + "author_site": "Yue Qiu; Shintaro Yamamoto; Kodai Nakashima; Ryota Suzuki; Kenji Iwata; Hirokatsu Kataoka; Yutaka Satoh", "author": "Yue Qiu; Shintaro Yamamoto; Kodai Nakashima; Ryota Suzuki; Kenji Iwata; Hirokatsu Kataoka; Yutaka Satoh", "abstract": "Existing change captioning studies have mainly focused on a single change. However, detecting and describing multiple changed parts in image pairs is essential for enhancing adaptability to complex scenarios. We solve the above issues from three aspects: (i) We propose a simulation-based multi-change captioning dataset; (ii) We benchmark existing state-of-the-art methods of single change captioning on multi-change captioning; (iii) We further propose Multi-Change Captioning transformers (MCCFormers) that identify change regions by densely correlating different regions in image pairs and dynamically determines the related change regions with words in sentences. The proposed method obtained the highest scores on four conventional change captioning evaluation metrics for multi-change captioning. Additionally, our proposed method can separate attention maps for each change and performs well with respect to change localization. Moreover, the proposed framework outperformed the previous state-of-the-art methods on an existing change captioning benchmark, CLEVR-Change, by a large margin (+6.1 on BLEU-4 and +9.7 on CIDEr scores), indicating its general ability in change captioning tasks. The code and dataset are available at the project page.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Qiu_Describing_and_Localizing_Multiple_Changes_With_Transformers_ICCV_2021_paper.pdf", @@ -9637,7 +10290,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0;0;0;0;0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Qiu_2021_ICCV,\n \n author = {\n Qiu,\n Yue and Yamamoto,\n Shintaro and Nakashima,\n Kodai and Suzuki,\n Ryota and Iwata,\n Kenji and Kataoka,\n Hirokatsu and Satoh,\n Yutaka\n},\n title = {\n Describing and Localizing Multiple Changes With Transformers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1971-1980\n} \n}" }, { "title": "Designing a Practical Degradation Model for Deep Blind Image Super-Resolution", @@ -9645,6 +10299,7 @@ "status": "Poster", "track": "main", "pid": 3968, + "author_site": "Kai Zhang; Jingyun Liang; Luc Van Gool; Radu Timofte", "author": "Kai Zhang; Jingyun Liang; Luc Van Gool; Radu Timofte", "abstract": "It is widely acknowledged that single image super-resolution (SISR) methods would not perform well if the assumed degradation model deviates from those in real images. Although several degradation models take additional factors into consideration, such as blur, they are still not effective enough to cover the diverse degradations of real images. To address this issue, this paper proposes to design a more complex but practical degradation model that consists of randomly shuffled blur, downsampling and noise degradations. Specifically, the blur is approximated by two convolutions with isotropic and anisotropic Gaussian kernels; the downsampling is randomly chosen from nearest, bilinear and bicubic interpolations; the noise is synthesized by adding Gaussian noise with different noise levels, adopting JPEG compression with different quality factors, and generating processed camera sensor noise via reverse-forward camera image signal processing (ISP) pipeline model and RAW image noise model. To verify the effectiveness of the new degradation model, we have trained a deep blind ESRGAN super-resolver and then applied it to super-resolve both synthetic and real images with diverse degradations. The experimental results demonstrate that the new degradation model can help to significantly improve the practicability of deep super-resolvers, thus providing a powerful alternative solution for real SISR applications.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhang_Designing_a_Practical_Degradation_Model_for_Deep_Blind_Image_Super-Resolution_ICCV_2021_paper.pdf", @@ -9668,7 +10323,8 @@ "aff_campus_unique_index": ";;;", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0+1;0+1;0+1", - "aff_country_unique": "Switzerland;Belgium" + "aff_country_unique": "Switzerland;Belgium", + "bibtex": "@InProceedings{Zhang_2021_ICCV,\n \n author = {\n Zhang,\n Kai and Liang,\n Jingyun and Van Gool,\n Luc and Timofte,\n Radu\n},\n title = {\n Designing a Practical Degradation Model for Deep Blind Image Super-Resolution\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4791-4800\n} \n}" }, { "title": "DetCo: Unsupervised Contrastive Learning for Object Detection", @@ -9676,10 +10332,11 @@ "status": "Poster", "track": "main", "pid": 6486, + "author_site": "Enze Xie; Jian Ding; Wenhai Wang; Xiaohang Zhan; Hang Xu; Peize Sun; Zhenguo Li; Ping Luo", "author": "Enze Xie; Jian Ding; Wenhai Wang; Xiaohang Zhan; Hang Xu; Peize Sun; Zhenguo Li; Ping Luo", "abstract": "We present DetCo, a simple yet effective self-supervised approach for object detection. Unsupervised pre-training methods have been recently designed for object detection, but they are usually deficient in image classification, or the opposite. Unlike them, DetCo transfers well on downstream instance-level dense prediction tasks, while maintaining competitive image-level classification accuracy. The advantages are derived from (1) multi-level supervision to intermediate representations, (2) contrastive learning between global image and local patches. These two designs facilitate discriminative and consistent global and local representation at each level of feature pyramid, improving detection and classification, simultaneously. Extensive experiments on VOC, COCO, Cityscapes, and ImageNet demonstrate that DetCo not only outperforms recent methods on a series of 2D and 3D instance-level detection tasks, but also competitive on image classification. For example, on ImageNet classification, DetCo is 6.9% and 5.0% top-1 accuracy better than InsLoc and DenseCL, which are two contemporary works designed for object detection. Moreover, on COCO detection, DetCo is 6.9 AP better than SwAV with Mask R-CNN C4. Notably, DetCo largely boosts up Sparse R-CNN, a recent strong detector, from 45.0 AP to 46.5 AP (+1.5 AP), establishing a new SOTA on COCO. Code is available.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xie_DetCo_Unsupervised_Contrastive_Learning_for_Object_Detection_ICCV_2021_paper.pdf", - "aff": "The University of Hong Kong; Wuhan University; Nanjing University; Chinese University of Hong Kong; Huawei Noah\u2019s Ark Lab; The University of Hong Kong; Huawei Noah\u2019s Ark Lab; The University of Hong Kong", + "aff": "The University of Hong Kong; Wuhan University; Nanjing University; Chinese University of Hong Kong; Huawei Noah’s Ark Lab; The University of Hong Kong; Huawei Noah’s Ark Lab; The University of Hong Kong", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Xie_DetCo_Unsupervised_Contrastive_ICCV_2021_supplemental.pdf", @@ -9692,14 +10349,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Xie_DetCo_Unsupervised_Contrastive_Learning_for_Object_Detection_ICCV_2021_paper.html", "aff_unique_index": "0;1;2;3;4;0;4;0", - "aff_unique_norm": "University of Hong Kong;Wuhan University;Nanjing University;Chinese University of Hong Kong;Huawei", - "aff_unique_dep": ";;;;Noah\u2019s Ark Lab", + "aff_unique_norm": "The University of Hong Kong;Wuhan University;Nanjing University;Chinese University of Hong Kong;Huawei", + "aff_unique_dep": ";;;;Noah’s Ark Lab", "aff_unique_url": "https://www.hku.hk;http://www.whu.edu.cn/;https://www.nju.edu.cn;https://www.cuhk.edu.hk;https://www.huawei.com", "aff_unique_abbr": "HKU;WHU;Nanjing U;CUHK;Huawei", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xie_2021_ICCV,\n \n author = {\n Xie,\n Enze and Ding,\n Jian and Wang,\n Wenhai and Zhan,\n Xiaohang and Xu,\n Hang and Sun,\n Peize and Li,\n Zhenguo and Luo,\n Ping\n},\n title = {\n DetCo: Unsupervised Contrastive Learning for Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8392-8401\n} \n}" }, { "title": "Detail Me More: Improving GAN's Photo-Realism of Complex Scenes", @@ -9707,6 +10365,7 @@ "status": "Poster", "track": "main", "pid": 1454, + "author_site": "Raghudeep Gadde; Qianli Feng; Aleix M. Martinez", "author": "Raghudeep Gadde; Qianli Feng; Aleix M. Martinez", "abstract": "Generative models can synthesize photo-realistic images of a single object. For example, for human faces, algorithms learn to model the local shape and shading of the face components, i.e., changes in the brows, eyes, nose, mouth, jaw line, etc. This is possible because all faces have two brows, two eyes, a nose and a mouth, approximately in the same location. The modeling of complex scenes is however much more challenging because the scene components and their location vary from image to image. For example, living rooms contain a varying number of products belonging to many possible categories and locations, e.g., a lamp may or may not be present in an endless number of possible locations. In the present work, we propose to add a \"broker\" module in Generative Adversarial Networks (GAN) to solve this problem. The broker is tasked to mediate the use of multiple discriminators in the appropriate image locales. For example, if a lamp is detected or wanted in a specific area of the scene, the broker assigns a fine-grained lamp discriminator to that image patch. This allows the generator to learn the shape and shading models of the lamp. The resulting multi-fine-grained optimization problem is able to synthesize complex scenes with almost the same level of photo-realism as single object images. We demonstrate the generability of the proposed approach on several GAN algorithms (BigGAN, ProGAN, StyleGAN, StyleGAN2), image resolutions (256x256 to 1024x1024), and datasets. Our approach yields significant improvements over state-of-the-art GAN algorithms.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Gadde_Detail_Me_More_Improving_GANs_Photo-Realism_of_Complex_Scenes_ICCV_2021_paper.pdf", @@ -9721,7 +10380,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Gadde_Detail_Me_More_Improving_GANs_Photo-Realism_of_Complex_Scenes_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Gadde_Detail_Me_More_Improving_GANs_Photo-Realism_of_Complex_Scenes_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Gadde_2021_ICCV,\n \n author = {\n Gadde,\n Raghudeep and Feng,\n Qianli and Martinez,\n Aleix M.\n},\n title = {\n Detail Me More: Improving GAN's Photo-Realism of Complex Scenes\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13950-13959\n} \n}" }, { "title": "Detecting Human-Object Relationships in Videos", @@ -9729,6 +10389,7 @@ "status": "Poster", "track": "main", "pid": 3678, + "author_site": "Jingwei Ji; Rishi Desai; Juan Carlos Niebles", "author": "Jingwei Ji; Rishi Desai; Juan Carlos Niebles", "abstract": "We study a crucial problem in video analysis: human-object relationship detection. The majority of previous approaches are developed only for the static image scenario, without incorporating the temporal dynamics so vital to contextualizing human-object relationships. We propose a model with Intra- and Inter-Transformers, enabling joint spatial and temporal reasoning on multiple visual concepts of objects, relationships, and human poses. We find that applying attention mechanisms among features distributed spatio-temporally greatly improves our understanding of human-object relationships. Our method is validated on two datasets, Action Genome and CAD-120-EVAR, and achieves state-of-the-art performance on both of them.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ji_Detecting_Human-Object_Relationships_in_Videos_ICCV_2021_paper.pdf", @@ -9752,7 +10413,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Ji_2021_ICCV,\n \n author = {\n Ji,\n Jingwei and Desai,\n Rishi and Niebles,\n Juan Carlos\n},\n title = {\n Detecting Human-Object Relationships in Videos\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8106-8116\n} \n}" }, { "title": "Detecting Invisible People", @@ -9760,6 +10422,7 @@ "status": "Poster", "track": "main", "pid": 7016, + "author_site": "Tarasha Khurana; Achal Dave; Deva Ramanan", "author": "Tarasha Khurana; Achal Dave; Deva Ramanan", "abstract": "Monocular object detection and tracking have improved drastically in recent years, but rely on a key assumption: that objects are visible to the camera. Many offline tracking approaches reason about occluded objects post-hoc, by linking together tracklets after the object re-appears, making use of reidentification (ReID). However, online tracking in embodied robotic agents (such as a self-driving vehicle) fundamentally requires object permanence, which is the ability to reason about occluded objects before they re-appear. In this work, we re-purpose tracking benchmarks and propose new metrics for the task of detecting invisible objects, focusing on the illustrative case of people. We demonstrate that current detection and tracking systems perform dramatically worse on this task. We introduce two key innovations to recover much of this performance drop. We treat occluded object detection in temporal sequences as a short-term forecasting challenge, bringing to bear tools from dynamic sequence prediction. Second, we build dynamic models that explicitly reason in 3D from monocular videos without calibration, using observations produced by monocular depth estimators. To our knowledge, ours is the first work to demonstrate the effectiveness of monocular depth estimation for the task of tracking and detecting occluded objects. Our approach strongly improves by 11.4% over the baseline in ablations and by 5.0% over the state-of-the-art in F1 score.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Khurana_Detecting_Invisible_People_ICCV_2021_paper.pdf", @@ -9783,7 +10446,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Khurana_2021_ICCV,\n \n author = {\n Khurana,\n Tarasha and Dave,\n Achal and Ramanan,\n Deva\n},\n title = {\n Detecting Invisible People\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3174-3184\n} \n}" }, { "title": "Detecting Persuasive Atypicality by Modeling Contextual Compatibility", @@ -9791,6 +10455,7 @@ "status": "Poster", "track": "main", "pid": 5542, + "author_site": "Meiqi Guo; Rebecca Hwa; Adriana Kovashka", "author": "Meiqi Guo; Rebecca Hwa; Adriana Kovashka", "abstract": "We propose a new approach to detect atypicality in persuasive imagery. Unlike atypicality which has been studied in prior work, persuasive atypicality has a particular purpose to convey meaning, and relies on understanding the common-sense spatial relations of objects. We propose a self-supervised attention-based technique which captures contextual compatibility, and models spatial relations in a precise manner. We further experiment with capturing common sense through the semantics of co-occurring object classes. We verify our approach on a dataset of atypicality in visual advertisements, as well as a second dataset capturing atypicality that has no persuasive intent.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Guo_Detecting_Persuasive_Atypicality_by_Modeling_Contextual_Compatibility_ICCV_2021_paper.pdf", @@ -9814,7 +10479,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Pittsburgh", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Guo_2021_ICCV,\n \n author = {\n Guo,\n Meiqi and Hwa,\n Rebecca and Kovashka,\n Adriana\n},\n title = {\n Detecting Persuasive Atypicality by Modeling Contextual Compatibility\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 972-982\n} \n}" }, { "title": "Detection and Continual Learning of Novel Face Presentation Attacks", @@ -9822,6 +10488,7 @@ "status": "Poster", "track": "main", "pid": 5936, + "author_site": "Mohammad Rostami; Leonidas Spinoulas; Mohamed Hussein; Joe Mathai; Wael Abd-Almageed", "author": "Mohammad Rostami; Leonidas Spinoulas; Mohamed Hussein; Joe Mathai; Wael Abd-Almageed", "abstract": "Advances in deep learning, combined with availability of large datasets, have led to impressive improvements in face presentation attack detection research. However, state of the art face antispoofing systems are still vulnerable to novel types of attacks that are never seen during training. Moreover, even if such attacks are correctly detected, these systems lack the ability to adapt to newly encountered attacks. The post-training ability of continually detecting new types of attacks and self-adaptation to identify these attack types, after the initial detection phase, is highly appealing. In this paper, we enable a deep neural network to detect anomalies in the observed input data points as potential new types of attacks by suppressing the confidence-level of the network outside the training samples' distribution. We then use experience replay to update the model to incorporate knowledge about new types of attacks without forgetting the past learned attack types. Experimental results are provided to demonstrate the effectiveness of the proposed method on the OULU and Idiap datasets as well as a newly introduced dataset, all of which exhibit a variety of attack types.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Rostami_Detection_and_Continual_Learning_of_Novel_Face_Presentation_Attacks_ICCV_2021_paper.pdf", @@ -9845,7 +10512,8 @@ "aff_campus_unique_index": "0;0;0+1;0;0", "aff_campus_unique": "Los Angeles;Alexandria", "aff_country_unique_index": "0;0;0+1;0;0", - "aff_country_unique": "United States;Egypt" + "aff_country_unique": "United States;Egypt", + "bibtex": "@InProceedings{Rostami_2021_ICCV,\n \n author = {\n Rostami,\n Mohammad and Spinoulas,\n Leonidas and Hussein,\n Mohamed and Mathai,\n Joe and Abd-Almageed,\n Wael\n},\n title = {\n Detection and Continual Learning of Novel Face Presentation Attacks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14851-14860\n} \n}" }, { "title": "Detector-Free Weakly Supervised Grounding by Separation", @@ -9853,6 +10521,7 @@ "status": "Poster", "track": "main", "pid": 7037, + "author_site": "Assaf Arbelle; Sivan Doveh; Amit Alfassy; Joseph Shtok; Guy Lev; Eli Schwartz; Hilde Kuehne; Hila Barak Levi; Prasanna Sattigeri; Rameswar Panda; Chun-Fu (Richard) Chen; Alex Bronstein; Kate Saenko; Shimon Ullman; Raja Giryes; Rogerio Feris; Leonid Karlinsky", "author": "Assaf Arbelle; Sivan Doveh; Amit Alfassy; Joseph Shtok; Guy Lev; Eli Schwartz; Hilde Kuehne; Hila Barak Levi; Prasanna Sattigeri; Rameswar Panda; Chun-Fu (Richard) Chen; Alex Bronstein; Kate Saenko; Shimon Ullman; Raja Giryes; Rogerio Feris; Leonid Karlinsky", "abstract": "Nowadays, there is an abundance of data involving images and surrounding free-form text weakly corresponding to those images. Weakly Supervised phrase-Grounding (WSG) deals with the task of using this data to learn to localize (or to ground) arbitrary text phrases in images without any additional annotations. However, most recent SotA methods for WSG assume an existence of a pre-trained object detector, relying on it to produce the ROIs for localization. In this work, we focus on the task of Detector-Free WSG (DF-WSG) to solve WSG without relying on a pre-trained detector. We directly learn everything from the images and associated free-form text pairs, thus potentially gaining advantage on the categories unsupported by the detector. The key idea behind our proposed Grounding by Separation (GbS) method is synthesizing `text to image-regions' associations by random alpha-blending of arbitrary image pairs and using the corresponding texts of the pair as conditions to recover the alpha map from the blended image via a segmentation network. At test time, this allows using the query phrase as a condition for a non-blended query image, thus interpreting the test image as a composition of a region corresponding to the phrase and the complement region. Using this approach we demonstrate a significant accuracy improvement, up to 8.5% over previous DF-WSG SotA, for a range of benchmarks including Flickr30K, Visual Genome, and ReferIt, as well as a significant complementary improvement (above 7%) over the detector-based approaches for WSG.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Arbelle_Detector-Free_Weakly_Supervised_Grounding_by_Separation_ICCV_2021_paper.pdf", @@ -9867,7 +10536,8 @@ "aff_domain": ";;;;;;;;;;;;;;;;", "email": ";;;;;;;;;;;;;;;;", "author_num": 17, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Arbelle_Detector-Free_Weakly_Supervised_Grounding_by_Separation_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Arbelle_Detector-Free_Weakly_Supervised_Grounding_by_Separation_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Arbelle_2021_ICCV,\n \n author = {\n Arbelle,\n Assaf and Doveh,\n Sivan and Alfassy,\n Amit and Shtok,\n Joseph and Lev,\n Guy and Schwartz,\n Eli and Kuehne,\n Hilde and Levi,\n Hila Barak and Sattigeri,\n Prasanna and Panda,\n Rameswar and Chen,\n Chun-Fu (Richard) and Bronstein,\n Alex and Saenko,\n Kate and Ullman,\n Shimon and Giryes,\n Raja and Feris,\n Rogerio and Karlinsky,\n Leonid\n},\n title = {\n Detector-Free Weakly Supervised Grounding by Separation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1801-1812\n} \n}" }, { "title": "DiagViB-6: A Diagnostic Benchmark Suite for Vision Models in the Presence of Shortcut and Generalization Opportunities", @@ -9875,6 +10545,7 @@ "status": "Poster", "track": "main", "pid": 7768, + "author_site": "Elias Eulig; Piyapat Saranrittichai; Chaithanya Kumar Mummadi; Kilian Rambach; William Beluch; Xiahan Shi; Volker Fischer", "author": "Elias Eulig; Piyapat Saranrittichai; Chaithanya Kumar Mummadi; Kilian Rambach; William Beluch; Xiahan Shi; Volker Fischer", "abstract": "Common deep neural networks (DNNs) for image classification have been shown to rely on shortcut opportunities (SO) in the form of predictive and easy-to-represent visual factors. This is known as shortcut learning and leads to impaired generalization. In this work, we show that common DNNs also suffer from shortcut learning when predicting only basic visual object factors of variation (FoV) such as shape, color, or texture. We argue that besides shortcut opportunities, generalization opportunities (GO) are also an inherent part of real-world vision data and arise from partial independence between predicted classes and FoVs. We also argue that it is necessary for DNNs to exploit GO to overcome shortcut learning. Our core contribution is to introduce the Diagnostic Vision Benchmark suite DiagViB-6, which includes datasets and metrics to study a network's shortcut vulnerability and generalization capability for six independent FoV. In particular, DiagViB-6 allows controlling the type and degree of SO and GO in a dataset. We benchmark a wide range of popular vision architectures and show that they can exploit GO only to a limited extent.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Eulig_DiagViB-6_A_Diagnostic_Benchmark_Suite_for_Vision_Models_in_the_ICCV_2021_paper.pdf", @@ -9898,7 +10569,8 @@ "aff_campus_unique_index": ";;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0+0;0;0;0;0+0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Eulig_2021_ICCV,\n \n author = {\n Eulig,\n Elias and Saranrittichai,\n Piyapat and Mummadi,\n Chaithanya Kumar and Rambach,\n Kilian and Beluch,\n William and Shi,\n Xiahan and Fischer,\n Volker\n},\n title = {\n DiagViB-6: A Diagnostic Benchmark Suite for Vision Models in the Presence of Shortcut and Generalization Opportunities\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10655-10664\n} \n}" }, { "title": "Diagonal Attention and Style-Based GAN for Content-Style Disentanglement in Image Generation and Translation", @@ -9906,6 +10578,7 @@ "status": "Poster", "track": "main", "pid": 7200, + "author_site": "Gihyun Kwon; Jong Chul Ye", "author": "Gihyun Kwon; Jong Chul Ye", "abstract": "One of the important research topics in image generative models is to disentangle the spatial contents and styles for their separate control. Although StyleGAN can generate content feature vectors from random noises, the resulting spatial content control is primarily intended for minor spatial variations, and the disentanglement of global content and styles is by no means complete. Inspired by a mathematical understanding of normalization and attention, here we present a novel hierarchical adaptive Diagonal spatial ATtention (DAT) layers to separately manipulate the spatial contents from styles in a hierarchical manner. Using DAT and AdaIN, our method enables coarse-to-fine level disentanglement of spatial contents and styles. In addition, our generator can be easily integrated into the GAN inversion framework so that the content and style of translated images from multi-domain image translation tasks can be flexibly controlled. By using various datasets, we confirm that the proposed method not only outperforms the existing models in disentanglement scores, but also provides more flexible control over spatial features in the generated images.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Kwon_Diagonal_Attention_and_Style-Based_GAN_for_Content-Style_Disentanglement_in_Image_ICCV_2021_paper.pdf", @@ -9929,7 +10602,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "South Korea;" + "aff_country_unique": "South Korea;", + "bibtex": "@InProceedings{Kwon_2021_ICCV,\n \n author = {\n Kwon,\n Gihyun and Ye,\n Jong Chul\n},\n title = {\n Diagonal Attention and Style-Based GAN for Content-Style Disentanglement in Image Generation and Translation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13980-13989\n} \n}" }, { "title": "Differentiable Convolution Search for Point Cloud Processing", @@ -9937,6 +10611,7 @@ "status": "Poster", "track": "main", "pid": 4008, + "author_site": "Xing Nie; Yongcheng Liu; Shaohong Chen; Jianlong Chang; Chunlei Huo; Gaofeng Meng; Qi Tian; Weiming Hu; Chunhong Pan", "author": "Xing Nie; Yongcheng Liu; Shaohong Chen; Jianlong Chang; Chunlei Huo; Gaofeng Meng; Qi Tian; Weiming Hu; Chunhong Pan", "abstract": "Exploiting convolutional neural networks for point cloud processing is quite challenging, due to the inherent irregular distribution and discrete shape representation of point clouds. To address these problems, many handcrafted convolution variants have sprung up in recent years. Though with elaborate design, these variants could be far from optimal in sufficiently capturing diverse shapes formed by discrete points. In this paper, we propose PointSeaConv, i.e., a novel differential convolution search paradigm on point clouds. It can work in a purely data-driven manner and thus is capable of auto-creating a group of suitable convolutions for geometric shape modeling. We also propose a joint optimization framework for simultaneous search of internal convolution and external architecture, and introduce epsilon-greedy algorithm to alleviate the effect of discretization error. As a result, PointSeaNet, a deep network that is sufficient to capture geometric shapes at both convolution level and architecture level, can be searched out for point cloud processing. Extensive experiments strongly evidence that our proposed PointSeaNet surpasses current handcrafted deep models on challenging benchmarks across multiple tasks with remarkable margins.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Nie_Differentiable_Convolution_Search_for_Point_Cloud_Processing_ICCV_2021_paper.pdf", @@ -9956,11 +10631,12 @@ "aff_unique_norm": "Chinese Academy of Sciences;University of Chinese Academy of Sciences;Xidian University;Huawei;Hong Kong Institute of Science and Innovation, Chinese Academy of Sciences", "aff_unique_dep": "Institute of Automation;School of Artificial Intelligence;;Cloud & AI;Centre for Artificial Intelligence and Robotics", "aff_unique_url": "http://www.ia.cas.cn;http://www.ucas.ac.cn;http://www.xidian.edu.cn/;https://www.huawei.com/en/cloud;http://www.isi.cas.cn", - "aff_unique_abbr": "CAS;UCAS;Xidian;Huawei Cloud & AI;HKI of S&I, CAS", + "aff_unique_abbr": "CAS;UCAS;Xidian;Huawei Cloud & AI;HKISI", "aff_campus_unique_index": ";;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0;0;0+0+0;0+0+0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Nie_2021_ICCV,\n \n author = {\n Nie,\n Xing and Liu,\n Yongcheng and Chen,\n Shaohong and Chang,\n Jianlong and Huo,\n Chunlei and Meng,\n Gaofeng and Tian,\n Qi and Hu,\n Weiming and Pan,\n Chunhong\n},\n title = {\n Differentiable Convolution Search for Point Cloud Processing\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7437-7446\n} \n}" }, { "title": "Differentiable Dynamic Wirings for Neural Networks", @@ -9968,6 +10644,7 @@ "status": "Poster", "track": "main", "pid": 2594, + "author_site": "Kun Yuan; Quanquan Li; Shaopeng Guo; Dapeng Chen; Aojun Zhou; Fengwei Yu; Ziwei Liu", "author": "Kun Yuan; Quanquan Li; Shaopeng Guo; Dapeng Chen; Aojun Zhou; Fengwei Yu; Ziwei Liu", "abstract": "A standard practice of deploying deep neural networks is to apply the same architecture to all the input instances. However, a fixed architecture may not be suitable for different data with high diversity. To boost the model capacity, existing methods usually employ larger convolutional kernels or deeper network layers, which incurs prohibitive computational costs. In this paper, we address this issue by proposing Differentiable Dynamic Wirings (DDW), which learns the instance-aware connectivity that creates different wiring patterns for different instances. 1) Specifically, the network is initialized as a complete directed acyclic graph, where the nodes represent convolutional blocks and the edges represent the connection paths. 2) We generate edge weights by a learnable module, Router, and select the edges whose weights are larger than a threshold, to adjust the connectivity of the neural network structure. 3) Instead of using the same path of the network, DDW aggregates features dynamically in each node, which allows the network to have more representation power. To facilitate effective training, we further represent the network connectivity of each sample as an adjacency matrix. The matrix is updated to aggregate features in the forward pass, cached in the memory, and used for gradient computing in the backward pass. We validate the effectiveness of our approach with several mainstream architectures, including MobileNetV2, ResNet, ResNeXt, and RegNet. Extensive experiments are performed on ImageNet classification and COCO object detection, which demonstrates the effectiveness and generalization ability of our approach.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yuan_Differentiable_Dynamic_Wirings_for_Neural_Networks_ICCV_2021_paper.pdf", @@ -9991,7 +10668,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0;0;1", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Yuan_2021_ICCV,\n \n author = {\n Yuan,\n Kun and Li,\n Quanquan and Guo,\n Shaopeng and Chen,\n Dapeng and Zhou,\n Aojun and Yu,\n Fengwei and Liu,\n Ziwei\n},\n title = {\n Differentiable Dynamic Wirings for Neural Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 327-336\n} \n}" }, { "title": "Differentiable Surface Rendering via Non-Differentiable Sampling", @@ -9999,6 +10677,7 @@ "status": "Poster", "track": "main", "pid": 7968, + "author_site": "Forrester Cole; Kyle Genova; Avneesh Sud; Daniel Vlasic; Zhoutong Zhang", "author": "Forrester Cole; Kyle Genova; Avneesh Sud; Daniel Vlasic; Zhoutong Zhang", "abstract": "We present a method for differentiable rendering of 3D surfaces that supports both explicit and implicit representations, provides derivatives at occlusion boundaries, and is fast and simple to implement. The method first samples the surface using non-differentiable rasterization, then applies differentiable, depth-aware point splatting to produce the final image. Our approach requires no differentiable meshing or rasterization steps, making it efficient for large 3D models and applicable to isosurfaces extracted from implicit surface definitions. We demonstrate the effectiveness of our method for implicit-, mesh-, and parametric-surface-based inverse rendering and neural-network training applications. In particular, we show for the first time efficient, differentiable rendering of an isosurface extracted from a neural radiance field (NeRF), and demonstrate surface-based, rather than volume-based, rendering of a NeRF.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Cole_Differentiable_Surface_Rendering_via_Non-Differentiable_Sampling_ICCV_2021_paper.pdf", @@ -10022,7 +10701,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;0;0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Cole_2021_ICCV,\n \n author = {\n Cole,\n Forrester and Genova,\n Kyle and Sud,\n Avneesh and Vlasic,\n Daniel and Zhang,\n Zhoutong\n},\n title = {\n Differentiable Surface Rendering via Non-Differentiable Sampling\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6088-6097\n} \n}" }, { "title": "Digging Into Uncertainty in Self-Supervised Multi-View Stereo", @@ -10030,6 +10710,7 @@ "status": "Poster", "track": "main", "pid": 6956, + "author_site": "Hongbin Xu; Zhipeng Zhou; Yali Wang; Wenxiong Kang; Baigui Sun; Hao Li; Yu Qiao", "author": "Hongbin Xu; Zhipeng Zhou; Yali Wang; Wenxiong Kang; Baigui Sun; Hao Li; Yu Qiao", "abstract": "Self-supervised Multi-view stereo (MVS) with a pretext task of image reconstruction has achieved significant progress recently. However, previous methods are built upon intuitions, lacking comprehensive explanations about the effectiveness of the pretext task in self-supervised MVS. To this end, we propose to estimate epistemic uncertainty in self-supervised MVS, accounting for what the model ignores. Specially, the limitations can be resorted into two folds: ambiguious supervision in foreground and noisy disturbance in background. To address these issues, we propose a novel Uncertainty reduction Multi-view Stereo (U-MVS) framework for self-supervised learning. To alleviate ambiguous supervision in foreground, we involve extra correspondence prior with a flow-depth consistency loss. The dense 2D correspondence of optical flows is used to regularize the 3D stereo correspondence in MVS. To handle the noisy disturbance in background, we use Monte-Carlo Dropout to acquire the uncertainty map and further filter the unreliable supervision signals on invalid regions. Extensive experiments on DTU and Tank&Temples benchmark show that our U-MVS framework achieves the best performance among unsupervised MVS methods, with competitive performance with its supervised opponents.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xu_Digging_Into_Uncertainty_in_Self-Supervised_Multi-View_Stereo_ICCV_2021_paper.pdf", @@ -10044,7 +10725,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Xu_Digging_Into_Uncertainty_in_Self-Supervised_Multi-View_Stereo_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Xu_Digging_Into_Uncertainty_in_Self-Supervised_Multi-View_Stereo_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Xu_2021_ICCV,\n \n author = {\n Xu,\n Hongbin and Zhou,\n Zhipeng and Wang,\n Yali and Kang,\n Wenxiong and Sun,\n Baigui and Li,\n Hao and Qiao,\n Yu\n},\n title = {\n Digging Into Uncertainty in Self-Supervised Multi-View Stereo\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6078-6087\n} \n}" }, { "title": "Direct Differentiable Augmentation Search", @@ -10052,10 +10734,11 @@ "status": "Poster", "track": "main", "pid": 3867, + "author_site": "Aoming Liu; Zehao Huang; Zhiwu Huang; Naiyan Wang", "author": "Aoming Liu; Zehao Huang; Zhiwu Huang; Naiyan Wang", "abstract": "Data augmentation has been an indispensable tool to improve the performance of deep neural networks, however the augmentation can hardly transfer among different tasks and datasets. Consequently, a recent trend is to adopt AutoML technique to learn proper augmentation policy without extensive hand-crafted tuning. In this paper, we propose an efficient differentiable search algorithm called Direct Differentiable Augmentation Search (DDAS). It utilizes meta-learning with one-step gradient update and continuous relaxation to the expected training loss for efficient search. Our DDAS could achieve efficient augmentation search without approximations such as Gumbel-Softmax or second order gradient approximation. To further reduce the adverse effect of improper augmentations, we organize the search space into a two level hierarchy, in which we first decide whether to apply augmentation, and then determine the specific augmentation policy. On standard image classification benchmarks, our DDAS achieves state-of-the-art performance and efficiency tradeoff while reducing the search cost dramatically, e.g. 0.15 GPU hours for CIFAR-10. In addition, we also use DDAS to search augmentation for object detection task and achieve comparable performance with AutoAugment, while being 1000x faster. Code will be released in https://github.com/zxcvfd13502/DDAS_code.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liu_Direct_Differentiable_Augmentation_Search_ICCV_2021_paper.pdf", - "aff": "ETH Z\u00fcrich, Switzerland; TuSimple, Beijing; ETH Z\u00fcrich, Switzerland; TuSimple, Beijing", + "aff": "ETH Zürich, Switzerland; TuSimple, Beijing; ETH Zürich, Switzerland; TuSimple, Beijing", "project": "", "github": "https://github.com/zxcvfd13502/DDAS_code", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Liu_Direct_Differentiable_Augmentation_ICCV_2021_supplemental.pdf", @@ -10068,14 +10751,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Liu_Direct_Differentiable_Augmentation_Search_ICCV_2021_paper.html", "aff_unique_index": "0;1;0;1", - "aff_unique_norm": "ETH Zurich;TuSimple", + "aff_unique_norm": "ETH Zürich;TuSimple", "aff_unique_dep": ";", "aff_unique_url": "https://www.ethz.ch;https://www.tusimple.com", "aff_unique_abbr": "ETHZ;", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Beijing", "aff_country_unique_index": "0;1;0;1", - "aff_country_unique": "Switzerland;China" + "aff_country_unique": "Switzerland;China", + "bibtex": "@InProceedings{Liu_2021_ICCV,\n \n author = {\n Liu,\n Aoming and Huang,\n Zehao and Huang,\n Zhiwu and Wang,\n Naiyan\n},\n title = {\n Direct Differentiable Augmentation Search\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12219-12228\n} \n}" }, { "title": "DisUnknown: Distilling Unknown Factors for Disentanglement Learning", @@ -10083,6 +10767,7 @@ "status": "Poster", "track": "main", "pid": 5580, + "author_site": "Sitao Xiang; Yuming Gu; Pengda Xiang; Menglei Chai; Hao Li; Yajie Zhao; Mingming He", "author": "Sitao Xiang; Yuming Gu; Pengda Xiang; Menglei Chai; Hao Li; Yajie Zhao; Mingming He", "abstract": "Disentangling data into interpretable and independent factors is critical for controllable generation tasks. With the availability of labeled data, supervision can help enforce the separation of specific factors as expected. However, it is often expensive or even impossible to label every single factor to achieve fully-supervised disentanglement. In this paper, we adopt a general setting where all factors that are hard to label or identify are encapsulated as a single unknown factor. Under this setting, we propose a flexible weakly-supervised multi-factor disentanglement framework DisUnknown, which Distills Unknown factors for enabling multi-conditional generation regarding both labeled and unknown factors. Specifically, a two-stage training approach is adopted to first disentangle the unknown factor with an effective and robust training method, and then train the final generator with the proper disentanglement of all labeled factors utilizing the unknown distillation. To demonstrate the generalization capacity and scalability of our method, we evaluate it on multiple benchmark datasets qualitatively and quantitatively and further apply it to various real-world applications on complicated datasets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xiang_DisUnknown_Distilling_Unknown_Factors_for_Disentanglement_Learning_ICCV_2021_paper.pdf", @@ -10106,7 +10791,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0+0;0+0;0+0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Xiang_2021_ICCV,\n \n author = {\n Xiang,\n Sitao and Gu,\n Yuming and Xiang,\n Pengda and Chai,\n Menglei and Li,\n Hao and Zhao,\n Yajie and He,\n Mingming\n},\n title = {\n DisUnknown: Distilling Unknown Factors for Disentanglement Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14810-14819\n} \n}" }, { "title": "DiscoBox: Weakly Supervised Instance Segmentation and Semantic Correspondence From Box Supervision", @@ -10114,6 +10800,7 @@ "status": "Poster", "track": "main", "pid": 3715, + "author_site": "Shiyi Lan; Zhiding Yu; Christopher Choy; Subhashree Radhakrishnan; Guilin Liu; Yuke Zhu; Larry S. Davis; Anima Anandkumar", "author": "Shiyi Lan; Zhiding Yu; Christopher Choy; Subhashree Radhakrishnan; Guilin Liu; Yuke Zhu; Larry S. Davis; Anima Anandkumar", "abstract": "We introduce DiscoBox, a novel framework that jointly learns instance segmentation and semantic correspondence using bounding box supervision. Specifically, we propose a self-ensembling framework where instance segmentation and semantic correspondence are jointly guided by a structured teacher in addition to the bounding box supervision. The teacher is a structured energy model incorporating a pairwise potential and a cross-image potential to model the pairwise pixel relationships both within and across the boxes. Minimizing the teacher energy simultaneously yields refined object masks and dense correspondences between intra-class objects, which are taken as pseudo-labels to supervise the task network and provide positive/negative correspondence pairs for dense contrastive learning. We show a symbiotic relationship where the two tasks mutually benefit from each other. Our best model achieves 37.9% AP on COCO instance segmentation, surpassing prior weakly supervised methods and is competitive to supervised methods. We also obtain state of the art weakly supervised results on PASCAL VOC12 and PF-PASCAL with real-time inference.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Lan_DiscoBox_Weakly_Supervised_Instance_Segmentation_and_Semantic_Correspondence_From_Box_ICCV_2021_paper.pdf", @@ -10130,14 +10817,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Lan_DiscoBox_Weakly_Supervised_Instance_Segmentation_and_Semantic_Correspondence_From_Box_ICCV_2021_paper.html", "aff_unique_index": "0;1;1;1;1;1+2;0;1+3", - "aff_unique_norm": "University of Maryland;NVIDIA;University of Texas at Austin;California Institute of Technology", - "aff_unique_dep": ";NVIDIA Corporation;;", + "aff_unique_norm": "University of Maryland;NVIDIA Corporation;University of Texas at Austin;California Institute of Technology", + "aff_unique_dep": ";;;", "aff_unique_url": "https://www/umd.edu;https://www.nvidia.com;https://www.utexas.edu;https://www.caltech.edu", "aff_unique_abbr": "UMD;NVIDIA;UT Austin;Caltech", "aff_campus_unique_index": "0;2;0;3", "aff_campus_unique": "College Park;;Austin;Pasadena", "aff_country_unique_index": "0;0;0;0;0;0+0;0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Lan_2021_ICCV,\n \n author = {\n Lan,\n Shiyi and Yu,\n Zhiding and Choy,\n Christopher and Radhakrishnan,\n Subhashree and Liu,\n Guilin and Zhu,\n Yuke and Davis,\n Larry S. and Anandkumar,\n Anima\n},\n title = {\n DiscoBox: Weakly Supervised Instance Segmentation and Semantic Correspondence From Box Supervision\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3406-3416\n} \n}" }, { "title": "Discover the Unknown Biased Attribute of an Image Classifier", @@ -10145,6 +10833,7 @@ "status": "Poster", "track": "main", "pid": 1494, + "author_site": "Zhiheng Li; Chenliang Xu", "author": "Zhiheng Li; Chenliang Xu", "abstract": "Recent works find that AI algorithms learn biases from data. Therefore, it is urgent and vital to identify biases in AI algorithms. However, the previous bias identification pipeline overly relies on human experts to conjecture potential biases (e.g., gender), which may neglect other underlying biases not realized by humans. To help human experts better find the AI algorithms' biases, we study a new problem in this work -- for a classifier that predicts a target attribute of the input image, discover its unknown biased attribute. To solve this challenging problem, we use a hyperplane in the generative model's latent space to represent an image attribute; thus, the original problem is transformed to optimizing the hyperplane's normal vector and offset. We propose a novel total-variation loss within this framework as the objective function and a new orthogonalization penalty as a constraint. The latter prevents trivial solutions in which the discovered biased attribute is identical with the target or one of the known-biased attributes. Extensive experiments on both disentanglement datasets and real-world datasets show that our method can discover biased attributes and achieve better disentanglement w.r.t. target attributes. Furthermore, the qualitative results show that our method can discover unnoticeable biased attributes for various object and scene classifiers, proving our method's generalizability for detecting biased attributes in diverse domains of images.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_Discover_the_Unknown_Biased_Attribute_of_an_Image_Classifier_ICCV_2021_paper.pdf", @@ -10168,7 +10857,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Zhiheng and Xu,\n Chenliang\n},\n title = {\n Discover the Unknown Biased Attribute of an Image Classifier\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14970-14979\n} \n}" }, { "title": "Discovering 3D Parts From Image Collections", @@ -10176,6 +10866,7 @@ "status": "Poster", "track": "main", "pid": 3448, + "author_site": "Chun-Han Yao; Wei-Chih Hung; Varun Jampani; Ming-Hsuan Yang", "author": "Chun-Han Yao; Wei-Chih Hung; Varun Jampani; Ming-Hsuan Yang", "abstract": "Reasoning 3D shapes from 2D images is an essential yet challenging task, especially when only single-view images are at our disposal. While an object can have a complicated shape, individual parts are usually close to geometric primitives and thus are easier to model. Furthermore, parts provide a mid-level representation that is robust to appearance variations across objects in a particular category. In this work, we tackle the problem of 3D part discovery from only 2D image collections. Instead of relying on manually annotated parts for supervision, we propose a self-supervised approach, latent part discovery (LPD). Our key insight is to learn a novel part shape prior that allows each part to fit an object shape faithfully while constrained to have simple geometry. Extensive experiments on the synthetic ShapeNet, PartNet, and real-world Pascal 3D+ datasets show that our method discovers consistent object parts and achieves favorable reconstruction accuracy compared to the existing methods with the same level of supervision. Our project page with code is at https://chhankyao.github.io/lpd/.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yao_Discovering_3D_Parts_From_Image_Collections_ICCV_2021_paper.pdf", @@ -10190,7 +10881,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Yao_Discovering_3D_Parts_From_Image_Collections_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Yao_Discovering_3D_Parts_From_Image_Collections_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Yao_2021_ICCV,\n \n author = {\n Yao,\n Chun-Han and Hung,\n Wei-Chih and Jampani,\n Varun and Yang,\n Ming-Hsuan\n},\n title = {\n Discovering 3D Parts From Image Collections\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12981-12990\n} \n}" }, { "title": "Discovering Human Interactions With Large-Vocabulary Objects via Query and Multi-Scale Detection", @@ -10198,6 +10890,7 @@ "status": "Poster", "track": "main", "pid": 3604, + "author_site": "Suchen Wang; Kim-Hui Yap; Henghui Ding; Jiyan Wu; Junsong Yuan; Yap-Peng Tan", "author": "Suchen Wang; Kim-Hui Yap; Henghui Ding; Jiyan Wu; Junsong Yuan; Yap-Peng Tan", "abstract": "In this work, we study the problem of human-object interaction (HOI) detection with large vocabulary object categories. Previous HOI studies are mainly conducted in the regime of limit object categories (e.g., 80 categories). Their solutions may face new difficulties in both object detection and interaction classification due to the increasing diversity of objects (e.g., 1000 categories). Different from previous methods, we formulate the HOI detection as a query problem. We propose a unified model to jointly discover the target objects and predict the corresponding interactions based on the human queries, thereby eliminating the need of using generic object detectors, extra steps to associate human-object instances, and multi-stream interaction recognition. This is achieved by a repurposed Transformer unit and a novel cascade detection over multi-scale feature maps. We observe that such a highly-coupled solution brings benefits for both object detection and interaction classification in a large vocabulary setting. To study the new challenges of the large vocabulary HOI detection, we assemble two datasets from the publicly available SWiG and 100 Days of Hands datasets. Experiments on these datasets validate that our proposed method can achieve a notable mAP improvement on HOI detection with a faster inference speed than existing one-stage HOI detectors.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_Discovering_Human_Interactions_With_Large-Vocabulary_Objects_via_Query_and_Multi-Scale_ICCV_2021_paper.pdf", @@ -10221,7 +10914,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Buffalo", "aff_country_unique_index": "0;0;0;0;1;0", - "aff_country_unique": "Singapore;United States" + "aff_country_unique": "Singapore;United States", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Suchen and Yap,\n Kim-Hui and Ding,\n Henghui and Wu,\n Jiyan and Yuan,\n Junsong and Tan,\n Yap-Peng\n},\n title = {\n Discovering Human Interactions With Large-Vocabulary Objects via Query and Multi-Scale Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13475-13484\n} \n}" }, { "title": "Discriminative Region-Based Multi-Label Zero-Shot Learning", @@ -10229,10 +10923,11 @@ "status": "Poster", "track": "main", "pid": 2617, + "author_site": "Sanath Narayan; Akshita Gupta; Salman Khan; Fahad Shahbaz Khan; Ling Shao; Mubarak Shah", "author": "Sanath Narayan; Akshita Gupta; Salman Khan; Fahad Shahbaz Khan; Ling Shao; Mubarak Shah", "abstract": "Multi-label zero-shot learning (ZSL) is a more realistic counter-part of standard single-label ZSL since several objects can co-exist in a natural image. However, the occurrence of multiple objects complicates the reasoning and requires region-specific processing of visual features to preserve their contextual cues. We note that the best existing multi-label ZSL method takes a shared approach towards attending to region features with a common set of attention maps for all the classes. Such shared maps lead to diffused attention, which does not discriminatively focus on relevant locations when the number of classes are large. Moreover, mapping spatially-pooled visual features to the class semantics leads to inter-class feature entanglement, thus hampering the classification. Here, we propose an alternate approach towards region-based discriminability-preserving multi-label zero-shot classification. Our approach maintains the spatial resolution to preserve region-level characteristics and utilizes a bi-level attention module (BiAM) to enrich the features by incorporating both region and scene context information. The enriched region-level features are then mapped to the class semantics and only their class predictions are spatially pooled to obtain image-level predictions, thereby keeping the multi-class features disentangled. Our approach sets a new state of the art on two large-scale multi-label zero-shot benchmarks: NUS-WIDE and Open Images. On NUS-WIDE, our approach achieves an absolute gain of 6.9% mAP for ZSL, compared to the best published results.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Narayan_Discriminative_Region-Based_Multi-Label_Zero-Shot_Learning_ICCV_2021_paper.pdf", - "aff": "Inception Institute of Artificial Intelligence, UAE; Inception Institute of Artificial Intelligence, UAE; Mohamed Bin Zayed University of AI, UAE; Mohamed Bin Zayed University of AI, UAE + Link\u00f6ping University, Sweden; Inception Institute of Artificial Intelligence, UAE; University of Central Florida, USA", + "aff": "Inception Institute of Artificial Intelligence, UAE; Inception Institute of Artificial Intelligence, UAE; Mohamed Bin Zayed University of AI, UAE; Mohamed Bin Zayed University of AI, UAE + Linköping University, Sweden; Inception Institute of Artificial Intelligence, UAE; University of Central Florida, USA", "project": "", "github": "https://github.com/akshitac8/BiAM", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Narayan_Discriminative_Region-Based_Multi-Label_ICCV_2021_supplemental.pdf", @@ -10245,14 +10940,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Narayan_Discriminative_Region-Based_Multi-Label_Zero-Shot_Learning_ICCV_2021_paper.html", "aff_unique_index": "0;0;1;1+2;0;3", - "aff_unique_norm": "Inception Institute of Artificial Intelligence;Mohamed bin Zayed University of Artificial Intelligence;Link\u00f6ping University;University of Central Florida", + "aff_unique_norm": "Inception Institute of Artificial Intelligence;Mohamed Bin Zayed University of Artificial Intelligence;Linköping University;University of Central Florida", "aff_unique_dep": ";;;", "aff_unique_url": ";https://mbzuai.ac.ae;https://www.liu.se;https://www.ucf.edu", "aff_unique_abbr": ";MBZUAI;LiU;UCF", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0+1;0;2", - "aff_country_unique": "United Arab Emirates;Sweden;United States" + "aff_country_unique": "United Arab Emirates;Sweden;United States", + "bibtex": "@InProceedings{Narayan_2021_ICCV,\n \n author = {\n Narayan,\n Sanath and Gupta,\n Akshita and Khan,\n Salman and Khan,\n Fahad Shahbaz and Shao,\n Ling and Shah,\n Mubarak\n},\n title = {\n Discriminative Region-Based Multi-Label Zero-Shot Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8731-8740\n} \n}" }, { "title": "Disentangled High Quality Salient Object Detection", @@ -10260,6 +10956,7 @@ "status": "Poster", "track": "main", "pid": 8870, + "author_site": "Lv Tang; Bo Li; Yijie Zhong; Shouhong Ding; Mofei Song", "author": "Lv Tang; Bo Li; Yijie Zhong; Shouhong Ding; Mofei Song", "abstract": "Aiming at discovering and locating most distinctive objects from visual scenes, salient object detection (SOD) plays an essential role in various computer vision systems. Coming to the era of high resolution, SOD methods are facing new challenges. The major limitation of previous methods is that they try to identify the salient regions and estimate the accurate objects boundaries simultaneously with a single regression task at low-resolution. This practice ignores the inherent difference between the two difficult problems, resulting in poor detection quality. In this paper, we propose a novel deep learning framework for high-resolution SOD task, which disentangles the task into a low-resolution saliency classification network (LRSCN) and a high-resolution refinement network (HRRN). As a pixel-wise classification task, LRSCN is designed to capture sufficient semantics at low-resolution to identify the definite salient, background and uncertain image regions. HRRN is a regression task, which aims at accurately refining the saliency value of pixels in the uncertain region to preserve a clear object boundary at high-resolution with limited GPU memory. It is worth noting that by introducing uncertainty into the training process, our HRRN can well address the high-resolution refinement task without using any high-resolution training data. Extensive experiments on high-resolution saliency datasets as well as some widely used saliency benchmarks show that the proposed method achieves superior performance compared to the state-of-the-art methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Tang_Disentangled_High_Quality_Salient_Object_Detection_ICCV_2021_paper.pdf", @@ -10274,7 +10971,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Tang_Disentangled_High_Quality_Salient_Object_Detection_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Tang_Disentangled_High_Quality_Salient_Object_Detection_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Tang_2021_ICCV,\n \n author = {\n Tang,\n Lv and Li,\n Bo and Zhong,\n Yijie and Ding,\n Shouhong and Song,\n Mofei\n},\n title = {\n Disentangled High Quality Salient Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3580-3590\n} \n}" }, { "title": "Disentangled Lifespan Face Synthesis", @@ -10282,6 +10980,7 @@ "status": "Poster", "track": "main", "pid": 3330, + "author_site": "Sen He; Wentong Liao; Michael Ying Yang; Yi-Zhe Song; Bodo Rosenhahn; Tao Xiang", "author": "Sen He; Wentong Liao; Michael Ying Yang; Yi-Zhe Song; Bodo Rosenhahn; Tao Xiang", "abstract": "A lifespan face synthesis (LFS) model aims to generate a set of photo-realistic face images of a person's whole life, given only one snapshot as reference. The generated face image given a target age code is expected to be age-sensitive reflected by bio-plausible transformations of shape and texture, while being identity preserving. This is extremely challenging because the shape and texture characteristics of a face undergo separate and highly nonlinear transformations w.r.t. age. Most recent LFS models are based on generative adversarial networks (GANs) whereby age code conditional transformations are applied to a latent face representation. They benefit greatly from the recent advancements of GANs. However, without explicitly disentangling their latent representations into the texture, shape and identity factors, they are fundamentally limited in modeling the nonlinear age-related transformation on texture and shape whilst preserving identity. In this work, a novel LFS model is proposed to disentangle the key face characteristics including shape, texture and identity so that the unique shape and texture age transformations can be modeled effectively. This is achieved by extracting shape, texture and identity features separately from an encoder. Critically, two transformation modules, one conditional convolution based and the other channel attention based, are designed for modeling the nonlinear shape and texture feature transformations respectively. This is to accommodate their rather distinct aging processes and ensure that our synthesized images are both age-sensitive and identity preserving. Extensive experiments show that our LFS model is clearly superior to the state-of-the-art alternatives.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/He_Disentangled_Lifespan_Face_Synthesis_ICCV_2021_paper.pdf", @@ -10296,7 +10995,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/He_Disentangled_Lifespan_Face_Synthesis_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/He_Disentangled_Lifespan_Face_Synthesis_ICCV_2021_paper.html", + "bibtex": "@InProceedings{He_2021_ICCV,\n \n author = {\n He,\n Sen and Liao,\n Wentong and Yang,\n Michael Ying and Song,\n Yi-Zhe and Rosenhahn,\n Bodo and Xiang,\n Tao\n},\n title = {\n Disentangled Lifespan Face Synthesis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3877-3886\n} \n}" }, { "title": "Disentangled Representation for Age-Invariant Face Recognition: A Mutual Information Minimization Perspective", @@ -10304,6 +11004,7 @@ "status": "Poster", "track": "main", "pid": 9473, + "author_site": "Xuege Hou; Yali Li; Shengjin Wang", "author": "Xuege Hou; Yali Li; Shengjin Wang", "abstract": "General face recognition has seen remarkable progress in recent years. However, large age gap still remains a big challenge due to significant alterations in facial appearance and bone structure. Disentanglement plays a key role in partitioning face representations into identity-dependent and age-dependent components for age-invariant face recognition (AIFR). In this paper we propose a multi-task learning framework based on mutual information minimization (MT-MIM), which casts the disentangled representation learning as an objective of information constraints. The method trains a disentanglement network to minimize mutual information between the identity component and age component of the face image from the same person, and reduce the effect of age variations during the identification process. For quantitative measure of the degree of disentanglement, we verify that mutual information can represent as metric. The resulting identity-dependent representations are used for age-invariant face recognition. We evaluate MT-MIM on popular public-domain face aging datasets (FG-NET, MORPH Album 2, CACD and AgeDB) and obtained significant improvements over previous state-of-the-art methods. Specifically, our method exceeds the baseline models by over 0.4% on MORPH Album 2, and over 0.7% on CACD subsets, which are impressive improvements at the high accuracy levels of above 99% and an average of 94%.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Hou_Disentangled_Representation_for_Age-Invariant_Face_Recognition_A_Mutual_Information_Minimization_ICCV_2021_paper.pdf", @@ -10327,7 +11028,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Hou_2021_ICCV,\n \n author = {\n Hou,\n Xuege and Li,\n Yali and Wang,\n Shengjin\n},\n title = {\n Disentangled Representation for Age-Invariant Face Recognition: A Mutual Information Minimization Perspective\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3692-3701\n} \n}" }, { "title": "Dissecting Image Crops", @@ -10335,6 +11037,7 @@ "status": "Poster", "track": "main", "pid": 7844, + "author_site": "Basile Van Hoorick; Carl Vondrick", "author": "Basile Van Hoorick; Carl Vondrick", "abstract": "The elementary operation of cropping underpins nearly every computer vision system, ranging from data augmentation and translation invariance to computational photography and representation learning. This paper investigates the subtle traces introduced by this operation. For example, despite refinements to camera optics, lenses will leave behind certain clues, notably chromatic aberration and vignetting. Photographers also leave behind other clues relating to image aesthetics and scene composition. We study how to detect these traces, and investigate the impact that cropping has on the image distribution. While our aim is to dissect the fundamental impact of spatial crops, there are also a number of practical implications to our work, such as revealing faulty photojournalism and equipping neural network researchers with a better understanding of shortcut learning. Code is available at https://github.com/basilevh/dissecting-image-crops.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Van_Hoorick_Dissecting_Image_Crops_ICCV_2021_paper.pdf", @@ -10358,7 +11061,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Van_Hoorick_2021_ICCV,\n \n author = {\n Van Hoorick,\n Basile and Vondrick,\n Carl\n},\n title = {\n Dissecting Image Crops\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9741-9750\n} \n}" }, { "title": "Distance-Aware Quantization", @@ -10366,6 +11070,7 @@ "status": "Poster", "track": "main", "pid": 8122, + "author_site": "Dohyung Kim; Junghyup Lee; Bumsub Ham", "author": "Dohyung Kim; Junghyup Lee; Bumsub Ham", "abstract": "We address the problem of network quantization, that is, reducing bit-widths of weights and/or activations to lighten network architectures. Quantization methods use a rounding function to map full-precision values to the nearest quantized ones, but this operation is not differentiable. There are mainly two approaches to training quantized networks with gradient-based optimizers. First, a straight-through estimator (STE) replaces the zero derivative of the rounding with that of an identity function, which causes a gradient mismatch problem. Second, soft quantizers approximate the rounding with continuous functions at training time, and exploit the rounding for quantization at test time. This alleviates the gradient mismatch, but causes a quantizer gap problem. We alleviate both problems in a unified framework. To this end, we introduce a novel quantizer, dubbed a distance-aware quantizer (DAQ), that mainly consists of a distance-aware soft rounding (DASR) and a temperature controller. To alleviate the gradient mismatch problem, DASR approximates the discrete rounding with the kernel soft argmax, which is based on our insight that the quantization can be formulated as a distance-based assignment problem between full-precision values and quantized ones. The controller adjusts the temperature parameter in DASR adaptively according to the input, addressing the quantizer gap problem. Experimental results on standard benchmarks show that DAQ outperforms the state of the art significantly for various bit-widths without bells and whistles.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Kim_Distance-Aware_Quantization_ICCV_2021_paper.pdf", @@ -10389,7 +11094,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Kim_2021_ICCV,\n \n author = {\n Kim,\n Dohyung and Lee,\n Junghyup and Ham,\n Bumsub\n},\n title = {\n Distance-Aware Quantization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5271-5280\n} \n}" }, { "title": "Distillation-Guided Image Inpainting", @@ -10397,6 +11103,7 @@ "status": "Poster", "track": "main", "pid": 9003, + "author_site": "Maitreya Suin; Kuldeep Purohit; A. N. Rajagopalan", "author": "Maitreya Suin; Kuldeep Purohit; A. N. Rajagopalan", "abstract": "Image inpainting methods have shown significant improvements by using deep neural networks recently. However, many of these techniques often create distorted structures or blurry inconsistent textures. The problem is rooted in the encoder layers' ineffectiveness in building a complete and faithful embedding of the missing regions from scratch. Existing solutions like course-to-fine, progressive refinement, structural guidance, etc., suffer from huge computational overheads owing to multiple generator networks, limited ability of handcrafted features, and sub-optimal utilization of the information present in the ground truth. We propose a distillation-based approach for inpainting, where we provide direct feature-level supervision while training. We deploy cross and self-distillation techniques and design a dedicated completion-block in encoder to produce more accurate encoding of the holes. Next, we demonstrate how an inpainting network's attention module can improve by leveraging a distillation-based attention transfer technique and enhancing coherence by using a pixel-adaptive global-local feature fusion. We conduct extensive evaluations on multiple datasets to validate our method. Along with achieving significant improvements over previous SOTA methods, the proposed approach's effectiveness is also demonstrated through its ability to improve existing inpainting works.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Suin_Distillation-Guided_Image_Inpainting_ICCV_2021_paper.pdf", @@ -10420,7 +11127,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Madras;", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "India;United States" + "aff_country_unique": "India;United States", + "bibtex": "@InProceedings{Suin_2021_ICCV,\n \n author = {\n Suin,\n Maitreya and Purohit,\n Kuldeep and Rajagopalan,\n A. N.\n},\n title = {\n Distillation-Guided Image Inpainting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2481-2490\n} \n}" }, { "title": "Distilling Global and Local Logits With Densely Connected Relations", @@ -10428,6 +11136,7 @@ "status": "Poster", "track": "main", "pid": 7543, + "author_site": "Youmin Kim; Jinbae Park; YounHo Jang; Muhammad Ali; Tae-Hyun Oh; Sung-Ho Bae", "author": "Youmin Kim; Jinbae Park; YounHo Jang; Muhammad Ali; Tae-Hyun Oh; Sung-Ho Bae", "abstract": "In prevalent knowledge distillation, logits in most image recognition models are computed by global average pooling, then used to learn to encode the high-level and task-relevant knowledge. In this work, we solve the limitation of this global logit transfer in this distillation context. We point out that it prevents the transfer of informative spatial information, which provides localized knowledge as well as rich relational information across contexts of an input scene. To exploit the rich spatial information, we propose a simple yet effective logit distillation approach. We add a local spatial pooling layer branch to the penultimate layer, thereby our method extends the standard logit distillation and enables learning of both finely-localized knowledge and holistic representation. Our proposed method shows favorable accuracy improvement against the state-of-the-art methods on several image classification datasets. We show that our distilled students trained on the image classification task can be successfully leveraged for object detection and semantic segmentation tasks; this result demonstrates our method's high transferability.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Kim_Distilling_Global_and_Local_Logits_With_Densely_Connected_Relations_ICCV_2021_paper.pdf", @@ -10451,7 +11160,8 @@ "aff_campus_unique_index": ";1", "aff_campus_unique": ";Pohang", "aff_country_unique_index": "0+0;0;0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Kim_2021_ICCV,\n \n author = {\n Kim,\n Youmin and Park,\n Jinbae and Jang,\n YounHo and Ali,\n Muhammad and Oh,\n Tae-Hyun and Bae,\n Sung-Ho\n},\n title = {\n Distilling Global and Local Logits With Densely Connected Relations\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6290-6300\n} \n}" }, { "title": "Distilling Holistic Knowledge With Graph Neural Networks", @@ -10459,6 +11169,7 @@ "status": "Poster", "track": "main", "pid": 11261, + "author_site": "Sheng Zhou; Yucheng Wang; Defang Chen; Jiawei Chen; Xin Wang; Can Wang; Jiajun Bu", "author": "Sheng Zhou; Yucheng Wang; Defang Chen; Jiawei Chen; Xin Wang; Can Wang; Jiajun Bu", "abstract": "Knowledge Distillation (KD) aims at transferring knowledge from a larger well-optimized teacher network to a smaller learnable student network. Existing KD methods have mainly considered two types of knowledge, namely the individual knowledge and the relational knowledge. However, these two types of knowledge are usually modeled independently while the inherent correlations between them are largely ignored. It is critical for sufficient student network learning to integrate both individual knowledge and relational knowledge while reserving their inherent correlation. In this paper, we propose to distill the novel holistic knowledge based on an attributed graph constructed among instances. The holistic knowledge is represented as a unified graph-based embedding by aggregating individual knowledge from relational neighborhood samples with graph neural networks, the student network is learned by distilling the holistic knowledge in a contrastive manner. Extensive experiments and ablation studies are conducted on benchmark datasets, the results demonstrate the effectiveness of the proposed method. The code has been published in https://github.com/wyc-ruiker/HKD", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhou_Distilling_Holistic_Knowledge_With_Graph_Neural_Networks_ICCV_2021_paper.pdf", @@ -10482,7 +11193,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhou_2021_ICCV,\n \n author = {\n Zhou,\n Sheng and Wang,\n Yucheng and Chen,\n Defang and Chen,\n Jiawei and Wang,\n Xin and Wang,\n Can and Bu,\n Jiajun\n},\n title = {\n Distilling Holistic Knowledge With Graph Neural Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10387-10396\n} \n}" }, { "title": "Distilling Optimal Neural Networks: Rapid Search in Diverse Spaces", @@ -10490,6 +11202,7 @@ "status": "Poster", "track": "main", "pid": 3875, + "author_site": "Bert Moons; Parham Noorzad; Andrii Skliar; Giovanni Mariani; Dushyant Mehta; Chris Lott; Tijmen Blankevoort", "author": "Bert Moons; Parham Noorzad; Andrii Skliar; Giovanni Mariani; Dushyant Mehta; Chris Lott; Tijmen Blankevoort", "abstract": "Current state-of-the-art Neural Architecture Search (NAS) methods neither efficiently scale to many hardware platforms nor handle diverse architectural search-spaces. To remedy this, we present DONNA (Distilling Optimal Neural Network Architectures), a novel pipeline for rapid, scalable and diverse NAS, that scales to many user scenarios. DONNA consists of three phases. First, an accuracy predictor is built using blockwise knowledge distillation from a reference model. This predictor enables searching across diverse networks with varying macro-architectural parameters such as layer types and attention mechanisms, as well as across micro-architectural parameters such as block repeats and expansion rates. Second, a rapid evolutionary search finds a set of pareto-optimal architectures for any scenario using the accuracy predictor and on-device measurements. Third, optimal models are quickly finetuned to training-from-scratch accuracy. DONNA is up to 100x faster than MNasNet in finding state-of-the-art architectures on-device. Classifying ImageNet, DONNA architectures are 20% faster than EfficientNet-B0 and MobileNetV2 on a Nvidia V100 GPU and 10% faster with 0.5% higher accuracy than MobileNetV2-1.4x on a Samsung S20 smartphone. In addition to NAS, DONNA is used for search-space extension and exploration, as well as hardware-aware model compression.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Moons_Distilling_Optimal_Neural_Networks_Rapid_Search_in_Diverse_Spaces_ICCV_2021_paper.pdf", @@ -10504,7 +11217,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Moons_Distilling_Optimal_Neural_Networks_Rapid_Search_in_Diverse_Spaces_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Moons_Distilling_Optimal_Neural_Networks_Rapid_Search_in_Diverse_Spaces_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Moons_2021_ICCV,\n \n author = {\n Moons,\n Bert and Noorzad,\n Parham and Skliar,\n Andrii and Mariani,\n Giovanni and Mehta,\n Dushyant and Lott,\n Chris and Blankevoort,\n Tijmen\n},\n title = {\n Distilling Optimal Neural Networks: Rapid Search in Diverse Spaces\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12229-12238\n} \n}" }, { "title": "Distilling Virtual Examples for Long-Tailed Recognition", @@ -10512,6 +11226,7 @@ "status": "Poster", "track": "main", "pid": 7311, + "author_site": "Yin-Yin He; Jianxin Wu; Xiu-Shen Wei", "author": "Yin-Yin He; Jianxin Wu; Xiu-Shen Wei", "abstract": "We tackle the long-tailed visual recognition problem from the knowledge distillation perspective by proposing a Distill the Virtual Examples (DiVE) method. Specifically, by treating the predictions of a teacher model as virtual exam- ples, we prove that distilling from these virtual examples is equivalent to label distribution learning under certain con- straints. We show that when the virtual example distribu- tion becomes flatter than the original input distribution, the under-represented tail classes will receive significant im- provements, which is crucial in long-tailed recognition. The proposed DiVE method can explicitly tune the virtual exam- ple distribution to become flat. Extensive experiments on three benchmark datasets, including the large-scale iNat- uralist ones, justify that the proposed DiVE method can significantly outperform state-of-the-art methods. Further- more, additional analyses and experiments verify the virtual example interpretation, and demonstrate the effectiveness of tailored designs in DiVE for long-tailed problems.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/He_Distilling_Virtual_Examples_for_Long-Tailed_Recognition_ICCV_2021_paper.pdf", @@ -10535,7 +11250,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{He_2021_ICCV,\n \n author = {\n He,\n Yin-Yin and Wu,\n Jianxin and Wei,\n Xiu-Shen\n},\n title = {\n Distilling Virtual Examples for Long-Tailed Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 235-244\n} \n}" }, { "title": "Distinctiveness Oriented Positional Equilibrium for Point Cloud Registration", @@ -10543,6 +11259,7 @@ "status": "Poster", "track": "main", "pid": 1135, + "author_site": "Taewon Min; Chonghyuk Song; Eunseok Kim; Inwook Shim", "author": "Taewon Min; Chonghyuk Song; Eunseok Kim; Inwook Shim", "abstract": "Recent state-of-the-art learning-based approaches to point cloud registration have largely been based on graph neural networks (GNN). However, these prominent GNN backbones suffer from the indistinguishable features problem associated with over-smoothing and structural ambiguity of the high-level features, a crucial bottleneck to point cloud registration that has evaded scrutiny in the recent relevant literature. To address this issue, we propose the Distinctiveness oriented Positional Equilibrium (DoPE) module, a novel positional embedding scheme that significantly improves the distinctiveness of the high-level features within both the source and target point clouds, resulting in superior point matching and hence registration accuracy. Specifically, we use the DoPE module in an iterative registration framework, whereby the two point clouds are gradually registered via rigid transformations that are computed from DoPE's position-aware features. With every successive iteration, the DoPE module feeds increasingly consistent positional information to would-be corresponding pairs, which in turn enhances the resulting point-to-point correspondence predictions used to estimate the rigid transformation. Within only a few iterations, the network converges to a desired equilibrium, where the positional embeddings given to matching pairs become essentially identical. We validate the effectiveness of DoPE through comprehensive experiments on various registration benchmarks, registration task settings, and prominent backbones, yielding unprecedented performance improvement across all combinations.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Min_Distinctiveness_Oriented_Positional_Equilibrium_for_Point_Cloud_Registration_ICCV_2021_paper.pdf", @@ -10566,7 +11283,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", - "aff_country_unique": "South Korea;United States;" + "aff_country_unique": "South Korea;United States;", + "bibtex": "@InProceedings{Min_2021_ICCV,\n \n author = {\n Min,\n Taewon and Song,\n Chonghyuk and Kim,\n Eunseok and Shim,\n Inwook\n},\n title = {\n Distinctiveness Oriented Positional Equilibrium for Point Cloud Registration\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5490-5498\n} \n}" }, { "title": "Distributional Robustness Loss for Long-Tail Learning", @@ -10574,6 +11292,7 @@ "status": "Poster", "track": "main", "pid": 8563, + "author_site": "Dvir Samuel; Gal Chechik", "author": "Dvir Samuel; Gal Chechik", "abstract": "Real-world data is often unbalanced and long-tailed, but deep models struggle to recognize rare classes in the presence of frequent classes. To address unbalanced data, most studies try balancing the data, the loss, or the classifier to reduce classification bias towards head classes. Far less attention has been given to the latent representations learned with unbalanced data. We show that the feature extractor part of deep networks suffers greatly from this bias. We propose a new loss based on robustness theory, which encourages the model to learn high-quality representations for both head and tail classes. While the general form of the robustness loss may be hard to compute, we further derive an easy-to-compute upper bound that can be minimized efficiently. This procedure reduces representation bias towards head classes in the feature space and achieves new SOTA results on CIFAR100-LT, ImageNet-LT, and iNaturalist long-tail benchmarks. We find that training with robustness increases recognition accuracy of tail classes while largely maintaining the accuracy of head classes. The new robustness loss can be combined with various classifier balancing techniques and can be applied to representations at several layers of the deep model.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Samuel_Distributional_Robustness_Loss_for_Long-Tail_Learning_ICCV_2021_paper.pdf", @@ -10590,14 +11309,15 @@ "author_num": 2, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Samuel_Distributional_Robustness_Loss_for_Long-Tail_Learning_ICCV_2021_paper.html", "aff_unique_index": "0;0+1", - "aff_unique_norm": "Bar-Ilan University;NVIDIA", + "aff_unique_norm": "Bar-Ilan University;NVIDIA Research", "aff_unique_dep": ";Research", "aff_unique_url": "https://www.biu.ac.il;https://research.nvidia.com", "aff_unique_abbr": "BIU;NVIDIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0", - "aff_country_unique": "Israel" + "aff_country_unique": "Israel", + "bibtex": "@InProceedings{Samuel_2021_ICCV,\n \n author = {\n Samuel,\n Dvir and Chechik,\n Gal\n},\n title = {\n Distributional Robustness Loss for Long-Tail Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9495-9504\n} \n}" }, { "title": "DivAug: Plug-In Automated Data Augmentation With Explicit Diversity Maximization", @@ -10605,6 +11325,7 @@ "status": "Poster", "track": "main", "pid": 7108, + "author_site": "Zirui Liu; Haifeng Jin; Ting-Hsiang Wang; Kaixiong Zhou; Xia Hu", "author": "Zirui Liu; Haifeng Jin; Ting-Hsiang Wang; Kaixiong Zhou; Xia Hu", "abstract": "Human-designed data augmentation strategies havebeen replaced by automatically learned augmentation pol-icy in the past two years. Specifically, recent works haveexperimentally shown that the superior performance of theautomated methods stems from increasing the diversity ofaugmented data. However, two factors regard-ing the diversity of augmented data are still missing: 1)the explicit definition (and thus measurement) of diversityand 2) the quantifiable relationship between diversity andits regularization effects. To fill this gap, we propose a di-versity measure called \"Variance Diversity\" and theoreti-cally show that the regularization effect of data augmenta-tion is promised by Variance Diversity. We confirm in exper-iments that the relative gain from automated data augmen-tation in test accuracy of a given model is highly correlatedto Variance Diversity. To improve the search process ofautomated augmentation, an unsupervised sampling-basedframework,DivAug, is designed to directly optimize Vari-ance Diversity and hence strengthen the regularization ef-fect. Without requiring a separate search process, the per-formance gain from DivAug is comparable with state-of-the-art method with better efficiency. Moreover, under thesemi-supervised setting, our framework can further improvethe performance of semi-supervised learning algorithmsbased on RandAugment, making it highly applicable to real-world problems, where labeled data is scarce. The code is available at https://github.com/warai-0toko/DivAug.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liu_DivAug_Plug-In_Automated_Data_Augmentation_With_Explicit_Diversity_Maximization_ICCV_2021_paper.pdf", @@ -10628,7 +11349,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Liu_2021_ICCV,\n \n author = {\n Liu,\n Zirui and Jin,\n Haifeng and Wang,\n Ting-Hsiang and Zhou,\n Kaixiong and Hu,\n Xia\n},\n title = {\n DivAug: Plug-In Automated Data Augmentation With Explicit Diversity Maximization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4762-4770\n} \n}" }, { "title": "Diverse Image Style Transfer via Invertible Cross-Space Mapping", @@ -10636,6 +11358,7 @@ "status": "Poster", "track": "main", "pid": 7764, + "author_site": "Haibo Chen; Lei Zhao; Huiming Zhang; Zhizhong Wang; Zhiwen Zuo; Ailin Li; Wei Xing; Dongming Lu", "author": "Haibo Chen; Lei Zhao; Huiming Zhang; Zhizhong Wang; Zhiwen Zuo; Ailin Li; Wei Xing; Dongming Lu", "abstract": "Image style transfer aims to transfer the styles of artworks onto arbitrary photographs to create novel artistic images. Although style transfer is inherently an underdetermined problem, existing approaches usually assume a deterministic solution, thus failing to capture the full distribution of possible outputs. To address this limitation, we propose a Diverse Image Style Transfer (DIST) framework which achieves significant diversity by enforcing an invertible cross-space mapping. Specifically, the framework consists of three branches: disentanglement branch, inverse branch, and stylization branch. Among them, the disentanglement branch factorizes artworks into content space and style space; the inverse branch encourages the invertible mapping between the latent space of input noise vectors and the style space of generated artistic images; the stylization branch renders the input content image with the style of an artist. Armed with these three branches, our approach is able to synthesize significantly diverse stylized images without loss of quality. We conduct extensive experiments and comparisons to evaluate our approach qualitatively and quantitatively. The experimental results demonstrate the effectiveness of our method.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_Diverse_Image_Style_Transfer_via_Invertible_Cross-Space_Mapping_ICCV_2021_paper.pdf", @@ -10659,7 +11382,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Haibo and Zhao,\n Lei and Zhang,\n Huiming and Wang,\n Zhizhong and Zuo,\n Zhiwen and Li,\n Ailin and Xing,\n Wei and Lu,\n Dongming\n},\n title = {\n Diverse Image Style Transfer via Invertible Cross-Space Mapping\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14880-14889\n} \n}" }, { "title": "Divide and Conquer for Single-Frame Temporal Action Localization", @@ -10667,6 +11391,7 @@ "status": "Poster", "track": "main", "pid": 2342, + "author_site": "Chen Ju; Peisen Zhao; Siheng Chen; Ya Zhang; Yanfeng Wang; Qi Tian", "author": "Chen Ju; Peisen Zhao; Siheng Chen; Ya Zhang; Yanfeng Wang; Qi Tian", "abstract": "Single-frame temporal action localization (STAL) aims to localize actions in untrimmed videos with only one timestamp annotation for each action instance. Existing methods adopt the one-stage framework but couple the counting goal and the localization goal. This paper proposes a novel two-stage framework for the STAL task with the spirit of divide and conquer. The instance counting stage leverages the location supervision to determine the number of action instances and divide a whole video into multiple video clips, so that each video clip contains only one complete action instance; and the location estimation stage leverages the category supervision to localize the action instance in each video clip. To efficiently represent the action instance in each video clip, we introduce the proposal-based representation, and design a novel differentiable mask generator to enable the end-to-end training supervised by category labels. On THUMOS14, GTEA, and BEOID datasets, our method outperforms state-of-the-art methods by 3.5%, 2.7%, 4.8% mAP on average. And extensive experiments verify the effectiveness of our method.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ju_Divide_and_Conquer_for_Single-Frame_Temporal_Action_Localization_ICCV_2021_paper.pdf", @@ -10690,7 +11415,8 @@ "aff_campus_unique_index": ";;;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0+0;0+0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Ju_2021_ICCV,\n \n author = {\n Ju,\n Chen and Zhao,\n Peisen and Chen,\n Siheng and Zhang,\n Ya and Wang,\n Yanfeng and Tian,\n Qi\n},\n title = {\n Divide and Conquer for Single-Frame Temporal Action Localization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13455-13464\n} \n}" }, { "title": "Divide and Contrast: Self-Supervised Learning From Uncurated Data", @@ -10698,7 +11424,8 @@ "status": "Poster", "track": "main", "pid": 8044, - "author": "Yonglong Tian; Olivier J. H\u00e9naff; A\u00e4ron van den Oord", + "author_site": "Yonglong Tian; Olivier J. Hénaff; Aäron van den Oord", + "author": "Yonglong Tian; Olivier J. Hénaff; Aäron van den Oord", "abstract": "Self-supervised learning holds promise in leveraging large amounts of unlabeled data, however much of its progress has thus far been limited to highly curated pre-training data such as ImageNet. We explore the effects of contrastive learning from larger, less-curated image datasets such as YFCC, and find there is indeed a large difference in the resulting representation quality. We hypothesize that this curation gap is due to a shift in the distribution of image classes---which is more diverse and heavy-tailed---resulting in less relevant negative samples to learn from. We test this hypothesis with a new approach, Divide and Contrast (DnC), which alternates between contrastive learning and clustering-based hard negative mining. When pretrained on less curated datasets, DnC greatly improves the performance of self-supervised learning on downstream tasks, while remaining competitive with the current state-of-the-art on curated datasets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Tian_Divide_and_Contrast_Self-Supervised_Learning_From_Uncurated_Data_ICCV_2021_paper.pdf", "aff": "MIT; DeepMind; DeepMind", @@ -10721,7 +11448,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", - "aff_country_unique": "United States;United Kingdom" + "aff_country_unique": "United States;United Kingdom", + "bibtex": "@InProceedings{Tian_2021_ICCV,\n \n author = {\n Tian,\n Yonglong and H\\'enaff,\n Olivier J. and van den Oord,\n A\\"aron\n},\n title = {\n Divide and Contrast: Self-Supervised Learning From Uncurated Data\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10063-10074\n} \n}" }, { "title": "Divide-and-Assemble: Learning Block-Wise Memory for Unsupervised Anomaly Detection", @@ -10729,6 +11457,7 @@ "status": "Poster", "track": "main", "pid": 9729, + "author_site": "Jinlei Hou; Yingying Zhang; Qiaoyong Zhong; Di Xie; Shiliang Pu; Hong Zhou", "author": "Jinlei Hou; Yingying Zhang; Qiaoyong Zhong; Di Xie; Shiliang Pu; Hong Zhou", "abstract": "Reconstruction-based methods play an important role in unsupervised anomaly detection in images. Ideally, we expect a perfect reconstruction for normal samples and poor reconstruction for abnormal samples. Since the generalizability of deep neural networks is difficult to control, existing models such as autoencoder do not work well. In this work, we interpret the reconstruction of an image as a divide-and-assemble procedure. Surprisingly, by varying the granularity of division on feature maps, we are able to modulate the reconstruction capability of the model for both normal and abnormal samples. That is, finer granularity leads to better reconstruction, while coarser granularity leads to poorer reconstruction. With proper granularity, the gap between the reconstruction error of normal and abnormal samples can be maximized. The divide-and-assemble framework is implemented by embedding a novel multi-scale block-wise memory module into an autoencoder network. Besides, we introduce adversarial learning and explore the semantic latent representation of the discriminator, which improves the detection of subtle anomaly. We achieve state-of-the-art performance on the challenging MVTec AD dataset. Remarkably, we improve the vanilla autoencoder model by 10.1% in terms of the AUROC score.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Hou_Divide-and-Assemble_Learning_Block-Wise_Memory_for_Unsupervised_Anomaly_Detection_ICCV_2021_paper.pdf", @@ -10752,7 +11481,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Hou_2021_ICCV,\n \n author = {\n Hou,\n Jinlei and Zhang,\n Yingying and Zhong,\n Qiaoyong and Xie,\n Di and Pu,\n Shiliang and Zhou,\n Hong\n},\n title = {\n Divide-and-Assemble: Learning Block-Wise Memory for Unsupervised Anomaly Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8791-8800\n} \n}" }, { "title": "DnD: Dense Depth Estimation in Crowded Dynamic Indoor Scenes", @@ -10760,6 +11490,7 @@ "status": "Poster", "track": "main", "pid": 7350, + "author_site": "Dongki Jung; Jaehoon Choi; Yonghan Lee; Deokhwa Kim; Changick Kim; Dinesh Manocha; Donghwan Lee", "author": "Dongki Jung; Jaehoon Choi; Yonghan Lee; Deokhwa Kim; Changick Kim; Dinesh Manocha; Donghwan Lee", "abstract": "We present a novel approach for estimating depth from a monocular camera as it moves through complex and crowded indoor environments, e.g., a department store or a metro station. Our approach predicts absolute scale depth maps over the entire scene consisting of a static background and multiple moving people, by training on dynamic scenes. Since it is difficult to collect dense depth maps from crowded indoor environments, we design our training framework without requiring groundtruth depths produced from depth sensing devices. Our network leverages RGB images and sparse depth maps generated from traditional 3D reconstruction methods to estimate dense depth maps. We use two constraints to handle depth for non-rigidly moving people without tracking their motion explicitly. We demonstrate that our approach offers consistent improvements over recent depth estimation methods on the NAVERLABS dataset, which includes complex and crowded scenes.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Jung_DnD_Dense_Depth_Estimation_in_Crowded_Dynamic_Indoor_Scenes_ICCV_2021_paper.pdf", @@ -10776,14 +11507,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Jung_DnD_Dense_Depth_Estimation_in_Crowded_Dynamic_Indoor_Scenes_ICCV_2021_paper.html", "aff_unique_index": "0;0+1;0;0;2;1;0", - "aff_unique_norm": "NAVER LABS;University of Maryland;Korea Advanced Institute of Science and Technology", + "aff_unique_norm": "NAVER Labs;University of Maryland;Korea Advanced Institute of Science and Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://www.naverlabs.com;https://www/umd.edu;https://www.kaist.ac.kr", "aff_unique_abbr": "NAVER Labs;UMD;KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+1;0;0;0;1;0", - "aff_country_unique": "South Korea;United States" + "aff_country_unique": "South Korea;United States", + "bibtex": "@InProceedings{Jung_2021_ICCV,\n \n author = {\n Jung,\n Dongki and Choi,\n Jaehoon and Lee,\n Yonghan and Kim,\n Deokhwa and Kim,\n Changick and Manocha,\n Dinesh and Lee,\n Donghwan\n},\n title = {\n DnD: Dense Depth Estimation in Crowded Dynamic Indoor Scenes\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12797-12807\n} \n}" }, { "title": "Do Different Deep Metric Learning Losses Lead to Similar Learned Features?", @@ -10791,10 +11523,11 @@ "status": "Poster", "track": "main", "pid": 10598, + "author_site": "Konstantin Kobs; Michael Steininger; Andrzej Dulny; Andreas Hotho", "author": "Konstantin Kobs; Michael Steininger; Andrzej Dulny; Andreas Hotho", "abstract": "Recent studies have shown that many deep metric learning loss functions perform very similarly under the same experimental conditions. One potential reason for this unexpected result is that all losses let the network focus on similar image regions or properties. In this paper, we investigate this by conducting a two-step analysis to extract and compare the learned visual features of the same model architecture trained with different loss functions: First, we compare the learned features on the pixel level by correlating saliency maps of the same input images. Second, we compare the clustering of embeddings for several image properties, e.g. object color or illumination. To provide independent control over these properties, photo-realistic 3D car renders similar to images in the Cars196 dataset are generated. In our analysis, we compare 14 pretrained models from a recent study and find that, even though all models perform similarly, different loss functions can guide the model to learn different features. We especially find differences between classification and ranking based losses. Our analysis also shows that some seemingly irrelevant properties can have significant influence on the resulting embedding. We encourage researchers from the deep metric learning community to use our methods to get insights into the features learned by their proposed methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Kobs_Do_Different_Deep_Metric_Learning_Losses_Lead_to_Similar_Learned_ICCV_2021_paper.pdf", - "aff": "University of W\u00fcrzburg; University of W\u00fcrzburg; University of W\u00fcrzburg; University of W\u00fcrzburg", + "aff": "University of Würzburg; University of Würzburg; University of Würzburg; University of Würzburg", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Kobs_Do_Different_Deep_ICCV_2021_supplemental.pdf", @@ -10807,14 +11540,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Kobs_Do_Different_Deep_Metric_Learning_Losses_Lead_to_Similar_Learned_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;0", - "aff_unique_norm": "University of W\u00fcrzburg", + "aff_unique_norm": "University of Würzburg", "aff_unique_dep": "", "aff_unique_url": "https://www.uni-wuerzburg.de", "aff_unique_abbr": "UWue", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Kobs_2021_ICCV,\n \n author = {\n Kobs,\n Konstantin and Steininger,\n Michael and Dulny,\n Andrzej and Hotho,\n Andreas\n},\n title = {\n Do Different Deep Metric Learning Losses Lead to Similar Learned Features?\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10644-10654\n} \n}" }, { "title": "Do Image Classifiers Generalize Across Time?", @@ -10822,6 +11556,7 @@ "status": "Poster", "track": "main", "pid": 7017, + "author_site": "Vaishaal Shankar; Achal Dave; Rebecca Roelofs; Deva Ramanan; Benjamin Recht; Ludwig Schmidt", "author": "Vaishaal Shankar; Achal Dave; Rebecca Roelofs; Deva Ramanan; Benjamin Recht; Ludwig Schmidt", "abstract": "Vision models notoriously flicker when applied to videos: they correctly recognize objects in some frames, but fail on perceptually similar, nearby frames. In this work, we systematically analyze the robustness of image classifiers to such temporal perturbations in videos. To do so, we construct two new datasets, ImageNet-Vid-Robust and YTBB-Robust, containing a total of 57,897 images grouped into 3,139 sets of perceptually similar images. Our datasets were derived from ImageNet-Vid and YouTube-BB, respectively, and thoroughly re-annotated by human experts for image similarity. We evaluate a diverse array of classifiers pre-trained on ImageNet and show a median classification accuracy drop of 16 and 10 points, respectively, on our two datasets. Additionally, we evaluate three detection models and show that natural perturbations induce both classification as well as localization errors, leading to a median drop in detection mAP of 14 points. Our analysis demonstrates that perturbations occurring naturally in videos pose a substantial and realistic challenge to deploying convolutional neural networks in environments that require both reliable and low-latency predictions.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Shankar_Do_Image_Classifiers_Generalize_Across_Time_ICCV_2021_paper.pdf", @@ -10836,7 +11571,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Shankar_Do_Image_Classifiers_Generalize_Across_Time_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Shankar_Do_Image_Classifiers_Generalize_Across_Time_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Shankar_2021_ICCV,\n \n author = {\n Shankar,\n Vaishaal and Dave,\n Achal and Roelofs,\n Rebecca and Ramanan,\n Deva and Recht,\n Benjamin and Schmidt,\n Ludwig\n},\n title = {\n Do Image Classifiers Generalize Across Time?\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9661-9669\n} \n}" }, { "title": "DocFormer: End-to-End Transformer for Document Understanding", @@ -10844,6 +11580,7 @@ "status": "Poster", "track": "main", "pid": 9385, + "author_site": "Srikar Appalaraju; Bhavan Jasani; Bhargava Urala Kota; Yusheng Xie; R. Manmatha", "author": "Srikar Appalaraju; Bhavan Jasani; Bhargava Urala Kota; Yusheng Xie; R. Manmatha", "abstract": "We present DocFormer - a multi-modal transformer based architecture for the task of Visual Document Understanding (VDU). VDU is a challenging problem which aims to understand documents in their varied formats(forms, receipts etc.) and layouts. In addition, DocFormer is pre-trained in an unsupervised fashion using carefully designed tasks which encourage multi-modal interaction. DocFormer uses text, vision and spatial features and combines them using a novel multi-modal self-attention layer. DocFormer also shares learned spatial embeddings across modalities which makes it easy for the model to correlate text to visual tokens and vice versa. DocFormer is evaluated on 4 different datasets each with strong baselines. DocFormer achieves state-of-the-art results on all of them, sometimes beating models 4x its size (in no. of parameters)", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Appalaraju_DocFormer_End-to-End_Transformer_for_Document_Understanding_ICCV_2021_paper.pdf", @@ -10860,14 +11597,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Appalaraju_DocFormer_End-to-End_Transformer_for_Document_Understanding_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;0;0", - "aff_unique_norm": "Amazon", + "aff_unique_norm": "Amazon Web Services", "aff_unique_dep": "AWS AI", "aff_unique_url": "https://aws.amazon.com", "aff_unique_abbr": "AWS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Appalaraju_2021_ICCV,\n \n author = {\n Appalaraju,\n Srikar and Jasani,\n Bhavan and Kota,\n Bhargava Urala and Xie,\n Yusheng and Manmatha,\n R.\n},\n title = {\n DocFormer: End-to-End Transformer for Document Understanding\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 993-1003\n} \n}" }, { "title": "Domain Adaptive Semantic Segmentation With Self-Supervised Depth Estimation", @@ -10875,6 +11613,7 @@ "status": "Poster", "track": "main", "pid": 1742, + "author_site": "Qin Wang; Dengxin Dai; Lukas Hoyer; Luc Van Gool; Olga Fink", "author": "Qin Wang; Dengxin Dai; Lukas Hoyer; Luc Van Gool; Olga Fink", "abstract": "Domain adaptation for semantic segmentation aims to improve the model performance in the presence of a distribution shift between source and target domain. Leveraging the supervision from auxiliary tasks (such as depth estimation) has the potential to heal this shift because many visual tasks are closely related to each other. However, such a supervision is not always available. In this work, we leverage the guidance from self-supervised depth estimation, which is available on both domains, to bridge the domain gap. On the one hand, we propose to explicitly learn the task feature correlation to strengthen the target semantic predictions with the help of target depth estimation. On the other hand, we use the depth prediction discrepancy from source and target depth decoders to approximate the pixel-wise adaptation difficulty. The adaptation difficulty, inferred from depth, is then used to refine the target semantic segmentation pseudo-labels. The proposed method can be easily implemented into existing segmentation frameworks. We demonstrate the effectiveness of our approach on the benchmark tasks SYNTHIA-to-Cityscapes and GTA-to-Cityscapes, on which we achieve the new state-of-the-art performance of 55.0% and 56.6%, respectively. Our code is available at https://qin.ee/corda", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_Domain_Adaptive_Semantic_Segmentation_With_Self-Supervised_Depth_Estimation_ICCV_2021_paper.pdf", @@ -10898,7 +11637,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0+1;0;0+2;0", - "aff_country_unique": "Switzerland;Germany;Belgium" + "aff_country_unique": "Switzerland;Germany;Belgium", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Qin and Dai,\n Dengxin and Hoyer,\n Lukas and Van Gool,\n Luc and Fink,\n Olga\n},\n title = {\n Domain Adaptive Semantic Segmentation With Self-Supervised Depth Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8515-8525\n} \n}" }, { "title": "Domain Adaptive Video Segmentation via Temporal Consistency Regularization", @@ -10906,6 +11646,7 @@ "status": "Poster", "track": "main", "pid": 2104, + "author_site": "Dayan Guan; Jiaxing Huang; Aoran Xiao; Shijian Lu", "author": "Dayan Guan; Jiaxing Huang; Aoran Xiao; Shijian Lu", "abstract": "Video semantic segmentation is an essential task for the analysis and understanding of videos. Recent efforts largely focus on supervised video segmentation by learning from fully annotated data, but the learnt models often experience clear performance drop while applied to videos of a different domain. This paper presents DA-VSN, a domain adaptive video segmentation network that addresses domain gaps in videos by temporal consistency regularization (TCR) for consecutive frames of target-domain videos. DA-VSN consists of two novel and complementary designs. The first is cross-domain TCR that guides the prediction of target frames to have similar temporal consistency as that of source frames (learnt from annotated source data) via adversarial learning. The second is intra-domain TCR that guides unconfident predictions of target frames to have similar temporal consistency as confident predictions of target frames. Extensive experiments demonstrate the superiority of our proposed domain adaptive video segmentation network which outperforms multiple baselines consistently by large margins.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Guan_Domain_Adaptive_Video_Segmentation_via_Temporal_Consistency_Regularization_ICCV_2021_paper.pdf", @@ -10920,7 +11661,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Guan_Domain_Adaptive_Video_Segmentation_via_Temporal_Consistency_Regularization_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Guan_Domain_Adaptive_Video_Segmentation_via_Temporal_Consistency_Regularization_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Guan_2021_ICCV,\n \n author = {\n Guan,\n Dayan and Huang,\n Jiaxing and Xiao,\n Aoran and Lu,\n Shijian\n},\n title = {\n Domain Adaptive Video Segmentation via Temporal Consistency Regularization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8053-8064\n} \n}" }, { "title": "Domain Generalization via Gradient Surgery", @@ -10928,6 +11670,7 @@ "status": "Poster", "track": "main", "pid": 10705, + "author_site": "Lucas Mansilla; Rodrigo Echeveste; Diego H. Milone; Enzo Ferrante", "author": "Lucas Mansilla; Rodrigo Echeveste; Diego H. Milone; Enzo Ferrante", "abstract": "In real-life applications, machine learning models often face scenarios where there is a change in data distribution between training and test domains. When the aim is to make predictions on distributions different from those seen at training, we incur in a domain generalization problem. Methods to address this issue learn a model using data from multiple source domains, and then apply this model to the unseen target domain. Our hypothesis is that when training with multiple domains, conflicting gradients within each mini-batch contain information specific to the individual domains which is irrelevant to the others, including the test domain. If left untouched, such disagreement may degrade generalization performance. In this work, we characterize the conflicting gradients emerging in domain shift scenarios and devise novel gradient agreement strategies based on gradient surgery to alleviate their effect. We validate our approach in image classification tasks with three multi-domain datasets, showing the value of the proposed agreement strategy in enhancing the generalization capability of deep learning models in domain shift scenarios.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Mansilla_Domain_Generalization_via_Gradient_Surgery_ICCV_2021_paper.pdf", @@ -10951,7 +11694,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1", - "aff_country_unique": "Spain;Argentina" + "aff_country_unique": "Spain;Argentina", + "bibtex": "@InProceedings{Mansilla_2021_ICCV,\n \n author = {\n Mansilla,\n Lucas and Echeveste,\n Rodrigo and Milone,\n Diego H. and Ferrante,\n Enzo\n},\n title = {\n Domain Generalization via Gradient Surgery\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6630-6638\n} \n}" }, { "title": "Domain-Aware Universal Style Transfer", @@ -10959,6 +11703,7 @@ "status": "Poster", "track": "main", "pid": 3644, + "author_site": "Kibeom Hong; Seogkyu Jeon; Huan Yang; Jianlong Fu; Hyeran Byun", "author": "Kibeom Hong; Seogkyu Jeon; Huan Yang; Jianlong Fu; Hyeran Byun", "abstract": "Style transfer aims to reproduce content images with the styles from reference images. Existing universal style transfer methods successfully deliver arbitrary styles to original images either in an artistic or a photo-realistic way. However, the range of \"arbitrary style\" defined by existing works is bounded in the particular domain due to their structural limitation. Specifically, the degrees of content preservation and stylization are established according to a predefined target domain. As a result, both photo-realistic and artistic models have difficulty in performing the desired style transfer for the other domain. To overcome this limitation, we propose a unified architecture, Domain-aware Style Transfer Networks (DSTN) that transfer not only the style but also the property of domain (i.e., domainness) from a given reference image. To this end, we design a novel domainness indicator that captures the domainness value from the texture and structural features of reference images. Moreover, we introduce a unified framework with domainaware skip connection to adaptively transfer the stroke and palette to the input contents guided by the domainness indicator. Our extensive experiments validate that our model produces better qualitative results and outperforms previous methods in terms of proxy metrics on both artistic and photo-realistic stylizations. All codes and pre-trained weights are available at https://github.com/Kibeom-Hong/Domain-Aware-Style-Transfer.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Hong_Domain-Aware_Universal_Style_Transfer_ICCV_2021_paper.pdf", @@ -10975,14 +11720,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Hong_Domain-Aware_Universal_Style_Transfer_ICCV_2021_paper.html", "aff_unique_index": "0;0;1;1;0+0", - "aff_unique_norm": "Yonsei University;Microsoft", + "aff_unique_norm": "Yonsei University;Microsoft Corporation", "aff_unique_dep": "Department of Computer Science;Microsoft Research", "aff_unique_url": "https://www.yonsei.ac.kr;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "Yonsei;MSR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;1;0+0", - "aff_country_unique": "South Korea;United States" + "aff_country_unique": "South Korea;United States", + "bibtex": "@InProceedings{Hong_2021_ICCV,\n \n author = {\n Hong,\n Kibeom and Jeon,\n Seogkyu and Yang,\n Huan and Fu,\n Jianlong and Byun,\n Hyeran\n},\n title = {\n Domain-Aware Universal Style Transfer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14609-14617\n} \n}" }, { "title": "Domain-Invariant Disentangled Network for Generalizable Object Detection", @@ -10990,6 +11736,7 @@ "status": "Poster", "track": "main", "pid": 2840, + "author_site": "Chuang Lin; Zehuan Yuan; Sicheng Zhao; Peize Sun; Changhu Wang; Jianfei Cai", "author": "Chuang Lin; Zehuan Yuan; Sicheng Zhao; Peize Sun; Changhu Wang; Jianfei Cai", "abstract": "We address the problem of domain generalizable object detection, which aims to learn a domain-invariant detector from multiple \"seen\" domains so that it can generalize well to other \"unseen\" domains. The generalization ability is crucial in practical scenarios especially when it is difficult to collect data. Compared to image classification, domain generalization in object detection has seldom been explored with more challenges brought by domain gaps on both image and instance levels. In this paper, we propose a novel generalizable object detection model, termed Domain-Invariant Disentangled Network (DIDN). In contrast to directly aligning multiple sources, we integrate a disentangled network into Faster R-CNN. By disentangling representations on both image and instance levels, DIDN is able to learn domain-invariant representations that are suitable for generalized object detection. Furthermore, we design a cross-level representation reconstruction to complement this two-level disentanglement so that informative object representations could be preserved. Extensive experiments are conducted on five benchmark datasets and the results demonstrate that our model achieves state-of-the-art performances on domain generalization for object detection.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Lin_Domain-Invariant_Disentangled_Network_for_Generalizable_Object_Detection_ICCV_2021_paper.pdf", @@ -11006,14 +11753,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Lin_Domain-Invariant_Disentangled_Network_for_Generalizable_Object_Detection_ICCV_2021_paper.html", "aff_unique_index": "0;1;2;3;1;0", - "aff_unique_norm": "Monash University;ByteDance;Columbia University;University of Hong Kong", + "aff_unique_norm": "Monash University;ByteDance;Columbia University;The University of Hong Kong", "aff_unique_dep": "Dept of Data Science and AI;AI Lab;;", "aff_unique_url": "https://www.monash.edu;https://www.bytedance.com;https://www.columbia.edu;https://www.hku.hk", "aff_unique_abbr": "Monash;ByteDance;Columbia;HKU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;1;2;1;1;0", - "aff_country_unique": "Australia;China;United States" + "aff_country_unique": "Australia;China;United States", + "bibtex": "@InProceedings{Lin_2021_ICCV,\n \n author = {\n Lin,\n Chuang and Yuan,\n Zehuan and Zhao,\n Sicheng and Sun,\n Peize and Wang,\n Changhu and Cai,\n Jianfei\n},\n title = {\n Domain-Invariant Disentangled Network for Generalizable Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8771-8780\n} \n}" }, { "title": "Dressing in Order: Recurrent Person Image Generation for Pose Transfer, Virtual Try-On and Outfit Editing", @@ -11021,6 +11769,7 @@ "status": "Poster", "track": "main", "pid": 8373, + "author_site": "Aiyu Cui; Daniel McKee; Svetlana Lazebnik", "author": "Aiyu Cui; Daniel McKee; Svetlana Lazebnik", "abstract": "We proposes a flexible person generation framework called Dressing in Order (DiOr), which supports 2D pose transfer, virtual try-on, and several fashion editing tasks. The key to DiOr is a novel recurrent generation pipeline to sequentially put garments on a person, so that trying on the same garments in different orders will result in different looks. Our system can produce dressing effects not achievable by existing work, including different interactions of garments (e.g., wearing a top tucked into the bottom or over it), as well as layering of multiple garments of the same type (e.g., jacket over shirt over t-shirt). DiOr explicitly encodes the shape and texture of each garment, enabling these elements to be edited separately. Joint training on pose transfer and inpainting helps with detail preservation and coherence of generated garments. Extensive evaluations show that DiOr outperforms other recent methods like ADGAN in terms of output quality, and handles a wide range of editing functions for which there is no direct supervision.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Cui_Dressing_in_Order_Recurrent_Person_Image_Generation_for_Pose_Transfer_ICCV_2021_paper.pdf", @@ -11037,14 +11786,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Cui_Dressing_in_Order_Recurrent_Person_Image_Generation_for_Pose_Transfer_ICCV_2021_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "University of Illinois Urbana-Champaign", + "aff_unique_norm": "University of Illinois at Urbana-Champaign", "aff_unique_dep": "", "aff_unique_url": "https://illinois.edu", "aff_unique_abbr": "UIUC", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Urbana-Champaign", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Cui_2021_ICCV,\n \n author = {\n Cui,\n Aiyu and McKee,\n Daniel and Lazebnik,\n Svetlana\n},\n title = {\n Dressing in Order: Recurrent Person Image Generation for Pose Transfer,\n Virtual Try-On and Outfit Editing\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14638-14647\n} \n}" }, { "title": "Dual Bipartite Graph Learning: A General Approach for Domain Adaptive Object Detection", @@ -11052,6 +11802,7 @@ "status": "Poster", "track": "main", "pid": 1338, + "author_site": "Chaoqi Chen; Jiongcheng Li; Zebiao Zheng; Yue Huang; Xinghao Ding; Yizhou Yu", "author": "Chaoqi Chen; Jiongcheng Li; Zebiao Zheng; Yue Huang; Xinghao Ding; Yizhou Yu", "abstract": "Domain Adaptive Object Detection (DAOD) relieves the reliance on large-scale annotated data by transferring the knowledge learned from a labeled source domain to a new unlabeled target domain. Recent DAOD approaches resort to local feature alignment in virtue of domain adversarial training in conjunction with the ad-hoc detection pipelines to achieve feature adaptation. However, these methods are limited to adapt the specific types of object detectors and do not explore the cross-domain topological relations. In this paper, we first formulate DAOD as an open-set domain adaptation problem in which foregrounds (pixel or region) can be seen as the \"known class\", while backgrounds (pixel or region) are referred to as the \"unknown class\". To this end, we present a new and general perspective for DAOD named Dual Bipartite Graph Learning (DBGL), which captures the cross-domain interactions on both pixel-level and semantic-level via increasing the distinction between foregrounds and backgrounds and modeling the cross-domain dependencies among different semantic categories. Experiments reveal that the proposed DBGL in conjunction with one-stage and two-stage detectors exceeds the state-of-the-art performance on standard DAOD benchmarks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_Dual_Bipartite_Graph_Learning_A_General_Approach_for_Domain_Adaptive_ICCV_2021_paper.pdf", @@ -11068,14 +11819,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Chen_Dual_Bipartite_Graph_Learning_A_General_Approach_for_Domain_Adaptive_ICCV_2021_paper.html", "aff_unique_index": "0;1;1;1;1;0", - "aff_unique_norm": "University of Hong Kong;Xiamen University", + "aff_unique_norm": "The University of Hong Kong;Xiamen University", "aff_unique_dep": ";", "aff_unique_url": "https://www.hku.hk;https://www.xmu.edu.cn", "aff_unique_abbr": "HKU;XMU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Chaoqi and Li,\n Jiongcheng and Zheng,\n Zebiao and Huang,\n Yue and Ding,\n Xinghao and Yu,\n Yizhou\n},\n title = {\n Dual Bipartite Graph Learning: A General Approach for Domain Adaptive Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2703-2712\n} \n}" }, { "title": "Dual Contrastive Loss and Attention for GANs", @@ -11083,6 +11835,7 @@ "status": "Poster", "track": "main", "pid": 8068, + "author_site": "Ning Yu; Guilin Liu; Aysegul Dundar; Andrew Tao; Bryan Catanzaro; Larry S. Davis; Mario Fritz", "author": "Ning Yu; Guilin Liu; Aysegul Dundar; Andrew Tao; Bryan Catanzaro; Larry S. Davis; Mario Fritz", "abstract": "Generative Adversarial Networks (GANs) produce impressive results on unconditional image generation when powered with large-scale image datasets. Yet generated images are still easy to spot especially on datasets with high variance (e.g. bedroom, church). In this paper, we propose various improvements to further push the boundaries in image generation. Specifically, we propose a novel dual contrastive loss and show that, with this loss, discriminator learns more generalized and distinguishable representations to incentivize generation. In addition, we revisit attention and extensively experiment with different attention blocks in the generator. We find attention to be still an important module for successful image generation even though it was not used in the recent state-of-the-art models. Lastly, we study different attention architectures in the discriminator, and propose a reference attention mechanism. By combining the strengths of these remedies, we improve the compelling state-of-the-art Frechet Inception Distance (FID) by at least 17.5% on several benchmark datasets. We obtain even more significant improvements on compositional synthetic scenes (up to 47.5% in FID).", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yu_Dual_Contrastive_Loss_and_Attention_for_GANs_ICCV_2021_paper.pdf", @@ -11099,14 +11852,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Yu_Dual_Contrastive_Loss_and_Attention_for_GANs_ICCV_2021_paper.html", "aff_unique_index": "0+1;2;2+3;2;2;0;4", - "aff_unique_norm": "University of Maryland;Max Planck Institute for Informatics;NVIDIA;Bilkent University;CISPA Helmholtz Center for Information Security", - "aff_unique_dep": ";;NVIDIA Corporation;;", + "aff_unique_norm": "University of Maryland;Max Planck Institute for Informatics;NVIDIA Corporation;Bilkent University;CISPA Helmholtz Center for Information Security", + "aff_unique_dep": ";;;;", "aff_unique_url": "https://www/umd.edu;https://mpi-inf.mpg.de;https://www.nvidia.com;https://www.bilkent.edu.tr;https://www.cispa.de/", "aff_unique_abbr": "UMD;MPII;NVIDIA;Bilkent;CISPA", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0;0+2;0;0;0;1", - "aff_country_unique": "United States;Germany;T\u00fcrkiye" + "aff_country_unique": "United States;Germany;Turkey", + "bibtex": "@InProceedings{Yu_2021_ICCV,\n \n author = {\n Yu,\n Ning and Liu,\n Guilin and Dundar,\n Aysegul and Tao,\n Andrew and Catanzaro,\n Bryan and Davis,\n Larry S. and Fritz,\n Mario\n},\n title = {\n Dual Contrastive Loss and Attention for GANs\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6731-6742\n} \n}" }, { "title": "Dual Path Learning for Domain Adaptation of Semantic Segmentation", @@ -11114,6 +11868,7 @@ "status": "Poster", "track": "main", "pid": 3438, + "author_site": "Yiting Cheng; Fangyun Wei; Jianmin Bao; Dong Chen; Fang Wen; Wenqiang Zhang", "author": "Yiting Cheng; Fangyun Wei; Jianmin Bao; Dong Chen; Fang Wen; Wenqiang Zhang", "abstract": "Domain adaptation for semantic segmentation enables to alleviate the need for large-scale pixel-wise annotations. Recently, self-supervised learning (SSL) with a combination of image-to-image translation shows great effectiveness in adaptive segmentation. The most common practice is to perform SSL along with image translation to well align a single domain (the source or target). However, in this single-domain paradigm, unavoidable visual inconsistency raised by image translation may affect subsequent learning. In this paper, based on the observation that domain adaptation frameworks performed in the source and target domain are almost complementary in terms of image translation and SSL, we propose a novel dual path learning (DPL) framework to alleviate visual inconsistency. Concretely, DPL contains two complementary and interactive single-domain adaptation pipelines aligned in source and target domain respectively. The inference of DPL is extremely simple, only one segmentation model in the target domain is employed. Novel technologies such as dual path image translation and dual path adaptive segmentation are proposed to make two paths promote each other in an interactive manner. Experiments on GTA5->Cityscapes and SYNTHIA->Cityscapes scenarios demonstrate the superiority of our DPL model over the state-of-the-art methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Cheng_Dual_Path_Learning_for_Domain_Adaptation_of_Semantic_Segmentation_ICCV_2021_paper.pdf", @@ -11130,14 +11885,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Cheng_Dual_Path_Learning_for_Domain_Adaptation_of_Semantic_Segmentation_ICCV_2021_paper.html", "aff_unique_index": "0;1;1;1;1;0", - "aff_unique_norm": "Fudan University;Microsoft", + "aff_unique_norm": "Fudan University;Microsoft Research", "aff_unique_dep": "School of Computer Science;Research", "aff_unique_url": "https://www.fudan.edu.cn;https://www.microsoft.com/en-us/research/group/asia", "aff_unique_abbr": "Fudan;MSR Asia", "aff_campus_unique_index": "1;1;1;1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Cheng_2021_ICCV,\n \n author = {\n Cheng,\n Yiting and Wei,\n Fangyun and Bao,\n Jianmin and Chen,\n Dong and Wen,\n Fang and Zhang,\n Wenqiang\n},\n title = {\n Dual Path Learning for Domain Adaptation of Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9082-9091\n} \n}" }, { "title": "Dual Projection Generative Adversarial Networks for Conditional Image Generation", @@ -11145,6 +11901,7 @@ "status": "Poster", "track": "main", "pid": 10701, + "author_site": "Ligong Han; Martin Renqiang Min; Anastasis Stathopoulos; Yu Tian; Ruijiang Gao; Asim Kadav; Dimitris N. Metaxas", "author": "Ligong Han; Martin Renqiang Min; Anastasis Stathopoulos; Yu Tian; Ruijiang Gao; Asim Kadav; Dimitris N. Metaxas", "abstract": "Conditional Generative Adversarial Networks (cGANs) extend the standard unconditional GAN framework to learning joint data-label distributions from samples, and have been established as powerful generative models capable of generating high-fidelity imagery. A challenge of training such a model lies in properly infusing class information into its generator and discriminator. For the discriminator, class conditioning can be achieved by either (1) directly incorporating labels as input or (2) involving labels in an auxiliary classification loss. In this paper, we show that the former directly aligns the class-conditioned fake-and-real data distributions P(\\text image |\\text class ) ( data matching ), while the latter aligns data-conditioned class distributions P(\\text class |\\text image ) ( label matching ). Although class separability does not directly translate to sample quality, the discriminator cannot provide useful guidance for the generator if features of distinct classes are mapped to the same point and thus become inseparable. Motivated by this intuition, we propose a Dual Projection GAN (P2GAN) model that learns to balance between data matching and label matching . We then propose an improved cGAN model with Auxiliary Classification that directly aligns the fake and real conditionals P(\\text class |\\text image ) by minimizing their f\\mhyphen\\text divergence . Experiments on a synthetic Mixture of Gaussian (MoG) dataset and a variety of real-world datasets including CIFAR100, ImageNet, and VGGFace2 demonstrate the efficacy of our proposed models.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Han_Dual_Projection_Generative_Adversarial_Networks_for_Conditional_Image_Generation_ICCV_2021_paper.pdf", @@ -11161,14 +11918,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Han_Dual_Projection_Generative_Adversarial_Networks_for_Conditional_Image_Generation_ICCV_2021_paper.html", "aff_unique_index": "0;1;0;0;2;1;0", - "aff_unique_norm": "Rutgers University;NEC Labs America;University of Texas at Austin", + "aff_unique_norm": "Rutgers University;NEC Labs America;The University of Texas at Austin", "aff_unique_dep": "Department of Computer Science;;McCombs School of Business", "aff_unique_url": "https://www.rutgers.edu;https://www.nec-labs.com;https://www.mccombs.utexas.edu", "aff_unique_abbr": "Rutgers;NEC LA;UT Austin", "aff_campus_unique_index": "1", "aff_campus_unique": ";Austin", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Han_2021_ICCV,\n \n author = {\n Han,\n Ligong and Min,\n Martin Renqiang and Stathopoulos,\n Anastasis and Tian,\n Yu and Gao,\n Ruijiang and Kadav,\n Asim and Metaxas,\n Dimitris N.\n},\n title = {\n Dual Projection Generative Adversarial Networks for Conditional Image Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14438-14447\n} \n}" }, { "title": "Dual Transfer Learning for Event-Based End-Task Prediction via Pluggable Event to Image Translation", @@ -11176,6 +11934,7 @@ "status": "Poster", "track": "main", "pid": 1576, + "author_site": "Lin Wang; Yujeong Chae; Kuk-Jin Yoon", "author": "Lin Wang; Yujeong Chae; Kuk-Jin Yoon", "abstract": "Event cameras are novel sensors that perceive the per-pixel intensity changes and output asynchronous event streams with high dynamic range and less motion blur. It has been shown that events alone can be used for end-task learning, e.g., semantic segmentation, based on encoder-decoder-like networks. However, as events are sparse and mostly reflect edge information, it is difficult to recover original details merely relying on the decoder. Moreover, most methods resort to the pixel-wise loss alone for supervision, which might be insufficient to fully exploit the visual details from sparse events, thus leading to less optimal performance. In this paper, we propose a simple yet flexible two-stream framework named Dual Transfer Learning (DTL) to effectively enhance the performance on the end-tasks without adding extra inference cost. The proposed approach consists of three parts: event to end-task learning (EEL) branch, event to image translation (EIT) branch, and transfer learning (TL) module that simultaneously explores the feature-level affinity information and pixel-level knowledge from the EIT branch to improve the EEL branch. This simple yet novel method leads to strong representation learning from events and is evidenced by the significant performance boost on the end-tasks such as semantic segmentation and depth estimation.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_Dual_Transfer_Learning_for_Event-Based_End-Task_Prediction_via_Pluggable_Event_ICCV_2021_paper.pdf", @@ -11199,7 +11958,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Lin and Chae,\n Yujeong and Yoon,\n Kuk-Jin\n},\n title = {\n Dual Transfer Learning for Event-Based End-Task Prediction via Pluggable Event to Image Translation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2135-2145\n} \n}" }, { "title": "Dual-Camera Super-Resolution With Aligned Attention Modules", @@ -11207,6 +11967,7 @@ "status": "Poster", "track": "main", "pid": 3776, + "author_site": "Tengfei Wang; Jiaxin Xie; Wenxiu Sun; Qiong Yan; Qifeng Chen", "author": "Tengfei Wang; Jiaxin Xie; Wenxiu Sun; Qiong Yan; Qifeng Chen", "abstract": "We present a novel approach to reference-based super-resolution (RefSR) with the focus on dual-camera super-resolution (DCSR), which utilizes reference images for high-quality and high-fidelity results. Our proposed method generalizes the standard patch-based feature matching with spatial alignment operations. We further explore the dual-camera super-resolution that is one promising application of RefSR, and build a dataset that consists of 146 image pairs from the main and telephoto cameras in a smartphone. To bridge the domain gaps between real-world images and the training images, we propose a self-supervised domain adaptation strategy for real-world images. Extensive experiments on our dataset and a public benchmark demonstrate clear improvement achieved by our method over state of the art in both quantitative evaluation and visual comparisons.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_Dual-Camera_Super-Resolution_With_Aligned_Attention_Modules_ICCV_2021_paper.pdf", @@ -11226,11 +11987,12 @@ "aff_unique_norm": "Hong Kong University of Science and Technology;SenseTime;Tetras AI", "aff_unique_dep": ";SenseTime Research;", "aff_unique_url": "https://www.ust.hk;https://www.sensetime.com;", - "aff_unique_abbr": "HKUST;SenseTime;Tetras AI", + "aff_unique_abbr": "HKUST;SenseTime;Tetras.AI", "aff_campus_unique_index": "0;0;;;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Tengfei and Xie,\n Jiaxin and Sun,\n Wenxiu and Yan,\n Qiong and Chen,\n Qifeng\n},\n title = {\n Dual-Camera Super-Resolution With Aligned Attention Modules\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2001-2010\n} \n}" }, { "title": "DualPoseNet: Category-Level 6D Object Pose and Size Estimation Using Dual Pose Network With Refined Learning of Pose Consistency", @@ -11238,10 +12000,11 @@ "status": "Poster", "track": "main", "pid": 4099, + "author_site": "Jiehong Lin; Zewei Wei; Zhihao Li; Songcen Xu; Kui Jia; Yuanqing Li", "author": "Jiehong Lin; Zewei Wei; Zhihao Li; Songcen Xu; Kui Jia; Yuanqing Li", "abstract": "Category-level 6D object pose and size estimation is to predict full pose configurations of rotation, translation, and size for object instances observed in single, arbitrary views of cluttered scenes. In this paper, we propose a new method of Dual Pose Network with refined learning of pose consistency for this task, shortened as DualPoseNet. DualPoseNet stacks two parallel pose decoders on top of a shared pose encoder, where the implicit decoder predicts object poses with a working mechanism different from that of the explicit one; they thus impose complementary supervision on the training of pose encoder. We construct the encoder based on spherical convolutions, and design a module of Spherical Fusion wherein for a better embedding of pose-sensitive features from the appearance and shape observations. Given no testing CAD models, it is the novel introduction of the implicit decoder that enables the refined pose prediction during testing, by enforcing the predicted pose consistency between the two decoders using a self-adaptive loss term. Thorough experiments on benchmarks of both category- and instance-level object pose datasets confirm efficacy of our designs. DualPoseNet outperforms existing methods with a large margin in the regime of high precision. Our code is released publicly at https://github.com/Gorilla-Lab-SCUT/DualPoseNet.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Lin_DualPoseNet_Category-Level_6D_Object_Pose_and_Size_Estimation_Using_Dual_ICCV_2021_paper.pdf", - "aff": "South China University of Technology; South China University of Technology+DexForce Technology Co. Ltd.; Noah\u2019s Ark Lab, Huawei Technologies Co. Ltd.; Noah\u2019s Ark Lab, Huawei Technologies Co. Ltd.; South China University of Technology; South China University of Technology", + "aff": "South China University of Technology; South China University of Technology+DexForce Technology Co. Ltd.; Noah’s Ark Lab, Huawei Technologies Co. Ltd.; Noah’s Ark Lab, Huawei Technologies Co. Ltd.; South China University of Technology; South China University of Technology", "project": "", "github": "https://github.com/Gorilla-Lab-SCUT/DualPoseNet", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Lin_DualPoseNet_Category-Level_6D_ICCV_2021_supplemental.pdf", @@ -11254,14 +12017,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Lin_DualPoseNet_Category-Level_6D_Object_Pose_and_Size_Estimation_Using_Dual_ICCV_2021_paper.html", "aff_unique_index": "0;0+1;2;2;0;0", - "aff_unique_norm": "South China University of Technology;DexForce Technology Co. Ltd.;Huawei", - "aff_unique_dep": ";;Noah\u2019s Ark Lab", + "aff_unique_norm": "South China University of Technology;DexForce Technology Co. Ltd.;Huawei Technologies Co. Ltd.", + "aff_unique_dep": ";;Noah’s Ark Lab", "aff_unique_url": "https://www.scut.edu.cn;;https://www.huawei.com", "aff_unique_abbr": "SCUT;;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Lin_2021_ICCV,\n \n author = {\n Lin,\n Jiehong and Wei,\n Zewei and Li,\n Zhihao and Xu,\n Songcen and Jia,\n Kui and Li,\n Yuanqing\n},\n title = {\n DualPoseNet: Category-Level 6D Object Pose and Size Estimation Using Dual Pose Network With Refined Learning of Pose Consistency\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3560-3569\n} \n}" }, { "title": "Dynamic Attentive Graph Learning for Image Restoration", @@ -11269,6 +12033,7 @@ "status": "Poster", "track": "main", "pid": 2229, + "author_site": "Chong Mou; Jian Zhang; Zhuoyuan Wu", "author": "Chong Mou; Jian Zhang; Zhuoyuan Wu", "abstract": "Non-local self-similarity in natural images has been verified to be an effective prior for image restoration. However, most existing deep non-local methods assign a fixed number of neighbors for each query item, neglecting the dynamics of non-local correlations. Moreover, the non-local correlations are usually based on pixels, prone to be biased due to image degradation. To rectify these weaknesses, in this paper, we propose a dynamic attentive graph learning model (DAGL) to explore the dynamic non-local property on patch level for image restoration. Specifically, we propose an improved graph model to perform patch-wise graph convolution with a dynamic and adaptive number of neighbors for each node. In this way, image content can adaptively balance over-smooth and over-sharp artifacts through the number of its connected neighbors, and the patch-wise non-local correlations can enhance the message passing process. Experimental results on various image restoration tasks: synthetic image denoising, real image denoising, image demosaicing, and compression artifact reduction show that our DAGL can produce state-of-the-art results with superior accuracy and visual quality. The source code is available at https://github.com/jianzhangcs/DAGL.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Mou_Dynamic_Attentive_Graph_Learning_for_Image_Restoration_ICCV_2021_paper.pdf", @@ -11285,14 +12050,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Mou_Dynamic_Attentive_Graph_Learning_for_Image_Restoration_ICCV_2021_paper.html", "aff_unique_index": "0;0+1;0", - "aff_unique_norm": "Peking University;Pengcheng Laboratory", - "aff_unique_dep": ";Peng Cheng Laboratory", + "aff_unique_norm": "Peking University;Peng Cheng Laboratory", + "aff_unique_dep": ";", "aff_unique_url": "http://www.pku.edu.cn;", "aff_unique_abbr": "PKU;", "aff_campus_unique_index": "0;0+0;0", "aff_campus_unique": "Shenzhen", "aff_country_unique_index": "0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Mou_2021_ICCV,\n \n author = {\n Mou,\n Chong and Zhang,\n Jian and Wu,\n Zhuoyuan\n},\n title = {\n Dynamic Attentive Graph Learning for Image Restoration\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4328-4337\n} \n}" }, { "title": "Dynamic CT Reconstruction From Limited Views With Implicit Neural Representations and Parametric Motion Fields", @@ -11300,6 +12066,7 @@ "status": "Poster", "track": "main", "pid": 6317, + "author_site": "Albert W. Reed; Hyojin Kim; Rushil Anirudh; K. Aditya Mohan; Kyle Champley; Jingu Kang; Suren Jayasuriya", "author": "Albert W. Reed; Hyojin Kim; Rushil Anirudh; K. Aditya Mohan; Kyle Champley; Jingu Kang; Suren Jayasuriya", "abstract": "Reconstructing dynamic, time-varying scenes with computed tomography (4D-CT) is a challenging and ill-posed problem common to industrial and medical settings. Existing 4D-CT reconstructions are designed for sparse sampling schemes that require fast CT scanners to capture multiple, rapid revolutions around the scene in order to generate high quality results. However, if the scene is moving too fast, then the sampling occurs along a limited view and is difficult to reconstruct due to spatiotemporal ambiguities. In this work, we design a reconstruction pipeline using implicit neural representations coupled with a novel parametric motion field warping to perform limited view 4D-CT reconstruction of rapidly deforming scenes. Importantly, we utilize a differentiable analysis-by-synthesis approach to compare with captured x-ray sinogram data in a self-supervised fashion. Thus, our resulting optimization method requires no training data to reconstruct the scene. We demonstrate that our proposed system robustly reconstructs scenes containing deformable and periodic motion and validate against state-of-the-art baselines. Further, we demonstrate an ability to reconstruct continuous spatiotemporal representations of our scenes and upsample them to arbitrary volumes and frame rates post-optimization. This research opens a new avenue for implicit neural representations in computed tomography reconstruction in general. Code is available at https://github.com/awreed/DynamicCTReconstruction.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Reed_Dynamic_CT_Reconstruction_From_Limited_Views_With_Implicit_Neural_Representations_ICCV_2021_paper.pdf", @@ -11323,7 +12090,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Reed_2021_ICCV,\n \n author = {\n Reed,\n Albert W. and Kim,\n Hyojin and Anirudh,\n Rushil and Mohan,\n K. Aditya and Champley,\n Kyle and Kang,\n Jingu and Jayasuriya,\n Suren\n},\n title = {\n Dynamic CT Reconstruction From Limited Views With Implicit Neural Representations and Parametric Motion Fields\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2258-2268\n} \n}" }, { "title": "Dynamic Context-Sensitive Filtering Network for Video Salient Object Detection", @@ -11331,6 +12099,7 @@ "status": "Poster", "track": "main", "pid": 7241, + "author_site": "Miao Zhang; Jie Liu; Yifei Wang; Yongri Piao; Shunyu Yao; Wei Ji; Jingjing Li; Huchuan Lu; Zhongxuan Luo", "author": "Miao Zhang; Jie Liu; Yifei Wang; Yongri Piao; Shunyu Yao; Wei Ji; Jingjing Li; Huchuan Lu; Zhongxuan Luo", "abstract": "The ability to capture inter-frame dynamics has been critical to the development of video salient object detection (VSOD). While many works have achieved great success in this field, a deeper insight into its dynamic nature should be developed. In this work, we aim to answer the following questions: How can a model adjust itself to dynamic variations as well as perceive fine differences in the real-world environment; How are the temporal dynamics well introduced into spatial information over time? To this end, we propose a dynamic context-sensitive filtering network (DCFNet) equipped with a dynamic context-sensitive filtering module (DCFM) and an effective bidirectional dynamic fusion strategy. The proposed DCFM sheds new light on dynamic filter generation by extracting location-related affinities between consecutive frames. Our bidirectional dynamic fusion strategy encourages the interaction of spatial and temporal information in a dynamic manner. Experimental results demonstrate that our proposed method can achieve state-of-the-art performance on most VSOD datasets while ensuring a real-time speed of 28 fps. The source code is publicly available at https://github.com/OIPLab-DUT/DCFNet.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhang_Dynamic_Context-Sensitive_Filtering_Network_for_Video_Salient_Object_Detection_ICCV_2021_paper.pdf", @@ -11345,7 +12114,8 @@ "aff_domain": ";;;;;;;;", "email": ";;;;;;;;", "author_num": 9, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhang_Dynamic_Context-Sensitive_Filtering_Network_for_Video_Salient_Object_Detection_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhang_Dynamic_Context-Sensitive_Filtering_Network_for_Video_Salient_Object_Detection_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Zhang_2021_ICCV,\n \n author = {\n Zhang,\n Miao and Liu,\n Jie and Wang,\n Yifei and Piao,\n Yongri and Yao,\n Shunyu and Ji,\n Wei and Li,\n Jingjing and Lu,\n Huchuan and Luo,\n Zhongxuan\n},\n title = {\n Dynamic Context-Sensitive Filtering Network for Video Salient Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1553-1563\n} \n}" }, { "title": "Dynamic Cross Feature Fusion for Remote Sensing Pansharpening", @@ -11353,6 +12123,7 @@ "status": "Poster", "track": "main", "pid": 10009, + "author_site": "Xiao Wu; Ting-Zhu Huang; Liang-Jian Deng; Tian-Jing Zhang", "author": "Xiao Wu; Ting-Zhu Huang; Liang-Jian Deng; Tian-Jing Zhang", "abstract": "Deep Convolution Neural Networks have been adopted for pansharpening and achieved state-of-the-art performance. However, most of the existing works mainly focus on single-scale feature fusion, which leads to failure in fully considering relationships of information between high-level semantics and low-level features, despite the network is deep enough. In this paper, we propose a dynamic cross feature fusion network (DCFNet) for pansharpening. Specifically, DCFNet contains multiple parallel branches, including a high-resolution branch served as the backbone, and the low-resolution branches progressively supplemented into the backbone. Thus our DCFNet can represent the overall information well. In order to enhance the relationships of inter-branches, dynamic cross feature transfers are embedded into multiple branches to obtain high-resolution representations. Then contextualized features will be learned to improve the fusion of information. Experimental results indicate that DCFNet significantly outperforms the prior arts in both quantitative indicators and visual qualities.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wu_Dynamic_Cross_Feature_Fusion_for_Remote_Sensing_Pansharpening_ICCV_2021_paper.pdf", @@ -11376,7 +12147,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wu_2021_ICCV,\n \n author = {\n Wu,\n Xiao and Huang,\n Ting-Zhu and Deng,\n Liang-Jian and Zhang,\n Tian-Jing\n},\n title = {\n Dynamic Cross Feature Fusion for Remote Sensing Pansharpening\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14687-14696\n} \n}" }, { "title": "Dynamic DETR: End-to-End Object Detection With Dynamic Attention", @@ -11384,6 +12156,7 @@ "status": "Poster", "track": "main", "pid": 9376, + "author_site": "Xiyang Dai; Yinpeng Chen; Jianwei Yang; Pengchuan Zhang; Lu Yuan; Lei Zhang", "author": "Xiyang Dai; Yinpeng Chen; Jianwei Yang; Pengchuan Zhang; Lu Yuan; Lei Zhang", "abstract": "In this paper, we present a novel Dynamic DETR (Detection with Transformers) approach by introducing dynamic attentions into both the encoder and decoder stages of DETR to break its two limitations on small feature resolution and slow training convergence. To address the first limitation, which is due to the quadratic computational complexity of the self-attention module in Transformer encoders, we propose a dynamic encoder to approximate the Transformer encoder's attention mechanism using a convolution-based dynamic encoder with various attention types. Such an encoder can dynamically adjust attentions based on multiple factors such as scale importance, spatial importance, and representation (i.e., feature dimension) importance. To mitigate the second limitation of learning difficulty, we introduce a dynamic decoder by replacing the cross-attention module with a ROI-based dynamic attention in the Transformer decoder. Such a decoder effectively assists Transformers to focus on region of interests from a coarse-to-fine manner and dramatically lowers the learning difficulty, leading to a much faster convergence with fewer training epochs. We conduct a series of experiments to demonstrate our advantages. Our Dynamic DETR significantly reduces the training epochs (by \\bf 14x ), yet results in a much better performance (by \\bf 3.6 on mAP). Meanwhile, in the standard 1x setup with ResNet-50 backbone, we archive a new state-of-the-art performance that further proves the learning effectiveness of the proposed approach. Code will be released soon.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Dai_Dynamic_DETR_End-to-End_Object_Detection_With_Dynamic_Attention_ICCV_2021_paper.pdf", @@ -11400,14 +12173,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Dai_Dynamic_DETR_End-to-End_Object_Detection_With_Dynamic_Attention_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;0;0;0", - "aff_unique_norm": "Microsoft", - "aff_unique_dep": "Microsoft Corporation", + "aff_unique_norm": "Microsoft Corporation", + "aff_unique_dep": "", "aff_unique_url": "https://www.microsoft.com", "aff_unique_abbr": "Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Dai_2021_ICCV,\n \n author = {\n Dai,\n Xiyang and Chen,\n Yinpeng and Yang,\n Jianwei and Zhang,\n Pengchuan and Yuan,\n Lu and Zhang,\n Lei\n},\n title = {\n Dynamic DETR: End-to-End Object Detection With Dynamic Attention\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2988-2997\n} \n}" }, { "title": "Dynamic Divide-and-Conquer Adversarial Training for Robust Semantic Segmentation", @@ -11415,6 +12189,7 @@ "status": "Poster", "track": "main", "pid": 4219, + "author_site": "Xiaogang Xu; Hengshuang Zhao; Jiaya Jia", "author": "Xiaogang Xu; Hengshuang Zhao; Jiaya Jia", "abstract": "Adversarial training is promising for improving robustness of deep neural networks towards adversarial perturbations, especially on the classification task. The effect of this type of training on semantic segmentation, contrarily, just commences. We make the initial attempt to explore the defense strategy on semantic segmentation by formulating a general adversarial training procedure that can perform decently on both adversarial and clean samples. We propose a dynamic divide-and-conquer adversarial training (DDC-AT) strategy to enhance the defense effect, by setting additional branches in the target model during training, and dealing with pixels with diverse properties towards adversarial perturbation. Our dynamical division mechanism divides pixels into multiple branches automatically. Note all these additional branches can be abandoned during inference and thus leave no extra parameter and computation cost. Extensive experiments with various segmentation models are conducted on PASCAL VOC 2012 and Cityscapes datasets, in which DDC-AT yields satisfying performance under both white- and black-box attack. The code is available at https://github.com/dvlab-research/Robust-Semantic-Segmentation.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xu_Dynamic_Divide-and-Conquer_Adversarial_Training_for_Robust_Semantic_Segmentation_ICCV_2021_paper.pdf", @@ -11431,14 +12206,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Xu_Dynamic_Divide-and-Conquer_Adversarial_Training_for_Robust_Semantic_Segmentation_ICCV_2021_paper.html", "aff_unique_index": "0;1+2;0+3", - "aff_unique_norm": "Chinese University of Hong Kong;University of Oxford;University of Hong Kong;SmartMore", + "aff_unique_norm": "The Chinese University of Hong Kong;University of Oxford;The University of Hong Kong;SmartMore", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.ox.ac.uk;https://www.hku.hk;", "aff_unique_abbr": "CUHK;Oxford;HKU;", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;1+0;0", - "aff_country_unique": "China;United Kingdom;" + "aff_country_unique": "China;United Kingdom;", + "bibtex": "@InProceedings{Xu_2021_ICCV,\n \n author = {\n Xu,\n Xiaogang and Zhao,\n Hengshuang and Jia,\n Jiaya\n},\n title = {\n Dynamic Divide-and-Conquer Adversarial Training for Robust Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7486-7495\n} \n}" }, { "title": "Dynamic Dual Gating Neural Networks", @@ -11446,6 +12222,7 @@ "status": "Poster", "track": "main", "pid": 9800, + "author_site": "Fanrong Li; Gang Li; Xiangyu He; Jian Cheng", "author": "Fanrong Li; Gang Li; Xiangyu He; Jian Cheng", "abstract": "In dynamic neural networks that adapt computations to different inputs, gating-based methods have demonstrated notable generality and applicability in trading-off the model complexity and accuracy. However, existing works only explore the redundancy from a single point of the network, limiting the performance. In this paper, we propose dual gating, a new dynamic computing method, to reduce the model complexity at run-time. For each convolutional block, dual gating identifies the informative features along two separate dimensions, spatial and channel. Specifically, the spatial gating module estimates which areas are essential, and the channel gating module predicts the salient channels that contribute more to the results. Then the computation of both unimportant regions and irrelevant channels can be skipped dynamically during inference. Extensive experiments on a variety of datasets demonstrate that our method can achieve higher accuracy under similar computing budgets compared with other dynamic execution methods. In particular, dynamic dual gating can provide 59.7% saving in computing of ResNet50 with 76.41% top-1 accuracy on ImageNet, which has advanced the state-of-the-art.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_Dynamic_Dual_Gating_Neural_Networks_ICCV_2021_paper.pdf", @@ -11469,7 +12246,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0+0;0;0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Fanrong and Li,\n Gang and He,\n Xiangyu and Cheng,\n Jian\n},\n title = {\n Dynamic Dual Gating Neural Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5330-5339\n} \n}" }, { "title": "Dynamic High-Pass Filtering and Multi-Spectral Attention for Image Super-Resolution", @@ -11477,6 +12255,7 @@ "status": "Poster", "track": "main", "pid": 2063, + "author_site": "Salma Abdel Magid; Yulun Zhang; Donglai Wei; Won-Dong Jang; Zudi Lin; Yun Fu; Hanspeter Pfister", "author": "Salma Abdel Magid; Yulun Zhang; Donglai Wei; Won-Dong Jang; Zudi Lin; Yun Fu; Hanspeter Pfister", "abstract": "Deep convolutional neural networks (CNNs) have pushed forward the frontier of super-resolution (SR) research. However, current CNN models exhibit a major flaw: they are biased towards learning low-frequency signals. This bias becomes more problematic for the image SR task which targets reconstructing all fine details and image textures. To tackle this challenge, we propose to improve the learning of high-frequency features both locally and globally and introduce two novel architectural units to existing SR models. Specifically, we propose a dynamic high-pass filtering (HPF) module that locally applies adaptive filter weights for each spatial location and channel group to preserve high-frequency signals. We also propose a matrix multi-spectral channel attention (MMCA) module that predicts the attention map of features decomposed in the frequency domain. This module operates in a global context to adaptively recalibrate feature responses at different frequencies. Extensive qualitative and quantitative results demonstrate that our proposed modules achieve better accuracy and visual improvements against state-of-the-art methods on several benchmark datasets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Magid_Dynamic_High-Pass_Filtering_and_Multi-Spectral_Attention_for_Image_Super-Resolution_ICCV_2021_paper.pdf", @@ -11491,7 +12270,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Magid_Dynamic_High-Pass_Filtering_and_Multi-Spectral_Attention_for_Image_Super-Resolution_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Magid_Dynamic_High-Pass_Filtering_and_Multi-Spectral_Attention_for_Image_Super-Resolution_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Magid_2021_ICCV,\n \n author = {\n Magid,\n Salma Abdel and Zhang,\n Yulun and Wei,\n Donglai and Jang,\n Won-Dong and Lin,\n Zudi and Fu,\n Yun and Pfister,\n Hanspeter\n},\n title = {\n Dynamic High-Pass Filtering and Multi-Spectral Attention for Image Super-Resolution\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4288-4297\n} \n}" }, { "title": "Dynamic Network Quantization for Efficient Video Inference", @@ -11499,6 +12279,7 @@ "status": "Poster", "track": "main", "pid": 6281, + "author_site": "Ximeng Sun; Rameswar Panda; Chun-Fu (Richard) Chen; Aude Oliva; Rogerio Feris; Kate Saenko", "author": "Ximeng Sun; Rameswar Panda; Chun-Fu (Richard) Chen; Aude Oliva; Rogerio Feris; Kate Saenko", "abstract": "Deep convolutional networks have recently achieved great success in video recognition, yet their practical realization remains a challenge due to the large amount of computational resources required to achieve robust recognition. Motivated by the effectiveness of quantization for boosting efficiency, in this paper, we propose a dynamic network quantization framework, that selects optimal precision for each frame conditioned on the input for efficient video recognition. Specifically, given a video clip, we train a very lightweight network in parallel with the recognition network, to produce a dynamic policy indicating which numerical precision to be used per frame in recognizing videos. We train both networks effectively using standard backpropagation with a loss to achieve both competitive performance and resource efficiency required for video recognition. Extensive experiments on four challenging diverse benchmark datasets demonstrate that our proposed approach provides significant savings in computation and memory usage while outperforming the existing state-of-the-art methods. Project page: https://cs-people.bu.edu/sunxm/VideoIQ/project.html.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Sun_Dynamic_Network_Quantization_for_Efficient_Video_Inference_ICCV_2021_paper.pdf", @@ -11522,7 +12303,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Sun_2021_ICCV,\n \n author = {\n Sun,\n Ximeng and Panda,\n Rameswar and Chen,\n Chun-Fu (Richard) and Oliva,\n Aude and Feris,\n Rogerio and Saenko,\n Kate\n},\n title = {\n Dynamic Network Quantization for Efficient Video Inference\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7375-7385\n} \n}" }, { "title": "Dynamic Surface Function Networks for Clothed Human Bodies", @@ -11530,7 +12312,8 @@ "status": "Poster", "track": "main", "pid": 3901, - "author": "Andrei Burov; Matthias Nie\u00dfner; Justus Thies", + "author_site": "Andrei Burov; Matthias Nießner; Justus Thies", + "author": "Andrei Burov; Matthias Nießner; Justus Thies", "abstract": "We present a novel method for temporal coherent reconstruction and tracking of clothed humans. Given a monocular RGB-D sequence, we learn a person-specific body model which is based on a dynamic surface function network. To this end, we explicitly model the surface of the person using a multi-layer perceptron (MLP) which is embedded into the canonical space of the SMPL body model. With classical forward rendering, the represented surface can be rasterized using the topology of a template mesh. For each surface point of the template mesh, the MLP is evaluated to predict the actual surface location. To handle pose-dependent deformations, the MLP is conditioned on the SMPL pose parameters. We show that this surface representation as well as the pose parameters can be learned in a self-supervised fashion using the principle of analysis-by-synthesis and differentiable rasterization. As a result, we are able to reconstruct a temporally coherent mesh sequence from the input data. The underlying surface representation can be used to synthesize new animations of the reconstructed person including pose-dependent deformations.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Burov_Dynamic_Surface_Function_Networks_for_Clothed_Human_Bodies_ICCV_2021_paper.pdf", "aff": ";;", @@ -11544,7 +12327,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Burov_Dynamic_Surface_Function_Networks_for_Clothed_Human_Bodies_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Burov_Dynamic_Surface_Function_Networks_for_Clothed_Human_Bodies_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Burov_2021_ICCV,\n \n author = {\n Burov,\n Andrei and Nie{\\ss\n}ner,\n Matthias and Thies,\n Justus\n},\n title = {\n Dynamic Surface Function Networks for Clothed Human Bodies\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10754-10764\n} \n}" }, { "title": "Dynamic View Synthesis From Dynamic Monocular Video", @@ -11552,6 +12336,7 @@ "status": "Poster", "track": "main", "pid": 2890, + "author_site": "Chen Gao; Ayush Saraf; Johannes Kopf; Jia-Bin Huang", "author": "Chen Gao; Ayush Saraf; Johannes Kopf; Jia-Bin Huang", "abstract": "We present an algorithm for generating novel views at arbitrary viewpoints and any input time step given a monocular video of a dynamic scene. Our work builds upon recent advances in neural implicit representation and uses continuous and differentiable functions for modeling the time-varying structure and the appearance of the scene. We jointly train a time-invariant static NeRF and a time-varying dynamic NeRF, and learn how to blend the results in an unsupervised manner. However, learning this implicit function from a single video is highly ill-posed (with infinitely many solutions that match the input video). To resolve the ambiguity, we introduce regularization losses to encourage a more physically plausible solution. We show extensive quantitative and qualitative results of dynamic view synthesis from casually captured videos.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Gao_Dynamic_View_Synthesis_From_Dynamic_Monocular_Video_ICCV_2021_paper.pdf", @@ -11568,14 +12353,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Gao_Dynamic_View_Synthesis_From_Dynamic_Monocular_Video_ICCV_2021_paper.html", "aff_unique_index": "0;1;1;0", - "aff_unique_norm": "Virginia Tech;Meta", - "aff_unique_dep": ";Facebook, Inc.", + "aff_unique_norm": "Virginia Tech;Facebook, Inc.", + "aff_unique_dep": ";", "aff_unique_url": "https://www.vt.edu;https://www.facebook.com", "aff_unique_abbr": "VT;FB", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Gao_2021_ICCV,\n \n author = {\n Gao,\n Chen and Saraf,\n Ayush and Kopf,\n Johannes and Huang,\n Jia-Bin\n},\n title = {\n Dynamic View Synthesis From Dynamic Monocular Video\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5712-5721\n} \n}" }, { "title": "Dynamical Pose Estimation", @@ -11583,6 +12369,7 @@ "status": "Poster", "track": "main", "pid": 6651, + "author_site": "Heng Yang; Chris Doran; Jean-Jacques Slotine", "author": "Heng Yang; Chris Doran; Jean-Jacques Slotine", "abstract": "We study the problem of aligning two sets of 3D geometric primitives given known correspondences. Our first contribution is to show that this primitive alignment framework unifies five perception problems including point cloud registration, primitive (mesh) registration, category-level 3D registration, absolution pose estimation (APE), and category-level APE. Our second contribution is to propose DynAMical Pose estimation (DAMP), the first general and practical algorithm to solve primitive alignment problem by simulating rigid body dynamics arising from virtual springs and damping, where the springs span the shortest distances between corresponding primitives. We evaluate DAMP in simulated and real datasets across all five problems, and demonstrate (i) DAMP always converges to the globally optimal solution in the first three problems with 3D-3D correspondences; (ii) although DAMP sometimes converges to suboptimal solutions in the last two problems with 2D-3D correspondences, using a scheme for escaping local minima, DAMP always succeeds. Our third contribution is to demystify the surprising empirical performance of DAMP and formally prove a global convergence result in the case of point cloud registration by charactering local stability of the equilibrium points of the underlying dynamical system.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yang_Dynamical_Pose_Estimation_ICCV_2021_paper.pdf", @@ -11597,7 +12384,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Yang_Dynamical_Pose_Estimation_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Yang_Dynamical_Pose_Estimation_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Yang_2021_ICCV,\n \n author = {\n Yang,\n Heng and Doran,\n Chris and Slotine,\n Jean-Jacques\n},\n title = {\n Dynamical Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5926-5935\n} \n}" }, { "title": "E-ViL: A Dataset and Benchmark for Natural Language Explanations in Vision-Language Tasks", @@ -11605,10 +12393,11 @@ "status": "Poster", "track": "main", "pid": 10711, + "author_site": "Maxime Kayser; Oana-Maria Camburu; Leonard Salewski; Cornelius Emde; Virginie Do; Zeynep Akata; Thomas Lukasiewicz", "author": "Maxime Kayser; Oana-Maria Camburu; Leonard Salewski; Cornelius Emde; Virginie Do; Zeynep Akata; Thomas Lukasiewicz", "abstract": "Recently, there has been an increasing number of efforts to introduce models capable of generating natural language explanations (NLEs) for their predictions on vision-language (VL) tasks. Such models are appealing, because they can provide human-friendly and comprehensive explanations. However, there is a lack of comparison between existing methods, which is due to a lack of re-usable evaluation frameworks and a scarcity of datasets. In this work, we introduce e-ViL and e-SNLI-VE. e-ViL is a benchmark for explainable vision-language tasks that establishes a unified evaluation framework and provides the first comprehensive comparison of existing approaches that generate NLEs for VL tasks. It spans four models and three datasets and both automatic metrics and human evaluation are used to assess model-generated explanations. e-SNLI-VE is currently the largest existing VL dataset with NLEs (over 430k instances). We also propose a new model that combines UNITER, which learns joint embeddings of images and text, and GPT-2, a pre-trained language model that is well-suited for text generation. It surpasses the previous state of the art by a large margin across all datasets. Code and data are available here: https://github.com/maximek3/e-ViL.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Kayser_E-ViL_A_Dataset_and_Benchmark_for_Natural_Language_Explanations_in_ICCV_2021_paper.pdf", - "aff": "Department of Computer Science, University of Oxford; Department of Computer Science, University of Oxford; University of T\u00fcbingen; Department of Computer Science, University of Oxford; Department of Computer Science, University of Oxford + Universit\u00e9 Paris-Dauphine, PSL, and Facebook AI Research; University of T\u00fcbingen + Max Planck Institute for Intelligent Systems + Max Planck Institute for Informatics; Department of Computer Science, University of Oxford", + "aff": "Department of Computer Science, University of Oxford; Department of Computer Science, University of Oxford; University of Tübingen; Department of Computer Science, University of Oxford; Department of Computer Science, University of Oxford + Université Paris-Dauphine, PSL, and Facebook AI Research; University of Tübingen + Max Planck Institute for Intelligent Systems + Max Planck Institute for Informatics; Department of Computer Science, University of Oxford", "project": "", "github": "https://github.com/maximek3/e-ViL", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Kayser_E-ViL_A_Dataset_ICCV_2021_supplemental.pdf", @@ -11621,14 +12410,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Kayser_E-ViL_A_Dataset_and_Benchmark_for_Natural_Language_Explanations_in_ICCV_2021_paper.html", "aff_unique_index": "0;0;1;0;0+2;1+3+4;0", - "aff_unique_norm": "University of Oxford;University of T\u00fcbingen;Universit\u00e9 Paris-Dauphine;Max Planck Institute for Intelligent Systems;Max Planck Institute for Informatics", + "aff_unique_norm": "University of Oxford;University of Tübingen;Université Paris-Dauphine;Max Planck Institute for Intelligent Systems;Max Planck Institute for Informatics", "aff_unique_dep": "Department of Computer Science;;;Intelligent Systems;", "aff_unique_url": "https://www.ox.ac.uk;https://www.uni-tuebingen.de/;https://www.univ-paris-dauphine.fr;https://www.mpi-is.mpg.de;https://mpi-inf.mpg.de", - "aff_unique_abbr": "Oxford;Uni T\u00fcbingen;UPD;MPI-IS;MPII", + "aff_unique_abbr": "Oxford;Uni Tübingen;UPD;MPI-IS;MPII", "aff_campus_unique_index": "0;0;0;0;;0", "aff_campus_unique": "Oxford;", "aff_country_unique_index": "0;0;1;0;0+2;1+1+1;0", - "aff_country_unique": "United Kingdom;Germany;France" + "aff_country_unique": "United Kingdom;Germany;France", + "bibtex": "@InProceedings{Kayser_2021_ICCV,\n \n author = {\n Kayser,\n Maxime and Camburu,\n Oana-Maria and Salewski,\n Leonard and Emde,\n Cornelius and Do,\n Virginie and Akata,\n Zeynep and Lukasiewicz,\n Thomas\n},\n title = {\n E-ViL: A Dataset and Benchmark for Natural Language Explanations in Vision-Language Tasks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1244-1254\n} \n}" }, { "title": "EC-DARTS: Inducing Equalized and Consistent Optimization Into DARTS", @@ -11636,10 +12426,11 @@ "status": "Poster", "track": "main", "pid": 2157, + "author_site": "Qinqin Zhou; Xiawu Zheng; Liujuan Cao; Bineng Zhong; Teng Xi; Gang Zhang; Errui Ding; Mingliang Xu; Rongrong Ji", "author": "Qinqin Zhou; Xiawu Zheng; Liujuan Cao; Bineng Zhong; Teng Xi; Gang Zhang; Errui Ding; Mingliang Xu; Rongrong Ji", "abstract": "Based on the relaxed search space, differential architecture search (DARTS) is efficient in searching for a high-performance architecture. However, the unbalanced competition among operations that have different trainable parameters causes the model collapse. Besides, the inconsistent structures in the search and retraining stages causes cross-stage evaluation to be unstable. In this paper, we call these issues as an operation gap and a structure gap in DARTS. To shrink these gaps, we propose to induce equalized and consistent optimization in differentiable architecture search (EC-DARTS). EC-DARTS decouples different operations based on their categories to optimize the operation weights so that the operation gap between them is shrinked. Besides, we introduce an induced structural transition to bridge the structure gap between the model structures in the search and retraining stages. Extensive experiments on CIFAR10 and ImageNet demonstrate the effectiveness of our method. Specifically, on CIFAR10, we achieve a test error of 2.39%, while only 0.3 GPU days on NVIDIA TITAN V. On ImageNet, our method achieves a top-1 error of 23.6% under the mobile setting.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhou_EC-DARTS_Inducing_Equalized_and_Consistent_Optimization_Into_DARTS_ICCV_2021_paper.pdf", - "aff": "MAC Lab, School of Informatics, Xiamen University; MAC Lab, School of Informatics, Xiamen University; MAC Lab, School of Informatics, Xiamen University; Guangxi Key Lab of Multi-Source Information Mining & Security, Guangxi Normal University; Department of Computer Vision Technology (VIS), Baidu Inc.; Department of Computer Vision Technology (VIS), Baidu Inc.; Department of Computer Vision Technology (VIS), Baidu Inc.; Zhengzhou University; MAC Lab, School of Informatics, Xiamen University + Institute of Arti\ufb01cial Intelligence, Xiamen University", + "aff": "MAC Lab, School of Informatics, Xiamen University; MAC Lab, School of Informatics, Xiamen University; MAC Lab, School of Informatics, Xiamen University; Guangxi Key Lab of Multi-Source Information Mining & Security, Guangxi Normal University; Department of Computer Vision Technology (VIS), Baidu Inc.; Department of Computer Vision Technology (VIS), Baidu Inc.; Department of Computer Vision Technology (VIS), Baidu Inc.; Zhengzhou University; MAC Lab, School of Informatics, Xiamen University + Institute of Artificial Intelligence, Xiamen University", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Zhou_EC-DARTS_Inducing_Equalized_ICCV_2021_supplemental.pdf", @@ -11652,14 +12443,15 @@ "author_num": 9, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhou_EC-DARTS_Inducing_Equalized_and_Consistent_Optimization_Into_DARTS_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;1;2;2;2;3;0+0", - "aff_unique_norm": "Xiamen University;Guangxi Normal University;Baidu;Zhengzhou University", + "aff_unique_norm": "Xiamen University;Guangxi Normal University;Baidu Inc.;Zhengzhou University", "aff_unique_dep": "School of Informatics;Guangxi Key Lab of Multi-Source Information Mining & Security;Department of Computer Vision Technology (VIS);", "aff_unique_url": "https://www.xmu.edu.cn;http://www.gxnu.edu.cn;https://www.baidu.com;http://www.zzu.edu.cn", "aff_unique_abbr": "XMU;;Baidu;ZZU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhou_2021_ICCV,\n \n author = {\n Zhou,\n Qinqin and Zheng,\n Xiawu and Cao,\n Liujuan and Zhong,\n Bineng and Xi,\n Teng and Zhang,\n Gang and Ding,\n Errui and Xu,\n Mingliang and Ji,\n Rongrong\n},\n title = {\n EC-DARTS: Inducing Equalized and Consistent Optimization Into DARTS\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11986-11995\n} \n}" }, { "title": "ECACL: A Holistic Framework for Semi-Supervised Domain Adaptation", @@ -11667,6 +12459,7 @@ "status": "Poster", "track": "main", "pid": 5418, + "author_site": "Kai Li; Chang Liu; Handong Zhao; Yulun Zhang; Yun Fu", "author": "Kai Li; Chang Liu; Handong Zhao; Yulun Zhang; Yun Fu", "abstract": "This paper studies Semi-Supervised Domain Adaptation (SSDA), a practical yet under-investigated research topic that aims to learn a model of good performance using unlabeled samples and a few labeled samples in the target domain, with the help of labeled samples from a source domain. Several SSDA methods have been proposed recently, which however fail to fully exploit the value of the few labeled target samples. In this paper, we propose Enhanced Categorical Alignment and Consistency Learning (ECACL), a holistic SSDA framework that incorporates multiple mutually complementary domain alignment techniques. ECACL includes two categorical domain alignment techniques that achieve class-level alignment, a strong data augmentation based technique that enhances the model's generalizability and a consistency learning based technique that forces the model to be robust with image perturbations. These techniques are applied on one or multiple of the three inputs (labeled source, unlabeled target, and labeled target) and align the domains from different perspectives. ECACL unifies them together and achieves fairly comprehensive domain alignments that are much better than the existing methods: For example, ECACL raises the state-of-the-art accuracy from 68.4 to 81.1 on VisDA2017 and from 45.5 to 53.4 on DomainNet for the 1-shot setting. Our code is available at https://github.com/kailigo/pacl.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_ECACL_A_Holistic_Framework_for_Semi-Supervised_Domain_Adaptation_ICCV_2021_paper.pdf", @@ -11690,7 +12483,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Kai and Liu,\n Chang and Zhao,\n Handong and Zhang,\n Yulun and Fu,\n Yun\n},\n title = {\n ECACL: A Holistic Framework for Semi-Supervised Domain Adaptation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8578-8587\n} \n}" }, { "title": "ECS-Net: Improving Weakly Supervised Semantic Segmentation by Using Connections Between Class Activation Maps", @@ -11698,6 +12492,7 @@ "status": "Poster", "track": "main", "pid": 2955, + "author_site": "Kunyang Sun; Haoqing Shi; Zhengming Zhang; Yongming Huang", "author": "Kunyang Sun; Haoqing Shi; Zhengming Zhang; Yongming Huang", "abstract": "Image-level weakly supervised semantic segmentation is a challenging task. As classification networks tend to capture notable object features and are insensitive to overactivation, class activation map (CAM) is too sparse and rough to guide segmentation network training. Inspired by the fact that erasing distinguishing features force networks to collect new ones from non-discriminative object regions, we using relationships between CAMs to propose a novel weakly supervised method. In this work, we apply these features, learned from erased images, as segmentation supervision, driving network to study robust representation. In specifically, object regions obtained by CAM techniques are erased on images firstly. To provide other regions with segmentation supervision, Erased CAM Supervision Net (ECSNet) generates pixel-level labels by predicting segmentation results of those processed images. We also design the rule of suppressing noise to select reliable labels. Our experiments on PASCAL VOC 2012 dataset show that without data annotations except for ground truth image-level labels, our ECS-Net achieves 67.6% mIoU on test set and 66.6% mIoU on val set, outperforming previous state-of-the-art methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Sun_ECS-Net_Improving_Weakly_Supervised_Semantic_Segmentation_by_Using_Connections_Between_ICCV_2021_paper.pdf", @@ -11716,12 +12511,13 @@ "aff_unique_index": "0+1;0+1;0+1;0+1", "aff_unique_norm": "Southeast University;Purple Mountain Laboratories", "aff_unique_dep": "National Mobile Communications Research Laboratory;Pervasive Communication Research Center", - "aff_unique_url": "https://www.seu.edu.cn/;http://www.pmlab.com.cn", + "aff_unique_url": "https://www.seu.edu.cn/;http://www.pmlab.com.cn/", "aff_unique_abbr": ";", "aff_campus_unique_index": ";;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Sun_2021_ICCV,\n \n author = {\n Sun,\n Kunyang and Shi,\n Haoqing and Zhang,\n Zhengming and Huang,\n Yongming\n},\n title = {\n ECS-Net: Improving Weakly Supervised Semantic Segmentation by Using Connections Between Class Activation Maps\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7283-7292\n} \n}" }, { "title": "ELF-VC: Efficient Learned Flexible-Rate Video Coding", @@ -11729,6 +12525,7 @@ "status": "Poster", "track": "main", "pid": 5856, + "author_site": "Oren Rippel; Alexander G. Anderson; Kedar Tatwawadi; Sanjay Nair; Craig Lytle; Lubomir Bourdev", "author": "Oren Rippel; Alexander G. Anderson; Kedar Tatwawadi; Sanjay Nair; Craig Lytle; Lubomir Bourdev", "abstract": "While learned video codecs have demonstrated great promise, they have yet to achieve sufficient efficiency for practical deployment. In this work, we propose several ideas for learned video compression which allow for improved performance for the low-latency mode (I- and P-frames only) along with a considerable increase in computational efficiency. In this setting, for natural videos our approach compares favorably across the entire R-D curve under metrics PSNR, MS-SSIM and VMAF against all mainstream video standards (H.264, H.265, AV1) and all ML codecs. At the same time, our approach runs at least 5x faster and has fewer parameters than all ML codecs which report these figures. Our contributions include a flexible-rate framework allowing a single model to cover a large and dense range of bitrates, at a negligible increase in computation and parameter count; an efficient backbone optimized for ML-based codecs; and a novel in-loop flow prediction scheme which leverages prior information towards more efficient compression. We benchmark our method, which we call ELF-VC (Efficient, Learned and Flexible Video Coding) on popular video test sets UVG and MCL-JCV under metrics PSNR, MS-SSIM and VMAF. For example, on UVG under PSNR, it reduces the BD-rate by 44% against H.264, 26% against H.265, 15% against AV1, 35% against the current best ML codec. At the same time, on an NVIDIA Titan V GPU our approach encodes/decodes VGA at 49/91 FPS, HD 720 at 19/35 FPS, and HD 1080 at 10/18 FPS.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Rippel_ELF-VC_Efficient_Learned_Flexible-Rate_Video_Coding_ICCV_2021_paper.pdf", @@ -11752,7 +12549,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Rippel_2021_ICCV,\n \n author = {\n Rippel,\n Oren and Anderson,\n Alexander G. and Tatwawadi,\n Kedar and Nair,\n Sanjay and Lytle,\n Craig and Bourdev,\n Lubomir\n},\n title = {\n ELF-VC: Efficient Learned Flexible-Rate Video Coding\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14479-14488\n} \n}" }, { "title": "ELLIPSDF: Joint Object Pose and Shape Optimization With a Bi-Level Ellipsoid and Signed Distance Function Description", @@ -11760,6 +12558,7 @@ "status": "Poster", "track": "main", "pid": 2741, + "author_site": "Mo Shan; Qiaojun Feng; You-Yi Jau; Nikolay Atanasov", "author": "Mo Shan; Qiaojun Feng; You-Yi Jau; Nikolay Atanasov", "abstract": "Autonomous systems need to understand the semantics and geometry of their surroundings in order to comprehend and safely execute object-level task specifications. This paper proposes an expressive yet compact model for joint object pose and shape optimization, and an associated optimization algorithm to infer an object-level map from multi-view RGB-D camera observations. The model is expressive because it captures the identities, positions, orientations, and shapes of objects in the environment. It is compact because it relies on a low-dimensional latent representation of implicit object shape, allowing onboard storage of large multi-category object maps. Different from other works that rely on a single object representation format, our approach has a bi-level object model that captures both the coarse level scale as well as the fine level shape details. Our approach is evaluated on the large-scale real-world ScanNet dataset and compared against state-of-the-art methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Shan_ELLIPSDF_Joint_Object_Pose_and_Shape_Optimization_With_a_Bi-Level_ICCV_2021_paper.pdf", @@ -11783,7 +12582,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "San Diego", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Shan_2021_ICCV,\n \n author = {\n Shan,\n Mo and Feng,\n Qiaojun and Jau,\n You-Yi and Atanasov,\n Nikolay\n},\n title = {\n ELLIPSDF: Joint Object Pose and Shape Optimization With a Bi-Level Ellipsoid and Signed Distance Function Description\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5946-5955\n} \n}" }, { "title": "ELSD: Efficient Line Segment Detector and Descriptor", @@ -11791,6 +12591,7 @@ "status": "Poster", "track": "main", "pid": 3054, + "author_site": "Haotian Zhang; Yicheng Luo; Fangbo Qin; Yijia He; Xiao Liu", "author": "Haotian Zhang; Yicheng Luo; Fangbo Qin; Yijia He; Xiao Liu", "abstract": "We present the novel Efficient Line Segment Detector and Descriptor (ELSD) to simultaneously detect line segments and extract their descriptors in an image. Unlike the traditional pipelines that conduct detection and description separately, ELSD utilizes a shared feature extractor for both detection and description, to provide the essential line features to the higher-level tasks like SLAM and image matching in real time. First, we design a one-stage compact model, and propose to use the mid-point, angle and length as the minimal representation of line segment, which also guarantees the center-symmetry. The non-centerness suppression is proposed to filter out the fragmented line segments caused by lines' intersections. The fine offset prediction is designed to refine the mid-point localization. Second, the line descriptor branch is integrated with the detector branch, and the two branches are jointly trained in an end-to-end manner. In the experiments, the proposed ELSD achieves the state-of-the-art performance on the Wireframe dataset and YorkUrban dataset, in both accuracy and efficiency. The line description ability of ELSD also outperforms the previous works on the line matching task.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhang_ELSD_Efficient_Line_Segment_Detector_and_Descriptor_ICCV_2021_paper.pdf", @@ -11814,7 +12615,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2021_ICCV,\n \n author = {\n Zhang,\n Haotian and Luo,\n Yicheng and Qin,\n Fangbo and He,\n Yijia and Liu,\n Xiao\n},\n title = {\n ELSD: Efficient Line Segment Detector and Descriptor\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2969-2978\n} \n}" }, { "title": "EM-POSE: 3D Human Pose Estimation From Sparse Electromagnetic Trackers", @@ -11822,6 +12624,7 @@ "status": "Poster", "track": "main", "pid": 4305, + "author_site": "Manuel Kaufmann; Yi Zhao; Chengcheng Tang; Lingling Tao; Christopher Twigg; Jie Song; Robert Wang; Otmar Hilliges", "author": "Manuel Kaufmann; Yi Zhao; Chengcheng Tang; Lingling Tao; Christopher Twigg; Jie Song; Robert Wang; Otmar Hilliges", "abstract": "Fully immersive experiences in AR/VR depend on reconstructing the full body pose of the user without restricting their motion. In this paper we study the use of body-worn electromagnetic (EM) field-based sensing for the task of 3D human pose reconstruction. To this end, we present a method to estimate SMPL parameters from 6-12 EM sensors. We leverage a customized wearable system consisting of wireless EM sensors measuring time-synchronized 6D poses at 120 Hz. To provide accurate poses even with little user instrumentation, we adopt a recently proposed hybrid framework, learned gradient descent (LGD), to iteratively estimate SMPL pose and shape from our input measurements. This allows us to harness powerful pose priors to cope with the idiosyncrasies of the input data and achieve accurate pose estimates. The proposed method uses AMASS to synthesize virtual EM-sensor data and we show that it generalizes well to a newly captured real dataset consisting of a total of 36 minutes of motion from 5 subjects. We achieve reconstruction errors as low as 31.8 mm and 13.3 degrees, outperforming both pure learning- and pure optimization-based methods. Code and data is available under https://ait.ethz.ch/projects/2021/em-pose.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Kaufmann_EM-POSE_3D_Human_Pose_Estimation_From_Sparse_Electromagnetic_Trackers_ICCV_2021_paper.pdf", @@ -11836,7 +12639,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Kaufmann_EM-POSE_3D_Human_Pose_Estimation_From_Sparse_Electromagnetic_Trackers_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Kaufmann_EM-POSE_3D_Human_Pose_Estimation_From_Sparse_Electromagnetic_Trackers_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Kaufmann_2021_ICCV,\n \n author = {\n Kaufmann,\n Manuel and Zhao,\n Yi and Tang,\n Chengcheng and Tao,\n Lingling and Twigg,\n Christopher and Song,\n Jie and Wang,\n Robert and Hilliges,\n Otmar\n},\n title = {\n EM-POSE: 3D Human Pose Estimation From Sparse Electromagnetic Trackers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11510-11520\n} \n}" }, { "title": "EPP-MVSNet: Epipolar-Assembling Based Depth Prediction for Multi-View Stereo", @@ -11844,6 +12648,7 @@ "status": "Poster", "track": "main", "pid": 9564, + "author_site": "Xinjun Ma; Yue Gong; Qirui Wang; Jingwei Huang; Lei Chen; Fan Yu", "author": "Xinjun Ma; Yue Gong; Qirui Wang; Jingwei Huang; Lei Chen; Fan Yu", "abstract": "In this paper, we proposed EPP-MVSNet, a novel deep learning network for 3D reconstruction from multi-view stereo (MVS). EPP-MVSNet can accurately aggregate features at high resolution to a limited cost volume with an optimal depth range, thus, leads to effective and efficient 3D construction. Distinct from existing works which measure feature cost at discrete positions which affects the 3D reconstruction accuracy, EPP-MVSNet introduces an epipolar assembling-based kernel that operates on adaptive intervals along epipolar lines for making full use of the image resolution. Further, we introduce an entropy-based refining strategy where the cost volume describes the space geometry with the little redundancy. Moreover, we design a light-weighted network with Pseudo-3D convolutions integrated to achieve high accuracy and efficiency. We have conducted extensive experiments on challenging datasets Tanks & Temples(TNT), ETH3D and DTU. As a result, we achieve promising results on all datasets and the highest F-Score on the online TNT intermediate benchmark. Code is available at https://gitee.com/mindspore/mindspore/tree/master/model_zoo/research/cv/eppmvsnet.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ma_EPP-MVSNet_Epipolar-Assembling_Based_Depth_Prediction_for_Multi-View_Stereo_ICCV_2021_paper.pdf", @@ -11860,14 +12665,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Ma_EPP-MVSNet_Epipolar-Assembling_Based_Depth_Prediction_for_Multi-View_Stereo_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;0;1;0", - "aff_unique_norm": "Huawei;Hong Kong University of Science and Technology", + "aff_unique_norm": "Huawei Technologies;Hong Kong University of Science and Technology", "aff_unique_dep": "Distributed and Parallel Software Lab;Department of Computer Science and Engineering", "aff_unique_url": "https://www.huawei.com;https://www.ust.hk", "aff_unique_abbr": "Huawei;HKUST", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Ma_2021_ICCV,\n \n author = {\n Ma,\n Xinjun and Gong,\n Yue and Wang,\n Qirui and Huang,\n Jingwei and Chen,\n Lei and Yu,\n Fan\n},\n title = {\n EPP-MVSNet: Epipolar-Assembling Based Depth Prediction for Multi-View Stereo\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5732-5740\n} \n}" }, { "title": "Editing Conditional Radiance Fields", @@ -11875,6 +12681,7 @@ "status": "Poster", "track": "main", "pid": 4327, + "author_site": "Steven Liu; Xiuming Zhang; Zhoutong Zhang; Richard Zhang; Jun-Yan Zhu; Bryan Russell", "author": "Steven Liu; Xiuming Zhang; Zhoutong Zhang; Richard Zhang; Jun-Yan Zhu; Bryan Russell", "abstract": "A neural radiance field (NeRF) is a scene model supporting high-quality view synthesis, optimized per scene. In this paper, we explore enabling user editing of a category-level NeRF trained on a shape category. Specifically, we propose a method for propagating coarse 2D user scribbles to the 3D space, to modify the color or shape of a local region. First, we propose a conditional radiance field that incorporates new modular network components, including a branch that is shared across object instances in the category. Observing multiple instances of the same category, our model learns underlying part semantics without any supervision, thereby allowing the propagation of coarse 2D user scribbles to the entire 3D region (e.g., chair seat) in a consistent fashion. Next, we investigate for the editing tasks which components of our network require updating. We propose a hybrid network update strategy that targets the later network components, which balances efficiency and accuracy. During user interaction, we formulate an optimization problem that both satisfies the user's constraints and preserves the original object structure. We demonstrate our approach on a variety of editing tasks over three shape datasets and show that it outperforms prior neural editing approaches. Finally, we edit the appearance and shape of a real photograph and show that the edit propagates to extrapolated novel views.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liu_Editing_Conditional_Radiance_Fields_ICCV_2021_paper.pdf", @@ -11889,7 +12696,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Liu_Editing_Conditional_Radiance_Fields_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Liu_Editing_Conditional_Radiance_Fields_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Liu_2021_ICCV,\n \n author = {\n Liu,\n Steven and Zhang,\n Xiuming and Zhang,\n Zhoutong and Zhang,\n Richard and Zhu,\n Jun-Yan and Russell,\n Bryan\n},\n title = {\n Editing Conditional Radiance Fields\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5773-5783\n} \n}" }, { "title": "Effectively Leveraging Attributes for Visual Similarity", @@ -11897,6 +12705,7 @@ "status": "Poster", "track": "main", "pid": 5849, + "author_site": "Samarth Mishra; Zhongping Zhang; Yuan Shen; Ranjitha Kumar; Venkatesh Saligrama; Bryan A. Plummer", "author": "Samarth Mishra; Zhongping Zhang; Yuan Shen; Ranjitha Kumar; Venkatesh Saligrama; Bryan A. Plummer", "abstract": "Measuring similarity between two images often requires performing complex reasoning along different axes (e.g., color, texture, or shape). Insights into what might be important for measuring similarity can can be provided by annotated attributes, but prior work tends to view these annotations as complete, resulting in them using a simplistic approach of predicting attributes on single images, which are, in turn, used to measure similarity. However, it is impractical for a dataset to fully annotate every attribute that may be important. Thus, only representing images based on these incomplete annotations may miss out on key information. To address this issue, we propose the Pairwise Attribute-informed similarity Network (PAN), which breaks similarity learning into capturing similarity conditions and relevance scores from a joint representation of two images. This enables our model to identify that two images contain the same attribute, but can have it deemed irrelevant (e.g., due to fine-grained differences between them) and ignored for measuring similarity between the two images. Notably, while prior methods of using attribute annotations are often unable to outperform prior art, PAN obtains a 4-9% improvement on compatibility prediction between clothing items on Polyvore Outfits, a 5% gain on few shot classification of images using Caltech-UCSD Birds (CUB), and over 1% boost to Recall@1 on In-Shop Clothes Retrieval.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Mishra_Effectively_Leveraging_Attributes_for_Visual_Similarity_ICCV_2021_paper.pdf", @@ -11920,7 +12729,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Urbana-Champaign", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Mishra_2021_ICCV,\n \n author = {\n Mishra,\n Samarth and Zhang,\n Zhongping and Shen,\n Yuan and Kumar,\n Ranjitha and Saligrama,\n Venkatesh and Plummer,\n Bryan A.\n},\n title = {\n Effectively Leveraging Attributes for Visual Similarity\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1015-1024\n} \n}" }, { "title": "Efficient Action Recognition via Dynamic Knowledge Propagation", @@ -11928,6 +12738,7 @@ "status": "Poster", "track": "main", "pid": 7213, + "author_site": "Hanul Kim; Mihir Jain; Jun-Tae Lee; Sungrack Yun; Fatih Porikli", "author": "Hanul Kim; Mihir Jain; Jun-Tae Lee; Sungrack Yun; Fatih Porikli", "abstract": "Efficient action recognition has become crucial to extend the success of action recognition to many real-world applications. Contrary to most existing methods, which mainly focus on selecting salient frames to reduce the computation cost, we focus more on making the most of the selected frames. To this end, we employ two networks of different capabilities that operate in tandem to efficiently recognize actions. Given a video, the lighter network processes more frames while the heavier one only processes a few. In order to enable the effective interaction between the two, we propose dynamic knowledge propagation based on a cross-attention mechanism. This is the main component of our framework that is essentially a student-teacher architecture, but as the teacher model continues to interact with the student model during inference, we call it a dynamic student-teacher framework. Through extensive experiments, we demonstrate the effectiveness of each component of our framework. Our method outperforms competing state-of-the-art methods on two video datasets: ActivityNet-v1.3 and Mini-Kinetics.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Kim_Efficient_Action_Recognition_via_Dynamic_Knowledge_Propagation_ICCV_2021_paper.pdf", @@ -11951,7 +12762,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0;0;0;0", - "aff_country_unique": "United States;South Korea" + "aff_country_unique": "United States;South Korea", + "bibtex": "@InProceedings{Kim_2021_ICCV,\n \n author = {\n Kim,\n Hanul and Jain,\n Mihir and Lee,\n Jun-Tae and Yun,\n Sungrack and Porikli,\n Fatih\n},\n title = {\n Efficient Action Recognition via Dynamic Knowledge Propagation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13719-13728\n} \n}" }, { "title": "Efficient Large Scale Inlier Voting for Geometric Vision Problems", @@ -11959,6 +12771,7 @@ "status": "Poster", "track": "main", "pid": 10801, + "author_site": "Dror Aiger; Simon Lynen; Jan Hosang; Bernhard Zeisl", "author": "Dror Aiger; Simon Lynen; Jan Hosang; Bernhard Zeisl", "abstract": "Outlier rejection and equivalently inlier set optimization is a key ingredient in numerous applications in computer vision such as filtering point-matches in camera pose estimation or plane and normal estimation in point clouds. Several approaches exist, yet at large scale we face a combinatorial explosion of possible solutions and state-of-the-art methods like RANSAC, Hough transform or Branch&Bound require a minimum inlier ratio or prior knowledge to remain practical. In fact, for problems such as camera posing in very large scenes these approaches become useless as they have exponential runtime growth if these conditions aren't met. To approach the problem we present a efficient and general algorithm for outlier rejection based on \"intersecting\" k-dimensional surfaces in Rd . We provide a recipe for casting a variety of geometric problems as finding a point in Rd which maximizes the number of nearby surfaces (and thus inliers). The resulting algorithm has linear worst-case complexity with a better runtime dependency in the approximation factor than competing algorithms while not requiring domain specific bounds. This is achieved by introducing a space decomposition scheme that bounds the number of computations by successively rounding and grouping samples. Our recipe (and open-source code) enables anybody to derive such fast approaches to new problems across a wide range of domains. We demonstrate the versatility of the approach on several camera posing problems with a high number of matches at low inlier ratio achieving state-of-the-art results at significantly lower processing times.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Aiger_Efficient_Large_Scale_Inlier_Voting_for_Geometric_Vision_Problems_ICCV_2021_paper.pdf", @@ -11982,7 +12795,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Aiger_2021_ICCV,\n \n author = {\n Aiger,\n Dror and Lynen,\n Simon and Hosang,\n Jan and Zeisl,\n Bernhard\n},\n title = {\n Efficient Large Scale Inlier Voting for Geometric Vision Problems\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3243-3251\n} \n}" }, { "title": "Efficient Video Compression via Content-Adaptive Super-Resolution", @@ -11990,6 +12804,7 @@ "status": "Poster", "track": "main", "pid": 9011, + "author_site": "Mehrdad Khani; Vibhaalakshmi Sivaraman; Mohammad Alizadeh", "author": "Mehrdad Khani; Vibhaalakshmi Sivaraman; Mohammad Alizadeh", "abstract": "Video compression is a critical component of Internet video delivery. Recent work has shown that deep learning techniques can rival or outperform human-designed algorithms, but these methods are significantly less compute and power-efficient than existing codecs. This paper presents a new approach that augments existing codecs with a small, content-adaptive super-resolution model that significantly boosts video quality. Our method, SRVC, encodes video into two bitstreams: (i) a content stream, produced by compressing downsampled low-resolution video with the existing codec, (ii) a model stream, which encodes periodic updates to a lightweight super-resolution neural network customized for short segments of the video. SRVC decodes the video by passing the decompressed low-resolution video frames through the (time-varying) super-resolution model to reconstruct high-resolution video frames. Our results show that to achieve the same PSNR, SRVC requires 20% of the bits-per-pixel of H.265 in slow mode, and 3% of the bits-per-pixel of DVC, a recent deep learning-based video compression scheme. SRVC runs at 90 frames per second on an NVIDIA V100 GPU.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Khani_Efficient_Video_Compression_via_Content-Adaptive_Super-Resolution_ICCV_2021_paper.pdf", @@ -12013,7 +12828,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Khani_2021_ICCV,\n \n author = {\n Khani,\n Mehrdad and Sivaraman,\n Vibhaalakshmi and Alizadeh,\n Mohammad\n},\n title = {\n Efficient Video Compression via Content-Adaptive Super-Resolution\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4521-4530\n} \n}" }, { "title": "Efficient Visual Pretraining With Contrastive Detection", @@ -12021,7 +12837,8 @@ "status": "Poster", "track": "main", "pid": 10303, - "author": "Olivier J. H\u00e9naff; Skanda Koppula; Jean-Baptiste Alayrac; Aaron van den Oord; Oriol Vinyals; Jo\u00e3o Carreira", + "author_site": "Olivier J. Hénaff; Skanda Koppula; Jean-Baptiste Alayrac; Aaron van den Oord; Oriol Vinyals; João Carreira", + "author": "Olivier J. Hénaff; Skanda Koppula; Jean-Baptiste Alayrac; Aaron van den Oord; Oriol Vinyals; João Carreira", "abstract": "Self-supervised pretraining has been shown to yield powerful representations for transfer learning. These performance gains come at a large computational cost however, with state-of-the-art methods requiring an order of magnitude more computation than supervised pretraining. We tackle this computational bottleneck by introducing a new self-supervised objective, contrastive detection, which tasks representations with identifying object-level features across augmentations. This objective extracts a rich learning signal per image, leading to state-of-the-art transfer accuracy on a variety of downstream tasks, while requiring up to 10x less pretraining. In particular, our strongest ImageNet-pretrained model performs on par with SEER, one of the largest self-supervised systems to date, which uses 1000x more pretraining data. Finally, our objective seamlessly handles pretraining on more complex images such as those in COCO, closing the gap with supervised transfer learning from COCO to PASCAL.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Henaff_Efficient_Visual_Pretraining_With_Contrastive_Detection_ICCV_2021_paper.pdf", "aff": ";;;;;", @@ -12035,7 +12852,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Henaff_Efficient_Visual_Pretraining_With_Contrastive_Detection_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Henaff_Efficient_Visual_Pretraining_With_Contrastive_Detection_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Henaff_2021_ICCV,\n \n author = {\n H\\'enaff,\n Olivier J. and Koppula,\n Skanda and Alayrac,\n Jean-Baptiste and van den Oord,\n Aaron and Vinyals,\n Oriol and Carreira,\n Jo\\~ao\n},\n title = {\n Efficient Visual Pretraining With Contrastive Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10086-10096\n} \n}" }, { "title": "Efficient and Differentiable Shadow Computation for Inverse Problems", @@ -12043,6 +12861,7 @@ "status": "Poster", "track": "main", "pid": 9022, + "author_site": "Linjie Lyu; Marc Habermann; Lingjie Liu; Mallikarjun B R; Ayush Tewari; Christian Theobalt", "author": "Linjie Lyu; Marc Habermann; Lingjie Liu; Mallikarjun B R; Ayush Tewari; Christian Theobalt", "abstract": "Differentiable rendering has received increasing interest in the solution of image-based inverse problems. It can benefit traditional optimization-based solutions to inverse problems, but also allows for self-supervision of learning-based approaches for which training data with ground truth annotation is hard to obtain. However, existing differentiable renderers either do not correctly model complex visibility responsible for shadows in the images, or are too slow for being used to train deep architectures over thousands of iterations. To this end, we propose an accurate yet efficient approach for differentiable visibility and soft shadow computation. Our approach is based on the spherical harmonics approximation of the scene illumination and visibility, where the occluding surface is approximated with spheres. This allows for significantly more efficient visibility computation compared to methods based on path tracing without sacrificing quality of generated images. As our formulation is differentiable, it can be used to solve various image-based inverse problems such as texture, lighting, geometry recovery from images using analysis-by-synthesis optimization.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Lyu_Efficient_and_Differentiable_Shadow_Computation_for_Inverse_Problems_ICCV_2021_paper.pdf", @@ -12066,7 +12885,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Lyu_2021_ICCV,\n \n author = {\n Lyu,\n Linjie and Habermann,\n Marc and Liu,\n Lingjie and R,\n Mallikarjun B and Tewari,\n Ayush and Theobalt,\n Christian\n},\n title = {\n Efficient and Differentiable Shadow Computation for Inverse Problems\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13107-13116\n} \n}" }, { "title": "EgoRenderer: Rendering Human Avatars From Egocentric Camera Images", @@ -12074,6 +12894,7 @@ "status": "Poster", "track": "main", "pid": 5959, + "author_site": "Tao Hu; Kripasindhu Sarkar; Lingjie Liu; Matthias Zwicker; Christian Theobalt", "author": "Tao Hu; Kripasindhu Sarkar; Lingjie Liu; Matthias Zwicker; Christian Theobalt", "abstract": "We present EgoRenderer, a system for rendering full-body neural avatars of a person captured by a wearable, egocentric fisheye camera that is mounted on a cap or a VR headset. Our system renders photorealistic novel views of the actor and her motion from arbitrary virtual camera locations. Rendering full-body avatars from such egocentric images come with unique challenges due to the top-down view and large distortions. We tackle these challenges by decomposing the rendering process into several steps, including texture synthesis, pose construction, and neural image translation. For texture synthesis, we propose Ego-DPNet, a neural network that infers dense correspondences between the input fisheye images and an underlying parametric body model, and to extract textures from egocentric inputs. In addition, to encode dynamic appearances, our approach also learns an implicit texture stack that captures detailed appearance variation across poses and viewpoints. For correct pose generation, we first estimate body pose from the egocentric view using a parametric model. We then synthesize an external free-viewpoint pose image by projecting the parametric model to the user-specified target viewpoint. We next combine the target pose image and the textures into a combined feature image, which is transformed into the output color image using a neural image translation network. Experimental evaluations show that EgoRenderer is capable of generating realistic free-viewpoint avatars of a person wearing an egocentric camera. Comparisons to several baselines demonstrate the advantages of our approach.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Hu_EgoRenderer_Rendering_Human_Avatars_From_Egocentric_Camera_Images_ICCV_2021_paper.pdf", @@ -12092,12 +12913,13 @@ "aff_unique_index": "0;1;1;0;1", "aff_unique_norm": "University of Maryland, College Park;Max Planck Institute for Informatics", "aff_unique_dep": "Department of Computer Science;Informatics", - "aff_unique_url": "https://www/umd.edu;https://mpi-inf.mpg.de", + "aff_unique_url": "https://www.umd.edu;https://mpi-inf.mpg.de", "aff_unique_abbr": "UMD;MPII", "aff_campus_unique_index": "0;1;1;0;1", "aff_campus_unique": "College Park;Saarland", "aff_country_unique_index": "0;1;1;0;1", - "aff_country_unique": "United States;Germany" + "aff_country_unique": "United States;Germany", + "bibtex": "@InProceedings{Hu_2021_ICCV,\n \n author = {\n Hu,\n Tao and Sarkar,\n Kripasindhu and Liu,\n Lingjie and Zwicker,\n Matthias and Theobalt,\n Christian\n},\n title = {\n EgoRenderer: Rendering Human Avatars From Egocentric Camera Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14528-14538\n} \n}" }, { "title": "Egocentric Pose Estimation From Human Vision Span", @@ -12105,6 +12927,7 @@ "status": "Poster", "track": "main", "pid": 7427, + "author_site": "Hao Jiang; Vamsi Krishna Ithapu", "author": "Hao Jiang; Vamsi Krishna Ithapu", "abstract": "Estimating camera wearer's body pose from an egocentric view (egopose) is a vital task in augmented and virtual reality. Existing approaches either use a narrow field of view front facing camera that barely captures the wearer, or an extended head-mounted top-down camera for maximal wearer visibility. In this paper, we tackle the egopose estimation from a more natural human vision span, where camera wearer can be seen in the peripheral view and depending on the head pose the wearer may become invisible or has a limited partial view. This is a realistic visual field for user-centric wearable devices like glasses which have front facing wide angle cameras. Existing solutions are not appropriate for this setting, and so, we propose a novel deep learning system taking advantage of both the dynamic features from camera SLAM and the body shape imagery. We compute 3D head pose, 3D body pose, the figure/ground separation, all at the same time while explicitly enforcing a certain geometric consistency across pose attributes. We further show that this system can be trained robustly with lots of existing mocap data so we do not have to collect and annotate large new datasets. Lastly, our system estimates egopose in real time and on the fly while maintaining high accuracy.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Jiang_Egocentric_Pose_Estimation_From_Human_Vision_Span_ICCV_2021_paper.pdf", @@ -12121,14 +12944,15 @@ "author_num": 2, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Jiang_Egocentric_Pose_Estimation_From_Human_Vision_Span_ICCV_2021_paper.html", "aff_unique_index": "0;0", - "aff_unique_norm": "Meta", + "aff_unique_norm": "Facebook Reality Labs Research", "aff_unique_dep": "Research", "aff_unique_url": "https://www.facebook.com/realitylabs", "aff_unique_abbr": "FRL", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Redmond", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Jiang_2021_ICCV,\n \n author = {\n Jiang,\n Hao and Ithapu,\n Vamsi Krishna\n},\n title = {\n Egocentric Pose Estimation From Human Vision Span\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11006-11014\n} \n}" }, { "title": "EigenGAN: Layer-Wise Eigen-Learning for GANs", @@ -12136,6 +12960,7 @@ "status": "Poster", "track": "main", "pid": 2558, + "author_site": "Zhenliang He; Meina Kan; Shiguang Shan", "author": "Zhenliang He; Meina Kan; Shiguang Shan", "abstract": "Recent studies on Generative Adversarial Network (GAN) reveal that different layers of a generative CNN hold different semantics of the synthesized images. However, few GAN models have explicit dimensions to control the semantic attributes represented in a specific layer. This paper proposes EigenGAN which is able to unsupervisedly mine interpretable and controllable dimensions from different generator layers. Specifically, EigenGAN embeds one linear subspace with orthogonal basis into each generator layer. Via generative adversarial training to learn a target distribution, these layer-wise subspaces automatically discover a set of \"eigen-dimensions\" at each layer corresponding to a set of semantic attributes or interpretable variations. By traversing the coefficient of a specific eigen-dimension, the generator can produce samples with continuous changes corresponding to a specific semantic attribute. Taking the human face for example, EigenGAN can discover controllable dimensions for high-level concepts such as pose and gender in the subspace of deep layers, as well as low-level concepts such as hue and color in the subspace of shallow layers. Moreover, in the linear case, we theoretically prove that our algorithm derives the principal components as PCA does. Codes can be found in https://github.com/LynnHo/EigenGAN-Tensorflow.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/He_EigenGAN_Layer-Wise_Eigen-Learning_for_GANs_ICCV_2021_paper.pdf", @@ -12152,14 +12977,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/He_EigenGAN_Layer-Wise_Eigen-Learning_for_GANs_ICCV_2021_paper.html", "aff_unique_index": "0+1;0+1;0+1+2", - "aff_unique_norm": "Chinese Academy of Sciences;University of Chinese Academy of Sciences;Pengcheng Laboratory", - "aff_unique_dep": "Institute of Computing Technology;;Peng Cheng Laboratory", + "aff_unique_norm": "Chinese Academy of Sciences;University of Chinese Academy of Sciences;Peng Cheng Laboratory", + "aff_unique_dep": "Institute of Computing Technology;;", "aff_unique_url": "http://www.ict.cas.cn;http://www.ucas.ac.cn;", "aff_unique_abbr": "CAS;UCAS;", "aff_campus_unique_index": "1;1;1+2", "aff_campus_unique": ";Beijing;Shenzhen", "aff_country_unique_index": "0+0;0+0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{He_2021_ICCV,\n \n author = {\n He,\n Zhenliang and Kan,\n Meina and Shan,\n Shiguang\n},\n title = {\n EigenGAN: Layer-Wise Eigen-Learning for GANs\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14408-14417\n} \n}" }, { "title": "Elaborative Rehearsal for Zero-Shot Action Recognition", @@ -12167,6 +12993,7 @@ "status": "Poster", "track": "main", "pid": 7710, + "author_site": "Shizhe Chen; Dong Huang", "author": "Shizhe Chen; Dong Huang", "abstract": "The growing number of action classes has posed a new challenge for video understanding, making Zero-Shot Action Recognition (ZSAR) a thriving direction. The ZSAR task aims to recognize target (unseen) actions without training examples by leveraging semantic representations to bridge seen and unseen actions. However, due to the complexity and diversity of actions, it remains challenging to semantically represent action classes and transfer knowledge from seen data. In this work, we propose an ER-enhanced ZSAR model inspired by an effective human memory technique Elaborative Rehearsal (ER), which involves elaborating a new concept and relating it to known concepts. Specifically, we expand each action class as an Elaborative Description (ED) sentence, which is more discriminative than a class name and less costly than manual-defined attributes. Besides directly aligning class semantics with videos, we incorporate objects from the video as Elaborative Concepts (EC) to improve video semantics and generalization from seen actions to unseen actions. Our ER-enhanced ZSAR model achieves state-of-the-art results on three existing benchmarks. Moreover, we propose a new ZSAR evaluation protocol on the Kinetics dataset to overcome limitations of current benchmarks and first compare with few-shot learning baselines on this more realistic setting. Our codes and collected EDs are released at https://github.com/DeLightCMU/ElaborativeRehearsal.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_Elaborative_Rehearsal_for_Zero-Shot_Action_Recognition_ICCV_2021_paper.pdf", @@ -12183,14 +13010,15 @@ "author_num": 2, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Chen_Elaborative_Rehearsal_for_Zero-Shot_Action_Recognition_ICCV_2021_paper.html", "aff_unique_index": "0+1;1", - "aff_unique_norm": "INRIA;Carnegie Mellon University", + "aff_unique_norm": "Inria;Carnegie Mellon University", "aff_unique_dep": ";", "aff_unique_url": "https://www.inria.fr;https://www.cmu.edu", "aff_unique_abbr": "Inria;CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;1", - "aff_country_unique": "France;United States" + "aff_country_unique": "France;United States", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Shizhe and Huang,\n Dong\n},\n title = {\n Elaborative Rehearsal for Zero-Shot Action Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13638-13647\n} \n}" }, { "title": "Else-Net: Elastic Semantic Network for Continual Action Recognition From Skeleton Data", @@ -12198,6 +13026,7 @@ "status": "Poster", "track": "main", "pid": 6511, + "author_site": "Tianjiao Li; Qiuhong Ke; Hossein Rahmani; Rui En Ho; Henghui Ding; Jun Liu", "author": "Tianjiao Li; Qiuhong Ke; Hossein Rahmani; Rui En Ho; Henghui Ding; Jun Liu", "abstract": "We address continual action recognition from skeleton sequence, which aims to learn a recognition model over time from a continuous stream of skeleton data. This task is very important in changing environment. Due to catastrophic forgetting problems of deep neural networks and large discrepancies between the previously learned and current new human actions from different categories, the neural networks may \"forget\" old actions, when learning new actions. This makes online continual action recognition a challenging task. We observe that although different human actions may vary to a large extent as a whole, their local body parts could share similar features. Therefore, we propose an Elastic Semantic Network (Else-Net) to learn new actions by decomposing human bodies into several semantic body parts. For each body part, the proposed Else-Net constructs a semantic pathway using several elastic cells learned with old actions, or explores new cells to store new knowledge.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_Else-Net_Elastic_Semantic_Network_for_Continual_Action_Recognition_From_Skeleton_ICCV_2021_paper.pdf", @@ -12221,7 +13050,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;0;0;0", - "aff_country_unique": "Singapore;Australia;United Kingdom" + "aff_country_unique": "Singapore;Australia;United Kingdom", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Tianjiao and Ke,\n Qiuhong and Rahmani,\n Hossein and Ho,\n Rui En and Ding,\n Henghui and Liu,\n Jun\n},\n title = {\n Else-Net: Elastic Semantic Network for Continual Action Recognition From Skeleton Data\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13434-13443\n} \n}" }, { "title": "Embed Me if You Can: A Geometric Perceptron", @@ -12229,10 +13059,11 @@ "status": "Poster", "track": "main", "pid": 10264, - "author": "Pavlo Melnyk; Michael Felsberg; M\u00e5rten Wadenb\u00e4ck", + "author_site": "Pavlo Melnyk; Michael Felsberg; Mårten Wadenbäck", + "author": "Pavlo Melnyk; Michael Felsberg; Mårten Wadenbäck", "abstract": "Solving geometric tasks involving point clouds by using machine learning is a challenging problem. Standard feed-forward neural networks combine linear or, if the bias parameter is included, affine layers and activation functions. Their geometric modeling is limited, which motivated the prior work introducing the multilayer hypersphere perceptron (MLHP). Its constituent part, i.e., the hypersphere neuron, is obtained by applying a conformal embedding of Euclidean space. By virtue of Clifford algebra, it can be implemented as the Cartesian dot product of inputs and weights. If the embedding is applied in a manner consistent with the dimensionality of the input space geometry, the decision surfaces of the model units become combinations of hyperspheres and make the decision-making process geometrically interpretable for humans. Our extension of the MLHP model, the multilayer geometric perceptron (MLGP), and its respective layer units, i.e., geometric neurons, are consistent with the 3D geometry and provide a geometric handle of the learned coefficients. In particular, the geometric neuron activations are isometric in 3D, which is necessary for rotation and translation equivariance. When classifying the 3D Tetris shapes, we quantitatively show that our model requires no activation function in the hidden layers other than the embedding to outperform the vanilla multilayer perceptron. In the presence of noise in the data, our model is also superior to the MLHP.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Melnyk_Embed_Me_if_You_Can_A_Geometric_Perceptron_ICCV_2021_paper.pdf", - "aff": "Computer Vision Laboratory, Department of Electrical Engineering, Link \u00a8oping University; Computer Vision Laboratory, Department of Electrical Engineering, Link \u00a8oping University; Computer Vision Laboratory, Department of Electrical Engineering, Link \u00a8oping University", + "aff": "Computer Vision Laboratory, Department of Electrical Engineering, Link ¨oping University; Computer Vision Laboratory, Department of Electrical Engineering, Link ¨oping University; Computer Vision Laboratory, Department of Electrical Engineering, Link ¨oping University", "project": "", "github": "github.com/pavlo-melnyk/mlgp-embedme", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Melnyk_Embed_Me_if_ICCV_2021_supplemental.zip", @@ -12245,14 +13076,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Melnyk_Embed_Me_if_You_Can_A_Geometric_Perceptron_ICCV_2021_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "Link\u00f6ping University", + "aff_unique_norm": "Linköping University", "aff_unique_dep": "Department of Electrical Engineering", "aff_unique_url": "https://www.liu.se", "aff_unique_abbr": "LiU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Sweden" + "aff_country_unique": "Sweden", + "bibtex": "@InProceedings{Melnyk_2021_ICCV,\n \n author = {\n Melnyk,\n Pavlo and Felsberg,\n Michael and Wadenb\\"ack,\n M\\r{a\n}rten\n},\n title = {\n Embed Me if You Can: A Geometric Perceptron\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1276-1284\n} \n}" }, { "title": "Embedding Novel Views in a Single JPEG Image", @@ -12260,6 +13092,7 @@ "status": "Poster", "track": "main", "pid": 3817, + "author_site": "Yue Wu; Guotao Meng; Qifeng Chen", "author": "Yue Wu; Guotao Meng; Qifeng Chen", "abstract": "We propose a novel approach for embedding novel views in a single JPEG image while preserving the perceptual fidelity of the modified JPEG image and the restored novel views. We adopt the popular novel view synthesis representation of multiplane images (MPIs). Our model first encodes 32 MPI layers (totally 128 channels) into a 3-channel JPEG image that can be decoded for MPIs to render novel views, with an embedding capacity of 1024 bits per pixel. We conducted experiments on public datasets with different novel view synthesis methods, and the results show that the proposed method can restore high-fidelity novel views from a slightly modified JPEG image. Furthermore, our method is robust to JPEG compression, color adjusting, and cropping. Our source code will be publicly available.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wu_Embedding_Novel_Views_in_a_Single_JPEG_Image_ICCV_2021_paper.pdf", @@ -12274,7 +13107,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wu_Embedding_Novel_Views_in_a_Single_JPEG_Image_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wu_Embedding_Novel_Views_in_a_Single_JPEG_Image_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Wu_2021_ICCV,\n \n author = {\n Wu,\n Yue and Meng,\n Guotao and Chen,\n Qifeng\n},\n title = {\n Embedding Novel Views in a Single JPEG Image\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14519-14527\n} \n}" }, { "title": "Emerging Properties in Self-Supervised Vision Transformers", @@ -12282,7 +13116,8 @@ "status": "Poster", "track": "main", "pid": 7530, - "author": "Mathilde Caron; Hugo Touvron; Ishan Misra; Herv\u00e9 J\u00e9gou; Julien Mairal; Piotr Bojanowski; Armand Joulin", + "author_site": "Mathilde Caron; Hugo Touvron; Ishan Misra; Hervé Jégou; Julien Mairal; Piotr Bojanowski; Armand Joulin", + "author": "Mathilde Caron; Hugo Touvron; Ishan Misra; Hervé Jégou; Julien Mairal; Piotr Bojanowski; Armand Joulin", "abstract": "In this paper, we question if self-supervised learning provides new properties to Vision Transformer (ViT) that stand out compared to convolutional networks (convnets). Beyond the fact that adapting self-supervised methods to this architecture works particularly well, we make the following observations: first, self-supervised ViT features contain explicit information about the semantic segmentation of an image, which does not emerge as clearly with supervised ViTs, nor with convnets. Second, these features are also excellent k-NN classifiers, reaching 78.3% top-1 on ImageNet with a small ViT. Our study also underlines the importance of momentum encoder, multi-crop training, and the use of small patches with ViTs. We implement our findings into a simple self-supervised method, called DINO, which we interpret as a form of self-distillation with no labels. We show the synergy between DINO and ViTs by achieving 80.1% top-1 on ImageNet in linear evaluation with ViT-Base.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Caron_Emerging_Properties_in_Self-Supervised_Vision_Transformers_ICCV_2021_paper.pdf", "aff": "Facebook AI Research; Facebook AI Research + Inria + Sorbonne University; Facebook AI Research; Facebook AI Research; Inria + Univ. Grenoble Alpes + CNRS + Grenoble INP + LJK; Facebook AI Research; Facebook AI Research", @@ -12298,14 +13133,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Caron_Emerging_Properties_in_Self-Supervised_Vision_Transformers_ICCV_2021_paper.html", "aff_unique_index": "0;0+1+2;0;0;1+3+4+5+6;0;0", - "aff_unique_norm": "Meta;INRIA;Sorbonne University;Universit\u00e9 Grenoble Alpes;Centre National de la Recherche Scientifique;Grenoble INP;Laboratoire Jean Kuntzmann", + "aff_unique_norm": "Facebook;Inria;Sorbonne University;Université Grenoble Alpes;Centre National de la Recherche Scientifique;Grenoble INP;Laboratoire Jean Kuntzmann", "aff_unique_dep": "Facebook AI Research;;;;;;", "aff_unique_url": "https://research.facebook.com;https://www.inria.fr;https://www.sorbonne.universite.fr;https://www.univ-grenoble-alpes.fr;https://www.cnrs.fr;https://www.grenoble-inp.fr;https://ljk.ensimag.fr", "aff_unique_abbr": "FAIR;Inria;Sorbonne;UGA;CNRS;Grenoble INP;LJK", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0+1+1;0;0;1+1+1+1+1;0;0", - "aff_country_unique": "United States;France" + "aff_country_unique": "United States;France", + "bibtex": "@InProceedings{Caron_2021_ICCV,\n \n author = {\n Caron,\n Mathilde and Touvron,\n Hugo and Misra,\n Ishan and J\\'egou,\n Herv\\'e and Mairal,\n Julien and Bojanowski,\n Piotr and Joulin,\n Armand\n},\n title = {\n Emerging Properties in Self-Supervised Vision Transformers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9650-9660\n} \n}" }, { "title": "Encoder-Decoder With Multi-Level Attention for 3D Human Shape and Pose Estimation", @@ -12313,6 +13149,7 @@ "status": "Poster", "track": "main", "pid": 3582, + "author_site": "Ziniu Wan; Zhengjia Li; Maoqing Tian; Jianbo Liu; Shuai Yi; Hongsheng Li", "author": "Ziniu Wan; Zhengjia Li; Maoqing Tian; Jianbo Liu; Shuai Yi; Hongsheng Li", "abstract": "3D human shape and pose estimation is the essential task for human motion analysis, which is widely used in many 3D applications. However, existing methods cannot simultaneously capture the relations at multiple levels, including spatial-temporal level and human joint level. Therefore they fail to make accurate predictions in some hard scenarios when there is cluttered background, occlusion, or extreme pose. To this end, we propose Multi-level Attention Encoder-Decoder Network (MAED), including a Spatial-Temporal Encoder (STE) and a Kinematic Topology Decoder (KTD) to model multi-level attentions in a unified framework. STE consists of a series of cascaded blocks based on Multi-Head Self-Attention, and each block uses two parallel branches to learn spatial and temporal attention respectively. Meanwhile, KTD aims at modeling the joint level attention. It regards pose estimation as a top-down hierarchical process similar to SMPL kinematic tree. With the training set of 3DPW, MAED outperforms previous state-of-the-art methods by 6.2, 7.2, and 2.4 mm of PA-MPJPE on the three widely used benchmarks 3DPW, MPI-INF-3DHP, and Human3.6M respectively. Our code is available at https://github.com/ziniuwan/maed.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wan_Encoder-Decoder_With_Multi-Level_Attention_for_3D_Human_Shape_and_Pose_ICCV_2021_paper.pdf", @@ -12336,7 +13173,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;1;1;1;1;1", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Wan_2021_ICCV,\n \n author = {\n Wan,\n Ziniu and Li,\n Zhengjia and Tian,\n Maoqing and Liu,\n Jianbo and Yi,\n Shuai and Li,\n Hongsheng\n},\n title = {\n Encoder-Decoder With Multi-Level Attention for 3D Human Shape and Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13033-13042\n} \n}" }, { "title": "End-to-End Dense Video Captioning With Parallel Decoding", @@ -12344,6 +13182,7 @@ "status": "Poster", "track": "main", "pid": 5660, + "author_site": "Teng Wang; Ruimao Zhang; Zhichao Lu; Feng Zheng; Ran Cheng; Ping Luo", "author": "Teng Wang; Ruimao Zhang; Zhichao Lu; Feng Zheng; Ran Cheng; Ping Luo", "abstract": "Dense video captioning aims to generate multiple associated captions with their temporal locations from the video. Previous methods follow a sophisticated \"localize-then-describe\" scheme, which heavily relies on numerous hand-crafted components. In this paper, we proposed a simple yet effective framework for end-to-end dense video captioning with parallel decoding (PDVC), by formulating the dense caption generation as a set prediction task. In practice, through stacking a newly proposed event counter on the top of a transformer decoder, the PDVC precisely segments the video into a number of event pieces under the holistic understanding of the video content, which effectively increases the coherence and readability of predicted captions. Compared with prior arts, the PDVC has several appealing advantages: (1) Without relying on heuristic non-maximum suppression or a recurrent event sequence selection network to remove redundancy, PDVC directly produces an event set with an appropriate size; (2) In contrast to adopting the two-stage scheme, we feed the enhanced representations of event queries into the localization head and caption head in parallel, making these two sub-tasks deeply interrelated and mutually promoted through the optimization; (3) Without bells and whistles, extensive experiments on ActivityNet Captions and YouCook2 show that PDVC is capable of producing high-quality captioning results, surpassing the state-of-the-art two-stage methods when its localization accuracy is on par with them. Code is available at https://github.com/ttengwang/PDVC.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_End-to-End_Dense_Video_Captioning_With_Parallel_Decoding_ICCV_2021_paper.pdf", @@ -12360,14 +13199,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wang_End-to-End_Dense_Video_Captioning_With_Parallel_Decoding_ICCV_2021_paper.html", "aff_unique_index": "0;1;2;3;1;0", - "aff_unique_norm": "University of Hong Kong;Southern University of Science and Technology;Chinese University of Hong Kong;Shenzhen Research Institute of Big Data", + "aff_unique_norm": "The University of Hong Kong;Southern University of Science and Technology;The Chinese University of Hong Kong;Shenzhen Research Institute of Big Data", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.hku.hk;https://www.sustech.edu.cn;https://www.cuhk.edu.cn;http://www.sribd.cn", "aff_unique_abbr": "HKU;SUSTech;CUHK;", "aff_campus_unique_index": "0;2;0", "aff_campus_unique": "Hong Kong SAR;;Shenzhen", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Teng and Zhang,\n Ruimao and Lu,\n Zhichao and Zheng,\n Feng and Cheng,\n Ran and Luo,\n Ping\n},\n title = {\n End-to-End Dense Video Captioning With Parallel Decoding\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6847-6857\n} \n}" }, { "title": "End-to-End Detection and Pose Estimation of Two Interacting Hands", @@ -12375,6 +13215,7 @@ "status": "Poster", "track": "main", "pid": 1877, + "author_site": "Dong Uk Kim; Kwang In Kim; Seungryul Baek", "author": "Dong Uk Kim; Kwang In Kim; Seungryul Baek", "abstract": "Three dimensional hand pose estimation has reached a level of maturity, enabling real-world applications for single-hand cases. However, accurate estimation of the pose of two closely interacting hands still remains a challenge as in this case, one hand often occludes the other. We present a new algorithm that accurately estimates hand poses in such a challenging scenario. The crux of our algorithm lies in a framework that jointly trains the estimators of interacting hands, leveraging their inter-dependence. Further, we employ a GAN-type discriminator of interacting hand pose that helps avoid physically implausible configurations, e.g intersecting fingers, and exploit the visibility of joints to improve intermediate 2D pose estimation. We incorporate them into a single model that learns to detect hands and estimate their pose based on a unified criterion of pose estimation accuracy. To our knowledge, this is the first attempt to build an end-to-end network that detects and estimates the pose of two closely interacting hands (as well as single hands). In the experiments with three datasets representing challenging real-world scenarios, our algorithm demonstrated significant and consistent performance improvements over state-of-the-arts.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Kim_End-to-End_Detection_and_Pose_Estimation_of_Two_Interacting_Hands_ICCV_2021_paper.pdf", @@ -12389,7 +13230,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Kim_End-to-End_Detection_and_Pose_Estimation_of_Two_Interacting_Hands_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Kim_End-to-End_Detection_and_Pose_Estimation_of_Two_Interacting_Hands_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Kim_2021_ICCV,\n \n author = {\n Kim,\n Dong Uk and Kim,\n Kwang In and Baek,\n Seungryul\n},\n title = {\n End-to-End Detection and Pose Estimation of Two Interacting Hands\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11189-11198\n} \n}" }, { "title": "End-to-End Piece-Wise Unwarping of Document Images", @@ -12397,6 +13239,7 @@ "status": "Poster", "track": "main", "pid": 3515, + "author_site": "Sagnik Das; Kunwar Yashraj Singh; Jon Wu; Erhan Bas; Vijay Mahadevan; Rahul Bhotika; Dimitris Samaras", "author": "Sagnik Das; Kunwar Yashraj Singh; Jon Wu; Erhan Bas; Vijay Mahadevan; Rahul Bhotika; Dimitris Samaras", "abstract": "Document unwarping attempts to undo the physical deformation of the paper and recover a 'flatbed' scanned document-image for downstream tasks such as OCR. Current state-of-the-art relies on global unwarping of the document which is not robust to local deformation changes. Moreover, a global unwarping often produces spurious warping artifacts in less warped regions to compensate for severe warps present in other parts of the document. In this paper, we propose the first end-to-end trainable piece-wise unwarping method that predicts local deformation fields and stitches them together with global information to obtain an improved unwarping. The proposed piece-wise formulation results in 4% improvement in terms of multi-scale structural similarity (MS-SSIM) and shows better performance in terms of OCR metrics, character error rate (CER) and word error rate (WER) compared to the state-of-the-art.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Das_End-to-End_Piece-Wise_Unwarping_of_Document_Images_ICCV_2021_paper.pdf", @@ -12420,7 +13263,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Das_2021_ICCV,\n \n author = {\n Das,\n Sagnik and Singh,\n Kunwar Yashraj and Wu,\n Jon and Bas,\n Erhan and Mahadevan,\n Vijay and Bhotika,\n Rahul and Samaras,\n Dimitris\n},\n title = {\n End-to-End Piece-Wise Unwarping of Document Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4268-4277\n} \n}" }, { "title": "End-to-End Robust Joint Unsupervised Image Alignment and Clustering", @@ -12428,6 +13272,7 @@ "status": "Poster", "track": "main", "pid": 6279, + "author_site": "Xiangrui Zeng; Gregory Howe; Min Xu", "author": "Xiangrui Zeng; Gregory Howe; Min Xu", "abstract": "Computing dense pixel-to-pixel image correspondences is a fundamental task of computer vision. Often, the objective is to align image pairs from the same semantic category for manipulation or segmentation purposes. Despite achieving superior performance, existing deep learning alignment methods cannot cluster images; consequently, clustering and pairing images needed to be a separate laborious and expensive step. Given a dataset with diverse semantic categories, we propose a multi-task model, Jim-Net, that can directly learn to cluster and align images without any pixel-level or image-level annotations. We design a pair-matching alignment unsupervised training algorithm that selectively matches and aligns image pairs from the clustering branch. Our unsupervised Jim-Net achieves comparable accuracy with state-of-the-art supervised methods on benchmark 2D image alignment dataset PF-PASCAL. Specifically, we apply Jim-Net to cryo-electron tomography, a revolutionary 3D microscopy imaging technique of native subcellular structures. After extensive evaluation on seven datasets, we demonstrate that Jim-Net enables systematic discovery and recovery of representative macromolecular structures in situ, which is essential for revealing molecular mechanisms underlying cellular functions. To our knowledge, Jim-Net is the first end-to-end model that can simultaneously align and cluster images, which significantly improves the performance as compared to performing each task alone.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zeng_End-to-End_Robust_Joint_Unsupervised_Image_Alignment_and_Clustering_ICCV_2021_paper.pdf", @@ -12451,7 +13296,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zeng_2021_ICCV,\n \n author = {\n Zeng,\n Xiangrui and Howe,\n Gregory and Xu,\n Min\n},\n title = {\n End-to-End Robust Joint Unsupervised Image Alignment and Clustering\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3854-3866\n} \n}" }, { "title": "End-to-End Semi-Supervised Object Detection With Soft Teacher", @@ -12459,6 +13305,7 @@ "status": "Poster", "track": "main", "pid": 8593, + "author_site": "Mengde Xu; Zheng Zhang; Han Hu; Jianfeng Wang; Lijuan Wang; Fangyun Wei; Xiang Bai; Zicheng Liu", "author": "Mengde Xu; Zheng Zhang; Han Hu; Jianfeng Wang; Lijuan Wang; Fangyun Wei; Xiang Bai; Zicheng Liu", "abstract": "Previous pseudo-label approaches for semi-supervised object detection typically follow a multi-stage schema, with the first stage to train an initial detector on a few labeled data, followed by the pseudo labeling and re-training stage on unlabeled data. These multi-stage methods complicate the training, and also hinder the use of improved detectors for more accurate pseudo-labeling. In this paper, we propose an end-to-end approach to simultaneously improve the detector and pseudo labels gradually for semi-supervised object detection. The pseudo labels are generated on the fly by a teacher model which is an aggregated version of the student detector at different steps. As the detector becomes stronger during the training, the teacher detector's performance improves and the pseudo labels tend to be more accurate, which further benefits the detector training. Within the end-to-end training, we present two simple yet effective techniques: weigh the classification loss of unlabeled images through soft teacher and select reliable pseudo boxes for regression through box jittering. Experimentally, the proposed approach outperforms the state-of-the-art methods by a large margin on MS-COCO benchmark by using Faster R-CNN with ResNet-50 and FPN, reaching 20.5 mAP, 30.7 mAP and 34.0 mAP with 1%, 5%, 10% labeled data, respectively. Moreover, the proposed approach also proves to improve this detector trained on the COCO full set by +1.8 mAP by leveraging additional unlabelled data of COCO, achieving 42.7 mAP.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xu_End-to-End_Semi-Supervised_Object_Detection_With_Soft_Teacher_ICCV_2021_paper.pdf", @@ -12475,14 +13322,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Xu_End-to-End_Semi-Supervised_Object_Detection_With_Soft_Teacher_ICCV_2021_paper.html", "aff_unique_index": "0+1;0+1;1;1;1;1;0;1", - "aff_unique_norm": "Huazhong University of Science and Technology;Microsoft", - "aff_unique_dep": ";Microsoft Corporation", + "aff_unique_norm": "Huazhong University of Science and Technology;Microsoft Corporation", + "aff_unique_dep": ";", "aff_unique_url": "http://www.hust.edu.cn;https://www.microsoft.com", "aff_unique_abbr": "HUST;Microsoft", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0+1;1;1;1;1;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Xu_2021_ICCV,\n \n author = {\n Xu,\n Mengde and Zhang,\n Zheng and Hu,\n Han and Wang,\n Jianfeng and Wang,\n Lijuan and Wei,\n Fangyun and Bai,\n Xiang and Liu,\n Zicheng\n},\n title = {\n End-to-End Semi-Supervised Object Detection With Soft Teacher\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3060-3069\n} \n}" }, { "title": "End-to-End Trainable Trident Person Search Network Using Adaptive Gradient Propagation", @@ -12490,6 +13338,7 @@ "status": "Poster", "track": "main", "pid": 3343, + "author_site": "Byeong-Ju Han; Kuhyeun Ko; Jae-Young Sim", "author": "Byeong-Ju Han; Kuhyeun Ko; Jae-Young Sim", "abstract": "Person search suffers from the conflicting objectives of commonness and uniqueness between the person detection and re-identification tasks that make the end-to-end training of person search networks difficult. In this paper, we propose a trident network for person search that performs detection, re-identification, and part classification together. We also devise a novel end-to-end training method using adaptive gradient weighting that controls the flow of back-propagated gradients through the re-identification and part classification networks according to the quality of the person detection. The proposed method not only prevents the over-fitting but encourages to exploit fine-grained features by incorporating the part classification branch into the person search framework. Experimental results on the CUHK-SYSU and PRW datasets demonstrate that the proposed method achieves the best performance among the state-of-the-art end-to-end person search methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Han_End-to-End_Trainable_Trident_Person_Search_Network_Using_Adaptive_Gradient_Propagation_ICCV_2021_paper.pdf", @@ -12513,7 +13362,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Ulsan", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Han_2021_ICCV,\n \n author = {\n Han,\n Byeong-Ju and Ko,\n Kuhyeun and Sim,\n Jae-Young\n},\n title = {\n End-to-End Trainable Trident Person Search Network Using Adaptive Gradient Propagation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 925-933\n} \n}" }, { "title": "End-to-End Unsupervised Document Image Blind Denoising", @@ -12521,6 +13371,7 @@ "status": "Poster", "track": "main", "pid": 9405, + "author_site": "Mehrdad J. Gangeh; Marcin Plata; Hamid R. Motahari Nezhad; Nigel P Duffy", "author": "Mehrdad J. Gangeh; Marcin Plata; Hamid R. Motahari Nezhad; Nigel P Duffy", "abstract": "Removing noise from scanned pages is a vital step before their submission to optical character recognition (OCR) system. Most available image denoising methods are supervised where the pairs of noisy/clean pages are required. However, this assumption is rarely met in real settings. Besides, there is no single model that can remove various noise types from documents. Here, we propose a unified end-to-end unsupervised deep learning model, for the first time, that can effectively remove multiple types of noise, including salt & pepper noise, blurred and/or faded text, as well as watermarks from documents at various levels of intensity. We demonstrate that the proposed model significantly improves the quality of scanned images and the OCR of the pages on several test datasets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Gangeh_End-to-End_Unsupervised_Document_Image_Blind_Denoising_ICCV_2021_paper.pdf", @@ -12544,7 +13395,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", - "aff_country_unique": "United States;Poland" + "aff_country_unique": "United States;Poland", + "bibtex": "@InProceedings{Gangeh_2021_ICCV,\n \n author = {\n Gangeh,\n Mehrdad J. and Plata,\n Marcin and Nezhad,\n Hamid R. Motahari and Duffy,\n Nigel P\n},\n title = {\n End-to-End Unsupervised Document Image Blind Denoising\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7888-7897\n} \n}" }, { "title": "End-to-End Urban Driving by Imitating a Reinforcement Learning Coach", @@ -12552,10 +13404,11 @@ "status": "Poster", "track": "main", "pid": 1766, + "author_site": "Zhejun Zhang; Alexander Liniger; Dengxin Dai; Fisher Yu; Luc Van Gool", "author": "Zhejun Zhang; Alexander Liniger; Dengxin Dai; Fisher Yu; Luc Van Gool", "abstract": "End-to-end approaches to autonomous driving commonly rely on expert demonstrations. Although humans are good drivers, they are not good coaches for end-to-end algorithms that demand dense on-policy supervision. On the contrary, automated experts that leverage privileged information can efficiently generate large scale on-policy and off-policy demonstrations. However, existing automated experts for urban driving make heavy use of hand-crafted rules and perform suboptimally even on driving simulators, where ground-truth information is available. To address these issues, we train a reinforcement learning expert that maps bird's-eye view images to continuous low-level actions. While setting a new performance upper-bound on CARLA, our expert is also a better coach that provides informative supervision signals for imitation learning agents to learn from. Supervised by our reinforcement learning coach, a baseline end-to-end agent with monocular camera-input achieves expert-level performance. Our end-to-end agent achieves a 78% success rate while generalizing to a new town and new weather on the NoCrash-dense benchmark and state-of-the-art performance on the more challenging CARLA LeaderBoard.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhang_End-to-End_Urban_Driving_by_Imitating_a_Reinforcement_Learning_Coach_ICCV_2021_paper.pdf", - "aff": "Computer Vision Lab, ETH Z\u00fcrich; Computer Vision Lab, ETH Z\u00fcrich; Computer Vision Lab, ETH Z\u00fcrich+MPI for Informatics; Computer Vision Lab, ETH Z\u00fcrich; Computer Vision Lab, ETH Z\u00fcrich+PSI, KU Leuven", + "aff": "Computer Vision Lab, ETH Zürich; Computer Vision Lab, ETH Zürich; Computer Vision Lab, ETH Zürich+MPI for Informatics; Computer Vision Lab, ETH Zürich; Computer Vision Lab, ETH Zürich+PSI, KU Leuven", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Zhang_End-to-End_Urban_Driving_ICCV_2021_supplemental.zip", @@ -12568,14 +13421,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhang_End-to-End_Urban_Driving_by_Imitating_a_Reinforcement_Learning_Coach_ICCV_2021_paper.html", "aff_unique_index": "0;0;0+1;0;0+2", - "aff_unique_norm": "ETH Zurich;Max Planck Institute for Informatics;KU Leuven", + "aff_unique_norm": "ETH Zürich;Max Planck Institute for Informatics;KU Leuven", "aff_unique_dep": "Computer Vision Lab;Informatics;PSI", "aff_unique_url": "https://www.ethz.ch;https://www.mpi-inf.mpg.de;https://www.kuleuven.be", "aff_unique_abbr": "ETHZ;MPII;KU Leuven", "aff_campus_unique_index": "0;0;0;0;0", - "aff_campus_unique": "Z\u00fcrich;", + "aff_campus_unique": "Zürich;", "aff_country_unique_index": "0;0;0+1;0;0+2", - "aff_country_unique": "Switzerland;Germany;Belgium" + "aff_country_unique": "Switzerland;Germany;Belgium", + "bibtex": "@InProceedings{Zhang_2021_ICCV,\n \n author = {\n Zhang,\n Zhejun and Liniger,\n Alexander and Dai,\n Dengxin and Yu,\n Fisher and Van Gool,\n Luc\n},\n title = {\n End-to-End Urban Driving by Imitating a Reinforcement Learning Coach\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15222-15232\n} \n}" }, { "title": "End-to-End Video Instance Segmentation via Spatial-Temporal Graph Neural Networks", @@ -12583,6 +13437,7 @@ "status": "Poster", "track": "main", "pid": 2487, + "author_site": "Tao Wang; Ning Xu; Kean Chen; Weiyao Lin", "author": "Tao Wang; Ning Xu; Kean Chen; Weiyao Lin", "abstract": "Video instance segmentation is a challenging task that extends image instance segmentation to the video domain. Existing methods either rely only on single-frame information for the detection and segmentation subproblems or handle tracking as a separate post-processing step, which limit their capability to fully leverage and share useful spatial-temporal information for all the subproblems. In this paper, we propose a novel graph-neural-network (GNN) based method to handle the aforementioned limitation. Specifically, graph nodes representing instance features are used for detection and segmentation while graph edges representing instance relations are used for tracking. Both inter and intra-frame information is effectively propagated and shared via graph updates and all the subproblems (i.e. detection, segmentation and tracking) are jointly optimized in an unified framework. The performance of our method shows great improvement on the YoutubeVIS validation dataset compared to existing methods and achieves 36.5% AP with a ResNet-50 backbone, operating at 22 FPS.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_End-to-End_Video_Instance_Segmentation_via_Spatial-Temporal_Graph_Neural_Networks_ICCV_2021_paper.pdf", @@ -12606,7 +13461,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Tao and Xu,\n Ning and Chen,\n Kean and Lin,\n Weiyao\n},\n title = {\n End-to-End Video Instance Segmentation via Spatial-Temporal Graph Neural Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10797-10806\n} \n}" }, { "title": "Energy-Based Open-World Uncertainty Modeling for Confidence Calibration", @@ -12614,6 +13470,7 @@ "status": "Poster", "track": "main", "pid": 6845, + "author_site": "Yezhen Wang; Bo Li; Tong Che; Kaiyang Zhou; Ziwei Liu; Dongsheng Li", "author": "Yezhen Wang; Bo Li; Tong Che; Kaiyang Zhou; Ziwei Liu; Dongsheng Li", "abstract": "Confidence calibration is of great importance to ensure the reliability of decisions made by machine learning systems. However, discriminative classifiers based on deep neural networks are often criticized for producing overconfident predictions that fail to reflect the true correctness likelihood of classification accuracy. We argue that such an inability to model uncertainty is mainly caused by the closed-world nature in softmax: a model trained by the cross-entropy loss will be forced to classify the input into one of K pre-defined categories with high probability. To address this problem, we for the first time propose a novel K+1-way softmax formulation, which incorporates the modeling of open-world uncertainty as to the extra dimension. To unify the learning of the original K-way classification task and the extra dimension that models uncertainty, we (1)propose a novel energy-based objective function, and moreover, (2) theoretically prove that optimizing such an objective essentially forces the extra dimension to capture the marginal data distribution. Extensive experiments show that our approach, Energy-based Open-World Softmax (EOW-Softmax), is superior to existing state-of-the-art methods in improving confidence calibration.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_Energy-Based_Open-World_Uncertainty_Modeling_for_Confidence_Calibration_ICCV_2021_paper.pdf", @@ -12630,14 +13487,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wang_Energy-Based_Open-World_Uncertainty_Modeling_for_Confidence_Calibration_ICCV_2021_paper.html", "aff_unique_index": "0;0;1;2;2;0", - "aff_unique_norm": "Microsoft;Mila;Nanyang Technological University", + "aff_unique_norm": "Microsoft Research;MILA;Nanyang Technological University", "aff_unique_dep": "Research;;S-Lab", "aff_unique_url": "https://www.microsoft.com/en-us/research/group/asia;https://mila.quebec;https://www.ntu.edu.sg", "aff_unique_abbr": "MSR Asia;MILA;NTU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Asia;", "aff_country_unique_index": "0;0;1;2;2;0", - "aff_country_unique": "China;Canada;Singapore" + "aff_country_unique": "China;Canada;Singapore", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Yezhen and Li,\n Bo and Che,\n Tong and Zhou,\n Kaiyang and Liu,\n Ziwei and Li,\n Dongsheng\n},\n title = {\n Energy-Based Open-World Uncertainty Modeling for Confidence Calibration\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9302-9311\n} \n}" }, { "title": "Enhanced Boundary Learning for Glass-Like Object Segmentation", @@ -12645,7 +13503,8 @@ "status": "Poster", "track": "main", "pid": 1794, - "author": "Hao He; Xiangtai Li; Guangliang Cheng; Jianping Shi; Yunhai Tong; Gaofeng Meng; V\u00e9ronique Prinet; LuBin Weng", + "author_site": "Hao He; Xiangtai Li; Guangliang Cheng; Jianping Shi; Yunhai Tong; Gaofeng Meng; Véronique Prinet; LuBin Weng", + "author": "Hao He; Xiangtai Li; Guangliang Cheng; Jianping Shi; Yunhai Tong; Gaofeng Meng; Véronique Prinet; LuBin Weng", "abstract": "Glass-like objects such as windows, bottles, and mirrors exist widely in the real world. Sensing these objects has many applications, including robot navigation and grasping. However, this task is very challenging due to the arbitrary scenes behind glass-like objects. This paper aims to solve the glass-like object segmentation problem via enhanced boundary learning. In particular, we first propose a novel refined differential module that outputs finer boundary cues. We then introduce an edge-aware point-based graph convolution network module to model the global shape along the boundary. We use these two modules to design a decoder that generates accurate and clean segmentation results, especially on the object contours. Both modules are lightweight and effective: they can be embedded into various segmentation models. In extensive experiments on three recent glass-like object segmentation datasets, including Trans10k, MSD, and GDD, our approach establishes new state-of-the-art results. We also illustrate the strong generalization properties of our method on three generic segmentation datasets, including Cityscapes, BDD, and COCO Stuff. Code and models will be available for further research.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/He_Enhanced_Boundary_Learning_for_Glass-Like_Object_Segmentation_ICCV_2021_paper.pdf", "aff": ";;;;;;;", @@ -12659,7 +13518,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/He_Enhanced_Boundary_Learning_for_Glass-Like_Object_Segmentation_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/He_Enhanced_Boundary_Learning_for_Glass-Like_Object_Segmentation_ICCV_2021_paper.html", + "bibtex": "@InProceedings{He_2021_ICCV,\n \n author = {\n He,\n Hao and Li,\n Xiangtai and Cheng,\n Guangliang and Shi,\n Jianping and Tong,\n Yunhai and Meng,\n Gaofeng and Prinet,\n V\\'eronique and Weng,\n LuBin\n},\n title = {\n Enhanced Boundary Learning for Glass-Like Object Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15859-15868\n} \n}" }, { "title": "Enhancing Self-Supervised Video Representation Learning via Multi-Level Feature Optimization", @@ -12667,6 +13527,7 @@ "status": "Poster", "track": "main", "pid": 1288, + "author_site": "Rui Qian; Yuxi Li; Huabin Liu; John See; Shuangrui Ding; Xian Liu; Dian Li; Weiyao Lin", "author": "Rui Qian; Yuxi Li; Huabin Liu; John See; Shuangrui Ding; Xian Liu; Dian Li; Weiyao Lin", "abstract": "The crux of self-supervised video representation learning is to build general features from unlabeled videos. However, most recent works have mainly focused on high-level semantics and neglected lower-level representations and their temporal relationship which are crucial for general video understanding. To address these challenges, this paper proposes a multi-level feature optimization framework to improve the generalization and temporal modeling ability of learned video representations. Concretely, high-level features obtained from naive and prototypical contrastive learning are utilized to build distribution graphs, guiding the process of low-level and mid-level feature learning. We also devise a simple temporal modeling module from multi-level features to enhance motion pattern learning. Experiments demonstrate that multi-level feature optimization with the graph constraint and temporal modeling can greatly improve the representation ability in video understanding. Code is available at https://github.com/shvdiwnkozbw/Video-Representation-via-Multi-level-Optimization.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Qian_Enhancing_Self-Supervised_Video_Representation_Learning_via_Multi-Level_Feature_Optimization_ICCV_2021_paper.pdf", @@ -12690,7 +13551,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0;1;0;0;0;0", - "aff_country_unique": "China;United Kingdom" + "aff_country_unique": "China;United Kingdom", + "bibtex": "@InProceedings{Qian_2021_ICCV,\n \n author = {\n Qian,\n Rui and Li,\n Yuxi and Liu,\n Huabin and See,\n John and Ding,\n Shuangrui and Liu,\n Xian and Li,\n Dian and Lin,\n Weiyao\n},\n title = {\n Enhancing Self-Supervised Video Representation Learning via Multi-Level Feature Optimization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7990-8001\n} \n}" }, { "title": "Enriching Local and Global Contexts for Temporal Action Localization", @@ -12698,10 +13560,11 @@ "status": "Poster", "track": "main", "pid": 2829, + "author_site": "Zixin Zhu; Wei Tang; Le Wang; Nanning Zheng; Gang Hua", "author": "Zixin Zhu; Wei Tang; Le Wang; Nanning Zheng; Gang Hua", "abstract": "Effectively tackling the problem of temporal action localization (TAL) necessitates a visual representation that jointly pursues two confounding goals, i.e., fine-grained discrimination for temporal localization and sufficient visual invariance for action classification. We address this challenge by enriching both the local and global contexts in the popular two-stage temporal localization framework, where action proposals are first generated followed by action classification and temporal boundary regression. Our proposed model, dubbed ContextLoc, can be divided into three sub-networks: L-Net, G-Net and P-Net. L-Net enriches the local context via fine-grained modeling of snippet-level features, which is formulated as a query-and-retrieval process. G-Net enriches the global context via higher-level modeling of the video-level representation. In addition, we introduce a novel context adaptation module to adapt the global context to different proposals. P-Net further models the context-aware inter-proposal relations. We explore two existing models to be the P-Net in our experiments. The efficacy of our proposed method is validated by experimental results on the THUMOS14 (54.3% at tIoU@0.5) and ActivityNet v1.3 (56.01% at tIoU@0.5) datasets, which outperforms recent states of the art. Code is available at https://github.com/buxiangzhiren/ContextLoc.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhu_Enriching_Local_and_Global_Contexts_for_Temporal_Action_Localization_ICCV_2021_paper.pdf", - "aff": "Institute of Artificial Intelligence and Robotics, Xi\u2019an Jiaotong University; University of Illinois at Chicago; Institute of Artificial Intelligence and Robotics, Xi\u2019an Jiaotong University; Institute of Artificial Intelligence and Robotics, Xi\u2019an Jiaotong University; Wormpex AI Research", + "aff": "Institute of Artificial Intelligence and Robotics, Xi’an Jiaotong University; University of Illinois at Chicago; Institute of Artificial Intelligence and Robotics, Xi’an Jiaotong University; Institute of Artificial Intelligence and Robotics, Xi’an Jiaotong University; Wormpex AI Research", "project": "", "github": "https://github.com/buxiangzhiren/ContextLoc", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Zhu_Enriching_Local_and_ICCV_2021_supplemental.pdf", @@ -12714,14 +13577,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhu_Enriching_Local_and_Global_Contexts_for_Temporal_Action_Localization_ICCV_2021_paper.html", "aff_unique_index": "0;1;0;0;2", - "aff_unique_norm": "Xi'an Jiao Tong University;University of Illinois at Chicago;Wormpex AI Research", + "aff_unique_norm": "Xi'an Jiaotong University;University of Illinois at Chicago;Wormpex AI Research", "aff_unique_dep": "Institute of Artificial Intelligence and Robotics;;AI Research", "aff_unique_url": "http://www.xjtu.edu.cn;https://www.uic.edu;", "aff_unique_abbr": "XJTU;UIC;Wormpex AI", "aff_campus_unique_index": "0;1;0;0", "aff_campus_unique": "Xi'an;Chicago;", "aff_country_unique_index": "0;1;0;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Zhu_2021_ICCV,\n \n author = {\n Zhu,\n Zixin and Tang,\n Wei and Wang,\n Le and Zheng,\n Nanning and Hua,\n Gang\n},\n title = {\n Enriching Local and Global Contexts for Temporal Action Localization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13516-13525\n} \n}" }, { "title": "Ensemble Attention Distillation for Privacy-Preserving Federated Learning", @@ -12729,6 +13593,7 @@ "status": "Poster", "track": "main", "pid": 5838, + "author_site": "Xuan Gong; Abhishek Sharma; Srikrishna Karanam; Ziyan Wu; Terrence Chen; David Doermann; Arun Innanje", "author": "Xuan Gong; Abhishek Sharma; Srikrishna Karanam; Ziyan Wu; Terrence Chen; David Doermann; Arun Innanje", "abstract": "We consider the problem of Federated Learning (FL) where numerous decentralized computational nodes collaborate with each other to train a centralized machine learning model without explicitly sharing their local data samples. Such decentralized training naturally leads to issues of imbalanced or differing data distributions among the local models and challenges in fusing them into a central model. Existing FL methods deal with these issues by either sharing local parameters or fusing models via online distillation. However, such a design leads to multiple rounds of inter-node communication resulting in substantial bandwidth consumption, while also increasing the risk of data leakage and consequent privacy issues. To address these problems, we propose a new distillation-based FL framework that can preserve privacy by design, while also consuming substantially less network communication resources when compared to the current state-of-the-art. Our framework engages in inter-node communication using only publicly available and approved datasets, thereby giving explicit privacy control to the user. To distill knowledge among the various local models, our framework involves a novel ensemble distillation algorithm that uses both final prediction as well as model attention. This algorithm explicitly considers the diversity among various local nodes while also seeking consensus among them. This results in a comprehensive technique to distill knowledge from various decentralized nodes. We demonstrate the various aspects and the associated benefits of our FL framework through extensive experiments that produce state-of-the-art results on both classification and segmentation tasks on natural and medical images.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Gong_Ensemble_Attention_Distillation_for_Privacy-Preserving_Federated_Learning_ICCV_2021_paper.pdf", @@ -12752,7 +13617,8 @@ "aff_campus_unique_index": "0+1;1;1;1;1;0;1", "aff_campus_unique": "Buffalo;Cambridge", "aff_country_unique_index": "0+0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Gong_2021_ICCV,\n \n author = {\n Gong,\n Xuan and Sharma,\n Abhishek and Karanam,\n Srikrishna and Wu,\n Ziyan and Chen,\n Terrence and Doermann,\n David and Innanje,\n Arun\n},\n title = {\n Ensemble Attention Distillation for Privacy-Preserving Federated Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15076-15086\n} \n}" }, { "title": "Entropy Maximization and Meta Classification for Out-of-Distribution Detection in Semantic Segmentation", @@ -12760,6 +13626,7 @@ "status": "Poster", "track": "main", "pid": 2340, + "author_site": "Robin Chan; Matthias Rottmann; Hanno Gottschalk", "author": "Robin Chan; Matthias Rottmann; Hanno Gottschalk", "abstract": "Deep neural networks (DNNs) for the semantic segmentation of images are usually trained to operate on a predefined closed set of object classes. This is in contrast to the \"\"open world\"\" setting where DNNs are envisioned to be deployed to. From a functional safety point of view, the ability to detect so-called \"\"out-of-distribution\"\" (OoD) samples, i.e., objects outside of a DNN's semantic space, is crucial for many applications such as automated driving. A natural baseline approach to OoD detection is to threshold on the pixel-wise softmax entropy. We present a two-step procedure that significantly improves that approach. Firstly, we utilize samples from the COCO dataset as OoD proxy and introduce a second training objective to maximize the softmax entropy on these samples. Starting from pretrained semantic segmentation networks we re-train a number of DNNs on different in-distribution datasets and consistently observe improved OoD detection performance when evaluating on completely disjoint OoD datasets. Secondly, we perform a transparent post-processing step to discard false positive OoD samples by so-called \"\"meta classification\"\". To this end, we apply linear models to a set of hand-crafted metrics derived from the DNN's softmax probabilities. In our experiments we consistently observe a clear additional gain in OoD detection performance, cutting down the number of detection errors by 52% when comparing the best baseline with our results. We achieve this improvement sacrificing only marginally in original segmentation performance. Therefore, our method contributes to safer DNNs with more reliable overall system performance.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chan_Entropy_Maximization_and_Meta_Classification_for_Out-of-Distribution_Detection_in_Semantic_ICCV_2021_paper.pdf", @@ -12783,7 +13650,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Chan_2021_ICCV,\n \n author = {\n Chan,\n Robin and Rottmann,\n Matthias and Gottschalk,\n Hanno\n},\n title = {\n Entropy Maximization and Meta Classification for Out-of-Distribution Detection in Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5128-5137\n} \n}" }, { "title": "Env-QA: A Video Question Answering Benchmark for Comprehensive Understanding of Dynamic Environments", @@ -12791,6 +13659,7 @@ "status": "Poster", "track": "main", "pid": 2431, + "author_site": "Difei Gao; Ruiping Wang; Ziyi Bai; Xilin Chen", "author": "Difei Gao; Ruiping Wang; Ziyi Bai; Xilin Chen", "abstract": "Visual understanding goes well beyond the study of images or videos on the web. To achieve complex tasks in volatile situations, the human can deeply understand the environment, quickly perceive events happening around, and continuously track objects' state changes, which are still challenging for current AI systems. To equip AI system with the ability to understand dynamic ENVironments, we build a video Question Answering dataset named Env-QA. Env-QA contains 23K egocentric videos, where each video is composed of a series of events about exploring and interacting in the environment. It also provides 85K questions to evaluate the ability of understanding the composition, layout, and state changes of the environment presented by the events in videos. Moreover, we propose a video QA model, Temporal Segmentation and Event Attention network (TSEA), which introduces event-level video representation and corresponding attention mechanisms to better extract environment information and answer questions. Comprehensive experiments demonstrate the effectiveness of our framework and show the formidable challenges of Env-QA in terms of long-term state tracking, multi-event temporal reasoning and event counting, etc.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Gao_Env-QA_A_Video_Question_Answering_Benchmark_for_Comprehensive_Understanding_of_ICCV_2021_paper.pdf", @@ -12814,7 +13683,8 @@ "aff_campus_unique_index": "0+0;0+0+0;0+0;0+0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0+0;0+0+0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Gao_2021_ICCV,\n \n author = {\n Gao,\n Difei and Wang,\n Ruiping and Bai,\n Ziyi and Chen,\n Xilin\n},\n title = {\n Env-QA: A Video Question Answering Benchmark for Comprehensive Understanding of Dynamic Environments\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1675-1685\n} \n}" }, { "title": "Episodic Transformer for Vision-and-Language Navigation", @@ -12822,6 +13692,7 @@ "status": "Poster", "track": "main", "pid": 3791, + "author_site": "Alexander Pashevich; Cordelia Schmid; Chen Sun", "author": "Alexander Pashevich; Cordelia Schmid; Chen Sun", "abstract": "Interaction and navigation defined by natural language instructions in dynamic environments pose significant challenges for neural agents. This paper focuses on addressing two challenges: handling long sequence of subtasks, and understanding complex human instructions. We propose Episodic Transformer (E.T.), a multimodal transformer that encodes language inputs and the full episode history of visual observations and actions. To improve training, we leverage synthetic instructions as an intermediate representation that decouples understanding the visual appearance of an environment from the variations of natural language instructions. We demonstrate that encoding the history with a transformer is critical to solve compositional tasks, and that pretraining and joint training with synthetic instructions further improve the performance. Our approach sets a new state of the art on the challenging ALFRED benchmark, achieving 38.4% and 8.5% task success rates on seen and unseen test splits.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Pashevich_Episodic_Transformer_for_Vision-and-Language_Navigation_ICCV_2021_paper.pdf", @@ -12838,14 +13709,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Pashevich_Episodic_Transformer_for_Vision-and-Language_Navigation_ICCV_2021_paper.html", "aff_unique_index": "0;1;1+2", - "aff_unique_norm": "INRIA;Google;Brown University", + "aff_unique_norm": "Inria;Google;Brown University", "aff_unique_dep": ";Google Research;", "aff_unique_url": "https://www.inria.fr;https://research.google;https://www.brown.edu", "aff_unique_abbr": "Inria;Google Research;Brown", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;1+1", - "aff_country_unique": "France;United States" + "aff_country_unique": "France;United States", + "bibtex": "@InProceedings{Pashevich_2021_ICCV,\n \n author = {\n Pashevich,\n Alexander and Schmid,\n Cordelia and Sun,\n Chen\n},\n title = {\n Episodic Transformer for Vision-and-Language Navigation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15942-15952\n} \n}" }, { "title": "Equivariant Imaging: Learning Beyond the Range Space", @@ -12853,7 +13725,8 @@ "status": "Poster", "track": "main", "pid": 2251, - "author": "Dongdong Chen; Juli\u00e1n Tachella; Mike E. Davies", + "author_site": "Dongdong Chen; Julián Tachella; Mike E. Davies", + "author": "Dongdong Chen; Julián Tachella; Mike E. Davies", "abstract": "In various imaging problems, we only have access to compressed measurements of the underlying signals, hindering most learning-based strategies which usually require pairs of signals and associated measurements for training. Learning only from compressed measurements is impossible in general, as the compressed observations do not contain information outside the range of the forward sensing operator. We propose a new end-to-end self-supervised framework that overcomes this limitation by exploiting the equivariances present in natural signals. Our proposed learning strategy performs as well as fully supervised methods. Experiments demonstrate the potential of this framework on inverse problems including sparse-view X-ray computed tomography on real clinical data and image inpainting on natural images. Code has been made available at: https://github.com/edongdongchen/EI.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_Equivariant_Imaging_Learning_Beyond_the_Range_Space_ICCV_2021_paper.pdf", "aff": ";;", @@ -12867,7 +13740,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Chen_Equivariant_Imaging_Learning_Beyond_the_Range_Space_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Chen_Equivariant_Imaging_Learning_Beyond_the_Range_Space_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Dongdong and Tachella,\n Juli\\'an and Davies,\n Mike E.\n},\n title = {\n Equivariant Imaging: Learning Beyond the Range Space\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4379-4388\n} \n}" }, { "title": "Estimating Egocentric 3D Human Pose in Global Space", @@ -12875,6 +13749,7 @@ "status": "Poster", "track": "main", "pid": 3944, + "author_site": "Jian Wang; Lingjie Liu; Weipeng Xu; Kripasindhu Sarkar; Christian Theobalt", "author": "Jian Wang; Lingjie Liu; Weipeng Xu; Kripasindhu Sarkar; Christian Theobalt", "abstract": "Egocentric 3D human pose estimation using a single fisheye camera has become popular recently as it allows capturing a wide range of daily activities in unconstrained environments, which is difficult for traditional outside-in motion capture with external cameras. However, existing methods have several limitations. A prominent problem is that the estimated poses lie in the local coordinate system of the fisheye camera, rather than in the world coordinate system, which is restrictive for many applications. Furthermore, these methods suffer from limited accuracy and temporal instability due to ambiguities caused by the monocular setup and the severe occlusion in a strongly distorted egocentric perspective. To tackle these limitations, we present a new method for egocentric global 3D body pose estimation using a single head-mounted fisheye camera. To achieve accurate and temporally stable global poses, a spatio-temporal optimization is performed over a sequence of frames by minimizing heatmap reprojection errors and enforcing local and global body motion priors learned from a mocap dataset. Experimental results show that our approach outperforms state-of-the-art methods both quantitatively and qualitatively.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_Estimating_Egocentric_3D_Human_Pose_in_Global_Space_ICCV_2021_paper.pdf", @@ -12891,14 +13766,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wang_Estimating_Egocentric_3D_Human_Pose_in_Global_Space_ICCV_2021_paper.html", "aff_unique_index": "0;1;2;0;1", - "aff_unique_norm": "Max Planck Institute for Informatics;Saarland University;Meta", + "aff_unique_norm": "Max Planck Institute for Informatics;Saarland University;Facebook Reality Labs", "aff_unique_dep": "Informatics;Department of Computer Science;Facebook Reality Labs", "aff_unique_url": "https://www.mpi-inf.mpg.de;https://www.uni-saarland.de;https://www.facebook.com/realitylabs", "aff_unique_abbr": "MPII;Uni Saar;FRL", "aff_campus_unique_index": "1;1", - "aff_campus_unique": ";Saarbr\u00fccken", + "aff_campus_unique": ";Saarbrücken", "aff_country_unique_index": "0;0;1;0;0", - "aff_country_unique": "Germany;United States" + "aff_country_unique": "Germany;United States", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Jian and Liu,\n Lingjie and Xu,\n Weipeng and Sarkar,\n Kripasindhu and Theobalt,\n Christian\n},\n title = {\n Estimating Egocentric 3D Human Pose in Global Space\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11500-11509\n} \n}" }, { "title": "Estimating and Exploiting the Aleatoric Uncertainty in Surface Normal Estimation", @@ -12906,6 +13782,7 @@ "status": "Poster", "track": "main", "pid": 6582, + "author_site": "Gwangbin Bae; Ignas Budvytis; Roberto Cipolla", "author": "Gwangbin Bae; Ignas Budvytis; Roberto Cipolla", "abstract": "Surface normal estimation from a single image is an important task in 3D scene understanding. In this paper, we address two limitations shared by the existing methods: the inability to estimate the aleatoric uncertainty and lack of detail in the prediction. The proposed network estimates the per-pixel surface normal probability distribution. We introduce a new parameterization for the distribution, such that its negative log-likelihood is the angular loss with learned attenuation. The expected value of the angular error is then used as a measure of the aleatoric uncertainty. We also present a novel decoder framework where pixel-wise multi-layer perceptrons are trained on a subset of pixels sampled based on the estimated uncertainty. The proposed uncertainty-guided sampling prevents the bias in training towards large planar surfaces and improves the quality of prediction, especially near object boundaries and on small structures. Experimental results show that the proposed method outperforms the state-of-the-art in ScanNet and NYUv2, and that the estimated uncertainty correlates well with the prediction error. Code is available at https://github.com/baegwangbin/surface_normal_uncertainty.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Bae_Estimating_and_Exploiting_the_Aleatoric_Uncertainty_in_Surface_Normal_Estimation_ICCV_2021_paper.pdf", @@ -12929,7 +13806,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Bae_2021_ICCV,\n \n author = {\n Bae,\n Gwangbin and Budvytis,\n Ignas and Cipolla,\n Roberto\n},\n title = {\n Estimating and Exploiting the Aleatoric Uncertainty in Surface Normal Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13137-13146\n} \n}" }, { "title": "EvIntSR-Net: Event Guided Multiple Latent Frames Reconstruction and Super-Resolution", @@ -12937,6 +13815,7 @@ "status": "Poster", "track": "main", "pid": 3711, + "author_site": "Jin Han; Yixin Yang; Chu Zhou; Chao Xu; Boxin Shi", "author": "Jin Han; Yixin Yang; Chu Zhou; Chao Xu; Boxin Shi", "abstract": "An event camera detects the scene radiance changes and sends a sequence of asynchronous event streams with high dynamic range, high temporal resolution, and low latency. However, the spatial resolution of event cameras is limited as a trade-off for these outstanding properties. To reconstruct high-resolution intensity images from event data, we propose EvIntSR-Net that converts event data to multiple latent intensity frames to achieve super-resolution on intensity images in this paper. EvIntSR-Net bridges the domain gap between event streams and intensity frames and learns to merge a sequence of latent intensity frames in a recurrent updating manner. Experimental results show that EvIntSR-Net can reconstruct SR intensity images with higher dynamic range and fewer blurry artifacts by fusing events with intensity frames for both simulated and real-world data. Furthermore, the proposed EvIntSR-Net is able to generate high-frame-rate videos with super-resolved frames.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Han_EvIntSR-Net_Event_Guided_Multiple_Latent_Frames_Reconstruction_and_Super-Resolution_ICCV_2021_paper.pdf", @@ -12960,7 +13839,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0+0;0;0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Han_2021_ICCV,\n \n author = {\n Han,\n Jin and Yang,\n Yixin and Zhou,\n Chu and Xu,\n Chao and Shi,\n Boxin\n},\n title = {\n EvIntSR-Net: Event Guided Multiple Latent Frames Reconstruction and Super-Resolution\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4882-4891\n} \n}" }, { "title": "Event Stream Super-Resolution via Spatiotemporal Constraint Learning", @@ -12968,6 +13848,7 @@ "status": "Poster", "track": "main", "pid": 2823, + "author_site": "Siqi Li; Yutong Feng; Yipeng Li; Yu Jiang; Changqing Zou; Yue Gao", "author": "Siqi Li; Yutong Feng; Yipeng Li; Yu Jiang; Changqing Zou; Yue Gao", "abstract": "Event cameras are bio-inspired sensors that respond to brightness changes asynchronously and output in the form of event streams instead of frame-based images. They own outstanding advantages compared with traditional cameras: higher temporal resolution, higher dynamic range, and lower power consumption. However, the spatial resolution of existing event cameras is insufficient and challenging to be enhanced at the hardware level while maintaining the asynchronous philosophy of circuit design. Therefore, it is imperative to explore the algorithm of event stream super-resolution, which is a non-trivial task due to the sparsity and strong spatio-temporal correlation of the events from an event camera. In this paper, we propose an end-to-end framework based on spiking neural network for event stream super-resolution, which can generate high-resolution (HR) event stream from the input low-resolution (LR) event stream. A spatiotemporal constraint learning mechanism is proposed to learn the spatial and temporal distributions of the event stream simultaneously. We validate our method on four large-scale datasets and the results show that our method achieves state-of-the-art performance. The satisfying results on two downstream applications, i.e. object classification and image reconstruction, further demonstrate the usability of our method. To prove the application potential of our method, we deploy it on a mobile platform. The high-quality HR event stream generated by our real-time system demonstrates the effectiveness and efficiency of our method.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_Event_Stream_Super-Resolution_via_Spatiotemporal_Constraint_Learning_ICCV_2021_paper.pdf", @@ -12984,14 +13865,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Li_Event_Stream_Super-Resolution_via_Spatiotemporal_Constraint_Learning_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;1;2;0", - "aff_unique_norm": "Tsinghua University;Jilin University;Huawei", - "aff_unique_dep": "School of Software;College of Computer Science and Technology;Huawei Technologies", + "aff_unique_norm": "Tsinghua University;Jilin University;Huawei Technologies", + "aff_unique_dep": "School of Software;College of Computer Science and Technology;", "aff_unique_url": "https://www.tsinghua.edu.cn;http://www.jlu.edu.cn;https://www.huawei.com/ca-en/", "aff_unique_abbr": "THU;JLU;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1;0", - "aff_country_unique": "China;Canada" + "aff_country_unique": "China;Canada", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Siqi and Feng,\n Yutong and Li,\n Yipeng and Jiang,\n Yu and Zou,\n Changqing and Gao,\n Yue\n},\n title = {\n Event Stream Super-Resolution via Spatiotemporal Constraint Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4480-4489\n} \n}" }, { "title": "Event-Based Video Reconstruction Using Transformer", @@ -12999,6 +13881,7 @@ "status": "Poster", "track": "main", "pid": 3240, + "author_site": "Wenming Weng; Yueyi Zhang; Zhiwei Xiong", "author": "Wenming Weng; Yueyi Zhang; Zhiwei Xiong", "abstract": "Event cameras, which output events by detecting spatio-temporal brightness changes, bring a novel paradigm to image sensors with high dynamic range and low latency. Previous works have achieved impressive performances on event-based video reconstruction by introducing convolutional neural networks (CNNs). However, intrinsic locality of convolutional operations is not capable of modeling long-range dependency, which is crucial to many vision tasks. In this paper, we present a hybrid CNN-Transformer network for event-based video reconstruction (ET-Net), which merits the fine local information from CNN and global contexts from Transformer. In addition, we further propose a Token Pyramid Aggregation strategy to implement multi-scale token integration for relating internal and intersected semantic concepts in the token-space. Experimental results demonstrate that our proposed method achieves superior performance over state-of-the-art methods on multiple real-world event datasets. The code is available at https://github.com/WarranWeng/ET-Net", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Weng_Event-Based_Video_Reconstruction_Using_Transformer_ICCV_2021_paper.pdf", @@ -13022,7 +13905,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Weng_2021_ICCV,\n \n author = {\n Weng,\n Wenming and Zhang,\n Yueyi and Xiong,\n Zhiwei\n},\n title = {\n Event-Based Video Reconstruction Using Transformer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2563-2572\n} \n}" }, { "title": "Event-Intensity Stereo: Estimating Depth by the Best of Both Worlds", @@ -13030,6 +13914,7 @@ "status": "Poster", "track": "main", "pid": 6135, + "author_site": "Mohammad Mostafavi; Kuk-Jin Yoon; Jonghyun Choi", "author": "Mohammad Mostafavi; Kuk-Jin Yoon; Jonghyun Choi", "abstract": "Event cameras can report scene movements as an asynchronous stream of data called the events. Unlike traditional cameras, event cameras have very low latency (microseconds vs milliseconds) very high dynamic range (140dB vs 60 dB), and low power consumption, as they report changes of a scene and not a complete frame. As they re-port per pixel feature-like events and not the whole intensity frame they are immune to motion blur. However, event cameras require movement between the scene and camera to fire events ,i.e., they have no output when the scene is relatively static. Traditional cameras, however, report the whole frame of pixels at once in fixed intervals but have lower dynamic range and are prone to motion blur in case of rapid movements. We get the best from both worlds and use events and intensity images together in our complementary design and estimate dense disparity from this combination. The proposed end-to-end design combines events and images in a sequential manner and correlates them to esti-mate dense depth values. Our various experimental settings in real-world and simulated scenarios exploit the superiority of our method in predicting accurate depth values with fine details. We further extend our method to extreme cases of missing the left or right event or stereo pair and also investigate stereo depth estimation with inconsistent dynamic ranges or event thresholds on the left and right pairs", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Mostafavi_Event-Intensity_Stereo_Estimating_Depth_by_the_Best_of_Both_Worlds_ICCV_2021_paper.pdf", @@ -13053,7 +13938,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Gwangju;", "aff_country_unique_index": "0+0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Mostafavi_2021_ICCV,\n \n author = {\n Mostafavi,\n Mohammad and Yoon,\n Kuk-Jin and Choi,\n Jonghyun\n},\n title = {\n Event-Intensity Stereo: Estimating Depth by the Best of Both Worlds\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4258-4267\n} \n}" }, { "title": "EventHPE: Event-Based 3D Human Pose and Shape Estimation", @@ -13061,6 +13947,7 @@ "status": "Poster", "track": "main", "pid": 5665, + "author_site": "Shihao Zou; Chuan Guo; Xinxin Zuo; Sen Wang; Pengyu Wang; Xiaoqin Hu; Shoushun Chen; Minglun Gong; Li Cheng", "author": "Shihao Zou; Chuan Guo; Xinxin Zuo; Sen Wang; Pengyu Wang; Xiaoqin Hu; Shoushun Chen; Minglun Gong; Li Cheng", "abstract": "Event camera is an emerging imaging sensor for capturing dynamics of moving objects as events, which motivates our work in estimating 3D human pose and shape from the event signals. Events, on the other hand, have their unique challenges: rather than capturing static body postures, the event signals are best at capturing local motions. This leads us to propose a two-stage deep learning approach, called EventHPE. The first-stage, FlowNet, is trained by unsupervised learning to infer optical flow from events. Both events and optical flow are closely related to human body dynamics, which are fed as input to the ShapeNet in the second stage, to estimate 3D human shapes. To mitigate the discrepancy between image-based flow (optical flow) and shape-based flow (vertices movement of human body shape), a novel flow coherence loss is introduced by exploiting the fact that both flows are originated from the identical human motion. An in-house event-based 3D human dataset is curated that comes with 3D pose and shape annotations, which is by far the largest one to our knowledge. Empirical evaluations on DHP19 dataset and our in-house dataset demonstrate the effectiveness of our approach.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zou_EventHPE_Event-Based_3D_Human_Pose_and_Shape_Estimation_ICCV_2021_paper.pdf", @@ -13084,7 +13971,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0;0+0;1;3;0;0", - "aff_country_unique": "Canada;China;;Singapore" + "aff_country_unique": "Canada;China;;Singapore", + "bibtex": "@InProceedings{Zou_2021_ICCV,\n \n author = {\n Zou,\n Shihao and Guo,\n Chuan and Zuo,\n Xinxin and Wang,\n Sen and Wang,\n Pengyu and Hu,\n Xiaoqin and Chen,\n Shoushun and Gong,\n Minglun and Cheng,\n Li\n},\n title = {\n EventHPE: Event-Based 3D Human Pose and Shape Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10996-11005\n} \n}" }, { "title": "EventHands: Real-Time Neural 3D Hand Pose Estimation From an Event Stream", @@ -13092,6 +13980,7 @@ "status": "Poster", "track": "main", "pid": 3126, + "author_site": "Viktor Rudnev; Vladislav Golyanik; Jiayi Wang; Hans-Peter Seidel; Franziska Mueller; Mohamed Elgharib; Christian Theobalt", "author": "Viktor Rudnev; Vladislav Golyanik; Jiayi Wang; Hans-Peter Seidel; Franziska Mueller; Mohamed Elgharib; Christian Theobalt", "abstract": "3D hand pose estimation from monocular videos is a long-standing and challenging problem, which is now seeing a strong upturn. In this work, we address it for the first time using a single event camera, i.e., an asynchronous vision sensor reacting on brightness changes. Our EventHands approach has characteristics previously not demonstrated with a single RGB or depth camera such as high temporal resolution at low data throughputs and real-time performance at 1000 Hz. Due to the different data modality of event cameras compared to classical cameras, existing methods cannot be directly applied to and re-trained for event streams. We thus design a new neural approach which accepts a new event stream representation suitable for learning, which is trained on newly-generated synthetic event streams and can generalise to real data. Experiments show that EventHands outperforms recent monocular methods using a colour (or depth) camera in terms of accuracy and its ability to capture hand motions of unprecedented speed. Our method, the event stream simulator and the dataset are publicly available (see https://gvv.mpi-inf.mpg.de/projects/EventHands/).", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Rudnev_EventHands_Real-Time_Neural_3D_Hand_Pose_Estimation_From_an_Event_ICCV_2021_paper.pdf", @@ -13106,7 +13995,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Rudnev_EventHands_Real-Time_Neural_3D_Hand_Pose_Estimation_From_an_Event_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Rudnev_EventHands_Real-Time_Neural_3D_Hand_Pose_Estimation_From_an_Event_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Rudnev_2021_ICCV,\n \n author = {\n Rudnev,\n Viktor and Golyanik,\n Vladislav and Wang,\n Jiayi and Seidel,\n Hans-Peter and Mueller,\n Franziska and Elgharib,\n Mohamed and Theobalt,\n Christian\n},\n title = {\n EventHands: Real-Time Neural 3D Hand Pose Estimation From an Event Stream\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12385-12395\n} \n}" }, { "title": "Evidential Deep Learning for Open Set Action Recognition", @@ -13114,6 +14004,7 @@ "status": "Poster", "track": "main", "pid": 1961, + "author_site": "Wentao Bao; Qi Yu; Yu Kong", "author": "Wentao Bao; Qi Yu; Yu Kong", "abstract": "In a real-world scenario, human actions are typically out of the distribution from training data, which requires a model to both recognize the known actions and reject the unknown. Different from image data, video actions are more challenging to be recognized in an open-set setting due to the uncertain temporal dynamics and static bias of human actions. In this paper, we propose a Deep Evidential Action Recognition (DEAR) method to recognize actions in an open testing set. Specifically, we formulate the action recognition problem from the evidential deep learning (EDL) perspective and propose a novel model calibration method to regularize the EDL training. Besides, to mitigate the static bias of video representation, we propose a plug-and-play module to debias the learned representation through contrastive learning. Experimental results show that our DEAR method achieves consistent performance gain on multiple mainstream action recognition models and benchmarks. Code and pre-trained models are available at https://www.rit.edu/actionlab/dear.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Bao_Evidential_Deep_Learning_for_Open_Set_Action_Recognition_ICCV_2021_paper.pdf", @@ -13137,7 +14028,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Rochester", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Bao_2021_ICCV,\n \n author = {\n Bao,\n Wentao and Yu,\n Qi and Kong,\n Yu\n},\n title = {\n Evidential Deep Learning for Open Set Action Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13349-13358\n} \n}" }, { "title": "Evolving Search Space for Neural Architecture Search", @@ -13145,6 +14037,7 @@ "status": "Poster", "track": "main", "pid": 6579, + "author_site": "Yuanzheng Ci; Chen Lin; Ming Sun; Boyu Chen; Hongwen Zhang; Wanli Ouyang", "author": "Yuanzheng Ci; Chen Lin; Ming Sun; Boyu Chen; Hongwen Zhang; Wanli Ouyang", "abstract": "Automation of neural architecture design has been a coveted alternative to human experts. Various search methods have been proposed aiming to find the optimal architecture in the search space. One would expect the search results to improve when the search space grows larger since it would potentially contain more performant candidates. Surprisingly, we observe that enlarging search space is unbeneficial or even detrimental to existing NAS methods such as DARTS, ProxylessNAS, and SPOS. This counterintuitive phenomenon suggests that enabling existing methods to large search space regimes is non-trivial. However, this problem is less discussed in the literature. We present a Neural Search-space Evolution (NSE) scheme, the first neural architecture search scheme designed especially for large space neural architecture search problems. The necessity of a well-designed search space with constrained size is a tacit consent in existing methods, and our NSE aims at minimizing such necessity. Specifically, the NSE starts with a search space subset, then evolves the search space by repeating two steps: 1) search an optimized space from the search space subset, 2) refill this subset from a large pool of operations that are not traversed. We further extend the flexibility of obtainable architectures by introducing a learnable multi-branch setting. With the proposed method, we achieve 77.3% top-1 retrain accuracy on ImageNet with 333M FLOPs, which yielded a state-of-the-art performance among previous auto-generated architectures that do not involve knowledge distillation or weight pruning. When the latency constraint is adopted, our result also performs better than the previous best-performing mobile models with a 77.9% Top-1 retrain accuracy. Code is available at https://github.com/orashi/NSE_NAS.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ci_Evolving_Search_Space_for_Neural_Architecture_Search_ICCV_2021_paper.pdf", @@ -13168,7 +14061,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;2;0;0", - "aff_country_unique": "Australia;United Kingdom;China" + "aff_country_unique": "Australia;United Kingdom;China", + "bibtex": "@InProceedings{Ci_2021_ICCV,\n \n author = {\n Ci,\n Yuanzheng and Lin,\n Chen and Sun,\n Ming and Chen,\n Boyu and Zhang,\n Hongwen and Ouyang,\n Wanli\n},\n title = {\n Evolving Search Space for Neural Architecture Search\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6659-6669\n} \n}" }, { "title": "Excavating the Potential Capacity of Self-Supervised Monocular Depth Estimation", @@ -13176,6 +14070,7 @@ "status": "Poster", "track": "main", "pid": 3898, + "author_site": "Rui Peng; Ronggang Wang; Yawen Lai; Luyang Tang; Yangang Cai", "author": "Rui Peng; Ronggang Wang; Yawen Lai; Luyang Tang; Yangang Cai", "abstract": "Self-supervised methods play an increasingly important role in monocular depth estimation due to their great potential and low annotation cost. To close the gap with supervised methods, recent works take advantage of extra constraints, e.g., semantic segmentation. However, these methods will inevitably increase the burden on the model. In this paper, we show theoretical and empirical evidence that the potential capacity of self-supervised monocular depth estimation can be excavated without increasing this cost. In particular, we propose (1) a novel data augmentation approach called data grafting, which forces the model to explore more cues to infer depth besides the vertical image position, (2) an exploratory self-distillation loss, which is supervised by the self-distillation label generated by our new post-processing method - selective post-processing, and (3) the full-scale network, designed to endow the encoder with the specialization of depth estimation task and enhance the representational power of the model. Extensive experiments show that our contributions can bring significant performance improvement to the baseline with even less computational overhead, and our model, named EPCDepth, surpasses the previous state-of-the-art methods even those supervised by additional constraints.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Peng_Excavating_the_Potential_Capacity_of_Self-Supervised_Monocular_Depth_Estimation_ICCV_2021_paper.pdf", @@ -13199,7 +14094,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Peng_2021_ICCV,\n \n author = {\n Peng,\n Rui and Wang,\n Ronggang and Lai,\n Yawen and Tang,\n Luyang and Cai,\n Yangang\n},\n title = {\n Excavating the Potential Capacity of Self-Supervised Monocular Depth Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15560-15569\n} \n}" }, { "title": "Explain Me the Painting: Multi-Topic Knowledgeable Art Description Generation", @@ -13207,6 +14103,7 @@ "status": "Poster", "track": "main", "pid": 5743, + "author_site": "Zechen Bai; Yuta Nakashima; Noa Garcia", "author": "Zechen Bai; Yuta Nakashima; Noa Garcia", "abstract": "Have you ever looked at a painting and wondered what is the story behind it? This work presents a framework to bring art closer to people by generating comprehensive descriptions of fine-art paintings. Generating informative descriptions for artworks, however, is extremely challenging, as it requires to 1) describe multiple aspects of the image such as its style, content, or composition, and 2) provide background and contextual knowledge about the artist, their influences, or the historical period. To address these challenges, we introduce a multi-topic and knowledgeable art description framework, which modules the generated sentences according to three artistic topics and, additionally, enhances each description with external knowledge. The framework is validated through an exhaustive analysis, both quantitative and qualitative, as well as a comparative human evaluation, demonstrating outstanding results in terms of both topic diversity and information veracity.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Bai_Explain_Me_the_Painting_Multi-Topic_Knowledgeable_Art_Description_Generation_ICCV_2021_paper.pdf", @@ -13230,7 +14127,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Bai_2021_ICCV,\n \n author = {\n Bai,\n Zechen and Nakashima,\n Yuta and Garcia,\n Noa\n},\n title = {\n Explain Me the Painting: Multi-Topic Knowledgeable Art Description Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5422-5432\n} \n}" }, { "title": "Explainable Person Re-Identification With Attribute-Guided Metric Distillation", @@ -13238,6 +14136,7 @@ "status": "Poster", "track": "main", "pid": 2909, + "author_site": "Xiaodong Chen; Xinchen Liu; Wu Liu; Xiao-Ping Zhang; Yongdong Zhang; Tao Mei", "author": "Xiaodong Chen; Xinchen Liu; Wu Liu; Xiao-Ping Zhang; Yongdong Zhang; Tao Mei", "abstract": "Despite the great progress of person re-identification (ReID) with the adoption of Convolutional Neural Networks, current ReID models are opaque and only outputs a scalar distance between two persons. There are few methods providing users semantically understandable explanations for why two persons are the same one or not. In this paper, we propose a post-hoc method, named Attribute-guided Metric Distillation (AMD), to explain existing ReID models. This is the first method to explore attributes to answer: 1) what and where the attributes make two persons different, and 2) how much each attribute contributes to the difference. In AMD, we design a pluggable interpreter network for target models to generate quantitative contributions of attributes and visualize accurate attention maps of the most discriminative attributes. To achieve this goal, we propose a metric distillation loss by which the interpreter learns to decompose the distance of two persons into components of attributes with knowledge distilled from the target model. Moreover, we propose an attribute prior loss to make the interpreter generate attribute-guided attention maps and to eliminate biases caused by the imbalanced distribution of attributes. This loss can guide the interpreter to focus on the exclusive and discriminative attributes rather than the large-area but common attributes of two persons. Comprehensive experiments show that the interpreter can generate effective and intuitive explanations for varied models and generalize well under cross-domain settings. As a by-product, the accuracy of target models can be further improved with our interpreter.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_Explainable_Person_Re-Identification_With_Attribute-Guided_Metric_Distillation_ICCV_2021_paper.pdf", @@ -13254,14 +14153,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Chen_Explainable_Person_Re-Identification_With_Attribute-Guided_Metric_Distillation_ICCV_2021_paper.html", "aff_unique_index": "0;1;1;2;0;1", - "aff_unique_norm": "University of Science and Technology of China;JD;Ryerson University", - "aff_unique_dep": ";JD AI Research;", + "aff_unique_norm": "University of Science and Technology of China;JD AI Research;Ryerson University", + "aff_unique_dep": ";;", "aff_unique_url": "http://www.ustc.edu.cn;https://www.jd.com;https://www.ryerson.ca", "aff_unique_abbr": "USTC;JD AI;Ryerson", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0;0", - "aff_country_unique": "China;Canada" + "aff_country_unique": "China;Canada", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Xiaodong and Liu,\n Xinchen and Liu,\n Wu and Zhang,\n Xiao-Ping and Zhang,\n Yongdong and Mei,\n Tao\n},\n title = {\n Explainable Person Re-Identification With Attribute-Guided Metric Distillation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11813-11822\n} \n}" }, { "title": "Explainable Video Entailment With Grounded Visual Evidence", @@ -13269,6 +14169,7 @@ "status": "Poster", "track": "main", "pid": 5968, + "author_site": "Junwen Chen; Yu Kong", "author": "Junwen Chen; Yu Kong", "abstract": "Video entailment aims at determining if a hypothesis textual statement is entailed or contradicted by a premise video. The main challenge of video entailment is that it requires fine-grained reasoning to understand the complex and long story-based videos. To this end, we propose to incorporate visual grounding to the entailment by explicitly linking the entities described in the statement to the evidence in the video. If the entities are grounded in the video, we enhance the entailment judgment by focusing on the frames where the entities occur. Besides, in entailment dataset, the real/fake statements are formed in pairs with subtle discrepancy, which allows an add-on explanation module to predict which words or phrases make the statement contradictory to the video and regularize the training of the entailment judgment. Experimental results demonstrate that our approach significantly outperforms the state-of-the-art methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_Explainable_Video_Entailment_With_Grounded_Visual_Evidence_ICCV_2021_paper.pdf", @@ -13292,7 +14193,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Rochester", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Junwen and Kong,\n Yu\n},\n title = {\n Explainable Video Entailment With Grounded Visual Evidence\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2021-2030\n} \n}" }, { "title": "Explaining Local, Global, and Higher-Order Interactions in Deep Learning", @@ -13300,6 +14202,7 @@ "status": "Poster", "track": "main", "pid": 7033, + "author_site": "Samuel Lerman; Charles Venuto; Henry Kautz; Chenliang Xu", "author": "Samuel Lerman; Charles Venuto; Henry Kautz; Chenliang Xu", "abstract": "We present a simple yet highly generalizable method for explaining interacting parts within a neural network's reasoning process. First, we design an algorithm based on cross derivatives for computing statistical interaction effects between individual features, which is generalized to both 2-way and higher-order (3-way or more) interactions. We present results side by side with a weight-based attribution technique, corroborating that cross derivatives are a superior metric for both 2-way and higher-order interaction detection. Moreover, we extend the use of cross derivatives as an explanatory device in neural networks to the computer vision setting by expanding Grad-CAM, a popular gradient-based explanatory tool for CNNs, to the higher order. While Grad-CAM can only explain the importance of individual objects in images, our method, which we call Taylor-CAM, can explain a neural network's relational reasoning across multiple objects. We show the success of our explanations both qualitatively and quantitatively, including with a user study. We will release all code as a tool package to facilitate explainable deep learning.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Lerman_Explaining_Local_Global_and_Higher-Order_Interactions_in_Deep_Learning_ICCV_2021_paper.pdf", @@ -13323,7 +14226,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Lerman_2021_ICCV,\n \n author = {\n Lerman,\n Samuel and Venuto,\n Charles and Kautz,\n Henry and Xu,\n Chenliang\n},\n title = {\n Explaining Local,\n Global,\n and Higher-Order Interactions in Deep Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1224-1233\n} \n}" }, { "title": "Explaining in Style: Training a GAN To Explain a Classifier in StyleSpace", @@ -13331,6 +14235,7 @@ "status": "Poster", "track": "main", "pid": 7751, + "author_site": "Oran Lang; Yossi Gandelsman; Michal Yarom; Yoav Wald; Gal Elidan; Avinatan Hassidim; William T. Freeman; Phillip Isola; Amir Globerson; Michal Irani; Inbar Mosseri", "author": "Oran Lang; Yossi Gandelsman; Michal Yarom; Yoav Wald; Gal Elidan; Avinatan Hassidim; William T. Freeman; Phillip Isola; Amir Globerson; Michal Irani; Inbar Mosseri", "abstract": "Image classification models can depend on multiple different semantic attributes of the image. An explanation of the decision of the classifier needs to both discover and visualize these properties. Here we present StylEx, a method for doing this, by training a generative model to specifically explain multiple attributes that underlie classifier decisions. A natural source for such attributes is the StyleSpace of StyleGAN, which is known to generate semantically meaningful dimensions in the image. However, because standard GAN training is not dependent on the classifier, it may not represent those attributes which are important for the classifier decision, and the dimensions of StyleSpace may represent irrelevant attributes. To overcome this, we propose a training procedure for a StyleGAN, which incorporates the classifier model, in order to learn a classifier-specific StyleSpace. Explanatory attributes are then selected from this space. These can be used to visualize the effect of changing multiple attributes per image, thus providing image-specific explanations. We apply StylEx to multiple domains, including animals, leaves, faces and retinal images. For these, we show how an image can be modified in different ways to change its classifier output. Our results show that the method finds attributes that align well with semantic ones, generate meaningful image-specific explanations, and are human-interpretable as measured in user-studies.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Lang_Explaining_in_Style_Training_a_GAN_To_Explain_a_Classifier_ICCV_2021_paper.pdf", @@ -13345,7 +14250,8 @@ "aff_domain": ";;;;;;;;;;", "email": ";;;;;;;;;;", "author_num": 11, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Lang_Explaining_in_Style_Training_a_GAN_To_Explain_a_Classifier_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Lang_Explaining_in_Style_Training_a_GAN_To_Explain_a_Classifier_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Lang_2021_ICCV,\n \n author = {\n Lang,\n Oran and Gandelsman,\n Yossi and Yarom,\n Michal and Wald,\n Yoav and Elidan,\n Gal and Hassidim,\n Avinatan and Freeman,\n William T. and Isola,\n Phillip and Globerson,\n Amir and Irani,\n Michal and Mosseri,\n Inbar\n},\n title = {\n Explaining in Style: Training a GAN To Explain a Classifier in StyleSpace\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 693-702\n} \n}" }, { "title": "Explanations for Occluded Images", @@ -13353,10 +14259,11 @@ "status": "Poster", "track": "main", "pid": 10420, + "author_site": "Hana Chockler; Daniel Kroening; Youcheng Sun", "author": "Hana Chockler; Daniel Kroening; Youcheng Sun", "abstract": "Existing algorithms for explaining the output of image classifiers perform poorly on inputs where the object of interest is partially occluded. We present a novel, black-box algorithm for computing explanations that uses a principled approach based on causal theory. We have implemented the method in the DeepCover tool. We obtain explanations that are much more accurate than those generated by the existing explanation tools on images with occlusions and observe a level of performance comparable to the state of the art when explaining images without occlusions.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chockler_Explanations_for_Occluded_Images_ICCV_2021_paper.pdf", - "aff": "causaLens + King\u2019s College London; Amazon.com, Inc.; Queen\u2019s University Belfast", + "aff": "causaLens + King’s College London; Amazon.com, Inc.; Queen’s University Belfast", "project": "", "github": "", "supp": "", @@ -13369,14 +14276,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Chockler_Explanations_for_Occluded_Images_ICCV_2021_paper.html", "aff_unique_index": "0+1;2;3", - "aff_unique_norm": "causaLens;King's College London;Amazon;Queen's University Belfast", - "aff_unique_dep": ";;Amazon.com, Inc.;", + "aff_unique_norm": "causaLens;King's College London;Amazon.com, Inc.;Queen's University Belfast", + "aff_unique_dep": ";;;", "aff_unique_url": ";https://www.kcl.ac.uk;https://www.amazon.com;https://www.qub.ac.uk", "aff_unique_abbr": ";KCL;Amazon;QUB", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "1;2;1", - "aff_country_unique": ";United Kingdom;United States" + "aff_country_unique": ";United Kingdom;United States", + "bibtex": "@InProceedings{Chockler_2021_ICCV,\n \n author = {\n Chockler,\n Hana and Kroening,\n Daniel and Sun,\n Youcheng\n},\n title = {\n Explanations for Occluded Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1234-1243\n} \n}" }, { "title": "Exploiting Explanations for Model Inversion Attacks", @@ -13384,6 +14292,7 @@ "status": "Poster", "track": "main", "pid": 7119, + "author_site": "Xuejun Zhao; Wencan Zhang; Xiaokui Xiao; Brian Lim", "author": "Xuejun Zhao; Wencan Zhang; Xiaokui Xiao; Brian Lim", "abstract": "The successful deployment of artificial intelligence (AI) in many domains from healthcare to hiring requires their responsible use, particularly in model explanations and privacy. Explainable artificial intelligence (XAI) provides more information to help users to understand model decisions, yet this additional knowledge exposes additional risks for privacy attacks. Hence, providing explanation harms privacy. We study this risk for image-based model inversion attacks and identified several attack architectures with increasing performance to reconstruct private image data from model explanations. We have developed several multi-modal transposed CNN architectures that achieve significantly higher inversion performance than using the target model prediction only. These XAI-aware inversion models were designed to exploit the spatial knowledge in image explanations. To understand which explanations have higher privacy risk, we analyzed how various explanation types and factors influence inversion performance. In spite of some models not providing explanations, we further demonstrate increased inversion performance even for non-explainable target models by exploiting explanations of surrogate models through attention transfer. This method first inverts an explanation from the target prediction, then reconstructs the target image. These threats highlight the urgent and significant privacy risks of explanations and calls attention for new privacy preservation techniques that balance the dual-requirement for AI explainability and privacy.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhao_Exploiting_Explanations_for_Model_Inversion_Attacks_ICCV_2021_paper.pdf", @@ -13407,7 +14316,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Zhao_2021_ICCV,\n \n author = {\n Zhao,\n Xuejun and Zhang,\n Wencan and Xiao,\n Xiaokui and Lim,\n Brian\n},\n title = {\n Exploiting Explanations for Model Inversion Attacks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 682-692\n} \n}" }, { "title": "Exploiting Multi-Object Relationships for Detecting Adversarial Attacks in Complex Scenes", @@ -13415,6 +14325,7 @@ "status": "Poster", "track": "main", "pid": 8497, + "author_site": "Mingjun Yin; Shasha Li; Zikui Cai; Chengyu Song; M. Salman Asif; Amit K. Roy-Chowdhury; Srikanth V. Krishnamurthy", "author": "Mingjun Yin; Shasha Li; Zikui Cai; Chengyu Song; M. Salman Asif; Amit K. Roy-Chowdhury; Srikanth V. Krishnamurthy", "abstract": "Vision systems that deploy Deep Neural Networks (DNNs) are known to be vulnerable to adversarial examples. Recent research has shown that checking the intrinsic consistencies in the input data is a promising way to detect adversarial attacks (e.g., by checking the object co-occurrence relationships in complex scenes). However, existing approaches are tied to specific models and do not offer generalizability. Motivated by the observation that language descriptions of natural scene images have already captured the object co-occurrence relationships that can be learned by a language model, we develop a novel approach to perform context consistency checks using such language models. The distinguishing aspect of our approach is that it is independent of the deployed object detector and yet offers very high accuracy in terms of detecting adversarial examples in practical scenes with multiple objects. Experiments on the PASCAL VOC and MS COCO datasets show that our method can outperform state-of-the-art methods in detecting adversarial attacks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yin_Exploiting_Multi-Object_Relationships_for_Detecting_Adversarial_Attacks_in_Complex_Scenes_ICCV_2021_paper.pdf", @@ -13438,7 +14349,8 @@ "aff_campus_unique_index": "0;0;0;0;0;0;0", "aff_campus_unique": "Riverside", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Yin_2021_ICCV,\n \n author = {\n Yin,\n Mingjun and Li,\n Shasha and Cai,\n Zikui and Song,\n Chengyu and Asif,\n M. Salman and Roy-Chowdhury,\n Amit K. and Krishnamurthy,\n Srikanth V.\n},\n title = {\n Exploiting Multi-Object Relationships for Detecting Adversarial Attacks in Complex Scenes\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7858-7867\n} \n}" }, { "title": "Exploiting Sample Correlation for Crowd Counting With Multi-Expert Network", @@ -13446,6 +14358,7 @@ "status": "Poster", "track": "main", "pid": 3025, + "author_site": "Xinyan Liu; Guorong Li; Zhenjun Han; Weigang Zhang; Yifan Yang; Qingming Huang; Nicu Sebe", "author": "Xinyan Liu; Guorong Li; Zhenjun Han; Weigang Zhang; Yifan Yang; Qingming Huang; Nicu Sebe", "abstract": "Crowd counting is a difficult task because of the diversity of scenes. Most of the existing crowd counting methods adopt complex structures with massive backbones to enhance the generalization ability. Unfortunately, the performance of existing methods on large-scale data sets is not satisfactory. In order to handle various scenarios with less complex network, we explored how to efficiently use the multi-expert model for crowd counting tasks. We mainly focus on how to train more efficient expert networks and how to choose the most suitable expert. Specifically, we propose a task-driven similarity metric based on sample's mutual enhancement, referred as co-fine-tune similarity, which can find a more efficient subset of data for training the expert network. Similar samples are considered as a cluster which is used to obtain parameters of an expert. Besides, to make better use of the proposed method, we design a simple network called FPN with Deconvolution Counting Network, which is a more suitable base model for the multi-expert counting network. Experimental results show that multiple experts FDC (MFDC) achieves the best performance on four public data sets, including the large scale NWPU-Crowd data set. Furthermore, the MFDC trained on an extensive dense crowd data set can generalize well on the other data sets without extra training or fine-tuning.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liu_Exploiting_Sample_Correlation_for_Crowd_Counting_With_Multi-Expert_Network_ICCV_2021_paper.pdf", @@ -13469,7 +14382,8 @@ "aff_campus_unique_index": ";1;", "aff_campus_unique": ";Weihai", "aff_country_unique_index": "0;0+0;0;0;0;0+0;1", - "aff_country_unique": "China;Italy" + "aff_country_unique": "China;Italy", + "bibtex": "@InProceedings{Liu_2021_ICCV,\n \n author = {\n Liu,\n Xinyan and Li,\n Guorong and Han,\n Zhenjun and Zhang,\n Weigang and Yang,\n Yifan and Huang,\n Qingming and Sebe,\n Nicu\n},\n title = {\n Exploiting Sample Correlation for Crowd Counting With Multi-Expert Network\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3215-3224\n} \n}" }, { "title": "Exploiting Scene Graphs for Human-Object Interaction Detection", @@ -13477,6 +14391,7 @@ "status": "Poster", "track": "main", "pid": 10998, + "author_site": "Tao He; Lianli Gao; Jingkuan Song; Yuan-Fang Li", "author": "Tao He; Lianli Gao; Jingkuan Song; Yuan-Fang Li", "abstract": "Human-Object Interaction (HOI) detection is a fundamental visual task aiming at localizing and recognizing interactions between humans and objects. Existing works focus on the visual and linguistic features of humans and objects. However, they do not captalise on the high-level and semantic relationships present in the image, which provides crucial contextual and detailed relational knowledge for HOI inference. We propose a novel method to exploit this information, through the scene graph, for the HumanObject Interaction (SG2HOI) detection task. Our method, SG2HOI, incorporates the SG information in two ways: (1) we embed a scene graph into a global context clue, serving as the scene-specific environmental context; and (2) we build a relation-aware message-passing module to gather relationships from objects' neighborhood and transfer them into interactions. Empirical evaluation shows that our SG2HOI method outperforms the state-of-the-art methods on two benchmark HOI datasets: V-COCO and HICO-DET. Code will be available at https://github.com/ht014/SG2HOI.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/He_Exploiting_Scene_Graphs_for_Human-Object_Interaction_Detection_ICCV_2021_paper.pdf", @@ -13500,7 +14415,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0", - "aff_country_unique": "Australia;China" + "aff_country_unique": "Australia;China", + "bibtex": "@InProceedings{He_2021_ICCV,\n \n author = {\n He,\n Tao and Gao,\n Lianli and Song,\n Jingkuan and Li,\n Yuan-Fang\n},\n title = {\n Exploiting Scene Graphs for Human-Object Interaction Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15984-15993\n} \n}" }, { "title": "Exploiting a Joint Embedding Space for Generalized Zero-Shot Semantic Segmentation", @@ -13508,6 +14424,7 @@ "status": "Poster", "track": "main", "pid": 8167, + "author_site": "Donghyeon Baek; Youngmin Oh; Bumsub Ham", "author": "Donghyeon Baek; Youngmin Oh; Bumsub Ham", "abstract": "We address the problem of generalized zero-shot semantic segmentation (GZS3) predicting pixel-wise semantic labels for seen and unseen classes. Most GZS3 methods adopt a generative approach that synthesizes visual features of unseen classes from corresponding semantic ones (e.g., word2vec) to train novel classifiers for both seen and unseen classes. Although generative methods show decent performance, they have two limitations: (1) the visual features are biased towards seen classes; (2) the classifier should be retrained whenever novel unseen classes appear. We propose a discriminative approach to address these limitations in a unified framework. To this end, we leverage visual and semantic encoders to learn a joint embedding space, where the semantic encoder transforms semantic features to semantic prototypes that act as centers for visual features of corresponding classes. Specifically, we introduce boundary-aware regression (BAR) and semantic consistency (SC) losses to learn discriminative features. Our approach to exploiting the joint embedding space, together with BAR and SC terms, alleviates the seen bias problem. At test time, we avoid the retraining process by exploiting semantic prototypes as a nearest-neighbor (NN) classifier. To further alleviate the bias problem, we also propose an inference technique, dubbed Apollonius calibration (AC), that modulates the decision boundary of the NN classifier to the Apollonius circle adaptively. Experimental results demonstrate the effectiveness of our framework, achieving a new state of the art on standard benchmarks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Baek_Exploiting_a_Joint_Embedding_Space_for_Generalized_Zero-Shot_Semantic_Segmentation_ICCV_2021_paper.pdf", @@ -13531,7 +14448,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Baek_2021_ICCV,\n \n author = {\n Baek,\n Donghyeon and Oh,\n Youngmin and Ham,\n Bumsub\n},\n title = {\n Exploiting a Joint Embedding Space for Generalized Zero-Shot Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9536-9545\n} \n}" }, { "title": "Exploration and Estimation for Model Compression", @@ -13539,6 +14457,7 @@ "status": "Poster", "track": "main", "pid": 7678, + "author_site": "Yanfu Zhang; Shangqian Gao; Heng Huang", "author": "Yanfu Zhang; Shangqian Gao; Heng Huang", "abstract": "Deep neural networks achieve great success in many visual recognition tasks. However, the model deployment is usually subject to some computational resources. Model pruning under computational budget has attracted growing attention. In this paper, we focus on the discrimination-aware compression of Convolutional Neural Networks (CNNs). In prior arts, directly searching the optimal sub-network is an integer programming problem, which is non-smooth, non-convex, and NP-hard. Meanwhile, the heuristic pruning criterion lacks clear interpretability and doesn't generalize well in applications. To address this problem, we formulate sub-networks as samples from a multivariate Bernoulli distribution and resort to the approximation of continuous problem. We propose a new flexible search scheme via alternating exploration and estimation. In the exploration step, we employ stochastic gradient Hamiltonian Monte Carlo with budget-awareness to generate sub-networks, which allows large search space with efficient computation. In the estimation step, we deduce the sub-network sampler to a near-optimal point, to promote the generation of high-quality sub-networks. Unifying the exploration and estimation, our approach avoids early falling into local minimum via a fast gradient-based search in a larger space. Extensive experiments on CIFAR-10 and ImageNet show that our method achieves state-of-the-art performances on pruning several popular CNNs.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhang_Exploration_and_Estimation_for_Model_Compression_ICCV_2021_paper.pdf", @@ -13555,14 +14474,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhang_Exploration_and_Estimation_for_Model_Compression_ICCV_2021_paper.html", "aff_unique_index": "0+1;0+1;0+1", - "aff_unique_norm": "University of Pittsburgh;JD", - "aff_unique_dep": "Electrical and Computer Engineering;JD Explore Academy", + "aff_unique_norm": "University of Pittsburgh;JD Explore Academy", + "aff_unique_dep": "Electrical and Computer Engineering;", "aff_unique_url": "https://www.pitt.edu;", "aff_unique_abbr": "Pitt;", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States;" + "aff_country_unique": "United States;", + "bibtex": "@InProceedings{Zhang_2021_ICCV,\n \n author = {\n Zhang,\n Yanfu and Gao,\n Shangqian and Huang,\n Heng\n},\n title = {\n Exploration and Estimation for Model Compression\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 487-496\n} \n}" }, { "title": "Exploring Classification Equilibrium in Long-Tailed Object Detection", @@ -13570,6 +14490,7 @@ "status": "Poster", "track": "main", "pid": 3838, + "author_site": "Chengjian Feng; Yujie Zhong; Weilin Huang", "author": "Chengjian Feng; Yujie Zhong; Weilin Huang", "abstract": "The conventional detectors tend to make imbalanced classification and suffer performance drop, when the distribution of the training data is severely skewed. In this paper, we propose to use the mean classification score to indicate the classification accuracy for each category during training. Based on this indicator, we balance the classification via an Equilibrium Loss (EBL) and a Memory-augmented Feature Sampling (MFS) method. Specifically, EBL increases the intensity of the adjustment of the decision boundary for the weak classes by a designed score-guided loss margin between any two classes. On the other hand, MFS improves the frequency and accuracy of the adjustments of the decision boundary for the weak classes through over-sampling the instance features of those classes. Therefore, EBL and MFS work collaboratively for finding the classification equilibrium in long-tailed detection, and dramatically improve the performance of tail classes while maintaining or even improving the performance of head classes. We conduct experiments on LVIS using Mask R-CNN with various backbones including ResNet-50-FPN and ResNet-101-FPN to show the superiority of the proposed method. It improves the detection performance of tail classes by 15.6 AP, and outperforms the most recent long-tailed object detectors by more than 1 AP. Code is available at https://github.com/fcjian/LOCE.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Feng_Exploring_Classification_Equilibrium_in_Long-Tailed_Object_Detection_ICCV_2021_paper.pdf", @@ -13593,7 +14514,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Feng_2021_ICCV,\n \n author = {\n Feng,\n Chengjian and Zhong,\n Yujie and Huang,\n Weilin\n},\n title = {\n Exploring Classification Equilibrium in Long-Tailed Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3417-3426\n} \n}" }, { "title": "Exploring Cross-Image Pixel Contrast for Semantic Segmentation", @@ -13601,6 +14523,7 @@ "status": "Poster", "track": "main", "pid": 6230, + "author_site": "Wenguan Wang; Tianfei Zhou; Fisher Yu; Jifeng Dai; Ender Konukoglu; Luc Van Gool", "author": "Wenguan Wang; Tianfei Zhou; Fisher Yu; Jifeng Dai; Ender Konukoglu; Luc Van Gool", "abstract": "Current semantic segmentation methods focus only on mining \"local\" context, i.e., dependencies between pixels within individual images, by context-aggregation modules (e.g., dilated convolution, neural attention) or structure-aware optimization criteria (e.g., IoU-like loss). However, they ignore \"global\" context of the training data, i.e., rich semantic relations between pixels across different images. Inspired by recent advance in unsupervised contrastive representation learning, we propose a pixel-wise contrastive algorithm for semantic segmentation in the fully supervised setting. The core idea is to enforce pixel embeddings belonging to a same semantic class to be more similar than embeddings from different classes. It raises a pixel-wise metric learning paradigm for semantic segmentation, by explicitly exploring the structures of labeled pixels, which were rarely explored before. Our method can be effortlessly incorporated into existing segmentation frameworks without extra overhead during testing. We experimentally show that, with famous segmentation models (i.e., DeepLabV3, HRNet, OCR) and backbones (i.e., ResNet, HRNet), our method brings performance improvements across diverse datasets (i.e., Cityscapes, PASCAL-Context, COCO-Stuff, CamVid). We expect this work will encourage our community to rethink the current de facto training paradigm in semantic segmentation.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_Exploring_Cross-Image_Pixel_Contrast_for_Semantic_Segmentation_ICCV_2021_paper.pdf", @@ -13615,7 +14538,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wang_Exploring_Cross-Image_Pixel_Contrast_for_Semantic_Segmentation_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wang_Exploring_Cross-Image_Pixel_Contrast_for_Semantic_Segmentation_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Wenguan and Zhou,\n Tianfei and Yu,\n Fisher and Dai,\n Jifeng and Konukoglu,\n Ender and Van Gool,\n Luc\n},\n title = {\n Exploring Cross-Image Pixel Contrast for Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7303-7313\n} \n}" }, { "title": "Exploring Geometry-Aware Contrast and Clustering Harmonization for Self-Supervised 3D Object Detection", @@ -13623,10 +14547,11 @@ "status": "Poster", "track": "main", "pid": 9070, + "author_site": "Hanxue Liang; Chenhan Jiang; Dapeng Feng; Xin Chen; Hang Xu; Xiaodan Liang; Wei Zhang; Zhenguo Li; Luc Van Gool", "author": "Hanxue Liang; Chenhan Jiang; Dapeng Feng; Xin Chen; Hang Xu; Xiaodan Liang; Wei Zhang; Zhenguo Li; Luc Van Gool", "abstract": "Current 3D object detection paradigms highly rely on extensive annotation efforts, which makes them not practical in many real-world industrial applications. Inspired by that a human driver can keep accumulating experiences from self-exploring the roads without any tutor's guidance, we first step forwards to explore a simple yet effective self-supervised learning framework tailored for LiDAR-based 3D object detection. Although the self-supervised pipeline has achieved great success in 2D domain, the characteristic challenges (e.g., complex geometry structure and various 3D object views) encountered in the 3D domain hinder the direct adoption of existing techniques that often contrast the 2D augmented data or cluster single-view features. Here we present a novel self-supervised 3D Object detection framework that seamlessly integrates the geometry-aware contrast and clustering harmonization to lift the unsupervised 3D representation learning, named GCC-3D. First, GCC-3D introduces a Geometric-Aware Contrastive objective to learn spatial-sensitive local structure representation. This objective enforces the spatially-closed voxels to have high feature similarity. Second, a Pseudo-Instance Clustering harmonization mechanism is proposed to encourage that different views of pseudo-instances should have consistent similarities to clustering prototype centers. This module endows our model semantic discriminative capacity. Extensive experiments demonstrate our GCC-3D achieves significant performance improvement on data-efficient 3D object detection benchmarks (nuScenes and Waymo). Moreover, our GCC-3D framework can achieve state-of-the art performances on all popular 3D object detection benchmarks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liang_Exploring_Geometry-Aware_Contrast_and_Clustering_Harmonization_for_Self-Supervised_3D_Object_ICCV_2021_paper.pdf", - "aff": "ETH Zurich; Huawei Noah\u2019s Ark Lab; Sun Yat-Sen University; The University of Hong Kong; Huawei Noah\u2019s Ark Lab; Sun Yat-Sen University; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; ETH Zurich", + "aff": "ETH Zurich; Huawei Noah’s Ark Lab; Sun Yat-Sen University; The University of Hong Kong; Huawei Noah’s Ark Lab; Sun Yat-Sen University; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; ETH Zurich", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Liang_Exploring_Geometry-Aware_Contrast_ICCV_2021_supplemental.pdf", @@ -13639,14 +14564,15 @@ "author_num": 9, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Liang_Exploring_Geometry-Aware_Contrast_and_Clustering_Harmonization_for_Self-Supervised_3D_Object_ICCV_2021_paper.html", "aff_unique_index": "0;1;2;3;1;2;1;1;0", - "aff_unique_norm": "ETH Zurich;Huawei;Sun Yat-sen University;University of Hong Kong", - "aff_unique_dep": ";Noah\u2019s Ark Lab;;", + "aff_unique_norm": "ETH Zurich;Huawei;Sun Yat-Sen University;The University of Hong Kong", + "aff_unique_dep": ";Noah’s Ark Lab;;", "aff_unique_url": "https://www.ethz.ch;https://www.huawei.com;http://www.sysu.edu.cn/;https://www.hku.hk", "aff_unique_abbr": "ETHZ;Huawei;SYSU;HKU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;1;1;1;1;1;1;1;0", - "aff_country_unique": "Switzerland;China" + "aff_country_unique": "Switzerland;China", + "bibtex": "@InProceedings{Liang_2021_ICCV,\n \n author = {\n Liang,\n Hanxue and Jiang,\n Chenhan and Feng,\n Dapeng and Chen,\n Xin and Xu,\n Hang and Liang,\n Xiaodan and Zhang,\n Wei and Li,\n Zhenguo and Van Gool,\n Luc\n},\n title = {\n Exploring Geometry-Aware Contrast and Clustering Harmonization for Self-Supervised 3D Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3293-3302\n} \n}" }, { "title": "Exploring Inter-Channel Correlation for Diversity-Preserved Knowledge Distillation", @@ -13654,6 +14580,7 @@ "status": "Poster", "track": "main", "pid": 3170, + "author_site": "Li Liu; Qingle Huang; Sihao Lin; Hongwei Xie; Bing Wang; Xiaojun Chang; Xiaodan Liang", "author": "Li Liu; Qingle Huang; Sihao Lin; Hongwei Xie; Bing Wang; Xiaojun Chang; Xiaodan Liang", "abstract": "Knowledge Distillation has shown very promising ability in transferring learned representation from the larger model (teacher) to the smaller one (student). Despite many efforts, prior methods ignore the important role of retaining inter-channel correlation of features, leading to the lack of capturing intrinsic distribution of the feature space and sufficient diversity properties of features in the teacher network. To solve the issue, we propose the novel Inter-Channel Correlation for Knowledge Distillation (ICKD), with which the diversity and homology of the feature space of the student network can align with that of the teacher network. The correlation between these two channels is interpreted as diversity if they are irrelevant to each other, otherwise homology. Then the student is required to mimic the correlation within its own embedding space. In addition, we introduce the grid-level inter-channel correlation, making it capable of dense prediction tasks. Extensive experiments on two vision tasks, including ImageNet classification and Pascal VOC segmentation, demonstrate the superiority of our ICKD, which consistently outperforms many existing methods, advancing the state-of-the-art in the fields of Knowledge Distillation. To our knowledge, we are the first method based on knowledge distillation boosts ResNet18 beyond 72% Top-1 accuracy on ImageNet classification. Code is available at: https://github.com/ADLab-AutoDrive/ICKD.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liu_Exploring_Inter-Channel_Correlation_for_Diversity-Preserved_Knowledge_Distillation_ICCV_2021_paper.pdf", @@ -13677,7 +14604,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1+0;0;0;1;0", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Liu_2021_ICCV,\n \n author = {\n Liu,\n Li and Huang,\n Qingle and Lin,\n Sihao and Xie,\n Hongwei and Wang,\n Bing and Chang,\n Xiaojun and Liang,\n Xiaodan\n},\n title = {\n Exploring Inter-Channel Correlation for Diversity-Preserved Knowledge Distillation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8271-8280\n} \n}" }, { "title": "Exploring Long Tail Visual Relationship Recognition With Large Vocabulary", @@ -13685,6 +14613,7 @@ "status": "Poster", "track": "main", "pid": 10473, + "author_site": "Sherif Abdelkarim; Aniket Agarwal; Panos Achlioptas; Jun Chen; Jiaji Huang; Boyang Li; Kenneth Church; Mohamed Elhoseiny", "author": "Sherif Abdelkarim; Aniket Agarwal; Panos Achlioptas; Jun Chen; Jiaji Huang; Boyang Li; Kenneth Church; Mohamed Elhoseiny", "abstract": "Several approaches have been proposed in recent literature to alleviate the long-tail problem, mainly in object classification tasks. In this paper, we make the first large-scale study concerning the task of Long-Tail Visual Relationship Recognition (LTVRR). LTVRR aims at improving the learning of structured visual relationships that come from the long-tail (e.g.,\"rabbit grazing on grass\"). In this setup, the subject, relation, and object classes each follow a long-tail distribution. To begin our study and make a future benchmark for the community, we introduce two LTVRR-related benchmarks, dubbed VG8K-LT and GQA-LT, built upon the widely used Visual Genome and GQA datasets. We use these benchmarks to study the performance of several state-of-the-art long-tail models on the LTVRR setup. Lastly, we propose a visiolinguistic hubless (VilHub) loss and a Mixup augmentation technique adapted to LTVRR setup, dubbed as RelMix. Both VilHub and RelMix can be easily integrated on top of existing models and despite being simple, our results show that they can remarkably improve the performance, especially on tail classes. Benchmarks, code, and models have been made available at: https://github.com/Vision-CAIR/LTVRR.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Abdelkarim_Exploring_Long_Tail_Visual_Relationship_Recognition_With_Large_Vocabulary_ICCV_2021_paper.pdf", @@ -13701,14 +14630,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Abdelkarim_Exploring_Long_Tail_Visual_Relationship_Recognition_With_Large_Vocabulary_ICCV_2021_paper.html", "aff_unique_index": "0+1;0+2;1;0;3;4;3;0", - "aff_unique_norm": "King Abdullah University of Science and Technology;Stanford University;Indian Institute of Technology Roorkee;Baidu;Nanyang Technological University", - "aff_unique_dep": ";;;Baidu, Inc.;", + "aff_unique_norm": "King Abdullah University of Science and Technology;Stanford University;Indian Institute of Technology Roorkee;Baidu, Inc.;Nanyang Technological University", + "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.kaust.edu.sa;https://www.stanford.edu;https://www.iitr.ac.in;https://www.baidu.com;https://www.ntu.edu.sg", "aff_unique_abbr": "KAUST;Stanford;IITR;Baidu;NTU", "aff_campus_unique_index": "1;;1;2", "aff_campus_unique": ";Stanford;Singapore", "aff_country_unique_index": "0+1;0+2;1;0;3;4;3;0", - "aff_country_unique": "Saudi Arabia;United States;India;China;Singapore" + "aff_country_unique": "Saudi Arabia;United States;India;China;Singapore", + "bibtex": "@InProceedings{Abdelkarim_2021_ICCV,\n \n author = {\n Abdelkarim,\n Sherif and Agarwal,\n Aniket and Achlioptas,\n Panos and Chen,\n Jun and Huang,\n Jiaji and Li,\n Boyang and Church,\n Kenneth and Elhoseiny,\n Mohamed\n},\n title = {\n Exploring Long Tail Visual Relationship Recognition With Large Vocabulary\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15921-15930\n} \n}" }, { "title": "Exploring Relational Context for Multi-Task Dense Prediction", @@ -13716,7 +14646,8 @@ "status": "Poster", "track": "main", "pid": 3855, - "author": "David Br\u00fcggemann; Menelaos Kanakis; Anton Obukhov; Stamatios Georgoulis; Luc Van Gool", + "author_site": "David Brüggemann; Menelaos Kanakis; Anton Obukhov; Stamatios Georgoulis; Luc Van Gool", + "author": "David Brüggemann; Menelaos Kanakis; Anton Obukhov; Stamatios Georgoulis; Luc Van Gool", "abstract": "The timeline of computer vision research is marked with advances in learning and utilizing efficient contextual representations. Most of them, however, are targeted at improving model performance on a single downstream task. We consider a multi-task environment for dense prediction tasks, represented by a common backbone and independent task-specific heads. Our goal is to find the most efficient way to refine each task prediction by capturing cross-task contexts dependent on tasks' relations. We explore various attention-based contexts, such as global and local, in the multi-task setting and analyze their behavior when applied to refine each task independently. Empirical findings confirm that different source-target task pairs benefit from different context types. To automate the selection process, we propose an Adaptive Task-Relational Context (ATRC) module, which samples the pool of all available contexts for each task pair using neural architecture search and outputs the optimal configuration for deployment. Our method achieves state-of-the-art performance on two important multi-task benchmarks, namely NYUD-v2 and PASCAL-Context. The proposed ATRC has a low computational toll and can be used as a drop-in refinement module for any supervised multi-task architecture.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Bruggemann_Exploring_Relational_Context_for_Multi-Task_Dense_Prediction_ICCV_2021_paper.pdf", "aff": "ETH Zurich; ETH Zurich; ETH Zurich; ETH Zurich; ETH Zurich", @@ -13739,7 +14670,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "Switzerland" + "aff_country_unique": "Switzerland", + "bibtex": "@InProceedings{Bruggemann_2021_ICCV,\n \n author = {\n Br\\"uggemann,\n David and Kanakis,\n Menelaos and Obukhov,\n Anton and Georgoulis,\n Stamatios and Van Gool,\n Luc\n},\n title = {\n Exploring Relational Context for Multi-Task Dense Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15869-15878\n} \n}" }, { "title": "Exploring Robustness of Unsupervised Domain Adaptation in Semantic Segmentation", @@ -13747,6 +14679,7 @@ "status": "Poster", "track": "main", "pid": 7185, + "author_site": "Jinyu Yang; Chunyuan Li; Weizhi An; Hehuan Ma; Yuzhi Guo; Yu Rong; Peilin Zhao; Junzhou Huang", "author": "Jinyu Yang; Chunyuan Li; Weizhi An; Hehuan Ma; Yuzhi Guo; Yu Rong; Peilin Zhao; Junzhou Huang", "abstract": "Recent studies imply that deep neural networks are vulnerable to adversarial examples, i.e., inputs with a slight but intentional perturbation are incorrectly classified by the network. Such vulnerability makes it risky for some security-related applications (e.g., semantic segmentation in autonomous cars) and triggers tremendous concerns on the model reliability. For the first time, we comprehensively evaluate the robustness of existing UDA methods and propose a robust UDA approach. It is rooted in two observations: i) the robustness of UDA methods in semantic segmentation remains unexplored, which poses a security concern in this field; and ii) although commonly used self-supervision (e.g., rotation and jigsaw) benefits model robustness in classification and recognition tasks, they fail to provide the critical supervision signals that are essential in semantic segmentation. These observations motivate us to propose adversarial self-supervision UDA (or ASSUDA) that maximizes the agreement between clean images and their adversarial examples by a contrastive loss in the output space. Extensive empirical studies on commonly used benchmarks demonstrate that ASSUDA is resistant to adversarial attacks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yang_Exploring_Robustness_of_Unsupervised_Domain_Adaptation_in_Semantic_Segmentation_ICCV_2021_paper.pdf", @@ -13770,7 +14703,8 @@ "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Arlington;", "aff_country_unique_index": "0;0;0;0;0;1;1;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Yang_2021_ICCV,\n \n author = {\n Yang,\n Jinyu and Li,\n Chunyuan and An,\n Weizhi and Ma,\n Hehuan and Guo,\n Yuzhi and Rong,\n Yu and Zhao,\n Peilin and Huang,\n Junzhou\n},\n title = {\n Exploring Robustness of Unsupervised Domain Adaptation in Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9194-9203\n} \n}" }, { "title": "Exploring Simple 3D Multi-Object Tracking for Autonomous Driving", @@ -13778,6 +14712,7 @@ "status": "Poster", "track": "main", "pid": 2499, + "author_site": "Chenxu Luo; Xiaodong Yang; Alan Yuille", "author": "Chenxu Luo; Xiaodong Yang; Alan Yuille", "abstract": "3D multi-object tracking in LiDAR point clouds is a key ingredient for self-driving vehicles. Existing methods are predominantly based on the tracking-by-detection pipeline and inevitably require a heuristic matching step for the detection association. In this paper, we present SimTrack to simplify the hand-crafted tracking paradigm by proposing an end-to-end trainable model for joint detection and tracking from raw point clouds. Our key design is to predict the first-appear location of each object in a given snippet to get the tracking identity and then update the location based on motion estimation. In the inference, the heuristic matching step can be completely waived by a simple read-off operation. SimTrack integrates the tracked object association, newborn object detection, and dead track killing in a single unified model. We conduct extensive evaluations on two large-scale datasets: nuScenes and Waymo Open Dataset. Experimental results reveal that our simple approach compares favorably with the state-of-the-art methods while ruling out the heuristic matching rules.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Luo_Exploring_Simple_3D_Multi-Object_Tracking_for_Autonomous_Driving_ICCV_2021_paper.pdf", @@ -13801,7 +14736,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "1;1", - "aff_country_unique": ";United States" + "aff_country_unique": ";United States", + "bibtex": "@InProceedings{Luo_2021_ICCV,\n \n author = {\n Luo,\n Chenxu and Yang,\n Xiaodong and Yuille,\n Alan\n},\n title = {\n Exploring Simple 3D Multi-Object Tracking for Autonomous Driving\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10488-10497\n} \n}" }, { "title": "Exploring Temporal Coherence for More General Video Face Forgery Detection", @@ -13809,6 +14745,7 @@ "status": "Poster", "track": "main", "pid": 10188, + "author_site": "Yinglin Zheng; Jianmin Bao; Dong Chen; Ming Zeng; Fang Wen", "author": "Yinglin Zheng; Jianmin Bao; Dong Chen; Ming Zeng; Fang Wen", "abstract": "Although current face manipulation techniques achieve impressive performance regarding quality and controllability, they are struggling to generate temporal coherent face videos. In this work, we explore to take full advantage of the temporal coherence for video face forgery detection. To achieve this, we propose a novel end-to-end framework, which consists of two major stages. The first stage is a fully temporal convolution network (FTCN). The key insight of FTCN is to reduce the spatial convolution kernel size to 1, while maintaining the temporal convolution kernel size unchanged. We surprisingly find this special design can benefit the model for extracting the temporal features as well as improve the generalization capability. The second stage is a Temporal Transformer network, which aims to explore the long-term temporal coherence. The proposed framework is general and flexible, which can be directly trained from scratch without any pre-training models or external datasets. Extensive experiments show that our framework outperforms existing methods and remains effective when applied to detect new sorts of face forgery videos.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zheng_Exploring_Temporal_Coherence_for_More_General_Video_Face_Forgery_Detection_ICCV_2021_paper.pdf", @@ -13825,14 +14762,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zheng_Exploring_Temporal_Coherence_for_More_General_Video_Face_Forgery_Detection_ICCV_2021_paper.html", "aff_unique_index": "0;1;1;0;1", - "aff_unique_norm": "Xiamen University;Microsoft", + "aff_unique_norm": "Xiamen University;Microsoft Research", "aff_unique_dep": "School of Informatics;Research", "aff_unique_url": "https://www.xmu.edu.cn;https://www.microsoft.com/en-us/research/group/asia", "aff_unique_abbr": "XMU;MSR Asia", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zheng_2021_ICCV,\n \n author = {\n Zheng,\n Yinglin and Bao,\n Jianmin and Chen,\n Dong and Zeng,\n Ming and Wen,\n Fang\n},\n title = {\n Exploring Temporal Coherence for More General Video Face Forgery Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15044-15054\n} \n}" }, { "title": "Exploring Visual Engagement Signals for Representation Learning", @@ -13840,6 +14778,7 @@ "status": "Poster", "track": "main", "pid": 5398, + "author_site": "Menglin Jia; Zuxuan Wu; Austin Reiter; Claire Cardie; Serge Belongie; Ser-Nam Lim", "author": "Menglin Jia; Zuxuan Wu; Austin Reiter; Claire Cardie; Serge Belongie; Ser-Nam Lim", "abstract": "Visual engagement in social media platforms comprises interactions with photo posts including comments, shares, and likes. In this paper, we leverage such visual engagement clues as supervisory signals for representation learning. However, learning from engagement signals is non-trivial as it is not clear how to bridge the gap between low-level visual information and high-level social interaction. We present VisE,, a weakly supervised learning approach, which maps social images to pseudo labels derived by clustered engagement signals. We then study how models trained in this way benefit subjective downstream computer vision tasks such as emotion recognition or political bias detection. Through extensive studies, we empirically demonstrate the effectiveness of VisE across a diverse set of classification tasks beyond the scope of conventional recognition.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Jia_Exploring_Visual_Engagement_Signals_for_Representation_Learning_ICCV_2021_paper.pdf", @@ -13854,7 +14793,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Jia_Exploring_Visual_Engagement_Signals_for_Representation_Learning_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Jia_Exploring_Visual_Engagement_Signals_for_Representation_Learning_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Jia_2021_ICCV,\n \n author = {\n Jia,\n Menglin and Wu,\n Zuxuan and Reiter,\n Austin and Cardie,\n Claire and Belongie,\n Serge and Lim,\n Ser-Nam\n},\n title = {\n Exploring Visual Engagement Signals for Representation Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4206-4217\n} \n}" }, { "title": "Extending Neural P-Frame Codecs for B-Frame Coding", @@ -13862,6 +14802,7 @@ "status": "Poster", "track": "main", "pid": 2201, + "author_site": "Reza Pourreza; Taco Cohen", "author": "Reza Pourreza; Taco Cohen", "abstract": "While most neural video codecs address P-frame coding (predicting each frame from past ones), in this paper we address B-frame compression (predicting frames using both past and future reference frames). Our B-frame solution is based on the existing P-frame methods. As a result, B-frame coding capability can easily be added to an existing neural codec. The basic idea of our B-frame coding method is to interpolate the two reference frames to generate a single reference frame and then use it together with an existing P-frame codec to encode the input B-frame. Our studies show that the interpolated frame is a much better reference for the P-frame codec compared to using the previous frame as is usually done. Our results show that using the proposed method with an existing P-frame codec can lead to 28.5% saving in bit-rate on the UVG dataset compared to the P-frame codec while generating the same video quality.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Pourreza_Extending_Neural_P-Frame_Codecs_for_B-Frame_Coding_ICCV_2021_paper.pdf", @@ -13876,7 +14817,8 @@ "aff_domain": ";", "email": ";", "author_num": 2, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Pourreza_Extending_Neural_P-Frame_Codecs_for_B-Frame_Coding_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Pourreza_Extending_Neural_P-Frame_Codecs_for_B-Frame_Coding_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Pourreza_2021_ICCV,\n \n author = {\n Pourreza,\n Reza and Cohen,\n Taco\n},\n title = {\n Extending Neural P-Frame Codecs for B-Frame Coding\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6680-6689\n} \n}" }, { "title": "Extensions of Karger's Algorithm: Why They Fail in Theory and How They Are Useful in Practice", @@ -13884,7 +14826,8 @@ "status": "Poster", "track": "main", "pid": 6984, - "author": "Erik Jenner; Enrique Fita Sanmart\u00edn; Fred A. Hamprecht", + "author_site": "Erik Jenner; Enrique Fita Sanmartín; Fred A. Hamprecht", + "author": "Erik Jenner; Enrique Fita Sanmartín; Fred A. Hamprecht", "abstract": "The minimum graph cut and minimum s-t-cut problems are important primitives in the modeling of combinatorial problems in computer science, including in computer vision and machine learning. Some of the most efficient algorithms for finding global minimum cuts are randomized algorithms based on Karger's groundbreaking contraction algorithm. Here, we study whether Karger's algorithm can be successfully generalized to other cut problems. We first prove that a wide class of natural generalizations of Karger's algorithm cannot efficiently solve the s-t-mincut or the normalized cut problem to optimality. However, we then present a simple new algorithm for seeded segmentation / graph-based semi-supervised learning that is closely based on Karger's original algorithm, showing that for these problems, extensions of Karger's algorithm can be useful. The new algorithm has linear asymptotic runtime and yields a potential that can be interpreted as the posterior probability of a sample belonging to a given seed / class. We clarify its relation to the random walker algorithm / harmonic energy minimization in terms of distributions over spanning forests. On classical problems from seeded image segmentation and graph-based semi-supervised learning on image data, the method performs at least as well as the random walker / harmonic energy minimization / Gaussian processes.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Jenner_Extensions_of_Kargers_Algorithm_Why_They_Fail_in_Theory_and_ICCV_2021_paper.pdf", "aff": "Heidelberg Collaboratory for Image Processing, University of Heidelberg, Germany; Heidelberg Collaboratory for Image Processing, University of Heidelberg, Germany; Heidelberg Collaboratory for Image Processing, University of Heidelberg, Germany", @@ -13907,7 +14850,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Heidelberg", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Jenner_2021_ICCV,\n \n author = {\n Jenner,\n Erik and Sanmart{\\'\\i\n}n,\n Enrique Fita and Hamprecht,\n Fred A.\n},\n title = {\n Extensions of Karger's Algorithm: Why They Fail in Theory and How They Are Useful in Practice\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4602-4611\n} \n}" }, { "title": "Extreme Structure From Motion for Indoor Panoramas Without Visual Overlaps", @@ -13915,6 +14859,7 @@ "status": "Poster", "track": "main", "pid": 2088, + "author_site": "Mohammad Amin Shabani; Weilian Song; Makoto Odamaki; Hirochika Fujiki; Yasutaka Furukawa", "author": "Mohammad Amin Shabani; Weilian Song; Makoto Odamaki; Hirochika Fujiki; Yasutaka Furukawa", "abstract": "This paper proposes an extreme structure from motion (SfM) algorithm for residential indoor panoramas that have little to no visual overlaps. Only a single panorama is present in a room for many cases, making the task infeasible for existing SfM algorithms. Our idea is to learn to evaluate the realism of room/door/window arrangements in the top-down semantic space. After using heuristics to enumerate possible arrangements based on door detections, we evaluate their realism scores, pick the most realistic arrangement, and return the corresponding camera poses. We evaluate the proposed approach on a dataset of 1029 panorama images with 286 houses. Our qualitative and quantitative evaluations show that an existing SfM approach completely fails for most of the houses. The proposed approach achieves the mean positional error of less than 1.0 meter for 47% of the houses and even 78% when considering the top five reconstructions. We will share the code and data in https://github.com/aminshabani/extreme-indoor-sfm.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Shabani_Extreme_Structure_From_Motion_for_Indoor_Panoramas_Without_Visual_Overlaps_ICCV_2021_paper.pdf", @@ -13938,7 +14883,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;1;0", - "aff_country_unique": "Canada;Japan" + "aff_country_unique": "Canada;Japan", + "bibtex": "@InProceedings{Shabani_2021_ICCV,\n \n author = {\n Shabani,\n Mohammad Amin and Song,\n Weilian and Odamaki,\n Makoto and Fujiki,\n Hirochika and Furukawa,\n Yasutaka\n},\n title = {\n Extreme Structure From Motion for Indoor Panoramas Without Visual Overlaps\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5703-5711\n} \n}" }, { "title": "Extreme-Quality Computational Imaging via Degradation Framework", @@ -13946,6 +14892,7 @@ "status": "Poster", "track": "main", "pid": 8936, + "author_site": "Shiqi Chen; Huajun Feng; Keming Gao; Zhihai Xu; Yueting Chen", "author": "Shiqi Chen; Huajun Feng; Keming Gao; Zhihai Xu; Yueting Chen", "abstract": "To meet the space limitation of optical elements, free-form surfaces or high-order aspherical lenses are adopted in mobile cameras to compress volume. However, the application of free-form surfaces also introduces the problem of image quality mutation. Existing model-based deconvolution methods are inefficient in dealing with the degradation that shows a wide range of spatial variants over regions. And the deep learning techniques in low-level and physics-based vision suffer from a lack of accurate data. To address this issue, we develop a degradation framework to estimate the spatially variant point spread functions (PSFs) of mobile cameras. When input extreme-quality digital images, the proposed framework generates degraded images sharing a common domain with real-world photographs. Supplied with the synthetic image pairs, we design a Field-Of-View shared kernel prediction network (FOV-KPN) to perform spatial-adaptive reconstruction on real degraded photos. Extensive experiments demonstrate that the proposed approach achieves extreme-quality computational imaging and outperforms the state-of-the-art methods. Furthermore, we illustrate that our technique can be integrated into existing postprocessing systems, resulting in significantly improved visual quality.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_Extreme-Quality_Computational_Imaging_via_Degradation_Framework_ICCV_2021_paper.pdf", @@ -13960,7 +14907,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Chen_Extreme-Quality_Computational_Imaging_via_Degradation_Framework_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Chen_Extreme-Quality_Computational_Imaging_via_Degradation_Framework_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Shiqi and Feng,\n Huajun and Gao,\n Keming and Xu,\n Zhihai and Chen,\n Yueting\n},\n title = {\n Extreme-Quality Computational Imaging via Degradation Framework\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2632-2641\n} \n}" }, { "title": "F-Drop&Match: GANs With a Dead Zone in the High-Frequency Domain", @@ -13968,6 +14916,7 @@ "status": "Poster", "track": "main", "pid": 9664, + "author_site": "Shin'ya Yamaguchi; Sekitoshi Kanai", "author": "Shin'ya Yamaguchi; Sekitoshi Kanai", "abstract": "Generative adversarial networks built from deep convolutional neural networks (GANs) lack the ability to exactly replicate the high-frequency components of natural images. To alleviate this issue, we introduce two novel training techniques called frequency dropping (F-Drop) and frequency matching (F-Match). The key idea of F-Drop is to filter out unnecessary high-frequency components from the input images of the discriminators. This simple modification prevents the discriminators from being confused by perturbations of the high-frequency components. In addition, F-Drop makes the GANs focus on fitting in the low-frequency domain, in which there are the dominant components of natural images. F-Match minimizes the difference between real and fake images in the frequency domain for generating more realistic images. F-Match is implemented as a regularization term in the objective functions of the generators; it penalizes the batch mean error in the frequency domain. F-Match helps the generators to fit in the high-frequency domain filtered out by F-Drop to the real image. We experimentally demonstrate that the combination of F-Drop and F-Match improves the generative performance of GANs in both the frequency and spatial domain on multiple image benchmarks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yamaguchi_F-DropMatch_GANs_With_a_Dead_Zone_in_the_High-Frequency_Domain_ICCV_2021_paper.pdf", @@ -13991,7 +14940,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Yamaguchi_2021_ICCV,\n \n author = {\n Yamaguchi,\n Shin'ya and Kanai,\n Sekitoshi\n},\n title = {\n F-Drop\\&Match: GANs With a Dead Zone in the High-Frequency Domain\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6743-6751\n} \n}" }, { "title": "FACIAL: Synthesizing Dynamic Talking Face With Implicit Attribute Learning", @@ -13999,6 +14949,7 @@ "status": "Poster", "track": "main", "pid": 2947, + "author_site": "Chenxu Zhang; Yifan Zhao; Yifei Huang; Ming Zeng; Saifeng Ni; Madhukar Budagavi; Xiaohu Guo", "author": "Chenxu Zhang; Yifan Zhao; Yifei Huang; Ming Zeng; Saifeng Ni; Madhukar Budagavi; Xiaohu Guo", "abstract": "In this paper, we propose a talking face generation method that takes an audio signal as input and a short target video clip as reference, and synthesizes a photo-realistic video of the target face with natural lip motions, head poses, and eye blinks that are in-sync with the input audio signal. We note that the synthetic face attributes include not only explicit ones such as lip motions that have high correlations with speech, but also implicit ones such as head poses and eye blinks that have only weak correlation with the input audio. To model such complicated relationships among different face attributes with input audio, we propose a FACe Implicit Attribute Learning Generative Adversarial Network (FACIAL-GAN), which integrates the phonetics-aware, context-aware, and identity-aware information to synthesize the 3D face animation with realistic motions of lips, head poses, and eye blinks. Then, our Rendering-to-Video network takes the rendered face images and the attention map of eye blinks as input to generate the photo-realistic output video frames. Experimental results and user studies show our method can generate realistic talking face videos with not only synchronized lip motions, but also natural head movements and eye blinks, with better qualities than the results of state-of-the-art methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhang_FACIAL_Synthesizing_Dynamic_Talking_Face_With_Implicit_Attribute_Learning_ICCV_2021_paper.pdf", @@ -14015,14 +14966,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhang_FACIAL_Synthesizing_Dynamic_Talking_Face_With_Implicit_Attribute_Learning_ICCV_2021_paper.html", "aff_unique_index": "0;1;2;3;4;4;0", - "aff_unique_norm": "University of Texas at Dallas;Beihang University;East China Normal University;Xiamen University;Samsung", - "aff_unique_dep": ";;;;Samsung Research America", + "aff_unique_norm": "University of Texas at Dallas;Beihang University;East China Normal University;Xiamen University;Samsung Research America", + "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.utdallas.edu;http://www.buaa.edu.cn/;http://www.ecnu.edu.cn;https://www.xmu.edu.cn;https://www.samsung.com/us/careers/research/", "aff_unique_abbr": "UT Dallas;BUAA;ECNU;XMU;SRA", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Dallas;", "aff_country_unique_index": "0;1;1;1;0;0;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Zhang_2021_ICCV,\n \n author = {\n Zhang,\n Chenxu and Zhao,\n Yifan and Huang,\n Yifei and Zeng,\n Ming and Ni,\n Saifeng and Budagavi,\n Madhukar and Guo,\n Xiaohu\n},\n title = {\n FACIAL: Synthesizing Dynamic Talking Face With Implicit Attribute Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3867-3876\n} \n}" }, { "title": "FASA: Feature Augmentation and Sampling Adaptation for Long-Tailed Instance Segmentation", @@ -14030,6 +14982,7 @@ "status": "Poster", "track": "main", "pid": 3794, + "author_site": "Yuhang Zang; Chen Huang; Chen Change Loy", "author": "Yuhang Zang; Chen Huang; Chen Change Loy", "abstract": "Recent methods for long-tailed instance segmentation still struggle on rare object classes with few training data. We propose a simple yet effective method, Feature Augmentation and Sampling Adaptation (FASA), that addresses the data scarcity issue by augmenting the feature space especially for rare classes. Both the Feature Augmentation (FA) and feature sampling components are adaptive to the actual training status -- FA is informed by the feature mean and variance of observed real samples from past iterations, and we sample the generated virtual features in a loss-adapted manner to avoid over-fitting. FASA does not require any elaborate loss design, and removes the need for inter-class transfer learning that often involves large cost and manually-defined head/tail class groups. We show FASA is a fast, generic method that can be easily plugged into standard or long-tailed segmentation frameworks, with consistent performance gains and little added cost. FASA is also applicable to other tasks like long-tailed classification with state-of-the-art performance.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zang_FASA_Feature_Augmentation_and_Sampling_Adaptation_for_Long-Tailed_Instance_Segmentation_ICCV_2021_paper.pdf", @@ -14053,7 +15006,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "Singapore;United States" + "aff_country_unique": "Singapore;United States", + "bibtex": "@InProceedings{Zang_2021_ICCV,\n \n author = {\n Zang,\n Yuhang and Huang,\n Chen and Loy,\n Chen Change\n},\n title = {\n FASA: Feature Augmentation and Sampling Adaptation for Long-Tailed Instance Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3457-3466\n} \n}" }, { "title": "FATNN: Fast and Accurate Ternary Neural Networks", @@ -14061,6 +15015,7 @@ "status": "Poster", "track": "main", "pid": 4085, + "author_site": "Peng Chen; Bohan Zhuang; Chunhua Shen", "author": "Peng Chen; Bohan Zhuang; Chunhua Shen", "abstract": "Ternary Neural Networks (TNNs) have received much attention due to being potentially orders of magnitude faster in inference, as well as more power efficient, than full-precision counterparts. However, 2 bits are required to encode the ternary representation with only 3 quantization levels leveraged. As a result, conventional TNNs have similar memory consumption and speed compared with the standard 2-bit models, but have worse representational capability. Moreover, there is still a significant gap in accuracy between TNNs and full-precision networks, hampering their deployment to real applications. To tackle these two challenges, in this work, we first show that, under some mild constraints, computational complexity of the ternary inner product can be reduced by 2x. Second, to mitigate the performance gap, we elaborately design an implementation-dependent ternary quantization algorithm. The proposed framework is termed Fast and Accurate Ternary Neural Networks (FATNN). Experiments on image classification demonstrate that our FATNN surpasses the state-of-the-arts by a significant margin in accuracy. More importantly, speedup evaluation compared with various precisions is analyzed on several platforms, which serves as a strong benchmark for further research.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_FATNN_Fast_and_Accurate_Ternary_Neural_Networks_ICCV_2021_paper.pdf", @@ -14084,7 +15039,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Australia" + "aff_country_unique": "Australia", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Peng and Zhuang,\n Bohan and Shen,\n Chunhua\n},\n title = {\n FATNN: Fast and Accurate Ternary Neural Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5219-5228\n} \n}" }, { "title": "FFT-OT: A Fast Algorithm for Optimal Transportation", @@ -14092,6 +15048,7 @@ "status": "Poster", "track": "main", "pid": 7188, + "author_site": "Na Lei; Xianfeng Gu", "author": "Na Lei; Xianfeng Gu", "abstract": "An optimal transportation map finds the most economical way to transport one probability measure to the other. It has been applied in a broad range of applications in vision, deep learning and medical images. By Brenier theory, computing the optimal transport map is equivalent to solving a Monge-Ampere equation. Due to the highly non-linear nature, the computation of optimal transportation maps in large scale is very challenging. This work proposes a simple but powerful method, the FFT-OT algorithm, to tackle this difficulty based on three key ideas. First, solving Monge-Ampere equation is converted to a fixed point problem; Second, the obliqueness property of optimal transportation maps are reformulated as Neumann boundary conditions on rectangular domains; Third, FFT is applied in each iteration to solve a Poisson equation in order to improve the efficiency. Experiments on surfaces captured from 3D scanning and reconstructed from medical imaging are conducted, and compared with other existing methods. Our experimental results show that the proposed FFT-OT algorithm is simple, general and scalable with high efficiency and accuracy.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Lei_FFT-OT_A_Fast_Algorithm_for_Optimal_Transportation_ICCV_2021_paper.pdf", @@ -14115,7 +15072,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Lei_2021_ICCV,\n \n author = {\n Lei,\n Na and Gu,\n Xianfeng\n},\n title = {\n FFT-OT: A Fast Algorithm for Optimal Transportation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6280-6289\n} \n}" }, { "title": "FIERY: Future Instance Prediction in Bird's-Eye View From Surround Monocular Cameras", @@ -14123,7 +15081,8 @@ "status": "Poster", "track": "main", "pid": 1835, - "author": "Anthony Hu; Zak Murez; Nikhil Mohan; Sof\u00eda Dudas; Jeffrey Hawke; Vijay Badrinarayanan; Roberto Cipolla; Alex Kendall", + "author_site": "Anthony Hu; Zak Murez; Nikhil Mohan; Sofía Dudas; Jeffrey Hawke; Vijay Badrinarayanan; Roberto Cipolla; Alex Kendall", + "author": "Anthony Hu; Zak Murez; Nikhil Mohan; Sofía Dudas; Jeffrey Hawke; Vijay Badrinarayanan; Roberto Cipolla; Alex Kendall", "abstract": "Driving requires interacting with road agents and predicting their future behaviour in order to navigate safely. We present FIERY: a probabilistic future prediction model in bird's-eye view from monocular cameras. Our model predicts future instance segmentation and motion of dynamic agents that can be transformed into non-parametric future trajectories. Our approach combines the perception, sensor fusion and prediction components of a traditional autonomous driving stack by estimating bird's-eye-view prediction directly from surround RGB monocular camera inputs. FIERY learns to model the inherent stochastic nature of the future solely from camera driving data in an end-to-end manner, without relying on HD maps, and predicts multimodal future trajectories. We show that our model outperforms previous prediction baselines on the NuScenes and Lyft datasets. The code and trained models are available at https://github.com/wayveai/fiery.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Hu_FIERY_Future_Instance_Prediction_in_Birds-Eye_View_From_Surround_Monocular_ICCV_2021_paper.pdf", "aff": "Wayve, UK + University of Cambridge, UK; Wayve, UK; Wayve, UK; Wayve, UK; Wayve, UK; Wayve, UK; University of Cambridge, UK; Wayve, UK", @@ -14146,7 +15105,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Cambridge", "aff_country_unique_index": "0+0;0;0;0;0;0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Hu_2021_ICCV,\n \n author = {\n Hu,\n Anthony and Murez,\n Zak and Mohan,\n Nikhil and Dudas,\n Sof{\\'\\i\n}a and Hawke,\n Jeffrey and Badrinarayanan,\n Vijay and Cipolla,\n Roberto and Kendall,\n Alex\n},\n title = {\n FIERY: Future Instance Prediction in Bird's-Eye View From Surround Monocular Cameras\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15273-15282\n} \n}" }, { "title": "FLAR: A Unified Prototype Framework for Few-Sample Lifelong Active Recognition", @@ -14154,6 +15114,7 @@ "status": "Poster", "track": "main", "pid": 6632, + "author_site": "Lei Fan; Peixi Xiong; Wei Wei; Ying Wu", "author": "Lei Fan; Peixi Xiong; Wei Wei; Ying Wu", "abstract": "Intelligent agents with visual sensors are allowed to actively explore their observations for better recognition performance. This task is referred to as Active Recognition (AR). Currently, most methods toward AR are implemented under a fixed-category setting, which constrains their applicability in realistic scenarios that need to incrementally learn new classes without retraining from scratch. Further, collecting massive data for novel categories is expensive. To address this demand, in this paper, we propose a unified framework towards Few-sample Lifelong Active Recognition (FLAR), which aims at performing active recognition on progressively arising novel categories that only have few training samples. Three difficulties emerge with FLAR: the lifelong recognition policy learning, the knowledge preservation of old categories, and the lack of training samples. To this end, our approach integrates prototypes, a robust representation for limited training samples, into a reinforcement learning solution, which motivates the agent to move towards views resulting in more discriminative features. Catastrophic forgetting during lifelong learning is then alleviated with knowledge distillation. Extensive experiments across two datasets, respectively for object and scene recognition, demonstrate that even without large training samples, the proposed approach could learn to actively recognize novel categories in a class-incremental behavior.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Fan_FLAR_A_Unified_Prototype_Framework_for_Few-Sample_Lifelong_Active_Recognition_ICCV_2021_paper.pdf", @@ -14177,7 +15138,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Evanston", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Fan_2021_ICCV,\n \n author = {\n Fan,\n Lei and Xiong,\n Peixi and Wei,\n Wei and Wu,\n Ying\n},\n title = {\n FLAR: A Unified Prototype Framework for Few-Sample Lifelong Active Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15394-15403\n} \n}" }, { "title": "FMODetect: Robust Detection of Fast Moving Objects", @@ -14185,7 +15147,8 @@ "status": "Poster", "track": "main", "pid": 8895, - "author": "Denys Rozumnyi; Ji\u0159\u00ed Matas; Filip \u0160roubek; Marc Pollefeys; Martin R. Oswald", + "author_site": "Denys Rozumnyi; Jiří Matas; Filip Šroubek; Marc Pollefeys; Martin R. Oswald", + "author": "Denys Rozumnyi; Jiří Matas; Filip Šroubek; Marc Pollefeys; Martin R. Oswald", "abstract": "We propose the first learning-based approach for fast moving objects detection. Such objects are highly blurred and move over large distances within one video frame. Fast moving objects are associated with a deblurring and matting problem, also called deblatting. We show that the separation of deblatting into consecutive matting and deblurring allows achieving real-time performance, i.e. an order of magnitude speed-up, and thus enabling new classes of application. The proposed method detects fast moving objects as a truncated distance function to the trajectory by learning from synthetic data. For the sharp appearance estimation and accurate trajectory estimation, we propose a matting and fitting network that estimates the blurred appearance without background, followed by an energy minimization based deblurring. The state-of-the-art methods are outperformed in terms of recall, precision, trajectory estimation, and sharp appearance reconstruction. Compared to other methods, such as deblatting, the inference is of several orders of magnitude faster and allows applications such as real-time fast moving object detection and retrieval in large video collections.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Rozumnyi_FMODetect_Robust_Detection_of_Fast_Moving_Objects_ICCV_2021_paper.pdf", "aff": "Department of Computer Science, ETH Zurich + Microsoft Mixed Reality and AI Zurich Lab; Visual Recognition Group, Czech Technical University in Prague; UTIA, Czech Academy of Sciences; Department of Computer Science, ETH Zurich + Microsoft Mixed Reality and AI Zurich Lab; Visual Recognition Group, Czech Technical University in Prague", @@ -14208,7 +15171,8 @@ "aff_campus_unique_index": "1;2;1;2", "aff_campus_unique": ";Zurich;Prague", "aff_country_unique_index": "0+0;1;1;0+0;1", - "aff_country_unique": "Switzerland;Czech Republic" + "aff_country_unique": "Switzerland;Czech Republic", + "bibtex": "@InProceedings{Rozumnyi_2021_ICCV,\n \n author = {\n Rozumnyi,\n Denys and Matas,\n Ji\\v{r\n}{\\'\\i\n} and \\v{S\n}roubek,\n Filip and Pollefeys,\n Marc and Oswald,\n Martin R.\n},\n title = {\n FMODetect: Robust Detection of Fast Moving Objects\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3541-3549\n} \n}" }, { "title": "FOVEA: Foveated Image Magnification for Autonomous Navigation", @@ -14216,6 +15180,7 @@ "status": "Poster", "track": "main", "pid": 3521, + "author_site": "Chittesh Thavamani; Mengtian Li; Nicolas Cebron; Deva Ramanan", "author": "Chittesh Thavamani; Mengtian Li; Nicolas Cebron; Deva Ramanan", "abstract": "Efficient processing of high-resolution video streams is safety-critical for many robotics applications such as autonomous driving. Image downsampling is a commonly adopted technique to ensure the latency constraint is met. However, this naive approach greatly restricts an object detector's capability to identify small objects. In this paper, we propose an attentional approach that elastically magnifies certain regions while maintaining a small input canvas. The magnified regions are those that are believed to have a high probability of containing an object, whose signal can come from a dataset-wide prior or frame-level prior computed from recent object predictions. The magnification is implemented by a KDE-based mapping to transform the bounding boxes into warping parameters, which are then fed into an image sampler with anti-cropping regularization. The detector is then fed with the warped image and we apply a differentiable backward mapping to get bounding box outputs in the original space. Our regional magnification allows algorithms to make better use of high-resolution input without incurring the cost of high-resolution processing. On the autonomous driving datasets Argoverse-HD and BDD100K, we show our proposed method boosts the detection AP over standard Faster R-CNN, with and without finetuning. Additionally, building on top of the previous state-of-the-art in streaming detection, our method sets a new record for streaming AP on Argoverse-HD (from 17.8 to 23.0 on a GTX 1080 Ti GPU), suggesting that it has achieved a superior accuracy-latency tradeoff.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Thavamani_FOVEA_Foveated_Image_Magnification_for_Autonomous_Navigation_ICCV_2021_paper.pdf", @@ -14239,7 +15204,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Thavamani_2021_ICCV,\n \n author = {\n Thavamani,\n Chittesh and Li,\n Mengtian and Cebron,\n Nicolas and Ramanan,\n Deva\n},\n title = {\n FOVEA: Foveated Image Magnification for Autonomous Navigation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15539-15548\n} \n}" }, { "title": "FREE: Feature Refinement for Generalized Zero-Shot Learning", @@ -14247,10 +15213,11 @@ "status": "Poster", "track": "main", "pid": 3223, + "author_site": "Shiming Chen; Wenjie Wang; Beihao Xia; Qinmu Peng; Xinge You; Feng Zheng; Ling Shao", "author": "Shiming Chen; Wenjie Wang; Beihao Xia; Qinmu Peng; Xinge You; Feng Zheng; Ling Shao", "abstract": "Generalized zero-shot learning (GZSL) has achieved significant progress, with many efforts dedicated to overcoming the problems of visual-semantic domain gaps and seen-unseen bias. However, most existing methods directly use feature extraction models trained on ImageNet alone, ignoring the cross-dataset bias between ImageNet and GZSL benchmarks. Such a bias inevitably results in poor-quality visual features for GZSL tasks, which potentially limits the recognition performance on both seen and unseen classes. In this paper, we propose a simple yet effective GZSL method, termed feature refinement for generalized zero-shot learning (FREE), to tackle the above problem. FREE employs a feature refinement (FR) module that incorporates semantic-visual mapping into a unified generative model to refine the visual features of seen and unseen class samples. Furthermore, we propose a self-adaptive margin center loss (SAMC-loss) that cooperates with a semantic cycle-consistency loss to guide FR to learn class- and semantically-relevant representations, and concatenate the features in FR to extract the fully refined features. Extensive experiments on five benchmark datasets demonstrate the significant performance gain of FREE over current state-of-the-art methods and its baseline. The code is available at https://github.com/shiming-chen/FREE.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_FREE_Feature_Refinement_for_Generalized_Zero-Shot_Learning_ICCV_2021_paper.pdf", - "aff": "Huazhong University of Science and Technology (HUST), China; Huazhong University of Science and Technology (HUST), China; Huazhong University of Science and Technology (HUST), China; Huazhong University of Science and Technology (HUST), China; Huazhong University of Science and Technology (HUST), China; Southern University of Science and Technology (SUSTech), China; Inception Institute of Arti\ufb01cial Intelligence (IIAI), UAE", + "aff": "Huazhong University of Science and Technology (HUST), China; Huazhong University of Science and Technology (HUST), China; Huazhong University of Science and Technology (HUST), China; Huazhong University of Science and Technology (HUST), China; Huazhong University of Science and Technology (HUST), China; Southern University of Science and Technology (SUSTech), China; Inception Institute of Artificial Intelligence (IIAI), UAE", "project": "", "github": "https://github.com/shiming-chen/FREE", "supp": "", @@ -14270,7 +15237,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;1", - "aff_country_unique": "China;United Arab Emirates" + "aff_country_unique": "China;United Arab Emirates", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Shiming and Wang,\n Wenjie and Xia,\n Beihao and Peng,\n Qinmu and You,\n Xinge and Zheng,\n Feng and Shao,\n Ling\n},\n title = {\n FREE: Feature Refinement for Generalized Zero-Shot Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 122-131\n} \n}" }, { "title": "FaPN: Feature-Aligned Pyramid Network for Dense Image Prediction", @@ -14278,10 +15246,11 @@ "status": "Poster", "track": "main", "pid": 3104, + "author_site": "Shihua Huang; Zhichao Lu; Ran Cheng; Cheng He", "author": "Shihua Huang; Zhichao Lu; Ran Cheng; Cheng He", "abstract": "Recent advancements in deep neural networks have made remarkable leap-forwards in dense image prediction. However, the issue of feature alignment remains as neglected by most existing approaches for simplicity. Direct pixel addition between upsampled and local features leads to feature maps with misaligned contexts that, in turn, translate to mis-classifications in prediction, especially on object boundaries. In this paper, we propose a feature alignment module that learns transformation offsets of pixels to contextually align upsampled higher-level features; and another feature selection module to emphasize the lower-level features with rich spatial details. We then integrate these two modules in a top-down pyramidal architecture and present the Feature-aligned Pyramid Network (FaPN). Extensive experimental evaluations on four dense prediction tasks and four datasets have demonstrated the efficacy of FaPN, yielding an overall improvement of 1.2 - 2.6 points in AP / mIoU over FPN when paired with Faster / Mask R-CNN. In particular, our FaPN achieves the state-of-the-art of 56.7% mIoU on ADE20K when integrated within Mask-Former. The code is available from https://github.com/EMI-Group/FaPN.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Huang_FaPN_Feature-Aligned_Pyramid_Network_for_Dense_Image_Prediction_ICCV_2021_paper.pdf", - "aff": "Southern University of Science and Technology\u2020; Southern University of Science and Technology\u2020; Southern University of Science and Technology\u2020; Southern University of Science and Technology\u2020", + "aff": "Southern University of Science and Technology†; Southern University of Science and Technology†; Southern University of Science and Technology†; Southern University of Science and Technology†", "project": "", "github": "https://github.com/EMI-Group/FaPN", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Huang_FaPN_Feature-Aligned_Pyramid_ICCV_2021_supplemental.pdf", @@ -14301,7 +15270,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Huang_2021_ICCV,\n \n author = {\n Huang,\n Shihua and Lu,\n Zhichao and Cheng,\n Ran and He,\n Cheng\n},\n title = {\n FaPN: Feature-Aligned Pyramid Network for Dense Image Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 864-873\n} \n}" }, { "title": "Face Image Retrieval With Attribute Manipulation", @@ -14309,6 +15279,7 @@ "status": "Poster", "track": "main", "pid": 6328, + "author_site": "Alireza Zaeemzadeh; Shabnam Ghadar; Baldo Faieta; Zhe Lin; Nazanin Rahnavard; Mubarak Shah; Ratheesh Kalarot", "author": "Alireza Zaeemzadeh; Shabnam Ghadar; Baldo Faieta; Zhe Lin; Nazanin Rahnavard; Mubarak Shah; Ratheesh Kalarot", "abstract": "Current face image retrieval solutions are limited, since they treat different facial attributes the same and cannot incorporate user's preference for a subset of attributes in their search criteria. This paper introduces a new face image retrieval framework, where the input face query is augmented by both an adjustment vector that specifies the desired modifications to the facial attributes, and a preference vector that assigns different levels of importance to different attributes. For example, a user can ask for retrieving images similar to a query image, but with a different hair color, and no preference for absence/presence of eyeglasses in the results. To achieve this, we propose to disentangle the semantics, corresponding to various attributes, by learning a set of sparse and orthogonal basis vectors in the latent space of StyleGAN. Such basis vectors are then employed to decompose the dissimilarity between face images in terms of dissimilarity between their attributes, assign preference to the attributes, and adjust the attributes in the query. Enforcing sparsity on the basis vectors helps us to disentangle the latent space and adjust each attribute independently from other attributes, while enforcing orthogonality facilitates preference assignment and the dissimilarity decomposition. The effectiveness of our approach is illustrated by achieving state-of-the-art results for the face image retrieval task.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zaeemzadeh_Face_Image_Retrieval_With_Attribute_Manipulation_ICCV_2021_paper.pdf", @@ -14325,14 +15296,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zaeemzadeh_Face_Image_Retrieval_With_Attribute_Manipulation_ICCV_2021_paper.html", "aff_unique_index": "0;1;1;1;0;0;1", - "aff_unique_norm": "University of Central Florida;Adobe", - "aff_unique_dep": ";Adobe Inc.", + "aff_unique_norm": "University of Central Florida;Adobe Inc.", + "aff_unique_dep": ";", "aff_unique_url": "https://www.ucf.edu;https://www.adobe.com", "aff_unique_abbr": "UCF;Adobe", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zaeemzadeh_2021_ICCV,\n \n author = {\n Zaeemzadeh,\n Alireza and Ghadar,\n Shabnam and Faieta,\n Baldo and Lin,\n Zhe and Rahnavard,\n Nazanin and Shah,\n Mubarak and Kalarot,\n Ratheesh\n},\n title = {\n Face Image Retrieval With Attribute Manipulation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12116-12125\n} \n}" }, { "title": "Factorizing Perception and Policy for Interactive Instruction Following", @@ -14340,6 +15312,7 @@ "status": "Poster", "track": "main", "pid": 6140, + "author_site": "Kunal Pratap Singh; Suvaansh Bhambri; Byeonghwi Kim; Roozbeh Mottaghi; Jonghyun Choi", "author": "Kunal Pratap Singh; Suvaansh Bhambri; Byeonghwi Kim; Roozbeh Mottaghi; Jonghyun Choi", "abstract": "Performing simple household tasks based on language directives is very natural to humans, yet it remains an open challenge for an AI agent. The 'interactive instruction following' task attempts to make progress towards building an agent that can jointly navigate, interact, and reason in the environment at every step. To address the multifaceted problem, we propose a model that factorizes the task into interactive perception and action policy streams with enhanced components. We empirically validate that our model outperforms prior arts by significant margins on the ALFRED benchmark in all metrics with improved generalization.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Singh_Factorizing_Perception_and_Policy_for_Interactive_Instruction_Following_ICCV_2021_paper.pdf", @@ -14363,7 +15336,8 @@ "aff_campus_unique_index": "1;1;1;1", "aff_campus_unique": ";Gwangju", "aff_country_unique_index": "0+1;1;1+0;0;1", - "aff_country_unique": "United States;South Korea" + "aff_country_unique": "United States;South Korea", + "bibtex": "@InProceedings{Singh_2021_ICCV,\n \n author = {\n Singh,\n Kunal Pratap and Bhambri,\n Suvaansh and Kim,\n Byeonghwi and Mottaghi,\n Roozbeh and Choi,\n Jonghyun\n},\n title = {\n Factorizing Perception and Policy for Interactive Instruction Following\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1888-1897\n} \n}" }, { "title": "FairNAS: Rethinking Evaluation Fairness of Weight Sharing Neural Architecture Search", @@ -14371,6 +15345,7 @@ "status": "Poster", "track": "main", "pid": 5476, + "author_site": "Xiangxiang Chu; Bo Zhang; Ruijun Xu", "author": "Xiangxiang Chu; Bo Zhang; Ruijun Xu", "abstract": "One of the most critical problems in weight-sharing neural architecture search is the evaluation of candidate models within a predefined search space. In practice, a one-shot supernet is trained to serve as an evaluator. A faithful ranking certainly leads to more accurate searching results. However, current methods are prone to making misjudgments. In this paper, we prove that their biased evaluation is due to inherent unfairness in the supernet training. In view of this, we propose two levels of constraints: expectation fairness and strict fairness. Particularly, strict fairness ensures equal optimization opportunities for all choice blocks throughout the training, which neither overestimates nor underestimates their capacity. We demonstrate that this is crucial for improving the confidence of models' ranking. Incorporating the one-shot supernet trained under the proposed fairness constraints with a multi-objective evolutionary search algorithm, we obtain various state-of-the-art models, e.g., FairNAS-A attains 77.5% top-1 validation accuracy on ImageNet.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chu_FairNAS_Rethinking_Evaluation_Fairness_of_Weight_Sharing_Neural_Architecture_Search_ICCV_2021_paper.pdf", @@ -14394,7 +15369,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chu_2021_ICCV,\n \n author = {\n Chu,\n Xiangxiang and Zhang,\n Bo and Xu,\n Ruijun\n},\n title = {\n FairNAS: Rethinking Evaluation Fairness of Weight Sharing Neural Architecture Search\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12239-12248\n} \n}" }, { "title": "Fake It Till You Make It: Face Analysis in the Wild Using Synthetic Data Alone", @@ -14402,7 +15378,8 @@ "status": "Poster", "track": "main", "pid": 6962, - "author": "Erroll Wood; Tadas Baltru\u0161aitis; Charlie Hewitt; Sebastian Dziadzio; Thomas J. Cashman; Jamie Shotton", + "author_site": "Erroll Wood; Tadas Baltrušaitis; Charlie Hewitt; Sebastian Dziadzio; Thomas J. Cashman; Jamie Shotton", + "author": "Erroll Wood; Tadas Baltrušaitis; Charlie Hewitt; Sebastian Dziadzio; Thomas J. Cashman; Jamie Shotton", "abstract": "We demonstrate that it is possible to perform face-related computer vision in the wild using synthetic data alone. The community has long enjoyed the benefits of synthesizing training data with graphics, but the domain gap between real and synthetic data has remained a problem, especially for human faces. Researchers have tried to bridge this gap with data mixing, domain adaptation, and domain-adversarial training, but we show that it is possible to synthesize data with minimal domain gap, so that models trained on synthetic data generalize to real in-the-wild datasets. We describe how to combine a procedurally-generated parametric 3D face model with a comprehensive library of hand-crafted assets to render training images with unprecedented realism and diversity. We train machine learning systems for face-related tasks such as landmark localization and face parsing, showing that synthetic data can both match real data in accuracy, as well as open up new approaches where manual labeling would be impossible.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wood_Fake_It_Till_You_Make_It_Face_Analysis_in_the_ICCV_2021_paper.pdf", "aff": ";;;;;", @@ -14416,7 +15393,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wood_Fake_It_Till_You_Make_It_Face_Analysis_in_the_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wood_Fake_It_Till_You_Make_It_Face_Analysis_in_the_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Wood_2021_ICCV,\n \n author = {\n Wood,\n Erroll and Baltru\\v{s\n}aitis,\n Tadas and Hewitt,\n Charlie and Dziadzio,\n Sebastian and Cashman,\n Thomas J. and Shotton,\n Jamie\n},\n title = {\n Fake It Till You Make It: Face Analysis in the Wild Using Synthetic Data Alone\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3681-3691\n} \n}" }, { "title": "FashionMirror: Co-Attention Feature-Remapping Virtual Try-On With Sequential Template Poses", @@ -14424,6 +15402,7 @@ "status": "Poster", "track": "main", "pid": 4152, + "author_site": "Chieh-Yun Chen; Ling Lo; Pin-Jui Huang; Hong-Han Shuai; Wen-Huang Cheng", "author": "Chieh-Yun Chen; Ling Lo; Pin-Jui Huang; Hong-Han Shuai; Wen-Huang Cheng", "abstract": "Virtual try-on tasks have drawn increased attention. Prior arts focus on tackling this task via warping clothes and fusing the information at the pixel level with the help of semantic segmentation. However, conducting semantic segmentation is time-consuming and easily causes error accumulation over time. Besides, warping the information at the pixel level instead of the feature level limits the performance (e.g., unable to generate different views) and is unstable since it directly demonstrates the results even with a misalignment. In contrast, fusing information at the feature level can be further refined by the convolution to obtain the final results. Based on these assumptions, we propose a co-attention feature-remapping framework, namely FashionMirror, that generates the try-on results according to the driven-pose sequence in two stages. In the first stage, we consider the source human image and the target try-on clothes to predict the removed mask and the try-on clothing mask, which replaces the pre-processed semantic segmentation and reduces the inference time. In the second stage, we first remove the clothes on the source human via the removed mask and warp the clothing features conditioning on the try-on clothing mask to fit the next frame human. Meanwhile, we predict the optical flows from the consecutive 2D poses and warp the source human to the next frame at the feature level. Then, we enhance the clothing features and source human features in every frame to generate realistic try-on results with spatio-temporal smoothness. Both qualitative and quantitative results show that FashionMirror outperforms the state-of-the-art virtual try-on approaches.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_FashionMirror_Co-Attention_Feature-Remapping_Virtual_Try-On_With_Sequential_Template_Poses_ICCV_2021_paper.pdf", @@ -14447,7 +15426,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Taiwan", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Chieh-Yun and Lo,\n Ling and Huang,\n Pin-Jui and Shuai,\n Hong-Han and Cheng,\n Wen-Huang\n},\n title = {\n FashionMirror: Co-Attention Feature-Remapping Virtual Try-On With Sequential Template Poses\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13809-13818\n} \n}" }, { "title": "Fast Convergence of DETR With Spatially Modulated Co-Attention", @@ -14455,6 +15435,7 @@ "status": "Poster", "track": "main", "pid": 6004, + "author_site": "Peng Gao; Minghang Zheng; Xiaogang Wang; Jifeng Dai; Hongsheng Li", "author": "Peng Gao; Minghang Zheng; Xiaogang Wang; Jifeng Dai; Hongsheng Li", "abstract": "The recently proposed Detection Transformer (DETR) model successfully applies Transformer to objects detection and achieves comparable performance with two-stage object detection frameworks, such as Faster-RCNN. However, DETR suffers from its slow convergence. Training DETR from scratch needs 500 epochs to achieve a high accuracy. To accelerate its convergence, we propose a simple yet effective scheme for improving the DETR framework, namely Spatially Modulated Co-Attention (SMCA) mechanism. The core idea of SMCA is to conduct location-aware co-attention in DETR by constraining co-attention responses to be high near initially estimated bounding box locations. Our proposed SMCA increases DETR's convergence speed by replacing the original co-attention mechanism in the decoder while keeping other operations in DETR unchanged. Furthermore, by integrating multi-head and scale-selection attention designs into SMCA, our fully-fledged SMCA can achieve better performance compared to DETR with a dilated convolution-based backbone (45.6 mAP at 108 epochs vs. 43.3 mAP at 500 epochs). We perform extensive ablation studies on COCO dataset to validate SMCA. Code is released at https://github.com/gaopengcuhk/SMCA-DETR.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Gao_Fast_Convergence_of_DETR_With_Spatially_Modulated_Co-Attention_ICCV_2021_paper.pdf", @@ -14471,14 +15452,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Gao_Fast_Convergence_of_DETR_With_Spatially_Modulated_Co-Attention_ICCV_2021_paper.html", "aff_unique_index": "0;1;2;3;2", - "aff_unique_norm": "Shanghai AI Laboratory;Peking University;Chinese University of Hong Kong;SenseTime", + "aff_unique_norm": "Shanghai AI Laboratory;Peking University;The Chinese University of Hong Kong;SenseTime", "aff_unique_dep": ";;CUHK-SenseTime Joint Laboratory;SenseTime Research", "aff_unique_url": "https://www.shanghai-ai-lab.com;http://www.pku.edu.cn;https://www.cuhk.edu.hk;https://www.sensetime.com", "aff_unique_abbr": "SAIL;Peking U;CUHK;SenseTime", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Gao_2021_ICCV,\n \n author = {\n Gao,\n Peng and Zheng,\n Minghang and Wang,\n Xiaogang and Dai,\n Jifeng and Li,\n Hongsheng\n},\n title = {\n Fast Convergence of DETR With Spatially Modulated Co-Attention\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3621-3630\n} \n}" }, { "title": "Fast Light-Field Disparity Estimation With Multi-Disparity-Scale Cost Aggregation", @@ -14486,6 +15468,7 @@ "status": "Poster", "track": "main", "pid": 8548, + "author_site": "Zhicong Huang; Xuemei Hu; Zhou Xue; Weizhu Xu; Tao Yue", "author": "Zhicong Huang; Xuemei Hu; Zhou Xue; Weizhu Xu; Tao Yue", "abstract": "Light field images contain both angular and spatial information of captured light rays. The rich information of light fields enables straightforward disparity recovery capability but demands high computational cost as well. In this paper, we design a lightweight disparity estimation model with physical-based multi-disparity-scale cost volume aggregation for fast disparity estimation. By introducing a sub-network of edge guidance, we significantly improve the recovery of geometric details near edges and improve the overall performance. We test the proposed model extensively on both synthetic and real-captured datasets, which provide both densely and sparsely sampled light fields. Finally, we significantly reduce computation cost and GPU memory consumption, while achieving comparable performance with state-of-the-art disparity estimation methods for light fields. Our source code is available at https://github.com/zcong17huang/FastLFnet.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Huang_Fast_Light-Field_Disparity_Estimation_With_Multi-Disparity-Scale_Cost_Aggregation_ICCV_2021_paper.pdf", @@ -14509,7 +15492,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Nanjing;", "aff_country_unique_index": "0+0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Huang_2021_ICCV,\n \n author = {\n Huang,\n Zhicong and Hu,\n Xuemei and Xue,\n Zhou and Xu,\n Weizhu and Yue,\n Tao\n},\n title = {\n Fast Light-Field Disparity Estimation With Multi-Disparity-Scale Cost Aggregation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6320-6329\n} \n}" }, { "title": "Fast Video Moment Retrieval", @@ -14517,6 +15501,7 @@ "status": "Poster", "track": "main", "pid": 1772, + "author_site": "Junyu Gao; Changsheng Xu", "author": "Junyu Gao; Changsheng Xu", "abstract": "This paper targets at fast video moment retrieval (fast VMR), aiming to localize the target moment efficiently and accurately as queried by a given natural language sentence. We argue that most existing VMR approaches can be divided into three modules namely video encoder, text encoder, and cross-modal interaction module, where the last module is the test-time computational bottleneck. To tackle this issue, we replace the cross-modal interaction module with a cross-modal common space, in which moment-query alignment is learned and efficient moment search can be performed. For the sake of robustness in the learned space, we propose a fine-grained semantic distillation framework to transfer knowledge from additional semantic structures. Specifically, we build a semantic role tree that decomposes a query sentence into different phrases (subtrees). A hierarchical semantic-guided attention module is designed to perform message propagation across the whole tree and yield discriminative features. Finally, the important and discriminative semantics are transferred to the common space by a matching-score distillation process. Extensive experimental results on three popular VMR benchmarks demonstrate that our proposed method enjoys the merits of high speed and significant performance.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Gao_Fast_Video_Moment_Retrieval_ICCV_2021_paper.pdf", @@ -14533,14 +15518,15 @@ "author_num": 2, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Gao_Fast_Video_Moment_Retrieval_ICCV_2021_paper.html", "aff_unique_index": "0+1;0+1+2", - "aff_unique_norm": "Chinese Academy of Sciences;University of Chinese Academy of Sciences;Pengcheng Laboratory", - "aff_unique_dep": "Institute of Automation;School of Artificial Intelligence;Peng Cheng Laboratory", + "aff_unique_norm": "Chinese Academy of Sciences;University of Chinese Academy of Sciences;Peng Cheng Laboratory", + "aff_unique_dep": "Institute of Automation;School of Artificial Intelligence;", "aff_unique_url": "http://www.ia.cas.cn;http://www.ucas.ac.cn;", "aff_unique_abbr": "CASIA;UCAS;", "aff_campus_unique_index": ";1", "aff_campus_unique": ";ShenZhen", "aff_country_unique_index": "0+0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Gao_2021_ICCV,\n \n author = {\n Gao,\n Junyu and Xu,\n Changsheng\n},\n title = {\n Fast Video Moment Retrieval\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1523-1532\n} \n}" }, { "title": "Fast and Efficient DNN Deployment via Deep Gaussian Transfer Learning", @@ -14548,6 +15534,7 @@ "status": "Poster", "track": "main", "pid": 9785, + "author_site": "Qi Sun; Chen Bai; Tinghuan Chen; Hao Geng; Xinyun Zhang; Yang Bai; Bei Yu", "author": "Qi Sun; Chen Bai; Tinghuan Chen; Hao Geng; Xinyun Zhang; Yang Bai; Bei Yu", "abstract": "Deep neural networks (DNNs) have been widely used recently while their hardware deployment optimizations are very time-consuming and the historical deployment knowledge is not utilized efficiently. In this paper, to accelerate the optimization process and find better deployment configurations, we propose a novel transfer learning method based on deep Gaussian processes (DGPs). Firstly, a deep Gaussian process (DGP) model is built on the historical data to learn empirical knowledge. Secondly, to transfer knowledge to a new task, a tuning set is sampled for the new task under the guidance of the DGP model. Then DGP is tuned according to the tuning set via maximum-a-posteriori (MAP) estimation to accommodate for the new task and finally used to guide the deployments of the task. The experiments show that our method achieves the best inference latencies of convolutions while accelerating the optimization process significantly, compared with previous arts.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Sun_Fast_and_Efficient_DNN_Deployment_via_Deep_Gaussian_Transfer_Learning_ICCV_2021_paper.pdf", @@ -14564,14 +15551,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Sun_Fast_and_Efficient_DNN_Deployment_via_Deep_Gaussian_Transfer_Learning_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;0;1;0;0", - "aff_unique_norm": "Chinese University of Hong Kong;SmartMore", + "aff_unique_norm": "The Chinese University of Hong Kong;SmartMore", "aff_unique_dep": ";", "aff_unique_url": "https://www.cuhk.edu.hk;", "aff_unique_abbr": "CUHK;", "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Sun_2021_ICCV,\n \n author = {\n Sun,\n Qi and Bai,\n Chen and Chen,\n Tinghuan and Geng,\n Hao and Zhang,\n Xinyun and Bai,\n Yang and Yu,\n Bei\n},\n title = {\n Fast and Efficient DNN Deployment via Deep Gaussian Transfer Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5380-5390\n} \n}" }, { "title": "FastNeRF: High-Fidelity Neural Rendering at 200FPS", @@ -14579,6 +15567,7 @@ "status": "Poster", "track": "main", "pid": 7615, + "author_site": "Stephan J. Garbin; Marek Kowalski; Matthew Johnson; Jamie Shotton; Julien Valentin", "author": "Stephan J. Garbin; Marek Kowalski; Matthew Johnson; Jamie Shotton; Julien Valentin", "abstract": "Recent work on Neural Radiance Fields (NeRF) showed how neural networks can be used to encode complex 3D environments that can be rendered photorealistically from novel viewpoints. Rendering these images is very computationally demanding and recent improvements are still a long way from enabling interactive rates, even on high-end hardware. Motivated by scenarios on mobile and mixed reality devices, we propose FastNeRF, the first NeRF-based system capable of rendering high fidelity photorealistic images at 200Hz on a high-end consumer GPU. The core of our method is a graphics-inspired factorization that allows for (i) compactly caching a deep radiance map at each position in space, (ii) efficiently querying that map using ray directions to estimate the pixel values in the rendered image. Extensive experiments show that the proposed method is 3000 times faster than the original NeRF algorithm and at least an order of magnitude faster than existing work on accelerating NeRF, while maintaining visual quality and extensibility.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Garbin_FastNeRF_High-Fidelity_Neural_Rendering_at_200FPS_ICCV_2021_paper.pdf", @@ -14595,14 +15584,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Garbin_FastNeRF_High-Fidelity_Neural_Rendering_at_200FPS_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;0;0", - "aff_unique_norm": "Microsoft", - "aff_unique_dep": "Microsoft Corporation", + "aff_unique_norm": "Microsoft Corporation", + "aff_unique_dep": "", "aff_unique_url": "https://www.microsoft.com", "aff_unique_abbr": "Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Garbin_2021_ICCV,\n \n author = {\n Garbin,\n Stephan J. and Kowalski,\n Marek and Johnson,\n Matthew and Shotton,\n Jamie and Valentin,\n Julien\n},\n title = {\n FastNeRF: High-Fidelity Neural Rendering at 200FPS\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14346-14355\n} \n}" }, { "title": "Faster Multi-Object Segmentation Using Parallel Quadratic Pseudo-Boolean Optimization", @@ -14610,6 +15600,7 @@ "status": "Poster", "track": "main", "pid": 6505, + "author_site": "Niels Jeppesen; Patrick M. Jensen; Anders N. Christensen; Anders B. Dahl; Vedrana A. Dahl", "author": "Niels Jeppesen; Patrick M. Jensen; Anders N. Christensen; Anders B. Dahl; Vedrana A. Dahl", "abstract": "We introduce a parallel version of the Quadratic Pseudo-Boolean Optimization (QPBO) algorithm for solving binary optimization tasks, such as image segmentation. The original QPBO implementation by Kolmogorov and Rother relies on the Boykov-Kolmogorov (BK) maxflow/mincut algorithm and performs well for many image analysis tasks. However, the serial nature of their QPBO algorithm results in poor utilization of modern hardware. By redesigning the QPBO algorithm to work with parallel maxflow/mincut algorithms, we significantly reduce solve time of large optimization tasks. We compare our parallel QPBO implementation to other state-of-the-art solvers and benchmark them on two large segmentation tasks and a substantial set of small segmentation tasks. The results show that our parallel QPBO algorithm is over 20 times faster than the serial QPBO algorithm on the large tasks and over three times faster for the majority of the small tasks. Although we focus on image segmentation, our algorithm is generic and can be used for any QPBO problem. Our implementation and experimental results are available at DOI: 10.5281/zenodo.5201620", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Jeppesen_Faster_Multi-Object_Segmentation_Using_Parallel_Quadratic_Pseudo-Boolean_Optimization_ICCV_2021_paper.pdf", @@ -14633,7 +15624,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Kgs. Lyngby", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "Denmark" + "aff_country_unique": "Denmark", + "bibtex": "@InProceedings{Jeppesen_2021_ICCV,\n \n author = {\n Jeppesen,\n Niels and Jensen,\n Patrick M. and Christensen,\n Anders N. and Dahl,\n Anders B. and Dahl,\n Vedrana A.\n},\n title = {\n Faster Multi-Object Segmentation Using Parallel Quadratic Pseudo-Boolean Optimization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6260-6269\n} \n}" }, { "title": "FcaNet: Frequency Channel Attention Networks", @@ -14641,6 +15633,7 @@ "status": "Poster", "track": "main", "pid": 7455, + "author_site": "Zequn Qin; Pengyi Zhang; Fei Wu; Xi Li", "author": "Zequn Qin; Pengyi Zhang; Fei Wu; Xi Li", "abstract": "Attention mechanism, especially channel attention, has gained great success in the computer vision field. Many works focus on how to design efficient channel attention mechanisms while ignoring a fundamental problem, i.e., channel attention mechanism uses scalar to represent channel, which is difficult due to massive information loss. In this work, we start from a different view and regard the channel representation problem as a compression process using frequency analysis. Based on the frequency analysis, we mathematically prove that the conventional global average pooling is a special case of the feature decomposition in the frequency domain. With the proof, we naturally generalize the compression of the channel attention mechanism in the frequency domain and propose our method with multi-spectral channel attention, termed as FcaNet. FcaNet is simple but effective. We can change a few lines of code in the calculation to implement our method within existing channel attention methods. Moreover, the proposed method achieves state-of-the-art results compared with other channel attention methods on image classification, object detection, and instance segmentation tasks. Our method could consistently outperform the baseline SENet, with the same number of parameters and the same computational cost. Our code and models are publicly available at https://github.com/cfzd/FcaNet.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Qin_FcaNet_Frequency_Channel_Attention_Networks_ICCV_2021_paper.pdf", @@ -14664,7 +15657,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Shanghai", "aff_country_unique_index": "0;0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Qin_2021_ICCV,\n \n author = {\n Qin,\n Zequn and Zhang,\n Pengyi and Wu,\n Fei and Li,\n Xi\n},\n title = {\n FcaNet: Frequency Channel Attention Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 783-792\n} \n}" }, { "title": "Feature Importance-Aware Transferable Adversarial Attacks", @@ -14672,6 +15666,7 @@ "status": "Poster", "track": "main", "pid": 6189, + "author_site": "Zhibo Wang; Hengchang Guo; Zhifei Zhang; Wenxin Liu; Zhan Qin; Kui Ren", "author": "Zhibo Wang; Hengchang Guo; Zhifei Zhang; Wenxin Liu; Zhan Qin; Kui Ren", "abstract": "Transferability of adversarial examples is of central importance for attacking an unknown model, which facilitates adversarial attacks in more practical scenarios, e.g., blackbox attacks. Existing transferable attacks tend to craft adversarial examples by indiscriminately distorting features to degrade prediction accuracy in a source model without aware of intrinsic features of objects in the images. We argue that such brute-force degradation would introduce model-specific local optimum into adversarial examples, thus limiting the transferability. By contrast, we propose the Feature Importance-aware Attack (FIA), which disrupts important object-aware features that dominate model decisions consistently. More specifically, we obtain feature importance by introducing the aggregate gradient, which averages the gradients with respect to feature maps of the source model, computed on a batch of random transforms of the original clean image. The gradients will be highly correlated to objects of interest, and such correlation presents invariance across different models. Besides, the random transforms will preserve intrinsic features of objects and suppress model-specific information. Finally, the feature importance guides to search for adversarial examples towards disrupting critical features, achieving stronger transferability. Extensive experimental evaluation demonstrates the effectiveness and superior performance of the proposed FIA, i.e., improving the success rate by 9.5% against normally trained models and 12.8% against defense models as compared to the state-of-the-art transferable attacks. Code is available at: https://github.com/hcguoO0/FIA", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_Feature_Importance-Aware_Transferable_Adversarial_Attacks_ICCV_2021_paper.pdf", @@ -14695,7 +15690,8 @@ "aff_campus_unique_index": ";1;1;;", "aff_campus_unique": ";Wuhan", "aff_country_unique_index": "0+0;0;1;0;0+0;0+0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Zhibo and Guo,\n Hengchang and Zhang,\n Zhifei and Liu,\n Wenxin and Qin,\n Zhan and Ren,\n Kui\n},\n title = {\n Feature Importance-Aware Transferable Adversarial Attacks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7639-7648\n} \n}" }, { "title": "Feature Interactive Representation for Point Cloud Registration", @@ -14703,10 +15699,11 @@ "status": "Poster", "track": "main", "pid": 8263, + "author_site": "Bingli Wu; Jie Ma; Gaojie Chen; Pei An", "author": "Bingli Wu; Jie Ma; Gaojie Chen; Pei An", "abstract": "Point cloud registration is the process of using the common structures in two point clouds to splice them together. To find out these common structures and make these structures match more accurately, we investigate the direction of interacting information of the source and target point clouds. To this end, we propose a Feature Interactive Representation learning Network (FIRE-Net), which can explore feature interaction among the source and target point clouds from different levels. Specifically, we first introduce a Combined Feature Encoder (CFE) based on feature interaction intra point cloud. CFE extracts interactive features intra each point cloud and combines them to enhance the ability of the network to describe the local geometric structure. Then, we propose a feature interaction mechanism inter point clouds which includes a Local Interaction Unit (LIU) and a Global Interaction Unit (GIU). The former is used to interact information between point pairs across two point clouds, thus the point features in one point cloud and its similar point features in another point cloud can be aware of each other. The latter is applied to change the per-point features depending on the global cross information of two point clouds, thus one point cloud has the global perception of another. Extensive experiments on partially overlapping point cloud registration show that our method achieves state-of-the-art performance.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wu_Feature_Interactive_Representation_for_Point_Cloud_Registration_ICCV_2021_paper.pdf", - "aff": "National Key Laboratory of Science and Technology on Multi-spectral Information Processing, School of Arti\ufb01cial Intelligence and Automation, Huazhong University of Science and Technology, P.R.China; National Key Laboratory of Science and Technology on Multi-spectral Information Processing, School of Arti\ufb01cial Intelligence and Automation, Huazhong University of Science and Technology, P.R.China; National Key Laboratory of Science and Technology on Multi-spectral Information Processing, School of Arti\ufb01cial Intelligence and Automation, Huazhong University of Science and Technology, P.R.China; National Key Laboratory of Science and Technology on Multi-spectral Information Processing, School of Arti\ufb01cial Intelligence and Automation, Huazhong University of Science and Technology, P.R.China", + "aff": "National Key Laboratory of Science and Technology on Multi-spectral Information Processing, School of Artificial Intelligence and Automation, Huazhong University of Science and Technology, P.R.China; National Key Laboratory of Science and Technology on Multi-spectral Information Processing, School of Artificial Intelligence and Automation, Huazhong University of Science and Technology, P.R.China; National Key Laboratory of Science and Technology on Multi-spectral Information Processing, School of Artificial Intelligence and Automation, Huazhong University of Science and Technology, P.R.China; National Key Laboratory of Science and Technology on Multi-spectral Information Processing, School of Artificial Intelligence and Automation, Huazhong University of Science and Technology, P.R.China", "project": "", "github": "", "supp": "", @@ -14720,13 +15717,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wu_Feature_Interactive_Representation_for_Point_Cloud_Registration_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Huazhong University of Science and Technology", - "aff_unique_dep": "School of Arti\ufb01cial Intelligence and Automation", + "aff_unique_dep": "School of Artificial Intelligence and Automation", "aff_unique_url": "http://www.hust.edu.cn", "aff_unique_abbr": "HUST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wu_2021_ICCV,\n \n author = {\n Wu,\n Bingli and Ma,\n Jie and Chen,\n Gaojie and An,\n Pei\n},\n title = {\n Feature Interactive Representation for Point Cloud Registration\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5530-5539\n} \n}" }, { "title": "Federated Learning for Non-IID Data via Unified Feature Learning and Optimization Objective Alignment", @@ -14734,10 +15732,11 @@ "status": "Poster", "track": "main", "pid": 7218, + "author_site": "Lin Zhang; Yong Luo; Yan Bai; Bo Du; Ling-Yu Duan", "author": "Lin Zhang; Yong Luo; Yan Bai; Bo Du; Ling-Yu Duan", "abstract": "Federated Learning (FL) aims to establish a shared model across decentralized clients under the privacy-preserving constraint. Despite certain success, it is still challenging for FL to deal with non-IID (non-independent and identical distribution) client data, which is a general scenario in real-world FL tasks. It has been demonstrated that the performance of FL will be reduced greatly under the non-IID scenario, since the discrepant data distributions will induce optimization inconsistency and feature divergence issues. Besides, naively minimizing an aggregate loss function in this scenario may have negative impacts on some clients and thus deteriorate their personal model performance. To address these issues, we propose a Unified Feature learning and Optimization objectives alignment method (FedUFO) for non-IID FL. In particular, an adversary module is proposed to reduce the divergence on feature representation among different clients, and two consensus losses are proposed to reduce the inconsistency on optimization objectives from two perspectives. Extensive experiments demonstrate that our FedUFO can outperform the state-of-the-art approaches, including the competitive one data-sharing method. Besides, FedUFO can enable more reasonable and balanced model performance among different clients.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhang_Federated_Learning_for_Non-IID_Data_via_Unified_Feature_Learning_and_ICCV_2021_paper.pdf", - "aff": "Institute of Digital Media (IDM), Peking University, Beijing, China + Peng Cheng Laboratory, Shenzhen, China; Institute of Arti\ufb01cial Intelligence, School of Computer Science, Wuhan University, Wuhan, China; Institute of Digital Media (IDM), Peking University, Beijing, China + Peng Cheng Laboratory, Shenzhen, China; Institute of Arti\ufb01cial Intelligence, School of Computer Science, Wuhan University, Wuhan, China; Institute of Digital Media (IDM), Peking University, Beijing, China + Peng Cheng Laboratory, Shenzhen, China", + "aff": "Institute of Digital Media (IDM), Peking University, Beijing, China + Peng Cheng Laboratory, Shenzhen, China; Institute of Artificial Intelligence, School of Computer Science, Wuhan University, Wuhan, China; Institute of Digital Media (IDM), Peking University, Beijing, China + Peng Cheng Laboratory, Shenzhen, China; Institute of Artificial Intelligence, School of Computer Science, Wuhan University, Wuhan, China; Institute of Digital Media (IDM), Peking University, Beijing, China + Peng Cheng Laboratory, Shenzhen, China", "project": "", "github": "", "supp": "", @@ -14750,14 +15749,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhang_Federated_Learning_for_Non-IID_Data_via_Unified_Feature_Learning_and_ICCV_2021_paper.html", "aff_unique_index": "0+1;2;0+1;2;0+1", - "aff_unique_norm": "Peking University;Pengcheng Laboratory;Wuhan University", - "aff_unique_dep": "Institute of Digital Media (IDM);Peng Cheng Laboratory;School of Computer Science", + "aff_unique_norm": "Peking University;Peng Cheng Laboratory;Wuhan University", + "aff_unique_dep": "Institute of Digital Media (IDM);;School of Computer Science", "aff_unique_url": "http://www.pku.edu.cn;;http://www.whu.edu.cn", "aff_unique_abbr": "PKU;;WHU", "aff_campus_unique_index": "0+1;2;0+1;2;0+1", "aff_campus_unique": "Beijing;Shenzhen;Wuhan", "aff_country_unique_index": "0+0;0;0+0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2021_ICCV,\n \n author = {\n Zhang,\n Lin and Luo,\n Yong and Bai,\n Yan and Du,\n Bo and Duan,\n Ling-Yu\n},\n title = {\n Federated Learning for Non-IID Data via Unified Feature Learning and Optimization Objective Alignment\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4420-4428\n} \n}" }, { "title": "Few-Shot Image Classification: Just Use a Library of Pre-Trained Feature Extractors and a Simple Classifier", @@ -14765,6 +15765,7 @@ "status": "Poster", "track": "main", "pid": 9655, + "author_site": "Arkabandhu Chowdhury; Mingchao Jiang; Swarat Chaudhuri; Chris Jermaine", "author": "Arkabandhu Chowdhury; Mingchao Jiang; Swarat Chaudhuri; Chris Jermaine", "abstract": "Recent papers have suggested that transfer learning can outperform sophisticated meta-learning methods for few-shot image classification. We take this hypothesis to its logical conclusion, and suggest the use of an ensemble of high-quality, pre-trained feature extractors for few-shot image classification. We show experimentally that a library of pre-trained feature extractors combined with a simple feed-forward network learned with an L2-regularizer can be an excellent option for solving cross-domain few-shot image classification. Our experimental results suggest that this simpler sample-efficient approach far outperforms several well-established meta-learning algorithms.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chowdhury_Few-Shot_Image_Classification_Just_Use_a_Library_of_Pre-Trained_Feature_ICCV_2021_paper.pdf", @@ -14779,7 +15780,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Chowdhury_Few-Shot_Image_Classification_Just_Use_a_Library_of_Pre-Trained_Feature_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Chowdhury_Few-Shot_Image_Classification_Just_Use_a_Library_of_Pre-Trained_Feature_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Chowdhury_2021_ICCV,\n \n author = {\n Chowdhury,\n Arkabandhu and Jiang,\n Mingchao and Chaudhuri,\n Swarat and Jermaine,\n Chris\n},\n title = {\n Few-Shot Image Classification: Just Use a Library of Pre-Trained Feature Extractors and a Simple Classifier\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9445-9454\n} \n}" }, { "title": "Few-Shot Semantic Segmentation With Cyclic Memory Network", @@ -14787,6 +15789,7 @@ "status": "Poster", "track": "main", "pid": 5901, + "author_site": "Guo-Sen Xie; Huan Xiong; Jie Liu; Yazhou Yao; Ling Shao", "author": "Guo-Sen Xie; Huan Xiong; Jie Liu; Yazhou Yao; Ling Shao", "abstract": "Few-shot semantic segmentation (FSS) is an important task for novel (unseen) object segmentation under the data-scarcity scenario. However, most FSS methods rely on unidirectional feature aggregation, e.g., from support prototypes to get the query prediction, and from high-resolution features to guide the low-resolution ones. This usually fails to fully capture the cross-resolution feature relationships and thus leads to inaccurate estimates of the query objects. To resolve the above dilemma, we propose a cyclic memory network (CMN) to directly learn to read abundant support information from all resolution features in a cyclic manner. Specifically, we first generate N pairs (key and value) of multi-resolution query features guided by the support feature and its mask. Next, we circularly take one pair of these features as the query to be segmented, and the rest N-1 pairs are written into an external memory accordingly, i.e., this leave-one-out process is conducted for N times. In each cycle, the query feature is updated by collaboratively matching its key and value with the memory, which can elegantly cover all the spatial locations from different resolutions. Furthermore, we incorporate the query feature re-adding and the query feature recursive updating mechanisms into the memory reading operation. CMN, equipped with these merits, can thus capture cross-resolution relationships and better handle the object appearance and scale variations in FSS. Experiments on PASCAL-5i and COCO-20i well validate the effectiveness of our model for FSS.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xie_Few-Shot_Semantic_Segmentation_With_Cyclic_Memory_Network_ICCV_2021_paper.pdf", @@ -14810,7 +15813,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0+1;0;1;0", - "aff_country_unique": "United Arab Emirates;China" + "aff_country_unique": "United Arab Emirates;China", + "bibtex": "@InProceedings{Xie_2021_ICCV,\n \n author = {\n Xie,\n Guo-Sen and Xiong,\n Huan and Liu,\n Jie and Yao,\n Yazhou and Shao,\n Ling\n},\n title = {\n Few-Shot Semantic Segmentation With Cyclic Memory Network\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7293-7302\n} \n}" }, { "title": "Few-Shot Visual Relationship Co-Localization", @@ -14818,6 +15822,7 @@ "status": "Poster", "track": "main", "pid": 10220, + "author_site": "Revant Teotia; Vaibhav Mishra; Mayank Maheshwari; Anand Mishra", "author": "Revant Teotia; Vaibhav Mishra; Mayank Maheshwari; Anand Mishra", "abstract": "In this paper, given a small bag of images, each containing a common but latent predicate, we are interested in localizing visual subject-object pairs connected via the common predicate in each of the images. We refer to this novel problem as visual relationship co-localization or VRC as an abbreviation. VRC is a challenging task, even more so than the well-studied object co-localization task. This becomes further challenging when using just a few images, the model has to learn to co-localize visual subject-object pairs connected via unseen predicates. To solve VRC, we propose an optimization framework to select a common visual relationship in each image of the bag. The goal of the optimization framework is to find the optimal solution by learning visual relationship similarity across images in a few-shot setting. To obtain robust visual relationship representation, we utilize a simple yet effective technique that learns relationship embedding as a translation vector from visual subject to visual object in a shared space. Further, to learn visual relationship similarity, we utilize a proven meta-learning technique commonly used for few-shot classification tasks. Finally, to tackle the combinatorial complexity challenge arising from an exponential number of feasible solutions, we use a greedy approximation inference algorithm that selects approximately the best solution. We extensively evaluate our proposed framework on variations of bag sizes obtained from two challenging public datasets, namely VrR-VG and VG-150, and achieve impressive visual co-localization performance.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Teotia_Few-Shot_Visual_Relationship_Co-Localization_ICCV_2021_paper.pdf", @@ -14841,7 +15846,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Jodhpur", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "India" + "aff_country_unique": "India", + "bibtex": "@InProceedings{Teotia_2021_ICCV,\n \n author = {\n Teotia,\n Revant and Mishra,\n Vaibhav and Maheshwari,\n Mayank and Mishra,\n Anand\n},\n title = {\n Few-Shot Visual Relationship Co-Localization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 16342-16351\n} \n}" }, { "title": "Few-Shot and Continual Learning With Attentive Independent Mechanisms", @@ -14849,6 +15855,7 @@ "status": "Poster", "track": "main", "pid": 9898, + "author_site": "Eugene Lee; Cheng-Han Huang; Chen-Yi Lee", "author": "Eugene Lee; Cheng-Han Huang; Chen-Yi Lee", "abstract": "Deep neural networks (DNNs) are known to perform well when deployed to test distributions that shares high similarity with the training distribution. Feeding DNNs with new data sequentially that were unseen in the training distribution has two major challenges --- fast adaptation to new tasks and catastrophic forgetting of old tasks. Such difficulties paved way for the on-going research on few-shot learning and continual learning. To tackle these problems, we introduce Attentive Independent Mechanisms (AIM). We incorporate the idea of learning using fast and slow weights in conjunction with the decoupling of the feature extraction and higher-order conceptual learning of a DNN. AIM is designed for higher-order conceptual learning, modeled by a mixture of experts that compete to learn independent concepts to solve a new task. AIM is a modular component that can be inserted into existing deep learning frameworks. We demonstrate its capability for few-shot learning by adding it to SIB and trained on MiniImageNet and CIFAR-FS, showing significant improvement. AIM is also applied to ANML and OML trained on Omniglot, CIFAR-100 and MiniImageNet to demonstrate its capability in continual learning.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Lee_Few-Shot_and_Continual_Learning_With_Attentive_Independent_Mechanisms_ICCV_2021_paper.pdf", @@ -14863,7 +15870,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Lee_Few-Shot_and_Continual_Learning_With_Attentive_Independent_Mechanisms_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Lee_Few-Shot_and_Continual_Learning_With_Attentive_Independent_Mechanisms_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Lee_2021_ICCV,\n \n author = {\n Lee,\n Eugene and Huang,\n Cheng-Han and Lee,\n Chen-Yi\n},\n title = {\n Few-Shot and Continual Learning With Attentive Independent Mechanisms\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9455-9464\n} \n}" }, { "title": "Field Convolutions for Surface CNNs", @@ -14871,6 +15879,7 @@ "status": "Poster", "track": "main", "pid": 6280, + "author_site": "Thomas W. Mitchel; Vladimir G. Kim; Michael Kazhdan", "author": "Thomas W. Mitchel; Vladimir G. Kim; Michael Kazhdan", "abstract": "We present a novel surface convolution operator acting on vector fields that is based on a simple observation: instead of combining neighboring features with respect to a single coordinate parameterization defined at a given point, we have every neighbor describe the position of the point within its own coordinate frame. This formulation combines intrinsic spatial convolution with parallel transport in a scattering operation while placing no constraints on the filters themselves, providing a definition of convolution that commutes with the action of isometries, has increased descriptive potential, and is robust to noise and other nuisance factors. The result is a rich notion of convolution which we call field convolution, well-suited for CNNs on surfaces. Field convolutions are flexible, straight-forward to incorporate into surface learning frameworks, and their highly discriminating nature has cascading effects throughout the learning pipeline. Using simple networks constructed from residual field convolution blocks, we achieve state-of-the-art results on standard benchmarks in fundamental geometry processing tasks, such as shape classification, segmentation, correspondence, and sparse matching.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Mitchel_Field_Convolutions_for_Surface_CNNs_ICCV_2021_paper.pdf", @@ -14894,7 +15903,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Mitchel_2021_ICCV,\n \n author = {\n Mitchel,\n Thomas W. and Kim,\n Vladimir G. and Kazhdan,\n Michael\n},\n title = {\n Field Convolutions for Surface CNNs\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10001-10011\n} \n}" }, { "title": "Field of Junctions: Extracting Boundary Structure at Low SNR", @@ -14902,6 +15912,7 @@ "status": "Poster", "track": "main", "pid": 1120, + "author_site": "Dor Verbin; Todd Zickler", "author": "Dor Verbin; Todd Zickler", "abstract": "We introduce a bottom-up model for simultaneously finding many boundary elements in an image, including contours, corners and junctions. The model explains boundary shape in each small patch using a 'generalized M-junction' comprising M angles and a freely-moving vertex. Images are analyzed using non-convex optimization to cooperatively find M+2 junction values at every location, with spatial consistency being enforced by a novel regularizer that reduces curvature while preserving corners and junctions. The resulting 'field of junctions' is simultaneously a contour detector, corner/junction detector, and boundary-aware smoothing of regional appearance. Notably, its unified analysis of contours, corners, junctions and uniform regions allows it to succeed at high noise levels, where other methods for segmentation and boundary detection fail.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Verbin_Field_of_Junctions_Extracting_Boundary_Structure_at_Low_SNR_ICCV_2021_paper.pdf", @@ -14925,7 +15936,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Verbin_2021_ICCV,\n \n author = {\n Verbin,\n Dor and Zickler,\n Todd\n},\n title = {\n Field of Junctions: Extracting Boundary Structure at Low SNR\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6869-6878\n} \n}" }, { "title": "Field-Guide-Inspired Zero-Shot Learning", @@ -14933,6 +15945,7 @@ "status": "Poster", "track": "main", "pid": 9293, + "author_site": "Utkarsh Mall; Bharath Hariharan; Kavita Bala", "author": "Utkarsh Mall; Bharath Hariharan; Kavita Bala", "abstract": "Modern recognition systems require large amounts of supervision to achieve accuracy. Adapting to new domains requires significant data from experts, which is onerous and can become too expensive. Zero-shot learning requires an annotated set of attributes for a novel category. Annotating the full set of attributes for a novel category proves to be a tedious and expensive task in deployment. This is especially the case when the recognition domain is an expert domain. We introduce a new field-guide-inspired approach to zero-shot annotation where the learner model interactively asks for the most useful attributes that define a class. We evaluate our method on classification benchmarks with attribute annotations like CUB, SUN, and AWA2 and show that our model achieves the performance of a model with full annotations at the cost of a significantly fewer number of annotations. Since the time of experts is precious, decreasing annotation cost can be very valuable for real-world deployment.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Mall_Field-Guide-Inspired_Zero-Shot_Learning_ICCV_2021_paper.pdf", @@ -14956,7 +15969,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Mall_2021_ICCV,\n \n author = {\n Mall,\n Utkarsh and Hariharan,\n Bharath and Bala,\n Kavita\n},\n title = {\n Field-Guide-Inspired Zero-Shot Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9546-9555\n} \n}" }, { "title": "Finding Representative Interpretations on Convolutional Neural Networks", @@ -14964,6 +15978,7 @@ "status": "Poster", "track": "main", "pid": 9356, + "author_site": "Peter Cho-Ho Lam; Lingyang Chu; Maxim Torgonskiy; Jian Pei; Yong Zhang; Lanjun Wang", "author": "Peter Cho-Ho Lam; Lingyang Chu; Maxim Torgonskiy; Jian Pei; Yong Zhang; Lanjun Wang", "abstract": "Interpreting the decision logic behind effective deep convolutional neural networks (CNN) on images complements the success of deep learning models. However, the existing methods can only interpret some specific decision logic on individual or a small number of images. To facilitate human understandability and generalization ability, it is important to develop representative interpretations that interpret common decision logics of a CNN on a large group of similar images, which reveal the common semantics data contributes to many closely related predictions. In this paper, we develop a novel unsupervised approach to produce a highly representative interpretation for a large number of similar images. We formulate the problem of finding representative interpretations as a co-clustering problem, and convert it into a submodular cost submodular cover problem based on a sample of the linear decision boundaries of a CNN. We also present a visualization and similarity ranking method. Our extensive experiments demonstrate the excellent performance of our method.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Lam_Finding_Representative_Interpretations_on_Convolutional_Neural_Networks_ICCV_2021_paper.pdf", @@ -14980,14 +15995,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Lam_Finding_Representative_Interpretations_on_Convolutional_Neural_Networks_ICCV_2021_paper.html", "aff_unique_index": "0;1;0;2;0;0", - "aff_unique_norm": "Huawei;McMaster University;Simon Fraser University", - "aff_unique_dep": "Huawei Canada Technologies Co., Ltd.;;", + "aff_unique_norm": "Huawei Canada Technologies Co., Ltd.;McMaster University;Simon Fraser University", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.huawei.com/ca-en/;https://www.mcmaster.ca;https://www.sfu.ca", "aff_unique_abbr": ";McMaster;SFU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Lam_2021_ICCV,\n \n author = {\n Lam,\n Peter Cho-Ho and Chu,\n Lingyang and Torgonskiy,\n Maxim and Pei,\n Jian and Zhang,\n Yong and Wang,\n Lanjun\n},\n title = {\n Finding Representative Interpretations on Convolutional Neural Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1345-1354\n} \n}" }, { "title": "Fine-Grained Semantics-Aware Representation Enhancement for Self-Supervised Monocular Depth Estimation", @@ -14995,6 +16011,7 @@ "status": "Poster", "track": "main", "pid": 8571, + "author_site": "Hyunyoung Jung; Eunhyeok Park; Sungjoo Yoo", "author": "Hyunyoung Jung; Eunhyeok Park; Sungjoo Yoo", "abstract": "Self-supervised monocular depth estimation has been widely studied, owing to its practical importance and recent promising improvements. However, most works suffer from limited supervision of photometric consistency, especially in weak texture regions and at object boundaries. To overcome this weakness, we propose novel ideas to improve self-supervised monocular depth estimation by leveraging cross-domain information, especially scene semantics. We focus on incorporating implicit semantic knowledge into geometric representation enhancement and suggest two ideas: a metric learning approach that exploits the semantics-guided local geometry to optimize intermediate depth representations and a novel feature fusion module that judiciously utilizes cross-modality between two heterogeneous feature representations. We comprehensively evaluate our methods on the KITTI dataset and demonstrate that our method outperforms state-of-the-art methods. The source code is available at https://github.com/hyBlue/FSRE-Depth.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Jung_Fine-Grained_Semantics-Aware_Representation_Enhancement_for_Self-Supervised_Monocular_Depth_Estimation_ICCV_2021_paper.pdf", @@ -15018,7 +16035,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Pohang", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Jung_2021_ICCV,\n \n author = {\n Jung,\n Hyunyoung and Park,\n Eunhyeok and Yoo,\n Sungjoo\n},\n title = {\n Fine-Grained Semantics-Aware Representation Enhancement for Self-Supervised Monocular Depth Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12642-12652\n} \n}" }, { "title": "FloW: A Dataset and Benchmark for Floating Waste Detection in Inland Waters", @@ -15026,10 +16044,11 @@ "status": "Poster", "track": "main", "pid": 7922, + "author_site": "Yuwei Cheng; Jiannan Zhu; Mengxin Jiang; Jie Fu; Changsong Pang; Peidong Wang; Kris Sankaran; Olawale Onabola; Yimin Liu; Dianbo Liu; Yoshua Bengio", "author": "Yuwei Cheng; Jiannan Zhu; Mengxin Jiang; Jie Fu; Changsong Pang; Peidong Wang; Kris Sankaran; Olawale Onabola; Yimin Liu; Dianbo Liu; Yoshua Bengio", "abstract": "Marine debris is severely threatening the marine lives and causing sustained pollution to the whole ecosystem. To prevent the wastes from getting into the ocean, it is helpful to clean up the floating wastes in inland waters using the autonomous cleaning devices like unmanned surface vehicles. The cleaning efficiency relies on a high-accurate and robust object detection system. However, the small size of the target, the strong light reflection over water surface, and the reflection of other objects on bank-side all bring challenges to the vision-based object detection system. To promote the practical application for autonomous floating wastes cleaning, we present FloW, the first dataset for floating waste detection in inland water areas. The dataset consists of an image sub-dataset FloW-Img and a multimodal sub-dataset FloW-RI which contains synchronized millimeter-wave radar data and images. Accurate annotations for images and radar data are provided, supporting floating waste detection strategies based on images, radar data, and the fusion of two sensors. We perform several baseline experiments on our dataset, including vision-based and radar-based detection methods. The results show that, the detection accuracy is relatively low and floating waste detection still remains a challenging task.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Cheng_FloW_A_Dataset_and_Benchmark_for_Floating_Waste_Detection_in_ICCV_2021_paper.pdf", - "aff": "ORCA-Uboat+Tsinghua University; ORCA-Uboat+Northwestern Polytechnical University; Tsinghua University; Mila-Qu\u00e9bec AI Institute; ORCA-Uboat+Northwestern Polytechnical University; Tsinghua University; Mila-Qu\u00e9bec AI Institute; Mila-Qu\u00e9bec AI Institute; Tsinghua University; Mila-Qu\u00e9bec AI Institute; Mila-Qu\u00e9bec AI Institute", + "aff": "ORCA-Uboat+Tsinghua University; ORCA-Uboat+Northwestern Polytechnical University; Tsinghua University; Mila-Québec AI Institute; ORCA-Uboat+Northwestern Polytechnical University; Tsinghua University; Mila-Québec AI Institute; Mila-Québec AI Institute; Tsinghua University; Mila-Québec AI Institute; Mila-Québec AI Institute", "project": "", "github": "https://github.com/ORCA-Uboat/FloW-Dataset", "supp": "", @@ -15042,14 +16061,15 @@ "author_num": 11, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Cheng_FloW_A_Dataset_and_Benchmark_for_Floating_Waste_Detection_in_ICCV_2021_paper.html", "aff_unique_index": "0+1;0+2;1;3;0+2;1;3;3;1;3;3", - "aff_unique_norm": "ORCA-Uboat;Tsinghua University;Northwestern Polytechnical University;Mila-Qu\u00e9bec AI Institute", + "aff_unique_norm": "ORCA-Uboat;Tsinghua University;Northwestern Polytechnical University;Mila-Québec AI Institute", "aff_unique_dep": ";;;AI Institute", "aff_unique_url": ";https://www.tsinghua.edu.cn;https://www.nwpu.edu.cn;https://mila.quebec", "aff_unique_abbr": ";THU;NWPU;Mila", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "1;1;1;2;1;1;2;2;1;2;2", - "aff_country_unique": ";China;Canada" + "aff_country_unique": ";China;Canada", + "bibtex": "@InProceedings{Cheng_2021_ICCV,\n \n author = {\n Cheng,\n Yuwei and Zhu,\n Jiannan and Jiang,\n Mengxin and Fu,\n Jie and Pang,\n Changsong and Wang,\n Peidong and Sankaran,\n Kris and Onabola,\n Olawale and Liu,\n Yimin and Liu,\n Dianbo and Bengio,\n Yoshua\n},\n title = {\n FloW: A Dataset and Benchmark for Floating Waste Detection in Inland Waters\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10953-10962\n} \n}" }, { "title": "FloorPlanCAD: A Large-Scale CAD Drawing Dataset for Panoptic Symbol Spotting", @@ -15057,6 +16077,7 @@ "status": "Poster", "track": "main", "pid": 5545, + "author_site": "Zhiwen Fan; Lingjie Zhu; Honghua Li; Xiaohao Chen; Siyu Zhu; Ping Tan", "author": "Zhiwen Fan; Lingjie Zhu; Honghua Li; Xiaohao Chen; Siyu Zhu; Ping Tan", "abstract": "Access to large and diverse computer-aided design (CAD) drawings is critical for developing symbol spotting algorithms. In this paper, we present FloorPlanCAD, a large-scale real-world CAD drawing dataset containing over 10,000 floor plans, ranging from residential to commercial buildings. CAD drawings in the dataset are all represented as vector graphics, which enable us to provide line-grained annotations of 30 object categories. Equipped by such annotations, we introduce the task of panoptic symbol spotting, which requires to spot not only instances of countable things, but also the semantic of uncountable stuff. Aiming to solve this task, we propose a novel method by combining Graph Convolutional Networks (GCNs) with Convolutional Neural Networks (CNNs), which captures both non-Euclidean and Euclidean features and can be trained end-to-end. The proposed CNN-GCN method achieved state-of-the-art (SOTA) performance on the task of semantic symbol spotting, and help us build a baseline network for the panoptic symbol spotting task. Our contributions are three-fold: 1) to the best of our knowledge, the presented CAD drawing dataset is the first of its kind; 2) the panoptic symbol spotting task considers the spotting of both thing instances and stuff semantic as one recognition problem; and 3) we presented a baseline solution to the panoptic symbol spotting task based on a novel CNN-GCN method, which achieved SOTA performance on semantic symbol spotting. We believe that these contributions will boost research in related areas. The dataset and code is publicly available at https://floorplancad.github.io/.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Fan_FloorPlanCAD_A_Large-Scale_CAD_Drawing_Dataset_for_Panoptic_Symbol_Spotting_ICCV_2021_paper.pdf", @@ -15080,7 +16101,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0+1", - "aff_country_unique": "China;Canada" + "aff_country_unique": "China;Canada", + "bibtex": "@InProceedings{Fan_2021_ICCV,\n \n author = {\n Fan,\n Zhiwen and Zhu,\n Lingjie and Li,\n Honghua and Chen,\n Xiaohao and Zhu,\n Siyu and Tan,\n Ping\n},\n title = {\n FloorPlanCAD: A Large-Scale CAD Drawing Dataset for Panoptic Symbol Spotting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10128-10137\n} \n}" }, { "title": "Flow-Guided Video Inpainting With Scene Templates", @@ -15088,6 +16110,7 @@ "status": "Poster", "track": "main", "pid": 8482, + "author_site": "Dong Lao; Peihao Zhu; Peter Wonka; Ganesh Sundaramoorthi", "author": "Dong Lao; Peihao Zhu; Peter Wonka; Ganesh Sundaramoorthi", "abstract": "We consider the problem of filling in missing spatio-temporal regions of a video. We provide a novel flow-based solution by introducing a generative model of images in relation to the scene (without missing regions) and mappings from the scene to images. We use the model to jointly infer the scene template, a 2D representation of the scene, and the mappings. This ensures consistency of the frame-to-frame flows generated to the underlying scene, reducing geometric distortions in flow-based inpainting. The template is mapped to the missing regions in the video by a new (L2-L1) interpolation scheme, creating crisp inpaintings, reducing common blur and distortion artifacts. We show on two benchmark datasets that our approach outperforms state-of-the-art quantitatively and in user studies.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Lao_Flow-Guided_Video_Inpainting_With_Scene_Templates_ICCV_2021_paper.pdf", @@ -15111,7 +16134,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Saudi Arabia" + "aff_country_unique": "Saudi Arabia", + "bibtex": "@InProceedings{Lao_2021_ICCV,\n \n author = {\n Lao,\n Dong and Zhu,\n Peihao and Wonka,\n Peter and Sundaramoorthi,\n Ganesh\n},\n title = {\n Flow-Guided Video Inpainting With Scene Templates\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14599-14608\n} \n}" }, { "title": "Focal Frequency Loss for Image Reconstruction and Synthesis", @@ -15119,6 +16143,7 @@ "status": "Poster", "track": "main", "pid": 3038, + "author_site": "Liming Jiang; Bo Dai; Wayne Wu; Chen Change Loy", "author": "Liming Jiang; Bo Dai; Wayne Wu; Chen Change Loy", "abstract": "Image reconstruction and synthesis have witnessed remarkable progress thanks to the development of generative models. Nonetheless, gaps could still exist between the real and generated images, especially in the frequency domain. In this study, we show that narrowing gaps in the frequency domain can ameliorate image reconstruction and synthesis quality further. We propose a novel focal frequency loss, which allows a model to adaptively focus on frequency components that are hard to synthesize by down-weighting the easy ones. This objective function is complementary to existing spatial losses, offering great impedance against the loss of important frequency information due to the inherent bias of neural networks. We demonstrate the versatility and effectiveness of focal frequency loss to improve popular models, such as VAE, pix2pix, and SPADE, in both perceptual quality and quantitative performance. We further show its potential on StyleGAN2.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Jiang_Focal_Frequency_Loss_for_Image_Reconstruction_and_Synthesis_ICCV_2021_paper.pdf", @@ -15142,7 +16167,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", - "aff_country_unique": "Singapore;China" + "aff_country_unique": "Singapore;China", + "bibtex": "@InProceedings{Jiang_2021_ICCV,\n \n author = {\n Jiang,\n Liming and Dai,\n Bo and Wu,\n Wayne and Loy,\n Chen Change\n},\n title = {\n Focal Frequency Loss for Image Reconstruction and Synthesis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13919-13929\n} \n}" }, { "title": "Focus on the Positives: Self-Supervised Learning for Biodiversity Monitoring", @@ -15150,6 +16176,7 @@ "status": "Poster", "track": "main", "pid": 8743, + "author_site": "Omiros Pantazis; Gabriel J. Brostow; Kate E. Jones; Oisin Mac Aodha", "author": "Omiros Pantazis; Gabriel J. Brostow; Kate E. Jones; Oisin Mac Aodha", "abstract": "We address the problem of learning self-supervised representations from unlabeled image collections. Unlike existing approaches that attempt to learn useful features by maximizing similarity between augmented versions of each input image or by speculatively picking negative samples, we instead also make use of the natural variation that occurs in image collections that are captured using static monitoring cameras. To achieve this, we exploit readily available context data that encodes information such as the spatial and temporal relationships between the input images. We are able to learn representations that are surprisingly effective for downstream supervised classification, by first identifying high probability positive pairs at training time, i.e. those images that are likely to depict the same visual concept. For the critical task of global biodiversity monitoring, this results in image features that can be adapted to challenging visual species classification tasks with limited human supervision. We present results on four different camera trap image collections, across three different families of self-supervised learning methods, and show that careful image selection at training time results in superior performance compared to existing baselines such as conventional self-supervised training and transfer learning.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Pantazis_Focus_on_the_Positives_Self-Supervised_Learning_for_Biodiversity_Monitoring_ICCV_2021_paper.pdf", @@ -15164,7 +16191,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Pantazis_Focus_on_the_Positives_Self-Supervised_Learning_for_Biodiversity_Monitoring_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Pantazis_Focus_on_the_Positives_Self-Supervised_Learning_for_Biodiversity_Monitoring_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Pantazis_2021_ICCV,\n \n author = {\n Pantazis,\n Omiros and Brostow,\n Gabriel J. and Jones,\n Kate E. and Mac Aodha,\n Oisin\n},\n title = {\n Focus on the Positives: Self-Supervised Learning for Biodiversity Monitoring\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10583-10592\n} \n}" }, { "title": "Fog Simulation on Real LiDAR Point Clouds for 3D Object Detection in Adverse Weather", @@ -15172,10 +16200,11 @@ "status": "Poster", "track": "main", "pid": 1963, + "author_site": "Martin Hahner; Christos Sakaridis; Dengxin Dai; Luc Van Gool", "author": "Martin Hahner; Christos Sakaridis; Dengxin Dai; Luc Van Gool", "abstract": "This work addresses the challenging task of LiDAR-based 3D object detection in foggy weather. Collecting and annotating data in such a scenario is very time, labor and cost intensive. In this paper, we tackle this problem by simulating physically accurate fog into clear-weather scenes, so that the abundant existing real datasets captured in clear weather can be repurposed for our task. Our contributions are twofold: 1) We develop a physically valid fog simulation method that is applicable to any LiDAR dataset. This unleashes the acquisition of large-scale foggy training data at no extra cost. These partially synthetic data can be used to improve the robustness of several perception methods, such as 3D object detection and tracking or simultaneous localization and mapping, on real foggy data. 2) Through extensive experiments with several state-of-the-art detection approaches, we show that our fog simulation can be leveraged to significantly improve the performance for 3D object detection in the presence of fog. Thus, we are the first to provide strong 3D object detection baselines on the Seeing Through Fog dataset. Our code is available at www.trace.ethz.ch/lidar_fog_simulation.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Hahner_Fog_Simulation_on_Real_LiDAR_Point_Clouds_for_3D_Object_ICCV_2021_paper.pdf", - "aff": "ETH Z\u00fcrich; ETH Z\u00fcrich; ETH Z\u00fcrich+MPI for Informatics; ETH Z\u00fcrich+KU Leuven", + "aff": "ETH Zürich; ETH Zürich; ETH Zürich+MPI for Informatics; ETH Zürich+KU Leuven", "project": "www.trace.ethz.ch/lidar_fogsimulation", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Hahner_Fog_Simulation_on_ICCV_2021_supplemental.pdf", @@ -15188,14 +16217,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Hahner_Fog_Simulation_on_Real_LiDAR_Point_Clouds_for_3D_Object_ICCV_2021_paper.html", "aff_unique_index": "0;0;0+1;0+2", - "aff_unique_norm": "ETH Zurich;Max Planck Institute for Informatics;Katholieke Universiteit Leuven", + "aff_unique_norm": "ETH Zürich;Max Planck Institute for Informatics;Katholieke Universiteit Leuven", "aff_unique_dep": ";Informatics;", "aff_unique_url": "https://www.ethz.ch;https://www.mpi-inf.mpg.de;https://www.kuleuven.be", "aff_unique_abbr": "ETHZ;MPII;KU Leuven", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+1;0+2", - "aff_country_unique": "Switzerland;Germany;Belgium" + "aff_country_unique": "Switzerland;Germany;Belgium", + "bibtex": "@InProceedings{Hahner_2021_ICCV,\n \n author = {\n Hahner,\n Martin and Sakaridis,\n Christos and Dai,\n Dengxin and Van Gool,\n Luc\n},\n title = {\n Fog Simulation on Real LiDAR Point Clouds for 3D Object Detection in Adverse Weather\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15283-15292\n} \n}" }, { "title": "Fooling LiDAR Perception via Adversarial Trajectory Perturbation", @@ -15203,6 +16233,7 @@ "status": "Poster", "track": "main", "pid": 9568, + "author_site": "Yiming Li; Congcong Wen; Felix Juefei-Xu; Chen Feng", "author": "Yiming Li; Congcong Wen; Felix Juefei-Xu; Chen Feng", "abstract": "LiDAR point clouds collected from a moving vehicle are functions of its trajectories, because the sensor motion needs to be compensated to avoid distortions. When autonomous vehicles are sending LiDAR point clouds to deep networks for perception and planning, could the motion compensation consequently become a wide-open backdoor in those networks, due to both the adversarial vulnerability of deep learning and GPS-based vehicle trajectory estimation that is susceptible to wireless spoofing? We demonstrate such possibilities for the first time: instead of directly attacking point cloud coordinates which requires tampering with the raw LiDAR readings, only adversarial spoofing of a self-driving car's trajectory with small perturbations is enough to make safety-critical objects undetectable or detected with incorrect positions. Moreover, polynomial trajectory perturbation is developed to achieve a temporally-smooth and highly-imperceptible attack. Extensive experiments on 3D object detection have shown that such attacks not only lower the performance of the state-of-the-art detectors effectively, but also transfer to other detectors, raising a red flag for the community. The code is available on https://ai4ce.github.io/FLAT/.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_Fooling_LiDAR_Perception_via_Adversarial_Trajectory_Perturbation_ICCV_2021_paper.pdf", @@ -15226,7 +16257,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+1;1;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Yiming and Wen,\n Congcong and Juefei-Xu,\n Felix and Feng,\n Chen\n},\n title = {\n Fooling LiDAR Perception via Adversarial Trajectory Perturbation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7898-7907\n} \n}" }, { "title": "Foreground Activation Maps for Weakly Supervised Object Localization", @@ -15234,6 +16266,7 @@ "status": "Poster", "track": "main", "pid": 7871, + "author_site": "Meng Meng; Tianzhu Zhang; Qi Tian; Yongdong Zhang; Feng Wu", "author": "Meng Meng; Tianzhu Zhang; Qi Tian; Yongdong Zhang; Feng Wu", "abstract": "Weakly supervised object localization (WSOL) aims to localize objects with only image-level labels, which has better scalability and practicability than fully supervised methods in the actual deployment. However, with only image-level labels, learning object classification models tends to activate object parts and ignore the whole object, while expanding object parts into the whole object may deteriorate classification performance. To alleviate this problem, we propose foreground activation maps (FAM), whose aim is to optimize object localization and classification jointly via an object-aware attention module and a part-aware attention module in a unified model, where the two tasks can complement and enhance each other. To the best of our knowledge, this is the first work that can achieve remarkable performance for both tasks by optimizing them jointly via FAM for WSOL. Besides, the designed two modules can effectively highlight foreground objects for localization and discover discriminative parts for classification. Extensive experiments with four backbones on two standard benchmarks demonstrate that our FAM performs favorably against state-of-the-art WSOL methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Meng_Foreground_Activation_Maps_for_Weakly_Supervised_Object_Localization_ICCV_2021_paper.pdf", @@ -15248,7 +16281,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Meng_Foreground_Activation_Maps_for_Weakly_Supervised_Object_Localization_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Meng_Foreground_Activation_Maps_for_Weakly_Supervised_Object_Localization_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Meng_2021_ICCV,\n \n author = {\n Meng,\n Meng and Zhang,\n Tianzhu and Tian,\n Qi and Zhang,\n Yongdong and Wu,\n Feng\n},\n title = {\n Foreground Activation Maps for Weakly Supervised Object Localization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3385-3395\n} \n}" }, { "title": "Foreground-Action Consistency Network for Weakly Supervised Temporal Action Localization", @@ -15256,6 +16290,7 @@ "status": "Poster", "track": "main", "pid": 2908, + "author_site": "Linjiang Huang; Liang Wang; Hongsheng Li", "author": "Linjiang Huang; Liang Wang; Hongsheng Li", "abstract": "As a challenging task of high-level video understanding, weakly supervised temporal action localization has been attracting increasing attention. With only video annotations, most existing methods seek to handle this task with a localization-by-classification framework, which generally adopts a selector to select snippets of high probabilities of actions or namely the foreground. Nevertheless, the existing foreground selection strategies have a major limitation of only considering the unilateral relation from foreground to actions, which cannot guarantee the foreground-action consistency. In this paper, we present a framework named FAC-Net based on the I3D backbone, on which three branches are appended, named class-wise foreground classification branch, class-agnostic attention branch and multiple instance learning branch. First, our class-wise foreground classification branch regularizes the relation between actions and foreground to maximize the foreground-background separation. Besides, the class-agnostic attention branch and multiple instance learning branch are adopted to regularize the foreground-action consistency and help to learn a meaningful foreground classifier. Within each branch, we introduce a hybrid attention mechanism, which calculates multiple attention scores for each snippet, to focus on both discriminative and less-discriminative snippets to capture the full action boundaries. Experimental results on THUMOS14 and ActivityNet1.3 demonstrate the superior performance over state-of-the-art approaches.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Huang_Foreground-Action_Consistency_Network_for_Weakly_Supervised_Temporal_Action_Localization_ICCV_2021_paper.pdf", @@ -15272,14 +16307,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Huang_Foreground-Action_Consistency_Network_for_Weakly_Supervised_Temporal_Action_Localization_ICCV_2021_paper.html", "aff_unique_index": "0+1;2;0+1", - "aff_unique_norm": "Chinese University of Hong Kong;Centre for Perceptual and Interactive Intelligence;Chinese Academy of Sciences", + "aff_unique_norm": "The Chinese University of Hong Kong;Centre for Perceptual and Interactive Intelligence;Chinese Academy of Sciences", "aff_unique_dep": "Multimedia Laboratory;Centre for Perceptual and Interactive Intelligence;Institute of Automation", "aff_unique_url": "https://www.cuhk.edu.hk;;http://www.ia.cas.cn", "aff_unique_abbr": "CUHK;;CAS", "aff_campus_unique_index": "0+0;0+0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0+0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Huang_2021_ICCV,\n \n author = {\n Huang,\n Linjiang and Wang,\n Liang and Li,\n Hongsheng\n},\n title = {\n Foreground-Action Consistency Network for Weakly Supervised Temporal Action Localization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8002-8011\n} \n}" }, { "title": "Fourier Space Losses for Efficient Perceptual Image Super-Resolution", @@ -15287,10 +16323,11 @@ "status": "Poster", "track": "main", "pid": 1615, + "author_site": "Dario Fuoli; Luc Van Gool; Radu Timofte", "author": "Dario Fuoli; Luc Van Gool; Radu Timofte", "abstract": "Many super-resolution (SR) models are optimized for high performance only and therefore lack efficiency due to large model complexity. As large models are often not practical in real-world applications, we investigate and propose novel loss functions, to enable SR with high perceptual quality from much more efficient models. The representative power for a given low-complexity generator network can only be fully leveraged by strong guidance towards the optimal set of parameters. We show that it is possible to improve the performance of a recently introduced efficient generator architecture solely with the application of our proposed loss functions. In particular, we use a Fourier space supervision loss for improved restoration of missing high-frequency (HF) content from the ground truth image and design a discriminator architecture working directly in the Fourier domain to better match the target HF distribution. We show that our losses' direct emphasis on the frequencies in Fourier-space significantly boosts the perceptual image quality, while at the same time retaining high restoration quality in comparison to previously proposed loss functions for this task. The performance is further improved by utilizing a combination of spatial and frequency domain losses, as both representations provide complementary information during training. On top of that, the trained generator achieves comparable results with and is 2.4x and 48x faster than state-of-the-art perceptual SR methods RankSRGAN and SRFlow respectively.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Fuoli_Fourier_Space_Losses_for_Efficient_Perceptual_Image_Super-Resolution_ICCV_2021_paper.pdf", - "aff": "Computer Vision Lab, ETH Z\u00fcrich, Switzerland+KU Leuven, Belgium; Computer Vision Lab, ETH Z\u00fcrich, Switzerland+KU Leuven, Belgium; Computer Vision Lab, ETH Z\u00fcrich, Switzerland", + "aff": "Computer Vision Lab, ETH Zürich, Switzerland+KU Leuven, Belgium; Computer Vision Lab, ETH Zürich, Switzerland+KU Leuven, Belgium; Computer Vision Lab, ETH Zürich, Switzerland", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Fuoli_Fourier_Space_Losses_ICCV_2021_supplemental.pdf", @@ -15303,14 +16340,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Fuoli_Fourier_Space_Losses_for_Efficient_Perceptual_Image_Super-Resolution_ICCV_2021_paper.html", "aff_unique_index": "0+1;0+1;0", - "aff_unique_norm": "ETH Zurich;KU Leuven", + "aff_unique_norm": "ETH Zürich;KU Leuven", "aff_unique_dep": "Computer Vision Lab;", "aff_unique_url": "https://www.ethz.ch;https://www.kuleuven.be", "aff_unique_abbr": "ETHZ;KU Leuven", - "aff_campus_unique_index": ";", - "aff_campus_unique": "", + "aff_campus_unique_index": "0;0;0", + "aff_campus_unique": "Zürich;", "aff_country_unique_index": "0+1;0+1;0", - "aff_country_unique": "Switzerland;Belgium" + "aff_country_unique": "Switzerland;Belgium", + "bibtex": "@InProceedings{Fuoli_2021_ICCV,\n \n author = {\n Fuoli,\n Dario and Van Gool,\n Luc and Timofte,\n Radu\n},\n title = {\n Fourier Space Losses for Efficient Perceptual Image Super-Resolution\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2360-2369\n} \n}" }, { "title": "Free-Form Description Guided 3D Visual Graph Network for Object Grounding in Point Cloud", @@ -15318,6 +16356,7 @@ "status": "Poster", "track": "main", "pid": 6225, + "author_site": "Mingtao Feng; Zhen Li; Qi Li; Liang Zhang; XiangDong Zhang; Guangming Zhu; Hui Zhang; Yaonan Wang; Ajmal Mian", "author": "Mingtao Feng; Zhen Li; Qi Li; Liang Zhang; XiangDong Zhang; Guangming Zhu; Hui Zhang; Yaonan Wang; Ajmal Mian", "abstract": "3D object grounding aims to locate the most relevant target object in a raw point cloud scene based on a free-form language description. Understanding complex and diverse descriptions, and lifting them directly to a point cloud is a new and challenging topic due to the irregular and sparse nature of point clouds. There are three main challenges in 3D object grounding: to find the main focus in the complex and diverse description; to understand the point cloud scene; and to locate the target object. In this paper, we address all three challenges. Firstly, we propose a language scene graph module to capture the rich structure and long-distance phrase correlations. Secondly, we introduce a multi-level 3D proposal relation graph module to extract the object-object and object-scene co-occurrence relationships, and strengthen the visual features of the initial proposals. Lastly, we develop a description guided 3D visual graph module to encode global contexts of phrases and proposals by a nodes matching strategy. Extensive experiments on challenging benchmark datasets (ScanRefer and Nr3D) show that our algorithm outperforms existing state-of-the-art. Our code is available at https://github.com/PNXD/FFL-3DOG.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Feng_Free-Form_Description_Guided_3D_Visual_Graph_Network_for_Object_Grounding_ICCV_2021_paper.pdf", @@ -15341,7 +16380,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0+0;0;0;0;0;1", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Feng_2021_ICCV,\n \n author = {\n Feng,\n Mingtao and Li,\n Zhen and Li,\n Qi and Zhang,\n Liang and Zhang,\n XiangDong and Zhu,\n Guangming and Zhang,\n Hui and Wang,\n Yaonan and Mian,\n Ajmal\n},\n title = {\n Free-Form Description Guided 3D Visual Graph Network for Object Grounding in Point Cloud\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3722-3731\n} \n}" }, { "title": "Frequency Domain Image Translation: More Photo-Realistic, Better Identity-Preserving", @@ -15349,6 +16389,7 @@ "status": "Poster", "track": "main", "pid": 5504, + "author_site": "Mu Cai; Hong Zhang; Huijuan Huang; Qichuan Geng; Yixuan Li; Gao Huang", "author": "Mu Cai; Hong Zhang; Huijuan Huang; Qichuan Geng; Yixuan Li; Gao Huang", "abstract": "Image-to-image translation has been revolutionized with GAN-based methods. However, existing methods lack the ability to preserve the identity of the source domain. As a result, synthesized images can often over-adapt to the reference domain, losing important structural characteristics and suffering from suboptimal visual quality. To solve these challenges, we propose a novel frequency domain image translation (FDIT) framework, exploiting frequency information for enhancing the image generation process. Our key idea is to decompose the image into low-frequency and high-frequency components, where the high-frequency feature captures object structure akin to the identity. Our training objective facilitates the preservation of frequency information in both pixel space and Fourier spectral space. We broadly evaluate FDIT across five large-scale datasets and multiple tasks including image translation and GAN inversion. Extensive experiments and ablations show that FDIT effectively preserves the identity of the source image, and produces photo-realistic images. FDIT establishes state-of-the-art performance, reducing the average FID score by 5.6% compared to the previous best method.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Cai_Frequency_Domain_Image_Translation_More_Photo-Realistic_Better_Identity-Preserving_ICCV_2021_paper.pdf", @@ -15372,7 +16413,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Madison;", "aff_country_unique_index": "0;1;1;1;0;1", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Cai_2021_ICCV,\n \n author = {\n Cai,\n Mu and Zhang,\n Hong and Huang,\n Huijuan and Geng,\n Qichuan and Li,\n Yixuan and Huang,\n Gao\n},\n title = {\n Frequency Domain Image Translation: More Photo-Realistic,\n Better Identity-Preserving\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13930-13940\n} \n}" }, { "title": "Frequency-Aware Spatiotemporal Transformers for Video Inpainting Detection", @@ -15380,6 +16422,7 @@ "status": "Poster", "track": "main", "pid": 6474, + "author_site": "Bingyao Yu; Wanhua Li; Xiu Li; Jiwen Lu; Jie Zhou", "author": "Bingyao Yu; Wanhua Li; Xiu Li; Jiwen Lu; Jie Zhou", "abstract": "In this paper, we propose a frequency-aware spatiotemporal transformers for deep In this paper, we propose a Frequency-Aware Spatiotemporal Transformer (FAST) for video inpainting detection, which aims to simultaneously mine the traces of video inpainting from spatial, temporal, and frequency domains. Unlike existing deep video inpainting detection methods that usually rely on hand-designed attention modules and memory mechanism, the proposed FAST have innate global self-attention mechanisms to capture the long-range relations. While existing video inpainting methods usually explore the spatial and temporal connections in a video, our method employs a spatiotemporal transformer framework to detect the spatial connections between patches and temporal dependency between frames. As the inpainted videos usually lack high frequency details, the proposed FAST simultaneously exploits the frequency domain information with a specifically designed decoder. Extensive experimental results demonstrate that our approach achieves very competitive performance and generalizes well.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yu_Frequency-Aware_Spatiotemporal_Transformers_for_Video_Inpainting_Detection_ICCV_2021_paper.pdf", @@ -15403,7 +16446,8 @@ "aff_campus_unique_index": "1;;1;;", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0+0;0+0;0+0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yu_2021_ICCV,\n \n author = {\n Yu,\n Bingyao and Li,\n Wanhua and Li,\n Xiu and Lu,\n Jiwen and Zhou,\n Jie\n},\n title = {\n Frequency-Aware Spatiotemporal Transformers for Video Inpainting Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8188-8197\n} \n}" }, { "title": "From Contexts to Locality: Ultra-High Resolution Image Segmentation via Locality-Aware Contextual Correlation", @@ -15411,6 +16455,7 @@ "status": "Poster", "track": "main", "pid": 3419, + "author_site": "Qi Li; Weixiang Yang; Wenxi Liu; Yuanlong Yu; Shengfeng He", "author": "Qi Li; Weixiang Yang; Wenxi Liu; Yuanlong Yu; Shengfeng He", "abstract": "Ultra-high resolution image segmentation has raised increasing interests in recent years due to its realistic applications. In this paper, we innovate the widely used high-resolution image segmentation pipeline, in which an ultra-high resolution image is partitioned into regular patches for local segmentation and then the local results are merged into a high-resolution semantic mask. In particular, we introduce a novel locality-aware contextual correlation based segmentation model to process local patches, where the relevance between local patch and its various contexts are jointly and complementarily utilized to handle the semantic regions with large variations. Additionally, we present a contextual semantics refinement network that associates the local segmentation result with its contextual semantics, and thus is endowed with the ability of reducing boundary artifacts and refining mask contours during the generation of final high-resolution mask. Furthermore, in comprehensive experiments, we demonstrate that our model outperforms other state-of-the-art methods in public benchmarks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_From_Contexts_to_Locality_Ultra-High_Resolution_Image_Segmentation_via_Locality-Aware_ICCV_2021_paper.pdf", @@ -15434,7 +16479,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Qi and Yang,\n Weixiang and Liu,\n Wenxi and Yu,\n Yuanlong and He,\n Shengfeng\n},\n title = {\n From Contexts to Locality: Ultra-High Resolution Image Segmentation via Locality-Aware Contextual Correlation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7252-7261\n} \n}" }, { "title": "From Continuity to Editability: Inverting GANs With Consecutive Images", @@ -15442,6 +16488,7 @@ "status": "Poster", "track": "main", "pid": 1728, + "author_site": "Yangyang Xu; Yong Du; Wenpeng Xiao; Xuemiao Xu; Shengfeng He", "author": "Yangyang Xu; Yong Du; Wenpeng Xiao; Xuemiao Xu; Shengfeng He", "abstract": "Existing GAN inversion methods are stuck in a paradox that the inverted codes can either achieve high-fidelity reconstruction, or retain the editing capability. Having only one of them clearly cannot realize real image editing. In this paper, we resolve this paradox by introducing consecutive images (e.g., video frames or the same person with different poses) into the inversion process. The rationale behind our solution is that the continuity of consecutive images leads to inherent editable directions. This inborn property is used for two unique purposes: 1) regularizing the joint inversion process, such that each of the inverted codes is semantically accessible from one of the other and fastened in an editable domain; 2) enforcing inter-image coherence, such that the fidelity of each inverted code can be maximized with the complement of other images. Extensive experiments demonstrate that our alternative significantly outperforms state-of-the-art methods in terms of reconstruction fidelity and editability on both the real image dataset and synthesis dataset. Furthermore, our method provides the first support of video-based GAN inversion and an interesting application of unsupervised semantic transfer from consecutive images. Source code can be found at: https://github.com/cnnlstm/InvertingGANs_with_ConsecutiveImgs.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xu_From_Continuity_to_Editability_Inverting_GANs_With_Consecutive_Images_ICCV_2021_paper.pdf", @@ -15465,7 +16512,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0+0+0;0+0+0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xu_2021_ICCV,\n \n author = {\n Xu,\n Yangyang and Du,\n Yong and Xiao,\n Wenpeng and Xu,\n Xuemiao and He,\n Shengfeng\n},\n title = {\n From Continuity to Editability: Inverting GANs With Consecutive Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13910-13918\n} \n}" }, { "title": "From Culture to Clothing: Discovering the World Events Behind a Century of Fashion Images", @@ -15473,6 +16521,7 @@ "status": "Poster", "track": "main", "pid": 7004, + "author_site": "Wei-Lin Hsiao; Kristen Grauman", "author": "Wei-Lin Hsiao; Kristen Grauman", "abstract": "Fashion is intertwined with external cultural factors, but identifying these links remains a manual process limited to only the most salient phenomena. We propose a data-driven approach to identify specific cultural factors affecting the clothes people wear. Using large-scale datasets of news articles and vintage photos spanning a century, we present a multi-modal statistical model to detect influence relationships between happenings in the world and people's choice of clothing. Furthermore, on two image datasets we apply our model to improve the concrete vision tasks of visual style forecasting and photo timestamping. Our work is a first step towards a computational, scalable, and easily refreshable approach to link culture to clothing.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Hsiao_From_Culture_to_Clothing_Discovering_the_World_Events_Behind_a_ICCV_2021_paper.pdf", @@ -15496,7 +16545,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Austin", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Hsiao_2021_ICCV,\n \n author = {\n Hsiao,\n Wei-Lin and Grauman,\n Kristen\n},\n title = {\n From Culture to Clothing: Discovering the World Events Behind a Century of Fashion Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1066-1075\n} \n}" }, { "title": "From General to Specific: Informative Scene Graph Generation via Balance Adjustment", @@ -15504,6 +16554,7 @@ "status": "Poster", "track": "main", "pid": 4142, + "author_site": "Yuyu Guo; Lianli Gao; Xuanhan Wang; Yuxuan Hu; Xing Xu; Xu Lu; Heng Tao Shen; Jingkuan Song", "author": "Yuyu Guo; Lianli Gao; Xuanhan Wang; Yuxuan Hu; Xing Xu; Xu Lu; Heng Tao Shen; Jingkuan Song", "abstract": "The scene graph generation (SGG) task aims to detect visual relationship triplets, i.e., subject, predicate, object, in an image, providing a structural vision layout for scene understanding. However, current models are stuck in common predicates, e.g., \"on\" and \"at\", rather than informative ones, e.g., \"standing on\" and \"looking at\", resulting in the loss of precise information and overall performance. If a model only uses \"stone on road\" rather than \"blocking\" to describe an image, it is easy to misunderstand the scene. We argue that this phenomenon is caused by two key imbalances between informative predicates and common ones, i.e., semantic space level imbalance and training sample level imbalance. To tackle this problem, we propose BA-SGG, a simple yet effective SGG framework based on balance adjustment but not the conventional distribution fitting. It integrates two components: Semantic Adjustment (SA) and Balanced Predicate Learning (BPL), respectively for adjusting these imbalances. Benefited from the model-agnostic process, our method is easily applied to the state-of-the-art SGG models and significantly improves the SGG performance. Our method achieves 14.3%, 8.0%, and 6.1% higher Mean Recall (mR) than that of the Transformer model at three scene graph generation sub-tasks on Visual Genome, respectively. Codes are publicly available.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Guo_From_General_to_Specific_Informative_Scene_Graph_Generation_via_Balance_ICCV_2021_paper.pdf", @@ -15518,7 +16569,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Guo_From_General_to_Specific_Informative_Scene_Graph_Generation_via_Balance_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Guo_From_General_to_Specific_Informative_Scene_Graph_Generation_via_Balance_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Guo_2021_ICCV,\n \n author = {\n Guo,\n Yuyu and Gao,\n Lianli and Wang,\n Xuanhan and Hu,\n Yuxuan and Xu,\n Xing and Lu,\n Xu and Shen,\n Heng Tao and Song,\n Jingkuan\n},\n title = {\n From General to Specific: Informative Scene Graph Generation via Balance Adjustment\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 16383-16392\n} \n}" }, { "title": "From Goals, Waypoints & Paths to Long Term Human Trajectory Forecasting", @@ -15526,6 +16578,7 @@ "status": "Poster", "track": "main", "pid": 2871, + "author_site": "Karttikeya Mangalam; Yang An; Harshayu Girase; Jitendra Malik", "author": "Karttikeya Mangalam; Yang An; Harshayu Girase; Jitendra Malik", "abstract": "Human trajectory forecasting is an inherently multimodal problem. Uncertainty in future trajectories stems from two sources: (a) sources that are known to the agent but unknown to the model, such as long term goals and (b) sources that are unknown to both the agent & the model, such as intent of other agents & irreducible randomness in decisions. We propose to factorize this uncertainty into its epistemic & aleatoric sources. We model the epistemic uncertainty through multimodality in long term goals and the aleatoric uncertainty through multimodality in waypoints & paths. To exemplify this dichotomy, we also propose a novel long term trajectory forecasting setting, with prediction horizons upto a minute, upto an order of magnitude longer than prior works. Finally, we present Y-net, a scene compliant trajectory forecasting network that exploits the proposed epistemic & aleatoric structure for diverse trajectory predictions across long prediction horizons. Y-net significantly improves previous state-of-the-art performance on both (a) The short prediction horizon setting on the Stanford Drone (31.7% in FDE) & ETH/UCY datasets (7.4% in FDE) and (b) The proposed long horizon setting on the re-purposed Stanford Drone & Intersection Drone datasets. Code is available at: https://karttikeya.github.io/publication/ynet/", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Mangalam_From_Goals_Waypoints__Paths_to_Long_Term_Human_Trajectory_ICCV_2021_paper.pdf", @@ -15540,7 +16593,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Mangalam_From_Goals_Waypoints__Paths_to_Long_Term_Human_Trajectory_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Mangalam_From_Goals_Waypoints__Paths_to_Long_Term_Human_Trajectory_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Mangalam_2021_ICCV,\n \n author = {\n Mangalam,\n Karttikeya and An,\n Yang and Girase,\n Harshayu and Malik,\n Jitendra\n},\n title = {\n From Goals,\n Waypoints \\& Paths to Long Term Human Trajectory Forecasting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15233-15242\n} \n}" }, { "title": "From Two to One: A New Scene Text Recognizer With Visual Language Modeling Network", @@ -15548,6 +16602,7 @@ "status": "Poster", "track": "main", "pid": 3101, + "author_site": "Yuxin Wang; Hongtao Xie; Shancheng Fang; Jing Wang; Shenggao Zhu; Yongdong Zhang", "author": "Yuxin Wang; Hongtao Xie; Shancheng Fang; Jing Wang; Shenggao Zhu; Yongdong Zhang", "abstract": "In this paper, we abandon the dominant complex language model and rethink the linguistic learning process in the scene text recognition. Different from previous methods considering the visual and linguistic information in two separate structures, we propose a Visual Language Modeling Network (VisionLAN), which views the visual and linguistic information as a union by directly enduing the vision model with language capability. Specially, we introduce the text recognition of character-wise occluded feature maps in the training stage. Such operation guides the vision model to use not only the visual texture of characters, but also the linguistic information in visual context for recognition when the visual cues are confused (e.g. occlusion, noise, etc.). As the linguistic information is acquired along with visual features without the need of extra language model, VisionLAN significantly improves the speed by 39% and adaptively considers the linguistic information to enhance the visual features for accurate recognition. Furthermore, an Occlusion Scene Text (OST) dataset is proposed to evaluate the performance on the case of missing character-wise visual cues. The state of-the-art results on several benchmarks prove our effectiveness. Code and dataset are available at https://github.com/wangyuxin87/VisionLAN.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_From_Two_to_One_A_New_Scene_Text_Recognizer_With_ICCV_2021_paper.pdf", @@ -15571,7 +16626,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Yuxin and Xie,\n Hongtao and Fang,\n Shancheng and Wang,\n Jing and Zhu,\n Shenggao and Zhang,\n Yongdong\n},\n title = {\n From Two to One: A New Scene Text Recognizer With Visual Language Modeling Network\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14194-14203\n} \n}" }, { "title": "Frozen in Time: A Joint Video and Image Encoder for End-to-End Retrieval", @@ -15579,10 +16635,11 @@ "status": "Poster", "track": "main", "pid": 2444, - "author": "Max Bain; Arsha Nagrani; G\u00fcl Varol; Andrew Zisserman", + "author_site": "Max Bain; Arsha Nagrani; Gül Varol; Andrew Zisserman", + "author": "Max Bain; Arsha Nagrani; Gül Varol; Andrew Zisserman", "abstract": "Our objective in this work is video-text retrieval - in particular a joint embedding that enables efficient text-to-video retrieval. The challenges in this area include the design of the visual architecture and the nature of the training data, in that the available large scale video-text training datasets, such as HowTo100M, are noisy and hence competitive performance is achieved only at scale through large amounts of compute. We address both these challenges in this paper. We propose an end-to-end trainable model that is designed to take advantage of both large-scale image and video captioning datasets. Our model is an adaptation and extension of the recent ViT and Timesformer architectures, and consists of attention in both space and time. The model is flexible and can be trained on both image and video text datasets, either independently or in conjunction. It is trained with a curriculum learning schedule that begins by treating images as 'frozen' snapshots of video, and then gradually learns to attend to increasing temporal context when trained on video datasets. We also provide a new video-text pretraining dataset WebVid-2M, comprised of over two million videos with weak captions scraped from the internet. Despite training on datasets that are an order of magnitude smaller, we show that this approach yields state-of-the-art results on standard downstream video-retrieval benchmarks including MSR-VTT, DiDeMo and MSVD.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Bain_Frozen_in_Time_A_Joint_Video_and_Image_Encoder_for_ICCV_2021_paper.pdf", - "aff": "Visual Geometry Group, University of Oxford; Visual Geometry Group, University of Oxford + Google Research; Visual Geometry Group, University of Oxford + LIGM, \u00b4Ecole des Ponts, Univ Gustave Eiffel, CNRS; Visual Geometry Group, University of Oxford", + "aff": "Visual Geometry Group, University of Oxford; Visual Geometry Group, University of Oxford + Google Research; Visual Geometry Group, University of Oxford + LIGM, ´Ecole des Ponts, Univ Gustave Eiffel, CNRS; Visual Geometry Group, University of Oxford", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Bain_Frozen_in_Time_ICCV_2021_supplemental.pdf", @@ -15602,7 +16659,8 @@ "aff_campus_unique_index": "0;0+1;0;0", "aff_campus_unique": "Oxford;Mountain View;", "aff_country_unique_index": "0;0+1;0+2;0", - "aff_country_unique": "United Kingdom;United States;France" + "aff_country_unique": "United Kingdom;United States;France", + "bibtex": "@InProceedings{Bain_2021_ICCV,\n \n author = {\n Bain,\n Max and Nagrani,\n Arsha and Varol,\n G\\"ul and Zisserman,\n Andrew\n},\n title = {\n Frozen in Time: A Joint Video and Image Encoder for End-to-End Retrieval\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1728-1738\n} \n}" }, { "title": "Full-Body Motion From a Single Head-Mounted Device: Generating SMPL Poses From Partial Observations", @@ -15610,6 +16668,7 @@ "status": "Poster", "track": "main", "pid": 10836, + "author_site": "Andrea Dittadi; Sebastian Dziadzio; Darren Cosker; Ben Lundell; Thomas J. Cashman; Jamie Shotton", "author": "Andrea Dittadi; Sebastian Dziadzio; Darren Cosker; Ben Lundell; Thomas J. Cashman; Jamie Shotton", "abstract": "The increased availability and maturity of head-mounted and wearable devices opens up opportunities for remote communication and collaboration. However, the signal streams provided by these devices (e.g., head pose, hand pose, and gaze direction) do not represent a whole person. One of the main open problems is therefore how to leverage these signals to build faithful representations of the user. In this paper, we propose a method based on variational autoencoders to generate articulated poses of a human skeleton based on noisy streams of head and hand pose. Our approach relies on a model of pose likelihood that is novel and theoretically well-grounded. We demonstrate on publicly available datasets that our method is effective even from very impoverished signals and investigate how pose prediction can be made more accurate and realistic.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Dittadi_Full-Body_Motion_From_a_Single_Head-Mounted_Device_Generating_SMPL_Poses_ICCV_2021_paper.pdf", @@ -15626,14 +16685,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Dittadi_Full-Body_Motion_From_a_Single_Head-Mounted_Device_Generating_SMPL_Poses_ICCV_2021_paper.html", "aff_unique_index": "0;1;2;1;1;1", - "aff_unique_norm": "Technical University of Denmark;Microsoft;University of Bath", - "aff_unique_dep": ";Microsoft Corporation;", + "aff_unique_norm": "Technical University of Denmark;Microsoft Corporation;University of Bath", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.tek.dk;https://www.microsoft.com;https://www.bath.ac.uk", "aff_unique_abbr": "DTU;Microsoft;Bath", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;1;1;1", - "aff_country_unique": "Denmark;United States;United Kingdom" + "aff_country_unique": "Denmark;United States;United Kingdom", + "bibtex": "@InProceedings{Dittadi_2021_ICCV,\n \n author = {\n Dittadi,\n Andrea and Dziadzio,\n Sebastian and Cosker,\n Darren and Lundell,\n Ben and Cashman,\n Thomas J. and Shotton,\n Jamie\n},\n title = {\n Full-Body Motion From a Single Head-Mounted Device: Generating SMPL Poses From Partial Observations\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11687-11697\n} \n}" }, { "title": "Full-Duplex Strategy for Video Object Segmentation", @@ -15641,6 +16701,7 @@ "status": "Poster", "track": "main", "pid": 3043, + "author_site": "Ge-Peng Ji; Keren Fu; Zhe Wu; Deng-Ping Fan; Jianbing Shen; Ling Shao", "author": "Ge-Peng Ji; Keren Fu; Zhe Wu; Deng-Ping Fan; Jianbing Shen; Ling Shao", "abstract": "Appearance and motion are two important sources of information in video object segmentation (VOS). Previous methods mainly focus on using simplex solutions, lowering the upper bound of feature collaboration among and across these two cues. In this paper, we study a novel framework, termed the FSNet (Full-duplex Strategy Network), which designs a relational cross-attention module (RCAM) to achieve the bidirectional message propagation across embedding subspaces. Furthermore, the bidirectional purification module (BPM) is introduced to update the inconsistent features between the spatial-temporal embeddings, effectively improving the model robustness. By considering the mutual restraint within the full-duplex strategy, our FSNet performs the cross-modal feature-passing (i.e., transmission and receiving) simultaneously before the fusion and decoding stage, making it robust to various challenging scenarios (e.g., motion blur, occlusion) in VOS. Extensive experiments on five popular benchmarks (i.e., DAVIS16, FBMS, MCL, SegTrack-V2, and DAVSOD19) show that our FSNet outperforms other state-of-the-arts for both the VOS and video salient object detection tasks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ji_Full-Duplex_Strategy_for_Video_Object_Segmentation_ICCV_2021_paper.pdf", @@ -15655,7 +16716,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Ji_Full-Duplex_Strategy_for_Video_Object_Segmentation_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Ji_Full-Duplex_Strategy_for_Video_Object_Segmentation_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Ji_2021_ICCV,\n \n author = {\n Ji,\n Ge-Peng and Fu,\n Keren and Wu,\n Zhe and Fan,\n Deng-Ping and Shen,\n Jianbing and Shao,\n Ling\n},\n title = {\n Full-Duplex Strategy for Video Object Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4922-4933\n} \n}" }, { "title": "Full-Velocity Radar Returns by Radar-Camera Fusion", @@ -15663,6 +16725,7 @@ "status": "Poster", "track": "main", "pid": 10568, + "author_site": "Yunfei Long; Daniel Morris; Xiaoming Liu; Marcos Castro; Punarjay Chakravarty; Praveen Narayanan", "author": "Yunfei Long; Daniel Morris; Xiaoming Liu; Marcos Castro; Punarjay Chakravarty; Praveen Narayanan", "abstract": "A distinctive feature of Doppler radar is the measurement of velocity in the radial direction for radar points. However, the missing tangential velocity component hampers object velocity estimation as well as temporal integration of radar sweeps in dynamic scenes. Recognizing that fusing camera with radar provides complementary information to radar, in this paper we present a closed-form solution for the point-wise, full-velocity estimate of Doppler returns using the corresponding optical flow from camera images. Additionally, we address the association problem between radar returns and camera images with a neural network that is trained to estimate radar-camera correspondences. Experimental results on the nuScenes dataset verify the validity of the method and show significant improvements over the state-of-the-art in velocity estimation and accumulation of radar points.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Long_Full-Velocity_Radar_Returns_by_Radar-Camera_Fusion_ICCV_2021_paper.pdf", @@ -15686,7 +16749,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Long_2021_ICCV,\n \n author = {\n Long,\n Yunfei and Morris,\n Daniel and Liu,\n Xiaoming and Castro,\n Marcos and Chakravarty,\n Punarjay and Narayanan,\n Praveen\n},\n title = {\n Full-Velocity Radar Returns by Radar-Camera Fusion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 16198-16207\n} \n}" }, { "title": "FuseFormer: Fusing Fine-Grained Information in Transformers for Video Inpainting", @@ -15694,6 +16758,7 @@ "status": "Poster", "track": "main", "pid": 10356, + "author_site": "Rui Liu; Hanming Deng; Yangyi Huang; Xiaoyu Shi; Lewei Lu; Wenxiu Sun; Xiaogang Wang; Jifeng Dai; Hongsheng Li", "author": "Rui Liu; Hanming Deng; Yangyi Huang; Xiaoyu Shi; Lewei Lu; Wenxiu Sun; Xiaogang Wang; Jifeng Dai; Hongsheng Li", "abstract": "Transformer, as a strong and flexible architecture for modelling long-range relations, has been widely explored in vision tasks. However, when used in video inpainting that requires fine-grained representation, existed method still suffers from yielding blurry edges in detail due to the hard patch splitting. Here we aim to tackle this problem by proposing FuseFormer, a Transformer model designed for video inpainting via fine-grained feature fusion based on novel Soft Split and Soft Composition operations. The soft split divides feature map into many patches with given overlapping interval. On the contrary, the soft composition operates by stitching different patches into a whole feature map where pixels in overlapping regions are summed up. These two modules are first used in tokenization before Transformer layers and de-tokenization after Transformer layers, for effective mapping between tokens and features. Therefore, sub-patch level information interaction is enabled for more effective feature propagation between neighboring patches, resulting in synthesizing vivid content for hole regions in videos. Moreover, in FuseFormer, we elaborately insert the soft composition and soft split into the feed-forward network, enabling the 1D linear layers to have the capability of modelling 2D structure. And, the sub-patch level feature fusion ability is further enhanced. In both quantitative and qualitative evaluations, our proposed FuseFormer surpasses state-of-the-art methods. We also conduct detailed analysis to examine its superiority.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liu_FuseFormer_Fusing_Fine-Grained_Information_in_Transformers_for_Video_Inpainting_ICCV_2021_paper.pdf", @@ -15710,14 +16775,15 @@ "author_num": 9, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Liu_FuseFormer_Fusing_Fine-Grained_Information_in_Transformers_for_Video_Inpainting_ICCV_2021_paper.html", "aff_unique_index": "0;1;2;1;1;3;0;1;4", - "aff_unique_norm": "Chinese University of Hong Kong;SenseTime;Zhejiang University;Tetras AI;Xidian University", + "aff_unique_norm": "The Chinese University of Hong Kong;SenseTime;Zhejiang University;Tetras AI;Xidian University", "aff_unique_dep": "CUHK-SenseTime Joint Laboratory;SenseTime Research;;;School of CST", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.sensetime.com;https://www.zju.edu.cn;;http://www.xidian.edu.cn/", - "aff_unique_abbr": "CUHK;SenseTime;ZJU;Tetras AI;Xidian", + "aff_unique_abbr": "CUHK;SenseTime;ZJU;Tetras.AI;Xidian", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0;0;0;0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Liu_2021_ICCV,\n \n author = {\n Liu,\n Rui and Deng,\n Hanming and Huang,\n Yangyi and Shi,\n Xiaoyu and Lu,\n Lewei and Sun,\n Wenxiu and Wang,\n Xiaogang and Dai,\n Jifeng and Li,\n Hongsheng\n},\n title = {\n FuseFormer: Fusing Fine-Grained Information in Transformers for Video Inpainting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14040-14049\n} \n}" }, { "title": "Fusion Moves for Graph Matching", @@ -15725,7 +16791,8 @@ "status": "Poster", "track": "main", "pid": 7057, - "author": "Lisa Hutschenreiter; Stefan Haller; Lorenz Feineis; Carsten Rother; Dagmar Kainm\u00fcller; Bogdan Savchynskyy", + "author_site": "Lisa Hutschenreiter; Stefan Haller; Lorenz Feineis; Carsten Rother; Dagmar Kainmüller; Bogdan Savchynskyy", + "author": "Lisa Hutschenreiter; Stefan Haller; Lorenz Feineis; Carsten Rother; Dagmar Kainmüller; Bogdan Savchynskyy", "abstract": "We contribute to approximate algorithms for the quadratic assignment problem also known as graph matching. Inspired by the success of the fusion moves technique developed for multilabel discrete Markov random fields, we investigate its applicability to graph matching. In particular, we show how fusion moves can be efficiently combined with the dedicated state-of-the-art dual methods that have recently shown superior results in computer vision and bio-imaging applications. As our empirical evaluation on a wide variety of graph matching datasets suggests, fusion moves significantly improve performance of these methods in terms of speed and quality of the obtained solutions. Our method sets a new state-of-the-art with a notable margin with respect to its competitors.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Hutschenreiter_Fusion_Moves_for_Graph_Matching_ICCV_2021_paper.pdf", "aff": ";;;;;", @@ -15739,7 +16806,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Hutschenreiter_Fusion_Moves_for_Graph_Matching_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Hutschenreiter_Fusion_Moves_for_Graph_Matching_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Hutschenreiter_2021_ICCV,\n \n author = {\n Hutschenreiter,\n Lisa and Haller,\n Stefan and Feineis,\n Lorenz and Rother,\n Carsten and Kainm\\"uller,\n Dagmar and Savchynskyy,\n Bogdan\n},\n title = {\n Fusion Moves for Graph Matching\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6270-6279\n} \n}" }, { "title": "G-DetKD: Towards General Distillation Framework for Object Detectors via Contrastive and Semantic-Guided Feature Imitation", @@ -15747,10 +16815,11 @@ "status": "Poster", "track": "main", "pid": 9107, + "author_site": "Lewei Yao; Renjie Pi; Hang Xu; Wei Zhang; Zhenguo Li; Tong Zhang", "author": "Lewei Yao; Renjie Pi; Hang Xu; Wei Zhang; Zhenguo Li; Tong Zhang", "abstract": "In this paper, we investigate the knowledge distillation (KD) strategy for object detection and propose an effective framework applicable to both homogeneous and heterogeneous student-teacher pairs. The conventional feature imitation paradigm introduces imitation masks to focus on informative foreground areas while excluding the background noises. However, we find that those methods fail to fully utilize the semantic information in all feature pyramid levels, which leads to inefficiency for knowledge distillation between FPN-based detectors. To this end, we propose a novel semantic-guided feature imitation technique, which automatically performs soft matching between feature pairs across all pyramid levels to provide the optimal guidance to the student. To push the envelop even further, we introduce contrastive distillation to effectively capture the information encoded in the relationship between different feature regions. Finally, we propose a generalized detection KD pipeline, which is capable of distilling both homogeneous and heterogeneous detector pairs. Our method consistently outperforms the existing detection KD techniques, and works when (1) components in the framework are used separately and in conjunction; (2) for both homogeneous and heterogenous student-teacher pairs and (3) on multiple detection benchmarks. With a powerful X101-FasterRCNN-Instaboost detector as the teacher, R50-FasterRCNN reaches 44.0% AP, R50-RetinaNet reaches 43.3% AP and R50-FCOS reaches 43.1% AP on COCO dataset.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yao_G-DetKD_Towards_General_Distillation_Framework_for_Object_Detectors_via_Contrastive_ICCV_2021_paper.pdf", - "aff": "Hong Kong University of Science and Technology; Hong Kong University of Science and Technology; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Hong Kong University of Science and Technology", + "aff": "Hong Kong University of Science and Technology; Hong Kong University of Science and Technology; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; Hong Kong University of Science and Technology", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Yao_G-DetKD_Towards_General_ICCV_2021_supplemental.pdf", @@ -15764,13 +16833,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Yao_G-DetKD_Towards_General_Distillation_Framework_for_Object_Detectors_via_Contrastive_ICCV_2021_paper.html", "aff_unique_index": "0;0;1;1;1;0", "aff_unique_norm": "Hong Kong University of Science and Technology;Huawei", - "aff_unique_dep": ";Noah\u2019s Ark Lab", + "aff_unique_dep": ";Noah’s Ark Lab", "aff_unique_url": "https://www.ust.hk;https://www.huawei.com", "aff_unique_abbr": "HKUST;Huawei", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yao_2021_ICCV,\n \n author = {\n Yao,\n Lewei and Pi,\n Renjie and Xu,\n Hang and Zhang,\n Wei and Li,\n Zhenguo and Zhang,\n Tong\n},\n title = {\n G-DetKD: Towards General Distillation Framework for Object Detectors via Contrastive and Semantic-Guided Feature Imitation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3591-3600\n} \n}" }, { "title": "GAN Inversion for Out-of-Range Images With Geometric Transformations", @@ -15778,6 +16848,7 @@ "status": "Poster", "track": "main", "pid": 7681, + "author_site": "Kyoungkook Kang; Seongtae Kim; Sunghyun Cho", "author": "Kyoungkook Kang; Seongtae Kim; Sunghyun Cho", "abstract": "For successful semantic editing of real images, it is critical for a GAN inversion method to find an in-domain latent code that aligns with the domain of a pre-trained GAN model. Unfortunately, such in-domain latent codes can be found only for in-range images that align with the training images of a GAN model. In this paper, we propose BDInvert, a novel GAN inversion approach to semantic editing of out-of-range images that are geometrically unaligned with the training images of a GAN model. To find a latent code that is semantically editable, BDInvert inverts an input out-of-range image into an alternative latent space than the original latent space. We also propose a regularized inversion method to find a solution that supports semantic editing in the alternative space. Our experiments show that BDInvert effectively supports semantic editing of out-of-range images with geometric transformations.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Kang_GAN_Inversion_for_Out-of-Range_Images_With_Geometric_Transformations_ICCV_2021_paper.pdf", @@ -15793,15 +16864,16 @@ "email": "postech.ac.kr;postech.ac.kr;postech.ac.kr", "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Kang_GAN_Inversion_for_Out-of-Range_Images_With_Geometric_Transformations_ICCV_2021_paper.html", - "aff_unique_index": "0;0;0+0", - "aff_unique_norm": "POSTECH", - "aff_unique_dep": "Department of Computer Science and Engineering", - "aff_unique_url": "https://www.postech.ac.kr", - "aff_unique_abbr": "POSTECH", - "aff_campus_unique_index": "", - "aff_campus_unique": "", + "aff_unique_index": "0;1;0+1", + "aff_unique_norm": "Pohang University of Science and Technology;POSTECH", + "aff_unique_dep": "Department of Computer Science and Engineering;Graduate School of Artificial Intelligence", + "aff_unique_url": "https://www.postech.ac.kr;https://www.postech.ac.kr", + "aff_unique_abbr": "POSTECH;POSTECH", + "aff_campus_unique_index": "0;0", + "aff_campus_unique": "Pohang;", "aff_country_unique_index": "0;0;0+0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Kang_2021_ICCV,\n \n author = {\n Kang,\n Kyoungkook and Kim,\n Seongtae and Cho,\n Sunghyun\n},\n title = {\n GAN Inversion for Out-of-Range Images With Geometric Transformations\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13941-13949\n} \n}" }, { "title": "GAN-Control: Explicitly Controllable GANs", @@ -15809,7 +16881,8 @@ "status": "Poster", "track": "main", "pid": 8809, - "author": "Alon Shoshan; Nadav Bhonker; Igor Kviatkovsky; G\u00e9rard Medioni", + "author_site": "Alon Shoshan; Nadav Bhonker; Igor Kviatkovsky; Gérard Medioni", + "author": "Alon Shoshan; Nadav Bhonker; Igor Kviatkovsky; Gérard Medioni", "abstract": "We present a framework for training GANs with explicit control over generated facial images. We are able to control the generated image by settings exact attributes such as age, pose, expression, etc. Most approaches for manipulating GAN-generated images achieve partial control by leveraging the latent space disentanglement properties, obtained implicitly after standard GAN training. Such methods are able to change the relative intensity of certain attributes, but not explicitly set their values. Recently proposed methods, designed for explicit control over human faces, harness morphable 3D face models (3DMM) to allow fine-grained control capabilities in GANs. Unlike these methods, our control is not constrained to 3DMM parameters and is extendable beyond the domain of human faces. Using contrastive learning, we obtain GANs with an explicitly disentangled latent space. This disentanglement is utilized to train control-encoders mapping human-interpretable inputs to suitable latent vectors, thus allowing explicit control. In the domain of human faces we demonstrate control over identity, age, pose, expression, hair color and illumination. We also demonstrate control capabilities of our framework in the domains of painted portraits and dog image generation. We demonstrate that our approach achieves state-of-the-art performance both qualitatively and quantitatively.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Shoshan_GAN-Control_Explicitly_Controllable_GANs_ICCV_2021_paper.pdf", "aff": "Amazon; Amazon; Amazon; Amazon", @@ -15825,14 +16898,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Shoshan_GAN-Control_Explicitly_Controllable_GANs_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;0", - "aff_unique_norm": "Amazon", - "aff_unique_dep": "Amazon.com, Inc.", + "aff_unique_norm": "Amazon.com, Inc.", + "aff_unique_dep": "", "aff_unique_url": "https://www.amazon.com", "aff_unique_abbr": "Amazon", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Shoshan_2021_ICCV,\n \n author = {\n Shoshan,\n Alon and Bhonker,\n Nadav and Kviatkovsky,\n Igor and Medioni,\n G\\'erard\n},\n title = {\n GAN-Control: Explicitly Controllable GANs\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14083-14093\n} \n}" }, { "title": "GANcraft: Unsupervised 3D Neural Rendering of Minecraft Worlds", @@ -15840,6 +16914,7 @@ "status": "Poster", "track": "main", "pid": 8075, + "author_site": "Zekun Hao; Arun Mallya; Serge Belongie; Ming-Yu Liu", "author": "Zekun Hao; Arun Mallya; Serge Belongie; Ming-Yu Liu", "abstract": "We present GANcraft, an unsupervised neural rendering framework for generating photorealistic images of large 3D block worlds such as those created in Minecraft. Our method takes a semantic block world as input, where each block is assigned a semantic label such as dirt, grass, or water. We represent the world as a continuous volumetric function and train our model to render view-consistent photorealistic images for a user-controlled camera. In the absence of paired ground truth real images for the block world, we devise a training technique based on pseudo-ground truth and adversarial training. This stands in contrast to prior work on neural rendering for view synthesis, which requires ground truth images to estimate scene geometry and view-dependent appearance. In addition to camera trajectory, GANcraft allows user control over both scene semantics and output style. Experimental results with comparison to strong baselines show the effectiveness of GANcraft on this novel task of photorealistic 3D block world synthesis.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Hao_GANcraft_Unsupervised_3D_Neural_Rendering_of_Minecraft_Worlds_ICCV_2021_paper.pdf", @@ -15856,14 +16931,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Hao_GANcraft_Unsupervised_3D_Neural_Rendering_of_Minecraft_Worlds_ICCV_2021_paper.html", "aff_unique_index": "0+1;0+1;1;0", - "aff_unique_norm": "NVIDIA;Cornell University", - "aff_unique_dep": "NVIDIA Corporation;", + "aff_unique_norm": "NVIDIA Corporation;Cornell University", + "aff_unique_dep": ";", "aff_unique_url": "https://www.nvidia.com;https://www.cornell.edu", "aff_unique_abbr": "NVIDIA;Cornell", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Hao_2021_ICCV,\n \n author = {\n Hao,\n Zekun and Mallya,\n Arun and Belongie,\n Serge and Liu,\n Ming-Yu\n},\n title = {\n GANcraft: Unsupervised 3D Neural Rendering of Minecraft Worlds\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14072-14082\n} \n}" }, { "title": "GDP: Stabilized Neural Network Pruning via Gates With Differentiable Polarization", @@ -15871,6 +16947,7 @@ "status": "Poster", "track": "main", "pid": 2143, + "author_site": "Yi Guo; Huan Yuan; Jianchao Tan; Zhangyang Wang; Sen Yang; Ji Liu", "author": "Yi Guo; Huan Yuan; Jianchao Tan; Zhangyang Wang; Sen Yang; Ji Liu", "abstract": "Model compression techniques are recently gaining explosive attention for obtaining efficient AI models for various real time applications. Channel pruning is one important compression strategy, and widely used in slimming various DNNs. Previous gate-based or importance-based pruning methods aim to remove channels whose \"importance\" are smallest. However, it remains unclear what criteria the channel importance should be measured on, leading to various channel selection heuristics. Some other sampling-based pruning methods deploy sampling strategy to train sub-nets, which often causes the training instability and the compressed model's degraded performance. In view of the research gaps, we present a new module named Gates with Differentiable Polarization (GDP), inspired by principled optimization ideas. GDP can be plugged before convolutional layers without bells and whistles, to control the on-and-off of each channel or whole layer block. During the training process, the polarization effect will drive a subset of gates to smoothly decrease to exactly zero, while other gates gradually stay away from zero by a large margin. When training terminates, those zero-gated channels can be painlessly removed, while other non-zero gates can be absorbed into the succeeding convolution kernel, causing completely no interruption to training nor damage to the trained model. Experiments conducted over CIFAR-10 and ImageNet datasets show that the proposed GDP algorithm achieves the state-of-the-art performance on various benchmark DNNs at a broad range of pruning ratios. We also apply GDP to DeepLabV3Plus-ResNet50 on the challenging Pascal VOC segmentation task, whose test performance sees no drop (even slightly improved) with over 60% FLOPs saving.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Guo_GDP_Stabilized_Neural_Network_Pruning_via_Gates_With_Differentiable_Polarization_ICCV_2021_paper.pdf", @@ -15894,7 +16971,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Austin", "aff_country_unique_index": "0;0;0;1;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Guo_2021_ICCV,\n \n author = {\n Guo,\n Yi and Yuan,\n Huan and Tan,\n Jianchao and Wang,\n Zhangyang and Yang,\n Sen and Liu,\n Ji\n},\n title = {\n GDP: Stabilized Neural Network Pruning via Gates With Differentiable Polarization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5239-5250\n} \n}" }, { "title": "GLiT: Neural Architecture Search for Global and Local Image Transformer", @@ -15902,6 +16980,7 @@ "status": "Poster", "track": "main", "pid": 2041, + "author_site": "Boyu Chen; Peixia Li; Chuming Li; Baopu Li; Lei Bai; Chen Lin; Ming Sun; Junjie Yan; Wanli Ouyang", "author": "Boyu Chen; Peixia Li; Chuming Li; Baopu Li; Lei Bai; Chen Lin; Ming Sun; Junjie Yan; Wanli Ouyang", "abstract": "We introduce the first Neural Architecture Search (NAS) method to find a better transformer architecture for image recognition. Recently, transformers without CNN-based backbones are found to achieve impressive performance for image recognition. However, the transformer is designed for NLP tasks and thus could be sub-optimal when directly used for image recognition. In order to improve the visual representation ability for transformers, we propose a new search space and searching algorithm. Specifically, we introduce a locality module that models the local correlations in images explicitly with fewer computational cost. With the locality module, our search space is defined to let the search algorithm freely trade off between global and local information as well as optimizing the low-level design choice in each module. To tackle the problem caused by huge search space, a hierarchical neural architecture search method is proposed to search the optimal vision transformer from two levels separately with the evolutionary algorithm. Extensive experiments on the ImageNet dataset demonstrate that our method can find more discriminative and efficient transformer variants than the ResNet family (e.g., ResNet101) and the baseline ViT for image classification.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_GLiT_Neural_Architecture_Search_for_Global_and_Local_Image_Transformer_ICCV_2021_paper.pdf", @@ -15919,13 +16998,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Chen_GLiT_Neural_Architecture_Search_for_Global_and_Local_Image_Transformer_ICCV_2021_paper.html", "aff_unique_index": "0;0;0+1;2;0;3;1;1;0", "aff_unique_norm": "University of Sydney;SenseTime Group Limited;Baidu;University of Oxford", - "aff_unique_dep": ";;Baidu;", + "aff_unique_dep": ";;;", "aff_unique_url": "https://www.sydney.edu.au;https://www.sensetime.com;https://www.baidu.com;https://www.ox.ac.uk", "aff_unique_abbr": "USYD;SenseTime;Baidu;Oxford", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+1;2;0;3;1;1;0", - "aff_country_unique": "Australia;China;United States;United Kingdom" + "aff_country_unique": "Australia;China;United States;United Kingdom", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Boyu and Li,\n Peixia and Li,\n Chuming and Li,\n Baopu and Bai,\n Lei and Lin,\n Chen and Sun,\n Ming and Yan,\n Junjie and Ouyang,\n Wanli\n},\n title = {\n GLiT: Neural Architecture Search for Global and Local Image Transformer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12-21\n} \n}" }, { "title": "GLoRIA: A Multimodal Global-Local Representation Learning Framework for Label-Efficient Medical Image Recognition", @@ -15933,6 +17013,7 @@ "status": "Poster", "track": "main", "pid": 9891, + "author_site": "Shih-Cheng Huang; Liyue Shen; Matthew P. Lungren; Serena Yeung", "author": "Shih-Cheng Huang; Liyue Shen; Matthew P. Lungren; Serena Yeung", "abstract": "In recent years, the growing number of medical imaging studies is placing an ever-increasing burden on radiologists. Deep learning provides a promising solution for automatic medical image analysis and clinical decision support. However, large-scale manually labeled datasets required for training deep neural networks are difficult and expensive to obtain for medical images. The purpose of this work is to develop label-efficient multimodal medical imaging representations by leveraging radiology reports. Specifically, we propose an attention-based framework (GLoRIA) for learning global and local representations by contrasting image sub-regions and words in the paired report. In addition, we propose methods to leverage the learned representations for various downstream medical image recognition tasks with limited labels. Our results demonstrate high-performance and label-efficiency for image-text retrieval, classification (finetuning and zeros-shot settings), and segmentation on different datasets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Huang_GLoRIA_A_Multimodal_Global-Local_Representation_Learning_Framework_for_Label-Efficient_Medical_ICCV_2021_paper.pdf", @@ -15956,7 +17037,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Huang_2021_ICCV,\n \n author = {\n Huang,\n Shih-Cheng and Shen,\n Liyue and Lungren,\n Matthew P. and Yeung,\n Serena\n},\n title = {\n GLoRIA: A Multimodal Global-Local Representation Learning Framework for Label-Efficient Medical Image Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3942-3951\n} \n}" }, { "title": "GNeRF: GAN-Based Neural Radiance Field Without Posed Camera", @@ -15964,6 +17046,7 @@ "status": "Poster", "track": "main", "pid": 3803, + "author_site": "Quan Meng; Anpei Chen; Haimin Luo; Minye Wu; Hao Su; Lan Xu; Xuming He; Jingyi Yu", "author": "Quan Meng; Anpei Chen; Haimin Luo; Minye Wu; Hao Su; Lan Xu; Xuming He; Jingyi Yu", "abstract": "We introduce GNeRF, a framework to marry Generative Adversarial Networks (GAN) with Neural Radiance Field (NeRF) reconstruction for the complex scenarios with unknown and even randomly initialized camera poses. Recent NeRF-based advances have gained popularity for remarkable realistic novel view synthesis. However, most of them heavily rely on accurate camera poses estimation, while few recent methods can only optimize the unknown camera poses in roughly forward-facing scenes with relatively short camera trajectories and require rough camera poses initialization. Differently, our GNeRF only utilizes randomly initialized poses for complex outside-in scenarios. We propose a novel two-phases end-to-end framework. The first phase takes the use of GANs into the new realm for optimizing coarse camera poses and radiance fields jointly, while the second phase refines them with additional photometric loss. We overcome local minima using a hybrid and iterative optimization scheme. Extensive experiments on a variety of synthetic and natural scenes demonstrate the effectiveness of GNeRF. More impressively, our approach outperforms the baselines favorably in those scenes with repeated patterns or even low textures that are regarded as extremely challenging before.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Meng_GNeRF_GAN-Based_Neural_Radiance_Field_Without_Posed_Camera_ICCV_2021_paper.pdf", @@ -15987,7 +17070,8 @@ "aff_campus_unique_index": "0;0;0;0;1;0;0;0", "aff_campus_unique": "Shanghai;San Diego", "aff_country_unique_index": "0;0;0;0;1;0;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Meng_2021_ICCV,\n \n author = {\n Meng,\n Quan and Chen,\n Anpei and Luo,\n Haimin and Wu,\n Minye and Su,\n Hao and Xu,\n Lan and He,\n Xuming and Yu,\n Jingyi\n},\n title = {\n GNeRF: GAN-Based Neural Radiance Field Without Posed Camera\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6351-6361\n} \n}" }, { "title": "GP-S3Net: Graph-Based Panoptic Sparse Semantic Segmentation Network", @@ -15995,10 +17079,11 @@ "status": "Poster", "track": "main", "pid": 9364, + "author_site": "Ryan Razani; Ran Cheng; Enxu Li; Ehsan Taghavi; Yuan Ren; Liu Bingbing", "author": "Ryan Razani; Ran Cheng; Enxu Li; Ehsan Taghavi; Yuan Ren; Liu Bingbing", "abstract": "Panoptic segmentation as an integrated task of both static environmental understanding and dynamic object identification, has recently begun to receive broad research interest. In this paper, we propose a new computationally efficient LiDAR based panoptic segmentation framework, called GP-S3Net. GP-S3Net is a proposal-free approach in which no object proposals are needed to identify the objects in contrast to conventional two-stage panoptic systems, where a detection network is incorporated for capturing instance information. Our new design consists of a novel instance-level network to process the semantic results by constructing a graph convolutional network to identify objects (foreground), which later on are fused with the background classes. Through the fine-grained clusters of the foreground objects from the semantic segmentation backbone, over-segmentation priors are generated and subsequently processed by 3D sparse convolution to embed each cluster. Each cluster is treated as a node in the graph and its corresponding embedding is used as its node feature. Then a GCNN predicts whether edges exist between each cluster pair. We utilize the instance label to generate ground truth edge labels for each constructed graph in order to supervise the learning. Extensive experiments demonstrate that GP-S3Net outperforms the current state-of-the-art approaches, by a significant margin across available datasets such as, nuScenes and SemanticPOSS, ranking first on the competitive public SemanticKITTI leaderboard upon publication.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Razani_GP-S3Net_Graph-Based_Panoptic_Sparse_Semantic_Segmentation_Network_ICCV_2021_paper.pdf", - "aff": "Huawei Noah\u2019s Ark Lab, Toronto, Canada; Huawei Noah\u2019s Ark Lab, Toronto, Canada; Huawei Noah\u2019s Ark Lab, Toronto, Canada; Huawei Noah\u2019s Ark Lab, Toronto, Canada; Huawei Noah\u2019s Ark Lab, Toronto, Canada; Huawei Noah\u2019s Ark Lab, Toronto, Canada", + "aff": "Huawei Noah’s Ark Lab, Toronto, Canada; Huawei Noah’s Ark Lab, Toronto, Canada; Huawei Noah’s Ark Lab, Toronto, Canada; Huawei Noah’s Ark Lab, Toronto, Canada; Huawei Noah’s Ark Lab, Toronto, Canada; Huawei Noah’s Ark Lab, Toronto, Canada", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Razani_GP-S3Net_Graph-Based_Panoptic_ICCV_2021_supplemental.zip", @@ -16011,14 +17096,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Razani_GP-S3Net_Graph-Based_Panoptic_Sparse_Semantic_Segmentation_Network_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;0;0;0", - "aff_unique_norm": "Huawei", - "aff_unique_dep": "Huawei Noah\u2019s Ark Lab", + "aff_unique_norm": "Huawei Noah’s Ark Lab", + "aff_unique_dep": "", "aff_unique_url": "https://www.huawei.com/en/ai/noahs-ark-lab", "aff_unique_abbr": "HNA Lab", "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Toronto", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Razani_2021_ICCV,\n \n author = {\n Razani,\n Ryan and Cheng,\n Ran and Li,\n Enxu and Taghavi,\n Ehsan and Ren,\n Yuan and Bingbing,\n Liu\n},\n title = {\n GP-S3Net: Graph-Based Panoptic Sparse Semantic Segmentation Network\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 16076-16085\n} \n}" }, { "title": "GRF: Learning a General Radiance Field for 3D Representation and Rendering", @@ -16026,6 +17112,7 @@ "status": "Poster", "track": "main", "pid": 6616, + "author_site": "Alex Trevithick; Bo Yang", "author": "Alex Trevithick; Bo Yang", "abstract": "We present a simple yet powerful neural network that implicitly represents and renders 3D objects and scenes only from 2D observations. The network models 3D geometries as a general radiance field, which takes a set of 2D images with camera poses and intrinsics as input, constructs an internal representation for each point of the 3D space, and then renders the corresponding appearance and geometry of that point viewed from an arbitrary position. The key to our approach is to learn local features for each pixel in 2D images and to then project these features to 3D points, thus yielding general and rich point representations. We additionally integrate an attention mechanism to aggregate pixel features from multiple 2D views, such that visual occlusions are implicitly taken into account. Extensive experiments demonstrate that our method can generate high-quality and realistic novel views for novel objects, unseen categories and challenging real-world scenes.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Trevithick_GRF_Learning_a_General_Radiance_Field_for_3D_Representation_and_ICCV_2021_paper.pdf", @@ -16042,14 +17129,15 @@ "author_num": 2, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Trevithick_GRF_Learning_a_General_Radiance_Field_for_3D_Representation_and_ICCV_2021_paper.html", "aff_unique_index": "0+1;2", - "aff_unique_norm": "University of California, San Diego;University of Oxford;Hong Kong Polytechnic University", + "aff_unique_norm": "University of California, San Diego;University of Oxford;The Hong Kong Polytechnic University", "aff_unique_dep": ";;vLAR Group", "aff_unique_url": "https://www.ucsd.edu;https://www.ox.ac.uk;https://www.polyu.edu.hk", "aff_unique_abbr": "UCSD;Oxford;PolyU", "aff_campus_unique_index": "0;2", "aff_campus_unique": "San Diego;;Hong Kong SAR", "aff_country_unique_index": "0+1;2", - "aff_country_unique": "United States;United Kingdom;China" + "aff_country_unique": "United States;United Kingdom;China", + "bibtex": "@InProceedings{Trevithick_2021_ICCV,\n \n author = {\n Trevithick,\n Alex and Yang,\n Bo\n},\n title = {\n GRF: Learning a General Radiance Field for 3D Representation and Rendering\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15182-15192\n} \n}" }, { "title": "GTT-Net: Learned Generalized Trajectory Triangulation", @@ -16057,6 +17145,7 @@ "status": "Poster", "track": "main", "pid": 2462, + "author_site": "Xiangyu Xu; Enrique Dunn", "author": "Xiangyu Xu; Enrique Dunn", "abstract": "We present GTT-Net, a supervised learning framework for the reconstruction of sparse dynamic 3D geometry. We build on a graph-theoretic formulation of the generalized trajectory triangulation problem, where non-concurrent multi-view imaging geometry is known but global image sequencing is not provided. GTT-Net learns pairwise affinities modeling the spatio-temporal relationships among our input observations and leverages them to determine 3D geometry estimates. Experiments reconstructing 3D motion-capture sequences show GTT-Net outperforms the state of the art in terms of accuracy and robustness. Within the context of articulated motion reconstruction, our proposed architecture is 1) able to learn and enforce semantic 3D motion priors for shared training and test domains, while being 2) able to generalize its performance across different training and test domains. Moreover, GTT-Net provides a computationally streamlined framework for trajectory triangulation with applications to multi-instance reconstruction and event segmentation.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xu_GTT-Net_Learned_Generalized_Trajectory_Triangulation_ICCV_2021_paper.pdf", @@ -16080,7 +17169,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Xu_2021_ICCV,\n \n author = {\n Xu,\n Xiangyu and Dunn,\n Enrique\n},\n title = {\n GTT-Net: Learned Generalized Trajectory Triangulation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5795-5804\n} \n}" }, { "title": "Gait Recognition in the Wild: A Benchmark", @@ -16088,6 +17178,7 @@ "status": "Poster", "track": "main", "pid": 1700, + "author_site": "Zheng Zhu; Xianda Guo; Tian Yang; Junjie Huang; Jiankang Deng; Guan Huang; Dalong Du; Jiwen Lu; Jie Zhou", "author": "Zheng Zhu; Xianda Guo; Tian Yang; Junjie Huang; Jiankang Deng; Guan Huang; Dalong Du; Jiwen Lu; Jie Zhou", "abstract": "Gait benchmarks empower the research community to train and evaluate high-performance gait recognition systems. Even though growing efforts have been devoted to cross-view recognition, academia is restricted by current existing databases captured in the controlled environment. In this paper, we contribute a new benchmark for Gait REcognition in the Wild (GREW). The GREW dataset is constructed from natural videos, which contains hundreds of cameras and thousands of hours streams in open systems. With tremendous manual annotations, the GREW consists of 26K identities and 128K sequences with rich attributes for unconstrained gait recognition. Moreover, we add a distractor set of over 233K sequences, making it more suitable for real-world applications. Compared with prevailing predefined cross-view datasets, the GREW has diverse and practical view variations, as well as more natural challenging factors. To the best of our knowledge, this is the first large-scale dataset for gait recognition in the wild. Equipped with this benchmark, we dissect the unconstrained gait recognition problem. Representative appearance-based and model-based methods are explored, and comprehensive baselines are established. Experimental results show (1) The proposed GREW benchmark is necessary for training and evaluating gait recognizer in the wild. (2) For state-of-the-art gait recognition approaches, there is a lot of room for improvement. (3) The GREW benchmark can be used as effective pre-training for controlled gait recognition. Benchmark website is https://www.grew-benchmark.org/.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhu_Gait_Recognition_in_the_Wild_A_Benchmark_ICCV_2021_paper.pdf", @@ -16111,7 +17202,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;2;1;1;0;0", - "aff_country_unique": "China;United States;United Kingdom" + "aff_country_unique": "China;United States;United Kingdom", + "bibtex": "@InProceedings{Zhu_2021_ICCV,\n \n author = {\n Zhu,\n Zheng and Guo,\n Xianda and Yang,\n Tian and Huang,\n Junjie and Deng,\n Jiankang and Huang,\n Guan and Du,\n Dalong and Lu,\n Jiwen and Zhou,\n Jie\n},\n title = {\n Gait Recognition in the Wild: A Benchmark\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14789-14799\n} \n}" }, { "title": "Gait Recognition via Effective Global-Local Feature Representation and Local Temporal Aggregation", @@ -16119,6 +17211,7 @@ "status": "Poster", "track": "main", "pid": 7774, + "author_site": "Beibei Lin; Shunli Zhang; Xin Yu", "author": "Beibei Lin; Shunli Zhang; Xin Yu", "abstract": "Gait recognition is one of the most important biometric technologies and has been applied in many fields. Recent gait recognition frameworks represent each gait frame by descriptors extracted from either global appearances or local regions of humans. However, the representations based on global information often neglect the details of the gait frame, while local region based descriptors cannot capture the relations among neighboring regions, thus reducing their discriminativeness. In this paper, we propose a novel feature extraction and fusion framework to achieve discriminative feature representations for gait recognition. Towards this goal, we take advantage of both global visual information and local region details and develop a Global and Local Feature Extractor (GLFE). Specifically, our GLFE module is composed of our newly designed multiple global and local convolutional layers (GLConv) to ensemble global and local features in a principle manner. Furthermore, we present a novel operation, namely Local Temporal Aggregation (LTA), to further preserve the spatial information by reducing the temporal resolution to obtain higher spatial resolution. With the help of our GLFE and LTA, our method significantly improves the discriminativeness of our visual features, thus improving the gait recognition performance. Extensive experiments demonstrate that our proposed method outperforms state-of-the-art gait recognition methods on two popular datasets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Lin_Gait_Recognition_via_Effective_Global-Local_Feature_Representation_and_Local_Temporal_ICCV_2021_paper.pdf", @@ -16135,14 +17228,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Lin_Gait_Recognition_via_Effective_Global-Local_Feature_Representation_and_Local_Temporal_ICCV_2021_paper.html", "aff_unique_index": "0;0;1", - "aff_unique_norm": "Beijing Jiao Tong University;University of Technology Sydney", + "aff_unique_norm": "Beijing Jiaotong University;University of Technology Sydney", "aff_unique_dep": ";", "aff_unique_url": "http://www.njtu.edu.cn/en;https://www.uts.edu.au", "aff_unique_abbr": "BJTU;UTS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Lin_2021_ICCV,\n \n author = {\n Lin,\n Beibei and Zhang,\n Shunli and Yu,\n Xin\n},\n title = {\n Gait Recognition via Effective Global-Local Feature Representation and Local Temporal Aggregation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14648-14656\n} \n}" }, { "title": "GarmentNets: Category-Level Pose Estimation for Garments via Canonical Space Shape Completion", @@ -16150,6 +17244,7 @@ "status": "Poster", "track": "main", "pid": 7051, + "author_site": "Cheng Chi; Shuran Song", "author": "Cheng Chi; Shuran Song", "abstract": "This paper tackles the task of category-level pose estimation for garments. With a near infinite degree of freedom, a garment's full configuration (i.e., poses) is often described by the per-vertex 3D locations of its entire 3D surface. However, garments are also commonly subject to extreme cases of self-occlusion, especially when folded or crumpled, making it challenging to perceive their full 3D surface. To address these challenges, we propose GarmentNets, where the key idea is to formulate the deformable object pose estimation problem as a shape completion task in the canonical space. This canonical space is defined across garments instances within a category, therefore, specifies the shared category-level pose. By mapping the observed partial surface to the canonical space and completing it in this space, the output representation describes the garment's full configuration using a complete 3D mesh with the per-vertex canonical coordinate label. To properly handle the thin 3D structure presented on garments, we proposed a novel 3D shape representation using the generalized winding number field. Experiments demonstrate that GarmentNets is able to generalize to unseen garment instances and achieve significantly better performance compared to alternative approaches. Code and data will be available online.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chi_GarmentNets_Category-Level_Pose_Estimation_for_Garments_via_Canonical_Space_Shape_ICCV_2021_paper.pdf", @@ -16173,7 +17268,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Chi_2021_ICCV,\n \n author = {\n Chi,\n Cheng and Song,\n Shuran\n},\n title = {\n GarmentNets: Category-Level Pose Estimation for Garments via Canonical Space Shape Completion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3324-3333\n} \n}" }, { "title": "Gated3D: Monocular 3D Object Detection From Temporal Illumination Cues", @@ -16181,6 +17277,7 @@ "status": "Poster", "track": "main", "pid": 7806, + "author_site": "Frank Julca-Aguilar; Jason Taylor; Mario Bijelic; Fahim Mannan; Ethan Tseng; Felix Heide", "author": "Frank Julca-Aguilar; Jason Taylor; Mario Bijelic; Fahim Mannan; Ethan Tseng; Felix Heide", "abstract": "Today's state-of-the-art methods for 3D object detection are based on lidar, stereo, or monocular cameras. Lidar-based methods achieve the best accuracy, but have a large footprint, high cost, and mechanically-limited angular sampling rates, resulting in low spatial resolution at long ranges. Recent approaches using low-cost monocular or stereo cameras promise to overcome these limitations but struggle in low-light or low-contrast regions as they rely on passive CMOS sensors. We propose a novel 3D object detection modality that exploits temporal illumination cues from a low-cost monocular gated imager. We introduce a novel deep detection architecture, Gated3D, that is tailored to temporal illumination cues in gated images. This modality allows us to exploit mature 2D object feature extractors that guide the 3D predictions through a frustum segment estimation. We assess the proposed method experimentally on a 3D detection dataset that includes gated images captured over 10,000 km of driving data. We validate that our method outperforms state-of-the-art monocular and stereo methods, opening up a new sensor modality as an avenue to replace lidar in autonomous driving. https://light.princeton.edu/gated3d", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Julca-Aguilar_Gated3D_Monocular_3D_Object_Detection_From_Temporal_Illumination_Cues_ICCV_2021_paper.pdf", @@ -16195,7 +17292,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Julca-Aguilar_Gated3D_Monocular_3D_Object_Detection_From_Temporal_Illumination_Cues_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Julca-Aguilar_Gated3D_Monocular_3D_Object_Detection_From_Temporal_Illumination_Cues_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Julca-Aguilar_2021_ICCV,\n \n author = {\n Julca-Aguilar,\n Frank and Taylor,\n Jason and Bijelic,\n Mario and Mannan,\n Fahim and Tseng,\n Ethan and Heide,\n Felix\n},\n title = {\n Gated3D: Monocular 3D Object Detection From Temporal Illumination Cues\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2938-2948\n} \n}" }, { "title": "Gaussian Fusion: Accurate 3D Reconstruction via Geometry-Guided Displacement Interpolation", @@ -16203,6 +17301,7 @@ "status": "Poster", "track": "main", "pid": 6066, + "author_site": "Duo Chen; Zixin Tang; Zhenyu Xu; Yunan Zheng; Yiguang Liu", "author": "Duo Chen; Zixin Tang; Zhenyu Xu; Yunan Zheng; Yiguang Liu", "abstract": "Reconstructing delicate geometric details with consumer RGB-D sensors is challenging due to sensor depth and poses uncertainties. To tackle this problem, we propose a unique geometry-guided fusion framework: 1) First, we characterize fusion correspondences with the geodesic curves derived from the mass transport problem, also known as the Monge-Kantorovich problem. Compared with the depth map back-projection methods, the geodesic curves reveal the geometric structures of the local surface. 2) Moving the points along the geodesic curves is the core of our fusion approach, guided by local geometric properties, i.e., Gaussian curvature and mean curvature. Compared with the state-of-the-art methods, our novel geometry-guided displacement interpolation fully utilizes the meaningful geometric features of the local surface. It makes the reconstruction accuracy and completeness improved. Finally, a significant number of experimental results on real object data verify the superior performance of the proposed method. Our technique achieves the most delicate geometric details on thin objects for which the original depth map back-projection fusion scheme suffers from severe artifacts (See Fig.1).", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_Gaussian_Fusion_Accurate_3D_Reconstruction_via_Geometry-Guided_Displacement_Interpolation_ICCV_2021_paper.pdf", @@ -16222,11 +17321,12 @@ "aff_unique_norm": "Sichuan University;Chongqing University of Education", "aff_unique_dep": ";", "aff_unique_url": "https://www.scu.edu.cn;http://www.cque.edu.cn", - "aff_unique_abbr": "SCU;CQUE", + "aff_unique_abbr": "SCU;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Duo and Tang,\n Zixin and Xu,\n Zhenyu and Zheng,\n Yunan and Liu,\n Yiguang\n},\n title = {\n Gaussian Fusion: Accurate 3D Reconstruction via Geometry-Guided Displacement Interpolation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5916-5925\n} \n}" }, { "title": "Generalizable Mixed-Precision Quantization via Attribution Rank Preservation", @@ -16234,6 +17334,7 @@ "status": "Poster", "track": "main", "pid": 2669, + "author_site": "Ziwei Wang; Han Xiao; Jiwen Lu; Jie Zhou", "author": "Ziwei Wang; Han Xiao; Jiwen Lu; Jie Zhou", "abstract": "In this paper, we propose a generalizable mixed-precision quantization (GMPQ) method for efficient inference. Conventional methods require the consistency of datasets for bitwidth search and model deployment to guarantee the policy optimality, leading to heavy search cost on challenging largescale datasets in realistic applications. On the contrary, our GMPQ searches the mixed-quantization policy that can be generalized to largescale datasets with only a small amount of data, so that the search cost is significantly reduced without performance degradation. Specifically, we observe that locating network attribution correctly is general ability for accurate visual analysis across different data distribution. Therefore, despite of pursuing higher model accuracy and complexity, we preserve attribution rank consistency between the quantized models and their full-precision counterparts via efficient capacity-aware attribution imitation for generalizable mixed-precision quantization strategy search. Extensive experiments show that our method obtains competitive accuracy-complexity trade-off compared with the state-of-the-art mixed-precision networks in significantly reduced search cost. The code is available at https://github.com/ZiweiWangTHU/GMPQ.git.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_Generalizable_Mixed-Precision_Quantization_via_Attribution_Rank_Preservation_ICCV_2021_paper.pdf", @@ -16257,7 +17358,8 @@ "aff_campus_unique_index": ";;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0+0;0+0+0;0+0+0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Ziwei and Xiao,\n Han and Lu,\n Jiwen and Zhou,\n Jie\n},\n title = {\n Generalizable Mixed-Precision Quantization via Attribution Rank Preservation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5291-5300\n} \n}" }, { "title": "Generalize Then Adapt: Source-Free Domain Adaptive Semantic Segmentation", @@ -16265,6 +17367,7 @@ "status": "Poster", "track": "main", "pid": 3399, + "author_site": "Jogendra Nath Kundu; Akshay Kulkarni; Amit Singh; Varun Jampani; R. Venkatesh Babu", "author": "Jogendra Nath Kundu; Akshay Kulkarni; Amit Singh; Varun Jampani; R. Venkatesh Babu", "abstract": "Unsupervised domain adaptation (DA) has gained substantial interest in semantic segmentation. However, almost all prior arts assume concurrent access to both labeled source and unlabeled target, making them unsuitable for scenarios demanding source-free adaptation. In this work, we enable source-free DA by partitioning the task into two: a) source-only domain generalization and b) source-free target adaptation. Towards the former, we provide theoretical insights to develop a multi-head framework trained with a virtually extended multi-source dataset, aiming to balance generalization and specificity. Towards the latter, we utilize the multi-head framework to extract reliable target pseudo-labels for self-training. Additionally, we introduce a novel conditional prior-enforcing auto-encoder that discourages spatial irregularities, thereby enhancing the pseudo-label quality. Experiments on the standard GTA5-to-Cityscapes and SYNTHIA-to-Cityscapes benchmarks show our superiority even against the non-source-free prior-arts. Further, we show our compatibility with online adaptation enabling deployment in a sequentially changing environment.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Kundu_Generalize_Then_Adapt_Source-Free_Domain_Adaptive_Semantic_Segmentation_ICCV_2021_paper.pdf", @@ -16288,7 +17391,8 @@ "aff_campus_unique_index": "0;0;0;1;0", "aff_campus_unique": "Bangalore;Mountain View", "aff_country_unique_index": "0;0;0;1;0", - "aff_country_unique": "India;United States" + "aff_country_unique": "India;United States", + "bibtex": "@InProceedings{Kundu_2021_ICCV,\n \n author = {\n Kundu,\n Jogendra Nath and Kulkarni,\n Akshay and Singh,\n Amit and Jampani,\n Varun and Babu,\n R. Venkatesh\n},\n title = {\n Generalize Then Adapt: Source-Free Domain Adaptive Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7046-7056\n} \n}" }, { "title": "Generalized Shuffled Linear Regression", @@ -16296,6 +17400,7 @@ "status": "Poster", "track": "main", "pid": 3533, + "author_site": "Feiran Li; Kent Fujiwara; Fumio Okura; Yasuyuki Matsushita", "author": "Feiran Li; Kent Fujiwara; Fumio Okura; Yasuyuki Matsushita", "abstract": "We consider the shuffled linear regression problem where the correspondences between covariates and responses are unknown. While the existing formulation assumes an ideal underlying bijection in which all pieces of data should match, such an assumption barely holds in real-world applications due to either missing data or outliers. Therefore, in this work, we generalize the formulation of shuffled linear regression to a broader range of conditions where only part of the data should correspond. Moreover, we present a remarkably simple yet effective optimization algorithm with guaranteed global convergence. Distinct tasks validate the effectiveness of the proposed method.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_Generalized_Shuffled_Linear_Regression_ICCV_2021_paper.pdf", @@ -16319,7 +17424,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Feiran and Fujiwara,\n Kent and Okura,\n Fumio and Matsushita,\n Yasuyuki\n},\n title = {\n Generalized Shuffled Linear Regression\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6474-6483\n} \n}" }, { "title": "Generalized Source-Free Domain Adaptation", @@ -16327,6 +17433,7 @@ "status": "Poster", "track": "main", "pid": 2445, + "author_site": "Shiqi Yang; Yaxing Wang; Joost van de Weijer; Luis Herranz; Shangling Jui", "author": "Shiqi Yang; Yaxing Wang; Joost van de Weijer; Luis Herranz; Shangling Jui", "abstract": "Domain adaptation (DA) aims to transfer the knowledge learned from source domain to an unlabeled target domain. Some recent works tackle source-free domain adaptation (SFDA) where only source pre-trained model is available for adaptation to target domain. However those methods does not consider keeping source performance which is of high practical value in real world application. In this paper, we propose a new domain adaptation paradigm denoted as Generalized Source-free Domain Adaptation (G-SFDA), where the learned model needs to perform well on both target and source domains, with only access to current unlabeled target data during adaptation. First, we propose local structure clustering (LSC), aiming to cluster the target features with its semantically similar neighbors, which successfully adapts the model to target domain in absence of source data. Second, we propose randomly generated domain attention (RGDA), it produces binary domain specific attention to activate different feature channels for different domains, meanwhile the domain attention will be utilized to regularize the gradient during adaptation to keep source information. In the experiments, for target performance our method is on par with or better than existing DA and SFDA methods, specifically achieves state-of-the-art performance (85.4%) on VisDA, and our method works well for all domains after adapting to single or multiple target domains.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yang_Generalized_Source-Free_Domain_Adaptation_ICCV_2021_paper.pdf", @@ -16350,7 +17457,8 @@ "aff_campus_unique_index": "0;0;0;0;2", "aff_campus_unique": "Barcelona;;Shanghai", "aff_country_unique_index": "0;0+1;0;0;1", - "aff_country_unique": "Spain;China" + "aff_country_unique": "Spain;China", + "bibtex": "@InProceedings{Yang_2021_ICCV,\n \n author = {\n Yang,\n Shiqi and Wang,\n Yaxing and van de Weijer,\n Joost and Herranz,\n Luis and Jui,\n Shangling\n},\n title = {\n Generalized Source-Free Domain Adaptation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8978-8987\n} \n}" }, { "title": "Generalized and Incremental Few-Shot Learning by Explicit Learning and Calibration Without Forgetting", @@ -16358,6 +17466,7 @@ "status": "Poster", "track": "main", "pid": 2529, + "author_site": "Anna Kukleva; Hilde Kuehne; Bernt Schiele", "author": "Anna Kukleva; Hilde Kuehne; Bernt Schiele", "abstract": "Both generalized and incremental few-shot learning have to deal with three major challenges: learning novel classes from only few samples per class, preventing catastrophic forgetting of base classes, and classifier calibration across novel and base classes. In this work we propose a three-stage framework that allows to explicitly and effectively address these challenges. While the first phase learns base classes with many samples, the second phase learns a calibrated classifier for novel classes from few samples while also preventing catastrophic forgetting. In the final phase, calibration is achieved across all classes. We evaluate the proposed framework on four challenging benchmark datasets for image and video few-shot classification and obtain state-of-the-art results for both generalized and incremental few shot learning.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Kukleva_Generalized_and_Incremental_Few-Shot_Learning_by_Explicit_Learning_and_Calibration_ICCV_2021_paper.pdf", @@ -16372,7 +17481,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Kukleva_Generalized_and_Incremental_Few-Shot_Learning_by_Explicit_Learning_and_Calibration_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Kukleva_Generalized_and_Incremental_Few-Shot_Learning_by_Explicit_Learning_and_Calibration_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Kukleva_2021_ICCV,\n \n author = {\n Kukleva,\n Anna and Kuehne,\n Hilde and Schiele,\n Bernt\n},\n title = {\n Generalized and Incremental Few-Shot Learning by Explicit Learning and Calibration Without Forgetting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9020-9029\n} \n}" }, { "title": "Generalizing Gaze Estimation With Outlier-Guided Collaborative Adaptation", @@ -16380,6 +17490,7 @@ "status": "Poster", "track": "main", "pid": 3540, + "author_site": "Yunfei Liu; Ruicong Liu; Haofei Wang; Feng Lu", "author": "Yunfei Liu; Ruicong Liu; Haofei Wang; Feng Lu", "abstract": "Deep neural networks have significantly improved appearance-based gaze estimation accuracy. However, it still suffers from unsatisfactory performance when generalizing the trained model to new domains, e.g., unseen environments or persons. In this paper, we propose a plug-and-play gaze adaptation framework (PnP-GA), which is an ensemble of networks that learn collaboratively with the guidance of outliers. Since our proposed framework does not require ground-truth labels in the target domain, the existing gaze estimation networks can be directly plugged into PnP-GA and generalize the algorithms to new domains. We test PnP-GA on four gaze domain adaptation tasks, ETH-to-MPII, ETH-to-EyeDiap, Gaze360-to-MPII, and Gaze360-to-EyeDiap. The experimental results demonstrate that the PnP-GA framework achieves considerable performance improvements of 36.9%, 31.6%, 19.4%, and 11.8% over the baseline system. The proposed framework also outperforms the state-of-the-art domain adaptation approaches on gaze domain adaptation tasks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liu_Generalizing_Gaze_Estimation_With_Outlier-Guided_Collaborative_Adaptation_ICCV_2021_paper.pdf", @@ -16396,14 +17507,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Liu_Generalizing_Gaze_Estimation_With_Outlier-Guided_Collaborative_Adaptation_ICCV_2021_paper.html", "aff_unique_index": "0;0;1;0+1", - "aff_unique_norm": "Beihang University;Pengcheng Laboratory", - "aff_unique_dep": "School of CSE;Peng Cheng Laboratory", + "aff_unique_norm": "Beihang University;Peng Cheng Laboratory", + "aff_unique_dep": "School of CSE;", "aff_unique_url": "http://www.buaa.edu.cn;", "aff_unique_abbr": "Beihang;", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2021_ICCV,\n \n author = {\n Liu,\n Yunfei and Liu,\n Ruicong and Wang,\n Haofei and Lu,\n Feng\n},\n title = {\n Generalizing Gaze Estimation With Outlier-Guided Collaborative Adaptation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3835-3844\n} \n}" }, { "title": "Generating Attribution Maps With Disentangled Masked Backpropagation", @@ -16411,6 +17523,7 @@ "status": "Poster", "track": "main", "pid": 8378, + "author_site": "Adria Ruiz; Antonio Agudo; Francesc Moreno-Noguer", "author": "Adria Ruiz; Antonio Agudo; Francesc Moreno-Noguer", "abstract": "Attribution map visualization has arisen as one of the most effective techniques to understand the underlying inference process of Convolutional Neural Networks. In this task, the goal is to compute an score for each image pixel related to its contribution to the network output. In this paper, we introduce Disentangled Masked Backpropagation (DMBP), a novel gradient-based method that leverages on the piecewise linear nature of ReLU networks to decompose the model function into different linear mappings. This decomposition aims to disentangle the attribution maps into positive, negative and nuisance factors by learning a set of variables masking the contribution of each filter during back-propagation. A thorough evaluation over standard architectures (ResNet50 and VGG16) and benchmark datasets (PASCAL VOC and ImageNet) demonstrates that DMBP generates more visually interpretable attribution maps than previous approaches. Additionally, we quantitatively show that the maps produced by our method are more consistent with the true contribution of each pixel to the final network output.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ruiz_Generating_Attribution_Maps_With_Disentangled_Masked_Backpropagation_ICCV_2021_paper.pdf", @@ -16434,7 +17547,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Barcelona", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Spain" + "aff_country_unique": "Spain", + "bibtex": "@InProceedings{Ruiz_2021_ICCV,\n \n author = {\n Ruiz,\n Adria and Agudo,\n Antonio and Moreno-Noguer,\n Francesc\n},\n title = {\n Generating Attribution Maps With Disentangled Masked Backpropagation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 905-914\n} \n}" }, { "title": "Generating Masks From Boxes by Mining Spatio-Temporal Consistencies in Videos", @@ -16442,6 +17556,7 @@ "status": "Poster", "track": "main", "pid": 7891, + "author_site": "Bin Zhao; Goutam Bhat; Martin Danelljan; Luc Van Gool; Radu Timofte", "author": "Bin Zhao; Goutam Bhat; Martin Danelljan; Luc Van Gool; Radu Timofte", "abstract": "Segmenting objects in videos is a fundamental computer vision task. The current deep learning based paradigm offers a powerful, but data-hungry solution. However, current datasets are limited by the cost and human effort of annotating object masks in videos. This effectively limits the performance and generalization capabilities of existing video segmentation methods. To address this issue, we explore weaker form of bounding box annotations. We introduce a method for generating segmentation masks from per-frame bounding box annotations in videos. To this end, we propose a spatio-temporal aggregation module that effectively mines consistencies in the object and background appearance across multiple frames. We use our predicted accurate masks to train video object segmentation (VOS) networks for the tracking domain, where only manual bounding box annotations are available. The additional data provides substantially better generalization performance, leading to state-of-the-art results on standard tracking benchmarks. The code and models are available at https://github.com/visionml/pytracking.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhao_Generating_Masks_From_Boxes_by_Mining_Spatio-Temporal_Consistencies_in_Videos_ICCV_2021_paper.pdf", @@ -16465,7 +17580,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Zurich", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "Switzerland" + "aff_country_unique": "Switzerland", + "bibtex": "@InProceedings{Zhao_2021_ICCV,\n \n author = {\n Zhao,\n Bin and Bhat,\n Goutam and Danelljan,\n Martin and Van Gool,\n Luc and Timofte,\n Radu\n},\n title = {\n Generating Masks From Boxes by Mining Spatio-Temporal Consistencies in Videos\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13556-13566\n} \n}" }, { "title": "Generating Smooth Pose Sequences for Diverse Human Motion Prediction", @@ -16473,6 +17589,7 @@ "status": "Poster", "track": "main", "pid": 8645, + "author_site": "Wei Mao; Miaomiao Liu; Mathieu Salzmann", "author": "Wei Mao; Miaomiao Liu; Mathieu Salzmann", "abstract": "Recent progress in stochastic motion prediction, i.e., predicting multiple possible future human motions given a single past pose sequence, has led to producing truly diverse future motions and even providing control over the motion of some body parts. However, to achieve this, the state-of-the-art method requires learning several mappings for diversity and a dedicated model for controllable motion prediction. In this paper, we introduce a unified deep generative network for both diverse and controllable motion prediction. To this end, we leverage the intuition that realistic human motions consist of smooth sequences of valid poses, and that, given limited data, learning a pose prior is much more tractable than a motion one. We therefore design a generator that predicts the motion of different body parts sequentially, and introduce a normalizing flow based pose prior, together with a joint angle loss, to achieve motion realism.Our experiments on two standard benchmark datasets, Human3.6M and HumanEva-I, demonstrate that our approach outperforms the state-of-the-art baselines in terms of both sample diversity and accuracy. The code is available at https://github.com/wei-mao-2019/gsps", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Mao_Generating_Smooth_Pose_Sequences_for_Diverse_Human_Motion_Prediction_ICCV_2021_paper.pdf", @@ -16489,14 +17606,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Mao_Generating_Smooth_Pose_Sequences_for_Diverse_Human_Motion_Prediction_ICCV_2021_paper.html", "aff_unique_index": "0;1+2;1+2", - "aff_unique_norm": "Australian National University;EPFL;ClearSpace", + "aff_unique_norm": "Australian National University;École Polytechnique Fédérale de Lausanne;ClearSpace", "aff_unique_dep": ";CVLab;", "aff_unique_url": "https://www.anu.edu.au;https://cvlab.epfl.ch;", "aff_unique_abbr": "ANU;EPFL;", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;1+1;1+1", - "aff_country_unique": "Australia;Switzerland" + "aff_country_unique": "Australia;Switzerland", + "bibtex": "@InProceedings{Mao_2021_ICCV,\n \n author = {\n Mao,\n Wei and Liu,\n Miaomiao and Salzmann,\n Mathieu\n},\n title = {\n Generating Smooth Pose Sequences for Diverse Human Motion Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13309-13318\n} \n}" }, { "title": "Generative Adversarial Registration for Improved Conditional Deformable Templates", @@ -16504,6 +17622,7 @@ "status": "Poster", "track": "main", "pid": 2489, + "author_site": "Neel Dey; Mengwei Ren; Adrian V. Dalca; Guido Gerig", "author": "Neel Dey; Mengwei Ren; Adrian V. Dalca; Guido Gerig", "abstract": "Deformable templates are essential to large-scale medical image registration, segmentation, and population analysis. Current conventional and deep network-based methods for template construction use only regularized registration objectives and often yield templates with blurry and/or anatomically implausible appearance, confounding downstream biomedical interpretation. We reformulate deformable registration and conditional template estimation as an adversarial game wherein we encourage realism in the moved templates with a generative adversarial registration framework conditioned on flexible image covariates. The resulting templates exhibit significant gain in specificity to attributes such as age and disease, better fit underlying group-wise spatiotemporal trends, and achieve improved sharpness and centrality. These improvements enable more accurate population modeling with diverse covariates for standardized downstream analyses and easier anatomical delineation for structures of interest.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Dey_Generative_Adversarial_Registration_for_Improved_Conditional_Deformable_Templates_ICCV_2021_paper.pdf", @@ -16527,7 +17646,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Dey_2021_ICCV,\n \n author = {\n Dey,\n Neel and Ren,\n Mengwei and Dalca,\n Adrian V. and Gerig,\n Guido\n},\n title = {\n Generative Adversarial Registration for Improved Conditional Deformable Templates\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3929-3941\n} \n}" }, { "title": "Generative Compositional Augmentations for Scene Graph Prediction", @@ -16535,7 +17655,8 @@ "status": "Poster", "track": "main", "pid": 7959, - "author": "Boris Knyazev; Harm de Vries; C\u0103t\u0103lina Cangea; Graham W. Taylor; Aaron Courville; Eugene Belilovsky", + "author_site": "Boris Knyazev; Harm de Vries; Cătălina Cangea; Graham W. Taylor; Aaron Courville; Eugene Belilovsky", + "author": "Boris Knyazev; Harm de Vries; Cătălina Cangea; Graham W. Taylor; Aaron Courville; Eugene Belilovsky", "abstract": "Inferring objects and their relationships from an image in the form of a scene graph is useful in many applications at the intersection of vision and language. We consider a challenging problem of compositional generalization that emerges in this task due to a long tail data distribution. Current scene graph generation models are trained on a tiny fraction of the distribution corresponding to the most frequent compositions, e.g. . However, test images might contain zero- and few-shot compositions of objects and relationships, e.g. . Despite each of the object categories and the predicate (e.g. 'on') being frequent in the training data, the models often fail to properly understand such unseen or rare compositions. To improve generalization, it is natural to attempt increasing the diversity of the training distribution. However, in the graph domain this is non-trivial. To that end, we propose a method to synthesize rare yet plausible scene graphs by perturbing real ones. We then propose and empirically study a model based on conditional generative adversarial networks (GANs) that allows us to generate visual features of perturbed scene graphs and learn from them in a joint fashion. When evaluated on the Visual Genome dataset, our approach yields marginal, but consistent improvements in zero- and few-shot metrics. We analyze the limitations of our approach indicating promising directions for future research.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Knyazev_Generative_Compositional_Augmentations_for_Scene_Graph_Prediction_ICCV_2021_paper.pdf", "aff": ";;;;;", @@ -16549,7 +17670,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Knyazev_Generative_Compositional_Augmentations_for_Scene_Graph_Prediction_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Knyazev_Generative_Compositional_Augmentations_for_Scene_Graph_Prediction_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Knyazev_2021_ICCV,\n \n author = {\n Knyazev,\n Boris and de Vries,\n Harm and Cangea,\n C\\u{a\n}t\\u{a\n}lina and Taylor,\n Graham W. and Courville,\n Aaron and Belilovsky,\n Eugene\n},\n title = {\n Generative Compositional Augmentations for Scene Graph Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15827-15837\n} \n}" }, { "title": "Generative Layout Modeling Using Constraint Graphs", @@ -16557,6 +17679,7 @@ "status": "Poster", "track": "main", "pid": 3376, + "author_site": "Wamiq Para; Paul Guerrero; Tom Kelly; Leonidas J. Guibas; Peter Wonka", "author": "Wamiq Para; Paul Guerrero; Tom Kelly; Leonidas J. Guibas; Peter Wonka", "abstract": "We propose a new generative model for layout generation. We generate layouts in three steps. First, we generate the layout elements as nodes in a layout graph. Second, we compute constraints between layout elements as edges in the layout graph. Third, we solve for the final layout using constrained optimization. For the first two steps, we build on recent transformer architectures. The layout optimization implements the constraints efficiently. We show three practical contributions compared to the state of the art: our work requires no user input, produces higher quality layouts, and enables many novel capabilities for conditional layout generation.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Para_Generative_Layout_Modeling_Using_Constraint_Graphs_ICCV_2021_paper.pdf", @@ -16580,7 +17703,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;1;2;1;0", - "aff_country_unique": "Saudi Arabia;United States;United Kingdom" + "aff_country_unique": "Saudi Arabia;United States;United Kingdom", + "bibtex": "@InProceedings{Para_2021_ICCV,\n \n author = {\n Para,\n Wamiq and Guerrero,\n Paul and Kelly,\n Tom and Guibas,\n Leonidas J. and Wonka,\n Peter\n},\n title = {\n Generative Layout Modeling Using Constraint Graphs\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6690-6700\n} \n}" }, { "title": "Generic Attention-Model Explainability for Interpreting Bi-Modal and Encoder-Decoder Transformers", @@ -16588,6 +17712,7 @@ "status": "Poster", "track": "main", "pid": 3316, + "author_site": "Hila Chefer; Shir Gur; Lior Wolf", "author": "Hila Chefer; Shir Gur; Lior Wolf", "abstract": "Transformers are increasingly dominating multi-modal reasoning tasks, such as visual question answering, achieving state-of-the-art results thanks to their ability to contextualize information using the self-attention and co-attention mechanisms. These attention modules also play a role in other computer vision tasks including object detection and image segmentation. Unlike Transformers that only use self-attention, Transformers with co-attention require to consider multiple attention maps in parallel in order to highlight the information that is relevant to the prediction in the model's input. In this work, we propose the first method to explain prediction by any Transformer-based architecture, including bi-modal Transformers and Transformers with co-attentions. We provide generic solutions and apply these to the three most commonly used of these architectures: (i) pure self-attention, (ii) self-attention combined with co-attention, and (iii) encoder-decoder attention. We show that our method is superior to all existing methods which are adapted from single modality explainability.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chefer_Generic_Attention-Model_Explainability_for_Interpreting_Bi-Modal_and_Encoder-Decoder_Transformers_ICCV_2021_paper.pdf", @@ -16604,14 +17729,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Chefer_Generic_Attention-Model_Explainability_for_Interpreting_Bi-Modal_and_Encoder-Decoder_Transformers_ICCV_2021_paper.html", "aff_unique_index": "0;0;0+1", - "aff_unique_norm": "Tel Aviv University;Meta", + "aff_unique_norm": "Tel Aviv University;Facebook", "aff_unique_dep": "School of Computer Science;Facebook AI Research", "aff_unique_url": "https://www.tau.ac.il;https://research.facebook.com", "aff_unique_abbr": "TAU;FAIR", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Tel Aviv;", "aff_country_unique_index": "0;0;0+1", - "aff_country_unique": "Israel;United States" + "aff_country_unique": "Israel;United States", + "bibtex": "@InProceedings{Chefer_2021_ICCV,\n \n author = {\n Chefer,\n Hila and Gur,\n Shir and Wolf,\n Lior\n},\n title = {\n Generic Attention-Model Explainability for Interpreting Bi-Modal and Encoder-Decoder Transformers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 397-406\n} \n}" }, { "title": "Generic Event Boundary Detection: A Benchmark for Event Segmentation", @@ -16619,6 +17745,7 @@ "status": "Poster", "track": "main", "pid": 3112, + "author_site": "Mike Zheng Shou; Stan Weixian Lei; Weiyao Wang; Deepti Ghadiyaram; Matt Feiszli", "author": "Mike Zheng Shou; Stan Weixian Lei; Weiyao Wang; Deepti Ghadiyaram; Matt Feiszli", "abstract": "This paper presents a novel task together with a new benchmark for detecting generic, taxonomy-free event boundaries that segment a whole video into chunks. Conventional work in temporal video segmentation and action detection focuses on localizing pre-defined action categories and thus does not scale to generic videos. Cognitive Science has known since last century that humans consistently segment videos into meaningful temporal chunks. This segmentation happens naturally, without pre-defined event categories and without being explicitly asked to do so. Here, we repeat these cognitive experiments on mainstream CV datasets; with our novel annotation guideline which addresses the complexities of taxonomy-free event boundary annotation, we introduce the task of Generic Event Boundary Detection (GEBD) and the new benchmark Kinetics-GEBD. We view GEBD as an important stepping stone towards understanding the video as a whole, and believe it has been previously neglected due to a lack of proper task definition and annotations. Through experiment and human study we demonstrate the value of the annotations. Further, we benchmark supervised and un-supervised GEBD approaches on the TAPOS dataset and our Kinetics-GEBD. We release our annotations and baseline codes at CVPR'21 LOVEU Challenge: https://sites.google.com/view/loveucvpr21.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Shou_Generic_Event_Boundary_Detection_A_Benchmark_for_Event_Segmentation_ICCV_2021_paper.pdf", @@ -16635,14 +17762,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Shou_Generic_Event_Boundary_Detection_A_Benchmark_for_Event_Segmentation_ICCV_2021_paper.html", "aff_unique_index": "0+1;1;0;0;0", - "aff_unique_norm": "Meta;National University of Singapore", + "aff_unique_norm": "Facebook;National University of Singapore", "aff_unique_dep": "Facebook AI;", "aff_unique_url": "https://www.facebook.com;https://www.nus.edu.sg", "aff_unique_abbr": "Facebook AI;NUS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;1;0;0;0", - "aff_country_unique": "United States;Singapore" + "aff_country_unique": "United States;Singapore", + "bibtex": "@InProceedings{Shou_2021_ICCV,\n \n author = {\n Shou,\n Mike Zheng and Lei,\n Stan Weixian and Wang,\n Weiyao and Ghadiyaram,\n Deepti and Feiszli,\n Matt\n},\n title = {\n Generic Event Boundary Detection: A Benchmark for Event Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8075-8084\n} \n}" }, { "title": "Geography-Aware Self-Supervised Learning", @@ -16650,6 +17778,7 @@ "status": "Poster", "track": "main", "pid": 7112, + "author_site": "Kumar Ayush; Burak Uzkent; Chenlin Meng; Kumar Tanmay; Marshall Burke; David Lobell; Stefano Ermon", "author": "Kumar Ayush; Burak Uzkent; Chenlin Meng; Kumar Tanmay; Marshall Burke; David Lobell; Stefano Ermon", "abstract": "Contrastive learning methods have significantly narrowed the gap between supervised and unsupervised learning on computer vision tasks. In this paper, we explore their application to geo-located datasets, e.g. remote sensing, where unlabeled data is often abundant but labeled data is scarce. We first show that due to their different characteristics, a non-trivial gap persists between contrastive and supervised learning on standard benchmarks. To close the gap, we propose novel training methods that exploit the spatio-temporal structure of remote sensing data. We leverage spatially aligned images over time to construct temporal positive pairs in contrastive learning and geo-location to design pre-text tasks. Our experiments show that our proposed method closes the gap between contrastive and supervised learning on image classification, object detection and semantic segmentation for remote sensing. Moreover, we demonstrate that the proposed method can also be applied to geo-tagged ImageNet images, improving downstream performance on various tasks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ayush_Geography-Aware_Self-Supervised_Learning_ICCV_2021_paper.pdf", @@ -16673,7 +17802,8 @@ "aff_campus_unique_index": "0;0;0;1;0;0;0", "aff_campus_unique": "Stanford;Kharagpur", "aff_country_unique_index": "0;0;0;1;0;0;0", - "aff_country_unique": "United States;India" + "aff_country_unique": "United States;India", + "bibtex": "@InProceedings{Ayush_2021_ICCV,\n \n author = {\n Ayush,\n Kumar and Uzkent,\n Burak and Meng,\n Chenlin and Tanmay,\n Kumar and Burke,\n Marshall and Lobell,\n David and Ermon,\n Stefano\n},\n title = {\n Geography-Aware Self-Supervised Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10181-10190\n} \n}" }, { "title": "GeomNet: A Neural Network Based on Riemannian Geometries of SPD Matrix Space and Cholesky Space for 3D Skeleton-Based Interaction Recognition", @@ -16681,10 +17811,11 @@ "status": "Poster", "track": "main", "pid": 8523, + "author_site": "Xuan Son Nguyen", "author": "Xuan Son Nguyen", "abstract": "In this paper, we propose a novel method for representation and classification of two-person interactions from 3D skeleton sequences. The key idea of our approach is to use Gaussian distributions to capture statistics on Rn and those on the space of symmetric positive definite (SPD) matrices. The main challenge is how to parametrize those distributions. Towards this end, we develop methods for embedding Gaussian distributions in matrix groups based on the theory of Lie groups and Riemannian symmetric spaces. Our method relies on the Riemannian geometry of the underlying manifolds and has the advantage of encoding high-order statistics from 3D joint positions. We show that the proposed method achieves competitive results in two-person interaction recognition on two large-scale benchmarks for 3D human activity understanding.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Nguyen_GeomNet_A_Neural_Network_Based_on_Riemannian_Geometries_of_SPD_ICCV_2021_paper.pdf", - "aff": "ETIS UMR 8051, CY Cergy Paris Universit \u00b4e, ENSEA, CNRS, F-95000, Cergy, France", + "aff": "ETIS UMR 8051, CY Cergy Paris Universit ´e, ENSEA, CNRS, F-95000, Cergy, France", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Nguyen_GeomNet_A_Neural_ICCV_2021_supplemental.pdf", @@ -16697,14 +17828,15 @@ "author_num": 1, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Nguyen_GeomNet_A_Neural_Network_Based_on_Riemannian_Geometries_of_SPD_ICCV_2021_paper.html", "aff_unique_index": "0", - "aff_unique_norm": "CY Cergy Paris Universit\u00e9", + "aff_unique_norm": "CY Cergy Paris Université", "aff_unique_dep": "ETIS UMR 8051", "aff_unique_url": "https://www.cyu.fr", "aff_unique_abbr": "CYU", "aff_campus_unique_index": "0", "aff_campus_unique": "Cergy", "aff_country_unique_index": "0", - "aff_country_unique": "France" + "aff_country_unique": "France", + "bibtex": "@InProceedings{Nguyen_2021_ICCV,\n \n author = {\n Nguyen,\n Xuan Son\n},\n title = {\n GeomNet: A Neural Network Based on Riemannian Geometries of SPD Matrix Space and Cholesky Space for 3D Skeleton-Based Interaction Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13379-13389\n} \n}" }, { "title": "Geometric Deep Neural Network Using Rigid and Non-Rigid Transformations for Human Action Recognition", @@ -16712,10 +17844,11 @@ "status": "Poster", "track": "main", "pid": 3289, + "author_site": "Rasha Friji; Hassen Drira; Faten Chaieb; Hamza Kchok; Sebastian Kurtek", "author": "Rasha Friji; Hassen Drira; Faten Chaieb; Hamza Kchok; Sebastian Kurtek", "abstract": "Deep Learning architectures, albeit successful in mostcomputer vision tasks, were designed for data with an un-derlying Euclidean structure, which is not usually fulfilledsince pre-processed data may lie on a non-linear space.In this paper, we propose a geometry aware deep learn-ing approach using rigid and non rigid transformation opti-mization for skeleton-based action recognition. Skeleton se-quences are first modeled as trajectories on Kendall's shapespace and then mapped to the linear tangent space. The re-sulting structured data are then fed to a deep learning archi-tecture, which includes a layer that optimizes over rigid andnon rigid transformations of the 3D skeletons, followed bya CNN-LSTM network. The assessment on two large scaleskeleton datasets, namely NTU-RGB+D and NTU-RGB+D120, has proven that the proposed approach outperformsexisting geometric deep learning methods and exceeds re-cently published approaches with respect to the majority of configurations.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Friji_Geometric_Deep_Neural_Network_Using_Rigid_and_Non-Rigid_Transformations_for_ICCV_2021_paper.pdf", - "aff": "CRISTAL Lab - National University of Computer Science ENSI, Manouba University Campus, Manouba, Tunisia+Talan Innovation Factory, Talan Tunisia, 10 Rue de l\u2019 \u00b4energie solaire Impasse N\u00b01 Charguia 1, Tunis 2035, Tunisia; IMT Lille Douai, CRIStAL UMR 9189, University Lille, F-59000 Lille, France; AllianSTIC Lab - EFREI Paris, Villejuif, France+CRISTAL Lab - National University of Computer Science ENSI, Manouba University Campus, Manouba, Tunisia; INSAT, Tunisia+IMT Lille Douai, CRIStAL UMR 9189, University Lille, F-59000 Lille, France; Department of Statistics, the Ohio State University Columbus, OH, USA", + "aff": "CRISTAL Lab - National University of Computer Science ENSI, Manouba University Campus, Manouba, Tunisia+Talan Innovation Factory, Talan Tunisia, 10 Rue de l’ ´energie solaire Impasse N°1 Charguia 1, Tunis 2035, Tunisia; IMT Lille Douai, CRIStAL UMR 9189, University Lille, F-59000 Lille, France; AllianSTIC Lab - EFREI Paris, Villejuif, France+CRISTAL Lab - National University of Computer Science ENSI, Manouba University Campus, Manouba, Tunisia; INSAT, Tunisia+IMT Lille Douai, CRIStAL UMR 9189, University Lille, F-59000 Lille, France; Department of Statistics, the Ohio State University Columbus, OH, USA", "project": "", "github": "", "supp": "", @@ -16728,14 +17861,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Friji_Geometric_Deep_Neural_Network_Using_Rigid_and_Non-Rigid_Transformations_for_ICCV_2021_paper.html", "aff_unique_index": "0+1;2;3+0;4+2;5", - "aff_unique_norm": "National University of Computer Science ENSI;Talan Tunisia;IMT Lille Douai;EFREI Paris;Institut National des Sciences Appliquees et de Technologie;Ohio State University", + "aff_unique_norm": "National University of Computer Science ENSI;Talan Tunisia;IMT Lille Douai;EFREI Paris;Institut National des Sciences Appliquées et de Technologie;The Ohio State University", "aff_unique_dep": "CRISTAL Lab;;CRIStAL UMR 9189;AllianSTIC Lab;;Department of Statistics", - "aff_unique_url": ";;https://www.imt-lille-douai.fr;https://www.efrei.fr;http://www.insat.rnu.tn;https://www.osu.edu", + "aff_unique_url": ";;https://www.imt-lille-douai.fr;https://www.efrei.fr;https://www.insat.rnu.tn;https://www.osu.edu", "aff_unique_abbr": ";;IMT Lille Douai;EFREI;INSAT;OSU", "aff_campus_unique_index": "0;2;3+0;2;4", "aff_campus_unique": "Manouba;;Lille;Villejuif;Columbus", "aff_country_unique_index": "0+0;1;1+0;0+1;2", - "aff_country_unique": "Tunisia;France;United States" + "aff_country_unique": "Tunisia;France;United States", + "bibtex": "@InProceedings{Friji_2021_ICCV,\n \n author = {\n Friji,\n Rasha and Drira,\n Hassen and Chaieb,\n Faten and Kchok,\n Hamza and Kurtek,\n Sebastian\n},\n title = {\n Geometric Deep Neural Network Using Rigid and Non-Rigid Transformations for Human Action Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12611-12620\n} \n}" }, { "title": "Geometric Granularity Aware Pixel-To-Mesh", @@ -16743,6 +17877,7 @@ "status": "Poster", "track": "main", "pid": 8227, + "author_site": "Yue Shi; Bingbing Ni; Jinxian Liu; Dingyi Rong; Ye Qian; Wenjun Zhang", "author": "Yue Shi; Bingbing Ni; Jinxian Liu; Dingyi Rong; Ye Qian; Wenjun Zhang", "abstract": "Pixel-to-mesh has wide applications, especially in virtual or augmented reality, animation and game industry. However, existing mesh reconstruction models perform unsatisfactorily in local geometry details due to ignoring mesh topology information during learning. Besides, most methods are constrained by the initial template, which cannot reconstruct meshes of various genus. In this work, we propose a geometric granularity-aware pixel-to-mesh framework with a fidelity-selection-and-guarantee strategy, which explicitly addresses both challenges. First, a geometry structure extractor is proposed for detecting local high structured parts and capturing local spatial feature. Second, we apply it to facilitate pixel-to-mesh mapping and resolve coarse details problem caused by the neglect of structural information in previous practices. Finally, a mesh edit module is proposed to encourage non-zero genus topology to emergence by fine-grained topology modification and a patching algorithm is introduced to repair the non-closed boundaries. Extensive experimental results, both quantitatively and visually have demonstrated the high reconstruction fidelity achieved by the proposed framework.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Shi_Geometric_Granularity_Aware_Pixel-To-Mesh_ICCV_2021_paper.pdf", @@ -16757,7 +17892,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Shi_Geometric_Granularity_Aware_Pixel-To-Mesh_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Shi_Geometric_Granularity_Aware_Pixel-To-Mesh_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Shi_2021_ICCV,\n \n author = {\n Shi,\n Yue and Ni,\n Bingbing and Liu,\n Jinxian and Rong,\n Dingyi and Qian,\n Ye and Zhang,\n Wenjun\n},\n title = {\n Geometric Granularity Aware Pixel-To-Mesh\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13097-13106\n} \n}" }, { "title": "Geometric Unsupervised Domain Adaptation for Semantic Segmentation", @@ -16765,7 +17901,8 @@ "status": "Poster", "track": "main", "pid": 3459, - "author": "Vitor Guizilini; Jie Li; Rare\u0219 Ambru\u0219; Adrien Gaidon", + "author_site": "Vitor Guizilini; Jie Li; Rareș Ambruș; Adrien Gaidon", + "author": "Vitor Guizilini; Jie Li; Rareș Ambruș; Adrien Gaidon", "abstract": "Simulators can efficiently generate large amounts of labeled synthetic data with perfect supervision for hard-to-label tasks like semantic segmentation. However, they introduce a domain gap that severely hurts real-world performance. We propose to use self-supervised monocular depth estimation as a proxy task to bridge this gap and improve sim-to-real unsupervised domain adaptation (UDA). Our Geometric Unsupervised Domain Adaptation method (GUDA) learns a domain-invariant representation via a multi-task objective combining synthetic semantic supervision with real-world geometric constraints on videos. GUDA establishes a new state of the art in UDA for semantic segmentation on three benchmarks, outperforming methods that use domain adversarial learning, self-training, or other self-supervised proxy tasks. Furthermore, we show that our method scales well with the quality and quantity of synthetic data while also improving depth prediction.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Guizilini_Geometric_Unsupervised_Domain_Adaptation_for_Semantic_Segmentation_ICCV_2021_paper.pdf", "aff": "Toyota Research Institute (TRI), Los Altos, CA; Toyota Research Institute (TRI), Los Altos, CA; Toyota Research Institute (TRI), Los Altos, CA; Toyota Research Institute (TRI), Los Altos, CA", @@ -16783,12 +17920,13 @@ "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Toyota Research Institute", "aff_unique_dep": "", - "aff_unique_url": "https://www.tri.global", + "aff_unique_url": "https://www.tri.toyota.com", "aff_unique_abbr": "TRI", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Los Altos", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Guizilini_2021_ICCV,\n \n author = {\n Guizilini,\n Vitor and Li,\n Jie and Ambruș,\n Rareș and Gaidon,\n Adrien\n},\n title = {\n Geometric Unsupervised Domain Adaptation for Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8537-8547\n} \n}" }, { "title": "Geometry Uncertainty Projection Network for Monocular 3D Object Detection", @@ -16796,6 +17934,7 @@ "status": "Poster", "track": "main", "pid": 5592, + "author_site": "Yan Lu; Xinzhu Ma; Lei Yang; Tianzhu Zhang; Yating Liu; Qi Chu; Junjie Yan; Wanli Ouyang", "author": "Yan Lu; Xinzhu Ma; Lei Yang; Tianzhu Zhang; Yating Liu; Qi Chu; Junjie Yan; Wanli Ouyang", "abstract": "Monocular 3D object detection has received increasing attention due to the wide application in autonomous driving. Existing works mainly focus on introducing geometry projection to predict depth priors for each object. Despite their impressive progress, these methods neglect the geometry leverage effect of the projection process, which leads to uncontrollable inferences and damage the training efficiency. In this paper, we propose a Geometry Uncertainty Projection Network (GUP Net) to handle these problems, which can guide the model to learn more reliable depth outputs. The overall framework combines the uncertainty inference and the hierarchical task learning to reduce the negative effects of the geometry leverage. Specifically, an Uncertainty Geometry Projection module is proposed to obtain the geometry guided uncertainty of the inferred depth, which can not only benefit the geometry learning but also provide more reliable depth inferences to reduce the uncontrollableness caused by the geometry leverage. Besides, to reduce the instability in the training process caused by the geometry leverage effect, we propose a Hierarchical Task Learning strategy to control the overall optimization process. This learning algorithm can monitor the situation of each task through a well designed learning situation indicator and adaptively assign the proper loss weights for different tasks according to their learning situation and the hierarchical structure, which can significantly improve the stability and the efficiency of the training process. Extensive experiments demonstrate the effectiveness of the proposed method.The overall model can infer more reliable depth and location information than existing methods, which achieves the state-of-the-art performance on the KITTI benchmark.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Lu_Geometry_Uncertainty_Projection_Network_for_Monocular_3D_Object_Detection_ICCV_2021_paper.pdf", @@ -16812,14 +17951,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Lu_Geometry_Uncertainty_Projection_Network_for_Monocular_3D_Object_Detection_ICCV_2021_paper.html", "aff_unique_index": "0+1;0+1;2;3;3;3;2;0+1", - "aff_unique_norm": "University of Sydney;SenseTime;SenseTime Group;University of Science and Technology of China", + "aff_unique_norm": "University of Sydney;SenseTime;Sensetime Group;University of Science and Technology of China", "aff_unique_dep": ";Computer Vision Group;;School of Information Science and Technology", - "aff_unique_url": "https://www.sydney.edu.au;https://www.sensetime.com;https://www.sensetime.com;http://www.ustc.edu.cn", + "aff_unique_url": "https://www.sydney.edu.au;https://www.sensetime.com;https://www.sensetime.com/;http://www.ustc.edu.cn", "aff_unique_abbr": "USYD;SenseTime;SenseTime;USTC", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0+1;1;1;1;1;1;0+1", - "aff_country_unique": "Australia;China" + "aff_country_unique": "Australia;China", + "bibtex": "@InProceedings{Lu_2021_ICCV,\n \n author = {\n Lu,\n Yan and Ma,\n Xinzhu and Yang,\n Lei and Zhang,\n Tianzhu and Liu,\n Yating and Chu,\n Qi and Yan,\n Junjie and Ouyang,\n Wanli\n},\n title = {\n Geometry Uncertainty Projection Network for Monocular 3D Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3111-3121\n} \n}" }, { "title": "Geometry-Aware Self-Training for Unsupervised Domain Adaptation on Object Point Clouds", @@ -16827,6 +17967,7 @@ "status": "Poster", "track": "main", "pid": 3816, + "author_site": "Longkun Zou; Hui Tang; Ke Chen; Kui Jia", "author": "Longkun Zou; Hui Tang; Ke Chen; Kui Jia", "abstract": "The point cloud representation of an object can have a large geometric variation in view of inconsistent data acquisition procedure, which thus leads to domain discrepancy due to diverse and uncontrollable shape representation cross datasets. To improve discrimination on unseen distribution of point-based geometries in a practical and feasible perspective, this paper proposes a new method of geometry-aware self-training (GAST) for unsupervised domain adaptation of object point cloud classification. Specifically, this paper aims to learn a domain-shared representation of semantic categories, via two novel self-supervised geometric learning tasks as feature regularization. On one hand, the representation learning is empowered by a linear mixup of point cloud samples with their self-generated rotation labels, to capture a global topological configuration of local geometries. On the other hand, a diverse point distribution across datasets can be normalized with a novel curvature-aware distortion localization. Experiments on the PointDA-10 dataset show that our GAST method can significantly outperform the state-of-the-art methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zou_Geometry-Aware_Self-Training_for_Unsupervised_Domain_Adaptation_on_Object_Point_Clouds_ICCV_2021_paper.pdf", @@ -16843,14 +17984,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zou_Geometry-Aware_Self-Training_for_Unsupervised_Domain_Adaptation_on_Object_Point_Clouds_ICCV_2021_paper.html", "aff_unique_index": "0+1;0;0+2;0+2+3", - "aff_unique_norm": "South China University of Technology;DexForce Technology Co., Ltd;Pengcheng Laboratory;Pazhou Laboratory", - "aff_unique_dep": ";;Peng Cheng Laboratory;", + "aff_unique_norm": "South China University of Technology;DexForce Technology Co., Ltd;Peng Cheng Laboratory;Pazhou Laboratory", + "aff_unique_dep": ";;;", "aff_unique_url": "https://www.scut.edu.cn;;http://www.pcl.ac.cn;", "aff_unique_abbr": "SCUT;;PCL;", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0+0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zou_2021_ICCV,\n \n author = {\n Zou,\n Longkun and Tang,\n Hui and Chen,\n Ke and Jia,\n Kui\n},\n title = {\n Geometry-Aware Self-Training for Unsupervised Domain Adaptation on Object Point Clouds\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6403-6412\n} \n}" }, { "title": "Geometry-Based Distance Decomposition for Monocular 3D Object Detection", @@ -16858,6 +18000,7 @@ "status": "Poster", "track": "main", "pid": 1349, + "author_site": "Xuepeng Shi; Qi Ye; Xiaozhi Chen; Chuangrong Chen; Zhixiang Chen; Tae-Kyun Kim", "author": "Xuepeng Shi; Qi Ye; Xiaozhi Chen; Chuangrong Chen; Zhixiang Chen; Tae-Kyun Kim", "abstract": "Monocular 3D object detection is of great significance for autonomous driving but remains challenging. The core challenge is to predict the distance of objects in the absence of explicit depth information. Unlike regressing the distance as a single variable in most existing methods, we propose a novel geometry-based distance decomposition to recover the distance by its factors. The decomposition factors the distance of objects into the most representative and stable variables, i.e. the physical height and the projected visual height in the image plane. Moreover, the decomposition maintains the self-consistency between the two heights, leading to robust distance prediction when both predicted heights are inaccurate. The decomposition also enables us to trace the causes of the distance uncertainty for different scenarios. Such decomposition makes the distance prediction interpretable, accurate, and robust. Our method directly predicts 3D bounding boxes from RGB images with a compact architecture, making the training and inference simple and efficient. The experimental results show that our method achieves the state-of-the-art performance on the monocular 3D Object Detection and Bird's Eye View tasks of the KITTI dataset, and can generalize to images with different camera intrinsics.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Shi_Geometry-Based_Distance_Decomposition_for_Monocular_3D_Object_Detection_ICCV_2021_paper.pdf", @@ -16872,7 +18015,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Shi_Geometry-Based_Distance_Decomposition_for_Monocular_3D_Object_Detection_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Shi_Geometry-Based_Distance_Decomposition_for_Monocular_3D_Object_Detection_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Shi_2021_ICCV,\n \n author = {\n Shi,\n Xuepeng and Ye,\n Qi and Chen,\n Xiaozhi and Chen,\n Chuangrong and Chen,\n Zhixiang and Kim,\n Tae-Kyun\n},\n title = {\n Geometry-Based Distance Decomposition for Monocular 3D Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15172-15181\n} \n}" }, { "title": "Geometry-Free View Synthesis: Transformers and No 3D Priors", @@ -16880,7 +18024,8 @@ "status": "Poster", "track": "main", "pid": 2485, - "author": "Robin Rombach; Patrick Esser; Bj\u00f6rn Ommer", + "author_site": "Robin Rombach; Patrick Esser; Björn Ommer", + "author": "Robin Rombach; Patrick Esser; Björn Ommer", "abstract": "Is a geometric model required to synthesize novel views from a single image? Being bound to local convolutions, CNNs need explicit 3D biases to model geometric transformations. In contrast, we demonstrate that a transformer-based model can synthesize entirely novel views without any hand-engineered 3D biases. This is achieved by (i) a global attention mechanism for implicitly learning long-range 3D correspondences between source and target views, and (ii) a probabilistic formulation necessary to capture the ambiguity inherent in predicting novel views from a single image, thereby overcoming the limitations of previous approaches that are restricted to relatively small viewpoint changes. We evaluate various ways to integrate 3D priors into a transformer architecture. However, our experiments show that no such geometric priors are required and that the transformer is capable of implicitly learning 3D relationships between images. Furthermore, this approach outperforms the state of the art in terms of visual quality while covering the full distribution of possible realizations.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Rombach_Geometry-Free_View_Synthesis_Transformers_and_No_3D_Priors_ICCV_2021_paper.pdf", "aff": "Ludwig Maximilian University of Munich; Heidelberg University; Ludwig Maximilian University of Munich & Heidelberg University", @@ -16903,7 +18048,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Munich", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Rombach_2021_ICCV,\n \n author = {\n Rombach,\n Robin and Esser,\n Patrick and Ommer,\n Bj\\"orn\n},\n title = {\n Geometry-Free View Synthesis: Transformers and No 3D Priors\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14356-14366\n} \n}" }, { "title": "GistNet: A Geometric Structure Transfer Network for Long-Tailed Recognition", @@ -16911,6 +18057,7 @@ "status": "Poster", "track": "main", "pid": 3972, + "author_site": "Bo Liu; Haoxiang Li; Hao Kang; Gang Hua; Nuno Vasconcelos", "author": "Bo Liu; Haoxiang Li; Hao Kang; Gang Hua; Nuno Vasconcelos", "abstract": "The problem of long-tailed recognition, where the number of examples per class is highly unbalanced, is considered. It is hypothesized that the well known tendency of standard classifier training to overfit to popular classes can be exploited for effective transfer learning. Rather than eliminating this overfitting, e.g. by adopting popular class-balanced sampling methods, the learning algorithm should instead leverage this overfitting to transfer geometric information from popular to low-shot classes. A new classifier architecture, GistNet, is proposed to support this goal, using constellations of classifier parameters to encode the class geometry. A new learning algorithm is then proposed for GeometrIc Structure Transfer (GIST), with resort to a combination of loss functions that combine class-balanced and random sampling to guarantee that, while overfitting to the popular classes is restricted to geometric parameters, it is leveraged to transfer class geometry from popular to few-shot classes. This enables better generalization for few-shot classes without the need for the manual specification of class weights, or even the explicit grouping of classes into different types. Experiments on two popular long-tailed recognition datasets show that GistNet outperforms existing solutions to this problem.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liu_GistNet_A_Geometric_Structure_Transfer_Network_for_Long-Tailed_Recognition_ICCV_2021_paper.pdf", @@ -16934,7 +18081,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "San Diego;", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Liu_2021_ICCV,\n \n author = {\n Liu,\n Bo and Li,\n Haoxiang and Kang,\n Hao and Hua,\n Gang and Vasconcelos,\n Nuno\n},\n title = {\n GistNet: A Geometric Structure Transfer Network for Long-Tailed Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8209-8218\n} \n}" }, { "title": "Glimpse-Attend-and-Explore: Self-Attention for Active Visual Exploration", @@ -16942,6 +18090,7 @@ "status": "Poster", "track": "main", "pid": 9217, + "author_site": "Soroush Seifi; Abhishek Jha; Tinne Tuytelaars", "author": "Soroush Seifi; Abhishek Jha; Tinne Tuytelaars", "abstract": "Active visual exploration aims to assist an agent with a limited field of view to understand its environment based on partial observations made by choosing the best viewing directions in the scene. Recent methods have tried to address this problem either by using reinforcement learning, which is difficult to train, or by uncertainty maps, which are task-specific and can only be implemented for dense prediction tasks. In this paper, we propose the Glimpse-Attend-and-Explore model which: (a) employs self-attention to guide the visual exploration instead of task-specific uncertainty maps; (b) can be used for both dense and sparse prediction tasks; and (c) uses a contrastive stream to further improve the representations learned. Unlike previous works, we show the application of our model on multiple tasks like reconstruction, segmentation and classification. Our model provides encouraging results against baseline while being less dependent on dataset bias in driving the exploration. We further perform an ablation study to investigate the features and attention learned by our model. Finally, we show that our self-attention module learns to attend different regions of the scene by minimizing the loss on the downstream task. Code: https://github.com/soroushseifi/glimpse-attend-explore.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Seifi_Glimpse-Attend-and-Explore_Self-Attention_for_Active_Visual_Exploration_ICCV_2021_paper.pdf", @@ -16965,7 +18114,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Belgium" + "aff_country_unique": "Belgium", + "bibtex": "@InProceedings{Seifi_2021_ICCV,\n \n author = {\n Seifi,\n Soroush and Jha,\n Abhishek and Tuytelaars,\n Tinne\n},\n title = {\n Glimpse-Attend-and-Explore: Self-Attention for Active Visual Exploration\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 16137-16146\n} \n}" }, { "title": "Global Pooling, More Than Meets the Eye: Position Information Is Encoded Channel-Wise in CNNs", @@ -16973,6 +18123,7 @@ "status": "Poster", "track": "main", "pid": 8328, + "author_site": "Md Amirul Islam; Matthew Kowal; Sen Jia; Konstantinos G. Derpanis; Neil D. B. Bruce", "author": "Md Amirul Islam; Matthew Kowal; Sen Jia; Konstantinos G. Derpanis; Neil D. B. Bruce", "abstract": "In this paper, we challenge the common assumption that collapsing the spatial dimensions of a 3D (spatial-channel) tensor in a convolutional neural network (CNN) into a vector via global pooling removes all spatial information. Specifically, we demonstrate that positional information is encoded based on the ordering of the channel dimensions, while semantic information is largely not. Following this demonstration, we show the real world impact of these findings by applying them to two applications. First, we propose a simple yet effective data augmentation strategy and loss function which improves the translation invariance of a CNN's output. Second, we propose a method to efficiently determine which channels in the latent representation are responsible for (i) encoding overall position information or (ii) region-specific positions. We first show that semantic segmentation has a significant reliance on the overall position channels to make predictions. We then show for the first time that it is possible to perform a `region-specific' attack, and degrade a network's performance in a particular part of the input. We believe our findings and demonstrated applications will benefit research areas concerned with understanding the characteristics of CNNs.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Islam_Global_Pooling_More_Than_Meets_the_Eye_Position_Information_Is_ICCV_2021_paper.pdf", @@ -16989,14 +18140,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Islam_Global_Pooling_More_Than_Meets_the_Eye_Position_Information_Is_ICCV_2021_paper.html", "aff_unique_index": "0;1;2;3+4;5+4", - "aff_unique_norm": "Ryerson University;York University;LG;Samsung;Vector Institute for AI;University of Guelph", - "aff_unique_dep": ";;Toronto AI Lab;AI Centre;;", - "aff_unique_url": "https://www.ryerson.ca;https://www.yorku.ca;https://www.lg.com/ca;https://www.samsung.com/global/innovation/ai-research-centers/;https://vectorinstitute.ai/;https://www.uoguelph.ca", + "aff_unique_norm": "Ryerson University;York University;LG Electronics;Samsung AI Centre;Vector Institute for AI;University of Guelph", + "aff_unique_dep": ";;LG Electronics AI Lab;AI Centre;;", + "aff_unique_url": "https://www.ryerson.ca;https://www.yorku.ca;https://www.lg.com;https://www.samsung.com/global/innovation/ai-research-centers/;https://vectorinstitute.ai/;https://www.uoguelph.ca", "aff_unique_abbr": "Ryerson;York U;LG;Samsung AI;Vector AI;U of G", "aff_campus_unique_index": "1;", "aff_campus_unique": ";Toronto", - "aff_country_unique_index": "0;0;0;0+0;0+0", - "aff_country_unique": "Canada" + "aff_country_unique_index": "0;0;1;0+0;0+0", + "aff_country_unique": "Canada;South Korea", + "bibtex": "@InProceedings{Islam_2021_ICCV,\n \n author = {\n Islam,\n Md Amirul and Kowal,\n Matthew and Jia,\n Sen and Derpanis,\n Konstantinos G. and Bruce,\n Neil D. B.\n},\n title = {\n Global Pooling,\n More Than Meets the Eye: Position Information Is Encoded Channel-Wise in CNNs\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 793-801\n} \n}" }, { "title": "Globally Optimal and Efficient Manhattan Frame Estimation by Delimiting Rotation Search Space", @@ -17004,6 +18156,7 @@ "status": "Poster", "track": "main", "pid": 10058, + "author_site": "Wuwei Ge; Yu Song; Baichao Zhang; Zehua Dong", "author": "Wuwei Ge; Yu Song; Baichao Zhang; Zehua Dong", "abstract": "A typical man-made structure can be abstracted as the Manhattan world assumption, in which notion is further represented as a Manhattan Frame (MF) defined by three orthogonal axes. The problem of MF estimation can be formulated as the solution of the rotation between the MF and the camera frame (called the \"MF rotation\"). However, the whole rotation space is quite redundant for solving the MF rotation, which is one of the main factors that disturb the computational efficiency of those methods associated with a rotation space search. This paper proves that the volume of the space that just contains all MF rotations (called the \"MFR space\") is only 1 / 24 of that of the whole rotation space, and then an exact MFR space is delimited from the rotation space. Searching in the delimited MFR space, the MF estimation solved by a branch-and-bound (BnB) framework guarantees stability and efficiency simultaneously. Furthermore, the general rotation problems associated with a rotation space search are solved more efficiently. Experiments on synthetic and real datasets have successfully confirmed the validity of our approach.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ge_Globally_Optimal_and_Efficient_Manhattan_Frame_Estimation_by_Delimiting_Rotation_ICCV_2021_paper.pdf", @@ -17020,14 +18173,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Ge_Globally_Optimal_and_Efficient_Manhattan_Frame_Estimation_by_Delimiting_Rotation_ICCV_2021_paper.html", "aff_unique_index": "0+1;1+0;0;0", - "aff_unique_norm": "Zongmu Tech;Beijing Jiao Tong University", + "aff_unique_norm": "Zongmu Tech;Beijing Jiaotong University", "aff_unique_dep": ";", "aff_unique_url": ";http://www.njtu.edu.cn/en", "aff_unique_abbr": ";BJTU", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Ge_2021_ICCV,\n \n author = {\n Ge,\n Wuwei and Song,\n Yu and Zhang,\n Baichao and Dong,\n Zehua\n},\n title = {\n Globally Optimal and Efficient Manhattan Frame Estimation by Delimiting Rotation Search Space\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15213-15221\n} \n}" }, { "title": "Going Deeper With Image Transformers", @@ -17035,7 +18189,8 @@ "status": "Poster", "track": "main", "pid": 6610, - "author": "Hugo Touvron; Matthieu Cord; Alexandre Sablayrolles; Gabriel Synnaeve; Herv\u00e9 J\u00e9gou", + "author_site": "Hugo Touvron; Matthieu Cord; Alexandre Sablayrolles; Gabriel Synnaeve; Hervé Jégou", + "author": "Hugo Touvron; Matthieu Cord; Alexandre Sablayrolles; Gabriel Synnaeve; Hervé Jégou", "abstract": "Transformers have been recently adapted for large scale image classification, achieving high scores shaking up the long supremacy of convolutional neural networks. However the optimization of vision transformers has been little studied so far. In this work, we build and optimize deeper transformer networks for image classification. In particular, we investigate the interplay of architecture and optimization of such dedicated transformers. We make two architecture changes that significantly improve the accuracy of deep transformers. This leads us to produce models whose performance does not saturate early with more depth, for instance we obtain 86.5% top-1 accuracy on Imagenet when training with no external data, we thus attain the current sate of the art with less floating-point operations and parameters. Our best model establishes the new state of the art on Imagenet with Reassessed labels and Imagenet-V2 / match frequency, in the setting with no additional training data. We share our code and models", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Touvron_Going_Deeper_With_Image_Transformers_ICCV_2021_paper.pdf", "aff": ";;;;", @@ -17049,7 +18204,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Touvron_Going_Deeper_With_Image_Transformers_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Touvron_Going_Deeper_With_Image_Transformers_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Touvron_2021_ICCV,\n \n author = {\n Touvron,\n Hugo and Cord,\n Matthieu and Sablayrolles,\n Alexandre and Synnaeve,\n Gabriel and J\\'egou,\n Herv\\'e\n},\n title = {\n Going Deeper With Image Transformers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 32-42\n} \n}" }, { "title": "Gradient Distribution Alignment Certificates Better Adversarial Domain Adaptation", @@ -17057,10 +18213,11 @@ "status": "Poster", "track": "main", "pid": 8392, + "author_site": "Zhiqiang Gao; Shufei Zhang; Kaizhu Huang; Qiufeng Wang; Chaoliang Zhong", "author": "Zhiqiang Gao; Shufei Zhang; Kaizhu Huang; Qiufeng Wang; Chaoliang Zhong", "abstract": "The latest heuristic for handling the domain shift in unsupervised domain adaptation tasks is to reduce the data distribution discrepancy using adversarial learning. Recent studies improve the conventional adversarial domain adaptation methods with discriminative information by integrating the classifier's outputs into distribution divergence measurement. However, they still suffer from the equilibrium problem of adversarial learning in which even if the discriminator is fully confused, sufficient similarity between two distributions cannot be guaranteed. To overcome this problem, we propose a novel approach named feature gradient distribution alignment (FGDA). We demonstrate the rationale of our method both theoretically and empirically. In particular, we show that the distribution discrepancy can be reduced by constraining feature gradients of two domains to have similar distributions. Meanwhile, our method enjoys a theoretical guarantee that a tighter error upper bound for target samples can be obtained than that of conventional adversarial domain adaptation methods. By integrating the proposed method with existing adversarial domain adaptation models, we achieve state-of-the-art performance on two real-world benchmark datasets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Gao_Gradient_Distribution_Alignment_Certificates_Better_Adversarial_Domain_Adaptation_ICCV_2021_paper.pdf", - "aff": "Xi\u2019an Jiatong-Liverpool University, Suzhou, China; Xi\u2019an Jiatong-Liverpool University, Suzhou, China; Xi\u2019an Jiatong-Liverpool University, Suzhou, China; Xi\u2019an Jiatong-Liverpool University, Suzhou, China; Fujitsu R&D Centre, Beijing, China", + "aff": "Xi’an Jiatong-Liverpool University, Suzhou, China; Xi’an Jiatong-Liverpool University, Suzhou, China; Xi’an Jiatong-Liverpool University, Suzhou, China; Xi’an Jiatong-Liverpool University, Suzhou, China; Fujitsu R&D Centre, Beijing, China", "project": "", "github": "https://github.com/gzqhappy/FGDA", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Gao_Gradient_Distribution_Alignment_ICCV_2021_supplemental.pdf", @@ -17080,7 +18237,8 @@ "aff_campus_unique_index": "0;0;0;0;1", "aff_campus_unique": "Suzhou;Beijing", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Gao_2021_ICCV,\n \n author = {\n Gao,\n Zhiqiang and Zhang,\n Shufei and Huang,\n Kaizhu and Wang,\n Qiufeng and Zhong,\n Chaoliang\n},\n title = {\n Gradient Distribution Alignment Certificates Better Adversarial Domain Adaptation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8937-8946\n} \n}" }, { "title": "Gradient Normalization for Generative Adversarial Networks", @@ -17088,6 +18246,7 @@ "status": "Poster", "track": "main", "pid": 3174, + "author_site": "Yi-Lun Wu; Hong-Han Shuai; Zhi-Rui Tam; Hong-Yu Chiu", "author": "Yi-Lun Wu; Hong-Han Shuai; Zhi-Rui Tam; Hong-Yu Chiu", "abstract": "In this paper, we propose a novel normalization method called gradient normalization (GN) to tackle the training instability of Generative Adversarial Networks (GANs) caused by the sharp gradient space. Unlike existing work such as gradient penalty and spectral normalization, the proposed GN only imposes a hard 1-Lipschitz constraint on the discriminator function, which increases the capacity of the discriminator. Moreover, the proposed gradient normalization can be applied to different GAN architectures with little modification. Extensive experiments on four datasets show that GANs trained with gradient normalization outperform existing methods in terms of both Frechet Inception Distance and Inception Score.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wu_Gradient_Normalization_for_Generative_Adversarial_Networks_ICCV_2021_paper.pdf", @@ -17111,7 +18270,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Taiwan", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wu_2021_ICCV,\n \n author = {\n Wu,\n Yi-Lun and Shuai,\n Hong-Han and Tam,\n Zhi-Rui and Chiu,\n Hong-Yu\n},\n title = {\n Gradient Normalization for Generative Adversarial Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6373-6382\n} \n}" }, { "title": "Grafit: Learning Fine-Grained Image Representations With Coarse Labels", @@ -17119,7 +18279,8 @@ "status": "Poster", "track": "main", "pid": 6612, - "author": "Hugo Touvron; Alexandre Sablayrolles; Matthijs Douze; Matthieu Cord; Herv\u00e9 J\u00e9gou", + "author_site": "Hugo Touvron; Alexandre Sablayrolles; Matthijs Douze; Matthieu Cord; Hervé Jégou", + "author": "Hugo Touvron; Alexandre Sablayrolles; Matthijs Douze; Matthieu Cord; Hervé Jégou", "abstract": "This paper tackles the problem of learning a finer representation than the one provided by training labels. This enables fine-grained category retrieval of images in a collection annotated with coarse labels only. Our network is learned with a nearest-neighbor classifier objective, and an instance loss inspired by self-supervised learning. By jointly leveraging the coarse labels and the underlying fine-grained latent space, it significantly improves the accuracy of category-level retrieval methods. Our strategy outperforms all competing methods for retrieving or classifying images at a finer granularity than that available at train time. It also improves the accuracy for transfer learning tasks to fine-grained datasets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Touvron_Grafit_Learning_Fine-Grained_Image_Representations_With_Coarse_Labels_ICCV_2021_paper.pdf", "aff": "Facebook AI Research; Facebook AI Research; Facebook AI Research; Sorbonne University; Facebook AI Research", @@ -17135,14 +18296,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Touvron_Grafit_Learning_Fine-Grained_Image_Representations_With_Coarse_Labels_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;1;0", - "aff_unique_norm": "Meta;Sorbonne University", + "aff_unique_norm": "Facebook;Sorbonne University", "aff_unique_dep": "Facebook AI Research;", "aff_unique_url": "https://research.facebook.com;https://www.sorbonne.universite.fr", "aff_unique_abbr": "FAIR;Sorbonne", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0", - "aff_country_unique": "United States;France" + "aff_country_unique": "United States;France", + "bibtex": "@InProceedings{Touvron_2021_ICCV,\n \n author = {\n Touvron,\n Hugo and Sablayrolles,\n Alexandre and Douze,\n Matthijs and Cord,\n Matthieu and J\\'egou,\n Herv\\'e\n},\n title = {\n Grafit: Learning Fine-Grained Image Representations With Coarse Labels\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 874-884\n} \n}" }, { "title": "Graph Constrained Data Representation Learning for Human Motion Segmentation", @@ -17150,10 +18312,11 @@ "status": "Poster", "track": "main", "pid": 7078, - "author": "Mariella Dimiccoli; Llu\u00eds Garrido; Guillem Rodriguez-Corominas; Herwig Wendt", + "author_site": "Mariella Dimiccoli; Lluís Garrido; Guillem Rodriguez-Corominas; Herwig Wendt", + "author": "Mariella Dimiccoli; Lluís Garrido; Guillem Rodriguez-Corominas; Herwig Wendt", "abstract": "Recently, transfer subspace learning based approaches have shown to be a valid alternative to unsupervised subspace clustering and temporal data clustering for human motion segmentation (HMS). These approaches leverage prior knowledge from a source domain to improve clustering performance on a target domain, and currently they represent the state of the art in HMS. Bucking this trend, in this paper, we propose a novel unsupervised model that learns a representation of the data and digs clustering information from the data itself. Our model is reminiscent of temporal subspace clustering, but presents two critical differences. First, we learn an auxiliary data matrix that can deviate from the initial data, hence confers more degrees of freedom to the coding matrix. Second, we introduce a regularization term for this auxiliary data matrix that preserves the local geometrical structure present in the high-dimensional space. The proposed model is efficiently optimized by using an original Alternating Direction Method of Multipliers (ADMM) formulation allowing to learn jointly the auxiliary data representation, a nonnegative dictionary and a coding matrix. Experimental results on four benchmark datasets for HMS demonstrate that our approach achieves significantly better clustering performance then state-of-the-art methods, including both unsupervised and more recent semi-supervised transfer learning approaches.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Dimiccoli_Graph_Constrained_Data_Representation_Learning_for_Human_Motion_Segmentation_ICCV_2021_paper.pdf", - "aff": "Institut de Rob\u00f2tica i Inform\u00e0tica Industrial (CSIC-UPC); Univ. of Barcelona; Institut de Rob\u00f2tica i Inform\u00e0tica Industrial (CSIC-UPC); CNRS, IRIT, Univ. of Toulouse", + "aff": "Institut de Robòtica i Informàtica Industrial (CSIC-UPC); Univ. of Barcelona; Institut de Robòtica i Informàtica Industrial (CSIC-UPC); CNRS, IRIT, Univ. of Toulouse", "project": "", "github": "https://github.com/mdimiccoli/GCRL-for-HMS/", "supp": "", @@ -17166,14 +18329,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Dimiccoli_Graph_Constrained_Data_Representation_Learning_for_Human_Motion_Segmentation_ICCV_2021_paper.html", "aff_unique_index": "0;1;0;2", - "aff_unique_norm": "Institut de Rob\u00f2tica i Inform\u00e0tica Industrial;University of Barcelona;University of Toulouse", - "aff_unique_dep": "Rob\u00f2tica i Inform\u00e0tica Industrial;;Institut de Recherche en Informatique de Toulouse (IRIT)", + "aff_unique_norm": "Institut de Robòtica i Informàtica Industrial;University of Barcelona;University of Toulouse", + "aff_unique_dep": "Robòtica i Informàtica Industrial;;Institut de Recherche en Informatique de Toulouse (IRIT)", "aff_unique_url": "http://www.iri.upc.edu/;https://www.ub.edu;https://www.univ-toulouse.fr", "aff_unique_abbr": "IRI;UB;Univ. of Toulouse", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1", - "aff_country_unique": "Spain;France" + "aff_country_unique": "Spain;France", + "bibtex": "@InProceedings{Dimiccoli_2021_ICCV,\n \n author = {\n Dimiccoli,\n Mariella and Garrido,\n Llu{\\'\\i\n}s and Rodriguez-Corominas,\n Guillem and Wendt,\n Herwig\n},\n title = {\n Graph Constrained Data Representation Learning for Human Motion Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1460-1469\n} \n}" }, { "title": "Graph Contrastive Clustering", @@ -17181,6 +18345,7 @@ "status": "Poster", "track": "main", "pid": 3417, + "author_site": "Huasong Zhong; Jianlong Wu; Chong Chen; Jianqiang Huang; Minghua Deng; Liqiang Nie; Zhouchen Lin; Xian-Sheng Hua", "author": "Huasong Zhong; Jianlong Wu; Chong Chen; Jianqiang Huang; Minghua Deng; Liqiang Nie; Zhouchen Lin; Xian-Sheng Hua", "abstract": "Recently, some contrastive learning methods have been proposed to simultaneously learn representations and clustering assignments, achieving significant improvements. However, these methods do not take the category information and clustering objective into consideration, thus the learned representations are not optimal for clustering and the performance might be limited. Towards this issue, we first propose a novel graph contrastive learning framework, which is then applied to the clustering task and we come up with the Graph Constrastive Clustering (GCC) method. Different from basic contrastive clustering that only assumes an image and its augmentation should share similar representation and clustering assignments, we lift the instance-level consistency to the cluster-level consistency with the assumption that samples in one cluster and their augmentations should all be similar. Specifically, on the one hand, the graph Laplacian based contrastive loss is proposed to learn more discriminative and clustering-friendly features. On the other hand, a novel graph-based contrastive learning strategy is proposed to learn more compact clustering assignments. Both of them incorporate the latent category information to reduce the intra-cluster variance while increasing the inter-cluster variance. Experiments on six commonly used datasets demonstrate the superiority of our proposed approach over the state-of-the-art methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhong_Graph_Contrastive_Clustering_ICCV_2021_paper.pdf", @@ -17204,7 +18369,8 @@ "aff_campus_unique_index": ";;1;1", "aff_campus_unique": ";Beijing", "aff_country_unique_index": "0+0;0+0;0+0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhong_2021_ICCV,\n \n author = {\n Zhong,\n Huasong and Wu,\n Jianlong and Chen,\n Chong and Huang,\n Jianqiang and Deng,\n Minghua and Nie,\n Liqiang and Lin,\n Zhouchen and Hua,\n Xian-Sheng\n},\n title = {\n Graph Contrastive Clustering\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9224-9233\n} \n}" }, { "title": "Graph-BAS3Net: Boundary-Aware Semi-Supervised Segmentation Network With Bilateral Graph Convolution", @@ -17212,10 +18378,11 @@ "status": "Poster", "track": "main", "pid": 3588, + "author_site": "Huimin Huang; Lanfen Lin; Yue Zhang; Yingying Xu; Jing Zheng; XiongWei Mao; Xiaohan Qian; Zhiyi Peng; Jianying Zhou; Yen-Wei Chen; Ruofeng Tong", "author": "Huimin Huang; Lanfen Lin; Yue Zhang; Yingying Xu; Jing Zheng; XiongWei Mao; Xiaohan Qian; Zhiyi Peng; Jianying Zhou; Yen-Wei Chen; Ruofeng Tong", "abstract": "Semi-supervised learning (SSL) algorithms have attracted much attentions in medical image segmentation by leveraging unlabeled data, which challenge in acquiring massive pixel-wise annotated samples. However, most of the existing SSLs neglected the geometric shape constraint in object, leading to unsatisfactory boundary and non-smooth of object. In this paper, we propose a novel boundary-aware semi-supervised medical image segmentation network, named Graph-BAS3Net, which incorporates the boundary information and learns duality constraints between semantics and geometrics in the graph domain. Specifically, the proposed method consists of two components: a multi-task learning framework BAS3Net and a graph-based cross-task module BGCM. The BAS3Net improves the existing GAN-based SSL by adding a boundary detection task, which encodes richer features of object shape and surface. Moreover, the BGCM further explores the co-occurrence relations between the semantics segmentation and boundary detection task, so that the network learns stronger semantic and geometric correspondences from both labeled and unlabeled data. Experimental results on the LiTS dataset and COVID-19 dataset confirm that our proposed Graph-BAS3 Net outperforms the state-of-the-art methods in semi-supervised segmentation task.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Huang_Graph-BAS3Net_Boundary-Aware_Semi-Supervised_Segmentation_Network_With_Bilateral_Graph_Convolution_ICCV_2021_paper.pdf", - "aff": "Zhejiang University; Zhejiang University; Zhejiang University; Zhejiang University+Zhejiang Lab+Ritsumeikan University; The First Af\ufb01liated Hospital; Zhejiang University Hospital; The First Af\ufb01liated Hospital; The First Af\ufb01liated Hospital; The First Af\ufb01liated Hospital; Ritsumeikan University+Zhejiang University+Zhejiang Lab; Zhejiang University+Zhejiang Lab", + "aff": "Zhejiang University; Zhejiang University; Zhejiang University; Zhejiang University+Zhejiang Lab+Ritsumeikan University; The First Affiliated Hospital; Zhejiang University Hospital; The First Affiliated Hospital; The First Affiliated Hospital; The First Affiliated Hospital; Ritsumeikan University+Zhejiang University+Zhejiang Lab; Zhejiang University+Zhejiang Lab", "project": "", "github": "", "supp": "", @@ -17228,14 +18395,15 @@ "author_num": 11, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Huang_Graph-BAS3Net_Boundary-Aware_Semi-Supervised_Segmentation_Network_With_Bilateral_Graph_Convolution_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;0+1+2;3;0;3;3;3;2+0+1;0+1", - "aff_unique_norm": "Zhejiang University;Zhejiang Lab;Ritsumeikan University;First Af\ufb01liated Hospital", + "aff_unique_norm": "Zhejiang University;Zhejiang Lab;Ritsumeikan University;The First Affiliated Hospital", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.zju.edu.cn;http://www.zhejianglab.com;https://www.ritsumei.ac.jp;", "aff_unique_abbr": "ZJU;;Ritsumeikan;", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0+0+1;0;1+0+0;0+0", - "aff_country_unique": "China;Japan;" + "aff_country_unique": "China;Japan;", + "bibtex": "@InProceedings{Huang_2021_ICCV,\n \n author = {\n Huang,\n Huimin and Lin,\n Lanfen and Zhang,\n Yue and Xu,\n Yingying and Zheng,\n Jing and Mao,\n XiongWei and Qian,\n Xiaohan and Peng,\n Zhiyi and Zhou,\n Jianying and Chen,\n Yen-Wei and Tong,\n Ruofeng\n},\n title = {\n Graph-BAS3Net: Boundary-Aware Semi-Supervised Segmentation Network With Bilateral Graph Convolution\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7386-7395\n} \n}" }, { "title": "Graph-Based 3D Multi-Person Pose Estimation Using Multi-View Images", @@ -17243,6 +18411,7 @@ "status": "Poster", "track": "main", "pid": 4103, + "author_site": "Size Wu; Sheng Jin; Wentao Liu; Lei Bai; Chen Qian; Dong Liu; Wanli Ouyang", "author": "Size Wu; Sheng Jin; Wentao Liu; Lei Bai; Chen Qian; Dong Liu; Wanli Ouyang", "abstract": "This paper studies the task of estimating the 3D human poses of multiple persons from multiple calibrated camera views. Following the top-down paradigm, we decompose the task into two stages, i.e. person localization and pose estimation. Both stages are processed in coarse-to-fine manners. And we propose three task-specific graph neural networks for effective message passing. For 3D person localization, we first use Multi-view Matching Graph Module (MMG) to learn the cross-view association and recover coarse human proposals. The Center Refinement Graph Module (CRG) further refines the results via flexible point-based prediction. For 3D pose estimation, the Pose Regression Graph Module (PRG) learns both the multi-view geometry and structural relations between human joints. Our approach achieves state-of-the-art performance on CMU Panoptic and Shelf datasets with significantly lower computation complexity.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wu_Graph-Based_3D_Multi-Person_Pose_Estimation_Using_Multi-View_Images_ICCV_2021_paper.pdf", @@ -17259,14 +18428,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wu_Graph-Based_3D_Multi-Person_Pose_Estimation_Using_Multi-View_Images_ICCV_2021_paper.html", "aff_unique_index": "0;1;2;3;2;0;3", - "aff_unique_norm": "University of Science and Technology of China;University of Hong Kong;SenseTime Research;University of Sydney", + "aff_unique_norm": "University of Science and Technology of China;The University of Hong Kong;SenseTime Research;University of Sydney", "aff_unique_dep": ";;Research;", "aff_unique_url": "http://www.ustc.edu.cn;https://www.hku.hk;https://www.sensetime.com;https://www.sydney.edu.au", "aff_unique_abbr": "USTC;HKU;SenseTime;USYD", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;1;0;0;1", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Wu_2021_ICCV,\n \n author = {\n Wu,\n Size and Jin,\n Sheng and Liu,\n Wentao and Bai,\n Lei and Qian,\n Chen and Liu,\n Dong and Ouyang,\n Wanli\n},\n title = {\n Graph-Based 3D Multi-Person Pose Estimation Using Multi-View Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11148-11157\n} \n}" }, { "title": "Graph-Based Asynchronous Event Processing for Rapid Object Recognition", @@ -17274,6 +18444,7 @@ "status": "Poster", "track": "main", "pid": 7148, + "author_site": "Yijin Li; Han Zhou; Bangbang Yang; Ye Zhang; Zhaopeng Cui; Hujun Bao; Guofeng Zhang", "author": "Yijin Li; Han Zhou; Bangbang Yang; Ye Zhang; Zhaopeng Cui; Hujun Bao; Guofeng Zhang", "abstract": "Different from traditional video cameras, event cameras capture asynchronous events stream in which each event encodes pixel location, trigger time, and the polarity of the brightness changes. In this paper, we introduce a novel graph-based framework for event cameras, namely SlideGCN. Unlike some recent graph-based methods that use groups of events as input, our approach can efficiently process data event-by-event, unlock the low latency nature of events data while still maintaining the graph's structure internally. For fast graph construction, we develop a radius search algorithm, which better exploits the partial regular structure of event cloud against k-d tree based generic methods. Experiments show that our method reduces the computational complexity up to 100 times with respect to current graph-based methods while keeping state-of-the-art performance on object recognition. Moreover, we verify the superiority of event-wise processing with our method. When the state becomes stable, we can give a prediction with high confidence, thus making an early recognition.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_Graph-Based_Asynchronous_Event_Processing_for_Rapid_Object_Recognition_ICCV_2021_paper.pdf", @@ -17297,7 +18468,8 @@ "aff_campus_unique_index": ";;;;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0+0;0+0;0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Yijin and Zhou,\n Han and Yang,\n Bangbang and Zhang,\n Ye and Cui,\n Zhaopeng and Bao,\n Hujun and Zhang,\n Guofeng\n},\n title = {\n Graph-Based Asynchronous Event Processing for Rapid Object Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 934-943\n} \n}" }, { "title": "Graph-to-3D: End-to-End Generation and Manipulation of 3D Scenes Using Scene Graphs", @@ -17305,10 +18477,11 @@ "status": "Poster", "track": "main", "pid": 1163, + "author_site": "Helisa Dhamo; Fabian Manhardt; Nassir Navab; Federico Tombari", "author": "Helisa Dhamo; Fabian Manhardt; Nassir Navab; Federico Tombari", "abstract": "Controllable scene synthesis consists of generating 3D information that satisfy underlying specifications. Thereby, these specifications should be abstract, i.e. allowing easy user interaction, whilst providing enough interface for detailed control. Scene graphs are representations of a scene, composed of objects (nodes) and inter-object relationships (edges), proven to be particularly suited for this task, as they allow for semantic control on the generated content. Previous works tackling this task often rely on synthetic data, and retrieve object meshes, which naturally limits the generation capabilities. To circumvent this issue, we instead propose the first work that directly generates shapes from a scene graph in an end-to-end manner. In addition, we show that the same model supports scene modification, using the respective scene graph as interface. Leveraging Graph Convolutional Networks (GCN) we train a variational Auto-Encoder on top of the object and edge categories, as well as 3D shapes and scene layouts, allowing latter sampling of new scenes and shapes.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Dhamo_Graph-to-3D_End-to-End_Generation_and_Manipulation_of_3D_Scenes_Using_Scene_ICCV_2021_paper.pdf", - "aff": "Technische Universit \u00a8at M \u00a8unchen; Google; Technische Universit \u00a8at M \u00a8unchen; Technische Universit \u00a8at M \u00a8unchen+Google", + "aff": "Technische Universit ¨at M ¨unchen; Google; Technische Universit ¨at M ¨unchen; Technische Universit ¨at M ¨unchen+Google", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Dhamo_Graph-to-3D_End-to-End_Generation_ICCV_2021_supplemental.pdf", @@ -17321,14 +18494,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Dhamo_Graph-to-3D_End-to-End_Generation_and_Manipulation_of_3D_Scenes_Using_Scene_ICCV_2021_paper.html", "aff_unique_index": "0;1;0;0+1", - "aff_unique_norm": "Technische Universit\u00e4t M\u00fcnchen;Google", - "aff_unique_dep": ";Google", + "aff_unique_norm": "Technische Universität München;Google", + "aff_unique_dep": ";", "aff_unique_url": "https://www.tum.de;https://www.google.com", "aff_unique_abbr": "TUM;Google", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;0;0+1", - "aff_country_unique": "Germany;United States" + "aff_country_unique": "Germany;United States", + "bibtex": "@InProceedings{Dhamo_2021_ICCV,\n \n author = {\n Dhamo,\n Helisa and Manhardt,\n Fabian and Navab,\n Nassir and Tombari,\n Federico\n},\n title = {\n Graph-to-3D: End-to-End Generation and Manipulation of 3D Scenes Using Scene Graphs\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 16352-16361\n} \n}" }, { "title": "GraphFPN: Graph Feature Pyramid Network for Object Detection", @@ -17336,6 +18510,7 @@ "status": "Poster", "track": "main", "pid": 2032, + "author_site": "Gangming Zhao; Weifeng Ge; Yizhou Yu", "author": "Gangming Zhao; Weifeng Ge; Yizhou Yu", "abstract": "Feature pyramids have been proven powerful in image understanding tasks that require multi-scale features. Stateof-the-art methods for multi-scale feature learning focus on performing feature interactions across space and scales using neural networks with a fixed topology. In this paper, we propose graph feature pyramid networks that are capable of adapting their topological structures to varying intrinsic image structures, and supporting simultaneous feature interactions across all scales. We first define an image specific superpixel hierarchy for each input image to represent its intrinsic image structures. The graph feature pyramid network inherits its structure from this superpixel hierarchy. Contextual and hierarchical layers are designed to achieve feature interactions within the same scale and across different scales, respectively. To make these layers more powerful, we introduce two types of local channel attention for graph neural networks by generalizing global channel attention for convolutional neural networks. The proposed graph feature pyramid network can enhance the multiscale features from a convolutional feature pyramid network. We evaluate our graph feature pyramid network in the object detection task by integrating it into the Faster RCNN algorithm. The modified algorithm not only outperforms previous state-of-the-art feature pyramid based methods with a clear margin but also outperforms other popular detection methods on both MS-COCO 2017 validation and test datasets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhao_GraphFPN_Graph_Feature_Pyramid_Network_for_Object_Detection_ICCV_2021_paper.pdf", @@ -17352,14 +18527,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhao_GraphFPN_Graph_Feature_Pyramid_Network_for_Object_Detection_ICCV_2021_paper.html", "aff_unique_index": "0+1+2;0+1;2", - "aff_unique_norm": "Fudan University;Shanghai Key Lab of Intelligent Information Processing;University of Hong Kong", + "aff_unique_norm": "Fudan University;Shanghai Key Lab of Intelligent Information Processing;The University of Hong Kong", "aff_unique_dep": "School of Computer Science;Intelligent Information Processing;Department of Computer Science", "aff_unique_url": "https://www.fudan.edu.cn;;https://www.hku.hk", "aff_unique_abbr": "Fudan;;HKU", "aff_campus_unique_index": "1;;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0+0+0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhao_2021_ICCV,\n \n author = {\n Zhao,\n Gangming and Ge,\n Weifeng and Yu,\n Yizhou\n},\n title = {\n GraphFPN: Graph Feature Pyramid Network for Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2763-2772\n} \n}" }, { "title": "Graspness Discovery in Clutters for Fast and Accurate Grasp Detection", @@ -17367,6 +18543,7 @@ "status": "Poster", "track": "main", "pid": 8339, + "author_site": "Chenxi Wang; Hao-Shu Fang; Minghao Gou; Hongjie Fang; Jin Gao; Cewu Lu", "author": "Chenxi Wang; Hao-Shu Fang; Minghao Gou; Hongjie Fang; Jin Gao; Cewu Lu", "abstract": "Efficient and robust grasp pose detection is vital for robotic manipulation. For general 6 DoF grasping, conventional methods treat all points in a scene equally and usually adopt uniform sampling to select grasp candidates. However, we discover that ignoring where to grasp greatly harms the speed and accuracy of current grasp pose detection methods. In this paper, we propose \"graspness\", a quality based on geometry cues that distinguishes graspable area in cluttered scenes. A look-ahead searching method is proposed for measuring the graspness and statistical results justify the rationality of our method. To quickly detect graspness in practice, we develop a neural network named graspness model to approximate the searching process. Extensive experiments verify the stability, generality and effectiveness of our graspness model, allowing it to be used as a plug-and-play module for different methods. A large improvement in accuracy is witnessed for various previous methods after equipping our graspness model. Moreover, we develop GSNet, an end-to-end network that incorporates our graspness model for early filtering of low-quality predictions. Experiments on a large-scale benchmark, GraspNet-1Billion, show that our method outperforms previous arts by a large margin (30+ AP) and achieves a high inference speed. Our code and model will be made publicly available.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_Graspness_Discovery_in_Clutters_for_Fast_and_Accurate_Grasp_Detection_ICCV_2021_paper.pdf", @@ -17390,7 +18567,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0+0+0+0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Chenxi and Fang,\n Hao-Shu and Gou,\n Minghao and Fang,\n Hongjie and Gao,\n Jin and Lu,\n Cewu\n},\n title = {\n Graspness Discovery in Clutters for Fast and Accurate Grasp Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15964-15973\n} \n}" }, { "title": "Gravity-Aware Monocular 3D Human-Object Reconstruction", @@ -17398,6 +18576,7 @@ "status": "Poster", "track": "main", "pid": 1025, + "author_site": "Rishabh Dabral; Soshi Shimada; Arjun Jain; Christian Theobalt; Vladislav Golyanik", "author": "Rishabh Dabral; Soshi Shimada; Arjun Jain; Christian Theobalt; Vladislav Golyanik", "abstract": "This paper proposes GraviCap, i.e., a new approach for joint markerless 3D human motion capture and object trajectory estimation from monocular RGB videos. We focus on scenes with objects partially observed during a free flight. In contrast to existing monocular methods, we can recover scale, object trajectories as well as human bone lengths in meters and the ground plane's orientation, thanks to the awareness of the gravity constraining object motions. Our objective function is parametrised by the object's initial velocity and position, gravity direction and focal length, and jointly optimised for one or several free flight episodes. The proposed human-object interaction constraints ensure geometric consistency of the 3D reconstructions and improved physical plausibility of human poses compared to the unconstrained case. We evaluate GraviCap on a new dataset with ground-truth annotations for persons and different objects undergoing free flights. In the experiments, our approach achieves state-of-the-art accuracy in 3D human motion capture on various metrics. We urge the reader to watch our supplementary video. Both the source code and the dataset are released; see http://4dqv.mpi-inf.mpg.de/GraviCap/.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Dabral_Gravity-Aware_Monocular_3D_Human-Object_Reconstruction_ICCV_2021_paper.pdf", @@ -17415,13 +18594,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Dabral_Gravity-Aware_Monocular_3D_Human-Object_Reconstruction_ICCV_2021_paper.html", "aff_unique_index": "0;1;2;3;1", "aff_unique_norm": "Indian Institute of Technology Bombay;Max Planck Institute for Informatics;Indian Institute of Science;Fast Code AI", - "aff_unique_dep": ";SIC;;", + "aff_unique_dep": ";MPI for Informatics;;", "aff_unique_url": "https://www.iitb.ac.in;https://www.mpi-inf.mpg.de;https://www.iisc.ac.in;https://www.fastcode.ai", "aff_unique_abbr": "IITB;MPII;IISc;Fast Code AI", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Mumbai;;Bangalore", "aff_country_unique_index": "0;1;0;2;1", - "aff_country_unique": "India;Germany;United States" + "aff_country_unique": "India;Germany;United States", + "bibtex": "@InProceedings{Dabral_2021_ICCV,\n \n author = {\n Dabral,\n Rishabh and Shimada,\n Soshi and Jain,\n Arjun and Theobalt,\n Christian and Golyanik,\n Vladislav\n},\n title = {\n Gravity-Aware Monocular 3D Human-Object Reconstruction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12365-12374\n} \n}" }, { "title": "Greedy Gradient Ensemble for Robust Visual Question Answering", @@ -17429,6 +18609,7 @@ "status": "Poster", "track": "main", "pid": 3100, + "author_site": "Xinzhe Han; Shuhui Wang; Chi Su; Qingming Huang; Qi Tian", "author": "Xinzhe Han; Shuhui Wang; Chi Su; Qingming Huang; Qi Tian", "abstract": "Language bias is a critical issue in Visual Question Answering (VQA), where models often exploit dataset biases for the final decision without considering the image information. As a result, they suffer from performance drop on out-of-distribution data and inadequate visual explanation. Based on experimental analysis for existing robust VQA methods, we stress the language bias in VQA that comes from two aspects, i.e., distribution bias and shortcut bias. We further propose a new de-bias framework, Greedy Gradient Ensemble (GGE), which combines multiple biased models for unbiased base model learning. With the greedy strategy, GGE forces the biased models to over-fit the biased data distribution in priority, thus makes the base model pay more attention to examples that are hard to solve by biased models. The experiments demonstrate that our method makes better use of visual information and achieves state-of-the-art performance on diagnosing dataset VQA-CP without using extra annotations.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Han_Greedy_Gradient_Ensemble_for_Robust_Visual_Question_Answering_ICCV_2021_paper.pdf", @@ -17445,14 +18626,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Han_Greedy_Gradient_Ensemble_for_Robust_Visual_Question_Answering_ICCV_2021_paper.html", "aff_unique_index": "0+1;0;2;0+1+3;4", - "aff_unique_norm": "Chinese Academy of Sciences;University of Chinese Academy of Sciences;Kingsoft Cloud;Pengcheng Laboratory;Huawei", - "aff_unique_dep": "Institute of Computing Technology;;;Peng Cheng Laboratory;Cloud BU", + "aff_unique_norm": "Chinese Academy of Sciences;University of Chinese Academy of Sciences;Kingsoft Cloud;Peng Cheng Laboratory;Huawei Technologies", + "aff_unique_dep": "Institute of Computing Technology;;;;Cloud BU", "aff_unique_url": "http://www.ict.cas.cn;http://www.ucas.ac.cn;https://www.ksyun.com;;https://www.huawei.com", "aff_unique_abbr": "CAS;UCAS;KSC;;Huawei", "aff_campus_unique_index": "0+0;0;0;0+0+1;1", "aff_campus_unique": "Beijing;Shenzhen", "aff_country_unique_index": "0+0;0;0;0+0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Han_2021_ICCV,\n \n author = {\n Han,\n Xinzhe and Wang,\n Shuhui and Su,\n Chi and Huang,\n Qingming and Tian,\n Qi\n},\n title = {\n Greedy Gradient Ensemble for Robust Visual Question Answering\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1584-1593\n} \n}" }, { "title": "GridToPix: Training Embodied Agents With Minimal Supervision", @@ -17460,6 +18642,7 @@ "status": "Poster", "track": "main", "pid": 6640, + "author_site": "Unnat Jain; Iou-Jen Liu; Svetlana Lazebnik; Aniruddha Kembhavi; Luca Weihs; Alexander G. Schwing", "author": "Unnat Jain; Iou-Jen Liu; Svetlana Lazebnik; Aniruddha Kembhavi; Luca Weihs; Alexander G. Schwing", "abstract": "While deep reinforcement learning (RL) promises freedom from hand-labeled data, great successes, especially for Embodied AI, require significant work to create supervision via carefully shaped rewards. Indeed, without shaped rewards, i.e., with only terminal rewards, present-day Embodied AI results degrade significantly across Embodied AI problems from single-agent Habitat-based PointGoal Navigation (SPL drops from 55 to 0) and two-agent AI2-THOR-based Furniture Moving (success drops from 58% to 1%) to three-agent Google Football-based 3 vs. 1 with Keeper (game score drops from 0.6 to 0.1). As training from shaped rewards doesn't scale to more realistic tasks, the community needs to improve the success of training with terminal rewards. For this we propose GridToPix: 1) train agents with terminal rewards in gridworlds that generically mirror Embodied AI environments, i.e., they are independent of the task; 2) distill the learned policy into agents that reside in complex visual worlds. Despite learning from only terminal rewards with identical models and RL algorithms, GridToPix significantly improves results across tasks: from PointGoal Navigation (SPL improves from 0 to 64) and Furniture Moving (success improves from 1% to 25%) to football gameplay (game score improves from 0.1 to 0.6). GridToPix even helps to improve the results of shaped reward training.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Jain_GridToPix_Training_Embodied_Agents_With_Minimal_Supervision_ICCV_2021_paper.pdf", @@ -17476,14 +18659,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Jain_GridToPix_Training_Embodied_Agents_With_Minimal_Supervision_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;1;1;0", - "aff_unique_norm": "University of Illinois Urbana-Champaign;Allen Institute for AI", + "aff_unique_norm": "University of Illinois at Urbana-Champaign;Allen Institute for AI", "aff_unique_dep": ";PRIOR", "aff_unique_url": "https://illinois.edu;https://allenai.org", "aff_unique_abbr": "UIUC;AI2", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Urbana-Champaign;", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Jain_2021_ICCV,\n \n author = {\n Jain,\n Unnat and Liu,\n Iou-Jen and Lazebnik,\n Svetlana and Kembhavi,\n Aniruddha and Weihs,\n Luca and Schwing,\n Alexander G.\n},\n title = {\n GridToPix: Training Embodied Agents With Minimal Supervision\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15141-15151\n} \n}" }, { "title": "Ground-Truth or DAER: Selective Re-Query of Secondary Information", @@ -17491,6 +18675,7 @@ "status": "Poster", "track": "main", "pid": 3963, + "author_site": "Stephan J. Lemmer; Jason J. Corso", "author": "Stephan J. Lemmer; Jason J. Corso", "abstract": "Many vision tasks use secondary information at inference time---a seed---to assist a computer vision model in solving a problem. For example, an initial bounding box is needed to initialize visual object tracking. To date, all such work makes the assumption that the seed is a good one. However, in practice, from crowdsourcing to noisy automated seeds, this is often not the case. We hence propose the problem of seed rejection---determining whether to reject a seed based on the expected performance degradation when it is provided in place of a gold-standard seed. We provide a formal definition to this problem, and focus on two meaningful subgoals: understanding causes of error and understanding the model's response to noisy seeds conditioned on the primary input. With these goals in mind, we propose a novel training method and evaluation metrics for the seed rejection problem. We then use seeded versions of the viewpoint estimation and fine-grained classification tasks to evaluate these contributions. In these experiments, we show our method can reduce the number of seeds that need to be reviewed for a target performance by over 23% compared to strong baselines.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Lemmer_Ground-Truth_or_DAER_Selective_Re-Query_of_Secondary_Information_ICCV_2021_paper.pdf", @@ -17514,7 +18699,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Ann Arbor", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Lemmer_2021_ICCV,\n \n author = {\n Lemmer,\n Stephan J. and Corso,\n Jason J.\n},\n title = {\n Ground-Truth or DAER: Selective Re-Query of Secondary Information\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 703-714\n} \n}" }, { "title": "Grounding Consistency: Distilling Spatial Common Sense for Precise Visual Relationship Detection", @@ -17522,6 +18708,7 @@ "status": "Poster", "track": "main", "pid": 4306, + "author_site": "Markos Diomataris; Nikolaos Gkanatsios; Vassilis Pitsikalis; Petros Maragos", "author": "Markos Diomataris; Nikolaos Gkanatsios; Vassilis Pitsikalis; Petros Maragos", "abstract": "Scene Graph Generators (SGGs) are models that, given an image, build a directed graph where each edge represents a predicted subject predicate object triplet. Most SGGs silently exploit datasets' bias on relationships' context, i.e. its subject and object, to improve recall and neglect spatial and visual evidence, e.g. having seen a glut of data for person wearing shirt, they are overconfident that every person is wearing every shirt. Such imprecise predictions are mainly ascribed to the lack of negative examples for most relationships, fact that obstructs models from meaningfully learning predicates, even those which have ample positive examples. We first present an in-depth investigation of the context bias issue to showcase that all examined state-of-the-art SGGs share the above vulnerabilities. In response, we propose a semi-supervised scheme that forces predicted triplets to be grounded consistently back to the image, in a closed-loop manner. The developed spatial common sense can be then distilled to a student SGG and substantially enhance its spatial reasoning ability. This Grounding Consistency Distillation (GCD) approach is model-agnostic and profits from the superfluous unlabeled samples to retain the valuable context information and avert memorization of annotations. Furthermore, we ascertain that current metrics disregard unlabeled samples, rendering themselves incapable of reflecting context bias, then we mine and incorporate during evaluation hard-negatives to reformulate precision as a reliable metric. Extensive experimental comparisons exhibit large quantitative - up to 70% relative precision boost on VG200 dataset - and qualitative improvements to prove the significance of our GCD method and our metrics towards refocusing graph generation as a core aspect of scene understanding. Code available at https://github.com/deeplab-ai/grounding-consistent-vrd.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Diomataris_Grounding_Consistency_Distilling_Spatial_Common_Sense_for_Precise_Visual_Relationship_ICCV_2021_paper.pdf", @@ -17545,7 +18732,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0+0;0+1;1", - "aff_country_unique": "United States;Greece" + "aff_country_unique": "United States;Greece", + "bibtex": "@InProceedings{Diomataris_2021_ICCV,\n \n author = {\n Diomataris,\n Markos and Gkanatsios,\n Nikolaos and Pitsikalis,\n Vassilis and Maragos,\n Petros\n},\n title = {\n Grounding Consistency: Distilling Spatial Common Sense for Precise Visual Relationship Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15911-15920\n} \n}" }, { "title": "Group-Aware Contrastive Regression for Action Quality Assessment", @@ -17553,6 +18741,7 @@ "status": "Poster", "track": "main", "pid": 2273, + "author_site": "Xumin Yu; Yongming Rao; Wenliang Zhao; Jiwen Lu; Jie Zhou", "author": "Xumin Yu; Yongming Rao; Wenliang Zhao; Jiwen Lu; Jie Zhou", "abstract": "Assessing action quality is challenging due to the subtle differences between videos and large variations in scores. Most existing approaches tackle this problem by regressing a quality score from a single video, suffering a lot from the large inter-video score variations. In this paper, we show that the relations among videos can provide important clues for more accurate action quality assessment during both training and inference. Specifically, we reformulate the problem of action quality assessment as regressing the relative scores with reference to another video that has shared attributes (e.g. category and difficulty), instead of learning unreferenced scores. Following this formulation, we propose a new contrastive regression (CoRe) framework to learn the relative scores by pair-wise comparison, which highlights the differences between videos and guides the models to learn the key hints for assessment. In order to further exploit the relative information between two videos, we devise a group-aware regression tree to convert the conventional score regression into two easier sub-problems: coarse-to-fine classification and regression in small intervals. To demonstrate the effectiveness of CoRe, we conduct extensive experiments on three mainstream AQA datasets including AQA-7, MTL-AQA, and JIGSAWS. Our approaches outperform previous methods by a large margin and establish new state-of-the-art on all three benchmarks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yu_Group-Aware_Contrastive_Regression_for_Action_Quality_Assessment_ICCV_2021_paper.pdf", @@ -17576,7 +18765,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yu_2021_ICCV,\n \n author = {\n Yu,\n Xumin and Rao,\n Yongming and Zhao,\n Wenliang and Lu,\n Jiwen and Zhou,\n Jie\n},\n title = {\n Group-Aware Contrastive Regression for Action Quality Assessment\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7919-7928\n} \n}" }, { "title": "Group-Free 3D Object Detection via Transformers", @@ -17584,6 +18774,7 @@ "status": "Poster", "track": "main", "pid": 8549, + "author_site": "Ze Liu; Zheng Zhang; Yue Cao; Han Hu; Xin Tong", "author": "Ze Liu; Zheng Zhang; Yue Cao; Han Hu; Xin Tong", "abstract": "Recently, directly detecting 3D objects from 3D point clouds has received increasing attention. To extract object representation from an irregular point cloud, existing methods usually take a point grouping step to assign the points to an object candidate so that a PointNet-like network could be used to derive object features from the grouped points. However, the inaccurate point assignments caused by the hand-crafted grouping scheme decrease the performance of 3D object detection. In this paper, we present a simple yet effective method for directly detecting 3D objects from the 3D point cloud. Instead of grouping local points to each object candidate, our method computes the feature of an object from all the points in the point cloud with the help of an attention mechanism in the Transformers, where the contribution of each point is automatically learned in the network training. With an improved attention stacking scheme, our method fuses object features in different stages and generates more accurate object detection results. With few bells and whistles, the proposed method achieves state-of-the-art 3D object detection performance on two widely used benchmarks, ScanNet V2 and SUN RGB-D.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liu_Group-Free_3D_Object_Detection_via_Transformers_ICCV_2021_paper.pdf", @@ -17600,14 +18791,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Liu_Group-Free_3D_Object_Detection_via_Transformers_ICCV_2021_paper.html", "aff_unique_index": "0+1;1;1;1;1", - "aff_unique_norm": "University of Science and Technology of China;Microsoft", + "aff_unique_norm": "University of Science and Technology of China;Microsoft Research", "aff_unique_dep": ";Research", "aff_unique_url": "http://www.ustc.edu.cn;https://www.microsoft.com/en-us/research/group/asia", "aff_unique_abbr": "USTC;MSR Asia", "aff_campus_unique_index": "1;1;1;1;1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0+0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2021_ICCV,\n \n author = {\n Liu,\n Ze and Zhang,\n Zheng and Cao,\n Yue and Hu,\n Han and Tong,\n Xin\n},\n title = {\n Group-Free 3D Object Detection via Transformers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2949-2958\n} \n}" }, { "title": "Group-Wise Inhibition Based Feature Regularization for Robust Classification", @@ -17615,6 +18807,7 @@ "status": "Poster", "track": "main", "pid": 6497, + "author_site": "Haozhe Liu; Haoqian Wu; Weicheng Xie; Feng Liu; Linlin Shen", "author": "Haozhe Liu; Haoqian Wu; Weicheng Xie; Feng Liu; Linlin Shen", "abstract": "The convolutional neural network (CNN) is vulnerable to degraded images with even very small variations (e.g. corrupted and adversarial samples). One of the possible reasons is that CNN pays more attention to the most discriminative regions, but ignores the auxiliary features when learning, leading to the lack of feature diversity for final judgment. In our method, we propose to dynamically suppress significant activation values of CNN by group-wise inhibition, but not fixedly or randomly handle them when training. The feature maps with different activation distribution are then processed separately to take the feature independence into account. CNN is finally guided to learn richer discriminative features hierarchically for robust classification according to the proposed regularization. Our method is comprehensively evaluated under multiple settings, including classification against corruptions, adversarial attacks and low data regime. Extensive experimental results show that the proposed method can achieve significant improvements in terms of both robustness and generalization performances, when compared with the state-of-the-art methods. Code is available at https://github. com/LinusWu/TENET_Training.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liu_Group-Wise_Inhibition_Based_Feature_Regularization_for_Robust_Classification_ICCV_2021_paper.pdf", @@ -17638,7 +18831,8 @@ "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "1;1;1+1", - "aff_country_unique": ";China" + "aff_country_unique": ";China", + "bibtex": "@InProceedings{Liu_2021_ICCV,\n \n author = {\n Liu,\n Haozhe and Wu,\n Haoqian and Xie,\n Weicheng and Liu,\n Feng and Shen,\n Linlin\n},\n title = {\n Group-Wise Inhibition Based Feature Regularization for Robust Classification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 478-486\n} \n}" }, { "title": "GroupFormer: Group Activity Recognition With Clustered Spatial-Temporal Transformer", @@ -17646,6 +18840,7 @@ "status": "Poster", "track": "main", "pid": 7253, + "author_site": "Shuaicheng Li; Qianggang Cao; Lingbo Liu; Kunlin Yang; Shinan Liu; Jun Hou; Shuai Yi", "author": "Shuaicheng Li; Qianggang Cao; Lingbo Liu; Kunlin Yang; Shinan Liu; Jun Hou; Shuai Yi", "abstract": "Group activity recognition is a crucial yet challenging problem, whose core lies in fully exploring spatial-temporal interactions among individuals and generating reasonable group representations. However, previous methods either model spatial and temporal information separately, or directly aggregate individual features to form group features. To address these issues, we propose a novel group activity recognition network termed GroupFormer. It captures spatial-temporal contextual information jointly to augment the individual and group representations effectively with a clustered spatial-temporal transformer. Specifically, our GroupFormer has three appealing advantages: (1) A tailor-modified Transformer, Clustered Spatial-Temporal Transformer, is proposed to enhance the individual and group representation. (2) It models the spatial and temporal dependencies integrally and utilizes decoders to build the bridge between the spatial and temporal information. (3) A clustered attention mechanism is utilized to dynamically divide individuals into multiple clusters for better learning activity-aware semantic representations. Moreover, experimental results show that the proposed framework outperforms state-of-the-art methods on the Volleyball dataset and Collective Activity dataset.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_GroupFormer_Group_Activity_Recognition_With_Clustered_Spatial-Temporal_Transformer_ICCV_2021_paper.pdf", @@ -17662,14 +18857,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Li_GroupFormer_Group_Activity_Recognition_With_Clustered_Spatial-Temporal_Transformer_ICCV_2021_paper.html", "aff_unique_index": "0;0;1;0;0;0;0", - "aff_unique_norm": "SenseTime;Hong Kong Polytechnic University", + "aff_unique_norm": "Sensetime;The Hong Kong Polytechnic University", "aff_unique_dep": "Research;", "aff_unique_url": "https://www.sensetime.com/;https://www.polyu.edu.hk", "aff_unique_abbr": "SenseTime;PolyU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Shuaicheng and Cao,\n Qianggang and Liu,\n Lingbo and Yang,\n Kunlin and Liu,\n Shinan and Hou,\n Jun and Yi,\n Shuai\n},\n title = {\n GroupFormer: Group Activity Recognition With Clustered Spatial-Temporal Transformer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13668-13677\n} \n}" }, { "title": "Guided Point Contrastive Learning for Semi-Supervised Point Cloud Semantic Segmentation", @@ -17677,6 +18873,7 @@ "status": "Poster", "track": "main", "pid": 4329, + "author_site": "Li Jiang; Shaoshuai Shi; Zhuotao Tian; Xin Lai; Shu Liu; Chi-Wing Fu; Jiaya Jia", "author": "Li Jiang; Shaoshuai Shi; Zhuotao Tian; Xin Lai; Shu Liu; Chi-Wing Fu; Jiaya Jia", "abstract": "Rapid progress in 3D semantic segmentation is inseparable from the advances of deep network models, which highly rely on large-scale annotated data for training. To address the high cost and challenges of 3D point-level labeling, we present a method for semi-supervised point cloud semantic segmentation to adopt unlabeled point clouds in training to boost the model performance. Inspired by the recent contrastive loss in self-supervised tasks, we propose the guided point contrastive loss to enhance the feature representation and model generalization ability in semi-supervised setting. Semantic predictions on unlabeled point clouds serve as pseudo-label guidance in our loss to avoid negative pairs in the same category. Also, we design the confidence guidance to ensure high-quality feature learning. Besides, a category-balanced sampling strategy is proposed to collect positive and negative samples to mitigate the class imbalance problem. Extensive experiments on three datasets (ScanNet V2, S3DIS, and SemanticKITTI) show the effectiveness of our semi-supervised method to improve the prediction quality with unlabeled data.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Jiang_Guided_Point_Contrastive_Learning_for_Semi-Supervised_Point_Cloud_Semantic_Segmentation_ICCV_2021_paper.pdf", @@ -17693,14 +18890,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Jiang_Guided_Point_Contrastive_Learning_for_Semi-Supervised_Point_Cloud_Semantic_Segmentation_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;0;1;0;0+1", - "aff_unique_norm": "Chinese University of Hong Kong;SmartMore", + "aff_unique_norm": "The Chinese University of Hong Kong;SmartMore", "aff_unique_dep": ";", "aff_unique_url": "https://www.cuhk.edu.hk;", "aff_unique_abbr": "CUHK;", "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Jiang_2021_ICCV,\n \n author = {\n Jiang,\n Li and Shi,\n Shaoshuai and Tian,\n Zhuotao and Lai,\n Xin and Liu,\n Shu and Fu,\n Chi-Wing and Jia,\n Jiaya\n},\n title = {\n Guided Point Contrastive Learning for Semi-Supervised Point Cloud Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6423-6432\n} \n}" }, { "title": "GyroFlow: Gyroscope-Guided Unsupervised Optical Flow Learning", @@ -17708,6 +18906,7 @@ "status": "Poster", "track": "main", "pid": 8106, + "author_site": "Haipeng Li; Kunming Luo; Shuaicheng Liu", "author": "Haipeng Li; Kunming Luo; Shuaicheng Liu", "abstract": "Existing optical flow methods are erroneous in challenging scenes, such as fog, rain, and night because the basic optical flow assumptions such as brightness and gradient constancy are broken. To address this problem, we present an unsupervised learning approach that fuses gyroscope into optical flow learning. Specifically, we first convert gyroscope readings into motion fields named gyro field. Second, we design a self-guided fusion module to fuse the background motion extracted from the gyro field with the optical flow and guide the network to focus on motion details. To the best of our knowledge, this is the first deep learning-based framework that fuses gyroscope data and image content for optical flow learning. To validate our method, we propose a new dataset that covers regular and challenging scenes. Experiments show that our method outperforms the state-of-art methods in both regular and challenging scenes. Code and dataset are available at https://github.com/megvii-research/GyroFlow.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_GyroFlow_Gyroscope-Guided_Unsupervised_Optical_Flow_Learning_ICCV_2021_paper.pdf", @@ -17731,7 +18930,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Haipeng and Luo,\n Kunming and Liu,\n Shuaicheng\n},\n title = {\n GyroFlow: Gyroscope-Guided Unsupervised Optical Flow Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12869-12878\n} \n}" }, { "title": "H2O: A Benchmark for Visual Human-Human Object Handover Analysis", @@ -17739,6 +18939,7 @@ "status": "Poster", "track": "main", "pid": 5850, + "author_site": "Ruolin Ye; Wenqiang Xu; Zhendong Xue; Tutian Tang; Yanfeng Wang; Cewu Lu", "author": "Ruolin Ye; Wenqiang Xu; Zhendong Xue; Tutian Tang; Yanfeng Wang; Cewu Lu", "abstract": "Object handover is a common human collaboration behavior that attracts attention from researchers in Robotics and Cognitive Science. Though visual perception plays an important role in the object handover task, the whole handover process has been specifically explored. In this work, we propose a novel rich-annotated dataset, H2O, for visual analysis of human-human object handovers. The H2O, which contains 18K video clips involving 15 people who hand over 30 objects to each other, is a multi-purpose benchmark. It can support several vision-based tasks, from which, we specifically provide a baseline method, RGPNet, for a less-explored task named Receiver Grasp Prediction. Extensive experiments show that the RGPNet can produce plausible grasps based on the giver's hand-object states in the pre-handover phase. Besides, we also report the hand and object pose errors with existing baselines and show that the dataset can serve as the video demonstrations for robot imitation learning on the handover task.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ye_H2O_A_Benchmark_for_Visual_Human-Human_Object_Handover_Analysis_ICCV_2021_paper.pdf", @@ -17753,7 +18954,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Ye_H2O_A_Benchmark_for_Visual_Human-Human_Object_Handover_Analysis_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Ye_H2O_A_Benchmark_for_Visual_Human-Human_Object_Handover_Analysis_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Ye_2021_ICCV,\n \n author = {\n Ye,\n Ruolin and Xu,\n Wenqiang and Xue,\n Zhendong and Tang,\n Tutian and Wang,\n Yanfeng and Lu,\n Cewu\n},\n title = {\n H2O: A Benchmark for Visual Human-Human Object Handover Analysis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15762-15771\n} \n}" }, { "title": "H2O: Two Hands Manipulating Objects for First Person Interaction Recognition", @@ -17761,10 +18963,11 @@ "status": "Poster", "track": "main", "pid": 6178, - "author": "Taein Kwon; Bugra Tekin; Jan St\u00fchmer; Federica Bogo; Marc Pollefeys", + "author_site": "Taein Kwon; Bugra Tekin; Jan Stühmer; Federica Bogo; Marc Pollefeys", + "author": "Taein Kwon; Bugra Tekin; Jan Stühmer; Federica Bogo; Marc Pollefeys", "abstract": "We present a comprehensive framework for egocentric interaction recognition using markerless 3D annotations of two hands manipulating objects. To this end, we propose a method to create a unified dataset for egocentric 3D interaction recognition. Our method produces annotations of the 3D pose of two hands and the 6D pose of the manipulated objects, along with their interaction labels for each frame. Our dataset, called H2O (2 Hands and Objects), provides synchronized multi-view RGB-D images, interaction labels, object classes, ground-truth 3D poses for left & right hands, 6D object poses, ground-truth camera poses, object meshes and scene point clouds. To the best of our knowledge, this is the first benchmark that enables the study of first-person actions with the use of the pose of both left and right hands manipulating objects and presents an unprecedented level of detail for egocentric 3D interaction recognition. We further propose the method to predict interaction classes by estimating the 3D pose of two hands and the 6D pose of the manipulated objects, jointly from RGB images. Our method models both inter- and intra-dependencies between both hands and objects by learning the topology of a graph convolutional network that predicts interactions. We show that our method facilitated by this dataset establishes a strong baseline for joint hand-object pose estimation and achieves state-of-the-art accuracy for first person interaction recognition.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Kwon_H2O_Two_Hands_Manipulating_Objects_for_First_Person_Interaction_Recognition_ICCV_2021_paper.pdf", - "aff": "ETH Z \u00a8urich; Microsoft; Samsung AI Center, Cambridge; Microsoft; ETH Z \u00a8urich+Microsoft", + "aff": "ETH Z ¨urich; Microsoft; Samsung AI Center, Cambridge; Microsoft; ETH Z ¨urich+Microsoft", "project": "https://www.taeinkwon.com/projects/h2o", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Kwon_H2O_Two_Hands_ICCV_2021_supplemental.pdf", @@ -17777,14 +18980,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Kwon_H2O_Two_Hands_Manipulating_Objects_for_First_Person_Interaction_Recognition_ICCV_2021_paper.html", "aff_unique_index": "0;1;2;1;0+1", - "aff_unique_norm": "ETH Zurich;Microsoft;Samsung", - "aff_unique_dep": ";Microsoft Corporation;AI Center", + "aff_unique_norm": "ETH Zürich;Microsoft Corporation;Samsung AI Center", + "aff_unique_dep": ";;AI Center", "aff_unique_url": "https://www.ethz.ch;https://www.microsoft.com;https://www.samsung.com/global/research-innovation/ai-research-centers/samsung-ai-center-cambridge/", "aff_unique_abbr": "ETHZ;Microsoft;SAC", "aff_campus_unique_index": "0;2;0", - "aff_campus_unique": "Z\u00fcrich;;Cambridge", + "aff_campus_unique": "Zürich;;Cambridge", "aff_country_unique_index": "0;1;2;1;0+1", - "aff_country_unique": "Switzerland;United States;United Kingdom" + "aff_country_unique": "Switzerland;United States;United Kingdom", + "bibtex": "@InProceedings{Kwon_2021_ICCV,\n \n author = {\n Kwon,\n Taein and Tekin,\n Bugra and St\\"uhmer,\n Jan and Bogo,\n Federica and Pollefeys,\n Marc\n},\n title = {\n H2O: Two Hands Manipulating Objects for First Person Interaction Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10138-10148\n} \n}" }, { "title": "H3D-Net: Few-Shot High-Fidelity 3D Head Reconstruction", @@ -17792,7 +18996,8 @@ "status": "Poster", "track": "main", "pid": 6166, - "author": "Eduard Ramon; Gil Triginer; Janna Escur; Albert Pumarola; Jaime Garcia; Xavier Gir\u00f3-i-Nieto; Francesc Moreno-Noguer", + "author_site": "Eduard Ramon; Gil Triginer; Janna Escur; Albert Pumarola; Jaime Garcia; Xavier Giró-i-Nieto; Francesc Moreno-Noguer", + "author": "Eduard Ramon; Gil Triginer; Janna Escur; Albert Pumarola; Jaime Garcia; Xavier Giró-i-Nieto; Francesc Moreno-Noguer", "abstract": "Recent learning approaches that implicitly represent surface geometry using coordinate-based neural representations have shown impressive results in the problem of multi-view 3D reconstruction. The effectiveness of these techniques is, however, subject to the availability of a large number (several tens) of input views of the scene, and computationally demanding optimizations. In this paper, we tackle these limitations for the specific problem of few-shot full 3D head reconstruction, by endowing coordinate-based representations with a probabilistic shape prior that enables faster convergence and better generalization when using few input images (down to three). First, we learn a shape model of 3D heads from thousands of incomplete raw scans using implicit representations. At test time, we jointly overfit two coordinate-based neural networks to the scene, one modeling the geometry and another estimating the surface radiance, using implicit differentiable rendering. We devise a two-stage optimization strategy in which the learned prior is used to initialize and constrain the geometry during an initial optimization phase. Then, the prior is unfrozen and fine-tuned to the scene. By doing this, we achieve high-fidelity head reconstructions, including hair and shoulders, and with a high level of detail that consistently outperforms both state-of-the-art 3D Morphable Models methods in the few-shot scenario, and non-parametric methods when large sets of views are available.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ramon_H3D-Net_Few-Shot_High-Fidelity_3D_Head_Reconstruction_ICCV_2021_paper.pdf", "aff": ";;;;;;", @@ -17806,7 +19011,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Ramon_H3D-Net_Few-Shot_High-Fidelity_3D_Head_Reconstruction_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Ramon_H3D-Net_Few-Shot_High-Fidelity_3D_Head_Reconstruction_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Ramon_2021_ICCV,\n \n author = {\n Ramon,\n Eduard and Triginer,\n Gil and Escur,\n Janna and Pumarola,\n Albert and Garcia,\n Jaime and Gir\\'o-i-Nieto,\n Xavier and Moreno-Noguer,\n Francesc\n},\n title = {\n H3D-Net: Few-Shot High-Fidelity 3D Head Reconstruction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5620-5629\n} \n}" }, { "title": "HAA500: Human-Centric Atomic Action Dataset With Curated Videos", @@ -17814,6 +19020,7 @@ "status": "Poster", "track": "main", "pid": 3147, + "author_site": "Jihoon Chung; Cheng-hsin Wuu; Hsuan-ru Yang; Yu-Wing Tai; Chi-Keung Tang", "author": "Jihoon Chung; Cheng-hsin Wuu; Hsuan-ru Yang; Yu-Wing Tai; Chi-Keung Tang", "abstract": "We contribute HAA500, a manually annotated human-centric atomic action dataset for action recognition on 500 classes with over 591K labeled frames. To minimize ambiguities in action classification, HAA500 consists of highly diversified classes of fine-grained atomic actions, where only consistent actions fall under the same label, e.g., \"\"Baseball Pitching\"\" vs \"\"Free Throw in Basketball\"\". Thus HAA500 is different from existing atomic action datasets, where coarse-grained atomic actions were labeled with coarse action-verbs such as \"\"Throw\"\". HAA500 has been carefully curated to capture the precise movement of human figures with little class-irrelevant motions or spatio-temporal label noises. The advantages of HAA500 are fourfold: 1) human-centric actions with a high average of 69.7% detectable joints for the relevant human poses; 2) high scalability since adding a new class can be done under 20-60 minutes; 3) curated videos capturing essential elements of an atomic action without irrelevant frames; 4) fine-grained atomic action classes. Our extensive experiments including cross-data validation using datasets collected in the wild demonstrate the clear benefits of human-centric and atomic characteristics of HAA500, which enable training even a baseline deep learning model to improve prediction by attending to atomic human poses. We detail the HAA500 dataset statistics and collection methodology and compare quantitatively with existing action recognition datasets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chung_HAA500_Human-Centric_Atomic_Action_Dataset_With_Curated_Videos_ICCV_2021_paper.pdf", @@ -17837,7 +19044,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0+1;0+1;0;0+0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Chung_2021_ICCV,\n \n author = {\n Chung,\n Jihoon and Wuu,\n Cheng-hsin and Yang,\n Hsuan-ru and Tai,\n Yu-Wing and Tang,\n Chi-Keung\n},\n title = {\n HAA500: Human-Centric Atomic Action Dataset With Curated Videos\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13465-13474\n} \n}" }, { "title": "HAIR: Hierarchical Visual-Semantic Relational Reasoning for Video Question Answering", @@ -17845,6 +19053,7 @@ "status": "Poster", "track": "main", "pid": 6160, + "author_site": "Fei Liu; Jing Liu; Weining Wang; Hanqing Lu", "author": "Fei Liu; Jing Liu; Weining Wang; Hanqing Lu", "abstract": "Relational reasoning is at the heart of video question answering. However, existing approaches suffer from several common limitations: (1) they only focus on either object-level or frame-level relational reasoning, and fail to integrate the both; and (2) they neglect to leverage semantic knowledge for relational reasoning. In this work, we propose a Hierarchical VisuAl-Semantic RelatIonal Reasoning (HAIR) framework to address these limitations. Specifically, we present a novel graph memory mechanism to perform relational reasoning, and further develop two types of graph memory: a) visual graph memory that leverages visual information of video for relational reasoning; b) semantic graph memory that is specifically designed to explicitly leverage semantic knowledge contained in the classes and attributes of video objects, and perform relational reasoning in the semantic space. Taking advantage of both graph memory mechanisms, we build a hierarchical framework to enable visual-semantic relational reasoning from object level to frame level. Experiments on four challenging benchmark datasets show that the proposed framework leads to state-of-the-art performance, with fewer parameters and faster inference speed. Besides, our approach also shows superior performance on other video+language task.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liu_HAIR_Hierarchical_Visual-Semantic_Relational_Reasoning_for_Video_Question_Answering_ICCV_2021_paper.pdf", @@ -17868,7 +19077,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2021_ICCV,\n \n author = {\n Liu,\n Fei and Liu,\n Jing and Wang,\n Weining and Lu,\n Hanqing\n},\n title = {\n HAIR: Hierarchical Visual-Semantic Relational Reasoning for Video Question Answering\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1698-1707\n} \n}" }, { "title": "HDR Video Reconstruction: A Coarse-To-Fine Network and a Real-World Benchmark Dataset", @@ -17876,6 +19086,7 @@ "status": "Poster", "track": "main", "pid": 2939, + "author_site": "Guanying Chen; Chaofeng Chen; Shi Guo; Zhetong Liang; Kwan-Yee K. Wong; Lei Zhang", "author": "Guanying Chen; Chaofeng Chen; Shi Guo; Zhetong Liang; Kwan-Yee K. Wong; Lei Zhang", "abstract": "High dynamic range (HDR) video reconstruction from sequences captured with alternating exposures is a very challenging problem. Existing methods often align low dynamic range (LDR) input sequence in the image space using optical flow, and then merge the aligned images to produce HDR output. However, accurate alignment and fusion in the image space are difficult due to the missing details in the over-exposed regions and noise in the under-exposed regions, resulting in unpleasing ghosting artifacts. To enable more accurate alignment and HDR fusion, we introduce a coarse-to-fine deep learning framework for HDR video reconstruction. Firstly, we perform coarse alignment and pixel blending in the image space to estimate the coarse HDR video. Secondly, we conduct more sophisticated alignment and temporal fusion in the feature space of the coarse HDR video to produce better reconstruction. Considering the fact that there is no publicly available dataset for quantitative and comprehensive evaluation of HDR video reconstruction methods, we collect such a benchmark dataset, which contains 97 sequences of static scenes and 184 testing pairs of dynamic scenes. Extensive experiments show that our method outperforms previous state-of-the-art methods. Our dataset, code and model will be made publicly available.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_HDR_Video_Reconstruction_A_Coarse-To-Fine_Network_and_a_Real-World_Benchmark_ICCV_2021_paper.pdf", @@ -17890,7 +19101,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Chen_HDR_Video_Reconstruction_A_Coarse-To-Fine_Network_and_a_Real-World_Benchmark_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Chen_HDR_Video_Reconstruction_A_Coarse-To-Fine_Network_and_a_Real-World_Benchmark_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Guanying and Chen,\n Chaofeng and Guo,\n Shi and Liang,\n Zhetong and Wong,\n Kwan-Yee K. and Zhang,\n Lei\n},\n title = {\n HDR Video Reconstruction: A Coarse-To-Fine Network and a Real-World Benchmark Dataset\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2502-2511\n} \n}" }, { "title": "HIRE-SNN: Harnessing the Inherent Robustness of Energy-Efficient Deep Spiking Neural Networks by Training With Crafted Input Noise", @@ -17898,6 +19110,7 @@ "status": "Poster", "track": "main", "pid": 3781, + "author_site": "Souvik Kundu; Massoud Pedram; Peter A. Beerel", "author": "Souvik Kundu; Massoud Pedram; Peter A. Beerel", "abstract": "Low-latency deep spiking neural networks (SNNs) have become a promising alternative to conventional artificial neural networks (ANNs) because of their potential for increased energy efficiency on event-driven neuromorphic hardware. Neural networks, including SNNs, however, are subject to various adversarial attacks and must be trained to remain resilient against such attacks for many applications. Nevertheless, due to prohibitively high training costs associated with SNNs, analysis, and optimization of deep SNNs under various adversarial attacks have been largely overlooked. In this paper, we first present a detailed analysis of the inherent robustness of low-latency SNNs against popular gradient-based attacks, namely fast gradient sign method (FGSM) and projected gradient descent (PGD). Motivated by this analysis, to harness the model robustness against these attacks we present an SNN training algorithm that uses crafted input noise and incurs no additional training time. To evaluate the merits of our algorithm, we conducted extensive experiments with variants of VGG and ResNet on both CIFAR-10 and CIFAR-100 datasets. Compared to standard trained direct input SNNs, our trained models yield improved classification accuracy of up to 13.7% and 10.1% on FGSM and PGD attack-generated images, respectively, with negligible loss in clean image accuracy. Our models also outperform inherently-robust SNNs trained on rate-coded inputs with improved or similar classification performance on attack-generated im-ages while having up to 25x and 4.6x lower latency and computation energy, respectively.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Kundu_HIRE-SNN_Harnessing_the_Inherent_Robustness_of_Energy-Efficient_Deep_Spiking_Neural_ICCV_2021_paper.pdf", @@ -17921,7 +19134,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Los Angeles", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Kundu_2021_ICCV,\n \n author = {\n Kundu,\n Souvik and Pedram,\n Massoud and Beerel,\n Peter A.\n},\n title = {\n HIRE-SNN: Harnessing the Inherent Robustness of Energy-Efficient Deep Spiking Neural Networks by Training With Crafted Input Noise\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5209-5218\n} \n}" }, { "title": "HPNet: Deep Primitive Segmentation Using Hybrid Representations", @@ -17929,6 +19143,7 @@ "status": "Poster", "track": "main", "pid": 1468, + "author_site": "Siming Yan; Zhenpei Yang; Chongyang Ma; Haibin Huang; Etienne Vouga; Qixing Huang", "author": "Siming Yan; Zhenpei Yang; Chongyang Ma; Haibin Huang; Etienne Vouga; Qixing Huang", "abstract": "This paper introduces HPNet, a novel deep-learning approach for segmenting a 3D shape represented as a point cloud into primitive patches. The key to deep primitive segmentation is learning a feature representation that can separate points of different primitives. Unlike utilizing a single feature representation, HPNet leverages hybrid representations that combine one learned semantic descriptor, two spectral descriptors derived from predicted geometric parameters, as well as an adjacency matrix that encodes sharp edges. Moreover, instead of merely concatenating the descriptors, HPNet optimally combines hybrid representations by learning combination weights. This weighting module builds on the entropy of input features. The output primitive segmentation is obtained from a mean-shift clustering module. Experimental results on benchmark datasets ANSI and ABCParts show that HPNet leads to significant performance gains from baseline approaches.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yan_HPNet_Deep_Primitive_Segmentation_Using_Hybrid_Representations_ICCV_2021_paper.pdf", @@ -17943,7 +19158,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Yan_HPNet_Deep_Primitive_Segmentation_Using_Hybrid_Representations_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Yan_HPNet_Deep_Primitive_Segmentation_Using_Hybrid_Representations_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Yan_2021_ICCV,\n \n author = {\n Yan,\n Siming and Yang,\n Zhenpei and Ma,\n Chongyang and Huang,\n Haibin and Vouga,\n Etienne and Huang,\n Qixing\n},\n title = {\n HPNet: Deep Primitive Segmentation Using Hybrid Representations\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2753-2762\n} \n}" }, { "title": "HRegNet: A Hierarchical Network for Large-Scale Outdoor LiDAR Point Cloud Registration", @@ -17951,10 +19167,11 @@ "status": "Poster", "track": "main", "pid": 4256, + "author_site": "Fan Lu; Guang Chen; Yinlong Liu; Lijun Zhang; Sanqing Qu; Shu Liu; Rongqi Gu", "author": "Fan Lu; Guang Chen; Yinlong Liu; Lijun Zhang; Sanqing Qu; Shu Liu; Rongqi Gu", "abstract": "Point cloud registration is a fundamental problem in 3D computer vision. Outdoor LiDAR point clouds are typically large-scale and complexly distributed, which makes the registration challenging. In this paper, we propose an efficient hierarchical network named HRegNet for large-scale outdoor LiDAR point cloud registration. Instead of using all points in the point clouds, HRegNet performs registration on hierarchically extracted keypoints and descriptors. The overall framework combines the reliable features in deeper layer and the precise position information in shallower layers to achieve robust and precise registration. We present a correspondence network to generate correct and accurate keypoints correspondences. Moreover, bilateral consensus and neighborhood consensus are introduced for keypoints matching and novel similarity features are designed to incorporate them into the correspondence network, which significantly improves the registration performance. Besides, the whole network is also highly efficient since only a small number of keypoints are used for registration. Extensive experiments are conducted on two large-scale outdoor LiDAR point cloud datasets to demonstrate the high accuracy and efficiency of the proposed HRegNet. The project website is https://ispc-group.github.io/hregnet.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Lu_HRegNet_A_Hierarchical_Network_for_Large-Scale_Outdoor_LiDAR_Point_Cloud_ICCV_2021_paper.pdf", - "aff": "Tongji University; Tongji University; Technische Universit \u00a8at M \u00a8unchen; Tongji University; Tongji University; ETH Zurich; Westwell lab", + "aff": "Tongji University; Tongji University; Technische Universit ¨at M ¨unchen; Tongji University; Tongji University; ETH Zurich; Westwell lab", "project": "https://ispc-group.github.io/hregnet", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Lu_HRegNet_A_Hierarchical_ICCV_2021_supplemental.pdf", @@ -17967,14 +19184,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Lu_HRegNet_A_Hierarchical_Network_for_Large-Scale_Outdoor_LiDAR_Point_Cloud_ICCV_2021_paper.html", "aff_unique_index": "0;0;1;0;0;2;3", - "aff_unique_norm": "Tongji University;Technische Universit\u00e4t M\u00fcnchen;ETH Zurich;Westwell lab", + "aff_unique_norm": "Tongji University;Technische Universität München;ETH Zurich;Westwell lab", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.tongji.edu.cn;https://www.tum.de;https://www.ethz.ch;", "aff_unique_abbr": "Tongji;TUM;ETHZ;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0;2", - "aff_country_unique": "China;Germany;Switzerland;" + "aff_country_unique": "China;Germany;Switzerland;", + "bibtex": "@InProceedings{Lu_2021_ICCV,\n \n author = {\n Lu,\n Fan and Chen,\n Guang and Liu,\n Yinlong and Zhang,\n Lijun and Qu,\n Sanqing and Liu,\n Shu and Gu,\n Rongqi\n},\n title = {\n HRegNet: A Hierarchical Network for Large-Scale Outdoor LiDAR Point Cloud Registration\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 16014-16023\n} \n}" }, { "title": "Hand Image Understanding via Deep Multi-Task Learning", @@ -17982,6 +19200,7 @@ "status": "Poster", "track": "main", "pid": 1587, + "author_site": "Xiong Zhang; Hongsheng Huang; Jianchao Tan; Hongmin Xu; Cheng Yang; Guozhu Peng; Lei Wang; Ji Liu", "author": "Xiong Zhang; Hongsheng Huang; Jianchao Tan; Hongmin Xu; Cheng Yang; Guozhu Peng; Lei Wang; Ji Liu", "abstract": "Analyzing and understanding hand information from multimedia materials like images or videos is important for many real world applications and remains to be very active in research community. There are various works focusing on recovering hand information from single image, however, they usually solve a single task, for example, hand mask segmentation, 2D/3D hand pose estimation, or hand mesh reconstruction and perform not well in challenging scenarios. To further improve the performance of these tasks, we propose a novel Hand Image Understanding (HIU) framework (HIU-DMTL) to extract comprehensive information of the hand object from a single RGB image, by jointly considering the relationships between these tasks. To achieve this goal, a cascaded multi-task learning (MTL) backbone is designed to estimate the 2D heat maps, to learn the segmentation mask, and to generate the intermediate 3D information encoding, followed by a coarse-to-fine learning paradigm and a self-supervised learning strategy. Qualitative experiments demonstrate that our approach is capable of recovering reasonable mesh representations even in challenging situations. Quantitatively, our method significantly outperforms the state-of-the-art approaches on various widely-used datasets, in terms of diverse evaluation metrics.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhang_Hand_Image_Understanding_via_Deep_Multi-Task_Learning_ICCV_2021_paper.pdf", @@ -17998,14 +19217,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhang_Hand_Image_Understanding_via_Deep_Multi-Task_Learning_ICCV_2021_paper.html", "aff_unique_index": "0;1;2;3;0;0;0;2", - "aff_unique_norm": "Baidu;Joyy Inc.;Kwai Inc.;OPPO Inc.", + "aff_unique_norm": "Baidu Inc.;Joyy Inc.;Kwai Inc.;OPPO Inc.", "aff_unique_dep": "YY Live;;AI Platform;", "aff_unique_url": "https://www.baidu.com;https://www.joyyinc.com;https://www.kwai.com;https://www.oppo.com", "aff_unique_abbr": "Baidu;Joyy;Kwai;OPPO", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0;0;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Zhang_2021_ICCV,\n \n author = {\n Zhang,\n Xiong and Huang,\n Hongsheng and Tan,\n Jianchao and Xu,\n Hongmin and Yang,\n Cheng and Peng,\n Guozhu and Wang,\n Lei and Liu,\n Ji\n},\n title = {\n Hand Image Understanding via Deep Multi-Task Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11281-11292\n} \n}" }, { "title": "Hand-Object Contact Consistency Reasoning for Human Grasps Generation", @@ -18013,6 +19233,7 @@ "status": "Poster", "track": "main", "pid": 4376, + "author_site": "Hanwen Jiang; Shaowei Liu; Jiashun Wang; Xiaolong Wang", "author": "Hanwen Jiang; Shaowei Liu; Jiashun Wang; Xiaolong Wang", "abstract": "While predicting robot grasps with parallel jaw grippers have been well studied and widely applied in robot manipulation tasks, the study on natural human grasp generation with a multi-finger hand remains a very challenging problem. In this paper, we propose to generate human grasps given a 3D object in the world. Our key observation is that it is crucial to model the consistency between the hand contact points and object contact regions. That is, we encourage the prior hand contact points to be close to the object surface and the object common contact regions to be touched by the hand at the same time. Based on the hand-object contact consistency, we design novel objectives in training the human grasp generation model and also a new self-supervised task which allows the grasp generation network to be adjusted even during test time. Our experiments show significant improvement in human grasp generation over state-of-the-art approaches by a large margin. More interestingly, by optimizing the model during test time with the self-supervised task, it helps achieve larger gain on unseen and out-of-domain objects. Project page: https://hwjiang1510.github.io/GraspTTA/.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Jiang_Hand-Object_Contact_Consistency_Reasoning_for_Human_Grasps_Generation_ICCV_2021_paper.pdf", @@ -18027,7 +19248,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Jiang_Hand-Object_Contact_Consistency_Reasoning_for_Human_Grasps_Generation_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Jiang_Hand-Object_Contact_Consistency_Reasoning_for_Human_Grasps_Generation_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Jiang_2021_ICCV,\n \n author = {\n Jiang,\n Hanwen and Liu,\n Shaowei and Wang,\n Jiashun and Wang,\n Xiaolong\n},\n title = {\n Hand-Object Contact Consistency Reasoning for Human Grasps Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11107-11116\n} \n}" }, { "title": "HandFoldingNet: A 3D Hand Pose Estimation Network Using Multiscale-Feature Guided Folding of a 2D Hand Skeleton", @@ -18035,6 +19257,7 @@ "status": "Poster", "track": "main", "pid": 6791, + "author_site": "Wencan Cheng; Jae Hyun Park; Jong Hwan Ko", "author": "Wencan Cheng; Jae Hyun Park; Jong Hwan Ko", "abstract": "With increasing applications of 3D hand pose estimation in various human-computer interaction applications, convolution neural networks (CNNs) based estimation models have been actively explored. However, the existing models require complex architectures or redundant computational resources to trade with the acceptable accuracy. To tackle this limitation, this paper proposes HandFoldingNet, an accurate and efficient hand pose estimator that regresses the hand joint locations from the normalized 3D hand point cloud input. The proposed model utilizes a folding-based decoder that folds a given 2D hand skeleton into the corresponding joint coordinates. For higher estimation accuracy, folding is guided by multi-scale features, which include both global and joint-wise local features. Experimental results show that the proposed model outperforms the existing methods on three hand pose benchmark datasets with the lowest model parameter requirement. Code is available at https://github.com/cwc1260/HandFold.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Cheng_HandFoldingNet_A_3D_Hand_Pose_Estimation_Network_Using_Multiscale-Feature_Guided_ICCV_2021_paper.pdf", @@ -18058,7 +19281,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Cheng_2021_ICCV,\n \n author = {\n Cheng,\n Wencan and Park,\n Jae Hyun and Ko,\n Jong Hwan\n},\n title = {\n HandFoldingNet: A 3D Hand Pose Estimation Network Using Multiscale-Feature Guided Folding of a 2D Hand Skeleton\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11260-11269\n} \n}" }, { "title": "Handwriting Transformers", @@ -18066,10 +19290,11 @@ "status": "Poster", "track": "main", "pid": 10644, + "author_site": "Ankan Kumar Bhunia; Salman Khan; Hisham Cholakkal; Rao Muhammad Anwer; Fahad Shahbaz Khan; Mubarak Shah", "author": "Ankan Kumar Bhunia; Salman Khan; Hisham Cholakkal; Rao Muhammad Anwer; Fahad Shahbaz Khan; Mubarak Shah", "abstract": "We propose a novel transformer-based styled handwritten text image generation approach, HWT, that strives to learn both style-content entanglement as well as global and local style patterns. The proposed HWT captures the long and short range relationships within the style examples through a self-attention mechanism, thereby encoding both global and local style patterns. Further, the proposed transformer-based HWT comprises an encoder-decoder attention that enables style-content entanglement by gathering the style features of each query character. To the best of our knowledge, we are the first to introduce a transformer-based network for styled handwritten text generation. Our proposed HWT generates realistic styled handwritten text images and outperforms the state-of-the-art demonstrated through extensive qualitative, quantitative and human-based evaluations. The proposed HWT can handle arbitrary length of text and any desired writing style in a few-shot setting. Further, our HWT generalizes well to the challenging scenario where both words and writing style are unseen during training, generating realistic styled handwritten text images.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Bhunia_Handwriting_Transformers_ICCV_2021_paper.pdf", - "aff": "Mohamed bin Zayed University of AI, UAE; Mohamed bin Zayed University of AI, UAE+Australian National University, Australia; Mohamed bin Zayed University of AI, UAE; Mohamed bin Zayed University of AI, UAE; Mohamed bin Zayed University of AI, UAE+Link \u00a8oping University, Sweden; University of Central Florida, USA", + "aff": "Mohamed bin Zayed University of AI, UAE; Mohamed bin Zayed University of AI, UAE+Australian National University, Australia; Mohamed bin Zayed University of AI, UAE; Mohamed bin Zayed University of AI, UAE; Mohamed bin Zayed University of AI, UAE+Link ¨oping University, Sweden; University of Central Florida, USA", "project": "", "github": "https://github.com/ankanbhunia/Handwriting-Transformers", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Bhunia_Handwriting_Transformers_ICCV_2021_supplemental.pdf", @@ -18082,14 +19307,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Bhunia_Handwriting_Transformers_ICCV_2021_paper.html", "aff_unique_index": "0;0+1;0;0;0+2;3", - "aff_unique_norm": "Mohamed bin Zayed University of Artificial Intelligence;Australian National University;Link\u00f6ping University;University of Central Florida", + "aff_unique_norm": "Mohamed bin Zayed University of Artificial Intelligence;Australian National University;Linköping University;University of Central Florida", "aff_unique_dep": ";;;", "aff_unique_url": "https://mbzuai.ac.ae;https://www.anu.edu.au;https://www.liu.se;https://www.ucf.edu", "aff_unique_abbr": "MBZUAI;ANU;LiU;UCF", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0+1;0;0;0+2;3", - "aff_country_unique": "United Arab Emirates;Australia;Sweden;United States" + "aff_country_unique": "United Arab Emirates;Australia;Sweden;United States", + "bibtex": "@InProceedings{Bhunia_2021_ICCV,\n \n author = {\n Bhunia,\n Ankan Kumar and Khan,\n Salman and Cholakkal,\n Hisham and Anwer,\n Rao Muhammad and Khan,\n Fahad Shahbaz and Shah,\n Mubarak\n},\n title = {\n Handwriting Transformers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1086-1094\n} \n}" }, { "title": "Harnessing the Conditioning Sensorium for Improved Image Translation", @@ -18097,6 +19323,7 @@ "status": "Poster", "track": "main", "pid": 10515, + "author_site": "Cooper Nederhood; Nicholas Kolkin; Deqing Fu; Jason Salavon", "author": "Cooper Nederhood; Nicholas Kolkin; Deqing Fu; Jason Salavon", "abstract": "Existing methods for multi-modal domain translation learn to embed the input images into a domain-invariant \"content\" space and a domain-specific \"style\" space from which novel images can be synthesized. Rather than learning to embed the RGB image from scratch we propose deriving our content representation from conditioning data produced by pretrained off-the-shelf networks. Motivated by the inherent ambiguity of \"content\", which has different meanings depending on the desired level of abstraction, this approach gives intuitive control over which aspects of content are preserved across domains. We evaluate our method on traditional, well-aligned, datasets such as CelebA-HQ, and propose two novel datasets for evaluation on more complex scenes: ClassicTV and FFHQ-WildCrops. Our approach, which we call Sensorium, enables higher quality domain translation for complex scenes than prior work.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Nederhood_Harnessing_the_Conditioning_Sensorium_for_Improved_Image_Translation_ICCV_2021_paper.pdf", @@ -18120,7 +19347,8 @@ "aff_campus_unique_index": ";1;;", "aff_campus_unique": ";Chicago", "aff_country_unique_index": "0+0;0;0+0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Nederhood_2021_ICCV,\n \n author = {\n Nederhood,\n Cooper and Kolkin,\n Nicholas and Fu,\n Deqing and Salavon,\n Jason\n},\n title = {\n Harnessing the Conditioning Sensorium for Improved Image Translation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6752-6761\n} \n}" }, { "title": "HeadGAN: One-Shot Neural Head Synthesis and Editing", @@ -18128,6 +19356,7 @@ "status": "Poster", "track": "main", "pid": 7663, + "author_site": "Michail Christos Doukas; Stefanos Zafeiriou; Viktoriia Sharmanska", "author": "Michail Christos Doukas; Stefanos Zafeiriou; Viktoriia Sharmanska", "abstract": "Recent attempts to solve the problem of head reenactment using a single reference image have shown promising results. However, most of them either perform poorly in terms of photo-realism, or fail to meet the identity preservation problem, or do not fully transfer the driving pose and expression. We propose HeadGAN, a novel system that conditions synthesis on 3D face representations, which can be extracted from any driving video and adapted to the facial geometry of any reference image, disentangling identity from expression. We further improve mouth movements, by utilising audio features as a complementary input. The 3D face representation enables HeadGAN to be further used as an efficient method for compression and reconstruction and a tool for expression and pose editing.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Doukas_HeadGAN_One-Shot_Neural_Head_Synthesis_and_Editing_ICCV_2021_paper.pdf", @@ -18144,14 +19373,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Doukas_HeadGAN_One-Shot_Neural_Head_Synthesis_and_Editing_ICCV_2021_paper.html", "aff_unique_index": "0+1;0+1;0+2", - "aff_unique_norm": "Imperial College London;Huawei;University of Sussex", - "aff_unique_dep": ";Huawei Technologies;", + "aff_unique_norm": "Imperial College London;Huawei Technologies;University of Sussex", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.imperial.ac.uk;https://www.huawei.com;https://www.sussex.ac.uk", "aff_unique_abbr": "ICL;Huawei;Sussex", "aff_campus_unique_index": "1;1;", "aff_campus_unique": ";London", "aff_country_unique_index": "0+0;0+0;0+0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Doukas_2021_ICCV,\n \n author = {\n Doukas,\n Michail Christos and Zafeiriou,\n Stefanos and Sharmanska,\n Viktoriia\n},\n title = {\n HeadGAN: One-Shot Neural Head Synthesis and Editing\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14398-14407\n} \n}" }, { "title": "Heterogeneous Relational Complement for Vehicle Re-Identification", @@ -18159,6 +19389,7 @@ "status": "Poster", "track": "main", "pid": 1708, + "author_site": "Jiajian Zhao; Yifan Zhao; Jia Li; Ke Yan; Yonghong Tian", "author": "Jiajian Zhao; Yifan Zhao; Jia Li; Ke Yan; Yonghong Tian", "abstract": "The crucial problem in vehicle re-identification is to find the same vehicle identity when reviewing this object from cross-view cameras, which sets a higher demand for learning viewpoint-invariant representations. In this paper, we propose to solve this problem from two aspects: constructing robust feature representations and proposing camera-sensitive evaluations. We first propose a novel Heterogeneous Relational Complement Network (HRCN) by incorporating region-specific features and cross-level features as complements for the original high-level output. Considering the distributional differences and semantic misalignment, we propose graph-based relation modules to embed these heterogeneous features into one unified high-dimensional space. On the other hand, considering the deficiencies of cross-camera evaluations in existing measures (i.e., CMC and AP), we then propose a Cross-camera Generalization Measure (CGM) to improve the evaluations by introducing position-sensitivity and cross-camera generalization penalties. We further construct a new benchmark of existing models with our proposed CGM and experimental results reveal that our proposed HRCN model achieves new state-of-the-art in VeRi-776, VehicleID, and VERI-Wild.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhao_Heterogeneous_Relational_Complement_for_Vehicle_Re-Identification_ICCV_2021_paper.pdf", @@ -18175,14 +19406,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhao_Heterogeneous_Relational_Complement_for_Vehicle_Re-Identification_ICCV_2021_paper.html", "aff_unique_index": "0;0;0+1;2;3+1", - "aff_unique_norm": "Beihang University;Pengcheng Laboratory;Tencent;Peking University", - "aff_unique_dep": "State Key Laboratory of Virtual Reality Technology and Systems, SCSE;Peng Cheng Laboratory;Youtu Lab;Department of Computer Science and Technology", + "aff_unique_norm": "Beihang University;Peng Cheng Laboratory;Tencent;Peking University", + "aff_unique_dep": "State Key Laboratory of Virtual Reality Technology and Systems, SCSE;;Youtu Lab;Department of Computer Science and Technology", "aff_unique_url": "http://www.buaa.edu.cn;;https://www.tencent.com;http://www.pku.edu.cn", "aff_unique_abbr": "Beihang;;Tencent;Peking U", "aff_campus_unique_index": "1;2;1", "aff_campus_unique": ";Shenzhen;Shanghai", "aff_country_unique_index": "0;0;0+0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhao_2021_ICCV,\n \n author = {\n Zhao,\n Jiajian and Zhao,\n Yifan and Li,\n Jia and Yan,\n Ke and Tian,\n Yonghong\n},\n title = {\n Heterogeneous Relational Complement for Vehicle Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 205-214\n} \n}" }, { "title": "HiFT: Hierarchical Feature Transformer for Aerial Tracking", @@ -18190,6 +19422,7 @@ "status": "Poster", "track": "main", "pid": 10178, + "author_site": "Ziang Cao; Changhong Fu; Junjie Ye; Bowen Li; Yiming Li", "author": "Ziang Cao; Changhong Fu; Junjie Ye; Bowen Li; Yiming Li", "abstract": "Most existing Siamese-based tracking methods execute the classification and regression of the target object based on the similarity maps. However, they either employ a single map from the last convolutional layer which degrades the localization accuracy in complex scenarios or separately use multiple maps for decision making, introducing intractable computations for aerial mobile platforms. Thus, in this work, we propose an efficient and effective hierarchical feature transformer (HiFT) for aerial tracking. Hierarchical similarity maps generated by multi-level convolutional layers are fed into the feature transformer to achieve the interactive fusion of spatial (shallow layers) and semantics cues (deep layers). Consequently, not only the global contextual information can be raised, facilitating the target search, but also our end-to-end architecture with the transformer can efficiently learn the interdependencies among multi-level features, thereby discovering a tracking-tailored feature space with strong discriminability. Comprehensive evaluations on four aerial benchmarks have proven the effectiveness of HiFT. Real-world tests on the aerial platform have strongly validated its practicability with a real-time speed. Our code is available at https://github.com/vision4robotics/HiFT.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Cao_HiFT_Hierarchical_Feature_Transformer_for_Aerial_Tracking_ICCV_2021_paper.pdf", @@ -18213,7 +19446,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+1;0;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Cao_2021_ICCV,\n \n author = {\n Cao,\n Ziang and Fu,\n Changhong and Ye,\n Junjie and Li,\n Bowen and Li,\n Yiming\n},\n title = {\n HiFT: Hierarchical Feature Transformer for Aerial Tracking\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15457-15466\n} \n}" }, { "title": "HiNet: Deep Image Hiding by Invertible Network", @@ -18221,6 +19455,7 @@ "status": "Poster", "track": "main", "pid": 3227, + "author_site": "Junpeng Jing; Xin Deng; Mai Xu; Jianyi Wang; Zhenyu Guan", "author": "Junpeng Jing; Xin Deng; Mai Xu; Jianyi Wang; Zhenyu Guan", "abstract": "Image hiding aims to hide a secret image into a cover image in an imperceptible way, and then recover the secret image perfectly at the receiver end. Capacity, invisibility and security are three primary challenges in image hiding task. This paper proposes a novel invertible neural network (INN) based framework, HiNet, to simultaneously overcome the three challenges in image hiding. For large capacity, we propose an inverse learning mechanism by simultaneously learning the image concealing and revealing processes. Our method is able to achieve the concealing of a full-size secret image into a cover image with the same size. For high invisibility, instead of pixel domain hiding, we propose to hide the secret information in wavelet domain. Furthermore, we propose a new low-frequency wavelet loss to constrain that secret information is hidden in high-frequency wavelet sub-bands, which significantly improves the hiding security. Experimental results show that our HiNet significantly outperforms other state-of-the-art image hiding methods, with more than 10 dB PSNR improvement in secret image recovery on ImageNet, COCO and DIV2K datasets. Codes are available at https://github.com/TomTomTommi/HiNet.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Jing_HiNet_Deep_Image_Hiding_by_Invertible_Network_ICCV_2021_paper.pdf", @@ -18244,7 +19479,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Jing_2021_ICCV,\n \n author = {\n Jing,\n Junpeng and Deng,\n Xin and Xu,\n Mai and Wang,\n Jianyi and Guan,\n Zhenyu\n},\n title = {\n HiNet: Deep Image Hiding by Invertible Network\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4733-4742\n} \n}" }, { "title": "HiT: Hierarchical Transformer With Momentum Contrast for Video-Text Retrieval", @@ -18252,6 +19488,7 @@ "status": "Poster", "track": "main", "pid": 2637, + "author_site": "Song Liu; Haoqi Fan; Shengsheng Qian; Yiru Chen; Wenkui Ding; Zhongyuan Wang", "author": "Song Liu; Haoqi Fan; Shengsheng Qian; Yiru Chen; Wenkui Ding; Zhongyuan Wang", "abstract": "Video-Text Retrieval has been a hot research topic with the growth of multimedia data on the internet. Transformer for video-text learning has attracted increasing attention due to its promising performance. However, existing cross-modal transformer approaches typically suffer from two major limitations: 1) Exploitation of the transformer architecture where different layers have different feature characteristics is limited; 2) End-to-end training mechanism limits negative sample interactions in a mini-batch. In this paper, we propose a novel approach named Hierarchical Transformer (HiT) for video-text retrieval. HiT performs Hierarchical Cross-modal Contrastive Matching in both feature-level and semantic-level, achieving multi-view and comprehensive retrieval results. Moreover, inspired by MoCo, we propose Momentum Cross-modal Contrast for cross-modal learning to enable large-scale negative sample interactions on-the-fly, which contributes to the generation of more precise and discriminative representations. Experimental results on the three major Video-Text Retrieval benchmark datasets demonstrate the advantages of our method.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liu_HiT_Hierarchical_Transformer_With_Momentum_Contrast_for_Video-Text_Retrieval_ICCV_2021_paper.pdf", @@ -18268,14 +19505,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Liu_HiT_Hierarchical_Transformer_With_Momentum_Contrast_for_Video-Text_Retrieval_ICCV_2021_paper.html", "aff_unique_index": "0;1;2+3;4;4;4", - "aff_unique_norm": "Peking University;Meta;Chinese Academy of Sciences;University of Chinese Academy of Sciences;Kuaishou Technology", + "aff_unique_norm": "Peking University;Facebook AI Research;Chinese Academy of Sciences;University of Chinese Academy of Sciences;Kuaishou Technology", "aff_unique_dep": ";Facebook AI Research;Institute of Automation;;", "aff_unique_url": "http://www.pku.edu.cn;https://research.facebook.com;http://www.ia.cas.cn;http://www.ucas.ac.cn;https://www.kuaishou.com", "aff_unique_abbr": "Peking U;FAIR;CAS;UCAS;Kuaishou", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0+0;0;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Liu_2021_ICCV,\n \n author = {\n Liu,\n Song and Fan,\n Haoqi and Qian,\n Shengsheng and Chen,\n Yiru and Ding,\n Wenkui and Wang,\n Zhongyuan\n},\n title = {\n HiT: Hierarchical Transformer With Momentum Contrast for Video-Text Retrieval\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11915-11925\n} \n}" }, { "title": "Hierarchical Aggregation for 3D Instance Segmentation", @@ -18283,6 +19521,7 @@ "status": "Poster", "track": "main", "pid": 4133, + "author_site": "Shaoyu Chen; Jiemin Fang; Qian Zhang; Wenyu Liu; Xinggang Wang", "author": "Shaoyu Chen; Jiemin Fang; Qian Zhang; Wenyu Liu; Xinggang Wang", "abstract": "Instance segmentation on point clouds is a fundamental task in 3D scene perception. In this work, we propose a concise clustering-based framework named HAIS, which makes full use of spatial relation of points and point sets. Considering clustering-based methods may result in over-segmentation or under-segmentation, we introduce the hierarchical aggregation to progressively generate instance proposals, i.e., point aggregation for preliminarily clustering points to sets and set aggregation for generating complete instances from sets. Once the complete 3D instances are obtained, a sub-network of intra-instance prediction is adopted for noisy points filtering and mask quality scoring. HAIS is fast (only 410ms per frame on Titan X) and does not require non-maximum suppression. It ranks 1st on the ScanNet v2 benchmark, achieving the highest 69.9% AP50 and surpassing previous state-of-the-art (SOTA) methods by a large margin. Besides, the SOTA results on the S3DIS dataset validate the good generalization ability. Code is available at https://github.com/hustvl/HAIS.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_Hierarchical_Aggregation_for_3D_Instance_Segmentation_ICCV_2021_paper.pdf", @@ -18306,7 +19545,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Shaoyu and Fang,\n Jiemin and Zhang,\n Qian and Liu,\n Wenyu and Wang,\n Xinggang\n},\n title = {\n Hierarchical Aggregation for 3D Instance Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15467-15476\n} \n}" }, { "title": "Hierarchical Conditional Flow: A Unified Framework for Image Super-Resolution and Image Rescaling", @@ -18314,6 +19554,7 @@ "status": "Poster", "track": "main", "pid": 1052, + "author_site": "Jingyun Liang; Andreas Lugmayr; Kai Zhang; Martin Danelljan; Luc Van Gool; Radu Timofte", "author": "Jingyun Liang; Andreas Lugmayr; Kai Zhang; Martin Danelljan; Luc Van Gool; Radu Timofte", "abstract": "Normalizing flows have recently demonstrated promising results for low-level vision tasks. For image super-resolution (SR), it learns to predict diverse photo-realistic high-resolution (HR) images from the low-resolution (LR) image rather than learning a deterministic mapping. For image rescaling, it achieves high accuracy by jointly modelling the downscaling and upscaling processes. While existing approaches employ specialized techniques for these two tasks, we set out to unify them in a single formulation. In this paper, we propose the hierarchical conditional flow (HCFlow) as a unified framework for image SR and image rescaling. More specifically, HCFlow learns a bijective mapping between HR and LR image pairs by modelling the distribution of the LR image and the rest high-frequency component simultaneously. In particular, the high-frequency component is conditional on the LR image in a hierarchical manner. To further enhance the performance, other losses such as perceptual loss and GAN loss are combined with the commonly used negative log-likelihood loss in training. Extensive experiments on general image SR, face image SR and image rescaling have demonstrated that the proposed HCFlow achieves state-of-the-art performance in terms of both quantitative metrics and visual quality.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liang_Hierarchical_Conditional_Flow_A_Unified_Framework_for_Image_Super-Resolution_and_ICCV_2021_paper.pdf", @@ -18337,7 +19578,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0+1;0", - "aff_country_unique": "Switzerland;Belgium" + "aff_country_unique": "Switzerland;Belgium", + "bibtex": "@InProceedings{Liang_2021_ICCV,\n \n author = {\n Liang,\n Jingyun and Lugmayr,\n Andreas and Zhang,\n Kai and Danelljan,\n Martin and Van Gool,\n Luc and Timofte,\n Radu\n},\n title = {\n Hierarchical Conditional Flow: A Unified Framework for Image Super-Resolution and Image Rescaling\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4076-4085\n} \n}" }, { "title": "Hierarchical Disentangled Representation Learning for Outdoor Illumination Estimation and Editing", @@ -18345,6 +19587,7 @@ "status": "Poster", "track": "main", "pid": 7697, + "author_site": "Piaopiao Yu; Jie Guo; Fan Huang; Cheng Zhou; Hongwei Che; Xiao Ling; Yanwen Guo", "author": "Piaopiao Yu; Jie Guo; Fan Huang; Cheng Zhou; Hongwei Che; Xiao Ling; Yanwen Guo", "abstract": "Data-driven sky models have gained much attention in outdoor illumination prediction recently, showing superior performance against analytical models. However, naively compressing an outdoor panorama into a low-dimensional latent vector, as existing models have done, causes two major problems. One is the mutual interference between the HDR intensity of the sun and the complex textures of the surrounding sky, and the other is the lack of fine-grained control over independent lighting factors due to the entangled representation. To address these issues, we propose a hierarchical disentangled sky model (HDSky) for outdoor illumination prediction. With this model, any outdoor panorama can be hierarchically disentangled into several factors based on three well-designed autoencoders. The first autoencoder compresses each sunny panorama into a sky vector and a sun vector with some constraints. The second autoencoder and the third autoencoder further disentangle the sun intensity and the sky intensity from the sun vector and the sky vector with several customized loss functions respectively. Moreover, a unified framework is designed to predict all-weather sky information from a single outdoor image. Through extensive experiments, we demonstrate that the proposed model significantly improves the accuracy of outdoor illumination prediction. It also allows users to intuitively edit the predicted panorama (e.g., changing the position of the sun while preserving others), without sacrificing physical plausibility.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yu_Hierarchical_Disentangled_Representation_Learning_for_Outdoor_Illumination_Estimation_and_Editing_ICCV_2021_paper.pdf", @@ -18368,7 +19611,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yu_2021_ICCV,\n \n author = {\n Yu,\n Piaopiao and Guo,\n Jie and Huang,\n Fan and Zhou,\n Cheng and Che,\n Hongwei and Ling,\n Xiao and Guo,\n Yanwen\n},\n title = {\n Hierarchical Disentangled Representation Learning for Outdoor Illumination Estimation and Editing\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15313-15322\n} \n}" }, { "title": "Hierarchical Graph Attention Network for Few-Shot Visual-Semantic Learning", @@ -18376,6 +19620,7 @@ "status": "Poster", "track": "main", "pid": 7849, + "author_site": "Chengxiang Yin; Kun Wu; Zhengping Che; Bo Jiang; Zhiyuan Xu; Jian Tang", "author": "Chengxiang Yin; Kun Wu; Zhengping Che; Bo Jiang; Zhiyuan Xu; Jian Tang", "abstract": "Deep learning has made tremendous success in computer vision, natural language processing and even visual-semantic learning, which requires a huge amount of labeled training data. Nevertheless, the goal of human-level intelligence is to enable a model to quickly obtain an in-depth understanding given a small number of samples, especially with heterogeneity in the multi-modal scenarios such as visual question answering and image captioning. In this paper, we study the few-shot visual-semantic learning and present the Hierarchical Graph ATtention network (HGAT). This two-stage network models the intra- and inter-modal relationships with limited image-text samples. The main contributions of HGAT can be summarized as follows: 1) it sheds light on tackling few-shot multi-modal learning problems, which focuses primarily, but not exclusively on visual and semantic modalities, through better exploitation of the intra-relationship of each modality and an attention-based co-learning framework between modalities using a hierarchical graph-based architecture; 2) it achieves superior performance on both visual question answering and image captioning in the few-shot setting; 3) it can be easily extended to the semi-supervised setting where image-text samples are partially unlabeled. We show via extensive experiments that HGAT delivers state-of-the-art performance on three widely-used benchmarks of two visual-semantic learning tasks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yin_Hierarchical_Graph_Attention_Network_for_Few-Shot_Visual-Semantic_Learning_ICCV_2021_paper.pdf", @@ -18394,12 +19639,13 @@ "aff_unique_index": "0;0;1;1;2;2", "aff_unique_norm": "Syracuse University;Didi Chuxing;Midea Group", "aff_unique_dep": ";;", - "aff_unique_url": "https://www.syracuse.edu;https://www.didichuxing.com/;https://www.mideaglobal.com", + "aff_unique_url": "https://www.syracuse.edu;https://www.didi.cn;https://www.mideaglobal.com", "aff_unique_abbr": "Syracuse;Didi;Midea", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;1;1;1", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Yin_2021_ICCV,\n \n author = {\n Yin,\n Chengxiang and Wu,\n Kun and Che,\n Zhengping and Jiang,\n Bo and Xu,\n Zhiyuan and Tang,\n Jian\n},\n title = {\n Hierarchical Graph Attention Network for Few-Shot Visual-Semantic Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2177-2186\n} \n}" }, { "title": "Hierarchical Kinematic Probability Distributions for 3D Human Shape and Pose Estimation From Images in the Wild", @@ -18407,6 +19653,7 @@ "status": "Poster", "track": "main", "pid": 10780, + "author_site": "Akash Sengupta; Ignas Budvytis; Roberto Cipolla", "author": "Akash Sengupta; Ignas Budvytis; Roberto Cipolla", "abstract": "This paper addresses the problem of 3D human body shape and pose estimation from an RGB image. This is often an ill-posed problem, since multiple plausible 3D bodies may match the visual evidence present in the input - particularly when the subject is occluded. Thus, it is desirable to estimate a distribution over 3D body shape and pose conditioned on the input image instead of a single 3D reconstruction. We train a deep neural network to estimate a hierarchical matrix-Fisher distribution over relative 3D joint rotation matrices (i.e. body pose), which exploits the human body's kinematic tree structure, as well as a Gaussian distribution over SMPL body shape parameters. To further ensure that the predicted shape and pose distributions match the visual evidence in the input image, we implement a differentiable rejection sampler to impose a reprojection loss between ground-truth 2D joint coordinates and samples from the predicted distributions, projected onto the image plane. We show that our method is competitive with the state-of-the-art in terms of 3D shape and pose metrics on the SSP-3D and 3DPW datasets, while also yielding a structured probability distribution over 3D body shape and pose, with which we can meaningfully quantify prediction uncertainty and sample multiple plausible 3D reconstructions to explain a given input image.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Sengupta_Hierarchical_Kinematic_Probability_Distributions_for_3D_Human_Shape_and_Pose_ICCV_2021_paper.pdf", @@ -18430,7 +19677,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Sengupta_2021_ICCV,\n \n author = {\n Sengupta,\n Akash and Budvytis,\n Ignas and Cipolla,\n Roberto\n},\n title = {\n Hierarchical Kinematic Probability Distributions for 3D Human Shape and Pose Estimation From Images in the Wild\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11219-11229\n} \n}" }, { "title": "Hierarchical Memory Matching Network for Video Object Segmentation", @@ -18438,6 +19686,7 @@ "status": "Poster", "track": "main", "pid": 4185, + "author_site": "Hongje Seong; Seoung Wug Oh; Joon-Young Lee; Seongwon Lee; Suhyeon Lee; Euntai Kim", "author": "Hongje Seong; Seoung Wug Oh; Joon-Young Lee; Seongwon Lee; Suhyeon Lee; Euntai Kim", "abstract": "We present Hierarchical Memory Matching Network (HMMN) for semi-supervised video object segmentation. Based on a recent memory-based method [33], we propose two advanced memory read modules that enable us to perform memory reading in multiple scales while exploiting temporal smoothness. We first propose a kernel guided memory matching module that replaces the non-local dense memory read, commonly adopted in previous memory-based methods. The module imposes the temporal smoothness constraint in the memory read, leading to accurate memory retrieval. More importantly, we introduce a hierarchical memory matching scheme and propose a top-k guided memory matching module in which memory read on a fine-scale is guided by that on a coarse-scale. With the module, we perform memory read in multiple scales efficiently and leverage both high-level semantic and low-level fine-grained memory features to predict detailed object masks. Our network achieves state-of-the-art performance on the validation sets of DAVIS 2016/2017 (90.8% and 84.7%) and YouTube-VOS 2018/2019 (82.6% and 82.5%), and test-dev set of DAVIS 2017 (78.6%). The source code and model are available online: https://github.com/Hongje/HMMN.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Seong_Hierarchical_Memory_Matching_Network_for_Video_Object_Segmentation_ICCV_2021_paper.pdf", @@ -18461,7 +19710,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0;0;0", - "aff_country_unique": "South Korea;United States" + "aff_country_unique": "South Korea;United States", + "bibtex": "@InProceedings{Seong_2021_ICCV,\n \n author = {\n Seong,\n Hongje and Oh,\n Seoung Wug and Lee,\n Joon-Young and Lee,\n Seongwon and Lee,\n Suhyeon and Kim,\n Euntai\n},\n title = {\n Hierarchical Memory Matching Network for Video Object Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12889-12898\n} \n}" }, { "title": "Hierarchical Object-to-Zone Graph for Object Navigation", @@ -18469,6 +19719,7 @@ "status": "Poster", "track": "main", "pid": 3258, + "author_site": "Sixian Zhang; Xinhang Song; Yubing Bai; Weijie Li; Yakui Chu; Shuqiang Jiang", "author": "Sixian Zhang; Xinhang Song; Yubing Bai; Weijie Li; Yakui Chu; Shuqiang Jiang", "abstract": "The goal of object navigation is to reach the expected objects according to visual information in the unseen environments. Previous works usually implement deep models to train an agent to predict actions in real-time. However, in the unseen environment, when the target object is not in egocentric view, the agent may not be able to make wise decisions due to the lack of guidance. In this paper, we propose a hierarchical object-to-zone (HOZ) graph to guide the agent in a coarse-to-fine manner, and an online-learning mechanism is also proposed to update HOZ according to the real-time observation in new environments. In particular, the HOZ graph is composed of scene nodes, zone nodes and object nodes. With the pre-learned HOZ graph, the real-time observation and the target goal, the agent can constantly plan an optimal path from zone to zone. In the estimated path, the next potential zone is regarded as sub-goal, which is also fed into the deep reinforcement learning model for action prediction. Our methods are evaluated on the AI2-Thor simulator. In addition to widely used evaluation metrics SR and SPL, we also propose a new evaluation metric of SAE that focuses on the effective action rate. Experimental results demonstrate the effectiveness and efficiency of our proposed method. The code is available at https://github.com/sx-zhang/HOZ.git.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhang_Hierarchical_Object-to-Zone_Graph_for_Object_Navigation_ICCV_2021_paper.pdf", @@ -18492,7 +19743,8 @@ "aff_campus_unique_index": "0+0;0+0;0+0;0+0;0;0+0+1", "aff_campus_unique": "Beijing;Suzhou", "aff_country_unique_index": "0+0;0+0;0+0;0+0;0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2021_ICCV,\n \n author = {\n Zhang,\n Sixian and Song,\n Xinhang and Bai,\n Yubing and Li,\n Weijie and Chu,\n Yakui and Jiang,\n Shuqiang\n},\n title = {\n Hierarchical Object-to-Zone Graph for Object Navigation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15130-15140\n} \n}" }, { "title": "High Quality Disparity Remapping With Two-Stage Warping", @@ -18500,6 +19752,7 @@ "status": "Poster", "track": "main", "pid": 8104, + "author_site": "Bing Li; Chia-Wen Lin; Cheng Zheng; Shan Liu; Junsong Yuan; Bernard Ghanem; C.-C. Jay Kuo", "author": "Bing Li; Chia-Wen Lin; Cheng Zheng; Shan Liu; Junsong Yuan; Bernard Ghanem; C.-C. Jay Kuo", "abstract": "A high quality disparity remapping method that preserves 2D shapes and 3D structures, and adjusts disparities of important objects in stereo image pairs is proposed. It is formulated as a constrained optimization problem, whose solution is challenging, since we need to meet multiple requirements of disparity remapping simultaneously. The one-stage optimization process either degrades the quality of important objects or introduces serious distortions in background regions. To address this challenge, we propose a two-stage warping process to solve it. In the first stage, we develop a warping model that finds the optimal warping grids for important objects to fulfill multiple requirements of disparity remapping. In the second stage, we derive another warping model to refine warping results in less important regions by eliminating serious distortions in shape, disparity and 3D structure. The superior performance of the proposed method is demonstrated by experimental results", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_High_Quality_Disparity_Remapping_With_Two-Stage_Warping_ICCV_2021_paper.pdf", @@ -18514,7 +19767,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Li_High_Quality_Disparity_Remapping_With_Two-Stage_Warping_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Li_High_Quality_Disparity_Remapping_With_Two-Stage_Warping_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Bing and Lin,\n Chia-Wen and Zheng,\n Cheng and Liu,\n Shan and Yuan,\n Junsong and Ghanem,\n Bernard and Kuo,\n C.-C. Jay\n},\n title = {\n High Quality Disparity Remapping With Two-Stage Warping\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2269-2278\n} \n}" }, { "title": "High-Fidelity Pluralistic Image Completion With Transformers", @@ -18522,6 +19776,7 @@ "status": "Poster", "track": "main", "pid": 7195, + "author_site": "Ziyu Wan; Jingbo Zhang; Dongdong Chen; Jing Liao", "author": "Ziyu Wan; Jingbo Zhang; Dongdong Chen; Jing Liao", "abstract": "Image completion has made tremendous progress with convolutional neural networks (CNNs), because of their powerful texture modeling capacity. However, due to some inherent properties (eg, local inductive prior, spatial-invariant kernels), CNNs do not perform well in understanding global structures or naturally support pluralistic completion. Recently, transformers demonstrate their power in modeling the long-term relationship and generating diverse results, but their computation complexity is quadratic to input length, thus hampering the application in processing high-resolution images. This paper brings the best of both worlds to pluralistic image completion: appearance prior reconstruction with transformer and texture replenishment with CNN. The former transformer recovers pluralistic coherent structures together with some coarse textures, while the latter CNN enhances the local texture details of coarse priors guided by the high-resolution masked images. The proposed method vastly outperforms state-of-the-art methods in terms of three aspects: 1) large performance boost on image fidelity even compared to deterministic completion methods; 2) better diversity and higher fidelity for pluralistic completion; 3) exceptional generalization ability on large masks and generic dataset, like ImageNet. Code and pre-trained models have been publicly released at https://github.com/raywzy/ICT.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wan_High-Fidelity_Pluralistic_Image_Completion_With_Transformers_ICCV_2021_paper.pdf", @@ -18536,7 +19791,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wan_High-Fidelity_Pluralistic_Image_Completion_With_Transformers_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wan_High-Fidelity_Pluralistic_Image_Completion_With_Transformers_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Wan_2021_ICCV,\n \n author = {\n Wan,\n Ziyu and Zhang,\n Jingbo and Chen,\n Dongdong and Liao,\n Jing\n},\n title = {\n High-Fidelity Pluralistic Image Completion With Transformers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4692-4701\n} \n}" }, { "title": "High-Performance Discriminative Tracking With Transformers", @@ -18544,6 +19800,7 @@ "status": "Poster", "track": "main", "pid": 2632, + "author_site": "Bin Yu; Ming Tang; Linyu Zheng; Guibo Zhu; Jinqiao Wang; Hao Feng; Xuetao Feng; Hanqing Lu", "author": "Bin Yu; Ming Tang; Linyu Zheng; Guibo Zhu; Jinqiao Wang; Hao Feng; Xuetao Feng; Hanqing Lu", "abstract": "End-to-end discriminative trackers improve the state of the art significantly, yet the improvement in robustness and efficiency is restricted by the conventional discriminative model, i.e., least-squares based regression. In this paper, we present DTT, a novel single-object discriminative tracker, based on an encoder-decoder Transformer architecture. By self- and encoder-decoder attention mechanisms, our approach is able to exploit the rich scene information in an end-to-end manner, effectively removing the need for hand-designed discriminative models. In online tracking, given a new test frame, dense prediction is performed at all spatial positions. Not only location, but also bounding box of the target object is obtained in a robust fashion, streamlining the discriminative tracking pipeline. DTT is conceptually simple and easy to implement. It yields state-of-the-art performance on four popular benchmarks including GOT-10k, LaSOT, NfS, and TrackingNet while running at over 50 FPS, confirming its effectiveness and efficiency. We hope DTT may provide a new perspective for single-object visual tracking.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yu_High-Performance_Discriminative_Tracking_With_Transformers_ICCV_2021_paper.pdf", @@ -18558,7 +19815,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Yu_High-Performance_Discriminative_Tracking_With_Transformers_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Yu_High-Performance_Discriminative_Tracking_With_Transformers_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Yu_2021_ICCV,\n \n author = {\n Yu,\n Bin and Tang,\n Ming and Zheng,\n Linyu and Zhu,\n Guibo and Wang,\n Jinqiao and Feng,\n Hao and Feng,\n Xuetao and Lu,\n Hanqing\n},\n title = {\n High-Performance Discriminative Tracking With Transformers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9856-9865\n} \n}" }, { "title": "High-Resolution Optical Flow From 1D Attention and Correlation", @@ -18566,6 +19824,7 @@ "status": "Poster", "track": "main", "pid": 2623, + "author_site": "Haofei Xu; Jiaolong Yang; Jianfei Cai; Juyong Zhang; Xin Tong", "author": "Haofei Xu; Jiaolong Yang; Jianfei Cai; Juyong Zhang; Xin Tong", "abstract": "Optical flow is inherently a 2D search problem, and thus the computational complexity grows quadratically with respect to the search window, making large displacements matching infeasible for high-resolution images. In this paper, we take inspiration from Transformers and propose a new method for high-resolution optical flow estimation with significantly less computation. Specifically, a 1D attention operation is first applied in the vertical direction of the target image, and then a simple 1D correlation in the horizontal direction of the attended image is able to achieve 2D correspondence modeling effect. The directions of attention and correlation can also be exchanged, resulting in two 3D cost volumes that are concatenated for optical flow estimation. The novel 1D formulation empowers our method to scale to very high-resolution input images while maintaining competitive performance. Extensive experiments on Sintel, KITTI and real-world 4K (2160 x 3840) resolution images demonstrated the effectiveness and superiority of our proposed method. Code and models are available at https://github.com/haofeixu/flow1d.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xu_High-Resolution_Optical_Flow_From_1D_Attention_and_Correlation_ICCV_2021_paper.pdf", @@ -18582,14 +19841,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Xu_High-Resolution_Optical_Flow_From_1D_Attention_and_Correlation_ICCV_2021_paper.html", "aff_unique_index": "0;1;2;0;1", - "aff_unique_norm": "University of Science and Technology of China;Microsoft;Monash University", + "aff_unique_norm": "University of Science and Technology of China;Microsoft Research;Monash University", "aff_unique_dep": ";Research;Department of Data Science and AI", "aff_unique_url": "http://www.ustc.edu.cn;https://www.microsoft.com/en-us/research/group/asia;https://www.monash.edu", "aff_unique_abbr": "USTC;MSR Asia;Monash", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0;0;1;0;0", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Xu_2021_ICCV,\n \n author = {\n Xu,\n Haofei and Yang,\n Jiaolong and Cai,\n Jianfei and Zhang,\n Juyong and Tong,\n Xin\n},\n title = {\n High-Resolution Optical Flow From 1D Attention and Correlation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10498-10507\n} \n}" }, { "title": "HighlightMe: Detecting Highlights From Human-Centric Videos", @@ -18597,6 +19857,7 @@ "status": "Poster", "track": "main", "pid": 5451, + "author_site": "Uttaran Bhattacharya; Gang Wu; Stefano Petrangeli; Viswanathan Swaminathan; Dinesh Manocha", "author": "Uttaran Bhattacharya; Gang Wu; Stefano Petrangeli; Viswanathan Swaminathan; Dinesh Manocha", "abstract": "We present a domain- and user-preference-agnostic approach to detect highlightable excerpts from human-centric videos. Our method works on the graph-based representation of multiple observable human-centric modalities in the videos, such as poses and faces. We use an autoencoder network equipped with spatial-temporal graph convolutions to detect human activities and interactions based on these modalities. We train our network to map the activity- and interaction-based latent structural representations of the different modalities to per-frame highlight scores based on the representativeness of the frames. We use these scores to compute which frames to highlight and stitch contiguous frames to produce the excerpts. We train our network on the large-scale AVA-Kinetics action dataset and evaluate it on four benchmark video highlight datasets: DSH, TVSum, PHD^2, and SumMe. We observe a 4-12% improvement in the mean average precision of matching the human-annotated highlights over state-of-the-art methods in these datasets, without requiring any user-provided preferences or dataset-specific fine-tuning.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Bhattacharya_HighlightMe_Detecting_Highlights_From_Human-Centric_Videos_ICCV_2021_paper.pdf", @@ -18613,14 +19874,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Bhattacharya_HighlightMe_Detecting_Highlights_From_Human-Centric_Videos_ICCV_2021_paper.html", "aff_unique_index": "0;1;1;1;0", - "aff_unique_norm": "University of Maryland;Adobe", - "aff_unique_dep": ";Adobe Research", + "aff_unique_norm": "University of Maryland;Adobe Research", + "aff_unique_dep": ";", "aff_unique_url": "https://www/umd.edu;https://research.adobe.com", "aff_unique_abbr": "UMD;Adobe", "aff_campus_unique_index": "0;1;1;1;0", "aff_campus_unique": "College Park;San Jose", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Bhattacharya_2021_ICCV,\n \n author = {\n Bhattacharya,\n Uttaran and Wu,\n Gang and Petrangeli,\n Stefano and Swaminathan,\n Viswanathan and Manocha,\n Dinesh\n},\n title = {\n HighlightMe: Detecting Highlights From Human-Centric Videos\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8157-8167\n} \n}" }, { "title": "Holistic Pose Graph: Modeling Geometric Structure Among Objects in a Scene Using Graph Inference for 3D Object Prediction", @@ -18628,6 +19890,7 @@ "status": "Poster", "track": "main", "pid": 6132, + "author_site": "Jiwei Xiao; Ruiping Wang; Xilin Chen", "author": "Jiwei Xiao; Ruiping Wang; Xilin Chen", "abstract": "Due to the missing depth cues, it is essentially ambiguous to detect 3D objects from a single RGB image. Existing methods predict the 3D pose for each object independently or merely by combining local relationships within limited surroundings, but rarely explore the inherent geometric relationships from a global perspective. To address this issue, we argue that modeling geometric structure among objects in a scene is very crucial, and thus elaborately devise the Holistic Pose Graph (HPG) that explicitly integrates all geometric poses including the object pose treated as nodes and the relative pose treated as edges. The inference of the HPG uses GRU to encode the pose features from their corresponding regions in a single RGB image, and passes messages along the graph structure iteratively to improve the predicted poses. To further enhance the correspondence between the object pose and the relative pose, we propose a novel consistency loss to explicitly measure the deviations between them. Finally, we apply Holistic Pose Estimation (HPE) to jointly evaluate both the independent object pose and the relative pose. Our experiments on the SUN RGB-D dataset demonstrate that the proposed method provides a significant improvement on 3D object prediction.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xiao_Holistic_Pose_Graph_Modeling_Geometric_Structure_Among_Objects_in_a_ICCV_2021_paper.pdf", @@ -18651,7 +19914,8 @@ "aff_campus_unique_index": "0+0;0+0+0;0+0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0+0;0+0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xiao_2021_ICCV,\n \n author = {\n Xiao,\n Jiwei and Wang,\n Ruiping and Chen,\n Xilin\n},\n title = {\n Holistic Pose Graph: Modeling Geometric Structure Among Objects in a Scene Using Graph Inference for 3D Object Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12717-12726\n} \n}" }, { "title": "Homogeneous Architecture Augmentation for Neural Predictor", @@ -18659,6 +19923,7 @@ "status": "Poster", "track": "main", "pid": 7320, + "author_site": "Yuqiao Liu; Yehui Tang; Yanan Sun", "author": "Yuqiao Liu; Yehui Tang; Yanan Sun", "abstract": "Neural Architecture Search (NAS) can automatically design well-performed architectures of Deep Neural Networks (DNNs) for the tasks at hand. However, one bottleneck of NAS is the prohibitively computational cost largely due to the expensive performance evaluation. The neural predictors can directly estimate the performance without any training of the DNNs to be evaluated, thus have drawn increasing attention from researchers. Despite their popularity, they also suffer a severe limitation: the shortage of annotated DNN architectures for effectively training the neural predictors. In this paper, we proposed Homogeneous Architecture Augmentation for Neural Predictor (HAAP) of DNN architectures to address the issue aforementioned. Specifically, a homogeneous architecture augmentation algorithm is proposed in HAAP to generate sufficient training data taking the use of homogeneous representation. Furthermore, the one-hot encoding strategy is introduced into HAAP to make the representation of DNN architectures more effective. The experiments have been conducted on both NAS-Benchmark-101 and NAS-Bench-201 dataset. The experimental results demonstrate that the proposed HAAP algorithm outperforms the state of the arts compared, yet with much less training data. In addition, the ablation studies on both benchmark datasets have also shown the universality of the homogeneous architecture augmentation. Our code has been made available at https://github.com/lyq998/HAAP.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liu_Homogeneous_Architecture_Augmentation_for_Neural_Predictor_ICCV_2021_paper.pdf", @@ -18682,7 +19947,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2021_ICCV,\n \n author = {\n Liu,\n Yuqiao and Tang,\n Yehui and Sun,\n Yanan\n},\n title = {\n Homogeneous Architecture Augmentation for Neural Predictor\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12249-12258\n} \n}" }, { "title": "How Shift Equivariance Impacts Metric Learning for Instance Segmentation", @@ -18690,7 +19956,8 @@ "status": "Poster", "track": "main", "pid": 1997, - "author": "Josef Lorenz Rumberger; Xiaoyan Yu; Peter Hirsch; Melanie Dohmen; Vanessa Emanuela Guarino; Ashkan Mokarian; Lisa Mais; Jan Funke; Dagmar Kainm\u00fcller", + "author_site": "Josef Lorenz Rumberger; Xiaoyan Yu; Peter Hirsch; Melanie Dohmen; Vanessa Emanuela Guarino; Ashkan Mokarian; Lisa Mais; Jan Funke; Dagmar Kainmüller", + "author": "Josef Lorenz Rumberger; Xiaoyan Yu; Peter Hirsch; Melanie Dohmen; Vanessa Emanuela Guarino; Ashkan Mokarian; Lisa Mais; Jan Funke; Dagmar Kainmüller", "abstract": "Metric learning has received conflicting assessments concerning its suitability for solving instance segmentation tasks. It has been dismissed as theoretically flawed due to the shift equivariance of the employed CNNs and their respective inability to distinguish same-looking objects. Yet it has been shown to yield state of the art results for a variety of tasks, and practical issues have mainly been reported in the context of tile-and-stitch approaches, where discontinuities at tile boundaries have been observed. To date, neither of the reported issues have undergone thorough formal analysis. In our work, we contribute a comprehensive formal analysis of the shift equivariance properties of encoder-decoder-style CNNs, which yields a clear picture of what can and cannot be achieved with metric learning in the face of same-looking objects. In particular, we prove that a standard encoder-decoder network that takes d-dimensional images as input, with l pooling layers and pooling factor f, has the capacity to distinguish at most f^(dl) same-looking objects, and we show that this upper limit can be reached. Furthermore, we show that to avoid discontinuities in a tile-and-stitch approach, assuming standard batch size 1, it is necessary to employ valid convolutions in combination with a training output window size strictly greater than f^l, while at test-time it is necessary to crop tiles to size n * f^l before stitching, with n >= 1. We complement these theoretical findings by discussing a number of insightful special cases for which we show empirical results on synthetic and real data.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Rumberger_How_Shift_Equivariance_Impacts_Metric_Learning_for_Instance_Segmentation_ICCV_2021_paper.pdf", "aff": ";;;;;;;;", @@ -18704,7 +19971,8 @@ "aff_domain": ";;;;;;;;", "email": ";;;;;;;;", "author_num": 9, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Rumberger_How_Shift_Equivariance_Impacts_Metric_Learning_for_Instance_Segmentation_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Rumberger_How_Shift_Equivariance_Impacts_Metric_Learning_for_Instance_Segmentation_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Rumberger_2021_ICCV,\n \n author = {\n Rumberger,\n Josef Lorenz and Yu,\n Xiaoyan and Hirsch,\n Peter and Dohmen,\n Melanie and Guarino,\n Vanessa Emanuela and Mokarian,\n Ashkan and Mais,\n Lisa and Funke,\n Jan and Kainm\\"uller,\n Dagmar\n},\n title = {\n How Shift Equivariance Impacts Metric Learning for Instance Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7128-7136\n} \n}" }, { "title": "How To Design a Three-Stage Architecture for Audio-Visual Active Speaker Detection in the Wild", @@ -18712,7 +19980,8 @@ "status": "Poster", "track": "main", "pid": 6101, - "author": "Okan K\u00f6p\u00fckl\u00fc; Maja Taseska; Gerhard Rigoll", + "author_site": "Okan Köpüklü; Maja Taseska; Gerhard Rigoll", + "author": "Okan Köpüklü; Maja Taseska; Gerhard Rigoll", "abstract": "Successful active speaker detection requires a three-stage pipeline: (i) audio-visual encoding for all speakers in the clip, (ii) inter-speaker relation modeling between a reference speaker and the background speakers within each frame, and (iii) temporal modeling for the reference speaker. Each stage of this pipeline plays an important role for the final performance of the created architecture. Based on a series of controlled experiments, this work presents several practical guidelines for audio-visual active speaker detection. Correspondingly, we present a new architecture called ASDNet, which achieves a new state-of-the-art on the AVA-ActiveSpeaker dataset with a mAP of 93.5% outperforming the second best with a large margin of 4.7%. Our code and pretrained models are publicly available.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Kopuklu_How_To_Design_a_Three-Stage_Architecture_for_Audio-Visual_Active_Speaker_ICCV_2021_paper.pdf", "aff": "Technical University of Munich; Microsoft Corporation; Technical University of Munich", @@ -18728,14 +19997,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Kopuklu_How_To_Design_a_Three-Stage_Architecture_for_Audio-Visual_Active_Speaker_ICCV_2021_paper.html", "aff_unique_index": "0;1;0", - "aff_unique_norm": "Technical University of Munich;Microsoft", - "aff_unique_dep": ";Microsoft Corporation", + "aff_unique_norm": "Technical University of Munich;Microsoft Corporation", + "aff_unique_dep": ";", "aff_unique_url": "https://www.tum.de;https://www.microsoft.com", "aff_unique_abbr": "TUM;Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "Germany;United States" + "aff_country_unique": "Germany;United States", + "bibtex": "@InProceedings{Kopuklu_2021_ICCV,\n \n author = {\n K\\"op\\"ukl\\"u,\n Okan and Taseska,\n Maja and Rigoll,\n Gerhard\n},\n title = {\n How To Design a Three-Stage Architecture for Audio-Visual Active Speaker Detection in the Wild\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1193-1203\n} \n}" }, { "title": "How To Train Neural Networks for Flare Removal", @@ -18743,6 +20013,7 @@ "status": "Poster", "track": "main", "pid": 3508, + "author_site": "Yicheng Wu; Qiurui He; Tianfan Xue; Rahul Garg; Jiawen Chen; Ashok Veeraraghavan; Jonathan T. Barron", "author": "Yicheng Wu; Qiurui He; Tianfan Xue; Rahul Garg; Jiawen Chen; Ashok Veeraraghavan; Jonathan T. Barron", "abstract": "When a camera is pointed at a strong light source, the resulting photograph may contain lens flare artifacts. Flares appear in a wide variety of patterns (halos, streaks, color bleeding, haze, etc.) and this diversity in appearance makes flare removal challenging. Existing analytical solutions make strong assumptions about the artifact's geometry or brightness, and therefore only work well on a small subset of flares. Machine learning techniques have shown success in removing other types of artifacts, like reflections, but have not been widely applied to flare removal due to the lack of training data. To solve this problem, we explicitly model the optical causes of flare either empirically or using wave optics, and generate semi-synthetic pairs of flare-corrupted and clean images. This enables us to train neural networks to remove lens flare for the first time. Experiments show our data synthesis approach is critical for accurate flare removal, and that models trained with our technique generalize well to real lens flares across different scenes, lighting conditions, and cameras.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wu_How_To_Train_Neural_Networks_for_Flare_Removal_ICCV_2021_paper.pdf", @@ -18759,14 +20030,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wu_How_To_Train_Neural_Networks_for_Flare_Removal_ICCV_2021_paper.html", "aff_unique_index": "0;1;1;1;2;0;1", - "aff_unique_norm": "Rice University;Google;Adobe", - "aff_unique_dep": ";Google Research;Adobe Inc.", + "aff_unique_norm": "Rice University;Google;Adobe Inc.", + "aff_unique_dep": ";Google Research;", "aff_unique_url": "https://www.rice.edu;https://research.google;https://www.adobe.com", "aff_unique_abbr": "Rice;Google Research;Adobe", "aff_campus_unique_index": "1;1;1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Wu_2021_ICCV,\n \n author = {\n Wu,\n Yicheng and He,\n Qiurui and Xue,\n Tianfan and Garg,\n Rahul and Chen,\n Jiawen and Veeraraghavan,\n Ashok and Barron,\n Jonathan T.\n},\n title = {\n How To Train Neural Networks for Flare Removal\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2239-2247\n} \n}" }, { "title": "HuMoR: 3D Human Motion Model for Robust Pose Estimation", @@ -18774,6 +20046,7 @@ "status": "Poster", "track": "main", "pid": 2793, + "author_site": "Davis Rempe; Tolga Birdal; Aaron Hertzmann; Jimei Yang; Srinath Sridhar; Leonidas J. Guibas", "author": "Davis Rempe; Tolga Birdal; Aaron Hertzmann; Jimei Yang; Srinath Sridhar; Leonidas J. Guibas", "abstract": "We introduce HuMoR: a 3D Human Motion Model for Robust Estimation of temporal pose and shape. Though substantial progress has been made in estimating 3D human motion and shape from dynamic observations, recovering plausible pose sequences in the presence of noise and occlusions remains a challenge. For this purpose, we propose an expressive generative model in the form of a conditional variational autoencoder, which learns a distribution of the change in pose at each step of a motion sequence. Furthermore, we introduce a flexible optimization-based approach that leverages HuMoR as a motion prior to robustly estimate plausible pose and shape from ambiguous observations. Through extensive evaluations, we demonstrate that our model generalizes to diverse motions and body shapes after training on a large motion capture dataset, and enables motion reconstruction from multiple input modalities including 3D keypoints and RGB(-D) videos.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Rempe_HuMoR_3D_Human_Motion_Model_for_Robust_Pose_Estimation_ICCV_2021_paper.pdf", @@ -18788,7 +20061,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Rempe_HuMoR_3D_Human_Motion_Model_for_Robust_Pose_Estimation_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Rempe_HuMoR_3D_Human_Motion_Model_for_Robust_Pose_Estimation_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Rempe_2021_ICCV,\n \n author = {\n Rempe,\n Davis and Birdal,\n Tolga and Hertzmann,\n Aaron and Yang,\n Jimei and Sridhar,\n Srinath and Guibas,\n Leonidas J.\n},\n title = {\n HuMoR: 3D Human Motion Model for Robust Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11488-11499\n} \n}" }, { "title": "Human Detection and Segmentation via Multi-View Consensus", @@ -18796,7 +20070,8 @@ "status": "Poster", "track": "main", "pid": 1832, - "author": "Isinsu Katircioglu; Helge Rhodin; J\u00f6rg Sp\u00f6rri; Mathieu Salzmann; Pascal Fua", + "author_site": "Isinsu Katircioglu; Helge Rhodin; Jörg Spörri; Mathieu Salzmann; Pascal Fua", + "author": "Isinsu Katircioglu; Helge Rhodin; Jörg Spörri; Mathieu Salzmann; Pascal Fua", "abstract": "Self-supervised detection and segmentation of foreground objects aims for accuracy without annotated training data. However, existing approaches predominantly rely on restrictive assumptions on appearance and motion. For scenes with dynamic activities and camera motion, we propose a multi-camera framework in which geometric constraints are embedded in the form of multi-view consistency during training via coarse 3D localization in a voxel grid and fine-grained offset regression. In this manner, we learn a joint distribution of proposals over multiple views. At inference time, our method operates on single RGB images. We outperform state-of-the-art techniques both on images that visually depart from those of standard benchmarks and on those of the classical Human3.6M dataset.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Katircioglu_Human_Detection_and_Segmentation_via_Multi-View_Consensus_ICCV_2021_paper.pdf", "aff": ";;;;", @@ -18810,7 +20085,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Katircioglu_Human_Detection_and_Segmentation_via_Multi-View_Consensus_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Katircioglu_Human_Detection_and_Segmentation_via_Multi-View_Consensus_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Katircioglu_2021_ICCV,\n \n author = {\n Katircioglu,\n Isinsu and Rhodin,\n Helge and Sp\\"orri,\n J\\"org and Salzmann,\n Mathieu and Fua,\n Pascal\n},\n title = {\n Human Detection and Segmentation via Multi-View Consensus\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2855-2864\n} \n}" }, { "title": "Human Pose Regression With Residual Log-Likelihood Estimation", @@ -18818,6 +20094,7 @@ "status": "Poster", "track": "main", "pid": 1042, + "author_site": "Jiefeng Li; Siyuan Bian; Ailing Zeng; Can Wang; Bo Pang; Wentao Liu; Cewu Lu", "author": "Jiefeng Li; Siyuan Bian; Ailing Zeng; Can Wang; Bo Pang; Wentao Liu; Cewu Lu", "abstract": "Heatmap-based methods dominate in the field of human pose estimation by modelling the output distribution through likelihood heatmaps. In contrast, regression-based methods are more efficient but suffer from inferior performance. In this work, we explore maximum likelihood estimation (MLE) to develop an efficient and effective regression-based method. From the perspective of MLE, adopting different regression losses is making different assumptions about the output density function. A density function closer to the true distribution leads to a better regression performance. In light of this, we propose a novel regression paradigm with Residual Log-likelihood Estimation (RLE) to capture the underlying output distribution. Concretely, RLE learns the change of the distribution instead of the unreferenced underlying distribution to facilitate the training process. With the proposed reparameterization design, our method is compatible with off-the-shelf flow models. The proposed method is effective, efficient and flexible. We show its potential in various human pose estimation tasks with comprehensive experiments. Compared to the conventional regression paradigm, regression with RLE bring 12.4 mAP improvement on MSCOCO without any test-time overhead. Moreover, for the first time, especially on multi-person pose estimation, our regression method is superior to the heatmap-based methods. Our code is available at https://github.com/Jeff-sjtu/res-loglikelihood-regression.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_Human_Pose_Regression_With_Residual_Log-Likelihood_Estimation_ICCV_2021_paper.pdf", @@ -18834,14 +20111,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Li_Human_Pose_Regression_With_Residual_Log-Likelihood_Estimation_ICCV_2021_paper.html", "aff_unique_index": "0;0;1;2;0;2;0+3+4+5+6", - "aff_unique_norm": "Shanghai Jiao Tong University;Chinese University of Hong Kong;SenseTime;Qing Yuan Research Institute;Qi Zhi Institute;MoE Key Lab of Artificial Intelligence;AI Institute", + "aff_unique_norm": "Shanghai Jiao Tong University;The Chinese University of Hong Kong;SenseTime;Qing Yuan Research Institute;Qi Zhi Institute;MoE Key Lab of Artificial Intelligence;AI Institute", "aff_unique_dep": ";;SenseTime Research;;;Artificial Intelligence;", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.cuhk.edu.hk;https://www.sensetime.com;;;;", "aff_unique_abbr": "SJTU;CUHK;SenseTime;;;;", "aff_campus_unique_index": "1;", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0;0;0+0+0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Jiefeng and Bian,\n Siyuan and Zeng,\n Ailing and Wang,\n Can and Pang,\n Bo and Liu,\n Wentao and Lu,\n Cewu\n},\n title = {\n Human Pose Regression With Residual Log-Likelihood Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11025-11034\n} \n}" }, { "title": "Human Trajectory Prediction via Counterfactual Analysis", @@ -18849,6 +20127,7 @@ "status": "Poster", "track": "main", "pid": 2587, + "author_site": "Guangyi Chen; Junlong Li; Jiwen Lu; Jie Zhou", "author": "Guangyi Chen; Junlong Li; Jiwen Lu; Jie Zhou", "abstract": "Forecasting human trajectories in complex dynamic environments plays a critical role in autonomous vehicles and intelligent robots. Most existing methods learn to predict future trajectories by behavior clues from history trajectories and interaction clues from environments. However, the inherent bias between training and deployment environments is ignored. Hence, we propose a counterfactual analysis method for human trajectory prediction to investigate the causality between the predicted trajectories and input clues and alleviate the negative effects brought by environment bias. We first build a causal graph for trajectory forecasting with history trajectory, future trajectory, and the environment interactions. Then, we cut off the inference from the environment to trajectory by constructing the counterfactual intervention on the trajectory itself. Finally, we compare the factual and counterfactual trajectory clues to alleviate the effects of environment bias and highlight the trajectory clues. Our counterfactual analysis is a plug-and-play module that can be applied to any baseline prediction methods including RNN- and CNN-based ones. We show that our method achieves consistent improvement for different baselines and obtains state-of-the-art results on public pedestrian trajectory forecasting benchmarks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_Human_Trajectory_Prediction_via_Counterfactual_Analysis_ICCV_2021_paper.pdf", @@ -18872,7 +20151,8 @@ "aff_campus_unique_index": ";;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0+0;0+0+0;0+0+0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Guangyi and Li,\n Junlong and Lu,\n Jiwen and Zhou,\n Jie\n},\n title = {\n Human Trajectory Prediction via Counterfactual Analysis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9824-9833\n} \n}" }, { "title": "Hybrid Neural Fusion for Full-Frame Video Stabilization", @@ -18880,6 +20160,7 @@ "status": "Poster", "track": "main", "pid": 6214, + "author_site": "Yu-Lun Liu; Wei-Sheng Lai; Ming-Hsuan Yang; Yung-Yu Chuang; Jia-Bin Huang", "author": "Yu-Lun Liu; Wei-Sheng Lai; Ming-Hsuan Yang; Yung-Yu Chuang; Jia-Bin Huang", "abstract": "Existing video stabilization methods often generate visible distortion or require aggressive cropping of frame boundaries, resulting in smaller field of views. In this work, we present a frame synthesis algorithm to achieve full-frame video stabilization. We first estimate dense warp fields from neighboring frames and then synthesize the stabilized frame by fusing the warped contents. Our core technical novelty lies in the learning-based hybrid-space fusion that alleviates artifacts caused by optical flow inaccuracy and fast-moving objects. We validate the effectiveness of our method on the NUS, selfie, and DeepStab video datasets. Extensive experiment results demonstrate the merits of our approach over prior video stabilization methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liu_Hybrid_Neural_Fusion_for_Full-Frame_Video_Stabilization_ICCV_2021_paper.pdf", @@ -18897,13 +20178,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Liu_Hybrid_Neural_Fusion_for_Full-Frame_Video_Stabilization_ICCV_2021_paper.html", "aff_unique_index": "0;1;2;3+4;2", "aff_unique_norm": "National Taiwan University;Google;Virginia Tech;University of California, Merced;Yonsei University", - "aff_unique_dep": ";Google;;;", + "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.ntu.edu.tw;https://www.google.com;https://www.vt.edu;https://www.ucmerced.edu;https://www.yonsei.ac.kr", "aff_unique_abbr": "NTU;Google;VT;UCM;Yonsei", "aff_campus_unique_index": "0;1;3", "aff_campus_unique": "Taiwan;Mountain View;;Merced", "aff_country_unique_index": "0;1;1;1+2;1", - "aff_country_unique": "China;United States;South Korea" + "aff_country_unique": "China;United States;South Korea", + "bibtex": "@InProceedings{Liu_2021_ICCV,\n \n author = {\n Liu,\n Yu-Lun and Lai,\n Wei-Sheng and Yang,\n Ming-Hsuan and Chuang,\n Yung-Yu and Huang,\n Jia-Bin\n},\n title = {\n Hybrid Neural Fusion for Full-Frame Video Stabilization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2299-2308\n} \n}" }, { "title": "Hypercorrelation Squeeze for Few-Shot Segmentation", @@ -18911,6 +20193,7 @@ "status": "Poster", "track": "main", "pid": 5861, + "author_site": "Juhong Min; Dahyun Kang; Minsu Cho", "author": "Juhong Min; Dahyun Kang; Minsu Cho", "abstract": "Few-shot semantic segmentation aims at learning to segment a target object from a query image using only a few annotated support images of the target class. This challenging task requires to understand diverse levels of visual cues and analyze fine-grained correspondence relations between the query and the support images. To address the problem, we propose Hypercorrelation Squeeze Networks (HSNet) that leverages multi-level feature correlation and efficient 4D convolutions. It extracts diverse features from different levels of intermediate convolutional layers and constructs a collection of 4D correlation tensors, i.e., hypercorrelations. Using efficient center-pivot 4D convolutions in a pyramidal architecture, the method gradually squeezes high-level semantic and low-level geometric cues of the hypercorrelation into precise segmentation masks in coarse-to-fine manner. The significant performance improvements on standard few-shot segmentation benchmarks of PASCAL-5i, COCO-20i, and FSS-1000 verify the efficacy of the proposed method.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Min_Hypercorrelation_Squeeze_for_Few-Shot_Segmentation_ICCV_2021_paper.pdf", @@ -18925,7 +20208,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Min_Hypercorrelation_Squeeze_for_Few-Shot_Segmentation_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Min_Hypercorrelation_Squeeze_for_Few-Shot_Segmentation_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Min_2021_ICCV,\n \n author = {\n Min,\n Juhong and Kang,\n Dahyun and Cho,\n Minsu\n},\n title = {\n Hypercorrelation Squeeze for Few-Shot Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6941-6952\n} \n}" }, { "title": "Hypergraph Neural Networks for Hypergraph Matching", @@ -18933,6 +20217,7 @@ "status": "Poster", "track": "main", "pid": 9493, + "author_site": "Xiaowei Liao; Yong Xu; Haibin Ling", "author": "Xiaowei Liao; Yong Xu; Haibin Ling", "abstract": "Hypergraph matching is a useful tool to find feature correspondence by considering higher-order structural information. Recently, the employment of deep learning has made great progress in the matching of graphs, suggesting its potential for hypergraphs. Hence, in this paper, we present the first, to our best knowledge, unified hypergraph neural network (HNN) solution for hypergraph matching. Specifically, given two hypergraphs to be matched, we first construct an association hypergraph over them and convert the hypergraph matching problem into a node classification problem on the association hypergraph. Then, we design a novel hypergraph neural network to effectively solve the node classification problem. Being end-to-end trainable, our proposed method, named HNN-HM, jointly learns all its components with improved optimization. For evaluation, HNN-HM is tested on various benchmarks and shows a clear advantage over state-of-the-arts.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liao_Hypergraph_Neural_Networks_for_Hypergraph_Matching_ICCV_2021_paper.pdf", @@ -18949,14 +20234,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Liao_Hypergraph_Neural_Networks_for_Hypergraph_Matching_ICCV_2021_paper.html", "aff_unique_index": "0+1+2;0+1+2;3", - "aff_unique_norm": "South China University of Technology;Pengcheng Laboratory;Guangdong Communication and Computer Network Laboratory;Stony Brook University", - "aff_unique_dep": "School of Computer Science & Engineering;Peng Cheng Laboratory;Communication and Computer Network Laboratory;Department of Computer Science", + "aff_unique_norm": "South China University of Technology;Peng Cheng Laboratory;Guangdong Communication and Computer Network Laboratory;Stony Brook University", + "aff_unique_dep": "School of Computer Science & Engineering;;Communication and Computer Network Laboratory;Department of Computer Science", "aff_unique_url": "https://www.scut.edu.cn;;;https://www.stonybrook.edu", "aff_unique_abbr": "SCUT;;;SBU", "aff_campus_unique_index": "0+1;0+1;3", "aff_campus_unique": "Guangzhou;Shenzhen;;Stony Brook", "aff_country_unique_index": "0+0+0;0+0+0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Liao_2021_ICCV,\n \n author = {\n Liao,\n Xiaowei and Xu,\n Yong and Ling,\n Haibin\n},\n title = {\n Hypergraph Neural Networks for Hypergraph Matching\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1266-1275\n} \n}" }, { "title": "Hypersim: A Photorealistic Synthetic Dataset for Holistic Indoor Scene Understanding", @@ -18964,6 +20250,7 @@ "status": "Poster", "track": "main", "pid": 8225, + "author_site": "Mike Roberts; Jason Ramapuram; Anurag Ranjan; Atulit Kumar; Miguel Angel Bautista; Nathan Paczan; Russ Webb; Joshua M. Susskind", "author": "Mike Roberts; Jason Ramapuram; Anurag Ranjan; Atulit Kumar; Miguel Angel Bautista; Nathan Paczan; Russ Webb; Joshua M. Susskind", "abstract": "For many fundamental scene understanding tasks, it is difficult or impossible to obtain per-pixel ground truth labels from real images. We address this challenge by introducing Hypersim, a photorealistic synthetic dataset for holistic indoor scene understanding. To create our dataset, we leverage a large repository of synthetic scenes created by professional artists, and we generate 77,400 images of 461 indoor scenes with detailed per-pixel labels and corresponding ground truth geometry. Our dataset: (1) relies exclusively on publicly available 3D assets; (2) includes complete scene geometry, material information, and lighting information for every scene; (3) includes dense per-pixel semantic instance segmentations and complete camera information for every image; and (4) factors every image into diffuse reflectance, diffuse illumination, and a non-diffuse residual term that captures view-dependent lighting effects. We analyze our dataset at the level of scenes, objects, and pixels, and we analyze costs in terms of money, computation time, and annotation effort. Remarkably, we find that it is possible to generate our entire dataset from scratch, for roughly half the cost of training a popular open-source natural language processing model. We also evaluate sim-to-real transfer performance on two real-world scene understanding tasks - semantic segmentation and 3D shape prediction - where we find that pre-training on our dataset significantly improves performance on both tasks, and achieves state-of-the-art performance on the most challenging Pix3D test set. All of our rendered image data, as well as all the code we used to generate our dataset and perform our experiments, is available online.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Roberts_Hypersim_A_Photorealistic_Synthetic_Dataset_for_Holistic_Indoor_Scene_Understanding_ICCV_2021_paper.pdf", @@ -18978,7 +20265,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Roberts_Hypersim_A_Photorealistic_Synthetic_Dataset_for_Holistic_Indoor_Scene_Understanding_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Roberts_Hypersim_A_Photorealistic_Synthetic_Dataset_for_Holistic_Indoor_Scene_Understanding_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Roberts_2021_ICCV,\n \n author = {\n Roberts,\n Mike and Ramapuram,\n Jason and Ranjan,\n Anurag and Kumar,\n Atulit and Bautista,\n Miguel Angel and Paczan,\n Nathan and Webb,\n Russ and Susskind,\n Joshua M.\n},\n title = {\n Hypersim: A Photorealistic Synthetic Dataset for Holistic Indoor Scene Understanding\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10912-10922\n} \n}" }, { "title": "Hyperspectral Image Denoising With Realistic Data", @@ -18986,10 +20274,11 @@ "status": "Poster", "track": "main", "pid": 6027, + "author_site": "Tao Zhang; Ying Fu; Cheng Li", "author": "Tao Zhang; Ying Fu; Cheng Li", "abstract": "The hyperspectral image (HSI) denoising has been widely utilized to improve HSI qualities. Recently, learning-based HSI denoising methods have shown their effectiveness, but most of them are based on synthetic dataset and lack the generalization capability on real testing HSI. Moreover, there is still no public paired real HSI denoising dataset to learn HSI denoising network and quantitatively evaluate HSI methods. In this paper, we mainly focus on how to produce realistic dataset for learning and evaluating HSI denoising network. On the one hand, we collect a paired real HSI denoising dataset, which consists of shortexposure noisy HSIs and the corresponding long-exposure clean HSIs. On the other hand, we propose an accurate HSI noise model which matches the distribution of real data well and can be employed to synthesize realistic dataset. On the basis of the noise model, we present an approach to calibrate the noise parameters of the given hyperspectral camera. The extensive experimental results show that a network learned with only synthetic data generated by our noise model performs as well as it is learned with paired real data.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhang_Hyperspectral_Image_Denoising_With_Realistic_Data_ICCV_2021_paper.pdf", - "aff": "Beijing Institute of Technology; Beijing Institute of Technology; Huawei Noah\u2019s Ark Lab", + "aff": "Beijing Institute of Technology; Beijing Institute of Technology; Huawei Noah’s Ark Lab", "project": "", "github": "https://github.com/ColinTaoZhang/HSIDwRD", "supp": "", @@ -19003,13 +20292,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhang_Hyperspectral_Image_Denoising_With_Realistic_Data_ICCV_2021_paper.html", "aff_unique_index": "0;0;1", "aff_unique_norm": "Beijing Institute of Technology;Huawei", - "aff_unique_dep": ";Noah\u2019s Ark Lab", + "aff_unique_dep": ";Noah’s Ark Lab", "aff_unique_url": "http://www.bit.edu.cn/;https://www.huawei.com", "aff_unique_abbr": "BIT;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2021_ICCV,\n \n author = {\n Zhang,\n Tao and Fu,\n Ying and Li,\n Cheng\n},\n title = {\n Hyperspectral Image Denoising With Realistic Data\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2248-2257\n} \n}" }, { "title": "I2UV-HandNet: Image-to-UV Prediction Network for Accurate and High-Fidelity 3D Hand Mesh Modeling", @@ -19017,6 +20307,7 @@ "status": "Poster", "track": "main", "pid": 3321, + "author_site": "Ping Chen; Yujin Chen; Dong Yang; Fangyin Wu; Qin Li; Qingpei Xia; Yong Tan", "author": "Ping Chen; Yujin Chen; Dong Yang; Fangyin Wu; Qin Li; Qingpei Xia; Yong Tan", "abstract": "Reconstructing a high-precision and high-fidelity 3D human hand from a color image plays a central role in replicating a realistic virtual hand in human-computer interaction and virtual reality applications. Current methods are lacking in accuracy and fidelity due to various hand poses and severe occlusions. In this study, we propose an I2UV-HandNet model for accurate hand pose and shape estimation as well as 3D hand super-resolution reconstruction. Specifically, we present the first UV-based 3D hand shape representation. To recover a 3D hand mesh from an RGB image, we design an AffineNet to predict a UV position map from the input in an image-to-image translation fashion. To obtain a higher fidelity shape, we exploit an additional SRNet to transform the low-resolution UV map outputted by AffineNet into a high-resolution one. For the first time, we demonstrate the characterization capability of the UV-based hand shape representation. Our experiments show that the proposed method achieves state-of-the-art performance on several challenging benchmarks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_I2UV-HandNet_Image-to-UV_Prediction_Network_for_Accurate_and_High-Fidelity_3D_Hand_ICCV_2021_paper.pdf", @@ -19040,7 +20331,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+1;0;0;0;0;0", - "aff_country_unique": "China;Germany" + "aff_country_unique": "China;Germany", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Ping and Chen,\n Yujin and Yang,\n Dong and Wu,\n Fangyin and Li,\n Qin and Xia,\n Qingpei and Tan,\n Yong\n},\n title = {\n I2UV-HandNet: Image-to-UV Prediction Network for Accurate and High-Fidelity 3D Hand Mesh Modeling\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12929-12938\n} \n}" }, { "title": "ICE: Inter-Instance Contrastive Encoding for Unsupervised Person Re-Identification", @@ -19048,10 +20340,11 @@ "status": "Poster", "track": "main", "pid": 3303, - "author": "Hao Chen; Benoit Lagadec; Fran\u00e7ois Bremond", + "author_site": "Hao Chen; Benoit Lagadec; François Bremond", + "author": "Hao Chen; Benoit Lagadec; François Bremond", "abstract": "Unsupervised person re-identification (ReID) aims at learning discriminative identity features without annotations. Recently, self-supervised contrastive learning has gained increasing attention for its effectiveness in unsupervised representation learning. The main idea of instance contrastive learning is to match a same instance in different augmented views. However, the relationship between different instances has not been fully explored in previous contrastive methods, especially for instance-level contrastive loss. To address this issue, we propose Inter-instance Contrastive Encoding (ICE) that leverages inter-instance pairwise similarity scores to boost previous class-level contrastive ReID methods. We first use pairwise similarity ranking as one-hot hard pseudo labels for hard instance contrast, which aims at reducing intra-class variance. Then, we use similarity scores as soft pseudo labels to enhance the consistency between augmented and original views, which makes our model more robust to augmentation perturbations. Experiments on several large-scale person ReID datasets validate the effectiveness of our proposed unsupervised method ICE, which is competitive with even supervised methods. Code is made available at https://github.com/chenhao2345/ICE.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_ICE_Inter-Instance_Contrastive_Encoding_for_Unsupervised_Person_Re-Identification_ICCV_2021_paper.pdf", - "aff": "Inria+Universit \u00b4e C\u02c6ote d\u2019Azur+European Systems Integration; European Systems Integration; Inria+Universit \u00b4e C\u02c6ote d\u2019Azur", + "aff": "Inria+Universit ´e Cˆote d’Azur+European Systems Integration; European Systems Integration; Inria+Universit ´e Cˆote d’Azur", "project": "", "github": "https://github.com/chenhao2345/ICE", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Chen_ICE_Inter-Instance_Contrastive_ICCV_2021_supplemental.pdf", @@ -19064,14 +20357,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Chen_ICE_Inter-Instance_Contrastive_Encoding_for_Unsupervised_Person_Re-Identification_ICCV_2021_paper.html", "aff_unique_index": "0+1+2;2;0+1", - "aff_unique_norm": "INRIA;Universit\u00e9 C\u00f4te d\u2019Azur;European Systems Integration", + "aff_unique_norm": "Inria;Université Côte d’Azur;European Systems Integration", "aff_unique_dep": ";;", - "aff_unique_url": "https://www.inria.fr;https://www.unice.fr;", - "aff_unique_abbr": "Inria;UniCoast;", + "aff_unique_url": "https://www.inria.fr;https://www.univ-cotedazur.fr;", + "aff_unique_abbr": "Inria;UCA;", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0+1;1;0+0", - "aff_country_unique": "France;Unknown" + "aff_country_unique": "France;Unknown", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Hao and Lagadec,\n Benoit and Bremond,\n Fran\\c{c\n}ois\n},\n title = {\n ICE: Inter-Instance Contrastive Encoding for Unsupervised Person Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14960-14969\n} \n}" }, { "title": "ICON: Learning Regular Maps Through Inverse Consistency", @@ -19079,7 +20373,8 @@ "status": "Poster", "track": "main", "pid": 10828, - "author": "Hastings Greer; Roland Kwitt; Fran\u00e7ois-Xavier Vialard; Marc Niethammer", + "author_site": "Hastings Greer; Roland Kwitt; François-Xavier Vialard; Marc Niethammer", + "author": "Hastings Greer; Roland Kwitt; François-Xavier Vialard; Marc Niethammer", "abstract": "Learning maps between data samples is fundamental. Applications range from representation learning, image translation and generative modeling, to the estimation of spatial deformations. Such maps relate feature vectors, or map between feature spaces. Well-behaved maps should be regular, which can be imposed explicitly or may emanate from the data itself. We explore what induces regularity for spatial transformations, e.g., when computing image registrations. Classical optimization-based models compute maps between pairs of samples and rely on an appropriate regularizer for well-posedness. Recent deep learning approaches have attempted to avoid using such regularizers altogether by relying on the sample population instead. We explore if it is possible to obtain spatial regularity using an inverse consistency loss only and elucidate what explains map regularity in such a context. We find that deep networks combined with an inverse consistency loss and randomized off-grid interpolation yield well behaved, approximately diffeomorphic, spatial transformations. Despite the simplicity of this approach, our experiments present compelling evidence, on both synthetic and real data, that regular maps can be obtained without carefully tuned explicit regularizers and competitive registration performance.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Greer_ICON_Learning_Regular_Maps_Through_Inverse_Consistency_ICCV_2021_paper.pdf", "aff": ";;;", @@ -19093,7 +20388,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Greer_ICON_Learning_Regular_Maps_Through_Inverse_Consistency_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Greer_ICON_Learning_Regular_Maps_Through_Inverse_Consistency_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Greer_2021_ICCV,\n \n author = {\n Greer,\n Hastings and Kwitt,\n Roland and Vialard,\n Fran\\c{c\n}ois-Xavier and Niethammer,\n Marc\n},\n title = {\n ICON: Learning Regular Maps Through Inverse Consistency\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3396-3405\n} \n}" }, { "title": "ID-Reveal: Identity-Aware DeepFake Video Detection", @@ -19101,10 +20397,11 @@ "status": "Poster", "track": "main", "pid": 10314, - "author": "Davide Cozzolino; Andreas R\u00f6ssler; Justus Thies; Matthias Nie\u00dfner; Luisa Verdoliva", + "author_site": "Davide Cozzolino; Andreas Rössler; Justus Thies; Matthias Nießner; Luisa Verdoliva", + "author": "Davide Cozzolino; Andreas Rössler; Justus Thies; Matthias Nießner; Luisa Verdoliva", "abstract": "A major challenge in DeepFake forgery detection is that state-of-the-art algorithms are mostly trained to detect a specific fake method. As a result, these approaches show poor generalization across different types of facial manipulations, e.g., from face swapping to facial reenactment. To this end, we introduce ID-Reveal, a new approach that learns temporal facial features, specific of how a person moves while talking, by means of metric learning coupled with an adversarial training strategy. The advantage is that we do not need any training data of fakes, but only train on real videos. Moreover, we utilize high-level semantic features, which enables robustess to widespread and disruptive forms of post-processing. We perform a thorough experimental analysis on several publicly available benchmarks. Compared to state of the art, our method improves generalization and is more robust to low-quality videos, that are usually spread over social networks. In particular, we obtain an average improvement of more than 15% in terms of accuracy for facial reenactment on high compressed videos.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Cozzolino_ID-Reveal_Identity-Aware_DeepFake_Video_Detection_ICCV_2021_paper.pdf", - "aff": "University Federico II of Naples; Technical University of Munich; Technical University of Munich + Max Planck Institute for Intelligent Systems, T \u00a8ubingen; Technical University of Munich; University Federico II of Naples", + "aff": "University Federico II of Naples; Technical University of Munich; Technical University of Munich + Max Planck Institute for Intelligent Systems, T ¨ubingen; Technical University of Munich; University Federico II of Naples", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Cozzolino_ID-Reveal_Identity-Aware_DeepFake_ICCV_2021_supplemental.pdf", @@ -19122,9 +20419,10 @@ "aff_unique_url": "https://www.unina.it;https://www.tum.de;https://www.mpi-is.mpg.de", "aff_unique_abbr": "UNINA;TUM;MPI-IS", "aff_campus_unique_index": "1", - "aff_campus_unique": ";T\u00fcbingen", + "aff_campus_unique": ";Tübingen", "aff_country_unique_index": "0;1;1+1;1;0", - "aff_country_unique": "Italy;Germany" + "aff_country_unique": "Italy;Germany", + "bibtex": "@InProceedings{Cozzolino_2021_ICCV,\n \n author = {\n Cozzolino,\n Davide and R\\"ossler,\n Andreas and Thies,\n Justus and Nie{\\ss\n}ner,\n Matthias and Verdoliva,\n Luisa\n},\n title = {\n ID-Reveal: Identity-Aware DeepFake Video Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15108-15117\n} \n}" }, { "title": "IDARTS: Interactive Differentiable Architecture Search", @@ -19132,6 +20430,7 @@ "status": "Poster", "track": "main", "pid": 8682, + "author_site": "Song Xue; Runqi Wang; Baochang Zhang; Tian Wang; Guodong Guo; David Doermann", "author": "Song Xue; Runqi Wang; Baochang Zhang; Tian Wang; Guodong Guo; David Doermann", "abstract": "Differentiable Architecture Search (DARTS) improves the efficiency of architecture search by learning the architecture and network parameters end-to-end. However, the intrinsic relationship between the architecture's parameters is neglected, leading to a sub-optimal optimization process. The reason lies in the fact that the gradient descent method used in DARTS ignores the coupling relationship of the parameters and therefore degrades the optimization. In this paper, we address this issue by formulating DARTS as a bilinear optimization problem and introducing an Interactive Differentiable Architecture Search (IDARTS). We first develop a backtracking backpropagation process, which can decouple the relationships of different kinds of parameters and train them in the same framework. The backtracking method coordinates the training of different parameters that fully explore their interaction and optimize training. We present experiments on the CIFAR10 and ImageNet datasets that demonstrate the efficacy of the IDARTS approach by achieving a top-1 accuracy of 76.52% on ImageNet without additional search cost vs. 75.8% with the state-of-the-art PC-DARTS.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xue_IDARTS_Interactive_Differentiable_Architecture_Search_ICCV_2021_paper.pdf", @@ -19148,14 +20447,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Xue_IDARTS_Interactive_Differentiable_Architecture_Search_ICCV_2021_paper.html", "aff_unique_index": "0+1;0;0+2;0+1;3+4;5", - "aff_unique_norm": "Beihang University;Nanjing University of Science and Technology;Lobachevsky State University;National Engineering Laboratory for Deep Learning Technology and Application;Baidu;University at Buffalo", + "aff_unique_norm": "Beihang University;Nanjing University of Science and Technology;Lobachevsky State University;National Engineering Laboratory for Deep Learning Technology and Application;Baidu Research;University at Buffalo", "aff_unique_dep": ";Jiangsu Key Laboratory of Image and Video Understanding for Social Safety;;;Institute of Deep Learning;", "aff_unique_url": "http://www.buaa.edu.cn/;http://www.nust.edu.cn;https://www.unn.ru;;https://research.baidu.com;https://www.buffalo.edu", "aff_unique_abbr": "BUAA;;UNN;;Baidu;UB", "aff_campus_unique_index": "0+1;0;0+2;0+1;0", "aff_campus_unique": "Beijing;Nanjing;Nizhni Novgorod;", "aff_country_unique_index": "0+0;0;0+1;0+0;0+0;2", - "aff_country_unique": "China;Russian Federation;United States" + "aff_country_unique": "China;Russian Federation;United States", + "bibtex": "@InProceedings{Xue_2021_ICCV,\n \n author = {\n Xue,\n Song and Wang,\n Runqi and Zhang,\n Baochang and Wang,\n Tian and Guo,\n Guodong and Doermann,\n David\n},\n title = {\n IDARTS: Interactive Differentiable Architecture Search\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1163-1172\n} \n}" }, { "title": "IDM: An Intermediate Domain Module for Domain Adaptive Person Re-ID", @@ -19163,6 +20463,7 @@ "status": "Poster", "track": "main", "pid": 2383, + "author_site": "Yongxing Dai; Jun Liu; Yifan Sun; Zekun Tong; Chi Zhang; Ling-Yu Duan", "author": "Yongxing Dai; Jun Liu; Yifan Sun; Zekun Tong; Chi Zhang; Ling-Yu Duan", "abstract": "Unsupervised domain adaptive person re-identification (UDA re-ID) aims at transferring the labeled source domain's knowledge to improve the model's discriminability on the unlabeled target domain. From a novel perspective, we argue that the bridging between the source and target domains can be utilized to tackle the UDA re-ID task, and we focus on explicitly modeling appropriate intermediate domains to characterize this bridging. Specifically, we propose an Intermediate Domain Module (IDM) to generate intermediate domains' representations on-the-fly by mixing the source and target domains' hidden representations using two domain factors. Based on the \"shortest geodesic path\" definition, i.e., the intermediate domains along the shortest geodesic path between the two extreme domains can play a better bridging role, we propose two properties that these intermediate domains should satisfy. To ensure these two properties to better characterize appropriate intermediate domains, we enforce the bridge losses on intermediate domains' prediction space and feature space, and enforce a diversity loss on the two domain factors. The bridge losses aim at guiding the distribution of appropriate intermediate domains to keep the right distance to the source and target domains. The diversity loss serves as a regularization to prevent the generated intermediate domains from being over-fitting to either of the source and target domains. Our proposed method outperforms the state-of-the-arts by a large margin in all the common UDA re-ID tasks, and the mAP gain is up to 7.7% on the challenging MSMT17 benchmark. Code is available at https://github.com/SikaStar/IDM.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Dai_IDM_An_Intermediate_Domain_Module_for_Domain_Adaptive_Person_Re-ID_ICCV_2021_paper.pdf", @@ -19179,14 +20480,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Dai_IDM_An_Intermediate_Domain_Module_for_Domain_Adaptive_Person_Re-ID_ICCV_2021_paper.html", "aff_unique_index": "0;1;2;3;2;0+4", - "aff_unique_norm": "Peking University;Singapore University of Technology and Design;Megvii Technology;National University of Singapore;Pengcheng Laboratory", - "aff_unique_dep": "Institute of Digital Media (IDM);;;;Peng Cheng Laboratory", + "aff_unique_norm": "Peking University;Singapore University of Technology and Design;Megvii Technology;National University of Singapore;Peng Cheng Laboratory", + "aff_unique_dep": "Institute of Digital Media (IDM);;;;", "aff_unique_url": "http://www.pku.edu.cn;https://www.sutd.edu.sg;https://www.megvii.com;https://www.nus.edu.sg;", "aff_unique_abbr": "PKU;SUTD;Megvii;NUS;", "aff_campus_unique_index": "0;0+2", "aff_campus_unique": "Beijing;;Shenzhen", "aff_country_unique_index": "0;1;0;1;0;0+0", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Dai_2021_ICCV,\n \n author = {\n Dai,\n Yongxing and Liu,\n Jun and Sun,\n Yifan and Tong,\n Zekun and Zhang,\n Chi and Duan,\n Ling-Yu\n},\n title = {\n IDM: An Intermediate Domain Module for Domain Adaptive Person Re-ID\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11864-11874\n} \n}" }, { "title": "IICNet: A Generic Framework for Reversible Image Conversion", @@ -19194,6 +20496,7 @@ "status": "Poster", "track": "main", "pid": 1650, + "author_site": "Ka Leong Cheng; Yueqi Xie; Qifeng Chen", "author": "Ka Leong Cheng; Yueqi Xie; Qifeng Chen", "abstract": "Reversible image conversion (RIC) aims to build a reversible transformation between specific visual content (e.g., short videos) and an embedding image, where the original content can be restored from the embedding when necessary. This work develops Invertible Image Conversion Net (IICNet) as a generic solution to various RIC tasks due to its strong capacity and task-independent design. Unlike previous encoder-decoder based methods, IICNet maintains a highly invertible structure based on invertible neural networks (INNs) to better preserve the information during conversion. We use a relation module and a channel squeeze layer to improve the INN nonlinearity to extract cross-image relations and the network flexibility, respectively. Experimental results demonstrate that IICNet outperforms the specifically-designed methods on existing RIC tasks and can generalize well to various newly-explored tasks. With our generic IICNet, we no longer need to hand-engineer task-specific embedding networks for rapidly occurring visual content. Our source codes are available at: https://github.com/felixcheng97/IICNet.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Cheng_IICNet_A_Generic_Framework_for_Reversible_Image_Conversion_ICCV_2021_paper.pdf", @@ -19217,7 +20520,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Cheng_2021_ICCV,\n \n author = {\n Cheng,\n Ka Leong and Xie,\n Yueqi and Chen,\n Qifeng\n},\n title = {\n IICNet: A Generic Framework for Reversible Image Conversion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1991-2000\n} \n}" }, { "title": "ILVR: Conditioning Method for Denoising Diffusion Probabilistic Models", @@ -19225,6 +20529,7 @@ "status": "Poster", "track": "main", "pid": 2958, + "author_site": "Jooyoung Choi; Sungwon Kim; Yonghyun Jeong; Youngjune Gwon; Sungroh Yoon", "author": "Jooyoung Choi; Sungwon Kim; Yonghyun Jeong; Youngjune Gwon; Sungroh Yoon", "abstract": "Denoising diffusion probabilistic models (DDPM) have shown remarkable performance in unconditional image generation. However, due to the stochasticity of the generative process in DDPM, it is challenging to generate images with the desired semantics. In this work, we propose Iterative Latent Variable Refinement (ILVR), a method to guide the generative process in DDPM to generate high-quality images based on a given reference image. Here, the refinement of the generative process in DDPM enables a single DDPM to sample images from various sets directed by the reference image. The proposed ILVR method generates high-quality images while controlling the generation. The controllability of our method allows adaptation of a single DDPM without any additional learning in various image generation tasks, such as generation from various downsampling factors, multi-domain image translation, paint-to-image, and editing with scribbles.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Choi_ILVR_Conditioning_Method_for_Denoising_Diffusion_Probabilistic_Models_ICCV_2021_paper.pdf", @@ -19239,7 +20544,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Choi_ILVR_Conditioning_Method_for_Denoising_Diffusion_Probabilistic_Models_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Choi_ILVR_Conditioning_Method_for_Denoising_Diffusion_Probabilistic_Models_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Choi_2021_ICCV,\n \n author = {\n Choi,\n Jooyoung and Kim,\n Sungwon and Jeong,\n Yonghyun and Gwon,\n Youngjune and Yoon,\n Sungroh\n},\n title = {\n ILVR: Conditioning Method for Denoising Diffusion Probabilistic Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14367-14376\n} \n}" }, { "title": "ISD: Self-Supervised Learning by Iterative Similarity Distillation", @@ -19247,6 +20553,7 @@ "status": "Poster", "track": "main", "pid": 7048, + "author_site": "Ajinkya Tejankar; Soroush Abbasi Koohpayegani; Vipin Pillai; Paolo Favaro; Hamed Pirsiavash", "author": "Ajinkya Tejankar; Soroush Abbasi Koohpayegani; Vipin Pillai; Paolo Favaro; Hamed Pirsiavash", "abstract": "Recently, contrastive learning has achieved great results in self-supervised learning, where the main idea is to pull two augmentations of an image (positive pairs) closer compared to other random images (negative pairs). We argue that not all negative images are equally negative. Hence, we introduce a self-supervised learning algorithm where we use a soft similarity for the negative images rather than a binary distinction between positive and negative pairs. We iteratively distill a slowly evolving teacher model to the student model by capturing the similarity of a query image to some random images and transferring that knowledge to the student. Specifically, our method should handle unbalanced and unlabeled data better than existing contrastive learning methods, because the randomly chosen negative set might include many samples that are semantically similar to the query image. In this case, our method labels them as highly similar while standard contrastive methods label them as negatives. Our method achieves comparable results to the state-of-the-art models. Our code is available here: https://github.com/UMBCvision/ISD", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Tejankar_ISD_Self-Supervised_Learning_by_Iterative_Similarity_Distillation_ICCV_2021_paper.pdf", @@ -19270,7 +20577,8 @@ "aff_campus_unique_index": "0;0;0;0+2", "aff_campus_unique": "Baltimore County;;Davis", "aff_country_unique_index": "0;0;0;1;0+0", - "aff_country_unique": "United States;Switzerland" + "aff_country_unique": "United States;Switzerland", + "bibtex": "@InProceedings{Tejankar_2021_ICCV,\n \n author = {\n Tejankar,\n Ajinkya and Koohpayegani,\n Soroush Abbasi and Pillai,\n Vipin and Favaro,\n Paolo and Pirsiavash,\n Hamed\n},\n title = {\n ISD: Self-Supervised Learning by Iterative Similarity Distillation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9609-9618\n} \n}" }, { "title": "ISNet: Integrate Image-Level and Semantic-Level Context for Semantic Segmentation", @@ -19278,6 +20586,7 @@ "status": "Poster", "track": "main", "pid": 3441, + "author_site": "Zhenchao Jin; Bin Liu; Qi Chu; Nenghai Yu", "author": "Zhenchao Jin; Bin Liu; Qi Chu; Nenghai Yu", "abstract": "Co-occurrent visual pattern makes aggregating contextual information a common paradigm to enhance the pixel representation for semantic image segmentation. The existing approaches focus on modeling the context from the perspective of the whole image, i.e., aggregating the image-level contextual information. Despite impressive, these methods weaken the significance of the pixel representations of the same category, i.e., the semantic-level contextual information. To address this, this paper proposes to augment the pixel representations by aggregating the image-level and semantic-level contextual information, respectively. First, an image-level context module is designed to capture the contextual information for each pixel in the whole image. Second, we aggregate the representations of the same category for each pixel where the category regions are learned under the supervision of the ground-truth segmentation. Third, we compute the similarities between each pixel representation and the image-level contextual information, the semantic-level contextual information, respectively. At last, a pixel representation is augmented by weighted aggregating both the image-level contextual information and the semantic-level contextual information with the similarities as the weights. Integrating the image-level and semantic-level context allows this paper to report state-of-the-art accuracy on four benchmarks, i.e., ADE20K, LIP, COCOStuff and Cityscapes.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Jin_ISNet_Integrate_Image-Level_and_Semantic-Level_Context_for_Semantic_Segmentation_ICCV_2021_paper.pdf", @@ -19301,7 +20610,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Jin_2021_ICCV,\n \n author = {\n Jin,\n Zhenchao and Liu,\n Bin and Chu,\n Qi and Yu,\n Nenghai\n},\n title = {\n ISNet: Integrate Image-Level and Semantic-Level Context for Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7189-7198\n} \n}" }, { "title": "Image Harmonization With Transformer", @@ -19309,6 +20619,7 @@ "status": "Poster", "track": "main", "pid": 6319, + "author_site": "Zonghui Guo; Dongsheng Guo; Haiyong Zheng; Zhaorui Gu; Bing Zheng; Junyu Dong", "author": "Zonghui Guo; Dongsheng Guo; Haiyong Zheng; Zhaorui Gu; Bing Zheng; Junyu Dong", "abstract": "Image harmonization, aiming to make composite images look more realistic, is an important and challenging task. The composite, synthesized by combining foreground from one image with background from another image, inevitably suffers from the issue of inharmonious appearance caused by distinct imaging conditions, i.e., lights. Current solutions mainly adopt an encoder-decoder architecture with convolutional neural network (CNN) to capture the context of composite images, trying to understand what it looks like in the surrounding background near the foreground. In this work, we seek to solve image harmonization with Transformer, by leveraging its powerful ability of modeling long-range context dependencies, for adjusting foreground light to make it compatible with background light while keeping structure and semantics unchanged. We present the design of our harmonization Transformer frameworks without and with disentanglement, as well as comprehensive experiments and ablation study, demonstrating the power of Transformer and investigating the Transformer for vision. Our method achieves state-of-the-art performance on both image harmonization and image inpainting/enhancement, indicating its superiority. Our code and models are available at https://github.com/zhenglab/HarmonyTransformer.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Guo_Image_Harmonization_With_Transformer_ICCV_2021_paper.pdf", @@ -19332,7 +20643,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Sanya", "aff_country_unique_index": "0;0;0;0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Guo_2021_ICCV,\n \n author = {\n Guo,\n Zonghui and Guo,\n Dongsheng and Zheng,\n Haiyong and Gu,\n Zhaorui and Zheng,\n Bing and Dong,\n Junyu\n},\n title = {\n Image Harmonization With Transformer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14870-14879\n} \n}" }, { "title": "Image Inpainting via Conditional Texture and Structure Dual Generation", @@ -19340,6 +20652,7 @@ "status": "Poster", "track": "main", "pid": 7481, + "author_site": "Xiefan Guo; Hongyu Yang; Di Huang", "author": "Xiefan Guo; Hongyu Yang; Di Huang", "abstract": "Deep generative approaches have recently made considerable progress in image inpainting by introducing structure priors. Due to the lack of proper interaction with image texture during structure reconstruction, however, current solutions are incompetent in handling the cases with large corruptions, and they generally suffer from distorted results. In this paper, we propose a novel two-stream network for image inpainting, which models the structure-constrained texture synthesis and texture-guided structure reconstruction in a coupled manner so that they better leverage each other for more plausible generation. Furthermore, to enhance the global consistency, a Bi-directional Gated Feature Fusion (Bi-GFF) module is designed to exchange and combine the structure and texture information and a Contextual Feature Aggregation (CFA) module is developed to refine the generated contents by region affinity learning and multi-scale feature aggregation. Qualitative and quantitative experiments on the CelebA, Paris StreetView and Places2 datasets demonstrate the superiority of the proposed method. Our code is available at https://github.com/Xiefan-Guo/CTSDG.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Guo_Image_Inpainting_via_Conditional_Texture_and_Structure_Dual_Generation_ICCV_2021_paper.pdf", @@ -19363,7 +20676,8 @@ "aff_campus_unique_index": "0+0;0;0+0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0+0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Guo_2021_ICCV,\n \n author = {\n Guo,\n Xiefan and Yang,\n Hongyu and Huang,\n Di\n},\n title = {\n Image Inpainting via Conditional Texture and Structure Dual Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14134-14143\n} \n}" }, { "title": "Image Manipulation Detection by Multi-View Multi-Scale Supervision", @@ -19371,6 +20685,7 @@ "status": "Poster", "track": "main", "pid": 3997, + "author_site": "Xinru Chen; Chengbo Dong; Jiaqi Ji; Juan Cao; Xirong Li", "author": "Xinru Chen; Chengbo Dong; Jiaqi Ji; Juan Cao; Xirong Li", "abstract": "The key challenge of image manipulation detection is how to learn generalizable features that are sensitive to manipulations in novel data, whilst specific to prevent false alarms on authentic images. Current research emphasizes the sensitivity, with the specificity overlooked. In this paper we address both aspects by multi-view feature learning and multi-scale supervision. By exploiting noise distribution and boundary artifact surrounding tampered regions, the former aims to learn semantic-agnostic and thus more generalizable features. The latter allows us to learn from authentic images which are nontrivial to taken into account by current semantic segmentation network based methods. Our thoughts are realized by a new network which we term MVSS-Net. Extensive experiments on five benchmark sets justify the viability of MVSS-Net for both pixel-level and image-level manipulation detection.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_Image_Manipulation_Detection_by_Multi-View_Multi-Scale_Supervision_ICCV_2021_paper.pdf", @@ -19394,7 +20709,8 @@ "aff_campus_unique_index": ";;;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0+0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Xinru and Dong,\n Chengbo and Ji,\n Jiaqi and Cao,\n Juan and Li,\n Xirong\n},\n title = {\n Image Manipulation Detection by Multi-View Multi-Scale Supervision\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14185-14193\n} \n}" }, { "title": "Image Retrieval on Real-Life Images With Pre-Trained Vision-and-Language Models", @@ -19402,6 +20718,7 @@ "status": "Poster", "track": "main", "pid": 10069, + "author_site": "Zheyuan Liu; Cristian Rodriguez-Opazo; Damien Teney; Stephen Gould", "author": "Zheyuan Liu; Cristian Rodriguez-Opazo; Damien Teney; Stephen Gould", "abstract": "We extend the task of composed image retrieval, where an input query consists of an image and short textual description of how to modify the image. Existing methods have only been applied to non-complex images within narrow domains, such as fashion products, thereby limiting the scope of study on in-depth visual reasoning in rich image and language contexts. To address this issue, we collect the Compose Image Retrieval on Real-life images (CIRR) dataset, which consists of over 36,000 pairs of crowd-sourced, open-domain images with human-generated modifying text. To extend current methods to the open-domain, we propose CIRPLANT, a transformer based model that leverages rich pre-trained vision-and-language (V&L) knowledge for modifying visual features conditioned on natural language. Retrieval is then done by nearest neighbor lookup on the modified features. We demonstrate that with a relatively simple architecture, CIRPLANT outperforms existing methods on open-domain images, while matching state-of-the-art accuracy on the existing narrow datasets, such as fashion. Together with the release of CIRR, we believe this work will inspire further research on composed image retrieval. Our dataset, code and pre-trained models are available at https://cuberick-orion.github.io/CIRR/.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liu_Image_Retrieval_on_Real-Life_Images_With_Pre-Trained_Vision-and-Language_Models_ICCV_2021_paper.pdf", @@ -19425,7 +20742,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", - "aff_country_unique": "Australia;Switzerland" + "aff_country_unique": "Australia;Switzerland", + "bibtex": "@InProceedings{Liu_2021_ICCV,\n \n author = {\n Liu,\n Zheyuan and Rodriguez-Opazo,\n Cristian and Teney,\n Damien and Gould,\n Stephen\n},\n title = {\n Image Retrieval on Real-Life Images With Pre-Trained Vision-and-Language Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2125-2134\n} \n}" }, { "title": "Image Shape Manipulation From a Single Augmented Training Sample", @@ -19433,6 +20751,7 @@ "status": "Poster", "track": "main", "pid": 1998, + "author_site": "Yael Vinker; Eliahu Horwitz; Nir Zabari; Yedid Hoshen", "author": "Yael Vinker; Eliahu Horwitz; Nir Zabari; Yedid Hoshen", "abstract": "In this paper, we present DeepSIM, a generative model for conditional image manipulation based on a single image. We find that extensive augmentation is key for enabling single image training, and incorporate the use of thin-plate-spline (TPS) as an effective augmentation. Our network learns to map between a primitive representation of the image to the image itself. The choice of a primitive representation has an impact on the ease and expressiveness of the manipulations and can be automatic (e.g. edges), manual (e.g. segmentation) or hybrid such as edges on top of segmentations. At manipulation time, our generator allows for making complex image changes by modifying the primitive input representation and mapping it through the network. Our method is shown to achieve remarkable performance on image manipulation tasks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Vinker_Image_Shape_Manipulation_From_a_Single_Augmented_Training_Sample_ICCV_2021_paper.pdf", @@ -19449,14 +20768,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Vinker_Image_Shape_Manipulation_From_a_Single_Augmented_Training_Sample_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;0", - "aff_unique_norm": "Hebrew University of Jerusalem", + "aff_unique_norm": "The Hebrew University of Jerusalem", "aff_unique_dep": "School of Computer Science and Engineering", "aff_unique_url": "http://www.huji.ac.il", "aff_unique_abbr": "HUJI", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Jerusalem", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Israel" + "aff_country_unique": "Israel", + "bibtex": "@InProceedings{Vinker_2021_ICCV,\n \n author = {\n Vinker,\n Yael and Horwitz,\n Eliahu and Zabari,\n Nir and Hoshen,\n Yedid\n},\n title = {\n Image Shape Manipulation From a Single Augmented Training Sample\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13769-13778\n} \n}" }, { "title": "Image Synthesis From Layout With Locality-Aware Mask Adaption", @@ -19464,6 +20784,7 @@ "status": "Poster", "track": "main", "pid": 8542, + "author_site": "Zejian Li; Jingyu Wu; Immanuel Koh; Yongchuan Tang; Lingyun Sun", "author": "Zejian Li; Jingyu Wu; Immanuel Koh; Yongchuan Tang; Lingyun Sun", "abstract": "This paper is concerned with synthesizing images conditioned on a layout (a set of bounding boxes with object categories). Existing works construct a layout-mask-image pipeline. Object masks are generated separately and mapped to bounding boxes to form a whole semantic segmentation mask (layout-to-mask), with which a new image is generated (mask-to-image). However, overlapped boxes in layouts result in overlapped object masks, which reduces the mask clarity and causes confusion in image generation. We hypothesize the importance of generating clean and semantically clear semantic masks. The hypothesis is supported by the finding that the performance of state-of-the-art LostGAN decreases when input masks are tainted. Motivated by this hypothesis, we propose Locality-Aware Mask Adaption (LAMA) module to adapt overlapped or nearby object masks in the generation. Experimental results show our proposed model with LAMA outperforms existing approaches regarding visual fidelity and alignment with input layouts. On COCO-stuff in 256x256, our method improves the state-of-the-art FID score from 41.65 to 31.12 and the SceneFID from 22.00 to 18.64.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_Image_Synthesis_From_Layout_With_Locality-Aware_Mask_Adaption_ICCV_2021_paper.pdf", @@ -19487,7 +20808,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0+0;0+0", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Zejian and Wu,\n Jingyu and Koh,\n Immanuel and Tang,\n Yongchuan and Sun,\n Lingyun\n},\n title = {\n Image Synthesis From Layout With Locality-Aware Mask Adaption\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13819-13828\n} \n}" }, { "title": "Image Synthesis via Semantic Composition", @@ -19495,6 +20817,7 @@ "status": "Poster", "track": "main", "pid": 1145, + "author_site": "Yi Wang; Lu Qi; Ying-Cong Chen; Xiangyu Zhang; Jiaya Jia", "author": "Yi Wang; Lu Qi; Ying-Cong Chen; Xiangyu Zhang; Jiaya Jia", "abstract": "In this paper, we present a novel approach to synthesize realistic images based on their semantic layouts. It hypothesizes that for objects with similar appearance, they share similar representation. Our method establishes dependencies between regions according to their appearance correlation, yielding both spatially variant and associated representations. Conditioning on these features, we propose a dynamic weighted network constructed by spatially conditional computation (with both convolution and normalization). More than preserving semantic distinctions, the given dynamic network strengthens semantic relevance, benefiting global structure and detail synthesis. We demonstrate that our method gives the compelling generation performance qualitatively and quantitatively with extensive experiments on benchmarks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_Image_Synthesis_via_Semantic_Composition_ICCV_2021_paper.pdf", @@ -19511,14 +20834,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wang_Image_Synthesis_via_Semantic_Composition_ICCV_2021_paper.html", "aff_unique_index": "0;0;1;2;0+3", - "aff_unique_norm": "Chinese University of Hong Kong;Hong Kong University of Science and Technology;Megvii Technology;SmartMore", + "aff_unique_norm": "The Chinese University of Hong Kong;Hong Kong University of Science and Technology;MEGVII Technology;SmartMore", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.ust.hk;https://www.megvii.com;", "aff_unique_abbr": "CUHK;HKUST;;", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Yi and Qi,\n Lu and Chen,\n Ying-Cong and Zhang,\n Xiangyu and Jia,\n Jiaya\n},\n title = {\n Image Synthesis via Semantic Composition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13749-13758\n} \n}" }, { "title": "Image2Reverb: Cross-Modal Reverb Impulse Response Synthesis", @@ -19526,6 +20850,7 @@ "status": "Poster", "track": "main", "pid": 9135, + "author_site": "Nikhil Singh; Jeff Mentch; Jerry Ng; Matthew Beveridge; Iddo Drori", "author": "Nikhil Singh; Jeff Mentch; Jerry Ng; Matthew Beveridge; Iddo Drori", "abstract": "Measuring the acoustic characteristics of a space is often done by capturing its impulse response (IR), a representation of how a full-range stimulus sound excites it. This work generates an IR from a single image, which can then be applied to other signals using convolution, simulating the reverberant characteristics of the space shown in the image. Recording these IRs is both time-intensive and expensive, and often infeasible for inaccessible locations. We use an end-to-end neural network architecture to generate plausible audio impulse responses from single images of acoustic environments. We evaluate our method both by comparisons to ground truth data and by human expert evaluation. We demonstrate our approach by generating plausible impulse responses from diverse settings and formats including well known places, musical halls, rooms in paintings, images from animations and computer games, synthetic environments generated from text, panoramic images, and video conference backgrounds.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Singh_Image2Reverb_Cross-Modal_Reverb_Impulse_Response_Synthesis_ICCV_2021_paper.pdf", @@ -19549,7 +20874,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Singh_2021_ICCV,\n \n author = {\n Singh,\n Nikhil and Mentch,\n Jeff and Ng,\n Jerry and Beveridge,\n Matthew and Drori,\n Iddo\n},\n title = {\n Image2Reverb: Cross-Modal Reverb Impulse Response Synthesis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 286-295\n} \n}" }, { "title": "Impact of Aliasing on Generalization in Deep Convolutional Networks", @@ -19557,6 +20883,7 @@ "status": "Poster", "track": "main", "pid": 8022, + "author_site": "Cristina Vasconcelos; Hugo Larochelle; Vincent Dumoulin; Rob Romijnders; Nicolas Le Roux; Ross Goroshin", "author": "Cristina Vasconcelos; Hugo Larochelle; Vincent Dumoulin; Rob Romijnders; Nicolas Le Roux; Ross Goroshin", "abstract": "We investigate the impact of aliasing on generalization in Deep Convolutional Networks and show that data augmentation schemes alone are unable to prevent it due to structural limitations in widely used architectures. Drawing insights from frequency analysis theory, we take a closer look at Resnet and EfficientNet architectures and review the trade-off between aliasing and information loss in each of their major components. We show how to mitigate aliasing by inserting non-trainable low-pass filters at key locations, particularly where networks lack the capacity to learn them. These simple architectural changes lead to substantial improvements in generalization on i.i.d. and even more on out-of-distribution conditions, such as image classification under natural corruptions on ImageNet-C and few-shot learning on Meta-Dataset. State-of-the art results are achieved on both datasets without introducing additional trainable parameters and using the default hyper-parameters of open source codebases.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Vasconcelos_Impact_of_Aliasing_on_Generalization_in_Deep_Convolutional_Networks_ICCV_2021_paper.pdf", @@ -19571,7 +20898,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Vasconcelos_Impact_of_Aliasing_on_Generalization_in_Deep_Convolutional_Networks_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Vasconcelos_Impact_of_Aliasing_on_Generalization_in_Deep_Convolutional_Networks_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Vasconcelos_2021_ICCV,\n \n author = {\n Vasconcelos,\n Cristina and Larochelle,\n Hugo and Dumoulin,\n Vincent and Romijnders,\n Rob and Le Roux,\n Nicolas and Goroshin,\n Ross\n},\n title = {\n Impact of Aliasing on Generalization in Deep Convolutional Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10529-10538\n} \n}" }, { "title": "Improve Unsupervised Pretraining for Few-Label Transfer", @@ -19579,6 +20907,7 @@ "status": "Poster", "track": "main", "pid": 2954, + "author_site": "Suichan Li; Dongdong Chen; Yinpeng Chen; Lu Yuan; Lei Zhang; Qi Chu; Bin Liu; Nenghai Yu", "author": "Suichan Li; Dongdong Chen; Yinpeng Chen; Lu Yuan; Lei Zhang; Qi Chu; Bin Liu; Nenghai Yu", "abstract": "Unsupervised pretraining has achieved great success and many recently works have shown unsupervised pretraining can achieve comparable or even slightly better transfer performance than supervised pretraining on downstream target datasets. But in this paper, we find this conclusion may not hold when the target dataset has very few labeled samples for finetuning, ie, few-label transfer. We analyze the possible reason from the clustering perspective: 1) The clustering quality of target samples is of great importance to few-label transfer; 2) Though contrastive learning is essentially to learn how to cluster, its clustering quality is still inferior to supervised pretraining due to lack of label supervision. Based on the analysis, we interestingly discover that only involving some unlabeled target domain into the unsupervised pretraining can improve the clustering quality, subsequently reducing the transfer performance gap with supervised pretraining. This finding also motivates us to propose a new progressive few-label transfer algorithm for real applications, which aims to maximize the transfer performance under a limited annotation budget. To support our analysis and proposed method, we conduct extensive experiments on nine different target datasets. Experimental results show our proposed method can significantly boost the few-label transfer performance of unsupervised pretraining.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_Improve_Unsupervised_Pretraining_for_Few-Label_Transfer_ICCV_2021_paper.pdf", @@ -19595,14 +20924,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Li_Improve_Unsupervised_Pretraining_for_Few-Label_Transfer_ICCV_2021_paper.html", "aff_unique_index": "0;1;1;1;1;0;0;0", - "aff_unique_norm": "University of Science and Technology of China;Microsoft", + "aff_unique_norm": "University of Science and Technology of China;Microsoft Corporation", "aff_unique_dep": ";Microsoft Research", "aff_unique_url": "http://www.ustc.edu.cn;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "USTC;MSR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;1;0;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Suichan and Chen,\n Dongdong and Chen,\n Yinpeng and Yuan,\n Lu and Zhang,\n Lei and Chu,\n Qi and Liu,\n Bin and Yu,\n Nenghai\n},\n title = {\n Improve Unsupervised Pretraining for Few-Label Transfer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10201-10210\n} \n}" }, { "title": "Improving 3D Object Detection With Channel-Wise Transformer", @@ -19610,6 +20940,7 @@ "status": "Poster", "track": "main", "pid": 9623, + "author_site": "Hualian Sheng; Sijia Cai; Yuan Liu; Bing Deng; Jianqiang Huang; Xian-Sheng Hua; Min-Jian Zhao", "author": "Hualian Sheng; Sijia Cai; Yuan Liu; Bing Deng; Jianqiang Huang; Xian-Sheng Hua; Min-Jian Zhao", "abstract": "Though 3D object detection from point clouds has achieved rapid progress in recent years, the lack of flexible and high-performance proposal refinement remains a great hurdle for existing state-of-the-art two-stage detectors. Previous works on refining 3D proposals have relied on human-designed components such as keypoints sampling, set abstraction and multi-scale feature fusion to produce powerful 3D object representations. Such methods, however, have limited ability to capture rich contextual dependencies among points. In this paper, we leverage the high-quality region proposal network and a Channel-wise Transformer architecture to constitute our two-stage 3D object detection framework (CT3D) with minimal hand-crafted design. The proposed CT3D simultaneously performs proposal-aware embedding and channel-wise context aggregation for the point features within each proposal. Specifically, CT3D uses proposal's keypoints for spatial contextual modelling and learns attention propagation in the encoding module, mapping the proposal to point embeddings. Next, a new channel-wise decoding module enriches the query-key interaction via channel-wise re-weighting to effectively merge multi-level contexts, which contributes to more accurate object predictions. Extensive experiments demonstrate that our CT3D method has superior performance and excellent scalability. Remarkably, CT3D achieves the AP of 81.77% in the moderate car category on the KITTI test 3D detection benchmark, outperforms state-of-the-art 3D detectors.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Sheng_Improving_3D_Object_Detection_With_Channel-Wise_Transformer_ICCV_2021_paper.pdf", @@ -19633,7 +20964,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Sheng_2021_ICCV,\n \n author = {\n Sheng,\n Hualian and Cai,\n Sijia and Liu,\n Yuan and Deng,\n Bing and Huang,\n Jianqiang and Hua,\n Xian-Sheng and Zhao,\n Min-Jian\n},\n title = {\n Improving 3D Object Detection With Channel-Wise Transformer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2743-2752\n} \n}" }, { "title": "Improving Contrastive Learning by Visualizing Feature Transformation", @@ -19641,6 +20973,7 @@ "status": "Poster", "track": "main", "pid": 3351, + "author_site": "Rui Zhu; Bingchen Zhao; Jingen Liu; Zhenglong Sun; Chang Wen Chen", "author": "Rui Zhu; Bingchen Zhao; Jingen Liu; Zhenglong Sun; Chang Wen Chen", "abstract": "Contrastive learning, which aims at minimizing the distance between positive pairs while maximizing that of negative ones, has been widely and successfully applied in unsupervised feature learning, where the design of positive and negative (pos/neg) pairs is one of its keys. In this paper, we attempt to devise a feature-level data manipulation, differing from data augmentation, to enhance the generic contrastive self-supervised learning. To this end, we first design a visualization scheme for pos/neg score (pos/neg score indicates cosine similarity of pos/neg pair.) distribution, which enables us to analyze, interpret and understand the learning process. To our knowledge, this is the first attempt of its kind. More importantly, leveraging this tool, we gain some significant observations, which inspire our novel Feature Transformation proposals including the extrapolation of positives. This operation creates harder positives to boost the learning because hard positives enable the model to be more view-invariant. Besides, we propose the interpolation among negatives, which provides diversified negatives and makes the model more discriminative. It is the first attempt to deal with both challenges simultaneously. Experiment results show that our proposed Feature Transformation can improve at least 6.0% accuracy on ImageNet-100 over MoCo baseline, and about 2.0% accuracy on ImageNet-1K over the MoCoV2 baseline. Transferring to the downstream tasks successfully demonstrate our model is less task-bias. Visualization tools and codes: https://github.com/DTennant/CL-Visualizing-Feature-Transformation.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhu_Improving_Contrastive_Learning_by_Visualizing_Feature_Transformation_ICCV_2021_paper.pdf", @@ -19657,14 +20990,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhu_Improving_Contrastive_Learning_by_Visualizing_Feature_Transformation_ICCV_2021_paper.html", "aff_unique_index": "0+1;2+1;1+0;0;3", - "aff_unique_norm": "Chinese University of Hong Kong;JD;Tongji University;Hong Kong Polytechnic University", - "aff_unique_dep": ";JD AI Research;;", + "aff_unique_norm": "The Chinese University of Hong Kong;JD AI Research;Tongji University;The Hong Kong Polytechnic University", + "aff_unique_dep": ";;;", "aff_unique_url": "https://www.cuhk.edu.cn;https://www.jd.com;https://www.tongji.edu.cn;https://www.polyu.edu.hk", "aff_unique_abbr": "CUHK;JD AI;Tongji;PolyU", "aff_campus_unique_index": "0;;0;0;2", "aff_campus_unique": "Shenzhen;;Hong Kong SAR", "aff_country_unique_index": "0+0;0+0;0+0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhu_2021_ICCV,\n \n author = {\n Zhu,\n Rui and Zhao,\n Bingchen and Liu,\n Jingen and Sun,\n Zhenglong and Chen,\n Chang Wen\n},\n title = {\n Improving Contrastive Learning by Visualizing Feature Transformation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10306-10315\n} \n}" }, { "title": "Improving De-Raining Generalization via Neural Reorganization", @@ -19672,6 +21006,7 @@ "status": "Poster", "track": "main", "pid": 4144, + "author_site": "Jie Xiao; Man Zhou; Xueyang Fu; Aiping Liu; Zheng-Jun Zha", "author": "Jie Xiao; Man Zhou; Xueyang Fu; Aiping Liu; Zheng-Jun Zha", "abstract": "Most existing image de-raining networks could only learn fixed mapping rules between paired rainy/clean images on single synthetic dataset and then stay static for lifetime. However, since single synthetic dataset merely provides a partial view for the distribution of rain streaks, the deep models well trained on an individual synthetic dataset tend to overfit on this biased distribution. This leads to the inability of these methods to well generalize to complex and changeable real-world rainy scenes, thus limiting their practical applications. In this paper, we try for the first time to accumulate the de-raining knowledge from multiple synthetic datasets on a single network parameter set to improve the de-raining generalization of deep networks. To achieve this goal, we explore Neural Reorganization (NR) to allow the de-raining network to keep a subtle stability-plasticity trade-off rather than naive stabilization after training phase. Specifically, we design our NR algorithm by borrowing the synaptic consolidation mechanism in the biological brain and knowledge distillation. Equipped with our NR algorithm, the deep model can be trained on a list of synthetic rainy datasets by overcoming catastrophic forgetting, making it a general-version de-raining network. Extensive experimental validation shows that due to the successful accumulation of de-raining knowledge, our proposed method can not only process multiple synthetic datasets consistently, but also achieve state-of-the-art results when dealing with real-world rainy images.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xiao_Improving_De-Raining_Generalization_via_Neural_Reorganization_ICCV_2021_paper.pdf", @@ -19695,7 +21030,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xiao_2021_ICCV,\n \n author = {\n Xiao,\n Jie and Zhou,\n Man and Fu,\n Xueyang and Liu,\n Aiping and Zha,\n Zheng-Jun\n},\n title = {\n Improving De-Raining Generalization via Neural Reorganization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4987-4996\n} \n}" }, { "title": "Improving Generalization of Batch Whitening by Convolutional Unit Optimization", @@ -19703,6 +21039,7 @@ "status": "Poster", "track": "main", "pid": 7685, + "author_site": "Yooshin Cho; Hanbyel Cho; Youngsoo Kim; Junmo Kim", "author": "Yooshin Cho; Hanbyel Cho; Youngsoo Kim; Junmo Kim", "abstract": "Batch Whitening is a technique that accelerates and stabilizes training by transforming input features to have a zero mean (Centering) and a unit variance (Scaling), and by removing linear correlation between channels (Decorrelation). In commonly used structures, which are empirically optimized with Batch Normalization, the normalization layer appears between convolution and activation function. Following Batch Whitening studies have employed the same structure without further analysis; even Batch Whitening was analyzed on the premise that the input of a linear layer is whitened. To bridge the gap, we propose a new Convolutional Unit that in line with the theory, and our method generally improves the performance of Batch Whitening. Moreover, we show the inefficacy of the original Convolutional Unit by investigating rank and correlation of features. As our method is employable off-the-shelf whitening modules, we use Iterative Normalization (IterNorm), the state-of-the-art whitening module, and obtain significantly improved performance on five image classification datasets: CIFAR-10, CIFAR-100, CUB-200-2011, Stanford Dogs, and ImageNet. Notably, we verify that our method improves stability and performance of whitening when using large learning rate, group size, and iteration number.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Cho_Improving_Generalization_of_Batch_Whitening_by_Convolutional_Unit_Optimization_ICCV_2021_paper.pdf", @@ -19726,7 +21063,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Cho_2021_ICCV,\n \n author = {\n Cho,\n Yooshin and Cho,\n Hanbyel and Kim,\n Youngsoo and Kim,\n Junmo\n},\n title = {\n Improving Generalization of Batch Whitening by Convolutional Unit Optimization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5321-5329\n} \n}" }, { "title": "Improving Low-Precision Network Quantization via Bin Regularization", @@ -19734,6 +21072,7 @@ "status": "Poster", "track": "main", "pid": 7367, + "author_site": "Tiantian Han; Dong Li; Ji Liu; Lu Tian; Yi Shan", "author": "Tiantian Han; Dong Li; Ji Liu; Lu Tian; Yi Shan", "abstract": "Model quantization is an important mechanism for energy-efficient deployment of deep neural networks on resource-constrained devices by reducing the bit precision of weights and activations. However, it remains challenging to maintain high accuracy as bit precision decreases, especially for low-precision networks (e.g., 2-bit MobileNetV2). Existing methods have explored to address this problem by minimizing the quantization error or mimicking the data distribution of full-precision networks. In this work, we propose a novel weight regularization algorithm for improving low-precision network quantization. Instead of constraining the overall data distribution, we separably optimize all elements in each quantization bin to be as close to the target quantized value as possible. Such bin regularization (BR) mechanism encourages the weight distribution of each quantization bin to be sharp and approximate to a Dirac delta distribution ideally. Experiments demonstrate that our method achieves consistent improvements over the state-of-the-art quantization-aware training methods for different low-precision networks. Particularly, our bin regularization improves LSQ for 2-bit MobileNetV2 and MobileNetV3-Small by 3.9% and 4.9% top-1 accuracy on ImageNet, respectively.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Han_Improving_Low-Precision_Network_Quantization_via_Bin_Regularization_ICCV_2021_paper.pdf", @@ -19757,7 +21096,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Han_2021_ICCV,\n \n author = {\n Han,\n Tiantian and Li,\n Dong and Liu,\n Ji and Tian,\n Lu and Shan,\n Yi\n},\n title = {\n Improving Low-Precision Network Quantization via Bin Regularization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5261-5270\n} \n}" }, { "title": "Improving Neural Network Efficiency via Post-Training Quantization With Adaptive Floating-Point", @@ -19765,6 +21105,7 @@ "status": "Poster", "track": "main", "pid": 11036, + "author_site": "Fangxin Liu; Wenbo Zhao; Zhezhi He; Yanzhi Wang; Zongwu Wang; Changzhi Dai; Xiaoyao Liang; Li Jiang", "author": "Fangxin Liu; Wenbo Zhao; Zhezhi He; Yanzhi Wang; Zongwu Wang; Changzhi Dai; Xiaoyao Liang; Li Jiang", "abstract": "Model quantization has emerged as a mandatory technique for efficient inference with advanced Deep Neural Networks (DNN). It converts the model parameters in full precision (32-bit floating point) to the hardware friendly data representation with shorter bit-width, to not only reduce the model size but also simplify the computation complexity. Nevertheless, prior model quantization either suffers from the inefficient data encoding method thus leading to noncompetitive model compression rate, or requires time-consuming quantization aware training process. In this work, we propose a novel Adaptive Floating-Point (AFP) as a variant of standard IEEE-754 floating-point format, with flexible configuration of exponent and mantissa segments. Leveraging the AFP for model quantization (i.e., encoding the parameter) could significantly enhance the model compression rate without accuracy degradation and model re-training. We also want to highlight that our proposed AFP could effectively eliminate the computationally intensive de-quantization step existing in the dynamic quantization technique adopted by the famous machine learning frameworks (e.g., pytorch, tensorRT and etc). Moreover, we develop a framework to automatically optimize and choose the adequate AFP configuration for each layer, thus maximizing the compression efficacy. Our experiments indicate that AFP-encoded ResNet-50/MobileNet-v2 only has ~0.04/0.6% accuracy degradation w.r.t its full-precision counterpart. It outperforms the state-of-the-art works by 1.1% in accuracy using the same bit-width while reducing the energy consumption by 11.2x, which is quite impressive for inference.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liu_Improving_Neural_Network_Efficiency_via_Post-Training_Quantization_With_Adaptive_Floating-Point_ICCV_2021_paper.pdf", @@ -19788,7 +21129,8 @@ "aff_campus_unique_index": ";;1;", "aff_campus_unique": ";Shanghai", "aff_country_unique_index": "0+0;0+0;0;1;0;0;0;0+0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Liu_2021_ICCV,\n \n author = {\n Liu,\n Fangxin and Zhao,\n Wenbo and He,\n Zhezhi and Wang,\n Yanzhi and Wang,\n Zongwu and Dai,\n Changzhi and Liang,\n Xiaoyao and Jiang,\n Li\n},\n title = {\n Improving Neural Network Efficiency via Post-Training Quantization With Adaptive Floating-Point\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5281-5290\n} \n}" }, { "title": "Improving Robustness Against Common Corruptions With Frequency Biased Models", @@ -19796,6 +21138,7 @@ "status": "Poster", "track": "main", "pid": 3338, + "author_site": "Tonmoy Saikia; Cordelia Schmid; Thomas Brox", "author": "Tonmoy Saikia; Cordelia Schmid; Thomas Brox", "abstract": "CNNs perform remarkably well when the training and test distributions are i.i.d, but unseen image corruptions can cause a surprisingly large drop in performance. In various real scenarios, unexpected distortions, such as random noise, compression artefacts, or weather distortions are common phenomena. Improving performance on corrupted images must not result in degraded i.i.d performance - a challenge faced by many state-of-the-art robust approaches. Image corruption types have different characteristics in the frequency spectrum and would benefit from a targeted type of data augmentation, which, however, is often unknown during training. In this paper, we introduce a mixture of two expert models specializing in high and low-frequency robustness, respectively. Moreover, we propose a new regularization scheme that minimizes the total variation (TV) of convolution feature-maps to increase high-frequency robustness. The approach improves on corrupted images without degrading in-distribution performance. We demonstrate this on ImageNet-C and also for real-world corruptions on an automotive dataset, both for object classification and object detection.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Saikia_Improving_Robustness_Against_Common_Corruptions_With_Frequency_Biased_Models_ICCV_2021_paper.pdf", @@ -19812,14 +21155,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Saikia_Improving_Robustness_Against_Common_Corruptions_With_Frequency_Biased_Models_ICCV_2021_paper.html", "aff_unique_index": "0;1;0", - "aff_unique_norm": "University of Freiburg;INRIA", + "aff_unique_norm": "University of Freiburg;Inria", "aff_unique_dep": ";", "aff_unique_url": "https://www.uni-freiburg.de;https://www.inria.fr", "aff_unique_abbr": "UoF;Inria", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "Germany;France" + "aff_country_unique": "Germany;France", + "bibtex": "@InProceedings{Saikia_2021_ICCV,\n \n author = {\n Saikia,\n Tonmoy and Schmid,\n Cordelia and Brox,\n Thomas\n},\n title = {\n Improving Robustness Against Common Corruptions With Frequency Biased Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10211-10220\n} \n}" }, { "title": "Improving Robustness of Facial Landmark Detection by Defending Against Adversarial Attacks", @@ -19827,6 +21171,7 @@ "status": "Poster", "track": "main", "pid": 4132, + "author_site": "Congcong Zhu; Xiaoqiang Li; Jide Li; Songmin Dai", "author": "Congcong Zhu; Xiaoqiang Li; Jide Li; Songmin Dai", "abstract": "Many recent developments in facial landmark detection have been driven by stacking model parameters or augmenting annotations. However, three subsequent challenges remain, including 1) an increase in computational overhead, 2) the risk of overfitting caused by increasing model parameters, and 3) the burden of labor-intensive annotation by humans. We argue that exploring the weaknesses of the detector so as to remedy them is a promising method of robust facial landmark detection. To achieve this, we propose a sample-adaptive adversarial training (SAAT) approach to interactively optimize an attacker and a detector, which improves facial landmark detection as a defense against sample-adaptive black-box attacks. By leveraging adversarial attacks, the proposed SAAT exploits adversarial perturbations beyond the handcrafted transformations to improve the detector. Specifically, an attacker generates adversarial perturbations to reflect the weakness of the detector. Then, the detector must improve its robustness to adversarial perturbations to defend against adversarial attacks. Moreover, a sample-adaptive weight is designed to balance the risks and benefits of augmenting adversarial examples to train the detector. We also introduce a masked face alignment dataset, Masked-300W, to evaluate our method. Experiments show that our SAAT performed comparably to existing state-of-the-art methods. The dataset and model are publicly available at https://github.com/zhuccly/SAAT.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhu_Improving_Robustness_of_Facial_Landmark_Detection_by_Defending_Against_Adversarial_ICCV_2021_paper.pdf", @@ -19850,7 +21195,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Shanghai", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhu_2021_ICCV,\n \n author = {\n Zhu,\n Congcong and Li,\n Xiaoqiang and Li,\n Jide and Dai,\n Songmin\n},\n title = {\n Improving Robustness of Facial Landmark Detection by Defending Against Adversarial Attacks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11751-11760\n} \n}" }, { "title": "In Defense of Scene Graphs for Image Captioning", @@ -19858,6 +21204,7 @@ "status": "Poster", "track": "main", "pid": 7999, + "author_site": "Kien Nguyen; Subarna Tripathi; Bang Du; Tanaya Guha; Truong Q. Nguyen", "author": "Kien Nguyen; Subarna Tripathi; Bang Du; Tanaya Guha; Truong Q. Nguyen", "abstract": "The mainstream image captioning models rely on Convolutional Neural Network (CNN) image features to generate captions via recurrent models. Recently, image scene graphs have been used to augment captioning models so as to leverage their structural semantics such as object entities, relationships and attributes. Several studies have noted that naive use of scene graphs from a black-box scene graph generator harms image captioning performance, and scene graph-based captioning models have to incur the overhead of explicit use of image features to generate decent captions. Addressing these challenges, we propose a framework, SG2Caps, that utilizes only the scene graph labels for competitive image captioning performance. The basic idea is to close the semantic gap between two scene graphs - one derived from the input image and the other one from its caption. In order to achieve this, we leverage the spatial location of objects and the Human-Object-Interaction (HOI) labels as an additional HOI graph. Our framework outperforms existing scene graph-only captioning models by a large margin indicating scene graphs as a promising representation for image captioning. Direct utilization of the scene graph labels avoids expensive graph convolutions over high-dimensional CNN features resulting in 49% fewer trainable parameters. The code is available at: https://github.com/Kien085/SG2Caps.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Nguyen_In_Defense_of_Scene_Graphs_for_Image_Captioning_ICCV_2021_paper.pdf", @@ -19874,14 +21221,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Nguyen_In_Defense_of_Scene_Graphs_for_Image_Captioning_ICCV_2021_paper.html", "aff_unique_index": "0;1;0;2;0", - "aff_unique_norm": "University of California, San Diego;Intel;University of Warwick", - "aff_unique_dep": ";Intel Labs;", + "aff_unique_norm": "University of California, San Diego;Intel Labs;University of Warwick", + "aff_unique_dep": ";;", "aff_unique_url": "https://ucsd.edu;https://www.intel.com/research;https://www.warwick.ac.uk", "aff_unique_abbr": "UCSD;Intel;Warwick", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "San Diego;", "aff_country_unique_index": "0;0;0;1;0", - "aff_country_unique": "United States;United Kingdom" + "aff_country_unique": "United States;United Kingdom", + "bibtex": "@InProceedings{Nguyen_2021_ICCV,\n \n author = {\n Nguyen,\n Kien and Tripathi,\n Subarna and Du,\n Bang and Guha,\n Tanaya and Nguyen,\n Truong Q.\n},\n title = {\n In Defense of Scene Graphs for Image Captioning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1407-1416\n} \n}" }, { "title": "In-Place Scene Labelling and Understanding With Implicit Scene Representation", @@ -19889,6 +21237,7 @@ "status": "Poster", "track": "main", "pid": 2992, + "author_site": "Shuaifeng Zhi; Tristan Laidlow; Stefan Leutenegger; Andrew J. Davison", "author": "Shuaifeng Zhi; Tristan Laidlow; Stefan Leutenegger; Andrew J. Davison", "abstract": "Semantic labelling is highly correlated with geometry and radiance reconstruction, as scene entities with similar shape and appearance are more likely to come from similar classes. Recent implicit neural reconstruction techniques are appealing as they do not require prior training data, but the same fully self-supervised approach is not possible for semantics because labels are human-defined properties. We extend neural radiance fields (NeRF) to jointly encode semantics with appearance and geometry, so that complete and accurate 2D semantic labels can be achieved using a small amount of in-place annotations specific to the scene. The intrinsic multi-view consistency and smoothness of NeRF benefit semantics by enabling sparse labels to efficiently propagate. We show the benefit of this approach when labels are either sparse or very noisy in room-scale scenes. We demonstrate its advantageous properties in various interesting applications such as an efficient scene labelling tool, novel semantic view synthesis, label denoising, super-resolution, label interpolation and multi-view semantic label fusion in visual semantic mapping systems.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhi_In-Place_Scene_Labelling_and_Understanding_With_Implicit_Scene_Representation_ICCV_2021_paper.pdf", @@ -19912,7 +21261,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "London", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Zhi_2021_ICCV,\n \n author = {\n Zhi,\n Shuaifeng and Laidlow,\n Tristan and Leutenegger,\n Stefan and Davison,\n Andrew J.\n},\n title = {\n In-Place Scene Labelling and Understanding With Implicit Scene Representation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15838-15847\n} \n}" }, { "title": "In-the-Wild Single Camera 3D Reconstruction Through Moving Water Surfaces", @@ -19920,6 +21270,7 @@ "status": "Poster", "track": "main", "pid": 8027, + "author_site": "Jinhui Xiong; Wolfgang Heidrich", "author": "Jinhui Xiong; Wolfgang Heidrich", "abstract": "We present a method for reconstructing the 3D shape of underwater environments from a single, stationary camera placed above the water. We propose a novel differentiable framework, which, to our knowledge, is the first single-camera solution that is capable of simultaneously retrieving the structure of dynamic water surfaces and static underwater scene geometry in the wild. This framework integrates ray casting of Snell's law at the refractive interface, multi-view triangulation and specially designed loss functions. Our method is calibration-free, and thus it is easy to collect data outdoors in uncontrolled environments. Experimental results show that our method is able to realize robust and quality reconstructions on a variety of scenes, both in a laboratory environment and in the wild, and even in a salt water environment. We believe the method is promising for applications in surveying and environmental monitoring.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xiong_In-the-Wild_Single_Camera_3D_Reconstruction_Through_Moving_Water_Surfaces_ICCV_2021_paper.pdf", @@ -19943,7 +21294,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Saudi Arabia" + "aff_country_unique": "Saudi Arabia", + "bibtex": "@InProceedings{Xiong_2021_ICCV,\n \n author = {\n Xiong,\n Jinhui and Heidrich,\n Wolfgang\n},\n title = {\n In-the-Wild Single Camera 3D Reconstruction Through Moving Water Surfaces\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12558-12567\n} \n}" }, { "title": "InSeGAN: A Generative Approach to Segmenting Identical Instances in Depth Images", @@ -19951,10 +21303,11 @@ "status": "Poster", "track": "main", "pid": 9605, - "author": "Anoop Cherian; Gon\u00e7alo Dias Pais; Siddarth Jain; Tim K. Marks; Alan Sullivan", + "author_site": "Anoop Cherian; Gonçalo Dias Pais; Siddarth Jain; Tim K. Marks; Alan Sullivan", + "author": "Anoop Cherian; Gonçalo Dias Pais; Siddarth Jain; Tim K. Marks; Alan Sullivan", "abstract": "In this paper, we present InSeGAN an unsupervised 3D generative adversarial network (GAN) for segmenting (nearly) identical instances of rigid objects in depth images. Using an analysis-by-synthesis approach, we design a novel GAN architecture to synthesize a multiple-instance depth image with independent control over each instance. InSeGAN takes in a set of code vectors (e.g., random noise vectors), each encoding the 3D pose of an object that is represented by a learned implicit object template. The generator has two distinct modules. The first module, the instance feature generator, uses each encoded pose to transform the implicit template into a feature map representation of each object instance. The second module, the depth image renderer, aggregates all of the single-instance feature maps output by the first module and generates a multiple-instance depth image. A discriminator distinguishes the generated multiple-instance depth images from the distribution of true depth images. To use our model for instance segmentation, we propose an instance pose encoder that learns to take in a generated depth image and reproduce the pose code vectors for all of the object instances. To evaluate our approach, we introduce a new synthetic dataset, \"Insta-10,\" consisting of 100,000 depth images each with 5 instances of an object from one of 10 classes. Our experiments on Insta-10, as well as on real-world noisy depth images, show that InSeGAN achieves state-of-the-art performance, often outperforming prior methods by large margins.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Cherian_InSeGAN_A_Generative_Approach_to_Segmenting_Identical_Instances_in_Depth_ICCV_2021_paper.pdf", - "aff": "Mitsubishi Electric Research Labs (MERL), Cambridge, MA; Instituto Superior T\u00b4ecnico, University of Lisbon, Portugal + Mitsubishi Electric Research Labs (MERL), Cambridge, MA; Mitsubishi Electric Research Labs (MERL), Cambridge, MA; Mitsubishi Electric Research Labs (MERL), Cambridge, MA; Mitsubishi Electric Research Labs (MERL), Cambridge, MA", + "aff": "Mitsubishi Electric Research Labs (MERL), Cambridge, MA; Instituto Superior T´ecnico, University of Lisbon, Portugal + Mitsubishi Electric Research Labs (MERL), Cambridge, MA; Mitsubishi Electric Research Labs (MERL), Cambridge, MA; Mitsubishi Electric Research Labs (MERL), Cambridge, MA; Mitsubishi Electric Research Labs (MERL), Cambridge, MA", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Cherian_InSeGAN_A_Generative_ICCV_2021_supplemental.pdf", @@ -19968,13 +21321,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Cherian_InSeGAN_A_Generative_Approach_to_Segmenting_Identical_Instances_in_Depth_ICCV_2021_paper.html", "aff_unique_index": "0;1+0;0;0;0", "aff_unique_norm": "Mitsubishi Electric Research Labs;University of Lisbon", - "aff_unique_dep": ";Instituto Superior T\u00e9cnicos", + "aff_unique_dep": ";Instituto Superior Técnicos", "aff_unique_url": "https://www.merl.com;https://www IST.edu.pt", "aff_unique_abbr": "MERL;IST", "aff_campus_unique_index": "0;1+0;0;0;0", "aff_campus_unique": "Cambridge;Lisbon", "aff_country_unique_index": "0;1+0;0;0;0", - "aff_country_unique": "United States;Portugal" + "aff_country_unique": "United States;Portugal", + "bibtex": "@InProceedings{Cherian_2021_ICCV,\n \n author = {\n Cherian,\n Anoop and Pais,\n Gon\\c{c\n}alo Dias and Jain,\n Siddarth and Marks,\n Tim K. and Sullivan,\n Alan\n},\n title = {\n InSeGAN: A Generative Approach to Segmenting Identical Instances in Depth Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10023-10032\n} \n}" }, { "title": "Incorporating Convolution Designs Into Visual Transformers", @@ -19982,6 +21336,7 @@ "status": "Poster", "track": "main", "pid": 2593, + "author_site": "Kun Yuan; Shaopeng Guo; Ziwei Liu; Aojun Zhou; Fengwei Yu; Wei Wu", "author": "Kun Yuan; Shaopeng Guo; Ziwei Liu; Aojun Zhou; Fengwei Yu; Wei Wu", "abstract": "Motivated by the success of Transformers in natural language processing (NLP) tasks, there exist some attempts (e.g., ViT and DeiT) to apply Transformers to the vision domain. However, pure Transformer architectures often require a large amount of training data or extra supervision to obtain comparable performance with convolutional neural networks (CNNs). To overcome these limitations, we analyze the potential drawbacks when directly borrowing Transformer architectures from NLP. Then we propose a new Convolution-enhanced image Transformer (CeiT) which combines the advantages of CNNs in extracting low-level features, strengthening locality, and the advantages of Transformers in establishing long-range dependencies. Three modifications are made to the original Transformer: 1) instead of the straightforward tokenization from raw input images, we design an Image-to-Tokens (I2T) module that extracts patches from generated low-level features; 2) the feed-froward network in each encoder block is replaced with a Locally-enhanced Feed-Forward (LeFF) layer that promotes the correlation among neighboring tokens in the spatial dimension; 3) a Layer-wise Class token Attention (LCA) is attached at the top of the Transformer that utilizes the multi-level representations. Experimental results on ImageNet and seven downstream tasks show the effectiveness and generalization ability compared with previous Transformers and state-of-the-art CNNs, without requiring a large amount of training data and extra CNN teachers. Besides, CeiT models also demonstrate better convergence with 3xfewer training iterations, which can reduce the training cost significantly.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yuan_Incorporating_Convolution_Designs_Into_Visual_Transformers_ICCV_2021_paper.pdf", @@ -20005,7 +21360,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;1;0;0;0", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Yuan_2021_ICCV,\n \n author = {\n Yuan,\n Kun and Guo,\n Shaopeng and Liu,\n Ziwei and Zhou,\n Aojun and Yu,\n Fengwei and Wu,\n Wei\n},\n title = {\n Incorporating Convolution Designs Into Visual Transformers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 579-588\n} \n}" }, { "title": "Incorporating Learnable Membrane Time Constant To Enhance Learning of Spiking Neural Networks", @@ -20013,7 +21369,8 @@ "status": "Poster", "track": "main", "pid": 8273, - "author": "Wei Fang; Zhaofei Yu; Yanqi Chen; Timoth\u00e9e Masquelier; Tiejun Huang; Yonghong Tian", + "author_site": "Wei Fang; Zhaofei Yu; Yanqi Chen; Timothée Masquelier; Tiejun Huang; Yonghong Tian", + "author": "Wei Fang; Zhaofei Yu; Yanqi Chen; Timothée Masquelier; Tiejun Huang; Yonghong Tian", "abstract": "Spiking Neural Networks (SNNs) have attracted enormous research interest due to temporal information processing capability, low power consumption, and high biological plausibility. However, the formulation of efficient and high-performance learning algorithms for SNNs is still challenging. Most existing learning methods learn weights only, and require manual tuning of the membrane-related parameters that determine the dynamics of a single spiking neuron. These parameters are typically chosen to be the same for all neurons, which limits the diversity of neurons and thus the expressiveness of the resulting SNNs. In this paper, we take inspiration from the observation that membrane-related parameters are different across brain regions, and propose a training algorithm that is capable of learning not only the synaptic weights but also the membrane time constants of SNNs. We show that incorporating learnable membrane time constants can make the network less sensitive to initial values and can speed up learning. In addition, we reevaluate the pooling methods in SNNs and find that max-pooling will not lead to significant information loss and have the advantage of low computation cost and binary compatibility. We evaluate the proposed method for image classification tasks on both traditional static MNIST, Fashion-MNIST, CIFAR-10 datasets, and neuromorphic N-MNIST, CIFAR10-DVS, DVS128 Gesture datasets. The experiment results show that the proposed method outperforms the state-of-the-art accuracy on nearly all datasets, using fewer time-steps. Our codes are available at https://github.com/fangwei123456/Parametric-Leaky-Integrate-and-Fire-Spiking-Neuron.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Fang_Incorporating_Learnable_Membrane_Time_Constant_To_Enhance_Learning_of_Spiking_ICCV_2021_paper.pdf", "aff": "Department of Computer Science and Technology, Peking University, China+Peng Cheng Laboratory, China+Institute for Artificial Intelligence, Peking University, China; Department of Computer Science and Technology, Peking University, China+Peng Cheng Laboratory, China+Institute for Artificial Intelligence, Peking University, China; Department of Computer Science and Technology, Peking University, China+Peng Cheng Laboratory, China; Centre de Recherche Cerveau et Cognition (CERCO), UMR5549 CNRS - Univ. Toulouse 3, France; Department of Computer Science and Technology, Peking University, China+Peng Cheng Laboratory, China+Institute for Artificial Intelligence, Peking University, China; Department of Computer Science and Technology, Peking University, China+Peng Cheng Laboratory, China+Institute for Artificial Intelligence, Peking University, China", @@ -20029,14 +21386,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Fang_Incorporating_Learnable_Membrane_Time_Constant_To_Enhance_Learning_of_Spiking_ICCV_2021_paper.html", "aff_unique_index": "0+1+0;0+1+0;0+1;2;0+1+0;0+1+0", - "aff_unique_norm": "Peking University;Pengcheng Laboratory;Centre de Recherche Cerveau et Cognition", - "aff_unique_dep": "Department of Computer Science and Technology;Peng Cheng Laboratory;UMR5549 CNRS - Univ. Toulouse 3", + "aff_unique_norm": "Peking University;Peng Cheng Laboratory;Centre de Recherche Cerveau et Cognition", + "aff_unique_dep": "Department of Computer Science and Technology;;UMR5549 CNRS - Univ. Toulouse 3", "aff_unique_url": "http://www.pku.edu.cn;;", "aff_unique_abbr": "Peking U;;CERCO", "aff_campus_unique_index": ";;;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0+0;0+0+0;0+0;1;0+0+0;0+0+0", - "aff_country_unique": "China;France" + "aff_country_unique": "China;France", + "bibtex": "@InProceedings{Fang_2021_ICCV,\n \n author = {\n Fang,\n Wei and Yu,\n Zhaofei and Chen,\n Yanqi and Masquelier,\n Timoth\\'ee and Huang,\n Tiejun and Tian,\n Yonghong\n},\n title = {\n Incorporating Learnable Membrane Time Constant To Enhance Learning of Spiking Neural Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2661-2671\n} \n}" }, { "title": "Indoor Scene Generation From a Collection of Semantic-Segmented Depth Images", @@ -20044,6 +21402,7 @@ "status": "Poster", "track": "main", "pid": 6728, + "author_site": "Ming-Jia Yang; Yu-Xiao Guo; Bin Zhou; Xin Tong", "author": "Ming-Jia Yang; Yu-Xiao Guo; Bin Zhou; Xin Tong", "abstract": "We present a method for creating 3D indoor scenes with a generative model learned from a collection of semantic-segmented depth images captured from different unknown scenes. Given a room with a specified size, our method automatically generates 3D objects in a room from a randomly sampled latent code. Different from existing methods that represent an indoor scene with the type, location, and other properties of objects in the room and learn the scene layout from a collection of complete 3D indoor scenes, our method models each indoor scene as a 3D semantic scene volume and learns a volumetric generative adversarial network (GAN) from a collection of 2.5D partial observations of 3D scenes. To this end, we apply a differentiable projection layer to project the generated 3D semantic scene volumes into semantic-segmented depth images and design a new multiple-view discriminator for learning the complete 3D scene volume from 2.5D semantic-segmented depth images. Compared to existing methods, our method not only efficiently reduces the workload of modeling and acquiring 3D scenes for training, but also produces better object shapes and their detailed layouts in the scene. We evaluate our method with different indoor scene datasets and demonstrate the advantages of our method. We also extend our method for generating 3D indoor scenes from semantic-segmented depth images inferred from RGB images of real scenes.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yang_Indoor_Scene_Generation_From_a_Collection_of_Semantic-Segmented_Depth_Images_ICCV_2021_paper.pdf", @@ -20060,14 +21419,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Yang_Indoor_Scene_Generation_From_a_Collection_of_Semantic-Segmented_Depth_Images_ICCV_2021_paper.html", "aff_unique_index": "0+1;1;0;1", - "aff_unique_norm": "Beihang University;Microsoft", + "aff_unique_norm": "Beihang University;Microsoft Research", "aff_unique_dep": ";Research", "aff_unique_url": "http://www.buaa.edu.cn/;https://www.microsoft.com/en-us/research/group/asia", "aff_unique_abbr": "BUAA;MSR Asia", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0+0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yang_2021_ICCV,\n \n author = {\n Yang,\n Ming-Jia and Guo,\n Yu-Xiao and Zhou,\n Bin and Tong,\n Xin\n},\n title = {\n Indoor Scene Generation From a Collection of Semantic-Segmented Depth Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15203-15212\n} \n}" }, { "title": "Inference of Black Hole Fluid-Dynamics From Sparse Interferometric Measurements", @@ -20075,6 +21435,7 @@ "status": "Poster", "track": "main", "pid": 4003, + "author_site": "Aviad Levis; Daeyoung Lee; Joel A. Tropp; Charles F. Gammie; Katherine L. Bouman", "author": "Aviad Levis; Daeyoung Lee; Joel A. Tropp; Charles F. Gammie; Katherine L. Bouman", "abstract": "We develop an approach to recover the underlying properties of fluid-dynamical processes from sparse measurements. We are motivated by the task of imaging the stochastically evolving environment surrounding black holes, and demonstrate how flow parameters can be estimated from sparse interferometric measurements used in radio astronomical imaging. To model the stochastic flow we use spatio-temporal Gaussian Random Fields (GRFs). The high dimensionality of the underlying source video makes direct representation via a GRF's full covariance matrix intractable. In contrast, stochastic partial differential equations are able to capture correlations at multiple scales by specifying only local interaction coefficients. Our approach estimates the coefficients of a space-time diffusion equation that dictates the stationary statistics of the dynamical process. We analyze our approach on realistic simulations of black hole evolution and demonstrate its advantage over state-of-the-art dynamic black hole imaging techniques.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Levis_Inference_of_Black_Hole_Fluid-Dynamics_From_Sparse_Interferometric_Measurements_ICCV_2021_paper.pdf", @@ -20098,7 +21459,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Pasadena;", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Levis_2021_ICCV,\n \n author = {\n Levis,\n Aviad and Lee,\n Daeyoung and Tropp,\n Joel A. and Gammie,\n Charles F. and Bouman,\n Katherine L.\n},\n title = {\n Inference of Black Hole Fluid-Dynamics From Sparse Interferometric Measurements\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2340-2349\n} \n}" }, { "title": "Inferring High-Resolution Traffic Accident Risk Maps Based on Satellite Imagery and GPS Trajectories", @@ -20106,6 +21468,7 @@ "status": "Poster", "track": "main", "pid": 11426, + "author_site": "Songtao He; Mohammad Amin Sadeghi; Sanjay Chawla; Mohammad Alizadeh; Hari Balakrishnan; Samuel Madden", "author": "Songtao He; Mohammad Amin Sadeghi; Sanjay Chawla; Mohammad Alizadeh; Hari Balakrishnan; Samuel Madden", "abstract": "Traffic accidents cost about 3% of the world's GDP and are the leading cause of death in children and young adults. Accident risk maps are useful tools to monitor and mitigate accident risk. We present a technique to generate high-resolution (5 meters) accident risk maps. At this high resolution, accidents are sparse and risk estimation is limited by bias-variance trade-off. Prior accident risk maps either estimate low-resolution maps that are of low utility (high bias), or they use frequency-based estimation techniques that inaccurately predict where accidents actually happen (high variance). To improve this trade-off, we use an end-to-end deep architecture that can input satellite imagery, GPS trajectories, road maps and the history of accidents. Our evaluation on four metropolitan areas in the US with a total area of 7,488 km2 shows that our technique outperforms prior work in terms of resolution and accuracy.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/He_Inferring_High-Resolution_Traffic_Accident_Risk_Maps_Based_on_Satellite_Imagery_ICCV_2021_paper.pdf", @@ -20125,11 +21488,12 @@ "aff_unique_norm": "Massachusetts Institute of Technology;HBKU QCRI", "aff_unique_dep": "Computer Science and Artificial Intelligence Laboratory;", "aff_unique_url": "https://www.csail.mit.edu;https://qcri.org", - "aff_unique_abbr": "MIT CSAIL;Qatar Computing Research Institute", + "aff_unique_abbr": "MIT CSAIL;QSCRI", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Cambridge;", "aff_country_unique_index": "0;1;1;0;0;0", - "aff_country_unique": "United States;Qatar" + "aff_country_unique": "United States;Qatar", + "bibtex": "@InProceedings{He_2021_ICCV,\n \n author = {\n He,\n Songtao and Sadeghi,\n Mohammad Amin and Chawla,\n Sanjay and Alizadeh,\n Mohammad and Balakrishnan,\n Hari and Madden,\n Samuel\n},\n title = {\n Inferring High-Resolution Traffic Accident Risk Maps Based on Satellite Imagery and GPS Trajectories\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11977-11985\n} \n}" }, { "title": "Infinite Nature: Perpetual View Generation of Natural Scenes From a Single Image", @@ -20137,6 +21501,7 @@ "status": "Poster", "track": "main", "pid": 3034, + "author_site": "Andrew Liu; Richard Tucker; Varun Jampani; Ameesh Makadia; Noah Snavely; Angjoo Kanazawa", "author": "Andrew Liu; Richard Tucker; Varun Jampani; Ameesh Makadia; Noah Snavely; Angjoo Kanazawa", "abstract": "We introduce the problem of perpetual view generation - long-range generation of novel views corresponding to an arbitrarily long camera trajectory given a single image. This is a challenging problem that goes far beyond the capabilities of current view synthesis methods, which quickly degenerate when presented with large camera motions. Methods for video generation also have limited ability to produce long sequences and are often agnostic to scene geometry. We take a hybrid approach that integrates both geometry and image synthesis in an iterative render, refine, and repeat framework, allowing for long-range generation that cover large distances after hundreds of frames. Our approach can be trained from a set of monocular video sequences. We propose a dataset of aerial footage of coastal scenes, and compare our method with recent view synthesis and conditional video generation baselines, showing that it can generate plausible scenes for much longer time horizons over large camera trajectories compared to existing methods. Project page at https://infinite-nature.github.io/.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liu_Infinite_Nature_Perpetual_View_Generation_of_Natural_Scenes_From_a_ICCV_2021_paper.pdf", @@ -20151,7 +21516,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Liu_Infinite_Nature_Perpetual_View_Generation_of_Natural_Scenes_From_a_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Liu_Infinite_Nature_Perpetual_View_Generation_of_Natural_Scenes_From_a_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Liu_2021_ICCV,\n \n author = {\n Liu,\n Andrew and Tucker,\n Richard and Jampani,\n Varun and Makadia,\n Ameesh and Snavely,\n Noah and Kanazawa,\n Angjoo\n},\n title = {\n Infinite Nature: Perpetual View Generation of Natural Scenes From a Single Image\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14458-14467\n} \n}" }, { "title": "Influence Selection for Active Learning", @@ -20159,6 +21525,7 @@ "status": "Poster", "track": "main", "pid": 3616, + "author_site": "Zhuoming Liu; Hao Ding; Huaping Zhong; Weijia Li; Jifeng Dai; Conghui He", "author": "Zhuoming Liu; Hao Ding; Huaping Zhong; Weijia Li; Jifeng Dai; Conghui He", "abstract": "The existing active learning methods select the samples by evaluating the sample's uncertainty or its effect on the diversity of labeled datasets based on different task-specific or model-specific criteria. In this paper, we propose the Influence Selection for Active Learning(ISAL) which selects the unlabeled samples that can provide the most positive Influence on model performance. To obtain the Influence of the unlabeled sample in the active learning scenario, we design the Untrained Unlabeled sample Influence Calculation(UUIC) to estimate the unlabeled sample's expected gradient with which we calculate its Influence. To prove the effectiveness of UUIC, we provide both theoretical and experimental analyses. Since the UUIC just depends on the model gradients, which can be obtained easily from any neural network, our active learning algorithm is task-agnostic and model-agnostic. ISAL achieves state-of-the-art performance in different active learning settings for different tasks with different datasets. Compared with previous methods, our method decreases the annotation cost at least by 12%, 13% and 16% on CIFAR10, VOC2012 and COCO, respectively.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liu_Influence_Selection_for_Active_Learning_ICCV_2021_paper.pdf", @@ -20175,14 +21542,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Liu_Influence_Selection_for_Active_Learning_ICCV_2021_paper.html", "aff_unique_index": "0;1;2;2+3;2;2", - "aff_unique_norm": "University of Southern California;Johns Hopkins University;SenseTime;Chinese University of Hong Kong", + "aff_unique_norm": "University of Southern California;Johns Hopkins University;SenseTime;The Chinese University of Hong Kong", "aff_unique_dep": ";;SenseTime Research;CUHK-SenseTime Joint Lab", "aff_unique_url": "https://www.usc.edu;https://www.jhu.edu;https://www.sensetime.com;https://www.cuhk.edu.hk", "aff_unique_abbr": "USC;JHU;SenseTime;CUHK", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Los Angeles;;Hong Kong SAR", "aff_country_unique_index": "0;0;1;1+1;1;1", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Liu_2021_ICCV,\n \n author = {\n Liu,\n Zhuoming and Ding,\n Hao and Zhong,\n Huaping and Li,\n Weijia and Dai,\n Jifeng and He,\n Conghui\n},\n title = {\n Influence Selection for Active Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9274-9283\n} \n}" }, { "title": "Influence-Balanced Loss for Imbalanced Visual Classification", @@ -20190,6 +21558,7 @@ "status": "Poster", "track": "main", "pid": 10181, + "author_site": "Seulki Park; Jongin Lim; Younghan Jeon; Jin Young Choi", "author": "Seulki Park; Jongin Lim; Younghan Jeon; Jin Young Choi", "abstract": "In this paper, we propose a balancing training method to address problems in imbalanced data learning. To this end, we derive a new loss used in the balancing training phase that alleviates the influence of samples that cause an overfitted decision boundary. The proposed loss efficiently improves the performance of any type of imbalance learning methods. In experiments on multiple benchmark data sets, we demonstrate the validity of our method and reveal that the proposed loss outperforms the state-of-the-art cost-sensitive loss methods. Furthermore, since our loss is not restricted to a specific task, model, or training method, it can be easily used in combination with other recent re-sampling, meta-learning, and cost-sensitive learning methods for class-imbalance problems. Our code is made available.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Park_Influence-Balanced_Loss_for_Imbalanced_Visual_Classification_ICCV_2021_paper.pdf", @@ -20213,7 +21582,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Seoul", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Park_2021_ICCV,\n \n author = {\n Park,\n Seulki and Lim,\n Jongin and Jeon,\n Younghan and Choi,\n Jin Young\n},\n title = {\n Influence-Balanced Loss for Imbalanced Visual Classification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 735-744\n} \n}" }, { "title": "Information-Theoretic Regularization for Multi-Source Domain Adaptation", @@ -20221,6 +21591,7 @@ "status": "Poster", "track": "main", "pid": 8989, + "author_site": "Geon Yeong Park; Sang Wan Lee", "author": "Geon Yeong Park; Sang Wan Lee", "abstract": "Adversarial learning strategy has demonstrated remarkable performance in dealing with single-source Domain Adaptation (DA) problems, and it has recently been applied to Multi-source DA (MDA) problems. Although most existing MDA strategies rely on a multiple domain discriminator setting, its effect on the latent space representations has been poorly understood. Here we adopt an information-theoretic approach to identify and resolve the potential adverse effect of the multiple domain discriminators on MDA: disintegration of domain-discriminative information, limited computational scalability, and a large variance in the gradient of the loss during training. We examine the above issues by situating adversarial DA in the context of information regularization. This also provides a theoretical justification for using a single and unified domain discriminator. Based on this idea, we implement a novel neural architecture called a Multi-source Information-regularized Adaptation Networks (MIAN). Large-scale experiments demonstrate that MIAN, despite its structural simplicity, reliably and significantly outperforms other state-of-the-art methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Park_Information-Theoretic_Regularization_for_Multi-Source_Domain_Adaptation_ICCV_2021_paper.pdf", @@ -20244,7 +21615,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Park_2021_ICCV,\n \n author = {\n Park,\n Geon Yeong and Lee,\n Sang Wan\n},\n title = {\n Information-Theoretic Regularization for Multi-Source Domain Adaptation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9214-9223\n} \n}" }, { "title": "Instance Segmentation in 3D Scenes Using Semantic Superpoint Tree Networks", @@ -20252,10 +21624,11 @@ "status": "Poster", "track": "main", "pid": 4180, + "author_site": "Zhihao Liang; Zhihao Li; Songcen Xu; Mingkui Tan; Kui Jia", "author": "Zhihao Liang; Zhihao Li; Songcen Xu; Mingkui Tan; Kui Jia", "abstract": "Instance segmentation in 3D scenes is fundamental in many applications of scene understanding. It is yet challenging due to the compound factors of data irregularity and uncertainty in the numbers of instances. State-of-the-art methods largely rely on a general pipeline that first learns point-wise features discriminative at semantic and instance levels, followed by a separate step of point grouping for proposing object instances. While promising, they have the shortcomings that (1) the second step is not supervised by the main objective of instance segmentation, and (2) their point-wise feature learning and grouping are less effective to deal with data irregularities, possibly resulting in fragmented segmentations. To address these issues, we propose in this work an end-to-end solution of Semantic Superpoint Tree Network (SSTNet) for proposing object instances from scene points. Key in SSTNet is an intermediate, semantic superpoint tree (SST), which is constructed based on the learned semantic features of superpoints, and which will be traversed and split at intermediate tree nodes for proposals of object instances. We also design in SSTNet a refinement module, termed CliqueNet, to prune superpoints that may be wrongly grouped into instance proposals. Experiments on the benchmarks of ScanNet and S3DIS show the efficacy of our proposed method. At the time of submission, SSTNet ranks top on the ScanNet (V2) leaderboard, with 2% higher of mAP than the second best method.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liang_Instance_Segmentation_in_3D_Scenes_Using_Semantic_Superpoint_Tree_Networks_ICCV_2021_paper.pdf", - "aff": "South China University of Technology+DexForce Technology Co., Ltd.; Noah\u2019s Ark Lab, Huawei Technologies; Noah\u2019s Ark Lab, Huawei Technologies; South China University of Technology+Pazhou Laboratory+Peng Cheng Laboratory; South China University of Technology+Pazhou Laboratory+Peng Cheng Laboratory", + "aff": "South China University of Technology+DexForce Technology Co., Ltd.; Noah’s Ark Lab, Huawei Technologies; Noah’s Ark Lab, Huawei Technologies; South China University of Technology+Pazhou Laboratory+Peng Cheng Laboratory; South China University of Technology+Pazhou Laboratory+Peng Cheng Laboratory", "project": "", "github": "https://github.com/Gorilla-Lab-SCUT/SSTNet", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Liang_Instance_Segmentation_in_ICCV_2021_supplemental.pdf", @@ -20268,14 +21641,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Liang_Instance_Segmentation_in_3D_Scenes_Using_Semantic_Superpoint_Tree_Networks_ICCV_2021_paper.html", "aff_unique_index": "0+1;2;2;0+3+4;0+3+4", - "aff_unique_norm": "South China University of Technology;DexForce Technology Co., Ltd.;Huawei;Pazhou Laboratory;Pengcheng Laboratory", - "aff_unique_dep": ";;Noah\u2019s Ark Lab;;Peng Cheng Laboratory", + "aff_unique_norm": "South China University of Technology;DexForce Technology;Huawei Technologies;Pazhou Laboratory;Peng Cheng Laboratory", + "aff_unique_dep": ";Technology;Noah’s Ark Lab;;", "aff_unique_url": "https://www.scut.edu.cn;;https://www.huawei.com;;http://www.pcl.ac.cn", "aff_unique_abbr": "SCUT;;Huawei;;PCL", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0+0+0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liang_2021_ICCV,\n \n author = {\n Liang,\n Zhihao and Li,\n Zhihao and Xu,\n Songcen and Tan,\n Mingkui and Jia,\n Kui\n},\n title = {\n Instance Segmentation in 3D Scenes Using Semantic Superpoint Tree Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2783-2792\n} \n}" }, { "title": "Instance Similarity Learning for Unsupervised Feature Representation", @@ -20283,6 +21657,7 @@ "status": "Poster", "track": "main", "pid": 8812, + "author_site": "Ziwei Wang; Yunsong Wang; Ziyi Wu; Jiwen Lu; Jie Zhou", "author": "Ziwei Wang; Yunsong Wang; Ziyi Wu; Jiwen Lu; Jie Zhou", "abstract": "In this paper, we propose an instance similarity learning (ISL) method for unsupervised feature representation. Conventional methods assign close instance pairs in the feature space with high similarity, which usually leads to wrong pairwise relationship for large neighborhoods because the Euclidean distance fails to depict the true semantic similarity on the feature manifold. On the contrary, our method mines the feature manifold in an unsupervised manner, through which the semantic similarity among instances is learned in order to obtain discriminative representations. Specifically, we employ the Generative Adversarial Networks (GAN) to mine the underlying feature manifold, where the generated features are applied as the proxies to progressively explore the feature manifold so that the semantic similarity among instances is acquired as reliable pseudo supervision. Extensive experiments on image classification demonstrate the superiority of our method compared with the state-of-the-art methods. The code is available at https://github.com/ZiweiWangTHU/ISL.git.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_Instance_Similarity_Learning_for_Unsupervised_Feature_Representation_ICCV_2021_paper.pdf", @@ -20306,7 +21681,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0+0;0;0;0+0+0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Ziwei and Wang,\n Yunsong and Wu,\n Ziyi and Lu,\n Jiwen and Zhou,\n Jie\n},\n title = {\n Instance Similarity Learning for Unsupervised Feature Representation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10336-10345\n} \n}" }, { "title": "Instance-Level Image Retrieval Using Reranking Transformers", @@ -20314,6 +21690,7 @@ "status": "Poster", "track": "main", "pid": 3114, + "author_site": "Fuwen Tan; Jiangbo Yuan; Vicente Ordonez", "author": "Fuwen Tan; Jiangbo Yuan; Vicente Ordonez", "abstract": "Instance-level image retrieval is the task of searching in a large database for images that match an object in a query image. To address this task, systems usually rely on a retrieval step that uses global image descriptors, and a subsequent step that performs domain-specific refinements or reranking by leveraging operations such as geometric verification based on local features. In this work, we propose Reranking Transformers (RRTs) as a general model to incorporate both local and global features to rerank the matching images in a supervised fashion and thus replace the relatively expensive process of geometric verification. RRTs are lightweight and can be easily parallelized so that reranking a set of top matching results can be performed in a single forward-pass. We perform extensive experiments on the Revisited Oxford and Paris datasets, and the Google Landmarks v2 dataset, showing that RRTs outperform previous reranking approaches while using much fewer local descriptors. Moreover, we demonstrate that, unlike existing approaches, RRTs can be optimized jointly with the feature extractor, which can lead to feature representations tailored to downstream tasks and further accuracy improvements. The code and trained models are publicly available at https://github.com/uvavision/RerankingTransformer.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Tan_Instance-Level_Image_Retrieval_Using_Reranking_Transformers_ICCV_2021_paper.pdf", @@ -20337,7 +21714,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Tan_2021_ICCV,\n \n author = {\n Tan,\n Fuwen and Yuan,\n Jiangbo and Ordonez,\n Vicente\n},\n title = {\n Instance-Level Image Retrieval Using Reranking Transformers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12105-12115\n} \n}" }, { "title": "Instance-Wise Hard Negative Example Generation for Contrastive Learning in Unpaired Image-to-Image Translation", @@ -20345,10 +21723,11 @@ "status": "Poster", "track": "main", "pid": 3037, + "author_site": "Weilun Wang; Wengang Zhou; Jianmin Bao; Dong Chen; Houqiang Li", "author": "Weilun Wang; Wengang Zhou; Jianmin Bao; Dong Chen; Houqiang Li", "abstract": "Contrastive learning shows great potential in unpaired image-to-image translation, but sometimes the translated results are in poor quality and the contents are not preserved consistently. In this paper, we uncover that the negative examples play a critical role in the performance of contrastive learning for image translation. The negative examples in previous methods are randomly sampled from the patches of different positions in the source image, which are not effective to push the positive examples close to the query examples. To address this issue, we present instance-wise hard Negative Example Generation for Contrastive learning in Unpaired image-to-image Translation (NEGCUT). Specifically, we train a generator to produce negative examples online. The generator is novel from two perspectives: 1) it is instance-wise which means that the generated examples are based on the input image, and 2) it can generate hard negative examples since it is trained with an adversarial loss. With the generator, the performance of unpaired image-to-image translation is significantly improved. Experiments on three benchmark datasets demonstrate that the proposed NEGCUT framework achieves state-of-the-art performance compared to previous methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_Instance-Wise_Hard_Negative_Example_Generation_for_Contrastive_Learning_in_Unpaired_ICCV_2021_paper.pdf", - "aff": "1CAS Key Laboratory of GIPAS, EEIS Department, University of Science and Technology of China (USTC)+2Institute of Arti\ufb01cial Intelligence, Hefei Comprehensive National Science Center; 1CAS Key Laboratory of GIPAS, EEIS Department, University of Science and Technology of China (USTC)+2Institute of Arti\ufb01cial Intelligence, Hefei Comprehensive National Science Center; Microsoft Research Asia; Microsoft Research Asia; 1CAS Key Laboratory of GIPAS, EEIS Department, University of Science and Technology of China (USTC)+2Institute of Arti\ufb01cial Intelligence, Hefei Comprehensive National Science Center", + "aff": "1CAS Key Laboratory of GIPAS, EEIS Department, University of Science and Technology of China (USTC)+2Institute of Artificial Intelligence, Hefei Comprehensive National Science Center; 1CAS Key Laboratory of GIPAS, EEIS Department, University of Science and Technology of China (USTC)+2Institute of Artificial Intelligence, Hefei Comprehensive National Science Center; Microsoft Research Asia; Microsoft Research Asia; 1CAS Key Laboratory of GIPAS, EEIS Department, University of Science and Technology of China (USTC)+2Institute of Artificial Intelligence, Hefei Comprehensive National Science Center", "project": "", "github": "", "supp": "", @@ -20361,14 +21740,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wang_Instance-Wise_Hard_Negative_Example_Generation_for_Contrastive_Learning_in_Unpaired_ICCV_2021_paper.html", "aff_unique_index": "0+1;0+1;2;2;0+1", - "aff_unique_norm": "University of Science and Technology of China;Hefei Comprehensive National Science Center;Microsoft", - "aff_unique_dep": "EEIS Department;Institute of Arti\ufb01cial Intelligence;Research", + "aff_unique_norm": "University of Science and Technology of China;Hefei Comprehensive National Science Center;Microsoft Research", + "aff_unique_dep": "EEIS Department;Institute of Artificial Intelligence;Research", "aff_unique_url": "http://www.ustc.edu.cn;http://www.hfcn.edu.cn;https://www.microsoft.com/en-us/research/group/asia", "aff_unique_abbr": "USTC;;MSR Asia", "aff_campus_unique_index": "1;1;2;2;1", "aff_campus_unique": ";Hefei;Asia", "aff_country_unique_index": "0+0;0+0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Weilun and Zhou,\n Wengang and Bao,\n Jianmin and Chen,\n Dong and Li,\n Houqiang\n},\n title = {\n Instance-Wise Hard Negative Example Generation for Contrastive Learning in Unpaired Image-to-Image Translation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14020-14029\n} \n}" }, { "title": "InstanceRefer: Cooperative Holistic Understanding for Visual Grounding on Point Clouds Through Instance Multi-Level Contextual Referring", @@ -20376,6 +21756,7 @@ "status": "Poster", "track": "main", "pid": 6814, + "author_site": "Zhihao Yuan; Xu Yan; Yinghong Liao; Ruimao Zhang; Sheng Wang; Zhen Li; Shuguang Cui", "author": "Zhihao Yuan; Xu Yan; Yinghong Liao; Ruimao Zhang; Sheng Wang; Zhen Li; Shuguang Cui", "abstract": "Compared with the visual grounding on 2D images, the natural-language-guided 3D object localization on point clouds is more challenging. In this paper, we propose a new model, named InstanceRefer, to achieve a superior 3D visual grounding through the grounding-by-matching strategy. In practice, our model first predicts the target category from the language descriptions using a simple language classification model. Then based on the category, our model sifts out a small number of instance candidates (usually less than 20) from the panoptic segmentation on point clouds. Thus, the non-trivial 3D visual grounding task has been effectively re-formulated as a simplified instance-matching problem, considering that instance-level candidates are more rational than the redundant 3D object proposals. Subsequently, for each candidate, we perform the multi-level contextual inference, i.e., referring from instance attribute perception, instance-to-instance relation perception, and instance-to-background global localization perception, respectively. Eventually, the most relevant candidate is selected and localized by ranking confidence scores, which are obtained by the cooperative holistic visual-language feature matching. Experiments confirm that our method outperforms previous state-of-the-arts on ScanRefer online benchmark (ranked 1st place) and Nr3D/Sr3D datasets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yuan_InstanceRefer_Cooperative_Holistic_Understanding_for_Visual_Grounding_on_Point_Clouds_ICCV_2021_paper.pdf", @@ -20392,14 +21773,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Yuan_InstanceRefer_Cooperative_Holistic_Understanding_for_Visual_Grounding_on_Point_Clouds_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;0;1;0;0", - "aff_unique_norm": "Chinese University of Hong Kong (Shenzhen);Southern University of Science and Technology", + "aff_unique_norm": "The Chinese University of Hong Kong (Shenzhen);Southern University of Science and Technology", "aff_unique_dep": ";CryoEM Center", "aff_unique_url": "https://www.cuhk.edu.cn;https://www.sustech.edu.cn", "aff_unique_abbr": "CUHK(SZ);SUSTech", "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Shenzhen;", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yuan_2021_ICCV,\n \n author = {\n Yuan,\n Zhihao and Yan,\n Xu and Liao,\n Yinghong and Zhang,\n Ruimao and Wang,\n Sheng and Li,\n Zhen and Cui,\n Shuguang\n},\n title = {\n InstanceRefer: Cooperative Holistic Understanding for Visual Grounding on Point Clouds Through Instance Multi-Level Contextual Referring\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1791-1800\n} \n}" }, { "title": "Instances As Queries", @@ -20407,6 +21789,7 @@ "status": "Poster", "track": "main", "pid": 8833, + "author_site": "Yuxin Fang; Shusheng Yang; Xinggang Wang; Yu Li; Chen Fang; Ying Shan; Bin Feng; Wenyu Liu", "author": "Yuxin Fang; Shusheng Yang; Xinggang Wang; Yu Li; Chen Fang; Ying Shan; Bin Feng; Wenyu Liu", "abstract": "We present QueryInst, a new perspective for instance segmentation. QueryInst is a multi-stage end-to-end system that treats instances of interest as learnable queries, enabling query based object detectors, e.g., Sparse R-CNN, to have strong instance segmentation performance. The attributes of instances such as categories, bounding boxes, instance masks, and instance association embeddings are represented by queries in a unified manner. In QueryInst, a query is shared by both detection and segmentation via dynamic convolutions and driven by parallelly-supervised multi-stage learning. We conduct extensive experiments on three challenging benchmarks, i.e., COCO, CityScapes, and YouTube-VIS to evaluate the effectiveness of QueryInst in object detection, instance segmentation, and video instance segmentation tasks. For the first time, we demonstrate that a simple end-to-end query based framework can achieve the state-of-the-art performance in various instance-level recognition tasks. Code is available at https://github.com/hustvl/QueryInst.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Fang_Instances_As_Queries_ICCV_2021_paper.pdf", @@ -20422,15 +21805,16 @@ "email": "hust.edu.cn;hust.edu.cn;hust.edu.cn;tencent.com;tencent.com;tencent.com;hust.edu.cn;hust.edu.cn", "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Fang_Instances_As_Queries_ICCV_2021_paper.html", - "aff_unique_index": "0;0+1;0;1;1;1;0;0", - "aff_unique_norm": "Huazhong University of Science & Technology;Tencent", - "aff_unique_dep": "School of EIC;Applied Research Center (ARC)", - "aff_unique_url": "http://www.hust.edu.cn;https://www.tencent.com", - "aff_unique_abbr": "HUST;Tencent ARC", + "aff_unique_index": "0;0+1;0;1;2;1;0;0", + "aff_unique_norm": "Huazhong University of Science & Technology;Tencent;Tencent Holdings Limited", + "aff_unique_dep": "School of EIC;Applied Research Center (ARC);", + "aff_unique_url": "http://www.hust.edu.cn;https://www.tencent.com;https://www.tencent.com", + "aff_unique_abbr": "HUST;Tencent ARC;Tencent", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Fang_2021_ICCV,\n \n author = {\n Fang,\n Yuxin and Yang,\n Shusheng and Wang,\n Xinggang and Li,\n Yu and Fang,\n Chen and Shan,\n Ying and Feng,\n Bin and Liu,\n Wenyu\n},\n title = {\n Instances As Queries\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6910-6919\n} \n}" }, { "title": "Integer-Arithmetic-Only Certified Robustness for Quantized Neural Networks", @@ -20438,6 +21822,7 @@ "status": "Poster", "track": "main", "pid": 11222, + "author_site": "Haowen Lin; Jian Lou; Li Xiong; Cyrus Shahabi", "author": "Haowen Lin; Jian Lou; Li Xiong; Cyrus Shahabi", "abstract": "Adversarial data examples have drawn significant attention from the machine learning and security communities. A line of work on tackling adversarial examples is certified robustness via randomized smoothing that can provide a theoretical robustness guarantee. However, such a mechanism usually uses floating-point arithmetic for calculations in inference and requires large memory footprints and daunting computational costs. These defensive models cannot run efficiently on edge devices nor be deployed on integer-only logical units such as Turing Tensor Cores or integer-only ARM processors. To overcome these challenges, we propose an integer randomized smoothing approach with quantization to convert any classifier into a new smoothed classifier, which uses integer-only arithmetic for certified robustness against adversarial perturbations. We prove a tight robustness guarantee under L2-norm for the proposed approach. We show our approach can obtain a comparable accuracy and 4x 5x speedup over floating-point arithmetic certified robust methods on general-purpose CPUs and mobile devices on two distinct datasets (CIFAR-10 and Caltech-101).", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Lin_Integer-Arithmetic-Only_Certified_Robustness_for_Quantized_Neural_Networks_ICCV_2021_paper.pdf", @@ -20461,7 +21846,8 @@ "aff_campus_unique_index": "0;;0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0;0+1;0;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Lin_2021_ICCV,\n \n author = {\n Lin,\n Haowen and Lou,\n Jian and Xiong,\n Li and Shahabi,\n Cyrus\n},\n title = {\n Integer-Arithmetic-Only Certified Robustness for Quantized Neural Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7828-7837\n} \n}" }, { "title": "Interacting Two-Hand 3D Pose and Shape Reconstruction From Single Color Image", @@ -20469,6 +21855,7 @@ "status": "Poster", "track": "main", "pid": 5708, + "author_site": "Baowen Zhang; Yangang Wang; Xiaoming Deng; Yinda Zhang; Ping Tan; Cuixia Ma; Hongan Wang", "author": "Baowen Zhang; Yangang Wang; Xiaoming Deng; Yinda Zhang; Ping Tan; Cuixia Ma; Hongan Wang", "abstract": "In this paper, we propose a novel deep learning framework to reconstruct 3D hand poses and shapes of two interacting hands from a single color image. Previous methods designed for single hand cannot be easily applied for the two hand scenario because of the heavy inter-hand occlusion and larger solution space. In order to address the occlusion and similar appearance between hands that may confuse the network, we design a hand pose-aware attention module to extract features associated to each individual hand respectively. We then leverage the two hand context presented in interaction and propose a context-aware cascaded refinement that improves the hand pose and shape accuracy of each hand conditioned on the context between interacting hands. Extensive experiments on the main benchmark datasets demonstrate that our method predicts accurate 3D hand pose and shape from single color image, and achieves the state-of-the-art performance. Code is available in project webpage https://baowenz.github.io/Intershape/.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhang_Interacting_Two-Hand_3D_Pose_and_Shape_Reconstruction_From_Single_Color_ICCV_2021_paper.pdf", @@ -20486,13 +21873,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhang_Interacting_Two-Hand_3D_Pose_and_Shape_Reconstruction_From_Single_Color_ICCV_2021_paper.html", "aff_unique_index": "0+1;2;0+1;3;4+5;0+1;0+1", "aff_unique_norm": "Chinese Academy of Sciences;University of Chinese Academy of Sciences;Southeast University;Google;Simon Fraser University;Alibaba Group Holding Limited", - "aff_unique_dep": "Institute of Software;;;Google;;", + "aff_unique_dep": "Institute of Software;;;;;", "aff_unique_url": "http://www.ios.ac.cn;http://www.ucas.ac.cn;https://www.seu.edu.cn/;https://www.google.com;https://www.sfu.ca;https://www.alibaba.com", "aff_unique_abbr": "CAS;UCAS;SEU;Google;SFU;Alibaba", "aff_campus_unique_index": ";;1;;;", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0+0;0;0+0;1;2+0;0+0;0+0", - "aff_country_unique": "China;United States;Canada" + "aff_country_unique": "China;United States;Canada", + "bibtex": "@InProceedings{Zhang_2021_ICCV,\n \n author = {\n Zhang,\n Baowen and Wang,\n Yangang and Deng,\n Xiaoming and Zhang,\n Yinda and Tan,\n Ping and Ma,\n Cuixia and Wang,\n Hongan\n},\n title = {\n Interacting Two-Hand 3D Pose and Shape Reconstruction From Single Color Image\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11354-11363\n} \n}" }, { "title": "Interaction Compass: Multi-Label Zero-Shot Learning of Human-Object Interactions via Spatial Relations", @@ -20500,6 +21888,7 @@ "status": "Poster", "track": "main", "pid": 6322, + "author_site": "Dat Huynh; Ehsan Elhamifar", "author": "Dat Huynh; Ehsan Elhamifar", "abstract": "We study the problem of multi-label zero-shot recognition in which labels are in the form of human-object interactions (combinations of actions on objects), each image may contain multiple interactions and some interactions do not have training images. We propose a novel compositional learning framework that decouples interaction labels into separate action and object scores that incorporate the spatial compatibility between the two components. We combine these scores to efficiently recognize seen and unseen interactions. However, learning action-object spatial relations, in principle, requires bounding-box annotations, which are costly to gather. Moreover, it is not clear how to generalize spatial relations to unseen interactions. We address these challenges by developing a cross-attention mechanism that localizes objects from action locations and vice versa by predicting displacements between them, referred to as relational directions. During training, we estimate the relational directions as ones maximizing the scores of ground-truth interactions that guide predictions toward compatible action-object regions. By extensive experiments, we show the effectiveness of our framework, where we improve the state of the art by 2.6% mAP score and 5.8% recall score on HICO and Visual Genome datasets, respectively. Code is available at https://github.com/hbdat/iccv21_relational_direction.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Huynh_Interaction_Compass_Multi-Label_Zero-Shot_Learning_of_Human-Object_Interactions_via_Spatial_ICCV_2021_paper.pdf", @@ -20523,7 +21912,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Huynh_2021_ICCV,\n \n author = {\n Huynh,\n Dat and Elhamifar,\n Ehsan\n},\n title = {\n Interaction Compass: Multi-Label Zero-Shot Learning of Human-Object Interactions via Spatial Relations\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8472-8483\n} \n}" }, { "title": "Interaction via Bi-Directional Graph of Semantic Region Affinity for Scene Parsing", @@ -20531,6 +21921,7 @@ "status": "Poster", "track": "main", "pid": 1380, + "author_site": "Henghui Ding; Hui Zhang; Jun Liu; Jiaxin Li; Zijian Feng; Xudong Jiang", "author": "Henghui Ding; Hui Zhang; Jun Liu; Jiaxin Li; Zijian Feng; Xudong Jiang", "abstract": "In this work, we devote to address the challenging problem of scene parsing. Previous methods, though capture context to exploit global clues, handle scene parsing as a pixel-independent task. However, it is well known that pixels in an image are highly correlated with each other, especially those from the same semantic region, while treating pixels independently fails to take advantage of such correlations. In this work, we treat each respective region in an image as a whole, and capture the structure topology as well as the affinity among different regions. To this end, we first divide the entire feature maps to different regions and extract respective global features from them. Next, we construct a directed graph whose nodes are regional features, and the edge connecting every two nodes is the affinity between the regional features they represent. After that, we transfer the affinity-aware nodes in the directed graph back to corresponding regions of the image, which helps to model the region dependencies and mitigate unrealistic results. In addition, to further boost the correlation among pixels, we propose a region-level loss that evaluates all pixels in a region as a whole and motivates the network to learn the exclusive regional feature per class. With the proposed approach, we achieves new state-of-the-art segmentation results on PASCAL-Context, ADE20K, and COCO-Stuff consistently.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ding_Interaction_via_Bi-Directional_Graph_of_Semantic_Region_Affinity_for_Scene_ICCV_2021_paper.pdf", @@ -20554,7 +21945,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;1;0", - "aff_country_unique": "Singapore;China" + "aff_country_unique": "Singapore;China", + "bibtex": "@InProceedings{Ding_2021_ICCV,\n \n author = {\n Ding,\n Henghui and Zhang,\n Hui and Liu,\n Jun and Li,\n Jiaxin and Feng,\n Zijian and Jiang,\n Xudong\n},\n title = {\n Interaction via Bi-Directional Graph of Semantic Region Affinity for Scene Parsing\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15848-15858\n} \n}" }, { "title": "Interactive Prototype Learning for Egocentric Action Recognition", @@ -20562,6 +21954,7 @@ "status": "Poster", "track": "main", "pid": 5646, + "author_site": "Xiaohan Wang; Linchao Zhu; Heng Wang; Yi Yang", "author": "Xiaohan Wang; Linchao Zhu; Heng Wang; Yi Yang", "abstract": "Egocentric video recognition is a challenging task that requires to identify both the actor's motion and the active object that the actor interacts with. Recognizing the active object is particularly hard due to the cluttered background with distracting objects, the frequent field of view changes, severe occlusion, etc. To improve the active object classification, most existing methods use object detectors or human gaze information, which are computationally expensive or require labor-intensive annotations. To avoid these additional costs, we propose an end-to-end Interactive Prototype Learning (IPL) framework to learn better active object representations by leveraging the motion cues from the actor. First, we introduce a set of verb prototypes to disentangle active object features from distracting object features. Each prototype corresponds to a primary motion pattern of an egocentric action, offering a distinctive supervision signal for active object feature learning. Second, we design two interactive operations to enable the extraction of active object features, i.e., noun-to-verb assignment and verb-to-noun selection. These operations are parameter-efficient and can learn judicious location-aware features on top of 3D CNN backbones. We demonstrate that the IPL framework can generalize to different backbones and outperform the state-of-the-art on three large-scale egocentric video datasets, i.e., EPIC-KITCHENS-55, EPIC-KITCHENS-100 and EGTEA.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_Interactive_Prototype_Learning_for_Egocentric_Action_Recognition_ICCV_2021_paper.pdf", @@ -20578,14 +21971,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wang_Interactive_Prototype_Learning_for_Egocentric_Action_Recognition_ICCV_2021_paper.html", "aff_unique_index": "0+1;2;3;0", - "aff_unique_norm": "Zhejiang University;Baidu;University of Technology Sydney;Meta", + "aff_unique_norm": "Zhejiang University;Baidu;University of Technology Sydney;Facebook", "aff_unique_dep": "CCAI;Baidu Research;ReLER;Facebook AI Research", "aff_unique_url": "https://www.zju.edu.cn;https://research.baidu.com;https://www.uts.edu.au;https://research.facebook.com", "aff_unique_abbr": ";Baidu;UTS;FAIR", "aff_campus_unique_index": ";1", "aff_campus_unique": ";Sydney", "aff_country_unique_index": "0+0;1;2;0", - "aff_country_unique": "China;Australia;United States" + "aff_country_unique": "China;Australia;United States", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Xiaohan and Zhu,\n Linchao and Wang,\n Heng and Yang,\n Yi\n},\n title = {\n Interactive Prototype Learning for Egocentric Action Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8168-8177\n} \n}" }, { "title": "Internal Video Inpainting by Implicit Long-Range Propagation", @@ -20593,6 +21987,7 @@ "status": "Poster", "track": "main", "pid": 5605, + "author_site": "Hao Ouyang; Tengfei Wang; Qifeng Chen", "author": "Hao Ouyang; Tengfei Wang; Qifeng Chen", "abstract": "We propose a novel framework for video inpainting by adopting an internal learning strategy. Unlike previous methods that use optical flow for cross-frame context propagation to inpaint unknown regions, we show that this can be achieved implicitly by fitting a convolutional neural network to known regions. Moreover, to handle challenging sequences with ambiguous backgrounds or long-term occlusion, we design two regularization terms to preserve high-frequency details and long-term temporal consistency. Extensive experiments on the DAVIS dataset demonstrate that the proposed method achieves state-of-the-art inpainting quality quantitatively and qualitatively. We further extend the proposed method to another challenging task: learning to remove an object from a video giving a single object mask in only one frame in a 4K video.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ouyang_Internal_Video_Inpainting_by_Implicit_Long-Range_Propagation_ICCV_2021_paper.pdf", @@ -20616,7 +22011,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Ouyang_2021_ICCV,\n \n author = {\n Ouyang,\n Hao and Wang,\n Tengfei and Chen,\n Qifeng\n},\n title = {\n Internal Video Inpainting by Implicit Long-Range Propagation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14579-14588\n} \n}" }, { "title": "Interpolation-Aware Padding for 3D Sparse Convolutional Neural Networks", @@ -20624,6 +22020,7 @@ "status": "Poster", "track": "main", "pid": 7107, + "author_site": "Yu-Qi Yang; Peng-Shuai Wang; Yang Liu", "author": "Yu-Qi Yang; Peng-Shuai Wang; Yang Liu", "abstract": "Sparse voxel-based 3D convolutional neural networks (CNNs) are widely used for various 3D vision tasks. Sparse voxel-based 3D CNNs create sparse non-empty voxels from input point clouds and perform standard convolution operations on them only. We propose a simple and effective padding scheme --- interpolation-aware padding to pad a few empty voxels adjacent to the non-empty voxels and involving them in the CNN computation so that all neighboring voxels exist when computing point-wise features via the trilinear interpolation. For fine-grained 3D vision tasks where point-wise features are essential, like semantic segmentation and 3D detection, our network achieves higher prediction accuracy than the existing networks using the nearest neighbor interpolation or normalized trilinear interpolation with the zero-padding or the octree-padding scheme. Through extensive comparisons on various 3D segmentation and detection tasks, we demonstrate the superiority of 3D sparse CNNs with our sparse padding scheme in conjunction with feature interpolation.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yang_Interpolation-Aware_Padding_for_3D_Sparse_Convolutional_Neural_Networks_ICCV_2021_paper.pdf", @@ -20640,14 +22037,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Yang_Interpolation-Aware_Padding_for_3D_Sparse_Convolutional_Neural_Networks_ICCV_2021_paper.html", "aff_unique_index": "0;1;1", - "aff_unique_norm": "Tsinghua University;Microsoft", + "aff_unique_norm": "Tsinghua University;Microsoft Research", "aff_unique_dep": ";Research", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.microsoft.com/en-us/research/group/asia", "aff_unique_abbr": "THU;MSR Asia", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yang_2021_ICCV,\n \n author = {\n Yang,\n Yu-Qi and Wang,\n Peng-Shuai and Liu,\n Yang\n},\n title = {\n Interpolation-Aware Padding for 3D Sparse Convolutional Neural Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7467-7475\n} \n}" }, { "title": "Interpretable Image Recognition by Constructing Transparent Embedding Space", @@ -20655,10 +22053,11 @@ "status": "Poster", "track": "main", "pid": 6158, + "author_site": "Jiaqi Wang; Huafeng Liu; Xinyue Wang; Liping Jing", "author": "Jiaqi Wang; Huafeng Liu; Xinyue Wang; Liping Jing", "abstract": "Humans usually explain their reasoning (e.g. classification) by dissecting the image and pointing out the evidence from these parts to the concepts in their minds. Inspired by this cognitive process, several part-level interpretable neural network architectures have been proposed to explain the predictions. However, they suffer from the complex data structure and confusing the effect of the individual part to output category. In this work, an interpretable image recognition deep network is designed by introducing a plug-in transparent embedding space (TesNet) to bridge the high-level input patches (e.g. CNN feature maps) and the output categories. This plug-in embedding space is spanned by transparent basis concepts which are constructed on the Grassmann manifold. These basis concepts are enforced to be category-aware and within-category concepts are orthogonal to each other, which makes sure the embedding space is disentangled. Meanwhile, each basis concept can be traced back to the particular image patches, thus they are transparent and friendly to explain the reasoning process. By comparing with state-of-the-art interpretable methods, TesNet is much more beneficial to classification tasks, esp. providing better interpretability on predictions and improve the final accuracy.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_Interpretable_Image_Recognition_by_Constructing_Transparent_Embedding_Space_ICCV_2021_paper.pdf", - "aff": "School of Computer and Information Technology, Beijing Key Lab of Traf\ufb01c Data Analysis and Mining, Beijing Jiaotong University, Beijing, China; School of Computer and Information Technology, Beijing Key Lab of Traf\ufb01c Data Analysis and Mining, Beijing Jiaotong University, Beijing, China; School of Computer and Information Technology, Beijing Key Lab of Traf\ufb01c Data Analysis and Mining, Beijing Jiaotong University, Beijing, China; School of Computer and Information Technology, Beijing Key Lab of Traf\ufb01c Data Analysis and Mining, Beijing Jiaotong University, Beijing, China", + "aff": "School of Computer and Information Technology, Beijing Key Lab of Traffic Data Analysis and Mining, Beijing Jiaotong University, Beijing, China; School of Computer and Information Technology, Beijing Key Lab of Traffic Data Analysis and Mining, Beijing Jiaotong University, Beijing, China; School of Computer and Information Technology, Beijing Key Lab of Traffic Data Analysis and Mining, Beijing Jiaotong University, Beijing, China; School of Computer and Information Technology, Beijing Key Lab of Traffic Data Analysis and Mining, Beijing Jiaotong University, Beijing, China", "project": "", "github": "https://github.com/JackeyWang96/TesNet", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Wang_Interpretable_Image_Recognition_ICCV_2021_supplemental.pdf", @@ -20671,14 +22070,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wang_Interpretable_Image_Recognition_by_Constructing_Transparent_Embedding_Space_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;0", - "aff_unique_norm": "Beijing Jiao Tong University", + "aff_unique_norm": "Beijing Jiaotong University", "aff_unique_dep": "School of Computer and Information Technology", "aff_unique_url": "http://www.bjtu.edu.cn", "aff_unique_abbr": "BJTU", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Jiaqi and Liu,\n Huafeng and Wang,\n Xinyue and Jing,\n Liping\n},\n title = {\n Interpretable Image Recognition by Constructing Transparent Embedding Space\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 895-904\n} \n}" }, { "title": "Interpretable Visual Reasoning via Induced Symbolic Space", @@ -20686,6 +22086,7 @@ "status": "Poster", "track": "main", "pid": 4359, + "author_site": "Zhonghao Wang; Kai Wang; Mo Yu; Jinjun Xiong; Wen-mei Hwu; Mark Hasegawa-Johnson; Humphrey Shi", "author": "Zhonghao Wang; Kai Wang; Mo Yu; Jinjun Xiong; Wen-mei Hwu; Mark Hasegawa-Johnson; Humphrey Shi", "abstract": "We study the problem of concept induction in visual reasoning, i.e., identifying concepts and their hierarchical relationships from question-answer pairs associated with images; and achieve an interpretable model via working on the induced symbolic concept space. To this end, we first design a new framework named object-centric compositional attention model (OCCAM) to perform the visual reasoning task with object-level visual features. Then, we come up with a method to induce concepts of objects and relations using clues from the attention patterns between objects' visual features and question words. Finally, we achieve a higher level of interpretability by imposing OCCAM on the objects represented in the induced symbolic concept space. Experiments on the CLEVR and GQA datasets demonstrate: 1) our OCCAM achieves a new state of the art without human-annotated functional programs; 2) our induced concepts are both accurate and sufficient as OCCAM achieves an on-par performance on objects represented either in visual features or in the induced symbolic concept space.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_Interpretable_Visual_Reasoning_via_Induced_Symbolic_Space_ICCV_2021_paper.pdf", @@ -20702,14 +22103,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wang_Interpretable_Visual_Reasoning_via_Induced_Symbolic_Space_ICCV_2021_paper.html", "aff_unique_index": "0;1;2;2;0;0;0+1+3", - "aff_unique_norm": "University of Illinois Urbana-Champaign;University of Oregon;IBM;Picsart AI Research", + "aff_unique_norm": "University of Illinois at Urbana-Champaign;University of Oregon;MIT-IBM Watson AI Lab;Picsart AI Research", "aff_unique_dep": ";;AI Lab;AI Research", "aff_unique_url": "https://www illinois.edu;https://www.uoregon.edu;https://watson-ai-lab.csail.mit.edu/;https://research.picsart.com", "aff_unique_abbr": "UIUC;UO;MIT-IBM AI Lab;PAIR", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Urbana-Champaign;", "aff_country_unique_index": "0;0;0;0;0;0;0+0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Zhonghao and Wang,\n Kai and Yu,\n Mo and Xiong,\n Jinjun and Hwu,\n Wen-mei and Hasegawa-Johnson,\n Mark and Shi,\n Humphrey\n},\n title = {\n Interpretable Visual Reasoning via Induced Symbolic Space\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1878-1887\n} \n}" }, { "title": "Interpretation of Emergent Communication in Heterogeneous Collaborative Embodied Agents", @@ -20717,6 +22119,7 @@ "status": "Poster", "track": "main", "pid": 7990, + "author_site": "Shivansh Patel; Saim Wani; Unnat Jain; Alexander G. Schwing; Svetlana Lazebnik; Manolis Savva; Angel X. Chang", "author": "Shivansh Patel; Saim Wani; Unnat Jain; Alexander G. Schwing; Svetlana Lazebnik; Manolis Savva; Angel X. Chang", "abstract": "Communication between embodied AI agents has received increasing attention in recent years. Despite its use, it is still unclear whether the learned communication is interpretable and grounded in perception. To study the grounding of emergent forms of communication, we first introduce the collaborative multi-object navigation task 'CoMON.' In this task, an 'oracle agent' has detailed environment information in the form of a map. It communicates with a 'navigator agent' that perceives the environment visually and is tasked to find a sequence of goals. To succeed at the task, effective communication is essential. CoMON hence serves as a basis to study different communication mechanisms between heterogeneous agents, that is, agents with different capabilities and roles. We study two common communication mechanisms and analyze their communication patterns through an egocentric and spatial lens. We show that the emergent communication can be grounded to the agent observations and the spatial structure of the 3D environment.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Patel_Interpretation_of_Emergent_Communication_in_Heterogeneous_Collaborative_Embodied_Agents_ICCV_2021_paper.pdf", @@ -20733,14 +22136,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Patel_Interpretation_of_Emergent_Communication_in_Heterogeneous_Collaborative_Embodied_Agents_ICCV_2021_paper.html", "aff_unique_index": "0;1;2;2;2;0;0", - "aff_unique_norm": "Simon Fraser University;Indian Institute of Technology Kanpur;University of Illinois Urbana-Champaign", + "aff_unique_norm": "Simon Fraser University;Indian Institute of Technology Kanpur;University of Illinois at Urbana-Champaign", "aff_unique_dep": ";;", "aff_unique_url": "https://www.sfu.ca;https://www.iitk.ac.in;https://www illinois.edu", "aff_unique_abbr": "SFU;IITK;UIUC", "aff_campus_unique_index": "1;2;2;2", "aff_campus_unique": ";Kanpur;Urbana-Champaign", "aff_country_unique_index": "0;1;2;2;2;0;0", - "aff_country_unique": "Canada;India;United States" + "aff_country_unique": "Canada;India;United States", + "bibtex": "@InProceedings{Patel_2021_ICCV,\n \n author = {\n Patel,\n Shivansh and Wani,\n Saim and Jain,\n Unnat and Schwing,\n Alexander G. and Lazebnik,\n Svetlana and Savva,\n Manolis and Chang,\n Angel X.\n},\n title = {\n Interpretation of Emergent Communication in Heterogeneous Collaborative Embodied Agents\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15953-15963\n} \n}" }, { "title": "Interpreting Attributions and Interactions of Adversarial Attacks", @@ -20748,10 +22152,11 @@ "status": "Poster", "track": "main", "pid": 2774, + "author_site": "Xin Wang; Shuyun Lin; Hao Zhang; Yufei Zhu; Quanshi Zhang", "author": "Xin Wang; Shuyun Lin; Hao Zhang; Yufei Zhu; Quanshi Zhang", "abstract": "This paper aims to explain adversarial attacks in terms of how adversarial perturbations contribute to the attacking task. We estimate attributions of different image regions to the decrease of the attacking cost based on the Shapley value. We define and quantify interactions among adversarial perturbation pixels, and decompose the entire perturbation map into relatively independent perturbation components. The decomposition of the perturbation map shows that adversarially-trained DNNs have more perturbation components in the foreground than normally-trained DNNs. Moreover, compared to the normally-trained DNN, the adversarially-trained DNN have more components which mainly decrease the score of the true category. Above analyses provide new insights into the understanding of adversarial attacks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_Interpreting_Attributions_and_Interactions_of_Adversarial_Attacks_ICCV_2021_paper.pdf", - "aff": "Shanghai Jiao Tong University; Shanghai Jiao Tong University; Shanghai Jiao Tong University; Shanghai Jiao Tong University; John Hopcroft Center and the MoE Key Lab of Arti\ufb01cial Intelligence, AI Institute, Shanghai Jiao Tong University", + "aff": "Shanghai Jiao Tong University; Shanghai Jiao Tong University; Shanghai Jiao Tong University; Shanghai Jiao Tong University; John Hopcroft Center and the MoE Key Lab of Artificial Intelligence, AI Institute, Shanghai Jiao Tong University", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Wang_Interpreting_Attributions_and_ICCV_2021_supplemental.pdf", @@ -20771,7 +22176,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Shanghai", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Xin and Lin,\n Shuyun and Zhang,\n Hao and Zhu,\n Yufei and Zhang,\n Quanshi\n},\n title = {\n Interpreting Attributions and Interactions of Adversarial Attacks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1095-1104\n} \n}" }, { "title": "IntraTomo: Self-Supervised Learning-Based Tomography via Sinogram Synthesis and Prediction", @@ -20779,6 +22185,7 @@ "status": "Poster", "track": "main", "pid": 6076, + "author_site": "Guangming Zang; Ramzi Idoughi; Rui Li; Peter Wonka; Wolfgang Heidrich", "author": "Guangming Zang; Ramzi Idoughi; Rui Li; Peter Wonka; Wolfgang Heidrich", "abstract": "We propose IntraTomo, a powerful framework that combines the benefits of learning-based and model-based approaches for solving highly ill-posed inverse problems in the Computed Tomography (CT) context. IntraTomo is composed of two core modules: a novel sinogram prediction module, and a geometry refinement module, which are applied iteratively. In the first module, the unknown density field is represented as a continuous and differentiable function, parameterized by a deep neural network. This network is learned, in a self-supervised fashion, from the incomplete or/and degraded input sinogram. After getting estimated through the sinogram prediction module, the density field is consistently refined in the second module using local and non-local geometrical priors. With these two core modules, we show that IntraTomo significantly outperforms existing approaches on several ill-posed inverse problems, such as limited angle tomography with a range of 45 degrees, sparse view tomographic reconstruction with as few as eight views, or super-resolution tomography with eight times increased resolution. The experiments on simulated and real data show that our approach can achieve results of unprecedented quality.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zang_IntraTomo_Self-Supervised_Learning-Based_Tomography_via_Sinogram_Synthesis_and_Prediction_ICCV_2021_paper.pdf", @@ -20802,7 +22209,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "Saudi Arabia" + "aff_country_unique": "Saudi Arabia", + "bibtex": "@InProceedings{Zang_2021_ICCV,\n \n author = {\n Zang,\n Guangming and Idoughi,\n Ramzi and Li,\n Rui and Wonka,\n Peter and Heidrich,\n Wolfgang\n},\n title = {\n IntraTomo: Self-Supervised Learning-Based Tomography via Sinogram Synthesis and Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1960-1970\n} \n}" }, { "title": "Intrinsic-Extrinsic Preserved GANs for Unsupervised 3D Pose Transfer", @@ -20810,6 +22218,7 @@ "status": "Poster", "track": "main", "pid": 4200, + "author_site": "Haoyu Chen; Hao Tang; Henglin Shi; Wei Peng; Nicu Sebe; Guoying Zhao", "author": "Haoyu Chen; Hao Tang; Henglin Shi; Wei Peng; Nicu Sebe; Guoying Zhao", "abstract": "With the strength of deep generative models, 3D pose transfer regains intensive research interests in recent years. Existing methods mainly rely on a variety of constraints to achieve the pose transfer over 3D meshes, e.g., the need for manually encoding for shape and pose disentanglement. In this paper, we present an unsupervised approach to conduct the pose transfer between any arbitrate given 3D meshes. Specifically, a novel Intrinsic-Extrinsic Preserved Generative Adversarial Network (IEP-GAN) is presented for both intrinsic (i.e., shape) and extrinsic (i.e., pose) information preservation. Extrinsically, we propose a co-occurrence discriminator to capture the structural/pose invariance from distinct Laplacians of the mesh. Meanwhile, intrinsically, a local intrinsic-preserved loss is introduced to preserve the geodesic priors while avoiding heavy computations. At last, we show the possibility of using IEP-GAN to manipulate 3D human meshes in various ways, including pose transfer, identity swapping and pose interpolation with latent code vector arithmetic. The extensive experiments on various 3D datasets of humans, animals and hands qualitatively and quantitatively demonstrate the generality of our approach. Our proposed model produces better results and is substantially more efficient compared to recent state-of-the-art methods. Code is available: https://github.com/mikecheninoulu/Unsupervised_IEPGAN", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_Intrinsic-Extrinsic_Preserved_GANs_for_Unsupervised_3D_Pose_Transfer_ICCV_2021_paper.pdf", @@ -20829,11 +22238,12 @@ "aff_unique_norm": "University of Oulu;ETH Zurich;University of Trento", "aff_unique_dep": "CMVS;Computer Vision Lab;DISI", "aff_unique_url": "https://www.oulu.fi;https://www.ethz.ch;https://www.unitn.it", - "aff_unique_abbr": ";ETHZ;UniTN", + "aff_unique_abbr": ";ETHZ;", "aff_campus_unique_index": "1;", "aff_campus_unique": ";Zurich", "aff_country_unique_index": "0;1;0;0;2;0+2", - "aff_country_unique": "Finland;Switzerland;Italy" + "aff_country_unique": "Finland;Switzerland;Italy", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Haoyu and Tang,\n Hao and Shi,\n Henglin and Peng,\n Wei and Sebe,\n Nicu and Zhao,\n Guoying\n},\n title = {\n Intrinsic-Extrinsic Preserved GANs for Unsupervised 3D Pose Transfer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8630-8639\n} \n}" }, { "title": "Inverting a Rolling Shutter Camera: Bring Rolling Shutter Images to High Framerate Global Shutter Video", @@ -20841,10 +22251,11 @@ "status": "Poster", "track": "main", "pid": 1876, + "author_site": "Bin Fan; Yuchao Dai", "author": "Bin Fan; Yuchao Dai", "abstract": "Rolling shutter (RS) images can be viewed as the result of the row-wise combination of global shutter (GS) images captured by a virtual moving GS camera over the period of camera readout time. The RS effect brings tremendous difficulties for the downstream applications. In this paper, we propose to invert the above RS imaging mechanism, i.e., recovering a high framerate GS video from consecutive RS images to achieve RS temporal super-resolution (RSSR). This extremely challenging problem, e.g., recovering 1440 GS images from two 720-height RS images, is far from being solved end-to-end. To address this challenge, we exploit the geometric constraint in the RS camera model, thus achieving geometry-aware inversion. Specifically, we make three contributions in resolving the above difficulties: (i) formulating the bidirectional RS undistortion flows under the constant velocity motion model, (ii) building the connection between the RS undistortion flow and optical flow via a scaling operation, and (iii) developing a mutual conversion scheme between varying RS undistortion flows that correspond to different scanlines. Building upon these formulations, we propose the first RS temporal super-resolution network in a cascaded structure to extract high framerate global shutter video. Our method explores the underlying spatio-temporal geometric relationships within a deep learning framework, where no extra supervision besides the middle-scanline ground truth GS image is needed. Essentially, our method can be very efficient for explicit propagation to generate GS images under any scanline. Experimental results on both synthetic and real data show that our method can produce high-quality GS image sequences with rich details, outperforming state-of-the-art methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Fan_Inverting_a_Rolling_Shutter_Camera_Bring_Rolling_Shutter_Images_to_ICCV_2021_paper.pdf", - "aff": "School of Electronics and Information, Northwestern Polytechnical University, Xi\u2019an, China; School of Electronics and Information, Northwestern Polytechnical University, Xi\u2019an, China", + "aff": "School of Electronics and Information, Northwestern Polytechnical University, Xi’an, China; School of Electronics and Information, Northwestern Polytechnical University, Xi’an, China", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Fan_Inverting_a_Rolling_ICCV_2021_supplemental.zip", @@ -20864,7 +22275,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Xi'an", "aff_country_unique_index": "0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Fan_2021_ICCV,\n \n author = {\n Fan,\n Bin and Dai,\n Yuchao\n},\n title = {\n Inverting a Rolling Shutter Camera: Bring Rolling Shutter Images to High Framerate Global Shutter Video\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4228-4237\n} \n}" }, { "title": "Invisible Backdoor Attack With Sample-Specific Triggers", @@ -20872,6 +22284,7 @@ "status": "Poster", "track": "main", "pid": 4190, + "author_site": "Yuezun Li; Yiming Li; Baoyuan Wu; Longkang Li; Ran He; Siwei Lyu", "author": "Yuezun Li; Yiming Li; Baoyuan Wu; Longkang Li; Ran He; Siwei Lyu", "abstract": "Recently, backdoor attacks pose a new security threat to the training process of deep neural networks (DNNs). Attackers intend to inject hidden backdoors into DNNs, such that the attacked model performs well on benign samples, whereas its prediction will be maliciously changed if hidden backdoors are activated by the attacker-defined trigger. Existing backdoor attacks usually adopt the setting that triggers are sample-agnostic, i.e., different poisoned samples contain the same trigger, resulting in that the attacks could be easily mitigated by current backdoor defenses. In this work, we explore a novel attack paradigm, where backdoor triggers are sample-specific. In our attack, we only need to modify certain training samples with invisible perturbation, while not need to manipulate other training components (e.g., training loss, and model structure) as required in many existing attacks. Specifically, inspired by the recent advance in DNN-based image steganography, we generate sample-specific invisible additive noises as backdoor triggers by encoding an attacker-specified string into benign images through an encoder-decoder network. The mapping from the string to the target label will be generated when DNNs are trained on the poisoned dataset. Extensive experiments on benchmark datasets verify the effectiveness of our method in attacking models with or without defenses.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_Invisible_Backdoor_Attack_With_Sample-Specific_Triggers_ICCV_2021_paper.pdf", @@ -20888,14 +22301,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Li_Invisible_Backdoor_Attack_With_Sample-Specific_Triggers_ICCV_2021_paper.html", "aff_unique_index": "0;1;2+3;2+3;4;5", - "aff_unique_norm": "Ocean University of China;Tsinghua University;Chinese University of Hong Kong;Shenzhen Research Institute of Big Data;Chinese Academy of Sciences;University at Buffalo", + "aff_unique_norm": "Ocean University of China;Tsinghua University;The Chinese University of Hong Kong;Shenzhen Research Institute of Big Data;Chinese Academy of Sciences;University at Buffalo", "aff_unique_dep": ";International Graduate School;School of Data Science;Secure Computing Lab of Big Data;Institute of Automation;", "aff_unique_url": "http://www.ouc.edu.cn;https://www.tsinghua.edu.cn;https://www.cuhk.edu.cn;;http://www.ia.cas.cn;https://www.buffalo.edu", "aff_unique_abbr": "OUC;THU;CUHK;;CAS;UB", "aff_campus_unique_index": "0;1;1+1;1+1;2;3", "aff_campus_unique": "Qingdao;Shenzhen;Beijing;Buffalo", "aff_country_unique_index": "0;0;0+0;0+0;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Yuezun and Li,\n Yiming and Wu,\n Baoyuan and Li,\n Longkang and He,\n Ran and Lyu,\n Siwei\n},\n title = {\n Invisible Backdoor Attack With Sample-Specific Triggers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 16463-16472\n} \n}" }, { "title": "Is Pseudo-Lidar Needed for Monocular 3D Object Detection?", @@ -20903,6 +22317,7 @@ "status": "Poster", "track": "main", "pid": 11354, + "author_site": "Dennis Park; Rares Ambrus; Vitor Guizilini; Jie Li; Adrien Gaidon", "author": "Dennis Park; Rares Ambrus; Vitor Guizilini; Jie Li; Adrien Gaidon", "abstract": "Recent progress in 3D object detection from single images leverages monocular depth estimation as a way to produce 3D pointclouds, turning cameras into pseudo-lidar sensors. These two-stage detectors improve with the accuracy of the intermediate depth estimation network, which can itself be improved without manual labels via large-scale self-supervised learning. However, they tend to suffer from overfitting more than end-to-end methods, are more complex, and the gap with similar lidar-based detectors remains significant. In this work, we propose an end-to-end, single stage, monocular 3D object detector, DD3D, that can benefit from depth pre-training like pseudo-lidar methods, but without their limitations. Our architecture is designed for effective information transfer between depth estimation and 3D detection, allowing us to scale with the amount of unlabeled pre-training data. Our method achieves state-of-theart results on two challenging benchmarks, with 16.34% and 9.28% AP for Cars and Pedestrians (respectively) on the KITTI-3D benchmark, and 41.5% mAP on NuScenes.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Park_Is_Pseudo-Lidar_Needed_for_Monocular_3D_Object_Detection_ICCV_2021_paper.pdf", @@ -20926,7 +22341,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Park_2021_ICCV,\n \n author = {\n Park,\n Dennis and Ambrus,\n Rares and Guizilini,\n Vitor and Li,\n Jie and Gaidon,\n Adrien\n},\n title = {\n Is Pseudo-Lidar Needed for Monocular 3D Object Detection?\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3142-3152\n} \n}" }, { "title": "Iterative Label Cleaning for Transductive and Semi-Supervised Few-Shot Learning", @@ -20934,6 +22350,7 @@ "status": "Poster", "track": "main", "pid": 7654, + "author_site": "Michalis Lazarou; Tania Stathaki; Yannis Avrithis", "author": "Michalis Lazarou; Tania Stathaki; Yannis Avrithis", "abstract": "Few-shot learning amounts to learning representations and acquiring knowledge such that novel tasks may be solved with both supervision and data being limited. Improved performance is possible by transductive inference, where the entire test set is available concurrently, and semi-supervised learning, where more unlabeled data is available. Focusing on these two settings, we introduce a new algorithm that leverages the manifold structure of the labeled and unlabeled data distribution to predict pseudo-labels, while balancing over classes and using the loss value distribution of a limited-capacity classifier to select the cleanest labels, iteratively improving the quality of pseudo-labels. Our solution surpasses or matches the state of the art results on four benchmark datasets, namely miniImageNet, tieredImageNet, CUB and CIFAR-FS, while being robust over feature space pre-processing and the quantity of available data. The publicly available source code can be found in https://github.com/MichalisLazarou/iLPC", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Lazarou_Iterative_Label_Cleaning_for_Transductive_and_Semi-Supervised_Few-Shot_Learning_ICCV_2021_paper.pdf", @@ -20948,7 +22365,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Lazarou_Iterative_Label_Cleaning_for_Transductive_and_Semi-Supervised_Few-Shot_Learning_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Lazarou_Iterative_Label_Cleaning_for_Transductive_and_Semi-Supervised_Few-Shot_Learning_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Lazarou_2021_ICCV,\n \n author = {\n Lazarou,\n Michalis and Stathaki,\n Tania and Avrithis,\n Yannis\n},\n title = {\n Iterative Label Cleaning for Transductive and Semi-Supervised Few-Shot Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8751-8760\n} \n}" }, { "title": "JEM++: Improved Techniques for Training JEM", @@ -20956,6 +22374,7 @@ "status": "Poster", "track": "main", "pid": 9222, + "author_site": "Xiulong Yang; Shihao Ji", "author": "Xiulong Yang; Shihao Ji", "abstract": "Joint Energy-based Model (JEM) is a recently proposed hybrid model that retains strong discriminative power of modern CNN classifiers, while generating samples rivaling the quality of GAN-based approaches. In this paper, we propose a variety of new training procedures and architecture features to improve JEM's accuracy, training stability, and speed altogether. 1) We propose a proximal SGLD to generate samples in the proximity of samples from previous step, which improves the stability. 2) We further treat the approximate maximum likelihood learning of EBM as a multi-step differential game, and extend the YOPO framework to cut out redundant calculations during backpropagation, which accelerates the training substantially. 3) Rather than initializing SGLD chain from random noise, we introduce a new informative initialization that samples from a distribution estimated from training data. 4) This informative initialization allows us to enable batch normalization in JEM, which further releases the power of modern CNN architectures for hybrid modeling.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yang_JEM_Improved_Techniques_for_Training_JEM_ICCV_2021_paper.pdf", @@ -20979,7 +22398,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Yang_2021_ICCV,\n \n author = {\n Yang,\n Xiulong and Ji,\n Shihao\n},\n title = {\n JEM++: Improved Techniques for Training JEM\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6494-6503\n} \n}" }, { "title": "Joint Audio-Visual Deepfake Detection", @@ -20987,6 +22407,7 @@ "status": "Poster", "track": "main", "pid": 9326, + "author_site": "Yipin Zhou; Ser-Nam Lim", "author": "Yipin Zhou; Ser-Nam Lim", "abstract": "Deepfakes (\"deep learning\" + \"fake\") are synthetically-generated videos from AI algorithms. While they could be entertaining, they could also be misused for falsifying speeches and spreading misinformation. The process to create deepfakes involves both visual and auditory manipulations. Exploration on detecting visual deepfakes has produced a number of detection methods as well as datasets, while audio deepfakes (e.g. synthetic speech from text-to-speech or voice conversion systems) and the relationship between the visual and auditory modalities have been relatively neglected. In this work, we propose a novel visual / auditory deepfake joint detection task and show that exploiting the intrinsic synchronization between the visual and auditory modalities could benefit deepfake detection. Experiments demonstrate that the proposed joint detection framework outperforms independently trained models, and at the same time, yields superior generalization capability on unseen types of deepfakes.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhou_Joint_Audio-Visual_Deepfake_Detection_ICCV_2021_paper.pdf", @@ -21003,14 +22424,15 @@ "author_num": 2, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhou_Joint_Audio-Visual_Deepfake_Detection_ICCV_2021_paper.html", "aff_unique_index": "0;0", - "aff_unique_norm": "Meta", + "aff_unique_norm": "Facebook", "aff_unique_dep": "Facebook AI", "aff_unique_url": "https://www.facebook.com", "aff_unique_abbr": "Facebook AI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhou_2021_ICCV,\n \n author = {\n Zhou,\n Yipin and Lim,\n Ser-Nam\n},\n title = {\n Joint Audio-Visual Deepfake Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14800-14809\n} \n}" }, { "title": "Joint Inductive and Transductive Learning for Video Object Segmentation", @@ -21018,6 +22440,7 @@ "status": "Poster", "track": "main", "pid": 3911, + "author_site": "Yunyao Mao; Ning Wang; Wengang Zhou; Houqiang Li", "author": "Yunyao Mao; Ning Wang; Wengang Zhou; Houqiang Li", "abstract": "Semi-supervised video object segmentation is a task of segmenting the target object in a video sequence given only a mask annotation in the first frame. The limited information available makes it an extremely challenging task. Most previous best-performing methods adopt matching-based transductive reasoning or online inductive learning. Nevertheless, they are either less discriminative for similar instances or insufficient in the utilization of spatio-temporal information. In this work, we propose to integrate transductive and inductive learning into a unified framework to exploit the complementarity between them for accurate and robust video object segmentation. The proposed approach consists of two functional branches. The transduction branch adopts a lightweight transformer architecture to aggregate rich spatio-temporal cues while the induction branch performs online inductive learning to obtain discriminative target information. To bridge these two diverse branches, a two-head label encoder is introduced to learn the suitable target prior for each of them. The generated mask encodings are further forced to be disentangled to better retain their complementarity. Extensive experiments on several prevalent benchmarks show that, without the need of synthetic training data, the proposed approach sets a series of new state-of-the-art records. Code is available at https://github.com/maoyunyao/JOINT.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Mao_Joint_Inductive_and_Transductive_Learning_for_Video_Object_Segmentation_ICCV_2021_paper.pdf", @@ -21041,7 +22464,8 @@ "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Hefei", "aff_country_unique_index": "0+0;0+0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Mao_2021_ICCV,\n \n author = {\n Mao,\n Yunyao and Wang,\n Ning and Zhou,\n Wengang and Li,\n Houqiang\n},\n title = {\n Joint Inductive and Transductive Learning for Video Object Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9670-9679\n} \n}" }, { "title": "Joint Representation Learning and Novel Category Discovery on Single- and Multi-Modal Data", @@ -21049,6 +22473,7 @@ "status": "Poster", "track": "main", "pid": 2742, + "author_site": "Xuhui Jia; Kai Han; Yukun Zhu; Bradley Green", "author": "Xuhui Jia; Kai Han; Yukun Zhu; Bradley Green", "abstract": "This paper studies the problem of novel category discovery on single- and multi-modal data with labels from different but relevant categories. We present a generic, end-to-end framework to jointly learn a reliable representation and assign clusters to unlabelled data. To avoid over-fitting the learnt embedding to labelled data, we take inspiration from self-supervised representation learning by noise-contrastive estimation and extend it to jointly handle labelled and unlabelled data. In particular, we propose using category discrimination on labelled data and cross-modal discrimination on multi-modal data to augment instance discrimination used in conventional contrastive learning approaches. We further employ Winner-Take-All (WTA) hashing algorithm on the shared representation space to generate pairwise pseudo labels for unlabelled data to better predict cluster assignments. We thoroughly evaluate our framework on large-scale multi-modal video benchmarks Kinetics-400 and VGG-Sound, and image benchmarks CIFAR10, CIFAR100 and ImageNet, obtaining state-of-the-art results.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Jia_Joint_Representation_Learning_and_Novel_Category_Discovery_on_Single-_and_ICCV_2021_paper.pdf", @@ -21065,14 +22490,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Jia_Joint_Representation_Learning_and_Novel_Category_Discovery_on_Single-_and_ICCV_2021_paper.html", "aff_unique_index": "0;0+1+2;0;0", - "aff_unique_norm": "Google;University of Bristol;University of Hong Kong", - "aff_unique_dep": "Google;;", + "aff_unique_norm": "Google;University of Bristol;The University of Hong Kong", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.google.com;https://www.bristol.ac.uk;https://www.hku.hk", "aff_unique_abbr": "Google;Bristol;HKU", "aff_campus_unique_index": "0;0+2;0;0", "aff_campus_unique": "Mountain View;;Hong Kong SAR", "aff_country_unique_index": "0;0+1+2;0;0", - "aff_country_unique": "United States;United Kingdom;China" + "aff_country_unique": "United States;United Kingdom;China", + "bibtex": "@InProceedings{Jia_2021_ICCV,\n \n author = {\n Jia,\n Xuhui and Han,\n Kai and Zhu,\n Yukun and Green,\n Bradley\n},\n title = {\n Joint Representation Learning and Novel Category Discovery on Single- and Multi-Modal Data\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 610-619\n} \n}" }, { "title": "Joint Topology-Preserving and Feature-Refinement Network for Curvilinear Structure Segmentation", @@ -21080,6 +22506,7 @@ "status": "Poster", "track": "main", "pid": 7266, + "author_site": "Mingfei Cheng; Kaili Zhao; Xuhong Guo; Yajing Xu; Jun Guo", "author": "Mingfei Cheng; Kaili Zhao; Xuhong Guo; Yajing Xu; Jun Guo", "abstract": "Curvilinear structure segmentation (CSS) is under semantic segmentation, whose applications include crack detection, aerial road extraction, and biomedical image segmentation. In general, geometric topology and pixel-wise features are two critical aspects of CSS. However, most semantic segmentation methods only focus on enhancing feature representations while existing CSS techniques emphasize preserving topology alone. In this paper, we present a Joint Topology-preserving and Feature-refinement Network (JTFN) that jointly models global topology and refined features based on an iterative feedback learning strategy. Specifically, we explore the structure of objects to help preserve corresponding topologies of predicted masks, thus design a reciprocative two-stream module for CSS and boundary detection. In addition, we introduce such topology-aware predictions as feedback guidance that refines attentive features by supplementing and enhancing saliencies. To the best of our knowledge, this is the first work that jointly addresses topology preserving and feature refinement for CSS. We evaluate JTFN on four datasets of diverse applications: Crack500, CrackTree200, Roads, and DRIVE. Results show that JTFN performs best in comparison with alternative methods. Code is available.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Cheng_Joint_Topology-Preserving_and_Feature-Refinement_Network_for_Curvilinear_Structure_Segmentation_ICCV_2021_paper.pdf", @@ -21103,7 +22530,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Beijing;", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Cheng_2021_ICCV,\n \n author = {\n Cheng,\n Mingfei and Zhao,\n Kaili and Guo,\n Xuhong and Xu,\n Yajing and Guo,\n Jun\n},\n title = {\n Joint Topology-Preserving and Feature-Refinement Network for Curvilinear Structure Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7147-7156\n} \n}" }, { "title": "Joint Visual Semantic Reasoning: Multi-Stage Decoder for Text Recognition", @@ -21111,6 +22539,7 @@ "status": "Poster", "track": "main", "pid": 3449, + "author_site": "Ayan Kumar Bhunia; Aneeshan Sain; Amandeep Kumar; Shuvozit Ghose; Pinaki Nath Chowdhury; Yi-Zhe Song", "author": "Ayan Kumar Bhunia; Aneeshan Sain; Amandeep Kumar; Shuvozit Ghose; Pinaki Nath Chowdhury; Yi-Zhe Song", "abstract": "Although text recognition has significantly evolved over the years, state-of the-art (SOTA) models still struggle in the wild scenarios due to complex backgrounds, varying fonts, uncontrolled illuminations, distortions and other artifacts. This is because such models solely depend on visual information for text recognition, thus lacking semantic reasoning capabilities. In this paper, we argue that semantic information offers a complementary role in addition to visual only. More specifically, we additionally utilize semantic information by proposing a multi-stage multi-scale attentional decoder that performs joint visual-semantic reasoning. Our novelty lies in the intuition that for text recognition, prediction should be refined in a stage-wise manner. Therefore our key contribution is in designing a stage-wise unrolling attentional decoder where non-differentiability, invoked by discretely predicted character labels, needs to be bypassed for end-to-end training. While the first stage predicts using visual features, subsequent stages refine on-top of it using joint visual-semantic information. Additionally, we introduce multi-scale 2D attention along with dense and residual connections between different stages to deal with varying scales of character sizes, for better performance and faster convergence during training. Experimental results show our approach to outperform existing SOTA methods by a considerable margin.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Bhunia_Joint_Visual_Semantic_Reasoning_Multi-Stage_Decoder_for_Text_Recognition_ICCV_2021_paper.pdf", @@ -21134,7 +22563,8 @@ "aff_campus_unique_index": ";;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0+0;0+0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Bhunia_2021_ICCV,\n \n author = {\n Bhunia,\n Ayan Kumar and Sain,\n Aneeshan and Kumar,\n Amandeep and Ghose,\n Shuvozit and Chowdhury,\n Pinaki Nath and Song,\n Yi-Zhe\n},\n title = {\n Joint Visual Semantic Reasoning: Multi-Stage Decoder for Text Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14940-14949\n} \n}" }, { "title": "Joint Visual and Audio Learning for Video Highlight Detection", @@ -21142,6 +22572,7 @@ "status": "Poster", "track": "main", "pid": 6310, + "author_site": "Taivanbat Badamdorj; Mrigank Rochan; Yang Wang; Li Cheng", "author": "Taivanbat Badamdorj; Mrigank Rochan; Yang Wang; Li Cheng", "abstract": "In video highlight detection, the goal is to identify the interesting moments within an unedited video. Although the audio component of the video provides important cues for highlight detection, the majority of existing efforts focus almost exclusively on the visual component. In this paper, we argue that both audio and visual components of a video should be modeled jointly to retrieve its best moments. To this end, we propose an audio-visual network for video highlight detection. At the core of our approach lies a bimodal attention mechanism, which captures the interaction between the audio and visual components of a video, and produces fused representations to facilitate highlight detection. Furthermore, we introduce a noise sentinel technique to adaptively discount a noisy visual or audio modality. Empirical evaluations on two benchmark datasets demonstrate the superior performance of our approach over the state-of-the-art methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Badamdorj_Joint_Visual_and_Audio_Learning_for_Video_Highlight_Detection_ICCV_2021_paper.pdf", @@ -21156,7 +22587,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Badamdorj_Joint_Visual_and_Audio_Learning_for_Video_Highlight_Detection_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Badamdorj_Joint_Visual_and_Audio_Learning_for_Video_Highlight_Detection_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Badamdorj_2021_ICCV,\n \n author = {\n Badamdorj,\n Taivanbat and Rochan,\n Mrigank and Wang,\n Yang and Cheng,\n Li\n},\n title = {\n Joint Visual and Audio Learning for Video Highlight Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8127-8137\n} \n}" }, { "title": "Just Ask: Learning To Answer Questions From Millions of Narrated Videos", @@ -21164,10 +22596,11 @@ "status": "Poster", "track": "main", "pid": 4169, + "author_site": "Antoine Yang; Antoine Miech; Josef Sivic; Ivan Laptev; Cordelia Schmid", "author": "Antoine Yang; Antoine Miech; Josef Sivic; Ivan Laptev; Cordelia Schmid", "abstract": "Recent methods for visual question answering rely on large-scale annotated datasets. Manual annotation of questions and answers for videos, however, is tedious, expensive and prevents scalability. In this work, we propose to avoid manual annotation and generate a large-scale training dataset for video question answering making use of automatic cross-modal supervision. We leverage a question generation transformer trained on text data and use it to generate question-answer pairs from transcribed video narrations. Given narrated videos, we then automatically generate the HowToVQA69M dataset with 69M video-question-answer triplets. To handle the open vocabulary of diverse answers in this dataset, we propose a training procedure based on a contrastive loss between a video-question multi-modal transformer and an answer transformer. We introduce the zero-shot VideoQA task and show excellent results, in particular for rare answers. Furthermore, we demonstrate our method to significantly outperform the state of the art on MSRVTT-QA, MSVD-QA, ActivityNet-QA and How2QA. Finally, for a detailed evaluation we introduce iVQA, a new VideoQA dataset with reduced language biases and high-quality redundant manual annotations.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yang_Just_Ask_Learning_To_Answer_Questions_From_Millions_of_Narrated_ICCV_2021_paper.pdf", - "aff": "Inria Paris + D\u00b4epartement d\u2019informatique de l\u2019ENS, CNRS, PSL Research University; Inria Paris + D\u00b4epartement d\u2019informatique de l\u2019ENS, CNRS, PSL Research University + Now at DeepMind; CIIRC CTU Prague; Inria Paris + D\u00b4epartement d\u2019informatique de l\u2019ENS, CNRS, PSL Research University; Inria Paris + D\u00b4epartement d\u2019informatique de l\u2019ENS, CNRS, PSL Research University", + "aff": "Inria Paris + D´epartement d’informatique de l’ENS, CNRS, PSL Research University; Inria Paris + D´epartement d’informatique de l’ENS, CNRS, PSL Research University + Now at DeepMind; CIIRC CTU Prague; Inria Paris + D´epartement d’informatique de l’ENS, CNRS, PSL Research University; Inria Paris + D´epartement d’informatique de l’ENS, CNRS, PSL Research University", "project": "https://antoyang.github.io/just-ask.html", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Yang_Just_Ask_Learning_ICCV_2021_supplemental.pdf", @@ -21180,14 +22613,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Yang_Just_Ask_Learning_To_Answer_Questions_From_Millions_of_Narrated_ICCV_2021_paper.html", "aff_unique_index": "0+1;0+1+2;3;0+1;0+1", - "aff_unique_norm": "INRIA;\u00c9cole Normale Sup\u00e9rieure;DeepMind;Czech Technical University in Prague", - "aff_unique_dep": ";D\u00e9partement d\u2019informatique;;CIIRC", + "aff_unique_norm": "Inria;École Normale Supérieure;DeepMind;Czech Technical University in Prague", + "aff_unique_dep": ";Département d’informatique;;CIIRC", "aff_unique_url": "https://www.inria.fr;https://www.ens.fr;https://deepmind.com;https://www.ciirc.cvut.cz/", "aff_unique_abbr": "Inria;ENS;DeepMind;CIIRC", "aff_campus_unique_index": "0;0;2;0;0", "aff_campus_unique": "Paris;;Prague", "aff_country_unique_index": "0+0;0+0+1;2;0+0;0+0", - "aff_country_unique": "France;United Kingdom;Czech Republic" + "aff_country_unique": "France;United Kingdom;Czech Republic", + "bibtex": "@InProceedings{Yang_2021_ICCV,\n \n author = {\n Yang,\n Antoine and Miech,\n Antoine and Sivic,\n Josef and Laptev,\n Ivan and Schmid,\n Cordelia\n},\n title = {\n Just Ask: Learning To Answer Questions From Millions of Narrated Videos\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1686-1697\n} \n}" }, { "title": "Just One Moment: Structural Vulnerability of Deep Action Recognition Against One Frame Attack", @@ -21195,6 +22629,7 @@ "status": "Poster", "track": "main", "pid": 9823, + "author_site": "Jaehui Hwang; Jun-Hyuk Kim; Jun-Ho Choi; Jong-Seok Lee", "author": "Jaehui Hwang; Jun-Hyuk Kim; Jun-Ho Choi; Jong-Seok Lee", "abstract": "The video-based action recognition task has been extensively studied in recent years. In this paper, we study the structural vulnerability of deep learning-based action recognition models against the adversarial attack using the one frame attack that adds an inconspicuous perturbation to only a single frame of a given video clip. Our analysis shows that the models are highly vulnerable against the one frame attack due to their structural properties. Experiments demonstrate high fooling rates and inconspicuous characteristics of the attack. Furthermore, we show that strong universal one frame perturbations can be obtained under various scenarios. Our work raises the serious issue of adversarial vulnerability of the state-of-the-art action recognition models in various perspectives.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Hwang_Just_One_Moment_Structural_Vulnerability_of_Deep_Action_Recognition_Against_ICCV_2021_paper.pdf", @@ -21218,7 +22653,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Hwang_2021_ICCV,\n \n author = {\n Hwang,\n Jaehui and Kim,\n Jun-Hyuk and Choi,\n Jun-Ho and Lee,\n Jong-Seok\n},\n title = {\n Just One Moment: Structural Vulnerability of Deep Action Recognition Against One Frame Attack\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7668-7676\n} \n}" }, { "title": "Just a Few Points Are All You Need for Multi-View Stereo: A Novel Semi-Supervised Learning Method for Multi-View Stereo", @@ -21226,6 +22662,7 @@ "status": "Poster", "track": "main", "pid": 8146, + "author_site": "Taekyung Kim; Jaehoon Choi; Seokeon Choi; Dongki Jung; Changick Kim", "author": "Taekyung Kim; Jaehoon Choi; Seokeon Choi; Dongki Jung; Changick Kim", "abstract": "While learning-based multi-view stereo (MVS) methods have recently shown successful performances in quality and efficiency, limited MVS data hampers generalization to unseen environments. A simple solution is to generate various large-scale MVS datasets, but generating dense ground truth for 3D structure requires a huge amount of time and resources. On the other hand, if the reliance on dense ground truth is relaxed, MVS systems will generalize more smoothly to new environments. To this end, we first introduce a novel semi-supervised multi-view stereo framework called a Sparse Ground truth-based MVS Network (SGT-MVSNet) that can reliably reconstruct the 3D structures even with a few ground truth 3D points. Our strategy is to divide the accurate and erroneous regions and individually conquer them based on our observation that a probability map can separate these regions. We propose a self-supervision loss called the 3D Point Consistency Loss to enhance the 3D reconstruction performance, which forces the 3D points back-projected from the corresponding pixels by the predicted depth values to meet at the same 3D coordinates. Finally, we propagate these improved depth predictions toward edges and occlusions by the Coarse-to-fine Reliable Depth Propagation module. We generate the spare ground truth of the DTU dataset for evaluation and extensive experiments verify that our SGT-MVSNet outperforms the state-of-the-art MVS methods on the sparse ground truth setting. Moreover, our method shows comparable reconstruction results to the supervised MVS methods though we only used tens and hundreds of ground truth 3D points.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Kim_Just_a_Few_Points_Are_All_You_Need_for_Multi-View_ICCV_2021_paper.pdf", @@ -21242,14 +22679,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Kim_Just_a_Few_Points_Are_All_You_Need_for_Multi-View_ICCV_2021_paper.html", "aff_unique_index": "0;1;0;2;0", - "aff_unique_norm": "Korea Advanced Institute of Science and Technology;University of Maryland;NAVER LABS", + "aff_unique_norm": "Korea Advanced Institute of Science and Technology;University of Maryland;NAVER Labs", "aff_unique_dep": ";;", "aff_unique_url": "https://www.kaist.ac.kr;https://www/umd.edu;https://www.naverlabs.com", "aff_unique_abbr": "KAIST;UMD;NAVER Labs", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0", - "aff_country_unique": "South Korea;United States" + "aff_country_unique": "South Korea;United States", + "bibtex": "@InProceedings{Kim_2021_ICCV,\n \n author = {\n Kim,\n Taekyung and Choi,\n Jaehoon and Choi,\n Seokeon and Jung,\n Dongki and Kim,\n Changick\n},\n title = {\n Just a Few Points Are All You Need for Multi-View Stereo: A Novel Semi-Supervised Learning Method for Multi-View Stereo\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6178-6186\n} \n}" }, { "title": "Keep CALM and Improve Visual Feature Attribution", @@ -21257,10 +22695,11 @@ "status": "Poster", "track": "main", "pid": 1308, + "author_site": "Jae Myung Kim; Junsuk Choe; Zeynep Akata; Seong Joon Oh", "author": "Jae Myung Kim; Junsuk Choe; Zeynep Akata; Seong Joon Oh", "abstract": "The class activation mapping, or CAM, has been the cornerstone of feature attribution methods for multiple vision tasks. Its simplicity and effectiveness have led to wide applications in the explanation of visual predictions and weakly-supervised localization tasks. However, CAM has its own shortcomings. The computation of attribution maps relies on ad-hoc calibration steps that are not part of the training computational graph, making it difficult for us to understand the real meaning of the attribution values. In this paper, we improve CAM by explicitly incorporating a latent variable encoding the location of the cue for recognition in the formulation, thereby subsuming the attribution map into the training computational graph. The resulting model, class activation latent mapping, or CALM, is trained with the expectation-maximization algorithm. Our experiments show that CALM identifies discriminative attributes for image classifiers more accurately than CAM and other visual attribution baselines. CALM also shows performance improvements over prior arts on the weakly-supervised object localization benchmarks. Our code is available at https://github.com/naver-ai/calm.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Kim_Keep_CALM_and_Improve_Visual_Feature_Attribution_ICCV_2021_paper.pdf", - "aff": "University of T\u00fcbingen; Department of Computer Science and Engineering, Sogang University; Max Planck Institute for Intelligent Systems+Max Planck Institute for Informatics; NAVER AI Lab", + "aff": "University of Tübingen; Department of Computer Science and Engineering, Sogang University; Max Planck Institute for Intelligent Systems+Max Planck Institute for Informatics; NAVER AI Lab", "project": "", "github": "https://github.com/naver-ai/calm", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Kim_Keep_CALM_and_ICCV_2021_supplemental.pdf", @@ -21273,14 +22712,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Kim_Keep_CALM_and_Improve_Visual_Feature_Attribution_ICCV_2021_paper.html", "aff_unique_index": "0;1;2+3;4", - "aff_unique_norm": "University of T\u00fcbingen;Sogang University;Max Planck Institute for Intelligent Systems;Max Planck Institute for Informatics;NAVER Corporation", + "aff_unique_norm": "University of Tübingen;Sogang University;Max Planck Institute for Intelligent Systems;Max Planck Institute for Informatics;NAVER Corporation", "aff_unique_dep": ";Department of Computer Science and Engineering;Intelligent Systems;;NAVER AI Lab", "aff_unique_url": "https://www.uni-tuebingen.de/;https://www.sogang.ac.kr;https://www.mpi-is.mpg.de;https://mpi-inf.mpg.de;https://www.naver.com", - "aff_unique_abbr": "Uni T\u00fcbingen;Sogang;MPI-IS;MPII;NAVER", + "aff_unique_abbr": "Uni Tübingen;Sogang;MPI-IS;MPII;NAVER", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0+0;1", - "aff_country_unique": "Germany;South Korea" + "aff_country_unique": "Germany;South Korea", + "bibtex": "@InProceedings{Kim_2021_ICCV,\n \n author = {\n Kim,\n Jae Myung and Choe,\n Junsuk and Akata,\n Zeynep and Oh,\n Seong Joon\n},\n title = {\n Keep CALM and Improve Visual Feature Attribution\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8350-8360\n} \n}" }, { "title": "Kernel Methods in Hyperbolic Spaces", @@ -21288,6 +22728,7 @@ "status": "Poster", "track": "main", "pid": 8088, + "author_site": "Pengfei Fang; Mehrtash Harandi; Lars Petersson", "author": "Pengfei Fang; Mehrtash Harandi; Lars Petersson", "abstract": "Embedding data in hyperbolic spaces has proven beneficial for many advanced machine learning applications such as image classification and word embeddings. However, working in hyperbolic spaces is not without difficulties as a result of its curved geometry (e.g., computing the Frechet mean of a set of points requires an iterative algorithm). Furthermore, in Euclidean spaces, one can resort to kernel machines that not only enjoy rich theoretical properties but that can also lead to superior representational power (e.g., infinite-width neural networks). In this paper, we introduce positive definite kernel functions for hyperbolic spaces. This brings in two major advantages, 1. kernelization will pave the way to seamlessly benefit from kernel machines in conjunction with hyperbolic embeddings, and 2. the rich structure of the Hilbert spaces associated with kernel machines enables us to simplify various operations involving hyperbolic data. That said, identifying valid kernel functions on curved spaces is not straightforward and is indeed considered an open problem in the learning community. Our work addresses this gap and develops several valid positive definite kernels in hyperbolic spaces, including the universal ones (e.g., RBF). We comprehensively study the proposed kernels on a variety of challenging tasks including few-shot learning, zero-shot learning, person re-identification and knowledge distillation, showing the superiority of the kernelization for hyperbolic representations.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Fang_Kernel_Methods_in_Hyperbolic_Spaces_ICCV_2021_paper.pdf", @@ -21311,7 +22752,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0", - "aff_country_unique": "Australia" + "aff_country_unique": "Australia", + "bibtex": "@InProceedings{Fang_2021_ICCV,\n \n author = {\n Fang,\n Pengfei and Harandi,\n Mehrtash and Petersson,\n Lars\n},\n title = {\n Kernel Methods in Hyperbolic Spaces\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10665-10674\n} \n}" }, { "title": "Keypoint Communities", @@ -21319,6 +22761,7 @@ "status": "Poster", "track": "main", "pid": 7646, + "author_site": "Duncan Zauss; Sven Kreiss; Alexandre Alahi", "author": "Duncan Zauss; Sven Kreiss; Alexandre Alahi", "abstract": "We present a fast bottom-up method that jointly detects over 100 keypoints on humans or objects, also referred to as human/object pose estimation. We model all keypoints belonging to a human or an object --the pose-- as a graph and leverage insights from community detection to quantify the independence of keypoints. We use a graph centrality measure to assign training weights to different parts of a pose. Our proposed measure quantifies how tightly a keypoint is connected to its neighborhood. Our experiments show that our method outperforms all previous methods for human pose estimation with fine-grained keypoint annotations on the face, the hands and the feet with a total of 133 keypoints. We also show that our method generalizes to car poses.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zauss_Keypoint_Communities_ICCV_2021_paper.pdf", @@ -21333,7 +22776,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zauss_Keypoint_Communities_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zauss_Keypoint_Communities_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Zauss_2021_ICCV,\n \n author = {\n Zauss,\n Duncan and Kreiss,\n Sven and Alahi,\n Alexandre\n},\n title = {\n Keypoint Communities\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11057-11066\n} \n}" }, { "title": "KiloNeRF: Speeding Up Neural Radiance Fields With Thousands of Tiny MLPs", @@ -21341,10 +22785,11 @@ "status": "Poster", "track": "main", "pid": 7572, + "author_site": "Christian Reiser; Songyou Peng; Yiyi Liao; Andreas Geiger", "author": "Christian Reiser; Songyou Peng; Yiyi Liao; Andreas Geiger", "abstract": "NeRF synthesizes novel views of a scene with unprecedented quality by fitting a neural radiance field to RGB images. However, NeRF requires querying a deep Multi-Layer Perceptron (MLP) millions of times, leading to slow rendering times, even on modern GPUs. In this paper, we demonstrate that real-time rendering is possible by utilizing thousands of tiny MLPs instead of one single large MLP. In our setting, each individual MLP only needs to represent parts of the scene, thus smaller and faster-to-evaluate MLPs can be used. By combining this divide-and-conquer strategy with further optimizations, rendering is accelerated by three orders of magnitude compared to the original NeRF model without incurring high storage costs. Further, using teacher-student distillation for training, we show that this speed-up can be achieved without sacrificing visual quality.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Reiser_KiloNeRF_Speeding_Up_Neural_Radiance_Fields_With_Thousands_of_Tiny_ICCV_2021_paper.pdf", - "aff": "Max Planck Institute for Intelligent Systems, T\u00fcbingen+University of T\u00fcbingen; Max Planck Institute for Intelligent Systems, T\u00fcbingen+University of T\u00fcbingen; Max Planck Institute for Intelligent Systems, T\u00fcbingen+University of T\u00fcbingen; Max Planck Institute for Intelligent Systems, T\u00fcbingen+University of T\u00fcbingen", + "aff": "Max Planck Institute for Intelligent Systems, Tübingen+University of Tübingen; Max Planck Institute for Intelligent Systems, Tübingen+University of Tübingen; Max Planck Institute for Intelligent Systems, Tübingen+University of Tübingen; Max Planck Institute for Intelligent Systems, Tübingen+University of Tübingen", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Reiser_KiloNeRF_Speeding_Up_ICCV_2021_supplemental.pdf", @@ -21357,14 +22802,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Reiser_KiloNeRF_Speeding_Up_Neural_Radiance_Fields_With_Thousands_of_Tiny_ICCV_2021_paper.html", "aff_unique_index": "0+1;0+1;0+1;0+1", - "aff_unique_norm": "Max Planck Institute for Intelligent Systems;University of T\u00fcbingen", + "aff_unique_norm": "Max Planck Institute for Intelligent Systems;University of Tübingen", "aff_unique_dep": ";", "aff_unique_url": "https://www.mpi-is.mpg.de;https://www.uni-tuebingen.de/", - "aff_unique_abbr": "MPI-IS;Uni T\u00fcbingen", + "aff_unique_abbr": "MPI-IS;Uni Tübingen", "aff_campus_unique_index": "0;0;0;0", - "aff_campus_unique": "T\u00fcbingen;", + "aff_campus_unique": "Tübingen;", "aff_country_unique_index": "0+0;0+0;0+0;0+0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Reiser_2021_ICCV,\n \n author = {\n Reiser,\n Christian and Peng,\n Songyou and Liao,\n Yiyi and Geiger,\n Andreas\n},\n title = {\n KiloNeRF: Speeding Up Neural Radiance Fields With Thousands of Tiny MLPs\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14335-14345\n} \n}" }, { "title": "Knowledge Mining and Transferring for Domain Adaptive Object Detection", @@ -21372,6 +22818,7 @@ "status": "Poster", "track": "main", "pid": 3705, + "author_site": "Kun Tian; Chenghao Zhang; Ying Wang; Shiming Xiang; Chunhong Pan", "author": "Kun Tian; Chenghao Zhang; Ying Wang; Shiming Xiang; Chunhong Pan", "abstract": "With the thriving of deep learning, CNN-based object detectors have made great progress in the past decade. However, the domain gap between training and testing data leads to a prominent performance degradation and thus hinders their application in the real world. To alleviate this problem, Knowledge Transfer Network (KTNet) is proposed as a new paradigm for domain adaption. Specifically, KTNet is constructed on a base detector with intrinsic knowledge mining and relational knowledge constraints. First, we design a foreground/background classifier shared by source domain and target domain to extract the common attribute knowledge of objects in different scenarios. Second, we model the relational knowledge graph and explicitly constrain the consistency of category correlation under source domain, target domain, as well as cross-domain conditions. As a result, the detector is guided to learn object-related and domain-independent representation. Extensive experiments and visualizations confirm that transferring object-specific knowledge can yield notable performance gains. The proposed KTNet achieves state-of-the-art results on three cross-domain detection benchmarks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Tian_Knowledge_Mining_and_Transferring_for_Domain_Adaptive_Object_Detection_ICCV_2021_paper.pdf", @@ -21395,7 +22842,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Tian_2021_ICCV,\n \n author = {\n Tian,\n Kun and Zhang,\n Chenghao and Wang,\n Ying and Xiang,\n Shiming and Pan,\n Chunhong\n},\n title = {\n Knowledge Mining and Transferring for Domain Adaptive Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9133-9142\n} \n}" }, { "title": "Knowledge-Enriched Distributional Model Inversion Attacks", @@ -21403,6 +22851,7 @@ "status": "Poster", "track": "main", "pid": 4392, + "author_site": "Si Chen; Mostafa Kahla; Ruoxi Jia; Guo-Jun Qi", "author": "Si Chen; Mostafa Kahla; Ruoxi Jia; Guo-Jun Qi", "abstract": "Model inversion (MI) attacks are aimed at reconstructing training data from model parameters. Such attacks have triggered increasing concerns about privacy, especially given the growing number of online model repositories. However, existing MI attacks against deep neural networks (DNNs) have a large room for performance improvement. We present a novel inversion-specific GAN that can better distill knowledge useful for performing attacks on private models from public data. In particular, we train the discriminator to differentiate not only the real and fake samples but the soft-labels provided by the target model. Moreover, unlike previous work that directly searches for a single data point to represent a target class, we propose to model a private data distribution for each target class. Our experiments show that the combination of these techniques can significantly boost the success rate of the state-of-the-art MI attacks by 150%, and generalize better to a variety of datasets and models. Our code is available at https://github.com/SCccc21/Knowledge-Enriched-DMI.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_Knowledge-Enriched_Distributional_Model_Inversion_Attacks_ICCV_2021_paper.pdf", @@ -21419,14 +22868,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Chen_Knowledge-Enriched_Distributional_Model_Inversion_Attacks_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;1", - "aff_unique_norm": "Virginia Tech;InnoPeak Technology", + "aff_unique_norm": "Virginia Tech;Innopeak Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.vt.edu;", "aff_unique_abbr": "VT;", "aff_campus_unique_index": "1", "aff_campus_unique": ";Seattle", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Si and Kahla,\n Mostafa and Jia,\n Ruoxi and Qi,\n Guo-Jun\n},\n title = {\n Knowledge-Enriched Distributional Model Inversion Attacks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 16178-16187\n} \n}" }, { "title": "KoDF: A Large-Scale Korean DeepFake Detection Dataset", @@ -21434,6 +22884,7 @@ "status": "Poster", "track": "main", "pid": 8640, + "author_site": "Patrick Kwon; Jaeseong You; Gyuhyeon Nam; Sungwoo Park; Gyeongsu Chae", "author": "Patrick Kwon; Jaeseong You; Gyuhyeon Nam; Sungwoo Park; Gyeongsu Chae", "abstract": "A variety of effective face-swap and face-reenactment methods have been publicized in recent years, democratizing the face synthesis technology to a great extent. Videos generated as such have come to be called deepfakes with a negative connotation, for various social problems they have caused. Facing the emerging threat of deepfakes, we have built the Korean DeepFake Detection Dataset (KoDF), a large-scale collection of synthesized and real videos focused on Korean subjects. In this paper, we provide a detailed description of methods used to construct the dataset, experimentally show the discrepancy between the distributions of KoDF and existing deepfake detection datasets, and underline the importance of using multiple datasets for real-world generalization. KoDF is publicly available at https://moneybrain-research.github.io/kodf in its entirety (i.e. real clips, synthesized clips, clips with adversarial attack, and metadata).", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Kwon_KoDF_A_Large-Scale_Korean_DeepFake_Detection_Dataset_ICCV_2021_paper.pdf", @@ -21448,7 +22899,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Kwon_KoDF_A_Large-Scale_Korean_DeepFake_Detection_Dataset_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Kwon_KoDF_A_Large-Scale_Korean_DeepFake_Detection_Dataset_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Kwon_2021_ICCV,\n \n author = {\n Kwon,\n Patrick and You,\n Jaeseong and Nam,\n Gyuhyeon and Park,\n Sungwoo and Chae,\n Gyeongsu\n},\n title = {\n KoDF: A Large-Scale Korean DeepFake Detection Dataset\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10744-10753\n} \n}" }, { "title": "LFI-CAM: Learning Feature Importance for Better Visual Explanation", @@ -21456,6 +22908,7 @@ "status": "Poster", "track": "main", "pid": 9500, + "author_site": "Kwang Hee Lee; Chaewon Park; Junghyun Oh; Nojun Kwak", "author": "Kwang Hee Lee; Chaewon Park; Junghyun Oh; Nojun Kwak", "abstract": "Class Activation Mapping (CAM) is a powerful technique used to understand the decision making of Convolutional Neural Network (CNN) in computer vision. Recently, there have been attempts not only to generate better visual explanations, but also to improve classification performance using visual explanations. However, previous works still have their own drawbacks. In this paper, we propose a novel architecture, LFI-CAM***(Learning Feature Importance Class Activation Mapping), which is trainable for image classification and visual explanation in an end-to-end manner. LFI-CAM generates attention map for visual explanation during forward propagation, and simultaneously uses attention map to improve classification performance through the attention mechanism. Feature Importance Network (FIN) focuses on learning the feature importance instead of directly learning the attention map to obtain a more reliable and consistent attention map. We confirmed that LFI-CAM is optimized not only by learning the feature importance but also by enhancing the backbone feature representation to focus more on important features of the input image. Experiments show that LFI-CAM outperforms baseline models' accuracy on classification tasks as well as significantly improves on previous works in terms of attention map quality and stability over different hyper-parameters.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Lee_LFI-CAM_Learning_Feature_Importance_for_Better_Visual_Explanation_ICCV_2021_paper.pdf", @@ -21470,7 +22923,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Lee_LFI-CAM_Learning_Feature_Importance_for_Better_Visual_Explanation_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Lee_LFI-CAM_Learning_Feature_Importance_for_Better_Visual_Explanation_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Lee_2021_ICCV,\n \n author = {\n Lee,\n Kwang Hee and Park,\n Chaewon and Oh,\n Junghyun and Kwak,\n Nojun\n},\n title = {\n LFI-CAM: Learning Feature Importance for Better Visual Explanation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1355-1363\n} \n}" }, { "title": "LIGA-Stereo: Learning LiDAR Geometry Aware Representations for Stereo-Based 3D Detector", @@ -21478,6 +22932,7 @@ "status": "Poster", "track": "main", "pid": 2859, + "author_site": "Xiaoyang Guo; Shaoshuai Shi; Xiaogang Wang; Hongsheng Li", "author": "Xiaoyang Guo; Shaoshuai Shi; Xiaogang Wang; Hongsheng Li", "abstract": "Stereo-based 3D detection aims at detecting 3D object bounding boxes from stereo images using intermediate depth maps or implicit 3D geometry representations, which provides a low-cost solution for 3D perception. However, its performance is still inferior compared with LiDAR-based detection algorithms. To detect and localize accurate 3D bounding boxes, LiDAR-based models can encode accurate object boundaries and surface normal directions from LiDAR point clouds. However, the detection results of stereo-based detectors are easily affected by the erroneous depth features due to the limitation of stereo matching. To solve the problem, we propose LIGA-Stereo (LiDAR Geometry Aware Stereo Detector) to learn stereo-based 3D detectors under the guidance of high-level geometry-aware representations of LiDAR-based detection models. In addition, we found existing voxel-based stereo detectors failed to learn semantic features effectively from indirect 3D supervisions. We attach an auxiliary 2D detection head to provide direct 2D semantic supervisions. Experiment results show that the above two strategies improved the geometric and semantic representation capabilities. Compared with the state-of-the-art stereo detector, our method has improved the 3D detection performance of cars, pedestrians, cyclists by 10.44%, 5.69%, 5.97% mAP respectively on the official KITTI benchmark. The gap between stereo-based and LiDAR-based 3D detectors is further narrowed.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Guo_LIGA-Stereo_Learning_LiDAR_Geometry_Aware_Representations_for_Stereo-Based_3D_Detector_ICCV_2021_paper.pdf", @@ -21494,14 +22949,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Guo_LIGA-Stereo_Learning_LiDAR_Geometry_Aware_Representations_for_Stereo-Based_3D_Detector_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;0", - "aff_unique_norm": "Chinese University of Hong Kong", + "aff_unique_norm": "The Chinese University of Hong Kong", "aff_unique_dep": "CUHK-SenseTime Joint Laboratory", "aff_unique_url": "https://www.cuhk.edu.hk", "aff_unique_abbr": "CUHK", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Guo_2021_ICCV,\n \n author = {\n Guo,\n Xiaoyang and Shi,\n Shaoshuai and Wang,\n Xiaogang and Li,\n Hongsheng\n},\n title = {\n LIGA-Stereo: Learning LiDAR Geometry Aware Representations for Stereo-Based 3D Detector\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3153-3163\n} \n}" }, { "title": "LIRA: Learnable, Imperceptible and Robust Backdoor Attacks", @@ -21509,6 +22965,7 @@ "status": "Poster", "track": "main", "pid": 11075, + "author_site": "Khoa Doan; Yingjie Lao; Weijie Zhao; Ping Li", "author": "Khoa Doan; Yingjie Lao; Weijie Zhao; Ping Li", "abstract": "Recently, machine learning models have demonstrated to be vulnerable to backdoor attacks, primarily due to the lack of transparency in black-box models such as deep neural networks. A third-party model can be poisoned such that it works adequately in normal conditions but behaves maliciously on samples with specific trigger patterns. However, the trigger injection function is manually defined in most existing backdoor attack methods, e.g., placing a small patch of pixels on an image or slightly deforming the image before poisoning the model. This results in a two-stage approach with a sub-optimal attack success rate and a lack of complete stealthiness under human inspection. In this paper, we propose a novel and stealthy backdoor attack framework, LIRA, which jointly learns the optimal, stealthy trigger injection function and poisons the model. We formulate such an objective as a non-convex, constrained optimization problem. Under this optimization framework, the trigger generator function will learn to manipulate the input with imperceptible noise to preserve the model performance on the clean data and maximize the attack success rate on the poisoned data. Then, we solve this challenging optimization problem with an efficient, two-stage stochastic optimization procedure. Finally, the proposed attack framework achieves 100% success rates in several benchmark datasets, including MNIST, CIFAR10, GTSRB, and T-ImageNet, while simultaneously bypassing existing backdoor defense methods and human inspection.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Doan_LIRA_Learnable_Imperceptible_and_Robust_Backdoor_Attacks_ICCV_2021_paper.pdf", @@ -21523,7 +22980,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Doan_LIRA_Learnable_Imperceptible_and_Robust_Backdoor_Attacks_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Doan_LIRA_Learnable_Imperceptible_and_Robust_Backdoor_Attacks_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Doan_2021_ICCV,\n \n author = {\n Doan,\n Khoa and Lao,\n Yingjie and Zhao,\n Weijie and Li,\n Ping\n},\n title = {\n LIRA: Learnable,\n Imperceptible and Robust Backdoor Attacks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11966-11976\n} \n}" }, { "title": "LOKI: Long Term and Key Intentions for Trajectory Prediction", @@ -21531,6 +22989,7 @@ "status": "Poster", "track": "main", "pid": 7416, + "author_site": "Harshayu Girase; Haiming Gang; Srikanth Malla; Jiachen Li; Akira Kanehara; Karttikeya Mangalam; Chiho Choi", "author": "Harshayu Girase; Haiming Gang; Srikanth Malla; Jiachen Li; Akira Kanehara; Karttikeya Mangalam; Chiho Choi", "abstract": "Recent advances in trajectory prediction have shown that explicit reasoning about agents' intent is important to accurately forecast their motion. However, the current research activities are not directly applicable to intelligent and safety critical systems. This is mainly because very few public datasets are available, and they only consider pedestrian-specific intents for a short temporal horizon from a restricted egocentric view. To this end, we propose LOKI (LOng term and Key Intentions), a novel large-scale dataset that is designed to tackle joint trajectory and intention prediction for heterogeneous traffic agents (pedestrians and vehicles) in an autonomous driving setting. The LOKI dataset is created to discover several factors that may affect intention, including i) agent's own will, ii) social interactions, iii) environmental constraints, and iv) contextual information. We also propose a model that jointly performs trajectory and intention prediction, showing that recurrently reasoning about intention can assist with trajectory prediction. We show our method outperforms state-of-the-art trajectory prediction methods by upto 27% and also provide a baseline for frame-wise intention estimation. The dataset is available at https://usa.honda-ri.com/loki", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Girase_LOKI_Long_Term_and_Key_Intentions_for_Trajectory_Prediction_ICCV_2021_paper.pdf", @@ -21549,12 +23008,13 @@ "aff_unique_index": "0+1;0;0;0+1;2;1;0", "aff_unique_norm": "Honda Research Institute;University of California, Berkeley;Honda R&D Co., Ltd.", "aff_unique_dep": "Honda Research Institute;;", - "aff_unique_url": "https://honda-ri.com;https://www.berkeley.edu;https://www.honda.com/", + "aff_unique_url": "https://honda-ri.com;https://www.berkeley.edu;https://www.honda.com", "aff_unique_abbr": "HRI USA;UC Berkeley;Honda R&D", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0+0;0;0;0+0;1;0;0", - "aff_country_unique": "United States;Japan" + "aff_country_unique": "United States;Japan", + "bibtex": "@InProceedings{Girase_2021_ICCV,\n \n author = {\n Girase,\n Harshayu and Gang,\n Haiming and Malla,\n Srikanth and Li,\n Jiachen and Kanehara,\n Akira and Mangalam,\n Karttikeya and Choi,\n Chiho\n},\n title = {\n LOKI: Long Term and Key Intentions for Trajectory Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9803-9812\n} \n}" }, { "title": "LSD-StructureNet: Modeling Levels of Structural Detail in 3D Part Hierarchies", @@ -21562,6 +23022,7 @@ "status": "Poster", "track": "main", "pid": 10594, + "author_site": "Dominic Roberts; Ara Danielyan; Hang Chu; Mani Golparvar-Fard; David Forsyth", "author": "Dominic Roberts; Ara Danielyan; Hang Chu; Mani Golparvar-Fard; David Forsyth", "abstract": "Generative models for 3D shapes represented by hierarchies of parts can generate realistic and diverse sets of outputs. However, existing models suffer from the key practical limitation of modelling shapes holistically and thus cannot perform conditional sampling, i.e. they are not able to generate variants on individual parts of generated shapes without modifying the rest of the shape. This is limiting for applications such as 3D CAD design that involve adjusting created shapes at multiple levels of detail. To address this, we introduce LSD-StructureNet, an augmentation to the StructureNet architecture that enables re-generation of parts situated at arbitrary positions in the hierarchies of its outputs. We achieve this by learning individual, probabilistic conditional decoders for each hierarchy depth. We evaluate LSD-StructureNet on the PartNet dataset, the largest dataset of 3D shapes represented by hierarchies of parts. Our results show that contrarily to existing methods, LSD-StructureNet can perform conditional sampling without impacting inference speed or the realism and diversity of its outputs.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Roberts_LSD-StructureNet_Modeling_Levels_of_Structural_Detail_in_3D_Part_Hierarchies_ICCV_2021_paper.pdf", @@ -21578,14 +23039,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Roberts_LSD-StructureNet_Modeling_Levels_of_Structural_Detail_in_3D_Part_Hierarchies_ICCV_2021_paper.html", "aff_unique_index": "0;1;1;0;0", - "aff_unique_norm": "University of Illinois Urbana-Champaign;Autodesk", - "aff_unique_dep": ";Autodesk AI Lab", + "aff_unique_norm": "University of Illinois at Urbana-Champaign;Autodesk", + "aff_unique_dep": ";AI Lab", "aff_unique_url": "https://illinois.edu;https://www.autodesk.com", "aff_unique_abbr": "UIUC;Autodesk", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Urbana-Champaign;", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Roberts_2021_ICCV,\n \n author = {\n Roberts,\n Dominic and Danielyan,\n Ara and Chu,\n Hang and Golparvar-Fard,\n Mani and Forsyth,\n David\n},\n title = {\n LSD-StructureNet: Modeling Levels of Structural Detail in 3D Part Hierarchies\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5836-5845\n} \n}" }, { "title": "LSG-CPD: Coherent Point Drift With Local Surface Geometry for Point Cloud Registration", @@ -21593,6 +23055,7 @@ "status": "Poster", "track": "main", "pid": 7429, + "author_site": "Weixiao Liu; Hongtao Wu; Gregory S. Chirikjian", "author": "Weixiao Liu; Hongtao Wu; Gregory S. Chirikjian", "abstract": "Probabilistic point cloud registration methods are becoming more popular because of their robustness. However, unlike point-to-plane variants of iterative closest point (ICP) which incorporate local surface geometric information such as surface normals, most probabilistic methods (e.g., coherent point drift (CPD)) ignore such information and build Gaussian mixture models (GMMs) with isotropic Gaussian covariances. This results in sphere-like GMM components which only penalize the point-to-point distance between the two point clouds. In this paper, we propose a novel method called CPD with Local Surface Geometry (LSG-CPD) for rigid point cloud registration. Our method adaptively adds different levels of point-to-plane penalization on top of the point-to-point penalization based on the flatness of the local surface. This results in GMM components with anisotropic covariances. We formulate point cloud registration as a maximum likelihood estimation (MLE) problem and solve it with the Expectation-Maximization (EM) algorithm. In the E step, we demonstrate that the computation can be recast into simple matrix manipulations and efficiently computed on a GPU. In the M step, we perform an unconstrained optimization on a matrix Lie group to efficiently update the rigid transformation of the registration. The proposed method outperforms state-of-the-art algorithms in terms of accuracy and robustness on various datasets captured with range scanners, RGBD cameras, and LiDARs. Also, it is significantly faster than modern implementations of CPD. The source code is available at https://github.com/ChirikjianLab/LSG-CPD.git.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liu_LSG-CPD_Coherent_Point_Drift_With_Local_Surface_Geometry_for_Point_ICCV_2021_paper.pdf", @@ -21607,7 +23070,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Liu_LSG-CPD_Coherent_Point_Drift_With_Local_Surface_Geometry_for_Point_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Liu_LSG-CPD_Coherent_Point_Drift_With_Local_Surface_Geometry_for_Point_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Liu_2021_ICCV,\n \n author = {\n Liu,\n Weixiao and Wu,\n Hongtao and Chirikjian,\n Gregory S.\n},\n title = {\n LSG-CPD: Coherent Point Drift With Local Surface Geometry for Point Cloud Registration\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15293-15302\n} \n}" }, { "title": "LaLaLoc: Latent Layout Localisation in Dynamic, Unvisited Environments", @@ -21615,10 +23079,11 @@ "status": "Poster", "track": "main", "pid": 3420, + "author_site": "Henry Howard-Jenkins; Jose-Raul Ruiz-Sarmiento; Victor Adrian Prisacariu", "author": "Henry Howard-Jenkins; Jose-Raul Ruiz-Sarmiento; Victor Adrian Prisacariu", "abstract": "We present LaLaLoc to localise in environments without the need for prior visitation, and in a manner that is robust to large changes in scene appearance, such as a full rearrangement of furniture. Specifically, LaLaLoc performs localisation through latent representations of room layout. LaLaLoc learns a rich embedding space shared between RGB panoramas and layouts inferred from a known floor plan that encodes the structural similarity between locations. Further, LaLaLoc introduces direct, cross-modal pose optimisation in its latent space. Thus, LaLaLoc enables fine-grained pose estimation in a scene without the need for prior visitation, as well as being robust to dynamics, such as a change in furniture configuration. We show that in a domestic environment LaLaLoc is able to accurately localise a single RGB panorama image to within 8.3cm, given only a floor plan as a prior.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Howard-Jenkins_LaLaLoc_Latent_Layout_Localisation_in_Dynamic_Unvisited_Environments_ICCV_2021_paper.pdf", - "aff": "Active Vision Laboratory, University of Oxford; Machine Perception and Intelligent Robotics Group, University of M\u00e1laga; Active Vision Laboratory, University of Oxford", + "aff": "Active Vision Laboratory, University of Oxford; Machine Perception and Intelligent Robotics Group, University of Málaga; Active Vision Laboratory, University of Oxford", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Howard-Jenkins_LaLaLoc_Latent_Layout_ICCV_2021_supplemental.pdf", @@ -21631,14 +23096,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Howard-Jenkins_LaLaLoc_Latent_Layout_Localisation_in_Dynamic_Unvisited_Environments_ICCV_2021_paper.html", "aff_unique_index": "0;1;0", - "aff_unique_norm": "University of Oxford;University of M\u00e1laga", + "aff_unique_norm": "University of Oxford;University of Málaga", "aff_unique_dep": "Active Vision Laboratory;Machine Perception and Intelligent Robotics Group", - "aff_unique_url": "https://www.ox.ac.uk;https://www.uma.es", + "aff_unique_url": "https://www.ox.ac.uk;https://www.um.es", "aff_unique_abbr": "Oxford;", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Oxford;", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "United Kingdom;Spain" + "aff_country_unique": "United Kingdom;Spain", + "bibtex": "@InProceedings{Howard-Jenkins_2021_ICCV,\n \n author = {\n Howard-Jenkins,\n Henry and Ruiz-Sarmiento,\n Jose-Raul and Prisacariu,\n Victor Adrian\n},\n title = {\n LaLaLoc: Latent Layout Localisation in Dynamic,\n Unvisited Environments\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10107-10116\n} \n}" }, { "title": "LabOR: Labeling Only if Required for Domain Adaptive Semantic Segmentation", @@ -21646,6 +23112,7 @@ "status": "Poster", "track": "main", "pid": 7219, + "author_site": "Inkyu Shin; Dong-Jin Kim; Jae Won Cho; Sanghyun Woo; Kwanyong Park; In So Kweon", "author": "Inkyu Shin; Dong-Jin Kim; Jae Won Cho; Sanghyun Woo; Kwanyong Park; In So Kweon", "abstract": "Unsupervised Domain Adaptation (UDA) for semantic segmentation has been actively studied to mitigate the domain gap between label-rich source data and unlabeled target data. Despite these efforts, UDA still has a long way to go to reach the fully supervised performance. To this end, we propose a Labeling Only if Required strategy, LabOR, where we introduce a human-in-the-loop approach to adaptively give scarce labels to points that a UDA model is uncertain about. In order to find the uncertain points, we generate an inconsistency mask using the proposed adaptive pixel selector and we label these segment-based regions to achieve near supervised performance with only a small fraction (about 2.2%) ground truth points, which we call \"Segment based Pixel-Labeling (SPL).\" To further reduce the efforts of the human annotator, we also propose \"Point based Pixel-Labeling (PPL),\" which finds the most representative points for labeling within the generated inconsistency mask. This reduces efforts from 2.2% segment label to 40 points label while minimizing performance degradation. Through extensive experimentation, we show the advantages of this new framework for domain adaptive semantic segmentation while minimizing human labor costs.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Shin_LabOR_Labeling_Only_if_Required_for_Domain_Adaptive_Semantic_Segmentation_ICCV_2021_paper.pdf", @@ -21669,7 +23136,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Shin_2021_ICCV,\n \n author = {\n Shin,\n Inkyu and Kim,\n Dong-Jin and Cho,\n Jae Won and Woo,\n Sanghyun and Park,\n Kwanyong and Kweon,\n In So\n},\n title = {\n LabOR: Labeling Only if Required for Domain Adaptive Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8588-8598\n} \n}" }, { "title": "Labels4Free: Unsupervised Segmentation Using StyleGAN", @@ -21677,6 +23145,7 @@ "status": "Poster", "track": "main", "pid": 6300, + "author_site": "Rameen Abdal; Peihao Zhu; Niloy J. Mitra; Peter Wonka", "author": "Rameen Abdal; Peihao Zhu; Niloy J. Mitra; Peter Wonka", "abstract": "We propose an unsupervised segmentation framework for StyleGAN generated objects. We build on two main observations. First, the features generated by StyleGAN hold valuable information that can be utilized towards training segmentation networks. Second, the foreground and background can often be treated to be largely independent and be swapped across images to produce plausible composited images. For our solution, we propose to augment the Style-GAN2 generator architecture with a segmentation branch and to split the generator into a foreground and background network. This enables us to generate soft segmentation masks for the foreground object in an unsupervised fashion. On multiple object classes, we report comparable results against state-of-the-art supervised segmentation networks, while against the best unsupervised segmentation approach we demonstrate a clear improvement, both in qualitative and quantitative metrics. Project Page : https:/rameenabdal.github.io/Labels4Free", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Abdal_Labels4Free_Unsupervised_Segmentation_Using_StyleGAN_ICCV_2021_paper.pdf", @@ -21691,7 +23160,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Abdal_Labels4Free_Unsupervised_Segmentation_Using_StyleGAN_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Abdal_Labels4Free_Unsupervised_Segmentation_Using_StyleGAN_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Abdal_2021_ICCV,\n \n author = {\n Abdal,\n Rameen and Zhu,\n Peihao and Mitra,\n Niloy J. and Wonka,\n Peter\n},\n title = {\n Labels4Free: Unsupervised Segmentation Using StyleGAN\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13970-13979\n} \n}" }, { "title": "Language-Guided Global Image Editing via Cross-Modal Cyclic Mechanism", @@ -21699,6 +23169,7 @@ "status": "Poster", "track": "main", "pid": 6365, + "author_site": "Wentao Jiang; Ning Xu; Jiayun Wang; Chen Gao; Jing Shi; Zhe Lin; Si Liu", "author": "Wentao Jiang; Ning Xu; Jiayun Wang; Chen Gao; Jing Shi; Zhe Lin; Si Liu", "abstract": "Editing an image automatically via a linguistic request can significantly save laborious manual work and is friendly to photography novice. In this paper, we focus on the task of language-guided global image editing. Existing works suffer from imbalanced data distribution of real-world datasets and thus fail to understand language requests well. To handle this issue, we propose to create a cycle with our image generator by creating another model called Editing Description Network (EDNet) which predicts an editing embedding given a pair of images. Given the cycle, we propose several free augmentation strategies to help our model understand various editing requests given the imbalanced dataset. In addition, two other novel ideas are proposed: an Image-Request Attention (IRA) module which allows our method to edit an image spatial-adaptively when the image requires different editing degree at different regions, as well as a new evaluation metric for this task which is more semantic and reasonable than conventional pixel losses (eg L1). Extensive experiments on two benchmark datasets demonstrate the effectiveness of our method over existing approaches.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Jiang_Language-Guided_Global_Image_Editing_via_Cross-Modal_Cyclic_Mechanism_ICCV_2021_paper.pdf", @@ -21722,7 +23193,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;1;1;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Jiang_2021_ICCV,\n \n author = {\n Jiang,\n Wentao and Xu,\n Ning and Wang,\n Jiayun and Gao,\n Chen and Shi,\n Jing and Lin,\n Zhe and Liu,\n Si\n},\n title = {\n Language-Guided Global Image Editing via Cross-Modal Cyclic Mechanism\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2115-2124\n} \n}" }, { "title": "LapsCore: Language-Guided Person Search via Color Reasoning", @@ -21730,6 +23202,7 @@ "status": "Poster", "track": "main", "pid": 2253, + "author_site": "Yushuang Wu; Zizheng Yan; Xiaoguang Han; Guanbin Li; Changqing Zou; Shuguang Cui", "author": "Yushuang Wu; Zizheng Yan; Xiaoguang Han; Guanbin Li; Changqing Zou; Shuguang Cui", "abstract": "The key point of language-guided person search is to construct the cross-modal association between visual and textual input. Existing methods focus on designing multimodal attention mechanisms and novel cross-modal loss functions to learn such association implicitly. We propose a representation learning method for language-guided person search based on color reasoning (LapsCore). It can explicitly build a fine-grained cross-modal association bidirectionally. Specifically, a pair of dual sub-tasks, image colorization and text completion, is designed. In the former task, rich text information is learned to colorize gray images, and the latter one requests the model to understand the image and complete color word vacancies in the captions. The two sub-tasks enable models to learn correct alignments between text phrases and image regions, so that rich multimodal representations can be learned. Extensive experiments on multiple datasets demonstrate the effectiveness and superiority of the proposed method.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wu_LapsCore_Language-Guided_Person_Search_via_Color_Reasoning_ICCV_2021_paper.pdf", @@ -21746,14 +23219,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wu_LapsCore_Language-Guided_Person_Search_via_Color_Reasoning_ICCV_2021_paper.html", "aff_unique_index": "0+0+1;0+0+1;0+0+1;2;3;0+0+1", - "aff_unique_norm": "Chinese University of Hong Kong, Shenzhen;Shenzhen Research Institute of Big Data;Sun Yat-sen University;Huawei", + "aff_unique_norm": "Chinese University of Hong Kong, Shenzhen;Shenzhen Research Institute of Big Data;Sun Yat-sen University;Huawei Technologies", "aff_unique_dep": "School of Software Engineering;;;HMI Lab", "aff_unique_url": "https://www.sse.cuhk.edu.cn;http://www.sribd.cn;http://www.sysu.edu.cn/;https://www.huawei.com", "aff_unique_abbr": "CUHK-Shenzhen;;SYSU;Huawei", "aff_campus_unique_index": "0+0;0+0;0+0;0+0", "aff_campus_unique": "Shenzhen;", "aff_country_unique_index": "0+0+0;0+0+0;0+0+0;0;0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wu_2021_ICCV,\n \n author = {\n Wu,\n Yushuang and Yan,\n Zizheng and Han,\n Xiaoguang and Li,\n Guanbin and Zou,\n Changqing and Cui,\n Shuguang\n},\n title = {\n LapsCore: Language-Guided Person Search via Color Reasoning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1624-1633\n} \n}" }, { "title": "Large Scale Interactive Motion Forecasting for Autonomous Driving: The Waymo Open Motion Dataset", @@ -21761,7 +23235,8 @@ "status": "Poster", "track": "main", "pid": 9324, - "author": "Scott Ettinger; Shuyang Cheng; Benjamin Caine; Chenxi Liu; Hang Zhao; Sabeek Pradhan; Yuning Chai; Ben Sapp; Charles R. Qi; Yin Zhou; Zoey Yang; Aur\u00e9lien Chouard; Pei Sun; Jiquan Ngiam; Vijay Vasudevan; Alexander McCauley; Jonathon Shlens; Dragomir Anguelov", + "author_site": "Scott Ettinger; Shuyang Cheng; Benjamin Caine; Chenxi Liu; Hang Zhao; Sabeek Pradhan; Yuning Chai; Ben Sapp; Charles R. Qi; Yin Zhou; Zoey Yang; Aurélien Chouard; Pei Sun; Jiquan Ngiam; Vijay Vasudevan; Alexander McCauley; Jonathon Shlens; Dragomir Anguelov", + "author": "Scott Ettinger; Shuyang Cheng; Benjamin Caine; Chenxi Liu; Hang Zhao; Sabeek Pradhan; Yuning Chai; Ben Sapp; Charles R. Qi; Yin Zhou; Zoey Yang; Aurélien Chouard; Pei Sun; Jiquan Ngiam; Vijay Vasudevan; Alexander McCauley; Jonathon Shlens; Dragomir Anguelov", "abstract": "As autonomous driving systems mature, motion forecasting has received increasing attention as a critical requirement for planning. Of particular importance are interactive situations such as merges, unprotected turns, etc., where predicting individual object motion is not sufficient. Joint predictions of multiple objects are required for effective route planning. There has been a critical need for highquality motion data that is rich in both interactions and annotation to develop motion planning models. In this work, we introduce the most diverse interactive motion dataset to our knowledge, and provide specific labels for interacting objects suitable for developing joint prediction models. With over 100,000 scenes, each 20 seconds long at 10 Hz, our new dataset contains more than 570 hours of unique data over 1750 km of roadways. It was collected by mining for interesting interactions between vehicles, pedestrians, and cyclists across six cities within the United States. We use a high-accuracy 3D auto-labeling system to generate high quality 3D bounding boxes for each road agent, and provide corresponding high definition 3D maps for each scene. Furthermore, we introduce a new set of metrics that provides a comprehensive evaluation of both single agent and joint agent interaction motion forecasting models. Finally, we provide strong baseline models for individual agent prediction and joint-prediction. We hope that this new large-scale interactive motion dataset will provide new opportunities for advancing motion forecasting models.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ettinger_Large_Scale_Interactive_Motion_Forecasting_for_Autonomous_Driving_The_Waymo_ICCV_2021_paper.pdf", "aff": "Waymo LLC; Waymo LLC; Google Brain; Waymo LLC; Waymo LLC; Waymo LLC; Waymo LLC; Waymo LLC; Waymo LLC; Waymo LLC; Waymo LLC; Waymo LLC; Waymo LLC; Google Brain; Google Brain; Waymo LLC; Google Brain; Waymo LLC", @@ -21784,7 +23259,8 @@ "aff_campus_unique_index": "1;1;1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Ettinger_2021_ICCV,\n \n author = {\n Ettinger,\n Scott and Cheng,\n Shuyang and Caine,\n Benjamin and Liu,\n Chenxi and Zhao,\n Hang and Pradhan,\n Sabeek and Chai,\n Yuning and Sapp,\n Ben and Qi,\n Charles R. and Zhou,\n Yin and Yang,\n Zoey and Chouard,\n Aur\\'elien and Sun,\n Pei and Ngiam,\n Jiquan and Vasudevan,\n Vijay and McCauley,\n Alexander and Shlens,\n Jonathon and Anguelov,\n Dragomir\n},\n title = {\n Large Scale Interactive Motion Forecasting for Autonomous Driving: The Waymo Open Motion Dataset\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9710-9719\n} \n}" }, { "title": "Large Scale Multi-Illuminant (LSMI) Dataset for Developing White Balance Algorithm Under Mixed Illumination", @@ -21792,6 +23268,7 @@ "status": "Poster", "track": "main", "pid": 7669, + "author_site": "Dongyoung Kim; Jinwoo Kim; Seonghyeon Nam; Dongwoo Lee; Yeonkyung Lee; Nahyup Kang; Hyong-Euk Lee; ByungIn Yoo; Jae-Joon Han; Seon Joo Kim", "author": "Dongyoung Kim; Jinwoo Kim; Seonghyeon Nam; Dongwoo Lee; Yeonkyung Lee; Nahyup Kang; Hyong-Euk Lee; ByungIn Yoo; Jae-Joon Han; Seon Joo Kim", "abstract": "We introduce a Large Scale Multi-Illuminant (LSMI) Dataset that contains 7,486 images, captured with three different cameras on more than 2,700 scenes with two or three illuminants. For each image in the dataset, the new dataset provides not only the pixel-wise ground truth illumination but also the chromaticity of each illuminant in the scene and the mixture ratio of illuminants per pixel. Images in our dataset are mostly captured with illuminants existing in the scene, and the ground truth illumination is computed by taking the difference between the images with different illumination combination. Therefore, our dataset captures natural composition in the real-world setting with wide field-of-view, providing more extensive dataset compared to existing datasets for multi-illumination white balance. As conventional single illuminant white balance algorithms cannot be directly applied, we also apply per-pixel DNN-based white balance algorithm and show its effectiveness against using patch-wise white balancing. We validate the benefits of our dataset through extensive analysis including a user-study, and expect the dataset to make meaningful contribution for future work in white balancing.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Kim_Large_Scale_Multi-Illuminant_LSMI_Dataset_for_Developing_White_Balance_Algorithm_ICCV_2021_paper.pdf", @@ -21806,7 +23283,8 @@ "aff_domain": ";;;;;;;;;", "email": ";;;;;;;;;", "author_num": 10, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Kim_Large_Scale_Multi-Illuminant_LSMI_Dataset_for_Developing_White_Balance_Algorithm_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Kim_Large_Scale_Multi-Illuminant_LSMI_Dataset_for_Developing_White_Balance_Algorithm_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Kim_2021_ICCV,\n \n author = {\n Kim,\n Dongyoung and Kim,\n Jinwoo and Nam,\n Seonghyeon and Lee,\n Dongwoo and Lee,\n Yeonkyung and Kang,\n Nahyup and Lee,\n Hyong-Euk and Yoo,\n ByungIn and Han,\n Jae-Joon and Kim,\n Seon Joo\n},\n title = {\n Large Scale Multi-Illuminant (LSMI) Dataset for Developing White Balance Algorithm Under Mixed Illumination\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2410-2419\n} \n}" }, { "title": "Large-Scale Robust Deep AUC Maximization: A New Surrogate Loss and Empirical Studies on Medical Image Classification", @@ -21814,6 +23292,7 @@ "status": "Poster", "track": "main", "pid": 2911, + "author_site": "Zhuoning Yuan; Yan Yan; Milan Sonka; Tianbao Yang", "author": "Zhuoning Yuan; Yan Yan; Milan Sonka; Tianbao Yang", "abstract": "Deep AUC Maximization (DAM) is a new paradigm for learning a deep neural network by maximizing the AUC score of the model on a dataset. Most previous works of AUC maximization focus on the perspective of optimization by designing efficient stochastic algorithms, and studies on generalization performance of large-scale DAM on difficult tasks are missing. In this work, we aim to make DAM more practical for interesting real-world applications (e.g., medical image classification). First, we propose a new margin-based min-max surrogate loss function for the AUC score (named as the AUC min-max-margin loss or simply AUC margin loss for short). It is more robust than the commonly used AUC square loss, while enjoying the same advantage in terms of large-scale stochastic optimization. Second, we conduct extensive empirical studies of our DAM method on four difficult medical image classification tasks, namely (i) classification of chest x-ray images for identifying many threatening diseases, (ii) classification of images of skin lesions for identifying melanoma, (iii) classification of mammogram for breast cancer screening, and (iv) classification of microscopic images for identifying tumor tissue. Our studies demonstrate that the proposed DAM method improves the performance of optimizing cross-entropy loss by a large margin, and also achieves better performance than optimizing the existing AUC square loss on these medical image classification tasks. Specifically, our DAM method has achieved the 1st place on Stanford CheXpert competition on Aug. 31, 2020. To the best of our knowledge, this is the first work that makes DAM succeed on large-scale medical image datasets. We also conduct extensive ablation studies to demonstrate the advantages of the new AUC margin loss over the AUC square loss on benchmark datasets. The proposed method is implemented in our open-sourced library LibAUC (www.libauc.org) whose github address is https://github.com/Optimization-AI/LibAUC.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yuan_Large-Scale_Robust_Deep_AUC_Maximization_A_New_Surrogate_Loss_and_ICCV_2021_paper.pdf", @@ -21837,7 +23316,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Yuan_2021_ICCV,\n \n author = {\n Yuan,\n Zhuoning and Yan,\n Yan and Sonka,\n Milan and Yang,\n Tianbao\n},\n title = {\n Large-Scale Robust Deep AUC Maximization: A New Surrogate Loss and Empirical Studies on Medical Image Classification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3040-3049\n} \n}" }, { "title": "Latent Transformations via NeuralODEs for GAN-Based Image Editing", @@ -21845,6 +23325,7 @@ "status": "Poster", "track": "main", "pid": 8931, + "author_site": "Valentin Khrulkov; Leyla Mirvakhabova; Ivan Oseledets; Artem Babenko", "author": "Valentin Khrulkov; Leyla Mirvakhabova; Ivan Oseledets; Artem Babenko", "abstract": "Recent advances in high-fidelity semantic image editing heavily rely on the presumably disentangled latent spaces of the state-of-the-art generative models, such as StyleGAN. Specifically, recent works show that it is possible to achieve decent controllability of attributes in the face images via linear shifts along with latent directions. Several recent methods address the discovery of such directions, implicitly assuming that the state-of-the-art GANs learn the latent spaces with inherently linearly separable attribute distributions and semantic vector arithmetic properties. In our work, we show that nonlinear latent code manipulations realized as flows of a trainable Neural ODE are beneficial for many practical non-face image domains with more complex non-textured factors of variation. In particular, we investigate a large number of datasets with known attributes and demonstrate that certain attribute manipulations are challenging to be obtained with linear shifts only.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Khrulkov_Latent_Transformations_via_NeuralODEs_for_GAN-Based_Image_Editing_ICCV_2021_paper.pdf", @@ -21868,7 +23349,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0+0", - "aff_country_unique": "Russian Federation" + "aff_country_unique": "Russia", + "bibtex": "@InProceedings{Khrulkov_2021_ICCV,\n \n author = {\n Khrulkov,\n Valentin and Mirvakhabova,\n Leyla and Oseledets,\n Ivan and Babenko,\n Artem\n},\n title = {\n Latent Transformations via NeuralODEs for GAN-Based Image Editing\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14428-14437\n} \n}" }, { "title": "LatentCLR: A Contrastive Learning Approach for Unsupervised Discovery of Interpretable Directions", @@ -21876,10 +23358,11 @@ "status": "Poster", "track": "main", "pid": 3325, - "author": "O\u011fuz Kaan Y\u00fcksel; Enis Simsar; Ezgi G\u00fclperi Er; Pinar Yanardag", + "author_site": "Oğuz Kaan Yüksel; Enis Simsar; Ezgi Gülperi Er; Pinar Yanardag", + "author": "Oğuz Kaan Yüksel; Enis Simsar; Ezgi Gülperi Er; Pinar Yanardag", "abstract": "Recent research has shown that it is possible to find interpretable directions in the latent spaces of pre-trained Generative Adversarial Networks (GANs). These directions enable controllable image generation and support a wide range of semantic editing operations, such as zoom or rotation. The discovery of such directions is often done in a supervised or semi-supervised manner and requires manual annotations which limits their use in practice. In comparison, unsupervised discovery allows finding subtle directions that are difficult to detect a priori. In this work, we propose a contrastive learning-based approach to discover semantic directions in the latent space of pre-trained GANs in a self-supervised manner. Our approach finds semantically meaningful dimensions compatible with state-of-the-art methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yuksel_LatentCLR_A_Contrastive_Learning_Approach_for_Unsupervised_Discovery_of_Interpretable_ICCV_2021_paper.pdf", - "aff": "EPFL; Technical University of Munich+Bo\u02d8gazic \u00b8i University; Bo\u02d8gazic \u00b8i University; Bo\u02d8gazic \u00b8i University", + "aff": "EPFL; Technical University of Munich+Bo˘gazic ¸i University; Bo˘gazic ¸i University; Bo˘gazic ¸i University", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Yuksel_LatentCLR_A_Contrastive_ICCV_2021_supplemental.pdf", @@ -21892,14 +23375,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Yuksel_LatentCLR_A_Contrastive_Learning_Approach_for_Unsupervised_Discovery_of_Interpretable_ICCV_2021_paper.html", "aff_unique_index": "0;1+2;2;2", - "aff_unique_norm": "EPFL;Technical University of Munich;Bogazici University", + "aff_unique_norm": "Ecole Polytechnique Fédérale de Lausanne;Technical University of Munich;Bogazici University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.epfl.ch;https://www.tum.de;https://www.bogazici.edu.tr", "aff_unique_abbr": "EPFL;TUM;Bogazici", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1+2;2;2", - "aff_country_unique": "Switzerland;Germany;T\u00fcrkiye" + "aff_country_unique": "Switzerland;Germany;Turkey", + "bibtex": "@InProceedings{Yuksel_2021_ICCV,\n \n author = {\n Y\\"uksel,\n O\\u{g\n}uz Kaan and Simsar,\n Enis and Er,\n Ezgi G\\"ulperi and Yanardag,\n Pinar\n},\n title = {\n LatentCLR: A Contrastive Learning Approach for Unsupervised Discovery of Interpretable Directions\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14263-14272\n} \n}" }, { "title": "LayoutTransformer: Layout Generation and Completion With Self-Attention", @@ -21907,6 +23391,7 @@ "status": "Poster", "track": "main", "pid": 10917, + "author_site": "Kamal Gupta; Justin Lazarow; Alessandro Achille; Larry S. Davis; Vijay Mahadevan; Abhinav Shrivastava", "author": "Kamal Gupta; Justin Lazarow; Alessandro Achille; Larry S. Davis; Vijay Mahadevan; Abhinav Shrivastava", "abstract": "We address the problem of scene layout generation for diverse domains such as images, mobile applications, documents, and 3D objects. Most complex scenes, natural or human-designed, can be expressed as a meaningful arrangement of simpler compositional graphical primitives. Generating a new layout or extending an existing layout requires understanding the relationships between these primitives. To do this, we propose LayoutTransformer, a novel framework that leverages self-attention to learn contextual relationships between layout elements and generate novel layouts in a given domain. Our framework allows us to generate a new layout either from an empty set or from an initial seed set of primitives, and can easily scale to support an arbitrary of primitives per layout. Furthermore, our analyses show that the model is able to automatically capture the semantic properties of the primitives. We propose simple improvements in both representation of layout primitives, as well as training methods to demonstrate competitive performance in very diverse data domains such as object bounding boxes in natural images (COCO bounding box), documents (PubLayNet), mobile applications (RICO dataset) as well as 3D shapes (Part-Net). Code and other materials will be made available at https://kampta.github.io/layout.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Gupta_LayoutTransformer_Layout_Generation_and_Completion_With_Self-Attention_ICCV_2021_paper.pdf", @@ -21918,19 +23403,20 @@ "gs_citation": 184, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17536750987876712046&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, - "aff_domain": "\u22c6; ; ; ; \u22c6; ", - "email": "\u22c6; ; ; ; \u22c6; ", + "aff_domain": "⋆; ; ; ; ⋆; ", + "email": "⋆; ; ; ; ⋆; ", "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Gupta_LayoutTransformer_Layout_Generation_and_Completion_With_Self-Attention_ICCV_2021_paper.html", "aff_unique_index": "0;1;2;0+2;2;0", - "aff_unique_norm": "University of Maryland;University of California, San Diego;Amazon", + "aff_unique_norm": "University of Maryland;University of California, San Diego;Amazon Web Services", "aff_unique_dep": ";;Amazon Web Services", "aff_unique_url": "https://www/umd.edu;https://www.ucsd.edu;https://aws.amazon.com", "aff_unique_abbr": "UMD;UCSD;AWS", "aff_campus_unique_index": "0;1;0;0", "aff_campus_unique": "College Park;San Diego;", "aff_country_unique_index": "0;0;0;0+0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Gupta_2021_ICCV,\n \n author = {\n Gupta,\n Kamal and Lazarow,\n Justin and Achille,\n Alessandro and Davis,\n Larry S. and Mahadevan,\n Vijay and Shrivastava,\n Abhinav\n},\n title = {\n LayoutTransformer: Layout Generation and Completion With Self-Attention\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1004-1014\n} \n}" }, { "title": "LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference", @@ -21938,7 +23424,8 @@ "status": "Poster", "track": "main", "pid": 1103, - "author": "Benjamin Graham; Alaaeldin El-Nouby; Hugo Touvron; Pierre Stock; Armand Joulin; Herv\u00e9 J\u00e9gou; Matthijs Douze", + "author_site": "Benjamin Graham; Alaaeldin El-Nouby; Hugo Touvron; Pierre Stock; Armand Joulin; Hervé Jégou; Matthijs Douze", + "author": "Benjamin Graham; Alaaeldin El-Nouby; Hugo Touvron; Pierre Stock; Armand Joulin; Hervé Jégou; Matthijs Douze", "abstract": "We design a family of image classification architectures that optimize the trade-off between accuracy and efficiency in a high-speed regime. Our work exploits recent findings in attention-based architectures, which are competitive on highly parallel processing hardware. We revisit principles from the extensive literature on convolutional neural networks to apply them to transformers, in particular activation maps with decreasing resolutions. We also introduce the attention bias, a new way to integrate positional information in vision transformers. As a result, we propose LeViT: a hybrid neural network for fast inference image classification. We consider different measures of efficiency on different hardware platforms, so as to best reflect a wide range of application scenarios. Our extensive experiments empirically validate our technical choices and show they are suitable to most architectures. Overall, LeViT significantly outperforms existing convnets and vision transformers with respect to the speed/accuracy tradeoff. For example, at 80% ImageNet top-1 accuracy, LeViT is 5 times faster than EfficientNet on CPU. We release the code at https://github.com/facebookresearch/LeViT.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Graham_LeViT_A_Vision_Transformer_in_ConvNets_Clothing_for_Faster_Inference_ICCV_2021_paper.pdf", "aff": ";;;;;;", @@ -21952,7 +23439,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Graham_LeViT_A_Vision_Transformer_in_ConvNets_Clothing_for_Faster_Inference_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Graham_LeViT_A_Vision_Transformer_in_ConvNets_Clothing_for_Faster_Inference_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Graham_2021_ICCV,\n \n author = {\n Graham,\n Benjamin and El-Nouby,\n Alaaeldin and Touvron,\n Hugo and Stock,\n Pierre and Joulin,\n Armand and J\\'egou,\n Herv\\'e and Douze,\n Matthijs\n},\n title = {\n LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12259-12269\n} \n}" }, { "title": "Learn To Cluster Faces via Pairwise Classification", @@ -21960,6 +23448,7 @@ "status": "Poster", "track": "main", "pid": 6019, + "author_site": "Junfu Liu; Di Qiu; Pengfei Yan; Xiaolin Wei", "author": "Junfu Liu; Di Qiu; Pengfei Yan; Xiaolin Wei", "abstract": "Face clustering plays an essential role in exploiting massive unlabeled face data. Recently, graph-based face clustering methods are getting popular for their satisfying performances. However, they usually suffer from excessive memory consumption especially on large-scale graphs, and rely on empirical thresholds to determine the connectivities between samples in inference, which restricts their applications in various real-world scenes. To address such problems, in this paper, we explore face clustering from the pairwise angle. Specifically, we formulate the face clustering task as a pairwise relationship classification task, avoiding the memory-consuming learning on large-scale graphs. The classifier can directly determine the relationship between samples and is enhanced by taking advantage of the contextual information. Moreover, to further facilitate the efficiency of our method, we propose a rank-weighted density to guide the selection of pairs sent to the classifier. Experimental results demonstrate that our method achieves state-of-the-art performances on several public clustering benchmarks at the fastest speed and shows a great advantage in comparison with graph-based clustering methods on memory consumption.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liu_Learn_To_Cluster_Faces_via_Pairwise_Classification_ICCV_2021_paper.pdf", @@ -21983,7 +23472,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2021_ICCV,\n \n author = {\n Liu,\n Junfu and Qiu,\n Di and Yan,\n Pengfei and Wei,\n Xiaolin\n},\n title = {\n Learn To Cluster Faces via Pairwise Classification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3845-3853\n} \n}" }, { "title": "Learn To Match: Automatic Matching Network Design for Visual Tracking", @@ -21991,6 +23481,7 @@ "status": "Poster", "track": "main", "pid": 6078, + "author_site": "Zhipeng Zhang; Yihao Liu; Xiao Wang; Bing Li; Weiming Hu", "author": "Zhipeng Zhang; Yihao Liu; Xiao Wang; Bing Li; Weiming Hu", "abstract": "Siamese tracking has achieved groundbreaking performance in recent years, where the essence is the efficient matching operator cross-correlation and its variants. Besides the remarkable success, it is important to note that the heuristic matching network design relies heavily on expert experience. Moreover, we experimentally find that one sole matching operator is difficult to guarantee stable tracking in all challenging environments. Thus, in this work, we introduce six novel matching operators, namely Concatenation, Pointwise-Addition, Pairwise-Relation, FiLM, Simple-Transformer and Transductive-Guidance, to explore more feasibility on matching operator selection. The analyses reveal these operators' selective adaptability on different environment degradation types, which inspires us to combine them to explore complementary features. To this end, we propose binary channel manipulation (BCM) to search for the optimal combination of these operators. BCM determines to retrain or discard one operator by learning its contribution to other tracking steps. By inserting the learned matching networks to a strong baseline tracker Ocean, our model achieves favorable gains by 67.2 -> 71.4, 52.6 -> 58.3, 70.3 -> 76.0 AUC on OTB100, LaSOT, and TrackingNet, respectively. Notably, Our tracker runs at real-time speed of 50 / 100 FPS using PyTorch / TensorRT.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhang_Learn_To_Match_Automatic_Matching_Network_Design_for_Visual_Tracking_ICCV_2021_paper.pdf", @@ -22007,14 +23498,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhang_Learn_To_Match_Automatic_Matching_Network_Design_for_Visual_Tracking_ICCV_2021_paper.html", "aff_unique_index": "0+1+2+0;1;2;0+1+2+0;0+1+2+0", - "aff_unique_norm": "Chinese Academy of Sciences;University of Chinese Academy of Sciences;Pengcheng Laboratory", - "aff_unique_dep": "Institute of Automation;School of AI;Peng Cheng Laboratory", + "aff_unique_norm": "Chinese Academy of Sciences;University of Chinese Academy of Sciences;Peng Cheng Laboratory", + "aff_unique_dep": "Institute of Automation;School of AI;", "aff_unique_url": "http://www.ia.cas.cn;http://www.ucas.ac.cn;http://www.pcl.ac.cn", "aff_unique_abbr": "CAS;UCAS;PCL", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0+0+0;0;0;0+0+0+0;0+0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2021_ICCV,\n \n author = {\n Zhang,\n Zhipeng and Liu,\n Yihao and Wang,\n Xiao and Li,\n Bing and Hu,\n Weiming\n},\n title = {\n Learn To Match: Automatic Matching Network Design for Visual Tracking\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13339-13348\n} \n}" }, { "title": "Learn-To-Race: A Multimodal Control Environment for Autonomous Racing", @@ -22022,6 +23514,7 @@ "status": "Poster", "track": "main", "pid": 5839, + "author_site": "James Herman; Jonathan Francis; Siddha Ganju; Bingqing Chen; Anirudh Koul; Abhinav Gupta; Alexey Skabelkin; Ivan Zhukov; Max Kumskoy; Eric Nyberg", "author": "James Herman; Jonathan Francis; Siddha Ganju; Bingqing Chen; Anirudh Koul; Abhinav Gupta; Alexey Skabelkin; Ivan Zhukov; Max Kumskoy; Eric Nyberg", "abstract": "Existing research on autonomous driving primarily focuses on urban driving, which is insufficient for characterising the complex driving behaviour underlying high-speed racing. At the same time, existing racing simulation frameworks struggle in capturing realism, with respect to visual rendering, vehicular dynamics, and task objectives, inhibiting the transfer of learning agents to real-world contexts. We introduce a new environment, where agents Learn-to-Race (L2R) in simulated competition-style racing, using multimodal information|from virtual cameras to a comprehensive array of inertial measurement sensors. Our environment, which includes a simulator and an interfacing training framework, accurately models vehicle dynamics and racing conditions. In this paper, we release the Arrival simulator for autonomous racing. Next, we propose the L2R task with challenging metrics, inspired by learning-to-drive challenges, Formula-style racing, and multimodal trajectory prediction for autonomous driving. Additionally, we provide the L2R framework suite, facilitating simulated racing on high-precision models of real-world tracks. Finally, we provide an official L2R task dataset of expert demonstrations, as well as a series of baseline experiments and reference implementations. We make all code available: https://github.com/learn-to-race/l2r.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Herman_Learn-To-Race_A_Multimodal_Control_Environment_for_Autonomous_Racing_ICCV_2021_paper.pdf", @@ -22039,13 +23532,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Herman_Learn-To-Race_A_Multimodal_Control_Environment_for_Autonomous_Racing_ICCV_2021_paper.html", "aff_unique_index": "0;0+1;2;3;0;4;4;4;4;0", "aff_unique_norm": "Carnegie Mellon University;Bosch Research;NVIDIA;Pinterest;Arrival", - "aff_unique_dep": "School of Computer Science;Human-Machine Collaboration;NVIDIA;;Autonomous Driving", + "aff_unique_dep": "School of Computer Science;Human-Machine Collaboration;;;Autonomous Driving", "aff_unique_url": "https://www.cmu.edu;https://research.bosch.com;https://www.nvidia.com;https://www.pinterest.com;", "aff_unique_abbr": "CMU;Bosch;NV;Pinterest;", "aff_campus_unique_index": "0;0+0;1;2;0;3;3;3;3;0", "aff_campus_unique": "Pittsburgh;Santa Clara;San Francisco;London", "aff_country_unique_index": "0;0+0;0;0;0;1;1;1;1;0", - "aff_country_unique": "United States;United Kingdom" + "aff_country_unique": "United States;United Kingdom", + "bibtex": "@InProceedings{Herman_2021_ICCV,\n \n author = {\n Herman,\n James and Francis,\n Jonathan and Ganju,\n Siddha and Chen,\n Bingqing and Koul,\n Anirudh and Gupta,\n Abhinav and Skabelkin,\n Alexey and Zhukov,\n Ivan and Kumskoy,\n Max and Nyberg,\n Eric\n},\n title = {\n Learn-To-Race: A Multimodal Control Environment for Autonomous Racing\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9793-9802\n} \n}" }, { "title": "Learnable Boundary Guided Adversarial Training", @@ -22053,6 +23547,7 @@ "status": "Poster", "track": "main", "pid": 3317, + "author_site": "Jiequan Cui; Shu Liu; Liwei Wang; Jiaya Jia", "author": "Jiequan Cui; Shu Liu; Liwei Wang; Jiaya Jia", "abstract": "Previous adversarial training raises model robustness under the compromise of accuracy on natural data. In this paper, we reduce natural accuracy degradation. We use the model logits from one clean model to guide learning of another one robust model, taking into consideration that logits from the well trained clean model embed the most discriminative features of natural data, e.g., generalizable classifier boundary. Our solution is to constrain logits from the robust model that takes adversarial examples as input and makes it similar to those from the clean model fed with corresponding natural data. It lets the robust model inherit the classifier boundary of the clean model. Moreover, we observe such boundary guidance can not only preserve high natural accuracy but also benefit model robustness, which gives new insights and facilitates progress for the adversarial community. Finally, extensive experiments on CIFAR-10, CIFAR-100, and Tiny ImageNet testify to the effectiveness of our method. We achieve new state-of-the-art robustness on CIFAR-100 without additional real or synthetic data with auto-attack benchmark. Our code is available at https://github.com/dvlab-research/LBGAT.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Cui_Learnable_Boundary_Guided_Adversarial_Training_ICCV_2021_paper.pdf", @@ -22069,14 +23564,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Cui_Learnable_Boundary_Guided_Adversarial_Training_ICCV_2021_paper.html", "aff_unique_index": "0;1;0;0+1", - "aff_unique_norm": "Chinese University of Hong Kong;SmartMore", + "aff_unique_norm": "The Chinese University of Hong Kong;SmartMore", "aff_unique_dep": ";", "aff_unique_url": "https://www.cuhk.edu.hk;", "aff_unique_abbr": "CUHK;", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Cui_2021_ICCV,\n \n author = {\n Cui,\n Jiequan and Liu,\n Shu and Wang,\n Liwei and Jia,\n Jiaya\n},\n title = {\n Learnable Boundary Guided Adversarial Training\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15721-15730\n} \n}" }, { "title": "Learned Spatial Representations for Few-Shot Talking-Head Synthesis", @@ -22084,6 +23580,7 @@ "status": "Poster", "track": "main", "pid": 9227, + "author_site": "Moustafa Meshry; Saksham Suri; Larry S. Davis; Abhinav Shrivastava", "author": "Moustafa Meshry; Saksham Suri; Larry S. Davis; Abhinav Shrivastava", "abstract": "We propose a novel approach for few-shot talking-head synthesis. While recent works in neural talking heads have produced promising results, they can still produce images that do not preserve the identity of the subject in source images. We posit this is a result of the entangled representation of each subject in a single latent code that models 3D shape information, identity cues, colors, lighting and even background details. In contrast, we propose to factorize the representation of a subject into its spatial and style components. Our method generates a target frame in two steps. First, it predicts a dense spatial layout for the target image. Second, an image generator utilizes the predicted layout for spatial denormalization and synthesizes the target frame. We experimentally show that this disentangled representation leads to a significant improvement over previous methods, both quantitatively and qualitatively.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Meshry_Learned_Spatial_Representations_for_Few-Shot_Talking-Head_Synthesis_ICCV_2021_paper.pdf", @@ -22098,7 +23595,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Meshry_Learned_Spatial_Representations_for_Few-Shot_Talking-Head_Synthesis_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Meshry_Learned_Spatial_Representations_for_Few-Shot_Talking-Head_Synthesis_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Meshry_2021_ICCV,\n \n author = {\n Meshry,\n Moustafa and Suri,\n Saksham and Davis,\n Larry S. and Shrivastava,\n Abhinav\n},\n title = {\n Learned Spatial Representations for Few-Shot Talking-Head Synthesis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13829-13838\n} \n}" }, { "title": "Learning Action Completeness From Points for Weakly-Supervised Temporal Action Localization", @@ -22106,6 +23604,7 @@ "status": "Poster", "track": "main", "pid": 1186, + "author_site": "Pilhyeon Lee; Hyeran Byun", "author": "Pilhyeon Lee; Hyeran Byun", "abstract": "We tackle the problem of localizing temporal intervals of actions with only a single frame label for each action instance for training. Owing to label sparsity, existing work fails to learn action completeness, resulting in fragmentary action predictions. In this paper, we propose a novel framework, where dense pseudo-labels are generated to provide completeness guidance for the model. Concretely, we first select pseudo background points to supplement point-level action labels. Then, by taking the points as seeds, we search for the optimal sequence that is likely to contain complete action instances while agreeing with the seeds. To learn completeness from the obtained sequence, we introduce two novel losses that contrast action instances with background ones in terms of action score and feature similarity, respectively. Experimental results demonstrate that our completeness guidance indeed helps the model to locate complete action instances, leading to large performance gains especially under high IoU thresholds. Moreover, we demonstrate the superiority of our method over existing state-of-the-art methods on four benchmarks: THUMOS'14, GTEA, BEOID, and ActivityNet. Notably, our method even performs comparably to recent fully-supervised methods, at the 6 times cheaper annotation cost. Our code is available at https://github.com/Pilhyeon.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Lee_Learning_Action_Completeness_From_Points_for_Weakly-Supervised_Temporal_Action_Localization_ICCV_2021_paper.pdf", @@ -22129,7 +23628,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Lee_2021_ICCV,\n \n author = {\n Lee,\n Pilhyeon and Byun,\n Hyeran\n},\n title = {\n Learning Action Completeness From Points for Weakly-Supervised Temporal Action Localization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13648-13657\n} \n}" }, { "title": "Learning Anchored Unsigned Distance Functions With Gradient Direction Alignment for Single-View Garment Reconstruction", @@ -22137,6 +23637,7 @@ "status": "Poster", "track": "main", "pid": 2102, + "author_site": "Fang Zhao; Wenhao Wang; Shengcai Liao; Ling Shao", "author": "Fang Zhao; Wenhao Wang; Shengcai Liao; Ling Shao", "abstract": "While single-view 3D reconstruction has made significant progress benefiting from deep shape representations in recent years, garment reconstruction is still not solved well due to open surfaces, diverse topologies and complex geometric details. In this paper, we propose a novel learnable Anchored Unsigned Distance Function (AnchorUDF) representation for 3D garment reconstruction from a single image. AnchorUDF represents 3D shapes by predicting unsigned distance fields (UDFs) to enable open garment surface modeling at arbitrary resolution. To capture diverse garment topologies, AnchorUDF not only computes pixel-aligned local image features of query points, but also leverages a set of anchor points located around the surface to enrich 3D position features for query points, which provides stronger 3D space context for the distance function. Furthermore, in order to obtain more accurate point projection direction at inference, we explicitly align the spatial gradient direction of AnchorUDF with the ground-truth direction to the surface during training. Extensive experiments on two public 3D garment datasets, i.e., MGN and Deep Fashion3D, demonstrate that AnchorUDF achieves the state-of-the-art performance on single-view garment reconstruction. Code is available at https://github.com/zhaofang0627/AnchorUDF.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhao_Learning_Anchored_Unsigned_Distance_Functions_With_Gradient_Direction_Alignment_for_ICCV_2021_paper.pdf", @@ -22155,12 +23656,13 @@ "aff_unique_index": "0;1;0+2;0+2", "aff_unique_norm": "Inception Institute of Artificial Intelligence;University of Technology Sydney;Mohamed bin Zayed University of Artificial Intelligence", "aff_unique_dep": ";ReLER;", - "aff_unique_url": "https://www.inceptioniai.org;https://www.uts.edu.au;https://mbzuai.ac.ae", + "aff_unique_url": "https://www.inceptioniai.org;https://www.uts.edu.au;https://www.mbzuai.ac.ae", "aff_unique_abbr": ";UTS;MBZUAI", "aff_campus_unique_index": "1;;", "aff_campus_unique": ";Sydney", "aff_country_unique_index": "0;1;0+0;0+0", - "aff_country_unique": "United Arab Emirates;Australia" + "aff_country_unique": "United Arab Emirates;Australia", + "bibtex": "@InProceedings{Zhao_2021_ICCV,\n \n author = {\n Zhao,\n Fang and Wang,\n Wenhao and Liao,\n Shengcai and Shao,\n Ling\n},\n title = {\n Learning Anchored Unsigned Distance Functions With Gradient Direction Alignment for Single-View Garment Reconstruction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12674-12683\n} \n}" }, { "title": "Learning Attribute-Driven Disentangled Representations for Interactive Fashion Retrieval", @@ -22168,6 +23670,7 @@ "status": "Poster", "track": "main", "pid": 8591, + "author_site": "Yuxin Hou; Eleonora Vig; Michael Donoser; Loris Bazzani", "author": "Yuxin Hou; Eleonora Vig; Michael Donoser; Loris Bazzani", "abstract": "Interactive retrieval for online fashion shopping provides the ability of changing image retrieval results according to the user feedback. One common problem in interactive retrieval is that a specific user interaction (e.g., changing the color of a T-shirt) causes other aspects to change inadvertently (e.g., the results have a sleeve type different from that of the query). This is a consequence of existing methods learning visual representations that are entangled in the embedding space, which limits the controllability of the retrieved results. We propose to leverage on the semantics of visual attributes to train convolutional networks that learn attribute-specific subspaces for each attribute type to obtain disentangled representations. Operations, such as swapping out a particular attribute value for another, impact the attribute at hand and leave others untouched. We show that our model can be tailored to deal with different retrieval tasks while maintaining its disentanglement property. We obtained state-of-the-art performance on three interactive fashion retrieval tasks: attribute manipulation retrieval, conditional similarity retrieval, and outfit complementary item retrieval. We will make code and models publicly available.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Hou_Learning_Attribute-Driven_Disentangled_Representations_for_Interactive_Fashion_Retrieval_ICCV_2021_paper.pdf", @@ -22184,14 +23687,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Hou_Learning_Attribute-Driven_Disentangled_Representations_for_Interactive_Fashion_Retrieval_ICCV_2021_paper.html", "aff_unique_index": "0;1;1;1", - "aff_unique_norm": "Aalto University;Amazon", - "aff_unique_dep": ";Amazon.com, Inc.", + "aff_unique_norm": "Aalto University;Amazon.com, Inc.", + "aff_unique_dep": ";", "aff_unique_url": "https://www.aalto.fi;https://www.amazon.com", "aff_unique_abbr": "Aalto;Amazon", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1", - "aff_country_unique": "Finland;United States" + "aff_country_unique": "Finland;United States", + "bibtex": "@InProceedings{Hou_2021_ICCV,\n \n author = {\n Hou,\n Yuxin and Vig,\n Eleonora and Donoser,\n Michael and Bazzani,\n Loris\n},\n title = {\n Learning Attribute-Driven Disentangled Representations for Interactive Fashion Retrieval\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12147-12157\n} \n}" }, { "title": "Learning Better Visual Data Similarities via New Grouplet Non-Euclidean Embedding", @@ -22199,6 +23703,7 @@ "status": "Poster", "track": "main", "pid": 9190, + "author_site": "Yanfu Zhang; Lei Luo; Wenhan Xian; Heng Huang", "author": "Yanfu Zhang; Lei Luo; Wenhan Xian; Heng Huang", "abstract": "In many computer vision problems, it is desired to learn the effective visual data similarity such that the prediction accuracy can be enhanced. Deep Metric Learning (DML) methods have been actively studied to measure the data similarity. Pair-based and proxy-based losses are the two major paradigms in DML. However, pair-wise methods involve expensive training costs, while proxy-based methods are less accurate in characterizing the relationships between data points. In this paper, we provide a hybrid grouplet paradigm, which inherits the accurate pair-wise relationship in pair-based methods and the efficient training in proxy-based methods. Our method also equips a non-Euclidean space to DML, which employs a hierarchical representation manifold. More specifically, we propose a unified graph perspective --- different DML methods learn different local connecting patterns between data points. Based on the graph interpretation, we construct a flexible subset of data points, dubbed grouplet. Our grouplet doesn't require explicit pair-wise relationships, instead, we encode the data relationships in an optimal transport problem regarding the proxies, and solve this problem via a differentiable implicit layer to automatically determine the relationships. Extensive experimental results show that our method significantly outperforms state-of-the-art baselines on several benchmarks. The ablation studies also verify the effectiveness of our method.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhang_Learning_Better_Visual_Data_Similarities_via_New_Grouplet_Non-Euclidean_Embedding_ICCV_2021_paper.pdf", @@ -22215,14 +23720,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhang_Learning_Better_Visual_Data_Similarities_via_New_Grouplet_Non-Euclidean_Embedding_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;0+1", - "aff_unique_norm": "University of Pittsburgh;JD", - "aff_unique_dep": "Electrical and Computer Engineering;JD Explore Academy", + "aff_unique_norm": "University of Pittsburgh;JD Explore Academy", + "aff_unique_dep": "Electrical and Computer Engineering;", "aff_unique_url": "https://www.pitt.edu;", "aff_unique_abbr": "Pitt;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States;" + "aff_country_unique": "United States;", + "bibtex": "@InProceedings{Zhang_2021_ICCV,\n \n author = {\n Zhang,\n Yanfu and Luo,\n Lei and Xian,\n Wenhan and Huang,\n Heng\n},\n title = {\n Learning Better Visual Data Similarities via New Grouplet Non-Euclidean Embedding\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9918-9927\n} \n}" }, { "title": "Learning Bias-Invariant Representation by Cross-Sample Mutual Information Minimization", @@ -22230,6 +23736,7 @@ "status": "Poster", "track": "main", "pid": 10453, + "author_site": "Wei Zhu; Haitian Zheng; Haofu Liao; Weijian Li; Jiebo Luo", "author": "Wei Zhu; Haitian Zheng; Haofu Liao; Weijian Li; Jiebo Luo", "abstract": "Deep learning algorithms mine knowledge from the training data and thus would likely inherit the dataset's bias information. As a result, the obtained model would generalize poorly and even mislead the decision process in real-life applications. We propose to remove the bias information misused by the target task with a cross-sample adversarial debiasing (CSAD) method. CSAD explicitly extracts target and bias features disentangled from the latent representation generated by a feature extractor and then learns to discover and remove the correlation between the target and bias features. The correlation measurement plays a critical role in adversarial debiasing and is conducted by a cross-sample neural mutual information estimator. Moreover, we propose joint content and local structural representation learning to boost mutual information estimation for better performance. We conduct thorough experiments on publicly available datasets to validate the advantages of the proposed method over state-of-the-art approaches.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhu_Learning_Bias-Invariant_Representation_by_Cross-Sample_Mutual_Information_Minimization_ICCV_2021_paper.pdf", @@ -22246,14 +23753,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhu_Learning_Bias-Invariant_Representation_by_Cross-Sample_Mutual_Information_Minimization_ICCV_2021_paper.html", "aff_unique_index": "0;0;1+0;0;0", - "aff_unique_norm": "University of Rochester;Amazon", - "aff_unique_dep": ";Amazon Web Services", + "aff_unique_norm": "University of Rochester;Amazon Web Services", + "aff_unique_dep": ";", "aff_unique_url": "https://www.rochester.edu;https://aws.amazon.com", "aff_unique_abbr": "U of R;AWS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhu_2021_ICCV,\n \n author = {\n Zhu,\n Wei and Zheng,\n Haitian and Liao,\n Haofu and Li,\n Weijian and Luo,\n Jiebo\n},\n title = {\n Learning Bias-Invariant Representation by Cross-Sample Mutual Information Minimization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15002-15012\n} \n}" }, { "title": "Learning Canonical 3D Object Representation for Fine-Grained Recognition", @@ -22261,6 +23769,7 @@ "status": "Poster", "track": "main", "pid": 8350, + "author_site": "Sunghun Joung; Seungryong Kim; Minsu Kim; Ig-Jae Kim; Kwanghoon Sohn", "author": "Sunghun Joung; Seungryong Kim; Minsu Kim; Ig-Jae Kim; Kwanghoon Sohn", "abstract": "We propose a novel framework for fine-grained object recognition that learns to recover object variation in 3D space from a single image, trained on an image collection without using any ground-truth 3D annotation. We accomplish this by representing an object as a composition of 3D shape and its appearance, while eliminating the effect of camera viewpoint, in a canonical configuration. Unlike conventional methods modeling spatial variation in 2D images only, our method is capable of reconfiguring the appearance feature in a canonical 3D space, thus enabling the subsequent object classifier to be invariant under 3D geometric variation. Our representation also allows us to go beyond existing methods, by incorporating 3D shape variation as an additional cue for object recognition. To learn the model without ground-truth 3D annotation, we deploy a differentiable renderer in an analysis-by-synthesis framework. By incorporating 3D shape and appearance jointly in a deep representation, our method learns the discriminative representation of the object and achieves competitive performance on fine-grained image recognition and vehicle re-identification. We also demonstrate that the performance of 3D shape reconstruction is improved by learning fine-grained shape deformation in a boosting manner.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Joung_Learning_Canonical_3D_Object_Representation_for_Fine-Grained_Recognition_ICCV_2021_paper.pdf", @@ -22284,7 +23793,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Joung_2021_ICCV,\n \n author = {\n Joung,\n Sunghun and Kim,\n Seungryong and Kim,\n Minsu and Kim,\n Ig-Jae and Sohn,\n Kwanghoon\n},\n title = {\n Learning Canonical 3D Object Representation for Fine-Grained Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1035-1045\n} \n}" }, { "title": "Learning Canonical View Representation for 3D Shape Recognition With Arbitrary Views", @@ -22292,10 +23802,11 @@ "status": "Poster", "track": "main", "pid": 2217, + "author_site": "Xin Wei; Yifei Gong; Fudong Wang; Xing Sun; Jian Sun", "author": "Xin Wei; Yifei Gong; Fudong Wang; Xing Sun; Jian Sun", "abstract": "In this paper, we focus on recognizing 3D shapes from arbitrary views, i.e., arbitrary numbers and positions of viewpoints. It is a challenging and realistic setting for view-based 3D shape recognition. We propose a canonical view representation to tackle this challenge. We first transform the original features of arbitrary views to a fixed number of view features, dubbed canonical view representation, by aligning the arbitrary view features to a set of learnable reference view features using optimal transport. In this way, each 3D shape with arbitrary views is represented by a fixed number of canonical view features, which are further aggregated to generate a rich and robust 3D shape representation for shape recognition. We also propose a canonical view feature separation constraint to enforce that the view features in canonical view representation can be embedded into scattered points in a Euclidean space. Experiments on the ModelNet40, ScanObjectNN, and RGBD datasets show that our method achieves competitive results under the fixed viewpoint settings, and significantly outperforms the applicable methods under the arbitrary view setting.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wei_Learning_Canonical_View_Representation_for_3D_Shape_Recognition_With_Arbitrary_ICCV_2021_paper.pdf", - "aff": "Xi\u2019an Jiaotong University; Tencent Youtu Lab + Xi\u2019an Jiaotong University; Tencent Youtu Lab; Tencent Youtu Lab; Xi\u2019an Jiaotong University", + "aff": "Xi’an Jiaotong University; Tencent Youtu Lab + Xi’an Jiaotong University; Tencent Youtu Lab; Tencent Youtu Lab; Xi’an Jiaotong University", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Wei_Learning_Canonical_View_ICCV_2021_supplemental.pdf", @@ -22308,14 +23819,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wei_Learning_Canonical_View_Representation_for_3D_Shape_Recognition_With_Arbitrary_ICCV_2021_paper.html", "aff_unique_index": "0;1+0;1;1;0", - "aff_unique_norm": "Xi'an Jiao Tong University;Tencent", + "aff_unique_norm": "Xi'an Jiaotong University;Tencent", "aff_unique_dep": ";Youtu Lab", "aff_unique_url": "https://www.xjtu.edu.cn;https://www.tencent.com", "aff_unique_abbr": "XJTU;Tencent", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wei_2021_ICCV,\n \n author = {\n Wei,\n Xin and Gong,\n Yifei and Wang,\n Fudong and Sun,\n Xing and Sun,\n Jian\n},\n title = {\n Learning Canonical View Representation for 3D Shape Recognition With Arbitrary Views\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 407-416\n} \n}" }, { "title": "Learning Causal Representation for Training Cross-Domain Pose Estimator via Generative Interventions", @@ -22323,10 +23835,11 @@ "status": "Poster", "track": "main", "pid": 9023, + "author_site": "Xiheng Zhang; Yongkang Wong; Xiaofei Wu; Juwei Lu; Mohan Kankanhalli; Xiangdong Li; Weidong Geng", "author": "Xiheng Zhang; Yongkang Wong; Xiaofei Wu; Juwei Lu; Mohan Kankanhalli; Xiangdong Li; Weidong Geng", "abstract": "3D pose estimation has attracted increasing attention with the availability of high-quality benchmark datasets. However, prior works show that deep learning models tend to learn spurious correlations, which fail to generalize beyond the specific dataset they are trained on. In this work, we take a step towards training robust models for cross-domain pose estimation task, which brings together ideas from causal representation learning and generative adversarial networks. Specifically, this paper introduces a novel framework for causal representation learning which explicitly exploits the causal structure of the task. We consider changing domain as interventions on images under the data-generation process and steer the generative model to produce counterfactual features. This help the model learn transferable and causal relations across different domains. Our framework is able to learn with various types of unlabeled datasets. We demonstrate the efficacy of our proposed method on both human and hand pose estimation task. The experiment results show the proposed approach achieves state-of-the-art performance on most datasets for both domain adaptation and domain generalization settings.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhang_Learning_Causal_Representation_for_Training_Cross-Domain_Pose_Estimator_via_Generative_ICCV_2021_paper.pdf", - "aff": "State Key Laboratory of CAD&CG, College of Computer Science and Technology, Zhejiang University; School of Computing, National University of Singapore; Huawei Noah\u2019s Ark Laboratory; Huawei Noah\u2019s Ark Laboratory; School of Computing, National University of Singapore; State Key Laboratory of CAD&CG, College of Computer Science and Technology, Zhejiang University; State Key Laboratory of CAD&CG, College of Computer Science and Technology, Zhejiang University", + "aff": "State Key Laboratory of CAD&CG, College of Computer Science and Technology, Zhejiang University; School of Computing, National University of Singapore; Huawei Noah’s Ark Laboratory; Huawei Noah’s Ark Laboratory; School of Computing, National University of Singapore; State Key Laboratory of CAD&CG, College of Computer Science and Technology, Zhejiang University; State Key Laboratory of CAD&CG, College of Computer Science and Technology, Zhejiang University", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Zhang_Learning_Causal_Representation_ICCV_2021_supplemental.pdf", @@ -22340,13 +23853,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhang_Learning_Causal_Representation_for_Training_Cross-Domain_Pose_Estimator_via_Generative_ICCV_2021_paper.html", "aff_unique_index": "0;1;2;2;1;0;0", "aff_unique_norm": "Zhejiang University;National University of Singapore;Huawei", - "aff_unique_dep": "College of Computer Science and Technology;School of Computing;Noah\u2019s Ark Laboratory", + "aff_unique_dep": "College of Computer Science and Technology;School of Computing;Noah’s Ark Laboratory", "aff_unique_url": "http://www.zju.edu.cn;https://www.nus.edu.sg;https://www.huawei.com", "aff_unique_abbr": "ZJU;NUS;Huawei", - "aff_campus_unique_index": "1;1", - "aff_campus_unique": ";Singapore", + "aff_campus_unique_index": "", + "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;1;0;0", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Zhang_2021_ICCV,\n \n author = {\n Zhang,\n Xiheng and Wong,\n Yongkang and Wu,\n Xiaofei and Lu,\n Juwei and Kankanhalli,\n Mohan and Li,\n Xiangdong and Geng,\n Weidong\n},\n title = {\n Learning Causal Representation for Training Cross-Domain Pose Estimator via Generative Interventions\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11270-11280\n} \n}" }, { "title": "Learning Compatible Embeddings", @@ -22354,6 +23868,7 @@ "status": "Poster", "track": "main", "pid": 2215, + "author_site": "Qiang Meng; Chixiang Zhang; Xiaoqiang Xu; Feng Zhou", "author": "Qiang Meng; Chixiang Zhang; Xiaoqiang Xu; Feng Zhou", "abstract": "Achieving backward compatibility when rolling out new models can highly reduce costs or even bypass feature re-encoding of existing gallery images for in-production visual retrieval systems. Previous related works usually leverage losses used in knowledge distillation which can cause performance degradations or not guarantee compatibility. To address these issues, we propose a general framework called Learning Compatible Embeddings (LCE) which is applicable for both cross model compatibility and compatible training in direct/forward/backward manners. Our compatibility is achieved by aligning class centers between models directly or via a transformation, and restricting more compact intra-class distributions for the new model. Experiments are conducted in extensive scenarios such as changes of training dataset, loss functions, network architectures as well as feature dimensions, and demonstrate that LCE efficiently enables model compatibility with marginal sacrifices of accuracies.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Meng_Learning_Compatible_Embeddings_ICCV_2021_paper.pdf", @@ -22368,7 +23883,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Meng_Learning_Compatible_Embeddings_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Meng_Learning_Compatible_Embeddings_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Meng_2021_ICCV,\n \n author = {\n Meng,\n Qiang and Zhang,\n Chixiang and Xu,\n Xiaoqiang and Zhou,\n Feng\n},\n title = {\n Learning Compatible Embeddings\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9939-9948\n} \n}" }, { "title": "Learning Conditional Knowledge Distillation for Degraded-Reference Image Quality Assessment", @@ -22376,6 +23892,7 @@ "status": "Poster", "track": "main", "pid": 8243, + "author_site": "Heliang Zheng; Huan Yang; Jianlong Fu; Zheng-Jun Zha; Jiebo Luo", "author": "Heliang Zheng; Huan Yang; Jianlong Fu; Zheng-Jun Zha; Jiebo Luo", "abstract": "An important scenario for image quality assessment (IQA) is to evaluate image restoration (IR) algorithms. The state-of-the-art approaches adopt a full-reference paradigm that compares restored images with their corresponding pristine-quality images. However, pristine-quality images are usually unavailable in blind image restoration tasks and real-world scenarios. In this paper, we propose a practical solution named degraded-reference IQA (DR-IQA), which exploits the inputs of IR models, degraded images, as references. Specifically, we extract reference information from degraded images by distilling knowledge from pristine-quality images. The distillation is achieved through learning a reference space, where various degraded images are encouraged to share the same feature statistics with pristine-quality images. And the reference space is optimized to capture deep image priors that are useful for quality assessment. Note that pristine-quality images are only used during training. Our work provides a powerful and differentiable metric for blind IRs, especially for GAN-based methods. Extensive experiments show that our results can even be close to the performance of full-reference settings.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zheng_Learning_Conditional_Knowledge_Distillation_for_Degraded-Reference_Image_Quality_Assessment_ICCV_2021_paper.pdf", @@ -22392,14 +23909,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zheng_Learning_Conditional_Knowledge_Distillation_for_Degraded-Reference_Image_Quality_Assessment_ICCV_2021_paper.html", "aff_unique_index": "0;1;1;0;2", - "aff_unique_norm": "University of Science and Technology of China;Microsoft;University of Rochester", + "aff_unique_norm": "University of Science and Technology of China;Microsoft Corporation;University of Rochester", "aff_unique_dep": ";Microsoft Research;", "aff_unique_url": "http://www.ustc.edu.cn;https://www.microsoft.com/en-us/research;https://www.rochester.edu", "aff_unique_abbr": "USTC;MSR;U of R", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Zheng_2021_ICCV,\n \n author = {\n Zheng,\n Heliang and Yang,\n Huan and Fu,\n Jianlong and Zha,\n Zheng-Jun and Luo,\n Jiebo\n},\n title = {\n Learning Conditional Knowledge Distillation for Degraded-Reference Image Quality Assessment\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10242-10251\n} \n}" }, { "title": "Learning Cross-Modal Contrastive Features for Video Domain Adaptation", @@ -22407,6 +23925,7 @@ "status": "Poster", "track": "main", "pid": 3494, + "author_site": "Donghyun Kim; Yi-Hsuan Tsai; Bingbing Zhuang; Xiang Yu; Stan Sclaroff; Kate Saenko; Manmohan Chandraker", "author": "Donghyun Kim; Yi-Hsuan Tsai; Bingbing Zhuang; Xiang Yu; Stan Sclaroff; Kate Saenko; Manmohan Chandraker", "abstract": "Learning transferable and domain adaptive feature representations from videos is important for video-relevant tasks such as action recognition. Existing video domain adaptation methods mainly rely on adversarial feature alignment, which has been derived from the RGB image space. However, video data is usually associated with multi-modal information, e.g., RGB and optical flow, and thus it remains a challenge to design a better method that considers the crossmodal inputs under the cross-domain adaptation setting. To this end, we propose a unified framework for video domain adaptation, which simultaneously regularizes cross-modal and cross-domain feature representations. Specifically, we treat each modality in a domain as a view and leverage the contrastive learning technique with properly designed sampling strategies. As a result, our objectives regularize feature spaces, which originally lack the connection across modalities or have less alignment across domains. We conduct experiments on domain adaptive action recognition benchmark datasets, i.e., UCF, HMDB and EPIC-Kitchens, and demonstrate the effectiveness of our individual components against state-of-the-art algorithms.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Kim_Learning_Cross-Modal_Contrastive_Features_for_Video_Domain_Adaptation_ICCV_2021_paper.pdf", @@ -22430,7 +23949,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0+0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Kim_2021_ICCV,\n \n author = {\n Kim,\n Donghyun and Tsai,\n Yi-Hsuan and Zhuang,\n Bingbing and Yu,\n Xiang and Sclaroff,\n Stan and Saenko,\n Kate and Chandraker,\n Manmohan\n},\n title = {\n Learning Cross-Modal Contrastive Features for Video Domain Adaptation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13618-13627\n} \n}" }, { "title": "Learning Deep Local Features With Multiple Dynamic Attentions for Large-Scale Image Retrieval", @@ -22438,6 +23958,7 @@ "status": "Poster", "track": "main", "pid": 3913, + "author_site": "Hui Wu; Min Wang; Wengang Zhou; Houqiang Li", "author": "Hui Wu; Min Wang; Wengang Zhou; Houqiang Li", "abstract": "In image retrieval, learning local features with deep convolutional networks has been demonstrated effective to improve the performance. To discriminate deep local features, some research efforts turn to attention learning. However, existing attention-based methods only generate a single attention map for each image, which limits the exploration of diverse visual patterns. To this end, we propose a novel deep local feature learning architecture to simultaneously focus on multiple discriminative local patterns in an image. In our framework, we first adaptively reorganize the channels of activation maps for multiple heads. For each head, a new dynamic attention module is designed to learn the potential attentions. The whole architecture is trained as metric learning of weighted-sum-pooled global image features, with only image-level relevance label. After the architecture training, for each database image, we select local features based on their multi-head dynamic attentions, which are further indexed for efficient retrieval. Extensive experiments show the proposed method outperforms the state-of-the-art methods on the Revisited Oxford and Paris datasets. Besides, it typically achieves competitive results even using local features with lower dimensions.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wu_Learning_Deep_Local_Features_With_Multiple_Dynamic_Attentions_for_Large-Scale_ICCV_2021_paper.pdf", @@ -22461,7 +23982,8 @@ "aff_campus_unique_index": "1;1;1;1", "aff_campus_unique": ";Hefei", "aff_country_unique_index": "0+0;0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wu_2021_ICCV,\n \n author = {\n Wu,\n Hui and Wang,\n Min and Zhou,\n Wengang and Li,\n Houqiang\n},\n title = {\n Learning Deep Local Features With Multiple Dynamic Attentions for Large-Scale Image Retrieval\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11416-11425\n} \n}" }, { "title": "Learning Dual Priors for JPEG Compression Artifacts Removal", @@ -22469,6 +23991,7 @@ "status": "Poster", "track": "main", "pid": 1110, + "author_site": "Xueyang Fu; Xi Wang; Aiping Liu; Junwei Han; Zheng-Jun Zha", "author": "Xueyang Fu; Xi Wang; Aiping Liu; Junwei Han; Zheng-Jun Zha", "abstract": "Deep learning (DL)-based methods have achieved great success in solving the ill-posed JPEG compression artifacts removal problem. However, as most DL architectures are designed to directly learn pixel-level mapping relationships, they largely ignore semantic-level information and lack sufficient interpretability. To address the above issues, in this work, we propose an interpretable deep network to learn both pixel-level regressive prior and semantic-level discriminative prior. Specifically, we design a variational model to formulate the image de-blocking problem and propose two prior terms for the image content and gradient, respectively. The content-relevant prior is formulated as a DL-based image-to-image regressor to perform as a de-blocker from the pixel-level. The gradient-relevant prior serves as a DL-based classifier to distinguish whether the image is compressed from the semantic-level. To effectively solve the variational model, we design an alternating minimization algorithm and unfold it into a deep network architecture. In this way, not only the interpretability of the deep network is increased, but also the dual priors can be well estimated from training samples. By integrating the two priors into a single framework, the image de-blocking problem can be well-constrained, leading to a better performance. Experiments on benchmarks and real-world use cases demonstrate the superiority of our method to the existing state-of-the-art approaches.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Fu_Learning_Dual_Priors_for_JPEG_Compression_Artifacts_Removal_ICCV_2021_paper.pdf", @@ -22492,7 +24015,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Fu_2021_ICCV,\n \n author = {\n Fu,\n Xueyang and Wang,\n Xi and Liu,\n Aiping and Han,\n Junwei and Zha,\n Zheng-Jun\n},\n title = {\n Learning Dual Priors for JPEG Compression Artifacts Removal\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4086-4095\n} \n}" }, { "title": "Learning Dynamic Interpolation for Extremely Sparse Light Fields With Wide Baselines", @@ -22500,6 +24024,7 @@ "status": "Poster", "track": "main", "pid": 2767, + "author_site": "Mantang Guo; Jing Jin; Hui Liu; Junhui Hou", "author": "Mantang Guo; Jing Jin; Hui Liu; Junhui Hou", "abstract": "In this paper, we tackle the problem of dense light field (LF) reconstruction from sparsely-sampled ones with wide baselines and propose a learnable model, namely dynamic interpolation, to replace the commonly-used geometry warping operation. Specifically, with the estimated geometric relation between input views, we first construct a lightweight neural network to dynamically learn weights for interpolating neighbouring pixels from input views to synthesize each pixel of novel views independently. In contrast to the fixed and content-independent weights employed in the geometry warping operation, the learned interpolation weights implicitly incorporate the correspondences between the source and novel views and adapt to different image content information. Then, we recover the spatial correlation between the independently synthesized pixels of each novel view by referring to that of input views using a geometry-based spatial refinement module. We also constrain the angular correlation between the novel views through a disparity-oriented LF structure loss. Experimental results on LF datasets with wide baselines show that the reconstructed LFs achieve much higher PSNR/SSIM and preserve the LF parallax structure better than state-of-the-art methods. The source code is publicly available at https://github.com/MantangGuo/DI4SLF.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Guo_Learning_Dynamic_Interpolation_for_Extremely_Sparse_Light_Fields_With_Wide_ICCV_2021_paper.pdf", @@ -22523,7 +24048,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Guo_2021_ICCV,\n \n author = {\n Guo,\n Mantang and Jin,\n Jing and Liu,\n Hui and Hou,\n Junhui\n},\n title = {\n Learning Dynamic Interpolation for Extremely Sparse Light Fields With Wide Baselines\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2450-2459\n} \n}" }, { "title": "Learning Efficient Photometric Feature Transform for Multi-View Stereo", @@ -22531,6 +24057,7 @@ "status": "Poster", "track": "main", "pid": 5933, + "author_site": "Kaizhang Kang; Cihui Xie; Ruisheng Zhu; Xiaohe Ma; Ping Tan; Hongzhi Wu; Kun Zhou", "author": "Kaizhang Kang; Cihui Xie; Ruisheng Zhu; Xiaohe Ma; Ping Tan; Hongzhi Wu; Kun Zhou", "abstract": "We present a novel framework to learn to convert the per-pixel photometric information at each view into spatially distinctive and view-invariant low-level features, which can be plugged into existing multi-view stereo pipeline for enhanced 3D reconstruction. Both the illumination conditions during acquisition and the subsequent per-pixel feature transform can be jointly optimized in a differentiable fashion. Our framework automatically adapts to and makes efficient use of the geometric information available in different forms of input data. High-quality 3D reconstructions of a variety of challenging objects are demonstrated on the data captured with an illumination multiplexing device, as well as a point light. Our results compare favorably with state-of-the-art techniques.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Kang_Learning_Efficient_Photometric_Feature_Transform_for_Multi-View_Stereo_ICCV_2021_paper.pdf", @@ -22554,7 +24081,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1;0+0;0+0", - "aff_country_unique": "China;Canada" + "aff_country_unique": "China;Canada", + "bibtex": "@InProceedings{Kang_2021_ICCV,\n \n author = {\n Kang,\n Kaizhang and Xie,\n Cihui and Zhu,\n Ruisheng and Ma,\n Xiaohe and Tan,\n Ping and Wu,\n Hongzhi and Zhou,\n Kun\n},\n title = {\n Learning Efficient Photometric Feature Transform for Multi-View Stereo\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5956-5965\n} \n}" }, { "title": "Learning Facial Representations From the Cycle-Consistency of Face", @@ -22562,6 +24090,7 @@ "status": "Poster", "track": "main", "pid": 1241, + "author_site": "Jia-Ren Chang; Yong-Sheng Chen; Wei-Chen Chiu", "author": "Jia-Ren Chang; Yong-Sheng Chen; Wei-Chen Chiu", "abstract": "Faces manifest large variations in many aspects, such as identity, expression, pose, and face styling. Therefore, it is a great challenge to disentangle and extract these characteristics from facial images, especially in an unsupervised manner. In this work, we introduce cycle-consistency in facial characteristics as free supervisory signal to learn facial representations from unlabeled facial images. The learning is realized by superimposing the facial motion cycle-consistency and identity cycle-consistency constraints. The main idea of the facial motion cycle-consistency is that, given a face with expression, we can perform de-expression to a neutral face via the removal of facial motion and further perform re-expression to reconstruct back to the original face. The main idea of the identity cycle-consistency is to exploit both de-identity into mean face by depriving the given neutral face of its identity via feature re-normalization and re-identity into neutral face by adding the personal attributes to the mean face. At training time, our model learns to disentangle two distinct facial representations to be useful for performing cycle-consistent face reconstruction. At test time, we use the linear protocol scheme for evaluating facial representations on various tasks, including facial expression recognition and head pose regression. We also can directly apply the learnt facial representations to person recognition, frontalization and image-to-image translation. Our experiments show that the results of our approach is competitive with those of existing methods, demonstrating the rich and unique information embedded in the disentangled representations. Code is available at https://github.com/JiaRenChang/FaceCycle.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chang_Learning_Facial_Representations_From_the_Cycle-Consistency_of_Face_ICCV_2021_paper.pdf", @@ -22585,7 +24114,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Taiwan", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chang_2021_ICCV,\n \n author = {\n Chang,\n Jia-Ren and Chen,\n Yong-Sheng and Chiu,\n Wei-Chen\n},\n title = {\n Learning Facial Representations From the Cycle-Consistency of Face\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9680-9689\n} \n}" }, { "title": "Learning Fast Sample Re-Weighting Without Reward Data", @@ -22593,6 +24123,7 @@ "status": "Poster", "track": "main", "pid": 7961, + "author_site": "Zizhao Zhang; Tomas Pfister", "author": "Zizhao Zhang; Tomas Pfister", "abstract": "Training sample re-weighting is an effective approach for tackling data biases such as imbalanced and corrupted labels. Recent methods develop learning-based algorithms to learn sample re-weighting strategies jointly with model training based on the frameworks of reinforcement learning and meta learning. However, depending on additional unbiased reward data is limiting their general applicability. Furthermore, existing learning-based sample re-weighting methods require nested optimizations of models and weighting parameters, which requires expensive second-order computation. This paper addresses these two problems and presents a novel learning-based fast sample re-weighting (FSR) method that does not require additional reward data. The method is based on two key ideas: learning from history to build proxy reward data and feature sharing to reduce the optimization cost. Our experiments show the proposed method achieves competitive results compared to state of the arts on label noise robustness and long-tailed recognition, and does so while achieving significantly improved training efficiency. The source code is publicly available at https://github.com/google-research/google-research/tree/master/ieg.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhang_Learning_Fast_Sample_Re-Weighting_Without_Reward_Data_ICCV_2021_paper.pdf", @@ -22616,7 +24147,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhang_2021_ICCV,\n \n author = {\n Zhang,\n Zizhao and Pfister,\n Tomas\n},\n title = {\n Learning Fast Sample Re-Weighting Without Reward Data\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 725-734\n} \n}" }, { "title": "Learning Frequency-Aware Dynamic Network for Efficient Super-Resolution", @@ -22624,10 +24156,11 @@ "status": "Poster", "track": "main", "pid": 7181, + "author_site": "Wenbin Xie; Dehua Song; Chang Xu; Chunjing Xu; Hui Zhang; Yunhe Wang", "author": "Wenbin Xie; Dehua Song; Chang Xu; Chunjing Xu; Hui Zhang; Yunhe Wang", "abstract": "Deep learning based methods, especially convolutional neural networks (CNNs) have been successfully applied in the field of single image super-resolution (SISR). To obtain better fidelity and visual quality, most of existing networks are of heavy design with massive computation. However, the computation resources of modern mobile devices are limited, which cannot easily support the expensive cost. To this end, this paper explores a novel frequency-aware dynamic network for dividing the input into multiple parts according to its coefficients in the discrete cosine transform (DCT) domain. In practice, the high-frequency part will be processed using expensive operations and the lower-frequency part is assigned with cheap operations to relieve the computation burden. Since pixels or image patches belong to low-frequency areas contain relatively few textural details, this dynamic network will not affect the quality of resulting super-resolution images. In addition, we embed predictors into the proposed dynamic network to end-to-end fine-tune the handcrafted frequency-aware masks. Extensive experiments conducted on benchmark SISR models and datasets show that the frequency-aware dynamic network can be employed for various SISR neural architectures to obtain the better tradeoff between visual quality and computational complexity. For instance, we can reduce the FLOPs of SR models by approximate 50% while preserving the state-of-the-art SISR performance.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xie_Learning_Frequency-Aware_Dynamic_Network_for_Efficient_Super-Resolution_ICCV_2021_paper.pdf", - "aff": "Noah\u2019s Ark Lab, Huawei Technologies+School of Software, Tsinghua University; Noah\u2019s Ark Lab, Huawei Technologies+School of Software, Tsinghua University; The University of Sydney; Noah\u2019s Ark Lab, Huawei Technologies; School of Software, Tsinghua University; Noah\u2019s Ark Lab, Huawei Technologies", + "aff": "Noah’s Ark Lab, Huawei Technologies+School of Software, Tsinghua University; Noah’s Ark Lab, Huawei Technologies+School of Software, Tsinghua University; The University of Sydney; Noah’s Ark Lab, Huawei Technologies; School of Software, Tsinghua University; Noah’s Ark Lab, Huawei Technologies", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Xie_Learning_Frequency-Aware_Dynamic_ICCV_2021_supplemental.pdf", @@ -22640,14 +24173,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Xie_Learning_Frequency-Aware_Dynamic_Network_for_Efficient_Super-Resolution_ICCV_2021_paper.html", "aff_unique_index": "0+1;0+1;2;0;1;0", - "aff_unique_norm": "Huawei;Tsinghua University;University of Sydney", - "aff_unique_dep": "Noah\u2019s Ark Lab;School of Software;", + "aff_unique_norm": "Huawei Technologies;Tsinghua University;University of Sydney", + "aff_unique_dep": "Noah’s Ark Lab;School of Software;", "aff_unique_url": "https://www.huawei.com;https://www.tsinghua.edu.cn;https://www.sydney.edu.au", "aff_unique_abbr": "Huawei;THU;USYD", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;1;0;0;0", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Xie_2021_ICCV,\n \n author = {\n Xie,\n Wenbin and Song,\n Dehua and Xu,\n Chang and Xu,\n Chunjing and Zhang,\n Hui and Wang,\n Yunhe\n},\n title = {\n Learning Frequency-Aware Dynamic Network for Efficient Super-Resolution\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4308-4317\n} \n}" }, { "title": "Learning From Noisy Data With Robust Representation Learning", @@ -22655,6 +24189,7 @@ "status": "Poster", "track": "main", "pid": 6467, + "author_site": "Junnan Li; Caiming Xiong; Steven C.H. Hoi", "author": "Junnan Li; Caiming Xiong; Steven C.H. Hoi", "abstract": "Learning from noisy data has attracted much attention, where most methods focus on label noise. In this work, we propose a new learning framework which simultaneously addresses three types of noise commonly seen in real-world data: label noise, out-of-distribution input, and input corruption. In contrast to most existing methods, we combat noise by learning robust representation. Specifically, we embed images into a low-dimensional subspace, and regularize the geometric structure of the subspace with robust contrastive learning, which includes an unsupervised consistency loss and a supervised mixup prototypical loss. We also propose a new noise cleaning method which leverages the learned representation to enforce a smoothness constraint on neighboring samples. Experiments on multiple benchmarks demonstrate state-of-the-art performance of our method and robustness of the learned representation. Code is available at https://github.com/salesforce/RRL/.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_Learning_From_Noisy_Data_With_Robust_Representation_Learning_ICCV_2021_paper.pdf", @@ -22678,7 +24213,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Junnan and Xiong,\n Caiming and Hoi,\n Steven C.H.\n},\n title = {\n Learning From Noisy Data With Robust Representation Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9485-9494\n} \n}" }, { "title": "Learning Generative Models of Textured 3D Meshes From Real-World Images", @@ -22686,6 +24222,7 @@ "status": "Poster", "track": "main", "pid": 8981, + "author_site": "Dario Pavllo; Jonas Kohler; Thomas Hofmann; Aurelien Lucchi", "author": "Dario Pavllo; Jonas Kohler; Thomas Hofmann; Aurelien Lucchi", "abstract": "Recent advances in differentiable rendering have sparked an interest in learning generative models of textured 3D meshes from image collections. These models natively disentangle pose and appearance, enable downstream applications in computer graphics, and improve the ability of generative models to understand the concept of image formation. Although there has been prior work on learning such models from collections of 2D images, these approaches require a delicate pose estimation step that exploits annotated keypoints, thereby restricting their applicability to a few specific datasets. In this work, we propose a GAN framework for generating textured triangle meshes without relying on such annotations. We show that the performance of our approach is on par with prior work that relies on ground-truth keypoints, and more importantly, we demonstrate the generality of our method by setting new baselines on a larger set of categories from ImageNet - for which keypoints are not available - without any class-specific hyperparameter tuning. We release our code at https://github.com/dariopavllo/textured-3d-gan", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Pavllo_Learning_Generative_Models_of_Textured_3D_Meshes_From_Real-World_Images_ICCV_2021_paper.pdf", @@ -22709,7 +24246,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "", - "aff_country_unique": "" + "aff_country_unique": "", + "bibtex": "@InProceedings{Pavllo_2021_ICCV,\n \n author = {\n Pavllo,\n Dario and Kohler,\n Jonas and Hofmann,\n Thomas and Lucchi,\n Aurelien\n},\n title = {\n Learning Generative Models of Textured 3D Meshes From Real-World Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13879-13889\n} \n}" }, { "title": "Learning Hierarchical Graph Neural Networks for Image Clustering", @@ -22717,6 +24255,7 @@ "status": "Poster", "track": "main", "pid": 8656, + "author_site": "Yifan Xing; Tong He; Tianjun Xiao; Yongxin Wang; Yuanjun Xiong; Wei Xia; David Wipf; Zheng Zhang; Stefano Soatto", "author": "Yifan Xing; Tong He; Tianjun Xiao; Yongxin Wang; Yuanjun Xiong; Wei Xia; David Wipf; Zheng Zhang; Stefano Soatto", "abstract": "We propose a hierarchical graph neural network (GNN) model that learns how to cluster a set of images into an unknown number of identities using a training set of images annotated with labels belonging to a disjoint set of identities. Our hierarchical GNN uses a novel approach to merge connected components predicted at each level of the hierarchy to form a new graph at the next level. Unlike fully unsupervised hierarchical clustering, the choice of grouping and complexity criteria stems naturally from supervision in the training set. The resulting method, Hi-LANDER, achieves an average of 49% improvement in F-score and 7% increase in Normalized Mutual Information (NMI) relative to current GNN-based clustering algorithms. Additionally, state-of-the-art GNN-based methods rely on separate models to predict linkage probabilities and node densities as intermediate steps of the clustering process. In contrast, our unified framework achieves a three-fold decrease in computational cost. Our training and inference code are released.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xing_Learning_Hierarchical_Graph_Neural_Networks_for_Image_Clustering_ICCV_2021_paper.pdf", @@ -22733,14 +24272,15 @@ "author_num": 9, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Xing_Learning_Hierarchical_Graph_Neural_Networks_for_Image_Clustering_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;0;0;0;0;0;0", - "aff_unique_norm": "Amazon", - "aff_unique_dep": "Amazon Web Services", + "aff_unique_norm": "Amazon Web Services", + "aff_unique_dep": "", "aff_unique_url": "https://aws.amazon.com", "aff_unique_abbr": "AWS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Xing_2021_ICCV,\n \n author = {\n Xing,\n Yifan and He,\n Tong and Xiao,\n Tianjun and Wang,\n Yongxin and Xiong,\n Yuanjun and Xia,\n Wei and Wipf,\n David and Zhang,\n Zheng and Soatto,\n Stefano\n},\n title = {\n Learning Hierarchical Graph Neural Networks for Image Clustering\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3467-3477\n} \n}" }, { "title": "Learning High-Fidelity Face Texture Completion Without Complete Face Texture", @@ -22748,6 +24288,7 @@ "status": "Poster", "track": "main", "pid": 9819, + "author_site": "Jongyoo Kim; Jiaolong Yang; Xin Tong", "author": "Jongyoo Kim; Jiaolong Yang; Xin Tong", "abstract": "For face texture completion, previous methods typically use some complete textures captured by multiview imaging systems or 3D scanners for supervised learning. This paper deals with a new challenging problem -- learning to complete invisible texture in a single face image without using any complete texture. We simply leverage a large corpus of face images of different subjects (e.\\,g., FFHQ) to train a texture completion model in an unsupervised manner. To achieve this, we propose DSD-GAN, a novel deep neural network based method that applies two discriminators in UV map space and image space. These two discriminators work in a complementary manner to learn both facial structures and texture details. We show that their combination is essential to obtain high-fidelity results. Despite the network never sees any complete facial appearance, it is able to generate compelling full textures from single images.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Kim_Learning_High-Fidelity_Face_Texture_Completion_Without_Complete_Face_Texture_ICCV_2021_paper.pdf", @@ -22764,14 +24305,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Kim_Learning_High-Fidelity_Face_Texture_Completion_Without_Complete_Face_Texture_ICCV_2021_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "Microsoft", + "aff_unique_norm": "Microsoft Research", "aff_unique_dep": "Research", "aff_unique_url": "https://www.microsoft.com/en-us/research/group/asia", "aff_unique_abbr": "MSR Asia", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Asia", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Kim_2021_ICCV,\n \n author = {\n Kim,\n Jongyoo and Yang,\n Jiaolong and Tong,\n Xin\n},\n title = {\n Learning High-Fidelity Face Texture Completion Without Complete Face Texture\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13990-13999\n} \n}" }, { "title": "Learning Icosahedral Spherical Probability Map Based on Bingham Mixture Model for Vanishing Point Estimation", @@ -22779,10 +24321,11 @@ "status": "Poster", "track": "main", "pid": 2858, + "author_site": "Haoang Li; Kai Chen; Pyojin Kim; Kuk-Jin Yoon; Zhe Liu; Kyungdon Joo; Yun-Hui Liu", "author": "Haoang Li; Kai Chen; Pyojin Kim; Kuk-Jin Yoon; Zhe Liu; Kyungdon Joo; Yun-Hui Liu", "abstract": "Existing vanishing point (VP) estimation methods rely on pre-extracted image lines and/or prior knowledge of the number of VPs. However, in practice, this information may be insufficient or unavailable. To solve this problem, we propose a network that treats a perspective image as input and predicts a spherical probability map of VP. Based on this map, we can detect all the VPs. Our method is reliable thanks to four technical novelties. First, we leverage the icosahedral spherical representation to express our probability map. This representation provides uniform pixel distribution, and thus facilitates estimating arbitrary positions of VPs. Second, we design a loss function that enforces the antipodal symmetry and sparsity of our spherical probability map to prevent over-fitting. Third, we generate the ground truth probability map that reasonably expresses the locations and uncertainties of VPs. This map unnecessarily peaks at noisy annotated VPs, and also exhibits various anisotropic dispersions. Fourth, given a predicted probability map, we detect VPs by fitting a Bingham mixture model. This strategy can robustly handle close VPs and provide the confidence level of VP useful for practical applications. Experiments showed that our method achieves the best compromise between generality, accuracy, and efficiency, compared with state-of-the-art approaches.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_Learning_Icosahedral_Spherical_Probability_Map_Based_on_Bingham_Mixture_Model_ICCV_2021_paper.pdf", - "aff": "The Chinese University of Hong Kong, Hong Kong, China; The Chinese University of Hong Kong, Hong Kong, China; Sookmyung Women\u2019s University, South Korea; KAIST, South Korea; University of Cambridge, United Kingdom; UNIST, South Korea; The Chinese University of Hong Kong, Hong Kong, China", + "aff": "The Chinese University of Hong Kong, Hong Kong, China; The Chinese University of Hong Kong, Hong Kong, China; Sookmyung Women’s University, South Korea; KAIST, South Korea; University of Cambridge, United Kingdom; UNIST, South Korea; The Chinese University of Hong Kong, Hong Kong, China", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Li_Learning_Icosahedral_Spherical_ICCV_2021_supplemental.pdf", @@ -22795,14 +24338,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Li_Learning_Icosahedral_Spherical_Probability_Map_Based_on_Bingham_Mixture_Model_ICCV_2021_paper.html", "aff_unique_index": "0;0;1;2;3;4;0", - "aff_unique_norm": "Chinese University of Hong Kong;Sookmyung Women's University;Korea Advanced Institute of Science and Technology;University of Cambridge;Ulsan National Institute of Science and Technology", + "aff_unique_norm": "The Chinese University of Hong Kong;Sookmyung Women's University;Korea Advanced Institute of Science and Technology;University of Cambridge;Ulsan National Institute of Science and Technology", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.sookmyung.ac.kr;https://www.kaist.ac.kr;https://www.cam.ac.uk;https://www.unist.ac.kr", "aff_unique_abbr": "CUHK;SWU;KAIST;Cambridge;UNIST", "aff_campus_unique_index": "0;0;2;0", "aff_campus_unique": "Hong Kong;;Cambridge", "aff_country_unique_index": "0;0;1;1;2;1;0", - "aff_country_unique": "China;South Korea;United Kingdom" + "aff_country_unique": "China;South Korea;United Kingdom", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Haoang and Chen,\n Kai and Kim,\n Pyojin and Yoon,\n Kuk-Jin and Liu,\n Zhe and Joo,\n Kyungdon and Liu,\n Yun-Hui\n},\n title = {\n Learning Icosahedral Spherical Probability Map Based on Bingham Mixture Model for Vanishing Point Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5661-5670\n} \n}" }, { "title": "Learning Indoor Inverse Rendering With 3D Spatially-Varying Lighting", @@ -22810,6 +24354,7 @@ "status": "Poster", "track": "main", "pid": 5975, + "author_site": "Zian Wang; Jonah Philion; Sanja Fidler; Jan Kautz", "author": "Zian Wang; Jonah Philion; Sanja Fidler; Jan Kautz", "abstract": "In this work, we address the problem of jointly estimating albedo, normals, depth and 3D spatially-varying lighting from a single image. Most existing methods formulate the task as image-to-image translation, ignoring the 3D properties of the scene. However, indoor scenes contain complex 3D light transport where a 2D representation is insufficient. In this paper, we propose a unified, learning-based inverse rendering framework that formulates 3D spatially-varying lighting. Inspired by classic volume rendering techniques, we propose a novel Volumetric Spherical Gaussian representation for lighting, which parameterizes the exitant radiance of the 3D scene surfaces on a voxel grid. We design a physicsbased differentiable renderer that utilizes our 3D lighting representation, and formulates the energy-conserving image formation process that enables joint training of all intrinsic properties with the re-rendering constraint. Our model ensures physically correct predictions and avoids the need for ground-truth HDR lighting which is not easily accessible. Experiments show that our method outperforms prior works both quantitatively and qualitatively, and is capable of producing photorealistic results for AR applications such as virtual object insertion even for highly specular objects.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_Learning_Indoor_Inverse_Rendering_With_3D_Spatially-Varying_Lighting_ICCV_2021_paper.pdf", @@ -22824,7 +24369,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wang_Learning_Indoor_Inverse_Rendering_With_3D_Spatially-Varying_Lighting_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wang_Learning_Indoor_Inverse_Rendering_With_3D_Spatially-Varying_Lighting_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Zian and Philion,\n Jonah and Fidler,\n Sanja and Kautz,\n Jan\n},\n title = {\n Learning Indoor Inverse Rendering With 3D Spatially-Varying Lighting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12538-12547\n} \n}" }, { "title": "Learning Inner-Group Relations on Point Clouds", @@ -22832,10 +24378,11 @@ "status": "Poster", "track": "main", "pid": 2278, + "author_site": "Haoxi Ran; Wei Zhuo; Jun Liu; Li Lu", "author": "Haoxi Ran; Wei Zhuo; Jun Liu; Li Lu", "abstract": "The prevalence of relation networks in computer vision is in stark contrast to underexplored point-based methods. In this paper, we explore the possibilities of local relation operators and survey their feasibility. We propose a scalable and efficient module, called group relation aggregator. The module computes a feature of a group based on the aggregation of the features of the inner-group points weighted by geometric relations and semantic relations. For convenience, we generalize groupwise operations to assemble this module. We adopt this module to design our RPNet. We further verify the expandability of RPNet, in terms of both depth and width, on the tasks of classification and segmentation. Surprisingly, empirical results show that wider RPNet fits for classification, while deeper RPNet works better on segmentation. RPNet achieves state-of-the-art for classification and segmentation on challenging benchmarks. We also compare our local aggregator with PointNet++, with around 30% parameters and 50% computation saving. Finally, we conduct experiments to reveal the robustness of RPNet with regard to rigid transformation and noises.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ran_Learning_Inner-Group_Relations_on_Point_Clouds_ICCV_2021_paper.pdf", - "aff": "Sichuan University* + Tencent\u2020; Tencent\u2020; Sichuan University\u2020; Sichuan University*", + "aff": "Sichuan University* + Tencent†; Tencent†; Sichuan University†; Sichuan University*", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Ran_Learning_Inner-Group_Relations_ICCV_2021_supplemental.pdf", @@ -22849,13 +24396,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Ran_Learning_Inner-Group_Relations_on_Point_Clouds_ICCV_2021_paper.html", "aff_unique_index": "0+1;1;0;0", "aff_unique_norm": "Sichuan University;Tencent", - "aff_unique_dep": ";Tencent", + "aff_unique_dep": ";", "aff_unique_url": "https://www.scu.edu.cn;https://www.tencent.com", "aff_unique_abbr": "SCU;Tencent", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Ran_2021_ICCV,\n \n author = {\n Ran,\n Haoxi and Zhuo,\n Wei and Liu,\n Jun and Lu,\n Li\n},\n title = {\n Learning Inner-Group Relations on Point Clouds\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15477-15487\n} \n}" }, { "title": "Learning Instance-Level Spatial-Temporal Patterns for Person Re-Identification", @@ -22863,6 +24411,7 @@ "status": "Poster", "track": "main", "pid": 3713, + "author_site": "Min Ren; Lingxiao He; Xingyu Liao; Wu Liu; Yunlong Wang; Tieniu Tan", "author": "Min Ren; Lingxiao He; Xingyu Liao; Wu Liu; Yunlong Wang; Tieniu Tan", "abstract": "Person re-identification (Re-ID) aims to match pedestrians under dis-joint cameras. Most Re-ID methods formulate it as visual representation learning and image search, and its accuracy is consequently affected greatly by the search space. Spatial-temporal information has been proven to be efficient to filter irrelevant negative samples and significantly improve Re-ID accuracy. However, existing spatial-temporal person Re-ID methods are still rough and do not exploit spatial-temporal information sufficiently. In this paper, we propose a novel instance-level and spatial-temporal disentangled Re-ID method (InSTD), to improve Re-ID accuracy. In our proposed framework, personalized information such as moving direction is explicitly considered to further narrow down the search space. Besides, the spatial-temporal transferring probability is disentangled from joint distribution to marginal distribution, so that outliers can also be well modeled. Abundant experimental analyses on two datasets are presented, which demonstrates the superiority and provides more insights into our method. The proposed method achieves mAP of 90.8% on Market-1501 and 89.1% on DukeMTMC-reID, improving from the baseline 82.2% and 72.7%, respectively. Besides, in order to provide a better benchmark for person re-identification, we release a cleaned data list of DukeMTMC-reID with this paper: https://github.com/RenMin1991/cleaned-DukeMTMC-reID.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ren_Learning_Instance-Level_Spatial-Temporal_Patterns_for_Person_Re-Identification_ICCV_2021_paper.pdf", @@ -22879,14 +24428,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Ren_Learning_Instance-Level_Spatial-Temporal_Patterns_for_Person_Re-Identification_ICCV_2021_paper.html", "aff_unique_index": "0+1;2;2;2;1;1", - "aff_unique_norm": "University of Chinese Academy of Sciences;Institute of Automation Chinese Academy of Sciences;JD", - "aff_unique_dep": ";CRIPAC NLPR;JD AI Research", + "aff_unique_norm": "University of Chinese Academy of Sciences;Institute of Automation Chinese Academy of Sciences;JD AI Research", + "aff_unique_dep": ";CRIPAC NLPR;", "aff_unique_url": "http://www.ucas.ac.cn;http://www.ia.cas.cn;https://www.jd.com", "aff_unique_abbr": "UCAS;;JD AI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Ren_2021_ICCV,\n \n author = {\n Ren,\n Min and He,\n Lingxiao and Liao,\n Xingyu and Liu,\n Wu and Wang,\n Yunlong and Tan,\n Tieniu\n},\n title = {\n Learning Instance-Level Spatial-Temporal Patterns for Person Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14930-14939\n} \n}" }, { "title": "Learning Latent Architectural Distribution in Differentiable Neural Architecture Search via Variational Information Maximization", @@ -22894,6 +24444,7 @@ "status": "Poster", "track": "main", "pid": 1203, + "author_site": "Yaoming Wang; Yuchen Liu; Wenrui Dai; Chenglin Li; Junni Zou; Hongkai Xiong", "author": "Yaoming Wang; Yuchen Liu; Wenrui Dai; Chenglin Li; Junni Zou; Hongkai Xiong", "abstract": "Existing differentiable neural architecture search approaches simply assume the architectural distribution on each edge is independent of each other, which conflicts with the intrinsic properties of architecture. In this paper, we view the architectural distribution as the latent representation of specific data points. Then we propose Variational Information Maximization Neural Architecture Search (VIM-NAS) to leverage a simple but effective convolutional neural network to model the latent representation, and optimize for a tractable variational lower bound to the mutual information between the data points and the latent representations. VIM-NAS automatically learns a near one-hot distribution from a continuous distribution with extremely fast convergence speed, e.g., converging with one epoch. Experimental results demonstrate VIM-NAS achieves state-of-the-art performance on various search spaces, including DARTS search space, NAS-Bench-1shot1, NAS-Bench-201, and simplified search spaces S1-S4. Specifically, VIM-NAS achieves a top-1 error rate of 2.45% and 15.80% within 10 minutes on CIFAR-10 and CIFAR-100, respectively, and a top-1 error rate of 24.0% when transferred to ImageNet.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_Learning_Latent_Architectural_Distribution_in_Differentiable_Neural_Architecture_Search_via_ICCV_2021_paper.pdf", @@ -22917,7 +24468,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Yaoming and Liu,\n Yuchen and Dai,\n Wenrui and Li,\n Chenglin and Zou,\n Junni and Xiong,\n Hongkai\n},\n title = {\n Learning Latent Architectural Distribution in Differentiable Neural Architecture Search via Variational Information Maximization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12312-12321\n} \n}" }, { "title": "Learning Meta-Class Memory for Few-Shot Semantic Segmentation", @@ -22925,6 +24477,7 @@ "status": "Poster", "track": "main", "pid": 3247, + "author_site": "Zhonghua Wu; Xiangxi Shi; Guosheng Lin; Jianfei Cai", "author": "Zhonghua Wu; Xiangxi Shi; Guosheng Lin; Jianfei Cai", "abstract": "Currently, the state-of-the-art methods treat few-shot semantic segmentation task as a conditional foreground-background segmentation problem, assuming each class is independent. In this paper, we introduce the concept of meta-class, which is the meta information (e.g. certain middle-level features) shareable among all classes. To explicitly learn meta-class representations in few-shot segmentation task, we propose a novel Meta-class Memory based few-shot segmentation method (MM-Net), where we introduce a set of learnable memory embeddings to memorize the meta-class information during the base class training and transfer to novel classes during the inference stage. Moreover, for the k-shot scenario, we propose a novel image quality measurement module to select images from the set of support images. A high-quality class prototype could be obtained with the weighted sum of support image features based on the quality measure. Experiments on both PASCAL-5^i and COCO datasets show that our proposed method is able to achieve state-of-the-art results in both 1-shot and 5-shot settings. Particularly, our proposed MM-Net achieves 37.5% mIoU on the COCO dataset in 1-shot setting, which is 5.1% higher than the previous state-of-the-art.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wu_Learning_Meta-Class_Memory_for_Few-Shot_Semantic_Segmentation_ICCV_2021_paper.pdf", @@ -22948,7 +24501,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;1;0+0;2", - "aff_country_unique": "Singapore;United States;Australia" + "aff_country_unique": "Singapore;United States;Australia", + "bibtex": "@InProceedings{Wu_2021_ICCV,\n \n author = {\n Wu,\n Zhonghua and Shi,\n Xiangxi and Lin,\n Guosheng and Cai,\n Jianfei\n},\n title = {\n Learning Meta-Class Memory for Few-Shot Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 517-526\n} \n}" }, { "title": "Learning Motion Priors for 4D Human Body Capture in 3D Scenes", @@ -22956,10 +24510,11 @@ "status": "Poster", "track": "main", "pid": 4338, + "author_site": "Siwei Zhang; Yan Zhang; Federica Bogo; Marc Pollefeys; Siyu Tang", "author": "Siwei Zhang; Yan Zhang; Federica Bogo; Marc Pollefeys; Siyu Tang", "abstract": "Recovering high-quality 3D human motion in complex scenes from monocular videos is important for many applications, ranging from AR/VR to robotics. However, capturing realistic human-scene interactions, while dealing with occlusions and partial views, is challenging; current approaches are still far from achieving compelling results. We address this problem by proposing LEMO: LEarning human MOtion priors for 4D human body capture. By leveraging the large-scale motion capture dataset AMASS, we introduce a novel motion smoothness prior, which strongly reduces the jitters exhibited by poses recovered over a sequence. Furthermore, to handle contacts and occlusions occurring frequently in body-scene interactions, we design a contact friction term and a contact-aware motion infiller obtained via per-instance self-supervised training. To prove the effectiveness of the proposed motion priors, we combine them into a novel pipeline for 4D human body capture in 3D scenes. With our pipeline, we demonstrate high-quality 4D human body capture, reconstructing smooth motions and physically plausible body-scene interactions. The code and data are available at https://sanweiliti.github.io/LEMO/LEMO.html.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhang_Learning_Motion_Priors_for_4D_Human_Body_Capture_in_3D_ICCV_2021_paper.pdf", - "aff": "ETH Z\u00fcrich; ETH Z\u00fcrich; Microsoft; ETH Z\u00fcrich+Microsoft; ETH Z\u00fcrich", + "aff": "ETH Zürich; ETH Zürich; Microsoft; ETH Zürich+Microsoft; ETH Zürich", "project": "", "github": "https://sanweiliti.github.io/LEMO/LEMO.html", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Zhang_Learning_Motion_Priors_ICCV_2021_supplemental.pdf", @@ -22972,14 +24527,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhang_Learning_Motion_Priors_for_4D_Human_Body_Capture_in_3D_ICCV_2021_paper.html", "aff_unique_index": "0;0;1;0+1;0", - "aff_unique_norm": "ETH Zurich;Microsoft", - "aff_unique_dep": ";Microsoft Corporation", + "aff_unique_norm": "ETH Zürich;Microsoft Corporation", + "aff_unique_dep": ";", "aff_unique_url": "https://www.ethz.ch;https://www.microsoft.com", "aff_unique_abbr": "ETHZ;Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0+1;0", - "aff_country_unique": "Switzerland;United States" + "aff_country_unique": "Switzerland;United States", + "bibtex": "@InProceedings{Zhang_2021_ICCV,\n \n author = {\n Zhang,\n Siwei and Zhang,\n Yan and Bogo,\n Federica and Pollefeys,\n Marc and Tang,\n Siyu\n},\n title = {\n Learning Motion Priors for 4D Human Body Capture in 3D Scenes\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11343-11353\n} \n}" }, { "title": "Learning Motion-Appearance Co-Attention for Zero-Shot Video Object Segmentation", @@ -22987,6 +24543,7 @@ "status": "Poster", "track": "main", "pid": 7453, + "author_site": "Shu Yang; Lu Zhang; Jinqing Qi; Huchuan Lu; Shuo Wang; Xiaoxing Zhang", "author": "Shu Yang; Lu Zhang; Jinqing Qi; Huchuan Lu; Shuo Wang; Xiaoxing Zhang", "abstract": "How to make the appearance and motion information interact effectively to accommodate complex scenarios is a fundamental issue in flow-based zero-shot video object segmentation. In this paper, we propose an Attentive Multi-Modality Collaboration Network (AMC-Net) to utilize appearance and motion information uniformly. Specifically, AMC-Net fuses robust information from multi-modality features and promotes their collaboration in two stages. First, we propose a Multi-Modality Co-Attention Gate (MCG) on the bilateral encoder branches, in which a gate function is used to formulate co-attention scores for balancing the contributions of multi-modality features and suppressing the redundant and misleading information. Then, we propose a Motion Correction Module (MCM) with a visual-motion attention mechanism, which is constructed to emphasize the features of foreground objects by incorporating the spatio-temporal correspondence between appearance and motion cues. Extensive experiments on three public challenging benchmark datasets verify that our proposed network performs favorably against existing state-of-the-art methods via training with fewer data.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yang_Learning_Motion-Appearance_Co-Attention_for_Zero-Shot_Video_Object_Segmentation_ICCV_2021_paper.pdf", @@ -23010,7 +24567,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yang_2021_ICCV,\n \n author = {\n Yang,\n Shu and Zhang,\n Lu and Qi,\n Jinqing and Lu,\n Huchuan and Wang,\n Shuo and Zhang,\n Xiaoxing\n},\n title = {\n Learning Motion-Appearance Co-Attention for Zero-Shot Video Object Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1564-1573\n} \n}" }, { "title": "Learning Multi-Scene Absolute Pose Regression With Transformers", @@ -23018,6 +24576,7 @@ "status": "Poster", "track": "main", "pid": 8011, + "author_site": "Yoli Shavit; Ron Ferens; Yosi Keller", "author": "Yoli Shavit; Ron Ferens; Yosi Keller", "abstract": "Absolute camera pose regression methods estimate the position and orientation of a camera by only using the captured image. A convolutional backbone with a multi-layer perceptron head is trained with images and pose labels to embed a single reference scene at a time. Recently, this framework was extended for learning multiple scenes with a single model by adding a multi-layer perceptron head per scene. In this work, we propose to learn multi-scene absolute camera pose regression with transformers, where encoders are used to aggregate activation maps with self-attention and deocoders transform latent features into candidate pose predictions in parallel, each associated with a different scene. This formulation allows our model to focus on general features that are informative for localization while embedding multiple scenes at once. We evaluate our method on commonly benchmarked indoor and outdoor datasets and show that it surpasses both multi-scene and single-scene absolute pose regressors.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Shavit_Learning_Multi-Scene_Absolute_Pose_Regression_With_Transformers_ICCV_2021_paper.pdf", @@ -23034,14 +24593,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Shavit_Learning_Multi-Scene_Absolute_Pose_Regression_With_Transformers_ICCV_2021_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "Bar-Ilan University", + "aff_unique_norm": "Bar Ilan University", "aff_unique_dep": "Faculty of Engineering", "aff_unique_url": "https://www.biu.ac.il", "aff_unique_abbr": "BIU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Ramat-Gan", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Israel" + "aff_country_unique": "Israel", + "bibtex": "@InProceedings{Shavit_2021_ICCV,\n \n author = {\n Shavit,\n Yoli and Ferens,\n Ron and Keller,\n Yosi\n},\n title = {\n Learning Multi-Scene Absolute Pose Regression With Transformers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2733-2742\n} \n}" }, { "title": "Learning Multiple Pixelwise Tasks Based on Loss Scale Balancing", @@ -23049,6 +24609,7 @@ "status": "Poster", "track": "main", "pid": 9166, + "author_site": "Jae-Han Lee; Chul Lee; Chang-Su Kim", "author": "Jae-Han Lee; Chul Lee; Chang-Su Kim", "abstract": "We propose a novel loss weighting algorithm, called loss scale balancing (LSB), for multi-task learning (MTL) of pixelwise vision tasks. An MTL model is trained to estimate multiple pixelwise predictions using an overall loss, which is a linear combination of individual task losses. The proposed algorithm dynamically adjusts the linear weights to learn all tasks effectively. Instead of controlling the trend of each loss value directly, we balance the loss scale --- the product of the loss value and its weight --- periodically. In addition, by evaluating the difficulty of each task based on the previous loss record, the proposed algorithm focuses more on difficult tasks during training. Experimental results show that the proposed algorithm outperforms conventional weighting algorithms for MTL of various pixelwise tasks. Codes are available at https://github.com/jaehanlee-mcl/LSB-MTL.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Lee_Learning_Multiple_Pixelwise_Tasks_Based_on_Loss_Scale_Balancing_ICCV_2021_paper.pdf", @@ -23072,7 +24633,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Lee_2021_ICCV,\n \n author = {\n Lee,\n Jae-Han and Lee,\n Chul and Kim,\n Chang-Su\n},\n title = {\n Learning Multiple Pixelwise Tasks Based on Loss Scale Balancing\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5107-5116\n} \n}" }, { "title": "Learning Object-Compositional Neural Radiance Field for Editable Scene Rendering", @@ -23080,6 +24642,7 @@ "status": "Poster", "track": "main", "pid": 5728, + "author_site": "Bangbang Yang; Yinda Zhang; Yinghao Xu; Yijin Li; Han Zhou; Hujun Bao; Guofeng Zhang; Zhaopeng Cui", "author": "Bangbang Yang; Yinda Zhang; Yinghao Xu; Yijin Li; Han Zhou; Hujun Bao; Guofeng Zhang; Zhaopeng Cui", "abstract": "Implicit neural rendering techniques have shown promising results for novel view synthesis. However, existing methods usually encode the entire scene as a whole, which is generally not aware of the object identity and limits the ability to the high-level editing tasks such as moving or adding furniture. In this paper, we present a novel neural scene rendering system, which learns an object-compositional neural radiance field and produces realistic rendering with editing capability for a clustered and real-world scene. Specifically, we design a novel two-pathway architecture, in which the scene branch encodes the scene geometry and appearance, and the object branch encodes each standalone object conditioned on learnable object activation codes. To survive the training in heavily cluttered scenes, we propose a scene-guided training strategy to solve the 3D space ambiguity in the occluded regions and learn sharp boundaries for each object. Extensive experiments demonstrate that our system not only achieves competitive performance for static scene novel-view synthesis, but also produces realistic rendering for object-level editing.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yang_Learning_Object-Compositional_Neural_Radiance_Field_for_Editable_Scene_Rendering_ICCV_2021_paper.pdf", @@ -23094,7 +24657,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Yang_Learning_Object-Compositional_Neural_Radiance_Field_for_Editable_Scene_Rendering_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Yang_Learning_Object-Compositional_Neural_Radiance_Field_for_Editable_Scene_Rendering_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Yang_2021_ICCV,\n \n author = {\n Yang,\n Bangbang and Zhang,\n Yinda and Xu,\n Yinghao and Li,\n Yijin and Zhou,\n Han and Bao,\n Hujun and Zhang,\n Guofeng and Cui,\n Zhaopeng\n},\n title = {\n Learning Object-Compositional Neural Radiance Field for Editable Scene Rendering\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13779-13788\n} \n}" }, { "title": "Learning Privacy-Preserving Optics for Human Pose Estimation", @@ -23102,6 +24666,7 @@ "status": "Poster", "track": "main", "pid": 5401, + "author_site": "Carlos Hinojosa; Juan Carlos Niebles; Henry Arguello", "author": "Carlos Hinojosa; Juan Carlos Niebles; Henry Arguello", "abstract": "The widespread use of always-connected digital cameras in our everyday life has led to increasing concerns about the users' privacy and security. How to develop privacy-preserving computer vision systems? In particular, we want to prevent the camera from obtaining detailed visual data that may contain private information. However, we also want the camera to capture useful information to perform computer vision tasks. Inspired by the trend of jointly designing optics and algorithms, we tackle the problem of privacy-preserving human pose estimation by optimizing an optical encoder (hardware-level protection) with a software decoder (convolutional neural network) in an end-to-end framework. We introduce a visual privacy protection layer in our optical encoder that, parametrized appropriately, enables the optimization of the camera lens's point spread function (PSF). We validate our approach with extensive simulations and a prototype camera. We show that our privacy-preserving deep optics approach successfully degrades or inhibits private attributes while maintaining important features to perform human pose estimation.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Hinojosa_Learning_Privacy-Preserving_Optics_for_Human_Pose_Estimation_ICCV_2021_paper.pdf", @@ -23116,7 +24681,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Hinojosa_Learning_Privacy-Preserving_Optics_for_Human_Pose_Estimation_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Hinojosa_Learning_Privacy-Preserving_Optics_for_Human_Pose_Estimation_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Hinojosa_2021_ICCV,\n \n author = {\n Hinojosa,\n Carlos and Niebles,\n Juan Carlos and Arguello,\n Henry\n},\n title = {\n Learning Privacy-Preserving Optics for Human Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2573-2582\n} \n}" }, { "title": "Learning RAW-to-sRGB Mappings With Inaccurately Aligned Supervision", @@ -23124,6 +24690,7 @@ "status": "Poster", "track": "main", "pid": 5565, + "author_site": "Zhilu Zhang; Haolin Wang; Ming Liu; Ruohao Wang; Jiawei Zhang; Wangmeng Zuo", "author": "Zhilu Zhang; Haolin Wang; Ming Liu; Ruohao Wang; Jiawei Zhang; Wangmeng Zuo", "abstract": "Learning RAW-to-sRGB mapping has drawn increasing attention in recent years, wherein an input raw image is trained to imitate the target sRGB image captured by another camera. However, the severe color inconsistency makes it very challenging to generate well-aligned training pairs of input raw and target sRGB images. While learning with inaccurately aligned supervision is prone to causing pixel shift and producing blurry results. In this paper, we circumvent such issue by presenting a joint learning model for image alignment and RAW-to-sRGB mapping. To diminish the effect of color inconsistency in image alignment, we introduce to use a global color mapping (GCM) module to generate an initial sRGB image given the input raw image, which can keep the spatial location of the pixels unchanged, and the target sRGB image is utilized to guide GCM for converting the color towards it. Then a pre-trained optical flow estimation network (e.g., PWC-Net) is deployed to warp the target sRGB image to align with the GCM output. To alleviate the effect of inaccurately aligned supervision, the warped target sRGB image is leveraged to learn RAW-to-sRGB mapping. When training is done, the GCM module and optical flow network can be detached, thereby bringing no extra computation cost for inference. Experiments show that our method performs favorably against state-of-the-arts on ZRR and SR-RAW datasets. With our joint learning model, a light-weight backbone can achieve better quantitative and qualitative performance on ZRR dataset. Codes are available at https://github.com/cszhilu1998/RAW-to-sRGB.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhang_Learning_RAW-to-sRGB_Mappings_With_Inaccurately_Aligned_Supervision_ICCV_2021_paper.pdf", @@ -23147,7 +24714,8 @@ "aff_campus_unique_index": "0;0;0;0;2+0", "aff_campus_unique": "Harbin;;Guangzhou", "aff_country_unique_index": "0;0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2021_ICCV,\n \n author = {\n Zhang,\n Zhilu and Wang,\n Haolin and Liu,\n Ming and Wang,\n Ruohao and Zhang,\n Jiawei and Zuo,\n Wangmeng\n},\n title = {\n Learning RAW-to-sRGB Mappings With Inaccurately Aligned Supervision\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4348-4358\n} \n}" }, { "title": "Learning Rare Category Classifiers on a Tight Labeling Budget", @@ -23155,6 +24723,7 @@ "status": "Poster", "track": "main", "pid": 7937, + "author_site": "Ravi Teja Mullapudi; Fait Poms; William R. Mark; Deva Ramanan; Kayvon Fatahalian", "author": "Ravi Teja Mullapudi; Fait Poms; William R. Mark; Deva Ramanan; Kayvon Fatahalian", "abstract": "Many real-world ML deployments face the challenge of training a rare category model with a small labeling bud- get. In these settings, there is often access to large amounts of unlabeled data, therefore it is attractive to consider semi-supervised or active learning approaches to reduce human labeling effort. However, prior approaches make two assumptions that do not often hold in practice; (a) one has access to a modest amount of labeled data to bootstrap learning and (b) every image belongs to a common category of interest. In this paper, we consider the scenario where we start with as-little-as five labeled positives of a rare category and a large amount of unlabeled data of which 99.9% of it is negatives. We propose an active semi-supervised method for building accurate models in this challenging setting. Our method leverages two key ideas: (a) Utilize human and machine effort where they are most effective; human labels are used to identify \"needle-in-a-haystack\" positives, while machine-generated pseudo-labels are used to identify negatives. (b) Adapt recently proposed representation learning techniques for handling extremely imbalanced human labeled data to iteratively train models with noisy machine labeled data. We compare our approach with prior active learning and semi-supervised approaches, demonstrating significant improvements in accuracy per unit labeling effort, particularly on a tight labeling budget.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Mullapudi_Learning_Rare_Category_Classifiers_on_a_Tight_Labeling_Budget_ICCV_2021_paper.pdf", @@ -23178,7 +24747,8 @@ "aff_campus_unique_index": "1;2;1;;2", "aff_campus_unique": ";Mountain View;Stanford", "aff_country_unique_index": "0+0;0;0;0+0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Mullapudi_2021_ICCV,\n \n author = {\n Mullapudi,\n Ravi Teja and Poms,\n Fait and Mark,\n William R. and Ramanan,\n Deva and Fatahalian,\n Kayvon\n},\n title = {\n Learning Rare Category Classifiers on a Tight Labeling Budget\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8423-8432\n} \n}" }, { "title": "Learning Realistic Human Reposing Using Cyclic Self-Supervision With 3D Shape, Pose, and Appearance Consistency", @@ -23186,10 +24756,11 @@ "status": "Poster", "track": "main", "pid": 3482, + "author_site": "Soubhik Sanyal; Alex Vorobiov; Timo Bolkart; Matthew Loper; Betty Mohler; Larry S. Davis; Javier Romero; Michael J. Black", "author": "Soubhik Sanyal; Alex Vorobiov; Timo Bolkart; Matthew Loper; Betty Mohler; Larry S. Davis; Javier Romero; Michael J. Black", "abstract": "Synthesizing images of a person in novel poses from a single image is a highly ambiguous task. Most existing approaches require paired training images; i.e. images of the same person with the same clothing in different poses. However, obtaining sufficiently large datasets with paired data is challenging and costly. Previous methods that forego paired supervision lack realism. We propose a self-supervised framework named SPICE (Self-supervised Person Image CrEation) that closes the image quality gap with supervised methods. The key insight enabling self-supervision is to exploit 3D information about the human body in several ways. First, the 3D body shape must remain unchanged when reposing. Second, representing body pose in 3D enables reasoning about self occlusions. Third, 3D body parts that are visible before and after reposing, should have similar appearance features. Once trained, SPICE takes an image of a person and generates a new image of that person in a new target pose. SPICE achieves state-of-the-art performance on the DeepFashion dataset, improving the FID score from 29.9 to 7.8 compared with previous unsupervised methods, and with performance similar to the state-of-the-art supervised method (6.4). SPICE also generates temporally coherent videos given an input image and a sequence of poses, despite being trained on static images only.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Sanyal_Learning_Realistic_Human_Reposing_Using_Cyclic_Self-Supervision_With_3D_Shape_ICCV_2021_paper.pdf", - "aff": "Max Planck Institute for Intelligent Systems, T\u00fcbingen+Amazon; Amazon; Max Planck Institute for Intelligent Systems, T\u00fcbingen; Amazon; Amazon; Amazon; Amazon; Amazon", + "aff": "Max Planck Institute for Intelligent Systems, Tübingen+Amazon; Amazon; Max Planck Institute for Intelligent Systems, Tübingen; Amazon; Amazon; Amazon; Amazon; Amazon", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Sanyal_Learning_Realistic_Human_ICCV_2021_supplemental.pdf", @@ -23202,14 +24773,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Sanyal_Learning_Realistic_Human_Reposing_Using_Cyclic_Self-Supervision_With_3D_Shape_ICCV_2021_paper.html", "aff_unique_index": "0+1;1;0;1;1;1;1;1", - "aff_unique_norm": "Max Planck Institute for Intelligent Systems;Amazon", - "aff_unique_dep": ";Amazon.com, Inc.", + "aff_unique_norm": "Max Planck Institute for Intelligent Systems;Amazon.com, Inc.", + "aff_unique_dep": ";", "aff_unique_url": "https://www.mpi-is.mpg.de;https://www.amazon.com", "aff_unique_abbr": "MPI-IS;Amazon", "aff_campus_unique_index": "0;0", - "aff_campus_unique": "T\u00fcbingen;", + "aff_campus_unique": "Tübingen;", "aff_country_unique_index": "0+1;1;0;1;1;1;1;1", - "aff_country_unique": "Germany;United States" + "aff_country_unique": "Germany;United States", + "bibtex": "@InProceedings{Sanyal_2021_ICCV,\n \n author = {\n Sanyal,\n Soubhik and Vorobiov,\n Alex and Bolkart,\n Timo and Loper,\n Matthew and Mohler,\n Betty and Davis,\n Larry S. and Romero,\n Javier and Black,\n Michael J.\n},\n title = {\n Learning Realistic Human Reposing Using Cyclic Self-Supervision With 3D Shape,\n Pose,\n and Appearance Consistency\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11138-11147\n} \n}" }, { "title": "Learning Self-Consistency for Deepfake Detection", @@ -23217,6 +24789,7 @@ "status": "Poster", "track": "main", "pid": 1521, + "author_site": "Tianchen Zhao; Xiang Xu; Mingze Xu; Hui Ding; Yuanjun Xiong; Wei Xia", "author": "Tianchen Zhao; Xiang Xu; Mingze Xu; Hui Ding; Yuanjun Xiong; Wei Xia", "abstract": "We propose a new method to detect deepfake images using the cue of the source feature inconsistency within the forged images. It is based on the hypothesis that images' distinct source features can be preserved and extracted after going through state-of-the-art deepfake generation processes. We introduce a novel representation learning approach, called pair-wise self-consistency learning (PCL), for training ConvNets to extract these source features and detect deepfake images. It is accompanied by a new image synthesis approach, called inconsistency image generator (I2G), to provide richly annotated training data for PCL. Experimental results on seven popular datasets show that our models improve averaged AUC from 96.45% to 98.05% over the state of the art in the in-dataset evaluation and from 86.03% to 92.18% in the cross-dataset evaluation.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhao_Learning_Self-Consistency_for_Deepfake_Detection_ICCV_2021_paper.pdf", @@ -23233,14 +24806,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhao_Learning_Self-Consistency_for_Deepfake_Detection_ICCV_2021_paper.html", "aff_unique_index": "0+1;1;1;1;1;1", - "aff_unique_norm": "University of Michigan;Amazon", + "aff_unique_norm": "University of Michigan;Amazon Web Services", "aff_unique_dep": ";AWS AI", "aff_unique_url": "https://www.umich.edu;https://aws.amazon.com", "aff_unique_abbr": "UM;AWS", "aff_campus_unique_index": "0", "aff_campus_unique": "Ann Arbor;", "aff_country_unique_index": "0+0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhao_2021_ICCV,\n \n author = {\n Zhao,\n Tianchen and Xu,\n Xiang and Xu,\n Mingze and Ding,\n Hui and Xiong,\n Yuanjun and Xia,\n Wei\n},\n title = {\n Learning Self-Consistency for Deepfake Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15023-15033\n} \n}" }, { "title": "Learning Self-Similarity in Space and Time As Generalized Motion for Video Action Recognition", @@ -23248,6 +24822,7 @@ "status": "Poster", "track": "main", "pid": 6424, + "author_site": "Heeseung Kwon; Manjin Kim; Suha Kwak; Minsu Cho", "author": "Heeseung Kwon; Manjin Kim; Suha Kwak; Minsu Cho", "abstract": "Spatio-temporal convolution often fails to learn motion dynamics in videos and thus an effective motion representation is required for video understanding in the wild. In this paper, we propose a rich and robust motion representation based on spatio-temporal self-similarity (STSS). Given a sequence of frames, STSS represents each local region as similarities to its neighbors in space and time. By converting appearance features into relational values, it enables the learner to better recognize structural patterns in space and time. We leverage the whole volume of STSS and let our model learn to extract an effective motion representation from it. The proposed neural block, dubbed SELFY, can be easily inserted into neural architectures and trained end-to-end without additional supervision. With a sufficient volume of the neighborhood in space and time, it effectively captures long-term interaction and fast motion in the video, leading to robust action recognition. Our experimental analysis demonstrates its superiority over previous methods for motion modeling as well as its complementarity to spatio-temporal features from direct convolution. On the standard action recognition benchmarks, SomethingSomething-V1 & V2, Diving-48, and FineGym, the proposed method achieves the state-of-the-art results.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Kwon_Learning_Self-Similarity_in_Space_and_Time_As_Generalized_Motion_for_ICCV_2021_paper.pdf", @@ -23262,7 +24837,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Kwon_Learning_Self-Similarity_in_Space_and_Time_As_Generalized_Motion_for_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Kwon_Learning_Self-Similarity_in_Space_and_Time_As_Generalized_Motion_for_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Kwon_2021_ICCV,\n \n author = {\n Kwon,\n Heeseung and Kim,\n Manjin and Kwak,\n Suha and Cho,\n Minsu\n},\n title = {\n Learning Self-Similarity in Space and Time As Generalized Motion for Video Action Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13065-13075\n} \n}" }, { "title": "Learning Signed Distance Field for Multi-View Surface Reconstruction", @@ -23270,6 +24846,7 @@ "status": "Poster", "track": "main", "pid": 7364, + "author_site": "Jingyang Zhang; Yao Yao; Long Quan", "author": "Jingyang Zhang; Yao Yao; Long Quan", "abstract": "Recent works on implicit neural representations have shown promising results for multi-view surface reconstruction. However, most approaches are limited to relatively simple geometries and usually require clean object masks for reconstructing complex and concave objects. In this work, we introduce a novel neural surface reconstruction framework that leverages the knowledge of stereo matching and feature consistency to optimize the implicit surface representation. More specifically, we apply a signed distance field (SDF) and a surface light field to represent the scene geometry and appearance respectively. The SDF is directly supervised by geometry from stereo matching, and is refined by optimizing the multi-view feature consistency and the fidelity of rendered images. Our method is able to improve the robustness of geometry estimation and support reconstruction of complex scene topologies. Extensive experiments have been conducted on DTU, EPFL and Tanks and Temples datasets. Compared to previous state-of-the-art methods, our method achieves better mesh reconstruction in wide open scenes without masks as input.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhang_Learning_Signed_Distance_Field_for_Multi-View_Surface_Reconstruction_ICCV_2021_paper.pdf", @@ -23293,7 +24870,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2021_ICCV,\n \n author = {\n Zhang,\n Jingyang and Yao,\n Yao and Quan,\n Long\n},\n title = {\n Learning Signed Distance Field for Multi-View Surface Reconstruction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6525-6534\n} \n}" }, { "title": "Learning Skeletal Graph Neural Networks for Hard 3D Pose Estimation", @@ -23301,6 +24879,7 @@ "status": "Poster", "track": "main", "pid": 2703, + "author_site": "Ailing Zeng; Xiao Sun; Lei Yang; Nanxuan Zhao; Minhao Liu; Qiang Xu", "author": "Ailing Zeng; Xiao Sun; Lei Yang; Nanxuan Zhao; Minhao Liu; Qiang Xu", "abstract": "Various deep learning techniques have been proposed to solve the single-view 2D-to-3D pose estimation problem. While the average prediction accuracy has been improved significantly over the years, the performance on hard poses with depth ambiguity, self-occlusion, and complex or rare poses is still far from satisfactory. In this work, we target these hard poses and present a novel skeletal GNN learning solution. To be specific, we propose a hop-aware hierarchical channel-squeezing fusion layer to effectively extract relevant information from neighboring nodes while suppressing undesired noises in GNN learning. In addition, we propose a temporal-aware dynamic graph construction procedure that is robust and effective for 3D pose estimation. Experimental results on the Human3.6M dataset show that our solution achieves a 10.3% average prediction accuracy improvement and greatly improves on hard poses over state-of-the-art techniques. We further apply the proposed technique on the skeleton-based action recognition task and also achieve state-of-the-art performance.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zeng_Learning_Skeletal_Graph_Neural_Networks_for_Hard_3D_Pose_Estimation_ICCV_2021_paper.pdf", @@ -23317,14 +24896,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zeng_Learning_Skeletal_Graph_Neural_Networks_for_Hard_3D_Pose_Estimation_ICCV_2021_paper.html", "aff_unique_index": "0;1;2+0;0;0;0", - "aff_unique_norm": "Chinese University of Hong Kong;Microsoft;SenseTime Group", + "aff_unique_norm": "The Chinese University of Hong Kong;Microsoft Research;Sensetime Group", "aff_unique_dep": ";Research;", - "aff_unique_url": "https://www.cuhk.edu.hk;https://www.microsoft.com/en-us/research/group/asia;https://www.sensetime.com", + "aff_unique_url": "https://www.cuhk.edu.hk;https://www.microsoft.com/en-us/research/group/asia;https://www.sensetime.com/", "aff_unique_abbr": "CUHK;MSR Asia;SenseTime", "aff_campus_unique_index": "0;1;0;0;0;0", "aff_campus_unique": "Hong Kong SAR;Asia;", "aff_country_unique_index": "0;0;0+0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zeng_2021_ICCV,\n \n author = {\n Zeng,\n Ailing and Sun,\n Xiao and Yang,\n Lei and Zhao,\n Nanxuan and Liu,\n Minhao and Xu,\n Qiang\n},\n title = {\n Learning Skeletal Graph Neural Networks for Hard 3D Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11436-11445\n} \n}" }, { "title": "Learning Spatio-Temporal Transformer for Visual Tracking", @@ -23332,6 +24912,7 @@ "status": "Poster", "track": "main", "pid": 2346, + "author_site": "Bin Yan; Houwen Peng; Jianlong Fu; Dong Wang; Huchuan Lu", "author": "Bin Yan; Houwen Peng; Jianlong Fu; Dong Wang; Huchuan Lu", "abstract": "In this paper, we present a new tracking architecture with an encoder-decoder transformer as the key component. The encoder models the global spatio-temporal feature dependencies between target objects and search regions, while the decoder learns a query embedding to predict the spatial positions of the target objects. Our method casts object tracking as a direct bounding box prediction problem, without using any proposals or predefined anchors. With the encoder-decoder transformer, the prediction of objects just uses a simple fully-convolutional network, which estimates the corners of objects directly. The whole method is end-to-end, does not need any postprocessing steps such as cosine window and bounding box smoothing, thus largely simplifying existing tracking pipelines. The proposed tracker achieves state-of-the-art performance on multiple challenging short-term and long-term benchmarks, while running at real-time speed, being 6x faster than Siam R-CNN. Code and models are open-sourced at https://github.com/researchmm/Stark.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yan_Learning_Spatio-Temporal_Transformer_for_Visual_Tracking_ICCV_2021_paper.pdf", @@ -23348,14 +24929,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Yan_Learning_Spatio-Temporal_Transformer_for_Visual_Tracking_ICCV_2021_paper.html", "aff_unique_index": "0;1;1;0;0", - "aff_unique_norm": "Dalian University of Technology;Microsoft", + "aff_unique_norm": "Dalian University of Technology;Microsoft Research", "aff_unique_dep": ";Research", "aff_unique_url": "http://www.dlut.edu.cn/;https://www.microsoft.com/en-us/research/group/asia", "aff_unique_abbr": "DUT;MSR Asia", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yan_2021_ICCV,\n \n author = {\n Yan,\n Bin and Peng,\n Houwen and Fu,\n Jianlong and Wang,\n Dong and Lu,\n Huchuan\n},\n title = {\n Learning Spatio-Temporal Transformer for Visual Tracking\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10448-10457\n} \n}" }, { "title": "Learning Specialized Activation Functions With the Piecewise Linear Unit", @@ -23363,6 +24945,7 @@ "status": "Poster", "track": "main", "pid": 7482, + "author_site": "Yucong Zhou; Zezhou Zhu; Zhao Zhong", "author": "Yucong Zhou; Zezhou Zhu; Zhao Zhong", "abstract": "The choice of activation functions is crucial for modern deep neural networks. Popular hand-designed activation functions like Rectified Linear Unit(ReLU) and its variants show promising performance in various tasks and models. Swish, the automatically discovered activation function, outperforms ReLU on many challenging datasets. However, it has two main drawbacks. First, the tree-based search space is highly discrete and restricted, making it difficult to searching. Second, the sample-based searching method is inefficient, making it infeasible to find specialized activation functions for each dataset or neural architecture. To tackle these drawbacks, we propose a new activation function called Piecewise Linear Unit(PWLU), which incorporates a carefully designed formulation and learning method. It can learn specialized activation functions and achieves SOTA performance on large-scale datasets like ImageNet and COCO. For example, on ImageNet classification dataset, PWLU improves 0.9%/0.53%/1.0%/1.7%/1.0% top-1 accuracy over Swish for ResNet-18/ResNet-50/MobileNet-V2/MobileNet-V3/EfficientNet-B0. PWLU is also easy to implement and efficient at inference, which can be widely applied in real-world applications.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhou_Learning_Specialized_Activation_Functions_With_the_Piecewise_Linear_Unit_ICCV_2021_paper.pdf", @@ -23379,14 +24962,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhou_Learning_Specialized_Activation_Functions_With_the_Piecewise_Linear_Unit_ICCV_2021_paper.html", "aff_unique_index": "0;1;0", - "aff_unique_norm": "Huawei;Beijing University of Posts and Telecommunications", - "aff_unique_dep": "Huawei Technologies Co., Ltd.;", + "aff_unique_norm": "Huawei Technologies Co., Ltd.;Beijing University of Posts and Telecommunications", + "aff_unique_dep": ";", "aff_unique_url": "https://www.huawei.com;http://www.bupt.edu.cn/", "aff_unique_abbr": "Huawei;BUPT", "aff_campus_unique_index": "1", "aff_campus_unique": ";Beijing", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhou_2021_ICCV,\n \n author = {\n Zhou,\n Yucong and Zhu,\n Zezhou and Zhong,\n Zhao\n},\n title = {\n Learning Specialized Activation Functions With the Piecewise Linear Unit\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12095-12104\n} \n}" }, { "title": "Learning Target Candidate Association To Keep Track of What Not To Track", @@ -23394,10 +24978,11 @@ "status": "Poster", "track": "main", "pid": 6479, + "author_site": "Christoph Mayer; Martin Danelljan; Danda Pani Paudel; Luc Van Gool", "author": "Christoph Mayer; Martin Danelljan; Danda Pani Paudel; Luc Van Gool", "abstract": "The presence of objects that are confusingly similar to the tracked target, poses a fundamental challenge in appearance-based visual tracking. Such distractor objects are easily misclassified as the target itself, leading to eventual tracking failure. While most methods strive to suppress distractors through more powerful appearance models, we take an alternative approach. We propose to keep track of distractor objects in order to continue tracking the target. To this end, we introduce a learned association network, allowing us to propagate the identities of all target candidates from frame-to-frame. To tackle the problem of lacking ground-truth correspondences between distractor objects in visual tracking, we propose a training strategy that combines partial annotations with self-supervision. We conduct comprehensive experimental validation and analysis of our approach on several challenging datasets. Our tracker sets a new state-of-the-art on six benchmarks, achieving an AUC score of 67.1% on LaSOT and a +5.8% absolute gain on the OxUvA long-term dataset.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Mayer_Learning_Target_Candidate_Association_To_Keep_Track_of_What_Not_ICCV_2021_paper.pdf", - "aff": "Computer Vision Lab, D-ITET, ETH Z\u00fcrich, Switzerland; Computer Vision Lab, D-ITET, ETH Z\u00fcrich, Switzerland; Computer Vision Lab, D-ITET, ETH Z\u00fcrich, Switzerland; Computer Vision Lab, D-ITET, ETH Z\u00fcrich, Switzerland", + "aff": "Computer Vision Lab, D-ITET, ETH Zürich, Switzerland; Computer Vision Lab, D-ITET, ETH Zürich, Switzerland; Computer Vision Lab, D-ITET, ETH Zürich, Switzerland; Computer Vision Lab, D-ITET, ETH Zürich, Switzerland", "project": "", "github": "https://github.com/visionml/pytracking", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Mayer_Learning_Target_Candidate_ICCV_2021_supplemental.pdf", @@ -23410,14 +24995,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Mayer_Learning_Target_Candidate_Association_To_Keep_Track_of_What_Not_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;0", - "aff_unique_norm": "ETH Zurich", + "aff_unique_norm": "ETH Zürich", "aff_unique_dep": "Computer Vision Lab, D-ITET", "aff_unique_url": "https://www.ethz.ch", "aff_unique_abbr": "ETHZ", "aff_campus_unique_index": "0;0;0;0", - "aff_campus_unique": "Z\u00fcrich", + "aff_campus_unique": "Zürich", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Switzerland" + "aff_country_unique": "Switzerland", + "bibtex": "@InProceedings{Mayer_2021_ICCV,\n \n author = {\n Mayer,\n Christoph and Danelljan,\n Martin and Paudel,\n Danda Pani and Van Gool,\n Luc\n},\n title = {\n Learning Target Candidate Association To Keep Track of What Not To Track\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13444-13454\n} \n}" }, { "title": "Learning Temporal Dynamics From Cycles in Narrated Video", @@ -23425,6 +25011,7 @@ "status": "Poster", "track": "main", "pid": 3128, + "author_site": "Dave Epstein; Jiajun Wu; Cordelia Schmid; Chen Sun", "author": "Dave Epstein; Jiajun Wu; Cordelia Schmid; Chen Sun", "abstract": "Learning to model how the world changes as time elapses has proven a challenging problem for the computer vision community. We introduce a self-supervised approach to this problem that solves a multi-modal temporal cycle consistency objective, MMCC, jointly in vision and language. This objective requires a model to learn modality-agnostic functions to predict the future and past that undo each other when composed. We hypothesize that a model trained on this objective will discover long-term temporal dynamics in video. We verify this hypothesis by using the resultant visual representations and predictive models as-is to solve a variety of downstream tasks. Our method outperforms state-of-the-art self-supervised video prediction methods on future action anticipation, temporal image ordering, and arrow-of-time classification tasks, without training on target datasets or their labels.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Epstein_Learning_Temporal_Dynamics_From_Cycles_in_Narrated_Video_ICCV_2021_paper.pdf", @@ -23442,13 +25029,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Epstein_Learning_Temporal_Dynamics_From_Cycles_in_Narrated_Video_ICCV_2021_paper.html", "aff_unique_index": "0;1;2;2+3", "aff_unique_norm": "University of California, Berkeley;Stanford University;Google;Brown University", - "aff_unique_dep": ";;Google;", + "aff_unique_dep": ";;;", "aff_unique_url": "https://www.berkeley.edu;https://www.stanford.edu;https://www.google.com;https://www.brown.edu", "aff_unique_abbr": "UC Berkeley;Stanford;Google;Brown", "aff_campus_unique_index": "0;1;2;2", "aff_campus_unique": "Berkeley;Stanford;Mountain View;", "aff_country_unique_index": "0;0;0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Epstein_2021_ICCV,\n \n author = {\n Epstein,\n Dave and Wu,\n Jiajun and Schmid,\n Cordelia and Sun,\n Chen\n},\n title = {\n Learning Temporal Dynamics From Cycles in Narrated Video\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1480-1489\n} \n}" }, { "title": "Learning To Adversarially Blur Visual Object Tracking", @@ -23456,6 +25044,7 @@ "status": "Poster", "track": "main", "pid": 1406, + "author_site": "Qing Guo; Ziyi Cheng; Felix Juefei-Xu; Lei Ma; Xiaofei Xie; Yang Liu; Jianjun Zhao", "author": "Qing Guo; Ziyi Cheng; Felix Juefei-Xu; Lei Ma; Xiaofei Xie; Yang Liu; Jianjun Zhao", "abstract": "Motion blur caused by the moving of the object or camera during the exposure can be a key challenge for visual object tracking, affecting tracking accuracy significantly. In this work, we explore the robustness of visual object trackers against motion blur from a new angle, i.e., adversarial blur attack (ABA). Our main objective is to online transfer input frames to their natural motion-blurred counterparts while misleading the state-of-the-art trackers during the tracking process. To this end, we first design the motion blur synthesizing method for visual tracking based on the generation principle of motion blur, considering the motion information and the light accumulation process. With this synthetic method, we propose optimization-based ABA (OP-ABA) by iteratively optimizing an adversarial objective function against the tracking w.r.t. the motion and light accumulation parameters. The OP-ABA is able to produce natural adversarial examples but the iteration can cause heavy time cost, making it unsuitable for attacking real-time trackers. To alleviate this issue, we further propose one-step ABA (OS-ABA) where we design and train a joint adversarial motion and accumulation predictive network (JAMANet) with the guidance of OP-ABA, which is able to efficiently estimate the adversarial motion and accumulation parameters in a one-step way. The experiments on four popular datasets (e.g., OTB100, VOT2018, UAV123, and LaSOT) demonstrate that our methods are able to cause significant accuracy drops on four state-of-the-art trackers with high transferability. Please find the source code at https://github.com/tsingqguo/ABA.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Guo_Learning_To_Adversarially_Blur_Visual_Object_Tracking_ICCV_2021_paper.pdf", @@ -23479,7 +25068,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;3;4;0;4", - "aff_country_unique": "China;Japan;United States;Canada;Singapore" + "aff_country_unique": "China;Japan;United States;Canada;Singapore", + "bibtex": "@InProceedings{Guo_2021_ICCV,\n \n author = {\n Guo,\n Qing and Cheng,\n Ziyi and Juefei-Xu,\n Felix and Ma,\n Lei and Xie,\n Xiaofei and Liu,\n Yang and Zhao,\n Jianjun\n},\n title = {\n Learning To Adversarially Blur Visual Object Tracking\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10839-10848\n} \n}" }, { "title": "Learning To Better Segment Objects From Unseen Classes With Unlabeled Videos", @@ -23487,10 +25077,11 @@ "status": "Poster", "track": "main", "pid": 7603, + "author_site": "Yuming Du; Yang Xiao; Vincent Lepetit", "author": "Yuming Du; Yang Xiao; Vincent Lepetit", "abstract": "The ability to localize and segment objects from unseen classes would open the door to new applications, such as autonomous object learning in active vision. Nonetheless, improving the performance on unseen classes requires additional training data, while manually annotating the objects of the unseen classes can be labor-extensive and expensive. In this paper, we explore the use of unlabeled video sequences to automatically generate training data for objects of unseen classes. It is in principle possible to apply existing video segmentation methods to unlabeled videos and automatically obtain object masks, which can then be used as a training set even for classes with no manual labels available. However, our experiments show that these methods do not perform well enough for this purpose. We therefore introduce a Bayesian method that is specifically designed to automatically create such a training set: Our method starts from a set of object proposals and relies on (non-realistic) analysis-by-synthesis to select the correct ones by performing an efficient optimization over all the frames simultaneously. Through extensive experiments, we show that our method can generate a high-quality training set which significantly boosts the performance of segmenting objects of unseen classes. We thus believe that our method could open the door for open-world instance segmentation by exploiting abundant Internet videos.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Du_Learning_To_Better_Segment_Objects_From_Unseen_Classes_With_Unlabeled_ICCV_2021_paper.pdf", - "aff": "LIGM, Ecole des Ponts, Univ Gustave Eiffel, CNRS, Marne-la-vall\u00e9e, France; LIGM, Ecole des Ponts, Univ Gustave Eiffel, CNRS, Marne-la-vall\u00e9e, France; LIGM, Ecole des Ponts, Univ Gustave Eiffel, CNRS, Marne-la-vall\u00e9e, France", + "aff": "LIGM, Ecole des Ponts, Univ Gustave Eiffel, CNRS, Marne-la-vallée, France; LIGM, Ecole des Ponts, Univ Gustave Eiffel, CNRS, Marne-la-vallée, France; LIGM, Ecole des Ponts, Univ Gustave Eiffel, CNRS, Marne-la-vallée, France", "project": "https://dulucas.github.io/gbopt/", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Du_Learning_To_Better_ICCV_2021_supplemental.pdf", @@ -23508,9 +25099,10 @@ "aff_unique_url": "https://www.ponts.fr", "aff_unique_abbr": "ENPC", "aff_campus_unique_index": "0;0;0", - "aff_campus_unique": "Marne-la-vall\u00e9e", + "aff_campus_unique": "Marne-la-vallée", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "France" + "aff_country_unique": "France", + "bibtex": "@InProceedings{Du_2021_ICCV,\n \n author = {\n Du,\n Yuming and Xiao,\n Yang and Lepetit,\n Vincent\n},\n title = {\n Learning To Better Segment Objects From Unseen Classes With Unlabeled Videos\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3375-3384\n} \n}" }, { "title": "Learning To Bundle-Adjust: A Graph Network Approach to Faster Optimization of Bundle Adjustment for Vehicular SLAM", @@ -23518,6 +25110,7 @@ "status": "Poster", "track": "main", "pid": 3206, + "author_site": "Tetsuya Tanaka; Yukihiro Sasagawa; Takayuki Okatani", "author": "Tetsuya Tanaka; Yukihiro Sasagawa; Takayuki Okatani", "abstract": "Bundle adjustment (BA) occupies a large portion of SfM and visual SLAM's total execution time. Local BA over the latest several keyframes plays a crucial role in visual SLAM. Its execution time should be sufficiently short for robust tracking; this is especially critical for embedded systems with a limited computational resource. This study proposes a learning-based method using a graph network that can replace conventional optimization-based BA and works faster. The graph network operates on a graph consisting of the nodes of keyframes and landmarks and the edges of the latter's visibility from the former. The graph network receives the parameters' initial values as inputs and predicts the updates to their optimal values. We design an intermediate representation of inputs inspired by the normal equation of the Levenberg-Marquardt method. We use the sum of reprojection errors as a loss function to train the graph network. The experiments show that the proposed method outputs parameter estimates with slightly inferior accuracy in 1/60-1/10 of time compared with the conventional BA.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Tanaka_Learning_To_Bundle-Adjust_A_Graph_Network_Approach_to_Faster_Optimization_ICCV_2021_paper.pdf", @@ -23537,11 +25130,12 @@ "aff_unique_norm": "Socionext Inc.;Tohoku University", "aff_unique_dep": ";Graduate School of Information Sciences", "aff_unique_url": "https://www.socionext.com;https://www.tohoku.ac.jp", - "aff_unique_abbr": ";Tohoku U", + "aff_unique_abbr": "Socionext;Tohoku U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Tanaka_2021_ICCV,\n \n author = {\n Tanaka,\n Tetsuya and Sasagawa,\n Yukihiro and Okatani,\n Takayuki\n},\n title = {\n Learning To Bundle-Adjust: A Graph Network Approach to Faster Optimization of Bundle Adjustment for Vehicular SLAM\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6250-6259\n} \n}" }, { "title": "Learning To Cut by Watching Movies", @@ -23549,7 +25143,8 @@ "status": "Poster", "track": "main", "pid": 5894, - "author": "Alejandro Pardo; Fabian Caba; Juan L\u00e9on Alc\u00e1zar; Ali K. Thabet; Bernard Ghanem", + "author_site": "Alejandro Pardo; Fabian Caba; Juan Léon Alcázar; Ali K. Thabet; Bernard Ghanem", + "author": "Alejandro Pardo; Fabian Caba; Juan Léon Alcázar; Ali K. Thabet; Bernard Ghanem", "abstract": "Video content creation keeps growing at an incredible pace; yet, creating engaging stories remains challenging and requires non-trivial video editing expertise. Many video editing components are astonishingly hard to automate primarily due to the lack of raw video materials. This paper focuses on a new task for computational video editing, namely the task of raking cut plausibility. Our key idea is to leverage content that has already been edited to learn fine-grained audiovisual patterns that trigger cuts. To do this, we first collected a data source of more than 10K videos, from which we extract more than 260K cuts. We devise a model that learns to discriminate between real and artificial cuts via contrastive learning. We set up a new task and a set of baselines to benchmark video cut generation. We observe that our proposed model outperforms the baselines by large margins. To demonstrate our model in real-world applications, we conduct human studies in a collection of unedited videos. The results show that our model does a better job at cutting than random and alternative baselines.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Pardo_Learning_To_Cut_by_Watching_Movies_ICCV_2021_paper.pdf", "aff": "King Abdullah University of Science and Technology (KAUST)+Adobe Research; Adobe Research; King Abdullah University of Science and Technology (KAUST); King Abdullah University of Science and Technology (KAUST); King Abdullah University of Science and Technology (KAUST)", @@ -23572,7 +25167,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;1;0;0;0", - "aff_country_unique": "Saudi Arabia;United States" + "aff_country_unique": "Saudi Arabia;United States", + "bibtex": "@InProceedings{Pardo_2021_ICCV,\n \n author = {\n Pardo,\n Alejandro and Caba,\n Fabian and Alc\\'azar,\n Juan L\\'eon and Thabet,\n Ali K. and Ghanem,\n Bernard\n},\n title = {\n Learning To Cut by Watching Movies\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6858-6868\n} \n}" }, { "title": "Learning To Discover Reflection Symmetry via Polar Matching Convolution", @@ -23580,6 +25176,7 @@ "status": "Poster", "track": "main", "pid": 11073, + "author_site": "Ahyun Seo; Woohyeon Shim; Minsu Cho", "author": "Ahyun Seo; Woohyeon Shim; Minsu Cho", "abstract": "The task of reflection symmetry detection remains challenging due to significant variations and ambiguities of symmetry patterns in the wild. Furthermore, since the local regions are required to match in reflection for detecting a symmetry pattern, it is hard for standard convolutional networks, which are not equivariant to rotation and reflection, to learn the task. To address the issue, we introduce a new convolutional technique, dubbed the polar matching convolution, which leverages a polar feature pooling, a self-similarity encoding, and a systematic kernel design for axes of different angles. The proposed high-dimensional kernel convolution network effectively learns to discover symmetry patterns from real-world images, overcoming the limitations of standard convolution. In addition, we present a new dataset and introduce a self-supervised learning strategy by augmenting the dataset with synthesizing images. Experiments demonstrate that our method outperforms state-of-the-art methods in terms of accuracy and robustness.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Seo_Learning_To_Discover_Reflection_Symmetry_via_Polar_Matching_Convolution_ICCV_2021_paper.pdf", @@ -23594,7 +25191,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Seo_Learning_To_Discover_Reflection_Symmetry_via_Polar_Matching_Convolution_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Seo_Learning_To_Discover_Reflection_Symmetry_via_Polar_Matching_Convolution_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Seo_2021_ICCV,\n \n author = {\n Seo,\n Ahyun and Shim,\n Woohyeon and Cho,\n Minsu\n},\n title = {\n Learning To Discover Reflection Symmetry via Polar Matching Convolution\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1285-1294\n} \n}" }, { "title": "Learning To Diversify for Single Domain Generalization", @@ -23602,6 +25200,7 @@ "status": "Poster", "track": "main", "pid": 7331, + "author_site": "Zijian Wang; Yadan Luo; Ruihong Qiu; Zi Huang; Mahsa Baktashmotlagh", "author": "Zijian Wang; Yadan Luo; Ruihong Qiu; Zi Huang; Mahsa Baktashmotlagh", "abstract": "Domain generalization (DG) aims to generalize a model trained on multiple source (i.e., training) domains to a distributionally different target (i.e., test) domain. In contrast to the DG setup that strictly requires the availability of multiple source domains, this paper considers a more realistic yet challenging scenario, namely Single Domain Generalization (SDG). In this new setting, there is only one source domain available for training, from which the limited diversity may jeopardize the model generalization on unseen target domains. To tackle this problem, we propose a style-complement module to enhance the generalization power of the model by synthesizing images from diverse distributions that are complementary to the source ones. More specifically, we adopt tractable upper and lower bounds of mutual information (MI) between the generated and source samples and perform the two-step optimization iteratively: (1) by minimizing MI upper bound approximation for each pair, the generated images are forced to diversify from the source samples; (2) subsequently, we maximize the lower bound of MI between the samples from the same semantic category, which assists the network to learn discriminative features from diverse-styled images. Extensive experiments on three benchmark datasets demonstrate the superiority of our approach, which surpasses the state-of-the-art single DG methods by up to 25.14%.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_Learning_To_Diversify_for_Single_Domain_Generalization_ICCV_2021_paper.pdf", @@ -23625,7 +25224,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "Australia" + "aff_country_unique": "Australia", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Zijian and Luo,\n Yadan and Qiu,\n Ruihong and Huang,\n Zi and Baktashmotlagh,\n Mahsa\n},\n title = {\n Learning To Diversify for Single Domain Generalization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 834-843\n} \n}" }, { "title": "Learning To Drive From a World on Rails", @@ -23633,7 +25233,8 @@ "status": "Poster", "track": "main", "pid": 3633, - "author": "Dian Chen; Vladlen Koltun; Philipp Kr\u00e4henb\u00fchl", + "author_site": "Dian Chen; Vladlen Koltun; Philipp Krähenbühl", + "author": "Dian Chen; Vladlen Koltun; Philipp Krähenbühl", "abstract": "We learn an interactive vision-based driving policy from pre-recorded driving logs via a model-based approach. A forward model of the world supervises a driving policy that predicts the outcome of any potential driving trajectory. To support learning from pre-recorded logs, we assume that the world is on rails, meaning neither the agent nor its actions influence the environment. This assumption greatly simplifies the learning problem, factorizing the dynamics into a nonreactive world model and a low-dimensional and compact forward model of the ego-vehicle. Our approach computes action-values for each training trajectory using a tabular dynamic-programming evaluation of the Bellman equations; these action-values in turn supervise the final vision-based driving policy. Despite the world-on-rails assumption, the final driving policy acts well in a dynamic and reactive world. It outperforms imitation learning as well as model-based and model-free reinforcement learning on the challenging CARLA NoCrash benchmark. It is also an order of magnitude more sample-efficient than state-of-the-art model-free reinforcement learning techniques on navigational tasks in the ProcGen benchmark.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_Learning_To_Drive_From_a_World_on_Rails_ICCV_2021_paper.pdf", "aff": ";;", @@ -23647,7 +25248,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Chen_Learning_To_Drive_From_a_World_on_Rails_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Chen_Learning_To_Drive_From_a_World_on_Rails_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Dian and Koltun,\n Vladlen and Kr\\"ahenb\\"uhl,\n Philipp\n},\n title = {\n Learning To Drive From a World on Rails\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15590-15599\n} \n}" }, { "title": "Learning To Estimate Hidden Motions With Global Motion Aggregation", @@ -23655,6 +25257,7 @@ "status": "Poster", "track": "main", "pid": 1797, + "author_site": "Shihao Jiang; Dylan Campbell; Yao Lu; Hongdong Li; Richard Hartley", "author": "Shihao Jiang; Dylan Campbell; Yao Lu; Hongdong Li; Richard Hartley", "abstract": "Occlusions pose a significant challenge to optical flow algorithms that rely on local evidences. We consider an occluded point to be one that is imaged in the first frame but not in the next, a slight overloading of the standard definition since it also includes points that move out-of-frame. Estimating the motion of these points is extremely difficult, particularly in the two-frame setting. Previous work relies on CNNs to learn occlusions, without much success, or requires multiple frames to reason about occlusions using temporal smoothness. In this paper, we argue that the occlusion problem can be better solved in the two-frame case by modelling image self-similarities. We introduce a global motion aggregation module, a transformer-based approach to find long-range dependencies between pixels in the first image, and perform global aggregation on the corresponding motion features. We demonstrate that the optical flow estimates in the occluded regions can be significantly improved without damaging the performance in non-occluded regions. This approach obtains new state-of-the-art results on the challenging Sintel dataset, improving the average end-point error by 13.6% on Sintel Final and 13.7% on Sintel Clean. At the time of submission, our method ranks first on these benchmarks among all published and unpublished approaches. Code is available at https://github.com/zacjiang/GMA.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Jiang_Learning_To_Estimate_Hidden_Motions_With_Global_Motion_Aggregation_ICCV_2021_paper.pdf", @@ -23670,15 +25273,16 @@ "email": ";;;;", "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Jiang_Learning_To_Estimate_Hidden_Motions_With_Global_Motion_Aggregation_ICCV_2021_paper.html", - "aff_unique_index": "0;2;0;0;0", - "aff_unique_norm": "Australian National University;;University of Oxford", + "aff_unique_index": "0+1;2;0+1;0+1;0+1", + "aff_unique_norm": "Australian National University;Advanced Robotics Centre魏;University of Oxford", "aff_unique_dep": ";;", "aff_unique_url": "https://www.anu.edu.au;;https://www.ox.ac.uk", - "aff_unique_abbr": "ANU;;Oxford", + "aff_unique_abbr": "ANU;ACRV;Oxford", "aff_campus_unique_index": ";;;", "aff_campus_unique": "", "aff_country_unique_index": "0;2;0;0;0", - "aff_country_unique": "Australia;;United Kingdom" + "aff_country_unique": "Australia;;United Kingdom", + "bibtex": "@InProceedings{Jiang_2021_ICCV,\n \n author = {\n Jiang,\n Shihao and Campbell,\n Dylan and Lu,\n Yao and Li,\n Hongdong and Hartley,\n Richard\n},\n title = {\n Learning To Estimate Hidden Motions With Global Motion Aggregation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9772-9781\n} \n}" }, { "title": "Learning To Generate Scene Graph From Natural Language Supervision", @@ -23686,6 +25290,7 @@ "status": "Poster", "track": "main", "pid": 2888, + "author_site": "Yiwu Zhong; Jing Shi; Jianwei Yang; Chenliang Xu; Yin Li", "author": "Yiwu Zhong; Jing Shi; Jianwei Yang; Chenliang Xu; Yin Li", "abstract": "Learning from image-text data has demonstrated recent success for many recognition tasks, yet is currently limited to visual features or individual visual concepts such as objects. In this paper, we propose one of the first methods that learn from image-sentence pairs to extract a graphical representation of localized objects and their relationships within an image, known as scene graph. To bridge the gap between images and texts, we leverage an off-the-shelf object detector to identify and localize object instances, match labels of detected regions to concepts parsed from captions, and thus create \"pseudo\" labels for learning scene graph. Further, we design a Transformer-based model to predict these \"pseudo\" labels via a masked token prediction task. Learning from only image-sentence pairs, our model achieves 30% relative gain over a latest method trained with human-annotated unlocalized scene graphs. Our model also shows strong results for weakly and fully supervised scene graph generation. In addition, we explore an open-vocabulary setting for detecting scene graphs, and present the first result for open-set scene graph generation.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhong_Learning_To_Generate_Scene_Graph_From_Natural_Language_Supervision_ICCV_2021_paper.pdf", @@ -23702,14 +25307,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhong_Learning_To_Generate_Scene_Graph_From_Natural_Language_Supervision_ICCV_2021_paper.html", "aff_unique_index": "0;1;2;1;0", - "aff_unique_norm": "University of Wisconsin-Madison;University of Rochester;Microsoft", + "aff_unique_norm": "University of Wisconsin-Madison;University of Rochester;Microsoft Corporation", "aff_unique_dep": ";;Microsoft Research", "aff_unique_url": "https://www.wisc.edu;https://www.rochester.edu;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "UW-Madison;U of R;MSR", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Madison;", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhong_2021_ICCV,\n \n author = {\n Zhong,\n Yiwu and Shi,\n Jing and Yang,\n Jianwei and Xu,\n Chenliang and Li,\n Yin\n},\n title = {\n Learning To Generate Scene Graph From Natural Language Supervision\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1823-1834\n} \n}" }, { "title": "Learning To Hallucinate Examples From Extrinsic and Intrinsic Supervision", @@ -23717,6 +25323,7 @@ "status": "Poster", "track": "main", "pid": 9698, + "author_site": "Liangke Gui; Adrien Bardes; Ruslan Salakhutdinov; Alexander Hauptmann; Martial Hebert; Yu-Xiong Wang", "author": "Liangke Gui; Adrien Bardes; Ruslan Salakhutdinov; Alexander Hauptmann; Martial Hebert; Yu-Xiong Wang", "abstract": "Learning to hallucinate additional examples has recently been shown as a promising direction to address few-shot learning tasks. This work investigates two important yet overlooked natural supervision signals for guiding the hallucination process -- (i) extrinsic: classifiers trained on hallucinated examples should be close to strong classifiers that would be learned from a large amount of real examples; and (ii) intrinsic: clusters of hallucinated and real examples belonging to the same class should be pulled together, while simultaneously pushing apart clusters of hallucinated and real examples from different classes. We achieve (i) by introducing an additional mentor model on data-abundant base classes for directing the hallucinator, and achieve (ii) by performing contrastive learning between hallucinated and real examples. As a general, model-agnostic framework, our dual mentor- and self-directed (DMAS) hallucinator significantly improves few-shot learning performance on widely used benchmarks in various scenarios.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Gui_Learning_To_Hallucinate_Examples_From_Extrinsic_and_Intrinsic_Supervision_ICCV_2021_paper.pdf", @@ -23733,14 +25340,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Gui_Learning_To_Hallucinate_Examples_From_Extrinsic_and_Intrinsic_Supervision_ICCV_2021_paper.html", "aff_unique_index": "0;1;0;0;0;2", - "aff_unique_norm": "Carnegie Mellon University;Meta;University of Illinois Urbana-Champaign", + "aff_unique_norm": "Carnegie Mellon University;Facebook AI Research;University of Illinois at Urbana-Champaign", "aff_unique_dep": ";Facebook AI Research;", "aff_unique_url": "https://www.cmu.edu;https://research.facebook.com;https://www illinois.edu", "aff_unique_abbr": "CMU;FAIR;UIUC", "aff_campus_unique_index": "1", "aff_campus_unique": ";Urbana-Champaign", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Gui_2021_ICCV,\n \n author = {\n Gui,\n Liangke and Bardes,\n Adrien and Salakhutdinov,\n Ruslan and Hauptmann,\n Alexander and Hebert,\n Martial and Wang,\n Yu-Xiong\n},\n title = {\n Learning To Hallucinate Examples From Extrinsic and Intrinsic Supervision\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8701-8711\n} \n}" }, { "title": "Learning To Know Where To See: A Visibility-Aware Approach for Occluded Person Re-Identification", @@ -23748,6 +25356,7 @@ "status": "Poster", "track": "main", "pid": 7521, + "author_site": "Jinrui Yang; Jiawei Zhang; Fufu Yu; Xinyang Jiang; Mengdan Zhang; Xing Sun; Ying-Cong Chen; Wei-Shi Zheng", "author": "Jinrui Yang; Jiawei Zhang; Fufu Yu; Xinyang Jiang; Mengdan Zhang; Xing Sun; Ying-Cong Chen; Wei-Shi Zheng", "abstract": "Person re-identification (ReID) has gained an impressive progress in recent years. However, the occlusion is still a common and challenging problem for recent ReID methods. Several mainstream methods utilize extra cues (e.g., human pose information) to distinguish human parts from obstacles to alleviate the occlusion problem. Although achieving inspiring progress, these methods severely rely on the fine-grained extra cues, and are sensitive to the estimation error in the extra cues. In this paper, we show that existing methods may degrade if the extra information is sparse or noisy. Thus we propose a simple yet effective method that is robust to sparse and noisy pose information. This is achieved by discretizing pose information to the visibility label of body parts, so as to suppress the influence of occluded regions. We show in our experiments that leveraging pose information in this way is more effective and robust. Besides, our method can be embedded into most person ReID models easily. Extensive experiments validate the effectiveness of our model on common occluded person ReID datasets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yang_Learning_To_Know_Where_To_See_A_Visibility-Aware_Approach_for_ICCV_2021_paper.pdf", @@ -23763,15 +25372,16 @@ "email": "mail2.sysu.edu.cn;mail2.sysu.edu.cn;tencent.com;microsoft.com;tencent.com;tencent.com;gmail.com;ieee.org", "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Yang_Learning_To_Know_Where_To_See_A_Visibility-Aware_Approach_for_ICCV_2021_paper.html", - "aff_unique_index": "0+1+2;0+1+2;3;4;3;3;5+5;0+1+2", - "aff_unique_norm": "Sun Yat-sen University;Pazhou Lab;Key Laboratory of Machine Intelligence and Advanced Computing;Tencent;Microsoft;Hong Kong University of Science and Technology", - "aff_unique_dep": "School of Computer Science and Engineering;;Ministry of Education;Youtu Lab;Research;", - "aff_unique_url": "http://www.sysu.edu.cn;;;https://www.tencent.com;https://www.microsoft.com/en-us/research/group/asia;https://www.ust.hk", - "aff_unique_abbr": "SYSU;;;Tencent;MSR Asia;HKUST", + "aff_unique_index": "0+1+2;0+1+2;3;4;3;3;5+6;0+1+2", + "aff_unique_norm": "Sun Yat-sen University;Pazhou Lab;Key Laboratory of Machine Intelligence and Advanced Computing;Tencent;Microsoft Research;Hong Kong University of Science and Technology;The Hong Kong University of Science and Technology", + "aff_unique_dep": "School of Computer Science and Engineering;;Ministry of Education;Youtu Lab;Research;;", + "aff_unique_url": "http://www.sysu.edu.cn;;;https://www.tencent.com;https://www.microsoft.com/en-us/research/group/asia;https://www.ust.hk;https://www.ust.hk", + "aff_unique_abbr": "SYSU;;;Tencent;MSR Asia;HKUST;HKUST", "aff_campus_unique_index": ";;1;2+3;", "aff_campus_unique": ";Asia;Hong Kong SAR;Guangzhou", "aff_country_unique_index": "0+0;0+0;0;0;0;0;0+0;0+0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Yang_2021_ICCV,\n \n author = {\n Yang,\n Jinrui and Zhang,\n Jiawei and Yu,\n Fufu and Jiang,\n Xinyang and Zhang,\n Mengdan and Sun,\n Xing and Chen,\n Ying-Cong and Zheng,\n Wei-Shi\n},\n title = {\n Learning To Know Where To See: A Visibility-Aware Approach for Occluded Person Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11885-11894\n} \n}" }, { "title": "Learning To Match Features With Seeded Graph Matching Network", @@ -23791,7 +25401,8 @@ "gs_citation": 142, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5389471615782101924&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Chen_Learning_To_Match_Features_With_Seeded_Graph_Matching_Network_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Chen_Learning_To_Match_Features_With_Seeded_Graph_Matching_Network_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Hongkai and Luo,\n Zixin and Zhang,\n Jiahui and Zhou,\n Lei and Bai,\n Xuyang and Hu,\n Zeyu and Tai,\n Chiew-Lan and Quan,\n Long\n},\n title = {\n Learning To Match Features With Seeded Graph Matching Network\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6301-6310\n} \n}" }, { "title": "Learning To Reduce Defocus Blur by Realistically Modeling Dual-Pixel Data", @@ -23799,6 +25410,7 @@ "status": "Poster", "track": "main", "pid": 3501, + "author_site": "Abdullah Abuolaim; Mauricio Delbracio; Damien Kelly; Michael S. Brown; Peyman Milanfar", "author": "Abdullah Abuolaim; Mauricio Delbracio; Damien Kelly; Michael S. Brown; Peyman Milanfar", "abstract": "Recent work has shown impressive results on data-driven defocus deblurring using the two-image views available on modern dual-pixel (DP) sensors. One significant challenge in this line of research is access to DP data. Despite many cameras having DP sensors, only a limited number provide access to the low-level DP sensor images. In addition, capturing training data for defocus deblurring involves a time-consuming and tedious setup requiring the camera's aperture to be adjusted. Some cameras with DP sensors (e.g., smartphones) do not have adjustable apertures, further limiting the ability to produce the necessary training data. We address the data capture bottleneck by proposing a procedure to generate realistic DP data synthetically. Our synthesis approach mimics the optical image formation found on DP sensors and can be applied to virtual scenes rendered with standard computer software. Leveraging these realistic synthetic DP images, we introduce a recurrent convolutional network (RCN) architecture that improves deblurring results and is suitable for use with single-frame and multi-frame data (e.g., video) captured by DP sensors. Finally, we show that our synthetic DP data is useful for training DNN models targeting video deblurring applications where access to DP data remains challenging.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Abuolaim_Learning_To_Reduce_Defocus_Blur_by_Realistically_Modeling_Dual-Pixel_Data_ICCV_2021_paper.pdf", @@ -23822,7 +25434,8 @@ "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;1;0;1", - "aff_country_unique": "Canada;United States" + "aff_country_unique": "Canada;United States", + "bibtex": "@InProceedings{Abuolaim_2021_ICCV,\n \n author = {\n Abuolaim,\n Abdullah and Delbracio,\n Mauricio and Kelly,\n Damien and Brown,\n Michael S. and Milanfar,\n Peyman\n},\n title = {\n Learning To Reduce Defocus Blur by Realistically Modeling Dual-Pixel Data\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2289-2298\n} \n}" }, { "title": "Learning To Regress Bodies From Images Using Differentiable Semantic Rendering", @@ -23830,10 +25443,11 @@ "status": "Poster", "track": "main", "pid": 4330, + "author_site": "Sai Kumar Dwivedi; Nikos Athanasiou; Muhammed Kocabas; Michael J. Black", "author": "Sai Kumar Dwivedi; Nikos Athanasiou; Muhammed Kocabas; Michael J. Black", "abstract": "Learning to regress 3D human body shape and pose (e.g. SMPL parameters) from monocular images typically exploits losses on 2D keypoints, silhouettes, and/or part-segmentation when 3D training data is not available. Such losses, however, are limited because 2D keypoints do not supervise body shape and segmentations of people in clothing do not match projected minimally-clothed SMPL shapes. To exploit richer image information about clothed people, we introduce higher-level semantic information about clothing to penalize clothed and non-clothed regions of the image differently. To do so, we train a body regressor using a novel \"Differentiable Semantic Rendering - DSR\" loss. For Minimally-Clothed regions, we define the DSR-MC loss, which encourages a tight match between a rendered SMPL body and the minimally-clothed regions of the image. For clothed regions, we define the DSR-C loss to encourage the rendered SMPL body to be inside the clothing mask. To ensure end-to-end differentiable training, we learn a semantic clothing prior for SMPL vertices from thousands of clothed human scans. We perform extensive qualitative and quantitative experiments to evaluate the role of clothing semantics on the accuracy of 3D human pose and shape estimation. We outperform all previous state-of-the-art methods on 3DPW and Human3.6M and obtain on par results on MPI-INF-3DHP. Code and trained models will be available for research at https://dsr.is.tue.mpg.de/", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Dwivedi_Learning_To_Regress_Bodies_From_Images_Using_Differentiable_Semantic_Rendering_ICCV_2021_paper.pdf", - "aff": "Max Planck Institute for Intelligent Systems, T\u00fcbingen, Germany; Max Planck Institute for Intelligent Systems, T\u00fcbingen, Germany; Max Planck Institute for Intelligent Systems, T\u00fcbingen, Germany+ETH Zurich; Max Planck Institute for Intelligent Systems, T\u00fcbingen, Germany", + "aff": "Max Planck Institute for Intelligent Systems, Tübingen, Germany; Max Planck Institute for Intelligent Systems, Tübingen, Germany; Max Planck Institute for Intelligent Systems, Tübingen, Germany+ETH Zurich; Max Planck Institute for Intelligent Systems, Tübingen, Germany", "project": "https://dsr.is.tue.mpg.de/", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Dwivedi_Learning_To_Regress_ICCV_2021_supplemental.pdf", @@ -23851,9 +25465,10 @@ "aff_unique_url": "https://www.mpi-is.mpg.de;https://www.ethz.ch", "aff_unique_abbr": "MPI-IS;ETHZ", "aff_campus_unique_index": "0;0;0;0", - "aff_campus_unique": "T\u00fcbingen;", + "aff_campus_unique": "Tübingen;", "aff_country_unique_index": "0;0;0+1;0", - "aff_country_unique": "Germany;Switzerland" + "aff_country_unique": "Germany;Switzerland", + "bibtex": "@InProceedings{Dwivedi_2021_ICCV,\n \n author = {\n Dwivedi,\n Sai Kumar and Athanasiou,\n Nikos and Kocabas,\n Muhammed and Black,\n Michael J.\n},\n title = {\n Learning To Regress Bodies From Images Using Differentiable Semantic Rendering\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11250-11259\n} \n}" }, { "title": "Learning To Remove Refractive Distortions From Underwater Images", @@ -23861,6 +25476,7 @@ "status": "Poster", "track": "main", "pid": 6296, + "author_site": "Simron Thapa; Nianyi Li; Jinwei Ye", "author": "Simron Thapa; Nianyi Li; Jinwei Ye", "abstract": "The fluctuation of the water surface causes refractive distortions that severely downgrade the image of an underwater scene. Here, we present the distortion-guided network (DG-Net) for restoring distortion-free underwater images. The key idea is to use a distortion map to guide network training. The distortion map models the pixel displacement caused by water refraction. We first use a physically constrained convolutional network to estimate the distortion map from the refracted image. We then use a generative adversarial network guided by the distortion map to restore the sharp distortion-free image. Since the distortion map indicates correspondences between the distorted image and the distortion-free one, it guides the network to make better predictions. We evaluate our network on several real and synthetic underwater image datasets and show that it out-performs the state-of-the-art algorithms, especially in presence of large distortions. We also show results of complex scenarios, including outdoor swimming pool images captured by the drone and indoor aquarium images taken by cellphone camera.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Thapa_Learning_To_Remove_Refractive_Distortions_From_Underwater_Images_ICCV_2021_paper.pdf", @@ -23884,7 +25500,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Baton Rouge", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Thapa_2021_ICCV,\n \n author = {\n Thapa,\n Simron and Li,\n Nianyi and Ye,\n Jinwei\n},\n title = {\n Learning To Remove Refractive Distortions From Underwater Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5007-5016\n} \n}" }, { "title": "Learning To Resize Images for Computer Vision Tasks", @@ -23892,6 +25509,7 @@ "status": "Poster", "track": "main", "pid": 8207, + "author_site": "Hossein Talebi; Peyman Milanfar", "author": "Hossein Talebi; Peyman Milanfar", "abstract": "For all the ways convolutional neural nets have revolutionized computer vision in recent years, one important aspect has received surprisingly little attention: the effect of image size on the accuracy of tasks being trained for. Typically, to be efficient, the input images are resized to a relatively small spatial resolution (e.g. 224x224), and both training and inference are carried out at this resolution. The actual mechanism for this re-scaling has been an afterthought: Namely, off-the-shelf image resizers such as bilinear and bicubic are commonly used in most machine learning software frameworks. But do these resizers limit the on task performance of the trained networks? The answer is yes. Indeed, we show that the typical linear resizer can be replaced with learned resizers that can substantially improve performance. Importantly, while the classical resizers typically result in better perceptual quality of the downscaled images, our proposed learned resizers do not necessarily give better visual quality, but instead improve task performance. Our learned image resizer is jointly trained with a baseline vision model. This learned CNN-based resizer creates machine friendly visual manipulations that lead to a consistent improvement of the end task metric over the baseline model. Specifically, here we focus on the classification task with the ImageNet dataset, and experiment with four different models to learn resizers adapted to each model. Moreover, we show that the proposed resizer can also be useful for fine-tuning the classification baselines for other vision tasks. To this end, we experiment with three different baselines to develop image quality assessment (IQA) models on the AVA dataset.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Talebi_Learning_To_Resize_Images_for_Computer_Vision_Tasks_ICCV_2021_paper.pdf", @@ -23915,7 +25533,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Talebi_2021_ICCV,\n \n author = {\n Talebi,\n Hossein and Milanfar,\n Peyman\n},\n title = {\n Learning To Resize Images for Computer Vision Tasks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 497-506\n} \n}" }, { "title": "Learning To Stylize Novel Views", @@ -23923,6 +25542,7 @@ "status": "Poster", "track": "main", "pid": 1906, + "author_site": "Hsin-Ping Huang; Hung-Yu Tseng; Saurabh Saini; Maneesh Singh; Ming-Hsuan Yang", "author": "Hsin-Ping Huang; Hung-Yu Tseng; Saurabh Saini; Maneesh Singh; Ming-Hsuan Yang", "abstract": "We tackle a 3D scene stylization problem -- generating stylized images of a scene from arbitrary novel views given a set of images of the same scene and a reference image of the desired style as inputs. Direct solution of combining novel view synthesis and stylization approaches lead to results that are blurry or not consistent across different views. We propose a point cloud-based method for consistent 3D scene stylization. First, we construct the point cloud by back-projecting the image features to the 3D space. Second, we develop point cloud aggregation modules to gather the style information of the 3D scene, and then modulate the features in the point cloud with a linear transformation matrix. Finally, we project the transformed features to 2D space to obtain the novel views. Experimental results on two diverse datasets of real-world scenes validate that our method generates consistent stylized novel view synthesis results against other alternative approaches.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Huang_Learning_To_Stylize_Novel_Views_ICCV_2021_paper.pdf", @@ -23942,11 +25562,12 @@ "aff_unique_norm": "University of California, Merced;Verisk Analytics;Google;Yonsei University", "aff_unique_dep": ";;Google Research;", "aff_unique_url": "https://www.ucmerced.edu;https://www.verisk.com;https://research.google;https://www.yonsei.ac.kr", - "aff_unique_abbr": "UCM;Verisk;Google Research;Yonsei", + "aff_unique_abbr": "UCM;;Google Research;Yonsei", "aff_campus_unique_index": "0;0;0+2", "aff_campus_unique": "Merced;;Mountain View", "aff_country_unique_index": "0;0;0;0;0+0+1", - "aff_country_unique": "United States;South Korea" + "aff_country_unique": "United States;South Korea", + "bibtex": "@InProceedings{Huang_2021_ICCV,\n \n author = {\n Huang,\n Hsin-Ping and Tseng,\n Hung-Yu and Saini,\n Saurabh and Singh,\n Maneesh and Yang,\n Ming-Hsuan\n},\n title = {\n Learning To Stylize Novel Views\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13869-13878\n} \n}" }, { "title": "Learning To Track Objects From Unlabeled Videos", @@ -23954,6 +25575,7 @@ "status": "Poster", "track": "main", "pid": 7672, + "author_site": "Jilai Zheng; Chao Ma; Houwen Peng; Xiaokang Yang", "author": "Jilai Zheng; Chao Ma; Houwen Peng; Xiaokang Yang", "abstract": "In this paper, we propose to learn an Unsupervised Single Object Tracker (USOT) from scratch. We identify that three major challenges, i.e., moving object discovery, rich temporal variation exploitation, and online update, are the central causes of the performance bottleneck of existing unsupervised trackers. To narrow the gap between unsupervised trackers and supervised counterparts, we propose an effective unsupervised learning approach composed of three stages. First, we sample sequentially moving objects with unsupervised optical flow and dynamic programming, instead of random cropping. Second, we train a naive Siamese tracker from scratch using single-frame pairs. Third, we continue training the tracker with a novel cycle memory learning scheme, which is conducted in longer temporal spans and also enables our tracker to update online. Extensive experiments show that the proposed USOT learned from unlabeled videos performs well over the state-of-the-art unsupervised trackers by large margins, and on par with recent supervised deep trackers. Code is available at https://github.com/VISION-SJTU/USOT.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zheng_Learning_To_Track_Objects_From_Unlabeled_Videos_ICCV_2021_paper.pdf", @@ -23970,14 +25592,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zheng_Learning_To_Track_Objects_From_Unlabeled_Videos_ICCV_2021_paper.html", "aff_unique_index": "0;0;1;0", - "aff_unique_norm": "Shanghai Jiao Tong University;Microsoft", + "aff_unique_norm": "Shanghai Jiao Tong University;Microsoft Corporation", "aff_unique_dep": "AI Institute;Microsoft Research", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "SJTU;MSR", - "aff_campus_unique_index": "", - "aff_campus_unique": "", + "aff_campus_unique_index": "0;0;0", + "aff_campus_unique": "Shanghai;", "aff_country_unique_index": "0;0;1;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Zheng_2021_ICCV,\n \n author = {\n Zheng,\n Jilai and Ma,\n Chao and Peng,\n Houwen and Yang,\n Xiaokang\n},\n title = {\n Learning To Track Objects From Unlabeled Videos\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13546-13555\n} \n}" }, { "title": "Learning To Track With Object Permanence", @@ -23985,6 +25608,7 @@ "status": "Poster", "track": "main", "pid": 2740, + "author_site": "Pavel Tokmakov; Jie Li; Wolfram Burgard; Adrien Gaidon", "author": "Pavel Tokmakov; Jie Li; Wolfram Burgard; Adrien Gaidon", "abstract": "Tracking by detection, the dominant approach for online multi-object tracking, alternates between localization and association steps. As a result, it strongly depends on the quality of instantaneous observations, often failing when objects are not fully visible. In contrast, tracking in humans is underlined by the notion of object permanence: once an object is recognized, we are aware of its physical existence and can approximately localize it even under full occlusions. In this work, we introduce an end-to-end trainable approach for joint object detection and tracking that is capable of such reasoning. We build on top of the recent CenterTrack architecture, which takes pairs of frames as input, and extend it to videos of arbitrary length. To this end, we augment the model with a spatio-temporal, recurrent memory module, allowing it to reason about object locations and identities in the current frame using all the previous history. It is, however, not obvious how to train such an approach. We study this question on a new, large-scale, synthetic dataset for multi-object tracking, which provides ground truth annotations for invisible objects, and propose several approaches for supervising tracking behind occlusions. Our model, trained jointly on synthetic and real data, outperforms the state of the art on KITTI and MOT17 datasets thanks to its robustness to occlusions.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Tokmakov_Learning_To_Track_With_Object_Permanence_ICCV_2021_paper.pdf", @@ -23999,7 +25623,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Tokmakov_Learning_To_Track_With_Object_Permanence_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Tokmakov_Learning_To_Track_With_Object_Permanence_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Tokmakov_2021_ICCV,\n \n author = {\n Tokmakov,\n Pavel and Li,\n Jie and Burgard,\n Wolfram and Gaidon,\n Adrien\n},\n title = {\n Learning To Track With Object Permanence\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10860-10869\n} \n}" }, { "title": "Learning Unsupervised Metaformer for Anomaly Detection", @@ -24007,6 +25632,7 @@ "status": "Poster", "track": "main", "pid": 3960, + "author_site": "Jhih-Ciang Wu; Ding-Jie Chen; Chiou-Shann Fuh; Tyng-Luh Liu", "author": "Jhih-Ciang Wu; Ding-Jie Chen; Chiou-Shann Fuh; Tyng-Luh Liu", "abstract": "Anomaly detection (AD) aims to address the task of classification or localization of image anomalies. This paper addresses two pivotal issues of reconstruction-based approaches to AD in images, namely, model adaptation and reconstruction gap. The former generalizes an AD model to tackling a broad range of object categories, while the latter provides useful clues for localizing abnormal regions. At the core of our method is an unsupervised universal model, termed as Metaformer, which leverages both meta-learned model parameters to achieve high model adaptation capability and instance-aware attention to emphasize the focal regions for localizing abnormal regions, i.e., to explore the reconstruction gap at those regions of interest. We justify the effectiveness of our method with SOTA results on the MVTec AD dataset of industrial images and highlight the adaptation flexibility of the universal Metaformer with multi-class and few-shot scenarios.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wu_Learning_Unsupervised_Metaformer_for_Anomaly_Detection_ICCV_2021_paper.pdf", @@ -24021,7 +25647,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wu_Learning_Unsupervised_Metaformer_for_Anomaly_Detection_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wu_Learning_Unsupervised_Metaformer_for_Anomaly_Detection_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Wu_2021_ICCV,\n \n author = {\n Wu,\n Jhih-Ciang and Chen,\n Ding-Jie and Fuh,\n Chiou-Shann and Liu,\n Tyng-Luh\n},\n title = {\n Learning Unsupervised Metaformer for Anomaly Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4369-4378\n} \n}" }, { "title": "Learning With Memory-Based Virtual Classes for Deep Metric Learning", @@ -24029,6 +25656,7 @@ "status": "Poster", "track": "main", "pid": 8360, + "author_site": "Byungsoo Ko; Geonmo Gu; Han-Gyu Kim", "author": "Byungsoo Ko; Geonmo Gu; Han-Gyu Kim", "abstract": "The core of deep metric learning (DML) involves learning visual similarities in high-dimensional embedding space. One of the main challenges is to generalize from seen classes of training data to unseen classes of test data. Recent works have focused on exploiting past embeddings to increase the number of instances for the seen classes. Such methods achieve performance improvement via augmentation, while the strong focus on seen classes still remains. This can be undesirable for DML, where training and test data exhibit entirely different classes. In this work, we present a novel training strategy for DML called MemVir. Unlike previous works, MemVir memorizes both embedding features and class weights to utilize them as additional virtual classes. The exploitation of virtual classes not only utilizes augmented information for training but also alleviates a strong focus on seen classes for better generalization. Moreover, we embed the idea of curriculum learning by slowly adding virtual classes for a gradual increase in learning difficulty, which improves the learning stability as well as the final performance. MemVir can be easily applied to many existing loss functions without any modification. Extensive experimental results on famous benchmarks demonstrate the superiority of MemVir over state-of-the-art competitors. Code of MemVir is publicly available.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ko_Learning_With_Memory-Based_Virtual_Classes_for_Deep_Metric_Learning_ICCV_2021_paper.pdf", @@ -24052,7 +25680,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "1", - "aff_country_unique": ";South Korea" + "aff_country_unique": ";South Korea", + "bibtex": "@InProceedings{Ko_2021_ICCV,\n \n author = {\n Ko,\n Byungsoo and Gu,\n Geonmo and Kim,\n Han-Gyu\n},\n title = {\n Learning With Memory-Based Virtual Classes for Deep Metric Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11792-11801\n} \n}" }, { "title": "Learning With Noisy Labels for Robust Point Cloud Segmentation", @@ -24060,6 +25689,7 @@ "status": "Poster", "track": "main", "pid": 6600, + "author_site": "Shuquan Ye; Dongdong Chen; Songfang Han; Jing Liao", "author": "Shuquan Ye; Dongdong Chen; Songfang Han; Jing Liao", "abstract": "Point cloud segmentation is a fundamental task in 3D. Despite recent progress on point cloud segmentation with the power of deep networks, current deep learning methods based on the clean label assumptions may fail with noisy labels. Yet, object class labels are often mislabeled in real-world point cloud datasets. In this work, we take the lead in solving this issue by proposing a novel Point Noise-Adaptive Learning (PNAL) framework. Compared to existing noise-robust methods on image tasks, our PNAL is noise-rate blind, to cope with the spatially variant noise rate problem specific to point clouds. Specifically, we propose a novel point-wise confidence selection to obtain reliable labels based on the historical predictions of each point. A novel cluster-wise label correction is proposed with a voting strategy to generate the best possible label taking the neighbor point correlations into consideration. We conduct extensive experiments to demonstrate the effectiveness of PNAL on both synthetic and real-world noisy datasets. In particular, even with 60% symmetric noisy labels, our proposed method produces much better results than its baseline counterpart without PNAL and is comparable to the ideal upper bound trained on a completely clean dataset. Moreover, we fully re-labeled the validation set of a popular but noisy real-world scene dataset ScanNetV2 to make it clean, for rigorous experiment and future research. Our code and data will be released.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ye_Learning_With_Noisy_Labels_for_Robust_Point_Cloud_Segmentation_ICCV_2021_paper.pdf", @@ -24083,7 +25713,8 @@ "aff_campus_unique_index": "0;2;0", "aff_campus_unique": "Hong Kong SAR;;San Diego", "aff_country_unique_index": "0;1;1;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Ye_2021_ICCV,\n \n author = {\n Ye,\n Shuquan and Chen,\n Dongdong and Han,\n Songfang and Liao,\n Jing\n},\n title = {\n Learning With Noisy Labels for Robust Point Cloud Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6443-6452\n} \n}" }, { "title": "Learning With Noisy Labels via Sparse Regularization", @@ -24091,6 +25722,7 @@ "status": "Poster", "track": "main", "pid": 6058, + "author_site": "Xiong Zhou; Xianming Liu; Chenyang Wang; Deming Zhai; Junjun Jiang; Xiangyang Ji", "author": "Xiong Zhou; Xianming Liu; Chenyang Wang; Deming Zhai; Junjun Jiang; Xiangyang Ji", "abstract": "Learning with noisy labels is an important and challenging task for training accurate deep neural networks. However, some commonly-used loss functions, such as Cross Entropy (CE), always suffer from severe overfitting to noisy labels. Although robust loss functions have been designed, they often encounter underfitting. In this paper, we theoretically prove that any loss will be robust to noisy labels when restricting the output of a network to the set of permutations over any fixed vector. When the fixed vector is one-hot, we only need to constrain the output to be one-hot, which means a discrete image and thus zero gradients almost everywhere. This prohibits gradient-based learning of models. In this work, we introduce two sparse regularization strategies to approximate the one-hot constraint: output sharpening and l_p-norm (p\\le 1). Output sharpening directly modifies the output distribution of a network to be sharp by adjusting the \"temperature\" parameter. l_p-norm plays the role of a regularization term to make the output to be sparse. These two simple strategies guarantee the robustness of arbitrary loss functions while not hindering the fitting ability of networks. Experiments on baseline and real-world datasets demonstrate that the sparse regularization can significantly improve the performance of commonly-used loss functions in the presence of noisy labels, and outperform state-of-the-art methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhou_Learning_With_Noisy_Labels_via_Sparse_Regularization_ICCV_2021_paper.pdf", @@ -24107,14 +25739,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhou_Learning_With_Noisy_Labels_via_Sparse_Regularization_ICCV_2021_paper.html", "aff_unique_index": "0+1;0+1;0;0;0+1;2", - "aff_unique_norm": "Harbin Institute of Technology;Pengcheng Laboratory;Tsinghua University", - "aff_unique_dep": ";Peng Cheng Laboratory;", + "aff_unique_norm": "Harbin Institute of Technology;Peng Cheng Laboratory;Tsinghua University", + "aff_unique_dep": ";;", "aff_unique_url": "http://www.hit.edu.cn/;http://www.pcl.ac.cn;https://www.tsinghua.edu.cn", "aff_unique_abbr": "HIT;PCL;THU", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Harbin;", "aff_country_unique_index": "0+0;0+0;0;0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhou_2021_ICCV,\n \n author = {\n Zhou,\n Xiong and Liu,\n Xianming and Wang,\n Chenyang and Zhai,\n Deming and Jiang,\n Junjun and Ji,\n Xiangyang\n},\n title = {\n Learning With Noisy Labels via Sparse Regularization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 72-81\n} \n}" }, { "title": "Learning With Privileged Tasks", @@ -24122,6 +25755,7 @@ "status": "Poster", "track": "main", "pid": 8404, + "author_site": "Yuru Song; Zan Lou; Shan You; Erkun Yang; Fei Wang; Chen Qian; Changshui Zhang; Xiaogang Wang", "author": "Yuru Song; Zan Lou; Shan You; Erkun Yang; Fei Wang; Chen Qian; Changshui Zhang; Xiaogang Wang", "abstract": "Multi-objective multi-task learning aims to boost the performance of all tasks by leveraging their correlation and conflict appropriately. Nevertheless, in real practice, users may have preference for certain tasks, and other tasks simply serve as privileged or auxiliary tasks to assist the training of target tasks. The privileged tasks thus possess less or even no priority in the final task assessment by users. Motivated by this, we propose a privileged multiple descent algorithm to arbitrate the learning of target tasks and privileged tasks. Concretely, we introduce a privileged parameter so that the optimization direction does not necessarily follow the gradient from the privileged tasks, but concentrates more on the target tasks. Besides, we also encourage a priority parameter for the target tasks to control the potential distraction of optimization direction from the privileged tasks. In this way, the optimization direction can be more aggressively determined by weighting the gradients among target and privileged tasks, and thus highlight more the performance of target tasks under the unified multi-task learning context. Extensive experiments on synthetic and real-world datasets indicate that our method can achieve versatile Pareto solutions under varying preference for the target tasks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Song_Learning_With_Privileged_Tasks_ICCV_2021_paper.pdf", @@ -24136,7 +25770,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Song_Learning_With_Privileged_Tasks_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Song_Learning_With_Privileged_Tasks_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Song_2021_ICCV,\n \n author = {\n Song,\n Yuru and Lou,\n Zan and You,\n Shan and Yang,\n Erkun and Wang,\n Fei and Qian,\n Chen and Zhang,\n Changshui and Wang,\n Xiaogang\n},\n title = {\n Learning With Privileged Tasks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10685-10694\n} \n}" }, { "title": "Learning a Single Network for Scale-Arbitrary Super-Resolution", @@ -24144,6 +25779,7 @@ "status": "Poster", "track": "main", "pid": 6676, + "author_site": "Longguang Wang; Yingqian Wang; Zaiping Lin; Jungang Yang; Wei An; Yulan Guo", "author": "Longguang Wang; Yingqian Wang; Zaiping Lin; Jungang Yang; Wei An; Yulan Guo", "abstract": "Recently, the performance of single image super-resolution (SR) has been significantly improved with powerful networks. However, these networks are developed for image SR with specific integer scale factors (e.g., x2/3/4), and cannot handle non-integer and asymmetric SR. In this paper, we propose to learn a scale-arbitrary image SR network from scale-specific networks. Specifically, we develop a plug-in module for existing SR networks to perform scale-arbitrary SR, which consists of multiple scale-aware feature adaption blocks and a scale-aware upsampling layer. Moreover, conditional convolution is used in our plug-in module to generate dynamic scale-aware filters, which enables our network to adapt to arbitrary scale factors. Our plug-in module can be easily adapted to existing networks to realize scale-arbitrary SR with a single model. These networks plugged with our module can produce promising results for non-integer and asymmetric SR while maintaining state-of-the-art performance for SR with integer scale factors. Besides, the additional computational and memory cost of our module is very small.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_Learning_a_Single_Network_for_Scale-Arbitrary_Super-Resolution_ICCV_2021_paper.pdf", @@ -24167,7 +25803,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Longguang and Wang,\n Yingqian and Lin,\n Zaiping and Yang,\n Jungang and An,\n Wei and Guo,\n Yulan\n},\n title = {\n Learning a Single Network for Scale-Arbitrary Super-Resolution\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4801-4810\n} \n}" }, { "title": "Learning a Sketch Tensor Space for Image Inpainting of Man-Made Scenes", @@ -24175,6 +25812,7 @@ "status": "Poster", "track": "main", "pid": 3305, + "author_site": "Chenjie Cao; Yanwei Fu", "author": "Chenjie Cao; Yanwei Fu", "abstract": "This paper studies the task of inpainting man-made scenes. It is very challenging due to the difficulty in preserving the visual patterns of images, such as edges, lines, and junctions. Especially, most previous works are failed to restore the object/building structures for images of man-made scenes. To this end, this paper proposes learning a Sketch Tensor (ST) space for inpainting man-made scenes. Such a space is learned to restore the edges, lines, and junctions in images, and thus makes reliable predictions of the holistic image structures. To facilitate the structure refinement, we propose a Multi-scale Sketch Tensor inpainting (MST) network, with a novel encoder-decoder structure. The encoder extracts lines and edges from the input images to project them into an ST space. From this space, the decoder is learned to restore the input images. Extensive experiments validate the efficacy of our model. Furthermore, our model can also achieve competitive performance in inpainting general nature images over the competitors.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Cao_Learning_a_Sketch_Tensor_Space_for_Image_Inpainting_of_Man-Made_ICCV_2021_paper.pdf", @@ -24198,7 +25836,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Cao_2021_ICCV,\n \n author = {\n Cao,\n Chenjie and Fu,\n Yanwei\n},\n title = {\n Learning a Sketch Tensor Space for Image Inpainting of Man-Made Scenes\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14509-14518\n} \n}" }, { "title": "Learning an Augmented RGB Representation With Cross-Modal Knowledge Distillation for Action Detection", @@ -24206,10 +25845,11 @@ "status": "Poster", "track": "main", "pid": 3291, - "author": "Rui Dai; Srijan Das; Fran\u00e7ois Bremond", + "author_site": "Rui Dai; Srijan Das; François Bremond", + "author": "Rui Dai; Srijan Das; François Bremond", "abstract": "In video understanding, most cross-modal knowledge distillation (KD) methods are tailored for classification tasks, focusing on the discriminative representation of the trimmed videos. However, action detection requires not only categorizing actions, but also localizing them in untrimmed videos. Therefore, transferring knowledge pertaining to temporal relations is critical for this task which is missing in the previous cross-modal KD frameworks. To this end, we aim at learning an augmented RGB representation for action detection, taking advantage of additional modalities at training time through KD. We propose a KD framework consisting of two levels of distillation. On one hand, atomic-level distillation encourages the RGB student to learn the sub-representation of the actions from the teacher in a contrastive manner. On the other hand, sequence-level distillation encourages the student to learn the temporal knowledge from the teacher, which consists of transferring the Global Contextual Relations and the action Boundary Saliency. The result is an Augmented-RGB stream that can achieve competitive performance as the two-stream network while using only RGB at inference time. Extensive experimental analysis shows that our proposed distillation framework is generic and outperforms other popular cross-modal distillation methods in the action detection task.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Dai_Learning_an_Augmented_RGB_Representation_With_Cross-Modal_Knowledge_Distillation_for_ICCV_2021_paper.pdf", - "aff": "Inria+Universit\u00b8e C\u02c6ote d\u2019Azur; Stony Brook University; Inria+Universit\u00b8e C\u02c6ote d\u2019Azur", + "aff": "Inria+Universit¸e Cˆote d’Azur; Stony Brook University; Inria+Universit¸e Cˆote d’Azur", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Dai_Learning_an_Augmented_ICCV_2021_supplemental.pdf", @@ -24222,14 +25862,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Dai_Learning_an_Augmented_RGB_Representation_With_Cross-Modal_Knowledge_Distillation_for_ICCV_2021_paper.html", "aff_unique_index": "0+1;2;0+1", - "aff_unique_norm": "INRIA;Universit\u00e9 C\u00f4te d\u2019Azur;Stony Brook University", + "aff_unique_norm": "Inria;Université Côte d’Azur;Stony Brook University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.inria.fr;https://www.univ-cotedazur.fr;https://www.stonybrook.edu", "aff_unique_abbr": "Inria;UCA;SBU", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;1;0+0", - "aff_country_unique": "France;United States" + "aff_country_unique": "France;United States", + "bibtex": "@InProceedings{Dai_2021_ICCV,\n \n author = {\n Dai,\n Rui and Das,\n Srijan and Bremond,\n Fran\\c{c\n}ois\n},\n title = {\n Learning an Augmented RGB Representation With Cross-Modal Knowledge Distillation for Action Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13053-13064\n} \n}" }, { "title": "Learning by Aligning: Visible-Infrared Person Re-Identification Using Cross-Modal Correspondences", @@ -24237,6 +25878,7 @@ "status": "Poster", "track": "main", "pid": 9067, + "author_site": "Hyunjong Park; Sanghoon Lee; Junghyup Lee; Bumsub Ham", "author": "Hyunjong Park; Sanghoon Lee; Junghyup Lee; Bumsub Ham", "abstract": "We address the problem of visible-infrared person re-identification (VI-reID), that is, retrieving a set of person images, captured by visible or infrared cameras, in a cross-modal setting. Two main challenges in VI-reID are intra-class variations across person images, and cross-modal discrepancies between visible and infrared images. Assuming that the person images are roughly aligned, previous approaches attempt to learn coarse image- or rigid part-level person representations that are discriminative and generalizable across different modalities. However, the person images, typically cropped by off-the-shelf object detectors, are not necessarily well-aligned, which distract discriminative person representation learning. In this paper, we introduce a novel feature learning framework that addresses these problems in a unified way. To this end, we propose to exploit dense correspondences between cross-modal person images. This allows to address the cross-modal discrepancies in a pixel-level, suppressing modality-related features from person representations more effectively. This also encourages pixel-wise associations between cross-modal local features, further facilitating discriminative feature learning for VI-reID. Extensive experiments and analyses on standard VI-reID benchmarks demonstrate the effectiveness of our approach, which significantly outperforms the state of the art.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Park_Learning_by_Aligning_Visible-Infrared_Person_Re-Identification_Using_Cross-Modal_Correspondences_ICCV_2021_paper.pdf", @@ -24260,7 +25902,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Park_2021_ICCV,\n \n author = {\n Park,\n Hyunjong and Lee,\n Sanghoon and Lee,\n Junghyup and Ham,\n Bumsub\n},\n title = {\n Learning by Aligning: Visible-Infrared Person Re-Identification Using Cross-Modal Correspondences\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12046-12055\n} \n}" }, { "title": "Learning of Visual Relations: The Devil Is in the Tails", @@ -24268,6 +25911,7 @@ "status": "Poster", "track": "main", "pid": 8009, + "author_site": "Alakh Desai; Tz-Ying Wu; Subarna Tripathi; Nuno Vasconcelos", "author": "Alakh Desai; Tz-Ying Wu; Subarna Tripathi; Nuno Vasconcelos", "abstract": "Significant effort has been recently devoted to modeling visual relations. This has mostly addressed the design of architectures, typically by adding parameters and increasing model complexity. However, visual relation learning is a long-tailed problem, due to the combinatorial nature of joint reasoning about groups of objects. Increasing model complexity is, in general, ill-suited for long-tailed problems due to their tendency to overfit. In this paper, we explore an alternative hypothesis, denoted the Devil is in the Tails. Under this hypothesis, better performance is achieved by keeping the model simple but improving its ability to cope with long-tailed distributions. To test this hypothesis, we devise a new approach for training visual relationships models, which is inspired by state-of-the-art long-tailed recognition literature. This is based on an iterative decoupled training scheme, denoted Decoupled Training for Devil in the Tails (DT2). DT2 employs a novel sampling approach, Alternating Class-Balanced Sampling (ACBS), to capture the interplay between the long-tailed entity and predicate distributions of visual relations. Results show that, with an extremely simple architecture, DT2-ACBS significantly outperforms much more complex state-of-the-art methods on scene graph generation tasks. This suggests that the development of sophisticated models must be considered in tandem with the long-tailed nature of the problem.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Desai_Learning_of_Visual_Relations_The_Devil_Is_in_the_Tails_ICCV_2021_paper.pdf", @@ -24282,7 +25926,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Desai_Learning_of_Visual_Relations_The_Devil_Is_in_the_Tails_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Desai_Learning_of_Visual_Relations_The_Devil_Is_in_the_Tails_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Desai_2021_ICCV,\n \n author = {\n Desai,\n Alakh and Wu,\n Tz-Ying and Tripathi,\n Subarna and Vasconcelos,\n Nuno\n},\n title = {\n Learning of Visual Relations: The Devil Is in the Tails\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15404-15413\n} \n}" }, { "title": "Let's See Clearly: Contaminant Artifact Removal for Moving Cameras", @@ -24290,6 +25935,7 @@ "status": "Poster", "track": "main", "pid": 5654, + "author_site": "Xiaoyu Li; Bo Zhang; Jing Liao; Pedro V. Sander", "author": "Xiaoyu Li; Bo Zhang; Jing Liao; Pedro V. Sander", "abstract": "Contaminants such as dust, dirt and moisture adhering to the camera lens can greatly affect the quality and clarity of the resulting image or video. In this paper, we propose a video restoration method to automatically remove these contaminants and produce a clean video. Our approach first seeks to detect attention maps that indicate the regions that need to be restored. In order to leverage the corresponding clean pixels from adjacent frames, we propose a flow completion module to hallucinate the flow of the background scene to the attention regions degraded by the contaminants. Guided by the attention maps and completed flows, we propose a recurrent technique to restore the input frame by fetching clean pixels from adjacent frames. Finally, a multi-frame processing stage is used to further process the entire video sequence in order to enforce temporal consistency. The entire network is trained on a synthetic dataset that approximates the physical lighting properties of contaminant artifacts. This new dataset and our novel framework lead to our method that is able to address different contaminants and outperforms competitive restoration approaches both qualitatively and quantitatively.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_Lets_See_Clearly_Contaminant_Artifact_Removal_for_Moving_Cameras_ICCV_2021_paper.pdf", @@ -24304,7 +25950,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Li_Lets_See_Clearly_Contaminant_Artifact_Removal_for_Moving_Cameras_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Li_Lets_See_Clearly_Contaminant_Artifact_Removal_for_Moving_Cameras_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Xiaoyu and Zhang,\n Bo and Liao,\n Jing and Sander,\n Pedro V.\n},\n title = {\n Let's See Clearly: Contaminant Artifact Removal for Moving Cameras\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2011-2020\n} \n}" }, { "title": "Leveraging Auxiliary Tasks With Affinity Learning for Weakly Supervised Semantic Segmentation", @@ -24312,6 +25959,7 @@ "status": "Poster", "track": "main", "pid": 3388, + "author_site": "Lian Xu; Wanli Ouyang; Mohammed Bennamoun; Farid Boussaid; Ferdous Sohel; Dan Xu", "author": "Lian Xu; Wanli Ouyang; Mohammed Bennamoun; Farid Boussaid; Ferdous Sohel; Dan Xu", "abstract": "Semantic segmentation is a challenging task in the absence of densely labelled data. Only relying on class activation maps (CAM) with image-level labels provides deficient segmentation supervision. Prior works thus consider pre-trained models to produce coarse saliency maps to guide the generation of pseudo segmentation labels. However, the commonly used off-line heuristic generation process cannot fully exploit the benefits of these coarse saliency maps. Motivated by the significant inter-task correlation, we propose a novel weakly supervised multi-task framework termed as AuxSegNet, to leverage saliency detection and multi-label image classification as auxiliary tasks to improve the primary task of semantic segmentation using only image-level ground-truth labels. Inspired by their similar structured semantics, we also propose to learn a cross-task global pixel-level affinity map from the saliency and segmentation representations. The learned cross-task affinity can be used to refine saliency predictions and propagate CAM maps to provide improved pseudo labels for both tasks. The mutual boost between pseudo label updating and cross-task affinity learning enables iterative improvements on segmentation performance. Extensive experiments demonstrate the effectiveness of the proposed auxiliary learning network structure and the cross-task affinity learning method. The proposed approach achieves state-of-the-art weakly supervised segmentation performance on the challenging PASCAL VOC 2012 and MS COCO benchmarks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xu_Leveraging_Auxiliary_Tasks_With_Affinity_Learning_for_Weakly_Supervised_Semantic_ICCV_2021_paper.pdf", @@ -24335,7 +25983,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0;1", - "aff_country_unique": "Australia;China" + "aff_country_unique": "Australia;China", + "bibtex": "@InProceedings{Xu_2021_ICCV,\n \n author = {\n Xu,\n Lian and Ouyang,\n Wanli and Bennamoun,\n Mohammed and Boussaid,\n Farid and Sohel,\n Ferdous and Xu,\n Dan\n},\n title = {\n Leveraging Auxiliary Tasks With Affinity Learning for Weakly Supervised Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6984-6993\n} \n}" }, { "title": "Lifelong Infinite Mixture Model Based on Knowledge-Driven Dirichlet Process", @@ -24343,6 +25992,7 @@ "status": "Poster", "track": "main", "pid": 10229, + "author_site": "Fei Ye; Adrian G. Bors", "author": "Fei Ye; Adrian G. Bors", "abstract": "Recent research efforts in lifelong learning propose to grow a mixture of models to adapt to an increasing number of tasks. The proposed methodology shows promising results in overcoming catastrophic forgetting. However, the theory behind these successful models is still not well understood. In this paper, we perform the theoretical analysis for lifelong learning models by deriving the risk bounds based on the discrepancy distance between the probabilistic representation of data generated by the model and that corresponding to the target dataset. Inspired by the theoretical analysis, we introduce a new lifelong learning approach, namely the Lifelong Infinite Mixture (LIMix) model, which can automatically expand its network architectures or choose an appropriate component to adapt its parameters for learning a new task, while preserving its previously learnt information. We propose to incorporate the knowledge by means of Dirichlet processes by using a gating mechanism which computes the dependence between the knowledge learnt previously and stored in each component, and a new set of data. Besides, we train a compact Student model which can accumulate cross-domain representations over time and make quick inferences. The code is available at https://github.com/dtuzi123/Lifelong-infinite-mixture-model.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ye_Lifelong_Infinite_Mixture_Model_Based_on_Knowledge-Driven_Dirichlet_Process_ICCV_2021_paper.pdf", @@ -24366,7 +26016,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "York", "aff_country_unique_index": "0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Ye_2021_ICCV,\n \n author = {\n Ye,\n Fei and Bors,\n Adrian G.\n},\n title = {\n Lifelong Infinite Mixture Model Based on Knowledge-Driven Dirichlet Process\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10695-10704\n} \n}" }, { "title": "Light Field Saliency Detection With Dual Local Graph Learning and Reciprocative Guidance", @@ -24374,6 +26025,7 @@ "status": "Poster", "track": "main", "pid": 8469, + "author_site": "Nian Liu; Wangbo Zhao; Dingwen Zhang; Junwei Han; Ling Shao", "author": "Nian Liu; Wangbo Zhao; Dingwen Zhang; Junwei Han; Ling Shao", "abstract": "The application of light field data in salient object detection is becoming increasingly popular in recent years. The difficulty lies in how to effectively fuse the features within the focal stack and how to cooperate them with the feature of the all-focus image. Previous methods usually fuse focal stack features via convolution or ConvLSTM, which are both less effective and ill-posed. In this paper, we model the information fusion within focal stack via graph networks. They introduce powerful context propagation from neighbouring nodes and also avoid ill-posed implementations. On the one hand, we construct local graph connections thus avoiding prohibitive computational costs of traditional graph networks. On the other hand, instead of processing the two kinds of data separately, we build a novel dual graph model to guide the focal stack fusion process using all-focus patterns. To handle the second difficulty, previous methods usually implement one-shot fusion for focal stack and all-focus features, hence lacking a thorough exploration of their supplements. We introduce a reciprocative guidance scheme and enable mutual guidance between these two kinds of information at multiple steps. As such, both kinds of features can be enhanced iteratively, finally benefiting the saliency prediction. Extensive experimental results show that the proposed models are all beneficial and we achieve significantly better results than state-of-the-art methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liu_Light_Field_Saliency_Detection_With_Dual_Local_Graph_Learning_and_ICCV_2021_paper.pdf", @@ -24397,7 +26049,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;0", - "aff_country_unique": "United Arab Emirates;China" + "aff_country_unique": "United Arab Emirates;China", + "bibtex": "@InProceedings{Liu_2021_ICCV,\n \n author = {\n Liu,\n Nian and Zhao,\n Wangbo and Zhang,\n Dingwen and Han,\n Junwei and Shao,\n Ling\n},\n title = {\n Light Field Saliency Detection With Dual Local Graph Learning and Reciprocative Guidance\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4712-4721\n} \n}" }, { "title": "Light Source Guided Single-Image Flare Removal From Unpaired Data", @@ -24405,6 +26058,7 @@ "status": "Poster", "track": "main", "pid": 1151, + "author_site": "Xiaotian Qiao; Gerhard P. Hancke; Rynson W.H. Lau", "author": "Xiaotian Qiao; Gerhard P. Hancke; Rynson W.H. Lau", "abstract": "Causally-taken images often suffer from flare artifacts, due to the unintended reflections and scattering of light inside the camera. However, as flares may appear in a variety of shapes, positions, and colors, detecting and removing them entirely from an image is very challenging. Existing methods rely on predefined intensity and geometry priors of flares, and may fail to distinguish the difference between light sources and flare artifacts. We observe that the conditions of the light source in the image play an important role in the resulting flares. In this paper, we present a deep framework with light source aware guidance for single-image flare removal (SIFR). In particular, we first detect the light source regions and the flare regions separately, and then remove the flare artifacts based on the light source aware guidance. By learning the underlying relationships between the two types of regions, our approach can remove different kinds of flares from the image. In addition, instead of using paired training data which are difficult to collect, we propose the first unpaired flare removal dataset and new cycle-consistency constraints to obtain more diverse examples and avoid manual annotations. Extensive experiments demonstrate that our method outperforms the baselines qualitatively and quantitatively. We also show that our model can be applied to flare effect manipulation (e.g., adding or changing image flares).", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Qiao_Light_Source_Guided_Single-Image_Flare_Removal_From_Unpaired_Data_ICCV_2021_paper.pdf", @@ -24419,7 +26073,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Qiao_Light_Source_Guided_Single-Image_Flare_Removal_From_Unpaired_Data_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Qiao_Light_Source_Guided_Single-Image_Flare_Removal_From_Unpaired_Data_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Qiao_2021_ICCV,\n \n author = {\n Qiao,\n Xiaotian and Hancke,\n Gerhard P. and Lau,\n Rynson W.H.\n},\n title = {\n Light Source Guided Single-Image Flare Removal From Unpaired Data\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4177-4185\n} \n}" }, { "title": "Lightweight Multi-Person Total Motion Capture Using Sparse Multi-View Cameras", @@ -24427,6 +26082,7 @@ "status": "Poster", "track": "main", "pid": 8736, + "author_site": "Yuxiang Zhang; Zhe Li; Liang An; Mengcheng Li; Tao Yu; Yebin Liu", "author": "Yuxiang Zhang; Zhe Li; Liang An; Mengcheng Li; Tao Yu; Yebin Liu", "abstract": "Multi-person total motion capture is extremely challenging when it comes to handle severe occlusions, different reconstruction granularities from body to face and hands, drastically changing observation scales and fast body movements. To overcome these challenges above, we contribute a lightweight total motion capture system for multi-person interactive scenarios using only sparse multi-view cameras. By contributing a novel hand and face bootstrapping algorithm, our method is capable of efficient localization and accurate association of the hands and faces even on severe occluded occasions. We leverage both pose regression and keypoints detection methods and further propose a unified two-stage parametric fitting method for achieving pixel-aligned accuracy. Moreover, for extremely self-occluded poses and close interactions, a novel feedback mechanism is proposed to propagate the pixel-aligned reconstructions into the next frame for more accurate association. Overall, we propose the first light-weight total capture system and achieves fast, robust and accurate multi-person total motion capture performance. The results and experiments show that our method achieves more accurate results than existing methods under sparse-view setups.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhang_Lightweight_Multi-Person_Total_Motion_Capture_Using_Sparse_Multi-View_Cameras_ICCV_2021_paper.pdf", @@ -24450,7 +26106,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2021_ICCV,\n \n author = {\n Zhang,\n Yuxiang and Li,\n Zhe and An,\n Liang and Li,\n Mengcheng and Yu,\n Tao and Liu,\n Yebin\n},\n title = {\n Lightweight Multi-Person Total Motion Capture Using Sparse Multi-View Cameras\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5560-5569\n} \n}" }, { "title": "Likelihood-Based Diverse Sampling for Trajectory Forecasting", @@ -24458,6 +26115,7 @@ "status": "Poster", "track": "main", "pid": 9291, + "author_site": "Yecheng Jason Ma; Jeevana Priya Inala; Dinesh Jayaraman; Osbert Bastani", "author": "Yecheng Jason Ma; Jeevana Priya Inala; Dinesh Jayaraman; Osbert Bastani", "abstract": "Forecasting complex vehicle and pedestrian multi-modal distributions requires powerful probabilistic approaches. Normalizing flows (NF) have recently emerged as an attractive tool to model such distributions. However, a key drawback is that independent samples drawn from a flow model often do not adequately capture all the modes in the underlying distribution. We propose Likelihood-Based Diverse Sampling (LDS), a method for improving the quality and the diversity of trajectory samples from a pre-trained flow model. Rather than producing individual samples, LDS produces a set of trajectories in one shot. Given a pre-trained forecasting flow model, we train LDS using gradients from the model, to optimize an objective function that rewards high likelihood for individual trajectories in the predicted set, together with high spatial separation among trajectories. LDS outperforms state-of-art post-hoc neural diverse forecasting methods for various pre-trained flow models as well as conditional variational autoencoder (CVAE) models. Crucially, it can also be used for transductive trajectory forecasting, where the diverse forecasts are trained on-the-fly on unlabeled test examples. LDS is easy to implement, and we show that it offers a simple plug-in improvement over baselines on two challenging benchmarks. Code is at: https://github.com/JasonMa2016/LDS", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Jason_Likelihood-Based_Diverse_Sampling_for_Trajectory_Forecasting_ICCV_2021_paper.pdf", @@ -24472,7 +26130,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Jason_Likelihood-Based_Diverse_Sampling_for_Trajectory_Forecasting_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Jason_Likelihood-Based_Diverse_Sampling_for_Trajectory_Forecasting_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Ma_2021_ICCV,\n \n author = {\n Ma,\n Yecheng Jason and Inala,\n Jeevana Priya and Jayaraman,\n Dinesh and Bastani,\n Osbert\n},\n title = {\n Likelihood-Based Diverse Sampling for Trajectory Forecasting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13279-13288\n} \n}" }, { "title": "Linguistically Routing Capsule Network for Out-of-Distribution Visual Question Answering", @@ -24480,6 +26139,7 @@ "status": "Poster", "track": "main", "pid": 8260, + "author_site": "Qingxing Cao; Wentao Wan; Keze Wang; Xiaodan Liang; Liang Lin", "author": "Qingxing Cao; Wentao Wan; Keze Wang; Xiaodan Liang; Liang Lin", "abstract": "Generalization on out-of-distribution (OOD) test data is an essential but underexplored topic in visual question answering. Current state-of-the-art VQA models often exploit the biased correlation between data and labels, which results in a large performance drop when the test and training data have different distributions. Inspired by the fact that humans can recognize novel concepts by composing existed concepts and capsule network's ability of representing part-whole hierarchies, we propose to use capsules to represent parts and introduce \"Linguistically Routing\" to merge parts with human-prior hierarchies. Specifically, we first fuse visual features with a single question word as atomic parts. Then we introduce the \"Linguistically Routing\" to reweight the capsule connections between two layers such that: 1) the lower layer capsules can transfer their outputs to the most compatible higher capsules, and 2) two capsules can be merged if their corresponding words are merged in the question parse tree. The routing process maximizes the above unary and binary potentials across multiple layers and finally carves a tree structure inside the capsule network. We evaluate our proposed routing method on the CLEVR compositional generation test, the VQA-CP2 dataset and the VQAv2 dataset. The experimental results show that our proposed method can improve current VQA models on OOD split without losing performance on the in-domain test data.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Cao_Linguistically_Routing_Capsule_Network_for_Out-of-Distribution_Visual_Question_Answering_ICCV_2021_paper.pdf", @@ -24495,15 +26155,16 @@ "email": "mail.sysu.edu.cn;qq.com;gmail.com;gmail.com;ieee.org", "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Cao_Linguistically_Routing_Capsule_Network_for_Out-of-Distribution_Visual_Question_Answering_ICCV_2021_paper.html", - "aff_unique_index": "0;0;0;0;0", - "aff_unique_norm": "Sun Yat-sen University", - "aff_unique_dep": "", - "aff_unique_url": "http://www.sysu.edu.cn/", - "aff_unique_abbr": "SYSU", + "aff_unique_index": "0;1;1;0;1", + "aff_unique_norm": "Sun Yat-sen University;Sun Yat-Sen University", + "aff_unique_dep": ";", + "aff_unique_url": "http://www.sysu.edu.cn/;http://www.sysu.edu.cn/", + "aff_unique_abbr": "SYSU;SYSU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Shenzhen;", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Cao_2021_ICCV,\n \n author = {\n Cao,\n Qingxing and Wan,\n Wentao and Wang,\n Keze and Liang,\n Xiaodan and Lin,\n Liang\n},\n title = {\n Linguistically Routing Capsule Network for Out-of-Distribution Visual Question Answering\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1614-1623\n} \n}" }, { "title": "Lipschitz Continuity Guided Knowledge Distillation", @@ -24511,6 +26172,7 @@ "status": "Poster", "track": "main", "pid": 8235, + "author_site": "Yuzhang Shang; Bin Duan; Ziliang Zong; Liqiang Nie; Yan Yan", "author": "Yuzhang Shang; Bin Duan; Ziliang Zong; Liqiang Nie; Yan Yan", "abstract": "Knowledge distillation has become one of the most important model compression techniques by distilling knowledge from larger teacher networks to smaller student ones. Although great success has been achieved by prior distillation methods via delicately designing various types of knowledge, they overlook the functional properties of neural networks, which makes the process of applying those techniques to new tasks unreliable and non-trivial. To alleviate such problem, in this paper, we initially leverage Lipschitz continuity to better represent the functional characteristic of neural networks and guide the knowledge distillation process. In particular, we propose a novel Lipschitz Continuity Guided Knowledge Distillation framework to faithfully distill knowledge by minimizing the distance between two neural networks' Lipschitz constants, which enables teacher networks to better regularize student networks and improve the corresponding performance. We derive an explainable approximation algorithm with an explicit theoretical derivation to address the NP-hard problem of calculating the Lipschitz constant. Experimental results have shown that our method outperforms other benchmarks over several knowledge distillation tasks (e.g., classification, segmentation and object detection) on CIFAR-100, ImageNet, and PASCAL VOC datasets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Shang_Lipschitz_Continuity_Guided_Knowledge_Distillation_ICCV_2021_paper.pdf", @@ -24534,7 +26196,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Shang_2021_ICCV,\n \n author = {\n Shang,\n Yuzhang and Duan,\n Bin and Zong,\n Ziliang and Nie,\n Liqiang and Yan,\n Yan\n},\n title = {\n Lipschitz Continuity Guided Knowledge Distillation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10675-10684\n} \n}" }, { "title": "LoFGAN: Fusing Local Representations for Few-Shot Image Generation", @@ -24542,6 +26205,7 @@ "status": "Poster", "track": "main", "pid": 4110, + "author_site": "Zheng Gu; Wenbin Li; Jing Huo; Lei Wang; Yang Gao", "author": "Zheng Gu; Wenbin Li; Jing Huo; Lei Wang; Yang Gao", "abstract": "Given only a few available images for a novel unseen category, few-shot image generation aims to generate more data for this category. Previous works attempt to globally fuse these images by using adjustable weighted coefficients. However, there is a serious semantic misalignment between different images from a global perspective, making these works suffer from poor generation quality and diversity. To tackle this problem, we propose a novel Local-Fusion Generative Adversarial Network (LoFGAN) for few-shot image generation. Instead of using these available images as a whole, we first randomly divide them into a base image and several reference images. Next, LoFGAN matches local representations between the base and reference images based on semantic similarities and replaces the local features with the closest related local features. In this way, LoFGAN can produce more realistic and diverse images at a more fine-grained level, and simultaneously enjoy the characteristic of semantic alignment. Furthermore, a local reconstruction loss is also proposed, which can provide better training stability and generation quality. We conduct extensive experiments on three datasets, which successfully demonstrates the effectiveness of our proposed method for few-shot image generation and downstream visual applications with limited data. Code is available at https://github.com/edward3862/LoFGAN-pytorch.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Gu_LoFGAN_Fusing_Local_Representations_for_Few-Shot_Image_Generation_ICCV_2021_paper.pdf", @@ -24565,7 +26229,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Wollongong", "aff_country_unique_index": "0;0;0;1;0", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Gu_2021_ICCV,\n \n author = {\n Gu,\n Zheng and Li,\n Wenbin and Huo,\n Jing and Wang,\n Lei and Gao,\n Yang\n},\n title = {\n LoFGAN: Fusing Local Representations for Few-Shot Image Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8463-8471\n} \n}" }, { "title": "LoOp: Looking for Optimal Hard Negative Embeddings for Deep Metric Learning", @@ -24573,10 +26238,11 @@ "status": "Poster", "track": "main", "pid": 9285, + "author_site": "Bhavya Vasudeva; Puneesh Deora; Saumik Bhattacharya; Umapada Pal; Sukalpa Chanda", "author": "Bhavya Vasudeva; Puneesh Deora; Saumik Bhattacharya; Umapada Pal; Sukalpa Chanda", "abstract": "Deep metric learning has been effectively used to learn distance metrics for different visual tasks like image retrieval, clustering, etc. In order to aid the training process, existing methods either use a hard mining strategy to extract the most informative samples or seek to generate hard synthetics using an additional network. Such approaches face different challenges and can lead to biased embeddings in the former case, and (i) harder optimization (ii) slower training speed (iii) higher model complexity in the latter case. In order to overcome these challenges, we propose a novel approach that looks for optimal hard negatives (LoOp) in the embedding space, taking full advantage of each tuple by calculating the minimum distance between a pair of positives and a pair of negatives. Unlike mining-based methods, our approach considers the entire space between pairs of embeddings to calculate the optimal hard negatives. Extensive experiments combining our approach and representative metric learning losses reveal a significant boost in performance on three benchmark datasets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Vasudeva_LoOp_Looking_for_Optimal_Hard_Negative_Embeddings_for_Deep_Metric_ICCV_2021_paper.pdf", - "aff": "Indian Statistical Institute, Kolkata, India; Indian Statistical Institute, Kolkata, India; Indian Institute of Technology, Kharagpur, India; Indian Statistical Institute, Kolkata, India; \u00d8stfold University College, Halden, Norway", + "aff": "Indian Statistical Institute, Kolkata, India; Indian Statistical Institute, Kolkata, India; Indian Institute of Technology, Kharagpur, India; Indian Statistical Institute, Kolkata, India; Østfold University College, Halden, Norway", "project": "", "github": "https://github.com/puneesh00/LoOp", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Vasudeva_LoOp_Looking_for_ICCV_2021_supplemental.pdf", @@ -24589,14 +26255,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Vasudeva_LoOp_Looking_for_Optimal_Hard_Negative_Embeddings_for_Deep_Metric_ICCV_2021_paper.html", "aff_unique_index": "0;0;1;0;2", - "aff_unique_norm": "Indian Statistical Institute;Indian Institute of Technology;\u00d8stfold University College", + "aff_unique_norm": "Indian Statistical Institute;Indian Institute of Technology;Østfold University College", "aff_unique_dep": ";;", "aff_unique_url": "https://www.isical.ac.in;https://www.iitkgp.ac.in;https://www.hih.no", "aff_unique_abbr": "ISI;IIT Kharagpur;", "aff_campus_unique_index": "0;0;1;0;2", "aff_campus_unique": "Kolkata;Kharagpur;Halden", "aff_country_unique_index": "0;0;0;0;1", - "aff_country_unique": "India;Norway" + "aff_country_unique": "India;Norway", + "bibtex": "@InProceedings{Vasudeva_2021_ICCV,\n \n author = {\n Vasudeva,\n Bhavya and Deora,\n Puneesh and Bhattacharya,\n Saumik and Pal,\n Umapada and Chanda,\n Sukalpa\n},\n title = {\n LoOp: Looking for Optimal Hard Negative Embeddings for Deep Metric Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10634-10643\n} \n}" }, { "title": "LocTex: Learning Data-Efficient Visual Representations From Localized Textual Supervision", @@ -24604,6 +26271,7 @@ "status": "Poster", "track": "main", "pid": 7238, + "author_site": "Zhijian Liu; Simon Stent; Jie Li; John Gideon; Song Han", "author": "Zhijian Liu; Simon Stent; Jie Li; John Gideon; Song Han", "abstract": "Computer vision tasks such as object detection and semantic/instance segmentation rely on the painstaking annotation of large training datasets. In this paper, we propose LocTex that takes advantage of the low-cost localized textual annotations (i.e., captions and synchronized mouse-over gestures) to reduce the annotation effort. We introduce a contrastive pre-training framework between images and captions and propose to supervise the cross-modal attention map with rendered mouse traces to provide coarse localization signals. Our learned visual features capture rich semantics (from free-form captions) and accurate localization (from mouse traces), which are very effective when transferred to various downstream vision tasks. Compared with ImageNet supervised pre-training, LocTex can reduce the size of the pre-training dataset by 10x or the target dataset by 2x while achieving comparable or even improved performance on COCO instance segmentation. When provided with the same amount of annotations, LocTex achieves around 4% higher accuracy than the previous state-of-the-art \"vision+language\" pre-training approach on the task of PASCAL VOC image classification.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liu_LocTex_Learning_Data-Efficient_Visual_Representations_From_Localized_Textual_Supervision_ICCV_2021_paper.pdf", @@ -24627,7 +26295,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Liu_2021_ICCV,\n \n author = {\n Liu,\n Zhijian and Stent,\n Simon and Li,\n Jie and Gideon,\n John and Han,\n Song\n},\n title = {\n LocTex: Learning Data-Efficient Visual Representations From Localized Textual Supervision\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2167-2176\n} \n}" }, { "title": "Local Temperature Scaling for Probability Calibration", @@ -24635,6 +26304,7 @@ "status": "Poster", "track": "main", "pid": 6405, + "author_site": "Zhipeng Ding; Xu Han; Peirong Liu; Marc Niethammer", "author": "Zhipeng Ding; Xu Han; Peirong Liu; Marc Niethammer", "abstract": "For semantic segmentation, label probabilities are often uncalibrated as they are typically only the by-product of a segmentation task. Intersection over Union (IoU) and Dice score are often used as criteria for segmentation success, while metrics related to label probabilities are not often explored. However, probability calibration approaches have been studied, which match probability outputs with experimentally observed errors. These approaches mainly focus on classification tasks, but not on semantic segmentation. Thus, we propose a learning-based calibration method that focuses on multi-label semantic segmentation. Specifically, we adopt a convolutional neural network to predict local temperature values for probability calibration. One advantage of our approach is that it does not change prediction accuracy, hence allowing for calibration as a post-processing step. Experiments on the COCO, CamVid, and LPBA40 datasets demonstrate improved calibration performance for a range of different metrics. We also demonstrate the good performance of our method for multi-atlas brain segmentation from magnetic resonance images.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ding_Local_Temperature_Scaling_for_Probability_Calibration_ICCV_2021_paper.pdf", @@ -24658,7 +26328,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Chapel Hill", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Ding_2021_ICCV,\n \n author = {\n Ding,\n Zhipeng and Han,\n Xu and Liu,\n Peirong and Niethammer,\n Marc\n},\n title = {\n Local Temperature Scaling for Probability Calibration\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6889-6899\n} \n}" }, { "title": "LocalTrans: A Multiscale Local Transformer Network for Cross-Resolution Homography Estimation", @@ -24666,6 +26337,7 @@ "status": "Poster", "track": "main", "pid": 8853, + "author_site": "Ruizhi Shao; Gaochang Wu; Yuemei Zhou; Ying Fu; Lu Fang; Yebin Liu", "author": "Ruizhi Shao; Gaochang Wu; Yuemei Zhou; Ying Fu; Lu Fang; Yebin Liu", "abstract": "Cross-resolution image alignment is a key problem in multiscale gigapixel photography, which requires to estimate homography matrix using images with large resolution gap. Existing deep homography methods concatenate the input images or features, neglecting the explicit formulation of correspondences between them, which leads to degraded accuracy in cross-resolution challenges. In this paper, we consider the cross-resolution homography estimation as a multimodal problem, and propose a local transformer network embedded within a multiscale structure to explicitly learn correspondences between the multimodal inputs, namely, input images with different resolutions. The proposed local transformer adopts a local attention map specifically for each position in the feature. By combining the local transformer with the multiscale structure, the network is able to capture long-short range correspondences efficiently and accurately. Experiments on both the MS-COCO dataset and real-captured cross-resolution dataset show that the proposed network outperforms existing state-of-the-art feature-based and deep-learning-based homography estimation methods, and is able to accurately align images under 10x resolution gap.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Shao_LocalTrans_A_Multiscale_Local_Transformer_Network_for_Cross-Resolution_Homography_Estimation_ICCV_2021_paper.pdf", @@ -24689,7 +26361,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;2;0;0;0;0", - "aff_country_unique": "China;;United States" + "aff_country_unique": "China;;United States", + "bibtex": "@InProceedings{Shao_2021_ICCV,\n \n author = {\n Shao,\n Ruizhi and Wu,\n Gaochang and Zhou,\n Yuemei and Fu,\n Ying and Fang,\n Lu and Liu,\n Yebin\n},\n title = {\n LocalTrans: A Multiscale Local Transformer Network for Cross-Resolution Homography Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14890-14899\n} \n}" }, { "title": "Localize to Binauralize: Audio Spatialization From Visual Sound Source Localization", @@ -24697,6 +26370,7 @@ "status": "Poster", "track": "main", "pid": 10454, + "author_site": "Kranthi Kumar Rachavarapu; Aakanksha; Vignesh Sundaresha; A. N. Rajagopalan", "author": "Kranthi Kumar Rachavarapu; Aakanksha; Vignesh Sundaresha; A. N. Rajagopalan", "abstract": "Videos with binaural audios provide an immersive viewing experience by enabling 3D sound sensation. Recent works attempt to generate binaural audio in a multimodal learning framework using large quantities of videos with accompanying binaural audio. In contrast, we attempt a more challenging problem -- synthesizing binaural audios for a video with monaural audio in a weakly supervised setting and weakly semi-supervised setting. Our key idea is that any down-stream task that can be solved only using binaural audios can be used to provide proxy supervision for binaural audio generation, thereby reducing the reliance on explicit supervision. In this work, as a proxy-task for weak supervision, we use Sound Source Localization with only audio. We design a two-stage architecture called Localize-to-Binauralize Network (L2BNet). The first stage of L2BNet is a Stereo Generation (SG) network employed to generate two-stream audio from monaural audio using visual frame information as guidance. In the second stage, an Audio Localization (AL) network is designed to use the synthesized two-stream audio to localize sound sources in visual frames. The entire network is trained end-to-end so that the AL network provides necessary supervision for the SG network. We experimentally show that our weakly-supervised framework generates two-stream audio containing binaural cues. Through user study, we further validate that our proposed approach generates binaural-quality audio using as little as 10% of explicit binaural supervision data for the SG network.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Rachavarapu_Localize_to_Binauralize_Audio_Spatialization_From_Visual_Sound_Source_Localization_ICCV_2021_paper.pdf", @@ -24720,7 +26394,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Madras", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "India" + "aff_country_unique": "India", + "bibtex": "@InProceedings{Rachavarapu_2021_ICCV,\n \n author = {\n Rachavarapu,\n Kranthi Kumar and Aakanksha and Sundaresha,\n Vignesh and Rajagopalan,\n A. N.\n},\n title = {\n Localize to Binauralize: Audio Spatialization From Visual Sound Source Localization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1930-1939\n} \n}" }, { "title": "Localized Simple Multiple Kernel K-Means", @@ -24728,6 +26403,7 @@ "status": "Poster", "track": "main", "pid": 6491, + "author_site": "Xinwang Liu; Sihang Zhou; Li Liu; Chang Tang; Siwei Wang; Jiyuan Liu; Yi Zhang", "author": "Xinwang Liu; Sihang Zhou; Li Liu; Chang Tang; Siwei Wang; Jiyuan Liu; Yi Zhang", "abstract": "As a representative of multiple kernel clustering (MKC), simple multiple kernel k-means (SimpleMKKM) is recently put forward to boosting the clustering performance by optimally fusing a group of pre-specified kernel matrices. Despite achieving significant improvement in a variety of applications, we find out that SimpleMKKM could indiscriminately force all sample pairs to be equally aligned with the same ideal similarity. As a result, it does not sufficiently take the variation of samples into consideration, leading to unsatisfying clustering performance. To address these issues, this paper proposes a novel MKC algorithm with a \"local\" kernel alignment, which only requires that the similarity of a sample to its k-nearest neighbours be aligned with the ideal similarity matrix. Such an alignment helps the clustering algorithm to focus on closer sample pairs that shall stay together and avoids involving unreliable similarity evaluation for farther sample pairs. After that, we theoretically show that the objective of SimpleMKKM is a special case of this local kernel alignment criterion with normalizing each base kernel matrix. Based on this observation, the proposed localized SimpleMKKM can be readily implemented by existing SimpleMKKM package. Moreover, we conduct extensive experiments on several widely used benchmark datasets to evaluate the clustering performance of localized SimpleMKKM. The experimental results have demonstrated that our algorithm consistently outperforms the state-of-the-art ones, verifying the effectiveness of the proposed local kernel alignment criterion. The code of Localized SimpleMKKM is publicly available at: https://github.com/xinwangliu/LocalizedSMKKM.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liu_Localized_Simple_Multiple_Kernel_K-Means_ICCV_2021_paper.pdf", @@ -24751,7 +26427,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2021_ICCV,\n \n author = {\n Liu,\n Xinwang and Zhou,\n Sihang and Liu,\n Li and Tang,\n Chang and Wang,\n Siwei and Liu,\n Jiyuan and Zhang,\n Yi\n},\n title = {\n Localized Simple Multiple Kernel K-Means\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9293-9301\n} \n}" }, { "title": "Location-Aware Single Image Reflection Removal", @@ -24759,6 +26436,7 @@ "status": "Poster", "track": "main", "pid": 7845, + "author_site": "Zheng Dong; Ke Xu; Yin Yang; Hujun Bao; Weiwei Xu; Rynson W.H. Lau", "author": "Zheng Dong; Ke Xu; Yin Yang; Hujun Bao; Weiwei Xu; Rynson W.H. Lau", "abstract": "This paper proposes a novel location-aware deep-learning-based single image reflection removal method. Our network has a reflection detection module to regress a probabilistic reflection confidence map, taking multi-scale Laplacian features as inputs. This probabilistic map tells if a region is reflection-dominated or transmission-dominated, and it is used as a cue for the network to control the feature flow when predicting the reflection and transmission layers. We design our network as a recurrent network to progressively refine reflection removal results at each iteration. The novelty is that we leverage Laplacian kernel parameters to emphasize the boundaries of strong reflections. It is beneficial to strong reflection detection and substantially improves the quality of reflection removal results. Extensive experiments verify the superior performance of the proposed method over state-of-the-art approaches. Our code and the pre-trained model can be found at https://github.com/zdlarr/Location-aware-SIRR.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Dong_Location-Aware_Single_Image_Reflection_Removal_ICCV_2021_paper.pdf", @@ -24782,7 +26460,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0+0;1;0;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Dong_2021_ICCV,\n \n author = {\n Dong,\n Zheng and Xu,\n Ke and Yang,\n Yin and Bao,\n Hujun and Xu,\n Weiwei and Lau,\n Rynson W.H.\n},\n title = {\n Location-Aware Single Image Reflection Removal\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5017-5026\n} \n}" }, { "title": "Long Short View Feature Decomposition via Contrastive Video Representation Learning", @@ -24790,6 +26469,7 @@ "status": "Poster", "track": "main", "pid": 6917, + "author_site": "Nadine Behrmann; Mohsen Fayyaz; Juergen Gall; Mehdi Noroozi", "author": "Nadine Behrmann; Mohsen Fayyaz; Juergen Gall; Mehdi Noroozi", "abstract": "Self-supervised video representation methods typically focus on the representation of temporal attributes in videos. However, the role of stationary versus non-stationary attributes is less explored: Stationary features, which remain similar throughout the video, enable the prediction of video-level action classes. Non-stationary features, which represent temporally varying attributes, are more beneficial for downstream tasks involving more fine-grained temporal understanding, such as action segmentation. We argue that a single representation to capture both types of features is sub-optimal, and propose to decompose the representation space into stationary and non-stationary features via contrastive learning from long and short views, i.e. long video sequences and their shorter sub-sequences. Stationary features are shared between the short and long views, while non-stationary features aggregate the short views to match the corresponding long view. To empirically verify our approach, we demonstrate that our stationary features work particularly well on an action recognition downstream task, while our non-stationary features perform better on action segmentation. Furthermore, we analyse the learned representations and find that stationary features capture more temporally stable, static attributes, while non-stationary features encompass more temporally varying ones.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Behrmann_Long_Short_View_Feature_Decomposition_via_Contrastive_Video_Representation_Learning_ICCV_2021_paper.pdf", @@ -24813,7 +26493,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Behrmann_2021_ICCV,\n \n author = {\n Behrmann,\n Nadine and Fayyaz,\n Mohsen and Gall,\n Juergen and Noroozi,\n Mehdi\n},\n title = {\n Long Short View Feature Decomposition via Contrastive Video Representation Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9244-9253\n} \n}" }, { "title": "Long-Term Temporally Consistent Unpaired Video Translation From Simulated Surgical 3D Data", @@ -24821,7 +26502,8 @@ "status": "Poster", "track": "main", "pid": 7515, - "author": "Dominik Rivoir; Micha Pfeiffer; Reuben Docea; Fiona Kolbinger; Carina Riediger; J\u00fcrgen Weitz; Stefanie Speidel", + "author_site": "Dominik Rivoir; Micha Pfeiffer; Reuben Docea; Fiona Kolbinger; Carina Riediger; Jürgen Weitz; Stefanie Speidel", + "author": "Dominik Rivoir; Micha Pfeiffer; Reuben Docea; Fiona Kolbinger; Carina Riediger; Jürgen Weitz; Stefanie Speidel", "abstract": "Research in unpaired video translation has mainly focused on short-term temporal consistency by conditioning on neighboring frames. However for transfer from simulated to photorealistic sequences, available information on the underlying geometry offers potential for achieving global consistency across views. We propose a novel approach which combines unpaired image translation with neural rendering to transfer simulated to photorealistic surgical abdominal scenes. By introducing global learnable textures and a lighting-invariant view-consistency loss, our method produces consistent translations of arbitrary views and thus enables long-term consistent video synthesis. We design and test our model to generate video sequences from minimally-invasive surgical abdominal scenes. Because labeled data is often limited in this domain, photorealistic data where ground truth information from the simulated domain is preserved is especially relevant. By extending existing image-based methods to view-consistent videos, we aim to impact the applicability of simulated training and evaluation environments for surgical applications. Code and data: http://opencas.dkfz.de/video-sim2real.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Rivoir_Long-Term_Temporally_Consistent_Unpaired_Video_Translation_From_Simulated_Surgical_3D_ICCV_2021_paper.pdf", "aff": ";;;;;;", @@ -24835,7 +26517,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Rivoir_Long-Term_Temporally_Consistent_Unpaired_Video_Translation_From_Simulated_Surgical_3D_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Rivoir_Long-Term_Temporally_Consistent_Unpaired_Video_Translation_From_Simulated_Surgical_3D_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Rivoir_2021_ICCV,\n \n author = {\n Rivoir,\n Dominik and Pfeiffer,\n Micha and Docea,\n Reuben and Kolbinger,\n Fiona and Riediger,\n Carina and Weitz,\n J\\"urgen and Speidel,\n Stefanie\n},\n title = {\n Long-Term Temporally Consistent Unpaired Video Translation From Simulated Surgical 3D Data\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3343-3353\n} \n}" }, { "title": "LookOut: Diverse Multi-Future Prediction and Planning for Self-Driving", @@ -24843,6 +26526,7 @@ "status": "Poster", "track": "main", "pid": 7104, + "author_site": "Alexander Cui; Sergio Casas; Abbas Sadat; Renjie Liao; Raquel Urtasun", "author": "Alexander Cui; Sergio Casas; Abbas Sadat; Renjie Liao; Raquel Urtasun", "abstract": "In this paper, we present LookOut, a novel autonomy system that perceives the environment, predicts a diverse set of futures of how the scene might unroll and estimates the trajectory of the SDV by optimizing a set of contingency plans over these future realizations. In particular, we learn a diverse joint distribution over multi-agent future trajectories in a traffic scene that covers a wide range of future modes with high sample efficiency while leveraging the expressive power of generative models. Unlike previous work in diverse motion forecasting, our diversity objective explicitly rewards sampling future scenarios that require distinct reactions from the self-driving vehicle for improved safety. Our contingency planner then finds comfortable and non-conservative trajectories that ensure safe reactions to a wide range of future scenarios. Through extensive evaluations, we show that our model demonstrates significantly more diverse and sample-efficient motion forecasting in a large-scale self-driving dataset as well as safer and less conservative motion plans in long-term closed-loop simulations when compared to current state-of-the-art models.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Cui_LookOut_Diverse_Multi-Future_Prediction_and_Planning_for_Self-Driving_ICCV_2021_paper.pdf", @@ -24866,7 +26550,8 @@ "aff_campus_unique_index": ";;1;", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "1;1;2;1", - "aff_country_unique": ";Canada;United States" + "aff_country_unique": ";Canada;United States", + "bibtex": "@InProceedings{Cui_2021_ICCV,\n \n author = {\n Cui,\n Alexander and Casas,\n Sergio and Sadat,\n Abbas and Liao,\n Renjie and Urtasun,\n Raquel\n},\n title = {\n LookOut: Diverse Multi-Future Prediction and Planning for Self-Driving\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 16107-16116\n} \n}" }, { "title": "Looking Here or There? Gaze Following in 360-Degree Images", @@ -24874,6 +26559,7 @@ "status": "Poster", "track": "main", "pid": 8499, + "author_site": "Yunhao Li; Wei Shen; Zhongpai Gao; Yucheng Zhu; Guangtao Zhai; Guodong Guo", "author": "Yunhao Li; Wei Shen; Zhongpai Gao; Yucheng Zhu; Guangtao Zhai; Guodong Guo", "abstract": "Gaze following, i.e., detecting the gaze target of a human subject, in 2D images has become an active topic in computer vision. However, it usually suffers from the out of frame issue due to the limited field-of-view (FoV) of 2D images. In this paper, we introduce a novel task, gaze following in 360-degree images which provide an omnidirectional FoV and can alleviate the out of frame issue. We collect the first dataset, \"GazeFollow360\", for this task, containing around 10,000 360-degree images with complex gaze behaviors under various scenes. Existing 2D gaze following methods suffer from performance degradation in 360-degree images since they may use the assumption that a gaze target is in the 2D gaze sight line. However, this assumption is no longer true for long-distance gaze behaviors in 360-degree images, due to the distortion brought by sphere-to-plane projection. To address this challenge, we propose a 3D sight line guided dual-pathway framework, to detect the gaze target within a local region (here) and from a distant region (there), parallelly. Specifically, the local region is obtained as a 2D cone-shaped field along the 2D projection of the sight line starting at the human subject's head position, and the distant region is obtained by searching along the sight line in 3D sphere space. Finally, the location of the gaze target is determined by fusing the estimations from both the local region and the distant region. Experimental results show that our method achieves significant improvements over previous 2D gaze following methods on our GazeFollow360 dataset.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_Looking_Here_or_There_Gaze_Following_in_360-Degree_Images_ICCV_2021_paper.pdf", @@ -24890,14 +26576,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Li_Looking_Here_or_There_Gaze_Following_in_360-Degree_Images_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;0;0+0;1", - "aff_unique_norm": "Shanghai Jiao Tong University;Baidu", - "aff_unique_dep": "Institute of Image Communication and Network Engineering;Baidu, Inc.", + "aff_unique_norm": "Shanghai Jiao Tong University;Baidu, Inc.", + "aff_unique_dep": "Institute of Image Communication and Network Engineering;", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.baidu.com", "aff_unique_abbr": "SJTU;Baidu", - "aff_campus_unique_index": "0;0;0", + "aff_campus_unique_index": "0;0;0;0;0+0", "aff_campus_unique": "Shanghai;", "aff_country_unique_index": "0;0;0;0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Yunhao and Shen,\n Wei and Gao,\n Zhongpai and Zhu,\n Yucheng and Zhai,\n Guangtao and Guo,\n Guodong\n},\n title = {\n Looking Here or There? Gaze Following in 360-Degree Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3742-3751\n} \n}" }, { "title": "Low Curvature Activations Reduce Overfitting in Adversarial Training", @@ -24905,6 +26592,7 @@ "status": "Poster", "track": "main", "pid": 3632, + "author_site": "Vasu Singla; Sahil Singla; Soheil Feizi; David Jacobs", "author": "Vasu Singla; Sahil Singla; Soheil Feizi; David Jacobs", "abstract": "Adversarial training is one of the most effective defenses against adversarial attacks. Previous works suggest that overfitting is a dominant phenomenon in adversarial training leading to a large generalization gap between test and train accuracy in neural networks. In this work, we show that the observed generalization gap is closely related to the choice of the activation function. In particular, we show that using activation functions with low (exact or approximate) curvature values has a regularization effect that significantly reduces both the standard and robust generalization gaps in adversarial training. We observe this effect for both differentiable/smooth activations such as SiLU as well as non-differentiable/non-smooth activations such as LeakyReLU. In the latter case, the \"approximate\" curvature of the activation is low. Finally, we show that for activation functions with low curvature, the double descent phenomenon for adversarially trained models does not occur.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Singla_Low_Curvature_Activations_Reduce_Overfitting_in_Adversarial_Training_ICCV_2021_paper.pdf", @@ -24928,7 +26616,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Singla_2021_ICCV,\n \n author = {\n Singla,\n Vasu and Singla,\n Sahil and Feizi,\n Soheil and Jacobs,\n David\n},\n title = {\n Low Curvature Activations Reduce Overfitting in Adversarial Training\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 16423-16433\n} \n}" }, { "title": "Low-Rank Tensor Completion by Approximating the Tensor Average Rank", @@ -24936,6 +26625,7 @@ "status": "Poster", "track": "main", "pid": 8614, + "author_site": "Zhanliang Wang; Junyu Dong; Xinguo Liu; Xueying Zeng", "author": "Zhanliang Wang; Junyu Dong; Xinguo Liu; Xueying Zeng", "abstract": "This paper focuses on the problem of low-rank tensor completion, the goal of which is to recover an underlying low-rank tensor from incomplete observations. Our method is motivated by the recently proposed t-product based on any invertible linear transforms. First, we define the new tensor average rank under the invertible real linear transforms. We then propose a new tensor completion model using a nonconvex surrogate to approximate the tensor average rank. This surrogate overcomes the discontinuity of the tensor average rank and alleviates the bias problem caused by the convex relaxation. Further, we develop an efficient algorithm to solve the proposed model and establish its convergence. Finally, experimental results on both synthetic and real data demonstrate the superiority of our method.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_Low-Rank_Tensor_Completion_by_Approximating_the_Tensor_Average_Rank_ICCV_2021_paper.pdf", @@ -24959,7 +26649,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Zhanliang and Dong,\n Junyu and Liu,\n Xinguo and Zeng,\n Xueying\n},\n title = {\n Low-Rank Tensor Completion by Approximating the Tensor Average Rank\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4612-4620\n} \n}" }, { "title": "Low-Shot Validation: Active Importance Sampling for Estimating Classifier Performance on Rare Categories", @@ -24967,6 +26658,7 @@ "status": "Poster", "track": "main", "pid": 8077, + "author_site": "Fait Poms; Vishnu Sarukkai; Ravi Teja Mullapudi; Nimit S. Sohoni; William R. Mark; Deva Ramanan; Kayvon Fatahalian", "author": "Fait Poms; Vishnu Sarukkai; Ravi Teja Mullapudi; Nimit S. Sohoni; William R. Mark; Deva Ramanan; Kayvon Fatahalian", "abstract": "For machine learning models trained with limited labeled training data, validation stands to become the main bottleneck to reducing overall annotation costs. We propose a statistical validation algorithm that accurately estimates the F-score of binary classifiers for rare categories, where finding relevant examples to evaluate on is particularly challenging. Our key insight is that simultaneous calibration and importance sampling enables accurate estimates even in the low-sample regime (<300 samples). Critically, we also derive an accurate single-trial estimator of the variance of our method and demonstrate that this estimator is empirically accurate at low sample counts, enabling a practitioner to know how well they can trust a given low-sample estimate. When validating state-of-the-art semi-supervised models on ImageNet and iNaturalist2017, our method achieves the same estimates of model performance with up to 10x fewer labels than competing approaches. In particular, we can estimate model F1 scores with a variance of 0.005 using as few as 100 labels.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Poms_Low-Shot_Validation_Active_Importance_Sampling_for_Estimating_Classifier_Performance_on_ICCV_2021_paper.pdf", @@ -24984,13 +26676,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Poms_Low-Shot_Validation_Active_Importance_Sampling_for_Estimating_Classifier_Performance_on_ICCV_2021_paper.html", "aff_unique_index": "0;0;1;0;2;1+3;0", "aff_unique_norm": "Stanford University;Carnegie Mellon University;Google;Argo AI", - "aff_unique_dep": ";;Google;", + "aff_unique_dep": ";;;", "aff_unique_url": "https://www.stanford.edu;https://www.cmu.edu;https://www.google.com;https://www.argo.ai", "aff_unique_abbr": "Stanford;CMU;Google;Argo AI", "aff_campus_unique_index": "0;0;0;2;;0", "aff_campus_unique": "Stanford;;Mountain View", "aff_country_unique_index": "0;0;0;0;0;0+0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Poms_2021_ICCV,\n \n author = {\n Poms,\n Fait and Sarukkai,\n Vishnu and Mullapudi,\n Ravi Teja and Sohoni,\n Nimit S. and Mark,\n William R. and Ramanan,\n Deva and Fatahalian,\n Kayvon\n},\n title = {\n Low-Shot Validation: Active Importance Sampling for Estimating Classifier Performance on Rare Categories\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10705-10714\n} \n}" }, { "title": "Lucas-Kanade Reloaded: End-to-End Super-Resolution From Raw Image Bursts", @@ -24998,6 +26691,7 @@ "status": "Poster", "track": "main", "pid": 3479, + "author_site": "Bruno Lecouat; Jean Ponce; Julien Mairal", "author": "Bruno Lecouat; Jean Ponce; Julien Mairal", "abstract": "This presentation addresses the problem of reconstructing a high-resolution image from multiple lower-resolution snapshots captured from slightly different viewpoints in space and time. Key challenges for solving this super-resolution problem include (i) aligning the input pictures with sub-pixel accuracy, (ii) handling raw (noisy) images for maximal faithfulness to native camera data, and (iii) designing/learning an image prior (regularizer) well suited to the task. We address these three challenges with a hybrid algorithm building on the insight from Wronski et al. that aliasing is an ally in this setting, with parameters that can be learned end to end, while retaining the interpretability of classical approaches to inverse problems. The effectiveness of our approach is demonstrated on synthetic and real image bursts, setting a new state of the art on several benchmarks and delivering excellent qualitative results on real raw bursts captured by smartphones and prosumer cameras.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Lecouat_Lucas-Kanade_Reloaded_End-to-End_Super-Resolution_From_Raw_Image_Bursts_ICCV_2021_paper.pdf", @@ -25014,14 +26708,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Lecouat_Lucas-Kanade_Reloaded_End-to-End_Super-Resolution_From_Raw_Image_Bursts_ICCV_2021_paper.html", "aff_unique_index": "0+1;0+2;1", - "aff_unique_norm": "INRIA;Universite Grenoble Alpes;New York University", + "aff_unique_norm": "Inria;Universite Grenoble Alpes;New York University", "aff_unique_dep": ";;Center for Data Science", "aff_unique_url": "https://www.inria.fr;https://www.univ-grenoble-alpes.fr;https://www.nyu.edu", "aff_unique_abbr": "Inria;UGA;NYU", "aff_campus_unique_index": "0+1;0+2;1", "aff_campus_unique": "Paris;Grenoble;New York", "aff_country_unique_index": "0+0;0+1;0", - "aff_country_unique": "France;United States" + "aff_country_unique": "France;United States", + "bibtex": "@InProceedings{Lecouat_2021_ICCV,\n \n author = {\n Lecouat,\n Bruno and Ponce,\n Jean and Mairal,\n Julien\n},\n title = {\n Lucas-Kanade Reloaded: End-to-End Super-Resolution From Raw Image Bursts\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2370-2379\n} \n}" }, { "title": "M3D-VTON: A Monocular-to-3D Virtual Try-On Network", @@ -25029,6 +26724,7 @@ "status": "Poster", "track": "main", "pid": 8611, + "author_site": "Fuwei Zhao; Zhenyu Xie; Michael Kampffmeyer; Haoye Dong; Songfang Han; Tianxiang Zheng; Tao Zhang; Xiaodan Liang", "author": "Fuwei Zhao; Zhenyu Xie; Michael Kampffmeyer; Haoye Dong; Songfang Han; Tianxiang Zheng; Tao Zhang; Xiaodan Liang", "abstract": "Virtual 3D try-on can provide an intuitive and realistic view for online shopping and has a huge potential commercial value. However, existing 3D virtual try-on methods mainly rely on annotated 3D human shapes and garment templates, which hinders their applications in practical scenarios. 2D virtual try-on approaches provide a faster alternative to manipulate clothed humans, but lack the rich and realistic 3D representation. In this paper, we propose a novel Monocular-to-3D Virtual Try-On Network (M3D-VTON) that builds on the merits of both 2D and 3D approaches. By integrating 2D information efficiently and learning a mapping that lifts the 2D representation to 3D, we make the first attempt to reconstruct a 3D try-on mesh only taking the target clothing and a person image as inputs. The proposed M3D-VTON includes three modules: 1) The Monocular Prediction Module (MPM) that estimates an initial full-body depth map and accomplishes 2D clothes-person alignment through a novel two-stage warping procedure; 2) The Depth Refinement Module (DRM) that refines the initial body depth to produce more detailed pleat and face characteristics; 3) The Texture Fusion Module (TFM) that fuses the warped clothing with the non-target body part to refine the results. We also construct a high-quality synthesized Monocular-to-3D virtual try-on dataset, in which each person image is associated with a front and a back depth map. Extensive experiments demonstrate that the proposed M3D-VTON can manipulate and reconstruct the 3D human body wearing the given clothing with compelling details and is more efficient than other 3D approaches.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhao_M3D-VTON_A_Monocular-to-3D_Virtual_Try-On_Network_ICCV_2021_paper.pdf", @@ -25045,14 +26741,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhao_M3D-VTON_A_Monocular-to-3D_Virtual_Try-On_Network_ICCV_2021_paper.html", "aff_unique_index": "0;0;1;0;2;3;3;0", - "aff_unique_norm": "Sun Yat-sen University;Arctic University of Norway;University of California, San Diego;Momo", + "aff_unique_norm": "Sun Yat-sen University;The Arctic University of Norway;University of California, San Diego;Momo", "aff_unique_dep": ";;;", "aff_unique_url": "http://www.sysu.edu.cn/;https://www.uit.no;https://www.ucsd.edu;", "aff_unique_abbr": "SYSU;UiT;UCSD;", "aff_campus_unique_index": "0;0;0;2;0", "aff_campus_unique": "Shenzhen;;San Diego", "aff_country_unique_index": "0;0;1;0;2;0", - "aff_country_unique": "China;Norway;United States;" + "aff_country_unique": "China;Norway;United States;", + "bibtex": "@InProceedings{Zhao_2021_ICCV,\n \n author = {\n Zhao,\n Fuwei and Xie,\n Zhenyu and Kampffmeyer,\n Michael and Dong,\n Haoye and Han,\n Songfang and Zheng,\n Tianxiang and Zhang,\n Tao and Liang,\n Xiaodan\n},\n title = {\n M3D-VTON: A Monocular-to-3D Virtual Try-On Network\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13239-13249\n} \n}" }, { "title": "MAAS: Multi-Modal Assignation for Active Speaker Detection", @@ -25060,7 +26757,8 @@ "status": "Poster", "track": "main", "pid": 1105, - "author": "Juan L\u00e9on Alc\u00e1zar; Fabian Caba; Ali K. Thabet; Bernard Ghanem", + "author_site": "Juan Léon Alcázar; Fabian Caba; Ali K. Thabet; Bernard Ghanem", + "author": "Juan Léon Alcázar; Fabian Caba; Ali K. Thabet; Bernard Ghanem", "abstract": "Active speaker detection requires a solid integration of multi-modal cues. While individual modalities can approximate a solution, accurate predictions can only be achieved by explicitly fusing the audio and visual features and modeling their temporal progression. Despite its inherent muti-modal nature, current methods still focus on modeling and fusing short-term audiovisual features for individual speakers, often at frame level. In this paper we present a novel approach to active speaker detection that directly addresses the multi-modal nature of the problem, and provides a straightforward strategy where independent visual features from potential speakers in the scene are assigned to a previously detected speech event. Our experiments show that, an small graph data structure built from local information, allows to approximate an instantaneous audio-visual assignment problem. Moreover, the temporal extension of this initial graph achieves a new state-of-the-art performance on the AVA-ActiveSpeaker dataset with a mAP of 88.8%.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Alcazar_MAAS_Multi-Modal_Assignation_for_Active_Speaker_Detection_ICCV_2021_paper.pdf", "aff": "King Abdullah University of Science and Technology (KAUST); Adobe Research; King Abdullah University of Science and Technology (KAUST); King Abdullah University of Science and Technology (KAUST)", @@ -25083,7 +26781,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", - "aff_country_unique": "Saudi Arabia;United States" + "aff_country_unique": "Saudi Arabia;United States", + "bibtex": "@InProceedings{Alcazar_2021_ICCV,\n \n author = {\n Alc\\'azar,\n Juan L\\'eon and Caba,\n Fabian and Thabet,\n Ali K. and Ghanem,\n Bernard\n},\n title = {\n MAAS: Multi-Modal Assignation for Active Speaker Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 265-274\n} \n}" }, { "title": "MBA-VO: Motion Blur Aware Visual Odometry", @@ -25091,6 +26790,7 @@ "status": "Poster", "track": "main", "pid": 5917, + "author_site": "Peidong Liu; Xingxing Zuo; Viktor Larsson; Marc Pollefeys", "author": "Peidong Liu; Xingxing Zuo; Viktor Larsson; Marc Pollefeys", "abstract": "Motion blur is one of the major challenges remaining for visual odometry methods. In low-light conditions where longer exposure times are necessary, motion blur can appear even for relatively slow camera motions. In this paper we present a novel hybrid visual odometry pipeline with direct approach that explicitly models and estimates the camera's local trajectory within exposure time. This allows us to actively compensate for any motion blur that occurs due to the camera motion. In addition, we also contribute a novel benchmarking dataset for motion blur aware visual odometry. In experiments we show that by directly modeling the image formation process we are able to improve robustness of the visual odometry, while keeping comparable accuracy as that for images without motion blur.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liu_MBA-VO_Motion_Blur_Aware_Visual_Odometry_ICCV_2021_paper.pdf", @@ -25105,7 +26805,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Liu_MBA-VO_Motion_Blur_Aware_Visual_Odometry_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Liu_MBA-VO_Motion_Blur_Aware_Visual_Odometry_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Liu_2021_ICCV,\n \n author = {\n Liu,\n Peidong and Zuo,\n Xingxing and Larsson,\n Viktor and Pollefeys,\n Marc\n},\n title = {\n MBA-VO: Motion Blur Aware Visual Odometry\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5550-5559\n} \n}" }, { "title": "MDETR - Modulated Detection for End-to-End Multi-Modal Understanding", @@ -25113,6 +26814,7 @@ "status": "Poster", "track": "main", "pid": 6653, + "author_site": "Aishwarya Kamath; Mannat Singh; Yann LeCun; Gabriel Synnaeve; Ishan Misra; Nicolas Carion", "author": "Aishwarya Kamath; Mannat Singh; Yann LeCun; Gabriel Synnaeve; Ishan Misra; Nicolas Carion", "abstract": "Multi-modal reasoning systems rely on a pre-trained object detector to extract regions of interest from the image. However, this crucial module is typically used as a black box, trained independently of the downstream task and on a fixed vocabulary of objects and attributes. This makes it challenging for such systems to capture the long tail of visual concepts expressed in free form text. In this paper we propose MDETR, an end-to-end modulated detector that detects objects in an image conditioned on a raw text query, like a caption or a question. We use a transformer-based architecture to reason jointly over text and image by fusing the two modalities at an early stage of the model. We pre-train the network on 1.3M text-image pairs, mined from pre-existing multi-modal datasets having explicit alignment between phrases in text and objects in the image. We then fine-tune on several downstream tasks such as phrase grounding, referring expression comprehension and segmentation, achieving state-of-the-art results on popular benchmarks. We also investigate the utility of our model as an object detector on a given label set when fine-tuned in a few-shot setting. We show that our pre-training approach provides a way to handle the long tail of object categories which have very few labelled instances. Our approach can be easily extended for visual question answering, achieving competitive performance on GQA and CLEVR. The code and models are available at https://github.com/ashkamath/mdetr.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Kamath_MDETR_-_Modulated_Detection_for_End-to-End_Multi-Modal_Understanding_ICCV_2021_paper.pdf", @@ -25129,14 +26831,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Kamath_MDETR_-_Modulated_Detection_for_End-to-End_Multi-Modal_Understanding_ICCV_2021_paper.html", "aff_unique_index": "0;1;0+1+0;1;1;0", - "aff_unique_norm": "New York University;Meta", + "aff_unique_norm": "New York University;Facebook", "aff_unique_dep": "Center for Data Science;Facebook AI Research", "aff_unique_url": "https://www.nyu.edu;https://research.facebook.com", "aff_unique_abbr": "NYU;FAIR", "aff_campus_unique_index": "0;0+0;0", "aff_campus_unique": "New York;", "aff_country_unique_index": "0;0;0+0+0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Kamath_2021_ICCV,\n \n author = {\n Kamath,\n Aishwarya and Singh,\n Mannat and LeCun,\n Yann and Synnaeve,\n Gabriel and Misra,\n Ishan and Carion,\n Nicolas\n},\n title = {\n MDETR - Modulated Detection for End-to-End Multi-Modal Understanding\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1780-1790\n} \n}" }, { "title": "ME-PCN: Point Completion Conditioned on Mask Emptiness", @@ -25144,6 +26847,7 @@ "status": "Poster", "track": "main", "pid": 3149, + "author_site": "Bingchen Gong; Yinyu Nie; Yiqun Lin; Xiaoguang Han; Yizhou Yu", "author": "Bingchen Gong; Yinyu Nie; Yiqun Lin; Xiaoguang Han; Yizhou Yu", "abstract": "Point completion refers to completing the missing geometries of an object from incomplete observations. Main-stream methods predict the missing shapes by decoding a global feature learned from the input point cloud, which often leads to deficient results in preserving topology consistency and surface details. In this work, we present ME-PCN, a point completion network that leverages `emptiness' in 3D shape space. Given a single depth scan, previous methods often encode the occupied partial shapes while ignoring the empty regions (e.g. holes) in depth maps. In contrast, we argue that these `emptiness' clues indicate shape boundaries that can be used to improve topology representation and detail granularity on surfaces. Specifically, our ME-PCN encodes both the occupied point cloud and the neighboring `empty points'. It estimates coarse-grained but complete and reasonable surface points in the first stage, followed by a refinement stage to produce fine-grained surface details. Comprehensive experiments verify that our ME-PCN presents better qualitative and quantitative performance against the state-of-the-art. Besides, we further prove that our `emptiness' design is lightweight and easy to embed in existing methods, which shows consistent effectiveness in improving the CD and EMD scores.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Gong_ME-PCN_Point_Completion_Conditioned_on_Mask_Emptiness_ICCV_2021_paper.pdf", @@ -25160,14 +26864,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Gong_ME-PCN_Point_Completion_Conditioned_on_Mask_Emptiness_ICCV_2021_paper.html", "aff_unique_index": "0;1;2;2+0;0", - "aff_unique_norm": "University of Hong Kong;Technical University of Munich;Chinese University of Hong Kong, Shenzhen", + "aff_unique_norm": "The University of Hong Kong;Technical University of Munich;The Chinese University of Hong Kong, Shenzhen", "aff_unique_dep": ";;School of Software Engineering", "aff_unique_url": "https://www.hku.hk;https://www.tum.de;https://www.cuhk.edu.cn", "aff_unique_abbr": "HKU;TUM;CUHK(SZ)", "aff_campus_unique_index": "0;2;2+0;0", "aff_campus_unique": "Hong Kong SAR;;Shenzhen", "aff_country_unique_index": "0;1;0;0+0;0", - "aff_country_unique": "China;Germany" + "aff_country_unique": "China;Germany", + "bibtex": "@InProceedings{Gong_2021_ICCV,\n \n author = {\n Gong,\n Bingchen and Nie,\n Yinyu and Lin,\n Yiqun and Han,\n Xiaoguang and Yu,\n Yizhou\n},\n title = {\n ME-PCN: Point Completion Conditioned on Mask Emptiness\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12488-12497\n} \n}" }, { "title": "MEDIRL: Predicting the Visual Attention of Drivers via Maximum Entropy Deep Inverse Reinforcement Learning", @@ -25175,6 +26880,7 @@ "status": "Poster", "track": "main", "pid": 7007, + "author_site": "Sonia Baee; Erfan Pakdamanian; Inki Kim; Lu Feng; Vicente Ordonez; Laura Barnes", "author": "Sonia Baee; Erfan Pakdamanian; Inki Kim; Lu Feng; Vicente Ordonez; Laura Barnes", "abstract": "Inspired by human visual attention, we propose a novel inverse reinforcement learning formulation using Maximum Entropy Deep Inverse Reinforcement Learning (MEDIRL) for predicting the visual attention of drivers in accident-prone situations. MEDIRL predicts fixation locations that lead to maximal rewards by learning a task-sensitive reward function from eye fixation patterns recorded from attentive drivers. Additionally, we introduce EyeCar, a new driver attention dataset in accident-prone situations. We conduct comprehensive experiments to evaluate our proposed model on three common benchmarks: (DR(eye)VE, BDD-A, DADA-2000), and our EyeCar dataset. Results indicate that MEDIRL outperforms existing models for predicting attention and achieves state-of-the-art performance. We present extensive ablation studies to provide more insights into different features of our proposed model.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Baee_MEDIRL_Predicting_the_Visual_Attention_of_Drivers_via_Maximum_Entropy_ICCV_2021_paper.pdf", @@ -25191,14 +26897,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Baee_MEDIRL_Predicting_the_Visual_Attention_of_Drivers_via_Maximum_Entropy_ICCV_2021_paper.html", "aff_unique_index": "0;0;1;0;2;0", - "aff_unique_norm": "University of Virginia;University of Illinois Urbana-Champaign;Rice University", + "aff_unique_norm": "University of Virginia;University of Illinois at Urbana-Champaign;Rice University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.virginia.edu;https://illinois.edu;https://www.rice.edu", "aff_unique_abbr": "UVA;UIUC;Rice", "aff_campus_unique_index": "1", "aff_campus_unique": ";Urbana-Champaign", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Baee_2021_ICCV,\n \n author = {\n Baee,\n Sonia and Pakdamanian,\n Erfan and Kim,\n Inki and Feng,\n Lu and Ordonez,\n Vicente and Barnes,\n Laura\n},\n title = {\n MEDIRL: Predicting the Visual Attention of Drivers via Maximum Entropy Deep Inverse Reinforcement Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13178-13188\n} \n}" }, { "title": "MFNet: Multi-Filter Directive Network for Weakly Supervised Salient Object Detection", @@ -25206,6 +26913,7 @@ "status": "Poster", "track": "main", "pid": 2755, + "author_site": "Yongri Piao; Jian Wang; Miao Zhang; Huchuan Lu", "author": "Yongri Piao; Jian Wang; Miao Zhang; Huchuan Lu", "abstract": "Weakly supervised salient object detection (WSOD) targets to train a CNNs-based saliency network using only low-cost annotations. Existing WSOD methods take various techniques to pursue single \"high-quality\" pseudo label from low-cost annotations and then develop their saliency networks. Though these methods have achieved good performance, the generated single label is inevitably affected by adopted refinement algorithms and shows prejudiced characteristics which further influence the saliency networks. In this work, we introduce a new multiple-pseudo label framework to integrate more comprehensive and accurate saliency cues from multiple labels, avoiding the aforementioned problem. Specifically, we propose a multi-filer directive network (MFNet) including a saliency network as well as multiple directive filters. The directive filter (DF) is designed to extract and filter more accurate saliency cues from the noisy pseudo labels. The multiple accurate cues from multiple DFs are then simultaneously propagated to the saliency network with a multi-guidance loss. Extensive experiments on five datasets over four metrics demonstrate that our method outperforms all the existing congeneric methods. Moreover, it is also worth noting that our framework is flexible enough to apply to existing methods and improve their performance.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Piao_MFNet_Multi-Filter_Directive_Network_for_Weakly_Supervised_Salient_Object_Detection_ICCV_2021_paper.pdf", @@ -25229,7 +26937,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Piao_2021_ICCV,\n \n author = {\n Piao,\n Yongri and Wang,\n Jian and Zhang,\n Miao and Lu,\n Huchuan\n},\n title = {\n MFNet: Multi-Filter Directive Network for Weakly Supervised Salient Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4136-4145\n} \n}" }, { "title": "MG-GAN: A Multi-Generator Model Preventing Out-of-Distribution Samples in Pedestrian Trajectory Prediction", @@ -25237,7 +26946,8 @@ "status": "Poster", "track": "main", "pid": 7501, - "author": "Patrick Dendorfer; Sven Elflein; Laura Leal-Taix\u00e9", + "author_site": "Patrick Dendorfer; Sven Elflein; Laura Leal-Taixé", + "author": "Patrick Dendorfer; Sven Elflein; Laura Leal-Taixé", "abstract": "Pedestrian trajectory prediction is challenging due to its uncertain and multimodal nature. While generative adversarial networks can learn a distribution over future trajectories, they tend to predict out-of-distribution samples when the distribution of future trajectories is a mixture of multiple, possibly disconnected modes. To address this issue, we propose a multi-generator model for pedestrian trajectory prediction. Each generator specializes in learning a distribution over trajectories routing towards one of the primary modes in the scene, while a second network learns a categorical distribution over these generators, conditioned on the dynamics and scene input. This architecture allows us to effectively sample from specialized generators and to significantly reduce the out-of-distribution samples compared to single generator methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Dendorfer_MG-GAN_A_Multi-Generator_Model_Preventing_Out-of-Distribution_Samples_in_Pedestrian_Trajectory_ICCV_2021_paper.pdf", "aff": "Technical University Munich; Technical University Munich; Technical University Munich", @@ -25260,7 +26970,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Dendorfer_2021_ICCV,\n \n author = {\n Dendorfer,\n Patrick and Elflein,\n Sven and Leal-Taix\\'e,\n Laura\n},\n title = {\n MG-GAN: A Multi-Generator Model Preventing Out-of-Distribution Samples in Pedestrian Trajectory Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13158-13167\n} \n}" }, { "title": "MGNet: Monocular Geometric Scene Understanding for Autonomous Driving", @@ -25268,7 +26979,8 @@ "status": "Poster", "track": "main", "pid": 8933, - "author": "Markus Sch\u00f6n; Michael Buchholz; Klaus Dietmayer", + "author_site": "Markus Schön; Michael Buchholz; Klaus Dietmayer", + "author": "Markus Schön; Michael Buchholz; Klaus Dietmayer", "abstract": "We introduce MGNet, a multi-task framework for monocular geometric scene understanding. We define monocular geometric scene understanding as the combination of two known tasks: Panoptic segmentation and self-supervised monocular depth estimation. Panoptic segmentation captures the full scene not only semantically, but also on an instance basis. Self-supervised monocular depth estimation uses geometric constraints derived from the camera measurement model in order to measure depth from monocular video sequences only. To the best of our knowledge, we are the first to propose the combination of these two tasks in one single model. Our model is designed with focus on low latency to provide fast inference in real-time on a single consumer-grade GPU. During deployment, our model produces dense 3D point clouds with instance aware semantic labels from single high-resolution camera images. We evaluate our model on two popular autonomous driving benchmarks, i.e., Cityscapes and KITTI, and show competitive performance among other real-time capable methods. Source code is available at https://github.com/markusschoen/MGNet.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Schon_MGNet_Monocular_Geometric_Scene_Understanding_for_Autonomous_Driving_ICCV_2021_paper.pdf", "aff": "Institute of Measurement, Control and Microtechnology, Ulm University; Institute of Measurement, Control and Microtechnology, Ulm University; Institute of Measurement, Control and Microtechnology, Ulm University", @@ -25291,7 +27003,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Schon_2021_ICCV,\n \n author = {\n Sch\\"on,\n Markus and Buchholz,\n Michael and Dietmayer,\n Klaus\n},\n title = {\n MGNet: Monocular Geometric Scene Understanding for Autonomous Driving\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15804-15815\n} \n}" }, { "title": "MGSampler: An Explainable Sampling Strategy for Video Action Recognition", @@ -25299,6 +27012,7 @@ "status": "Poster", "track": "main", "pid": 9731, + "author_site": "Yuan Zhi; Zhan Tong; Limin Wang; Gangshan Wu", "author": "Yuan Zhi; Zhan Tong; Limin Wang; Gangshan Wu", "abstract": "Frame sampling is a fundamental problem in video action recognition due to the essential redundancy in time and limited computation resources. The existing sampling strategy often employs a fixed frame selection and lacks the flexibility to deal with complex variations in videos. In this paper, we present a simple, sparse, and explainable frame sampler, termed as Motion-Guided Sampler (MGSampler). Our basic motivation is that motion is an important and universal signal that can drive us to adaptively select frames from videos. Accordingly, we propose two important properties in our MGSampler design: motion sensitive and motion uniform. First, we present two different motion representations to enable us to efficiently distinguish the motion-salient frames from the background. Then, we devise a motion-uniform sampling strategy based on the cumulative motion distribution to ensure the sampled frames evenly cover all the important segments with high motion salience. Our MGSampler yields a new principled and holistic sample scheme, that could be incorporated into any existing video architecture. Experiments on five benchmarks demonstrate the effectiveness of our MGSampler over previous fixed sampling strategies, and its generalization power across different backbones, video models, and datasets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhi_MGSampler_An_Explainable_Sampling_Strategy_for_Video_Action_Recognition_ICCV_2021_paper.pdf", @@ -25322,7 +27036,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhi_2021_ICCV,\n \n author = {\n Zhi,\n Yuan and Tong,\n Zhan and Wang,\n Limin and Wu,\n Gangshan\n},\n title = {\n MGSampler: An Explainable Sampling Strategy for Video Action Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1513-1522\n} \n}" }, { "title": "MINE: Towards Continuous Depth MPI With NeRF for Novel View Synthesis", @@ -25330,6 +27045,7 @@ "status": "Poster", "track": "main", "pid": 4168, + "author_site": "Jiaxin Li; Zijian Feng; Qi She; Henghui Ding; Changhu Wang; Gim Hee Lee", "author": "Jiaxin Li; Zijian Feng; Qi She; Henghui Ding; Changhu Wang; Gim Hee Lee", "abstract": "In this paper, we propose MINE to perform novel view synthesis and depth estimation via dense 3D reconstruction from a single image. Our approach is a continuous depth generalization of the Multiplane Images (MPI) by introducing the NEural radiance fields (NeRF). Given a single image as input, MINE predicts a 4-channel image (RGB and volume density) at arbitrary depth values to jointly reconstruct the camera frustum and fill in occluded contents. The reconstructed and inpainted frustum can then be easily rendered into novel RGB or depth views using differentiable rendering. Extensive experiments on RealEstate10K, KITTI and Flowers Light Fields show that our MINE outperforms state-of-the-art by a large margin in novel view synthesis. We also achieve competitive results in depth estimation on iBims-1 and NYU-v2 without annotated depth supervision. Our source code is available at https://github.com/vincentfung13/MINE", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_MINE_Towards_Continuous_Depth_MPI_With_NeRF_for_Novel_View_ICCV_2021_paper.pdf", @@ -25353,7 +27069,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;1", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Jiaxin and Feng,\n Zijian and She,\n Qi and Ding,\n Henghui and Wang,\n Changhu and Lee,\n Gim Hee\n},\n title = {\n MINE: Towards Continuous Depth MPI With NeRF for Novel View Synthesis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12578-12588\n} \n}" }, { "title": "MLVSNet: Multi-Level Voting Siamese Network for 3D Visual Tracking", @@ -25361,6 +27078,7 @@ "status": "Poster", "track": "main", "pid": 2850, + "author_site": "Zhoutao Wang; Qian Xie; Yu-Kun Lai; Jing Wu; Kun Long; Jun Wang", "author": "Zhoutao Wang; Qian Xie; Yu-Kun Lai; Jing Wu; Kun Long; Jun Wang", "abstract": "Benefiting from the excellent performance of Siamese-based trackers, huge progress on 2D visual tracking has been achieved. However, 3D visual tracking is still under-explored. Inspired by the idea of Hough voting in 3D object detection, in this paper, we propose a Multi-level Voting Siamese Network (MLVSNet) for 3D visual tracking from outdoor point cloud sequences. To deal with sparsity in outdoor 3D point clouds, we propose to perform Hough voting on multi-level features to get more vote centers and retain more useful information, instead of voting only on the final level feature as in previous methods. We also design an efficient and lightweight Target-Guided Attention (TGA) module to transfer the target information and highlight the target points in the search area. Moreover, we propose a Vote-cluster Feature Enhancement (VFE) module to exploit the relationships between different vote clusters. Extensive experiments on the 3D tracking benchmark of KITTI dataset demonstrate that our MLVSNet outperforms state-of-the-art methods with significant margins. Code will be available at https://github.com/CodeWZT/MLVSNet.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_MLVSNet_Multi-Level_Voting_Siamese_Network_for_3D_Visual_Tracking_ICCV_2021_paper.pdf", @@ -25384,7 +27102,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;1;0;0", - "aff_country_unique": "China;United Kingdom" + "aff_country_unique": "China;United Kingdom", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Zhoutao and Xie,\n Qian and Lai,\n Yu-Kun and Wu,\n Jing and Long,\n Kun and Wang,\n Jun\n},\n title = {\n MLVSNet: Multi-Level Voting Siamese Network for 3D Visual Tracking\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3101-3110\n} \n}" }, { "title": "MOTSynth: How Can Synthetic Data Help Pedestrian Detection and Tracking?", @@ -25392,7 +27111,8 @@ "status": "Poster", "track": "main", "pid": 1920, - "author": "Matteo Fabbri; Guillem Bras\u00f3; Gianluca Maugeri; Orcun Cetintas; Riccardo Gasparini; Aljo\u0161a O\u0161ep; Simone Calderara; Laura Leal-Taix\u00e9; Rita Cucchiara", + "author_site": "Matteo Fabbri; Guillem Brasó; Gianluca Maugeri; Orcun Cetintas; Riccardo Gasparini; Aljoša Ošep; Simone Calderara; Laura Leal-Taixé; Rita Cucchiara", + "author": "Matteo Fabbri; Guillem Brasó; Gianluca Maugeri; Orcun Cetintas; Riccardo Gasparini; Aljoša Ošep; Simone Calderara; Laura Leal-Taixé; Rita Cucchiara", "abstract": "Deep learning-based methods for video pedestrian detection and tracking require large volumes of training data to achieve good performance. However, data acquisition in crowded public environments raises data privacy concerns -- we are not allowed to simply record and store data without the explicit consent of all participants. Furthermore, the annotation of such data for computer vision applications usually requires a substantial amount of manual effort, especially in the video domain. Labeling instances of pedestrians in highly crowded scenarios can be challenging even for human annotators and may introduce errors in the training data. In this paper, we study how we can advance different aspects of multi-person tracking using solely synthetic data. To this end, we generate MOTSynth, a large, highly diverse synthetic dataset for object detection and tracking using a rendering game engine. Our experiments show that MOTSynth can be used as a replacement for real data on tasks such as pedestrian detection, re-identification, segmentation, and tracking.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Fabbri_MOTSynth_How_Can_Synthetic_Data_Help_Pedestrian_Detection_and_Tracking_ICCV_2021_paper.pdf", "aff": "University of Modena and Reggio Emilia, Italy+GoatAI S.r.l.; Technical University of Munich, Germany; University of Modena and Reggio Emilia, Italy+GoatAI S.r.l.; Technical University of Munich, Germany; University of Modena and Reggio Emilia, Italy+GoatAI S.r.l.; Technical University of Munich, Germany; University of Modena and Reggio Emilia, Italy; Technical University of Munich, Germany; University of Modena and Reggio Emilia, Italy", @@ -25415,7 +27135,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;1;0+0;1;0+0;1;0;1;0", - "aff_country_unique": "Italy;Germany" + "aff_country_unique": "Italy;Germany", + "bibtex": "@InProceedings{Fabbri_2021_ICCV,\n \n author = {\n Fabbri,\n Matteo and Bras\\'o,\n Guillem and Maugeri,\n Gianluca and Cetintas,\n Orcun and Gasparini,\n Riccardo and O\\v{s\n}ep,\n Aljo\\v{s\n}a and Calderara,\n Simone and Leal-Taix\\'e,\n Laura and Cucchiara,\n Rita\n},\n title = {\n MOTSynth: How Can Synthetic Data Help Pedestrian Detection and Tracking?\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10849-10859\n} \n}" }, { "title": "MSR-GCN: Multi-Scale Residual Graph Convolution Networks for Human Motion Prediction", @@ -25423,6 +27144,7 @@ "status": "Poster", "track": "main", "pid": 7627, + "author_site": "Lingwei Dang; Yongwei Nie; Chengjiang Long; Qing Zhang; Guiqing Li", "author": "Lingwei Dang; Yongwei Nie; Chengjiang Long; Qing Zhang; Guiqing Li", "abstract": "Human motion prediction is a challenging task due to the stochasticity and aperiodicity of future poses. Recently, graph convolutional network has been proven to be very effective to learn dynamic relations among pose joints, which is helpful for pose prediction. On the other hand, one can abstract a human pose recursively to obtain a set of poses at multiple scales. With the increase of the abstraction level, the motion of the pose becomes more stable, which benefits pose prediction too. In this paper, we propose a novel Multi-Scale Residual Graph Convolution Network (MSR-GCN) for human pose prediction task in the manner of end-to-end. The GCNs are used to extract features from fine to coarse scale and then from coarse to fine scale. The extracted features at each scale are then combined and decoded to obtain the residuals between the input and target poses. Intermediate supervisions are imposed on all the predicted poses, which enforces the network to learn more representative features. Our proposed approach is evaluated on two standard benchmark datasets, i.e., the Human3.6M dataset and the CMU Mocap dataset. Experimental results demonstrate that our method outperforms the state-of-the-art approaches. Code and pre-trained models are available at https://github.com/Droliven/MSRGCN.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Dang_MSR-GCN_Multi-Scale_Residual_Graph_Convolution_Networks_for_Human_Motion_Prediction_ICCV_2021_paper.pdf", @@ -25439,14 +27161,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Dang_MSR-GCN_Multi-Scale_Residual_Graph_Convolution_Networks_for_Human_Motion_Prediction_ICCV_2021_paper.html", "aff_unique_index": "0;0;1;2;0", - "aff_unique_norm": "South China University of Technology;JD;Sun Yat-sen University", - "aff_unique_dep": "School of Computer Science and Engineering;JD Finance America Corporation;School of Computer Science and Engineering", + "aff_unique_norm": "South China University of Technology;JD Finance America Corporation;Sun Yat-sen University", + "aff_unique_dep": "School of Computer Science and Engineering;;School of Computer Science and Engineering", "aff_unique_url": "https://www.scut.edu.cn;;http://www.sysu.edu.cn", "aff_unique_abbr": "SCUT;;SYSU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Dang_2021_ICCV,\n \n author = {\n Dang,\n Lingwei and Nie,\n Yongwei and Long,\n Chengjiang and Zhang,\n Qing and Li,\n Guiqing\n},\n title = {\n MSR-GCN: Multi-Scale Residual Graph Convolution Networks for Human Motion Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11467-11476\n} \n}" }, { "title": "MT-ORL: Multi-Task Occlusion Relationship Learning", @@ -25454,6 +27177,7 @@ "status": "Poster", "track": "main", "pid": 5569, + "author_site": "Panhe Feng; Qi She; Lei Zhu; Jiaxin Li; Lin Zhang; Zijian Feng; Changhu Wang; Chunpeng Li; Xuejing Kang; Anlong Ming", "author": "Panhe Feng; Qi She; Lei Zhu; Jiaxin Li; Lin Zhang; Zijian Feng; Changhu Wang; Chunpeng Li; Xuejing Kang; Anlong Ming", "abstract": "Retrieving occlusion relation among objects in a single image is challenging due to sparsity of boundaries in image. We observe two key issues in existing works: firstly, lack of an architecture which can exploit the limited amount of coupling in the decoder stage between the two subtasks, namely occlusion boundary extraction and occlusion orientation prediction, and secondly, improper representation of occlusion orientation. In this paper, we propose a novel architecture called Occlusion-shared and Path-separated Network (OPNet), which solves the first issue by exploiting rich occlusion cues in shared high-level features and structured spatial information in task-specific low-level features. We then design a simple but effective orthogonal occlusion representation (OOR) to tackle the second issue. Our method surpasses the state-of-the-art methods by 6.1%/8.3% Boundary-AP and 6.5%/10% Orientation-AP on standard PIOD/BSDS ownership datasets. Code is available at https://github.com/fengpanhe/MT-ORL.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Feng_MT-ORL_Multi-Task_Occlusion_Relationship_Learning_ICCV_2021_paper.pdf", @@ -25477,7 +27201,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Beijing;", "aff_country_unique_index": "0+0;0;0;0;0;0;0;0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Feng_2021_ICCV,\n \n author = {\n Feng,\n Panhe and She,\n Qi and Zhu,\n Lei and Li,\n Jiaxin and Zhang,\n Lin and Feng,\n Zijian and Wang,\n Changhu and Li,\n Chunpeng and Kang,\n Xuejing and Ming,\n Anlong\n},\n title = {\n MT-ORL: Multi-Task Occlusion Relationship Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9364-9373\n} \n}" }, { "title": "MUSIQ: Multi-Scale Image Quality Transformer", @@ -25485,6 +27210,7 @@ "status": "Poster", "track": "main", "pid": 10958, + "author_site": "Junjie Ke; Qifei Wang; Yilin Wang; Peyman Milanfar; Feng Yang", "author": "Junjie Ke; Qifei Wang; Yilin Wang; Peyman Milanfar; Feng Yang", "abstract": "Image quality assessment (IQA) is an important research topic for understanding and improving visual experience. The current state-of-the-art IQA methods are based on convolutional neural networks (CNNs). The performance of CNN-based models is often compromised by the fixed shape constraint in batch training. To accommodate this, the input images are usually resized and cropped to a fixed shape, causing image quality degradation. To address this, we design a multi-scale image quality Transformer (MUSIQ) to process native resolution images with varying sizes and aspect ratios. With a multi-scale image representation, our proposed method can capture image quality at different granularities. Furthermore, a novel hash-based 2D spatial embedding and a scale embedding is proposed to support the positional embedding in the multi-scale representation. Experimental results verify that our method can achieve state-of-the-art performance on multiple large scale IQA datasets such as PaQ-2-PiQ, SPAQ and KonIQ-10k.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ke_MUSIQ_Multi-Scale_Image_Quality_Transformer_ICCV_2021_paper.pdf", @@ -25508,7 +27234,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Ke_2021_ICCV,\n \n author = {\n Ke,\n Junjie and Wang,\n Qifei and Wang,\n Yilin and Milanfar,\n Peyman and Yang,\n Feng\n},\n title = {\n MUSIQ: Multi-Scale Image Quality Transformer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5148-5157\n} \n}" }, { "title": "MVSNeRF: Fast Generalizable Radiance Field Reconstruction From Multi-View Stereo", @@ -25516,6 +27243,7 @@ "status": "Poster", "track": "main", "pid": 3801, + "author_site": "Anpei Chen; Zexiang Xu; Fuqiang Zhao; Xiaoshuai Zhang; Fanbo Xiang; Jingyi Yu; Hao Su", "author": "Anpei Chen; Zexiang Xu; Fuqiang Zhao; Xiaoshuai Zhang; Fanbo Xiang; Jingyi Yu; Hao Su", "abstract": "We present MVSNeRF, a novel neural rendering approach that can efficiently reconstruct neural radiance fields for view synthesis. Unlike prior works on neural radiance fields that consider per-scene optimization on densely captured images, we propose a generic deep neural network that can reconstruct radiance fields from only three nearby input views via fast network inference. Our approach leverages plane-swept cost volumes (widely used in multi-view stereo) for geometry-aware scene reasoning, and combines this with physically based volume rendering for neural radiance field reconstruction. We train our network on real objects in the DTU dataset, and test it on three different datasets to evaluate its effectiveness and generalizability. Our approach can generalize across scenes (even indoor scenes, completely different from our training scenes of objects) and generate realistic view synthesis results using only three input images, significantly outperforming concurrent works on generalizable radiance field reconstruction. Moreover, if dense images are captured, our estimated radiance field representation can be easily fine-tuned; this leads to fast per-scene reconstruction with higher rendering quality and substantially less optimization time than NeRF.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_MVSNeRF_Fast_Generalizable_Radiance_Field_Reconstruction_From_Multi-View_Stereo_ICCV_2021_paper.pdf", @@ -25539,7 +27267,8 @@ "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";San Diego", "aff_country_unique_index": "0;1;0;1;1;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Anpei and Xu,\n Zexiang and Zhao,\n Fuqiang and Zhang,\n Xiaoshuai and Xiang,\n Fanbo and Yu,\n Jingyi and Su,\n Hao\n},\n title = {\n MVSNeRF: Fast Generalizable Radiance Field Reconstruction From Multi-View Stereo\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14124-14133\n} \n}" }, { "title": "MVTN: Multi-View Transformation Network for 3D Shape Recognition", @@ -25547,6 +27276,7 @@ "status": "Poster", "track": "main", "pid": 1133, + "author_site": "Abdullah Hamdi; Silvio Giancola; Bernard Ghanem", "author": "Abdullah Hamdi; Silvio Giancola; Bernard Ghanem", "abstract": "Multi-view projection methods have demonstrated their ability to reach state-of-the-art performance on 3D shape recognition. Those methods learn different ways to aggregate information from multiple views. However, the camera view-points for those views tend to be heuristically set and fixed for all shapes. To circumvent the lack of dynamism of current multi-view methods, we propose to learn those view-points. In particular, we introduce the Multi-View Transformation Network (MVTN) that regresses optimal view-points for 3D shape recognition, building upon advances in differentiable rendering. As a result, MVTN can be trained end-to-end along with any multi-view network for 3D shape classification. We integrate MVTN in a novel adaptive multi-view pipeline that can render either 3D meshes or point clouds. MVTN exhibits clear performance gains in the tasks of 3D shape classification and 3D shape retrieval without the need for extra training supervision. In these tasks, MVTN achieves state-of-the-art performance on ModelNet40, ShapeNet Core55, and the most recent and realistic ScanObjectNN dataset (up to 6 % improvement). Interestingly, we also show that MVTN can provide network robustness against rotation and occlusion in the 3D domain.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Hamdi_MVTN_Multi-View_Transformation_Network_for_3D_Shape_Recognition_ICCV_2021_paper.pdf", @@ -25570,7 +27300,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Thuwal", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Saudi Arabia" + "aff_country_unique": "Saudi Arabia", + "bibtex": "@InProceedings{Hamdi_2021_ICCV,\n \n author = {\n Hamdi,\n Abdullah and Giancola,\n Silvio and Ghanem,\n Bernard\n},\n title = {\n MVTN: Multi-View Transformation Network for 3D Shape Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1-11\n} \n}" }, { "title": "Making Higher Order MOT Scalable: An Efficient Approximate Solver for Lifted Disjoint Paths", @@ -25578,10 +27309,11 @@ "status": "Poster", "track": "main", "pid": 8638, + "author_site": "Andrea Hornakova; Timo Kaiser; Paul Swoboda; Michal Rolinek; Bodo Rosenhahn; Roberto Henschel", "author": "Andrea Hornakova; Timo Kaiser; Paul Swoboda; Michal Rolinek; Bodo Rosenhahn; Roberto Henschel", "abstract": "We present an efficient approximate message passing solver for the lifted disjoint paths problem (LDP), a natural but NP-hard model for multiple object tracking (MOT). Our tracker scales to very large instances that come from long and crowded MOT sequences. Our approximate solver enables us to process the MOT15/16/17 benchmarks without sacrificing solution quality and allows for solving MOT20, which has been out of reach up to now for LDP solvers due to its size and complexity. On all these four standard MOT benchmarks we achieve performance comparable or better than current state-of-the-art methods including a tracker based on an optimal LDP solver.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Hornakova_Making_Higher_Order_MOT_Scalable_An_Efficient_Approximate_Solver_for_ICCV_2021_paper.pdf", - "aff": "Max Planck Institute for Informatics, Saarland Informatics Campus; Institute for Information Processing, Leibniz University Hannover; Max Planck Institute for Intelligent Systems, T\u00fcbingen; Max Planck Institute for Informatics, Saarland Informatics Campus; Institute for Information Processing, Leibniz University Hannover; Institute for Information Processing, Leibniz University Hannover", + "aff": "Max Planck Institute for Informatics, Saarland Informatics Campus; Institute for Information Processing, Leibniz University Hannover; Max Planck Institute for Intelligent Systems, Tübingen; Max Planck Institute for Informatics, Saarland Informatics Campus; Institute for Information Processing, Leibniz University Hannover; Institute for Information Processing, Leibniz University Hannover", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Hornakova_Making_Higher_Order_ICCV_2021_supplemental.pdf", @@ -25599,9 +27331,10 @@ "aff_unique_url": "https://mpi-inf.mpg.de;https://www.uni-hannover.de;https://www.mpi-is.mpg.de", "aff_unique_abbr": "MPII;LUH;MPI-IS", "aff_campus_unique_index": "0;2;0", - "aff_campus_unique": "Saarland;;T\u00fcbingen", + "aff_campus_unique": "Saarland;;Tübingen", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Hornakova_2021_ICCV,\n \n author = {\n Hornakova,\n Andrea and Kaiser,\n Timo and Swoboda,\n Paul and Rolinek,\n Michal and Rosenhahn,\n Bodo and Henschel,\n Roberto\n},\n title = {\n Making Higher Order MOT Scalable: An Efficient Approximate Solver for Lifted Disjoint Paths\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6330-6340\n} \n}" }, { "title": "Manifold Alignment for Semantically Aligned Style Transfer", @@ -25609,6 +27342,7 @@ "status": "Poster", "track": "main", "pid": 6192, + "author_site": "Jing Huo; Shiyin Jin; Wenbin Li; Jing Wu; Yu-Kun Lai; Yinghuan Shi; Yang Gao", "author": "Jing Huo; Shiyin Jin; Wenbin Li; Jing Wu; Yu-Kun Lai; Yinghuan Shi; Yang Gao", "abstract": "Most existing style transfer methods follow the assumption that styles can be represented with global statistics (e.g., Gram matrices or covariance matrices), and thus address the problem by forcing the output and style images to have similar global statistics. An alternative is the assumption of local style patterns, where algorithms are designed to swap similar local features of content and style images. However, the limitation of these existing methods is that they neglect the semantic structure of the content image which may lead to corrupted content structure in the output. In this paper, we make a new assumption that image features from the same semantic region form a manifold and an image with multiple semantic regions follows a multi-manifold distribution. Based on this assumption, the style transfer problem is formulated as aligning two multi-manifold distributions and a Manifold Alignment based Style Transfer (MAST) framework is proposed. The proposed framework allows semantically similar regions between the output and the style image share similar style patterns. Moreover, the proposed manifold alignment method is flexible to allow user editing or using semantic segmentation maps as guidance for style transfer. To allow the method to be applicable to photorealistic style transfer, we propose a new adaptive weight skip connection network structure to preserve the content details. Extensive experiments verify the effectiveness of the proposed framework for both artistic and photorealistic style transfer. Code is available at https://github.com/NJUHuoJing/MAST.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Huo_Manifold_Alignment_for_Semantically_Aligned_Style_Transfer_ICCV_2021_paper.pdf", @@ -25632,7 +27366,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Cardiff", "aff_country_unique_index": "0;0;0;1;1;0;0", - "aff_country_unique": "China;United Kingdom" + "aff_country_unique": "China;United Kingdom", + "bibtex": "@InProceedings{Huo_2021_ICCV,\n \n author = {\n Huo,\n Jing and Jin,\n Shiyin and Li,\n Wenbin and Wu,\n Jing and Lai,\n Yu-Kun and Shi,\n Yinghuan and Gao,\n Yang\n},\n title = {\n Manifold Alignment for Semantically Aligned Style Transfer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14861-14869\n} \n}" }, { "title": "Manifold Matching via Deep Metric Learning for Generative Modeling", @@ -25640,6 +27375,7 @@ "status": "Poster", "track": "main", "pid": 5983, + "author_site": "Mengyu Dai; Haibin Hang", "author": "Mengyu Dai; Haibin Hang", "abstract": "We propose a manifold matching approach to generative models which includes a distribution generator (or data generator) and a metric generator. In our framework, we view the real data set as some manifold embedded in a high-dimensional Euclidean space. The distribution generator aims at generating samples that follow some distribution condensed around the real data manifold. It is achieved by matching two sets of points using their geometric shape descriptors, such as centroid and p-diameter, with learned distance metric; the metric generator utilizes both real data and generated samples to learn a distance metric which is close to some intrinsic geodesic distance on the real data manifold. The produced distance metric is further used for manifold matching. The two networks learn simultaneously during the training process. We apply the approach on both unsupervised and supervised learning tasks: in unconditional image generation task, the proposed method obtains competitive results compared with existing generative models; in super-resolution task, we incorporate the framework in perception-based models and improve visual qualities by producing samples with more natural textures. Experiments and analysis demonstrate the feasibility and effectiveness of the proposed framework.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Dai_Manifold_Matching_via_Deep_Metric_Learning_for_Generative_Modeling_ICCV_2021_paper.pdf", @@ -25656,14 +27392,15 @@ "author_num": 2, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Dai_Manifold_Matching_via_Deep_Metric_Learning_for_Generative_Modeling_ICCV_2021_paper.html", "aff_unique_index": "0;1", - "aff_unique_norm": "Microsoft;University of Delaware", - "aff_unique_dep": "Microsoft Corporation;", + "aff_unique_norm": "Microsoft Corporation;University of Delaware", + "aff_unique_dep": ";", "aff_unique_url": "https://www.microsoft.com;https://www.udel.edu", "aff_unique_abbr": "Microsoft;UD", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Dai_2021_ICCV,\n \n author = {\n Dai,\n Mengyu and Hang,\n Haibin\n},\n title = {\n Manifold Matching via Deep Metric Learning for Generative Modeling\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6587-6597\n} \n}" }, { "title": "Matching in the Dark: A Dataset for Matching Image Pairs of Low-Light Scenes", @@ -25671,6 +27408,7 @@ "status": "Poster", "track": "main", "pid": 7818, + "author_site": "Wenzheng Song; Masanori Suganuma; Xing Liu; Noriyuki Shimobayashi; Daisuke Maruta; Takayuki Okatani", "author": "Wenzheng Song; Masanori Suganuma; Xing Liu; Noriyuki Shimobayashi; Daisuke Maruta; Takayuki Okatani", "abstract": "This paper considers matching images of low-light scenes, aiming to widen the frontier of SfM and visual SLAM applications. Recent image sensors can record the brightness of scenes with more than eight-bit precision, available in their RAW-format image. We are interested in making full use of such high-precision information to match extremely low-light scene images that conventional methods cannot handle. For extreme low-light scenes, even if some of their brightness information exists in the RAW format images' low bits, the standard raw image processing fails to utilize them properly. As was recently shown by Chen et al., CNNs can learn to produce images with a natural appearance from such RAW-format images. To consider if and how well we can utilize such information stored in RAW-format images for image matching, we have created a new dataset named MID (matching in the dark). Using it, we experimentally evaluated combinations of eight image-enhancing methods and eleven image matching methods consisting of classical/neural local descriptors and classical/neural initial point-matching methods. The results show the advantage of using the RAW-format images and the strengths and weaknesses of the above component methods. They also imply there is room for further research.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Song_Matching_in_the_Dark_A_Dataset_for_Matching_Image_Pairs_ICCV_2021_paper.pdf", @@ -25690,11 +27428,12 @@ "aff_unique_norm": "Tohoku University;RIKEN;Socionext Inc.", "aff_unique_dep": "GSIS;Center for AIP;", "aff_unique_url": "https://www.tohoku.ac.jp;https://www.riken.jp;https://www.socionext.com", - "aff_unique_abbr": "Tohoku U;RIKEN;", + "aff_unique_abbr": "Tohoku U;RIKEN;Socionext", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0;0;0;0+0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Song_2021_ICCV,\n \n author = {\n Song,\n Wenzheng and Suganuma,\n Masanori and Liu,\n Xing and Shimobayashi,\n Noriyuki and Maruta,\n Daisuke and Okatani,\n Takayuki\n},\n title = {\n Matching in the Dark: A Dataset for Matching Image Pairs of Low-Light Scenes\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6029-6038\n} \n}" }, { "title": "Me-Momentum: Extracting Hard Confident Examples From Noisily Labeled Data", @@ -25702,6 +27441,7 @@ "status": "Poster", "track": "main", "pid": 8317, + "author_site": "Yingbin Bai; Tongliang Liu", "author": "Yingbin Bai; Tongliang Liu", "abstract": "Examples that are close to the decision boundary---that we term hard examples, are essential to shape accurate classifiers. Extracting confident examples has been widely studied in the community of learning with noisy labels. However, it remains elusive how to extract hard confident examples from the noisy training data. In this paper, we propose a deep learning paradigm to solve this problem, which is built on the memorization effect of deep neural networks that they would first learn simple patterns, i.e., which are defined by these shared by multiple training examples. To extract hard confident examples that contain non-simple patterns and are entangled with the inaccurately labeled examples, we borrow the idea of momentum from physics. Specifically, we alternately update the confident examples and refine the classifier. Note that the extracted confident examples in the previous round can be exploited to learn a better classifier and that the better classifier will help identify better (and hard) confident examples. We call the approach the \"Momentum of Memorization\" (Me-Momentum). Empirical results on benchmark-simulated and real-world label-noise data illustrate the effectiveness of Me-Momentum for extracting hard confident examples, leading to better classification performance.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Bai_Me-Momentum_Extracting_Hard_Confident_Examples_From_Noisily_Labeled_Data_ICCV_2021_paper.pdf", @@ -25725,7 +27465,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Australia" + "aff_country_unique": "Australia", + "bibtex": "@InProceedings{Bai_2021_ICCV,\n \n author = {\n Bai,\n Yingbin and Liu,\n Tongliang\n},\n title = {\n Me-Momentum: Extracting Hard Confident Examples From Noisily Labeled Data\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9312-9321\n} \n}" }, { "title": "Mean Shift for Self-Supervised Learning", @@ -25733,6 +27474,7 @@ "status": "Poster", "track": "main", "pid": 8312, + "author_site": "Soroush Abbasi Koohpayegani; Ajinkya Tejankar; Hamed Pirsiavash", "author": "Soroush Abbasi Koohpayegani; Ajinkya Tejankar; Hamed Pirsiavash", "abstract": "Most recent self-supervised learning (SSL) algorithms learn features by contrasting between instances of images or by clustering the images and then contrasting between the image clusters. We introduce a simple mean-shift algorithm that learns representations by grouping images together without contrasting between them or adopting much of prior on the structure or number of the clusters. We simply \"shift\" the embedding of each image to be close to the \"mean\" of the neighbors of its augmentation. Since the closest neighbor is always another augmentation of the same image, our model will be identical to BYOL when using only one nearest neighbor instead of 5 used in our experiments. Our model achieves 72.4% on ImageNet linear evaluation with ResNet50 at 200 epochs outperforming BYOL. Also, our method outperforms the SOTA by a large margin when using weak augmentations only, facilitating the adoption of SSL for other modalities. Our code is available here: https://github.com/UMBCvision/MSF", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Koohpayegani_Mean_Shift_for_Self-Supervised_Learning_ICCV_2021_paper.pdf", @@ -25756,7 +27498,8 @@ "aff_campus_unique_index": "0;0;0+1", "aff_campus_unique": "Baltimore County;Davis", "aff_country_unique_index": "0;0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Koohpayegani_2021_ICCV,\n \n author = {\n Koohpayegani,\n Soroush Abbasi and Tejankar,\n Ajinkya and Pirsiavash,\n Hamed\n},\n title = {\n Mean Shift for Self-Supervised Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10326-10335\n} \n}" }, { "title": "Membership Inference Attacks Are Easier on Difficult Problems", @@ -25764,6 +27507,7 @@ "status": "Poster", "track": "main", "pid": 7786, + "author_site": "Avital Shafran; Shmuel Peleg; Yedid Hoshen", "author": "Avital Shafran; Shmuel Peleg; Yedid Hoshen", "abstract": "Membership inference attacks (MIA) try to detect if data samples were used to train a neural network model, e.g. to detect copyright abuses. We show that models with higher dimensional input and output are more vulnerable to MIA, and address in more detail models for image translation and semantic segmentation, including medical image segmentation. We show that reconstruction-errors can lead to very effective MIA attacks as they are indicative of memorization. Unfortunately, reconstruction error alone is less effective at discriminating between non-predictable images used in training and easy to predict images that were never seen before. To overcome this, we propose using a novel predictability error that can be computed for each sample, and its computation does not require a training set. Our membership error, obtained by subtracting the predictability error from the reconstruction error, is shown to achieve high MIA accuracy on an extensive number of benchmarks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Shafran_Membership_Inference_Attacks_Are_Easier_on_Difficult_Problems_ICCV_2021_paper.pdf", @@ -25778,7 +27522,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Shafran_Membership_Inference_Attacks_Are_Easier_on_Difficult_Problems_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Shafran_Membership_Inference_Attacks_Are_Easier_on_Difficult_Problems_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Shafran_2021_ICCV,\n \n author = {\n Shafran,\n Avital and Peleg,\n Shmuel and Hoshen,\n Yedid\n},\n title = {\n Membership Inference Attacks Are Easier on Difficult Problems\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14820-14829\n} \n}" }, { "title": "Memory-Augmented Dynamic Neural Relational Inference", @@ -25786,6 +27531,7 @@ "status": "Poster", "track": "main", "pid": 2984, + "author_site": "Dong Gong; Frederic Z. Zhang; Javen Qinfeng Shi; Anton van den Hengel", "author": "Dong Gong; Frederic Z. Zhang; Javen Qinfeng Shi; Anton van den Hengel", "abstract": "Dynamic interacting systems are prevalent in vision tasks. These interactions are usually difficult to observe and measure directly, and yet understanding latent interactions is essential for performing inference tasks on dynamic systems like forecasting. Neural relational inference (NRI) techniques are thus introduced to explicitly estimate interpretable relations between the entities in the system for trajectory prediction. However, NRI assumes static relations; thus, dynamic neural relational inference (DNRI) was proposed to handle dynamic relations using LSTM. Unfortunately, the older information will be washed away when the LSTM updates the latent variable as a whole, which is why DNRI struggles with modeling long-term dependences and forecasting long sequences. This motivates us to propose a memory-augmented dynamic neural relational inference method, which maintains two associative memory pools: one for the interactive relations and the other for the individual entities. The two memory pools help retain useful relation features and node features for the estimation in the future steps. Our model dynamically estimates the relations by learning better embeddings and utilizing the long-range information stored in the memory. With the novel memory modules and customized structures, our memory-augmented DNRI can update and access the memory adaptively as required. The memory pools also serve as global latent variables across time to maintain detailed long-term temporal relations readily available for other components to use. Experiments on synthetic and real-world datasets show the effectiveness of the proposed method on modeling dynamic relations and forecasting complex trajectories.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Gong_Memory-Augmented_Dynamic_Neural_Relational_Inference_ICCV_2021_paper.pdf", @@ -25809,7 +27555,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Australia" + "aff_country_unique": "Australia", + "bibtex": "@InProceedings{Gong_2021_ICCV,\n \n author = {\n Gong,\n Dong and Zhang,\n Frederic Z. and Shi,\n Javen Qinfeng and van den Hengel,\n Anton\n},\n title = {\n Memory-Augmented Dynamic Neural Relational Inference\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11843-11852\n} \n}" }, { "title": "Mesh Graphormer", @@ -25817,6 +27564,7 @@ "status": "Poster", "track": "main", "pid": 3473, + "author_site": "Kevin Lin; Lijuan Wang; Zicheng Liu", "author": "Kevin Lin; Lijuan Wang; Zicheng Liu", "abstract": "We present a graph-convolution-reinforced transformer, named Mesh Graphormer, for 3D human pose and mesh reconstruction from a single image. Recently both transformers and graph convolutional neural networks (GCNNs) have shown promising progress in human mesh reconstruction. Transformer-based approaches are effective in modeling non-local interactions among 3D mesh vertices and body joints, whereas GCNNs are good at exploiting neighborhood vertex interactions based on a pre-specified mesh topology. In this paper, we study how to combine graph convolutions and self-attentions in a transformer to model both local and global interactions. Experimental results show that our proposed method, Mesh Graphormer, significantly outperforms the previous state-of-the-art methods on multiple benchmarks, including Human3.6M, 3DPW, and FreiHAND datasets. Code and pre-trained models are available at https://github.com/microsoft/MeshGraphormer", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Lin_Mesh_Graphormer_ICCV_2021_paper.pdf", @@ -25833,14 +27581,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Lin_Mesh_Graphormer_ICCV_2021_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "Microsoft", - "aff_unique_dep": "Microsoft Corporation", + "aff_unique_norm": "Microsoft Corporation", + "aff_unique_dep": "", "aff_unique_url": "https://www.microsoft.com", "aff_unique_abbr": "Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Lin_2021_ICCV,\n \n author = {\n Lin,\n Kevin and Wang,\n Lijuan and Liu,\n Zicheng\n},\n title = {\n Mesh Graphormer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12939-12948\n} \n}" }, { "title": "MeshTalk: 3D Face Animation From Speech Using Cross-Modality Disentanglement", @@ -25848,7 +27597,8 @@ "status": "Poster", "track": "main", "pid": 2644, - "author": "Alexander Richard; Michael Zollh\u00f6fer; Yandong Wen; Fernando de la Torre; Yaser Sheikh", + "author_site": "Alexander Richard; Michael Zollhöfer; Yandong Wen; Fernando de la Torre; Yaser Sheikh", + "author": "Alexander Richard; Michael Zollhöfer; Yandong Wen; Fernando de la Torre; Yaser Sheikh", "abstract": "This paper presents a generic method for generating full facial 3D animation from speech. Existing approaches to audio-driven facial animation exhibit uncanny or static upper face animation, fail to produce accurate and plausible co-articulation or rely on person-specific models that limit their scalability. To improve upon existing models, we propose a generic audio-driven facial animation approach that achieves highly realistic motion synthesis results for the entire face. At the core of our approach is a categorical latent space for facial animation that disentangles audio-correlated and audio-uncorrelated information based on a novel cross-modality loss. Our approach ensures highly accurate lip motion, while also synthesizing plausible animation of the parts of the face that are uncorrelated to the audio signal, such as eye blinks and eye brow motion. We demonstrate that our approach outperforms several baselines and obtains state-of-the-art quality both qualitatively and quantitatively. A perceptual user study demonstrates that our approach is deemed more realistic than the current state-of-the-art in over 75% of cases. We recommend watching the supplemental video before reading the paper: https://github.com/facebookresearch/meshtalk", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Richard_MeshTalk_3D_Face_Animation_From_Speech_Using_Cross-Modality_Disentanglement_ICCV_2021_paper.pdf", "aff": "Facebook Reality Labs; Facebook Reality Labs; Carnegie Mellon University; Carnegie Mellon University; Facebook Reality Labs", @@ -25864,14 +27614,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Richard_MeshTalk_3D_Face_Animation_From_Speech_Using_Cross-Modality_Disentanglement_ICCV_2021_paper.html", "aff_unique_index": "0;0;1;1;0", - "aff_unique_norm": "Meta;Carnegie Mellon University", + "aff_unique_norm": "Facebook Reality Labs;Carnegie Mellon University", "aff_unique_dep": "Facebook Reality Labs;", "aff_unique_url": "https://www.facebook.com/realitylabs;https://www.cmu.edu", "aff_unique_abbr": "FRL;CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Richard_2021_ICCV,\n \n author = {\n Richard,\n Alexander and Zollh\\"ofer,\n Michael and Wen,\n Yandong and de la Torre,\n Fernando and Sheikh,\n Yaser\n},\n title = {\n MeshTalk: 3D Face Animation From Speech Using Cross-Modality Disentanglement\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1173-1182\n} \n}" }, { "title": "Meta Gradient Adversarial Attack", @@ -25879,6 +27630,7 @@ "status": "Poster", "track": "main", "pid": 6980, + "author_site": "Zheng Yuan; Jie Zhang; Yunpei Jia; Chuanqi Tan; Tao Xue; Shiguang Shan", "author": "Zheng Yuan; Jie Zhang; Yunpei Jia; Chuanqi Tan; Tao Xue; Shiguang Shan", "abstract": "In recent years, research on adversarial attacks has become a hot spot. Although current literature on the transfer-based adversarial attack has achieved promising results for improving the transferability to unseen black-box models, it still leaves a long way to go. Inspired by the idea of meta-learning, this paper proposes a novel architecture called Meta Gradient Adversarial Attack (MGAA), which is plug-and-play and can be integrated with any existing gradient-based attack method for improving the cross-model transferability. Specifically, we randomly sample multiple models from a model zoo to compose different tasks and iteratively simulate a white-box attack and a black-box attack in each task. By narrowing the gap between the gradient directions in white-box and black-box attacks, the transferability of adversarial examples on the black-box setting can be improved. Extensive experiments on the CIFAR10 and ImageNet datasets show that our architecture outperforms the state-of-the-art methods for both black-box and white-box attack settings.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yuan_Meta_Gradient_Adversarial_Attack_ICCV_2021_paper.pdf", @@ -25895,14 +27647,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Yuan_Meta_Gradient_Adversarial_Attack_ICCV_2021_paper.html", "aff_unique_index": "0+1;0+1;0+1;2;2;0+1", - "aff_unique_norm": "Chinese Academy of Sciences;University of Chinese Academy of Sciences;Tencent", - "aff_unique_dep": "Institute of Computing Technology;;Tencent Holdings Limited", + "aff_unique_norm": "Chinese Academy of Sciences;University of Chinese Academy of Sciences;Tencent Holdings Limited", + "aff_unique_dep": "Institute of Computing Technology;;", "aff_unique_url": "http://www.ict.ac.cn;http://www.ucas.ac.cn;https://www.tencent.com", "aff_unique_abbr": "CAS;UCAS;Tencent", "aff_campus_unique_index": ";;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0+0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yuan_2021_ICCV,\n \n author = {\n Yuan,\n Zheng and Zhang,\n Jie and Jia,\n Yunpei and Tan,\n Chuanqi and Xue,\n Tao and Shan,\n Shiguang\n},\n title = {\n Meta Gradient Adversarial Attack\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7748-7757\n} \n}" }, { "title": "Meta Learning on a Sequence of Imbalanced Domains With Difficulty Awareness", @@ -25910,6 +27663,7 @@ "status": "Poster", "track": "main", "pid": 9659, + "author_site": "Zhenyi Wang; Tiehang Duan; Le Fang; Qiuling Suo; Mingchen Gao", "author": "Zhenyi Wang; Tiehang Duan; Le Fang; Qiuling Suo; Mingchen Gao", "abstract": "Recognizing new objects by learning from a few labeled examples in an evolving environment is crucial to obtain excellent generalization ability for real-world machine learning systems. A typical setting across current meta learning algorithms assumes a stationary task distribution during meta training. In this paper, we explore a more practical and challenging setting where task distribution changes over time with domain shift. Particularly, we consider realistic scenarios where task distribution is highly imbalanced with domain labels unavailable in nature. We propose a kernel-based method for domain change detection and a difficulty-aware memory management mechanism that jointly considers the imbalanced domain size and domain importance to learn across domains continuously. Furthermore, we introduce an efficient adaptive task sampling method during meta training, which significantly reduces task gradient variance with theoretical guarantees. Finally, we propose a challenging benchmark with imbalanced domain sequences and varied domain difficulty. We have performed extensive evaluations on the proposed benchmark, demonstrating the effectiveness of our method.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_Meta_Learning_on_a_Sequence_of_Imbalanced_Domains_With_Difficulty_ICCV_2021_paper.pdf", @@ -25933,7 +27687,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Buffalo", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Zhenyi and Duan,\n Tiehang and Fang,\n Le and Suo,\n Qiuling and Gao,\n Mingchen\n},\n title = {\n Meta Learning on a Sequence of Imbalanced Domains With Difficulty Awareness\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8947-8957\n} \n}" }, { "title": "Meta Navigator: Search for a Good Adaptation Policy for Few-Shot Learning", @@ -25941,6 +27696,7 @@ "status": "Poster", "track": "main", "pid": 6510, + "author_site": "Chi Zhang; Henghui Ding; Guosheng Lin; Ruibo Li; Changhu Wang; Chunhua Shen", "author": "Chi Zhang; Henghui Ding; Guosheng Lin; Ruibo Li; Changhu Wang; Chunhua Shen", "abstract": "Few-shot learning aims to adapt knowledge learned from previous tasks to novel tasks with only a limited amount of labeled data. Research literature on few-shot learning exhibits great diversity, while different algorithms often excel at different few-shot learning scenarios. It is therefore tricky to decide which learning strategies to use under different task conditions. Inspired by the recent success in Automated Machine Learning literature (AutoML), in this paper, we present Meta Navigator, a framework that attempts to solve the aforementioned limitation in few-shot learning by seeking a higher-level strategy and proffer to automate the selection from various few-shot learning designs. The goal of our work is to search for good parameter adaptation policies that are applied to different stages in the network for few-shot classification. We present a search space that covers many popular few-shot learning algorithms in the literature and develop a differentiable searching and decoding algorithm based on meta-learning that supports gradient-based optimization. We demonstrate the effectiveness of our searching-based method on multiple benchmark datasets. Extensive experiments show that our approach significantly outperforms baselines and demonstrates performance advantages over many state-of-the-art methods. Code and models will be made publicly available.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhang_Meta_Navigator_Search_for_a_Good_Adaptation_Policy_for_Few-Shot_ICCV_2021_paper.pdf", @@ -25964,7 +27720,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+1;0;0;1;2", - "aff_country_unique": "Singapore;China;Australia" + "aff_country_unique": "Singapore;China;Australia", + "bibtex": "@InProceedings{Zhang_2021_ICCV,\n \n author = {\n Zhang,\n Chi and Ding,\n Henghui and Lin,\n Guosheng and Li,\n Ruibo and Wang,\n Changhu and Shen,\n Chunhua\n},\n title = {\n Meta Navigator: Search for a Good Adaptation Policy for Few-Shot Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9435-9444\n} \n}" }, { "title": "Meta Pairwise Relationship Distillation for Unsupervised Person Re-Identification", @@ -25972,10 +27729,11 @@ "status": "Poster", "track": "main", "pid": 5765, + "author_site": "Haoxuanye Ji; Le Wang; Sanping Zhou; Wei Tang; Nanning Zheng; Gang Hua", "author": "Haoxuanye Ji; Le Wang; Sanping Zhou; Wei Tang; Nanning Zheng; Gang Hua", "abstract": "Unsupervised person re-identification (Re-ID) remains challenging due to the lack of ground-truth labels. Existing methods often rely on estimated pseudo labels via iterative clustering and classification, and they are unfortunately highly susceptible to performance penalties incurred by the inaccurate estimated number of clusters. Alternatively, we propose the Meta Pairwise Relationship Distillation (MPRD) method to estimate the pseudo labels of sample pairs for unsupervised person Re-ID. Specifically, it consists of a Convolutional Neural Network (CNN) and Graph Convolutional Network (GCN), in which the GCN estimates the pseudo labels of sample pairs based on the current features extracted by CNN, and the CNN learns better features by involving high-fidelity positive and negative sample pairs imposed by GCN. To achieve this goal, a small amount of labeled samples are used to guide GCN training, which can distill meta knowledge to judge the difference in the neighborhood structure between positive and negative sample pairs. Extensive experiments on Market-1501, DukeMTMC-reID and MSMT17 datasets show that our method outperforms the state-of-the-art approaches.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ji_Meta_Pairwise_Relationship_Distillation_for_Unsupervised_Person_Re-Identification_ICCV_2021_paper.pdf", - "aff": "Institute of Arti\ufb01cial Intelligence and Robotics, Xi\u2019an Jiaotong University; Institute of Arti\ufb01cial Intelligence and Robotics, Xi\u2019an Jiaotong University; Institute of Arti\ufb01cial Intelligence and Robotics, Xi\u2019an Jiaotong University; University of Illinois at Chicago; Institute of Arti\ufb01cial Intelligence and Robotics, Xi\u2019an Jiaotong University; Wormpex AI Research", + "aff": "Institute of Artificial Intelligence and Robotics, Xi’an Jiaotong University; Institute of Artificial Intelligence and Robotics, Xi’an Jiaotong University; Institute of Artificial Intelligence and Robotics, Xi’an Jiaotong University; University of Illinois at Chicago; Institute of Artificial Intelligence and Robotics, Xi’an Jiaotong University; Wormpex AI Research", "project": "", "github": "", "supp": "", @@ -25988,14 +27746,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Ji_Meta_Pairwise_Relationship_Distillation_for_Unsupervised_Person_Re-Identification_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;1;0;2", - "aff_unique_norm": "Xi'an Jiao Tong University;University of Illinois at Chicago;Wormpex AI Research", - "aff_unique_dep": "Institute of Arti\ufb01cial Intelligence and Robotics;;AI Research", + "aff_unique_norm": "Xi'an Jiaotong University;University of Illinois at Chicago;Wormpex AI Research", + "aff_unique_dep": "Institute of Artificial Intelligence and Robotics;;AI Research", "aff_unique_url": "http://www.xjtu.edu.cn;https://www.uic.edu;", "aff_unique_abbr": "XJTU;UIC;Wormpex AI", "aff_campus_unique_index": "0;0;0;1;0", "aff_campus_unique": "Xi'an;Chicago;", "aff_country_unique_index": "0;0;0;1;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Ji_2021_ICCV,\n \n author = {\n Ji,\n Haoxuanye and Wang,\n Le and Zhou,\n Sanping and Tang,\n Wei and Zheng,\n Nanning and Hua,\n Gang\n},\n title = {\n Meta Pairwise Relationship Distillation for Unsupervised Person Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3661-3670\n} \n}" }, { "title": "Meta-Aggregator: Learning To Aggregate for 1-Bit Graph Neural Networks", @@ -26003,6 +27762,7 @@ "status": "Poster", "track": "main", "pid": 4000, + "author_site": "Yongcheng Jing; Yiding Yang; Xinchao Wang; Mingli Song; Dacheng Tao", "author": "Yongcheng Jing; Yiding Yang; Xinchao Wang; Mingli Song; Dacheng Tao", "abstract": "In this paper, we study a novel meta aggregation scheme towards binarizing graph neural networks (GNNs). We begin by developing a vanilla 1-bit GNN framework that binarizes both the GNN parameters and the graph features. Despite the lightweight architecture, we observed that this vanilla framework suffered from insufficient discriminative power in distinguishing graph topologies, leading to a dramatic drop in performance. This discovery motivates us to devise meta aggregators to improve the expressive power of vanilla binarized GNNs, of which the aggregation schemes can be adaptively changed in a learnable manner based on the binarized features. Towards this end, we propose two dedicated forms of meta neighborhood aggregators, an exclusive meta aggregator termed as Greedy Gumbel Neighborhood Aggregator (GNA), and a diffused meta aggregator termed as Adaptable Hybrid Neighborhood Aggregator (ANA). GNA learns to exclusively pick one single optimal aggregator from a pool of candidates, while ANA learns a hybrid aggregation behavior to simultaneously retain the benefits of several individual aggregators. Furthermore, the proposed meta aggregators may readily serve as a generic plugin module into existing full-precision GNNs. Experiments across various domains demonstrate that the proposed method yields results superior to the state of the art.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Jing_Meta-Aggregator_Learning_To_Aggregate_for_1-Bit_Graph_Neural_Networks_ICCV_2021_paper.pdf", @@ -26019,14 +27779,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Jing_Meta-Aggregator_Learning_To_Aggregate_for_1-Bit_Graph_Neural_Networks_ICCV_2021_paper.html", "aff_unique_index": "0;1;2;3;4+0", - "aff_unique_norm": "University of Sydney;Stevens Institute of Technology;National University of Singapore;Zhejiang University;JD", - "aff_unique_dep": ";;;;JD Explore Academy", + "aff_unique_norm": "The University of Sydney;Stevens Institute of Technology;National University of Singapore;Zhejiang University;JD Explore Academy", + "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.sydney.edu.au;https://www.stevens.edu;https://www.nus.edu.sg;https://www.zju.edu.cn;", "aff_unique_abbr": "USYD;SIT;NUS;ZJU;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;3;3+0", - "aff_country_unique": "Australia;United States;Singapore;China" + "aff_country_unique": "Australia;United States;Singapore;China", + "bibtex": "@InProceedings{Jing_2021_ICCV,\n \n author = {\n Jing,\n Yongcheng and Yang,\n Yiding and Wang,\n Xinchao and Song,\n Mingli and Tao,\n Dacheng\n},\n title = {\n Meta-Aggregator: Learning To Aggregate for 1-Bit Graph Neural Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5301-5310\n} \n}" }, { "title": "Meta-Attack: Class-Agnostic and Model-Agnostic Physical Adversarial Attack", @@ -26034,6 +27795,7 @@ "status": "Poster", "track": "main", "pid": 7154, + "author_site": "Weiwei Feng; Baoyuan Wu; Tianzhu Zhang; Yong Zhang; Yongdong Zhang", "author": "Weiwei Feng; Baoyuan Wu; Tianzhu Zhang; Yong Zhang; Yongdong Zhang", "abstract": "Modern deep neural networks are often vulnerable to adversarial examples. Most exist attack methods focus on crafting adversarial examples in the digital domain, while only limited works study physical adversarial attack. However, it is more challenging to generate effective adversarial examples in the physical world due to many uncontrollable physical dynamics. Most current physical attack methods aim to generate robust physical adversarial examples by simulating all possible physical dynamics. When attacking new images or new DNN models, they require expensive manually efforts for simulating physical dynamics and considerable time for iteratively optimizing for each image. To tackle these issues, we propose a class-agnostic and model-agnostic physical adversarial attack model (Meta-Attack), which is able to not only generate robust physical adversarial examples by simulating color and shape distortions, but also generalize to attacking novel images and novel DNN models by accessing a few digital and physical images. To the best of our knowledge, this is the first work to formulate the physical attack as a few-shot learning problem. Here, the training task is redefined as the composition of a support set, a query set, and a target DNN model. Under the few- shot setting, we design a novel class-agnostic and model-agnostic meta-learning algorithm to enhance the generalization ability of our method. Extensive experimental results on two benchmark datasets with four challenging experimental settings verify the superior robustness and generalization of our method by comparing to state-of-the-art physical attack methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Feng_Meta-Attack_Class-Agnostic_and_Model-Agnostic_Physical_Adversarial_Attack_ICCV_2021_paper.pdf", @@ -26048,7 +27810,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Feng_Meta-Attack_Class-Agnostic_and_Model-Agnostic_Physical_Adversarial_Attack_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Feng_Meta-Attack_Class-Agnostic_and_Model-Agnostic_Physical_Adversarial_Attack_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Feng_2021_ICCV,\n \n author = {\n Feng,\n Weiwei and Wu,\n Baoyuan and Zhang,\n Tianzhu and Zhang,\n Yong and Zhang,\n Yongdong\n},\n title = {\n Meta-Attack: Class-Agnostic and Model-Agnostic Physical Adversarial Attack\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7787-7796\n} \n}" }, { "title": "Meta-Baseline: Exploring Simple Meta-Learning for Few-Shot Learning", @@ -26056,6 +27819,7 @@ "status": "Poster", "track": "main", "pid": 7065, + "author_site": "Yinbo Chen; Zhuang Liu; Huijuan Xu; Trevor Darrell; Xiaolong Wang", "author": "Yinbo Chen; Zhuang Liu; Huijuan Xu; Trevor Darrell; Xiaolong Wang", "abstract": "Meta-learning has been the most common framework for few-shot learning in recent years. It learns the model from collections of few-shot classification tasks, which is believed to have a key advantage of making the training objective consistent with the testing objective. However, some recent works report that by training for whole-classification, i.e. classification on the whole label-set, it can get comparable or even better embedding than many meta-learning algorithms. The edge between these two lines of works has yet been underexplored, and the effectiveness of meta-learning in few-shot learning remains unclear. In this paper, we explore a simple process: meta-learning over a whole-classification pre-trained model on its evaluation metric. We observe this simple method achieves competitive performance to state-of-the-art methods on standard benchmarks. Our further analysis shed some light on understanding the trade-offs between the meta-learning objective and the whole-classification objective in few-shot learning.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_Meta-Baseline_Exploring_Simple_Meta-Learning_for_Few-Shot_Learning_ICCV_2021_paper.pdf", @@ -26079,7 +27843,8 @@ "aff_campus_unique_index": "0;1;1;0", "aff_campus_unique": "San Diego;Berkeley;", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Yinbo and Liu,\n Zhuang and Xu,\n Huijuan and Darrell,\n Trevor and Wang,\n Xiaolong\n},\n title = {\n Meta-Baseline: Exploring Simple Meta-Learning for Few-Shot Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9062-9071\n} \n}" }, { "title": "Meta-Learning With Task-Adaptive Loss Function for Few-Shot Learning", @@ -26087,6 +27852,7 @@ "status": "Poster", "track": "main", "pid": 10177, + "author_site": "Sungyong Baik; Janghoon Choi; Heewon Kim; Dohee Cho; Jaesik Min; Kyoung Mu Lee", "author": "Sungyong Baik; Janghoon Choi; Heewon Kim; Dohee Cho; Jaesik Min; Kyoung Mu Lee", "abstract": "In few-shot learning scenarios, the challenge is to generalize and perform well on new unseen examples when only very few labeled examples are available for each task. Model-agnostic meta-learning (MAML) has gained the popularity as one of the representative few-shot learning methods for its flexibility and applicability to diverse problems. However, MAML and its variants often resort to a simple loss function without any auxiliary loss function or regularization terms that can help achieve better generalization. The problem lies in that each application and task may require different auxiliary loss function, especially when tasks are diverse and distinct. Instead of attempting to hand-design an auxiliary loss function for each application and task, we introduce a new meta-learning framework with a loss function that adapts to each task. Our proposed framework, named Meta-Learning with Task-Adaptive Loss Function (MeTAL), demonstrates the effectiveness and the flexibility across various domains, such as few-shot classification and few-shot regression.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Baik_Meta-Learning_With_Task-Adaptive_Loss_Function_for_Few-Shot_Learning_ICCV_2021_paper.pdf", @@ -26110,7 +27876,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Seoul;", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Baik_2021_ICCV,\n \n author = {\n Baik,\n Sungyong and Choi,\n Janghoon and Kim,\n Heewon and Cho,\n Dohee and Min,\n Jaesik and Lee,\n Kyoung Mu\n},\n title = {\n Meta-Learning With Task-Adaptive Loss Function for Few-Shot Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9465-9474\n} \n}" }, { "title": "MicroNet: Improving Image Recognition With Extremely Low FLOPs", @@ -26118,6 +27885,7 @@ "status": "Poster", "track": "main", "pid": 3217, + "author_site": "Yunsheng Li; Yinpeng Chen; Xiyang Dai; Dongdong Chen; Mengchen Liu; Lu Yuan; Zicheng Liu; Lei Zhang; Nuno Vasconcelos", "author": "Yunsheng Li; Yinpeng Chen; Xiyang Dai; Dongdong Chen; Mengchen Liu; Lu Yuan; Zicheng Liu; Lei Zhang; Nuno Vasconcelos", "abstract": "This paper aims at addressing the problem of substantial performance degradation at extremely low computational cost (e.g. 5M FLOPs on ImageNet classification). We found that two factors, sparse connectivity and dynamic activation function, are effective to improve the accuracy. The former avoids the significant reduction of network width, while the latter mitigates the detriment of reduction in network depth. Technically, we propose micro-factorized convolution, which factorizes a convolution matrix into low rank matrices, to integrate sparse connectivity into convolution. We also present a new dynamic activation function, named Dynamic Shift Max, to improve the non-linearity via maxing out multiple dynamic fusions between an input feature map and its circular channel shift. Building upon these two new operators, we arrive at a family of networks, named MicroNet, that achieves significant performance gains over the state of the art in the low FLOP regime. For instance, under the constraint of 12M FLOPs, MicroNet achieves 59.4% top-1 accuracy on ImageNet classification, outperforming MobileNetV3 by 9.6%. Source code is at https://github.com/liyunsheng13/micronet.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_MicroNet_Improving_Image_Recognition_With_Extremely_Low_FLOPs_ICCV_2021_paper.pdf", @@ -26134,14 +27902,15 @@ "author_num": 9, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Li_MicroNet_Improving_Image_Recognition_With_Extremely_Low_FLOPs_ICCV_2021_paper.html", "aff_unique_index": "0;1;1;1;1;1;1;1;0", - "aff_unique_norm": "University of California, San Diego;Microsoft", - "aff_unique_dep": ";Microsoft Corporation", + "aff_unique_norm": "University of California, San Diego;Microsoft Corporation", + "aff_unique_dep": ";", "aff_unique_url": "https://ucsd.edu;https://www.microsoft.com", "aff_unique_abbr": "UCSD;Microsoft", "aff_campus_unique_index": "0;0", "aff_campus_unique": "San Diego;", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Yunsheng and Chen,\n Yinpeng and Dai,\n Xiyang and Chen,\n Dongdong and Liu,\n Mengchen and Yuan,\n Lu and Liu,\n Zicheng and Zhang,\n Lei and Vasconcelos,\n Nuno\n},\n title = {\n MicroNet: Improving Image Recognition With Extremely Low FLOPs\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 468-477\n} \n}" }, { "title": "Minimal Adversarial Examples for Deep Learning on 3D Point Clouds", @@ -26149,6 +27918,7 @@ "status": "Poster", "track": "main", "pid": 8118, + "author_site": "Jaeyeon Kim; Binh-Son Hua; Thanh Nguyen; Sai-Kit Yeung", "author": "Jaeyeon Kim; Binh-Son Hua; Thanh Nguyen; Sai-Kit Yeung", "abstract": "With recent developments of convolutional neural networks, deep learning for 3D point clouds has shown significant progress in various 3D scene understanding tasks, e.g., object recognition, object detection. In a safety-critical environment, it is however not well understood how such deep learning models are vulnerable to adversarial examples. In this work, we explore adversarial attacks for point cloud-based neural networks. We propose a new formulation for adversarial point cloud generation that can generalise two different attack strategies. Our method generates adversarial examples by attacking the classification ability of point cloud-based networks while considering the perceptibility of the examples and ensuring the minimal level of point manipulations. Experimental results show that our method achieves the state-of-the-art performance with higher than 89% and 90% of attack success rate on synthetic and real-world data respectively, while manipulating only about 4% of the total points.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Kim_Minimal_Adversarial_Examples_for_Deep_Learning_on_3D_Point_Clouds_ICCV_2021_paper.pdf", @@ -26163,7 +27933,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Kim_Minimal_Adversarial_Examples_for_Deep_Learning_on_3D_Point_Clouds_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Kim_Minimal_Adversarial_Examples_for_Deep_Learning_on_3D_Point_Clouds_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Kim_2021_ICCV,\n \n author = {\n Kim,\n Jaeyeon and Hua,\n Binh-Son and Nguyen,\n Thanh and Yeung,\n Sai-Kit\n},\n title = {\n Minimal Adversarial Examples for Deep Learning on 3D Point Clouds\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7797-7806\n} \n}" }, { "title": "Minimal Cases for Computing the Generalized Relative Pose Using Affine Correspondences", @@ -26171,6 +27942,7 @@ "status": "Poster", "track": "main", "pid": 5442, + "author_site": "Banglei Guan; Ji Zhao; Daniel Barath; Friedrich Fraundorfer", "author": "Banglei Guan; Ji Zhao; Daniel Barath; Friedrich Fraundorfer", "abstract": "We propose three novel solvers for estimating the relative pose of a multi-camera system from affine correspondences (ACs). A new constraint is derived interpreting the relationship of ACs and the generalized camera model. Using the constraint, we demonstrate efficient solvers for two types of motions assumed. Considering that the cameras undergo planar motion, we propose a minimal solution using a single AC and a solver with two ACs to overcome the degenerate case. Also, we propose a minimal solution using two ACs with known vertical direction, e.g., from an IMU. Since the proposed methods require significantly fewer correspondences than state-of-the-art algorithms, they can be efficiently used within RANSAC for outlier removal and initial motion estimation. The solvers are tested both on synthetic data and on real-world scenes from the KITTI odometry benchmark. It is shown that the accuracy of the estimated poses is superior to the state-of-the-art techniques.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Guan_Minimal_Cases_for_Computing_the_Generalized_Relative_Pose_Using_Affine_ICCV_2021_paper.pdf", @@ -26185,7 +27957,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Guan_Minimal_Cases_for_Computing_the_Generalized_Relative_Pose_Using_Affine_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Guan_Minimal_Cases_for_Computing_the_Generalized_Relative_Pose_Using_Affine_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Guan_2021_ICCV,\n \n author = {\n Guan,\n Banglei and Zhao,\n Ji and Barath,\n Daniel and Fraundorfer,\n Friedrich\n},\n title = {\n Minimal Cases for Computing the Generalized Relative Pose Using Affine Correspondences\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6068-6077\n} \n}" }, { "title": "Minimal Solutions for Panoramic Stitching Given Gravity Prior", @@ -26193,10 +27966,11 @@ "status": "Poster", "track": "main", "pid": 9407, + "author_site": "Yaqing Ding; Daniel Barath; Zuzana Kukelova", "author": "Yaqing Ding; Daniel Barath; Zuzana Kukelova", "abstract": "When capturing panoramas, people tend to align their cameras with the vertical axis, i.e., the direction of gravity. Moreover, modern devices, e.g. smartphones and tablets, are equipped with an IMU (Inertial Measurement Unit) that can measure the gravity vector accurately. Using this prior, the y-axes of the cameras can be aligned or assumed to be already aligned, reducing the relative orientation to 1-DOF (degree of freedom). Exploiting this assumption, we propose new minimal solutions to panoramic stitching of images taken by cameras with coinciding optical centers, i.e. undergoing pure rotation. We consider six practical camera configurations, from fully calibrated ones up to a camera with unknown fixed or varying focal length and with or without radial distortion. The solvers are tested both on synthetic scenes, on more than 500k real image pairs from the Sun360 dataset, and from scenes captured by us using two smartphones equipped with IMUs. The new solvers have similar or better accuracy than the state-of-the-art ones and outperform them in terms of processing time.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ding_Minimal_Solutions_for_Panoramic_Stitching_Given_Gravity_Prior_ICCV_2021_paper.pdf", - "aff": "School of Computer Science and Engineering, Nanjing University of Science and Technology; Computer Vision and Geometry Group, Department of Computer Science, ETH Z\u00fcrich; Visual Recognition Group, Faculty of Electrical Engineering, Czech Technical University in Prague", + "aff": "School of Computer Science and Engineering, Nanjing University of Science and Technology; Computer Vision and Geometry Group, Department of Computer Science, ETH Zürich; Visual Recognition Group, Faculty of Electrical Engineering, Czech Technical University in Prague", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Ding_Minimal_Solutions_for_ICCV_2021_supplemental.pdf", @@ -26209,14 +27983,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Ding_Minimal_Solutions_for_Panoramic_Stitching_Given_Gravity_Prior_ICCV_2021_paper.html", "aff_unique_index": "0;1;2", - "aff_unique_norm": "Nanjing University of Science and Technology;ETH Zurich;Czech Technical University in Prague", + "aff_unique_norm": "Nanjing University of Science and Technology;ETH Zürich;Czech Technical University in Prague", "aff_unique_dep": "School of Computer Science and Engineering;Department of Computer Science;Faculty of Electrical Engineering", "aff_unique_url": "http://www.nust.edu.cn;https://www.ethz.ch;https://www.cvut.cz", "aff_unique_abbr": "NUST;ETHZ;CTU", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Nanjing;;Prague", "aff_country_unique_index": "0;1;2", - "aff_country_unique": "China;Switzerland;Czech Republic" + "aff_country_unique": "China;Switzerland;Czech Republic", + "bibtex": "@InProceedings{Ding_2021_ICCV,\n \n author = {\n Ding,\n Yaqing and Barath,\n Daniel and Kukelova,\n Zuzana\n},\n title = {\n Minimal Solutions for Panoramic Stitching Given Gravity Prior\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5579-5588\n} \n}" }, { "title": "Mining Contextual Information Beyond Image for Semantic Segmentation", @@ -26224,6 +27999,7 @@ "status": "Poster", "track": "main", "pid": 2363, + "author_site": "Zhenchao Jin; Tao Gong; Dongdong Yu; Qi Chu; Jian Wang; Changhu Wang; Jie Shao", "author": "Zhenchao Jin; Tao Gong; Dongdong Yu; Qi Chu; Jian Wang; Changhu Wang; Jie Shao", "abstract": "This paper studies the context aggregation problem in semantic image segmentation. The existing researches focus on improving the pixel representations by aggregating the contextual information within individual images. Though impressive, these methods neglect the significance of the representations of the pixels of the corresponding class beyond the input image. To address this, this paper proposes to mine the contextual information beyond individual images to further augment the pixel representations. We first set up a feature memory module, which is updated dynamically during training, to store the dataset-level representations of various categories. Then, we learn class probability distribution of each pixel representation under the supervision of the ground-truth segmentation. At last, the representation of each pixel is augmented by aggregating the dataset-level representations based on the corresponding class probability distribution. Furthermore, by utilizing the stored dataset-level representations, we also propose a representation consistent learning strategy to make the classification head better address intra-class compactness and inter-class dispersion. The proposed method could be effortlessly incorporated into existing segmentation frameworks (e.g., FCN, PSPNet, OCRNet and DeepLabV3) and brings consistent performance improvements. Mining contextual information beyond image allows us to report state-of-the-art performance on various benchmarks: ADE20K, LIP, Cityscapes and COCO-Stuff.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Jin_Mining_Contextual_Information_Beyond_Image_for_Semantic_Segmentation_ICCV_2021_paper.pdf", @@ -26247,7 +28023,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Jin_2021_ICCV,\n \n author = {\n Jin,\n Zhenchao and Gong,\n Tao and Yu,\n Dongdong and Chu,\n Qi and Wang,\n Jian and Wang,\n Changhu and Shao,\n Jie\n},\n title = {\n Mining Contextual Information Beyond Image for Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7231-7241\n} \n}" }, { "title": "Mining Latent Classes for Few-Shot Segmentation", @@ -26255,6 +28032,7 @@ "status": "Poster", "track": "main", "pid": 2244, + "author_site": "Lihe Yang; Wei Zhuo; Lei Qi; Yinghuan Shi; Yang Gao", "author": "Lihe Yang; Wei Zhuo; Lei Qi; Yinghuan Shi; Yang Gao", "abstract": "Few-shot segmentation (FSS) aims to segment unseen classes given only a few annotated samples. Existing methods suffer the problem of feature undermining, i.e. potential novel classes are treated as background during training phase. Our method aims to alleviate this problem and enhance the feature embedding on latent novel classes. In our work, we propose a novel joint-training framework. Based on conventional episodic training on support-query pairs, we add an additional mining branch that exploits latent novel classes via transferable sub-clusters, and a new rectification technique on both background and foreground categories to enforce more stable prototypes. Over and above that, our transferable sub-cluster has the ability to leverage extra unlabeled data for further feature enhancement. Extensive experiments on two FSS benchmarks demonstrate that our method outperforms previous state-of-the-art by a large margin of 3.7% mIOU on PASCAL-5i and 7.0% mIOU on COCO-20i at the cost of 74% fewer parameters and 2.5x faster inference speed. The source code is available at https://github.com/LiheYoung/MiningFSS.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yang_Mining_Latent_Classes_for_Few-Shot_Segmentation_ICCV_2021_paper.pdf", @@ -26269,7 +28047,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Yang_Mining_Latent_Classes_for_Few-Shot_Segmentation_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Yang_Mining_Latent_Classes_for_Few-Shot_Segmentation_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Yang_2021_ICCV,\n \n author = {\n Yang,\n Lihe and Zhuo,\n Wei and Qi,\n Lei and Shi,\n Yinghuan and Gao,\n Yang\n},\n title = {\n Mining Latent Classes for Few-Shot Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8721-8730\n} \n}" }, { "title": "Mip-NeRF: A Multiscale Representation for Anti-Aliasing Neural Radiance Fields", @@ -26277,6 +28056,7 @@ "status": "Poster", "track": "main", "pid": 4397, + "author_site": "Jonathan T. Barron; Ben Mildenhall; Matthew Tancik; Peter Hedman; Ricardo Martin-Brualla; Pratul P. Srinivasan", "author": "Jonathan T. Barron; Ben Mildenhall; Matthew Tancik; Peter Hedman; Ricardo Martin-Brualla; Pratul P. Srinivasan", "abstract": "The rendering procedure used by neural radiance fields (NeRF) samples a scene with a single ray per pixel and may therefore produce renderings that are excessively blurred or aliased when training or testing images observe scene content at different resolutions. The straightforward solution of supersampling by rendering with multiple rays per pixel is impractical for NeRF, because rendering each ray requires querying a multilayer perceptron hundreds of times. Our solution, which we call \"mip-NeRF\" (a la \"mipmap\"), extends NeRF to represent the scene at a continuously-valued scale. By efficiently rendering anti-aliased conical frustums instead of rays, mip-NeRF reduces objectionable aliasing artifacts and significantly improves NeRF's ability to represent fine details, while also being 7% faster than NeRF and half the size. Compared to NeRF, mip-NeRF reduces average error rates by 17% on the dataset presented with NeRF and by 60% on a challenging multiscale variant of that dataset that we present. Mip-NeRF is also able to match the accuracy of a brute-force supersampled NeRF on our multiscale dataset while being 22x faster.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Barron_Mip-NeRF_A_Multiscale_Representation_for_Anti-Aliasing_Neural_Radiance_Fields_ICCV_2021_paper.pdf", @@ -26291,7 +28071,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Barron_Mip-NeRF_A_Multiscale_Representation_for_Anti-Aliasing_Neural_Radiance_Fields_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Barron_Mip-NeRF_A_Multiscale_Representation_for_Anti-Aliasing_Neural_Radiance_Fields_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Barron_2021_ICCV,\n \n author = {\n Barron,\n Jonathan T. and Mildenhall,\n Ben and Tancik,\n Matthew and Hedman,\n Peter and Martin-Brualla,\n Ricardo and Srinivasan,\n Pratul P.\n},\n title = {\n Mip-NeRF: A Multiscale Representation for Anti-Aliasing Neural Radiance Fields\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5855-5864\n} \n}" }, { "title": "Mitigating Intensity Bias in Shadow Detection via Feature Decomposition and Reweighting", @@ -26299,6 +28080,7 @@ "status": "Poster", "track": "main", "pid": 7556, + "author_site": "Lei Zhu; Ke Xu; Zhanghan Ke; Rynson W.H. Lau", "author": "Lei Zhu; Ke Xu; Zhanghan Ke; Rynson W.H. Lau", "abstract": "While CNNs achieved remarkable progress in shadow detection, they tend to make mistakes in dark non-shadow regions and relatively bright shadow regions. They are also susceptible to brightness change. These two phenomenons reveal that deep shadow detectors heavily depend on the intensity cue, which we refer to as intensity bias. In this paper, we propose a novel feature decomposition and reweighting scheme to mitigate this intensity bias, in which multi-level integrated features are decomposed into intensity-variant and intensity-invariant components through self-supervision. By reweighting these two types of features, our method can reallocate the attention to the corresponding latent semantics and achieves balanced exploitation of them. Extensive experiments on three popular datasets show that the proposed method outperforms state-of-the-art shadow detectors.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhu_Mitigating_Intensity_Bias_in_Shadow_Detection_via_Feature_Decomposition_and_ICCV_2021_paper.pdf", @@ -26322,7 +28104,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0+0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhu_2021_ICCV,\n \n author = {\n Zhu,\n Lei and Xu,\n Ke and Ke,\n Zhanghan and Lau,\n Rynson W.H.\n},\n title = {\n Mitigating Intensity Bias in Shadow Detection via Feature Decomposition and Reweighting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4702-4711\n} \n}" }, { "title": "MixMix: All You Need for Data-Free Compression Are Feature and Data Mixing", @@ -26330,6 +28113,7 @@ "status": "Poster", "track": "main", "pid": 2916, + "author_site": "Yuhang Li; Feng Zhu; Ruihao Gong; Mingzhu Shen; Xin Dong; Fengwei Yu; Shaoqing Lu; Shi Gu", "author": "Yuhang Li; Feng Zhu; Ruihao Gong; Mingzhu Shen; Xin Dong; Fengwei Yu; Shaoqing Lu; Shi Gu", "abstract": "User data confidentiality protection is becoming a rising challenge in the present deep learning research. Without access to data, conventional data-driven model compression faces a higher risk of performance degradation. Recently, some works propose to generate images from a specific pretrained model to serve as training data. However, the inversion process only utilizes biased feature statistics stored in one model and is from low-dimension to high-dimension. As a consequence, it inevitably encounters the difficulties of generalizability and inexact inversion, which leads to unsatisfactory performance. To address these problems, we propose MixMix based on two simple yet effective techniques: (1) Feature Mixing: utilizes various models to construct a universal feature space for generalized inversion; (2) Data Mixing: mixes the synthesized images and labels to generate exact label information. We prove the effectiveness of MixMix from both theoretical and empirical perspectives. Extensive experiments show that MixMix outperforms existing methods on the mainstream compression tasks, including quantization, knowledge distillation and pruning. Specifically, MixMix achieves up to 4% and 20% accuracy uplift on quantization and pruning, respectively, compared to existing data-free compression work.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_MixMix_All_You_Need_for_Data-Free_Compression_Are_Feature_and_ICCV_2021_paper.pdf", @@ -26344,7 +28128,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Li_MixMix_All_You_Need_for_Data-Free_Compression_Are_Feature_and_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Li_MixMix_All_You_Need_for_Data-Free_Compression_Are_Feature_and_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Yuhang and Zhu,\n Feng and Gong,\n Ruihao and Shen,\n Mingzhu and Dong,\n Xin and Yu,\n Fengwei and Lu,\n Shaoqing and Gu,\n Shi\n},\n title = {\n MixMix: All You Need for Data-Free Compression Are Feature and Data Mixing\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4410-4419\n} \n}" }, { "title": "MixMo: Mixing Multiple Inputs for Multiple Outputs via Deep Subnetworks", @@ -26352,10 +28137,11 @@ "status": "Poster", "track": "main", "pid": 3306, - "author": "Alexandre Ram\u00e9; R\u00e9my Sun; Matthieu Cord", + "author_site": "Alexandre Ramé; Rémy Sun; Matthieu Cord", + "author": "Alexandre Ramé; Rémy Sun; Matthieu Cord", "abstract": "Recent strategies achieved ensembling \"\"for free\"\" by fitting concurrently diverse subnetworks inside a single base network. The main idea during training is that each subnetwork learns to classify only one of the multiple inputs simultaneously provided. However, the question of how to best mix these multiple inputs has not been studied so far. In this paper, we introduce MixMo, a new generalized framework for learning multi-input multi-output deep subnetworks. Our key motivation is to replace the suboptimal summing operation hidden in previous approaches by a more appropriate mixing mechanism. For that purpose, we draw inspiration from successful mixed sample data augmentations. We show that binary mixing in features - particularly with rectangular patches from CutMix - enhances results by making subnetworks stronger and more diverse. We improve state of the art for image classification on CIFAR-100 and Tiny ImageNet datasets. Our easy to implement models notably outperform data augmented deep ensembles, without the inference and memory overheads. As we operate in features and simply better leverage the expressiveness of large networks, we open a new line of research complementary to previous works.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Rame_MixMo_Mixing_Multiple_Inputs_for_Multiple_Outputs_via_Deep_Subnetworks_ICCV_2021_paper.pdf", - "aff": "Sorbonne Universit\u00e9, CNRS, LIP6, Paris, France; Sorbonne Universit\u00e9, CNRS, LIP6, Paris, France + Optronics & Missile Electronics, Land & Air Systems, Thales; Sorbonne Universit\u00e9, CNRS, LIP6, Paris, France + Valeo.ai", + "aff": "Sorbonne Université, CNRS, LIP6, Paris, France; Sorbonne Université, CNRS, LIP6, Paris, France + Optronics & Missile Electronics, Land & Air Systems, Thales; Sorbonne Université, CNRS, LIP6, Paris, France + Valeo.ai", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Rame_MixMo_Mixing_Multiple_ICCV_2021_supplemental.pdf", @@ -26368,14 +28154,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Rame_MixMo_Mixing_Multiple_Inputs_for_Multiple_Outputs_via_Deep_Subnetworks_ICCV_2021_paper.html", "aff_unique_index": "0;0+1;0+2", - "aff_unique_norm": "Sorbonne Universit\u00e9;Thales;Valeo", + "aff_unique_norm": "Sorbonne Université;Thales;Valeo", "aff_unique_dep": "LIP6;Optronics & Missile Electronics, Land & Air Systems;Valeo.ai", "aff_unique_url": "https://www.sorbonne-universite.fr;https://www.thalesgroup.com;https://www.valeo.com", "aff_unique_abbr": "SU;Thales;Valeo", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Paris;", "aff_country_unique_index": "0;0+0;0+0", - "aff_country_unique": "France" + "aff_country_unique": "France", + "bibtex": "@InProceedings{Rame_2021_ICCV,\n \n author = {\n Ram\\'e,\n Alexandre and Sun,\n R\\'emy and Cord,\n Matthieu\n},\n title = {\n MixMo: Mixing Multiple Inputs for Multiple Outputs via Deep Subnetworks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 823-833\n} \n}" }, { "title": "Mixed SIGNals: Sign Language Production via a Mixture of Motion Primitives", @@ -26383,6 +28170,7 @@ "status": "Poster", "track": "main", "pid": 3894, + "author_site": "Ben Saunders; Necati Cihan Camgoz; Richard Bowden", "author": "Ben Saunders; Necati Cihan Camgoz; Richard Bowden", "abstract": "It is common practice to represent spoken languages at their phonetic level. However, for sign languages, this implies breaking motion into its constituent motion primitives. Avatar based Sign Language Production (SLP) has traditionally done just this, building up animation from sequences of hand motions, shapes and facial expressions. However, more recent deep learning based solutions to SLP have tackled the problem using a single network that estimates the full skeletal structure. We propose splitting the SLP task into two distinct jointly-trained sub-tasks. The first translation sub-task translates from spoken language to a latent sign language representation, with gloss supervision. Subsequently, the animation sub-task aims to produce expressive sign language sequences that closely resemble the learnt spatio-temporal representation. Using a progressive transformer for the translation sub-task, we propose a novel Mixture of Motion Primitives (MoMP) architecture for sign language animation. A set of distinct motion primitives are learnt during training, that can be temporally combined at inference to animate continuous sign language sequences. We evaluate on the challenging RWTH-PHOENIX-Weather-2014T(PHOENIX14T) dataset, presenting extensive ablation studies and showing that MoMP outperforms baselines in user evaluations. We achieve state-of-the-art back translation performance with an 11% improvement over competing results. Importantly, and for the first time, we showcase stronger performance for a full translation pipeline going from spoken language to sign, than from gloss to sign.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Saunders_Mixed_SIGNals_Sign_Language_Production_via_a_Mixture_of_Motion_ICCV_2021_paper.pdf", @@ -26406,7 +28194,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Saunders_2021_ICCV,\n \n author = {\n Saunders,\n Ben and Camgoz,\n Necati Cihan and Bowden,\n Richard\n},\n title = {\n Mixed SIGNals: Sign Language Production via a Mixture of Motion Primitives\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1919-1929\n} \n}" }, { "title": "Mixture-Based Feature Space Learning for Few-Shot Image Classification", @@ -26414,10 +28203,11 @@ "status": "Poster", "track": "main", "pid": 5579, - "author": "Arman Afrasiyabi; Jean-Fran\u00e7ois Lalonde; Christian Gagn\u00e9", + "author_site": "Arman Afrasiyabi; Jean-François Lalonde; Christian Gagné", + "author": "Arman Afrasiyabi; Jean-François Lalonde; Christian Gagné", "abstract": "We introduce Mixture-based Feature Space Learning (MixtFSL) for obtaining a rich and robust feature representation in the context of few-shot image classification. Previous works have proposed to model each base class either with a single point or with a mixture model by relying on offline clustering algorithms. In contrast, we propose to model base classes with mixture models by simultaneously training the feature extractor and learning the mixture model parameters in an online manner. This results in a richer and more discriminative feature space which can be employed to classify novel examples from very few samples. Two main stages are proposed to train the MixtFSL model. First, the multimodal mixtures for each base class and the feature extractor parameters are learned using a combination of two loss functions. Second, the resulting network and mixture models are progressively refined through a leader-follower learning procedure, which uses the current estimate as a \"target\" network. This target network is used to make a consistent assignment of instances to mixture components, which increases performance and stabilizes training. The effectiveness of our end-to-end feature space learning approach is demonstrated with extensive experiments on four standard datasets and four backbones. Notably, we demonstrate that when we combine our robust representation with recent alignment-based approaches, we achieve new state-of-the-art results in the inductive setting, with an absolute accuracy for 5-shot classification of 82.45 on miniImageNet, 88.20 with tieredImageNet, and 60.70 in FC100 using the ResNet-12 backbone.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Afrasiyabi_Mixture-Based_Feature_Space_Learning_for_Few-Shot_Image_Classification_ICCV_2021_paper.pdf", - "aff": "Universit\u00b4e Laval+Canada CIFAR AI Chair+Mila; Universit\u00b4e Laval+Canada CIFAR AI Chair+Mila; Universit\u00b4e Laval+Canada CIFAR AI Chair+Mila", + "aff": "Universit´e Laval+Canada CIFAR AI Chair+Mila; Universit´e Laval+Canada CIFAR AI Chair+Mila; Universit´e Laval+Canada CIFAR AI Chair+Mila", "project": "https://lvsn.github.io/MixtFSL/", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Afrasiyabi_Mixture-Based_Feature_Space_ICCV_2021_supplemental.pdf", @@ -26430,14 +28220,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Afrasiyabi_Mixture-Based_Feature_Space_Learning_for_Few-Shot_Image_Classification_ICCV_2021_paper.html", "aff_unique_index": "0+1+2;0+1+2;0+1+2", - "aff_unique_norm": "Universit\u00e9 Laval;Canadian Institute for Advanced Research;Mila", + "aff_unique_norm": "Université Laval;Canadian Institute for Advanced Research;Mila", "aff_unique_dep": ";AI Chair;Quebec Artificial Intelligence Institute", "aff_unique_url": "https://www.ulaval.ca;https://www.cifar.ca;https://mila.quebec", "aff_unique_abbr": "UL;CIFAR;Mila", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0+0;0+0+0;0+0+0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Afrasiyabi_2021_ICCV,\n \n author = {\n Afrasiyabi,\n Arman and Lalonde,\n Jean-Fran\\c{c\n}ois and Gagn\\'e,\n Christian\n},\n title = {\n Mixture-Based Feature Space Learning for Few-Shot Image Classification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9041-9051\n} \n}" }, { "title": "Modelling Neighbor Relation in Joint Space-Time Graph for Video Correspondence Learning", @@ -26445,6 +28236,7 @@ "status": "Poster", "track": "main", "pid": 5526, + "author_site": "Zixu Zhao; Yueming Jin; Pheng-Ann Heng", "author": "Zixu Zhao; Yueming Jin; Pheng-Ann Heng", "abstract": "This paper presents a self-supervised method for learning reliable visual correspondence from unlabeled videos. We formulate the correspondence as finding paths in a joint space-time graph, where nodes are grid patches sampled from frames, and are linked by two type of edges: (i) neighbor relations that determine the aggregation strength from intra-frame neighbors in space, and (ii) similarity relations that indicate the transition probability of inter-frame paths across time. Leveraging the cycle-consistency in videos, our contrastive learning objective discriminates dynamic objects from both their neighboring views and temporal views. Compared with prior works, our approach actively explores the neighbor relations of central instances to learn a latent association between center-neighbor pairs (eg, \"hand -- arm\") across time, thus improving the instance discrimination. Without fine-tuning, our learned representation outperforms the state-of-the-art self-supervised methods on a variety of visual tasks including video object propagation, part propagation, and pose keypoint tracking. Our self-supervised method also surpasses some fully supervised algorithms designed for the specific tasks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhao_Modelling_Neighbor_Relation_in_Joint_Space-Time_Graph_for_Video_Correspondence_ICCV_2021_paper.pdf", @@ -26461,14 +28253,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhao_Modelling_Neighbor_Relation_in_Joint_Space-Time_Graph_for_Video_Correspondence_ICCV_2021_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "Chinese University of Hong Kong", + "aff_unique_norm": "The Chinese University of Hong Kong", "aff_unique_dep": "", "aff_unique_url": "https://www.cuhk.edu.hk", "aff_unique_abbr": "CUHK", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhao_2021_ICCV,\n \n author = {\n Zhao,\n Zixu and Jin,\n Yueming and Heng,\n Pheng-Ann\n},\n title = {\n Modelling Neighbor Relation in Joint Space-Time Graph for Video Correspondence Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9960-9969\n} \n}" }, { "title": "Modulated Graph Convolutional Network for 3D Human Pose Estimation", @@ -26476,6 +28269,7 @@ "status": "Poster", "track": "main", "pid": 8250, + "author_site": "Zhiming Zou; Wei Tang", "author": "Zhiming Zou; Wei Tang", "abstract": "The graph convolutional network (GCN) has recently achieved promising performance of 3D human pose estimation (HPE) by modeling the relationship among body parts. However, most prior GCN approaches suffer from two main drawbacks. First, they share a feature transformation for each node within a graph convolution layer. This prevents them from learning different relations between different body joints. Second, the graph is usually defined according to the human skeleton and is suboptimal because human activities often exhibit motion patterns beyond the natural connections of body joints. To address these limitations, we introduce a novel Modulated GCN for 3D HPE. It consists of two main components: weight modulation and affinity modulation. Weight modulation learns different modulation vectors for different nodes so that the feature transformations of different nodes are disentangled while retaining a small model size. Affinity modulation adjusts the graph structure in a GCN so that it can model additional edges beyond the human skeleton. We investigate several affinity modulation methods as well as the impact of regularizations. Rigorous ablation study indicates both types of modulation improve performance with negligible overhead. Compared with state-of-the-art GCNs for 3D HPE, our approach either significantly reduces the estimation errors, e.g., by around 10%, while retaining a small model size or drastically reduces the model size, e.g., from 4.22M to 0.29M (a 14.5 X reduction), while achieving comparable performance. Results on two benchmarks show our Modulated GCN outperforms some recent states of the art. Our code is available at https://github.com/ZhimingZo/Modulated-GCN.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zou_Modulated_Graph_Convolutional_Network_for_3D_Human_Pose_Estimation_ICCV_2021_paper.pdf", @@ -26499,7 +28293,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Chicago", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zou_2021_ICCV,\n \n author = {\n Zou,\n Zhiming and Tang,\n Wei\n},\n title = {\n Modulated Graph Convolutional Network for 3D Human Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11477-11487\n} \n}" }, { "title": "Modulated Periodic Activations for Generalizable Local Functional Representations", @@ -26507,7 +28302,8 @@ "status": "Poster", "track": "main", "pid": 7133, - "author": "Ishit Mehta; Micha\u00ebl Gharbi; Connelly Barnes; Eli Shechtman; Ravi Ramamoorthi; Manmohan Chandraker", + "author_site": "Ishit Mehta; Michaël Gharbi; Connelly Barnes; Eli Shechtman; Ravi Ramamoorthi; Manmohan Chandraker", + "author": "Ishit Mehta; Michaël Gharbi; Connelly Barnes; Eli Shechtman; Ravi Ramamoorthi; Manmohan Chandraker", "abstract": "Multi-Layer Perceptrons (MLPs) make powerful functional representations for sampling and reconstruction problems involving low-dimensional signals like images,shapes and light fields. Recent works have significantly improved their ability to represent high-frequency content by using periodic activations or positional encodings. This often came at the expense of generalization: modern methods are typically optimized for a single signal. We present a new representation that generalizes to multiple instances and achieves state-of-the-art fidelity. We use a dual-MLP architecture to encode the signals. A synthesis network creates a functional mapping from a low-dimensional input(e.g. pixel-position) to the output domain (e.g. RGB color).A modulation network maps a latent code corresponding to the target signal to parameters that modulate the periodic activations of the synthesis network. We also propose a local-functional representation which enables generalization. The signal's domain is partitioned into a regular grid,with each tile represented by a latent code. At test time, the signal is encoded with high-fidelity by inferring (or directly optimizing) the latent code-book. Our approach produces generalizable functional representations of images, videos and shapes, and achieves higher reconstruction quality than prior works that are optimized for a single signal.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Mehta_Modulated_Periodic_Activations_for_Generalizable_Local_Functional_Representations_ICCV_2021_paper.pdf", "aff": ";;;;;", @@ -26521,7 +28317,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Mehta_Modulated_Periodic_Activations_for_Generalizable_Local_Functional_Representations_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Mehta_Modulated_Periodic_Activations_for_Generalizable_Local_Functional_Representations_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Mehta_2021_ICCV,\n \n author = {\n Mehta,\n Ishit and Gharbi,\n Micha\\"el and Barnes,\n Connelly and Shechtman,\n Eli and Ramamoorthi,\n Ravi and Chandraker,\n Manmohan\n},\n title = {\n Modulated Periodic Activations for Generalizable Local Functional Representations\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14214-14223\n} \n}" }, { "title": "MonoIndoor: Towards Good Practice of Self-Supervised Monocular Depth Estimation for Indoor Environments", @@ -26529,6 +28326,7 @@ "status": "Poster", "track": "main", "pid": 5842, + "author_site": "Pan Ji; Runze Li; Bir Bhanu; Yi Xu", "author": "Pan Ji; Runze Li; Bir Bhanu; Yi Xu", "abstract": "Self-supervised depth estimation for indoor environments is more challenging than its outdoor counterpart in at least the following two aspects: (i) the depth range of indoor sequences varies a lot across different frames, making it difficult for the depth network to induce consistent depth cues, whereas the maximum distance in outdoor scenes mostly stays the same as the camera usually sees the sky; (ii) the indoor sequences contain much more rotational motions, which cause difficulties for the pose network, while the motions of outdoor sequences are pre-dominantly translational, especially for driving datasets such as KITTI. In this paper, special considerations are given to those challenges and a set of good practices are consolidated for improving the performance of self-supervised monocular depth estimation in indoor environments. The proposed method mainly consists of two novel modules, i.e., a depth factorization module and a residual pose estimation module, each of which is designed to respectively tackle the aforementioned challenges. The effectiveness of each module is shown through a carefully conducted ablation study and the demonstration of the state-of-the-art performance on three indoor datasets, i.e., EuRoC, NYUv2 and 7-Scenes.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ji_MonoIndoor_Towards_Good_Practice_of_Self-Supervised_Monocular_Depth_Estimation_for_ICCV_2021_paper.pdf", @@ -26552,7 +28350,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Riverside", "aff_country_unique_index": "0;0+0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Ji_2021_ICCV,\n \n author = {\n Ji,\n Pan and Li,\n Runze and Bhanu,\n Bir and Xu,\n Yi\n},\n title = {\n MonoIndoor: Towards Good Practice of Self-Supervised Monocular Depth Estimation for Indoor Environments\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12787-12796\n} \n}" }, { "title": "Monocular, One-Stage, Regression of Multiple 3D People", @@ -26560,10 +28359,11 @@ "status": "Poster", "track": "main", "pid": 1355, + "author_site": "Yu Sun; Qian Bao; Wu Liu; Yili Fu; Michael J. Black; Tao Mei", "author": "Yu Sun; Qian Bao; Wu Liu; Yili Fu; Michael J. Black; Tao Mei", "abstract": "This paper focuses on the regression of multiple 3D people from a single RGB image. Existing approaches predominantly follow a multi-stage pipeline that first detects people in bounding boxes and then independently regresses their 3D body meshes. In contrast, we propose to Regress all meshes in a One-stage fashion for Multiple 3D People (termed ROMP). The approach is conceptually simple, bounding box-free, and able to learn a per-pixel representation in an end-to-end manner. Our method simultaneously predicts a Body Center heatmap and a Mesh Parameter map, which can jointly describe the 3D body mesh on the pixel level. Through a body-center-guided sampling process, the body mesh parameters of all people in the image are easily extracted from the Mesh Parameter map. Equipped with such a fine-grained representation, our one-stage framework is free of the complex multi-stage process and more robust to occlusion. Compared with state-of-the-art methods, ROMP achieves superior performance on the challenging multi-person benchmarks, including 3DPW and CMU Panoptic. Experiments on crowded/occluded datasets demonstrate the robustness under various types of occlusion. The code, released at https://github.com/Arthur151/ROMP, is the first real-time implementation of monocular multi-person 3D mesh regression.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Sun_Monocular_One-Stage_Regression_of_Multiple_3D_People_ICCV_2021_paper.pdf", - "aff": "Harbin Institute of Technology+JD AI Research; JD AI Research; JD AI Research; Harbin Institute of Technology+JD AI Research; Max Planck Institute for Intelligent Systems, T \u00a8ubingen, Germany; JD AI Research", + "aff": "Harbin Institute of Technology+JD AI Research; JD AI Research; JD AI Research; Harbin Institute of Technology+JD AI Research; Max Planck Institute for Intelligent Systems, T ¨ubingen, Germany; JD AI Research", "project": "", "github": "https://github.com/Arthur151/ROMP", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Sun_Monocular_One-Stage_Regression_ICCV_2021_supplemental.pdf", @@ -26576,14 +28376,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Sun_Monocular_One-Stage_Regression_of_Multiple_3D_People_ICCV_2021_paper.html", "aff_unique_index": "0+1;1;1;0+1;2;1", - "aff_unique_norm": "Harbin Institute of Technology;JD;Max Planck Institute for Intelligent Systems", - "aff_unique_dep": ";JD AI Research;", + "aff_unique_norm": "Harbin Institute of Technology;JD AI Research;Max Planck Institute for Intelligent Systems", + "aff_unique_dep": ";;", "aff_unique_url": "http://www.hit.edu.cn/;https://www.jd.com;https://www.mpi-is.mpg.de", "aff_unique_abbr": "HIT;JD AI;MPI-IS", "aff_campus_unique_index": "0;0;2", - "aff_campus_unique": "Harbin;;T\u00fcbingen", + "aff_campus_unique": "Harbin;;Tübingen", "aff_country_unique_index": "0+0;0;0;0+0;1;0", - "aff_country_unique": "China;Germany" + "aff_country_unique": "China;Germany", + "bibtex": "@InProceedings{Sun_2021_ICCV,\n \n author = {\n Sun,\n Yu and Bao,\n Qian and Liu,\n Wu and Fu,\n Yili and Black,\n Michael J. and Mei,\n Tao\n},\n title = {\n Monocular,\n One-Stage,\n Regression of Multiple 3D People\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11179-11188\n} \n}" }, { "title": "MonteFloor: Extending MCTS for Reconstructing Accurate Large-Scale Floor Plans", @@ -26591,10 +28392,11 @@ "status": "Poster", "track": "main", "pid": 2641, + "author_site": "Sinisa Stekovic; Mahdi Rad; Friedrich Fraundorfer; Vincent Lepetit", "author": "Sinisa Stekovic; Mahdi Rad; Friedrich Fraundorfer; Vincent Lepetit", "abstract": "We propose a novel method for reconstructing floor plans from noisy 3D point clouds. Our main contribution is a principled approach that relies on the Monte Carlo Tree Search (MCTS) algorithm to maximize a suitable objective function efficiently despite the complexity of the problem. Like previous work, we first project the input point cloud to a top view to create a density map and extract room proposals from it. Our method selects and optimizes the polygonal shapes of these room proposals jointly to fit the density map and outputs an accurate vectorized floor map even for large complex scenes. To do this, we adapted MCTS, an algorithm originally designed to learn to play games, to select the room proposals by maximizing an objective function combining the fitness with the density map as predicted by a deep network and regularizing terms on the room shapes. We also introduce a refinement step to MCTS that adjusts the shape of the room proposals. For this step, we propose a novel differentiable method for rendering the polygonal shapes of these proposals. We evaluate our method on the recent and challenging Structured3D and Floor-SP datasets and show a significant improvement over the state-of-the-art, without imposing any hard constraints nor assumptions on the floor plan configurations.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Stekovic_MonteFloor_Extending_MCTS_for_Reconstructing_Accurate_Large-Scale_Floor_Plans_ICCV_2021_paper.pdf", - "aff": "Institute for Computer Graphics and Vision, Graz University of Technology, Graz, Austria; Institute for Computer Graphics and Vision, Graz University of Technology, Graz, Austria; Institute for Computer Graphics and Vision, Graz University of Technology, Graz, Austria; Universit \u00b4e Paris-Est, \u00b4Ecole des Ponts ParisTech, Paris, France + Institute for Computer Graphics and Vision, Graz University of Technology, Graz, Austria", + "aff": "Institute for Computer Graphics and Vision, Graz University of Technology, Graz, Austria; Institute for Computer Graphics and Vision, Graz University of Technology, Graz, Austria; Institute for Computer Graphics and Vision, Graz University of Technology, Graz, Austria; Universit ´e Paris-Est, ´Ecole des Ponts ParisTech, Paris, France + Institute for Computer Graphics and Vision, Graz University of Technology, Graz, Austria", "project": "https://www.tugraz.at/index.php?id=52770", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Stekovic_MonteFloor_Extending_MCTS_ICCV_2021_supplemental.zip", @@ -26607,14 +28409,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Stekovic_MonteFloor_Extending_MCTS_for_Reconstructing_Accurate_Large-Scale_Floor_Plans_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;1+0", - "aff_unique_norm": "Graz University of Technology;Universit\u00e9 Paris-Est", - "aff_unique_dep": "Institute for Computer Graphics and Vision;\u00c9cole des Ponts ParisTech", + "aff_unique_norm": "Graz University of Technology;Université Paris-Est", + "aff_unique_dep": "Institute for Computer Graphics and Vision;École des Ponts ParisTech", "aff_unique_url": "https://www.tugraz.at;https://www.ponts.org", "aff_unique_abbr": "TU Graz;UPE", "aff_campus_unique_index": "0;0;0;1+0", "aff_campus_unique": "Graz;Paris", "aff_country_unique_index": "0;0;0;1+0", - "aff_country_unique": "Austria;France" + "aff_country_unique": "Austria;France", + "bibtex": "@InProceedings{Stekovic_2021_ICCV,\n \n author = {\n Stekovic,\n Sinisa and Rad,\n Mahdi and Fraundorfer,\n Friedrich and Lepetit,\n Vincent\n},\n title = {\n MonteFloor: Extending MCTS for Reconstructing Accurate Large-Scale Floor Plans\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 16034-16043\n} \n}" }, { "title": "Morphable Detector for Object Detection on Demand", @@ -26622,6 +28425,7 @@ "status": "Poster", "track": "main", "pid": 2865, + "author_site": "Xiangyun Zhao; Xu Zou; Ying Wu", "author": "Xiangyun Zhao; Xu Zou; Ying Wu", "abstract": "Many emerging applications of intelligent robots need to explore and understand new environments, where it is desirable to detect objects of novel categories on the fly with minimum online efforts. This is an object detection on demand (ODOD) task. It is challenging, because it is impossible to annotate large data on the fly, and the embedded systems are usually unable to perform back-propagation which is essential for training. Most existing few-shot detection methods are confronted here as they need extra training. We propose a novel morphable detector (MD), that simply \"morphs\"\" some of its changeable parameters online estimated from the few samples, so as to detect novel categories without any extra training. The MD has two sets of parameters, one for the feature embedding and the other for category representation(called \"prototypes\"\"). Each category is associated with a hidden prototype to be learned by integrating the visual and semantic embeddings. The learning of the MD is based on the alternate learning of the feature embedding and the prototypes in an EM-like approach which allows the recovery of an unknown prototype from a few samples of a novel category. Once an MD is learned, it is able to use a few samples of a novel category to directly compute its prototype to fulfill the online morphing process. We have shown the superiority of the MD in Pascal, COCO and FSOD datasets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhao_Morphable_Detector_for_Object_Detection_on_Demand_ICCV_2021_paper.pdf", @@ -26636,7 +28440,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhao_Morphable_Detector_for_Object_Detection_on_Demand_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhao_Morphable_Detector_for_Object_Detection_on_Demand_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Zhao_2021_ICCV,\n \n author = {\n Zhao,\n Xiangyun and Zou,\n Xu and Wu,\n Ying\n},\n title = {\n Morphable Detector for Object Detection on Demand\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4771-4780\n} \n}" }, { "title": "MosaicOS: A Simple and Effective Use of Object-Centric Images for Long-Tailed Object Detection", @@ -26644,6 +28449,7 @@ "status": "Poster", "track": "main", "pid": 10419, + "author_site": "Cheng Zhang; Tai-Yu Pan; Yandong Li; Hexiang Hu; Dong Xuan; Soravit Changpinyo; Boqing Gong; Wei-Lun Chao", "author": "Cheng Zhang; Tai-Yu Pan; Yandong Li; Hexiang Hu; Dong Xuan; Soravit Changpinyo; Boqing Gong; Wei-Lun Chao", "abstract": "Many objects do not appear frequently enough in complex scenes (e.g., certain handbags in living rooms) for training an accurate object detector, but are often found frequently by themselves (e.g., in product images). Yet, these object-centric images are not effectively leveraged for improving object detection in scene-centric images. In this paper, we propose Mosaic of Object-centric images as Scene-centric images (MosaicOS), a simple and novel framework that is surprisingly effective at tackling the challenges of long-tailed object detection. Keys to our approach are three-fold: (i) pseudo scene-centric image construction from object-centric images for mitigating domain differences, (ii) high-quality bounding box imputation using the object-centric images' class labels, and (iii) a multi-stage training procedure. On LVIS object detection (and instance segmentation), MosaicOS leads to a massive 60% (and 23%) relative improvement in average precision for rare object categories. We also show that our framework can be compatibly used with other existing approaches to achieve even further gains. Our pre-trained models are publicly available at https://github.com/czhang0528/MosaicOS/.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhang_MosaicOS_A_Simple_and_Effective_Use_of_Object-Centric_Images_for_ICCV_2021_paper.pdf", @@ -26660,14 +28466,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhang_MosaicOS_A_Simple_and_Effective_Use_of_Object-Centric_Images_for_ICCV_2021_paper.html", "aff_unique_index": "0;0;1;2;0;1;1;0", - "aff_unique_norm": "Ohio State University;Google;University of Southern California", + "aff_unique_norm": "The Ohio State University;Google;University of Southern California", "aff_unique_dep": ";Google Research;", "aff_unique_url": "https://www.osu.edu;https://research.google;https://www.usc.edu", "aff_unique_abbr": "OSU;Google Research;USC", "aff_campus_unique_index": "1;2;1;1", "aff_campus_unique": ";Mountain View;Los Angeles", "aff_country_unique_index": "0;0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhang_2021_ICCV,\n \n author = {\n Zhang,\n Cheng and Pan,\n Tai-Yu and Li,\n Yandong and Hu,\n Hexiang and Xuan,\n Dong and Changpinyo,\n Soravit and Gong,\n Boqing and Chao,\n Wei-Lun\n},\n title = {\n MosaicOS: A Simple and Effective Use of Object-Centric Images for Long-Tailed Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 417-427\n} \n}" }, { "title": "Motion Adaptive Pose Estimation From Compressed Videos", @@ -26675,6 +28482,7 @@ "status": "Poster", "track": "main", "pid": 6742, + "author_site": "Zhipeng Fan; Jun Liu; Yao Wang", "author": "Zhipeng Fan; Jun Liu; Yao Wang", "abstract": "Human pose estimation from videos has many real-world applications. Existing methods focus on applying models with a uniform computation profile on fully de- coded frames, ignoring the freely available motion signals and motion-compensation residuals from the compressed stream. A novel model, called Motion Adaptive Pose Net is proposed to exploit the compressed streams to efficiently decode pose sequences from videos. The model incorporates a Motion Compensated ConvLSTM to propagate the spatially aligned features, along with an adaptive gate to dynamically determine if the computationally expensive features should be extracted from fully decoded frames to compensate the motion-warped features, solely based on the residual errors. Leveraging the informative yet readily available signals from compressed streams, we propagate the latent features through our Motion Adaptive Pose Net efficiently. Our model outperforms the state-of-the-art models in pose- estimation accuracy on two widely used datasets with only around half of the computation complexity.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Fan_Motion_Adaptive_Pose_Estimation_From_Compressed_Videos_ICCV_2021_paper.pdf", @@ -26689,7 +28497,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Fan_Motion_Adaptive_Pose_Estimation_From_Compressed_Videos_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Fan_Motion_Adaptive_Pose_Estimation_From_Compressed_Videos_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Fan_2021_ICCV,\n \n author = {\n Fan,\n Zhipeng and Liu,\n Jun and Wang,\n Yao\n},\n title = {\n Motion Adaptive Pose Estimation From Compressed Videos\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11719-11728\n} \n}" }, { "title": "Motion Basis Learning for Unsupervised Deep Homography Estimation With Subspace Projection", @@ -26697,6 +28506,7 @@ "status": "Poster", "track": "main", "pid": 8332, + "author_site": "Nianjin Ye; Chuan Wang; Haoqiang Fan; Shuaicheng Liu", "author": "Nianjin Ye; Chuan Wang; Haoqiang Fan; Shuaicheng Liu", "abstract": "In this paper, we introduce a new framework for unsupervised deep homography estimation. Our contributions are 3 folds. First, unlike previous methods that regress 4 offsets for a homography, we propose a homography flow representation, which can be estimated by a weighted sum of 8 pre-defined homography flow bases. Second, considering a homography contains 8 Degree-of-Freedoms (DOFs) that is much less than the rank of the network features, we propose a Low Rank Representation (LRR) block that reduces the feature rank, so that features corresponding to the dominant motions are retained while others are rejected. Last, we propose a Feature Identity Loss (FIL) to enforce the learned image feature warp-equivariant, meaning that the result should be identical if the order of warp operation and feature extraction is swapped. With this constraint, the unsupervised optimization is achieved more effectively and more stable features are learned. Extensive experiments are conducted to demonstrate the effectiveness of all the newly proposed components, and results show that our approach outperforms the state-of-the-art on the homography benchmark datasets both qualitatively and quantitatively. Code is available at https://github.com/megvii-research/BasesHomo", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ye_Motion_Basis_Learning_for_Unsupervised_Deep_Homography_Estimation_With_Subspace_ICCV_2021_paper.pdf", @@ -26711,7 +28521,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Ye_Motion_Basis_Learning_for_Unsupervised_Deep_Homography_Estimation_With_Subspace_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Ye_Motion_Basis_Learning_for_Unsupervised_Deep_Homography_Estimation_With_Subspace_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Ye_2021_ICCV,\n \n author = {\n Ye,\n Nianjin and Wang,\n Chuan and Fan,\n Haoqiang and Liu,\n Shuaicheng\n},\n title = {\n Motion Basis Learning for Unsupervised Deep Homography Estimation With Subspace Projection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13117-13125\n} \n}" }, { "title": "Motion Deblurring With Real Events", @@ -26719,10 +28530,11 @@ "status": "Poster", "track": "main", "pid": 7727, + "author_site": "Fang Xu; Lei Yu; Bishan Wang; Wen Yang; Gui-Song Xia; Xu Jia; Zhendong Qiao; Jianzhuang Liu", "author": "Fang Xu; Lei Yu; Bishan Wang; Wen Yang; Gui-Song Xia; Xu Jia; Zhendong Qiao; Jianzhuang Liu", "abstract": "In this paper, we propose an end-to-end learning framework for event-based motion deblurring in a self-supervised manner, where real-world events are exploited to alleviate the performance degradation caused by data inconsistency. To achieve this end, optical flows are predicted from events, with which the blurry consistency and photometric consistency are exploited to enable self-supervision on the deblurring network with real-world data. Furthermore, a piece-wise linear motion model is proposed to take into account motion non-linearities and thus leads to an accurate model for the physical formation of motion blurs in the real-world scenario. Extensive evaluation on both synthetic and real motion blur datasets demonstrates that the proposed algorithm bridges the gap between simulated and real-world motion blurs and shows remarkable performance for event-based motion deblurring in real-world scenarios.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xu_Motion_Deblurring_With_Real_Events_ICCV_2021_paper.pdf", - "aff": "School of Electronic Information, Wuhan University; School of Electronic Information, Wuhan University; School of Electronic Information, Wuhan University; School of Electronic Information, Wuhan University; School of Computer Science, Wuhan University; School of Artificial Intelligence, Dalian University of Technology; Noah\u2019s Ark Lab, Huawei Technologies; Noah\u2019s Ark Lab, Huawei Technologies", + "aff": "School of Electronic Information, Wuhan University; School of Electronic Information, Wuhan University; School of Electronic Information, Wuhan University; School of Electronic Information, Wuhan University; School of Computer Science, Wuhan University; School of Artificial Intelligence, Dalian University of Technology; Noah’s Ark Lab, Huawei Technologies; Noah’s Ark Lab, Huawei Technologies", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Xu_Motion_Deblurring_With_ICCV_2021_supplemental.pdf", @@ -26735,14 +28547,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Xu_Motion_Deblurring_With_Real_Events_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;0;0;1;2;2", - "aff_unique_norm": "Wuhan University;Dalian University of Technology;Huawei", - "aff_unique_dep": "School of Electronic Information;School of Artificial Intelligence;Noah\u2019s Ark Lab", + "aff_unique_norm": "Wuhan University;Dalian University of Technology;Huawei Technologies", + "aff_unique_dep": "School of Electronic Information;School of Artificial Intelligence;Noah’s Ark Lab", "aff_unique_url": "http://www.whu.edu.cn;http://www.dlut.edu.cn/;https://www.huawei.com", "aff_unique_abbr": "WHU;DUT;Huawei", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Wuhan;Dalian", "aff_country_unique_index": "0;0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xu_2021_ICCV,\n \n author = {\n Xu,\n Fang and Yu,\n Lei and Wang,\n Bishan and Yang,\n Wen and Xia,\n Gui-Song and Jia,\n Xu and Qiao,\n Zhendong and Liu,\n Jianzhuang\n},\n title = {\n Motion Deblurring With Real Events\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2583-2592\n} \n}" }, { "title": "Motion Guided Attention Fusion To Recognize Interactions From Videos", @@ -26750,6 +28563,7 @@ "status": "Poster", "track": "main", "pid": 7911, + "author_site": "Tae Soo Kim; Jonathan Jones; Gregory D. Hager", "author": "Tae Soo Kim; Jonathan Jones; Gregory D. Hager", "abstract": "We present a dual-pathway approach for recognizing fine-grained interactions from videos. We build on the success of prior dual-stream approaches, but make a distinction between the static and dynamic representations of objects and their interactions explicit by introducing separate motion and object detection pathways. Then, using our new Motion-Guided Attention Fusion module, we fuse the bottom-up features in the motion pathway with features captured from object detections to learn the temporal aspects of an action. We show that our approach can generalize across appearance effectively and recognize actions where an actor interacts with previously unseen objects. We validate our approach using the compositional action recognition task from the Something-Something-v2 dataset where we outperform existing state-of-the-art methods. We also show that our method can generalize well to real world tasks by showing state-of-the-art performance on recognizing humans assembling various IKEA furniture on the IKEA-ASM dataset.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Kim_Motion_Guided_Attention_Fusion_To_Recognize_Interactions_From_Videos_ICCV_2021_paper.pdf", @@ -26764,7 +28578,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Kim_Motion_Guided_Attention_Fusion_To_Recognize_Interactions_From_Videos_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Kim_Motion_Guided_Attention_Fusion_To_Recognize_Interactions_From_Videos_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Kim_2021_ICCV,\n \n author = {\n Kim,\n Tae Soo and Jones,\n Jonathan and Hager,\n Gregory D.\n},\n title = {\n Motion Guided Attention Fusion To Recognize Interactions From Videos\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13076-13086\n} \n}" }, { "title": "Motion Guided Region Message Passing for Video Captioning", @@ -26772,6 +28587,7 @@ "status": "Poster", "track": "main", "pid": 4028, + "author_site": "Shaoxiang Chen; Yu-Gang Jiang", "author": "Shaoxiang Chen; Yu-Gang Jiang", "abstract": "Video captioning is an important vision task and has been intensively studied in the computer vision community. Existing methods that utilize the fine-grained spatial information have achieved significant improvements, however, they either rely on costly external object detectors or do not sufficiently model the spatial/temporal relations. In this paper, we aim at designing a spatial information extraction and aggregation method for video captioning without the need of external object detectors. For this purpose, we propose a Recurrent Region Attention module to better extract diverse spatial features, and by employing Motion-Guided Cross-frame Message Passing, our model is aware of the temporal structure and able to establish high-order relations among the diverse regions across frames. They jointly encourage information communication and produce compact and powerful video representations. Furthermore, an Adjusted Temporal Graph Decoder is proposed to flexibly update video features and model high-order temporal relations during decoding. Experimental results on three benchmark datasets: MSVD, MSR-VTT, and VATEX demonstrate that our proposed method can outperform state-of-the-art methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_Motion_Guided_Region_Message_Passing_for_Video_Captioning_ICCV_2021_paper.pdf", @@ -26795,7 +28611,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Shanghai;", "aff_country_unique_index": "0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Shaoxiang and Jiang,\n Yu-Gang\n},\n title = {\n Motion Guided Region Message Passing for Video Captioning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1543-1552\n} \n}" }, { "title": "Motion Prediction Using Trajectory Cues", @@ -26803,6 +28620,7 @@ "status": "Poster", "track": "main", "pid": 4228, + "author_site": "Zhenguang Liu; Pengxiang Su; Shuang Wu; Xuanjing Shen; Haipeng Chen; Yanbin Hao; Meng Wang", "author": "Zhenguang Liu; Pengxiang Su; Shuang Wu; Xuanjing Shen; Haipeng Chen; Yanbin Hao; Meng Wang", "abstract": "Predicting human motion from a historical pose sequence is at the core of many applications in computer vision. Current state-of-the-art methods concentrate on learning motion contexts in the pose space, however, the high dimensionality and complex nature of human pose invoke inherent difficulties in extracting such contexts. In this paper, we instead advocate to model motion contexts in the joint trajectory space, as the trajectory of a joint is smooth, vectorial, and gives sufficient information to the model. Moreover, most existing methods consider only the dependencies between skeletal connected joints, disregarding prior knowledge and the hidden connections between geometrically separated joints. Motivated by this, we present a semi-constrained graph to explicitly encode skeletal connections and prior knowledge, while adaptively learn implicit dependencies between joints. We also explore the applications of our approach to a range of objects including human, fish, and mouse. Surprisingly, our method sets the new state-of-the-art performance on 4 different benchmark datasets, a remarkable highlight is that it achieves a 19.1% accuracy improvement over current state-of-the-art in average. To facilitate future research, we have released our code at https://github.com/Pose-Group/MPT.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liu_Motion_Prediction_Using_Trajectory_Cues_ICCV_2021_paper.pdf", @@ -26821,12 +28639,13 @@ "aff_unique_index": "0;1;2;1;1;3;4", "aff_unique_norm": "Zhejiang University;Jilin University;Nanyang Technological University;University of Science and Technology of China;Hefei University of Technology", "aff_unique_dep": ";;;;", - "aff_unique_url": "https://www.zju.edu.cn;http://www.jlu.edu.cn;https://www.ntu.edu.sg;http://www.ustc.edu.cn;http://www.hfut.edu.cn/", + "aff_unique_url": "https://www.zju.edu.cn;http://www.jlu.edu.cn;https://www.ntu.edu.sg;http://www.ustc.edu.cn;http://www.hfut.edu.cn", "aff_unique_abbr": "ZJU;JLU;NTU;USTC;HUT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0;0;0", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Liu_2021_ICCV,\n \n author = {\n Liu,\n Zhenguang and Su,\n Pengxiang and Wu,\n Shuang and Shen,\n Xuanjing and Chen,\n Haipeng and Hao,\n Yanbin and Wang,\n Meng\n},\n title = {\n Motion Prediction Using Trajectory Cues\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13299-13308\n} \n}" }, { "title": "Motion-Augmented Self-Training for Video Recognition at Smaller Scale", @@ -26834,6 +28653,7 @@ "status": "Poster", "track": "main", "pid": 5769, + "author_site": "Kirill Gavrilyuk; Mihir Jain; Ilia Karmanov; Cees G. M. Snoek", "author": "Kirill Gavrilyuk; Mihir Jain; Ilia Karmanov; Cees G. M. Snoek", "abstract": "The goal of this paper is to self-train a 3D convolutional neural network on an unlabeled video collection for deployment on small-scale video collections. As smaller video datasets benefit more from motion than appearance, we strive to train our network using optical flow, but avoid its computation during inference. We propose the first motion-augmented self-training regime, we call MotionFit. We start with supervised training of a motion model on a small, and labeled, video collection. With the motion model we generate pseudo-labels for a large unlabeled video collection, which enables us to transfer knowledge by learning to predict these pseudo-labels with an appearance model. Moreover, we introduce a multi-clip loss as a simple yet efficient way to improve the quality of the pseudo-labeling, even without additional auxiliary tasks. We also take into consideration the temporal granularity of videos during self-training of the appearance model, which was missed in previous works. As a result we obtain a strong motion-augmented representation model suited for video downstream tasks like action recognition and clip retrieval. On small-scale video datasets, MotionFit outperforms alternatives for knowledge transfer by 5%-8%, video-only self-supervision by 1%-7% and semisupervised learning by 9%-18% using the same amount of class labels.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Gavrilyuk_Motion-Augmented_Self-Training_for_Video_Recognition_at_Smaller_Scale_ICCV_2021_paper.pdf", @@ -26857,7 +28677,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0", - "aff_country_unique": "Netherlands;United States" + "aff_country_unique": "Netherlands;United States", + "bibtex": "@InProceedings{Gavrilyuk_2021_ICCV,\n \n author = {\n Gavrilyuk,\n Kirill and Jain,\n Mihir and Karmanov,\n Ilia and Snoek,\n Cees G. M.\n},\n title = {\n Motion-Augmented Self-Training for Video Recognition at Smaller Scale\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10429-10438\n} \n}" }, { "title": "Motion-Aware Dynamic Architecture for Efficient Frame Interpolation", @@ -26865,6 +28686,7 @@ "status": "Poster", "track": "main", "pid": 10166, + "author_site": "Myungsub Choi; Suyoung Lee; Heewon Kim; Kyoung Mu Lee", "author": "Myungsub Choi; Suyoung Lee; Heewon Kim; Kyoung Mu Lee", "abstract": "Video frame interpolation aims to synthesize accurate intermediate frames given a low-frame-rate video. While the quality of the generated frames is increasingly getting better, state-of-the-art models have become more and more computationally expensive. However, local regions with small or no motion can be easily interpolated with simple models and do not require such heavy compute, whereas some regions may not be correct even after inference through a large model. Thus, we propose an effective framework that assigns varying amounts of computation for different regions. Our dynamic architecture first calculates the approximate motion magnitude to use as a proxy for the difficulty levels for each region, and decides the depth of the model and the scale of the input. Experimental results show that static regions pass through a smaller number of layers, while the regions with larger motion are downscaled for better motion reasoning. In doing so, we demonstrate that the proposed framework can significantly reduce the computation cost (FLOPs) while maintaining the performance, often up to 50% when interpolating a 2K resolution video.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Choi_Motion-Aware_Dynamic_Architecture_for_Efficient_Frame_Interpolation_ICCV_2021_paper.pdf", @@ -26879,7 +28701,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Choi_Motion-Aware_Dynamic_Architecture_for_Efficient_Frame_Interpolation_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Choi_Motion-Aware_Dynamic_Architecture_for_Efficient_Frame_Interpolation_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Choi_2021_ICCV,\n \n author = {\n Choi,\n Myungsub and Lee,\n Suyoung and Kim,\n Heewon and Lee,\n Kyoung Mu\n},\n title = {\n Motion-Aware Dynamic Architecture for Efficient Frame Interpolation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13839-13848\n} \n}" }, { "title": "Motion-Focused Contrastive Learning of Video Representations", @@ -26887,6 +28710,7 @@ "status": "Poster", "track": "main", "pid": 8998, + "author_site": "Rui Li; Yiheng Zhang; Zhaofan Qiu; Ting Yao; Dong Liu; Tao Mei", "author": "Rui Li; Yiheng Zhang; Zhaofan Qiu; Ting Yao; Dong Liu; Tao Mei", "abstract": "Motion, as the most distinct phenomenon in a video to involve the changes over time, has been unique and critical to the development of video representation learning. In this paper, we ask the question: how important is the motion particularly for self-supervised video representation learning. To this end, we compose a duet of exploiting the motion for data augmentation and feature learning in the regime of contrastive learning. Specifically, we present a Motion-focused Contrastive Learning (MCL) method that regards such duet as the foundation. On one hand, MCL capitalizes on optical flow of each frame in a video to temporally and spatially sample the tubelets (i.e., sequences of associated frame patches across time) as data augmentations. On the other hand, MCL further aligns gradient maps of the convolutional layers to optical flow maps from spatial, temporal and spatio-temporal perspectives, in order to ground motion information in feature learning. Extensive experiments conducted on R(2+1)D backbone demonstrate the effectiveness of our MCL. On UCF101, the linear classifier trained on the representations learnt by MCL achieves 81.91% top-1 accuracy, outperforming ImageNet supervised pre-training by 6.78%. On Kinetics-400, MCL achieves 66.62% top-1 accuracy under the linear protocol.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_Motion-Focused_Contrastive_Learning_of_Video_Representations_ICCV_2021_paper.pdf", @@ -26903,14 +28727,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Li_Motion-Focused_Contrastive_Learning_of_Video_Representations_ICCV_2021_paper.html", "aff_unique_index": "0+1;1;1;1;0+1;1", - "aff_unique_norm": "University of Science and Technology of China;JD", - "aff_unique_dep": ";JD AI Research", + "aff_unique_norm": "University of Science and Technology of China;JD AI Research", + "aff_unique_dep": ";", "aff_unique_url": "http://www.ustc.edu.cn;", "aff_unique_abbr": "USTC;", "aff_campus_unique_index": "0+1;1;1;1;0+1;1", "aff_campus_unique": "Hefei;Beijing", "aff_country_unique_index": "0+0;0;0;0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Rui and Zhang,\n Yiheng and Qiu,\n Zhaofan and Yao,\n Ting and Liu,\n Dong and Mei,\n Tao\n},\n title = {\n Motion-Focused Contrastive Learning of Video Representations\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2105-2114\n} \n}" }, { "title": "Move2Hear: Active Audio-Visual Source Separation", @@ -26918,6 +28743,7 @@ "status": "Poster", "track": "main", "pid": 3843, + "author_site": "Sagnik Majumder; Ziad Al-Halah; Kristen Grauman", "author": "Sagnik Majumder; Ziad Al-Halah; Kristen Grauman", "abstract": "We introduce the active audio-visual source separation problem, where an agent must move intelligently in order to better isolate the sounds coming from an object of interest in its environment. The agent hears multiple audio sources simultaneously (e.g., a person speaking down the hall in a noisy household) and it must use its eyes and ears to automatically separate out the sounds originating from a target object within a limited time budget. Towards this goal, we introduce a reinforcement learning approach that trains movement policies controlling the agent's camera and microphone placement over time, guided by the improvement in predicted audio separation quality. We demonstrate our approach in scenarios motivated by both augmented reality (system is already co-located with the target object) and mobile robotics (agent begins arbitrarily far from the target object). Using state-of-the-art realistic audio-visual simulations in 3D environments, we demonstrate our model's ability to find minimal movement sequences with maximal payoff for audio source separation. Project: http://vision.cs.utexas.edu/projects/move2hear", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Majumder_Move2Hear_Active_Audio-Visual_Source_Separation_ICCV_2021_paper.pdf", @@ -26934,14 +28760,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Majumder_Move2Hear_Active_Audio-Visual_Source_Separation_ICCV_2021_paper.html", "aff_unique_index": "0;0;0+1", - "aff_unique_norm": "University of Texas at Austin;Meta", + "aff_unique_norm": "University of Texas at Austin;Facebook", "aff_unique_dep": ";Facebook AI Research", "aff_unique_url": "https://www.utexas.edu;https://research.facebook.com", "aff_unique_abbr": "UT Austin;FAIR", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Austin;", "aff_country_unique_index": "0;0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Majumder_2021_ICCV,\n \n author = {\n Majumder,\n Sagnik and Al-Halah,\n Ziad and Grauman,\n Kristen\n},\n title = {\n Move2Hear: Active Audio-Visual Source Separation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 275-285\n} \n}" }, { "title": "Multi-Anchor Active Domain Adaptation for Semantic Segmentation", @@ -26949,6 +28776,7 @@ "status": "Poster", "track": "main", "pid": 8161, + "author_site": "Munan Ning; Donghuan Lu; Dong Wei; Cheng Bian; Chenglang Yuan; Shuang Yu; Kai Ma; Yefeng Zheng", "author": "Munan Ning; Donghuan Lu; Dong Wei; Cheng Bian; Chenglang Yuan; Shuang Yu; Kai Ma; Yefeng Zheng", "abstract": "Unsupervised domain adaption has proven to be an effective approach for alleviating the intensive workload of manual annotation by aligning the synthetic source-domain data and the real-world target-domain samples. Unfortunately, mapping the target-domain distribution to the source-domain unconditionally may distort the essential structural information of the target-domain data. To this end, we firstly propose to introduce a novel multi-anchor based active learning strategy to assist domain adaptation regarding the semantic segmentation task. By innovatively adopting multiple anchors instead of a single centroid, the source domain can be better characterized as a multimodal distribution, thus more representative and complimentary samples are selected from the target domain. With little workload to manually annotate these active samples, the distortion of the target-domain distribution can be effectively alleviated, resulting in a large performance gain. The multi-anchor strategy is additionally employed to model the target-distribution. By regularizing the latent representation of the unlabeled target samples compact around multiple anchors through a novel soft alignment loss, more precise segmentation can be achieved. Extensive experiments are conducted on public datasets to demonstrate that the proposed approach outperforms state-of-the-art methods significantly, along with thorough ablation study to verify the effectiveness of each component.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ning_Multi-Anchor_Active_Domain_Adaptation_for_Semantic_Segmentation_ICCV_2021_paper.pdf", @@ -26972,7 +28800,8 @@ "aff_campus_unique_index": "0;0;0;0;0;0;0;0", "aff_campus_unique": "Shenzhen", "aff_country_unique_index": "0;0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Ning_2021_ICCV,\n \n author = {\n Ning,\n Munan and Lu,\n Donghuan and Wei,\n Dong and Bian,\n Cheng and Yuan,\n Chenglang and Yu,\n Shuang and Ma,\n Kai and Zheng,\n Yefeng\n},\n title = {\n Multi-Anchor Active Domain Adaptation for Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9112-9122\n} \n}" }, { "title": "Multi-Class Cell Detection Using Spatial Context Representation", @@ -26980,6 +28809,7 @@ "status": "Poster", "track": "main", "pid": 10587, + "author_site": "Shahira Abousamra; David Belinsky; John Van Arnam; Felicia Allard; Eric Yee; Rajarsi Gupta; Tahsin Kurc; Dimitris Samaras; Joel Saltz; Chao Chen", "author": "Shahira Abousamra; David Belinsky; John Van Arnam; Felicia Allard; Eric Yee; Rajarsi Gupta; Tahsin Kurc; Dimitris Samaras; Joel Saltz; Chao Chen", "abstract": "In digital pathology, both detection and classification of cells are important for automatic diagnostic and prognostic tasks. Classifying cells into subtypes, such as tumor cells, lymphocytes or stromal cells is particularly challenging. Existing methods focus on morphological appearance of individual cells, whereas in practice pathologists often infer cell classes through their spatial context. In this paper, we propose a novel method for both detection and classification that explicitly incorporates spatial contextual information. We use the spatial statistical function to describe local density in both a multi-class and a multi-scale manner. Through representation learning and deep clustering techniques, we learn advanced cell representation with both appearance and spatial context. On various benchmarks, our method achieves better performance than state-of-the-arts, especially on the classification task.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Abousamra_Multi-Class_Cell_Detection_Using_Spatial_Context_Representation_ICCV_2021_paper.pdf", @@ -27003,7 +28833,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Abousamra_2021_ICCV,\n \n author = {\n Abousamra,\n Shahira and Belinsky,\n David and Van Arnam,\n John and Allard,\n Felicia and Yee,\n Eric and Gupta,\n Rajarsi and Kurc,\n Tahsin and Samaras,\n Dimitris and Saltz,\n Joel and Chen,\n Chao\n},\n title = {\n Multi-Class Cell Detection Using Spatial Context Representation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4005-4014\n} \n}" }, { "title": "Multi-Class Multi-Instance Count Conditioned Adversarial Image Generation", @@ -27011,6 +28842,7 @@ "status": "Poster", "track": "main", "pid": 4258, + "author_site": "Amrutha Saseendran; Kathrin Skubch; Margret Keuper", "author": "Amrutha Saseendran; Kathrin Skubch; Margret Keuper", "abstract": "Image generation has rapidly evolved in recent years. Modern architectures for adversarial training allow to generate even high resolution images with remarkable quality. At the same time, more and more effort is dedicated towards controlling the content of generated images. In this paper, we take one further step in this direction and propose a conditional generative adversarial network (GAN) that generates images with a defined number of objects from given classes. This entails two fundamental abilities (1) being able to generate high-quality images given a complex constraint and (2) being able to count object instances per class in a given image. Our proposed model modularly extends the successful StyleGAN2 architecture with a count-based conditioning as well as with a regression sub-network to count the number of generated objects per class during training. In experiments on three different datasets, we show that the proposed model learns to generate images according to the given multiple-class count condition even in the presence of complex backgrounds. In particular, we propose a new dataset, CityCount, which is derived from the Cityscapes street scenes dataset, to evaluate our approach in a challenging and practically relevant scenario. An implementation is available at https://github.com/boschresearch/MCCGAN.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Saseendran_Multi-Class_Multi-Instance_Count_Conditioned_Adversarial_Image_Generation_ICCV_2021_paper.pdf", @@ -27034,7 +28866,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Saseendran_2021_ICCV,\n \n author = {\n Saseendran,\n Amrutha and Skubch,\n Kathrin and Keuper,\n Margret\n},\n title = {\n Multi-Class Multi-Instance Count Conditioned Adversarial Image Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6762-6771\n} \n}" }, { "title": "Multi-Echo LiDAR for 3D Object Detection", @@ -27042,6 +28875,7 @@ "status": "Poster", "track": "main", "pid": 6294, + "author_site": "Yunze Man; Xinshuo Weng; Prasanna Kumar Sivakumar; Matthew O'Toole; Kris M. Kitani", "author": "Yunze Man; Xinshuo Weng; Prasanna Kumar Sivakumar; Matthew O'Toole; Kris M. Kitani", "abstract": "LiDAR sensors can be used to obtain a wide range of measurement signals other than a simple 3D point cloud, and those signals can be leveraged to improve perception tasks like 3D object detection. A single laser pulse can be partially reflected by multiple objects along its path, resulting in multiple measurements called echoes. Multi-echo measurement can provide information about object contours and semi-transparent surfaces which can be used to better identify and locate objects. LiDAR can also measure surface reflectance (intensity of laser pulse return), as well as ambient light of the scene (sunlight reflected by objects). These signals are already available in commercial LiDAR devices but have not been used in most LiDAR-based detection models. We present a 3D object detection model which leverages the full spectrum of measurement signals provided by LiDAR. First, we propose a multi-signal fusion (MSF) module to combine (1) the reflectance and ambient features extracted with a 2D CNN, and (2) point cloud features extracted using a 3D graph neural network (GNN). Second, we propose a multi-echo aggregation (MEA) module to combine the information encoded in different set of echo points. Compared with traditional single echo point cloud methods, our proposed multi-signal LiDAR Detector (MSLiD) extracts richer context information from a wider range of sensing measurements and achieves more accurate 3D object detection. Experiments show that by incorporating the multi-modality of LiDAR, our method outperforms the state-of-the-art by up to relatively 9.1%.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Man_Multi-Echo_LiDAR_for_3D_Object_Detection_ICCV_2021_paper.pdf", @@ -27065,7 +28899,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0", - "aff_country_unique": "United States;Japan" + "aff_country_unique": "United States;Japan", + "bibtex": "@InProceedings{Man_2021_ICCV,\n \n author = {\n Man,\n Yunze and Weng,\n Xinshuo and Sivakumar,\n Prasanna Kumar and O'Toole,\n Matthew and Kitani,\n Kris M.\n},\n title = {\n Multi-Echo LiDAR for 3D Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3763-3772\n} \n}" }, { "title": "Multi-Expert Adversarial Attack Detection in Person Re-Identification Using Context Inconsistency", @@ -27073,6 +28908,7 @@ "status": "Poster", "track": "main", "pid": 9788, + "author_site": "Xueping Wang; Shasha Li; Min Liu; Yaonan Wang; Amit K. Roy-Chowdhury", "author": "Xueping Wang; Shasha Li; Min Liu; Yaonan Wang; Amit K. Roy-Chowdhury", "abstract": "The success of deep neural networks (DNNs) has promoted the widespread applications of person re-identification (ReID). However, ReID systems inherit the vulnerability of DNNs to malicious attacks of visually inconspicuous adversarial perturbations. Detection of adversarial attacks is, therefore, a fundamental requirement for robust ReID systems. In this work, we propose a Multi-Expert Adversarial Attack Detection (MEAAD) approach to achieve this goal by checking context inconsistency, which is suitable for any DNNs-based ReID systems. Specifically, three kinds of context inconsistencies caused by adversarial attacks are employed to learn a detector for detecting adversarial attacks, i.e., a) the embedding distances between a perturbed query person image and its top-K retrievals are generally larger than those between a benign query image and its top-K retrievals, b) the embedding distances among the top-K retrievals of a perturbed query image are larger than those of a benign query image, c) the top-K retrievals of a benign query image obtained with multiple expert ReID models tend to be consistent, which is not preserved when attacks are present. Extensive experiments on the Market1501 and DukeMTMC-ReID datasets show that, as the first adversarial attack detection approach for ReID, MEAAD effectively detects various adversarial attacks and achieves high ROC-AUC (over 97.5%).", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_Multi-Expert_Adversarial_Attack_Detection_in_Person_Re-Identification_Using_Context_Inconsistency_ICCV_2021_paper.pdf", @@ -27096,7 +28932,8 @@ "aff_campus_unique_index": ";1;;;1", "aff_campus_unique": ";Riverside", "aff_country_unique_index": "0+0;1;0+0;0+0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Xueping and Li,\n Shasha and Liu,\n Min and Wang,\n Yaonan and Roy-Chowdhury,\n Amit K.\n},\n title = {\n Multi-Expert Adversarial Attack Detection in Person Re-Identification Using Context Inconsistency\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15097-15107\n} \n}" }, { "title": "Multi-Instance Pose Networks: Rethinking Top-Down Pose Estimation", @@ -27104,6 +28941,7 @@ "status": "Poster", "track": "main", "pid": 6754, + "author_site": "Rawal Khirodkar; Visesh Chari; Amit Agrawal; Ambrish Tyagi", "author": "Rawal Khirodkar; Visesh Chari; Amit Agrawal; Ambrish Tyagi", "abstract": "A key assumption of top-down human pose estimation approaches is their expectation of having a single person/instance present in the input bounding box. This often leads to failures in crowded scenes with occlusions. We propose a novel solution to overcome the limitations of this fundamental assumption. Our Multi-Instance Pose Network (MIPNet) allows for predicting multiple 2D pose instances within a given bounding box. We introduce a Multi-Instance Modulation Block (MIMB) that can adaptively modulate channel-wise feature responses for each instance and is parameter efficient. We demonstrate the efficacy of our approach by evaluating on COCO, CrowdPose, and OCHuman datasets. Specifically, we achieve 70.0 AP on CrowdPose and 42.5 AP on OCHuman test sets, a significant improvement of 2.4 AP and 6.5 AP over the prior art, respectively. When using ground truth bounding boxes for inference, MIPNet achieves an improvement of 0.7 AP on COCO, 0.9 AP on CrowdPose, and 9.1 AP on OCHuman validation sets compared to HRNet. Interestingly, when fewer, high confidence bounding boxes are used, HRNet's performance degrades (by 5 AP) on OCHuman, whereas MIPNet maintains a relatively stable performance (drop of 1 AP) for the same inputs.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Khirodkar_Multi-Instance_Pose_Networks_Rethinking_Top-Down_Pose_Estimation_ICCV_2021_paper.pdf", @@ -27121,13 +28959,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Khirodkar_Multi-Instance_Pose_Networks_Rethinking_Top-Down_Pose_Estimation_ICCV_2021_paper.html", "aff_unique_index": "0;1+2;1;1", "aff_unique_norm": "Carnegie Mellon University;Amazon;Waymo", - "aff_unique_dep": ";Amazon;", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.cmu.edu;https://www.amazon.com;https://www.waymo.com", "aff_unique_abbr": "CMU;Amazon;Waymo", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Lab 126", "aff_country_unique_index": "0;0+0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Khirodkar_2021_ICCV,\n \n author = {\n Khirodkar,\n Rawal and Chari,\n Visesh and Agrawal,\n Amit and Tyagi,\n Ambrish\n},\n title = {\n Multi-Instance Pose Networks: Rethinking Top-Down Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3122-3131\n} \n}" }, { "title": "Multi-Level Curriculum for Training a Distortion-Aware Barrel Distortion Rectification Model", @@ -27135,6 +28974,7 @@ "status": "Poster", "track": "main", "pid": 2566, + "author_site": "Kang Liao; Chunyu Lin; Lixin Liao; Yao Zhao; Weiyao Lin", "author": "Kang Liao; Chunyu Lin; Lixin Liao; Yao Zhao; Weiyao Lin", "abstract": "Barrel distortion rectification aims at removing the radial distortion in a distorted image captured by a wide-angle lens. Previous deep learning methods mainly solve this problem by learning the implicit distortion parameters or the nonlinear rectified mapping function in a direct manner. However, this type of manner results in an indistinct learning process of rectification and thus limits the deep perception of distortion. In this paper, inspired by the curriculum learning, we analyze the barrel distortion rectification task in a progressive and meaningful manner. By considering the relationship among different construction levels in an image, we design a multi-level curriculum that disassembles the rectification task into three levels, structure recovery, semantics embedding, and texture rendering. With the guidance of the curriculum that corresponds to the construction of images, the proposed hierarchical architecture enables a progressive rectification and achieves more accurate results. Moreover, we present a novel distortion-aware pre-training strategy to facilitate the initial learning of neural networks, promoting the model to converge faster and better. Experimental results on the synthesized and real-world distorted image datasets show that the proposed approach significantly outperforms other learning methods, both qualitatively and quantitatively.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liao_Multi-Level_Curriculum_for_Training_a_Distortion-Aware_Barrel_Distortion_Rectification_Model_ICCV_2021_paper.pdf", @@ -27151,14 +28991,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Liao_Multi-Level_Curriculum_for_Training_a_Distortion-Aware_Barrel_Distortion_Rectification_Model_ICCV_2021_paper.html", "aff_unique_index": "0+1;0+1;0+1;0+1;2", - "aff_unique_norm": "Beijing Jiao Tong University;Beijing Key Laboratory of Advanced Information Science and Network;Shanghai Jiao Tong University", + "aff_unique_norm": "Beijing Jiaotong University;Beijing Key Laboratory of Advanced Information Science and Network;Shanghai Jiaotong University", "aff_unique_dep": "Institute of Information Science;Advanced Information Science and Network;Department of Electronic Engineering", "aff_unique_url": "http://www.bjtu.edu.cn;;https://www.sjtu.edu.cn", "aff_unique_abbr": "BJTU;;SJTU", - "aff_campus_unique_index": "0;0;0;0", - "aff_campus_unique": "Beijing;", + "aff_campus_unique_index": ";;;", + "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0+0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liao_2021_ICCV,\n \n author = {\n Liao,\n Kang and Lin,\n Chunyu and Liao,\n Lixin and Zhao,\n Yao and Lin,\n Weiyao\n},\n title = {\n Multi-Level Curriculum for Training a Distortion-Aware Barrel Distortion Rectification Model\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4389-4398\n} \n}" }, { "title": "Multi-Modal Multi-Action Video Recognition", @@ -27166,6 +29007,7 @@ "status": "Poster", "track": "main", "pid": 7363, + "author_site": "Zhensheng Shi; Ju Liang; Qianqian Li; Haiyong Zheng; Zhaorui Gu; Junyu Dong; Bing Zheng", "author": "Zhensheng Shi; Ju Liang; Qianqian Li; Haiyong Zheng; Zhaorui Gu; Junyu Dong; Bing Zheng", "abstract": "Multi-action video recognition is much more challenging due to the requirement to recognize multiple actions co-occurring simultaneously or sequentially. Modeling multi-action relations is beneficial and crucial to understand videos with multiple actions, and actions in a video are usually presented in multiple modalities. In this paper, we propose a novel multi-action relation model for videos, by leveraging both relational graph convolutional networks (GCNs) and video multi-modality. We first build multi-modal GCNs to explore modality-aware multi-action relations, fed by modality-specific action representation as node features, e.g., spatiotemporal features learned by 3D convolutional neural network (CNN), audio and textual embeddings queried from respective feature lexicons. We then joint both multi-modal CNN-GCN models and multi-modal feature representations for learning better relational action predictions. Ablation study, multi-action relation visualization, and boosts analysis, all show efficacy of our multi-modal multi-action relation modeling. Also our method achieves state-of-the-art performance on large-scale multi-action M-MiT benchmark. Our code is made publicly available at https://github.com/zhenglab/multi-action-video.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Shi_Multi-Modal_Multi-Action_Video_Recognition_ICCV_2021_paper.pdf", @@ -27189,7 +29031,8 @@ "aff_campus_unique_index": ";;1", "aff_campus_unique": ";Sanya", "aff_country_unique_index": "0+0;0;0;0;0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Shi_2021_ICCV,\n \n author = {\n Shi,\n Zhensheng and Liang,\n Ju and Li,\n Qianqian and Zheng,\n Haiyong and Gu,\n Zhaorui and Dong,\n Junyu and Zheng,\n Bing\n},\n title = {\n Multi-Modal Multi-Action Video Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13678-13687\n} \n}" }, { "title": "Multi-Modality Associative Bridging Through Memory: Speech Sound Recollected From Face Video", @@ -27197,6 +29040,7 @@ "status": "Poster", "track": "main", "pid": 11373, + "author_site": "Minsu Kim; Joanna Hong; Se Jin Park; Yong Man Ro", "author": "Minsu Kim; Joanna Hong; Se Jin Park; Yong Man Ro", "abstract": "In this paper, we introduce a novel audio-visual multi-modal bridging framework that can utilize both audio and visual information, even with uni-modal inputs. We exploit a memory network that stores source (i.e., visual) and target (i.e., audio) modal representations, where source modal representation is what we are given, and target modal representations are what we want to obtain from the memory network. We then construct an associative bridge between source and target memories that considers the interrelationship between the two memories. By learning the interrelationship through the associative bridge, the proposed bridging framework is able to obtain the target modal representations inside the memory network, even with the source modal input only, and it provides rich information for its downstream tasks. We apply the proposed framework to two tasks: lip reading and speech reconstruction from silent video. Through the proposed associative bridge and modality-specific memories, each task knowledge is enriched with the recalled audio context, achieving state-of-the-art performance. We also verify that the associative bridge properly relates the source and target memories.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Kim_Multi-Modality_Associative_Bridging_Through_Memory_Speech_Sound_Recollected_From_Face_ICCV_2021_paper.pdf", @@ -27220,7 +29064,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Kim_2021_ICCV,\n \n author = {\n Kim,\n Minsu and Hong,\n Joanna and Park,\n Se Jin and Ro,\n Yong Man\n},\n title = {\n Multi-Modality Associative Bridging Through Memory: Speech Sound Recollected From Face Video\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 296-306\n} \n}" }, { "title": "Multi-Scale Matching Networks for Semantic Correspondence", @@ -27228,6 +29073,7 @@ "status": "Poster", "track": "main", "pid": 3552, + "author_site": "Dongyang Zhao; Ziyang Song; Zhenghao Ji; Gangming Zhao; Weifeng Ge; Yizhou Yu", "author": "Dongyang Zhao; Ziyang Song; Zhenghao Ji; Gangming Zhao; Weifeng Ge; Yizhou Yu", "abstract": "Deep features have been proven powerful in building accurate dense semantic correspondences in various previous works. However, the multi-scale and pyramidal hierarchy of convolutional neural networks has not been well studied to learn discriminative pixel-level features for semantic correspondence. In this paper, we propose a multiscale matching network that is sensitive to tiny semantic differences between neighboring pixels. We follow the coarse-to-fine matching strategy, and build a top-down feature and matching enhancement scheme that is coupled with the multi-scale hierarchy of deep convolutional neural networks. During feature enhancement, intra-scale enhancement fuses same-resolution feature maps from multiple layers together via local self-attention, and cross-scale enhancement hallucinates higher resolution feature maps along the top-down hierarchy. Besides, we learn complementary matching details at different scales, and thus the overall matching score is refined by features at different semantic levels gradually. Our multi-scale matching network can be trained end-to-end easily with few additional learnable parameters. Experimental results demonstrate the proposed method achieves state-of-the-art performance on three popular benchmarks with high computational efficiency.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhao_Multi-Scale_Matching_Networks_for_Semantic_Correspondence_ICCV_2021_paper.pdf", @@ -27244,14 +29090,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhao_Multi-Scale_Matching_Networks_for_Semantic_Correspondence_ICCV_2021_paper.html", "aff_unique_index": "0+1;0;0;2;0+1;2", - "aff_unique_norm": "Fudan University;Shanghai Key Lab of Intelligent Information Processing;University of Hong Kong", + "aff_unique_norm": "Fudan University;Shanghai Key Lab of Intelligent Information Processing;The University of Hong Kong", "aff_unique_dep": "School of Computer Science;Intelligent Information Processing;Department of Computer Science", "aff_unique_url": "https://www.fudan.edu.cn;;https://www.hku.hk", "aff_unique_abbr": "Fudan;;HKU", "aff_campus_unique_index": ";1;;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0+0;0;0;0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhao_2021_ICCV,\n \n author = {\n Zhao,\n Dongyang and Song,\n Ziyang and Ji,\n Zhenghao and Zhao,\n Gangming and Ge,\n Weifeng and Yu,\n Yizhou\n},\n title = {\n Multi-Scale Matching Networks for Semantic Correspondence\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3354-3364\n} \n}" }, { "title": "Multi-Scale Separable Network for Ultra-High-Definition Video Deblurring", @@ -27259,10 +29106,11 @@ "status": "Poster", "track": "main", "pid": 3233, + "author_site": "Senyou Deng; Wenqi Ren; Yanyang Yan; Tao Wang; Fenglong Song; Xiaochun Cao", "author": "Senyou Deng; Wenqi Ren; Yanyang Yan; Tao Wang; Fenglong Song; Xiaochun Cao", "abstract": "Although recent research has witnessed a significant progress on the video deblurring task, these methods struggle to reconcile inference efficiency and visual quality simultaneously, especially on ultra-high-definition (UHD) videos (e.g., 4K resolution). To address the problem, we propose a novel deep model for fast and accurate UHD Video Deblurring (UHDVD). The proposed UHDVD is achieved by a separable-patch architecture, which collaborates with a multi-scale integration scheme to achieve a large receptive field without adding the number of generic convolutional layers and kernels. Additionally, we design a residual channel-spatial attention (RCSA) module to improve accuracy and reduce the depth of the network appropriately. The proposed UHDVD is the first real-time deblurring model for 4K videos at 35 fps. To train the proposed model, we build a new dataset comprised of 4K blurry videos and corresponding sharp frames using three different smartphones. Comprehensive experimental results show that our network performs favorably against the state-ofthe-art methods on both the 4K dataset and public benchmarks in terms of accuracy, speed, and model size.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Deng_Multi-Scale_Separable_Network_for_Ultra-High-Definition_Video_Deblurring_ICCV_2021_paper.pdf", - "aff": "Institute of Information Engineering, Chinese Academy of Sciences; Institute of Information Engineering, Chinese Academy of Sciences + State Key Lab. for Novel Software Technology, Nanjing University; Institute of Information Engineering, Chinese Academy of Sciences; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Institute of Information Engineering, Chinese Academy of Sciences", + "aff": "Institute of Information Engineering, Chinese Academy of Sciences; Institute of Information Engineering, Chinese Academy of Sciences + State Key Lab. for Novel Software Technology, Nanjing University; Institute of Information Engineering, Chinese Academy of Sciences; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; Institute of Information Engineering, Chinese Academy of Sciences", "project": "", "github": "", "supp": "", @@ -27276,13 +29124,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Deng_Multi-Scale_Separable_Network_for_Ultra-High-Definition_Video_Deblurring_ICCV_2021_paper.html", "aff_unique_index": "0;0+1;0;2;2;0", "aff_unique_norm": "Chinese Academy of Sciences;Nanjing University;Huawei", - "aff_unique_dep": "Institute of Information Engineering;State Key Lab. for Novel Software Technology;Noah\u2019s Ark Lab", + "aff_unique_dep": "Institute of Information Engineering;State Key Lab. for Novel Software Technology;Noah’s Ark Lab", "aff_unique_url": "http://www.cas.cn;http://www.nju.edu.cn;https://www.huawei.com", "aff_unique_abbr": "CAS;Nanjing U;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Deng_2021_ICCV,\n \n author = {\n Deng,\n Senyou and Ren,\n Wenqi and Yan,\n Yanyang and Wang,\n Tao and Song,\n Fenglong and Cao,\n Xiaochun\n},\n title = {\n Multi-Scale Separable Network for Ultra-High-Definition Video Deblurring\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14030-14039\n} \n}" }, { "title": "Multi-Scale Vision Longformer: A New Vision Transformer for High-Resolution Image Encoding", @@ -27290,6 +29139,7 @@ "status": "Poster", "track": "main", "pid": 11316, + "author_site": "Pengchuan Zhang; Xiyang Dai; Jianwei Yang; Bin Xiao; Lu Yuan; Lei Zhang; Jianfeng Gao", "author": "Pengchuan Zhang; Xiyang Dai; Jianwei Yang; Bin Xiao; Lu Yuan; Lei Zhang; Jianfeng Gao", "abstract": "This paper presents a new Vision Transformer (ViT) architecture Multi-Scale Vision Longformer, which significantly enhances the ViT of [??] for encoding high-resolution images using two techniques. The first is the multi-scale model structure, which provides image encodings at multiple scales with manageable computational cost. The second is the attention mechanism of Vision Longformer, which is a variant of Longformer [??], originally developed for natural language processing, and achieves a linear complexity w.r.t. the number of input tokens. A comprehensive empirical study shows that the new ViT significantly outperforms several strong baselines, including the existing ViT models and their ResNet counterparts, and the Pyramid Vision Transformer from a concurrent work [??], on a range of vision tasks, including image classification, object detection, and segmentation. The models and source code are released at https://github.com/microsoft/vision-longformer.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhang_Multi-Scale_Vision_Longformer_A_New_Vision_Transformer_for_High-Resolution_Image_ICCV_2021_paper.pdf", @@ -27306,14 +29156,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhang_Multi-Scale_Vision_Longformer_A_New_Vision_Transformer_for_High-Resolution_Image_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;0;0;1;0", - "aff_unique_norm": "Microsoft;International Digital Economy Academy", - "aff_unique_dep": "Microsoft Corporation;", + "aff_unique_norm": "Microsoft Corporation;International Digital Economy Academy", + "aff_unique_dep": ";", "aff_unique_url": "https://www.microsoft.com;", "aff_unique_abbr": "Microsoft;IDEA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States;" + "aff_country_unique": "United States;", + "bibtex": "@InProceedings{Zhang_2021_ICCV,\n \n author = {\n Zhang,\n Pengchuan and Dai,\n Xiyang and Yang,\n Jianwei and Xiao,\n Bin and Yuan,\n Lu and Zhang,\n Lei and Gao,\n Jianfeng\n},\n title = {\n Multi-Scale Vision Longformer: A New Vision Transformer for High-Resolution Image Encoding\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2998-3008\n} \n}" }, { "title": "Multi-Source Domain Adaptation for Object Detection", @@ -27321,6 +29172,7 @@ "status": "Poster", "track": "main", "pid": 5714, + "author_site": "Xingxu Yao; Sicheng Zhao; Pengfei Xu; Jufeng Yang", "author": "Xingxu Yao; Sicheng Zhao; Pengfei Xu; Jufeng Yang", "abstract": "To reduce annotation labor associated with object detection, an increasing number of studies focus on transferring the learned knowledge from a labeled source domain to another unlabeled target domain. However, existing methods assume that the labeled data are sampled from a single source domain, which ignores a more generalized scenario, where labeled data are from multiple source domains. For the more challenging task, we propose a unified Faster RCNN based framework, termed Divide-and-Merge Spindle Network (DMSN), which can simultaneously enhance domain invariance and preserve discriminative power. Specifically, the framework contains multiple source subnets and a pseudo target subnet. First, we propose a hierarchical feature alignment strategy to conduct strong and weak alignments for low- and high-level features, respectively, considering their different effects for object detection. Second, we develop a novel pseudo subnet learning algorithm to approximate optimal parameters of pseudo target subset by weighted combination of parameters in different source subnets. Finally, a consistency regularization for region proposal network is proposed to facilitate each subnet to learn more abstract invariances. Extensive experiments on different adaptation scenarios demonstrate the effectiveness of the proposed model.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yao_Multi-Source_Domain_Adaptation_for_Object_Detection_ICCV_2021_paper.pdf", @@ -27344,7 +29196,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;1;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Yao_2021_ICCV,\n \n author = {\n Yao,\n Xingxu and Zhao,\n Sicheng and Xu,\n Pengfei and Yang,\n Jufeng\n},\n title = {\n Multi-Source Domain Adaptation for Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3273-3282\n} \n}" }, { "title": "Multi-Target Adversarial Frameworks for Domain Adaptation in Semantic Segmentation", @@ -27352,7 +29205,8 @@ "status": "Poster", "track": "main", "pid": 2624, - "author": "Antoine Saporta; Tuan-Hung Vu; Matthieu Cord; Patrick P\u00e9rez", + "author_site": "Antoine Saporta; Tuan-Hung Vu; Matthieu Cord; Patrick Pérez", + "author": "Antoine Saporta; Tuan-Hung Vu; Matthieu Cord; Patrick Pérez", "abstract": "In this work, we address the task of unsupervised domain adaptation (UDA) for semantic segmentation in presence of multiple target domains: the objective is to train a single model that can handle all these domains at test time. Such a multi-target adaptation is crucial for a variety of scenarios that real-world autonomous systems must handle. It is a challenging set-up since one faces not only the domain gap between the labeled source set and the unlabeled target set, but also the distribution shifts existing within the latter among the different target domains. To this end, we introduce two adversarial frameworks: (i) multi-discriminator, which explicitly aligns each target domain to its counterparts, and (ii) multi-target knowledge transfer, which learns a target-agnostic model thanks to a multi-teacher/single-student distillation mechanism. The evaluation is done on four newly proposed multi-target benchmarks for UDA in semantic segmentation. In all tested scenarios, our approaches consistently outperform baselines, setting competitive standards for the novel task.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Saporta_Multi-Target_Adversarial_Frameworks_for_Domain_Adaptation_in_Semantic_Segmentation_ICCV_2021_paper.pdf", "aff": "Sorbonne University+Valeo.ai; Valeo.ai; Sorbonne University+Valeo.ai; Valeo.ai", @@ -27375,7 +29229,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0+0;0", - "aff_country_unique": "France" + "aff_country_unique": "France", + "bibtex": "@InProceedings{Saporta_2021_ICCV,\n \n author = {\n Saporta,\n Antoine and Vu,\n Tuan-Hung and Cord,\n Matthieu and P\\'erez,\n Patrick\n},\n title = {\n Multi-Target Adversarial Frameworks for Domain Adaptation in Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9072-9081\n} \n}" }, { "title": "Multi-Task Self-Training for Learning General Representations", @@ -27383,6 +29238,7 @@ "status": "Poster", "track": "main", "pid": 9639, + "author_site": "Golnaz Ghiasi; Barret Zoph; Ekin D. Cubuk; Quoc V. Le; Tsung-Yi Lin", "author": "Golnaz Ghiasi; Barret Zoph; Ekin D. Cubuk; Quoc V. Le; Tsung-Yi Lin", "abstract": "Despite the fast progress in training specialized models for various tasks, learning a single general model that works well for many tasks is still challenging for computer vision. Here we introduce multi-task self-training (MuST), which harnesses the knowledge in independent specialized teacher models (e.g., ImageNet model on classification) to train a single general student model. Our approach has three steps. First, we train specialized teachers independently on labeled datasets. We then use the specialized teachers to label an unlabeled dataset to create a multi-task pseudo labeled dataset. Finally, the dataset, which now contains pseudo labels from teacher models trained on different datasets/tasks, is then used to train a student model with multi-task learning. We evaluate the feature representations of the student model on 6 vision tasks including image recognition (classification, detection, segmentation) and 3D geometry estimation (depth and surface normal estimation). MuST is scalable with unlabeled or partially labeled datasets and outperforms both specialized supervised models and self-supervised models when training on large scale datasets. Lastly, we show MuST can improve upon already strong checkpoints trained with billions of examples. The results suggest self-training is a promising direction to aggregate labeled and unlabeled training data for learning general feature representations.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ghiasi_Multi-Task_Self-Training_for_Learning_General_Representations_ICCV_2021_paper.pdf", @@ -27406,7 +29262,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Ghiasi_2021_ICCV,\n \n author = {\n Ghiasi,\n Golnaz and Zoph,\n Barret and Cubuk,\n Ekin D. and Le,\n Quoc V. and Lin,\n Tsung-Yi\n},\n title = {\n Multi-Task Self-Training for Learning General Representations\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8856-8865\n} \n}" }, { "title": "Multi-VAE: Learning Disentangled View-Common and View-Peculiar Visual Representations for Multi-View Clustering", @@ -27414,6 +29271,7 @@ "status": "Poster", "track": "main", "pid": 4263, + "author_site": "Jie Xu; Yazhou Ren; Huayi Tang; Xiaorong Pu; Xiaofeng Zhu; Ming Zeng; Lifang He", "author": "Jie Xu; Yazhou Ren; Huayi Tang; Xiaorong Pu; Xiaofeng Zhu; Ming Zeng; Lifang He", "abstract": "Multi-view clustering, a long-standing and important research problem, focuses on mining complementary information from diverse views. However, existing works often fuse multiple views' representations or handle clustering in a common feature space, which may result in their entanglement especially for visual representations. To address this issue, we present a novel VAE-based multi-view clustering framework (Multi-VAE) by learning disentangled visual representations. Concretely, we define a view-common variable and multiple view-peculiar variables in the generative model. The prior of view-common variable obeys approximately discrete Gumbel Softmax distribution, which is introduced to extract the common cluster factor of multiple views. Meanwhile, the prior of view-peculiar variable follows continuous Gaussian distribution, which is used to represent each view's peculiar visual factors. By controlling the mutual information capacity to disentangle the view-common and view-peculiar representations, continuous visual information of multiple views can be separated so that their common discrete cluster information can be effectively mined. Experimental results demonstrate that Multi-VAE enjoys the disentangled and explainable visual representations, while obtaining superior clustering performance compared with state-of-the-art methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xu_Multi-VAE_Learning_Disentangled_View-Common_and_View-Peculiar_Visual_Representations_for_Multi-View_ICCV_2021_paper.pdf", @@ -27437,7 +29295,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;1;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Xu_2021_ICCV,\n \n author = {\n Xu,\n Jie and Ren,\n Yazhou and Tang,\n Huayi and Pu,\n Xiaorong and Zhu,\n Xiaofeng and Zeng,\n Ming and He,\n Lifang\n},\n title = {\n Multi-VAE: Learning Disentangled View-Common and View-Peculiar Visual Representations for Multi-View Clustering\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9234-9243\n} \n}" }, { "title": "Multi-View 3D Reconstruction With Transformers", @@ -27445,6 +29304,7 @@ "status": "Poster", "track": "main", "pid": 3945, + "author_site": "Dan Wang; Xinrui Cui; Xun Chen; Zhengxia Zou; Tianyang Shi; Septimiu Salcudean; Z. Jane Wang; Rabab Ward", "author": "Dan Wang; Xinrui Cui; Xun Chen; Zhengxia Zou; Tianyang Shi; Septimiu Salcudean; Z. Jane Wang; Rabab Ward", "abstract": "Deep CNN-based methods have so far achieved the state of the art results in multi-view 3D object reconstruction. Despite the considerable progress, the two core modules of these methods - view feature extraction and multi-view fusion, are usually investigated separately, and the relations among multiple input views are rarely explored. Inspired by the recent great success in Transformer models, we reformulate the multi-view 3D reconstruction as a sequence-to-sequence prediction problem and propose a framework named 3D Volume Transformer. Unlike previous CNN-based methods using a separate design, we unify the feature extraction and view fusion in a single Transformer network. A natural advantage of our design lies in the exploration of view-to-view relationships using self-attention among multiple unordered inputs. On ShapeNet - a large-scale 3D reconstruction benchmark, our method achieves a new state-of-the-art accuracy in multi-view reconstruction with fewer parameters (70% less) than CNN-based methods. Experimental results also suggest the strong scaling capability of our method. Our code will be made publicly available.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_Multi-View_3D_Reconstruction_With_Transformers_ICCV_2021_paper.pdf", @@ -27461,14 +29321,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wang_Multi-View_3D_Reconstruction_With_Transformers_ICCV_2021_paper.html", "aff_unique_index": "0;1;1;2;3;0;0;0", - "aff_unique_norm": "University of British Columbia;University of Science and Technology of China;University of Michigan;Netease", + "aff_unique_norm": "University of British Columbia;University of Science and Technology of China;University of Michigan;NetEase", "aff_unique_dep": ";;;Fuxi AI Lab", "aff_unique_url": "https://www.ubc.ca;http://www.ustc.edu.cn;https://www.umich.edu;https://www.163.com", "aff_unique_abbr": "UBC;USTC;UM;NetEase", "aff_campus_unique_index": "1", "aff_campus_unique": ";Ann Arbor", "aff_country_unique_index": "0;1;1;2;1;0;0;0", - "aff_country_unique": "Canada;China;United States" + "aff_country_unique": "Canada;China;United States", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Dan and Cui,\n Xinrui and Chen,\n Xun and Zou,\n Zhengxia and Shi,\n Tianyang and Salcudean,\n Septimiu and Wang,\n Z. Jane and Ward,\n Rabab\n},\n title = {\n Multi-View 3D Reconstruction With Transformers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5722-5731\n} \n}" }, { "title": "Multi-View Radar Semantic Segmentation", @@ -27476,10 +29337,11 @@ "status": "Poster", "track": "main", "pid": 9024, - "author": "Arthur Ouaknine; Alasdair Newson; Patrick P\u00e9rez; Florence Tupin; Julien Rebut", + "author_site": "Arthur Ouaknine; Alasdair Newson; Patrick Pérez; Florence Tupin; Julien Rebut", + "author": "Arthur Ouaknine; Alasdair Newson; Patrick Pérez; Florence Tupin; Julien Rebut", "abstract": "Understanding the scene around the ego-vehicle is key to assisted and autonomous driving. Nowadays, this is mostly conducted using cameras and laser scanners, despite their reduced performances in adverse weather conditions. Automotive radars are low-cost active sensors that measure properties of surrounding objects, including their relative speed, and have the key advantage of not being impacted by rain, snow or fog. However, they are seldom used for scene understanding due to the size and complexity of radar raw data and the lack of annotated datasets. Fortunately, recent open-sourced datasets have opened up research on classification, object detection and semantic segmentation with raw radar signals using end-to-end trainable models. In this work, we propose several novel architectures, and their associated losses, which analyse multiple \"views\" of the range-angle-Doppler radar tensor to segment it semantically. Experiments conducted on the recent CARRADA dataset demonstrate that our best model outperforms alternative models, derived either from the semantic segmentation of natural images or from radar scene understanding, while requiring significantly fewer parameters. Both our code and trained models are available at https://github.com/valeoai/MVRSS.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ouaknine_Multi-View_Radar_Semantic_Segmentation_ICCV_2021_paper.pdf", - "aff": "LTCI, T \u00b4el\u00b4ecom Paris, Institut Polytechnique de Paris, Palaiseau, France+valeo.ai, Paris, France; LTCI, T \u00b4el\u00b4ecom Paris, Institut Polytechnique de Paris, Palaiseau, France; valeo.ai, Paris, France; LTCI, T \u00b4el\u00b4ecom Paris, Institut Polytechnique de Paris, Palaiseau, France; valeo.ai, Paris, France", + "aff": "LTCI, T ´el´ecom Paris, Institut Polytechnique de Paris, Palaiseau, France+valeo.ai, Paris, France; LTCI, T ´el´ecom Paris, Institut Polytechnique de Paris, Palaiseau, France; valeo.ai, Paris, France; LTCI, T ´el´ecom Paris, Institut Polytechnique de Paris, Palaiseau, France; valeo.ai, Paris, France", "project": "", "github": "https://github.com/valeoai/MVRSS", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Ouaknine_Multi-View_Radar_Semantic_ICCV_2021_supplemental.pdf", @@ -27492,14 +29354,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Ouaknine_Multi-View_Radar_Semantic_Segmentation_ICCV_2021_paper.html", "aff_unique_index": "0+1;0;1;0;1", - "aff_unique_norm": "T\u00e9l\u00e9com Paris;Valeo.ai", + "aff_unique_norm": "Télécom Paris;valeo.ai", "aff_unique_dep": "LTCI;", "aff_unique_url": "https://www.telecom-paris.fr;https://www.valeo.ai", - "aff_unique_abbr": "T\u00e9l\u00e9com Paris;", + "aff_unique_abbr": "Télécom Paris;", "aff_campus_unique_index": "0+0;0;0;0;0", "aff_campus_unique": "Paris", "aff_country_unique_index": "0+0;0;0;0;0", - "aff_country_unique": "France" + "aff_country_unique": "France", + "bibtex": "@InProceedings{Ouaknine_2021_ICCV,\n \n author = {\n Ouaknine,\n Arthur and Newson,\n Alasdair and P\\'erez,\n Patrick and Tupin,\n Florence and Rebut,\n Julien\n},\n title = {\n Multi-View Radar Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15671-15680\n} \n}" }, { "title": "MultiSiam: Self-Supervised Multi-Instance Siamese Representation Learning for Autonomous Driving", @@ -27507,10 +29370,11 @@ "status": "Poster", "track": "main", "pid": 6427, + "author_site": "Kai Chen; Lanqing Hong; Hang Xu; Zhenguo Li; Dit-Yan Yeung", "author": "Kai Chen; Lanqing Hong; Hang Xu; Zhenguo Li; Dit-Yan Yeung", "abstract": "Autonomous driving has attracted much attention over the years but turns out to be harder than expected, probably due to the difficulty of labeled data collection for model training. Self-supervised learning (SSL), which leverages unlabeled data only for representation learning, might be a promising way to improve model performance. Existing SSL methods, however, usually rely on the single-centric-object guarantee, which may not be applicable for multi-instance datasets such as street scenes. To alleviate this limitation, we raise two issues to solve: (1) how to define positive samples for cross-view consistency and (2) how to measure similarity in multi-instance circumstances. We first adopt an IoU threshold during random cropping to transfer global-inconsistency to local-consistency. Then, we propose two feature alignment methods to enable 2D feature maps for multi-instance similarity measurement. Additionally, we adopt intra-image clustering with self-attention for further mining intra-image similarity and translation-invariance. Experiments show that, when pre-trained on Waymo dataset, our method called Multi-instance Siamese Network (MultiSiam) remarkably improves generalization ability and achieves state-of-the-art transfer performance on autonomous driving benchmarks, including Cityscapes and BDD100K, while existing SSL counterparts like MoCo, MoCo-v2, and BYOL show significant performance drop. By pre-training on SODA10M, a large-scale autonomous driving dataset, MultiSiam exceeds the ImageNet pre-trained MoCo-v2, demonstrating the potential of domain-specific pre-training. Code will be available at https://github.com/KaiChen1998/MultiSiam .", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_MultiSiam_Self-Supervised_Multi-Instance_Siamese_Representation_Learning_for_Autonomous_Driving_ICCV_2021_paper.pdf", - "aff": "Hong Kong University of Science and Technology; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Hong Kong University of Science and Technology", + "aff": "Hong Kong University of Science and Technology; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; Hong Kong University of Science and Technology", "project": "", "github": "https://github.com/KaiChen1998/MultiSiam", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Chen_MultiSiam_Self-Supervised_Multi-Instance_ICCV_2021_supplemental.pdf", @@ -27524,13 +29388,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Chen_MultiSiam_Self-Supervised_Multi-Instance_Siamese_Representation_Learning_for_Autonomous_Driving_ICCV_2021_paper.html", "aff_unique_index": "0;1;1;1;0", "aff_unique_norm": "Hong Kong University of Science and Technology;Huawei", - "aff_unique_dep": ";Noah\u2019s Ark Lab", + "aff_unique_dep": ";Noah’s Ark Lab", "aff_unique_url": "https://www.ust.hk;https://www.huawei.com", "aff_unique_abbr": "HKUST;Huawei", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Kai and Hong,\n Lanqing and Xu,\n Hang and Li,\n Zhenguo and Yeung,\n Dit-Yan\n},\n title = {\n MultiSiam: Self-Supervised Multi-Instance Siamese Representation Learning for Autonomous Driving\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7546-7554\n} \n}" }, { "title": "MultiSports: A Multi-Person Video Dataset of Spatio-Temporally Localized Sports Actions", @@ -27538,6 +29403,7 @@ "status": "Poster", "track": "main", "pid": 7750, + "author_site": "Yixuan Li; Lei Chen; Runyu He; Zhenzhi Wang; Gangshan Wu; Limin Wang", "author": "Yixuan Li; Lei Chen; Runyu He; Zhenzhi Wang; Gangshan Wu; Limin Wang", "abstract": "Spatio-temporal action detection is an important and challenging problem in video understanding. The existing action detection benchmarks are limited in aspects of small numbers of instances in a trimmed video or low-level atomic actions. This paper aims to present a new multi-person dataset of spatio-temporal localized sports actions, coined as MultiSports. We first analyze the important ingredients of constructing a realistic and challenging dataset for spatio-temporal action detection by proposing three criteria: (1) multi-person scenes and motion dependent identification, (2) with well-defined boundaries, (3) relatively fine-grained classes of high complexity. Based on these guidelines, we build the dataset of MultiSports v1.0 by selecting 4 sports classes, collecting 3200 video clips, and annotating 37701 action instances with 902k bounding boxes. Our datasets are characterized with important properties of high diversity, dense annotation, and high quality. Our MultiSports, with its realistic setting and detailed annotations, exposes the intrinsic challenges of spatio-temporal action detection. To benchmark this, we adapt several baseline methods to our dataset and give an in-depth analysis on the action detection results in our dataset. We hope our MultiSports can serve as a standard benchmark for spatio-temporal action detection in the future. Our dataset website is at https://deeperaction.github.io/multisports/.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_MultiSports_A_Multi-Person_Video_Dataset_of_Spatio-Temporally_Localized_Sports_Actions_ICCV_2021_paper.pdf", @@ -27561,7 +29427,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Yixuan and Chen,\n Lei and He,\n Runyu and Wang,\n Zhenzhi and Wu,\n Gangshan and Wang,\n Limin\n},\n title = {\n MultiSports: A Multi-Person Video Dataset of Spatio-Temporally Localized Sports Actions\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13536-13545\n} \n}" }, { "title": "Multimodal Clustering Networks for Self-Supervised Learning From Unlabeled Videos", @@ -27569,6 +29436,7 @@ "status": "Poster", "track": "main", "pid": 2965, + "author_site": "Brian Chen; Andrew Rouditchenko; Kevin Duarte; Hilde Kuehne; Samuel Thomas; Angie Boggust; Rameswar Panda; Brian Kingsbury; Rogerio Feris; David Harwath; James Glass; Michael Picheny; Shih-Fu Chang", "author": "Brian Chen; Andrew Rouditchenko; Kevin Duarte; Hilde Kuehne; Samuel Thomas; Angie Boggust; Rameswar Panda; Brian Kingsbury; Rogerio Feris; David Harwath; James Glass; Michael Picheny; Shih-Fu Chang", "abstract": "Multimodal self-supervised learning is getting more and more attention as it allows not only to train large networks without human supervision but also to search and retrieve data across various modalities. In this context, this paper proposes a framework that, starting from a pre-trained backbone, learns a common multimodal embedding space that, in addition to sharing representations across different modalities, enforces a grouping of semantically similar instances. To this end, we extend the concept of instance-level contrastive learning with a multimodal clustering step in the training pipeline to capture semantic similarities across modalities. The resulting embedding space enables retrieval of samples across all modalities, even from unseen datasets and different domains. To evaluate our approach, we train our model on the HowTo100M dataset and evaluate its zero-shot retrieval capabilities in two challenging domains, namely text-to-video retrieval, and temporal action localization, showing state-of-the-art results on four different datasets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_Multimodal_Clustering_Networks_for_Self-Supervised_Learning_From_Unlabeled_Videos_ICCV_2021_paper.pdf", @@ -27585,14 +29453,15 @@ "author_num": 13, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Chen_Multimodal_Clustering_Networks_for_Self-Supervised_Learning_From_Unlabeled_Videos_ICCV_2021_paper.html", "aff_unique_index": "0;1;2;3+1;4+1;4+1;4+1;4+1;4+1;5;1;6;0", - "aff_unique_norm": "Columbia University;Massachusetts Institute of Technology;University of Central Florida;Goethe University Frankfurt;IBM;University of Texas at Austin;New York University", + "aff_unique_norm": "Columbia University;Massachusetts Institute of Technology;University of Central Florida;Goethe University Frankfurt;IBM Research;University of Texas at Austin;New York University", "aff_unique_dep": ";Computer Science and Artificial Intelligence Laboratory;;;AI;;Computer Science and Data Science", "aff_unique_url": "https://www.columbia.edu;https://www.csail.mit.edu;https://www.ucf.edu;https://www.uni-frankfurt.de;https://www.ibm.com/research;https://www.utexas.edu;https://www.courant.nyu.edu", "aff_unique_abbr": "Columbia;MIT CSAIL;UCF;GU Frankfurt;IBM;UT Austin;NYU", "aff_campus_unique_index": "1;2;;;;;;3;1;4", "aff_campus_unique": ";Cambridge;Frankfurt;Austin;Courant Institute", "aff_country_unique_index": "0;0;0;1+0;0+0;0+0;0+0;0+0;0+0;0;0;0;0", - "aff_country_unique": "United States;Germany" + "aff_country_unique": "United States;Germany", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Brian and Rouditchenko,\n Andrew and Duarte,\n Kevin and Kuehne,\n Hilde and Thomas,\n Samuel and Boggust,\n Angie and Panda,\n Rameswar and Kingsbury,\n Brian and Feris,\n Rogerio and Harwath,\n David and Glass,\n James and Picheny,\n Michael and Chang,\n Shih-Fu\n},\n title = {\n Multimodal Clustering Networks for Self-Supervised Learning From Unlabeled Videos\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8012-8021\n} \n}" }, { "title": "Multimodal Co-Attention Transformer for Survival Prediction in Gigapixel Whole Slide Images", @@ -27600,10 +29469,11 @@ "status": "Poster", "track": "main", "pid": 10972, + "author_site": "Richard J. Chen; Ming Y. Lu; Wei-Hung Weng; Tiffany Y. Chen; Drew F.K. Williamson; Trevor Manz; Maha Shady; Faisal Mahmood", "author": "Richard J. Chen; Ming Y. Lu; Wei-Hung Weng; Tiffany Y. Chen; Drew F.K. Williamson; Trevor Manz; Maha Shady; Faisal Mahmood", "abstract": "Survival outcome prediction is a challenging weakly-supervised and ordinal regression task in computational pathology that involves modeling complex interactions within the tumor microenvironment in gigapixel whole slide images (WSIs). Despite recent progress in formulating WSIs as bags for multiple instance learning (MIL), representation learning of entire WSIs remains an open and challenging problem, especially in overcoming: 1) the computational complexity of feature aggregation in large bags, and 2) the data heterogeneity gap in incorporating biological priors such as genomic measurements. In this work, we present a Multimodal Co-Attention Transformer (MCAT) framework that learns an interpretable, dense co-attention mapping between WSIs and genomic features formulated in an embedding space. Inspired by approaches in Visual Question Answering (VQA) that can attribute how word embeddings attend to salient objects in an image when answering a question, MCAT learns how histology patches attend to genes when predicting patient survival. In addition to visualizing multimodal interactions, our co-attention transformation also reduces the space complexity of WSI bags, which enables the adaptation of Transformer layers as a general encoder backbone in MIL. We apply our proposed method on five different cancer datasets (4,730 WSIs, 67 million patches). Our experimental results demonstrate that the proposed method consistently achieves superior performance compared to the state-of-the-art methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_Multimodal_Co-Attention_Transformer_for_Survival_Prediction_in_Gigapixel_Whole_Slide_ICCV_2021_paper.pdf", - "aff": "Department of Pathology, Brigham and Women\u2019s Hospital+Department of Biomedical Informatics, Harvard Medical School+Cancer Program, Broad Institute of Harvard and MIT+Cancer Data Science Program, Dana-Farber Cancer Institute; Department of Pathology, Brigham and Women\u2019s Hospital+Cancer Program, Broad Institute of Harvard and MIT+Cancer Data Science Program, Dana-Farber Cancer Institute; Computer Science and Arti\ufb01cial Intelligence Laboratory, MIT; Department of Pathology, Brigham and Women\u2019s Hospital+Cancer Program, Broad Institute of Harvard and MIT+Cancer Data Science Program, Dana-Farber Cancer Institute; Department of Pathology, Brigham and Women\u2019s Hospital+Cancer Program, Broad Institute of Harvard and MIT+Cancer Data Science Program, Dana-Farber Cancer Institute; Department of Pathology, Brigham and Women\u2019s Hospital+Department of Biomedical Informatics, Harvard Medical School; Department of Pathology, Brigham and Women\u2019s Hospital+Department of Biomedical Informatics, Harvard Medical School+Cancer Program, Broad Institute of Harvard and MIT+Cancer Data Science Program, Dana-Farber Cancer Institute; Department of Pathology, Brigham and Women\u2019s Hospital+Cancer Program, Broad Institute of Harvard and MIT+Cancer Data Science Program, Dana-Farber Cancer Institute", + "aff": "Department of Pathology, Brigham and Women’s Hospital+Department of Biomedical Informatics, Harvard Medical School+Cancer Program, Broad Institute of Harvard and MIT+Cancer Data Science Program, Dana-Farber Cancer Institute; Department of Pathology, Brigham and Women’s Hospital+Cancer Program, Broad Institute of Harvard and MIT+Cancer Data Science Program, Dana-Farber Cancer Institute; Computer Science and Artificial Intelligence Laboratory, MIT; Department of Pathology, Brigham and Women’s Hospital+Cancer Program, Broad Institute of Harvard and MIT+Cancer Data Science Program, Dana-Farber Cancer Institute; Department of Pathology, Brigham and Women’s Hospital+Cancer Program, Broad Institute of Harvard and MIT+Cancer Data Science Program, Dana-Farber Cancer Institute; Department of Pathology, Brigham and Women’s Hospital+Department of Biomedical Informatics, Harvard Medical School; Department of Pathology, Brigham and Women’s Hospital+Department of Biomedical Informatics, Harvard Medical School+Cancer Program, Broad Institute of Harvard and MIT+Cancer Data Science Program, Dana-Farber Cancer Institute; Department of Pathology, Brigham and Women’s Hospital+Cancer Program, Broad Institute of Harvard and MIT+Cancer Data Science Program, Dana-Farber Cancer Institute", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Chen_Multimodal_Co-Attention_Transformer_ICCV_2021_supplemental.pdf", @@ -27617,13 +29487,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Chen_Multimodal_Co-Attention_Transformer_for_Survival_Prediction_in_Gigapixel_Whole_Slide_ICCV_2021_paper.html", "aff_unique_index": "0+1+2+3;0+2+3;4;0+2+3;0+2+3;0+1;0+1+2+3;0+2+3", "aff_unique_norm": "Brigham and Women's Hospital;Harvard Medical School;Broad Institute of Harvard and MIT;Dana-Farber Cancer Institute;Massachusetts Institute of Technology", - "aff_unique_dep": "Department of Pathology;Department of Biomedical Informatics;Cancer Program;Cancer Data Science Program;Computer Science and Arti\ufb01cial Intelligence Laboratory", + "aff_unique_dep": "Department of Pathology;Department of Biomedical Informatics;Cancer Program;Cancer Data Science Program;Computer Science and Artificial Intelligence Laboratory", "aff_unique_url": "https://www.brighamandwomens.org;https://hms.harvard.edu;https://www.broadinstitute.org;https://www.dana-farber.org;https://www.csail.mit.edu", "aff_unique_abbr": "BWH;HMS;Broad Institute;Dana-Farber;MIT", "aff_campus_unique_index": "1;;2;;;1;1;", "aff_campus_unique": ";Boston;Cambridge", "aff_country_unique_index": "0+0+0+0;0+0+0;0;0+0+0;0+0+0;0+0;0+0+0+0;0+0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Richard J. and Lu,\n Ming Y. and Weng,\n Wei-Hung and Chen,\n Tiffany Y. and Williamson,\n Drew F.K. and Manz,\n Trevor and Shady,\n Maha and Mahmood,\n Faisal\n},\n title = {\n Multimodal Co-Attention Transformer for Survival Prediction in Gigapixel Whole Slide Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4015-4025\n} \n}" }, { "title": "Multimodal Knowledge Expansion", @@ -27631,6 +29502,7 @@ "status": "Poster", "track": "main", "pid": 3771, + "author_site": "Zihui Xue; Sucheng Ren; Zhengqi Gao; Hang Zhao", "author": "Zihui Xue; Sucheng Ren; Zhengqi Gao; Hang Zhao", "abstract": "The popularity of multimodal sensors and the accessibility of the Internet have brought us a massive amount of unlabeled multimodal data. Since existing datasets and well-trained models are primarily unimodal, the modality gap between a unimodal network and unlabeled multimodal data poses an interesting problem: how to transfer a pre-trained unimodal network to perform the same task on unlabeled multimodal data? In this work, we propose multimodal knowledge expansion (MKE), a knowledge distillation-based framework to effectively utilize multimodal data without requiring labels. Opposite to traditional knowledge distillation, where the student is designed to be lightweight and inferior to the teacher, we observe that the multimodal student model consistently rectifies pseudo labels and generalizes better than its teacher. Extensive experiments on four tasks and different modalities verify this finding. Furthermore, we connect the mechanism of MKE to semi-supervised learning and offer both empirical and theoretical explanations to understand the expansion capability of a multimodal student.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xue_Multimodal_Knowledge_Expansion_ICCV_2021_paper.pdf", @@ -27654,7 +29526,8 @@ "aff_campus_unique_index": "1;;;", "aff_campus_unique": ";Austin", "aff_country_unique_index": "0+1;0+0;0+1;0+0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Xue_2021_ICCV,\n \n author = {\n Xue,\n Zihui and Ren,\n Sucheng and Gao,\n Zhengqi and Zhao,\n Hang\n},\n title = {\n Multimodal Knowledge Expansion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 854-863\n} \n}" }, { "title": "Multiple Heads Are Better Than One: Few-Shot Font Generation With Multiple Localized Experts", @@ -27662,6 +29535,7 @@ "status": "Poster", "track": "main", "pid": 1304, + "author_site": "Song Park; Sanghyuk Chun; Junbum Cha; Bado Lee; Hyunjung Shim", "author": "Song Park; Sanghyuk Chun; Junbum Cha; Bado Lee; Hyunjung Shim", "abstract": "A few-shot font generation (FFG) method has to satisfy two objectives: the generated images should preserve the underlying global structure of the target character and present the diverse local reference style. Existing FFG methods aim to disentangle content and style either by extracting a universal representation style or extracting multiple component-wise style representations. However, previous methods either fail to capture diverse local styles or cannot be generalized to a character with unseen components, e.g., unseen language systems. To mitigate the issues, we propose a novel FFG method, named Multiple Localized Experts Few-shot Font Generation Network (MX-Font). MX-Font extracts multiple style features not explicitly conditioned on component labels, but automatically by multiple experts to represent different local concepts, e.g., left-side sub-glyph. Owing to the multiple experts, MX-Font can capture diverse local concepts and show the generalizability to unseen languages. During training, we utilize component labels as weak supervision to guide each expert to be specialized for different local concepts. We formulate the component assign problem to each expert as the graph matching problem, and solve it by the Hungarian algorithm. We also employ the independence loss and the content-style adversarial loss to impose the content-style disentanglement. In our experiments, MX-Font outperforms previous state-of-the-art FFG methods in the Chinese generation and cross-lingual, e.g., Chinese to Korean, generation.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Park_Multiple_Heads_Are_Better_Than_One_Few-Shot_Font_Generation_With_ICCV_2021_paper.pdf", @@ -27685,7 +29559,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "South Korea;" + "aff_country_unique": "South Korea;", + "bibtex": "@InProceedings{Park_2021_ICCV,\n \n author = {\n Park,\n Song and Chun,\n Sanghyuk and Cha,\n Junbum and Lee,\n Bado and Shim,\n Hyunjung\n},\n title = {\n Multiple Heads Are Better Than One: Few-Shot Font Generation With Multiple Localized Experts\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13900-13909\n} \n}" }, { "title": "Multiple Pairwise Ranking Networks for Personalized Video Summarization", @@ -27693,6 +29568,7 @@ "status": "Poster", "track": "main", "pid": 7824, + "author_site": "Yassir Saquil; Da Chen; Yuan He; Chuan Li; Yong-Liang Yang", "author": "Yassir Saquil; Da Chen; Yuan He; Chuan Li; Yong-Liang Yang", "abstract": "In this paper, we investigate video summarization in the supervised setting. Since video summarization is subjective to the preference of the end-user, the design of a unique model is limited. In this work, we propose a model that provides personalized video summaries by conditioning the summarization process with predefined categorical user labels referred to as preferences. The underlying method is based on multiple pairwise rankers (called Multi-ranker), where the rankers are trained jointly to provide local summaries as well as a global summarization of a given video. In order to demonstrate the relevance and applications of our method in contrast with a classical global summarizer, we conduct experiments on multiple benchmark datasets, notably through a user study and comparisons with the state-of-art methods in the global video summarization task.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Saquil_Multiple_Pairwise_Ranking_Networks_for_Personalized_Video_Summarization_ICCV_2021_paper.pdf", @@ -27716,7 +29592,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;2;0", - "aff_country_unique": "United Kingdom;China;Canada" + "aff_country_unique": "United Kingdom;China;Canada", + "bibtex": "@InProceedings{Saquil_2021_ICCV,\n \n author = {\n Saquil,\n Yassir and Chen,\n Da and He,\n Yuan and Li,\n Chuan and Yang,\n Yong-Liang\n},\n title = {\n Multiple Pairwise Ranking Networks for Personalized Video Summarization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1718-1727\n} \n}" }, { "title": "Multiresolution Deep Implicit Functions for 3D Shape Representation", @@ -27724,7 +29601,8 @@ "status": "Poster", "track": "main", "pid": 6006, - "author": "Zhang Chen; Yinda Zhang; Kyle Genova; Sean Fanello; Sofien Bouaziz; Christian H\u00e4ne; Ruofei Du; Cem Keskin; Thomas Funkhouser; Danhang Tang", + "author_site": "Zhang Chen; Yinda Zhang; Kyle Genova; Sean Fanello; Sofien Bouaziz; Christian Häne; Ruofei Du; Cem Keskin; Thomas Funkhouser; Danhang Tang", + "author": "Zhang Chen; Yinda Zhang; Kyle Genova; Sean Fanello; Sofien Bouaziz; Christian Häne; Ruofei Du; Cem Keskin; Thomas Funkhouser; Danhang Tang", "abstract": "We introduce Multiresolution Deep Implicit Functions (MDIF), a hierarchical representation that can recover fine geometry detail, while being able to perform global operations such as shape completion. Our model represents a complex 3D shape with a hierarchy of latent grids, which can be decoded into different levels of detail and also achieve better accuracy. For shape completion, we propose latent grid dropout to simulate partial data in the latent space and therefore defer the completing functionality to the decoder side.This along with our multires design significantly improves the shape completion quality under decoder-only latent optimization. To the best of our knowledge, MDIF is the first deep implicit function model that can at the same time (1) represent different levels of detail and allow progressive decoding; (2) support both encoder-decoder inference and decoder-only latent optimization, and fulfill multiple applications; (3) perform detailed decoder-only shape completion. Experiments demonstrate its superior performance against prior art in various 3D reconstruction tasks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_Multiresolution_Deep_Implicit_Functions_for_3D_Shape_Representation_ICCV_2021_paper.pdf", "aff": "Google+ShanghaiTech University; Google; Google; Google; Google; Google; Google; Google; Google; Google", @@ -27741,13 +29619,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Chen_Multiresolution_Deep_Implicit_Functions_for_3D_Shape_Representation_ICCV_2021_paper.html", "aff_unique_index": "0+1;0;0;0;0;0;0;0;0;0", "aff_unique_norm": "Google;ShanghaiTech University", - "aff_unique_dep": "Google;", + "aff_unique_dep": ";", "aff_unique_url": "https://www.google.com;https://www.shanghaitech.edu.cn", "aff_unique_abbr": "Google;ShanghaiTech", "aff_campus_unique_index": "0;0;0;0;0;0;0;0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0+1;0;0;0;0;0;0;0;0;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Zhang and Zhang,\n Yinda and Genova,\n Kyle and Fanello,\n Sean and Bouaziz,\n Sofien and H\\"ane,\n Christian and Du,\n Ruofei and Keskin,\n Cem and Funkhouser,\n Thomas and Tang,\n Danhang\n},\n title = {\n Multiresolution Deep Implicit Functions for 3D Shape Representation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13087-13096\n} \n}" }, { "title": "Multiscale Vision Transformers", @@ -27755,6 +29634,7 @@ "status": "Poster", "track": "main", "pid": 2065, + "author_site": "Haoqi Fan; Bo Xiong; Karttikeya Mangalam; Yanghao Li; Zhicheng Yan; Jitendra Malik; Christoph Feichtenhofer", "author": "Haoqi Fan; Bo Xiong; Karttikeya Mangalam; Yanghao Li; Zhicheng Yan; Jitendra Malik; Christoph Feichtenhofer", "abstract": "We present Multiscale Vision Transformers (MViT) for video and image recognition, by connecting the seminal idea of multiscale feature hierarchies with transformer models. Multiscale Transformers have several channel-resolution scale stages. Starting from the input resolution and a small channel dimension, the stages hierarchically expand the channel capacity while reducing the spatial resolution. This creates a multiscale pyramid of features with early layers operating at high spatial resolution to model simple low-level visual information, and deeper layers at spatially coarse, but complex, high-dimensional features. We evaluate this fundamental architectural prior for modeling the dense nature of visual signals for a variety of video recognition tasks where it outperforms concurrent vision transformers that rely on large scale external pre-training and are 5-10 more costly in computation and parameters. We further remove the temporal dimension and apply our model for image classification where it outperforms prior work on vision transformers. Code is available at: https://github.com/facebookresearch/SlowFast.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Fan_Multiscale_Vision_Transformers_ICCV_2021_paper.pdf", @@ -27769,7 +29649,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Fan_Multiscale_Vision_Transformers_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Fan_Multiscale_Vision_Transformers_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Fan_2021_ICCV,\n \n author = {\n Fan,\n Haoqi and Xiong,\n Bo and Mangalam,\n Karttikeya and Li,\n Yanghao and Yan,\n Zhicheng and Malik,\n Jitendra and Feichtenhofer,\n Christoph\n},\n title = {\n Multiscale Vision Transformers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6824-6835\n} \n}" }, { "title": "Multispectral Illumination Estimation Using Deep Unrolling Network", @@ -27777,6 +29658,7 @@ "status": "Poster", "track": "main", "pid": 8625, + "author_site": "Yuqi Li; Qiang Fu; Wolfgang Heidrich", "author": "Yuqi Li; Qiang Fu; Wolfgang Heidrich", "abstract": "This paper examines the problem of illumination spectra estimation in multispectral images. We cast the problem into a constrained matrix factorization problem and present a method for both single-global and multiple illumination estimation in which a deep unrolling network is constructed from the alternating direction method of multipliers(ADMM) optimization for solving the matrix factorization problem. To alleviate the lack of multispectral training data, we build a large multispectral reflectance image dataset for generating synthesized data and use them for training and evaluating our model. The results of simulations and real experiments demonstrate that the proposed method is able to outperform state-of-the-art spectral illumination estimation methods, and that it generalizes well to a wide variety of scenes and spectra.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_Multispectral_Illumination_Estimation_Using_Deep_Unrolling_Network_ICCV_2021_paper.pdf", @@ -27800,7 +29682,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Saudi Arabia" + "aff_country_unique": "Saudi Arabia", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Yuqi and Fu,\n Qiang and Heidrich,\n Wolfgang\n},\n title = {\n Multispectral Illumination Estimation Using Deep Unrolling Network\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2672-2681\n} \n}" }, { "title": "Multitask AET With Orthogonal Tangent Regularity for Dark Object Detection", @@ -27808,6 +29691,7 @@ "status": "Poster", "track": "main", "pid": 3124, + "author_site": "Ziteng Cui; Guo-Jun Qi; Lin Gu; Shaodi You; Zenghui Zhang; Tatsuya Harada", "author": "Ziteng Cui; Guo-Jun Qi; Lin Gu; Shaodi You; Zenghui Zhang; Tatsuya Harada", "abstract": "Dark environment becomes a challenge for computer vision algorithms owing to insufficient photons and undesirable noises. Most of the existing studies tackle this by either targeting human vision for better visual perception or improving the machine vision for specific high-level tasks. In addition, these methods rely on data argumentation and directly train their models based on real-world or over-simplified synthetic datasets without exploring the intrinsic pattern behind illumination translation. Here, we propose a novel multitask auto encoding transformation (MAET) model that combines human vision and machine vision tasks to enhance object detection in a dark environment. With a self-supervision learning, the MAET learns an intrinsic visual structure by encoding and decoding the realistic illumination-degrading transformation considering the physical noise model and image signal processing (ISP). Based on this representation, we achieve object detection task by decoding the bounding box coordinates and classes. To avoid the over-entanglement of two tasks, our MAET disentangles the object and degrading features by imposing an orthogonal tangent regularity. This forms a parametric manifold along which multitask predictions can be geometrically formulated by maximizing the orthogonality between the tangents along the outputs of respective tasks. Our framework can be implemented based on the mainstream object detection architecture and directly trained end-to-end using the normal target detection datasets, such as COCO and VOC. We have achieved the state-of-the-art performance using synthetic and real-world datasets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Cui_Multitask_AET_With_Orthogonal_Tangent_Regularity_for_Dark_Object_Detection_ICCV_2021_paper.pdf", @@ -27824,14 +29708,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Cui_Multitask_AET_With_Orthogonal_Tangent_Regularity_for_Dark_Object_Detection_ICCV_2021_paper.html", "aff_unique_index": "0;1;2+3;4;0;2+3", - "aff_unique_norm": "Shanghai Jiao Tong University;InnoPeak Technology;RIKEN;University of Tokyo;University of Amsterdam", + "aff_unique_norm": "Shanghai Jiao Tong University;Innopeak Technology;RIKEN;University of Tokyo;University of Amsterdam", "aff_unique_dep": ";;Advanced Institute for Computational Science;;", "aff_unique_url": "https://www.sjtu.edu.cn;;https://www.aip.riken.jp;https://www.u-tokyo.ac.jp;https://www.uva.nl", "aff_unique_abbr": "SJTU;;RIKEN AIP;UTokyo;UvA", "aff_campus_unique_index": "1;;", "aff_campus_unique": ";Seattle", "aff_country_unique_index": "0;1;2+2;3;0;2+2", - "aff_country_unique": "China;United States;Japan;Netherlands" + "aff_country_unique": "China;United States;Japan;Netherlands", + "bibtex": "@InProceedings{Cui_2021_ICCV,\n \n author = {\n Cui,\n Ziteng and Qi,\n Guo-Jun and Gu,\n Lin and You,\n Shaodi and Zhang,\n Zenghui and Harada,\n Tatsuya\n},\n title = {\n Multitask AET With Orthogonal Tangent Regularity for Dark Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2553-2562\n} \n}" }, { "title": "Multiview Pseudo-Labeling for Semi-Supervised Learning From Video", @@ -27839,6 +29724,7 @@ "status": "Poster", "track": "main", "pid": 2125, + "author_site": "Bo Xiong; Haoqi Fan; Kristen Grauman; Christoph Feichtenhofer", "author": "Bo Xiong; Haoqi Fan; Kristen Grauman; Christoph Feichtenhofer", "abstract": "We present a multiview pseudo-labeling approach to video learning, a novel framework that uses complementary views in the form of appearance and motion information for semi-supervised learning in video. The complementary views help obtain more reliable \"pseudo-labels\"\" on unlabeled video, to learn stronger video representations than from purely supervised data. Though our method capitalizes on multiple views, it nonetheless trains a model that is shared across appearance and motion input and thus, by design, incurs no additional computation overhead at inference time. On multiple video recognition datasets, our method substantially outperforms its supervised counterpart, and compares favorably to previous work on standard benchmarks in self-supervised video representation learning.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xiong_Multiview_Pseudo-Labeling_for_Semi-Supervised_Learning_From_Video_ICCV_2021_paper.pdf", @@ -27853,7 +29739,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Xiong_Multiview_Pseudo-Labeling_for_Semi-Supervised_Learning_From_Video_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Xiong_Multiview_Pseudo-Labeling_for_Semi-Supervised_Learning_From_Video_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Xiong_2021_ICCV,\n \n author = {\n Xiong,\n Bo and Fan,\n Haoqi and Grauman,\n Kristen and Feichtenhofer,\n Christoph\n},\n title = {\n Multiview Pseudo-Labeling for Semi-Supervised Learning From Video\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7209-7219\n} \n}" }, { "title": "Mutual Affine Network for Spatially Variant Kernel Estimation in Blind Image Super-Resolution", @@ -27861,6 +29748,7 @@ "status": "Poster", "track": "main", "pid": 1158, + "author_site": "Jingyun Liang; Guolei Sun; Kai Zhang; Luc Van Gool; Radu Timofte", "author": "Jingyun Liang; Guolei Sun; Kai Zhang; Luc Van Gool; Radu Timofte", "abstract": "Existing blind image super-resolution (SR) methods mostly assume blur kernels are spatially invariant across the whole image. However, such an assumption is rarely applicable for real images whose blur kernels are usually spatially variant due to factors such as object motion and out-of-focus. Hence, existing blind SR methods would inevitably give rise to poor performance in real applications. To address this issue, this paper proposes a mutual affine network (MANet) for spatially variant kernel estimation. Specifically, MANet has two distinctive features. First, it has a moderate receptive field so as to keep the locality of degradation. Second, it involves a new mutual affine convolution (MAConv) layer that enhances feature expressiveness without increasing receptive field, model size and computation burden. This is made possible through exploiting channel interdependence, which applies each channel split with an affine transformation module whose input are the rest channel splits. Extensive experiments on synthetic and real images show that the proposed MANet not only performs favorably for both spatially variant and invariant kernel estimation, but also leads to state-of-the-art blind SR performance when combined with non-blind SR methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liang_Mutual_Affine_Network_for_Spatially_Variant_Kernel_Estimation_in_Blind_ICCV_2021_paper.pdf", @@ -27884,7 +29772,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0+1;0", - "aff_country_unique": "Switzerland;Belgium" + "aff_country_unique": "Switzerland;Belgium", + "bibtex": "@InProceedings{Liang_2021_ICCV,\n \n author = {\n Liang,\n Jingyun and Sun,\n Guolei and Zhang,\n Kai and Van Gool,\n Luc and Timofte,\n Radu\n},\n title = {\n Mutual Affine Network for Spatially Variant Kernel Estimation in Blind Image Super-Resolution\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4096-4105\n} \n}" }, { "title": "Mutual Supervision for Dense Object Detection", @@ -27892,6 +29781,7 @@ "status": "Poster", "track": "main", "pid": 7800, + "author_site": "Ziteng Gao; Limin Wang; Gangshan Wu", "author": "Ziteng Gao; Limin Wang; Gangshan Wu", "abstract": "The classification and regression head are both indispensable components to build up a dense object detector, which are usually supervised by the same training samples and thus expected to have consistency with each other for detecting objects accurately in final detection pipelines. In this paper, we break the convention of the same training samples for these two heads in dense detectors and explore a novel supervisory paradigm, termed as Mutual Supervision (MuSu), to respectively and mutually assign training samples for the classification and regression head to ensure this consistency. MuSu defines training samples for the regression head mainly based on classification predicting scores and in turn, defines samples for the classification head based on localization scores from the regression head. Experimental results show that the convergence of detectors trained by this mutual supervision is guaranteed and the effectiveness of the proposed method is verified on the challenging MS COCO benchmark. We also find that tiling more anchors at the same location benefits detectors and leads to further improvements under this training scheme. We hope this work can inspire further researches on the interaction of the classification and regression task in detection and the supervision paradigm for detectors, especially separately for these two heads.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Gao_Mutual_Supervision_for_Dense_Object_Detection_ICCV_2021_paper.pdf", @@ -27915,7 +29805,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Gao_2021_ICCV,\n \n author = {\n Gao,\n Ziteng and Wang,\n Limin and Wu,\n Gangshan\n},\n title = {\n Mutual Supervision for Dense Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3641-3650\n} \n}" }, { "title": "Mutual-Complementing Framework for Nuclei Detection and Segmentation in Pathology Image", @@ -27923,6 +29814,7 @@ "status": "Poster", "track": "main", "pid": 5992, + "author_site": "Zunlei Feng; Zhonghua Wang; Xinchao Wang; Yining Mao; Thomas Li; Jie Lei; Yuexuan Wang; Mingli Song", "author": "Zunlei Feng; Zhonghua Wang; Xinchao Wang; Yining Mao; Thomas Li; Jie Lei; Yuexuan Wang; Mingli Song", "abstract": "Detection and segmentation of nuclei are fundamental analysis operations in pathology images, the assessments derived from which serve as the gold standard for cancer diagnosis. Manual segmenting nuclei is expensive and time-consuming. What's more, accurate segmentation detection of nuclei can be challenging due to the large appearance variation, conjoined and overlapping nuclei, and serious degeneration of histological structures. Supervised methods highly rely on massive annotated samples. The existing two unsupervised methods are prone to failure on degenerated samples. This paper proposes a Mutual-Complementing Framework (MCF) for nuclei detection and segmentation in pathology images. Two branches of MCF are trained in the mutual-complementing manner, where the detection branch complements the pseudo mask of the segmentation branch, while the progressive trained segmentation branch complements the missing nucleus templates through calculating the mask residual between the predicted mask and detected result. In the detection branch, two response map fusion strategies and gradient direction based postprocessing are devised to obtain the optimal detection response. Furthermore, the confidence loss combined with the synthetic samples and self-finetuning is adopted to train the segmentation network with only high confidence areas. Extensive experiments demonstrate that MCF achieves comparable performance with only a few nucleus patches as supervision. Especially, MCF possesses good robustness (only dropping by about 6%) on degenerated samples, which are critical and common cases in clinical diagnosis.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Feng_Mutual-Complementing_Framework_for_Nuclei_Detection_and_Segmentation_in_Pathology_Image_ICCV_2021_paper.pdf", @@ -27937,7 +29829,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Feng_Mutual-Complementing_Framework_for_Nuclei_Detection_and_Segmentation_in_Pathology_Image_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Feng_Mutual-Complementing_Framework_for_Nuclei_Detection_and_Segmentation_in_Pathology_Image_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Feng_2021_ICCV,\n \n author = {\n Feng,\n Zunlei and Wang,\n Zhonghua and Wang,\n Xinchao and Mao,\n Yining and Li,\n Thomas and Lei,\n Jie and Wang,\n Yuexuan and Song,\n Mingli\n},\n title = {\n Mutual-Complementing Framework for Nuclei Detection and Segmentation in Pathology Image\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4036-4045\n} \n}" }, { "title": "N-ImageNet: Towards Robust, Fine-Grained Object Recognition With Event Cameras", @@ -27945,6 +29838,7 @@ "status": "Poster", "track": "main", "pid": 3861, + "author_site": "Junho Kim; Jaehyeok Bae; Gangin Park; Dongsu Zhang; Young Min Kim", "author": "Junho Kim; Jaehyeok Bae; Gangin Park; Dongsu Zhang; Young Min Kim", "abstract": "We introduce N-ImageNet, a large-scale dataset targeted for robust, fine-grained object recognition with event cameras. The dataset is collected using programmable hardware in which an event camera consistently moves around a monitor displaying images from ImageNet. N-ImageNet serves as a challenging benchmark for event-based object recognition, due to its large number of classes and samples. We empirically show that pretraining on N-ImageNet improves the performance of event-based classifiers and helps them learn with few labeled data. In addition, we present several variants of N-ImageNet to test the robustness of event-based classifiers under diverse camera trajectories and severe lighting conditions, and propose a novel event representation to alleviate the performance degradation. To the best of our knowledge, we are the first to quantitatively investigate the consequences caused by various environmental conditions on event-based object recognition algorithms. N-ImageNet and its variants are expected to guide practical implementations for deploying event-based object recognition algorithms in the real world.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Kim_N-ImageNet_Towards_Robust_Fine-Grained_Object_Recognition_With_Event_Cameras_ICCV_2021_paper.pdf", @@ -27968,7 +29862,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Kim_2021_ICCV,\n \n author = {\n Kim,\n Junho and Bae,\n Jaehyeok and Park,\n Gangin and Zhang,\n Dongsu and Kim,\n Young Min\n},\n title = {\n N-ImageNet: Towards Robust,\n Fine-Grained Object Recognition With Event Cameras\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2146-2156\n} \n}" }, { "title": "NAS-OoD: Neural Architecture Search for Out-of-Distribution Generalization", @@ -27976,10 +29871,11 @@ "status": "Poster", "track": "main", "pid": 5647, + "author_site": "Haoyue Bai; Fengwei Zhou; Lanqing Hong; Nanyang Ye; S.-H. Gary Chan; Zhenguo Li", "author": "Haoyue Bai; Fengwei Zhou; Lanqing Hong; Nanyang Ye; S.-H. Gary Chan; Zhenguo Li", "abstract": "Recent advances on Out-of-Distribution (OoD) generalization reveal the robustness of deep learning models against distribution shifts. However, existing works focus on OoD algorithms, such as invariant risk minimization, domain generalization, or stable learning, without considering the influence of deep model architectures on OoD generalization, which may lead to sub-optimal performance. Neural Architecture Search (NAS) methods search for architecture based on its performance on the training data, which may result in poor generalization for OoD tasks. In this work, we propose robust Neural Architecture Search for OoD generalization (NAS-OoD), which optimizes the architecture with respect to its performance on generated OoD data by gradient descent. Specifically, a data generator is learned to synthesize OoD data by maximizing losses computed by different neural architectures, while the goal for architecture search is to find the optimal architecture parameters that minimize the synthetic OoD data losses. The data generator and the neural architecture are jointly optimized in an end-to-end manner, and the minimax training process effectively discovers robust architectures that generalize well for different distribution shifts. Extensive experimental results show that NAS-OoD achieves superior performance on various OoD generalization benchmarks with deep models having a much fewer number of parameters. In addition, on a real industry dataset, the proposed NAS-OoD method reduces the error rate by more than 70% compared with the state-of-the-art method, demonstrating the proposed method's practicality for real applications.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Bai_NAS-OoD_Neural_Architecture_Search_for_Out-of-Distribution_Generalization_ICCV_2021_paper.pdf", - "aff": "The Hong Kong University of Science and Technology; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Shanghai Jiao Tong University; The Hong Kong University of Science and Technology; Huawei Noah\u2019s Ark Lab", + "aff": "The Hong Kong University of Science and Technology; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; Shanghai Jiao Tong University; The Hong Kong University of Science and Technology; Huawei Noah’s Ark Lab", "project": "", "github": "", "supp": "", @@ -27993,13 +29889,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Bai_NAS-OoD_Neural_Architecture_Search_for_Out-of-Distribution_Generalization_ICCV_2021_paper.html", "aff_unique_index": "0;1;1;2;0;1", "aff_unique_norm": "Hong Kong University of Science and Technology;Huawei;Shanghai Jiao Tong University", - "aff_unique_dep": ";Noah\u2019s Ark Lab;", + "aff_unique_dep": ";Noah’s Ark Lab;", "aff_unique_url": "https://www.ust.hk;https://www.huawei.com;https://www.sjtu.edu.cn", "aff_unique_abbr": "HKUST;Huawei;SJTU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Bai_2021_ICCV,\n \n author = {\n Bai,\n Haoyue and Zhou,\n Fengwei and Hong,\n Lanqing and Ye,\n Nanyang and Chan,\n S.-H. Gary and Li,\n Zhenguo\n},\n title = {\n NAS-OoD: Neural Architecture Search for Out-of-Distribution Generalization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8320-8329\n} \n}" }, { "title": "NASOA: Towards Faster Task-Oriented Online Fine-Tuning With a Zoo of Models", @@ -28007,10 +29904,11 @@ "status": "Poster", "track": "main", "pid": 2129, + "author_site": "Hang Xu; Ning Kang; Gengwei Zhang; Chuanlong Xie; Xiaodan Liang; Zhenguo Li", "author": "Hang Xu; Ning Kang; Gengwei Zhang; Chuanlong Xie; Xiaodan Liang; Zhenguo Li", "abstract": "Fine-tuning from pre-trained ImageNet models has been a simple, effective, and popular approach for various computer vision tasks. The common practice of fine-tuning is to adopt a default hyperparameter setting with a fixed pre-trained model, while both of them are not optimized for specific tasks and time constraints. Moreover, in cloud computing or GPU clusters where the tasks arrive sequentially in a stream, faster online fine-tuning is a more desired and realistic strategy for saving money, energy consumption, and CO2 emission. In this paper, we propose a joint Neural Architecture Search and Online Adaption framework named NASOA towards a faster task-oriented fine-tuning upon the request of users. Specifically, NASOA first adopts an offline NAS to identify a group of training-efficient networks to form a pretrained model zoo. We propose a novel joint block and macro level search space to enable a flexible and efficient search. Then, by estimating fine-tuning performance via an adaptive model by accumulating experience from the past tasks, an online schedule generator is proposed to pick up the most suitable model and generate a personalized training regime with respect to each desired task in a one-shot fashion. The resulting model zoo is more training efficient than SOTA NAS models, e.g. 6x faster than RegNetY-16GF, and 1.7x faster than EfficientNetB3. Experiments on multiple datasets also show that NASOA achieves much better fine-tuning results, i.e. improving around 2.1% accuracy than the best performance in RegNet series under various time constraints and tasks; 40x faster compared to the BOHB method.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xu_NASOA_Towards_Faster_Task-Oriented_Online_Fine-Tuning_With_a_Zoo_of_ICCV_2021_paper.pdf", - "aff": "Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Sun Yat-sen University; Huawei Noah\u2019s Ark Lab; Sun Yat-sen University; Huawei Noah\u2019s Ark Lab", + "aff": "Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; Sun Yat-sen University; Huawei Noah’s Ark Lab; Sun Yat-sen University; Huawei Noah’s Ark Lab", "project": "", "github": "https://github.com/NAS-OA/NASOA", "supp": "", @@ -28024,13 +29922,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Xu_NASOA_Towards_Faster_Task-Oriented_Online_Fine-Tuning_With_a_Zoo_of_ICCV_2021_paper.html", "aff_unique_index": "0;0;1;0;1;0", "aff_unique_norm": "Huawei;Sun Yat-sen University", - "aff_unique_dep": "Noah\u2019s Ark Lab;", + "aff_unique_dep": "Noah’s Ark Lab;", "aff_unique_url": "https://www.huawei.com;http://www.sysu.edu.cn/", "aff_unique_abbr": "Huawei;SYSU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xu_2021_ICCV,\n \n author = {\n Xu,\n Hang and Kang,\n Ning and Zhang,\n Gengwei and Xie,\n Chuanlong and Liang,\n Xiaodan and Li,\n Zhenguo\n},\n title = {\n NASOA: Towards Faster Task-Oriented Online Fine-Tuning With a Zoo of Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5097-5106\n} \n}" }, { "title": "NEAT: Neural Attention Fields for End-to-End Autonomous Driving", @@ -28038,10 +29937,11 @@ "status": "Poster", "track": "main", "pid": 3370, + "author_site": "Kashyap Chitta; Aditya Prakash; Andreas Geiger", "author": "Kashyap Chitta; Aditya Prakash; Andreas Geiger", "abstract": "Efficient reasoning about the semantic, spatial, and temporal structure of a scene is a crucial prerequisite for autonomous driving. We present NEural ATtention fields (NEAT), a novel representation that enables such reasoning for end-to-end imitation learning models. NEAT is a continuous function which maps locations in Bird's Eye View (BEV) scene coordinates to waypoints and semantics, using intermediate attention maps to iteratively compress high-dimensional 2D image features into a compact representation. This allows our model to selectively attend to relevant regions in the input while ignoring information irrelevant to the driving task, effectively associating the images with the BEV representation. In a new evaluation setting involving adverse environmental conditions and challenging scenarios, NEAT outperforms several strong baselines and achieves driving scores on par with the privileged CARLA expert used to generate its training data. Furthermore, visualizing the attention maps for models with NEAT intermediate representations provides improved interpretability.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chitta_NEAT_Neural_Attention_Fields_for_End-to-End_Autonomous_Driving_ICCV_2021_paper.pdf", - "aff": "Max Planck Institute for Intelligent Systems, T\u00fcbingen+University of T\u00fcbingen; Max Planck Institute for Intelligent Systems, T\u00fcbingen+University of T\u00fcbingen; Max Planck Institute for Intelligent Systems, T\u00fcbingen+University of T\u00fcbingen", + "aff": "Max Planck Institute for Intelligent Systems, Tübingen+University of Tübingen; Max Planck Institute for Intelligent Systems, Tübingen+University of Tübingen; Max Planck Institute for Intelligent Systems, Tübingen+University of Tübingen", "project": "", "github": "", "supp": "", @@ -28054,14 +29954,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Chitta_NEAT_Neural_Attention_Fields_for_End-to-End_Autonomous_Driving_ICCV_2021_paper.html", "aff_unique_index": "0+1;0+1;0+1", - "aff_unique_norm": "Max Planck Institute for Intelligent Systems;University of T\u00fcbingen", + "aff_unique_norm": "Max Planck Institute for Intelligent Systems;University of Tübingen", "aff_unique_dep": ";", "aff_unique_url": "https://www.mpi-is.mpg.de;https://www.uni-tuebingen.de/", - "aff_unique_abbr": "MPI-IS;Uni T\u00fcbingen", + "aff_unique_abbr": "MPI-IS;Uni Tübingen", "aff_campus_unique_index": "0;0;0", - "aff_campus_unique": "T\u00fcbingen;", + "aff_campus_unique": "Tübingen;", "aff_country_unique_index": "0+0;0+0;0+0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Chitta_2021_ICCV,\n \n author = {\n Chitta,\n Kashyap and Prakash,\n Aditya and Geiger,\n Andreas\n},\n title = {\n NEAT: Neural Attention Fields for End-to-End Autonomous Driving\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15793-15803\n} \n}" }, { "title": "NGC: A Unified Framework for Learning With Open-World Noisy Data", @@ -28069,6 +29970,7 @@ "status": "Poster", "track": "main", "pid": 2395, + "author_site": "Zhi-Fan Wu; Tong Wei; Jianwen Jiang; Chaojie Mao; Mingqian Tang; Yu-Feng Li", "author": "Zhi-Fan Wu; Tong Wei; Jianwen Jiang; Chaojie Mao; Mingqian Tang; Yu-Feng Li", "abstract": "The existence of noisy data is prevalent in both the training and testing phases of machine learning systems, which inevitably leads to the degradation of model performance. There have been plenty of works concentrated on learning with in-distribution (IND) noisy labels in the last decade, i.e., some training samples are assigned incorrect labels that do not correspond to their true classes. Nonetheless, in real application scenarios, it is necessary to consider the influence of out-of-distribution (OOD) samples, i.e., samples that do not belong to any known classes, which has not been sufficiently explored yet. To remedy this, we study a new problem setup, namely Learning with Open-world Noisy Data (LOND). The goal of LOND is to simultaneously learn a classifier and an OOD detector from datasets with mixed IND and OOD noise. In this paper, we propose a new graph-based framework, namely Noisy Graph Cleaning (NGC), which collects clean samples by leveraging geometric structure of data and model predictive confidence. Without any additional training effort, NGC can detect and reject the OOD samples based on the learned class prototypes directly in testing phase. We conduct experiments on multiple benchmarks with different types of noise and the results demonstrate the superior performance of our method against state of the arts.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wu_NGC_A_Unified_Framework_for_Learning_With_Open-World_Noisy_Data_ICCV_2021_paper.pdf", @@ -28092,7 +29994,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Nanjing;", "aff_country_unique_index": "0+0;0+0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wu_2021_ICCV,\n \n author = {\n Wu,\n Zhi-Fan and Wei,\n Tong and Jiang,\n Jianwen and Mao,\n Chaojie and Tang,\n Mingqian and Li,\n Yu-Feng\n},\n title = {\n NGC: A Unified Framework for Learning With Open-World Noisy Data\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 62-71\n} \n}" }, { "title": "NPMs: Neural Parametric Models for 3D Deformable Shapes", @@ -28100,10 +30003,11 @@ "status": "Poster", "track": "main", "pid": 4333, - "author": "Pablo Palafox; Alja\u017e Bo\u017ei\u010d; Justus Thies; Matthias Nie\u00dfner; Angela Dai", + "author_site": "Pablo Palafox; Aljaž Božič; Justus Thies; Matthias Nießner; Angela Dai", + "author": "Pablo Palafox; Aljaž Božič; Justus Thies; Matthias Nießner; Angela Dai", "abstract": "Parametric 3D models have enabled a wide variety of tasks in computer graphics and vision, such as modeling human bodies, faces, and hands. However, the construction of these parametric models is often tedious, as it requires heavy manual tweaking, and they struggle to represent additional complexity and details such as wrinkles or clothing. To this end, we propose Neural Parametric Models (NPMs), a novel, learned alternative to traditional, parametric 3D models, which does not require hand-crafted, object-specific constraints. In particular, we learn to disentangle 4D dynamics into latent-space representations of shape and pose, leveraging the flexibility of recent developments in learned implicit functions. Crucially, once learned, our neural parametric models of shape and pose enable optimization over the learned spaces to fit to new observations, similar to the fitting of a traditional parametric model, e.g., SMPL. This enables NPMs to achieve a significantly more accurate and detailed representation of observed deformable sequences. We show that NPMs improve notably over both parametric and non-parametric state of the art in reconstruction and tracking of monocular depth sequences of clothed humans and hands. Latent-space interpolation as well as shape / pose transfer experiments further demonstrate the usefulness of NPMs. Code is publicly available at https://pablopalafox.github.io/npms.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Palafox_NPMs_Neural_Parametric_Models_for_3D_Deformable_Shapes_ICCV_2021_paper.pdf", - "aff": "Technical University of Munich; Technical University of Munich; Technical University of Munich + Max Planck Institute for Intelligent Systems, T \u00a8ubingen; Technical University of Munich; Technical University of Munich", + "aff": "Technical University of Munich; Technical University of Munich; Technical University of Munich + Max Planck Institute for Intelligent Systems, T ¨ubingen; Technical University of Munich; Technical University of Munich", "project": "https://pablopalafox.github.io/npms", "github": "https://github.com/pablopalafox/npms", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Palafox_NPMs_Neural_Parametric_ICCV_2021_supplemental.pdf", @@ -28121,9 +30025,10 @@ "aff_unique_url": "https://www.tum.de;https://www.mpi-is.mpg.de", "aff_unique_abbr": "TUM;MPI-IS", "aff_campus_unique_index": "1", - "aff_campus_unique": ";T\u00fcbingen", + "aff_campus_unique": ";Tübingen", "aff_country_unique_index": "0;0;0+0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Palafox_2021_ICCV,\n \n author = {\n Palafox,\n Pablo and Bo\\v{z\n}i\\v{c\n},\n Alja\\v{z\n} and Thies,\n Justus and Nie{\\ss\n}ner,\n Matthias and Dai,\n Angela\n},\n title = {\n NPMs: Neural Parametric Models for 3D Deformable Shapes\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12695-12705\n} \n}" }, { "title": "Naturalistic Physical Adversarial Patch for Object Detectors", @@ -28131,6 +30036,7 @@ "status": "Poster", "track": "main", "pid": 8298, + "author_site": "Yu-Chih-Tuan Hu; Bo-Han Kung; Daniel Stanley Tan; Jun-Cheng Chen; Kai-Lung Hua; Wen-Huang Cheng", "author": "Yu-Chih-Tuan Hu; Bo-Han Kung; Daniel Stanley Tan; Jun-Cheng Chen; Kai-Lung Hua; Wen-Huang Cheng", "abstract": "Most prior works on physical adversarial attacks mainly focus on the attack performance but seldom enforce any restrictions over the appearance of the generated adversarial patches. This leads to conspicuous and attention-grabbing patterns for the generated patches which can be easily identified by humans. To address this issue, we propose a method to craft physical adversarial patches for object detectors by leveraging the learned image manifold of a pretrained generative adversarial network (GAN) (e.g., BigGAN and StyleGAN) upon real-world images. Through sampling the optimal image from the GAN, our method can generate natural looking adversarial patches while maintaining high attack performance. With extensive experiments on both digital and physical domains and several independent subjective surveys, the results show that our proposed method produces significantly more realistic and natural looking patches than several state-of-the-art baselines while achieving competitive attack performance.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Hu_Naturalistic_Physical_Adversarial_Patch_for_Object_Detectors_ICCV_2021_paper.pdf", @@ -28145,7 +30051,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Hu_Naturalistic_Physical_Adversarial_Patch_for_Object_Detectors_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Hu_Naturalistic_Physical_Adversarial_Patch_for_Object_Detectors_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Hu_2021_ICCV,\n \n author = {\n Hu,\n Yu-Chih-Tuan and Kung,\n Bo-Han and Tan,\n Daniel Stanley and Chen,\n Jun-Cheng and Hua,\n Kai-Lung and Cheng,\n Wen-Huang\n},\n title = {\n Naturalistic Physical Adversarial Patch for Object Detectors\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7848-7857\n} \n}" }, { "title": "NeRD: Neural Reflectance Decomposition From Image Collections", @@ -28153,6 +30060,7 @@ "status": "Poster", "track": "main", "pid": 2350, + "author_site": "Mark Boss; Raphael Braun; Varun Jampani; Jonathan T. Barron; Ce Liu; Hendrik P.A. Lensch", "author": "Mark Boss; Raphael Braun; Varun Jampani; Jonathan T. Barron; Ce Liu; Hendrik P.A. Lensch", "abstract": "Decomposing a scene into its shape, reflectance, and illumination is a challenging but important problem in computer vision and graphics. This problem is inherently more challenging when the illumination is not a single light source under laboratory conditions but is instead an unconstrained environmental illumination. Though recent work has shown that implicit representations can be used to model the radiance field of an object, most of these techniques only enable view synthesis and not relighting. Additionally, evaluating these radiance fields is resource and time-intensive. We propose a neural reflectance decomposition (NeRD) technique that uses physically-based rendering to decompose the scene into spatially varying BRDF material properties. In contrast to existing techniques, our input images can be captured under different illumination conditions. In addition, we also propose techniques to convert the learned reflectance volume into a relightable textured mesh enabling fast real-time rendering with novel illuminations. We demonstrate the potential of the proposed approach with experiments on both synthetic and real datasets, where we are able to obtain high-quality relightable 3D assets from image collections. The datasets and code are available at the project page: https://markboss.me/publication/2021-nerd/", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Boss_NeRD_Neural_Reflectance_Decomposition_From_Image_Collections_ICCV_2021_paper.pdf", @@ -28167,7 +30075,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Boss_NeRD_Neural_Reflectance_Decomposition_From_Image_Collections_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Boss_NeRD_Neural_Reflectance_Decomposition_From_Image_Collections_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Boss_2021_ICCV,\n \n author = {\n Boss,\n Mark and Braun,\n Raphael and Jampani,\n Varun and Barron,\n Jonathan T. and Liu,\n Ce and Lensch,\n Hendrik P.A.\n},\n title = {\n NeRD: Neural Reflectance Decomposition From Image Collections\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12684-12694\n} \n}" }, { "title": "Nerfies: Deformable Neural Radiance Fields", @@ -28175,6 +30084,7 @@ "status": "Poster", "track": "main", "pid": 5882, + "author_site": "Keunhong Park; Utkarsh Sinha; Jonathan T. Barron; Sofien Bouaziz; Dan B Goldman; Steven M. Seitz; Ricardo Martin-Brualla", "author": "Keunhong Park; Utkarsh Sinha; Jonathan T. Barron; Sofien Bouaziz; Dan B Goldman; Steven M. Seitz; Ricardo Martin-Brualla", "abstract": "We present the first method capable of photorealistically reconstructing deformable scenes using photos/videos captured casually from mobile phones. Our approach augments neural radiance fields (NeRF) by optimizing an additional continuous volumetric deformation field that warps each observed point into a canonical 5D NeRF. We observe that these NeRF-like deformation fields are prone to local minima, and propose a coarse-to-fine optimization method for coordinate-based models that allows for more robust optimization. By adapting principles from geometry processing and physical simulation to NeRF-like models, we propose an elastic regularization of the deformation field that further improves robustness. We show that our method can turn casually captured selfie photos/videos into deformable NeRF models that allow for photorealistic renderings of the subject from arbitrary viewpoints, which we dub \"nerfies.\" We evaluate our method by collecting time-synchronized data using a rig with two mobile phones, yielding train/validation images of the same pose at different viewpoints. We show that our method faithfully reconstructs non-rigidly deforming scenes and reproduces unseen views with high fidelity.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Park_Nerfies_Deformable_Neural_Radiance_Fields_ICCV_2021_paper.pdf", @@ -28198,7 +30108,8 @@ "aff_campus_unique_index": "1;1;1;1;1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0;0;0+0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Park_2021_ICCV,\n \n author = {\n Park,\n Keunhong and Sinha,\n Utkarsh and Barron,\n Jonathan T. and Bouaziz,\n Sofien and Goldman,\n Dan B and Seitz,\n Steven M. and Martin-Brualla,\n Ricardo\n},\n title = {\n Nerfies: Deformable Neural Radiance Fields\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5865-5874\n} \n}" }, { "title": "NerfingMVS: Guided Optimization of Neural Radiance Fields for Indoor Multi-View Stereo", @@ -28206,6 +30117,7 @@ "status": "Poster", "track": "main", "pid": 4076, + "author_site": "Yi Wei; Shaohui Liu; Yongming Rao; Wang Zhao; Jiwen Lu; Jie Zhou", "author": "Yi Wei; Shaohui Liu; Yongming Rao; Wang Zhao; Jiwen Lu; Jie Zhou", "abstract": "In this work, we present a new multi-view depth estimation method that utilizes both conventional SfM reconstruction and learning-based priors over the recently proposed neural radiance fields (NeRF). Unlike existing neural network based optimization method that relies on estimated correspondences, our method directly optimizes over implicit volumes, eliminating the challenging step of matching pixels in indoor scenes. The key to our approach is to utilize the learning-based priors to guide the optimization process of NeRF. Our system firstly adapts a monocular depth network over the target scene by finetuning on its sparse SfM reconstruction. Then, we show that the shape-radiance ambiguity of NeRF still exists in indoor environments and propose to address the issue by employing the adapted depth priors to monitor the sampling process of volume rendering. Finally, a per-pixel confidence map acquired by error computation on the rendered image can be used to further improve the depth quality. Experiments show that our proposed framework significantly outperforms state-of-the-art methods on indoor scenes, with surprising findings presented on the effectiveness of correspondence-based optimization and NeRF-based optimization over the adapted depth priors. In addition, we show that the guided optimization scheme does not sacrifice the original synthesis capability of neural radiance fields, improving the rendering quality on both seen and novel views. Code is available at https://github.com/weiyithu/NerfingMVS.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wei_NerfingMVS_Guided_Optimization_of_Neural_Radiance_Fields_for_Indoor_Multi-View_ICCV_2021_paper.pdf", @@ -28229,7 +30141,8 @@ "aff_campus_unique_index": ";;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;1;0+0;0;0+0;0+0", - "aff_country_unique": "China;Switzerland" + "aff_country_unique": "China;Switzerland", + "bibtex": "@InProceedings{Wei_2021_ICCV,\n \n author = {\n Wei,\n Yi and Liu,\n Shaohui and Rao,\n Yongming and Zhao,\n Wang and Lu,\n Jiwen and Zhou,\n Jie\n},\n title = {\n NerfingMVS: Guided Optimization of Neural Radiance Fields for Indoor Multi-View Stereo\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5610-5619\n} \n}" }, { "title": "NeuSpike-Net: High Speed Video Reconstruction via Bio-Inspired Neuromorphic Cameras", @@ -28237,6 +30150,7 @@ "status": "Poster", "track": "main", "pid": 1634, + "author_site": "Lin Zhu; Jianing Li; Xiao Wang; Tiejun Huang; Yonghong Tian", "author": "Lin Zhu; Jianing Li; Xiao Wang; Tiejun Huang; Yonghong Tian", "abstract": "Neuromorphic vision sensor is a new bio-inspired imaging paradigm that emerged in recent years, which continuously sensing luminance intensity and firing asynchronous spikes (events) with high temporal resolution. Typically, there are two types of neuromorphic vision sensors, namely dynamic vision sensor (DVS) and spike camera. From the perspective of bio-inspired sampling, DVS only perceives movement by imitating the retinal periphery, while the spike camera was developed to perceive fine textures by simulating the fovea. It is meaningful to explore how to combine two types of neuromorphic cameras to reconstruct high quality image like human vision. In this paper, we propose a NeuSpike-Net to learn both the high dynamic range and high motion sensitivity of DVS and the full texture sampling of spike camera to achieve high-speed and high dynamic image reconstruction. We propose a novel representation to effectively extract the temporal information of spike and event data. By introducing the feature fusion module, the two types of neuromorphic data achieve complementary to each other. The experimental results on the simulated and real datasets demonstrate that the proposed approach is effective to reconstruct high-speed and high dynamic range images via the combination of spike and event data.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhu_NeuSpike-Net_High_Speed_Video_Reconstruction_via_Bio-Inspired_Neuromorphic_Cameras_ICCV_2021_paper.pdf", @@ -28253,14 +30167,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhu_NeuSpike-Net_High_Speed_Video_Reconstruction_via_Bio-Inspired_Neuromorphic_Cameras_ICCV_2021_paper.html", "aff_unique_index": "0+1;0+1;1;0;0+1", - "aff_unique_norm": "Peking University;Pengcheng Laboratory", - "aff_unique_dep": "Department of Computer Science and Technology;Peng Cheng Laboratory", + "aff_unique_norm": "Peking University;Peng Cheng Laboratory", + "aff_unique_dep": "Department of Computer Science and Technology;", "aff_unique_url": "http://www.pku.edu.cn;", "aff_unique_abbr": "Peking U;", "aff_campus_unique_index": "0+1;0+1;1;0;0+1", "aff_campus_unique": "Beijing;Shenzhen", "aff_country_unique_index": "0+0;0+0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhu_2021_ICCV,\n \n author = {\n Zhu,\n Lin and Li,\n Jianing and Wang,\n Xiao and Huang,\n Tiejun and Tian,\n Yonghong\n},\n title = {\n NeuSpike-Net: High Speed Video Reconstruction via Bio-Inspired Neuromorphic Cameras\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2400-2409\n} \n}" }, { "title": "Neural Architecture Search for Joint Human Parsing and Pose Estimation", @@ -28268,6 +30183,7 @@ "status": "Poster", "track": "main", "pid": 2507, + "author_site": "Dan Zeng; Yuhang Huang; Qian Bao; Junjie Zhang; Chi Su; Wu Liu", "author": "Dan Zeng; Yuhang Huang; Qian Bao; Junjie Zhang; Chi Su; Wu Liu", "abstract": "Human parsing and pose estimation are crucial for the understanding of human behaviors. Since these tasks are closely related, employing one unified model to perform two tasks simultaneously allows them to benefit from each other. However, since human parsing is a pixel-wise classification process while pose estimation is usually a regression task, it is non-trivial to extract discriminative features for both tasks while modeling their correlation in the joint learning fashion. Recent studies have shown that Neural Architecture Search (NAS) has the ability to allocate efficient feature connections for specific tasks automatically. With the spirit of NAS, we propose to search for an efficient network architecture (NPPNet) to tackle two tasks at the same time. On the one hand, to extract task-specific features for the two tasks and lay the foundation for the further searching of feature interaction, we propose to search their encoder-decoder architectures, respectively. On the other hand, to ensure two tasks fully communicate with each other, we propose to embed NAS units in both multi-scale feature interaction and high-level feature fusion to establish optimal connections between two tasks. Experimental results on both parsing and pose estimation benchmark datasets have demonstrated that the searched model achieves state-of-the-art performances on both tasks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zeng_Neural_Architecture_Search_for_Joint_Human_Parsing_and_Pose_Estimation_ICCV_2021_paper.pdf", @@ -28284,14 +30200,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zeng_Neural_Architecture_Search_for_Joint_Human_Parsing_and_Pose_Estimation_ICCV_2021_paper.html", "aff_unique_index": "0;0+1;1;0;2;1", - "aff_unique_norm": "Shanghai University;JD.com;Kingsoft Cloud", - "aff_unique_dep": ";AI Research;", + "aff_unique_norm": "Shanghai University;JD.com;Kingsoft Corporation", + "aff_unique_dep": ";AI Research;Cloud Services", "aff_unique_url": "https://www.shu.edu.cn;https://www.jd.com;https://www.ksyun.com", - "aff_unique_abbr": "SHU;JD;KSC", + "aff_unique_abbr": "SHU;JD;Kingsoft Cloud", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zeng_2021_ICCV,\n \n author = {\n Zeng,\n Dan and Huang,\n Yuhang and Bao,\n Qian and Zhang,\n Junjie and Su,\n Chi and Liu,\n Wu\n},\n title = {\n Neural Architecture Search for Joint Human Parsing and Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11385-11394\n} \n}" }, { "title": "Neural Articulated Radiance Field", @@ -28299,6 +30216,7 @@ "status": "Poster", "track": "main", "pid": 11311, + "author_site": "Atsuhiro Noguchi; Xiao Sun; Stephen Lin; Tatsuya Harada", "author": "Atsuhiro Noguchi; Xiao Sun; Stephen Lin; Tatsuya Harada", "abstract": "We present Neural Articulated Radiance Field (NARF), a novel deformable 3D representation for articulated objects learned from images. While recent advances in 3D implicit representation have made it possible to learn models of complex objects, learning pose-controllable representations of articulated objects remains a challenge, as current methods require 3D shape supervision and are unable to render appearance. In formulating an implicit representation of 3D articulated objects, our method considers only the rigid transformation of the most relevant object part in solving for the radiance field at each 3D location. In this way, the proposed method represents pose-dependent changes without significantly increasing the computational complexity. NARF is fully differentiable and can be trained from images with pose annotations. Moreover, through the use of an autoencoder, it can learn appearance variations over multiple instances of an object class. Experiments show that the proposed method is efficient and can generalize well to novel poses. The code is available for research purposes at https://github.com/nogu-atsu/NARF", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Noguchi_Neural_Articulated_Radiance_Field_ICCV_2021_paper.pdf", @@ -28313,7 +30231,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Noguchi_Neural_Articulated_Radiance_Field_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Noguchi_Neural_Articulated_Radiance_Field_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Noguchi_2021_ICCV,\n \n author = {\n Noguchi,\n Atsuhiro and Sun,\n Xiao and Lin,\n Stephen and Harada,\n Tatsuya\n},\n title = {\n Neural Articulated Radiance Field\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5762-5772\n} \n}" }, { "title": "Neural Image Compression via Attentional Multi-Scale Back Projection and Frequency Decomposition", @@ -28321,10 +30240,11 @@ "status": "Poster", "track": "main", "pid": 9529, + "author_site": "Ge Gao; Pei You; Rong Pan; Shunyuan Han; Yuanyuan Zhang; Yuchao Dai; Hojae Lee", "author": "Ge Gao; Pei You; Rong Pan; Shunyuan Han; Yuanyuan Zhang; Yuchao Dai; Hojae Lee", "abstract": "In recent years, neural image compression emerges as a rapidly developing topic in computer vision, where the state-of-the-art approaches now exhibit superior compression performance than their conventional counterparts. Despite the great progress, current methods still have limitations in preserving fine spatial details for optimal reconstruction, especially at low compression rates. We make three contributions in tackling this issue. First, we develop a novel back projection method with attentional and multi-scale feature fusion for augmented representation power. Our back projection method recalibrates the current estimation by establishing feedback connections between high-level and low-level attributes in an attentional and discriminative manner. Second, we propose to decompose the input image and separately process the distinct frequency components, whose derived latents are recombined using a novel dual attention module, so that details inside regions of interest could be explicitly manipulated. Third, we propose a novel training scheme for reducing the latent rounding residual. Experimental results show that, when measured in PSNR, our model reduces BD-rate by 9.88% and 10.32% over the state-of-the-art method, and 4.12% and 4.32% over the latest coding standard Versatile Video Coding (VVC) on the Kodak and CLIC2020 Professional Validation dataset, respectively. Our approach also produces more visually pleasant images when optimized for MS-SSIM. The significant improvement upon existing methods shows the effectiveness of our method in preserving and remedying spatial information for enhanced compression quality.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Gao_Neural_Image_Compression_via_Attentional_Multi-Scale_Back_Projection_and_Frequency_ICCV_2021_paper.pdf", - "aff": "Samsung R&D Institute China Xi\u2019an, China+Northwestern Polytechnical University, Xi\u2019an, China; Samsung R&D Institute China Xi\u2019an, China+Northwestern Polytechnical University, Xi\u2019an, China; Samsung R&D Institute China Xi\u2019an, China+Northwestern Polytechnical University, Xi\u2019an, China; Samsung R&D Institute China Xi\u2019an, China+Northwestern Polytechnical University, Xi\u2019an, China; Samsung R&D Institute China Xi\u2019an, China+Northwestern Polytechnical University, Xi\u2019an, China; Northwestern Polytechnical University, Xi\u2019an, China; Samsung R&D Institute China Xi\u2019an, China+Northwestern Polytechnical University, Xi\u2019an, China", + "aff": "Samsung R&D Institute China Xi’an, China+Northwestern Polytechnical University, Xi’an, China; Samsung R&D Institute China Xi’an, China+Northwestern Polytechnical University, Xi’an, China; Samsung R&D Institute China Xi’an, China+Northwestern Polytechnical University, Xi’an, China; Samsung R&D Institute China Xi’an, China+Northwestern Polytechnical University, Xi’an, China; Samsung R&D Institute China Xi’an, China+Northwestern Polytechnical University, Xi’an, China; Northwestern Polytechnical University, Xi’an, China; Samsung R&D Institute China Xi’an, China+Northwestern Polytechnical University, Xi’an, China", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Gao_Neural_Image_Compression_ICCV_2021_supplemental.pdf", @@ -28337,14 +30257,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Gao_Neural_Image_Compression_via_Attentional_Multi-Scale_Back_Projection_and_Frequency_ICCV_2021_paper.html", "aff_unique_index": "0+1;0+1;0+1;0+1;0+1;1;0+1", - "aff_unique_norm": "Samsung;Northwestern Polytechnical University", - "aff_unique_dep": "Samsung R&D Institute China;", + "aff_unique_norm": "Samsung R&D Institute China;Northwestern Polytechnical University", + "aff_unique_dep": ";", "aff_unique_url": "https://www.samsung.com/cn;http://www.nwpu.edu.cn", - "aff_unique_abbr": "SRC;NPU", + "aff_unique_abbr": "SRC;NWPU", "aff_campus_unique_index": "0+0;0+0;0+0;0+0;0+0;0;0+0", "aff_campus_unique": "Xi'an", "aff_country_unique_index": "0+0;0+0;0+0;0+0;0+0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Gao_2021_ICCV,\n \n author = {\n Gao,\n Ge and You,\n Pei and Pan,\n Rong and Han,\n Shunyuan and Zhang,\n Yuanyuan and Dai,\n Yuchao and Lee,\n Hojae\n},\n title = {\n Neural Image Compression via Attentional Multi-Scale Back Projection and Frequency Decomposition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14677-14686\n} \n}" }, { "title": "Neural Photofit: Gaze-Based Mental Image Reconstruction", @@ -28352,7 +30273,8 @@ "status": "Poster", "track": "main", "pid": 9418, - "author": "Florian Strohm; Ekta Sood; Sven Mayer; Philipp M\u00fcller; Mihai B\u00e2ce; Andreas Bulling", + "author_site": "Florian Strohm; Ekta Sood; Sven Mayer; Philipp Müller; Mihai Bâce; Andreas Bulling", + "author": "Florian Strohm; Ekta Sood; Sven Mayer; Philipp Müller; Mihai Bâce; Andreas Bulling", "abstract": "We propose a novel method that leverages human fixations to visually decode the image a person has in mind into a photofit (facial composite). Our method combines three neural networks: An encoder, a scoring network, and a decoder. The encoder extracts image features and predicts a neural activation map for each face looked at by a human observer. A neural scoring network compares the human and neural attention and predicts a relevance score for each extracted image feature. Finally, image features are aggregated into a single feature vector as a linear combination of all features weighted by relevance which a decoder decodes into the final photofit. We train the neural scoring network on a novel dataset containing gaze data of 19 participants looking at collages of synthetic faces. We show that our method significantly outperforms a mean baseline predictor and report on a human study that shows that we can decode photofits that are visually plausible and close to the observer's mental image.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Strohm_Neural_Photofit_Gaze-Based_Mental_Image_Reconstruction_ICCV_2021_paper.pdf", "aff": "University of Stuttgart; University of Stuttgart; LMU Munich; German Research Center for Artificial Intelligence (DFKI); University of Stuttgart; University of Stuttgart", @@ -28375,7 +30297,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Munich", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Strohm_2021_ICCV,\n \n author = {\n Strohm,\n Florian and Sood,\n Ekta and Mayer,\n Sven and M\\"uller,\n Philipp and B\\^ace,\n Mihai and Bulling,\n Andreas\n},\n title = {\n Neural Photofit: Gaze-Based Mental Image Reconstruction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 245-254\n} \n}" }, { "title": "Neural Radiance Flow for 4D View Synthesis and Video Processing", @@ -28383,6 +30306,7 @@ "status": "Poster", "track": "main", "pid": 7261, + "author_site": "Yilun Du; Yinan Zhang; Hong-Xing Yu; Joshua B. Tenenbaum; Jiajun Wu", "author": "Yilun Du; Yinan Zhang; Hong-Xing Yu; Joshua B. Tenenbaum; Jiajun Wu", "abstract": "We present a method, Neural Radiance Flow (NeRFlow), to learn a 4D spatial-temporal representation of a dynamic scene from a set of RGB images. Key to our approach is the use of a neural implicit representation that learns to capture the 3D occupancy, radiance, and dynamics of the scene. By enforcing consistency across different modalities, our representation enables multi-view rendering in diverse dynamic scenes, including water pouring, robotic interaction, and real images, outperforming state-of-the-art methods for spatial-temporal view synthesis. Our approach works even when being provided only a single monocular real video. We further demonstrate that the learned representation can serve as an implicit scene prior, enabling video processing tasks such as image super-resolution and de-noising without any additional supervision.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Du_Neural_Radiance_Flow_for_4D_View_Synthesis_and_Video_Processing_ICCV_2021_paper.pdf", @@ -28406,7 +30330,8 @@ "aff_campus_unique_index": "0;1;1;0;1", "aff_campus_unique": "Cambridge;Stanford", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Du_2021_ICCV,\n \n author = {\n Du,\n Yilun and Zhang,\n Yinan and Yu,\n Hong-Xing and Tenenbaum,\n Joshua B. and Wu,\n Jiajun\n},\n title = {\n Neural Radiance Flow for 4D View Synthesis and Video Processing\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14324-14334\n} \n}" }, { "title": "Neural Strokes: Stylized Line Drawing of 3D Shapes", @@ -28414,6 +30339,7 @@ "status": "Poster", "track": "main", "pid": 1861, + "author_site": "Difan Liu; Matthew Fisher; Aaron Hertzmann; Evangelos Kalogerakis", "author": "Difan Liu; Matthew Fisher; Aaron Hertzmann; Evangelos Kalogerakis", "abstract": "This paper introduces a model for producing stylized line drawings from 3D shapes. The model takes a 3D shape and a viewpoint as input, and outputs a drawing with textured strokes, with variations in stroke thickness, deformation, and color learned from an artist's style. The model is fully differentiable. We train its parameters from a single training drawing of another 3D shape. We show that, in contrast to previous image-based methods, the use of a geometric representation of 3D shape and 2D strokes allows the model to transfer important aspects of shape and texture style while preserving contours. Our method outputs the resulting drawing in a vector representation, enabling richer downstream analysis or editing in interactive applications.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liu_Neural_Strokes_Stylized_Line_Drawing_of_3D_Shapes_ICCV_2021_paper.pdf", @@ -28428,7 +30354,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Liu_Neural_Strokes_Stylized_Line_Drawing_of_3D_Shapes_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Liu_Neural_Strokes_Stylized_Line_Drawing_of_3D_Shapes_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Liu_2021_ICCV,\n \n author = {\n Liu,\n Difan and Fisher,\n Matthew and Hertzmann,\n Aaron and Kalogerakis,\n Evangelos\n},\n title = {\n Neural Strokes: Stylized Line Drawing of 3D Shapes\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14204-14213\n} \n}" }, { "title": "Neural TMDlayer: Modeling Instantaneous Flow of Features via SDE Generators", @@ -28436,6 +30363,7 @@ "status": "Poster", "track": "main", "pid": 8010, + "author_site": "Zihang Meng; Vikas Singh; Sathya N. Ravi", "author": "Zihang Meng; Vikas Singh; Sathya N. Ravi", "abstract": "We study how stochastic differential equation (SDE) based ideas can inspire new modifications to existing algorithms for a set of problems in computer vision. Loosely speaking, our formulation is related to both explicit and implicit strategies for data augmentation and group equivariance, but is derived from new results in the SDE literature on estimating infinitesimal generators of a class of stochastic processes. If and when there is nominal agreement between the needs of an application/task and the inherent properties and behavior of the types of processes that we can efficiently handle, we obtain a very simple and efficient plug-in layer that can be incorporated within any existing network architecture, with minimal modification and only a few additional parameters. We show promising experiments on a number of vision tasks including few shot learning, point cloud transformers and deep variational segmentation obtaining efficiency or performance improvements.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Meng_Neural_TMDlayer_Modeling_Instantaneous_Flow_of_Features_via_SDE_Generators_ICCV_2021_paper.pdf", @@ -28459,7 +30387,8 @@ "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Madison;Chicago", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Meng_2021_ICCV,\n \n author = {\n Meng,\n Zihang and Singh,\n Vikas and Ravi,\n Sathya N.\n},\n title = {\n Neural TMDlayer: Modeling Instantaneous Flow of Features via SDE Generators\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11635-11644\n} \n}" }, { "title": "Neural Video Portrait Relighting in Real-Time via Consistency Modeling", @@ -28467,6 +30396,7 @@ "status": "Poster", "track": "main", "pid": 2424, + "author_site": "Longwen Zhang; Qixuan Zhang; Minye Wu; Jingyi Yu; Lan Xu", "author": "Longwen Zhang; Qixuan Zhang; Minye Wu; Jingyi Yu; Lan Xu", "abstract": "Video portraits relighting is critical in user-facing human photography, especially for immersive VR/AR experience. Recent advances still fail to recover consistent relit result under dynamic illuminations from monocular RGB stream, suffering from the lack of video consistency supervision. In this paper, we propose a neural approach for real-time, high-quality and coherent video portrait relighting, which jointly models the semantic, temporal and lighting consistency using a new dynamic OLAT dataset. We propose a hybrid structure and lighting disentanglement in an encoder-decoder architecture, which combines a multi-task and adversarial training strategy for semantic-aware consistency modeling. We adopt a temporal modeling scheme via flow-based supervision to encode the conjugated temporal consistency in a cross manner. We also propose a lighting sampling strategy to model the illumination consistency and mutation for natural portrait light manipulation in real-world. Extensive experiments demonstrate the effectiveness of our approach for consistent video portrait light-editing and relighting, even using mobile computing.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhang_Neural_Video_Portrait_Relighting_in_Real-Time_via_Consistency_Modeling_ICCV_2021_paper.pdf", @@ -28481,7 +30411,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhang_Neural_Video_Portrait_Relighting_in_Real-Time_via_Consistency_Modeling_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhang_Neural_Video_Portrait_Relighting_in_Real-Time_via_Consistency_Modeling_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Zhang_2021_ICCV,\n \n author = {\n Zhang,\n Longwen and Zhang,\n Qixuan and Wu,\n Minye and Yu,\n Jingyi and Xu,\n Lan\n},\n title = {\n Neural Video Portrait Relighting in Real-Time via Consistency Modeling\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 802-812\n} \n}" }, { "title": "Neural-GIF: Neural Generalized Implicit Functions for Animating People in Clothing", @@ -28489,10 +30420,11 @@ "status": "Poster", "track": "main", "pid": 6488, + "author_site": "Garvita Tiwari; Nikolaos Sarafianos; Tony Tung; Gerard Pons-Moll", "author": "Garvita Tiwari; Nikolaos Sarafianos; Tony Tung; Gerard Pons-Moll", "abstract": "We present Neural Generalized Implicit Functions(Neural-GIF), to animate people in clothing as a function of the body pose. Given a sequence of scans of a subject in various poses, we learn to animate the character for new poses. Existing methods have relied on template-based representations of the human body(or clothing). However such models usually have fixed and limited resolutions, and require difficult data pre-processing steps, and cannot be used for complex clothing. We draw inspiration from template-based methods, which factorize motion into articulation and non-rigid deformation, but generalize this concept for implicit shape learning to obtain a more flexible model. We learn to map every point in the space to a canonical space, where a learned deformation field is applied to model non-rigid effects, before evaluating the signed distance field. Our formulation allows the learning of complex and non-rigid deformations of clothing and soft tissue, without computing a template registration as it is common with current approaches. Neural-GIF can be trained on raw 3D scans and reconstructs detailed complex surface geometry and deformations. Moreover, the model can generalize to new poses. We evaluate our method on a variety of characters from different public datasets in diverse clothing styles and show significant improvement over baseline methods, quantitatively and qualitatively. We also extend our model to multiple shape setting. To stimulate further research, we will make the model, code, and data publicly available.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Tiwari_Neural-GIF_Neural_Generalized_Implicit_Functions_for_Animating_People_in_Clothing_ICCV_2021_paper.pdf", - "aff": "University of T\u00fcbingen, Germany+Max Planck Institute for Informatics, Saarland Informatics Campus, Germany; Facebook Reality Labs, Sausalito, USA; Facebook Reality Labs, Sausalito, USA; University of T\u00fcbingen, Germany+Max Planck Institute for Informatics, Saarland Informatics Campus, Germany", + "aff": "University of Tübingen, Germany+Max Planck Institute for Informatics, Saarland Informatics Campus, Germany; Facebook Reality Labs, Sausalito, USA; Facebook Reality Labs, Sausalito, USA; University of Tübingen, Germany+Max Planck Institute for Informatics, Saarland Informatics Campus, Germany", "project": "", "github": "", "supp": "", @@ -28505,14 +30437,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Tiwari_Neural-GIF_Neural_Generalized_Implicit_Functions_for_Animating_People_in_Clothing_ICCV_2021_paper.html", "aff_unique_index": "0+1;2;2;0+1", - "aff_unique_norm": "University of T\u00fcbingen;Max Planck Institute for Informatics;Meta", - "aff_unique_dep": ";;Facebook Reality Labs", + "aff_unique_norm": "University of Tübingen;Max Planck Institute for Informatics;Facebook Reality Labs", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.uni-tuebingen.de/;https://mpi-inf.mpg.de;https://www.facebook.com/realitylabs", - "aff_unique_abbr": "Uni T\u00fcbingen;MPII;FRL", + "aff_unique_abbr": "Uni Tübingen;MPII;FRL", "aff_campus_unique_index": "1;2;2;1", "aff_campus_unique": ";Saarland;Sausalito", "aff_country_unique_index": "0+0;1;1;0+0", - "aff_country_unique": "Germany;United States" + "aff_country_unique": "Germany;United States", + "bibtex": "@InProceedings{Tiwari_2021_ICCV,\n \n author = {\n Tiwari,\n Garvita and Sarafianos,\n Nikolaos and Tung,\n Tony and Pons-Moll,\n Gerard\n},\n title = {\n Neural-GIF: Neural Generalized Implicit Functions for Animating People in Clothing\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11708-11718\n} \n}" }, { "title": "Non-Rigid Neural Radiance Fields: Reconstruction and Novel View Synthesis of a Dynamic Scene From Monocular Video", @@ -28520,7 +30453,8 @@ "status": "Poster", "track": "main", "pid": 8626, - "author": "Edgar Tretschk; Ayush Tewari; Vladislav Golyanik; Michael Zollh\u00f6fer; Christoph Lassner; Christian Theobalt", + "author_site": "Edgar Tretschk; Ayush Tewari; Vladislav Golyanik; Michael Zollhöfer; Christoph Lassner; Christian Theobalt", + "author": "Edgar Tretschk; Ayush Tewari; Vladislav Golyanik; Michael Zollhöfer; Christoph Lassner; Christian Theobalt", "abstract": "We present Non-Rigid Neural Radiance Fields (NR-NeRF), a reconstruction and novel view synthesis approach for general non-rigid dynamic scenes. Our approach takes RGB images of a dynamic scene as input (e.g., from a monocular video recording), and creates a high-quality space-time geometry and appearance representation. We show that a single handheld consumer-grade camera is sufficient to synthesize sophisticated renderings of a dynamic scene from novel virtual camera views, e.g. a `bullet-time' video effect. NR-NeRF disentangles the dynamic scene into a canonical volume and its deformation. Scene deformation is implemented as ray bending, where straight rays are deformed non-rigidly. We also propose a novel rigidity network to better constrain rigid regions of the scene, leading to more stable results. The ray bending and rigidity network are trained without explicit supervision. Our formulation enables dense correspondence estimation across views and time, and compelling video editing applications such as motion exaggeration. Our code will be open sourced.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Tretschk_Non-Rigid_Neural_Radiance_Fields_Reconstruction_and_Novel_View_Synthesis_of_ICCV_2021_paper.pdf", "aff": ";;;;;", @@ -28534,7 +30468,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Tretschk_Non-Rigid_Neural_Radiance_Fields_Reconstruction_and_Novel_View_Synthesis_of_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Tretschk_Non-Rigid_Neural_Radiance_Fields_Reconstruction_and_Novel_View_Synthesis_of_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Tretschk_2021_ICCV,\n \n author = {\n Tretschk,\n Edgar and Tewari,\n Ayush and Golyanik,\n Vladislav and Zollh\\"ofer,\n Michael and Lassner,\n Christoph and Theobalt,\n Christian\n},\n title = {\n Non-Rigid Neural Radiance Fields: Reconstruction and Novel View Synthesis of a Dynamic Scene From Monocular Video\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12959-12970\n} \n}" }, { "title": "Normalization Matters in Weakly Supervised Object Localization", @@ -28542,6 +30477,7 @@ "status": "Poster", "track": "main", "pid": 4086, + "author_site": "Jeesoo Kim; Junsuk Choe; Sangdoo Yun; Nojun Kwak", "author": "Jeesoo Kim; Junsuk Choe; Sangdoo Yun; Nojun Kwak", "abstract": "Weakly-supervised object localization (WSOL) enables finding an object using a dataset without any localization information. By simply training a classification model using only image-level annotations, the feature map of a model can be utilized as a score map for localization. In spite of many WSOL methods proposing novel strategies, there has not been any de facto standards about how to normalize the class activation map (CAM). Consequently, many WSOL methods have failed to fully exploit their own capacity because of the misuse of a normalization method. In this paper, we review many existing normalization methods and point out that they should be used according to the property of the given dataset. Additionally, we propose a new normalization method which substantially enhances the performance of any CAM-based WSOL methods. Using the proposed normalization method, we provide a comprehensive evaluation over three datasets (CUB, ImageNet and OpenImages) on three different architectures and observe significant performance gains over the conventional normalization methods in all the evaluated cases.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Kim_Normalization_Matters_in_Weakly_Supervised_Object_Localization_ICCV_2021_paper.pdf", @@ -28565,7 +30501,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Seoul;", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Kim_2021_ICCV,\n \n author = {\n Kim,\n Jeesoo and Choe,\n Junsuk and Yun,\n Sangdoo and Kwak,\n Nojun\n},\n title = {\n Normalization Matters in Weakly Supervised Object Localization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3427-3436\n} \n}" }, { "title": "Normalized Human Pose Features for Human Action Video Alignment", @@ -28573,6 +30510,7 @@ "status": "Poster", "track": "main", "pid": 7038, + "author_site": "Jingyuan Liu; Mingyi Shi; Qifeng Chen; Hongbo Fu; Chiew-Lan Tai", "author": "Jingyuan Liu; Mingyi Shi; Qifeng Chen; Hongbo Fu; Chiew-Lan Tai", "abstract": "We present a novel approach for extracting human pose features from human action videos. The goal is to let the pose features capture only the poses of the action while being invariant to other factors, including video backgrounds, the video subject's anthropometric characteristics and viewpoints. Such human pose features facilitate the comparison of pose similarity and can be used for down-stream tasks, such as human action video alignment and pose retrieval. The key to our approach is to first normalize the poses in the video frames by retargeting the poses onto a pre-defined 3D skeleton to not only disentangle subject physical features, such as bone lengths and ratios, but also to unify global orientations of the poses. Then the normalized poses are mapped to a pose embedding space of high-level features, learned via unsupervised metric learning. We evaluate the effectiveness of our normalized features both qualitatively by visualizations, and quantitatively by a video alignment task on the Human3.6M dataset and an action recognition task on the Penn Action dataset.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liu_Normalized_Human_Pose_Features_for_Human_Action_Video_Alignment_ICCV_2021_paper.pdf", @@ -28589,14 +30527,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Liu_Normalized_Human_Pose_Features_for_Human_Action_Video_Alignment_ICCV_2021_paper.html", "aff_unique_index": "0;1;0;2;0", - "aff_unique_norm": "Hong Kong University of Science and Technology;University of Hong Kong;City University of Hong Kong", + "aff_unique_norm": "Hong Kong University of Science and Technology;The University of Hong Kong;City University of Hong Kong", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ust.hk;https://www.hku.hk;https://www.cityu.edu.hk", "aff_unique_abbr": "HKUST;HKU;CityU", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2021_ICCV,\n \n author = {\n Liu,\n Jingyuan and Shi,\n Mingyi and Chen,\n Qifeng and Fu,\n Hongbo and Tai,\n Chiew-Lan\n},\n title = {\n Normalized Human Pose Features for Human Action Video Alignment\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11521-11531\n} \n}" }, { "title": "Not All Operations Contribute Equally: Hierarchical Operation-Adaptive Predictor for Neural Architecture Search", @@ -28604,6 +30543,7 @@ "status": "Poster", "track": "main", "pid": 6895, + "author_site": "Ziye Chen; Yibing Zhan; Baosheng Yu; Mingming Gong; Bo Du", "author": "Ziye Chen; Yibing Zhan; Baosheng Yu; Mingming Gong; Bo Du", "abstract": "Graph-based predictors have recently shown promising results on neural architecture search (NAS). Despite their efficiency, current graph-based predictors treat all operations equally, resulting in biased topological knowledge of cell architectures. Intuitively, not all operations are equally significant during forwarding propagation when aggregating information from these operations to another operation. To address the above issue, we propose a Hierarchical Operation-adaptive Predictor (HOP) for NAS. HOP contains an operation-adaptive attention module (OAM) to capture the diverse knowledge between operations by learning the relative significance of operations in cell architectures during aggregation over iterations. In addition, a cell-hierarchical gated module (CGM) further refines and enriches the obtained topological knowledge of cell architectures, by integrating cell information from each iteration of OAM. The experimental results compared with state-of-the-art predictors demonstrate the capability of our proposed HOP. In specific, only using 0.1% training data, HOP improves kendall's Tau by 3.45%, N@5 by 20 places on NASBech-101; only using 1% training data, HOP improves kendall's Tau by 2.12%, N@5 by 18 places on NASBench-201, respectively.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_Not_All_Operations_Contribute_Equally_Hierarchical_Operation-Adaptive_Predictor_for_Neural_ICCV_2021_paper.pdf", @@ -28620,14 +30560,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Chen_Not_All_Operations_Contribute_Equally_Hierarchical_Operation-Adaptive_Predictor_for_Neural_ICCV_2021_paper.html", "aff_unique_index": "0+1;1;2;3;0", - "aff_unique_norm": "Wuhan University;JD;University of Sydney;University of Melbourne", - "aff_unique_dep": "School of Computer Science;JD Explore Academy;;School of Mathematics and Statistics", + "aff_unique_norm": "Wuhan University;JD Explore Academy;The University of Sydney;University of Melbourne", + "aff_unique_dep": "School of Computer Science;;;School of Mathematics and Statistics", "aff_unique_url": "http://www.whu.edu.cn/;;https://www.sydney.edu.au;https://www.unimelb.edu.au", "aff_unique_abbr": "WHU;;USYD;UniMelb", "aff_campus_unique_index": "0;2;0", "aff_campus_unique": "Wuhan;;Melbourne", "aff_country_unique_index": "0+0;0;1;1;0", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Ziye and Zhan,\n Yibing and Yu,\n Baosheng and Gong,\n Mingming and Du,\n Bo\n},\n title = {\n Not All Operations Contribute Equally: Hierarchical Operation-Adaptive Predictor for Neural Architecture Search\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10508-10517\n} \n}" }, { "title": "ODAM: Object Detection, Association, and Mapping Using Posed RGB Video", @@ -28635,6 +30576,7 @@ "status": "Poster", "track": "main", "pid": 2885, + "author_site": "Kejie Li; Daniel DeTone; Yu Fan (Steven) Chen; Minh Vo; Ian Reid; Hamid Rezatofighi; Chris Sweeney; Julian Straub; Richard Newcombe", "author": "Kejie Li; Daniel DeTone; Yu Fan (Steven) Chen; Minh Vo; Ian Reid; Hamid Rezatofighi; Chris Sweeney; Julian Straub; Richard Newcombe", "abstract": "Localizing objects and estimating their extent in 3D is an important step towards high-level 3D scene understanding, which has many applications in Augmented Reality and Robotics. We present ODAM, a system for 3D Object Detection, Association, and Mapping using posed RGB videos. The proposed system relies on a deep-learning-based front-end to detect 3D objects from a given RGB frame and associate them to a global object-based map using a graph neural network (GNN). Based on these frame-to-model associations, our back-end optimizes object bounding volumes, represented as super-quadrics, under multi-view geometry constraints and the object scale prior. We validate the proposed system on ScanNet where we show a significant improvement over existing RGB-only methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_ODAM_Object_Detection_Association_and_Mapping_Using_Posed_RGB_Video_ICCV_2021_paper.pdf", @@ -28651,14 +30593,15 @@ "author_num": 9, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Li_ODAM_Object_Detection_Association_and_Mapping_Using_Posed_RGB_Video_ICCV_2021_paper.html", "aff_unique_index": "0;1;1;1;0;2;1;1;1", - "aff_unique_norm": "University of Adelaide;Meta;Monash University", + "aff_unique_norm": "University of Adelaide;Facebook Reality Labs;Monash University", "aff_unique_dep": ";Research;", "aff_unique_url": "https://www.adelaide.edu.au;https://www.facebook.com/realitylabs;https://www.monash.edu", "aff_unique_abbr": "Adelaide;FRL;Monash", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;0;0;1;1;1", - "aff_country_unique": "Australia;United States" + "aff_country_unique": "Australia;United States", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Kejie and DeTone,\n Daniel and Chen,\n Yu Fan (Steven) and Vo,\n Minh and Reid,\n Ian and Rezatofighi,\n Hamid and Sweeney,\n Chris and Straub,\n Julian and Newcombe,\n Richard\n},\n title = {\n ODAM: Object Detection,\n Association,\n and Mapping Using Posed RGB Video\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5998-6008\n} \n}" }, { "title": "OMNet: Learning Overlapping Mask for Partial-to-Partial Point Cloud Registration", @@ -28666,6 +30609,7 @@ "status": "Poster", "track": "main", "pid": 7711, + "author_site": "Hao Xu; Shuaicheng Liu; Guangfu Wang; Guanghui Liu; Bing Zeng", "author": "Hao Xu; Shuaicheng Liu; Guangfu Wang; Guanghui Liu; Bing Zeng", "abstract": "Point cloud registration is a key task in many computational fields. Previous correspondence matching based methods require the inputs to have distinctive geometric structures to fit a 3D rigid transformation according to point-wise sparse feature matches. However, the accuracy of transformation heavily relies on the quality of extracted features, which are prone to errors with respect to partiality and noise. In addition, they can not utilize the geometric knowledge of all the overlapping regions. On the other hand, previous global feature based approaches can utilize the entire point cloud for the registration, however they ignore the negative effect of non-overlapping points when aggregating global features. In this paper, we present OMNet, a global feature based iterative network for partial-to-partial point cloud registration. We learn overlapping masks to reject non-overlapping regions, which converts the partial-to-partial registration to the registration of the same shape. Moreover, the previously used data is sampled only once from the CAD models for each object, resulting in the same point clouds for the source and reference. We propose a more practical manner of data generation where a CAD model is sampled twice for the source and reference, avoiding the previously prevalent over-fitting issue. Experimental results show that our method achieves state-of-the-art performance compared to traditional and deep learning based methods. Code is available at https://github.com/megvii-research/OMNet.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xu_OMNet_Learning_Overlapping_Mask_for_Partial-to-Partial_Point_Cloud_Registration_ICCV_2021_paper.pdf", @@ -28689,7 +30633,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xu_2021_ICCV,\n \n author = {\n Xu,\n Hao and Liu,\n Shuaicheng and Wang,\n Guangfu and Liu,\n Guanghui and Zeng,\n Bing\n},\n title = {\n OMNet: Learning Overlapping Mask for Partial-to-Partial Point Cloud Registration\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3132-3141\n} \n}" }, { "title": "ORBIT: A Real-World Few-Shot Dataset for Teachable Object Recognition", @@ -28697,6 +30642,7 @@ "status": "Poster", "track": "main", "pid": 2101, + "author_site": "Daniela Massiceti; Luisa Zintgraf; John Bronskill; Lida Theodorou; Matthew Tobias Harris; Edward Cutrell; Cecily Morrison; Katja Hofmann; Simone Stumpf", "author": "Daniela Massiceti; Luisa Zintgraf; John Bronskill; Lida Theodorou; Matthew Tobias Harris; Edward Cutrell; Cecily Morrison; Katja Hofmann; Simone Stumpf", "abstract": "Object recognition has made great advances in the last decade, but predominately still relies on many high-quality training examples per object category. In contrast, learning new objects from only a few examples could enable many impactful applications from robotics to user personalization. Most few-shot learning research, however, has been driven by benchmark datasets that lack the high variation that these applications will face when deployed in the real-world. To close this gap, we present the ORBIT dataset and benchmark, grounded in the real-world application of teachable object recognizers for people who are blind/low-vision. The dataset contains 3,822 videos of 486 objects recorded by people who are blind/low-vision on their mobile phones. The benchmark reflects a realistic, highly challenging recognition problem, providing a rich playground to drive research in robustness to few-shot, high-variation conditions. We set the benchmark's first state-of-the-art and show there is massive scope for further innovation, holding the potential to impact a broad range of real-world vision applications including tools for the blind/low-vision community. We release the dataset at https://doi.org/10.25383/city.14294597 and benchmark code at https://github.com/microsoft/ORBIT-Dataset.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Massiceti_ORBIT_A_Real-World_Few-Shot_Dataset_for_Teachable_Object_Recognition_ICCV_2021_paper.pdf", @@ -28713,14 +30659,15 @@ "author_num": 9, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Massiceti_ORBIT_A_Real-World_Few-Shot_Dataset_for_Teachable_Object_Recognition_ICCV_2021_paper.html", "aff_unique_index": "0;1;2;3;3;0;0;0;3", - "aff_unique_norm": "Microsoft;University of Oxford;University of Cambridge;City, University of London", + "aff_unique_norm": "Microsoft Corporation;University of Oxford;University of Cambridge;City, University of London", "aff_unique_dep": "Microsoft Research;;;", "aff_unique_url": "https://www.microsoft.com/en-us/research;https://www.ox.ac.uk;https://www.cam.ac.uk;https://www.city.ac.uk", "aff_unique_abbr": "MSR;Oxford;Cambridge;City, University of London", "aff_campus_unique_index": "1", "aff_campus_unique": ";Cambridge", "aff_country_unique_index": "0;1;1;1;1;0;0;0;1", - "aff_country_unique": "United States;United Kingdom" + "aff_country_unique": "United States;United Kingdom", + "bibtex": "@InProceedings{Massiceti_2021_ICCV,\n \n author = {\n Massiceti,\n Daniela and Zintgraf,\n Luisa and Bronskill,\n John and Theodorou,\n Lida and Harris,\n Matthew Tobias and Cutrell,\n Edward and Morrison,\n Cecily and Hofmann,\n Katja and Stumpf,\n Simone\n},\n title = {\n ORBIT: A Real-World Few-Shot Dataset for Teachable Object Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10818-10828\n} \n}" }, { "title": "OSCAR-Net: Object-Centric Scene Graph Attention for Image Attribution", @@ -28728,10 +30675,11 @@ "status": "Poster", "track": "main", "pid": 3468, + "author_site": "Eric Nguyen; Tu Bui; Viswanathan Swaminathan; John Collomosse", "author": "Eric Nguyen; Tu Bui; Viswanathan Swaminathan; John Collomosse", "abstract": "Images tell powerful stories but cannot always be trusted. Matching images back to trusted sources (attribution) enables users to make a more informed judgment of the images they encounter online. We propose a robust image hashing algorithm to perform such matching. Our hash is sensitive to manipulation of subtle, salient visual details that can substantially change the story told by an image. Yet the hash is invariant to benign transformations (changes in quality, codecs, sizes, shapes, etc.) experienced by images during online redistribution. Our key contribution is OSCAR-Net (Object-centric Scene Graph Attention for Image Attribution Network); a robust image hashing model inspired by recent successes of Transformers in the visual domain. OSCAR-Net constructs a scene graph representation that attends to fine-grained changes of every object's visual appearance and their spatial relationships. The network is trained via contrastive learning on a dataset of original and manipulated images yielding a state of the art image hash for content fingerprinting that scales to millions of images.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Nguyen_OSCAR-Net_Object-Centric_Scene_Graph_Attention_for_Image_Attribution_ICCV_2021_paper.pdf", - "aff": "Adobe Research, Creative Intelligence Lab \u2014 San Jose, CA; CVSSP, University of Surrey \u2014 Guildford, UK; Adobe Research, Creative Intelligence Lab \u2014 San Jose, CA; Adobe Research, Creative Intelligence Lab \u2014 San Jose, CA + CVSSP, University of Surrey \u2014 Guildford, UK", + "aff": "Adobe Research, Creative Intelligence Lab — San Jose, CA; CVSSP, University of Surrey — Guildford, UK; Adobe Research, Creative Intelligence Lab — San Jose, CA; Adobe Research, Creative Intelligence Lab — San Jose, CA + CVSSP, University of Surrey — Guildford, UK", "project": "https://exnx.github.io/oscar/", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Nguyen_OSCAR-Net_Object-Centric_Scene_ICCV_2021_supplemental.pdf", @@ -28744,14 +30692,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Nguyen_OSCAR-Net_Object-Centric_Scene_Graph_Attention_for_Image_Attribution_ICCV_2021_paper.html", "aff_unique_index": "0;1;0;0+1", - "aff_unique_norm": "Adobe;University of Surrey", + "aff_unique_norm": "Adobe Research;University of Surrey", "aff_unique_dep": "Creative Intelligence Lab;CVSSP", "aff_unique_url": "https://research.adobe.com;https://www.surrey.ac.uk", "aff_unique_abbr": "Adobe;Surrey", "aff_campus_unique_index": "0;1;0;0+1", "aff_campus_unique": "San Jose;Guildford", "aff_country_unique_index": "0;1;0;0+1", - "aff_country_unique": "United States;United Kingdom" + "aff_country_unique": "United States;United Kingdom", + "bibtex": "@InProceedings{Nguyen_2021_ICCV,\n \n author = {\n Nguyen,\n Eric and Bui,\n Tu and Swaminathan,\n Viswanathan and Collomosse,\n John\n},\n title = {\n OSCAR-Net: Object-Centric Scene Graph Attention for Image Attribution\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14499-14508\n} \n}" }, { "title": "OVANet: One-vs-All Network for Universal Domain Adaptation", @@ -28759,6 +30708,7 @@ "status": "Poster", "track": "main", "pid": 5753, + "author_site": "Kuniaki Saito; Kate Saenko", "author": "Kuniaki Saito; Kate Saenko", "abstract": "Universal Domain Adaptation (UNDA) aims to handle both domain-shift and category-shift between two datasets, where the main challenge is to transfer knowledge while rejecting \"unknown\" classes which are absent in the labeled source data but present in the unlabeled target data. Existing methods manually set a threshold to reject \"unknown\" samples based on validation or a pre-defined ratio of \"unknown\" samples, but this strategy is not practical. In this paper, we propose a method to learn the threshold using source samples and to adapt it to the target domain. Our idea is that a minimum inter-class distance in the source domain should be a good threshold to decide between \"known\" or \"unknown\" in the target. To learn the inter- and intra-class distance, we propose to train a one-vs-all classifier for each class using labeled source data. Then, we adapt the open-set classifier to the target domain by minimizing class entropy. The resulting framework is the simplest of all baselines of UNDA and is insensitive to the value of a hyper-parameter, yet outperforms baselines with a large margin.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Saito_OVANet_One-vs-All_Network_for_Universal_Domain_Adaptation_ICCV_2021_paper.pdf", @@ -28782,7 +30732,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Saito_2021_ICCV,\n \n author = {\n Saito,\n Kuniaki and Saenko,\n Kate\n},\n title = {\n OVANet: One-vs-All Network for Universal Domain Adaptation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9000-9009\n} \n}" }, { "title": "OadTR: Online Action Detection With Transformers", @@ -28790,6 +30741,7 @@ "status": "Poster", "track": "main", "pid": 1322, + "author_site": "Xiang Wang; Shiwei Zhang; Zhiwu Qing; Yuanjie Shao; Zhengrong Zuo; Changxin Gao; Nong Sang", "author": "Xiang Wang; Shiwei Zhang; Zhiwu Qing; Yuanjie Shao; Zhengrong Zuo; Changxin Gao; Nong Sang", "abstract": "Most recent approaches for online action detection tend to apply Recurrent Neural Network (RNN) to capture long-range temporal structure. However, RNN suffers from non-parallelism and gradient vanishing, hence it is hard to be optimized. In this paper, we propose a new encoder-decoder framework based on Transformers, named OadTR, to tackle these problems. The encoder attached with a task token aims to capture the relationships and global interactions between historical observations. The decoder extracts auxiliary information by aggregating anticipated future clip representations. Therefore, OadTR can recognize current actions by encoding historical information and predicting future context simultaneously. We extensively evaluate the proposed OadTR on three challenging datasets: HDD, TVSeries, and THUMOS14. The experimental results show that OadTR achieves higher training and inference speeds than current RNN based approaches, and significantly outperforms the state-of-the-art methods in terms of both mAP and mcAP. Code is available at https: //github.com/wangxiang1230/OadTR.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_OadTR_Online_Action_Detection_With_Transformers_ICCV_2021_paper.pdf", @@ -28813,7 +30765,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Xiang and Zhang,\n Shiwei and Qing,\n Zhiwu and Shao,\n Yuanjie and Zuo,\n Zhengrong and Gao,\n Changxin and Sang,\n Nong\n},\n title = {\n OadTR: Online Action Detection With Transformers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7565-7575\n} \n}" }, { "title": "Object Tracking by Jointly Exploiting Frame and Event Domain", @@ -28821,6 +30774,7 @@ "status": "Poster", "track": "main", "pid": 8103, + "author_site": "Jiqing Zhang; Xin Yang; Yingkai Fu; Xiaopeng Wei; Baocai Yin; Bo Dong", "author": "Jiqing Zhang; Xin Yang; Yingkai Fu; Xiaopeng Wei; Baocai Yin; Bo Dong", "abstract": "Inspired by the complementarity between conventional frame-based and bio-inspired event-based cameras, we propose a multi-modal based approach to fuse visual cues from the frame- and event-domain to enhance the single object tracking performance, especially in degraded conditions (e.g., scenes with high dynamic range, low light, and fast-motion objects). The proposed approach can effectively and adaptively combine meaningful information from both domains. Our approach's effectiveness is enforced by a novel designed cross-domain attention schemes, which can effectively enhance features based on self- and cross-domain attention schemes; The adaptiveness is guarded by a specially designed weighting scheme, which can adaptively balance the contribution of the two domains. To exploit event-based visual cues in single-object tracking, we construct a large-scale frame-event-based dataset, which we subsequently employ to train a novel frame-event fusion based model. Extensive experiments show that the proposed approach outperforms state-of-the-art frame-based tracking methods by at least 10.4% and 11.9% in terms of representative success rate and precision rate, respectively. Besides, the effectiveness of each key component of our approach is evidenced by our thorough ablation study.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhang_Object_Tracking_by_Jointly_Exploiting_Frame_and_Event_Domain_ICCV_2021_paper.pdf", @@ -28844,7 +30798,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0+1;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Zhang_2021_ICCV,\n \n author = {\n Zhang,\n Jiqing and Yang,\n Xin and Fu,\n Yingkai and Wei,\n Xiaopeng and Yin,\n Baocai and Dong,\n Bo\n},\n title = {\n Object Tracking by Jointly Exploiting Frame and Event Domain\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13043-13052\n} \n}" }, { "title": "Objects As Cameras: Estimating High-Frequency Illumination From Shadows", @@ -28852,6 +30807,7 @@ "status": "Poster", "track": "main", "pid": 10992, + "author_site": "Tristan Swedish; Connor Henley; Ramesh Raskar", "author": "Tristan Swedish; Connor Henley; Ramesh Raskar", "abstract": "We recover high-frequency information encoded in the shadows cast by an object to estimate a hemispherical photograph from the viewpoint of the object, effectively turning objects into cameras. Estimating environment maps is useful for advanced image editing tasks such as relighting, object insertion or removal, and material parameter estimation. Because the problem is ill-posed, recent works in illumination recovery have tackled the problem of low-frequency lighting for object insertion, rely upon specular surface materials, or make use of data-driven methods that are susceptible to hallucination without physically plausible constraints. We incorporate an optimization scheme to update scene parameters that could enable practical capture of real-world scenes. Furthermore, we develop a methodology for evaluating expected recovery performance for different types and shapes of objects.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Swedish_Objects_As_Cameras_Estimating_High-Frequency_Illumination_From_Shadows_ICCV_2021_paper.pdf", @@ -28866,7 +30822,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Swedish_Objects_As_Cameras_Estimating_High-Frequency_Illumination_From_Shadows_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Swedish_Objects_As_Cameras_Estimating_High-Frequency_Illumination_From_Shadows_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Swedish_2021_ICCV,\n \n author = {\n Swedish,\n Tristan and Henley,\n Connor and Raskar,\n Ramesh\n},\n title = {\n Objects As Cameras: Estimating High-Frequency Illumination From Shadows\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2593-2602\n} \n}" }, { "title": "Occlude Them All: Occlusion-Aware Attention Network for Occluded Person Re-ID", @@ -28874,10 +30831,11 @@ "status": "Poster", "track": "main", "pid": 8635, - "author": "Peixian Chen; Wenfeng Liu; Pingyang Dai; Jianzhuang Liu; Qixiang Ye; Mingliang Xu; Qi\u2019an Chen; Rongrong Ji", + "author_site": "Peixian Chen; Wenfeng Liu; Pingyang Dai; Jianzhuang Liu; Qixiang Ye; Mingliang Xu; Qi’an Chen; Rongrong Ji", + "author": "Peixian Chen; Wenfeng Liu; Pingyang Dai; Jianzhuang Liu; Qixiang Ye; Mingliang Xu; Qi’an Chen; Rongrong Ji", "abstract": "Person Re-Identification (ReID) has achieved remarkable performance along with the deep learning era. However, most approaches carry out ReID only based upon holistic pedestrian regions. In contrast, real-world scenarios involve occluded pedestrians, which provide partial visual appearances and destroy the ReID accuracy. A common strategy is to locate visible body parts by auxiliary model, which however suffers from significant domain gaps and data bias issues. To avoid such problematic models in occluded person ReID, we propose the Occlusion-Aware Mask Network (OAMN). In particular, we incorporate an attention-guided mask module, which requires guidance from labeled occlusion data. To this end, we propose a novel occlusion augmentation scheme that produces diverse and precisely labeled occlusion for any holistic dataset. The proposed scheme suits real-world scenarios better than existing schemes, which consider only limited types of occlusions. We also offer a novel occlusion unification scheme to tackle ambiguity information at the test phase. The above three components enable existing attention mechanisms to precisely capture body parts regardless of the occlusion. Comprehensive experiments on a variety of person ReID benchmarks demonstrate the superiority of OAMN over state-of-the-arts.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_Occlude_Them_All_Occlusion-Aware_Attention_Network_for_Occluded_Person_Re-ID_ICCV_2021_paper.pdf", - "aff": "Xiamen University, China+Tencent YouTu Lab; Xiamen University, China; Xiamen University, China; Noah\u2019s Ark Lab, Huawei Technologies; University of Chinese Academy of Sciences, China; Zhengzhou University, China; Xiamen University, China; Xiamen University, China", + "aff": "Xiamen University, China+Tencent YouTu Lab; Xiamen University, China; Xiamen University, China; Noah’s Ark Lab, Huawei Technologies; University of Chinese Academy of Sciences, China; Zhengzhou University, China; Xiamen University, China; Xiamen University, China", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Chen_Occlude_Them_All_ICCV_2021_supplemental.pdf", @@ -28890,14 +30848,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Chen_Occlude_Them_All_Occlusion-Aware_Attention_Network_for_Occluded_Person_Re-ID_ICCV_2021_paper.html", "aff_unique_index": "0+1;0;0;2;3;4;0;0", - "aff_unique_norm": "Xiamen University;Tencent;Huawei;University of Chinese Academy of Sciences;Zhengzhou University", - "aff_unique_dep": ";YouTu Lab;Noah\u2019s Ark Lab;;", + "aff_unique_norm": "Xiamen University;Tencent;Huawei Technologies;University of Chinese Academy of Sciences;Zhengzhou University", + "aff_unique_dep": ";YouTu Lab;Noah’s Ark Lab;;", "aff_unique_url": "https://www.xmu.edu.cn;https://www.tencent.com;https://www.huawei.com;http://www.ucas.ac.cn;http://www.zzu.edu.cn", "aff_unique_abbr": "XMU;Tencent;Huawei;UCAS;ZZU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Peixian and Liu,\n Wenfeng and Dai,\n Pingyang and Liu,\n Jianzhuang and Ye,\n Qixiang and Xu,\n Mingliang and Chen,\n Qi{\\textquoteright\n}an and Ji,\n Rongrong\n},\n title = {\n Occlude Them All: Occlusion-Aware Attention Network for Occluded Person Re-ID\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11833-11842\n} \n}" }, { "title": "Occluded Person Re-Identification With Single-Scale Global Representations", @@ -28905,6 +30864,7 @@ "status": "Poster", "track": "main", "pid": 5681, + "author_site": "Cheng Yan; Guansong Pang; Jile Jiao; Xiao Bai; Xuetao Feng; Chunhua Shen", "author": "Cheng Yan; Guansong Pang; Jile Jiao; Xiao Bai; Xuetao Feng; Chunhua Shen", "abstract": "Occluded person re-identification (ReID) aims at re-identifying occluded pedestrians from occluded or holistic images taken across multiple cameras. Current state-of-the-art (SOTA) occluded ReID models rely on some auxiliary modules, including pose estimation, feature pyramid and graph matching modules, to learn multi-scale and/or part-level features to tackle the occlusion challenges. This unfortunately leads to complex ReID models that (i) fail to generalize to challenging occlusions of diverse appearance, shape or size, and (ii) become ineffective in handling non-occluded pedestrians. However, real-world ReID applications typically have highly diverse occlusions and involve a hybrid of occluded and non-occluded pedestrians. To address these two issues, we introduce a novel ReID model that learns discriminative single-scale global-level pedestrian features by enforcing a novel exponentially sensitive yet bounded distance loss on occlusion-based augmented data. We show for the first time that learning single-scale global features without using these auxiliary modules is able to outperform those SOTA multi-scale and/or part-level feature-based models. Further, our simple model can achieve new SOTA performance in both occluded and non-occluded ReID, as shown by extensive results on three occluded and two general ReID benchmarks. Additionally, we create a large-scale occluded person ReID dataset with both indoor and outdoor occlusions in different scenes, which is significantly larger and contains substantially more diverse occlusions and pedestrian dressings than existing occluded ReID datasets, providing a more faithful occluded ReID benchmark.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yan_Occluded_Person_Re-Identification_With_Single-Scale_Global_Representations_ICCV_2021_paper.pdf", @@ -28928,7 +30888,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;1;0;1", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Yan_2021_ICCV,\n \n author = {\n Yan,\n Cheng and Pang,\n Guansong and Jiao,\n Jile and Bai,\n Xiao and Feng,\n Xuetao and Shen,\n Chunhua\n},\n title = {\n Occluded Person Re-Identification With Single-Scale Global Representations\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11875-11884\n} \n}" }, { "title": "Occlusion-Aware Video Object Inpainting", @@ -28936,6 +30897,7 @@ "status": "Poster", "track": "main", "pid": 4194, + "author_site": "Lei Ke; Yu-Wing Tai; Chi-Keung Tang", "author": "Lei Ke; Yu-Wing Tai; Chi-Keung Tang", "abstract": "Conventional video inpainting is neither object-oriented nor occlusion-aware, making it liable to obvious artifacts when large occluded object regions are inpainted. This paper presents occlusion-aware video object inpainting, which recovers both the complete shape and appearance for occluded objects in videos given their visible mask segmentation. To facilitate this new research, we construct the first large-scale video object inpainting benchmark YouTube-VOI to provide realistic occlusion scenarios with both occluded and visible object masks available. Our technical contribution VOIN jointly performs video object shape completion and occluded texture generation. In particular, the shape completion module models long-range object coherence while the flow completion module recovers accurate flow with sharp motion boundary, for propagating temporally-consistent texture to the same moving object across frames. For more realistic results, VOIN is optimized using both T-PatchGAN and a new spatio-temporal attention-based multi-class discriminator. Finally, we compare VOIN and strong baselines on YouTube-VOI. Experimental results clearly demonstrate the efficacy of our method including inpainting complex and dynamic objects. VOIN degrades gracefully with inaccurate input visible mask.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ke_Occlusion-Aware_Video_Object_Inpainting_ICCV_2021_paper.pdf", @@ -28959,7 +30921,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Ke_2021_ICCV,\n \n author = {\n Ke,\n Lei and Tai,\n Yu-Wing and Tang,\n Chi-Keung\n},\n title = {\n Occlusion-Aware Video Object Inpainting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14468-14478\n} \n}" }, { "title": "Omni-GAN: On the Secrets of cGANs and Beyond", @@ -28967,6 +30930,7 @@ "status": "Poster", "track": "main", "pid": 2425, + "author_site": "Peng Zhou; Lingxi Xie; Bingbing Ni; Cong Geng; Qi Tian", "author": "Peng Zhou; Lingxi Xie; Bingbing Ni; Cong Geng; Qi Tian", "abstract": "The conditional generative adversarial network (cGAN) is a powerful tool of generating high-quality images, but existing approaches mostly suffer unsatisfying performance or the risk of mode collapse. This paper presents Omni-GAN, a variant of cGAN that reveals the devil in designing a proper discriminator for training the model. The key is to ensure that the discriminator receives strong supervision to perceive the concepts and moderate regularization to avoid collapse. Omni-GAN is easily implemented and freely integrated with off-the-shelf encoding methods (e.g., implicit neural representation, INR). Experiments validate the superior performance of Omni-GAN and Omni-INR-GAN in a wide range of image generation and restoration tasks. In particular, Omni-INR-GAN sets new records on the ImageNet dataset with impressive Inception scores of 262.85 and 343.22 for the image sizes of 128 and 256, respectively, surpassing the previous records by 100+ points. Moreover, leveraging the generator prior, Omni-INR-GAN can extrapolate low-resolution images to arbitrary resolution, even up to x60+ higher resolution. Code is available.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhou_Omni-GAN_On_the_Secrets_of_cGANs_and_Beyond_ICCV_2021_paper.pdf", @@ -28984,13 +30948,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhou_Omni-GAN_On_the_Secrets_of_cGANs_and_Beyond_ICCV_2021_paper.html", "aff_unique_index": "0;1;0+1;0;1", "aff_unique_norm": "Shanghai Jiao Tong University;Huawei", - "aff_unique_dep": ";Huawei", + "aff_unique_dep": ";", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.huawei.com", "aff_unique_abbr": "SJTU;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhou_2021_ICCV,\n \n author = {\n Zhou,\n Peng and Xie,\n Lingxi and Ni,\n Bingbing and Geng,\n Cong and Tian,\n Qi\n},\n title = {\n Omni-GAN: On the Secrets of cGANs and Beyond\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14061-14071\n} \n}" }, { "title": "Omnidata: A Scalable Pipeline for Making Multi-Task Mid-Level Vision Datasets From 3D Scans", @@ -28998,6 +30963,7 @@ "status": "Poster", "track": "main", "pid": 11164, + "author_site": "Ainaz Eftekhar; Alexander Sax; Jitendra Malik; Amir Zamir", "author": "Ainaz Eftekhar; Alexander Sax; Jitendra Malik; Amir Zamir", "abstract": "Computer vision now relies on data, but we know surprisingly little about what factors in the data affect performance. We argue that this stems from the way data is collected. Designing and collecting static datasets of images (or videos) locks us in to specific design choices and limits us to post-hoc analyses. In practice, vision datasets only include specific domains and tasks. This not only makes it necessary and difficult to combine datsets, but leads to scattershot overall coverage that frustrates systemic research into the interaction of tasks, data, models, and learning algorithms. For example, if a model trained for ImageNet classification on ImageNet transfers better to CoCo than does a model trained for Kitti depth estimation--is that due to the difference in tasks or the different training data? We note that one way to do this is to use a comprehensive, standardized scene representation that contains extra information about the scene, and then to use that to create a specific dataset of study. We introduce a platform for doing this. Specifically, we provide a pipeline that takes as input a 3D scans and generates multi-task datasets of mid-level cues. The pipeline exposes complete control over the generation process, is implemented in mostly python, and we provide ecosystem tools such as a Docker and PyTorch dataloaders. We also provide a starter dataset of several recent 3D scan datasets, processed into standard static datasets of mid-level cues. We show that this starter dataset (generated from the annotator pipeline) is reliable; it yields models that provide state-of-the-art performance for several tasks. It yields human-level surface normal estimation performance on OASIS, despite having never seen OASIS data during training. With the proliferation of cheaper 3D sensors (e.g. on the newest iPhone), we anticipate that releasing an automated tool for this processing pipeline will allow the starter set to continue to expand and cover more domains. We examine a few small examples of using this procedure to analyze the relationship of data, tasks, models and learning algorithms, and suggest several exciting directions that are well out of the scope of this paper.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Eftekhar_Omnidata_A_Scalable_Pipeline_for_Making_Multi-Task_Mid-Level_Vision_Datasets_ICCV_2021_paper.pdf", @@ -29012,7 +30978,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Eftekhar_Omnidata_A_Scalable_Pipeline_for_Making_Multi-Task_Mid-Level_Vision_Datasets_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Eftekhar_Omnidata_A_Scalable_Pipeline_for_Making_Multi-Task_Mid-Level_Vision_Datasets_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Eftekhar_2021_ICCV,\n \n author = {\n Eftekhar,\n Ainaz and Sax,\n Alexander and Malik,\n Jitendra and Zamir,\n Amir\n},\n title = {\n Omnidata: A Scalable Pipeline for Making Multi-Task Mid-Level Vision Datasets From 3D Scans\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10786-10796\n} \n}" }, { "title": "Omniscient Video Super-Resolution", @@ -29020,6 +30987,7 @@ "status": "Poster", "track": "main", "pid": 2592, + "author_site": "Peng Yi; Zhongyuan Wang; Kui Jiang; Junjun Jiang; Tao Lu; Xin Tian; Jiayi Ma", "author": "Peng Yi; Zhongyuan Wang; Kui Jiang; Junjun Jiang; Tao Lu; Xin Tian; Jiayi Ma", "abstract": "Most recent video super-resolution (SR) methods either adopt an iterative manner to deal with low-resolution (LR) frames from a temporally sliding window, or leverage the previously estimated SR output to help reconstruct the current frame recurrently. A few studies try to combine these two structures to form a hybrid framework but have failed to give full play to it. In this paper, we propose an omniscient framework to not only utilize the preceding SR output, but also leverage the SR outputs from the present and future. The omniscient framework is more generic because the iterative, recurrent and hybrid frameworks can be regarded as its special cases. The proposed omniscient framework enables a generator to behave better than its counterparts under other frameworks. Abundant experiments on public datasets show that our method is superior to the state-of-the-art methods in objective metrics, subjective visual effects and complexity.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yi_Omniscient_Video_Super-Resolution_ICCV_2021_paper.pdf", @@ -29036,14 +31004,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Yi_Omniscient_Video_Super-Resolution_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;1+2;3;0;0", - "aff_unique_norm": "Wuhan University;Harbin Institute of Technology;Pengcheng Laboratory;Wuhan Institute of Technology", - "aff_unique_dep": "School of Computer Science;School of Computer Science and Technology;Peng Cheng Laboratory;School of Computer Science and Engineering", + "aff_unique_norm": "Wuhan University;Harbin Institute of Technology;Peng Cheng Laboratory;Wuhan Institute of Technology", + "aff_unique_dep": "School of Computer Science;School of Computer Science and Technology;;School of Computer Science and Engineering", "aff_unique_url": "http://www.whu.edu.cn/;http://www.hit.edu.cn/;http://www.pcl.ac.cn;http://www.wit.edu.cn/", "aff_unique_abbr": "WHU;HIT;PCL;WIT", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Harbin;Wuhan", "aff_country_unique_index": "0;0;0;0+0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yi_2021_ICCV,\n \n author = {\n Yi,\n Peng and Wang,\n Zhongyuan and Jiang,\n Kui and Jiang,\n Junjun and Lu,\n Tao and Tian,\n Xin and Ma,\n Jiayi\n},\n title = {\n Omniscient Video Super-Resolution\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4429-4438\n} \n}" }, { "title": "On Compositions of Transformations in Contrastive Self-Supervised Learning", @@ -29051,7 +31020,8 @@ "status": "Poster", "track": "main", "pid": 1009, - "author": "Mandela Patrick; Yuki M. Asano; Polina Kuznetsova; Ruth Fong; Jo\u00e3o F. Henriques; Geoffrey Zweig; Andrea Vedaldi", + "author_site": "Mandela Patrick; Yuki M. Asano; Polina Kuznetsova; Ruth Fong; João F. Henriques; Geoffrey Zweig; Andrea Vedaldi", + "author": "Mandela Patrick; Yuki M. Asano; Polina Kuznetsova; Ruth Fong; João F. Henriques; Geoffrey Zweig; Andrea Vedaldi", "abstract": "In the image domain, excellent representations can be learned by inducing invariance to content-preserving transformations via noise contrastive learning. In this paper, we generalize contrastive learning to a wider set of transformations, and their compositions, for which either invariance or distinctiveness is sought. We show that it is not immediately obvious how existing methods such as SimCLR can be extended to do so. Instead, we introduce a number of formal requirements that all contrastive formulations must satisfy, and propose a practical construction which satisfies these requirements. In order to maximise the reach of this analysis, we express all components of noise contrastive formulations as the choice of certain generalized transformations of the data (GDTs), including data sampling. We then consider videos as an example of data in which a large variety of transformations are applicable, accounting for the extra modalities -- for which we analyze audio and text -- and the dimension of time. We find that being invariant to certain transformations and distinctive to others is critical to learning effective video representations, improving the state-of-the-art for multiple benchmarks by a large margin, and even surpassing supervised pretraining.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Patrick_On_Compositions_of_Transformations_in_Contrastive_Self-Supervised_Learning_ICCV_2021_paper.pdf", "aff": "Facebook AI Research + Visual Geometry Group, University of Oxford; Visual Geometry Group, University of Oxford; Facebook AI Research; Visual Geometry Group, University of Oxford; Visual Geometry Group, University of Oxford; Facebook AI Research; Facebook AI Research + Visual Geometry Group, University of Oxford", @@ -29067,14 +31037,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Patrick_On_Compositions_of_Transformations_in_Contrastive_Self-Supervised_Learning_ICCV_2021_paper.html", "aff_unique_index": "0+1;1;0;1;1;0;0+1", - "aff_unique_norm": "Meta;University of Oxford", + "aff_unique_norm": "Facebook;University of Oxford", "aff_unique_dep": "Facebook AI Research;Visual Geometry Group", "aff_unique_url": "https://research.facebook.com;https://www.ox.ac.uk", "aff_unique_abbr": "FAIR;Oxford", "aff_campus_unique_index": "1;1;1;1;1", "aff_campus_unique": ";Oxford", "aff_country_unique_index": "0+1;1;0;1;1;0;0+1", - "aff_country_unique": "United States;United Kingdom" + "aff_country_unique": "United States;United Kingdom", + "bibtex": "@InProceedings{Patrick_2021_ICCV,\n \n author = {\n Patrick,\n Mandela and Asano,\n Yuki M. and Kuznetsova,\n Polina and Fong,\n Ruth and Henriques,\n Jo\\~ao F. and Zweig,\n Geoffrey and Vedaldi,\n Andrea\n},\n title = {\n On Compositions of Transformations in Contrastive Self-Supervised Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9577-9587\n} \n}" }, { "title": "On Equivariant and Invariant Learning of Object Landmark Representations", @@ -29082,6 +31053,7 @@ "status": "Poster", "track": "main", "pid": 3080, + "author_site": "Zezhou Cheng; Jong-Chyi Su; Subhransu Maji", "author": "Zezhou Cheng; Jong-Chyi Su; Subhransu Maji", "abstract": "Given a collection of images, humans are able to discover landmarks by modeling the shared geometric structure across instances. This idea of geometric equivariance has been widely used for the unsupervised discovery of object landmark representations. In this paper, we develop a simple and effective approach by combining instance-discriminative and spatially-discriminative contrastive learning. We show that when a deep network is trained to be invariant to geometric and photometric transformations, representations emerge from its intermediate layers that are highly predictive of object landmarks. Stacking these across layers in a \"hypercolumn\" and projecting them using spatially-contrastive learning further improves their performance on matching and few-shot landmark regression tasks. We also present a unified view of existing equivariant and invariant representation learning approaches through the lens of contrastive learning, shedding light on the nature of invariances learned. Experiments on standard benchmarks for landmark learning, as well as a new challenging one we propose, show that the proposed approach surpasses prior state-of-the-art.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Cheng_On_Equivariant_and_Invariant_Learning_of_Object_Landmark_Representations_ICCV_2021_paper.pdf", @@ -29105,7 +31077,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Amherst", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Cheng_2021_ICCV,\n \n author = {\n Cheng,\n Zezhou and Su,\n Jong-Chyi and Maji,\n Subhransu\n},\n title = {\n On Equivariant and Invariant Learning of Object Landmark Representations\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9897-9906\n} \n}" }, { "title": "On Exposing the Challenging Long Tail in Future Prediction of Traffic Actors", @@ -29113,7 +31086,8 @@ "status": "Poster", "track": "main", "pid": 5794, - "author": "Osama Makansi; \u00d6zg\u00fcn \u00c7i\u00e7ek; Yassine Marrakchi; Thomas Brox", + "author_site": "Osama Makansi; Özgün Çiçek; Yassine Marrakchi; Thomas Brox", + "author": "Osama Makansi; Özgün Çiçek; Yassine Marrakchi; Thomas Brox", "abstract": "Predicting the future states of dynamic traffic actors enables autonomous systems to avoid accidents and operate safely. Remarkably, the most critical scenarios are much less frequent and more complex than the uncritical ones. Therefore, uncritical cases dominate the prediction. In this paper, we address specifically the challenging scenarios at the long tail of the dataset distribution. Our analysis shows that the common losses tend to place challenging cases sub-optimally in the embedding space. As a consequence, we propose to supplement the usual loss with a loss that places challenging cases closer to each other in the embedding space. This triggers sharing information among challenging cases and learning specific predictive features. We show on four public datasets that this leads to improved performance on the hard scenarios while the overall performance stays stable. The approach is agnostic w.r.t. the used network architecture, input modality or viewpoint, and can be integrated into existing solutions easily.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Makansi_On_Exposing_the_Challenging_Long_Tail_in_Future_Prediction_of_ICCV_2021_paper.pdf", "aff": "University of Freiburg; University of Freiburg; University of Freiburg; University of Freiburg", @@ -29136,7 +31110,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Makansi_2021_ICCV,\n \n author = {\n Makansi,\n Osama and \\c{C\n}i\\c{c\n}ek,\n \\"Ozg\\"un and Marrakchi,\n Yassine and Brox,\n Thomas\n},\n title = {\n On Exposing the Challenging Long Tail in Future Prediction of Traffic Actors\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13147-13157\n} \n}" }, { "title": "On Feature Decorrelation in Self-Supervised Learning", @@ -29144,6 +31119,7 @@ "status": "Poster", "track": "main", "pid": 4204, + "author_site": "Tianyu Hua; Wenxiao Wang; Zihui Xue; Sucheng Ren; Yue Wang; Hang Zhao", "author": "Tianyu Hua; Wenxiao Wang; Zihui Xue; Sucheng Ren; Yue Wang; Hang Zhao", "abstract": "In self-supervised representation learning, a common idea behind most of the state-of-the-art approaches is to enforce the robustness of the representations to predefined augmentations. A potential issue of this idea is the existence of completely collapsed solutions (i.e., constant features), which are typically avoided implicitly by carefully chosen implementation details. In this work, we study a relatively concise framework containing the most common components from recent approaches. We verify the existence of complete collapse and discover another reachable collapse pattern that is usually overlooked, namely dimensional collapse. We connect dimensional collapse with strong correlations between axes and consider such connection as a strong motivation for feature decorrelation (i.e., standardizing the covariance matrix). The gains from feature decorrelation are verified empirically to highlight the importance and the potential of this insight.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Hua_On_Feature_Decorrelation_in_Self-Supervised_Learning_ICCV_2021_paper.pdf", @@ -29167,7 +31143,8 @@ "aff_campus_unique_index": ";;1;;", "aff_campus_unique": ";Austin", "aff_country_unique_index": "0+0;0+0;1+0+0;0+0;1;0+0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Hua_2021_ICCV,\n \n author = {\n Hua,\n Tianyu and Wang,\n Wenxiao and Xue,\n Zihui and Ren,\n Sucheng and Wang,\n Yue and Zhao,\n Hang\n},\n title = {\n On Feature Decorrelation in Self-Supervised Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9598-9608\n} \n}" }, { "title": "On Generating Transferable Targeted Perturbations", @@ -29175,10 +31152,11 @@ "status": "Poster", "track": "main", "pid": 10534, + "author_site": "Muzammal Naseer; Salman Khan; Munawar Hayat; Fahad Shahbaz Khan; Fatih Porikli", "author": "Muzammal Naseer; Salman Khan; Munawar Hayat; Fahad Shahbaz Khan; Fatih Porikli", "abstract": "While the untargeted black-box transferability of adversarial perturbations has been extensively studied before, changing an unseen model's decisions to a specific `targeted' class remains a challenging feat. In this paper, we propose a new generative approach for highly transferable targeted perturbations (\\ours). We note that the existing methods are less suitable for this task due to their reliance on class-boundary information that changes from one model to another, thus reducing transferability. In contrast, our approach matches perturbed image `distribution' with that of the target class, leading to high targeted transferability rates. To this end, we propose a new objective function that not only aligns the global distributions of source and target images, but also matches the local neighbourhood structure between the two domains. Based on the proposed objective, we train a generator function that can adaptively synthesize perturbations specific to a given input. Our generative approach is independent of the source or target domain labels, while consistently performs well against state-of-the-art methods on a wide range of attack settings. As an example, we achieve 32.63% target transferability from (an adversarially weak) VGG19_ BN to (a strong) WideResNet on ImageNet val. set, which is 4xhigher than the previous best generative attack and 16xbetter than instance-specific iterative attack.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Naseer_On_Generating_Transferable_Targeted_Perturbations_ICCV_2021_paper.pdf", - "aff": "Australian National University, Australia+Mohamed bin Zayed University of Artificial Intelligence, UAE+Link\u00f6ping University, Sweden; Mohamed bin Zayed University of Artificial Intelligence, UAE+Link\u00f6ping University, Sweden; Monash University, Australia; Mohamed bin Zayed University of Artificial Intelligence, UAE+Link\u00f6ping University, Sweden; Qualcomm, USA", + "aff": "Australian National University, Australia+Mohamed bin Zayed University of Artificial Intelligence, UAE+Linköping University, Sweden; Mohamed bin Zayed University of Artificial Intelligence, UAE+Linköping University, Sweden; Monash University, Australia; Mohamed bin Zayed University of Artificial Intelligence, UAE+Linköping University, Sweden; Qualcomm, USA", "project": "", "github": "https://github.com/Muzammal-Naseer/TTP", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Naseer_On_Generating_Transferable_ICCV_2021_supplemental.pdf", @@ -29191,14 +31169,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Naseer_On_Generating_Transferable_Targeted_Perturbations_ICCV_2021_paper.html", "aff_unique_index": "0+1+2;1+2;3;1+2;4", - "aff_unique_norm": "Australian National University;Mohamed bin Zayed University of Artificial Intelligence;Link\u00f6ping University;Monash University;Qualcomm", + "aff_unique_norm": "Australian National University;Mohamed bin Zayed University of Artificial Intelligence;Linköping University;Monash University;Qualcomm", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.anu.edu.au;https://mbzuai.ac.ae;https://www.liu.se;https://www.monash.edu;https://www.qualcomm.com", "aff_unique_abbr": "ANU;MBZUAI;LiU;Monash;QCOM", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+1+2;1+2;0;1+2;3", - "aff_country_unique": "Australia;United Arab Emirates;Sweden;United States" + "aff_country_unique": "Australia;United Arab Emirates;Sweden;United States", + "bibtex": "@InProceedings{Naseer_2021_ICCV,\n \n author = {\n Naseer,\n Muzammal and Khan,\n Salman and Hayat,\n Munawar and Khan,\n Fahad Shahbaz and Porikli,\n Fatih\n},\n title = {\n On Generating Transferable Targeted Perturbations\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7708-7717\n} \n}" }, { "title": "On the Hidden Treasure of Dialog in Video Question Answering", @@ -29206,7 +31185,8 @@ "status": "Poster", "track": "main", "pid": 9161, - "author": "Deniz Engin; Fran\u00e7ois Schnitzler; Ngoc Q. K. Duong; Yannis Avrithis", + "author_site": "Deniz Engin; François Schnitzler; Ngoc Q. K. Duong; Yannis Avrithis", + "author": "Deniz Engin; François Schnitzler; Ngoc Q. K. Duong; Yannis Avrithis", "abstract": "High-level understanding of stories in video such as movies and TV shows from raw data is extremely challenging. Modern video question answering (VideoQA) systems often use additional human-made sources like plot synopses, scripts, video descriptions or knowledge bases. In this work, we present a new approach to understand the whole story without such external sources. The secret lies in the dialog: unlike any prior work, we treat dialog as a noisy source to be converted into text description via dialog summarization, much like recent methods treat video. The input of each modality is encoded by transformers independently, and a simple fusion method combines all modalities, using soft temporal attention for localization over long inputs. Our model outperforms the state of the art on the KnowIT VQA dataset by a large margin, without using question-specific human annotation or human-made plot summaries. It even outperforms human evaluators who have never watched any whole episode before. Code is available at https://engindeniz.github.io/dialogsummary-videoqa", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Engin_On_the_Hidden_Treasure_of_Dialog_in_Video_Question_Answering_ICCV_2021_paper.pdf", "aff": "Inria, Univ Rennes, CNRS, IRISA+InterDigital; InterDigital; InterDigital; Inria, Univ Rennes, CNRS, IRISA", @@ -29222,14 +31202,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Engin_On_the_Hidden_Treasure_of_Dialog_in_Video_Question_Answering_ICCV_2021_paper.html", "aff_unique_index": "0+1;1;1;0", - "aff_unique_norm": "INRIA;InterDigital", + "aff_unique_norm": "Inria;InterDigital", "aff_unique_dep": ";", "aff_unique_url": "https://www.inria.fr;https://www.interdigital.com", "aff_unique_abbr": "Inria;InterDigital", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;1;1;0", - "aff_country_unique": "France;United States" + "aff_country_unique": "France;United States", + "bibtex": "@InProceedings{Engin_2021_ICCV,\n \n author = {\n Engin,\n Deniz and Schnitzler,\n Fran\\c{c\n}ois and Duong,\n Ngoc Q. K. and Avrithis,\n Yannis\n},\n title = {\n On the Hidden Treasure of Dialog in Video Question Answering\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2064-2073\n} \n}" }, { "title": "On the Importance of Distractors for Few-Shot Classification", @@ -29237,7 +31218,8 @@ "status": "Poster", "track": "main", "pid": 2657, - "author": "Rajshekhar Das; Yu-Xiong Wang; Jos\u00e9 M. F. Moura", + "author_site": "Rajshekhar Das; Yu-Xiong Wang; José M. F. Moura", + "author": "Rajshekhar Das; Yu-Xiong Wang; José M. F. Moura", "abstract": "Few-shot classification aims at classifying categories of a novel task by learning from just a few (typically, 1 to 5) labeled examples. An effective approach to few-shot classification involves a prior model trained on a large-sample base domain, which is then finetuned over the novel few-shot task to yield generalizable representations. However, task-specific finetuning is prone to overfitting due to the lack of enough training examples. To alleviate this issue, we propose a new finetuning approach based on contrastive learning that reuses unlabelled examples from the base domain in the form of distractors. Unlike the nature of unlabelled data used in prior works, distractors belong to classes that do not overlap with the novel categories. We demonstrate for the first time that the inclusion of such distractors can significantly boost few-shot generalization. Our technical novelty includes a stochastic pairing of examples sharing the same category in the few-shot task and a weighting term that controls the relative influence of task-specific negatives and distractors. An important aspect of our finetuning objective is that it is agnostic to distractor labels and hence applicable to various base domain settings. More precisely, compared to state-of-the-art approaches, our method shows accuracy gains of up to 12% in cross-domain and up to 5% in unsupervised prior-learning settings. Our code is available at https://github.com/quantacode/Contrastive-Finetuning.git", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Das_On_the_Importance_of_Distractors_for_Few-Shot_Classification_ICCV_2021_paper.pdf", "aff": "Carnegie Mellon University; University of Illinois at Urbana-Champaign; Carnegie Mellon University", @@ -29253,14 +31235,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Das_On_the_Importance_of_Distractors_for_Few-Shot_Classification_ICCV_2021_paper.html", "aff_unique_index": "0;1;0", - "aff_unique_norm": "Carnegie Mellon University;University of Illinois Urbana-Champaign", + "aff_unique_norm": "Carnegie Mellon University;University of Illinois at Urbana-Champaign", "aff_unique_dep": ";", "aff_unique_url": "https://www.cmu.edu;https://illinois.edu", "aff_unique_abbr": "CMU;UIUC", "aff_campus_unique_index": "1", "aff_campus_unique": ";Urbana-Champaign", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Das_2021_ICCV,\n \n author = {\n Das,\n Rajshekhar and Wang,\n Yu-Xiong and Moura,\n Jos\\'e M. F.\n},\n title = {\n On the Importance of Distractors for Few-Shot Classification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9030-9040\n} \n}" }, { "title": "On the Limits of Pseudo Ground Truth in Visual Camera Re-Localisation", @@ -29268,6 +31251,7 @@ "status": "Poster", "track": "main", "pid": 8079, + "author_site": "Eric Brachmann; Martin Humenberger; Carsten Rother; Torsten Sattler", "author": "Eric Brachmann; Martin Humenberger; Carsten Rother; Torsten Sattler", "abstract": "Benchmark datasets that measure camera pose accuracy have driven progress in visual re-localisation research. To obtain poses for thousands of images, it is common to use a reference algorithm to generate pseudo ground truth. Popular choices include Structure-from-Motion (SfM) and Simultaneous-Localisation-and-Mapping (SLAM) using additional sensors like depth cameras if available. Re-localisation benchmarks thus measure how well each method replicates the results of the reference algorithm. This begs the question whether the choice of the reference algorithm favours a certain family of re-localisation methods. This paper analyzes two widely used re-localisation datasets and shows that evaluation outcomes indeed vary with the choice of the reference algorithm. We thus question common beliefs in the re-localisation literature, namely that learning-based scene coordinate regression outperforms classical feature-based methods, and that RGB-D- based methods outperform RGB-based methods. We argue that any claims on ranking re-localisation methods should take the type of the reference algorithm, and the similarity of the methods to the reference algorithm, into account.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Brachmann_On_the_Limits_of_Pseudo_Ground_Truth_in_Visual_Camera_ICCV_2021_paper.pdf", @@ -29282,7 +31266,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Brachmann_On_the_Limits_of_Pseudo_Ground_Truth_in_Visual_Camera_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Brachmann_On_the_Limits_of_Pseudo_Ground_Truth_in_Visual_Camera_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Brachmann_2021_ICCV,\n \n author = {\n Brachmann,\n Eric and Humenberger,\n Martin and Rother,\n Carsten and Sattler,\n Torsten\n},\n title = {\n On the Limits of Pseudo Ground Truth in Visual Camera Re-Localisation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6218-6228\n} \n}" }, { "title": "On the Robustness of Vision Transformers to Adversarial Examples", @@ -29290,6 +31275,7 @@ "status": "Poster", "track": "main", "pid": 8047, + "author_site": "Kaleel Mahmood; Rigel Mahmood; Marten van Dijk", "author": "Kaleel Mahmood; Rigel Mahmood; Marten van Dijk", "abstract": "Recent advances in attention-based networks have shown that Vision Transformers can achieve state-of-the-art or near state-of-the-art results on many image classification tasks. This puts transformers in the unique position of being a promising alternative to traditional convolutional neural networks (CNNs). While CNNs have been carefully studied with respect to adversarial attacks, the same cannot be said of Vision Transformers. In this paper, we study the robustness of Vision Transformers to adversarial examples. Our analyses of transformer security is divided into three parts. First, we test the transformer under standard white-box and black-box attacks. Second, we study the transferability of adversarial examples between CNNs and transformers. We show that adversarial examples do not readily transfer between CNNs and transformers. Based on this finding, we analyze the security of a simple ensemble defense of CNNs and transformers. By creating a new attack, the self-attention blended gradient attack, we show that such an ensemble is not secure under a white-box adversary. However, under a black-box adversary, we show that an ensemble can achieve unprecedented robustness without sacrificing clean accuracy. Our analysis for this work is done using six types of white-box attacks and two types of black-box attacks. Our study encompasses multiple Vision Transformers, Big Transfer Models and CNN architectures trained on CIFAR-10, CIFAR-100 and ImageNet.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Mahmood_On_the_Robustness_of_Vision_Transformers_to_Adversarial_Examples_ICCV_2021_paper.pdf", @@ -29313,7 +31299,8 @@ "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Connecticut;Amsterdam", "aff_country_unique_index": "0;0;1", - "aff_country_unique": "United States;Netherlands" + "aff_country_unique": "United States;Netherlands", + "bibtex": "@InProceedings{Mahmood_2021_ICCV,\n \n author = {\n Mahmood,\n Kaleel and Mahmood,\n Rigel and van Dijk,\n Marten\n},\n title = {\n On the Robustness of Vision Transformers to Adversarial Examples\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7838-7847\n} \n}" }, { "title": "Once Quantization-Aware Training: High Performance Extremely Low-Bit Architecture Search", @@ -29321,6 +31308,7 @@ "status": "Poster", "track": "main", "pid": 3597, + "author_site": "Mingzhu Shen; Feng Liang; Ruihao Gong; Yuhang Li; Chuming Li; Chen Lin; Fengwei Yu; Junjie Yan; Wanli Ouyang", "author": "Mingzhu Shen; Feng Liang; Ruihao Gong; Yuhang Li; Chuming Li; Chen Lin; Fengwei Yu; Junjie Yan; Wanli Ouyang", "abstract": "Quantization Neural Networks (QNN) have attracted a lot of attention due to their high efficiency. To enhance the quantization accuracy, prior works mainly focus on designing advanced quantization algorithms but still fail to achieve satisfactory results under the extremely low-bit case. In this work, we take an architecture perspective to investigate the potential of high-performance QNN. Therefore, we propose to combine Network Architecture Search methods with quantization to enjoy the merits of the two sides. However, a naive combination inevitably faces unacceptable time consumption or unstable training problem. To alleviate these problems, we first propose the joint training of architecture and quantization with a shared step size to acquire a large number of quantized models. Then a bit-inheritance scheme is introduced to transfer the quantized models to the lower bit, which further reduces the time cost and meanwhile improves the quantization accuracy. Equipped with this overall framework, dubbed as Once Quantization-Aware Training (OQAT), our searched model family, OQATNets, achieves a new state-of-the-art compared with various architectures under different bit-widths. In particular, OQAT-2bit-M achieves 61.6% ImageNet Top-1 accuracy, outperforming 2-bit counterpart MobileNetV3 by a large margin of 9% with 10% less computation cost. A series of quantization-friendly architectures are identified easily and extensive analysis can be made to summarize the interaction between quantization and neural architectures. Codes and models are released at https://github.com/LaVieEnRoseSMZ/OQA", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Shen_Once_Quantization-Aware_Training_High_Performance_Extremely_Low-Bit_Architecture_Search_ICCV_2021_paper.pdf", @@ -29337,14 +31325,15 @@ "author_num": 9, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Shen_Once_Quantization-Aware_Training_High_Performance_Extremely_Low-Bit_Architecture_Search_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;0;0;1;0;0;2", - "aff_unique_norm": "SenseTime;University of Oxford;University of Sydney", + "aff_unique_norm": "Sensetime;University of Oxford;University of Sydney", "aff_unique_dep": "Research;;", "aff_unique_url": "https://www.sensetime.com/;https://www.ox.ac.uk;https://www.sydney.edu.au", "aff_unique_abbr": "SenseTime;Oxford;USYD", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;1;0;0;2", - "aff_country_unique": "China;United Kingdom;Australia" + "aff_country_unique": "China;United Kingdom;Australia", + "bibtex": "@InProceedings{Shen_2021_ICCV,\n \n author = {\n Shen,\n Mingzhu and Liang,\n Feng and Gong,\n Ruihao and Li,\n Yuhang and Li,\n Chuming and Lin,\n Chen and Yu,\n Fengwei and Yan,\n Junjie and Ouyang,\n Wanli\n},\n title = {\n Once Quantization-Aware Training: High Performance Extremely Low-Bit Architecture Search\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5340-5349\n} \n}" }, { "title": "One-Pass Multi-View Clustering for Large-Scale Data", @@ -29352,6 +31341,7 @@ "status": "Poster", "track": "main", "pid": 2185, + "author_site": "Jiyuan Liu; Xinwang Liu; Yuexiang Yang; Li Liu; Siqi Wang; Weixuan Liang; Jiangyong Shi", "author": "Jiyuan Liu; Xinwang Liu; Yuexiang Yang; Li Liu; Siqi Wang; Weixuan Liang; Jiangyong Shi", "abstract": "Existing non-negative matrix factorization based multi-view clustering algorithms compute multiple coefficient matrices respect to different data views, and learn a common consensus concurrently. The final partition is always obtained from the consensus with classical clustering techniques, such as k-means. However, the non-negativity constraint prevents from obtaining a more discriminative embedding. Meanwhile, this two-step procedure fails to unify multi-view matrix factorization with partition generation closely, resulting in unpromising performance. Therefore, we propose an one-pass multi-view clustering algorithm by removing the non-negativity constraint and jointly optimize the aforementioned two steps. In this way, the generated partition can guide multi-view matrix factorization to produce more purposive coefficient matrix which, as a feedback, improves the quality of partition. To solve the resultant optimization problem, we design an alternate strategy which is guaranteed to be convergent theoretically. Moreover, the proposed algorithm is free of parameter and of linear complexity, making it practical in applications. In addition, the proposed algorithm is compared with recent advances in literature on benchmarks, demonstrating its effectiveness, superiority and efficiency.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liu_One-Pass_Multi-View_Clustering_for_Large-Scale_Data_ICCV_2021_paper.pdf", @@ -29375,7 +31365,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2021_ICCV,\n \n author = {\n Liu,\n Jiyuan and Liu,\n Xinwang and Yang,\n Yuexiang and Liu,\n Li and Wang,\n Siqi and Liang,\n Weixuan and Shi,\n Jiangyong\n},\n title = {\n One-Pass Multi-View Clustering for Large-Scale Data\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12344-12353\n} \n}" }, { "title": "Online Continual Learning With Natural Distribution Shifts: An Empirical Study With Visual Data", @@ -29383,6 +31374,7 @@ "status": "Poster", "track": "main", "pid": 3390, + "author_site": "Zhipeng Cai; Ozan Sener; Vladlen Koltun", "author": "Zhipeng Cai; Ozan Sener; Vladlen Koltun", "abstract": "Continual learning is the problem of learning and retaining knowledge through time over multiple tasks and environments. Research has primarily focused on the incremental classification setting, where new tasks/classes are added at discrete time intervals. Such an \"offline\" setting does not evaluate the ability of agents to learn effectively and efficiently, since an agent can perform multiple learning epochs without any time limitation when a task is added. We argue that \"online\" continual learning, where data is a single continuous stream without task boundaries, enables evaluating both information retention and online learning efficacy. In online continual learning, each incoming small batch of data is first used for testing and then added to the training set, making the problem truly online. Trained models are later evaluated on historical data to assess information retention. We introduce a new benchmark for online continual visual learning that exhibits large scale and natural distribution shifts. Through a large-scale analysis, we identify critical and previously unobserved phenomena of gradient-based optimization in continual learning, and propose effective strategies for improving gradient-based online continual learning with real data. The source code and dataset are available in: https://github.com/ IntelLabs/continuallearning.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Cai_Online_Continual_Learning_With_Natural_Distribution_Shifts_An_Empirical_Study_ICCV_2021_paper.pdf", @@ -29397,7 +31389,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Cai_Online_Continual_Learning_With_Natural_Distribution_Shifts_An_Empirical_Study_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Cai_Online_Continual_Learning_With_Natural_Distribution_Shifts_An_Empirical_Study_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Cai_2021_ICCV,\n \n author = {\n Cai,\n Zhipeng and Sener,\n Ozan and Koltun,\n Vladlen\n},\n title = {\n Online Continual Learning With Natural Distribution Shifts: An Empirical Study With Visual Data\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8281-8290\n} \n}" }, { "title": "Online Knowledge Distillation for Efficient Pose Estimation", @@ -29405,6 +31398,7 @@ "status": "Poster", "track": "main", "pid": 9595, + "author_site": "Zheng Li; Jingwen Ye; Mingli Song; Ying Huang; Zhigeng Pan", "author": "Zheng Li; Jingwen Ye; Mingli Song; Ying Huang; Zhigeng Pan", "abstract": "Existing state-of-the-art human pose estimation methods require heavy computational resources for accurate predictions. One promising technique to obtain an accurate yet lightweight pose estimator is knowledge distillation, which distills the pose knowledge from a powerful teacher model to a less-parameterized student model. However, existing pose distillation works rely on a heavy pre-trained estimator to perform knowledge transfer and require a complex two-stage learning procedure. In this work, we investigate a novel Online Knowledge Distillation framework by distilling Human Pose structure knowledge in a one-stage manner to guarantee the distillation efficiency, termed OKDHP. Specifically, OKDHP trains a single multi-branch network and acquires the predicted heatmaps from each, which are then assembled by a Feature Aggregation Unit (FAU) as the target heatmaps to teach each branch in reverse. Instead of simply averaging the heatmaps, FAU which consists of multiple parallel transformations with different receptive fields, leverages the multi-scale information, thus obtains target heatmaps with higher-quality. Specifically, the pixel-wise Kullback-Leibler (KL) divergence is utilized to minimize the discrepancy between the target heatmaps and the predicted ones, which enables the student network to learn the implicit keypoint relationship. Besides, an unbalanced OKDHP scheme is introduced to customize the student networks with different compression rates. The effectiveness of our approach is demonstrated by extensive experiments on two common benchmark datasets, MPII and COCO.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_Online_Knowledge_Distillation_for_Efficient_Pose_Estimation_ICCV_2021_paper.pdf", @@ -29428,7 +31422,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Zheng and Ye,\n Jingwen and Song,\n Mingli and Huang,\n Ying and Pan,\n Zhigeng\n},\n title = {\n Online Knowledge Distillation for Efficient Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11740-11750\n} \n}" }, { "title": "Online Multi-Granularity Distillation for GAN Compression", @@ -29436,6 +31431,7 @@ "status": "Poster", "track": "main", "pid": 8240, + "author_site": "Yuxi Ren; Jie Wu; Xuefeng Xiao; Jianchao Yang", "author": "Yuxi Ren; Jie Wu; Xuefeng Xiao; Jianchao Yang", "abstract": "Generative Adversarial Networks (GANs) have witnessed prevailing success in yielding outstanding images, however, they are burdensome to deploy on resource-constrained devices due to ponderous computational costs and hulking memory usage. Although recent efforts on compressing GANs have acquired remarkable results, they still exist potential model redundancies and can be further compressed. To solve this issue, we propose a novel online multi-granularity distillation (OMGD) scheme to obtain lightweight GANs, which contributes to generating high-fidelity images with low computational demands. We offer the first attempt to popularize single-stage online distillation for GAN-oriented compression, where the progressively promoted teacher generator helps to refine the discriminator-free based student generator. Complementary teacher generators and network layers provide comprehensive and multi-granularity concepts to enhance visual fidelity from diverse dimensions. Experimental results on four benchmark datasets demonstrate that OMGD successes to compress 40xMACs and 82.5xparameters on Pix2Pix and CycleGAN, without loss of image quality. It reveals that OMGD provides a feasible solution for the deployment of real-time image translation on resource-constrained devices. Our code and models are made public at: https://github.com/bytedance/OMGD", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ren_Online_Multi-Granularity_Distillation_for_GAN_Compression_ICCV_2021_paper.pdf", @@ -29459,7 +31455,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Ren_2021_ICCV,\n \n author = {\n Ren,\n Yuxi and Wu,\n Jie and Xiao,\n Xuefeng and Yang,\n Jianchao\n},\n title = {\n Online Multi-Granularity Distillation for GAN Compression\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6793-6803\n} \n}" }, { "title": "Online Pseudo Label Generation by Hierarchical Cluster Dynamics for Adaptive Person Re-Identification", @@ -29467,6 +31464,7 @@ "status": "Poster", "track": "main", "pid": 2775, + "author_site": "Yi Zheng; Shixiang Tang; Guolong Teng; Yixiao Ge; Kaijian Liu; Jing Qin; Donglian Qi; Dapeng Chen", "author": "Yi Zheng; Shixiang Tang; Guolong Teng; Yixiao Ge; Kaijian Liu; Jing Qin; Donglian Qi; Dapeng Chen", "abstract": "Adaptive person re-identification (adaptive ReID) targets at transferring learned knowledge from the labeled source domain to the unlabeled target domain. Pseudo-label-based methods that alternatively generate pseudo labels and optimize the training model have demonstrated great effectiveness in this field. However, the generated pseudo labels are inaccurate and cannot reflect the true semantic meaning of the unlabeled samples. We consider such inaccuracy stems from both the lagged update of the pseudo labels as well as the simple criterion of the employed clustering method. To tackle the problem, we propose an online pseudo label generation by hierarchical cluster dynamics for adaptive ReID. In particular, hierarchical label banks are constructed for all the samples in the dataset, and we update the pseudo labels of the sample in each coming mini-batch, performing the model optimization and the label generation simultaneously. A new hierarchical cluster dynamics is built for the label update, where cluster merge and cluster split are driven by a possibility computed by the label propagation. Our method can achieve better pseudo labels and higher reid accuracy. Extensive experiments on Market-to-Duke, Duke-to-Market, MSMT-to-Market, MSMT-to-Duke, Market-to-MSMT, and Duke-to-MSMT verify the effectiveness of our proposed method.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zheng_Online_Pseudo_Label_Generation_by_Hierarchical_Cluster_Dynamics_for_Adaptive_ICCV_2021_paper.pdf", @@ -29483,14 +31481,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zheng_Online_Pseudo_Label_Generation_by_Hierarchical_Cluster_Dynamics_for_Adaptive_ICCV_2021_paper.html", "aff_unique_index": "0;1;0;2;0;3;0;4", - "aff_unique_norm": "Zhejiang University;University of Sydney;Chinese University of Hong Kong;Hong Kong Polytechnic University;Huawei", - "aff_unique_dep": ";;;;Huawei", + "aff_unique_norm": "Zhejiang University;University of Sydney;The Chinese University of Hong Kong;The Hong Kong Polytechnic University;Huawei", + "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.zju.edu.cn;https://www.sydney.edu.au;https://www.cuhk.edu.hk;https://www.polyu.edu.hk;https://www.huawei.com", "aff_unique_abbr": "ZJU;USYD;CUHK;PolyU;Huawei", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;1;0;0;0;0;0;0", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Zheng_2021_ICCV,\n \n author = {\n Zheng,\n Yi and Tang,\n Shixiang and Teng,\n Guolong and Ge,\n Yixiao and Liu,\n Kaijian and Qin,\n Jing and Qi,\n Donglian and Chen,\n Dapeng\n},\n title = {\n Online Pseudo Label Generation by Hierarchical Cluster Dynamics for Adaptive Person Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8371-8381\n} \n}" }, { "title": "Online Refinement of Low-Level Feature Based Activation Map for Weakly Supervised Object Localization", @@ -29498,6 +31497,7 @@ "status": "Poster", "track": "main", "pid": 6804, + "author_site": "Jinheng Xie; Cheng Luo; Xiangping Zhu; Ziqi Jin; Weizeng Lu; Linlin Shen", "author": "Jinheng Xie; Cheng Luo; Xiangping Zhu; Ziqi Jin; Weizeng Lu; Linlin Shen", "abstract": "We present a two-stage learning framework for weakly supervised object localization (WSOL). While most previous efforts rely on high-level feature based CAMs (Class Activation Maps), this paper proposes to localize objects using the low-level feature based activation maps. In the first stage, an activation map generator produces activation maps based on the low-level feature maps in the classifier, such that rich contextual object information is included in an online manner. In the second stage, we employ an evaluator to evaluate the activation maps predicted by the activation map generator. Based on this, we further propose a weighted entropy loss, an attentive erasing, and an area loss to drive the activation map generator to substantially reduce the uncertainty of activations between object and background, and explore less discriminative regions. Based on the low-level object information preserved in the first stage, the second stage model gradually generates a well-separated, complete, and compact activation map of object in the image, which can be easily thresholded for accurate localization. Extensive experiments on CUB-200-2011 and ImageNet-1K datasets show that our framework surpasses previous methods by a large margin, which sets a new state-of-the-art for WSOL. Code will be available soon.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xie_Online_Refinement_of_Low-Level_Feature_Based_Activation_Map_for_Weakly_ICCV_2021_paper.pdf", @@ -29521,7 +31521,8 @@ "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Shenzhen;", "aff_country_unique_index": "0;0;0;0;0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xie_2021_ICCV,\n \n author = {\n Xie,\n Jinheng and Luo,\n Cheng and Zhu,\n Xiangping and Jin,\n Ziqi and Lu,\n Weizeng and Shen,\n Linlin\n},\n title = {\n Online Refinement of Low-Level Feature Based Activation Map for Weakly Supervised Object Localization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 132-141\n} \n}" }, { "title": "Online-Trained Upsampler for Deep Low Complexity Video Compression", @@ -29529,6 +31530,7 @@ "status": "Poster", "track": "main", "pid": 5403, + "author_site": "Jan P. Klopp; Keng-Chi Liu; Shao-Yi Chien; Liang-Gee Chen", "author": "Jan P. Klopp; Keng-Chi Liu; Shao-Yi Chien; Liang-Gee Chen", "abstract": "Deep learning for image and video compression has demonstrated promising results both as a standalone technology and a hybrid combination with existing codecs. However, these systems still come with high computational costs. Deep learning models are typically applied directly in pixel space, making them expensive when resolutions become large. In this work, we propose an online-trained upsampler to augment an existing codec. The upsampler is a small neural network trained on an isolated group of frames. Its parameters are signalled to the decoder. This hybrid solution has a small scope of only 10s or 100s of frames and allows for a low complexity both on the encoding and the decoding side. Our algorithm works in offline and in zero-latency settings. Our evaluation employs the popular x265 codec on several high-resolution datasets ranging from Full HD to 8K. We demonstrate rate savings between 8.6% and 27.5% and provide ablation studies to show the impact of our design decisions. In comparison to similar works, our approach performs favourably.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Klopp_Online-Trained_Upsampler_for_Deep_Low_Complexity_Video_Compression_ICCV_2021_paper.pdf", @@ -29552,7 +31554,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Taiwan", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Klopp_2021_ICCV,\n \n author = {\n Klopp,\n Jan P. and Liu,\n Keng-Chi and Chien,\n Shao-Yi and Chen,\n Liang-Gee\n},\n title = {\n Online-Trained Upsampler for Deep Low Complexity Video Compression\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7929-7938\n} \n}" }, { "title": "OpenForensics: Large-Scale Challenging Dataset for Multi-Face Forgery Detection and Segmentation In-the-Wild", @@ -29560,6 +31563,7 @@ "status": "Poster", "track": "main", "pid": 2285, + "author_site": "Trung-Nghia Le; Huy H. Nguyen; Junichi Yamagishi; Isao Echizen", "author": "Trung-Nghia Le; Huy H. Nguyen; Junichi Yamagishi; Isao Echizen", "abstract": "The proliferation of deepfake media is raising concerns among the public and relevant authorities. It has become essential to develop countermeasures against forged faces in social media. This paper presents a comprehensive study on two new countermeasure tasks: multi-face forgery detection and segmentation in-the-wild. Localizing forged faces among multiple human faces in unrestricted natural scenes is far more challenging than the traditional deepfake recognition task. To promote these new tasks, we have created the first large-scale dataset posing a high level of challenges that is designed with face-wise rich annotations explicitly for face forgery detection and segmentation, namely OpenForensics. With its rich annotations, our OpenForensics dataset has great potentials for research in both deepfake prevention and general human face detection. We have also developed a suite of benchmarks for these tasks by conducting an extensive evaluation of state-of-the-art instance detection and segmentation methods on our newly constructed dataset in various scenarios.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Le_OpenForensics_Large-Scale_Challenging_Dataset_for_Multi-Face_Forgery_Detection_and_Segmentation_ICCV_2021_paper.pdf", @@ -29574,7 +31578,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Le_OpenForensics_Large-Scale_Challenging_Dataset_for_Multi-Face_Forgery_Detection_and_Segmentation_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Le_OpenForensics_Large-Scale_Challenging_Dataset_for_Multi-Face_Forgery_Detection_and_Segmentation_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Le_2021_ICCV,\n \n author = {\n Le,\n Trung-Nghia and Nguyen,\n Huy H. and Yamagishi,\n Junichi and Echizen,\n Isao\n},\n title = {\n OpenForensics: Large-Scale Challenging Dataset for Multi-Face Forgery Detection and Segmentation In-the-Wild\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10117-10127\n} \n}" }, { "title": "OpenGAN: Open-Set Recognition via Open Data Generation", @@ -29582,6 +31587,7 @@ "status": "Poster", "track": "main", "pid": 3014, + "author_site": "Shu Kong; Deva Ramanan", "author": "Shu Kong; Deva Ramanan", "abstract": "Real-world machine learning systems need to analyze novel testing data that differs from the training data. In K-way classification, this is crisply formulated as open-set recognition, core to which is the ability to discriminate open-set data outside the K closed-set classes. Two conceptually elegant ideas for open-set discrimination are: 1) discriminatively learning an open-vs-closed binary discriminator by exploiting some outlier data as the open-set, and 2) unsupervised learning the closed-set data distribution with a GAN and using its discriminator as the open-set likelihood function. However, the former generalizes poorly to diverse open test data due to overfitting to the training outliers, which unlikely exhaustively span the open-world. The latter does not work well, presumably due to the instable training of GANs. Motivated by the above, we propose OpenGAN, which addresses the limitation of each approach by combining them with several technical insights. First, we show that a carefully selected GAN-discriminator on some real outlier data already achieves the state-of-the-art. Second, we augment the available set of real open training examples with adversarially synthesized \"\"fake\"\" data. Third and most importantly, we build the discriminator over the features computed by the closed-world K-way networks. Extensive experiments show that OpenGAN significantly outperforms prior open-set methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Kong_OpenGAN_Open-Set_Recognition_via_Open_Data_Generation_ICCV_2021_paper.pdf", @@ -29605,7 +31611,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Kong_2021_ICCV,\n \n author = {\n Kong,\n Shu and Ramanan,\n Deva\n},\n title = {\n OpenGAN: Open-Set Recognition via Open Data Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 813-822\n} \n}" }, { "title": "Oriented R-CNN for Object Detection", @@ -29613,10 +31620,11 @@ "status": "Poster", "track": "main", "pid": 6580, + "author_site": "Xingxing Xie; Gong Cheng; Jiabao Wang; Xiwen Yao; Junwei Han", "author": "Xingxing Xie; Gong Cheng; Jiabao Wang; Xiwen Yao; Junwei Han", "abstract": "Current state-of-the-art two-stage detectors generate oriented proposals through time-consuming schemes. This diminishes the detectors' speed, thereby becoming the computational bottleneck in advanced oriented object detection systems. This work proposes an effective and simple oriented object detection framework, termed Oriented R-CNN, which is a general two-stage oriented detector with promising accuracy and efficiency. To be specific, in the first stage, we propose an oriented Region Proposal Network (oriented RPN) that directly generates high-quality oriented proposals in a nearly cost-free manner. The second stage is oriented R-CNN head for refining oriented Regions of Interest (oriented RoIs) and recognizing them. Without tricks, oriented R-CNN with ResNet50 achieves state-of-the-art detection accuracy on two commonly-used datasets for oriented object detection including DOTA (75.87% mAP) and HRSC2016 (96.50% mAP), while having a speed of 15.1 FPS with the image size of 1024x1024 on a single RTX 2080Ti. We hope our work could inspire rethinking the design of oriented detectors and serve as a baseline for oriented object detection. Code is available at https: //github.com/jbwang1997/OBBDetection.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xie_Oriented_R-CNN_for_Object_Detection_ICCV_2021_paper.pdf", - "aff": "School of Automation, Northwestern Ploytechnical University, Xi\u2019an, China; School of Automation, Northwestern Ploytechnical University, Xi\u2019an, China; School of Automation, Northwestern Ploytechnical University, Xi\u2019an, China; School of Automation, Northwestern Ploytechnical University, Xi\u2019an, China; School of Automation, Northwestern Ploytechnical University, Xi\u2019an, China", + "aff": "School of Automation, Northwestern Ploytechnical University, Xi’an, China; School of Automation, Northwestern Ploytechnical University, Xi’an, China; School of Automation, Northwestern Ploytechnical University, Xi’an, China; School of Automation, Northwestern Ploytechnical University, Xi’an, China; School of Automation, Northwestern Ploytechnical University, Xi’an, China", "project": "", "github": "https://github.com/jbwang1997/OBBDetection", "supp": "", @@ -29632,11 +31640,12 @@ "aff_unique_norm": "Northwestern Polytechnical University", "aff_unique_dep": "School of Automation", "aff_unique_url": "http://www.nwpu.edu.cn", - "aff_unique_abbr": "NWPU", + "aff_unique_abbr": "NPU", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Xi'an", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xie_2021_ICCV,\n \n author = {\n Xie,\n Xingxing and Cheng,\n Gong and Wang,\n Jiabao and Yao,\n Xiwen and Han,\n Junwei\n},\n title = {\n Oriented R-CNN for Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3520-3529\n} \n}" }, { "title": "Orthogonal Jacobian Regularization for Unsupervised Disentanglement in Image Generation", @@ -29644,6 +31653,7 @@ "status": "Poster", "track": "main", "pid": 6321, + "author_site": "Yuxiang Wei; Yupeng Shi; Xiao Liu; Zhilong Ji; Yuan Gao; Zhongqin Wu; Wangmeng Zuo", "author": "Yuxiang Wei; Yupeng Shi; Xiao Liu; Zhilong Ji; Yuan Gao; Zhongqin Wu; Wangmeng Zuo", "abstract": "Unsupervised disentanglement learning is a crucial issue for understanding and exploiting deep generative models. Recently, SeFa tries to find latent disentangled directions by performing SVD on the first projection of a pre-trained GAN. However, it is only applied to the first layer and works in a post-processing way. Hessian Penalty minimizes the off-diagonal entries of the output's Hessian matrix to facilitate disentanglement, and can be applied to multi-layers. However, it constrains each entry of output independently, making it not sufficient in disentangling the latent directions (e.g., shape, size, rotation, etc.) of spatially correlated variations. In this paper, we propose a simple Orthogonal Jacobian Regularization (OroJaR) to encourage deep generative model to learn disentangled representations. It simply encourages the variation of output caused by perturbations on different latent dimensions to be orthogonal, and the Jacobian with respect to the input is calculated to represent this variation. We show that our OroJaR also encourages the output's Hessian matrix to be diagonal in an indirect manner. In contrast to the Hessian Penalty, our OroJaR constrains the output in a holistic way, making it very effective in disentangling latent dimensions corresponding to spatially correlated variations. Quantitative and qualitative experimental results show that our method is effective in disentangled and controllable image generation, and performs favorably against the state-of-the-art methods. Our code is available at https://github.com/csyxwei/OroJaR.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wei_Orthogonal_Jacobian_Regularization_for_Unsupervised_Disentanglement_in_Image_Generation_ICCV_2021_paper.pdf", @@ -29667,7 +31677,8 @@ "aff_campus_unique_index": "0;0;0+2", "aff_campus_unique": "Harbin;;Guangzhou", "aff_country_unique_index": "0;0;0+0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Wei_2021_ICCV,\n \n author = {\n Wei,\n Yuxiang and Shi,\n Yupeng and Liu,\n Xiao and Ji,\n Zhilong and Gao,\n Yuan and Wu,\n Zhongqin and Zuo,\n Wangmeng\n},\n title = {\n Orthogonal Jacobian Regularization for Unsupervised Disentanglement in Image Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6721-6730\n} \n}" }, { "title": "Orthogonal Projection Loss", @@ -29675,10 +31686,11 @@ "status": "Poster", "track": "main", "pid": 2044, + "author_site": "Kanchana Ranasinghe; Muzammal Naseer; Munawar Hayat; Salman Khan; Fahad Shahbaz Khan", "author": "Kanchana Ranasinghe; Muzammal Naseer; Munawar Hayat; Salman Khan; Fahad Shahbaz Khan", "abstract": "Deep neural networks have achieved remarkable performance on a range of classification tasks, with softmax cross-entropy (CE) loss emerging as the de-facto objective function. The CE loss encourages features of a class to have a higher projection score on the true class-vector compared to the negative classes. However, this is a relative constraint and does not explicitly force different class features to be well-separated. Motivated by the observation that ground-truth class representations in CE loss are orthogonal (one-hot encoded vectors), we develop a novel loss function termed `Orthogonal Projection Loss' (OPL) which imposes orthogonality in the feature space. OPL augments the properties of CE loss and directly enforces inter-class separation alongside intra-class clustering in the feature space through orthogonality constraints on the mini-batch level. As compared to other alternatives of CE, OPL offers unique advantages e.g., no additional learnable parameters, does not require careful negative mining and is not sensitive to the batch size. Given the plug-and-play nature of OPL, we evaluate it on a diverse range of tasks including image recognition (CIFAR-100), large-scale classification (ImageNet), domain generalization (PACS) and few-shot learning (miniImageNet, CIFAR-FS, tiered-ImageNet and Meta-dataset) and demonstrate its effectiveness across the board. Furthermore, OPL offers better robustness against practical nuisances such as adversarial attacks and label noise. Our code will be publicly released.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ranasinghe_Orthogonal_Projection_Loss_ICCV_2021_paper.pdf", - "aff": "Mohamed bin Zayed University of AI, UAE+Stony Brook University, USA; Mohamed bin Zayed University of AI, UAE+Australian National University, Australia; Monash University, Australia; Mohamed bin Zayed University of AI, UAE+Australian National University, Australia; Mohamed bin Zayed University of AI, UAE+Link \u00a8oping University, Sweden", + "aff": "Mohamed bin Zayed University of AI, UAE+Stony Brook University, USA; Mohamed bin Zayed University of AI, UAE+Australian National University, Australia; Monash University, Australia; Mohamed bin Zayed University of AI, UAE+Australian National University, Australia; Mohamed bin Zayed University of AI, UAE+Link ¨oping University, Sweden", "project": "", "github": "https://github.com/kahnchana/opl", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Ranasinghe_Orthogonal_Projection_Loss_ICCV_2021_supplemental.pdf", @@ -29691,14 +31703,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Ranasinghe_Orthogonal_Projection_Loss_ICCV_2021_paper.html", "aff_unique_index": "0+1;0+2;3;0+2;0+4", - "aff_unique_norm": "Mohamed bin Zayed University of Artificial Intelligence;Stony Brook University;Australian National University;Monash University;Link\u00f6ping University", + "aff_unique_norm": "Mohamed bin Zayed University of Artificial Intelligence;Stony Brook University;Australian National University;Monash University;Linköping University", "aff_unique_dep": ";;;;", "aff_unique_url": "https://mbzuai.ac.ae;https://www.stonybrook.edu;https://www.anu.edu.au;https://www.monash.edu;https://www.liu.se", "aff_unique_abbr": "MBZUAI;SBU;ANU;Monash;LiU", "aff_campus_unique_index": ";;;", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0+2;2;0+2;0+3", - "aff_country_unique": "United Arab Emirates;United States;Australia;Sweden" + "aff_country_unique": "United Arab Emirates;United States;Australia;Sweden", + "bibtex": "@InProceedings{Ranasinghe_2021_ICCV,\n \n author = {\n Ranasinghe,\n Kanchana and Naseer,\n Muzammal and Hayat,\n Munawar and Khan,\n Salman and Khan,\n Fahad Shahbaz\n},\n title = {\n Orthogonal Projection Loss\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12333-12343\n} \n}" }, { "title": "Orthographic-Perspective Epipolar Geometry", @@ -29706,6 +31719,7 @@ "status": "Poster", "track": "main", "pid": 9055, + "author_site": "Viktor Larsson; Marc Pollefeys; Magnus Oskarsson", "author": "Viktor Larsson; Marc Pollefeys; Magnus Oskarsson", "abstract": "In this paper we consider the epipolar geometry between orthographic and perspective cameras. We generalize many of the classical results for the perspective essential matrix to this setting and derive novel minimal solvers, not only for the calibrated case, but also for partially calibrated and non-central camera setups. While orthographic cameras might seem exotic, they occur naturally in many applications. They can e.g. model 2D maps (such as floor plans), aerial/satellite photography and even approximate narrow field-of-view cameras (e.g. from telephoto lenses). In our experiments we highlight various applications of the developed theory and solvers, including Radar-Camera calibration and aligning Structure-from-Motion models to aerial or satellite images.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Larsson_Orthographic-Perspective_Epipolar_Geometry_ICCV_2021_paper.pdf", @@ -29720,7 +31734,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Larsson_Orthographic-Perspective_Epipolar_Geometry_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Larsson_Orthographic-Perspective_Epipolar_Geometry_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Larsson_2021_ICCV,\n \n author = {\n Larsson,\n Viktor and Pollefeys,\n Marc and Oskarsson,\n Magnus\n},\n title = {\n Orthographic-Perspective Epipolar Geometry\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5570-5578\n} \n}" }, { "title": "Out-of-Boundary View Synthesis Towards Full-Frame Video Stabilization", @@ -29728,6 +31743,7 @@ "status": "Poster", "track": "main", "pid": 3748, + "author_site": "Yufei Xu; Jing Zhang; Dacheng Tao", "author": "Yufei Xu; Jing Zhang; Dacheng Tao", "abstract": "Warping-based video stabilizers smooth camera trajectory by constraining each pixel's displacement and warp stabilized frames from unstable ones accordingly. However, since the view outside the boundary is not available during warping, the resulting holes around the boundary of the stabilized frame must be discarded (i.e., cropping) to maintain visual consistency, and thus does leads to a tradeoff between stability and cropping ratio. In this paper, we make a first attempt to address this issue by proposing a new Out-of-boundary View Synthesis (OVS) method. By the nature of spatial coherence between adjacent frames and within each frame, OVS extrapolates the out-of-boundary view by aligning adjacent frames to each reference one. Technically, it first calculates the optical flow and propagates it to the outer boundary region according to the affinity, and then warps pixels accordingly. OVS can be integrated into existing warping-based stabilizers as a plug-and-play pre-processing module to significantly improve the cropping ratio of the stabilized results. In addition, stability is improved because the jitter amplification effect caused by cropping and resizing is reduced. Experimental results on the NUS benchmark show that OVS can improve the performance of five representative state-of-the-art methods in terms of objective metrics and subjective visual quality.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xu_Out-of-Boundary_View_Synthesis_Towards_Full-Frame_Video_Stabilization_ICCV_2021_paper.pdf", @@ -29742,7 +31758,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Xu_Out-of-Boundary_View_Synthesis_Towards_Full-Frame_Video_Stabilization_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Xu_Out-of-Boundary_View_Synthesis_Towards_Full-Frame_Video_Stabilization_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Xu_2021_ICCV,\n \n author = {\n Xu,\n Yufei and Zhang,\n Jing and Tao,\n Dacheng\n},\n title = {\n Out-of-Boundary View Synthesis Towards Full-Frame Video Stabilization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4842-4851\n} \n}" }, { "title": "Out-of-Core Surface Reconstruction via Global TGV Minimization", @@ -29750,6 +31767,7 @@ "status": "Poster", "track": "main", "pid": 1218, + "author_site": "Nikolai Poliarnyi", "author": "Nikolai Poliarnyi", "abstract": "We present an out-of-core variational approach for surface reconstruction from a set of aligned depth maps. Input depth maps are supposed to be reconstructed from regular photos or/and can be a representation of terrestrial LIDAR point clouds. Our approach is based on surface reconstruction via total generalized variation minimization (TGV) because of its strong visibility-based noise-filtering properties and GPU-friendliness. Our main contribution is an out-of-core OpenCL-accelerated adaptation of this numerical algorithm which can handle arbitrarily large real-world scenes with scale diversity.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Poliarnyi_Out-of-Core_Surface_Reconstruction_via_Global_TGV_Minimization_ICCV_2021_paper.pdf", @@ -29771,7 +31789,8 @@ "aff_unique_url": "", "aff_unique_abbr": "", "aff_country_unique_index": "0", - "aff_country_unique": "Russian Federation" + "aff_country_unique": "Russia", + "bibtex": "@InProceedings{Poliarnyi_2021_ICCV,\n \n author = {\n Poliarnyi,\n Nikolai\n},\n title = {\n Out-of-Core Surface Reconstruction via Global TGV Minimization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5641-5650\n} \n}" }, { "title": "Overfitting the Data: Compact Neural Video Delivery via Content-Aware Feature Modulation", @@ -29779,6 +31798,7 @@ "status": "Poster", "track": "main", "pid": 3163, + "author_site": "Jiaming Liu; Ming Lu; Kaixin Chen; Xiaoqi Li; Shizun Wang; Zhaoqing Wang; Enhua Wu; Yurong Chen; Chuang Zhang; Ming Wu", "author": "Jiaming Liu; Ming Lu; Kaixin Chen; Xiaoqi Li; Shizun Wang; Zhaoqing Wang; Enhua Wu; Yurong Chen; Chuang Zhang; Ming Wu", "abstract": "Internet video delivery has undergone a tremendous explosion of growth over the past few years. However, the quality of video delivery system greatly depends on the Internet bandwidth. Deep Neural Networks (DNNs) are utilized to improve the quality of video delivery recently. These methods divide a video into chunks, and stream LR video chunks and corresponding content-aware models to the client. The client runs the inference of models to super-resolve the LR chunks. Consequently, a large number of models are streamed in order to deliver a video. In this paper, we first carefully study the relation between models of different chunks, then we tactfully design a joint training framework along with the Content-aware Feature Modulation (CaFM) layer to compress these models for neural video delivery. With our method, each video chunk only requires less than 1% of original parameters to be streamed, achieving even better SR performance. We conduct extensive experiments across various SR backbones, video time length, and scaling factors to demonstrate the advantages of our method. Besides, our method can be also viewed as a new approach of video coding. Our primary experiments achieve better video quality compared with the commercial H.264 and H.265 standard under the same storage cost, showing the great potential of the proposed method. Code is available at: https://github.com/Neural-video-delivery/CaFM-Pytorch-ICCV2021", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liu_Overfitting_the_Data_Compact_Neural_Video_Delivery_via_Content-Aware_Feature_ICCV_2021_paper.pdf", @@ -29795,14 +31815,15 @@ "author_num": 10, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Liu_Overfitting_the_Data_Compact_Neural_Video_Delivery_via_Content-Aware_Feature_ICCV_2021_paper.html", "aff_unique_index": "0;1;0;0;0;0;2;1;0;0", - "aff_unique_norm": "Beijing University of Posts and Telecommunications;Intel;University of Macau", + "aff_unique_norm": "Beijing University of Posts and Telecommunications;Intel Corporation;University of Macau", "aff_unique_dep": ";Intel Labs;State Key Lab of Computer Science", "aff_unique_url": "http://www.bupt.edu.cn/;https://www.intel.cn;https://www.um.edu.mo", "aff_unique_abbr": "BUPT;Intel;UM", "aff_campus_unique_index": "0;0;0;0;0;2;0;0", "aff_campus_unique": "Beijing;;Macau SAR", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2021_ICCV,\n \n author = {\n Liu,\n Jiaming and Lu,\n Ming and Chen,\n Kaixin and Li,\n Xiaoqi and Wang,\n Shizun and Wang,\n Zhaoqing and Wu,\n Enhua and Chen,\n Yurong and Zhang,\n Chuang and Wu,\n Ming\n},\n title = {\n Overfitting the Data: Compact Neural Video Delivery via Content-Aware Feature Modulation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4631-4640\n} \n}" }, { "title": "P2-Net: Joint Description and Detection of Local Features for Pixel and Point Matching", @@ -29810,6 +31831,7 @@ "status": "Poster", "track": "main", "pid": 4191, + "author_site": "Bing Wang; Changhao Chen; Zhaopeng Cui; Jie Qin; Chris Xiaoxuan Lu; Zhengdi Yu; Peijun Zhao; Zhen Dong; Fan Zhu; Niki Trigoni; Andrew Markham", "author": "Bing Wang; Changhao Chen; Zhaopeng Cui; Jie Qin; Chris Xiaoxuan Lu; Zhengdi Yu; Peijun Zhao; Zhen Dong; Fan Zhu; Niki Trigoni; Andrew Markham", "abstract": "Accurately describing and detecting 2D and 3D keypoints is crucial to establishing correspondences across images and point clouds. Despite a plethora of learning-based 2D or 3D local feature descriptors and detectors having been proposed, the derivation of a shared descriptor and joint keypoint detector that directly matches pixels and points remains under-explored by the community. This work takes the initiative to establish fine-grained correspondences between 2D images and 3D point clouds. In order to directly match pixels and points, a dual fully convolutional framework is presented that maps 2D and 3D inputs into a shared latent representation space to simultaneously describe and detect keypoints. Furthermore, an ultra-wide reception mechanism and a novel loss function are designed to mitigate the intrinsic information variations between pixel and point local regions. Extensive experimental results demonstrate that our framework shows competitive performance in fine-grained matching between images and point clouds and achieves state-of-the-art results for the task of indoor visual localization. Our source code will be available at [no-name-for-blind-review].", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_P2-Net_Joint_Description_and_Detection_of_Local_Features_for_Pixel_ICCV_2021_paper.pdf", @@ -29824,7 +31846,8 @@ "aff_domain": ";;;;;;;;;;", "email": ";;;;;;;;;;", "author_num": 11, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wang_P2-Net_Joint_Description_and_Detection_of_Local_Features_for_Pixel_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wang_P2-Net_Joint_Description_and_Detection_of_Local_Features_for_Pixel_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Bing and Chen,\n Changhao and Cui,\n Zhaopeng and Qin,\n Jie and Lu,\n Chris Xiaoxuan and Yu,\n Zhengdi and Zhao,\n Peijun and Dong,\n Zhen and Zhu,\n Fan and Trigoni,\n Niki and Markham,\n Andrew\n},\n title = {\n P2-Net: Joint Description and Detection of Local Features for Pixel and Point Matching\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 16004-16013\n} \n}" }, { "title": "PARE: Part Attention Regressor for 3D Human Body Estimation", @@ -29832,10 +31855,11 @@ "status": "Poster", "track": "main", "pid": 1275, + "author_site": "Muhammed Kocabas; Chun-Hao P. Huang; Otmar Hilliges; Michael J. Black", "author": "Muhammed Kocabas; Chun-Hao P. Huang; Otmar Hilliges; Michael J. Black", "abstract": "Despite significant progress, we show that state of the art 3D human pose and shape estimation methods remain sensitive to partial occlusion and can produce dramatically wrong predictions although much of the body is observable. To address this, we introduce a soft attention mechanism, called the Part Attention REgressor (PARE), that learns to predict body-part-guided attention masks. We observe that state-of-the-art methods rely on global feature representations, making them sensitive to even small occlusions. In contrast, PARE's part-guided attention mechanism overcomes these issues by exploiting information about the visibility of individual body parts while leveraging information from neighboring body-parts to predict occluded parts. We show qualitatively that PARE learns sensible attention masks, and quantitative evaluation confirms that PARE achieves more accurate and robust reconstruction results than existing approaches on both occlusion-specific and standard benchmarks. The code and data are available for research purposes at https://pare.is.tue.mpg.de/", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Kocabas_PARE_Part_Attention_Regressor_for_3D_Human_Body_Estimation_ICCV_2021_paper.pdf", - "aff": "Max Planck Institute for Intelligent Systems, T\u00fcbingen, Germany+ETH Zurich; Max Planck Institute for Intelligent Systems, T\u00fcbingen, Germany; ETH Zurich; Max Planck Institute for Intelligent Systems, T\u00fcbingen, Germany", + "aff": "Max Planck Institute for Intelligent Systems, Tübingen, Germany+ETH Zurich; Max Planck Institute for Intelligent Systems, Tübingen, Germany; ETH Zurich; Max Planck Institute for Intelligent Systems, Tübingen, Germany", "project": "https://pare.is.tue.mpg.de/", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Kocabas_PARE_Part_Attention_ICCV_2021_supplemental.pdf", @@ -29853,9 +31877,10 @@ "aff_unique_url": "https://www.mpi-is.mpg.de;https://www.ethz.ch", "aff_unique_abbr": "MPI-IS;ETHZ", "aff_campus_unique_index": "0;0;0", - "aff_campus_unique": "T\u00fcbingen;", + "aff_campus_unique": "Tübingen;", "aff_country_unique_index": "0+1;0;1;0", - "aff_country_unique": "Germany;Switzerland" + "aff_country_unique": "Germany;Switzerland", + "bibtex": "@InProceedings{Kocabas_2021_ICCV,\n \n author = {\n Kocabas,\n Muhammed and Huang,\n Chun-Hao P. and Hilliges,\n Otmar and Black,\n Michael J.\n},\n title = {\n PARE: Part Attention Regressor for 3D Human Body Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11127-11137\n} \n}" }, { "title": "PARTS: Unsupervised Segmentation With Slots, Attention and Independence Maximization", @@ -29863,6 +31888,7 @@ "status": "Poster", "track": "main", "pid": 10369, + "author_site": "Daniel Zoran; Rishabh Kabra; Alexander Lerchner; Danilo J. Rezende", "author": "Daniel Zoran; Rishabh Kabra; Alexander Lerchner; Danilo J. Rezende", "abstract": "From an early age, humans perceive the visual world as composed of coherent objects with distinctive properties such as shape, size, and color. There is great interest in building models that are able to learn similar structure, ideally in an unsupervised manner. Learning such structure from complex 3D scenes that include clutter, occlusions, interactions, and camera motion is still an open challenge. We present a model that is able to segment visual scenes from complex 3D environments into distinct objects, learn disentangled representations of individual objects, and form consistent and coherent predictions of future frames, in a fully unsupervised manner. Our model (named PARTS) builds on recent approaches that utilize iterative amortized inference and transition dynamics for deep generative models. We achieve dramatic improvements in performance by introducing several novel contributions. We introduce a recurrent slot-attention like encoder which allows for top-down influence during inference. Unlike prior work, we eschew using an auto-regressive prior when modeling image sequences, and demonstrate that a fixed frame-independent prior is superior for the purpose of scene segmentation and representation learning. We demonstrate our model's success on three different video datasets (the popular benchmark CLEVRER; a simulated 3D Playroom environment; and a real-world Robotics Arm dataset). Finally, we analyze the contributions of the various model components and the representations learned by the model.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zoran_PARTS_Unsupervised_Segmentation_With_Slots_Attention_and_Independence_Maximization_ICCV_2021_paper.pdf", @@ -29886,7 +31912,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Zoran_2021_ICCV,\n \n author = {\n Zoran,\n Daniel and Kabra,\n Rishabh and Lerchner,\n Alexander and Rezende,\n Danilo J.\n},\n title = {\n PARTS: Unsupervised Segmentation With Slots,\n Attention and Independence Maximization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10439-10447\n} \n}" }, { "title": "PASS: Protected Attribute Suppression System for Mitigating Bias in Face Recognition", @@ -29894,6 +31921,7 @@ "status": "Poster", "track": "main", "pid": 9984, + "author_site": "Prithviraj Dhar; Joshua Gleason; Aniket Roy; Carlos D. Castillo; Rama Chellappa", "author": "Prithviraj Dhar; Joshua Gleason; Aniket Roy; Carlos D. Castillo; Rama Chellappa", "abstract": "Face recognition networks encode information about sensitive attributes while being trained for identity classification. Such encoding has two major issues: (a) it makes the face representations susceptible to privacy leakage (b) it appears to contribute to bias in face recognition. However, existing bias mitigation approaches generally require end-to-end training and are unable to achieve high verification accuracy. Therefore, we present a descriptor-based adversarial de-biasing approach called `Protected Attribute Suppression System (PASS)'. PASS can be trained on top of descriptors obtained from any previously trained high-performing network to classify identities and simultaneously reduce encoding of sensitive attributes. This eliminates the need for end-to-end training. As a component of PASS, we present a novel discriminator training strategy that discourages a network from encoding protected attribute information. We show the efficacy of PASS to reduce gender and skintone information in descriptors from SOTA face recognition networks like Arcface. As a result, PASS descriptors outperform existing baselines in reducing gender and skintone bias on the IJB-C dataset, while maintaining a high verification accuracy.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Dhar_PASS_Protected_Attribute_Suppression_System_for_Mitigating_Bias_in_Face_ICCV_2021_paper.pdf", @@ -29917,7 +31945,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";College Park", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Dhar_2021_ICCV,\n \n author = {\n Dhar,\n Prithviraj and Gleason,\n Joshua and Roy,\n Aniket and Castillo,\n Carlos D. and Chellappa,\n Rama\n},\n title = {\n PASS: Protected Attribute Suppression System for Mitigating Bias in Face Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15087-15096\n} \n}" }, { "title": "PCAM: Product of Cross-Attention Matrices for Rigid Registration of Point Clouds", @@ -29925,10 +31954,11 @@ "status": "Poster", "track": "main", "pid": 10077, + "author_site": "Anh-Quan Cao; Gilles Puy; Alexandre Boulch; Renaud Marlet", "author": "Anh-Quan Cao; Gilles Puy; Alexandre Boulch; Renaud Marlet", "abstract": "Rigid registration of point clouds with partial overlaps is a longstanding problem usually solved in two steps: (a) finding correspondences between the point clouds; (b) filtering these correspondences to keep only the most reliable ones to estimate the transformation. Recently, several deep nets have been proposed to solve these steps jointly. We built upon these works and propose PCAM: a neural network whose key element is a pointwise product of cross-attention matrices that permits to mix both low-level geometric and high-level contextual information to find point correspondences. These cross-attention matrices also permits the exchange of context information between the point clouds, at each layer, allowing the network construct better matching features within the overlapping regions. The experiments show that PCAM achieves state-of-the-art results among methods which, like us, solve steps (a) and (b) jointly via deepnets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Cao_PCAM_Product_of_Cross-Attention_Matrices_for_Rigid_Registration_of_Point_ICCV_2021_paper.pdf", - "aff": "Valeo.ai, Paris, France+Inria, Paris, France; Valeo.ai, Paris, France; Valeo.ai, Paris, France; Inria, Paris, France+LIGM, Ecole des Ponts, Univ Gustave Eiffel, CNRS, Marne-la-Vall\u00e9e, France", + "aff": "Valeo.ai, Paris, France+Inria, Paris, France; Valeo.ai, Paris, France; Valeo.ai, Paris, France; Inria, Paris, France+LIGM, Ecole des Ponts, Univ Gustave Eiffel, CNRS, Marne-la-Vallée, France", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Cao_PCAM_Product_of_ICCV_2021_supplemental.pdf", @@ -29941,14 +31971,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Cao_PCAM_Product_of_Cross-Attention_Matrices_for_Rigid_Registration_of_Point_ICCV_2021_paper.html", "aff_unique_index": "0+1;0;0;1+2", - "aff_unique_norm": "Valeo.ai;INRIA;Ecole des Ponts ParisTech", + "aff_unique_norm": "Valeo.ai;Inria;Ecole des Ponts ParisTech", "aff_unique_dep": ";;LIGM", "aff_unique_url": "https://www.valeo.ai;https://www.inria.fr;https://www.ponts.fr", "aff_unique_abbr": ";Inria;ENPC", "aff_campus_unique_index": "0+0;0;0;0+1", - "aff_campus_unique": "Paris;Marne-la-Vall\u00e9e", + "aff_campus_unique": "Paris;Marne-la-Vallée", "aff_country_unique_index": "0+0;0;0;0+0", - "aff_country_unique": "France" + "aff_country_unique": "France", + "bibtex": "@InProceedings{Cao_2021_ICCV,\n \n author = {\n Cao,\n Anh-Quan and Puy,\n Gilles and Boulch,\n Alexandre and Marlet,\n Renaud\n},\n title = {\n PCAM: Product of Cross-Attention Matrices for Rigid Registration of Point Clouds\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13229-13238\n} \n}" }, { "title": "PIAP-DF: Pixel-Interested and Anti Person-Specific Facial Action Unit Detection Net With Discrete Feedback Learning", @@ -29956,6 +31987,7 @@ "status": "Poster", "track": "main", "pid": 5829, + "author_site": "Yang Tang; Wangding Zeng; Dafei Zhao; Honggang Zhang", "author": "Yang Tang; Wangding Zeng; Dafei Zhao; Honggang Zhang", "abstract": "Facial Action Units (AUs) are of great significance in communication. Automatic AU detection can improve the understanding of psychological conditions and emotional status. Recently, several deep learning methods have been proposed to detect AUs automatically. However, several challenges, such as poor extraction of fine-grained and robust local AUs information, model overfitting on person-specific features, as well as the limitation of datasets with wrong labels, remain to be addressed. In this paper, we propose a joint strategy called PIAP-DF to solve these problems, which involves 1) a multi-stage Pixel-Interested learning method with pixel-level attention for each AU; 2) an Anti Person-Specific method aiming to eliminate features associated with any individual as much as possible; 3) a semi-supervised learning method with Discrete Feedback, designed to effectively utilize unlabeled data and mitigate the negative impacts of wrong labels. Experimental results on the two popular AU detection datasets BP4D and DISFA prove that PIAP-DF can be the new state-of-the-art method. Compared with the current best method, PIAP-DF improves the average F1 score by 3.2% on BP4D and by 0.5% on DISFA. All modules of PIAP-DF can be easily removed after training to obtain a lightweight model for practical application.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Tang_PIAP-DF_Pixel-Interested_and_Anti_Person-Specific_Facial_Action_Unit_Detection_Net_ICCV_2021_paper.pdf", @@ -29979,7 +32011,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Beijing;", "aff_country_unique_index": "0;0;1;0", - "aff_country_unique": "China;Canada" + "aff_country_unique": "China;Canada", + "bibtex": "@InProceedings{Tang_2021_ICCV,\n \n author = {\n Tang,\n Yang and Zeng,\n Wangding and Zhao,\n Dafei and Zhang,\n Honggang\n},\n title = {\n PIAP-DF: Pixel-Interested and Anti Person-Specific Facial Action Unit Detection Net With Discrete Feedback Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12899-12908\n} \n}" }, { "title": "PICCOLO: Point Cloud-Centric Omnidirectional Localization", @@ -29987,6 +32020,7 @@ "status": "Poster", "track": "main", "pid": 3864, + "author_site": "Junho Kim; Changwoon Choi; Hojun Jang; Young Min Kim", "author": "Junho Kim; Changwoon Choi; Hojun Jang; Young Min Kim", "abstract": "We present PICCOLO, a simple and efficient algorithm for omnidirectional localization. Given a colored point cloud and a 360 panorama image of a scene, our objective is to recover the camera pose at which the panorama image is taken. Our pipeline works in an off-the-shelf manner with a single image given as a query and does not require any training of neural networks or collecting ground-truth poses of images. Instead, we match each point cloud color to the holistic view of the panorama image with gradient-descent optimization to find the camera pose. Our loss function, called sampling loss, is point cloud-centric, evaluated at the projected location of every point in the point cloud. In contrast, conventional photometric loss is image-centric, comparing colors at each pixel location. With a simple change in the compared entities, sampling loss effectively overcomes the severe visual distortion of omnidirectional images, and enjoys the global context of the 360 view to handle challenging scenarios for visual localization. PICCOLO outperforms existing omnidirectional localization algorithms in both accuracy and stability when evaluated in various environments.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Kim_PICCOLO_Point_Cloud-Centric_Omnidirectional_Localization_ICCV_2021_paper.pdf", @@ -30010,7 +32044,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Kim_2021_ICCV,\n \n author = {\n Kim,\n Junho and Choi,\n Changwoon and Jang,\n Hojun and Kim,\n Young Min\n},\n title = {\n PICCOLO: Point Cloud-Centric Omnidirectional Localization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3313-3323\n} \n}" }, { "title": "PIRenderer: Controllable Portrait Image Generation via Semantic Neural Rendering", @@ -30018,6 +32053,7 @@ "status": "Poster", "track": "main", "pid": 1996, + "author_site": "Yurui Ren; Ge Li; Yuanqi Chen; Thomas H. Li; Shan Liu", "author": "Yurui Ren; Ge Li; Yuanqi Chen; Thomas H. Li; Shan Liu", "abstract": "Generating portrait images by controlling the motions of existing faces is an important task of great consequence to social media industries. For easy use and intuitive control, semantically meaningful and fully disentangled parameters should be used as modifications. However, many existing techniques do not provide such fine-grained controls or use indirect editing methods i.e. mimic motions of other individuals. In this paper, a Portrait Image Neural Renderer (PIRenderer) is proposed to control the face motions with the parameters of three-dimensional morphable face models (3DMMs). The proposed model can generate photo-realistic portrait images with accurate movements according to intuitive modifications. Experiments on both direct and indirect editing tasks demonstrate the superiority of this model. Meanwhile, we further extend this model to tackle the audio-driven facial reenactment task by extracting sequential motions from audio inputs. We show that our model can generate coherent videos with convincing movements from only a single reference image and a driving audio stream. Our source code is available at https://github.com/RenYurui/PIRender.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ren_PIRenderer_Controllable_Portrait_Image_Generation_via_Semantic_Neural_Rendering_ICCV_2021_paper.pdf", @@ -30034,14 +32070,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Ren_PIRenderer_Controllable_Portrait_Image_Generation_via_Semantic_Neural_Rendering_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;0+0;1", - "aff_unique_norm": "Peking University;Tencent", - "aff_unique_dep": "School of Electronics and Computer Engineering;Tencent America", + "aff_unique_norm": "Peking University;Tencent America", + "aff_unique_dep": "School of Electronics and Computer Engineering;", "aff_unique_url": "http://www.pku.edu.cn;https://www.tencent.com/en-us", "aff_unique_abbr": "PKU;Tencent America", "aff_campus_unique_index": "1", "aff_campus_unique": ";Beijing", "aff_country_unique_index": "0;0;0;0+0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Ren_2021_ICCV,\n \n author = {\n Ren,\n Yurui and Li,\n Ge and Chen,\n Yuanqi and Li,\n Thomas H. and Liu,\n Shan\n},\n title = {\n PIRenderer: Controllable Portrait Image Generation via Semantic Neural Rendering\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13759-13768\n} \n}" }, { "title": "PIT: Position-Invariant Transform for Cross-FoV Domain Adaptation", @@ -30049,6 +32086,7 @@ "status": "Poster", "track": "main", "pid": 1982, + "author_site": "Qiqi Gu; Qianyu Zhou; Minghao Xu; Zhengyang Feng; Guangliang Cheng; Xuequan Lu; Jianping Shi; Lizhuang Ma", "author": "Qiqi Gu; Qianyu Zhou; Minghao Xu; Zhengyang Feng; Guangliang Cheng; Xuequan Lu; Jianping Shi; Lizhuang Ma", "abstract": "Cross-domain object detection and semantic segmentation have witnessed impressive progress recently. Existing approaches mainly consider the domain shift resulting from external environments including the changes of background, illumination or weather, while distinct camera intrinsic parameters appear commonly in different domains and their influence for domain adaptation has been very rarely explored. In this paper, we observe that the Field of View (FoV) gap induces noticeable instance appearance differences between the source and target domains. We further discover that the FoV gap between two domains impairs domain adaptation performance under both the FoV-increasing (source FoV < target FoV) and FoV-decreasing cases. Motivated by the observations, we propose the Position-Invariant Transform (PIT) to better align images in different domains. We also introduce a reverse PIT for mapping the transformed/aligned images back to the original image space, and design a loss re-weighting strategy to accelerate the training process. Our method can be easily plugged into existing cross-domain detection/segmentation frameworks, while bringing about negligible computational overhead. Extensive experiments demonstrate that our method can soundly boost the performance on both cross-domain object detection and segmentation for state-of-the-art techniques. Our code is available at https://github.com/sheepooo/PIT-Position-Invariant-Transform.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Gu_PIT_Position-Invariant_Transform_for_Cross-FoV_Domain_Adaptation_ICCV_2021_paper.pdf", @@ -30072,7 +32110,8 @@ "aff_campus_unique_index": ";1;", "aff_campus_unique": ";Qing Yuan", "aff_country_unique_index": "0;0;0;0;0+0;1;0+0;0+0+0", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Gu_2021_ICCV,\n \n author = {\n Gu,\n Qiqi and Zhou,\n Qianyu and Xu,\n Minghao and Feng,\n Zhengyang and Cheng,\n Guangliang and Lu,\n Xuequan and Shi,\n Jianping and Ma,\n Lizhuang\n},\n title = {\n PIT: Position-Invariant Transform for Cross-FoV Domain Adaptation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8761-8770\n} \n}" }, { "title": "PR-GCN: A Deep Graph Convolutional Network With Point Refinement for 6D Pose Estimation", @@ -30080,6 +32119,7 @@ "status": "Poster", "track": "main", "pid": 9066, + "author_site": "Guangyuan Zhou; Huiqun Wang; Jiaxin Chen; Di Huang", "author": "Guangyuan Zhou; Huiqun Wang; Jiaxin Chen; Di Huang", "abstract": "RGB-D based 6D pose estimation has recently achieved remarkable progress, but still suffers from two major limitations: (1) ineffective representation of depth data and (2) insufficient integration of different modalities. This paper proposes a novel deep learning approach, namely Graph Convolutional Network with Point Refinement (PR-GCN), to simultaneously address the issues above in a unified way. It first introduces the Point Refinement Network (PRN) to polish 3D point clouds, recovering missing parts with noise removed. Subsequently, the Multi-Modal Fusion Graph Convolutional Network (MMF-GCN) is presented to strengthen RGB-D combination, which captures geometry-aware inter-modality correlation through local information propagation in the graph convolutional network. Extensive experiments are conducted on three widely used benchmarks, and state-of-the-art performance is reached. Besides, it is also shown that the proposed PRN and MMF-GCN modules are well generalized to other frameworks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhou_PR-GCN_A_Deep_Graph_Convolutional_Network_With_Point_Refinement_for_ICCV_2021_paper.pdf", @@ -30103,7 +32143,8 @@ "aff_campus_unique_index": "0+0;0+0;0;0+0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0+0;0+0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhou_2021_ICCV,\n \n author = {\n Zhou,\n Guangyuan and Wang,\n Huiqun and Chen,\n Jiaxin and Huang,\n Di\n},\n title = {\n PR-GCN: A Deep Graph Convolutional Network With Point Refinement for 6D Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2793-2802\n} \n}" }, { "title": "PR-Net: Preference Reasoning for Personalized Video Highlight Detection", @@ -30111,6 +32152,7 @@ "status": "Poster", "track": "main", "pid": 5544, + "author_site": "Runnan Chen; Penghao Zhou; Wenzhe Wang; Nenglun Chen; Pai Peng; Xing Sun; Wenping Wang", "author": "Runnan Chen; Penghao Zhou; Wenzhe Wang; Nenglun Chen; Pai Peng; Xing Sun; Wenping Wang", "abstract": "Personalized video highlight detection aims to shorten a long video to interesting moments according to a user's preference, which has recently raised the community's attention. Current methods regard the user's history as holistic information to predict the user's preference but negating the inherent diversity of the user's interests, resulting in vague preference representation. In this paper, we propose a simple yet efficient preference reasoning framework (PR-Net) to explicitly take the diverse interests into account for frame-level highlight prediction. Specifically, distinct user-specific preferences for each input query frame are produced, presented as the similarity weighted sum of history highlights to the corresponding query frame. Next, distinct comprehensive preferences are formed by the user-specific preferences and a learnable generic preference for more overall highlight measurement. Lastly, the degree of highlight and non-highlight for each query frame is calculated as semantic similarity to its comprehensive and non-highlight preferences, respectively. Besides, to alleviate the ambiguity due to the incomplete annotation, a new bi-directional contrastive loss is proposed to ensure a compact and differentiable metric space. In this way, our method significantly outperforms state-of-the-art methods with a relative improvement of 12% in mean accuracy precision.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_PR-Net_Preference_Reasoning_for_Personalized_Video_Highlight_Detection_ICCV_2021_paper.pdf", @@ -30127,14 +32169,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Chen_PR-Net_Preference_Reasoning_for_Personalized_Video_Highlight_Detection_ICCV_2021_paper.html", "aff_unique_index": "0+1;1;2;0;1;1;0", - "aff_unique_norm": "University of Hong Kong;Tencent;Zhejiang University", + "aff_unique_norm": "The University of Hong Kong;Tencent;Zhejiang University", "aff_unique_dep": ";Youtu Lab;", "aff_unique_url": "https://www.hku.hk;https://www.tencent.com;https://www.zju.edu.cn", "aff_unique_abbr": "HKU;Tencent;ZJU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0+0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Runnan and Zhou,\n Penghao and Wang,\n Wenzhe and Chen,\n Nenglun and Peng,\n Pai and Sun,\n Xing and Wang,\n Wenping\n},\n title = {\n PR-Net: Preference Reasoning for Personalized Video Highlight Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7980-7989\n} \n}" }, { "title": "PR-RRN: Pairwise-Regularized Residual-Recursive Networks for Non-Rigid Structure-From-Motion", @@ -30142,6 +32185,7 @@ "status": "Poster", "track": "main", "pid": 2139, + "author_site": "Haitian Zeng; Yuchao Dai; Xin Yu; Xiaohan Wang; Yi Yang", "author": "Haitian Zeng; Yuchao Dai; Xin Yu; Xiaohan Wang; Yi Yang", "abstract": "We propose PR-RRN, a novel neural-network based method for Non-rigid Structure-from-Motion (NRSfM). PR-RRN consists of Residual-Recursive Networks (RRN) and two extra regularization losses. RRN is designed to effectively recover 3D shape and camera from 2D keypoints with novel residual-recursive structure. As NRSfM is a highly under-constrained problem, we propose two new pairwise regularization to further regularize the reconstruction. The Rigidity-based Pairwise Contrastive Loss regularizes the shape representation by encouraging higher similarity between the representations of high-rigidity pairs of frames than low-rigidity pairs. We propose minimum singular-value ratio to measure the pairwise rigidity. The Pairwise Consistency Loss enforces the reconstruction to be consistent when the estimated shapes and cameras are exchanged between pairs. Our approach achieves state-of-the-art performance on CMU MOCAP and PASCAL3D+ dataset.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zeng_PR-RRN_Pairwise-Regularized_Residual-Recursive_Networks_for_Non-Rigid_Structure-From-Motion_ICCV_2021_paper.pdf", @@ -30156,7 +32200,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zeng_PR-RRN_Pairwise-Regularized_Residual-Recursive_Networks_for_Non-Rigid_Structure-From-Motion_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zeng_PR-RRN_Pairwise-Regularized_Residual-Recursive_Networks_for_Non-Rigid_Structure-From-Motion_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Zeng_2021_ICCV,\n \n author = {\n Zeng,\n Haitian and Dai,\n Yuchao and Yu,\n Xin and Wang,\n Xiaohan and Yang,\n Yi\n},\n title = {\n PR-RRN: Pairwise-Regularized Residual-Recursive Networks for Non-Rigid Structure-From-Motion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5600-5609\n} \n}" }, { "title": "PT-CapsNet: A Novel Prediction-Tuning Capsule Network Suitable for Deeper Architectures", @@ -30164,6 +32209,7 @@ "status": "Poster", "track": "main", "pid": 6732, + "author_site": "Chenbin Pan; Senem Velipasalar", "author": "Chenbin Pan; Senem Velipasalar", "abstract": "Capsule Networks (CapsNets) create internal representations by parsing inputs into various instances at different resolution levels via a two-phase process -- part-whole transformation and hierarchical component routing. Since both of these internal phases are computationally expensive, CapsNets have not found wider use. Existing variations of CapsNets mainly focus on performance comparison with the original CapsNet, and have not outperformed CNN-based models on complex tasks. To address the limitations of the existing CapsNet structures, we propose a novel Prediction-Tuning Capsule Network (PT-CapsNet), and also introduce fully connected PT-Capsules (FC-PT-Caps) and locally connected PT-Capsules (LC-PT-Caps). Different from existing CapsNet structures, our proposed model (i) allows the use of capsules for more difficult vision tasks and provides wider applicability; and (ii) provides better than or comparable performance to CNN-based baselines on these complex tasks. In our experiments, we show robustness to affine transformations, as well as the lightweight and scalability of PT-CapsNet via constructing larger and deeper networks and performing comparisons on classification, semantic segmentation and object detection tasks. The results show consistent performance improvement and significant parameter reduction compared to various baseline models. Code is available at https://github.com/Christinepan881/PT-CapsNet.git.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Pan_PT-CapsNet_A_Novel_Prediction-Tuning_Capsule_Network_Suitable_for_Deeper_Architectures_ICCV_2021_paper.pdf", @@ -30187,7 +32233,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Syracuse", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Pan_2021_ICCV,\n \n author = {\n Pan,\n Chenbin and Velipasalar,\n Senem\n},\n title = {\n PT-CapsNet: A Novel Prediction-Tuning Capsule Network Suitable for Deeper Architectures\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11996-12005\n} \n}" }, { "title": "PU-EVA: An Edge-Vector Based Approximation Solution for Flexible-Scale Point Cloud Upsampling", @@ -30195,6 +32242,7 @@ "status": "Poster", "track": "main", "pid": 11396, + "author_site": "Luqing Luo; Lulu Tang; Wanyi Zhou; Shizheng Wang; Zhi-Xin Yang", "author": "Luqing Luo; Lulu Tang; Wanyi Zhou; Shizheng Wang; Zhi-Xin Yang", "abstract": "High-quality point clouds have practical significance for point-based rendering, semantic understanding, and surface reconstruction. Upsampling sparse, noisy and non-uniform point clouds for a denser and more regular approximation of target objects is a desirable but challenging task. Most existing methods duplicate point features for upsampling, constraining the upsampling scales at a fixed rate. In this work, the arbitrary point clouds upsampling rates are achieved via edge-vector based affine combinations, and a novel design of Edge-Vector based Approximation for Flexible-scale Point clouds Upsampling (PU-EVA) is proposed. The edge-vector based approximation encodes neighboring connectivity via affine combinations based on edge vectors, and restricts the approximation error within a second-order term of Taylor's Expansion. Moreover, the EVA upsampling decouples the upsampling scales with network architecture, achieving the arbitrary upsampling rates in one-time training. Qualitative and quantitative evaluations demonstrate that the proposed PU-EVA outperforms the state-of-the-arts in terms of proximity-to-surface, distribution uniformity, and geometric details preservation.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Luo_PU-EVA_An_Edge-Vector_Based_Approximation_Solution_for_Flexible-Scale_Point_Cloud_ICCV_2021_paper.pdf", @@ -30218,7 +32266,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Macau SAR;", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Luo_2021_ICCV,\n \n author = {\n Luo,\n Luqing and Tang,\n Lulu and Zhou,\n Wanyi and Wang,\n Shizheng and Yang,\n Zhi-Xin\n},\n title = {\n PU-EVA: An Edge-Vector Based Approximation Solution for Flexible-Scale Point Cloud Upsampling\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 16208-16217\n} \n}" }, { "title": "PX-NET: Simple and Efficient Pixel-Wise Training of Photometric Stereo Networks", @@ -30226,6 +32275,7 @@ "status": "Poster", "track": "main", "pid": 7980, + "author_site": "Fotios Logothetis; Ignas Budvytis; Roberto Mecca; Roberto Cipolla", "author": "Fotios Logothetis; Ignas Budvytis; Roberto Mecca; Roberto Cipolla", "abstract": "Retrieving accurate 3D reconstructions of objects from the way they reflect light is a very challenging task in computer vision. Despite more than four decades since the definition of the Photometric Stereo problem, most of the literature has had limited success when global illumination effects such as cast shadows, self-reflections and ambient light come into play, especially for specular surfaces. Recent approaches have leveraged the capabilities of deep learning in conjunction with computer graphics in order to cope with the need of a vast number of training data to invert the image irradiance equation and retrieve the geometry of the object. However, rendering global illumination effects is a slow process which can limit the amount of training data that can be generated. In this work we propose a novel pixel-wise training procedure for normal prediction by replacing the training data (observation maps) of globally rendered images with independent per-pixel generated data. We show that global physical effects can be approximated on the observation map domain and this simplifies and speeds up the data creation procedure. Our network, PX-NET, achieves state-of-the-art performance compared to other pixelwise methods on synthetic datasets, as well as the DiLiGenT real dataset on both dense and sparse light settings.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Logothetis_PX-NET_Simple_and_Efficient_Pixel-Wise_Training_of_Photometric_Stereo_Networks_ICCV_2021_paper.pdf", @@ -30249,7 +32299,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Logothetis_2021_ICCV,\n \n author = {\n Logothetis,\n Fotios and Budvytis,\n Ignas and Mecca,\n Roberto and Cipolla,\n Roberto\n},\n title = {\n PX-NET: Simple and Efficient Pixel-Wise Training of Photometric Stereo Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12757-12766\n} \n}" }, { "title": "Paint Transformer: Feed Forward Neural Painting With Stroke Prediction", @@ -30257,6 +32308,7 @@ "status": "Poster", "track": "main", "pid": 7244, + "author_site": "Songhua Liu; Tianwei Lin; Dongliang He; Fu Li; Ruifeng Deng; Xin Li; Errui Ding; Hao Wang", "author": "Songhua Liu; Tianwei Lin; Dongliang He; Fu Li; Ruifeng Deng; Xin Li; Errui Ding; Hao Wang", "abstract": "Neural painting refers to the procedure of producing a series of strokes for a given image and non-photo-realistically recreating it using neural networks. While reinforcement learning (RL) based agents can generate a stroke sequence step by step for this task, it is not easy to train a stable RL agent. On the other hand, stroke optimization methods search for a set of stroke parameters iteratively in a large search space; such low efficiency significantly limits their prevalence and practicality. Different from previous methods, in this paper, we formulate the task as a set prediction problem and propose a novel Transformer-based framework, dubbed Paint Transformer, to predict the parameters of a stroke set with a feed forward network. This way, our model can generate a set of strokes in parallel and obtain the final painting of size 512x512 in near real time. More importantly, since there is no dataset available for training the Paint Transformer, we devise a self-training pipeline such that it can be trained without any off-the-shelf dataset while still achieving excellent generalization capability. Experiments demonstrate that our method achieves better painting performance than previous ones with cheaper training and inference costs. Codes and models will be available.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liu_Paint_Transformer_Feed_Forward_Neural_Painting_With_Stroke_Prediction_ICCV_2021_paper.pdf", @@ -30271,7 +32323,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Liu_Paint_Transformer_Feed_Forward_Neural_Painting_With_Stroke_Prediction_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Liu_Paint_Transformer_Feed_Forward_Neural_Painting_With_Stroke_Prediction_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Liu_2021_ICCV,\n \n author = {\n Liu,\n Songhua and Lin,\n Tianwei and He,\n Dongliang and Li,\n Fu and Deng,\n Ruifeng and Li,\n Xin and Ding,\n Errui and Wang,\n Hao\n},\n title = {\n Paint Transformer: Feed Forward Neural Painting With Stroke Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6598-6607\n} \n}" }, { "title": "Painting From Part", @@ -30279,6 +32332,7 @@ "status": "Poster", "track": "main", "pid": 7379, + "author_site": "Dongsheng Guo; Haoru Zhao; Yunhao Cheng; Haiyong Zheng; Zhaorui Gu; Bing Zheng", "author": "Dongsheng Guo; Haoru Zhao; Yunhao Cheng; Haiyong Zheng; Zhaorui Gu; Bing Zheng", "abstract": "This paper studies the problem of painting the whole image from part of it, namely painting from part or part-painting for short, involving both inpainting and outpainting. To address the challenge of taking full advantage of both information from local domain (part) and knowledge from global domain (dataset), we propose a novel part-painting method according to the observations of relationship between part and whole, which consists of three stages: part-noise restarting, part-feature repainting, and part-patch refining, to paint the whole image by leveraging both feature-level and patch-level part as well as powerful representation ability of generative adversarial network. Extensive ablation studies show efficacy of each stage, and our method achieves state-of-the-art performance on both inpainting and outpainting benchmarks with free-form parts, including our new mask dataset for irregular outpainting. Our code and dataset are available at https://github.com/zhenglab/partpainting.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Guo_Painting_From_Part_ICCV_2021_paper.pdf", @@ -30302,7 +32356,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Sanya", "aff_country_unique_index": "0;0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Guo_2021_ICCV,\n \n author = {\n Guo,\n Dongsheng and Zhao,\n Haoru and Cheng,\n Yunhao and Zheng,\n Haiyong and Gu,\n Zhaorui and Zheng,\n Bing\n},\n title = {\n Painting From Part\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14779-14788\n} \n}" }, { "title": "Pano-AVQA: Grounded Audio-Visual Question Answering on 360deg Videos", @@ -30310,6 +32365,7 @@ "status": "Poster", "track": "main", "pid": 8766, + "author_site": "Heeseung Yun; Youngjae Yu; Wonsuk Yang; Kangil Lee; Gunhee Kim", "author": "Heeseung Yun; Youngjae Yu; Wonsuk Yang; Kangil Lee; Gunhee Kim", "abstract": "360deg videos convey holistic views for the surroundings of a scene. It provides audio-visual cues beyond predetermined normal field of views and displays distinctive spatial relations on a sphere. However, previous benchmark tasks for panoramic videos are still limited to evaluate the semantic understanding of audio-visual relationships or spherical spatial property in surroundings. We propose a novel benchmark named Pano-AVQA as a large-scale grounded audio-visual question answering dataset on panoramic videos. Using 5.4K 360deg video clips harvested online, we collect two types of novel question-answer pairs with bounding-box grounding: spherical spatial relation QAs and audio-visual relation QAs. We train several transformer-based models from Pano-AVQA, where the results suggest that our proposed spherical spatial embeddings and multimodal training objectives fairly contribute to better semantic understanding of the panoramic surroundings on the dataset.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yun_Pano-AVQA_Grounded_Audio-Visual_Question_Answering_on_360deg_Videos_ICCV_2021_paper.pdf", @@ -30333,7 +32389,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;0;0", - "aff_country_unique": "South Korea;United States;United Kingdom" + "aff_country_unique": "South Korea;United States;United Kingdom", + "bibtex": "@InProceedings{Yun_2021_ICCV,\n \n author = {\n Yun,\n Heeseung and Yu,\n Youngjae and Yang,\n Wonsuk and Lee,\n Kangil and Kim,\n Gunhee\n},\n title = {\n Pano-AVQA: Grounded Audio-Visual Question Answering on 360deg Videos\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2031-2041\n} \n}" }, { "title": "Panoptic Narrative Grounding", @@ -30341,7 +32398,8 @@ "status": "Poster", "track": "main", "pid": 1106, - "author": "Cristina Gonz\u00e1lez; Nicol\u00e1s Ayobi; Isabela Hern\u00e1ndez; Jos\u00e9 Hern\u00e1ndez; Jordi Pont-Tuset; Pablo Arbel\u00e1ez", + "author_site": "Cristina González; Nicolás Ayobi; Isabela Hernández; José Hernández; Jordi Pont-Tuset; Pablo Arbeláez", + "author": "Cristina González; Nicolás Ayobi; Isabela Hernández; José Hernández; Jordi Pont-Tuset; Pablo Arbeláez", "abstract": "This paper proposes Panoptic Narrative Grounding, a spatially fine and general formulation of the natural language visual grounding problem. We establish an experimental framework for the study of this new task, including new ground truth and metrics, and we propose a strong baseline method to serve as stepping stone for future work. We exploit the intrinsic semantic richness in an image by including panoptic categories, and we approach visual grounding at a fine-grained level by using segmentations. In terms of ground truth, we propose an algorithm to automatically transfer Localized Narratives annotations to specific regions in the panoptic segmentations of the MS COCO dataset. To guarantee the quality of our annotations, we take advantage of the semantic structure contained in WordNet to exclusively incorporate noun phrases that are grounded to a meaningfully related panoptic segmentation region. The proposed baseline achieves a performance of 55.4 absolute Average Recall points. This result is a suitable foundation to push the envelope further in the development of methods for Panoptic Narrative Grounding.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Gonzalez_Panoptic_Narrative_Grounding_ICCV_2021_paper.pdf", "aff": "Center for Research and Formation in Artificial Intelligence, Universidad de los Andes, Colombia; Center for Research and Formation in Artificial Intelligence, Universidad de los Andes, Colombia; Center for Research and Formation in Artificial Intelligence, Universidad de los Andes, Colombia; Center for Research and Formation in Artificial Intelligence, Universidad de los Andes, Colombia; Google Research, Switzerland; Center for Research and Formation in Artificial Intelligence, Universidad de los Andes, Colombia", @@ -30357,14 +32415,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Gonzalez_Panoptic_Narrative_Grounding_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;0;1;0", - "aff_unique_norm": "Universidad de los Andes;Google", + "aff_unique_norm": "Universidad de los Andes;Google Research", "aff_unique_dep": "Center for Research and Formation in Artificial Intelligence;Google Research", "aff_unique_url": "https://www.uniandes.edu.co;https://research.google", "aff_unique_abbr": ";Google Research", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1;0", - "aff_country_unique": "Colombia;Switzerland" + "aff_country_unique": "Colombia;Switzerland", + "bibtex": "@InProceedings{Gonzalez_2021_ICCV,\n \n author = {\n Gonz\\'alez,\n Cristina and Ayobi,\n Nicol\\'as and Hern\\'andez,\n Isabela and Hern\\'andez,\n Jos\\'e and Pont-Tuset,\n Jordi and Arbel\\'aez,\n Pablo\n},\n title = {\n Panoptic Narrative Grounding\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1364-1373\n} \n}" }, { "title": "Panoptic Segmentation of Satellite Image Time Series With Convolutional Temporal Attention Networks", @@ -30372,6 +32431,7 @@ "status": "Poster", "track": "main", "pid": 5698, + "author_site": "Vivien Sainte Fare Garnot; Loic Landrieu", "author": "Vivien Sainte Fare Garnot; Loic Landrieu", "abstract": "Unprecedented access to multi-temporal satellite imagery has opened new perspectives for a variety of Earth observation tasks. Among them, pixel-precise panoptic segmentation of agricultural parcels has major economic and environmental implications. While researchers have explored this problem for single images, we argue that the complex temporal patterns of crop phenology are better addressed with temporal sequences of images. In this paper, we present the first end-to-end, single-stage method for panoptic segmentation of Satellite Image Time Series (SITS). This module can be combined with our novel image sequence encoding network which relies on temporal self-attention to extract rich and adaptive multi-scale spatio-temporal features. We also introduce PASTIS, the first open-access SITS dataset with panoptic annotations. We demonstrate the superiority of our encoder for semantic segmentation against multiple competing network architectures, and set up the first state-of-the-art of panoptic segmentation of SITS. Our implementation and the PASTIS dataset are publicly available at (link-upon-publication).", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Garnot_Panoptic_Segmentation_of_Satellite_Image_Time_Series_With_Convolutional_Temporal_ICCV_2021_paper.pdf", @@ -30388,14 +32448,15 @@ "author_num": 2, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Garnot_Panoptic_Segmentation_of_Satellite_Image_Time_Series_With_Convolutional_Temporal_ICCV_2021_paper.html", "aff_unique_index": "0;0", - "aff_unique_norm": "University Gustave Eiffel", + "aff_unique_norm": "Universite Gustave Eiffel", "aff_unique_dep": "LASTIG", "aff_unique_url": "", "aff_unique_abbr": "", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "France" + "aff_country_unique": "France", + "bibtex": "@InProceedings{Garnot_2021_ICCV,\n \n author = {\n Garnot,\n Vivien Sainte Fare and Landrieu,\n Loic\n},\n title = {\n Panoptic Segmentation of Satellite Image Time Series With Convolutional Temporal Attention Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4872-4881\n} \n}" }, { "title": "Parallel Detection-and-Segmentation Learning for Weakly Supervised Instance Segmentation", @@ -30403,6 +32464,7 @@ "status": "Poster", "track": "main", "pid": 1056, + "author_site": "Yunhang Shen; Liujuan Cao; Zhiwei Chen; Baochang Zhang; Chi Su; Yongjian Wu; Feiyue Huang; Rongrong Ji", "author": "Yunhang Shen; Liujuan Cao; Zhiwei Chen; Baochang Zhang; Chi Su; Yongjian Wu; Feiyue Huang; Rongrong Ji", "abstract": "Weakly supervised instance segmentation (WSIS) with only image-level labels has recently drawn much attention. To date, bottom-up WSIS methods refine discriminative cues from classifiers with sophisticated multi-stage training procedures, which also suffer from inconsistent object boundaries. And top-down WSIS methods are formulated as cascade detection-to-segmentation pipeline, in which the quality of segmentation learning heavily depends on pseudo masks generated from detectors. In this paper, we propose a unified parallel detection-and-segmentation learning (PDSL) framework to learn instance segmentation with only image-level labels, which draws inspiration from both top-down and bottom-up instance segmentation approaches. The detection module is the same as the typical design of any weakly supervised object detection, while the segmentation module leverages self-supervised learning to model class-agnostic foreground extraction, following by self-training to refine class-specific segmentation. We further design instance-activation correlation module to improve the coherence between detection and segmentation branches. Extensive experiments verify that the proposed method outperforms baselines and achieves the state-of-the-art results on PASCAL VOC and MS COCO.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Shen_Parallel_Detection-and-Segmentation_Learning_for_Weakly_Supervised_Instance_Segmentation_ICCV_2021_paper.pdf", @@ -30419,14 +32481,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Shen_Parallel_Detection-and-Segmentation_Learning_for_Weakly_Supervised_Instance_Segmentation_ICCV_2021_paper.html", "aff_unique_index": "0+1;0;0;2;3;1;1;0+0+4", - "aff_unique_norm": "Xiamen University;Tencent;Beihang University;Kingsoft Cloud;Pengcheng Laboratory", - "aff_unique_dep": "Department of Artificial Intelligence;Youtu Lab;Institute of Artificial Intelligence;;Peng Cheng Laboratory", + "aff_unique_norm": "Xiamen University;Tencent;Beihang University;KingSoft Cloud;Peng Cheng Laboratory", + "aff_unique_dep": "Department of Artificial Intelligence;Youtu Lab;Institute of Artificial Intelligence;;", "aff_unique_url": "https://www.xmu.edu.cn;https://www.tencent.com;http://www.buaa.edu.cn;https://www.ksyun.com;", "aff_unique_abbr": "XMU;Tencent;BUAA;KSC;", "aff_campus_unique_index": "1;2;2;1;1;3+4", "aff_campus_unique": ";Shanghai;Beijing;Xiamen;Shenzhen", "aff_country_unique_index": "0+0;0;0;0;0;0;0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Shen_2021_ICCV,\n \n author = {\n Shen,\n Yunhang and Cao,\n Liujuan and Chen,\n Zhiwei and Zhang,\n Baochang and Su,\n Chi and Wu,\n Yongjian and Huang,\n Feiyue and Ji,\n Rongrong\n},\n title = {\n Parallel Detection-and-Segmentation Learning for Weakly Supervised Instance Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8198-8208\n} \n}" }, { "title": "Parallel Multi-Resolution Fusion Network for Image Inpainting", @@ -30434,6 +32497,7 @@ "status": "Poster", "track": "main", "pid": 3384, + "author_site": "Wentao Wang; Jianfu Zhang; Li Niu; Haoyu Ling; Xue Yang; Liqing Zhang", "author": "Wentao Wang; Jianfu Zhang; Li Niu; Haoyu Ling; Xue Yang; Liqing Zhang", "abstract": "Conventional deep image inpainting methods are based on auto-encoder architecture, in which the spatial details of images will be lost in the down-sampling process, leading to the degradation of generated results. Also, the structure information in deep layers and texture information in shallow layers of the auto-encoder architecture can not be well integrated. Differing from the conventional image inpainting architecture, we design a parallel multi-resolution inpainting network with multi-resolution partial convolution, in which low-resolution branches focus on the global structure while high-resolution branches focus on the local texture details. All these high- and low-resolution streams are in parallel and fused repeatedly with multi-resolution masked representation fusion so that the reconstructed images are semantically robust and textually plausible. Experimental results show that our method can effectively fuse structure and texture information, producing more realistic results than state-of-the-art methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_Parallel_Multi-Resolution_Fusion_Network_for_Image_Inpainting_ICCV_2021_paper.pdf", @@ -30457,7 +32521,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Shanghai;", "aff_country_unique_index": "0;1;0;0;0;0", - "aff_country_unique": "China;Japan" + "aff_country_unique": "China;Japan", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Wentao and Zhang,\n Jianfu and Niu,\n Li and Ling,\n Haoyu and Yang,\n Xue and Zhang,\n Liqing\n},\n title = {\n Parallel Multi-Resolution Fusion Network for Image Inpainting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14559-14568\n} \n}" }, { "title": "Parallel Rectangle Flip Attack: A Query-Based Black-Box Attack Against Object Detection", @@ -30465,6 +32530,7 @@ "status": "Poster", "track": "main", "pid": 6660, + "author_site": "Siyuan Liang; Baoyuan Wu; Yanbo Fan; Xingxing Wei; Xiaochun Cao", "author": "Siyuan Liang; Baoyuan Wu; Yanbo Fan; Xingxing Wei; Xiaochun Cao", "abstract": "Object detection has been widely used in many safety-critical tasks, such as autonomous driving. However, its vulnerability to adversarial examples has not been sufficiently studied, especially under the practical scenario of black-box attacks, where the attacker can only access the query feedback of predicted bounding-boxes and top-1 scores returned by the attacked model. Compared with black-box attack to image classification, there are two main challenges in black-box attack to detection. Firstly, even if one bounding-box is successfully attacked, another sub-optimal bounding-box may be detected near the attacked bounding-box. Secondly, there are multiple bounding-boxes, leading to very high attack cost. To address these challenges, we propose a Parallel Rectangle Flip Attack (PRFA) via random search. Specifically, we generate perturbations in each rectangle patch to avoid sub-optimal detection near the attacked region. Besides, utilizing the observation that adversarial perturbations mainly locate around objects' contours and critical points under white-box attacks, the search space of attacked rectangles is reduced to improve the attack efficiency. Moreover, we develop a parallel mechanism of attacking multiple rectangles simultaneously to further accelerate the attack process. Extensive experiments demonstrate that our method can effectively and efficiently attack various popular object detectors, including anchor-based and anchor-free, and generate transferable adversarial examples.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liang_Parallel_Rectangle_Flip_Attack_A_Query-Based_Black-Box_Attack_Against_Object_ICCV_2021_paper.pdf", @@ -30479,7 +32545,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Liang_Parallel_Rectangle_Flip_Attack_A_Query-Based_Black-Box_Attack_Against_Object_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Liang_Parallel_Rectangle_Flip_Attack_A_Query-Based_Black-Box_Attack_Against_Object_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Liang_2021_ICCV,\n \n author = {\n Liang,\n Siyuan and Wu,\n Baoyuan and Fan,\n Yanbo and Wei,\n Xingxing and Cao,\n Xiaochun\n},\n title = {\n Parallel Rectangle Flip Attack: A Query-Based Black-Box Attack Against Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7697-7707\n} \n}" }, { "title": "Parametric Contrastive Learning", @@ -30487,6 +32554,7 @@ "status": "Poster", "track": "main", "pid": 2894, + "author_site": "Jiequan Cui; Zhisheng Zhong; Shu Liu; Bei Yu; Jiaya Jia", "author": "Jiequan Cui; Zhisheng Zhong; Shu Liu; Bei Yu; Jiaya Jia", "abstract": "In this paper, we propose Parametric Contrastive Learning (PaCo) to tackle long-tailed recognition. Based on theoretical analysis, we observe supervised contrastive loss tends to bias on high-frequency classes and thus increases the difficulty of imbalanced learning. We introduce a set of parametric class-wise learnable centers to rebalance from an optimization perspective. Further, we analyze our PaCo loss under a balanced setting. Our analysis demonstrates that PaCo can adaptively enhance the intensity of pushing samples of the same class close as more samples are pulled together with their corresponding centers and benefit hard example learning. Experiments on long-tailed CIFAR, ImageNet, Places, and iNaturalist 2018 manifest the new state-of-the-art for long-tailed recognition. On full ImageNet, models trained with PaCo loss surpass supervised contrastive learning across various ResNet backbones, e.g., our ResNet-200 achieves 81.8% top-1 accuracy. Our code is available at https://github.com/dvlab-research/Parametric-Contrastive-Learning.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Cui_Parametric_Contrastive_Learning_ICCV_2021_paper.pdf", @@ -30503,14 +32571,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Cui_Parametric_Contrastive_Learning_ICCV_2021_paper.html", "aff_unique_index": "0;0;1;0+1;0+1", - "aff_unique_norm": "Chinese University of Hong Kong;SmartMore", + "aff_unique_norm": "The Chinese University of Hong Kong;SmartMore", "aff_unique_dep": ";", "aff_unique_url": "https://www.cuhk.edu.hk;", "aff_unique_abbr": "CUHK;", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Cui_2021_ICCV,\n \n author = {\n Cui,\n Jiequan and Zhong,\n Zhisheng and Liu,\n Shu and Yu,\n Bei and Jia,\n Jiaya\n},\n title = {\n Parametric Contrastive Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 715-724\n} \n}" }, { "title": "Parsing Table Structures in the Wild", @@ -30518,6 +32587,7 @@ "status": "Poster", "track": "main", "pid": 5634, + "author_site": "Rujiao Long; Wen Wang; Nan Xue; Feiyu Gao; Zhibo Yang; Yongpan Wang; Gui-Song Xia", "author": "Rujiao Long; Wen Wang; Nan Xue; Feiyu Gao; Zhibo Yang; Yongpan Wang; Gui-Song Xia", "abstract": "This paper tackles the problem of table structure pars-ing (TSP) from images in the wild. In contrast to existingstudies that mainly focus on parsing well-aligned tabularimages with simple layouts from scanned PDF documents,we aim to establish a practical table structure parsing sys-tem for real-world scenarios where tabular input imagesare taken or scanned with severe deformation, bending orocclusions. For designing such a system, we propose anapproach named Cycle-CenterNet on the top of CenterNetwith a novel cycle-pairing module to simultaneously detectand group tabular cells into structured tables. In the cycle-pairing module, a new pairing loss function is proposed forthe network training. Alongside with our Cycle-CenterNet,we also present a large-scale dataset, named Wired Tablein the Wild (WTW), which includes well-annotated structureparsing of multiple style tables in several scenes like photo,scanning files, web pages,etc.. In experiments, we demon-strate that our Cycle-CenterNet consistently achieves thebest accuracy of table structure parsing on the new WTWdataset by 24.6% absolute improvement evaluated by theTEDS metric. A more comprehensive experimental analysisalso validates the advantages of our proposed methods forthe TSP task.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Long_Parsing_Table_Structures_in_the_Wild_ICCV_2021_paper.pdf", @@ -30541,7 +32611,8 @@ "aff_campus_unique_index": "1+1;1;1+1", "aff_campus_unique": ";Wuhan", "aff_country_unique_index": "0;0+0;0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Long_2021_ICCV,\n \n author = {\n Long,\n Rujiao and Wang,\n Wen and Xue,\n Nan and Gao,\n Feiyu and Yang,\n Zhibo and Wang,\n Yongpan and Xia,\n Gui-Song\n},\n title = {\n Parsing Table Structures in the Wild\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 944-952\n} \n}" }, { "title": "Partial Off-Policy Learning: Balance Accuracy and Diversity for Human-Oriented Image Captioning", @@ -30549,6 +32620,7 @@ "status": "Poster", "track": "main", "pid": 8108, + "author_site": "Jiahe Shi; Yali Li; Shengjin Wang", "author": "Jiahe Shi; Yali Li; Shengjin Wang", "abstract": "Human-oriented image captioning with both high diversity and accuracy is a challenging task in vision+language modeling. The reinforcement learning (RL) based frameworks promote the accuracy of image captioning, yet seriously hurt the diversity. In contrast, other methods based on variational auto-encoder (VAE) or generative adversarial network (GAN) can produce diverse yet less accurate captions. In this work, we devote our attention to promote the diversity of RL-based image captioning. To be specific, we devise a partial off-policy learning scheme to balance accuracy and diversity. First, we keep the model exposed to varied candidate captions by sampling from the initial state before RL launched. Second, a novel criterion named max-CIDEr is proposed to serve as the reward for promoting diversity. We combine the above-mentioned off-policy strategy with the on-policy one to moderate the exploration effect, further balancing the diversity and accuracy for human-like image captioning. Experiments show that our method locates the closest to human performance in the diversity-accuracy space, and achieves the highest Pearson correlation as 0.337 with human performance.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Shi_Partial_Off-Policy_Learning_Balance_Accuracy_and_Diversity_for_Human-Oriented_Image_ICCV_2021_paper.pdf", @@ -30572,7 +32644,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Shi_2021_ICCV,\n \n author = {\n Shi,\n Jiahe and Li,\n Yali and Wang,\n Shengjin\n},\n title = {\n Partial Off-Policy Learning: Balance Accuracy and Diversity for Human-Oriented Image Captioning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2187-2196\n} \n}" }, { "title": "Partial Video Domain Adaptation With Partial Adversarial Temporal Attentive Network", @@ -30580,6 +32653,7 @@ "status": "Poster", "track": "main", "pid": 7404, + "author_site": "Yuecong Xu; Jianfei Yang; Haozhi Cao; Zhenghua Chen; Qi Li; Kezhi Mao", "author": "Yuecong Xu; Jianfei Yang; Haozhi Cao; Zhenghua Chen; Qi Li; Kezhi Mao", "abstract": "Partial Domain Adaptation (PDA) is a practical and general domain adaptation scenario, which relaxes the fully shared label space assumption such that the source label space subsumes the target one. The key challenge of PDA is the issue of negative transfer caused by source-only classes. For videos, such negative transfer could be triggered by both spatial and temporal features, which leads to a more challenging Partial Video Domain Adaptation (PVDA) problem. In this paper, we propose a novel Partial Adversarial Temporal Attentive Network (PATAN) to address the PVDA problem by utilizing both spatial and temporal features for filtering source-only classes. Besides, PATAN constructs effective overall temporal features by attending to local temporal features that contribute more toward the class filtration process. We further introduce new benchmarks to facilitate research on PVDA problems, covering a wide range of PVDA scenarios. Empirical results demonstrate the state-of-the-art performance of our proposed PATAN across the multiple PVDA benchmarks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xu_Partial_Video_Domain_Adaptation_With_Partial_Adversarial_Temporal_Attentive_Network_ICCV_2021_paper.pdf", @@ -30603,7 +32677,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Singapore;", "aff_country_unique_index": "0+0;0+0;0;0;0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Xu_2021_ICCV,\n \n author = {\n Xu,\n Yuecong and Yang,\n Jianfei and Cao,\n Haozhi and Chen,\n Zhenghua and Li,\n Qi and Mao,\n Kezhi\n},\n title = {\n Partial Video Domain Adaptation With Partial Adversarial Temporal Attentive Network\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9332-9341\n} \n}" }, { "title": "Partner-Assisted Learning for Few-Shot Image Classification", @@ -30611,6 +32686,7 @@ "status": "Poster", "track": "main", "pid": 8070, + "author_site": "Jiawei Ma; Hanchen Xie; Guangxing Han; Shih-Fu Chang; Aram Galstyan; Wael Abd-Almageed", "author": "Jiawei Ma; Hanchen Xie; Guangxing Han; Shih-Fu Chang; Aram Galstyan; Wael Abd-Almageed", "abstract": "Few-shot Learning has been studied to mimic human visual capabilities and learn effective models without the need of exhaustive human annotation. Even though the idea of meta-learning for adaptation has dominated the few-shot learning methods, how to train a feature extractor is still a challenge. In this paper, we focus on the design of training strategy to obtain an elemental representation such that the prototype of each novel class can be estimated from a few labeled samples. We propose a two-stage training scheme, Partner-Assisted Learning (PAL), which first trains a partner encoder to model pair-wise similarities and extract features serving as soft-anchors, and then trains a main encoder by aligning its outputs with soft-anchors while attempting to maximize classification performance. Two alignment constraints from logit-level and feature-level are designed individually. For each few-shot task, we perform prototype classification. Our method consistently outperforms the state-of-the-art method on four benchmarks. Detailed ablation studies of PAL are provided to justify the selection of each component involved in training.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ma_Partner-Assisted_Learning_for_Few-Shot_Image_Classification_ICCV_2021_paper.pdf", @@ -30634,7 +32710,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Ma_2021_ICCV,\n \n author = {\n Ma,\n Jiawei and Xie,\n Hanchen and Han,\n Guangxing and Chang,\n Shih-Fu and Galstyan,\n Aram and Abd-Almageed,\n Wael\n},\n title = {\n Partner-Assisted Learning for Few-Shot Image Classification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10573-10582\n} \n}" }, { "title": "Patch Craft: Video Denoising by Deep Modeling and Patch Matching", @@ -30642,6 +32719,7 @@ "status": "Poster", "track": "main", "pid": 7804, + "author_site": "Gregory Vaksman; Michael Elad; Peyman Milanfar", "author": "Gregory Vaksman; Michael Elad; Peyman Milanfar", "abstract": "The non-local self-similarity property of natural images has been exploited extensively for solving various image processing problems. When it comes to video sequences, harnessing this force is even more beneficial due to the temporal redundancy. In the context of image and video denoising, many classically-oriented algorithms employ self-similarity, splitting the data into overlapping patches, gathering groups of similar ones and processing these together somehow. With the emergence of convolutional neural networks (CNN), the patch-based framework has been abandoned. Most CNN denoisers operate on the whole image, leveraging non-local relations only implicitly by using a large receptive field. This work proposes a novel approach for leveraging self-similarity in the context of video denoising, while still relying on a regular convolutional architecture. We introduce a concept of patch-craft frames - artificial frames that are similar to the real ones, built by tiling matched patches. Our algorithm augments video sequences with patch-craft frames and feeds them to a CNN. We demonstrate the substantial boost in denoising performance obtained with the proposed approach.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Vaksman_Patch_Craft_Video_Denoising_by_Deep_Modeling_and_Patch_Matching_ICCV_2021_paper.pdf", @@ -30658,14 +32736,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Vaksman_Patch_Craft_Video_Denoising_by_Deep_Modeling_and_Patch_Matching_ICCV_2021_paper.html", "aff_unique_index": "0;1;1", - "aff_unique_norm": "Technion;Google", + "aff_unique_norm": "The Technion;Google", "aff_unique_dep": "CS Department;Google Research", "aff_unique_url": "https://www.technion.ac.il;https://research.google", "aff_unique_abbr": "Technion;Google Research", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;1", - "aff_country_unique": "Israel;United States" + "aff_country_unique": "Israel;United States", + "bibtex": "@InProceedings{Vaksman_2021_ICCV,\n \n author = {\n Vaksman,\n Gregory and Elad,\n Michael and Milanfar,\n Peyman\n},\n title = {\n Patch Craft: Video Denoising by Deep Modeling and Patch Matching\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2157-2166\n} \n}" }, { "title": "Patch2CAD: Patchwise Embedding Learning for In-the-Wild Shape Retrieval From a Single Image", @@ -30673,6 +32752,7 @@ "status": "Poster", "track": "main", "pid": 9152, + "author_site": "Weicheng Kuo; Anelia Angelova; Tsung-Yi Lin; Angela Dai", "author": "Weicheng Kuo; Anelia Angelova; Tsung-Yi Lin; Angela Dai", "abstract": "3D perception of object shapes from RGB image input is fundamental towards semantic scene understanding, grounding image-based perception in our spatially 3-dimensional real-world environments. To achieve a mapping between image views of objects and 3D shapes, we leverage CAD model priors from existing large-scale databases, and propose a novel approach towards constructing a joint embedding space between 2D images and 3D CAD models in a patch-wise fashion -- establishing correspondences between patches of an image view of an object and patches of CAD geometry. This enables part similarity reasoning for retrieving similar CADs to a new image view without exact matches in the database. Our patch embedding provides more robust CAD retrieval for shape estimation in our end-to-end estimation of CAD model shape and pose for detected objects in a single input image. Experiments on in-the-wild, complex imagery from ScanNet show that our approach is more robust than state of the art in real-world scenarios without any exact CAD matches.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Kuo_Patch2CAD_Patchwise_Embedding_Learning_for_In-the-Wild_Shape_Retrieval_From_a_ICCV_2021_paper.pdf", @@ -30696,7 +32776,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;0;1", - "aff_country_unique": "United States;Germany" + "aff_country_unique": "United States;Germany", + "bibtex": "@InProceedings{Kuo_2021_ICCV,\n \n author = {\n Kuo,\n Weicheng and Angelova,\n Anelia and Lin,\n Tsung-Yi and Dai,\n Angela\n},\n title = {\n Patch2CAD: Patchwise Embedding Learning for In-the-Wild Shape Retrieval From a Single Image\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12589-12599\n} \n}" }, { "title": "PatchMatch-RL: Deep MVS With Pixelwise Depth, Normal, and Visibility", @@ -30704,6 +32785,7 @@ "status": "Poster", "track": "main", "pid": 3244, + "author_site": "Jae Yong Lee; Joseph DeGol; Chuhang Zou; Derek Hoiem", "author": "Jae Yong Lee; Joseph DeGol; Chuhang Zou; Derek Hoiem", "abstract": "Recent learning-based multi-view stereo (MVS) methods show excellent performance with dense cameras and small depth ranges. However, non-learning based approaches still outperform for scenes with large depth ranges and sparser wide-baseline views, in part due to their PatchMatch optimization over pixelwise estimates of depth, normals, and visibility. In this paper, we propose an end-to-end trainable PatchMatch-based MVS approach that combines advantages of trainable costs and regularizations with pixelwise estimates. To overcome the challenge of the non-differentiable PatchMatch optimization that involves iterative sampling and hard decisions, we use reinforcement learning to minimize expected photometric cost and maximize likelihood of ground truth depth and normals. We incorporate normal estimation by using dilated patch kernels, and propose a recurrent cost regularization that applies beyond frontal plane-sweep algorithms to our pixelwise depth/normal estimates. We evaluate our method on widely used MVS benchmarks, ETH3D and Tanks and Temples (TnT), and compare to other state of the art learning based MVS models. On ETH3D, our method outperforms other recent learning-based approaches and performs comparably on advanced TnT.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Lee_PatchMatch-RL_Deep_MVS_With_Pixelwise_Depth_Normal_and_Visibility_ICCV_2021_paper.pdf", @@ -30720,14 +32802,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Lee_PatchMatch-RL_Deep_MVS_With_Pixelwise_Depth_Normal_and_Visibility_ICCV_2021_paper.html", "aff_unique_index": "0;1;2;0", - "aff_unique_norm": "University of Illinois Urbana-Champaign;Microsoft;Amazon", - "aff_unique_dep": ";Microsoft Corporation;Amazon Go", + "aff_unique_norm": "University of Illinois at Urbana-Champaign;Microsoft Corporation;Amazon", + "aff_unique_dep": ";;Amazon Go", "aff_unique_url": "https://illinois.edu;https://www.microsoft.com;https://www.amazon.com", "aff_unique_abbr": "UIUC;Microsoft;Amazon", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Urbana-Champaign;", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Lee_2021_ICCV,\n \n author = {\n Lee,\n Jae Yong and DeGol,\n Joseph and Zou,\n Chuhang and Hoiem,\n Derek\n},\n title = {\n PatchMatch-RL: Deep MVS With Pixelwise Depth,\n Normal,\n and Visibility\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6158-6167\n} \n}" }, { "title": "Pathdreamer: A World Model for Indoor Navigation", @@ -30735,6 +32818,7 @@ "status": "Poster", "track": "main", "pid": 9191, + "author_site": "Jing Yu Koh; Honglak Lee; Yinfei Yang; Jason Baldridge; Peter Anderson", "author": "Jing Yu Koh; Honglak Lee; Yinfei Yang; Jason Baldridge; Peter Anderson", "abstract": "People navigating in unfamiliar buildings take advantage of myriad visual, spatial and semantic cues to efficiently achieve their navigation goals. Towards equipping computational agents with similar capabilities, we introduce Pathdreamer, a visual world model for agents navigating in novel indoor environments. Given one or more previous visual observations, Pathdreamer generates plausible high-resolution 360deg visual observations (RGB, semantic segmentation and depth) for viewpoints that have not been visited, in buildings not seen during training. In regions of high uncertainty (e.g. predicting around corners, imagining the contents of an unseen room), Pathdreamer can predict diverse scenes, allowing an agent to sample multiple realistic outcomes for a given trajectory. We demonstrate that Pathdreamer encodes useful and accessible visual, spatial and semantic knowledge about human environments by using it in the downstream task of Vision-and-Language Navigation (VLN). Specifically, we show that planning ahead with Pathdreamer brings about half the benefit of looking ahead at actual observations from unobserved parts of the environment. We hope that Pathdreamer will help unlock model-based approaches to challenging embodied navigation tasks such as navigating to specified objects and VLN.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Koh_Pathdreamer_A_World_Model_for_Indoor_Navigation_ICCV_2021_paper.pdf", @@ -30758,7 +32842,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Koh_2021_ICCV,\n \n author = {\n Koh,\n Jing Yu and Lee,\n Honglak and Yang,\n Yinfei and Baldridge,\n Jason and Anderson,\n Peter\n},\n title = {\n Pathdreamer: A World Model for Indoor Navigation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14738-14748\n} \n}" }, { "title": "Perception-Aware Multi-Sensor Fusion for 3D LiDAR Semantic Segmentation", @@ -30766,6 +32851,7 @@ "status": "Poster", "track": "main", "pid": 3293, + "author_site": "Zhuangwei Zhuang; Rong Li; Kui Jia; Qicheng Wang; Yuanqing Li; Mingkui Tan", "author": "Zhuangwei Zhuang; Rong Li; Kui Jia; Qicheng Wang; Yuanqing Li; Mingkui Tan", "abstract": "3D LiDAR (light detection and ranging) semantic segmentation is important in scene understanding for many applications, such as auto-driving and robotics. For example, for autonomous cars equipped with RGB cameras and LiDAR, it is crucial to fuse complementary information from different sensors for robust and accurate segmentation. Existing fusion-based methods, however, may not achieve promising performance due to the vast difference between the two modalities. In this work, we investigate a collaborative fusion scheme called perception-aware multi-sensor fusion (PMF) to exploit perceptual information from two modalities, namely, appearance information from RGB images and spatio-depth information from point clouds. To this end, we first project point clouds to the camera coordinates to provide spatio-depth information for RGB images. Then, we propose a two-stream network to extract features from the two modalities, separately, and fuse the features by effective residual-based fusion modules. Moreover, we propose additional perception-aware losses to measure the perceptual difference between the two modalities. Extensive experiments on two benchmark data sets show the superiority of our method. For example, on nuScenes, our PMF outperforms the state-of-the-art method by 0.8% in mIoU.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhuang_Perception-Aware_Multi-Sensor_Fusion_for_3D_LiDAR_Semantic_Segmentation_ICCV_2021_paper.pdf", @@ -30789,7 +32875,8 @@ "aff_campus_unique_index": ";;;", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Zhuang_2021_ICCV,\n \n author = {\n Zhuang,\n Zhuangwei and Li,\n Rong and Jia,\n Kui and Wang,\n Qicheng and Li,\n Yuanqing and Tan,\n Mingkui\n},\n title = {\n Perception-Aware Multi-Sensor Fusion for 3D LiDAR Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 16280-16290\n} \n}" }, { "title": "Perceptual Variousness Motion Deblurring With Light Global Context Refinement", @@ -30797,6 +32884,7 @@ "status": "Poster", "track": "main", "pid": 8839, + "author_site": "Jichun Li; Weimin Tan; Bo Yan", "author": "Jichun Li; Weimin Tan; Bo Yan", "abstract": "Deep learning algorithms have made significant progress in dynamic scene deblurring. However, several challenges are still unsettled: 1) The degree and scale of blur in different regions of a blurred image can have a considerable variation in a large range. However, the traditional input pyramid or downscaling-upscaling, is designed to have limited and inflexible perceptual variousness to cope with large blur scale variation. 2) The nonlocal block is proved to be effective in the image enhancement tasks, but it requires high computation and memory cost. In this paper, we are the first to propose a light-weight globally-analyzing module into the image deblurring field, named Light Global Context Refinement (LGCR) module. With exponentially lower cost, it achieves even better performance than the nonlocal unit. Moreover, we propose the Perceptual Variousness Block (PVB) and PVB-piling strategy. By placing PVB repeatedly, the whole method possesses abundant reception field spectrum to be aware of the blur with various degrees and scales. Comprehensive experimental results from the different benchmarks and assessment metrics show that our method achieves excellent performance to set a new state-of-the-art in motion deblurring.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_Perceptual_Variousness_Motion_Deblurring_With_Light_Global_Context_Refinement_ICCV_2021_paper.pdf", @@ -30820,7 +32908,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Shanghai", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Jichun and Tan,\n Weimin and Yan,\n Bo\n},\n title = {\n Perceptual Variousness Motion Deblurring With Light Global Context Refinement\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4116-4125\n} \n}" }, { "title": "Persistent Homology Based Graph Convolution Network for Fine-Grained 3D Shape Segmentation", @@ -30828,6 +32917,7 @@ "status": "Poster", "track": "main", "pid": 6896, + "author_site": "Chi-Chong Wong; Chi-Man Vong", "author": "Chi-Chong Wong; Chi-Man Vong", "abstract": "Fine-grained 3D segmentation is an important task in 3D object understanding, especially in applications such as intelligent manufacturing or parts analysis for 3D objects. However, many challenges involved in such problem are yet to be solved, such as i) interpreting the complex structures located in different regions for 3D objects; ii) capturing fine-grained structures with sufficient topology correctness. Current deep learning and graph machine learning methods fail to tackle such challenges and thus provide inferior performance in fine-grained 3D analysis. In this work, methods in topological data analysis are incorporated with geometric deep learning model for the task of fine-grained segmentation for 3D objects. We propose a novel neural network model called Persistent Homology based Graph Convolution Network (PHGCN), which i) integrates persistent homology into graph convolution network to capture multi-scale structural information that can accurately represent complex structures for 3D objects; ii) applies a novel Persistence Diagram Loss that provides sufficient topology correctness for segmentation over the fine-grained structures. Extensive experiments on fine-grained 3D segmentation validate the effectiveness of the proposed PHGCN model and show significant improvements over current state-of-the-art methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wong_Persistent_Homology_Based_Graph_Convolution_Network_for_Fine-Grained_3D_Shape_ICCV_2021_paper.pdf", @@ -30851,7 +32941,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Macau SAR", "aff_country_unique_index": "0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wong_2021_ICCV,\n \n author = {\n Wong,\n Chi-Chong and Vong,\n Chi-Man\n},\n title = {\n Persistent Homology Based Graph Convolution Network for Fine-Grained 3D Shape Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7098-7107\n} \n}" }, { "title": "Personalized Image Semantic Segmentation", @@ -30859,6 +32950,7 @@ "status": "Poster", "track": "main", "pid": 6768, + "author_site": "Yu Zhang; Chang-Bin Zhang; Peng-Tao Jiang; Ming-Ming Cheng; Feng Mao", "author": "Yu Zhang; Chang-Bin Zhang; Peng-Tao Jiang; Ming-Ming Cheng; Feng Mao", "abstract": "Semantic segmentation models trained on public datasets have achieved great success in recent years. However, these models didn't consider the personalization issue of segmentation though it is important in practice. In this paper, we address the problem of personalized image segmentation. The objective is to generate more accurate segmentation results on unlabeled personalized images by investigating the data's personalized traits. To open up future research in this area, we collect a large dataset containing various users' personalized images called PSS (Personalized Semantic Segmentation). We also survey some recent researches related to this problem and report their performance on our dataset. Furthermore, by observing the correlation among a user's personalized images, we propose a baseline method that incorporates the inter-image context when segmenting certain images. Extensive experiments show that our method outperforms the existing methods on the proposed dataset. The code and the PSS dataset are available at https://mmcheng.net/pss/.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhang_Personalized_Image_Semantic_Segmentation_ICCV_2021_paper.pdf", @@ -30882,7 +32974,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2021_ICCV,\n \n author = {\n Zhang,\n Yu and Zhang,\n Chang-Bin and Jiang,\n Peng-Tao and Cheng,\n Ming-Ming and Mao,\n Feng\n},\n title = {\n Personalized Image Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10549-10559\n} \n}" }, { "title": "Personalized Trajectory Prediction via Distribution Discrimination", @@ -30890,6 +32983,7 @@ "status": "Poster", "track": "main", "pid": 2586, + "author_site": "Guangyi Chen; Junlong Li; Nuoxing Zhou; Liangliang Ren; Jiwen Lu", "author": "Guangyi Chen; Junlong Li; Nuoxing Zhou; Liangliang Ren; Jiwen Lu", "abstract": "Trajectory prediction is confronted with the dilemma to capture the multi-modal nature of future dynamics with both diversity and accuracy. In this paper, we propose a distribution discrimination method (DisDis) to predict personalized motion pattern by distinguishing the potential distributions in a self-supervised manner. The key motivation of DisDis is the observation that the motion pattern of each person is personalized due to his/her habit, character, or goal. Specifically, we learn the latent distribution to represent different motion patterns and optimize it by contrastive discrimination. The contrastive distribution discrimination encourages latent distributions to be discriminative. Our method could be seamlessly integrated with existing multi-modal stochastic predictive models as a plug-and-play module to learn the more discriminative latent distribution. To evaluate the latent distribution, we further propose a new metric, probability cumulative minimum distance (PCMD) curve, which cumulatively calculates the minimum distance on the sorted probabilities. Experimental results on the ETH and UCY datasets show the effectiveness of our method.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_Personalized_Trajectory_Prediction_via_Distribution_Discrimination_ICCV_2021_paper.pdf", @@ -30913,7 +33007,8 @@ "aff_campus_unique_index": ";;;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0+0;0+0+0;0+0+0;0+0+0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Guangyi and Li,\n Junlong and Zhou,\n Nuoxing and Ren,\n Liangliang and Lu,\n Jiwen\n},\n title = {\n Personalized Trajectory Prediction via Distribution Discrimination\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15580-15589\n} \n}" }, { "title": "Personalized and Invertible Face De-Identification by Disentangled Identity Information Manipulation", @@ -30921,6 +33016,7 @@ "status": "Poster", "track": "main", "pid": 9676, + "author_site": "Jingyi Cao; Bo Liu; Yunqian Wen; Rong Xie; Li Song", "author": "Jingyi Cao; Bo Liu; Yunqian Wen; Rong Xie; Li Song", "abstract": "The popularization of intelligent devices including smartphones and surveillance cameras results in more serious privacy issues. De-identification is regarded as an effective tool for visual privacy protection with the process of concealing or replacing identity information. Most of the existing de-identification methods suffer from some limitations since they mainly focus on the protection process and are usually non-reversible. In this paper, we propose a personalized and invertible de-identification method based on the deep generative model, where the main idea is introducing a user-specific password and an adjustable parameter to control the direction and degree of identity variation. Extensive experiments demonstrate the effectiveness and generalization of our proposed framework for both face de-identification and recovery.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Cao_Personalized_and_Invertible_Face_De-Identification_by_Disentangled_Identity_Information_Manipulation_ICCV_2021_paper.pdf", @@ -30944,7 +33040,8 @@ "aff_campus_unique_index": "0;1;0;0;0", "aff_campus_unique": "Shanghai;Sydney", "aff_country_unique_index": "0;1;0;0;0", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Cao_2021_ICCV,\n \n author = {\n Cao,\n Jingyi and Liu,\n Bo and Wen,\n Yunqian and Xie,\n Rong and Song,\n Li\n},\n title = {\n Personalized and Invertible Face De-Identification by Disentangled Identity Information Manipulation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3334-3342\n} \n}" }, { "title": "Perturbed Self-Distillation: Weakly Supervised Large-Scale Point Cloud Semantic Segmentation", @@ -30952,6 +33049,7 @@ "status": "Poster", "track": "main", "pid": 9518, + "author_site": "Yachao Zhang; Yanyun Qu; Yuan Xie; Zonghao Li; Shanshan Zheng; Cuihua Li", "author": "Yachao Zhang; Yanyun Qu; Yuan Xie; Zonghao Li; Shanshan Zheng; Cuihua Li", "abstract": "Large-scale point cloud semantic segmentation has wide applications. Current popular researches mainly focus on fully supervised learning which demands expensive and tedious manual point-wise annotation. Weakly supervised learning is an alternative way to avoid this exhausting annotation. However, for large-scale point clouds with few labeled points, the network is difficult to extract discriminative features for unlabeled points, as well as the regularization of topology between labeled and unlabeled points is usually ignored, resulting in incorrect segmentation results. To address this problem, we propose a perturbed self-distillation (PSD) framework. Specifically, inspired by self-supervised learning, we construct the perturbed branch and enforce the predictive consistency among the perturbed branch and original branch. In this way, the graph topology of the whole point cloud can be effectively established by the introduced auxiliary supervision, such that the information propagation between the labeled and unlabeled points will be realized. Besides point-level supervision, we present a well-integrated context-aware module to explicitly regularize the affinity correlation of labeled points. Therefore, the graph topology of the point cloud can be further refined. The experimental results evaluated on three large-scale datasets show the large gain (3.0% on average) against recent weakly supervised methods and comparable results to some fully supervised methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhang_Perturbed_Self-Distillation_Weakly_Supervised_Large-Scale_Point_Cloud_Semantic_Segmentation_ICCV_2021_paper.pdf", @@ -30975,7 +33073,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2021_ICCV,\n \n author = {\n Zhang,\n Yachao and Qu,\n Yanyun and Xie,\n Yuan and Li,\n Zonghao and Zheng,\n Shanshan and Li,\n Cuihua\n},\n title = {\n Perturbed Self-Distillation: Weakly Supervised Large-Scale Point Cloud Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15520-15528\n} \n}" }, { "title": "Photon-Starved Scene Inference Using Single Photon Cameras", @@ -30983,6 +33082,7 @@ "status": "Poster", "track": "main", "pid": 4042, + "author_site": "Bhavya Goyal; Mohit Gupta", "author": "Bhavya Goyal; Mohit Gupta", "abstract": "Scene understanding under low-light conditions is a challenging problem. This is due to the small number of photons captured by the camera and the resulting low signal-to-noise ratio (SNR). Single-photon cameras (SPCs) are an emerging sensing modality that are capable of capturing images with high sensitivity. Despite having minimal read-noise, images captured by SPCs in photon-starved conditions still suffer from strong shot noise, preventing reliable scene inference. We propose photon scale-space -- a collection of high-SNR images spanning a wide range of photons-per-pixel (PPP) levels (but same scene content) as guides to train inference model on low photon flux images. We develop training techniques that push images with different illumination levels closer to each other in feature representation space. The key idea is that having a spectrum of different brightness levels during training enables effective guidance, and increases robustness to shot noise even in extreme noise cases. Based on the proposed approach, we demonstrate, via simulations and real experiments with a SPAD camera, high-performance on various inference tasks such as image classification and monocular depth estimation under ultra low-light, down to <1 PPP.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Goyal_Photon-Starved_Scene_Inference_Using_Single_Photon_Cameras_ICCV_2021_paper.pdf", @@ -31006,7 +33106,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Madison", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Goyal_2021_ICCV,\n \n author = {\n Goyal,\n Bhavya and Gupta,\n Mohit\n},\n title = {\n Photon-Starved Scene Inference Using Single Photon Cameras\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2512-2521\n} \n}" }, { "title": "Physics-Based Differentiable Depth Sensor Simulation", @@ -31014,6 +33115,7 @@ "status": "Poster", "track": "main", "pid": 6240, + "author_site": "Benjamin Planche; Rajat Vikram Singh", "author": "Benjamin Planche; Rajat Vikram Singh", "abstract": "Gradient-based algorithms are crucial to modern computer-vision and graphics applications, enabling learning-based optimization and inverse problems. For example, photorealistic differentiable rendering pipelines for color images have been proven highly valuable to applications aiming to map 2D and 3D domains. However, to the best of our knowledge, no effort has been made so far towards extending these gradient-based methods to the generation of depth (2.5D) images, as simulating structured-light depth sensors implies solving complex light transport and stereo-matching problems. In this paper, we introduce a novel end-to-end differentiable simulation pipeline for the generation of realistic 2.5D scans, built on physics-based 3D rendering and custom block-matching algorithms. Each module can be differentiated w.r.t sensor and scene parameters; e.g., to automatically tune the simulation for new devices over some provided scans or to leverage the pipeline as a 3D-to-2.5D transformer within larger computer-vision applications. Applied to the training of deep-learning methods for various depth-based recognition tasks (classification, pose estimation, semantic segmentation), our simulation greatly improves the performance of the resulting models on real scans, thereby demonstrating the fidelity and value of its synthetic depth data compared to previous static simulations and learning-based domain adaptation schemes.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Planche_Physics-Based_Differentiable_Depth_Sensor_Simulation_ICCV_2021_paper.pdf", @@ -31030,14 +33132,15 @@ "author_num": 2, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Planche_Physics-Based_Differentiable_Depth_Sensor_Simulation_ICCV_2021_paper.html", "aff_unique_index": "0;0+1", - "aff_unique_norm": "Siemens AG;NVIDIA", - "aff_unique_dep": ";NVIDIA Corporation", + "aff_unique_norm": "Siemens AG;NVIDIA Corporation", + "aff_unique_dep": ";", "aff_unique_url": "https://www.siemens.com;https://www.nvidia.com", "aff_unique_abbr": "Siemens;NVIDIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+1", - "aff_country_unique": "Germany;United States" + "aff_country_unique": "Germany;United States", + "bibtex": "@InProceedings{Planche_2021_ICCV,\n \n author = {\n Planche,\n Benjamin and Singh,\n Rajat Vikram\n},\n title = {\n Physics-Based Differentiable Depth Sensor Simulation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14387-14397\n} \n}" }, { "title": "Physics-Based Human Motion Estimation and Synthesis From Videos", @@ -31045,6 +33148,7 @@ "status": "Poster", "track": "main", "pid": 9477, + "author_site": "Kevin Xie; Tingwu Wang; Umar Iqbal; Yunrong Guo; Sanja Fidler; Florian Shkurti", "author": "Kevin Xie; Tingwu Wang; Umar Iqbal; Yunrong Guo; Sanja Fidler; Florian Shkurti", "abstract": "Human motion synthesis is an important problem for applications in graphics and gaming, and even in simulation environments for robotics. Existing methods require accurate motion capture data for training, which is costly to obtain. Instead, we propose a framework for training generative models of physically plausible human motion directly from monocular RGB videos, which are much more widely available. At the core of our method is a novel optimization formulation that aims to correct imperfect image-based pose estimations by enforcing physics constraints and reasons about contacts in a differentiable way. This optimization yields corrected 3D poses and motions, as well as their corresponding contact forces. Results show that our physically-correct motions significantly outperform prior work on pose estimation. We then train a generative model to synthesize both future motion and contact forces. We demonstrate both qualitatively and quantitatively significantly improved motion synthesis quality and physical plausibility achieved by our method on the large scale Human3.6m dataset as compared to prior learning-based kinematic and physics-based methods. By learning directly from video, our method paves the way for large-scale, realistic and diverse motion synthesis not previously possible.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xie_Physics-Based_Human_Motion_Estimation_and_Synthesis_From_Videos_ICCV_2021_paper.pdf", @@ -31061,14 +33165,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Xie_Physics-Based_Human_Motion_Estimation_and_Synthesis_From_Videos_ICCV_2021_paper.html", "aff_unique_index": "0+1;0+1;1;1;0+1;0", - "aff_unique_norm": "University of Toronto;NVIDIA", - "aff_unique_dep": ";NVIDIA Corporation", + "aff_unique_norm": "University of Toronto;NVIDIA Corporation", + "aff_unique_dep": ";", "aff_unique_url": "https://www.utoronto.ca;https://www.nvidia.com", "aff_unique_abbr": "U of T;NVIDIA", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Toronto;", "aff_country_unique_index": "0+1;0+1;1;1;0+1;0", - "aff_country_unique": "Canada;United States" + "aff_country_unique": "Canada;United States", + "bibtex": "@InProceedings{Xie_2021_ICCV,\n \n author = {\n Xie,\n Kevin and Wang,\n Tingwu and Iqbal,\n Umar and Guo,\n Yunrong and Fidler,\n Sanja and Shkurti,\n Florian\n},\n title = {\n Physics-Based Human Motion Estimation and Synthesis From Videos\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11532-11541\n} \n}" }, { "title": "Physics-Enhanced Machine Learning for Virtual Fluorescence Microscopy", @@ -31076,6 +33181,7 @@ "status": "Poster", "track": "main", "pid": 6268, + "author_site": "Colin L. Cooke; Fanjie Kong; Amey Chaware; Kevin C. Zhou; Kanghyun Kim; Rong Xu; D. Michael Ando; Samuel J. Yang; Pavan Chandra Konda; Roarke Horstmeyer", "author": "Colin L. Cooke; Fanjie Kong; Amey Chaware; Kevin C. Zhou; Kanghyun Kim; Rong Xu; D. Michael Ando; Samuel J. Yang; Pavan Chandra Konda; Roarke Horstmeyer", "abstract": "This paper introduces a new method of data-driven microscope design for virtual fluorescence microscopy. We use a deep neural network (DNN) to effectively design optical patterns for specimen illumination that substantially improve upon the ability to infer fluorescence image information from unstained microscope images. To achieve this design, we include an illumination model within the DNN's first layers that is jointly optimized during network training. We validated our method on two different experimental setups, with different magnifications and sample types, to show a consistent improvement in performance as compared to conventional microscope imaging methods. Additionally, to understand the importance of learned illumination on the inference task, we varied the number of illumination patterns being optimized (and thus the number of unique images captured) and analyzed how the structure of the patterns changed as their number increased. This work demonstrates the power of programmable optical elements at enabling better machine learning algorithm performance and at providing physical insight into next generation of machine-controlled imaging systems.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Cooke_Physics-Enhanced_Machine_Learning_for_Virtual_Fluorescence_Microscopy_ICCV_2021_paper.pdf", @@ -31099,7 +33205,8 @@ "aff_campus_unique_index": "1;1;", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Cooke_2021_ICCV,\n \n author = {\n Cooke,\n Colin L. and Kong,\n Fanjie and Chaware,\n Amey and Zhou,\n Kevin C. and Kim,\n Kanghyun and Xu,\n Rong and Ando,\n D. Michael and Yang,\n Samuel J. and Konda,\n Pavan Chandra and Horstmeyer,\n Roarke\n},\n title = {\n Physics-Enhanced Machine Learning for Virtual Fluorescence Microscopy\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3803-3813\n} \n}" }, { "title": "Pi-NAS: Improving Neural Architecture Search by Reducing Supernet Training Consistency Shift", @@ -31107,6 +33214,7 @@ "status": "Poster", "track": "main", "pid": 6080, + "author_site": "Jiefeng Peng; Jiqi Zhang; Changlin Li; Guangrun Wang; Xiaodan Liang; Liang Lin", "author": "Jiefeng Peng; Jiqi Zhang; Changlin Li; Guangrun Wang; Xiaodan Liang; Liang Lin", "abstract": "Recently proposed neural architecture search (NAS) methods co-train billions of architectures in a supernet and estimate their potential accuracy using the network weights detached from the supernet. However, the ranking correlation between the architectures' predicted accuracy and their actual capability is incorrect, which causes the existing NAS methods' dilemma. We attribute this ranking correlation problem to the supernet training consistency shift, including feature shift and parameter shift. Feature shift is identified as dynamic input distributions of a hidden layer due to random path sampling. The input distribution dynamic affects the loss descent and finally affects architecture ranking. Parameter shift is identified as contradictory parameter updates for a shared layer lay in different paths in different training steps. The rapidly-changing parameter could not preserve architecture ranking. We address these two shifts simultaneously using a nontrivial supernet-\\Pi model, called \\Pi-NAS. Specifically, we employ a supernet-\\Pi model that contains cross-path learning to reduce the feature consistency shift between different paths. Meanwhile, we adopt a novel nontrivial mean teacher containing negative samples to overcome parameter shift and model collision. Furthermore, our \\Pi-NAS runs in an unsupervised manner, which can search for more transferable architectures. Extensive experiments on ImageNet and a wide range of downstream tasks (e.g., COCO 2017, ADE20K, and Cityscapes) demonstrate the effectiveness and universality of our \\Pi-NAS compared to supervised NAS. See Codes: https://github.com/Ernie1/Pi-NAS.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Peng_Pi-NAS_Improving_Neural_Architecture_Search_by_Reducing_Supernet_Training_Consistency_ICCV_2021_paper.pdf", @@ -31130,7 +33238,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0;2;3;0;0", - "aff_country_unique": "China;United States;Australia;United Kingdom" + "aff_country_unique": "China;United States;Australia;United Kingdom", + "bibtex": "@InProceedings{Peng_2021_ICCV,\n \n author = {\n Peng,\n Jiefeng and Zhang,\n Jiqi and Li,\n Changlin and Wang,\n Guangrun and Liang,\n Xiaodan and Lin,\n Liang\n},\n title = {\n Pi-NAS: Improving Neural Architecture Search by Reducing Supernet Training Consistency Shift\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12354-12364\n} \n}" }, { "title": "Pixel Contrastive-Consistent Semi-Supervised Semantic Segmentation", @@ -31138,6 +33247,7 @@ "status": "Poster", "track": "main", "pid": 9394, + "author_site": "Yuanyi Zhong; Bodi Yuan; Hong Wu; Zhiqiang Yuan; Jian Peng; Yu-Xiong Wang", "author": "Yuanyi Zhong; Bodi Yuan; Hong Wu; Zhiqiang Yuan; Jian Peng; Yu-Xiong Wang", "abstract": "We present a novel semi-supervised semantic segmentation method which jointly achieves two desiderata of segmentation model regularities: the label-space consistency property between image augmentations and the feature-space contrastive property among different pixels. We leverage the pixel-level L2 loss and the pixel contrastive loss for the two purposes respectively. To address the computational efficiency issue and the false negative noise issue involved in the pixel contrastive loss, we further introduce and investigate several negative sampling techniques. Extensive experiments demonstrate the state-of-the-art performance of our method (PC2Seg) with the DeepLab-v3+ architecture, in several challenging semi-supervised settings derived from the VOC, Cityscapes, and COCO datasets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhong_Pixel_Contrastive-Consistent_Semi-Supervised_Semantic_Segmentation_ICCV_2021_paper.pdf", @@ -31154,14 +33264,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhong_Pixel_Contrastive-Consistent_Semi-Supervised_Semantic_Segmentation_ICCV_2021_paper.html", "aff_unique_index": "0;1;1;1;0;0", - "aff_unique_norm": "University of Illinois Urbana-Champaign;X Development LLC", + "aff_unique_norm": "University of Illinois at Urbana-Champaign;X Development LLC", "aff_unique_dep": ";", "aff_unique_url": "https://illinois.edu;https://xdevllc.com", "aff_unique_abbr": "UIUC;X", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Urbana-Champaign;", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhong_2021_ICCV,\n \n author = {\n Zhong,\n Yuanyi and Yuan,\n Bodi and Wu,\n Hong and Yuan,\n Zhiqiang and Peng,\n Jian and Wang,\n Yu-Xiong\n},\n title = {\n Pixel Contrastive-Consistent Semi-Supervised Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7273-7282\n} \n}" }, { "title": "Pixel Difference Networks for Efficient Edge Detection", @@ -31169,7 +33280,8 @@ "status": "Poster", "track": "main", "pid": 9187, - "author": "Zhuo Su; Wenzhe Liu; Zitong Yu; Dewen Hu; Qing Liao; Qi Tian; Matti Pietik\u00e4inen; Li Liu", + "author_site": "Zhuo Su; Wenzhe Liu; Zitong Yu; Dewen Hu; Qing Liao; Qi Tian; Matti Pietikäinen; Li Liu", + "author": "Zhuo Su; Wenzhe Liu; Zitong Yu; Dewen Hu; Qing Liao; Qi Tian; Matti Pietikäinen; Li Liu", "abstract": "Recently, deep Convolutional Neural Networks (CNNs) can achieve human-level performance in edge detection with the rich and abstract edge representation capacities. However, the high performance of CNN based edge detection is achieved with a large pretrained CNN backbone, which is memory and energy consuming. In addition, it is surprising that the previous wisdom from the traditional edge detectors, such as Canny, Sobel, and LBP are rarely investigated in the rapid-developing deep learning era. To address these issues, we propose a simple, lightweight yet effective architecture named Pixel Difference Network (PiDiNet) for efficient edge detection. Extensive experiments on BSDS500, NYUD, and Multicue are provided to demonstrate its effectiveness, and its high training and inference efficiency. Surprisingly, when training from scratch with only the BSDS500 and VOC datasets, PiDiNet can surpass the recorded result of human perception (0.807 vs. 0.803 in ODS F-measure) on the BSDS500 dataset with 100 FPS and less than 1M parameters. A faster version of PiDiNet with less than 0.1M parameters can still achieve comparable performance among state of the arts with 200 FPS. Results on the NYUD and Multicue datasets show similar observations. The codes are available at https://github.com/zhuoinoulu/pidinet.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Su_Pixel_Difference_Networks_for_Efficient_Edge_Detection_ICCV_2021_paper.pdf", "aff": "Center for Machine Vision and Signal Analysis, University of Oulu, Finland; National University of Defense Technology, China; Center for Machine Vision and Signal Analysis, University of Oulu, Finland+National University of Defense Technology, China; National University of Defense Technology, China; Harbin Institute of Technology (Shenzhen), China; Xidian University, China; Center for Machine Vision and Signal Analysis, University of Oulu, Finland; Center for Machine Vision and Signal Analysis, University of Oulu, Finland+National University of Defense Technology, China", @@ -31192,7 +33304,8 @@ "aff_campus_unique_index": ";1;", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0;1;0+1;1;1;1;0;0+1", - "aff_country_unique": "Finland;China" + "aff_country_unique": "Finland;China", + "bibtex": "@InProceedings{Su_2021_ICCV,\n \n author = {\n Su,\n Zhuo and Liu,\n Wenzhe and Yu,\n Zitong and Hu,\n Dewen and Liao,\n Qing and Tian,\n Qi and Pietik\\"ainen,\n Matti and Liu,\n Li\n},\n title = {\n Pixel Difference Networks for Efficient Edge Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5117-5127\n} \n}" }, { "title": "Pixel-Perfect Structure-From-Motion With Featuremetric Refinement", @@ -31200,6 +33313,7 @@ "status": "Poster", "track": "main", "pid": 10386, + "author_site": "Philipp Lindenberger; Paul-Edouard Sarlin; Viktor Larsson; Marc Pollefeys", "author": "Philipp Lindenberger; Paul-Edouard Sarlin; Viktor Larsson; Marc Pollefeys", "abstract": "Finding local features that are repeatable across multiple views is a cornerstone of sparse 3D reconstruction. The classical image matching paradigm detects keypoints per-image once and for all, which can yield poorly-localized features and propagate large errors to the final geometry. In this paper, we refine two key steps of structure-from-motion by a direct alignment of low-level image information from multiple views: we first adjust the initial keypoint locations prior to any geometric estimation, and subsequently refine points and camera poses as a post-processing. This refinement is robust to large detection noise and appearance changes, as it optimizes a featuremetric error based on dense features predicted by a neural network. This significantly improves the accuracy of camera poses and scene geometry for a wide range of keypoint detectors, challenging viewing conditions, and off-the-shelf deep features. Our system easily scales to large image collections, enabling pixel-perfect crowd-sourced localization at scale. Our code is publicly available at https://github.com/cvg/pixel-perfect-sfm as an add-on to the popular SfM software COLMAP.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Lindenberger_Pixel-Perfect_Structure-From-Motion_With_Featuremetric_Refinement_ICCV_2021_paper.pdf", @@ -31214,7 +33328,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Lindenberger_Pixel-Perfect_Structure-From-Motion_With_Featuremetric_Refinement_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Lindenberger_Pixel-Perfect_Structure-From-Motion_With_Featuremetric_Refinement_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Lindenberger_2021_ICCV,\n \n author = {\n Lindenberger,\n Philipp and Sarlin,\n Paul-Edouard and Larsson,\n Viktor and Pollefeys,\n Marc\n},\n title = {\n Pixel-Perfect Structure-From-Motion With Featuremetric Refinement\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5987-5997\n} \n}" }, { "title": "PixelPyramids: Exact Inference Models From Lossless Image Pyramids", @@ -31222,6 +33337,7 @@ "status": "Poster", "track": "main", "pid": 6285, + "author_site": "Shweta Mahajan; Stefan Roth", "author": "Shweta Mahajan; Stefan Roth", "abstract": "Autoregressive models are a class of exact inference approaches with highly flexible functional forms, yielding state-of-the-art density estimates for natural images. Yet, the sequential ordering on the dimensions makes these models computationally expensive and limits their applicability to low-resolution imagery. In this work, we propose Pixel-Pyramids, a block-autoregressive approach employing a lossless pyramid decomposition with scale-specific representations to encode the joint distribution of image pixels. Crucially, it affords a sparser dependency structure compared to fully autoregressive approaches. Our PixelPyramids yield state-of-the-art results for density estimation on various image datasets, especially for high-resolution data. For CelebA-HQ 1024 x 1024, we observe that the density estimates (in terms of bits/dim) are improved to 44% of the baseline despite sampling speeds superior even to easily parallelizable flow-based models.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Mahajan_PixelPyramids_Exact_Inference_Models_From_Lossless_Image_Pyramids_ICCV_2021_paper.pdf", @@ -31238,14 +33354,15 @@ "author_num": 2, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Mahajan_PixelPyramids_Exact_Inference_Models_From_Lossless_Image_Pyramids_ICCV_2021_paper.html", "aff_unique_index": "0;0+1", - "aff_unique_norm": "Technische Universit\u00e4t Darmstadt;Hessian.AI", + "aff_unique_norm": "Technische Universität Darmstadt;hessian.AI", "aff_unique_dep": "Department of Computer Science;", "aff_unique_url": "https://www.tu-darmstadt.de;https://www.hessian.ai", "aff_unique_abbr": "TU Darmstadt;hessian.AI", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Darmstadt;", "aff_country_unique_index": "0;0+0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Mahajan_2021_ICCV,\n \n author = {\n Mahajan,\n Shweta and Roth,\n Stefan\n},\n title = {\n PixelPyramids: Exact Inference Models From Lossless Image Pyramids\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6639-6648\n} \n}" }, { "title": "PixelSynth: Generating a 3D-Consistent Experience From a Single Image", @@ -31253,6 +33370,7 @@ "status": "Poster", "track": "main", "pid": 1632, + "author_site": "Chris Rockwell; David F. Fouhey; Justin Johnson", "author": "Chris Rockwell; David F. Fouhey; Justin Johnson", "abstract": "Recent advancements in differentiable rendering and 3D reasoning have driven exciting results in novel view synthesis from a single image. Despite realistic results, methods are limited to relatively small view change. In order to synthesize immersive scenes, models must also be able to extrapolate. We present an approach that fuses 3D reasoning with autoregressive modeling to outpaint large view changes in a 3D-consistent manner, which enables scene synthesis. We demonstrate considerable improvement in single-image large-angle view synthesis results compared to a variety of methods and possible variants across simulated and real datasets. In addition, we show increased 3D consistency compared to alternative accumulation methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Rockwell_PixelSynth_Generating_a_3D-Consistent_Experience_From_a_Single_Image_ICCV_2021_paper.pdf", @@ -31267,7 +33385,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Rockwell_PixelSynth_Generating_a_3D-Consistent_Experience_From_a_Single_Image_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Rockwell_PixelSynth_Generating_a_3D-Consistent_Experience_From_a_Single_Image_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Rockwell_2021_ICCV,\n \n author = {\n Rockwell,\n Chris and Fouhey,\n David F. and Johnson,\n Justin\n},\n title = {\n PixelSynth: Generating a 3D-Consistent Experience From a Single Image\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14104-14113\n} \n}" }, { "title": "Planar Surface Reconstruction From Sparse Views", @@ -31275,6 +33394,7 @@ "status": "Poster", "track": "main", "pid": 4394, + "author_site": "Linyi Jin; Shengyi Qian; Andrew Owens; David F. Fouhey", "author": "Linyi Jin; Shengyi Qian; Andrew Owens; David F. Fouhey", "abstract": "The paper studies planar surface reconstruction of indoor scenes from two views with unknown camera poses. While prior approaches have successfully created object-centric reconstructions of many scenes, they fail to exploit other structures, such as planes, which are typically the dominant components of indoor scenes. In this paper, we reconstruct planar surfaces from multiple views, while jointly estimating camera pose. Our experiments demonstrate that our method is able to advance the state of the art of reconstruction from sparse views, on challenging scenes from Matterport3D.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Jin_Planar_Surface_Reconstruction_From_Sparse_Views_ICCV_2021_paper.pdf", @@ -31298,7 +33418,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Jin_2021_ICCV,\n \n author = {\n Jin,\n Linyi and Qian,\n Shengyi and Owens,\n Andrew and Fouhey,\n David F.\n},\n title = {\n Planar Surface Reconstruction From Sparse Views\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12991-13000\n} \n}" }, { "title": "PlaneTR: Structure-Guided Transformers for 3D Plane Recovery", @@ -31306,6 +33427,7 @@ "status": "Poster", "track": "main", "pid": 1178, + "author_site": "Bin Tan; Nan Xue; Song Bai; Tianfu Wu; Gui-Song Xia", "author": "Bin Tan; Nan Xue; Song Bai; Tianfu Wu; Gui-Song Xia", "abstract": "This paper presents a neural network built upon Transformers, namely PlaneTR, to simultaneously detect and reconstruct planes from a single image. Different from previous methods, PlaneTR jointly leverages the context information and the geometric structures in a sequence-to-sequence way to holistically detect plane instances in one forward pass. Specifically, we represent the geometric structures as line segments and conduct the network with three main components: (i) context and line segments encoders, (ii) a structure-guided plane decoder, (iii) a pixel-wise plane embedding decoder. Given an image and its detected line segments, PlaneTR generates the context and line segment sequences via two specially designed encoders and then feeds them into a Transformers-based decoder to directly predict a sequence of plane instances by simultaneously considering the context and global structure cues. Finally, the pixel-wise embeddings are computed to assign each pixel to one predicted plane instance which is nearest to it in embedding space. Comprehensive experiments demonstrate that PlaneTR achieves state-of-the-art performance on the ScanNet and NYUv2 datasets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Tan_PlaneTR_Structure-Guided_Transformers_for_3D_Plane_Recovery_ICCV_2021_paper.pdf", @@ -31329,7 +33451,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Wuhan;", "aff_country_unique_index": "0;0;0;1;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Tan_2021_ICCV,\n \n author = {\n Tan,\n Bin and Xue,\n Nan and Bai,\n Song and Wu,\n Tianfu and Xia,\n Gui-Song\n},\n title = {\n PlaneTR: Structure-Guided Transformers for 3D Plane Recovery\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4186-4195\n} \n}" }, { "title": "PlenOctrees for Real-Time Rendering of Neural Radiance Fields", @@ -31337,6 +33460,7 @@ "status": "Poster", "track": "main", "pid": 8084, + "author_site": "Alex Yu; Ruilong Li; Matthew Tancik; Hao Li; Ren Ng; Angjoo Kanazawa", "author": "Alex Yu; Ruilong Li; Matthew Tancik; Hao Li; Ren Ng; Angjoo Kanazawa", "abstract": "We introduce a method to render Neural Radiance Fields (NeRFs) in real time using PlenOctrees, an octree-based 3D representation which supports view-dependent effects. Our method can render 800x800 images at more than 150 FPS, which is over 3000 times faster than conventional NeRFs. We do so without sacrificing quality while preserving the ability of NeRFs to perform free-viewpoint rendering of scenes with arbitrary geometry and view-dependent effects. Real-time performance is achieved by pre-tabulating the NeRF into a PlenOctree. In order to preserve view-dependent effects such as specularities, we factorize the appearance via closed-form spherical basis functions. Specifically, we show that it is possible to train NeRFs to predict a spherical harmonic representation of radiance, removing the viewing direction as an input to the neural network. Furthermore, we show that PlenOctrees can be directly optimized to further minimize the reconstruction loss, which leads to equal or better quality compared to competing methods. Moreover, this octree optimization step can be used to reduce the training time, as we no longer need to wait for the NeRF training to converge fully. Our real-time neural rendering approach may potentially enable new applications such as 6-DOF industrial and product visualizations, as well as next generation AR/VR systems. PlenOctrees are amenable to in-browser rendering as well; please visit the project page for the interactive online demo, as well as video and code: https://alexyu.net/plenoctrees.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yu_PlenOctrees_for_Real-Time_Rendering_of_Neural_Radiance_Fields_ICCV_2021_paper.pdf", @@ -31351,7 +33475,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Yu_PlenOctrees_for_Real-Time_Rendering_of_Neural_Radiance_Fields_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Yu_PlenOctrees_for_Real-Time_Rendering_of_Neural_Radiance_Fields_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Yu_2021_ICCV,\n \n author = {\n Yu,\n Alex and Li,\n Ruilong and Tancik,\n Matthew and Li,\n Hao and Ng,\n Ren and Kanazawa,\n Angjoo\n},\n title = {\n PlenOctrees for Real-Time Rendering of Neural Radiance Fields\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5752-5761\n} \n}" }, { "title": "PnP-DETR: Towards Efficient Visual Analysis With Transformers", @@ -31359,6 +33484,7 @@ "status": "Poster", "track": "main", "pid": 6159, + "author_site": "Tao Wang; Li Yuan; Yunpeng Chen; Jiashi Feng; Shuicheng Yan", "author": "Tao Wang; Li Yuan; Yunpeng Chen; Jiashi Feng; Shuicheng Yan", "abstract": "Recently, DETR pioneered the solution of vision tasks with transformers, it directly translates the image feature map into the object detection result. Though effective, translating the full feature map can be costly due to redundant computation on some area like the background. In this work, we encapsulate the idea of reducing spatial redundancy into a novel poll and pool (PnP) sampling module, with which we build an end-to-end PnP-DETR architecture that adaptively allocates its computation spatially to be more efficient. Concretely, the PnP module abstracts the image feature map into fine foreground object feature vectors and a small number of coarse background contextual feature vectors. The transformer models information interaction within the fine-coarse feature space and translates the features into the detection result. Moreover, the PnP-augmented model can instantly achieve various desired trade-offs between performance and computation with a single model by varying the sampled feature length, without requiring to train multiple models as existing methods. Thus it offers greater flexibility for deployment in diverse scenarios with varying computation constraint. We further validate the generalizability of the PnP module on panoptic segmentation and the recent transformer-based image recognition model ViT and show consistent efficiency gain. We believe our method makes a step for efficient visual analysis with transformers, wherein spatial redundancy is commonly observed. Code and models will be available.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_PnP-DETR_Towards_Efficient_Visual_Analysis_With_Transformers_ICCV_2021_paper.pdf", @@ -31375,14 +33501,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wang_PnP-DETR_Towards_Efficient_Visual_Analysis_With_Transformers_ICCV_2021_paper.html", "aff_unique_index": "0;0;1;0;0", - "aff_unique_norm": "National University of Singapore;YITU Technology", + "aff_unique_norm": "National University of Singapore;Yitu Technology", "aff_unique_dep": "Institute of Data Science;", "aff_unique_url": "https://www.nus.edu.sg;https://www.yITU.cn", "aff_unique_abbr": "NUS;YITU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0", - "aff_country_unique": "Singapore;China" + "aff_country_unique": "Singapore;China", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Tao and Yuan,\n Li and Chen,\n Yunpeng and Feng,\n Jiashi and Yan,\n Shuicheng\n},\n title = {\n PnP-DETR: Towards Efficient Visual Analysis With Transformers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4661-4670\n} \n}" }, { "title": "PoGO-Net: Pose Graph Optimization With Graph Neural Networks", @@ -31390,6 +33517,7 @@ "status": "Poster", "track": "main", "pid": 2646, + "author_site": "Xinyi Li; Haibin Ling", "author": "Xinyi Li; Haibin Ling", "abstract": "Accurate camera pose estimation or global camera re-localization is a core component in Structure-from-Motion (SfM) and SLAM systems. Given pair-wise relative camera poses, pose-graph optimization (PGO) involves solving for an optimized set of globally-consistent absolute camera poses. In this work, we propose a novel PGO scheme fueled by graph neural networks (GNN), namely PoGO-Net, to conduct the absolute camera pose regression leveraging multiple rotation averaging (MRA). Specifically, PoGO-Net takes a noisy view-graph as the input, where the nodes and edges are designed to encode the geometric constraints and local graph consistency. Besides, we address the outlier edge removal by exploiting an implicit edge-dropping scheme where the noisy or corrupted edges are effectively filtered out with parameterized networks. Furthermore, we introduce a joint loss function embedding MRA formulation such that the robust inference is capable of achieving real-time performances even for large-scale scenes. Our proposed network is trained end-to-end on public benchmarks, outperforming state-of-the-art approaches in extensive experiments that demonstrate the efficiency and robustness of our proposed network.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_PoGO-Net_Pose_Graph_Optimization_With_Graph_Neural_Networks_ICCV_2021_paper.pdf", @@ -31413,7 +33541,8 @@ "aff_campus_unique_index": "0;2", "aff_campus_unique": "Sunnyvale;;Stony Brook", "aff_country_unique_index": "0+0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Xinyi and Ling,\n Haibin\n},\n title = {\n PoGO-Net: Pose Graph Optimization With Graph Neural Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5895-5905\n} \n}" }, { "title": "PoinTr: Diverse Point Cloud Completion With Geometry-Aware Transformers", @@ -31421,6 +33550,7 @@ "status": "Poster", "track": "main", "pid": 7661, + "author_site": "Xumin Yu; Yongming Rao; Ziyi Wang; Zuyan Liu; Jiwen Lu; Jie Zhou", "author": "Xumin Yu; Yongming Rao; Ziyi Wang; Zuyan Liu; Jiwen Lu; Jie Zhou", "abstract": "Point clouds captured in real-world applications are often incomplete due to the limited sensor resolution, single viewpoint, and occlusion. Therefore, recovering the complete point clouds from partial ones becomes an indispensable task in many practical applications. In this paper, we present a new method that reformulates point cloud completion as a set-to-set translation problem and design a new model, called PoinTr that adopts a transformer encoder-decoder architecture for point cloud completion. By representing the point cloud as a set of unordered groups of points with position embeddings, we convert the point cloud to a sequence of point proxies and employ the transformers for point cloud generation. To facilitate transformers to better leverage the inductive bias about 3D geometric structures of point clouds, we further devise a geometry-aware block that models the local geometric relationships explicitly. The migration of transformers enables our model to better learn structural knowledge and preserve detailed information for point cloud completion. Furthermore, we propose two more challenging benchmarks with more diverse incomplete point clouds that can better reflect the real-world scenarios to promote future research. Experimental results show that our method outperforms state-of-the-art methods by a large margin on both the new benchmarks and the existing ones.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yu_PoinTr_Diverse_Point_Cloud_Completion_With_Geometry-Aware_Transformers_ICCV_2021_paper.pdf", @@ -31444,7 +33574,8 @@ "aff_campus_unique_index": ";;;;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0+0;0+0+0;0+0+0;0+0+0;0+0+0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yu_2021_ICCV,\n \n author = {\n Yu,\n Xumin and Rao,\n Yongming and Wang,\n Ziyi and Liu,\n Zuyan and Lu,\n Jiwen and Zhou,\n Jie\n},\n title = {\n PoinTr: Diverse Point Cloud Completion With Geometry-Aware Transformers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12498-12507\n} \n}" }, { "title": "Point Cloud Augmentation With Weighted Local Transformations", @@ -31452,6 +33583,7 @@ "status": "Poster", "track": "main", "pid": 11129, + "author_site": "Sihyeon Kim; Sanghyeok Lee; Dasol Hwang; Jaewon Lee; Seong Jae Hwang; Hyunwoo J. Kim", "author": "Sihyeon Kim; Sanghyeok Lee; Dasol Hwang; Jaewon Lee; Seong Jae Hwang; Hyunwoo J. Kim", "abstract": "Despite the extensive usage of point clouds in 3D vision, relatively limited data are available for training deep neural networks. Although data augmentation is a standard approach to compensate for the scarcity of data, it has been less explored in the point cloud literature. In this paper, we propose a simple and effective augmentation method called PointWOLF for point cloud augmentation. The proposed method produces smoothly varying non-rigid deformations by locally weighted transformations centered at multiple anchor points. The smooth deformations allow diverse and realistic augmentations. Furthermore, in order to minimize the manual efforts to search the optimal hyperparameters for augmentation, we present AugTune, which generates augmented samples of desired difficulties producing targeted confidence scores. Our experiments show that our framework consistently improves the performance for both shape classification and part segmentation tasks. In particular, with PointNet++, PointWOLF achieves the state-of-the-art 89.7 accuracy on shape classification with the real-world ScanObjectNN dataset. The code is available at https://github.com/mlvlab/PointWOLF.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Kim_Point_Cloud_Augmentation_With_Weighted_Local_Transformations_ICCV_2021_paper.pdf", @@ -31475,7 +33607,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1;0", - "aff_country_unique": "South Korea;United States" + "aff_country_unique": "South Korea;United States", + "bibtex": "@InProceedings{Kim_2021_ICCV,\n \n author = {\n Kim,\n Sihyeon and Lee,\n Sanghyeok and Hwang,\n Dasol and Lee,\n Jaewon and Hwang,\n Seong Jae and Kim,\n Hyunwoo J.\n},\n title = {\n Point Cloud Augmentation With Weighted Local Transformations\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 548-557\n} \n}" }, { "title": "Point Transformer", @@ -31483,6 +33616,7 @@ "status": "Poster", "track": "main", "pid": 2068, + "author_site": "Hengshuang Zhao; Li Jiang; Jiaya Jia; Philip H.S. Torr; Vladlen Koltun", "author": "Hengshuang Zhao; Li Jiang; Jiaya Jia; Philip H.S. Torr; Vladlen Koltun", "abstract": "Self-attention networks have revolutionized natural language processing and are making impressive strides in image analysis tasks such as image classification and object detection. Inspired by this success, we investigate the application of self-attention networks to 3D point cloud processing. We design self-attention layers for point clouds and use these to construct self-attention networks for tasks such as semantic scene segmentation, object part segmentation, and object classification. Our Point Transformer design improves upon prior work across domains and tasks. For example, on the challenging S3DIS dataset for large-scale semantic scene segmentation, the Point Transformer attains an mIoU of 70.4% on Area 5, outperforming the strongest prior model by 3.3 absolute percentage points and crossing the 70% mIoU threshold for the first time.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhao_Point_Transformer_ICCV_2021_paper.pdf", @@ -31499,14 +33633,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhao_Point_Transformer_ICCV_2021_paper.html", "aff_unique_index": "0+1;2;2;0;3", - "aff_unique_norm": "University of Oxford;University of Hong Kong;Chinese University of Hong Kong;Intel", + "aff_unique_norm": "University of Oxford;The University of Hong Kong;The Chinese University of Hong Kong;Intel Corporation", "aff_unique_dep": ";;;Intel Labs", "aff_unique_url": "https://www.ox.ac.uk;https://www.hku.hk;https://www.cuhk.edu.hk;https://www.intel.com", "aff_unique_abbr": "Oxford;HKU;CUHK;Intel", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0+1;1;1;0;2", - "aff_country_unique": "United Kingdom;China;United States" + "aff_country_unique": "United Kingdom;China;United States", + "bibtex": "@InProceedings{Zhao_2021_ICCV,\n \n author = {\n Zhao,\n Hengshuang and Jiang,\n Li and Jia,\n Jiaya and Torr,\n Philip H.S. and Koltun,\n Vladlen\n},\n title = {\n Point Transformer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 16259-16268\n} \n}" }, { "title": "Point-Based Modeling of Human Clothing", @@ -31514,6 +33649,7 @@ "status": "Poster", "track": "main", "pid": 8840, + "author_site": "Ilya Zakharkin; Kirill Mazur; Artur Grigorev; Victor Lempitsky", "author": "Ilya Zakharkin; Kirill Mazur; Artur Grigorev; Victor Lempitsky", "abstract": "We propose a new approach to human clothing modeling based on point clouds. Within this approach, we learn a deep model that can predict point clouds of various outfits, for various human poses, and for various human body shapes. Notably, outfits of various types and topologies can be handled by the same model. Using the learned model, we can infer the geometry of new outfits from as little as a single image, and perform outfit retargeting to new bodies in new poses. We complement our geometric model with appearance modeling that uses the point cloud geometry as a geometric scaffolding and employs neural point-based graphics to capture outfit appearance from videos and to re-render the captured outfits. We validate both geometric modeling and appearance modeling aspects of the proposed approach against recently proposed methods and establish the viability of point-based clothing modeling.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zakharkin_Point-Based_Modeling_of_Human_Clothing_ICCV_2021_paper.pdf", @@ -31528,7 +33664,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zakharkin_Point-Based_Modeling_of_Human_Clothing_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zakharkin_Point-Based_Modeling_of_Human_Clothing_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Zakharkin_2021_ICCV,\n \n author = {\n Zakharkin,\n Ilya and Mazur,\n Kirill and Grigorev,\n Artur and Lempitsky,\n Victor\n},\n title = {\n Point-Based Modeling of Human Clothing\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14718-14727\n} \n}" }, { "title": "Point-Set Distances for Learning Representations of 3D Point Clouds", @@ -31536,6 +33673,7 @@ "status": "Poster", "track": "main", "pid": 8592, + "author_site": "Trung Nguyen; Quang-Hieu Pham; Tam Le; Tung Pham; Nhat Ho; Binh-Son Hua", "author": "Trung Nguyen; Quang-Hieu Pham; Tam Le; Tung Pham; Nhat Ho; Binh-Son Hua", "abstract": "Learning an effective representation of 3D point clouds requires a good metric to measure the discrepancy between two 3D point sets, which is non-trivial due to their irregularity. Most of the previous works resort to using the Chamfer discrepancy or Earth Mover's distance, but those metrics are either ineffective in measuring the differences between point clouds or computationally expensive. In this paper, we conduct a systematic study with extensive experiments on distance metrics for 3D point clouds. From this study, we propose to use sliced Wasserstein distance and its variants for learning representations of 3D point clouds. In addition, we introduce a new algorithm to estimate sliced Wasserstein distance that guarantees that the estimated value is close enough to the true one. Experiments show that the sliced Wasserstein distance and its variants allow the neural network to learn a more efficient representation compared to the Chamfer discrepancy. We demonstrate the efficiency of the sliced Wasserstein metric and its variants on several tasks in 3D computer vision including training a point cloud autoencoder, generative modeling, transfer learning, and point cloud registration.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Nguyen_Point-Set_Distances_for_Learning_Representations_of_3D_Point_Clouds_ICCV_2021_paper.pdf", @@ -31559,7 +33697,8 @@ "aff_campus_unique_index": ";;1;", "aff_campus_unique": ";Austin", "aff_country_unique_index": "0+0;1+2;3;0+0", - "aff_country_unique": "Vietnam;Unknown;Japan;United States" + "aff_country_unique": "Vietnam;Unknown;Japan;United States", + "bibtex": "@InProceedings{Nguyen_2021_ICCV,\n \n author = {\n Nguyen,\n Trung and Pham,\n Quang-Hieu and Le,\n Tam and Pham,\n Tung and Ho,\n Nhat and Hua,\n Binh-Son\n},\n title = {\n Point-Set Distances for Learning Representations of 3D Point Clouds\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10478-10487\n} \n}" }, { "title": "PointBA: Towards Backdoor Attacks in 3D Point Cloud", @@ -31567,10 +33706,11 @@ "status": "Poster", "track": "main", "pid": 6145, + "author_site": "Xinke Li; Zhirui Chen; Yue Zhao; Zekun Tong; Yabang Zhao; Andrew Lim; Joey Tianyi Zhou", "author": "Xinke Li; Zhirui Chen; Yue Zhao; Zekun Tong; Yabang Zhao; Andrew Lim; Joey Tianyi Zhou", "abstract": "3D deep learning has been increasingly more popular for a variety of tasks including many safety-critical applications. However, recently several works raise the security issues of 3D deep models. Although most of them consider adversarial attacks, we identify that backdoor attack is indeed a more serious threat to 3D deep learning systems but remains unexplored. We present the backdoor attacks in 3D point cloud with a unified framework that exploits the unique properties of 3D data and networks. In particular, we design two attack approaches on point cloud: the poison-label backdoor attack (PointPBA) and the clean-label backdoor attack (PointCBA). The first one is straightforward and effective in practice, while the latter is more sophisticated assuming there are certain data inspections. The attack algorithms are mainly motivated and developed by 1) the recent discovery of 3D adversarial samples suggesting the vulnerability of deep models under spatial transformation; 2) the proposed feature disentanglement technique that manipulates the feature of the data through optimization methods and its potential to embed a new task. Extensive experiments show the efficacy of the PointPBA with over 95% success rate across various 3D datasets and models, and the more stealthy PointCBA with around 50% success rate. Our proposed backdoor attack in 3D point cloud is expected to perform as a baseline for improving the robustness of 3D deep models.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_PointBA_Towards_Backdoor_Attacks_in_3D_Point_Cloud_ICCV_2021_paper.pdf", - "aff": "National University of Singapore; National University of Singapore; National University of Singapore\u2020; National University of Singapore; National University of Singapore; Southwest Jiaotong University; Institute of High Performance Computing, A*STAR", + "aff": "National University of Singapore; National University of Singapore; National University of Singapore†; National University of Singapore; National University of Singapore; Southwest Jiaotong University; Institute of High Performance Computing, A*STAR", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Li_PointBA_Towards_Backdoor_ICCV_2021_supplemental.pdf", @@ -31583,14 +33723,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Li_PointBA_Towards_Backdoor_Attacks_in_3D_Point_Cloud_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;0;0;1;2", - "aff_unique_norm": "National University of Singapore;Southwest Jiao Tong University;A*STAR Institute of High Performance Computing", + "aff_unique_norm": "National University of Singapore;Southwest Jiaotong University;A*STAR Institute of High Performance Computing", "aff_unique_dep": ";;Institute of High Performance Computing", "aff_unique_url": "https://www.nus.edu.sg;https://www.swjtu.edu.cn;https://www.ihpc.a-star.edu.sg", "aff_unique_abbr": "NUS;SWJTU;IHPC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;1;0", - "aff_country_unique": "Singapore;China" + "aff_country_unique": "Singapore;China", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Xinke and Chen,\n Zhirui and Zhao,\n Yue and Tong,\n Zekun and Zhao,\n Yabang and Lim,\n Andrew and Zhou,\n Joey Tianyi\n},\n title = {\n PointBA: Towards Backdoor Attacks in 3D Point Cloud\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 16492-16501\n} \n}" }, { "title": "Polarimetric Helmholtz Stereopsis", @@ -31598,6 +33739,7 @@ "status": "Poster", "track": "main", "pid": 5470, + "author_site": "Yuqi Ding; Yu Ji; Mingyuan Zhou; Sing Bing Kang; Jinwei Ye", "author": "Yuqi Ding; Yu Ji; Mingyuan Zhou; Sing Bing Kang; Jinwei Ye", "abstract": "Helmholtz stereopsis (HS) exploits the reciprocity principle of light propagation (i.e., the Helmholtz reciprocity) for 3D reconstruction of surfaces with arbitrary reflectance. In this paper, we present the polarimetric Helmholtz stereopsis (polar-HS), which extends the classical HS by considering the polarization state of light in the reciprocal paths. With the additional phase information from polarization, polar-HS requires only one reciprocal image pair. We formulate new reciprocity and diffuse/specular polarimetric constraints to recover surface depths and normals using an optimization framework. Using a hardware prototype, we show that our approach produces high-quality 3D reconstruction for different types of surfaces, ranging from diffuse to highly specular.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ding_Polarimetric_Helmholtz_Stereopsis_ICCV_2021_paper.pdf", @@ -31621,7 +33763,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Ding_2021_ICCV,\n \n author = {\n Ding,\n Yuqi and Ji,\n Yu and Zhou,\n Mingyuan and Kang,\n Sing Bing and Ye,\n Jinwei\n},\n title = {\n Polarimetric Helmholtz Stereopsis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5037-5046\n} \n}" }, { "title": "Poly-NL: Linear Complexity Non-Local Layers With 3rd Order Polynomials", @@ -31629,6 +33772,7 @@ "status": "Poster", "track": "main", "pid": 7944, + "author_site": "Francesca Babiloni; Ioannis Marras; Filippos Kokkinos; Jiankang Deng; Grigorios Chrysos; Stefanos Zafeiriou", "author": "Francesca Babiloni; Ioannis Marras; Filippos Kokkinos; Jiankang Deng; Grigorios Chrysos; Stefanos Zafeiriou", "abstract": "Spatial self-attention layers, in the form of Non-Local blocks, introduce long-range dependencies in Convolutional Neural Networks by computing pairwise similarities among all possible positions. Such pairwise functions underpin the effectiveness of non-local layers, but also determine a complexity that scales quadratically with respect to the input size both in space and time. This is a severely limiting factor that practically hinders the applicability of non-local blocks to even moderately sized inputs. Previous works focused on reducing the complexity by modifying the underlying matrix operations, however in this work we aim to retain full expressiveness of non-local layers while keeping complexity linear. We overcome the efficiency limitation of non-local blocks by framing them as special cases of 3rd order polynomial functions. This fact enables us to formulate novel fast Non-Local blocks, capable of reducing the complexity from quadratic to linear with no loss in performance, by replacing any direct computation of pairwise similarities with element-wise multiplications. The proposed method, which we dub as \"Poly-NL\", is competitive with state-of-the-art performance across image recognition, instance segmentation, and face detection tasks, while having considerably less computational overhead.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Babiloni_Poly-NL_Linear_Complexity_Non-Local_Layers_With_3rd_Order_Polynomials_ICCV_2021_paper.pdf", @@ -31643,7 +33787,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Babiloni_Poly-NL_Linear_Complexity_Non-Local_Layers_With_3rd_Order_Polynomials_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Babiloni_Poly-NL_Linear_Complexity_Non-Local_Layers_With_3rd_Order_Polynomials_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Babiloni_2021_ICCV,\n \n author = {\n Babiloni,\n Francesca and Marras,\n Ioannis and Kokkinos,\n Filippos and Deng,\n Jiankang and Chrysos,\n Grigorios and Zafeiriou,\n Stefanos\n},\n title = {\n Poly-NL: Linear Complexity Non-Local Layers With 3rd Order Polynomials\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10518-10528\n} \n}" }, { "title": "Pose Correction for Highly Accurate Visual Localization in Large-Scale Indoor Spaces", @@ -31651,6 +33796,7 @@ "status": "Poster", "track": "main", "pid": 8670, + "author_site": "Janghun Hyeon; Joohyung Kim; Nakju Doh", "author": "Janghun Hyeon; Joohyung Kim; Nakju Doh", "abstract": "Indoor visual localization is significant for various applications such as autonomous robots, augmented reality, and mixed reality. Recent advances in visual localization have demonstrated their feasibility in large-scale indoor spaces through coarse-to-fine methods that typically employ three steps: image retrieval, pose estimation, and pose selection. However, further research is needed to improve the accuracy of large-scale indoor visual localization. We demonstrate that the limitations in the previous methods can be attributed to the sparsity of image positions in the database, which causes view-differences between a query and a retrieved image from the database. In this paper, to address this problem, we propose a novel module, named pose correction, that enables re-estimation of the pose with local feature matching in a similar view by reorganizing the local features. This module enhances the accuracy of the initially estimated pose and assigns more reliable ranks. Furthermore, the proposed method achieves a new state-of-the-art performance with an accuracy of more than 90% within 1.0m in the challenging indoor benchmark dataset InLoc for the first time.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Hyeon_Pose_Correction_for_Highly_Accurate_Visual_Localization_in_Large-Scale_Indoor_ICCV_2021_paper.pdf", @@ -31674,7 +33820,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "South Korea;" + "aff_country_unique": "South Korea;", + "bibtex": "@InProceedings{Hyeon_2021_ICCV,\n \n author = {\n Hyeon,\n Janghun and Kim,\n Joohyung and Doh,\n Nakju\n},\n title = {\n Pose Correction for Highly Accurate Visual Localization in Large-Scale Indoor Spaces\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15974-15983\n} \n}" }, { "title": "Pose Invariant Topological Memory for Visual Navigation", @@ -31682,6 +33829,7 @@ "status": "Poster", "track": "main", "pid": 2576, + "author_site": "Asuto Taniguchi; Fumihiro Sasaki; Ryota Yamashina", "author": "Asuto Taniguchi; Fumihiro Sasaki; Ryota Yamashina", "abstract": "Planning for visual navigation using topological memory, a memory graph consisting of nodes and edges, has been recently well-studied. The nodes correspond to past observations of a robot, and the edges represent the reachability predicted by a neural network (NN). Most prior methods, however, often fail to predict the reachability when the robot takes different poses, i.e. the direction the robot faces, at close positions. This is because the methods observe first-person view images, which significantly changes when the robot changes its pose, and thus it is fundamentally difficult to correctly predict the reachability from them. In this paper, we propose pose invariant topological memory (POINT) to address the problem. POINT observes omnidirectional images and predicts the reachability by using a spherical convolutional NN, which has a rotation invariance property and enables planning regardless of the robot's pose. Additionally, we train the NN by contrastive learning with data augmentation to enable POINT to plan with robustness to changes in environmental conditions, such as light conditions and the presence of unseen objects. Our experimental results show that POINT outperforms conventional methods under both the same and different environmental conditions. In addition, the results with the KITTI-360 dataset show that POINT is more applicable to real-world environments than conventional methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Taniguchi_Pose_Invariant_Topological_Memory_for_Visual_Navigation_ICCV_2021_paper.pdf", @@ -31705,7 +33853,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Taniguchi_2021_ICCV,\n \n author = {\n Taniguchi,\n Asuto and Sasaki,\n Fumihiro and Yamashina,\n Ryota\n},\n title = {\n Pose Invariant Topological Memory for Visual Navigation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15384-15393\n} \n}" }, { "title": "Practical Relative Order Attack in Deep Ranking", @@ -31713,10 +33862,11 @@ "status": "Poster", "track": "main", "pid": 2210, + "author_site": "Mo Zhou; Le Wang; Zhenxing Niu; Qilin Zhang; Yinghui Xu; Nanning Zheng; Gang Hua", "author": "Mo Zhou; Le Wang; Zhenxing Niu; Qilin Zhang; Yinghui Xu; Nanning Zheng; Gang Hua", "abstract": "Recent studies unveil the vulnerabilities of deep ranking models, where an imperceptible perturbation can trigger dramatic changes in the ranking result. While previous attempts focus on manipulating absolute ranks of certain candidates, the possibility of adjusting their relative order remains under-explored. In this paper, we formulate a new adversarial attack against deep ranking systems, i.e., the Order Attack, which covertly alters the relative order among a selected set of candidates according to an attacker-specified permutation, with limited interference to other unrelated candidates. Specifically, it is formulated as a triplet-style loss imposing an inequality chain reflecting the specified permutation. However, direct optimization of such white-box objective is infeasible in a real-world attack scenario due to various black-box limitations. To cope with them, we propose a Short-range Ranking Correlation metric as a surrogate objective for black-box Order Attack to approximate the white-box method. The Order Attack is evaluated on the Fashion-MNIST and Stanford-Online-Products datasets under both white-box and black-box threat models. The black-box attack is also successfully implemented on a major e-commerce platform. Comprehensive experimental evaluations demonstrate the effectiveness of the proposed methods, revealing a new type of ranking model vulnerability.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhou_Practical_Relative_Order_Attack_in_Deep_Ranking_ICCV_2021_paper.pdf", - "aff": "Xi\u2019an Jiaotong University; Xi\u2019an Jiaotong University; Alibaba Group; HERE Technologies; Alibaba Group; Xi\u2019an Jiaotong University; Wormpex AI Research", + "aff": "Xi’an Jiaotong University; Xi’an Jiaotong University; Alibaba Group; HERE Technologies; Alibaba Group; Xi’an Jiaotong University; Wormpex AI Research", "project": "", "github": "", "supp": "", @@ -31729,14 +33879,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhou_Practical_Relative_Order_Attack_in_Deep_Ranking_ICCV_2021_paper.html", "aff_unique_index": "0;0;1;2;1;0;3", - "aff_unique_norm": "Xi'an Jiao Tong University;Alibaba Group;HERE Technologies;Wormpex AI Research", + "aff_unique_norm": "Xi'an Jiaotong University;Alibaba Group;HERE Technologies;Wormpex AI Research", "aff_unique_dep": ";;;AI Research", "aff_unique_url": "https://www.xjtu.edu.cn;https://www.alibaba.com;https://www.here.com;", "aff_unique_abbr": "XJTU;Alibaba;HERE;Wormpex AI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0;0;2", - "aff_country_unique": "China;Finland;United States" + "aff_country_unique": "China;Finland;United States", + "bibtex": "@InProceedings{Zhou_2021_ICCV,\n \n author = {\n Zhou,\n Mo and Wang,\n Le and Niu,\n Zhenxing and Zhang,\n Qilin and Xu,\n Yinghui and Zheng,\n Nanning and Hua,\n Gang\n},\n title = {\n Practical Relative Order Attack in Deep Ranking\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 16413-16422\n} \n}" }, { "title": "PreDet: Large-Scale Weakly Supervised Pre-Training for Detection", @@ -31744,6 +33895,7 @@ "status": "Poster", "track": "main", "pid": 2121, + "author_site": "Vignesh Ramanathan; Rui Wang; Dhruv Mahajan", "author": "Vignesh Ramanathan; Rui Wang; Dhruv Mahajan", "abstract": "State-of-the-art object detection approaches typically rely on pre-trained classification models to achieve better performance and faster convergence. We hypothesize that classification pre-training strives to achieve translation invariance, and consequently ignores the localization aspect of the problem. We propose a new large-scale pre-training strategy for detection, where noisy class labels are available for all images, but not bounding-boxes. In this setting, we augment standard classification pre-training with a new detection-specific pretext task. Motivated by the noise-contrastive learning based self-supervised approaches, we design a task that forces bounding boxes with high-overlap to have similar representations in different views of an image, compared to non-overlapping boxes. We redesign Faster R-CNN modules to perform this task efficiently. Our experimental results show significant improvements over existing weakly-supervised and self-supervised pre-training approaches in both detection accuracy as well as fine-tuning speed.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ramanathan_PreDet_Large-Scale_Weakly_Supervised_Pre-Training_for_Detection_ICCV_2021_paper.pdf", @@ -31760,14 +33912,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Ramanathan_PreDet_Large-Scale_Weakly_Supervised_Pre-Training_for_Detection_ICCV_2021_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "Meta", + "aff_unique_norm": "Facebook", "aff_unique_dep": "Facebook AI", "aff_unique_url": "https://www.facebook.com", "aff_unique_abbr": "Facebook AI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Ramanathan_2021_ICCV,\n \n author = {\n Ramanathan,\n Vignesh and Wang,\n Rui and Mahajan,\n Dhruv\n},\n title = {\n PreDet: Large-Scale Weakly Supervised Pre-Training for Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2865-2875\n} \n}" }, { "title": "Predicting With Confidence on Unseen Distributions", @@ -31775,6 +33928,7 @@ "status": "Poster", "track": "main", "pid": 9498, + "author_site": "Devin Guillory; Vaishaal Shankar; Sayna Ebrahimi; Trevor Darrell; Ludwig Schmidt", "author": "Devin Guillory; Vaishaal Shankar; Sayna Ebrahimi; Trevor Darrell; Ludwig Schmidt", "abstract": "Recent work has shown that the accuracy of machine learning models can vary substantially when evaluated on a distribution that even slightly differs from that of the training data. As a result, predicting model performance on previously unseen distributions without access to labeled data is an important challenge with implications for increasing the reliability of machine learning models. In the context of distribution shift, distance measures are often used to adapt models and improve their performance on new domains, however accuracy estimation is seldom explored in these investigations. Our investigation determines that common distributional distances such as Frechet distance or Maximum Mean Discrepancy, fail to induce reliable estimates of performance under distribution shift. On the other hand, we find that our proposed difference of confidences (DoC) approach yields successful estimates of a classifier's performance over a variety of shifts and model architectures. Despite its simplicity, we observe that DoC outperforms other methods across synthetic, natural, and adversarial distribution shifts, reducing error by (>46%) on several realistic and challenging datasets such as ImageNet-Vid-Robust and ImageNet-Rendition.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Guillory_Predicting_With_Confidence_on_Unseen_Distributions_ICCV_2021_paper.pdf", @@ -31791,14 +33945,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Guillory_Predicting_With_Confidence_on_Unseen_Distributions_ICCV_2021_paper.html", "aff_unique_index": "0;1;0;0+2;0+2", - "aff_unique_norm": "University of California, Berkeley;Amazon;Toyota Research Institute", - "aff_unique_dep": ";Amazon.com, Inc.;", + "aff_unique_norm": "University of California, Berkeley;Amazon.com, Inc.;Toyota Research Institute", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.berkeley.edu;https://www.amazon.com;https://www.tri.global", "aff_unique_abbr": "UC Berkeley;Amazon;TRI", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;0;0;0+0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Guillory_2021_ICCV,\n \n author = {\n Guillory,\n Devin and Shankar,\n Vaishaal and Ebrahimi,\n Sayna and Darrell,\n Trevor and Schmidt,\n Ludwig\n},\n title = {\n Predicting With Confidence on Unseen Distributions\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1134-1144\n} \n}" }, { "title": "Prediction by Anticipation: An Action-Conditional Prediction Method Based on Interaction Learning", @@ -31806,10 +33961,11 @@ "status": "Poster", "track": "main", "pid": 7796, + "author_site": "Ershad Banijamali; Mohsen Rohani; Elmira Amirloo; Jun Luo; Pascal Poupart", "author": "Ershad Banijamali; Mohsen Rohani; Elmira Amirloo; Jun Luo; Pascal Poupart", "abstract": "In autonomous driving (AD), accurately predicting changes in the environment can effectively improve safety and comfort. Due to complex interactions among traffic participants, however, it is very hard to achieve accurate prediction for a long horizon. To address this challenge, we propose prediction by anticipation, which views interaction in terms of a latent probabilistic generative process wherein some vehicles move partly in response to the anticipated motion of other vehicles. Under this view, consecutive data frames can be factorized into sequential samples from an action-conditional distribution that effectively generalizes to a wider range of actions and driving situations. Our proposed prediction model, variational Bayesian in nature, is trained to maximize the evidence lower bound (ELBO) of the log-likelihood of this conditional distribution. Evaluations of our approach with prominent AD datasets NGSIM I-80 and Argoverse show significant improvement over current state-of-the-art in both accuracy and generalization.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Banijamali_Prediction_by_Anticipation_An_Action-Conditional_Prediction_Method_Based_on_Interaction_ICCV_2021_paper.pdf", - "aff": "Noah\u2019s Ark Laboratory, Huawei, Markham, Canada; Noah\u2019s Ark Laboratory, Huawei, Markham, Canada; Noah\u2019s Ark Laboratory, Huawei, Markham, Canada; Noah\u2019s Ark Laboratory, Huawei, Markham, Canada; School of Computer Science, University of Waterloo, Waterloo, Canada", + "aff": "Noah’s Ark Laboratory, Huawei, Markham, Canada; Noah’s Ark Laboratory, Huawei, Markham, Canada; Noah’s Ark Laboratory, Huawei, Markham, Canada; Noah’s Ark Laboratory, Huawei, Markham, Canada; School of Computer Science, University of Waterloo, Waterloo, Canada", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Banijamali_Prediction_by_Anticipation_ICCV_2021_supplemental.pdf", @@ -31823,13 +33979,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Banijamali_Prediction_by_Anticipation_An_Action-Conditional_Prediction_Method_Based_on_Interaction_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;0;1", "aff_unique_norm": "Huawei;University of Waterloo", - "aff_unique_dep": "Noah\u2019s Ark Laboratory;School of Computer Science", + "aff_unique_dep": "Noah’s Ark Laboratory;School of Computer Science", "aff_unique_url": "https://www.huawei.com;https://uwaterloo.ca", - "aff_unique_abbr": "Huawei;UW", + "aff_unique_abbr": "Huawei;UWaterloo", "aff_campus_unique_index": "0;0;0;0;1", "aff_campus_unique": "Markham;Waterloo", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Banijamali_2021_ICCV,\n \n author = {\n Banijamali,\n Ershad and Rohani,\n Mohsen and Amirloo,\n Elmira and Luo,\n Jun and Poupart,\n Pascal\n},\n title = {\n Prediction by Anticipation: An Action-Conditional Prediction Method Based on Interaction Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15621-15630\n} \n}" }, { "title": "Predictive Feature Learning for Future Segmentation Prediction", @@ -31837,6 +33994,7 @@ "status": "Poster", "track": "main", "pid": 7667, + "author_site": "Zihang Lin; Jiangxin Sun; Jian-Fang Hu; Qizhi Yu; Jian-Huang Lai; Wei-Shi Zheng", "author": "Zihang Lin; Jiangxin Sun; Jian-Fang Hu; Qizhi Yu; Jian-Huang Lai; Wei-Shi Zheng", "abstract": "Future segmentation prediction aims to predict the segmentation masks for unobserved future frames. Most existing works addressed it by directly predicting the intermediate features extracted by existing segmentation models. However, these segmentation features are learned to be local discriminative (with rich details) and are always of high resolution/dimension. Hence, the complicated spatio-temporal variations of these features are difficult to predict, which motivates us to learn a more predictive representation. In this work, we develop a novel framework called Predictive Feature Autoencoder. In the proposed framework, we construct an autoencoder which serves as a bridge between the segmentation features and the predictor. In the latent feature learned by the autoencoder, global structures are enhanced and local details are suppressed so that it is more predictive. In order to reduce the risk of vanishing the suppressed details during recurrent feature prediction, we further introduce a reconstruction constraint in the prediction module. Extensive experiments show the effectiveness of the proposed approach and our method outperforms state-of-the-arts by a considerable margin.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Lin_Predictive_Feature_Learning_for_Future_Segmentation_Prediction_ICCV_2021_paper.pdf", @@ -31860,7 +34018,8 @@ "aff_campus_unique_index": "1;1;1;1;", "aff_campus_unique": ";Guangzhou", "aff_country_unique_index": "0+0+0;0+0+0;0+0+0+0+0;0;0+0+0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Lin_2021_ICCV,\n \n author = {\n Lin,\n Zihang and Sun,\n Jiangxin and Hu,\n Jian-Fang and Yu,\n Qizhi and Lai,\n Jian-Huang and Zheng,\n Wei-Shi\n},\n title = {\n Predictive Feature Learning for Future Segmentation Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7365-7374\n} \n}" }, { "title": "Preservational Learning Improves Self-Supervised Medical Image Models by Reconstructing Diverse Contexts", @@ -31868,6 +34027,7 @@ "status": "Poster", "track": "main", "pid": 8732, + "author_site": "Hong-Yu Zhou; Chixiang Lu; Sibei Yang; Xiaoguang Han; Yizhou Yu", "author": "Hong-Yu Zhou; Chixiang Lu; Sibei Yang; Xiaoguang Han; Yizhou Yu", "abstract": "Preserving maximal information is the basic principle of designing self-supervised learning methodologies. To reach this goal, contrastive learning adopts an implicit way which is contrasting image pairs. However, we believe it is not fully optimal to simply use the contrastive estimation for preservation. Moreover, it is necessary and complemental to introduce an explicit solution to preserve more information. From this perspective, we introduce Preservational Learning to reconstruct diverse image contexts in order to preserve more information in learned representations. Together with the contrastive loss, we present Preservational Contrastive Representation Learning (PCRL) for learning self-supervised medical representations. PCRL provides very competitive results under the pretraining-finetuning protocol, outperforming both self-supervised and supervised counterparts in 5 classification/segmentation tasks substantially.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhou_Preservational_Learning_Improves_Self-Supervised_Medical_Image_Models_by_Reconstructing_Diverse_ICCV_2021_paper.pdf", @@ -31882,7 +34042,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhou_Preservational_Learning_Improves_Self-Supervised_Medical_Image_Models_by_Reconstructing_Diverse_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhou_Preservational_Learning_Improves_Self-Supervised_Medical_Image_Models_by_Reconstructing_Diverse_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Zhou_2021_ICCV,\n \n author = {\n Zhou,\n Hong-Yu and Lu,\n Chixiang and Yang,\n Sibei and Han,\n Xiaoguang and Yu,\n Yizhou\n},\n title = {\n Preservational Learning Improves Self-Supervised Medical Image Models by Reconstructing Diverse Contexts\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3499-3509\n} \n}" }, { "title": "Pri3D: Can 3D Priors Help 2D Representation Learning?", @@ -31890,7 +34051,8 @@ "status": "Poster", "track": "main", "pid": 1398, - "author": "Ji Hou; Saining Xie; Benjamin Graham; Angela Dai; Matthias Nie\u00dfner", + "author_site": "Ji Hou; Saining Xie; Benjamin Graham; Angela Dai; Matthias Nießner", + "author": "Ji Hou; Saining Xie; Benjamin Graham; Angela Dai; Matthias Nießner", "abstract": "Recent advances in 3D perception have shown impressive progress in understanding geometric structures of 3D shapes and even scenes. Inspired by these advances in geometric understanding, we aim to imbue image-based perception with representations learned under geometric constraints. We introduce an approach to learn view-invariant, geometry-aware representations for network pre-training, based on multi-view RGB-D data, that can then be effectively transferred to downstream 2D tasks. We propose to employ contrastive learning under both multi-view image constraints and image-geometry constraints to encode 3D priors into learned 2D representations. This results not only in improvement over 2D-only representation learning on the image-based tasks of semantic segmentation, instance segmentation, and object detection on real-world indoor datasets, but moreover, provides significant improvement in the low data regime. We show a significant improvement of 6.0% on semantic segmentation on full data as well as 11.9% on 20% data against our baselines on ScanNet.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Hou_Pri3D_Can_3D_Priors_Help_2D_Representation_Learning_ICCV_2021_paper.pdf", "aff": "Technical University of Munich; Facebook AI Research; Facebook AI Research; Technical University of Munich; Technical University of Munich", @@ -31906,14 +34068,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Hou_Pri3D_Can_3D_Priors_Help_2D_Representation_Learning_ICCV_2021_paper.html", "aff_unique_index": "0;1;1;0;0", - "aff_unique_norm": "Technical University of Munich;Meta", + "aff_unique_norm": "Technical University of Munich;Facebook", "aff_unique_dep": ";Facebook AI Research", "aff_unique_url": "https://www.tum.de;https://research.facebook.com", "aff_unique_abbr": "TUM;FAIR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0;0", - "aff_country_unique": "Germany;United States" + "aff_country_unique": "Germany;United States", + "bibtex": "@InProceedings{Hou_2021_ICCV,\n \n author = {\n Hou,\n Ji and Xie,\n Saining and Graham,\n Benjamin and Dai,\n Angela and Nie{\\ss\n}ner,\n Matthias\n},\n title = {\n Pri3D: Can 3D Priors Help 2D Representation Learning?\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5693-5702\n} \n}" }, { "title": "PrimitiveNet: Primitive Instance Segmentation With Local Primitive Embedding Under Adversarial Metric", @@ -31921,6 +34084,7 @@ "status": "Poster", "track": "main", "pid": 1503, + "author_site": "Jingwei Huang; Yanfeng Zhang; Mingwei Sun", "author": "Jingwei Huang; Yanfeng Zhang; Mingwei Sun", "abstract": "We present PrimitiveNet, a novel approach for high-resolution primitive instance segmentation from point clouds on a large scale. Our key idea is to transform the global segmentation problem into easier local tasks. We train a high-resolution primitive embedding network to predict explicit geometry features and implicit latent features for each point. The embedding is jointly trained with an adversarial network as a primitive discriminator to decide whether points are from the same primitive instance in local neighborhoods. Such local supervision encourages the learned embedding and discriminator to describe local surface properties and robustly distinguish different instances. At inference time, network predictions are followed by a region growing method to finalize the segmentation. Experiments show that our method outperforms existing state-of-the-arts based on mean average precision by a significant margin (46.3%) on ABC dataset [??]. We can process extremely large real scenes covering more than 0.1km^2. Ablation studies highlight the contribution of our core designs. Finally, our method can improve geometry processing algorithms to abstract scans as lightweight models.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Huang_PrimitiveNet_Primitive_Instance_Segmentation_With_Local_Primitive_Embedding_Under_Adversarial_ICCV_2021_paper.pdf", @@ -31937,14 +34101,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Huang_PrimitiveNet_Primitive_Instance_Segmentation_With_Local_Primitive_Embedding_Under_Adversarial_ICCV_2021_paper.html", "aff_unique_index": "0;0;0+1", - "aff_unique_norm": "Huawei;Wuhan University", + "aff_unique_norm": "Huawei Technologies;Wuhan University", "aff_unique_dep": "Riemann Lab;", "aff_unique_url": "https://www.huawei.com;http://www.whu.edu.cn/", "aff_unique_abbr": "Huawei;WHU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Huang_2021_ICCV,\n \n author = {\n Huang,\n Jingwei and Zhang,\n Yanfeng and Sun,\n Mingwei\n},\n title = {\n PrimitiveNet: Primitive Instance Segmentation With Local Primitive Embedding Under Adversarial Metric\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15343-15353\n} \n}" }, { "title": "Prior to Segment: Foreground Cues for Weakly Annotated Classes in Partially Supervised Instance Segmentation", @@ -31952,7 +34117,8 @@ "status": "Poster", "track": "main", "pid": 9342, - "author": "David Biertimpel; Sindi Shkodrani; Anil S. Baslamisli; N\u00f3ra Baka", + "author_site": "David Biertimpel; Sindi Shkodrani; Anil S. Baslamisli; Nóra Baka", + "author": "David Biertimpel; Sindi Shkodrani; Anil S. Baslamisli; Nóra Baka", "abstract": "Instance segmentation methods require large datasets with expensive and thus limited instance-level mask labels. Partially supervised instance segmentation aims to improve mask prediction with limited mask labels by utilizing the more abundant weak box labels. In this work, we show that a class agnostic mask head, commonly used in partially supervised instance segmentation, has difficulties learning a general concept of foreground for the weakly annotated classes using box supervision only. To resolve this problem, we introduce an object mask prior (OMP) that provides the mask head with the general concept of foreground implicitly learned by the box classification head under the supervision of all classes. This helps the class agnostic mask head to focus on the primary object in a region of interest (RoI) and improves generalization to the weakly annotated classes. We test our approach on the COCO dataset using different splits of strongly and weakly supervised classes. Our approach significantly improves over the Mask R-CNN baseline and obtains competitive performance with the state-of-the-art, while offering a much simpler architecture.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Biertimpel_Prior_to_Segment_Foreground_Cues_for_Weakly_Annotated_Classes_in_ICCV_2021_paper.pdf", "aff": "University of Amsterdam + TomTom; TomTom; University of Amsterdam; TomTom", @@ -31975,7 +34141,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0", - "aff_country_unique": "Netherlands" + "aff_country_unique": "Netherlands", + "bibtex": "@InProceedings{Biertimpel_2021_ICCV,\n \n author = {\n Biertimpel,\n David and Shkodrani,\n Sindi and Baslamisli,\n Anil S. and Baka,\n N\\'ora\n},\n title = {\n Prior to Segment: Foreground Cues for Weakly Annotated Classes in Partially Supervised Instance Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2824-2833\n} \n}" }, { "title": "ProFlip: Targeted Trojan Attack With Progressive Bit Flips", @@ -31983,6 +34150,7 @@ "status": "Poster", "track": "main", "pid": 10676, + "author_site": "Huili Chen; Cheng Fu; Jishen Zhao; Farinaz Koushanfar", "author": "Huili Chen; Cheng Fu; Jishen Zhao; Farinaz Koushanfar", "abstract": "The security of Deep Neural Networks (DNNs) is of great importance due to their employment in various safety-critical applications. DNNs are shown to be vulnerable against the Trojan attack that manipulates the model parameters via poisoned training and gets activated by the pre-defined trigger in inputs during inference. In this work, we present ProFlip, the first targeted Trojan attack framework that can divert the prediction of the DNN to the target class by progressively identifying and flipping a small set of bits in model parameters. At its core, ProFlip consists of three key phases: (i) Determining significant neurons in the last layer; (ii) Generating an effective trigger pattern for the target class; (iii) Identifying a sequence of susceptible bits of DNN parameters stored in the main memory (e.g., DRAM). After model deployment, the adversary can insert the Trojan by flipping the critical bits found by ProFlip using bit flip techniques such as Row Hammer or laser beams. As the result, the altered DNN predicts the target class when the trigger pattern is present in any inputs. We perform extensive evaluations of ProFlip on CIFAR10, SVHN, and ImageNet datasets with ResNet-18 and VGG-16 architectures. Empirical results show that, to reach an attack success rate (ASR) of over 94%, ProFlip requires only 12 bit flips out of 88 million parameter bits for ResNet-18 with CIFAR-10, and 15 bit flips for ResNet-18 with ImageNet. Compared to the SOTA, ProFlip reduces the number of required bits flips by 28x 34x while reaching the same level of ASR.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_ProFlip_Targeted_Trojan_Attack_With_Progressive_Bit_Flips_ICCV_2021_paper.pdf", @@ -32006,7 +34174,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "San Diego", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Huili and Fu,\n Cheng and Zhao,\n Jishen and Koushanfar,\n Farinaz\n},\n title = {\n ProFlip: Targeted Trojan Attack With Progressive Bit Flips\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7718-7727\n} \n}" }, { "title": "Probabilistic Modeling for Human Mesh Recovery", @@ -32014,6 +34183,7 @@ "status": "Poster", "track": "main", "pid": 3952, + "author_site": "Nikos Kolotouros; Georgios Pavlakos; Dinesh Jayaraman; Kostas Daniilidis", "author": "Nikos Kolotouros; Georgios Pavlakos; Dinesh Jayaraman; Kostas Daniilidis", "abstract": "This paper focuses on the problem of 3D human reconstruction from 2D evidence. Although this is an inherently ambiguous problem, the majority of recent works avoid the uncertainty modeling and typically regress a single estimate for a given input. In contrast to that, in this work, we propose to embrace the reconstruction ambiguity and we recast the problem as learning a mapping from the input to a distribution of plausible 3D poses. Our approach is based on the normalizing flows model and offers a series of advantages. For conventional applications, where a single 3D estimate is required, our formulation allows for efficient mode computation. Using the mode leads to performance that is comparable with the state of the art among deterministic unimodal regression models. Simultaneously, since we have access to the likelihood of each sample, we demonstrate that our model is useful in a series of downstream tasks, where we leverage the probabilistic nature of the prediction as a tool for more accurate estimation. These tasks include reconstruction from multiple uncalibrated views, as well as human model fitting, where our model acts as a powerful image-based prior for mesh recovery. Our results validate the importance of probabilistic modeling, and indicate state-of-the-art performance across a variety of settings. Code and models are available at: https://www.seas.upenn.edu/ nkolot/projects/prohmr.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Kolotouros_Probabilistic_Modeling_for_Human_Mesh_Recovery_ICCV_2021_paper.pdf", @@ -32028,7 +34198,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Kolotouros_Probabilistic_Modeling_for_Human_Mesh_Recovery_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Kolotouros_Probabilistic_Modeling_for_Human_Mesh_Recovery_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Kolotouros_2021_ICCV,\n \n author = {\n Kolotouros,\n Nikos and Pavlakos,\n Georgios and Jayaraman,\n Dinesh and Daniilidis,\n Kostas\n},\n title = {\n Probabilistic Modeling for Human Mesh Recovery\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11605-11614\n} \n}" }, { "title": "Probabilistic Monocular 3D Human Pose Estimation With Normalizing Flows", @@ -32036,6 +34207,7 @@ "status": "Poster", "track": "main", "pid": 2001, + "author_site": "Tom Wehrbein; Marco Rudolph; Bodo Rosenhahn; Bastian Wandt", "author": "Tom Wehrbein; Marco Rudolph; Bodo Rosenhahn; Bastian Wandt", "abstract": "3D human pose estimation from monocular images is a highly ill-posed problem due to depth ambiguities and occlusions. Nonetheless, most existing works ignore these ambiguities and only estimate a single solution. In contrast, we generate a diverse set of hypotheses that represents the full posterior distribution of feasible 3D poses. To this end, we propose a normalizing flow based method that exploits the deterministic 3D-to-2D mapping to solve the ambiguous inverse 2D-to-3D problem. Additionally, uncertain detections and occlusions are effectively modeled by incorporating uncertainty information of the 2D detector as condition. Further keys to success are a learned 3D pose prior and a generalization of the best-of-M loss. We evaluate our approach on the two benchmark datasets Human3.6M and MPI-INF-3DHP, outperforming all comparable methods in most metrics. The implementation is available on GitHub.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wehrbein_Probabilistic_Monocular_3D_Human_Pose_Estimation_With_Normalizing_Flows_ICCV_2021_paper.pdf", @@ -32059,7 +34231,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1", - "aff_country_unique": "Germany;Canada" + "aff_country_unique": "Germany;Canada", + "bibtex": "@InProceedings{Wehrbein_2021_ICCV,\n \n author = {\n Wehrbein,\n Tom and Rudolph,\n Marco and Rosenhahn,\n Bodo and Wandt,\n Bastian\n},\n title = {\n Probabilistic Monocular 3D Human Pose Estimation With Normalizing Flows\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11199-11208\n} \n}" }, { "title": "Procedure Planning in Instructional Videos via Contextual Modeling and Model-Based Policy Learning", @@ -32067,6 +34240,7 @@ "status": "Poster", "track": "main", "pid": 5878, + "author_site": "Jing Bi; Jiebo Luo; Chenliang Xu", "author": "Jing Bi; Jiebo Luo; Chenliang Xu", "abstract": "Learning new skills by observing humans' behaviors is an essential capability of AI. In this work, we leverage instructional videos to study humans' decision-making processes, focusing on learning a model to plan goal-directed actions in real-life videos. In contrast to conventional action recognition, goal-directed actions are based on expectations of their outcomes requiring causal knowledge of potential consequences of actions. Thus, integrating the environment structure with goals is critical for solving this task. Previous works learn a single world model will fail to distinguish various tasks, resulting in an ambiguous latent space; planning through it will gradually neglect the desired outcomes since the global information of the future goal degrades quickly as the procedure evolves. We address these limitations with a new formulation of procedure planning and propose novel algorithms to model human behaviors through Bayesian Inference and model-based Imitation Learning. Experiments conducted on real-world instructional videos show that our method can achieve state-of-the-art performance in reaching the indicated goals. Furthermore, the learned contextual information presents interesting features for planning in a latent space.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Bi_Procedure_Planning_in_Instructional_Videos_via_Contextual_Modeling_and_Model-Based_ICCV_2021_paper.pdf", @@ -32090,7 +34264,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Bi_2021_ICCV,\n \n author = {\n Bi,\n Jing and Luo,\n Jiebo and Xu,\n Chenliang\n},\n title = {\n Procedure Planning in Instructional Videos via Contextual Modeling and Model-Based Policy Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15611-15620\n} \n}" }, { "title": "Procrustean Training for Imbalanced Deep Learning", @@ -32098,6 +34273,7 @@ "status": "Poster", "track": "main", "pid": 11183, + "author_site": "Han-Jia Ye; De-Chuan Zhan; Wei-Lun Chao", "author": "Han-Jia Ye; De-Chuan Zhan; Wei-Lun Chao", "abstract": "Neural networks trained with class-imbalanced data are known to perform poorly on minor classes of scarce training data. Several recent works attribute this to over-fitting to minor classes. In this paper, we provide a novel explanation of this issue. We found that a neural network tends to first under-fit the minor classes by classifying most of their data into the major classes in early training epochs. To correct these wrong predictions, the neural network then must focus on pushing features of minor class data across the decision boundaries between major and minor classes, leading to much larger gradients for features of minor classes. We argue that such an under-fitting phase over-emphasizes the competition between major and minor classes, hinders the neural network from learning the discriminative knowledge that can be generalized to test data, and eventually results in over-fitting. To address this issue, we propose a novel learning strategy to equalize the training progress across classes. We mix features of the major class data with those of other data in a mini-batch, intentionally weakening their features to prevent a neural network from fitting them first. We show that this strategy can largely balance the training accuracy and feature gradients across classes, effectively mitigating the under-fitting then over-fitting problem for minor class data. On several benchmark datasets, our approach achieves the state-of-the-art accuracy, especially for the challenging step-imbalanced cases.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ye_Procrustean_Training_for_Imbalanced_Deep_Learning_ICCV_2021_paper.pdf", @@ -32114,14 +34290,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Ye_Procrustean_Training_for_Imbalanced_Deep_Learning_ICCV_2021_paper.html", "aff_unique_index": "0;0;1", - "aff_unique_norm": "Nanjing University;Ohio State University", + "aff_unique_norm": "Nanjing University;The Ohio State University", "aff_unique_dep": "State Key Laboratory for Novel Software Technology;", "aff_unique_url": "http://www.nju.edu.cn;https://www.osu.edu", "aff_unique_abbr": "Nanjing U;OSU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Ye_2021_ICCV,\n \n author = {\n Ye,\n Han-Jia and Zhan,\n De-Chuan and Chao,\n Wei-Lun\n},\n title = {\n Procrustean Training for Imbalanced Deep Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 92-102\n} \n}" }, { "title": "Product Quantizer Aware Inverted Index for Scalable Nearest Neighbor Search", @@ -32129,6 +34306,7 @@ "status": "Poster", "track": "main", "pid": 8694, + "author_site": "Haechan Noh; Taeho Kim; Jae-Pil Heo", "author": "Haechan Noh; Taeho Kim; Jae-Pil Heo", "abstract": "The inverted index is one of the most commonly used structures for non-exhaustive nearest neighbor search on large-scale datasets. It allows a significant factor of acceleration by a reduced number of distance computations with only a small fraction of the database. In particular, the inverted index enables the product quantization (PQ) to learn their codewords in the residual vector space. The quantization error of the PQ can be substantially improved in such combination since the residual vector space is much more quantization-friendly thanks to their compact distribution compared to the original data. In this paper, we first raise an unremarked but crucial question; why the inverted index and the product quantizer are optimized separately even though they are closely related? For instance, changes on the inverted index distort the whole residual vector space. To address the raised question, we suggest a joint optimization of the coarse and fine quantizers by substituting the original objective of the coarse quantizer to end-to-end quantization distortion. Moreover, our method is generic and applicable to different combinations of coarse and fine quantizers such as inverted multi-index and optimized PQ.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Noh_Product_Quantizer_Aware_Inverted_Index_for_Scalable_Nearest_Neighbor_Search_ICCV_2021_paper.pdf", @@ -32152,7 +34330,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Noh_2021_ICCV,\n \n author = {\n Noh,\n Haechan and Kim,\n Taeho and Heo,\n Jae-Pil\n},\n title = {\n Product Quantizer Aware Inverted Index for Scalable Nearest Neighbor Search\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12210-12218\n} \n}" }, { "title": "Product1M: Towards Weakly Supervised Instance-Level Product Retrieval via Cross-Modal Pretraining", @@ -32160,10 +34339,11 @@ "status": "Poster", "track": "main", "pid": 8338, + "author_site": "Xunlin Zhan; Yangxin Wu; Xiao Dong; Yunchao Wei; Minlong Lu; Yichi Zhang; Hang Xu; Xiaodan Liang", "author": "Xunlin Zhan; Yangxin Wu; Xiao Dong; Yunchao Wei; Minlong Lu; Yichi Zhang; Hang Xu; Xiaodan Liang", "abstract": "Nowadays, customer's demands for E-commerce are more diversified, which introduces more complications to the product retrieval industry. Previous methods are either subject to single-modal input or perform supervised image-level product retrieval, thus fail to accommodate real-life scenarios where enormous weakly annotated multi-modal data are present. In this paper, we investigate a more realistic setting that aims to perform weakly-supervised multi-modal instance-level product retrieval among fine-grained product categories. To promote the study of this challenging task, we contribute Product1M, one of the largest multi-modal cosmetic datasets for real-world instance-level retrieval. Notably, Product1M contains over 1 million image-caption pairs and consists of two sample types, i.e., single-product and multi-product samples, which encompass a wide variety of cosmetics brands. In addition to the great diversity, Product1M enjoys several appealing characteristics including fine-grained categories, complex combinations, and fuzzy correspondence that well mimic the real-world scenes. Moreover, we propose a novel model named Cross-modal contrAstive Product Transformer for instance-level prodUct REtrieval (CAPTURE), that excels in capturing the potential synergy between multi-modal inputs via a hybrid-stream transformer in a self-supervised manner. CAPTURE generates discriminative instance features via masked multi-modal learning as well as cross-modal contrastive pretraining and it outperforms several SOTA cross-modal baselines. Extensive ablation studies well demonstrate the effectiveness and the generalization capacity of our model.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhan_Product1M_Towards_Weakly_Supervised_Instance-Level_Product_Retrieval_via_Cross-Modal_Pretraining_ICCV_2021_paper.pdf", - "aff": "Sun Yat-sen University; Sun Yat-sen University; Sun Yat-sen University; Beijing Jiaotong University; Alibaba Group; Alibaba Group; Huawei Noah\u2019s Ark Lab; Sun Yat-sen University", + "aff": "Sun Yat-sen University; Sun Yat-sen University; Sun Yat-sen University; Beijing Jiaotong University; Alibaba Group; Alibaba Group; Huawei Noah’s Ark Lab; Sun Yat-sen University", "project": "", "github": "https://github.com/zhanxlin/Product1M", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Zhan_Product1M_Towards_Weakly_ICCV_2021_supplemental.pdf", @@ -32176,14 +34356,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhan_Product1M_Towards_Weakly_Supervised_Instance-Level_Product_Retrieval_via_Cross-Modal_Pretraining_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;1;2;2;3;0", - "aff_unique_norm": "Sun Yat-sen University;Beijing Jiao Tong University;Alibaba Group;Huawei", - "aff_unique_dep": ";;;Noah\u2019s Ark Lab", + "aff_unique_norm": "Sun Yat-sen University;Beijing Jiaotong University;Alibaba Group;Huawei", + "aff_unique_dep": ";;;Noah’s Ark Lab", "aff_unique_url": "http://www.sysu.edu.cn/;http://www.njtu.edu.cn/en;https://www.alibaba.com;https://www.huawei.com", "aff_unique_abbr": "SYSU;BJTU;Alibaba;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhan_2021_ICCV,\n \n author = {\n Zhan,\n Xunlin and Wu,\n Yangxin and Dong,\n Xiao and Wei,\n Yunchao and Lu,\n Minlong and Zhang,\n Yichi and Xu,\n Hang and Liang,\n Xiaodan\n},\n title = {\n Product1M: Towards Weakly Supervised Instance-Level Product Retrieval via Cross-Modal Pretraining\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11782-11791\n} \n}" }, { "title": "Progressive Correspondence Pruning by Consensus Learning", @@ -32191,10 +34372,11 @@ "status": "Poster", "track": "main", "pid": 8059, + "author_site": "Chen Zhao; Yixiao Ge; Feng Zhu; Rui Zhao; Hongsheng Li; Mathieu Salzmann", "author": "Chen Zhao; Yixiao Ge; Feng Zhu; Rui Zhao; Hongsheng Li; Mathieu Salzmann", "abstract": "Correspondence pruning aims to correctly remove false matches (outliers) from an initial set of putative correspondences. The selection is challenging since putative matches are typically extremely unbalanced, largely dominated by outliers, and the random distribution of such outliers further complicates the learning process for learning-based methods. To address this issue, we propose to progressively prune the correspondences via a local-to-global consensus learning procedure. We introduce a \"pruning\" block that lets us identify reliable candidates among the initial matches according to consensus scores estimated using local-to-global dynamic graphs. We then achieve progressive pruning by stacking multiple pruning blocks sequentially. Our method outperforms state-of-the-arts on robust line fitting, camera pose estimation and retrieval-based image localization benchmarks by significant margins and shows promising generalization ability to different datasets and detector/descriptor combinations.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhao_Progressive_Correspondence_Pruning_by_Consensus_Learning_ICCV_2021_paper.pdf", - "aff": "\u00b4Ecole Polytechnique F \u00b4ed\u00b4erale de Lausanne (EPFL)+SenseTime Research; The Chinese University of Hong Kong; SenseTime Research+Qing Yuan Research Institute, Shanghai Jiao Tong University; SenseTime Research+Qing Yuan Research Institute, Shanghai Jiao Tong University; The Chinese University of Hong Kong; \u00b4Ecole Polytechnique F \u00b4ed\u00b4erale de Lausanne (EPFL)", + "aff": "´Ecole Polytechnique F ´ed´erale de Lausanne (EPFL)+SenseTime Research; The Chinese University of Hong Kong; SenseTime Research+Qing Yuan Research Institute, Shanghai Jiao Tong University; SenseTime Research+Qing Yuan Research Institute, Shanghai Jiao Tong University; The Chinese University of Hong Kong; ´Ecole Polytechnique F ´ed´erale de Lausanne (EPFL)", "project": "", "github": "", "supp": "", @@ -32207,14 +34389,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhao_Progressive_Correspondence_Pruning_by_Consensus_Learning_ICCV_2021_paper.html", "aff_unique_index": "0+1;2;1+3;1+3;2;0", - "aff_unique_norm": "EPFL;SenseTime;Chinese University of Hong Kong;Shanghai Jiao Tong University", + "aff_unique_norm": "Ecole Polytechnique Fédérale de Lausanne;SenseTime;The Chinese University of Hong Kong;Shanghai Jiao Tong University", "aff_unique_dep": ";SenseTime Research;;Qing Yuan Research Institute", "aff_unique_url": "https://www.epfl.ch;https://www.sensetime.com;https://www.cuhk.edu.hk;https://www.sjtu.edu.cn", "aff_unique_abbr": "EPFL;SenseTime;CUHK;SJTU", "aff_campus_unique_index": "0;2;3;3;2;0", "aff_campus_unique": "Lausanne;;Hong Kong SAR;Shanghai", "aff_country_unique_index": "0+1;1;1+1;1+1;1;0", - "aff_country_unique": "Switzerland;China" + "aff_country_unique": "Switzerland;China", + "bibtex": "@InProceedings{Zhao_2021_ICCV,\n \n author = {\n Zhao,\n Chen and Ge,\n Yixiao and Zhu,\n Feng and Zhao,\n Rui and Li,\n Hongsheng and Salzmann,\n Mathieu\n},\n title = {\n Progressive Correspondence Pruning by Consensus Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6464-6473\n} \n}" }, { "title": "Progressive Seed Generation Auto-Encoder for Unsupervised Point Cloud Learning", @@ -32222,6 +34405,7 @@ "status": "Poster", "track": "main", "pid": 4148, + "author_site": "Juyoung Yang; Pyunghwan Ahn; Doyeon Kim; Haeil Lee; Junmo Kim", "author": "Juyoung Yang; Pyunghwan Ahn; Doyeon Kim; Haeil Lee; Junmo Kim", "abstract": "With the development of 3D scanning technologies, 3D vision tasks have become a popular research area. Owing to the large amount of data acquired by sensors, unsupervised learning is essential for understanding and utilizing point clouds without an expensive annotation process. In this paper, we propose a novel framework and an effective auto-encoder architecture named \"PSG-Net\" for reconstruction-based learning of point clouds. Unlike existing studies that used fixed or random 2D points, our framework generates input-dependent point-wise features for the latent point set. PSG-Net uses the encoded input to produce point-wise features through the seed generation module and extracts richer features in multiple stages with gradually increasing resolution by applying the seed feature propagation module progressively. We prove the effectiveness of PSG-Net experimentally; PSG-Net shows state-of-the-art performances in point cloud reconstruction and unsupervised classification, and achieves comparable performance to counterpart methods in supervised completion.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yang_Progressive_Seed_Generation_Auto-Encoder_for_Unsupervised_Point_Cloud_Learning_ICCV_2021_paper.pdf", @@ -32245,7 +34429,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Yang_2021_ICCV,\n \n author = {\n Yang,\n Juyoung and Ahn,\n Pyunghwan and Kim,\n Doyeon and Lee,\n Haeil and Kim,\n Junmo\n},\n title = {\n Progressive Seed Generation Auto-Encoder for Unsupervised Point Cloud Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6413-6422\n} \n}" }, { "title": "Prototypical Matching and Open Set Rejection for Zero-Shot Semantic Segmentation", @@ -32253,6 +34438,7 @@ "status": "Poster", "track": "main", "pid": 1596, + "author_site": "Hui Zhang; Henghui Ding", "author": "Hui Zhang; Henghui Ding", "abstract": "The deep learning methods in addressing semantic segmentation typically demand vast amount of pixel-wise annotated training samples. In this work, we present zero-shot semantic segmentation, which aims to identify not only the seen classes contained in training but also the novel classes that have never been seen. We adopt a stringent inductive setting in which only the instances of seen classes are accessible during training. We propose an open-aware prototypical matching approach to accomplish the segmentation. The prototypical way extracts the visual representations by a set of prototypes, making it convenient and flexible to add new unseen classes. A prototype projection is trained to map the semantic representations towards prototypes based on seen instances, and will generate prototypes for unseen classes. Moreover, an open-set rejection is utilized to detect the objects that do not belong to any seen classes, which greatly reduces the misclassifications of unseen objects as seen classes caused by the lack of unseen training instances. We apply the framework on two segmentation datasets, Pascal VOC 2012 and Pascal Context, and achieve impressively state-of-the-art performance.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhang_Prototypical_Matching_and_Open_Set_Rejection_for_Zero-Shot_Semantic_Segmentation_ICCV_2021_paper.pdf", @@ -32276,7 +34462,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Zhang_2021_ICCV,\n \n author = {\n Zhang,\n Hui and Ding,\n Henghui\n},\n title = {\n Prototypical Matching and Open Set Rejection for Zero-Shot Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6974-6983\n} \n}" }, { "title": "Provably Approximated Point Cloud Registration", @@ -32284,6 +34471,7 @@ "status": "Poster", "track": "main", "pid": 10463, + "author_site": "Ibrahim Jubran; Alaa Maalouf; Ron Kimmel; Dan Feldman", "author": "Ibrahim Jubran; Alaa Maalouf; Ron Kimmel; Dan Feldman", "abstract": "The goal of the alignment problem is to align a (given) point cloud P = \\ p_1,\\cdots,p_n\\ to another (observed) point cloud Q = \\ q_1,\\cdots,q_n\\ . That is, to compute a rotation matrix R \\in \\mathbb R ^ 3 x3 and a translation vector t \\in \\mathbb R ^ 3 that minimize the sum of paired distances between every transformed point Rp_i-t, to its corresponding point q_i, over every i\\in \\br 1,\\cdots,n . A harder version is the registration problem, where the correspondence is unknown, and the minimum is also over all possible correspondence functions from P to Q. Algorithms such as the Iterative Closest Point (ICP) and its variants were suggested for these problems, but none yield a provable non-trivial approximation for the global optimum. We prove that there always exists a \"witness\" set of 3 pairs in P xQ that, via novel alignment algorithm, defines a constant factor approximation (in the worst case) to this global optimum. We then provide algorithms that recover this witness set and yield the first provable constant factor approximation for the: (i) alignment problem in O(n) expected time, and (ii) registration problem in polynomial time. Such small witness sets exist for many variants including points in d-dimensional space, outlier-resistant cost functions, and different correspondence types. Extensive experimental results on real and synthetic datasets show that, in practice, our approximation constants are close to 1 and our error is up to x10 times smaller than state-of-the-art algorithms.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Jubran_Provably_Approximated_Point_Cloud_Registration_ICCV_2021_paper.pdf", @@ -32307,7 +34495,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Israel" + "aff_country_unique": "Israel", + "bibtex": "@InProceedings{Jubran_2021_ICCV,\n \n author = {\n Jubran,\n Ibrahim and Maalouf,\n Alaa and Kimmel,\n Ron and Feldman,\n Dan\n},\n title = {\n Provably Approximated Point Cloud Registration\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13269-13278\n} \n}" }, { "title": "Pseudo-Loss Confidence Metric for Semi-Supervised Few-Shot Learning", @@ -32315,6 +34504,7 @@ "status": "Poster", "track": "main", "pid": 2817, + "author_site": "Kai Huang; Jie Geng; Wen Jiang; Xinyang Deng; Zhe Xu", "author": "Kai Huang; Jie Geng; Wen Jiang; Xinyang Deng; Zhe Xu", "abstract": "Semi-supervised few-shot learning is developed to train a classifier that can adapt to new tasks with limited labeled data and a fixed quantity of unlabeled data. Most semi-supervised few-shot learning methods select pseudo-labeled data of unlabeled set by task-specific confidence estimation. This work presents a task-unified confidence estimation approach for semi-supervised few-shot learning, named pseudo-loss confidence metric (PLCM). It measures the data credibility by the loss distribution of pseudo-labels, which is synthetical considered multi-tasks. Specifically, pseudo-labeled data of different tasks are mapped to a unified metric space by mean of the pseudo-loss model, making it possible to learn the prior pseudo-loss distribution. Then, confidence of pseudo-labeled data is estimated according to the distribution component confidence of its pseudo-loss. Thus highly reliable pseudo-labeled data are selected to strengthen the classifier. Moreover, to overcome the pseudo-loss distribution shift and improve the effectiveness of classifier, we advance the multi-step training strategy coordinated with the class balance measures of class-apart selection and class weight. Experimental results on four popular benchmark datasets demonstrate that the proposed approach can effectively select pseudo-labeled data and achieve the state-of-the-art performance.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Huang_Pseudo-Loss_Confidence_Metric_for_Semi-Supervised_Few-Shot_Learning_ICCV_2021_paper.pdf", @@ -32338,7 +34528,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Huang_2021_ICCV,\n \n author = {\n Huang,\n Kai and Geng,\n Jie and Jiang,\n Wen and Deng,\n Xinyang and Xu,\n Zhe\n},\n title = {\n Pseudo-Loss Confidence Metric for Semi-Supervised Few-Shot Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8671-8680\n} \n}" }, { "title": "Pseudo-Mask Matters in Weakly-Supervised Semantic Segmentation", @@ -32346,6 +34537,7 @@ "status": "Poster", "track": "main", "pid": 7532, + "author_site": "Yi Li; Zhanghui Kuang; Liyang Liu; Yimin Chen; Wayne Zhang", "author": "Yi Li; Zhanghui Kuang; Liyang Liu; Yimin Chen; Wayne Zhang", "abstract": "Most weakly supervised semantic segmentation (WSSS) methods follow the pipeline that generates pseudo-masks initially and trains the segmentation model with the pseudo-masks in fully supervised manner after. However, we find some matters related to the pseudo-masks, including high quality pseudo-masks generation from class activation maps (CAMs), and training with noisy pseudo-mask supervision. For these matters, we propose the following designs to push the performance to new state-of-art: (i) Coefficient of Variation Smoothing to smooth the CAMs adaptively; (ii) Proportional Pseudo-mask Generation to project the expanded CAMs to pseudo-mask based on a new metric indicating the importance of each class on each location, instead of the scores trained from binary classifiers. (iii) Pretended Under-Fitting strategy to suppress the influence of noise in pseudo-mask; (iv) Cyclic Pseudo-mask to boost the pseudo-masks during training of fully supervised semantic segmentation (FSSS). Experiments based on our methods achieve new state-of-art results on two changeling weakly supervised semantic segmentation datasets, pushing the mIoU to 70.0% and 40.2% on PAS-CAL VOC 2012 and MS COCO 2014 respectively. Codes including segmentation framework are released at https://github.com/Eli-YiLi/PMM", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_Pseudo-Mask_Matters_in_Weakly-Supervised_Semantic_Segmentation_ICCV_2021_paper.pdf", @@ -32369,7 +34561,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Shanghai", "aff_country_unique_index": "0;0;0+0+0;0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Yi and Kuang,\n Zhanghui and Liu,\n Liyang and Chen,\n Yimin and Zhang,\n Wayne\n},\n title = {\n Pseudo-Mask Matters in Weakly-Supervised Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6964-6973\n} \n}" }, { "title": "Putting NeRF on a Diet: Semantically Consistent Few-Shot View Synthesis", @@ -32377,6 +34570,7 @@ "status": "Poster", "track": "main", "pid": 11265, + "author_site": "Ajay Jain; Matthew Tancik; Pieter Abbeel", "author": "Ajay Jain; Matthew Tancik; Pieter Abbeel", "abstract": "We present DietNeRF, a 3D neural scene representation estimated from a few images. Neural Radiance Fields (NeRF) learn a continuous volumetric representation of a scene through multi-view consistency, and can be rendered from novel viewpoints by ray casting. While NeRF has an impressive ability to reconstruct geometry and fine details given many images, up to 100 for challenging 360 degree scenes, it often finds a degenerate solution to its image reconstruction objective when only a few input views are available. To improve few-shot quality, we propose DietNeRF. We introduce an auxiliary semantic consistency loss that encourages realistic renderings at novel poses. DietNeRF is trained on individual scenes to (1) correctly render given input views from the same pose, and (2) match high-level semantic attributes across different, random poses. Our semantic loss allows us to supervise DietNeRF from arbitrary poses. We extract these semantics using a pre-trained visual encoder such as CLIP, a Vision Transformer trained on hundreds of millions of diverse single-view, 2D photographs mined from the web with natural language supervision. In experiments, DietNeRF improves the perceptual quality of few-shot view synthesis when learned from scratch, can render novel views with as few as one observed image when pre-trained on a multi-view dataset, and produces plausible completions of completely unobserved regions. Our project website is available at https://www.ajayj.com/dietnerf.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Jain_Putting_NeRF_on_a_Diet_Semantically_Consistent_Few-Shot_View_Synthesis_ICCV_2021_paper.pdf", @@ -32400,7 +34594,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Jain_2021_ICCV,\n \n author = {\n Jain,\n Ajay and Tancik,\n Matthew and Abbeel,\n Pieter\n},\n title = {\n Putting NeRF on a Diet: Semantically Consistent Few-Shot View Synthesis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5885-5894\n} \n}" }, { "title": "PyMAF: 3D Human Pose and Shape Regression With Pyramidal Mesh Alignment Feedback Loop", @@ -32408,6 +34603,7 @@ "status": "Poster", "track": "main", "pid": 3358, + "author_site": "Hongwen Zhang; Yating Tian; Xinchi Zhou; Wanli Ouyang; Yebin Liu; Limin Wang; Zhenan Sun", "author": "Hongwen Zhang; Yating Tian; Xinchi Zhou; Wanli Ouyang; Yebin Liu; Limin Wang; Zhenan Sun", "abstract": "Regression-based methods have recently shown promising results in reconstructing human meshes from monocular images. By directly mapping raw pixels to model parameters, these methods can produce parametric models in a feed-forward manner via neural networks. However, minor deviation in parameters may lead to noticeable misalignment between the estimated meshes and image evidences. To address this issue, we propose a Pyramidal Mesh Alignment Feedback (PyMAF) loop to leverage a feature pyramid and rectify the predicted parameters explicitly based on the mesh-image alignment status in our deep regressor. In PyMAF, given the currently predicted parameters, mesh-aligned evidences will be extracted from finer-resolution features accordingly and fed back for parameter rectification. To reduce noise and enhance the reliability of these evidences, an auxiliary pixel-wise supervision is imposed on the feature encoder, which provides mesh-image correspondence guidance for our network to preserve the most related information in spatial features. The efficacy of our approach is validated on several benchmarks, including Human3.6M, 3DPW, LSP, and COCO, where experimental results show that our approach consistently improves the mesh-image alignment of the reconstruction. The project page with code and video results can be found at https://hongwenzhang.github.io/pymaf.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhang_PyMAF_3D_Human_Pose_and_Shape_Regression_With_Pyramidal_Mesh_ICCV_2021_paper.pdf", @@ -32422,7 +34618,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhang_PyMAF_3D_Human_Pose_and_Shape_Regression_With_Pyramidal_Mesh_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhang_PyMAF_3D_Human_Pose_and_Shape_Regression_With_Pyramidal_Mesh_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Zhang_2021_ICCV,\n \n author = {\n Zhang,\n Hongwen and Tian,\n Yating and Zhou,\n Xinchi and Ouyang,\n Wanli and Liu,\n Yebin and Wang,\n Limin and Sun,\n Zhenan\n},\n title = {\n PyMAF: 3D Human Pose and Shape Regression With Pyramidal Mesh Alignment Feedback Loop\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11446-11456\n} \n}" }, { "title": "Pyramid Architecture Search for Real-Time Image Deblurring", @@ -32430,6 +34627,7 @@ "status": "Poster", "track": "main", "pid": 3516, + "author_site": "Xiaobin Hu; Wenqi Ren; Kaicheng Yu; Kaihao Zhang; Xiaochun Cao; Wei Liu; Bjoern Menze", "author": "Xiaobin Hu; Wenqi Ren; Kaicheng Yu; Kaihao Zhang; Xiaochun Cao; Wei Liu; Bjoern Menze", "abstract": "Multi-scale and multi-patch deep models have been shown effective in removing blurs of dynamic scenes. However, these methods still have one major obstacle: manually designing a lightweight and high-efficiency network is challenging and time-consuming. To tackle this problem, we propose a novel deblurring method, dubbed PyNAS (pyramid neural architecture search network), towards automatically designing hyper-parameters including the scales, patches, and standard cell operators. The proposed PyNAS adopts gradient-based search strategies and innovatively searches the hierarchy patch and scale scheme not limited to the cell searching. Specifically, we introduce a hierarchical search strategy tailored for the multi-scale and multi-patch deblurring task. The strategy follows the principle that the first distinguishes between the top-level (pyramid-scales and pyramid-patches) and bottom-level variables (cell operators) and then searches multi-scale variables using the top-to-bottom principle. During the search stage, PyNAS employs an early stopping strategy to avoid the collapse and computational issue. Furthermore, we use a path-level binarization mechanism for multi-scale cell searching to save memory consumption. Our model is a real-time deblurring algorithm (around 58 fps) for 720p images while achieves state-of-the-art deblurring performance on the GoPro and Video Deblurring dataset.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Hu_Pyramid_Architecture_Search_for_Real-Time_Image_Deblurring_ICCV_2021_paper.pdf", @@ -32446,14 +34644,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Hu_Pyramid_Architecture_Search_for_Real-Time_Image_Deblurring_ICCV_2021_paper.html", "aff_unique_index": "0+1;2;3+4;5;6;4;0+1", - "aff_unique_norm": "Technical University of Munich;University of Zurich;Institute of Information Engineering, Chinese Academy of Sciences;EPFL;Abacus.AI;Australian National University;Tencent", + "aff_unique_norm": "Technical University of Munich;University of Zurich;Institute of Information Engineering, Chinese Academy of Sciences;École Polytechnique Fédérale de Lausanne;Abacus.AI;Australian National University;Tencent", "aff_unique_dep": "Department of Informatics;Quantitative Biomedicine;SKLOIS (State Key Laboratory of Information Security);CVLab;;;Data Platform", "aff_unique_url": "https://www.tum.de;https://www.uzh.ch;http://www.iie.cas.cn;https://cvlab.epfl.ch;https://www.abacus.ai;https://www.anu.edu.au;https://www.tencent.com", "aff_unique_abbr": "TUM;UZH;IIE, CAS;EPFL;Abacus.AI;ANU;Tencent", "aff_campus_unique_index": "0;;0", "aff_campus_unique": "Munich;", "aff_country_unique_index": "0+1;2;1+3;4;2;3;0+1", - "aff_country_unique": "Germany;Switzerland;China;United States;Australia" + "aff_country_unique": "Germany;Switzerland;China;United States;Australia", + "bibtex": "@InProceedings{Hu_2021_ICCV,\n \n author = {\n Hu,\n Xiaobin and Ren,\n Wenqi and Yu,\n Kaicheng and Zhang,\n Kaihao and Cao,\n Xiaochun and Liu,\n Wei and Menze,\n Bjoern\n},\n title = {\n Pyramid Architecture Search for Real-Time Image Deblurring\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4298-4307\n} \n}" }, { "title": "Pyramid Point Cloud Transformer for Large-Scale Place Recognition", @@ -32461,6 +34660,7 @@ "status": "Poster", "track": "main", "pid": 2967, + "author_site": "Le Hui; Hang Yang; Mingmei Cheng; Jin Xie; Jian Yang", "author": "Le Hui; Hang Yang; Mingmei Cheng; Jin Xie; Jian Yang", "abstract": "Recently, deep learning based point cloud descriptors have achieved impressive results in the place recognition task. Nonetheless, due to the sparsity of point clouds, how to extract discriminative local features of point clouds to efficiently form a global descriptor is still a challenging problem. In this paper, we propose a pyramid point cloud transformer network (PPT-Net) to learn the discriminative global descriptors from point clouds for efficient retrieval. Specifically, we first develop a pyramid point transformer module that adaptively learns the spatial relationship of the different local k-NN graphs of point clouds, where the grouped self-attention is proposed to extract discriminative local features of the point clouds. Furthermore, the grouped self-attention not only enhances long-term dependencies of the point clouds, but also reduces the computational cost. In order to obtain discriminative global descriptors, we construct a pyramid VLAD module to aggregate the multi-scale feature maps of point clouds into the global descriptors. By applying VLAD pooling on multi-scale feature maps, we utilize the context gating mechanism on the multiple global descriptors to adaptively weight the multi-scale global context information into the final global descriptor. Experimental results on the Oxford dataset and three in-house datasets show that our method achieves the state-of-the-art on the point cloud based place recognition task. Code is available at https://github.com/fpthink/PPT-Net.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Hui_Pyramid_Point_Cloud_Transformer_for_Large-Scale_Place_Recognition_ICCV_2021_paper.pdf", @@ -32484,7 +34684,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Hui_2021_ICCV,\n \n author = {\n Hui,\n Le and Yang,\n Hang and Cheng,\n Mingmei and Xie,\n Jin and Yang,\n Jian\n},\n title = {\n Pyramid Point Cloud Transformer for Large-Scale Place Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6098-6107\n} \n}" }, { "title": "Pyramid R-CNN: Towards Better Performance and Adaptability for 3D Object Detection", @@ -32492,10 +34693,11 @@ "status": "Poster", "track": "main", "pid": 3822, + "author_site": "Jiageng Mao; Minzhe Niu; Haoyue Bai; Xiaodan Liang; Hang Xu; Chunjing Xu", "author": "Jiageng Mao; Minzhe Niu; Haoyue Bai; Xiaodan Liang; Hang Xu; Chunjing Xu", "abstract": "We present a flexible and high-performance framework, named Pyramid R-CNN, for two-stage 3D object detection from point clouds. Current approaches generally rely on the points or voxels of interest for RoI feature extraction on the second stage, but cannot effectively handle the sparsity and non-uniform distribution of those points, and this may result in failures in detecting objects that are far away. To resolve the problems, we propose a novel second-stage module, named pyramid RoI head, to adaptively learn the features from the sparse points of interest. The pyramid RoI head consists of three key components. Firstly, we propose the RoI-grid Pyramid, which addresses the sparsity problem by extensively collecting points of interest for each RoI in a pyramid manner. Secondly, we propose RoI-grid Attention, a new operation that can encode richer information from sparse points by incorporating conventional attention-based and graph-based point operators into a unified formulation. Thirdly, we propose the Density-Aware Radius Prediction (DARP) module, which can adapt to different point density levels by dynamically adjusting the focusing range of RoIs. Combining the three components, our pyramid RoI head is robust to the sparse and imbalanced circumstances, and can be applied upon various 3D backbones to consistently boost the detection performance. Extensive experiments show that Pyramid R-CNN outperforms the state-of-the-art 3D detection models by a large margin on both the KITTI dataset and the Waymo Open dataset.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Mao_Pyramid_R-CNN_Towards_Better_Performance_and_Adaptability_for_3D_Object_ICCV_2021_paper.pdf", - "aff": "The Chinese University of Hong Kong; Huawei Noah\u2019s Ark Lab; HKUST; Sun Yat-Sen University; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab", + "aff": "The Chinese University of Hong Kong; Huawei Noah’s Ark Lab; HKUST; Sun Yat-Sen University; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab", "project": "", "github": "", "supp": "", @@ -32508,14 +34710,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Mao_Pyramid_R-CNN_Towards_Better_Performance_and_Adaptability_for_3D_Object_ICCV_2021_paper.html", "aff_unique_index": "0;1;2;3;1;1", - "aff_unique_norm": "Chinese University of Hong Kong;Huawei;Hong Kong University of Science and Technology;Sun Yat-sen University", - "aff_unique_dep": ";Noah\u2019s Ark Lab;;", + "aff_unique_norm": "The Chinese University of Hong Kong;Huawei;Hong Kong University of Science and Technology;Sun Yat-Sen University", + "aff_unique_dep": ";Noah’s Ark Lab;;", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.huawei.com;https://www.ust.hk;http://www.sysu.edu.cn/", "aff_unique_abbr": "CUHK;Huawei;HKUST;SYSU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Mao_2021_ICCV,\n \n author = {\n Mao,\n Jiageng and Niu,\n Minzhe and Bai,\n Haoyue and Liang,\n Xiaodan and Xu,\n Hang and Xu,\n Chunjing\n},\n title = {\n Pyramid R-CNN: Towards Better Performance and Adaptability for 3D Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2723-2732\n} \n}" }, { "title": "Pyramid Spatial-Temporal Aggregation for Video-Based Person Re-Identification", @@ -32523,6 +34726,7 @@ "status": "Poster", "track": "main", "pid": 5590, + "author_site": "Yingquan Wang; Pingping Zhang; Shang Gao; Xia Geng; Hu Lu; Dong Wang", "author": "Yingquan Wang; Pingping Zhang; Shang Gao; Xia Geng; Hu Lu; Dong Wang", "abstract": "Video-based person re-identification aims to associate the video clips of the same person across multiple non-overlapping cameras. Spatial-temporal representations can provide richer and complementary information between frames, which are crucial to distinguish the target person when occlusion occurs. This paper proposes a novel Pyramid Spatial-Temporal Aggregation (PSTA) framework to aggregate the frame-level features progressively and fuse the hierarchical temporal features into a final video-level representation. Thus, short-term and long-term temporal information could be well exploited by different hierarchies. Furthermore, a Spatial-Temporal Aggregation Module (STAM) is proposed to enhance the aggregation capability of PSTA. It mainly consists of two novel attention blocks: Spatial Reference Attention (SRA) and Temporal Reference Attention (TRA). SRA explores the spatial correlations within a frame to determine the attention weight of each location. While TRA extends SRA with the correlations between adjacent frames, temporal consistency information can be fully explored to suppress the interference features and strengthen the discriminative ones. Extensive experiments on several challenging benchmarks demonstrate the effectiveness of the proposed PSTA, and our full model reaches 91.5% and 98.3% Rank-1 accuracy on MARS and DukeMTMC-VID benchmarks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_Pyramid_Spatial-Temporal_Aggregation_for_Video-Based_Person_Re-Identification_ICCV_2021_paper.pdf", @@ -32546,7 +34750,8 @@ "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Dalian", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Yingquan and Zhang,\n Pingping and Gao,\n Shang and Geng,\n Xia and Lu,\n Hu and Wang,\n Dong\n},\n title = {\n Pyramid Spatial-Temporal Aggregation for Video-Based Person Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12026-12035\n} \n}" }, { "title": "Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction Without Convolutions", @@ -32554,6 +34759,7 @@ "status": "Poster", "track": "main", "pid": 2578, + "author_site": "Wenhai Wang; Enze Xie; Xiang Li; Deng-Ping Fan; Kaitao Song; Ding Liang; Tong Lu; Ping Luo; Ling Shao", "author": "Wenhai Wang; Enze Xie; Xiang Li; Deng-Ping Fan; Kaitao Song; Ding Liang; Tong Lu; Ping Luo; Ling Shao", "abstract": "Although convolutional neural networks (CNNs) have achieved great success in computer vision, this work investigates a simpler, convolution-free backbone network useful for many dense prediction tasks. Unlike the recently-proposed Vision Transformer (ViT) that was designed for image classification specifically, we introduce the Pyramid Vision Transformer (PVT), which overcomes the difficulties of porting Transformer to various dense prediction tasks. PVT has several merits compared to current state of the arts. (1) Different from ViT that typically yields low-resolution outputs and incurs high computational and memory costs, PVT not only can be trained on dense partitions of an image to achieve high output resolution, which is important for dense prediction, but also uses a progressive shrinking pyramid to reduce the computations of large feature maps. (2) PVT inherits the advantages of both CNN and Transformer, making it a unified backbone for various vision tasks without convolutions, where it can be used as a direct replacement for CNN backbones. (3) We validate PVT through extensive experiments, showing that it boosts the performance of many downstream tasks, including object detection, instance and semantic segmentation. For example, with a comparable number of parameters, PVT+RetinaNet achieves 40.4 AP on the COCO dataset, surpassing ResNet50+RetinNet (36.3 AP) by 4.1 absolute AP. We hope that PVT could serve as an alternative and useful backbone for pixel-level predictions and facilitate future research.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_Pyramid_Vision_Transformer_A_Versatile_Backbone_for_Dense_Prediction_Without_ICCV_2021_paper.pdf", @@ -32570,14 +34776,15 @@ "author_num": 9, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wang_Pyramid_Vision_Transformer_A_Versatile_Backbone_for_Dense_Prediction_Without_ICCV_2021_paper.html", "aff_unique_index": "0;1;2;3;4;4;0;1;3", - "aff_unique_norm": "Nanjing University;University of Hong Kong;Nanjing University of Science and Technology;International Institute of Artificial Intelligence;SenseTime", + "aff_unique_norm": "Nanjing University;The University of Hong Kong;Nanjing University of Science and Technology;International Institute of Artificial Intelligence;SenseTime", "aff_unique_dep": ";;;;SenseTime Research", "aff_unique_url": "https://www.nju.edu.cn;https://www.hku.hk;http://www.nust.edu.cn/;https://www.iiai.org;https://www.sensetime.com", "aff_unique_abbr": "Nanjing U;HKU;NUST;IIAI;SenseTime", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;1;0;0;0;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Wenhai and Xie,\n Enze and Li,\n Xiang and Fan,\n Deng-Ping and Song,\n Kaitao and Liang,\n Ding and Lu,\n Tong and Luo,\n Ping and Shao,\n Ling\n},\n title = {\n Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction Without Convolutions\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 568-578\n} \n}" }, { "title": "Q-Match: Iterative Shape Matching via Quantum Annealing", @@ -32585,7 +34792,8 @@ "status": "Poster", "track": "main", "pid": 10365, - "author": "Marcel Seelbach Benkner; Zorah L\u00e4hner; Vladislav Golyanik; Christof Wunderlich; Christian Theobalt; Michael Moeller", + "author_site": "Marcel Seelbach Benkner; Zorah Lähner; Vladislav Golyanik; Christof Wunderlich; Christian Theobalt; Michael Moeller", + "author": "Marcel Seelbach Benkner; Zorah Lähner; Vladislav Golyanik; Christof Wunderlich; Christian Theobalt; Michael Moeller", "abstract": "Finding shape correspondences can be formulated as an NP-hard quadratic assignment problem (QAP) that becomes infeasible for shapes with high sampling density. A promising research direction is to tackle such quadratic optimization problems over binary variables with quantum annealing, which allows for some problems a more efficient search in the solution space. Unfortunately, enforcing the linear equality constraints in QAPs via a penalty significantly limits the success probability of such methods on currently available quantum hardware. To address this limitation, this paper proposes Q-Match, i.e., a new iterative quantum method for QAPs inspired by the alpha-expansion algorithm, which allows solving problems of an order of magnitude larger than current quantum methods. It implicitly enforces the QAP constraints by updating the current estimates in a cyclic fashion. Further, Q-Match can be applied iteratively, on a subset of well-chosen correspondences, allowing us to scale to real-world problems. Using the latest quantum annealer, the D-Wave Advantage, we evaluate the proposed method on a subset of QAPLIB as well as on isometric shape matching problems from the FAUST dataset.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Benkner_Q-Match_Iterative_Shape_Matching_via_Quantum_Annealing_ICCV_2021_paper.pdf", "aff": "University of Siegen; University of Siegen; MPI for Informatics, SIC; University of Siegen; MPI for Informatics, SIC; University of Siegen", @@ -32602,13 +34810,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Benkner_Q-Match_Iterative_Shape_Matching_via_Quantum_Annealing_ICCV_2021_paper.html", "aff_unique_index": "0;0;1;0;1;0", "aff_unique_norm": "University of Siegen;Max Planck Institute for Informatics", - "aff_unique_dep": ";SIC", + "aff_unique_dep": ";MPI for Informatics", "aff_unique_url": "https://www.uni-siegen.de;https://www.mpi-inf.mpg.de", "aff_unique_abbr": "Uni Siegen;MPII", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Benkner_2021_ICCV,\n \n author = {\n Benkner,\n Marcel Seelbach and L\\"ahner,\n Zorah and Golyanik,\n Vladislav and Wunderlich,\n Christof and Theobalt,\n Christian and Moeller,\n Michael\n},\n title = {\n Q-Match: Iterative Shape Matching via Quantum Annealing\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7586-7596\n} \n}" }, { "title": "Query Adaptive Few-Shot Object Detection With Heterogeneous Graph Convolutional Networks", @@ -32616,6 +34825,7 @@ "status": "Poster", "track": "main", "pid": 3620, + "author_site": "Guangxing Han; Yicheng He; Shiyuan Huang; Jiawei Ma; Shih-Fu Chang", "author": "Guangxing Han; Yicheng He; Shiyuan Huang; Jiawei Ma; Shih-Fu Chang", "abstract": "Few-shot object detection (FSOD) aims to detect never-seen objects using few examples. This field sees recent improvement owing to the meta-learning techniques by learning how to match between the query image and few-shot class examples, such that the learned model can generalize to few-shot novel classes. However, currently, most of the meta-learning-based methods perform parwise matching between query image regions (usually proposals) and novel classes separately, therefore failing to take into account multiple relationships among them. In this paper, we propose a novel FSOD model using heterogeneous graph convolutional networks. Through efficient message passing among all the proposal and class nodes with three different types of edges, we could obtain context-aware proposal features and query-adaptive, multiclass-enhanced prototype representations for each class, which could help promote the pairwise matching and improve final FSOD accuracy. Extensive experimental results show that our proposed model, denoted as QA-FewDet, outperforms the current state-of-the-art approaches on the PASCAL VOC and MSCOCO FSOD benchmarks under different shots and evaluation metrics.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Han_Query_Adaptive_Few-Shot_Object_Detection_With_Heterogeneous_Graph_Convolutional_Networks_ICCV_2021_paper.pdf", @@ -32639,7 +34849,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Han_2021_ICCV,\n \n author = {\n Han,\n Guangxing and He,\n Yicheng and Huang,\n Shiyuan and Ma,\n Jiawei and Chang,\n Shih-Fu\n},\n title = {\n Query Adaptive Few-Shot Object Detection With Heterogeneous Graph Convolutional Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3263-3272\n} \n}" }, { "title": "R-MSFM: Recurrent Multi-Scale Feature Modulation for Monocular Depth Estimating", @@ -32647,6 +34858,7 @@ "status": "Poster", "track": "main", "pid": 2602, + "author_site": "Zhongkai Zhou; Xinnan Fan; Pengfei Shi; Yuanxue Xin", "author": "Zhongkai Zhou; Xinnan Fan; Pengfei Shi; Yuanxue Xin", "abstract": "In this paper, we propose Recurrent Multi-Scale Feature Modulation (R-MSFM), a new deep network architecture for self-supervised monocular depth estimation. R-MSFM extracts per-pixel features, builds a multi-scale feature modulation module, and iteratively updates an inverse depth through a parameter-shared decoder at the fixed resolution. This architecture enables our R-MSFM to maintain semantically richer while spatially more precise representations and avoid the error propagation caused by the traditional U-Net-like coarse-to-fine architecture widely used in this domain, resulting in strong generalization and efficient parameter count. Experimental results demonstrate the superiority of our proposed R-MSFM both at model size and inference speed, and show the state-of-the-art results on the KITTI benchmark. Code is available at https://github.com/jsczzzk/R-MSFM", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhou_R-MSFM_Recurrent_Multi-Scale_Feature_Modulation_for_Monocular_Depth_Estimating_ICCV_2021_paper.pdf", @@ -32661,7 +34873,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhou_R-MSFM_Recurrent_Multi-Scale_Feature_Modulation_for_Monocular_Depth_Estimating_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhou_R-MSFM_Recurrent_Multi-Scale_Feature_Modulation_for_Monocular_Depth_Estimating_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Zhou_2021_ICCV,\n \n author = {\n Zhou,\n Zhongkai and Fan,\n Xinnan and Shi,\n Pengfei and Xin,\n Yuanxue\n},\n title = {\n R-MSFM: Recurrent Multi-Scale Feature Modulation for Monocular Depth Estimating\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12777-12786\n} \n}" }, { "title": "R-SLAM: Optimizing Eye Tracking From Rolling Shutter Video of the Retina", @@ -32669,6 +34882,7 @@ "status": "Poster", "track": "main", "pid": 9822, + "author_site": "Jay Shenoy; James Fong; Jeffrey Tan; Austin Roorda; Ren Ng", "author": "Jay Shenoy; James Fong; Jeffrey Tan; Austin Roorda; Ren Ng", "abstract": "We present a method for optimization-based recovery of eye motion from rolling shutter video of the retina. Our approach formulates eye tracking as an optimization problem that jointly estimates the retina's motion and appearance using convex optimization and a constrained version of gradient descent. By incorporating the rolling shutter imaging model into the formulation of our joint optimization, we achieve state-of-the-art accuracy both offline and in real-time. We apply our method to retina video captured with an adaptive optics scanning laser ophthalmoscope (AOSLO), demonstrating eye tracking at 1 kHz with accuracies below one arcminute -- over an order of magnitude higher than conventional eye tracking systems.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Shenoy_R-SLAM_Optimizing_Eye_Tracking_From_Rolling_Shutter_Video_of_the_ICCV_2021_paper.pdf", @@ -32692,7 +34906,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Shenoy_2021_ICCV,\n \n author = {\n Shenoy,\n Jay and Fong,\n James and Tan,\n Jeffrey and Roorda,\n Austin and Ng,\n Ren\n},\n title = {\n R-SLAM: Optimizing Eye Tracking From Rolling Shutter Video of the Retina\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4852-4861\n} \n}" }, { "title": "RAIN: Reinforced Hybrid Attention Inference Network for Motion Forecasting", @@ -32700,6 +34915,7 @@ "status": "Poster", "track": "main", "pid": 6650, + "author_site": "Jiachen Li; Fan Yang; Hengbo Ma; Srikanth Malla; Masayoshi Tomizuka; Chiho Choi", "author": "Jiachen Li; Fan Yang; Hengbo Ma; Srikanth Malla; Masayoshi Tomizuka; Chiho Choi", "abstract": "Motion forecasting plays a significant role in various domains (e.g., autonomous driving, human-robot interaction), which aims to predict future motion sequences given a set of historical observations. However, the observed elements may be of different levels of importance. Some information may be irrelevant or even distracting to the forecasting in certain situations. To address this issue, we propose a generic motion forecasting framework (named RAIN) with dynamic key information selection and ranking based on a hybrid attention mechanism. The general framework is instantiated to handle multi-agent trajectory prediction and human motion forecasting tasks, respectively. In the former task, the model learns to recognize the relations between agents with a graph representation and to determine their relative significance. In the latter task, the model learns to capture the temporal proximity and dependency in long-term human motions. We also propose an effective double-stage training pipeline with an alternating training strategy to optimize the parameters in different modules of the framework. We validate the framework on both synthetic simulations and motion forecasting benchmarks in different domains, demonstrating that our method not only achieves state-of-the-art forecasting performance but also provides interpretable and reasonable hybrid attention weights.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_RAIN_Reinforced_Hybrid_Attention_Inference_Network_for_Motion_Forecasting_ICCV_2021_paper.pdf", @@ -32723,7 +34939,8 @@ "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0+0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Jiachen and Yang,\n Fan and Ma,\n Hengbo and Malla,\n Srikanth and Tomizuka,\n Masayoshi and Choi,\n Chiho\n},\n title = {\n RAIN: Reinforced Hybrid Attention Inference Network for Motion Forecasting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 16096-16106\n} \n}" }, { "title": "RANK-NOSH: Efficient Predictor-Based Architecture Search via Non-Uniform Successive Halving", @@ -32731,6 +34948,7 @@ "status": "Poster", "track": "main", "pid": 9419, + "author_site": "Ruochen Wang; Xiangning Chen; Minhao Cheng; Xiaocheng Tang; Cho-Jui Hsieh", "author": "Ruochen Wang; Xiangning Chen; Minhao Cheng; Xiaocheng Tang; Cho-Jui Hsieh", "abstract": "Predictor-based algorithms have achieved remarkable performance in the Neural Architecture Search (NAS) tasks. However, these methods suffer from high computation costs, as training the performance predictor usually requires training and evaluating hundreds of architectures from scratch. Previous works along this line mainly focus on reducing the number of architectures required to fit the predictor. In this work, we tackle this challenge from a different perspective - improve search efficiency by cutting down the computation budget of architecture training. We propose NOn-uniform Successive Halving (NOSH), a hierarchical scheduling algorithm that terminates the training of underperforming architectures early to avoid wasting budget. To effectively leverage the non-uniform supervision signals produced by NOSH, we formulate predictor-based architecture search as learning to rank with pairwise comparisons. The resulting method - RANK-NOSH, reduces the search budget by 5x while achieving competitive or even better performance than previous state-of-the-art predictor-based methods on various spaces and datasets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_RANK-NOSH_Efficient_Predictor-Based_Architecture_Search_via_Non-Uniform_Successive_Halving_ICCV_2021_paper.pdf", @@ -32754,7 +34972,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0;0;0;1;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Ruochen and Chen,\n Xiangning and Cheng,\n Minhao and Tang,\n Xiaocheng and Hsieh,\n Cho-Jui\n},\n title = {\n RANK-NOSH: Efficient Predictor-Based Architecture Search via Non-Uniform Successive Halving\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10377-10386\n} \n}" }, { "title": "RDA: Robust Domain Adaptation via Fourier Adversarial Attacking", @@ -32762,6 +34981,7 @@ "status": "Poster", "track": "main", "pid": 2530, + "author_site": "Jiaxing Huang; Dayan Guan; Aoran Xiao; Shijian Lu", "author": "Jiaxing Huang; Dayan Guan; Aoran Xiao; Shijian Lu", "abstract": "Unsupervised domain adaptation (UDA) involves a supervised loss in a labeled source domain and an unsupervised loss in an unlabeled target domain, which often faces more severe overfitting (than classical supervised learning) as the supervised source loss has clear domain gap and the unsupervised target loss is often noisy due to the lack of annotations. This paper presents RDA, a robust domain adaptation technique that introduces adversarial attacking to mitigate overfitting in UDA. We achieve robust domain adaptation by a novel Fourier adversarial attacking (FAA) method that allows large magnitude of perturbation noises but has minimal modification of image semantics, the former is critical to the effectiveness of its generated adversarial samples due to the existence of domain gaps. Specifically, FAA decomposes images into multiple frequency components (FCs) and generates adversarial samples by just perturbating certain FCs that capture little semantic information. With FAA-generated samples, the training can continue the random walk and drift into an area with a flat loss landscape, leading to more robust domain adaptation. Extensive experiments over multiple domain adaptation tasks show that RDA can work with different computer vision tasks with superior performance.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Huang_RDA_Robust_Domain_Adaptation_via_Fourier_Adversarial_Attacking_ICCV_2021_paper.pdf", @@ -32785,7 +35005,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Huang_2021_ICCV,\n \n author = {\n Huang,\n Jiaxing and Guan,\n Dayan and Xiao,\n Aoran and Lu,\n Shijian\n},\n title = {\n RDA: Robust Domain Adaptation via Fourier Adversarial Attacking\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8988-8999\n} \n}" }, { "title": "RDI-Net: Relational Dynamic Inference Networks", @@ -32793,6 +35014,7 @@ "status": "Poster", "track": "main", "pid": 2913, + "author_site": "Huanyu Wang; Songyuan Li; Shihao Su; Zequn Qin; Xi Li", "author": "Huanyu Wang; Songyuan Li; Shihao Su; Zequn Qin; Xi Li", "abstract": "Dynamic inference networks, aimed at promoting computational efficiency, go along an adaptive executing path for a given sample. Prevalent methods typically assign a router for each convolutional block and sequentially make block-by-block executing decisions, without considering the relations during the dynamic inference. In this paper, we model the relations for dynamic inference from two aspects: the routers and the samples. We design a novel type of router called the relational router to model the relations among routers for a given sample. In principle, the current relational router aggregates the contextual features of preceding routers by graph convolution and propagates its router features to subsequent ones, making the executing decision for the current block in a long-range manner. Furthermore, we model the relation between samples by introducing a Sample Relation Module (SRM), encouraging correlated samples to go along correlated executing paths. As a whole, we call our method the Relational Dynamic Inference Network (RDI-Net). Extensive experiments on CIFAR-10/100 and ImageNet show that RDI-Net achieves state-of-the-art performance and computational cost reduction. Our code and models will be made publicly available.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_RDI-Net_Relational_Dynamic_Inference_Networks_ICCV_2021_paper.pdf", @@ -32816,7 +35038,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Huanyu and Li,\n Songyuan and Su,\n Shihao and Qin,\n Zequn and Li,\n Xi\n},\n title = {\n RDI-Net: Relational Dynamic Inference Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4621-4630\n} \n}" }, { "title": "RECALL: Replay-Based Continual Learning in Semantic Segmentation", @@ -32824,6 +35047,7 @@ "status": "Poster", "track": "main", "pid": 1687, + "author_site": "Andrea Maracani; Umberto Michieli; Marco Toldo; Pietro Zanuttigh", "author": "Andrea Maracani; Umberto Michieli; Marco Toldo; Pietro Zanuttigh", "abstract": "Deep networks allow to obtain outstanding results in semantic segmentation, however they need to be trained in a single shot with a large amount of data. Continual learning settings where new classes are learned in incremental steps and previous training data is no longer available are challenging due to the catastrophic forgetting phenomenon. Existing approaches typically fail when several incremental steps are performed or in presence of a distribution shift of the background class. We tackle these issues by recreating no longer available data for the old classes and outlining a content inpainting scheme on the background class. We propose two sources for replay data. The first resorts to a generative adversarial network to sample from the class space of past learning steps. The second relies on web-crawled data to retrieve images containing examples of old classes from online databases. In both scenarios no samples of past steps are stored, thus avoiding privacy concerns. Replay data are then blended with new samples during the incremental steps. Our approach, RECALL, outperforms state-of-the-art methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Maracani_RECALL_Replay-Based_Continual_Learning_in_Semantic_Segmentation_ICCV_2021_paper.pdf", @@ -32847,7 +35071,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0;0", - "aff_country_unique": "Italy" + "aff_country_unique": "Italy", + "bibtex": "@InProceedings{Maracani_2021_ICCV,\n \n author = {\n Maracani,\n Andrea and Michieli,\n Umberto and Toldo,\n Marco and Zanuttigh,\n Pietro\n},\n title = {\n RECALL: Replay-Based Continual Learning in Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7026-7035\n} \n}" }, { "title": "RFNet: Recurrent Forward Network for Dense Point Cloud Completion", @@ -32855,6 +35080,7 @@ "status": "Poster", "track": "main", "pid": 7825, + "author_site": "Tianxin Huang; Hao Zou; Jinhao Cui; Xuemeng Yang; Mengmeng Wang; Xiangrui Zhao; Jiangning Zhang; Yi Yuan; Yifan Xu; Yong Liu", "author": "Tianxin Huang; Hao Zou; Jinhao Cui; Xuemeng Yang; Mengmeng Wang; Xiangrui Zhao; Jiangning Zhang; Yi Yuan; Yifan Xu; Yong Liu", "abstract": "Point cloud completion is an interesting and challenging task in 3D vision, aiming to recover complete shapes from sparse and incomplete point clouds. Existing learning-based methods often require vast computation cost to achieve excellent performance, which limits their practical applications. In this paper, we propose a novel Recurrent Forward Network (RFNet), which is composed of three modules: Recurrent Feature Extraction (RFE), Forward Dense Completion (FDC) and Raw Shape Protection (RSP). The RFE extracts multiple global features from the incomplete point clouds for different recurrent levels, and the FDC generates point clouds in a coarse-to-fine pipeline. The RSP introduces details from the original incomplete models to refine the completion results. Besides, we propose a Sampling Chamfer Distance to better capture the shapes of models and a new Balanced Expansion Constraint to restrict the expansion distances from coarse to fine. According to the experiments on ShapeNet and KITTI, our network can achieve the state-of-the-art with lower memory cost and faster convergence.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Huang_RFNet_Recurrent_Forward_Network_for_Dense_Point_Cloud_Completion_ICCV_2021_paper.pdf", @@ -32871,14 +35097,15 @@ "author_num": 10, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Huang_RFNet_Recurrent_Forward_Network_for_Dense_Point_Cloud_Completion_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;0;0;0;0;1;1;0+0", - "aff_unique_norm": "Zhejiang University;Netease", + "aff_unique_norm": "Zhejiang University;NetEase", "aff_unique_dep": ";Fuxi AI Lab", "aff_unique_url": "https://www.zju.edu.cn;https://www.163.com", "aff_unique_abbr": "ZJU;NetEase", "aff_campus_unique_index": "1", "aff_campus_unique": ";Huzhou", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Huang_2021_ICCV,\n \n author = {\n Huang,\n Tianxin and Zou,\n Hao and Cui,\n Jinhao and Yang,\n Xuemeng and Wang,\n Mengmeng and Zhao,\n Xiangrui and Zhang,\n Jiangning and Yuan,\n Yi and Xu,\n Yifan and Liu,\n Yong\n},\n title = {\n RFNet: Recurrent Forward Network for Dense Point Cloud Completion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12508-12517\n} \n}" }, { "title": "RFNet: Region-Aware Fusion Network for Incomplete Multi-Modal Brain Tumor Segmentation", @@ -32886,6 +35113,7 @@ "status": "Poster", "track": "main", "pid": 2077, + "author_site": "Yuhang Ding; Xin Yu; Yi Yang", "author": "Yuhang Ding; Xin Yu; Yi Yang", "abstract": "Most existing brain tumor segmentation methods usually exploit multi-modal magnetic resonance imaging (MRI) images to achieve high segmentation performance. However, the problem of missing certain modality images often happens in clinical practice, thus leading to severe segmentation performance degradation. In this work, we propose a Region-aware Fusion Network (RFNet) that is able to exploit different combinations of multi-modal data adaptively and effectively for tumor segmentation. Considering different modalities are sensitive to different brain tumor regions, we design a Region-aware Fusion Module (RFM) in RFNet to conduct modal feature fusion from available image modalities according to disparate regions. Benefiting from RFM, RFNet can adaptively segment tumor regions from an incomplete set of multi-modal images by effectively aggregating modal features. Furthermore, we also develop a segmentation-based regularizer to prevent RFNet from the insufficient and unbalanced training caused by the incomplete multi-modal data. Specifically, apart from obtaining segmentation results from fused modal features, we also segment each image modality individually from the corresponding encoded features. In this manner, each modal encoder is forced to learn discriminative features, thus improving the representation ability of the fused features. Remarkably, extensive experiments on BRATS2020, BRATS2018 and BRATS2015 datasets demonstrate that our RFNet outperforms the state-of-the-art significantly.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ding_RFNet_Region-Aware_Fusion_Network_for_Incomplete_Multi-Modal_Brain_Tumor_Segmentation_ICCV_2021_paper.pdf", @@ -32909,7 +35137,8 @@ "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Sydney", "aff_country_unique_index": "0+1;1;1", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Ding_2021_ICCV,\n \n author = {\n Ding,\n Yuhang and Yu,\n Xin and Yang,\n Yi\n},\n title = {\n RFNet: Region-Aware Fusion Network for Incomplete Multi-Modal Brain Tumor Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3975-3984\n} \n}" }, { "title": "RGB-D Saliency Detection via Cascaded Mutual Information Minimization", @@ -32917,6 +35146,7 @@ "status": "Poster", "track": "main", "pid": 3167, + "author_site": "Jing Zhang; Deng-Ping Fan; Yuchao Dai; Xin Yu; Yiran Zhong; Nick Barnes; Ling Shao", "author": "Jing Zhang; Deng-Ping Fan; Yuchao Dai; Xin Yu; Yiran Zhong; Nick Barnes; Ling Shao", "abstract": "Existing RGB-D saliency detection models do not explicitly encourage RGB and depth to achieve effective multi-modal learning. In this paper, we introduce a novel multi-stage cascaded learning framework via mutual information minimization to explicitly model the multi-modal information between RGB image and depth data. Specifically, we first map the feature of each mode to a lower dimensional feature vector, and adopt mutual information minimization as a regularizer to reduce the redundancy between appearance features from RGB and geometric features from depth. We then perform multi-stage cascaded learning to impose the mutual information minimization constraint at every stage of the network. Extensive experiments on benchmark RGB-D saliency datasets illustrate the effectiveness of our framework. Further, to prosper the development of this field, we contribute the largest (7x larger than NJU2K) COME20K dataset, which contains 15,625 image pairs with high quality polygon-/scribble-/object-/instance-/rank-level annotations. Based on these rich labels, we additionally construct four new benchmarks (Code, results, and benchmarks will be made publicly available.) with strong baselines and observe some interesting phenomena, which can motivate future model design.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhang_RGB-D_Saliency_Detection_via_Cascaded_Mutual_Information_Minimization_ICCV_2021_paper.pdf", @@ -32940,7 +35170,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;0;2;0;1", - "aff_country_unique": "Australia;United States;China" + "aff_country_unique": "Australia;United States;China", + "bibtex": "@InProceedings{Zhang_2021_ICCV,\n \n author = {\n Zhang,\n Jing and Fan,\n Deng-Ping and Dai,\n Yuchao and Yu,\n Xin and Zhong,\n Yiran and Barnes,\n Nick and Shao,\n Ling\n},\n title = {\n RGB-D Saliency Detection via Cascaded Mutual Information Minimization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4338-4347\n} \n}" }, { "title": "RINDNet: Edge Detection for Discontinuity in Reflectance, Illumination, Normal and Depth", @@ -32948,6 +35179,7 @@ "status": "Poster", "track": "main", "pid": 2989, + "author_site": "Mengyang Pu; Yaping Huang; Qingji Guan; Haibin Ling", "author": "Mengyang Pu; Yaping Huang; Qingji Guan; Haibin Ling", "abstract": "As a fundamental building block in computer vision, edges can be categorised into four types according to the discontinuity in surface-Reflectance, Illumination, surface-Normal or Depth. While great progress has been made in detecting generic or individual types of edges, it remains under-explored to comprehensively study all four edge types together. In this paper, we propose a novel neural network solution, RINDNet, to jointly detect all four types of edges. Taking into consideration the distinct attributes of each type of edges and the relationship between them, RINDNet learns effective representations for each of them and works in three stages. In stage I, RINDNet uses a common backbone to extract features shared by all edges. Then in stage II it branches to prepare discriminative features for each edge type by the corresponding decoder. In stage III, an independent decision head for each type aggregates the features from previous stages to predict the initial results. Additionally, an attention module learns attention maps for all types to capture the underlying relations between them, and these maps are combined with initial results to generate the final edge detection results. For training and evaluation, we construct the first public benchmark, BSDS-RIND, with all four types of edges carefully annotated. In our experiments, RINDNet yields promising results in comparison with state-of-the-art methods. Additional analysis is presented in supplementary material.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Pu_RINDNet_Edge_Detection_for_Discontinuity_in_Reflectance_Illumination_Normal_and_ICCV_2021_paper.pdf", @@ -32964,14 +35196,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Pu_RINDNet_Edge_Detection_for_Discontinuity_in_Reflectance_Illumination_Normal_and_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;1", - "aff_unique_norm": "Beijing Jiao Tong University;Stony Brook University", + "aff_unique_norm": "Beijing Jiaotong University;Stony Brook University", "aff_unique_dep": "Beijing Key Laboratory of Traffic Data Analysis and Mining;Department of Computer Science", "aff_unique_url": "http://www.bjtu.edu.cn;https://www.stonybrook.edu", "aff_unique_abbr": ";SBU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Stony Brook", "aff_country_unique_index": "0;0;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Pu_2021_ICCV,\n \n author = {\n Pu,\n Mengyang and Huang,\n Yaping and Guan,\n Qingji and Ling,\n Haibin\n},\n title = {\n RINDNet: Edge Detection for Discontinuity in Reflectance,\n Illumination,\n Normal and Depth\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6879-6888\n} \n}" }, { "title": "RMSMP: A Novel Deep Neural Network Quantization Framework With Row-Wise Mixed Schemes and Multiple Precisions", @@ -32979,6 +35212,7 @@ "status": "Poster", "track": "main", "pid": 7280, + "author_site": "Sung-En Chang; Yanyu Li; Mengshu Sun; Weiwen Jiang; Sijia Liu; Yanzhi Wang; Xue Lin", "author": "Sung-En Chang; Yanyu Li; Mengshu Sun; Weiwen Jiang; Sijia Liu; Yanzhi Wang; Xue Lin", "abstract": "This work proposes a novel Deep Neural Network (DNN) quantization framework, namely RMSMP, with a \\underline R ow-wise \\underline M ixed-\\underline S cheme and \\underline M ulti-\\underline P recision approach. Specifically, this is the first effort to assign mixed quantization schemes and multiple precisions within layers -- among rows of the DNN weight matrix, for simplified operations in hardware inference, while preserving accuracy. Furthermore, this paper makes a different observation from the prior work that the quantization error does not necessarily exhibit the layer-wise sensitivity, and actually can be mitigated as long as a certain portion of the weights in every layer are in higher precisions. This observation enables layer-wise uniformality in the hardware implementation towards guaranteed inference acceleration, while still enjoying row-wise flexibility of mixed schemes and multiple precisions to boost accuracy. The candidates of schemes and precisions are derived practically and effectively with a highly hardware-informative strategy to reduce the problem search space. With the offline determined ratio of different quantization schemes and precisions for all the layers, the RMSMP quantization algorithm uses Hessian and variance based method to effectively assign schemes and precisions for each row. The proposed RMSMP is tested for the image classification and natural language processing (BERT) applications, and achieves the best accuracy performance among state-of-the-arts under the same equivalent precisions. The RMSMP is implemented on FPGA devices, achieving 3.65xspeedup in the end-to-end inference time for ResNet-18 on ImageNet, comparing with the 4-bit Fixed-point baseline.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chang_RMSMP_A_Novel_Deep_Neural_Network_Quantization_Framework_With_Row-Wise_ICCV_2021_paper.pdf", @@ -33002,7 +35236,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Chang_2021_ICCV,\n \n author = {\n Chang,\n Sung-En and Li,\n Yanyu and Sun,\n Mengshu and Jiang,\n Weiwen and Liu,\n Sijia and Wang,\n Yanzhi and Lin,\n Xue\n},\n title = {\n RMSMP: A Novel Deep Neural Network Quantization Framework With Row-Wise Mixed Schemes and Multiple Precisions\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5251-5260\n} \n}" }, { "title": "RPVNet: A Deep and Efficient Range-Point-Voxel Fusion Network for LiDAR Point Cloud Segmentation", @@ -33010,6 +35245,7 @@ "status": "Poster", "track": "main", "pid": 7430, + "author_site": "Jianyun Xu; Ruixiang Zhang; Jian Dou; Yushi Zhu; Jie Sun; Shiliang Pu", "author": "Jianyun Xu; Ruixiang Zhang; Jian Dou; Yushi Zhu; Jie Sun; Shiliang Pu", "abstract": "Point clouds can be represented in many forms (views), typically, point-based sets, voxel-based cells or range-based images(i.e., panoramic view). The point-based view is geometrically accurate, but it is disordered, which makes it difficult to find local neighbors efficiently. The voxel-based view is regular, but sparse, and computation grows cubicly when voxel resolution increases. The range-based view is regular and generally dense, however spherical projection makes physical dimensions distorted. Both voxel- and range-based views suffer from quantization loss, especially for voxels when facing large-scale scenes. In order to utilize different view's advantages and alleviate their own shortcomings in fine-grained segmentation task, we propose a novel range-point-voxel fusion network, namely RPVNet. In this network, we devise a deep fusion framework with multiple and mutual information interactions among these three views, and propose a gated fusion module (termed as GFM), which can adaptively merge the three features based on concurrent inputs. Moreover, the proposed RPV interaction mechanism is highly efficient, and we summarize it to a more general formulation. By leveraging this efficient interaction and relatively lower voxel resolution, our method is also proved to be more efficient. Finally, we evaluated the proposed model on two large-scale datasets, i.e., SemanticKITTI and nuScenes, and it shows state-of-the-art performance on both of them. Note that, our method currently ranks 1st on SemanticKITTI leaderboard without any extra tricks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xu_RPVNet_A_Deep_and_Efficient_Range-Point-Voxel_Fusion_Network_for_LiDAR_ICCV_2021_paper.pdf", @@ -33033,7 +35269,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xu_2021_ICCV,\n \n author = {\n Xu,\n Jianyun and Zhang,\n Ruixiang and Dou,\n Jian and Zhu,\n Yushi and Sun,\n Jie and Pu,\n Shiliang\n},\n title = {\n RPVNet: A Deep and Efficient Range-Point-Voxel Fusion Network for LiDAR Point Cloud Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 16024-16033\n} \n}" }, { "title": "Radial Distortion Invariant Factorization for Structure From Motion", @@ -33041,7 +35278,8 @@ "status": "Poster", "track": "main", "pid": 5817, - "author": "Jos\u00e9 Pedro Iglesias; Carl Olsson", + "author_site": "José Pedro Iglesias; Carl Olsson", + "author": "José Pedro Iglesias; Carl Olsson", "abstract": "Factorization methods are frequently used for structure from motion problems (SfM). In the presence of noise they are able to jointly estimate camera matrices and scene points in overdetermined settings, without the need for accurate initial solutions. While the early formulations were restricted to affine models, recent approaches have been show to work with pinhole cameras by minimizing object space errors. In this paper we propose a factorization approach using the so called radial camera, which is invariant to radial distortion and changes in focal length. Assuming a known principal point our approach can reconstruct the 3D scene in settings with unknown and varying radial distortion and focal length. We show on both real and synthetic data that our approach outperforms state-of-the-art factorization methods under these conditions.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Iglesias_Radial_Distortion_Invariant_Factorization_for_Structure_From_Motion_ICCV_2021_paper.pdf", "aff": "Department of Electrical Engineering, Chalmers University of Technology; Department of Electrical Engineering, Chalmers University of Technology + Centre for Mathematical Sciences, Lund University", @@ -33064,7 +35302,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Lund", "aff_country_unique_index": "0;0+0", - "aff_country_unique": "Sweden" + "aff_country_unique": "Sweden", + "bibtex": "@InProceedings{Iglesias_2021_ICCV,\n \n author = {\n Iglesias,\n Jos\\'e Pedro and Olsson,\n Carl\n},\n title = {\n Radial Distortion Invariant Factorization for Structure From Motion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5906-5915\n} \n}" }, { "title": "RandomRooms: Unsupervised Pre-Training From Synthetic Shapes and Randomized Layouts for 3D Object Detection", @@ -33072,6 +35311,7 @@ "status": "Poster", "track": "main", "pid": 7576, + "author_site": "Yongming Rao; Benlin Liu; Yi Wei; Jiwen Lu; Cho-Jui Hsieh; Jie Zhou", "author": "Yongming Rao; Benlin Liu; Yi Wei; Jiwen Lu; Cho-Jui Hsieh; Jie Zhou", "abstract": "3D point cloud understanding has made great progress in recent years. However, one major bottleneck is the scarcity of annotated real datasets, especially compared to 2D object detection tasks, since a large amount of labor is involved in annotating the real scans of a scene. A promising solution to this problem is to make better use of the synthetic dataset, which consists of CAD object models, to boost the learning on real datasets. This can be achieved by the pre-training and fine-tuning procedure. However, recent work on 3D pre-training exhibits failure when transfer features learned on synthetic objects to other real-world applications. In this work, we put forward a new method called RandomRooms to accomplish this objective. In particular, we propose to generate random layouts of a scene by making use of the objects in the synthetic CAD dataset and learn the 3D scene representation by applying object-level contrastive learning on two random scenes generated from the same set of synthetic objects. The model pre-trained in this way can serve as a better initialization when later fine-tuning on the 3D object detection task. Empirically, we show consistent improvement in downstream 3D detection tasks on several base models, especially when less training data are used, which strongly demonstrates the effectiveness and generalization of our method. Benefiting from the rich semantic knowledge and diverse objects from synthetic data, our method establishes the new state-of-the-art on widely-used 3D detection benchmarks ScanNetV2 and SUN RGB-D. We expect our attempt to provide a new perspective for bridging object and scene-level 3D understanding.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Rao_RandomRooms_Unsupervised_Pre-Training_From_Synthetic_Shapes_and_Randomized_Layouts_for_ICCV_2021_paper.pdf", @@ -33095,7 +35335,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;1+1;0;0;1;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Rao_2021_ICCV,\n \n author = {\n Rao,\n Yongming and Liu,\n Benlin and Wei,\n Yi and Lu,\n Jiwen and Hsieh,\n Cho-Jui and Zhou,\n Jie\n},\n title = {\n RandomRooms: Unsupervised Pre-Training From Synthetic Shapes and Randomized Layouts for 3D Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3283-3292\n} \n}" }, { "title": "RangeDet: In Defense of Range View for LiDAR-Based 3D Object Detection", @@ -33103,10 +35344,11 @@ "status": "Poster", "track": "main", "pid": 3356, + "author_site": "Lue Fan; Xuan Xiong; Feng Wang; Naiyan Wang; ZhaoXiang Zhang", "author": "Lue Fan; Xuan Xiong; Feng Wang; Naiyan Wang; ZhaoXiang Zhang", "abstract": "In this paper, we propose an anchor-free single-stage LiDAR-based 3D object detector -- RangeDet. The most notable difference with previous works is that our method is purely based on the range view representation. Compared with the commonly used voxelized or Bird's Eye View (BEV) representations, the range view representation is more compact and without quantization error. Although there are works adopting it for semantic segmentation, its performance in object detection is largely behind voxelized or BEV counterparts. We first analyze the existing range-view-based methods and find two issues overlooked by previous works: 1) the scale variation between nearby and far away objects; 2) the inconsistency between the 2D range image coordinates used in feature extraction and the 3D Cartesian coordinates used in output. Then we deliberately design three components to address these issues in our RangeDet. We test our RangeDet in the large-scale Waymo Open Dataset (WOD). Our best model achieves 72.9/75.9/65.8 3D AP on vehicle/pedestrian/cyclist. These results outperform other range-view-based methods by a large margin, and are overall comparable with the state-of-the-art multi-view-based methods. Codes will be released at https://github.com/TuSimple/RangeDet.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Fan_RangeDet_In_Defense_of_Range_View_for_LiDAR-Based_3D_Object_ICCV_2021_paper.pdf", - "aff": "Institute of Automation, Chinese Academy of Sciences (CASIA)+University of Chinese Academy of Sciences (UCAS)+National Laboratory of Pattern Recognition (NLPR)+Centre for Arti\ufb01cial Intelligence and Robotics, HKISI CAS+School of Future Technology, UCAS; TuSimple; TuSimple; TuSimple; Institute of Automation, Chinese Academy of Sciences (CASIA)+University of Chinese Academy of Sciences (UCAS)+National Laboratory of Pattern Recognition (NLPR)+Centre for Arti\ufb01cial Intelligence and Robotics, HKISI CAS", + "aff": "Institute of Automation, Chinese Academy of Sciences (CASIA)+University of Chinese Academy of Sciences (UCAS)+National Laboratory of Pattern Recognition (NLPR)+Centre for Artificial Intelligence and Robotics, HKISI CAS+School of Future Technology, UCAS; TuSimple; TuSimple; TuSimple; Institute of Automation, Chinese Academy of Sciences (CASIA)+University of Chinese Academy of Sciences (UCAS)+National Laboratory of Pattern Recognition (NLPR)+Centre for Artificial Intelligence and Robotics, HKISI CAS", "project": "", "github": "https://github.com/TuSimple/RangeDet", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Fan_RangeDet_In_Defense_ICCV_2021_supplemental.pdf", @@ -33120,13 +35362,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Fan_RangeDet_In_Defense_of_Range_View_for_LiDAR-Based_3D_Object_ICCV_2021_paper.html", "aff_unique_index": "0+1+2+3+1;4;4;4;0+1+2+3", "aff_unique_norm": "Chinese Academy of Sciences;University of Chinese Academy of Sciences;National Laboratory of Pattern Recognition;Hong Kong Institute of Science and Technology;TuSimple", - "aff_unique_dep": "Institute of Automation;;;Centre for Arti\ufb01cial Intelligence and Robotics;", + "aff_unique_dep": "Institute of Automation;;;Centre for Artificial Intelligence and Robotics;", "aff_unique_url": "http://www.ia.cas.cn;http://www.ucas.ac.cn;http://www.nlpr.com/;;https://www.tusimple.com", "aff_unique_abbr": "CASIA;UCAS;NLPR;HKISI;TuSimple", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0+0+0+0+0;1;1;1;0+0+0+0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Fan_2021_ICCV,\n \n author = {\n Fan,\n Lue and Xiong,\n Xuan and Wang,\n Feng and Wang,\n Naiyan and Zhang,\n ZhaoXiang\n},\n title = {\n RangeDet: In Defense of Range View for LiDAR-Based 3D Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2918-2927\n} \n}" }, { "title": "Rank & Sort Loss for Object Detection and Instance Segmentation", @@ -33134,6 +35377,7 @@ "status": "Poster", "track": "main", "pid": 2289, + "author_site": "Kemal Oksuz; Baris Can Cam; Emre Akbas; Sinan Kalkan", "author": "Kemal Oksuz; Baris Can Cam; Emre Akbas; Sinan Kalkan", "abstract": "We propose Rank & Sort (RS) Loss, a ranking-based loss function to train deep object detection and instance segmentation methods (i.e. visual detectors). RS Loss supervises the classifier, a sub-network of these methods, to rank each positive above all negatives as well as to sort positives among themselves with respect to (wrt.) their localisation qualities (e.g. Intersection-over-Union - IoU). To tackle the non-differentiable nature of ranking and sorting, we reformulate the incorporation of error-driven update with backpropagation as Identity Update, which enables us to model our novel sorting error among positives. With RS Loss, we significantly simplify training: (i) Thanks to our sorting objective, the positives are prioritized by the classifier without an additional auxiliary head (e.g. for centerness, IoU, mask-IoU), (ii) due to its ranking-based nature, RS Loss is robust to class imbalance, and thus, no sampling heuristic is required, and (iii) we address the multi-task nature of visual detectors using tuning-free task-balancing coefficients. Using RS Loss, we train seven diverse visual detectors only by tuning the learning rate, and show that it consistently outperforms baselines: e.g. our RS Loss improves (i) Faster R-CNN by 3 box AP and aLRP Loss (ranking-based baseline) by 2 box AP on COCO dataset, (ii) Mask R-CNN with repeat factor sampling (RFS) by 3.5 mask AP ( 7 AP for rare classes) on LVIS dataset; and also outperforms all counterparts. Code is available at: https://github.com/kemaloksuz/RankSortLoss.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Oksuz_Rank__Sort_Loss_for_Object_Detection_and_Instance_Segmentation_ICCV_2021_paper.pdf", @@ -33157,7 +35401,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Ankara", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "T\u00fcrkiye" + "aff_country_unique": "Turkey", + "bibtex": "@InProceedings{Oksuz_2021_ICCV,\n \n author = {\n Oksuz,\n Kemal and Cam,\n Baris Can and Akbas,\n Emre and Kalkan,\n Sinan\n},\n title = {\n Rank \\& Sort Loss for Object Detection and Instance Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3009-3018\n} \n}" }, { "title": "Ranking Models in Unlabeled New Environments", @@ -33165,6 +35410,7 @@ "status": "Poster", "track": "main", "pid": 1553, + "author_site": "Xiaoxiao Sun; Yunzhong Hou; Weijian Deng; Hongdong Li; Liang Zheng", "author": "Xiaoxiao Sun; Yunzhong Hou; Weijian Deng; Hongdong Li; Liang Zheng", "abstract": "Consider a scenario where we are supplied with a number of ready-to-use models trained on a certain source domain and hope to directly apply the most appropriate ones to different target domains based on the models' relative performance. Ideally we should annotate a validation set for model performance assessment on each new target environment, but such annotations are often very expensive. Under this circumstance, we introduce the problem of ranking models in unlabeled new environments. For this problem, we propose to adopt a proxy dataset that 1) is fully labeled and 2) well reflects the true model rankings in a given target environment, and use the performance rankings on the proxy sets as surrogates. We first select labeled datasets as the proxy. Specifically, datasets that are more similar to the unlabeled target domain are found to better preserve the relative performance rankings. Motivated by this, we further propose to search the proxy set by sampling images from various datasets that have similar distributions as the target. We analyze the problem and its solutions on the person re-identification (re-ID) task, for which sufficient datasets are publicly available, and show that a carefully constructed proxy set effectively captures relative performance ranking in new environments. Code is avalible at https://github.com/sxzrt/Proxy-Set.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Sun_Ranking_Models_in_Unlabeled_New_Environments_ICCV_2021_paper.pdf", @@ -33188,7 +35434,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "Australia" + "aff_country_unique": "Australia", + "bibtex": "@InProceedings{Sun_2021_ICCV,\n \n author = {\n Sun,\n Xiaoxiao and Hou,\n Yunzhong and Deng,\n Weijian and Li,\n Hongdong and Zheng,\n Liang\n},\n title = {\n Ranking Models in Unlabeled New Environments\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11761-11771\n} \n}" }, { "title": "Rational Polynomial Camera Model Warping for Deep Learning Based Satellite Multi-View Stereo Matching", @@ -33196,6 +35443,7 @@ "status": "Poster", "track": "main", "pid": 2980, + "author_site": "Jian Gao; Jin Liu; Shunping Ji", "author": "Jian Gao; Jin Liu; Shunping Ji", "abstract": "Satellite multi-view stereo (MVS) imagery is particularly suited for large-scale Earth surface reconstruction. Differing from the perspective camera model (pin-hole model) that is commonly used for close-range and aerial cameras, the cubic rational polynomial camera (RPC) model is the mainstream model for push-broom linear-array satellite cameras. However, the homography warping used in the prevailing learning based MVS methods is only applicable to pin-hole cameras. In order to apply the SOTA learning based MVS technology to the satellite MVS taskfor large-scale Earth surface reconstruction, RPC warping should be considered. In this work, we propose, for the first time, a rigorous RPC warping module. The rational polynomial coefficients are recorded as a tensor, and the RPC warping is formulated as a series of tensor transformations. Based on the RPC warping, we propose the deep learning based satellite MVS (SatMVS) framework for large-scale and wide depth range Earth surface reconstruction. We also introduce a large-scale satellite image dataset consisting of 519 5120x5120 images, which we call the TLC SatMVS dataset. The satellite images were acquired from a three-line camera (TLC) that catches triple-view images simultaneously, forming a valuable supplement to the existing open-source WorldView-3 datasets with single-scanline images. Experiments show that the proposed RPC warping module and the SatMVS framework can achieve a superior reconstruction accuracy compared to the pin-hole fitting method and conventional MVS methods. Code and data are available at https://github.com/WHU-GPCV/SatMVS.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Gao_Rational_Polynomial_Camera_Model_Warping_for_Deep_Learning_Based_Satellite_ICCV_2021_paper.pdf", @@ -33219,7 +35467,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Gao_2021_ICCV,\n \n author = {\n Gao,\n Jian and Liu,\n Jin and Ji,\n Shunping\n},\n title = {\n Rational Polynomial Camera Model Warping for Deep Learning Based Satellite Multi-View Stereo Matching\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6148-6157\n} \n}" }, { "title": "Re-Aging GAN: Toward Personalized Face Age Transformation", @@ -33227,6 +35476,7 @@ "status": "Poster", "track": "main", "pid": 8363, + "author_site": "Farkhod Makhmudkhujaev; Sungeun Hong; In Kyu Park", "author": "Farkhod Makhmudkhujaev; Sungeun Hong; In Kyu Park", "abstract": "Face age transformation aims to synthesize past or future face images by reflecting the age factor on given faces. Ideally, this task should synthesize natural-looking faces across various age groups while maintaining identity. However, most of the existing work has focused on only one of these or is difficult to train while unnatural artifacts still appear. In this work, we propose Re-Aging GAN (RAGAN), a novel single framework considering all the critical factors in age transformation. Our framework achieves state-of-the-art personalized face age transformation by compelling the input identity to perform the self-guidance of the generation process. Specifically, RAGAN can learn the personalized age features by using high-order interactions between given identity and target age. Learned personalized age features are identity information that is recalibrated according to the target age. Hence, such features encompass identity and target age information that provides important clues on how an input identity should be at a certain age. Experimental result shows the lowest FID and KID scores and the highest age recognition accuracy compared to previous methods. The proposed method also demonstrates the visual superiority with fewer artifacts, identity preservation, and natural transformation across various age groups.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Makhmudkhujaev_Re-Aging_GAN_Toward_Personalized_Face_Age_Transformation_ICCV_2021_paper.pdf", @@ -33250,7 +35500,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Incheon", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Makhmudkhujaev_2021_ICCV,\n \n author = {\n Makhmudkhujaev,\n Farkhod and Hong,\n Sungeun and Park,\n In Kyu\n},\n title = {\n Re-Aging GAN: Toward Personalized Face Age Transformation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3908-3917\n} \n}" }, { "title": "Re-Distributing Biased Pseudo Labels for Semi-Supervised Semantic Segmentation: A Baseline Investigation", @@ -33258,6 +35509,7 @@ "status": "Poster", "track": "main", "pid": 2978, + "author_site": "Ruifei He; Jihan Yang; Xiaojuan Qi", "author": "Ruifei He; Jihan Yang; Xiaojuan Qi", "abstract": "While self-training has advanced semi-supervised semantic segmentation, it severely suffers from the long-tailed class distribution on real-world semantic segmentation datasets that make the pseudo-labeled data bias toward majority classes. In this paper, we present a simple and yet effective Distribution Alignment and Random Sampling (DARS) method to produce unbiased pseudo labels that match the true class distribution estimated from the labeled data. Besides, we also contribute a progressive data augmentation and labeling strategy to facilitate model training with pseudo-labeled data. Experiments on both Cityscapes and PASCAL VOC 2012 datasets demonstrate the effectiveness of our approach. Albeit simple, our method performs favorably in comparison with state-of-the-art approaches. Code will be available at https://github.com/CVMI-Lab/DARS.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/He_Re-Distributing_Biased_Pseudo_Labels_for_Semi-Supervised_Semantic_Segmentation_A_Baseline_ICCV_2021_paper.pdf", @@ -33274,14 +35526,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/He_Re-Distributing_Biased_Pseudo_Labels_for_Semi-Supervised_Semantic_Segmentation_A_Baseline_ICCV_2021_paper.html", "aff_unique_index": "0+1;0;0", - "aff_unique_norm": "University of Hong Kong;Zhejiang University", + "aff_unique_norm": "The University of Hong Kong;Zhejiang University", "aff_unique_dep": ";", "aff_unique_url": "https://www.hku.hk;https://www.zju.edu.cn", "aff_unique_abbr": "HKU;ZJU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0+0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{He_2021_ICCV,\n \n author = {\n He,\n Ruifei and Yang,\n Jihan and Qi,\n Xiaojuan\n},\n title = {\n Re-Distributing Biased Pseudo Labels for Semi-Supervised Semantic Segmentation: A Baseline Investigation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6930-6940\n} \n}" }, { "title": "Re-Energizing Domain Discriminator With Sample Relabeling for Adversarial Domain Adaptation", @@ -33289,6 +35542,7 @@ "status": "Poster", "track": "main", "pid": 3055, + "author_site": "Xin Jin; Cuiling Lan; Wenjun Zeng; Zhibo Chen", "author": "Xin Jin; Cuiling Lan; Wenjun Zeng; Zhibo Chen", "abstract": "Many unsupervised domain adaptation (UDA) methods exploit domain adversarial training to align the features to reduce domain gap, where a feature extractor is trained to fool a domain discriminator in order to have aligned feature distributions. The discrimination capability of the domain classifier w.r.t. the increasingly aligned feature distributions deteriorates as training goes on, thus cannot effectively further drive the training of feature extractor. In this work, we propose an efficient optimization strategy named Re-enforceable Adversarial Domain Adaptation (RADA) which aims to re-energize the domain discriminator during the training by using dynamic domain labels. Particularly, we relabel the well aligned target domain samples as source domain samples on the fly. Such relabeling makes the less separable distributions more separable, and thus leads to a more powerful domain classifier w.r.t. the new data distributions, which in turn further drives feature alignment. Extensive experiments on multiple UDA benchmarks demonstrate the effectiveness and superiority of our RADA.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Jin_Re-Energizing_Domain_Discriminator_With_Sample_Relabeling_for_Adversarial_Domain_Adaptation_ICCV_2021_paper.pdf", @@ -33305,14 +35559,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Jin_Re-Energizing_Domain_Discriminator_With_Sample_Relabeling_for_Adversarial_Domain_Adaptation_ICCV_2021_paper.html", "aff_unique_index": "0;1;1;0", - "aff_unique_norm": "University of Science and Technology of China;Microsoft", + "aff_unique_norm": "University of Science and Technology of China;Microsoft Research", "aff_unique_dep": ";Research", "aff_unique_url": "http://www.ustc.edu.cn;https://www.microsoft.com/en-us/research/group/asia", "aff_unique_abbr": "USTC;MSR Asia", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Jin_2021_ICCV,\n \n author = {\n Jin,\n Xin and Lan,\n Cuiling and Zeng,\n Wenjun and Chen,\n Zhibo\n},\n title = {\n Re-Energizing Domain Discriminator With Sample Relabeling for Adversarial Domain Adaptation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9174-9183\n} \n}" }, { "title": "ReCU: Reviving the Dead Weights in Binary Neural Networks", @@ -33320,10 +35575,11 @@ "status": "Poster", "track": "main", "pid": 3372, + "author_site": "Zihan Xu; Mingbao Lin; Jianzhuang Liu; Jie Chen; Ling Shao; Yue Gao; Yonghong Tian; Rongrong Ji", "author": "Zihan Xu; Mingbao Lin; Jianzhuang Liu; Jie Chen; Ling Shao; Yue Gao; Yonghong Tian; Rongrong Ji", "abstract": "Binary neural networks (BNNs) have received increasing attention due to their superior reductions of computation and memory. Most existing works focus on either lessening the quantization error by minimizing the gap between the full-precision weights and their binarization or designing a gradient approximation to mitigate the gradient mismatch, while leaving the \"dead weights\" untouched. This leads to slow convergence when training BNNs. In this paper, for the first time, we explore the influence of \"dead weights\" which refer to a group of weights that are barely updated during the training of BNNs, and then introduce rectified clamp unit (ReCU) to revive the \"dead weights\" for updating. We prove that reviving the \"dead weights\" by ReCU can result in a smaller quantization error. Besides, we also take into account the information entropy of the weights, and then mathematically analyze why the weight standardization can benefit BNNs. We demonstrate the inherent contradiction between minimizing the quantization error and maximizing the information entropy, and then propose an adaptive exponential scheduler to identify the range of the \"dead weights\". By considering the \"dead weights\", our method offers not only faster BNN training, but also state-of-the-art performance on CIFAR-10 and ImageNet, compared with recent methods. Code can be available at https://github.com/z-hXu/ReCU.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xu_ReCU_Reviving_the_Dead_Weights_in_Binary_Neural_Networks_ICCV_2021_paper.pdf", - "aff": "MAC Lab, School of Informatics, Xiamen University; MAC Lab, School of Informatics, Xiamen University; Noah\u2019s Ark Lab, Huawei Technologies; School of Electronic and Computer Engineering, Peking University + Peng Cheng Lab; Inception Institute of Arti\ufb01cial Intelligence; School of Software, THUIBCS, BNRist, Tsinghua University; School of Electronic and Computer Engineering, Peking University + Peng Cheng Lab; MAC Lab, School of Informatics, Xiamen University + Institute of Arti\ufb01cial Intelligence, Xiamen University + Peng Cheng Lab", + "aff": "MAC Lab, School of Informatics, Xiamen University; MAC Lab, School of Informatics, Xiamen University; Noah’s Ark Lab, Huawei Technologies; School of Electronic and Computer Engineering, Peking University + Peng Cheng Lab; Inception Institute of Artificial Intelligence; School of Software, THUIBCS, BNRist, Tsinghua University; School of Electronic and Computer Engineering, Peking University + Peng Cheng Lab; MAC Lab, School of Informatics, Xiamen University + Institute of Artificial Intelligence, Xiamen University + Peng Cheng Lab", "project": "", "github": "https://github.com/z-hXu/ReCU", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Xu_ReCU_Reviving_the_ICCV_2021_supplemental.pdf", @@ -33336,14 +35592,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Xu_ReCU_Reviving_the_Dead_Weights_in_Binary_Neural_Networks_ICCV_2021_paper.html", "aff_unique_index": "0;0;1;2+3;4;5;2+3;0+0+3", - "aff_unique_norm": "Xiamen University;Huawei;Peking University;Pengcheng Laboratory;Inception Institute of Artificial Intelligence;Tsinghua University", - "aff_unique_dep": "School of Informatics;Noah\u2019s Ark Lab;School of Electronic and Computer Engineering;Peng Cheng Lab;;School of Software", + "aff_unique_norm": "Xiamen University;Huawei Technologies;Peking University;Peng Cheng Lab;Inception Institute of Artificial Intelligence;Tsinghua University", + "aff_unique_dep": "School of Informatics;Noah’s Ark Lab;School of Electronic and Computer Engineering;;;School of Software", "aff_unique_url": "https://www.xmu.edu.cn;https://www.huawei.com;http://www.pku.edu.cn;;https://www.inceptionai.org;https://www.tsinghua.edu.cn", "aff_unique_abbr": "XMU;Huawei;PKU;;;THU", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0+0;0;0;0+0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xu_2021_ICCV,\n \n author = {\n Xu,\n Zihan and Lin,\n Mingbao and Liu,\n Jianzhuang and Chen,\n Jie and Shao,\n Ling and Gao,\n Yue and Tian,\n Yonghong and Ji,\n Rongrong\n},\n title = {\n ReCU: Reviving the Dead Weights in Binary Neural Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5198-5208\n} \n}" }, { "title": "ReDAL: Region-Based and Diversity-Aware Active Learning for Point Cloud Semantic Segmentation", @@ -33351,6 +35608,7 @@ "status": "Poster", "track": "main", "pid": 8729, + "author_site": "Tsung-Han Wu; Yueh-Cheng Liu; Yu-Kai Huang; Hsin-Ying Lee; Hung-Ting Su; Ping-Chia Huang; Winston H. Hsu", "author": "Tsung-Han Wu; Yueh-Cheng Liu; Yu-Kai Huang; Hsin-Ying Lee; Hung-Ting Su; Ping-Chia Huang; Winston H. Hsu", "abstract": "Despite the success of deep learning on supervised point cloud semantic segmentation, obtaining large-scale point-by-point manual annotations is still a significant challenge. To reduce the huge annotation burden, we propose a Region-based and Diversity-aware Active Learning (ReDAL), a general framework for many deep learning approaches, aiming to automatically select only informative and diverse sub-scene regions for label acquisition. Observing that only a small portion of annotated regions are sufficient for 3D scene understanding with deep learning, we use softmax entropy, color discontinuity, and structural complexity to measure the information of sub-scene regions. A diversity-aware selection algorithm is also developed to avoid redundant annotations resulting from selecting informative but similar regions in a querying batch. Extensive experiments show that our method highly outperforms previous active learning strategies, and we achieve the performance of 90% fully supervised learning, while less than 15% and 5% annotations are required on S3DIS and SemanticKITTI datasets, respectively.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wu_ReDAL_Region-Based_and_Diversity-Aware_Active_Learning_for_Point_Cloud_Semantic_ICCV_2021_paper.pdf", @@ -33374,7 +35632,8 @@ "aff_campus_unique_index": "0;0;0;0;0;0;0", "aff_campus_unique": "Taiwan;", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Wu_2021_ICCV,\n \n author = {\n Wu,\n Tsung-Han and Liu,\n Yueh-Cheng and Huang,\n Yu-Kai and Lee,\n Hsin-Ying and Su,\n Hung-Ting and Huang,\n Ping-Chia and Hsu,\n Winston H.\n},\n title = {\n ReDAL: Region-Based and Diversity-Aware Active Learning for Point Cloud Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15510-15519\n} \n}" }, { "title": "RePOSE: Fast 6D Object Pose Refinement via Deep Texture Rendering", @@ -33382,6 +35641,7 @@ "status": "Poster", "track": "main", "pid": 3480, + "author_site": "Shun Iwase; Xingyu Liu; Rawal Khirodkar; Rio Yokota; Kris M. Kitani", "author": "Shun Iwase; Xingyu Liu; Rawal Khirodkar; Rio Yokota; Kris M. Kitani", "abstract": "We present RePOSE, a fast iterative refinement method for 6D object pose estimation. Prior methods perform refinement by feeding zoomed-in input and rendered RGB images into a CNN and directly regressing an update of a refined pose. Their runtime is slow due to the computational cost of CNN, which is especially prominent in multiple-object pose refinement. To overcome this problem, RePOSE leverages image rendering for fast feature extraction using a 3D model with a learnable texture. We call this deep texture rendering, which uses a shallow multi-layer perceptron to directly regress a view-invariant image representation of an object. Furthermore, we utilize differentiable Levenberg-Marquard (LM) optimization to refine a pose fast and accurately by minimizing the feature-metric error between the input and rendered image representations without the need of zooming in. These image representations are trained such that differentiable LM optimization converges within few iterations. Consequently, RePOSE runs at 92 FPS and achieves state-of-the-art accuracy of 51.6% on the Occlusion LineMOD dataset - a 4.1% absolute improvement over the prior art, and comparable result on the YCB-Video dataset with a much faster runtime. The code is available at https://github.com/sh8/repose.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Iwase_RePOSE_Fast_6D_Object_Pose_Refinement_via_Deep_Texture_Rendering_ICCV_2021_paper.pdf", @@ -33405,7 +35665,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0", - "aff_country_unique": "United States;Japan" + "aff_country_unique": "United States;Japan", + "bibtex": "@InProceedings{Iwase_2021_ICCV,\n \n author = {\n Iwase,\n Shun and Liu,\n Xingyu and Khirodkar,\n Rawal and Yokota,\n Rio and Kitani,\n Kris M.\n},\n title = {\n RePOSE: Fast 6D Object Pose Refinement via Deep Texture Rendering\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3303-3312\n} \n}" }, { "title": "ReStyle: A Residual-Based StyleGAN Encoder via Iterative Refinement", @@ -33413,6 +35674,7 @@ "status": "Poster", "track": "main", "pid": 3929, + "author_site": "Yuval Alaluf; Or Patashnik; Daniel Cohen-Or", "author": "Yuval Alaluf; Or Patashnik; Daniel Cohen-Or", "abstract": "Recently, the power of unconditional image synthesis has significantly advanced through the use of Generative Adversarial Networks (GANs). The task of inverting an image into its corresponding latent code of the trained GAN is of utmost importance as it allows for the manipulation of real images, leveraging the rich semantics learned by the network. Recognizing the limitations of current inversion approaches, in this work we present a novel inversion scheme that extends current encoder-based inversion methods by introducing an iterative refinement mechanism. Instead of directly predicting the latent code of a given real image using a single pass, the encoder is tasked with predicting a residual with respect to the current estimate of the inverted latent code in a self-correcting manner. Our residual-based encoder, named ReStyle, attains improved accuracy compared to current state-of-the-art encoder-based methods with a negligible increase in inference time. We analyze the behavior of ReStyle to gain valuable insights into its iterative nature. We then evaluate the performance of our residual encoder and analyze its robustness compared to optimization-based inversion and state-of-the-art encoders.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Alaluf_ReStyle_A_Residual-Based_StyleGAN_Encoder_via_Iterative_Refinement_ICCV_2021_paper.pdf", @@ -33427,7 +35689,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Alaluf_ReStyle_A_Residual-Based_StyleGAN_Encoder_via_Iterative_Refinement_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Alaluf_ReStyle_A_Residual-Based_StyleGAN_Encoder_via_Iterative_Refinement_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Alaluf_2021_ICCV,\n \n author = {\n Alaluf,\n Yuval and Patashnik,\n Or and Cohen-Or,\n Daniel\n},\n title = {\n ReStyle: A Residual-Based StyleGAN Encoder via Iterative Refinement\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6711-6720\n} \n}" }, { "title": "Real-Time Image Enhancer via Learnable Spatial-Aware 3D Lookup Tables", @@ -33435,10 +35698,11 @@ "status": "Poster", "track": "main", "pid": 7223, + "author_site": "Tao Wang; Yong Li; Jingyang Peng; Yipeng Ma; Xian Wang; Fenglong Song; Youliang Yan", "author": "Tao Wang; Yong Li; Jingyang Peng; Yipeng Ma; Xian Wang; Fenglong Song; Youliang Yan", "abstract": "Recently, deep learning-based image enhancement algorithms achieved state-of-the-art (SOTA) performance on several publicly available datasets. However, most existing methods fail to meet practical requirements either for visual perception or for computation efficiency, especially for high-resolution images. In this paper, we propose a novel real-time image enhancer via learnable spatial-aware 3-dimentional lookup tables(3D LUTs), which well considers global scenario and local spatial information. Specifically, we introduce a light weight two-head weight predictor that has two outputs. One is a 1D weight vector used for image-level scenario adaptation, the other is a 3D weight map aimed for pixel-wise category fusion. We learn the spatial-aware 3D LUTs and fuse them according to the aforementioned weights in an end-to-end manner. The fused LUT is then used to transform the source image into the target tone in an efficient way. Extensive results show that our model outperforms SOTA image enhancement methods on public datasets both subjectively and objectively, and that our model only takes about 4ms to process a 4K resolution image on one NVIDIA V100 GPU.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_Real-Time_Image_Enhancer_via_Learnable_Spatial-Aware_3D_Lookup_Tables_ICCV_2021_paper.pdf", - "aff": "Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab\u2020; Huawei Noah\u2019s Ark Lab\u2020", + "aff": "Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab†; Huawei Noah’s Ark Lab†", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Wang_Real-Time_Image_Enhancer_ICCV_2021_supplemental.pdf", @@ -33452,13 +35716,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wang_Real-Time_Image_Enhancer_via_Learnable_Spatial-Aware_3D_Lookup_Tables_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;0;0;0;0", "aff_unique_norm": "Huawei", - "aff_unique_dep": "Noah\u2019s Ark Lab", + "aff_unique_dep": "Noah’s Ark Lab", "aff_unique_url": "https://www.huawei.com", "aff_unique_abbr": "Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Tao and Li,\n Yong and Peng,\n Jingyang and Ma,\n Yipeng and Wang,\n Xian and Song,\n Fenglong and Yan,\n Youliang\n},\n title = {\n Real-Time Image Enhancer via Learnable Spatial-Aware 3D Lookup Tables\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2471-2480\n} \n}" }, { "title": "Real-Time Instance Segmentation With Discriminative Orientation Maps", @@ -33466,6 +35731,7 @@ "status": "Poster", "track": "main", "pid": 7336, + "author_site": "Wentao Du; Zhiyu Xiang; Shuya Chen; Chengyu Qiao; Yiman Chen; Tingming Bai", "author": "Wentao Du; Zhiyu Xiang; Shuya Chen; Chengyu Qiao; Yiman Chen; Tingming Bai", "abstract": "Although instance segmentation has made considerable advancement over recent years, it's still a challenge to design high accuracy algorithms with real-time performance. In this paper, we propose a real-time instance segmentation framework termed OrienMask. Upon the one-stage object detector YOLOv3, a mask head is added to predict some discriminative orientation maps, which are explicitly defined as spatial offset vectors for both foreground and background pixels. Thanks to the discrimination ability of orientation maps, masks can be recovered without the need for extra foreground segmentation. All instances that match with the same anchor size share a common orientation map. This special sharing strategy reduces the amortized memory utilization for mask predictions but without loss of mask granularity. Given the surviving box predictions after NMS, instance masks can be concurrently constructed from the corresponding orientation maps with low complexity. Owing to the concise design for mask representation and its effective integration with the anchor-based object detector, our method is qualified under real-time conditions while maintaining competitive accuracy. Experiments on COCO benchmark show that OrienMask achieves 34.8 mask AP at the speed of 42.7 fps evaluated with a single RTX 2080 Ti. Code is available at github.com/duwt/OrienMask.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Du_Real-Time_Instance_Segmentation_With_Discriminative_Orientation_Maps_ICCV_2021_paper.pdf", @@ -33489,7 +35755,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Du_2021_ICCV,\n \n author = {\n Du,\n Wentao and Xiang,\n Zhiyu and Chen,\n Shuya and Qiao,\n Chengyu and Chen,\n Yiman and Bai,\n Tingming\n},\n title = {\n Real-Time Instance Segmentation With Discriminative Orientation Maps\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7314-7323\n} \n}" }, { "title": "Real-Time Vanishing Point Detector Integrating Under-Parameterized RANSAC and Hough Transform", @@ -33497,6 +35764,7 @@ "status": "Poster", "track": "main", "pid": 6673, + "author_site": "Jianping Wu; Liang Zhang; Ye Liu; Ke Chen", "author": "Jianping Wu; Liang Zhang; Ye Liu; Ke Chen", "abstract": "We propose a novel approach that integrates under-parameterized RANSAC (UPRANSAC) with Hough Transform to detect vanishing points (VPs) from un-calibrated monocular images. In our algorithm, the UPRANSAC chooses one hypothetical inlier in a sample set to find a portion of the VP's degrees of freedom, which is followed by a highly reliable brute-force voting scheme (1-D Hough Transform) to find the VP's remaining degrees of freedom along the extension line of the hypothetical inlier. Our approach is able to sequentially find a series of VPs by repeatedly removing inliers of any detected VPs from minimal sample sets until the stop criterion is reached. Compared to traditional RANSAC that selects 2 edges as a hypothetical inlier pair to fit a model of VP hypothesis and requires hitting a pair of inliners, the UPRANSAC has a higher likelihood to hit one inliner and is more reliable in VP detection. Meanwhile, the tremendously scaled-down voting space with the requirement of only 1 parameter for processing significantly increased the performance efficiency of Hough Transform in our scheme. Testing results with well-known benchmark datasets show that the detection accuracies of our approach were higher or on par with the SOTA while running in deeply real-time zone.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wu_Real-Time_Vanishing_Point_Detector_Integrating_Under-Parameterized_RANSAC_and_Hough_Transform_ICCV_2021_paper.pdf", @@ -33520,7 +35788,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wu_2021_ICCV,\n \n author = {\n Wu,\n Jianping and Zhang,\n Liang and Liu,\n Ye and Chen,\n Ke\n},\n title = {\n Real-Time Vanishing Point Detector Integrating Under-Parameterized RANSAC and Hough Transform\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3732-3741\n} \n}" }, { "title": "Real-Time Video Inference on Edge Devices via Adaptive Model Streaming", @@ -33528,6 +35797,7 @@ "status": "Poster", "track": "main", "pid": 9020, + "author_site": "Mehrdad Khani; Pouya Hamadanian; Arash Nasr-Esfahany; Mohammad Alizadeh", "author": "Mehrdad Khani; Pouya Hamadanian; Arash Nasr-Esfahany; Mohammad Alizadeh", "abstract": "Real-time video inference on edge devices like mobile phones and drones is challenging due to the high computation cost of Deep Neural Networks. We present Adaptive Model Streaming (AMS), a new approach to improving the performance of efficient lightweight models for video inference on edge devices. AMS uses a remote server to continually train and adapt a small model running on the edge device, boosting its performance on the live video using online knowledge distillation from a large, state-of-the-art model. We discuss the challenges of over-the-network model adaptation for video inference and present several techniques to reduce communication the cost of this approach: avoiding excessive overfitting, updating a small fraction of important model parameters, and adaptive sampling of training frames at edge devices. On the task of video semantic segmentation, our experimental results show 0.4--17.8 percent mean Intersection-over-Union improvement compared to a pre-trained model across several video datasets. Our prototype can perform video segmentation at 30 frames-per-second with 40 milliseconds camera-to-label latency on a Samsung Galaxy S10+ mobile phone, using less than 300 Kbps uplink and downlink bandwidth on the device.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Khani_Real-Time_Video_Inference_on_Edge_Devices_via_Adaptive_Model_Streaming_ICCV_2021_paper.pdf", @@ -33551,7 +35821,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Khani_2021_ICCV,\n \n author = {\n Khani,\n Mehrdad and Hamadanian,\n Pouya and Nasr-Esfahany,\n Arash and Alizadeh,\n Mohammad\n},\n title = {\n Real-Time Video Inference on Edge Devices via Adaptive Model Streaming\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4572-4582\n} \n}" }, { "title": "Real-World Video Super-Resolution: A Benchmark Dataset and a Decomposition Based Learning Scheme", @@ -33559,6 +35830,7 @@ "status": "Poster", "track": "main", "pid": 3239, + "author_site": "Xi Yang; Wangmeng Xiang; Hui Zeng; Lei Zhang", "author": "Xi Yang; Wangmeng Xiang; Hui Zeng; Lei Zhang", "abstract": "Video super-resolution (VSR) aims to improve the spatial resolution of low-resolution (LR) videos. Existing VSR methods are mostly trained and evaluated on synthetic datasets, where the LR videos are uniformly downsampled from their high-resolution (HR) counterparts by some simple operators (e.g., bicubic downsampling). Such simple synthetic degradation models, however, cannot well describe the complex degradation processes in real-world videos, and thus the trained VSR models become ineffective in real-world applications. As an attempt to bridge the gap, we build a real-world video super-resolution (RealVSR) dataset by capturing paired LR-HR video sequences using the multi-camera system of iPhone 11 Pro Max. Since the LR-HR video pairs are captured by two separate cameras, there are inevitably certain misalignment and luminance/color differences between them. To more robustly train the VSR model and recover more details from the LR inputs, we convert the LR-HR videos into YCbCr space and decompose the luminance channel into a Laplacian pyramid, and then apply different loss functions to different components. Experiments validate that VSR models trained on our RealVSR dataset demonstrate better visual quality than those trained on synthetic datasets under real-world settings. They also exhibit good generalization capability in cross-camera tests. The dataset and code can be found at https://github.com/IanYeung/RealVSR.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yang_Real-World_Video_Super-Resolution_A_Benchmark_Dataset_and_a_Decomposition_Based_ICCV_2021_paper.pdf", @@ -33575,14 +35847,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Yang_Real-World_Video_Super-Resolution_A_Benchmark_Dataset_and_a_Decomposition_Based_ICCV_2021_paper.html", "aff_unique_index": "0+1;0+1;0+1;0+1", - "aff_unique_norm": "Hong Kong Polytechnic University;Alibaba Group", + "aff_unique_norm": "The Hong Kong Polytechnic University;Alibaba Group", "aff_unique_dep": ";DAMO Academy", "aff_unique_url": "https://www.polyu.edu.hk;https://www.alibaba-group.com", "aff_unique_abbr": "PolyU;Alibaba", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0+0;0+0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yang_2021_ICCV,\n \n author = {\n Yang,\n Xi and Xiang,\n Wangmeng and Zeng,\n Hui and Zhang,\n Lei\n},\n title = {\n Real-World Video Super-Resolution: A Benchmark Dataset and a Decomposition Based Learning Scheme\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4781-4790\n} \n}" }, { "title": "Reality Transform Adversarial Generators for Image Splicing Forgery Detection and Localization", @@ -33590,6 +35863,7 @@ "status": "Poster", "track": "main", "pid": 11082, + "author_site": "Xiuli Bi; Zhipeng Zhang; Bin Xiao", "author": "Xiuli Bi; Zhipeng Zhang; Bin Xiao", "abstract": "When many forged images become more and more realistic with the help of image editing tools and deep learning techniques, authenticators need to improve their ability to verify these forged images. The process of generating and detecting forged images is thus similar to the principle of Generative Adversarial Networks (GANs). Creating realistic forged images requires a retouching process to suppress tampering artifacts and keep structural information. We view this retouching process as image style transfer and then proposed the fake-to-realistic transformation generator GT. For detecting the tampered regions, a forgery localization generator GM is proposed based on a multi-decoder-single-task strategy. By adversarial training two generators, the proposed alpha-learnable whitening and coloring transformation (alpha-learnable WCT) block in GT automatically suppresses the tampering artifacts in the forged images. Meanwhile, the detection and localization abilities of GM will be improved by learning the forged images retouched by GT. The experimental results demonstrate that the proposed two generators in GAN can simulate confrontation between fakers and authenticators well. The localization generator GM outperforms the state-of-the-art methods in splicing forgery detection and localization on four public datasets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Bi_Reality_Transform_Adversarial_Generators_for_Image_Splicing_Forgery_Detection_and_ICCV_2021_paper.pdf", @@ -33613,7 +35887,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Bi_2021_ICCV,\n \n author = {\n Bi,\n Xiuli and Zhang,\n Zhipeng and Xiao,\n Bin\n},\n title = {\n Reality Transform Adversarial Generators for Image Splicing Forgery Detection and Localization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14294-14303\n} \n}" }, { "title": "Reconcile Prediction Consistency for Balanced Object Detection", @@ -33621,6 +35896,7 @@ "status": "Poster", "track": "main", "pid": 6352, + "author_site": "Keyang Wang; Lei Zhang", "author": "Keyang Wang; Lei Zhang", "abstract": "Classification and regression are two pillars of object detectors. In most CNN-based detectors, these two pillars are optimized independently. Without direct interactions between them, the classification loss and the regression loss can not be optimized synchronously toward the optimal direction in the training phase. This clearly leads to lots of inconsistent predictions with high classification score but low localization accuracy or low classification score but high localization accuracy in the inference phase, especially for the objects of irregular shape and occlusion, which severely hurts the detection performance of existing detectors after NMS. To reconcile prediction consistency for balanced object detection, we propose a Harmonic loss to harmonize the optimization of classification branch and localization branch. The Harmonic loss enables these two branches to supervise and promote each other during training, thereby producing consistent predictions with high co-occurrence of top classification and localization in the inference phase. Furthermore, in order to prevent the localization loss from being dominated by outliers during training phase, a Harmonic IoU loss is proposed to harmonize the weight of the localization loss of different IoU-level samples. Comprehensive experiments on benchmarks PASCAL VOC and MS COCO demonstrate the generality and effectiveness of our model for facilitating existing object detectors to state-of-the-art accuracy.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_Reconcile_Prediction_Consistency_for_Balanced_Object_Detection_ICCV_2021_paper.pdf", @@ -33644,7 +35920,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Keyang and Zhang,\n Lei\n},\n title = {\n Reconcile Prediction Consistency for Balanced Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3631-3640\n} \n}" }, { "title": "ReconfigISP: Reconfigurable Camera Image Processing Pipeline", @@ -33652,6 +35929,7 @@ "status": "Poster", "track": "main", "pid": 4117, + "author_site": "Ke Yu; Zexian Li; Yue Peng; Chen Change Loy; Jinwei Gu", "author": "Ke Yu; Zexian Li; Yue Peng; Chen Change Loy; Jinwei Gu", "abstract": "Image Signal Processor (ISP) is a crucial component in digital cameras that transforms sensor signals into images for us to perceive and understand. Existing ISP designs always adopt a fixed architecture, e.g., several sequential modules connected in a rigid order. Such a fixed ISP architecture may be suboptimal for real-world applications, where camera sensors, scenes and tasks are diverse. In this study, we propose a novel Reconfigurable ISP (ReconfigISP) whose architecture and parameters can be automatically tailored to specific data and tasks. In particular, we implement several ISP modules, and enable backpropagation for each module by training a differentiable proxy, hence allowing us to leverage the popular differentiable neural architecture search and effectively search for the optimal ISP architecture. A proxy tuning mechanism is adopted to maintain the accuracy of proxy networks in all cases. Extensive experiments conducted on image restoration and object detection, with different sensors, light conditions and efficiency constraints, validate the effectiveness of ReconfigISP. Only hundreds of parameters need tuning for every task.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yu_ReconfigISP_Reconfigurable_Camera_Image_Processing_Pipeline_ICCV_2021_paper.pdf", @@ -33668,14 +35946,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Yu_ReconfigISP_Reconfigurable_Camera_Image_Processing_Pipeline_ICCV_2021_paper.html", "aff_unique_index": "0+1;0+2;0;3;0+4", - "aff_unique_norm": "SenseTime Research;Chinese University of Hong Kong;Beihang University;Nanyang Technological University;Shanghai AI Laboratory", + "aff_unique_norm": "SenseTime Research;The Chinese University of Hong Kong;Beihang University;Nanyang Technological University;Shanghai AI Laboratory", "aff_unique_dep": "Research;CUHK-SenseTime Joint Lab;;S-Lab;", "aff_unique_url": "https://www.sensetime.com;https://www.cuhk.edu.hk;http://www.buaa.edu.cn/;https://www.ntu.edu.sg;https://www.shanghai-ai-lab.com", "aff_unique_abbr": "SenseTime;CUHK;BUAA;NTU;SAIL", "aff_campus_unique_index": "1;;", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0+0;0+0;0;1;0+0", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Yu_2021_ICCV,\n \n author = {\n Yu,\n Ke and Li,\n Zexian and Peng,\n Yue and Loy,\n Chen Change and Gu,\n Jinwei\n},\n title = {\n ReconfigISP: Reconfigurable Camera Image Processing Pipeline\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4248-4257\n} \n}" }, { "title": "Reconstructing Hand-Object Interactions in the Wild", @@ -33683,6 +35962,7 @@ "status": "Poster", "track": "main", "pid": 1036, + "author_site": "Zhe Cao; Ilija Radosavovic; Angjoo Kanazawa; Jitendra Malik", "author": "Zhe Cao; Ilija Radosavovic; Angjoo Kanazawa; Jitendra Malik", "abstract": "We study the problem of understanding hand-object interactions from 2D images in the wild. This requires reconstructing both the hand and the object in 3D, which is challenging because of the mutual occlusion between the hand and the object. In this paper we make two main contributions: (1) a novel reconstruction technique, RHO (Reconstructing Hands and Objects), which reconstructs 3D models of both the hand and the object leveraging the 2D image cues and 3D contact priors; (2) a dataset MOW (Manipulating Objects in the Wild) of 500 examples of hand-object interaction images that have been \"3Dfied\" with the help of the RHO technique. Overall our dataset contains 121 distinct object categories, with a much greater diversity of manipulation actions, than in previous datasets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Cao_Reconstructing_Hand-Object_Interactions_in_the_Wild_ICCV_2021_paper.pdf", @@ -33697,7 +35977,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Cao_Reconstructing_Hand-Object_Interactions_in_the_Wild_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Cao_Reconstructing_Hand-Object_Interactions_in_the_Wild_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Cao_2021_ICCV,\n \n author = {\n Cao,\n Zhe and Radosavovic,\n Ilija and Kanazawa,\n Angjoo and Malik,\n Jitendra\n},\n title = {\n Reconstructing Hand-Object Interactions in the Wild\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12417-12426\n} \n}" }, { "title": "Recurrent Mask Refinement for Few-Shot Medical Image Segmentation", @@ -33705,6 +35986,7 @@ "status": "Poster", "track": "main", "pid": 2067, + "author_site": "Hao Tang; Xingwei Liu; Shanlin Sun; Xiangyi Yan; Xiaohui Xie", "author": "Hao Tang; Xingwei Liu; Shanlin Sun; Xiangyi Yan; Xiaohui Xie", "abstract": "Although having achieved great success in medical image segmentation, deep convolutional neural networks usually require a large dataset with manual annotations for training and are difficult to generalize to unseen classes. Few-shot learning has the potential to address these challenges by learning new classes from only a few labeled examples. In this work, we propose a new framework for few-shot medical image segmentation based on prototypical networks. Our innovation lies in the design of two key modules: 1) a context relation encoder (CRE) that uses correlation to capture local relation features between foreground and background regions; and 2) a recurrent mask refinement module that repeatedly uses the CRE and a prototypical network to recapture the change of context relationship and refine the segmentation mask iteratively. Experiments on two abdomen CT datasets and an abdomen MRI dataset show the proposed method obtains substantial improvement over the state-of-the-art methods by an average of 16.32%, 8.45% and 6.24% in terms of DSC, respectively. Code is publicly available.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Tang_Recurrent_Mask_Refinement_for_Few-Shot_Medical_Image_Segmentation_ICCV_2021_paper.pdf", @@ -33728,7 +36010,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Irvine", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Tang_2021_ICCV,\n \n author = {\n Tang,\n Hao and Liu,\n Xingwei and Sun,\n Shanlin and Yan,\n Xiangyi and Xie,\n Xiaohui\n},\n title = {\n Recurrent Mask Refinement for Few-Shot Medical Image Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3918-3928\n} \n}" }, { "title": "Recursively Conditional Gaussian for Ordinal Unsupervised Domain Adaptation", @@ -33736,6 +36019,7 @@ "status": "Poster", "track": "main", "pid": 2957, + "author_site": "Xiaofeng Liu; Site Li; Yubin Ge; Pengyi Ye; Jane You; Jun Lu", "author": "Xiaofeng Liu; Site Li; Yubin Ge; Pengyi Ye; Jane You; Jun Lu", "abstract": "The unsupervised domain adaptation (UDA) has been widely adopted to alleviate the data scalability issue, while the existing works usually focus on classifying independently discrete labels. However, in many tasks (e.g., medical diagnosis), the labels are discrete and successively distributed. The UDA for ordinal classification requires inducing non-trivial ordinal distribution prior to the latent space. Target for this, the partially ordered set (poset) is defined for constraining the latent vector. Instead of the typically i.i.d. Gaussian latent prior, in this work, a recursively conditional Gaussian (RCG) set is adapted for ordered constraint modeling, which admits a tractable joint distribution prior. Furthermore, we are able to control the density of content vector that violates the poset constraints by a simple \"three-sigma rule\". We explicitly disentangle the cross-domain images into a shared ordinal prior induced ordinal content space and two separate source/target ordinal-unrelated spaces, and the self-training is worked on the shared space exclusively for ordinal-aware domain alignment. Extensive experiments on UDA medical diagnoses and facial age estimation demonstrate its effectiveness.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liu_Recursively_Conditional_Gaussian_for_Ordinal_Unsupervised_Domain_Adaptation_ICCV_2021_paper.pdf", @@ -33750,7 +36034,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Liu_Recursively_Conditional_Gaussian_for_Ordinal_Unsupervised_Domain_Adaptation_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Liu_Recursively_Conditional_Gaussian_for_Ordinal_Unsupervised_Domain_Adaptation_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Liu_2021_ICCV,\n \n author = {\n Liu,\n Xiaofeng and Li,\n Site and Ge,\n Yubin and Ye,\n Pengyi and You,\n Jane and Lu,\n Jun\n},\n title = {\n Recursively Conditional Gaussian for Ordinal Unsupervised Domain Adaptation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 764-773\n} \n}" }, { "title": "Refining Action Segmentation With Hierarchical Video Representations", @@ -33758,6 +36043,7 @@ "status": "Poster", "track": "main", "pid": 10192, + "author_site": "Hyemin Ahn; Dongheui Lee", "author": "Hyemin Ahn; Dongheui Lee", "abstract": "In this paper, we propose Hierarchical Action Segmentation Refiner (HASR), which can refine temporal action segmentation results from various models by understanding the overall context of a given video in a hierarchical way. When a backbone model for action segmentation estimates how the given video can be segmented, our model extracts segment-level representations based on frame-level features, and extracts a video-level representation based on the segment-level representations. Based on these hierarchical representations, our model can refer to the overall context of the entire video, and predict how the segment labels that are out of context should be corrected. Our HASR can be plugged into various action segmentation models (MS-TCN, SSTDA, ASRF), and improve the performance of state-of-the-art models based on three challenging datasets (GTEA, 50Salads, and Breakfast). For example, in 50Salads dataset, the segmental edit score improves from 67.9% to 77.4% (MS-TCN), from 75.8% to 77.3% (SSTDA), from 79.3% to 81.0% (ASRF). In addition, our model can refine the segmentation result from the unseen backbone model, which was not referred to when training HASR. This generalization performance would make HASR be an effective tool for boosting up the existing approaches for temporal action segmentation. Our code is available at https://github.com/cotton-ahn/HASR_iccv2021.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ahn_Refining_Action_Segmentation_With_Hierarchical_Video_Representations_ICCV_2021_paper.pdf", @@ -33781,7 +36067,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Ahn_2021_ICCV,\n \n author = {\n Ahn,\n Hyemin and Lee,\n Dongheui\n},\n title = {\n Refining Action Segmentation With Hierarchical Video Representations\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 16302-16310\n} \n}" }, { "title": "Refining Activation Downsampling With SoftPool", @@ -33789,6 +36076,7 @@ "status": "Poster", "track": "main", "pid": 3375, + "author_site": "Alexandros Stergiou; Ronald Poppe; Grigorios Kalliatakis", "author": "Alexandros Stergiou; Ronald Poppe; Grigorios Kalliatakis", "abstract": "Convolutional Neural Networks (CNNs) use pooling to decrease the size of activation maps. This process is crucial to increase the receptive fields and to reduce computational requirements of subsequent convolutions. An important feature of the pooling operation is the minimization of information loss, with respect to the initial activation maps, without a significant impact on the computation and memory overhead. To meet these requirements, we propose SoftPool: a fast and efficient method for exponentially weighted activation downsampling. Through experiments across a range of architectures and pooling methods, we demonstrate that SoftPool can retain more information in the reduced activation maps. This refined downsampling leads to improvements in a CNN's classification accuracy. Experiments with pooling layer substitutions on ImageNet1K show an increase in accuracy over both original architectures and other pooling methods. We also test SoftPool on video datasets for action recognition. Again, through the direct replacement of pooling layers, we observe consistent performance improvements while computational loads and memory requirements remain limited.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Stergiou_Refining_Activation_Downsampling_With_SoftPool_ICCV_2021_paper.pdf", @@ -33812,7 +36100,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", - "aff_country_unique": "Netherlands;United Kingdom" + "aff_country_unique": "Netherlands;United Kingdom", + "bibtex": "@InProceedings{Stergiou_2021_ICCV,\n \n author = {\n Stergiou,\n Alexandros and Poppe,\n Ronald and Kalliatakis,\n Grigorios\n},\n title = {\n Refining Activation Downsampling With SoftPool\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10357-10366\n} \n}" }, { "title": "Region Similarity Representation Learning", @@ -33820,6 +36109,7 @@ "status": "Poster", "track": "main", "pid": 9531, + "author_site": "Tete Xiao; Colorado J Reed; Xiaolong Wang; Kurt Keutzer; Trevor Darrell", "author": "Tete Xiao; Colorado J Reed; Xiaolong Wang; Kurt Keutzer; Trevor Darrell", "abstract": "We present Region Similarity Representation Learning (ReSim), a new approach to self-supervised representation learning for localization-based tasks such as object detection and segmentation. While existing work has largely focused on learning global representations for an entire image, ReSim learns both regional representations for localization as well as semantic image-level representations. ReSim operates by sliding a fixed-sized window across the overlapping area between two views (e.g., image crops), aligning these areas with their corresponding convolutional feature map regions, and then maximizing the feature similarity across views. As a result, ReSim learns spatially and semantically consistent feature representation throughout the convolutional feature maps of a neural network. A shift or scale of an image region, e.g., a shift or scale of an object, has a corresponding change in the feature maps; this allows downstream tasks to leverage these representations for localization. Through object detection, instance segmentation, and dense pose estimation experiments, we illustrate how ReSim learns representations which significantly improve the localization and classification performance compared to a competitive MoCo-v2 baseline: +2:7 APbb75 VOC, +1:1 AP75 COCO, and +1:9 APmk Cityscapes. We will release our code and pre-trained models.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xiao_Region_Similarity_Representation_Learning_ICCV_2021_paper.pdf", @@ -33843,7 +36133,8 @@ "aff_campus_unique_index": "0;0;1;0;0", "aff_campus_unique": "Berkeley;San Diego", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Xiao_2021_ICCV,\n \n author = {\n Xiao,\n Tete and Reed,\n Colorado J and Wang,\n Xiaolong and Keutzer,\n Kurt and Darrell,\n Trevor\n},\n title = {\n Region Similarity Representation Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10539-10548\n} \n}" }, { "title": "Region-Aware Contrastive Learning for Semantic Segmentation", @@ -33851,6 +36142,7 @@ "status": "Poster", "track": "main", "pid": 8356, + "author_site": "Hanzhe Hu; Jinshi Cui; Liwei Wang", "author": "Hanzhe Hu; Jinshi Cui; Liwei Wang", "abstract": "Recent works have made great success in semantic segmentation by exploiting contextual information in a local or global manner within individual image and supervising the model with pixel-wise cross entropy loss. However, from the holistic view of the whole dataset, semantic relations not only exist inside one single image, but also prevail in the whole training data, which makes solely considering intra-image correlations insufficient. Inspired by recent progress in unsupervised contrastive learning, we propose the region-aware contrastive learning (RegionContrast) for semantic segmentation in the supervised manner. In order to enhance the similarity of semantically similar pixels while keeping the discrimination from others, we employ contrastive learning to realize this objective. With the help of memory bank, we explore to store all the representative features into the memory. Without loss of generality, to efficiently incorporate all training data into the memory bank while avoiding taking too much computation resource, we propose to construct region centers to represent features from different categories for every image. Hence, the proposed region-aware contrastive learning is performed in a region level for all the training data, which saves much more memory than methods exploring the pixel-level relations. The proposed RegionContrast brings little computation cost during training and requires no extra overhead for testing. Extensive experiments demonstrate that our method achieves state-of-the-art performance on three benchmark datasets including Cityscapes, ADE20K and COCO Stuff.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Hu_Region-Aware_Contrastive_Learning_for_Semantic_Segmentation_ICCV_2021_paper.pdf", @@ -33874,7 +36166,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Hu_2021_ICCV,\n \n author = {\n Hu,\n Hanzhe and Cui,\n Jinshi and Wang,\n Liwei\n},\n title = {\n Region-Aware Contrastive Learning for Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 16291-16301\n} \n}" }, { "title": "Regularizing Nighttime Weirdness: Efficient Self-Supervised Monocular Depth Estimation in the Dark", @@ -33882,10 +36175,11 @@ "status": "Poster", "track": "main", "pid": 1731, + "author_site": "Kun Wang; Zhenyu Zhang; Zhiqiang Yan; Xiang Li; Baobei Xu; Jun Li; Jian Yang", "author": "Kun Wang; Zhenyu Zhang; Zhiqiang Yan; Xiang Li; Baobei Xu; Jun Li; Jian Yang", "abstract": "Monocular depth estimation aims at predicting depth from a single image or video. Recently, self-supervised methods draw much attention since they are free of depth annotations and achieve impressive performance on several daytime benchmarks. However, they produce weird outputs in more challenging nighttime scenarios because of low visibility and varying illuminations, which bring weak textures and break brightness-consistency assumption, respectively. To address these problems, in this paper we propose a novel framework with several improvements: (1) we introduce Priors-Based Regularization to learn distribution knowledge from unpaired depth maps and prevent model from being incorrectly trained; (2) we leverage Mapping-Consistent Image Enhancement module to enhance image visibility and contrast while maintaining brightness consistency; and (3) we present Statistics-Based Mask strategy to tune the number of removed pixels within textureless regions, using dynamic statistics. Experimental results demonstrate the effectiveness of each component. Meanwhile, our framework achieves remarkable improvements and state-of-the-art results on two nighttime datasets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_Regularizing_Nighttime_Weirdness_Efficient_Self-Supervised_Monocular_Depth_Estimation_in_the_ICCV_2021_paper.pdf", - "aff": "PCA Lab\u2021, Nanjing University of Science and Technology, China; Tencent YouTu Lab+PCA Lab\u2021, Nanjing University of Science and Technology, China; PCA Lab\u2021, Nanjing University of Science and Technology, China; PCA Lab\u2021, Nanjing University of Science and Technology, China; Hikvision Research Institute; PCA Lab\u2021, Nanjing University of Science and Technology, China; PCA Lab\u2021, Nanjing University of Science and Technology, China", + "aff": "PCA Lab‡, Nanjing University of Science and Technology, China; Tencent YouTu Lab+PCA Lab‡, Nanjing University of Science and Technology, China; PCA Lab‡, Nanjing University of Science and Technology, China; PCA Lab‡, Nanjing University of Science and Technology, China; Hikvision Research Institute; PCA Lab‡, Nanjing University of Science and Technology, China; PCA Lab‡, Nanjing University of Science and Technology, China", "project": "", "github": "https://github.com/w2kun/RNW", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Wang_Regularizing_Nighttime_Weirdness_ICCV_2021_supplemental.pdf", @@ -33905,7 +36199,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Kun and Zhang,\n Zhenyu and Yan,\n Zhiqiang and Li,\n Xiang and Xu,\n Baobei and Li,\n Jun and Yang,\n Jian\n},\n title = {\n Regularizing Nighttime Weirdness: Efficient Self-Supervised Monocular Depth Estimation in the Dark\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 16055-16064\n} \n}" }, { "title": "Rehearsal Revealed: The Limits and Merits of Revisiting Samples in Continual Learning", @@ -33913,6 +36208,7 @@ "status": "Poster", "track": "main", "pid": 5748, + "author_site": "Eli Verwimp; Matthias De Lange; Tinne Tuytelaars", "author": "Eli Verwimp; Matthias De Lange; Tinne Tuytelaars", "abstract": "Learning from non-stationary data streams and overcoming catastrophic forgetting still poses a serious challenge for machine learning research. Rather than aiming to improve state-of-the-art, in this work we provide insight into the limits and merits of rehearsal, one of continual learning's most established methods. We hypothesize that models trained sequentially with rehearsal tend to stay in the same low-loss region after a task has finished, but are at risk of overfitting on its sample memory, hence harming generalization. We provide both conceptual and strong empirical evidence on three benchmarks for both behaviors, bringing novel insights into the dynamics of rehearsal and continual learning in general. Finally, we interpret important continual learning works in the light of our findings, allowing for a deeper understanding of their successes.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Verwimp_Rehearsal_Revealed_The_Limits_and_Merits_of_Revisiting_Samples_in_ICCV_2021_paper.pdf", @@ -33936,7 +36232,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Belgium" + "aff_country_unique": "Belgium", + "bibtex": "@InProceedings{Verwimp_2021_ICCV,\n \n author = {\n Verwimp,\n Eli and De Lange,\n Matthias and Tuytelaars,\n Tinne\n},\n title = {\n Rehearsal Revealed: The Limits and Merits of Revisiting Samples in Continual Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9385-9394\n} \n}" }, { "title": "Relating Adversarially Robust Generalization to Flat Minima", @@ -33944,10 +36241,11 @@ "status": "Poster", "track": "main", "pid": 8654, + "author_site": "David Stutz; Matthias Hein; Bernt Schiele", "author": "David Stutz; Matthias Hein; Bernt Schiele", "abstract": "Adversarial training (AT) has become the de-facto standard to obtain models robust against adversarial examples. However, AT exhibits severe robust overfitting: cross-entropy loss on adversarial examples, so-called robust loss, decreases continuously on training examples, while eventually increasing on test examples. In practice, this leads to poor robust generalization, i.e., adversarial robustness does not generalize well to new examples. In this paper, we study the relationship between robust generalization and flatness of the robust loss landscape in weight space, i.e., whether robust loss changes significantly when perturbing weights. To this end, we propose average- and worst-case metrics to measure flatness in the robust loss landscape and show a correlation between good robust generalization and flatness. For example, throughout training, flatness reduces significantly during overfitting such that early stopping effectively finds flatter minima in the robust loss landscape. Similarly, AT variants achieving higher adversarial robustness also correspond to flatter minima. This holds for many popular choices, e.g., AT-AWP, TRADES, MART, AT with self-supervision or additional unlabeled examples, as well as simple regularization techniques, e.g., AutoAugment, weight decay or label noise. For fair comparison across these approaches, our flatness measures are specifically designed to be scale-invariant and we conduct extensive experiments to validate our findings.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Stutz_Relating_Adversarially_Robust_Generalization_to_Flat_Minima_ICCV_2021_paper.pdf", - "aff": "Max Planck Institute for Informatics, Saarland Informatics Campus, Saarbrucken; University of T\u00fcbingen, T\u00fcbingen; Max Planck Institute for Informatics, Saarland Informatics Campus, Saarbrucken", + "aff": "Max Planck Institute for Informatics, Saarland Informatics Campus, Saarbrucken; University of Tübingen, Tübingen; Max Planck Institute for Informatics, Saarland Informatics Campus, Saarbrucken", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Stutz_Relating_Adversarially_Robust_ICCV_2021_supplemental.pdf", @@ -33960,14 +36258,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Stutz_Relating_Adversarially_Robust_Generalization_to_Flat_Minima_ICCV_2021_paper.html", "aff_unique_index": "0;1;0", - "aff_unique_norm": "Max Planck Institute for Informatics;University of T\u00fcbingen", + "aff_unique_norm": "Max Planck Institute for Informatics;University of Tübingen", "aff_unique_dep": ";", "aff_unique_url": "https://mpi-inf.mpg.de;https://www.uni-tuebingen.de/", - "aff_unique_abbr": "MPII;Uni T\u00fcbingen", + "aff_unique_abbr": "MPII;Uni Tübingen", "aff_campus_unique_index": "0;1;0", - "aff_campus_unique": "Saarbrucken;T\u00fcbingen", + "aff_campus_unique": "Saarbrucken;Tübingen", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Stutz_2021_ICCV,\n \n author = {\n Stutz,\n David and Hein,\n Matthias and Schiele,\n Bernt\n},\n title = {\n Relating Adversarially Robust Generalization to Flat Minima\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7807-7817\n} \n}" }, { "title": "Relational Embedding for Few-Shot Classification", @@ -33975,6 +36274,7 @@ "status": "Poster", "track": "main", "pid": 4102, + "author_site": "Dahyun Kang; Heeseung Kwon; Juhong Min; Minsu Cho", "author": "Dahyun Kang; Heeseung Kwon; Juhong Min; Minsu Cho", "abstract": "We propose to address the problem of few-shot classification by meta-learning \"what to observe\" and \"where to attend\" in a relational perspective. Our method leverages relational patterns within and between images via self-correlational representation (SCR) and cross-correlational attention (CCA). Within each image, the SCR module transforms a base feature map into a self-correlation tensor and learns to extract structural patterns from the tensor. Between the images, the CCA module computes cross-correlation between two image representations and learns to produce co-attention between them. Our Relational Embedding Network (RENet) combines the two relational modules to learn relational embedding in an end-to-end manner. In experimental evaluation, it achieves consistent improvements over state-of-the-art methods on four widely used few-shot classification benchmarks of miniImageNet, tieredImageNet, CUB-200-2011, and CIFAR-FS.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Kang_Relational_Embedding_for_Few-Shot_Classification_ICCV_2021_paper.pdf", @@ -33989,7 +36289,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Kang_Relational_Embedding_for_Few-Shot_Classification_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Kang_Relational_Embedding_for_Few-Shot_Classification_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Kang_2021_ICCV,\n \n author = {\n Kang,\n Dahyun and Kwon,\n Heeseung and Min,\n Juhong and Cho,\n Minsu\n},\n title = {\n Relational Embedding for Few-Shot Classification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8822-8833\n} \n}" }, { "title": "Relaxed Transformer Decoders for Direct Action Proposal Generation", @@ -33997,6 +36298,7 @@ "status": "Poster", "track": "main", "pid": 6334, + "author_site": "Jing Tan; Jiaqi Tang; Limin Wang; Gangshan Wu", "author": "Jing Tan; Jiaqi Tang; Limin Wang; Gangshan Wu", "abstract": "Temporal action proposal generation is an important and challenging task in video understanding, which aims at detecting all temporal segments containing action instances of interest. The existing proposal generation approaches are generally based on pre-defined anchor windows or heuristic bottom-up boundary matching strategies. This paper presents a simple and efficient framework (RTD-Net) for direct action proposal generation, by re-purposing a Transformer-alike architecture. To tackle the essential visual difference between time and space, we make three important improvements over the original transformer detection framework (DETR). First, to deal with slowness prior in videos, we replace the original Transformer encoder with a boundary attentive module to better capture long-range temporal information. Second, due to the ambiguous temporal boundary and relatively sparse annotations, we present a relaxed matching scheme to relieve the strict criteria of single assignment to each groundtruth. Finally, we devise a three-branch head to further improve the proposal confidence estimation by explicitly predicting its completeness. Extensive experiments on THUMOS14 and ActivityNet-1.3 benchmarks demonstrate the effectiveness of RTD-Net, on both tasks of temporal action proposal generation and temporal action detection. Moreover, due to its simplicity in design, our framework is more efficient than previous proposal generation methods, without non-maximum suppression post-processing. The code and models are made available at https://github.com/MCG-NJU/RTD-Action.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Tan_Relaxed_Transformer_Decoders_for_Direct_Action_Proposal_Generation_ICCV_2021_paper.pdf", @@ -34020,7 +36322,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Tan_2021_ICCV,\n \n author = {\n Tan,\n Jing and Tang,\n Jiaqi and Wang,\n Limin and Wu,\n Gangshan\n},\n title = {\n Relaxed Transformer Decoders for Direct Action Proposal Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13526-13535\n} \n}" }, { "title": "Reliably Fast Adversarial Training via Latent Adversarial Perturbation", @@ -34028,6 +36331,7 @@ "status": "Poster", "track": "main", "pid": 9002, + "author_site": "Geon Yeong Park; Sang Wan Lee", "author": "Geon Yeong Park; Sang Wan Lee", "abstract": "While multi-step adversarial training is widely popular as an effective defense method against strong adversarial attacks, its computational cost is notoriously expensive, compared to standard training. Several single-step adversarial training methods have been proposed to mitigate the above-mentioned overhead cost; however, their performance is not sufficiently reliable depending on the optimization setting. To overcome such limitations, we deviate from the existing input-space-based adversarial training regime and propose a single-step latent adversarial training method (SLAT), which leverages the gradients of latent representation as the latent adversarial perturbation. We demonstrate that the L1 norm of feature gradients is implicitly regularized through the adopted latent perturbation, thereby recovering local linearity and ensuring reliable performance, compared to the existing single-step adversarial training methods. Because latent perturbation is based on the gradients of the latent representations which can be obtained for free in the process of input gradients computation, the proposed method costs roughly the same time as the fast gradient sign method. Experiment results demonstrate that the proposed method, despite its structural simplicity, outperforms state-of-the-art accelerated adversarial training methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Park_Reliably_Fast_Adversarial_Training_via_Latent_Adversarial_Perturbation_ICCV_2021_paper.pdf", @@ -34051,7 +36355,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Park_2021_ICCV,\n \n author = {\n Park,\n Geon Yeong and Lee,\n Sang Wan\n},\n title = {\n Reliably Fast Adversarial Training via Latent Adversarial Perturbation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7758-7767\n} \n}" }, { "title": "Removing Adversarial Noise in Class Activation Feature Space", @@ -34059,6 +36364,7 @@ "status": "Poster", "track": "main", "pid": 11044, + "author_site": "Dawei Zhou; Nannan Wang; Chunlei Peng; Xinbo Gao; Xiaoyu Wang; Jun Yu; Tongliang Liu", "author": "Dawei Zhou; Nannan Wang; Chunlei Peng; Xinbo Gao; Xiaoyu Wang; Jun Yu; Tongliang Liu", "abstract": "Deep neural networks (DNNs) are vulnerable to adversarial noise. Pre-processing based defenses could largely remove adversarial noise by processing inputs. However, they are typically affected by the error amplification effect, especially in the front of continuously evolving attacks. To solve this problem, in this paper, we propose to remove adversarial noise by implementing a self-supervised adversarial training mechanism in a class activation feature space. To be specific, we first maximize the disruptions to class activation features of natural examples to craft adversarial examples. Then, we train a denoising model to minimize the distances between the adversarial examples and the natural examples in the class activation feature space. Empirical evaluations demonstrate that our method could significantly enhance adversarial robustness in comparison to previous state-of-the-art approaches, especially against unseen adversarial attacks and adaptive attacks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhou_Removing_Adversarial_Noise_in_Class_Activation_Feature_Space_ICCV_2021_paper.pdf", @@ -34075,14 +36381,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhou_Removing_Adversarial_Noise_in_Class_Activation_Feature_Space_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;1;2;3;4", - "aff_unique_norm": "Xidian University;Chongqing University of Posts and Telecommunications;Chinese University of Hong Kong;University of Science and Technology of China;University of Sydney", + "aff_unique_norm": "Xidian University;Chongqing University of Posts and Telecommunications;The Chinese University of Hong Kong;University of Science and Technology of China;University of Sydney", "aff_unique_dep": ";;;;", "aff_unique_url": "http://www.xidian.edu.cn/;http://www.cqupt.edu.cn;https://www.cuhk.edu.cn;http://www.ustc.edu.cn;https://www.sydney.edu.au", "aff_unique_abbr": "Xidian;CQUPT;CUHK;USTC;USYD", "aff_campus_unique_index": "1", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0;0;0;0;0;0;1", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Zhou_2021_ICCV,\n \n author = {\n Zhou,\n Dawei and Wang,\n Nannan and Peng,\n Chunlei and Gao,\n Xinbo and Wang,\n Xiaoyu and Yu,\n Jun and Liu,\n Tongliang\n},\n title = {\n Removing Adversarial Noise in Class Activation Feature Space\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7878-7887\n} \n}" }, { "title": "Removing the Bias of Integral Pose Regression", @@ -34090,6 +36397,7 @@ "status": "Poster", "track": "main", "pid": 11152, + "author_site": "Kerui Gu; Linlin Yang; Angela Yao", "author": "Kerui Gu; Linlin Yang; Angela Yao", "abstract": "Heatmap-based detection methods are dominant for 2D human pose estimation even though regression is more intuitive. The introduction of the integral regression method, which, architecture-wise uses an implicit heatmap, brings the two approaches even closer together. This begs the question -- does detection really outperform regression? In this paper, we investigate the difference in supervision between the heatmap-based detection and integral regression, as this is the key remaining difference between the two approaches. In the process, we discover an underlying bias behind integral pose regression that arises from taking the expectation after the softmax function. To counter the bias, we present a compensation method which we find to improve integral regression accuracy on all 2D pose estimation benchmarks. We further propose a simple joint detection and bias-compensated regression method that considerably outperforms state-of-the-art baselines with few added components.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Gu_Removing_the_Bias_of_Integral_Pose_Regression_ICCV_2021_paper.pdf", @@ -34113,7 +36421,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+1;0", - "aff_country_unique": "Singapore;Germany" + "aff_country_unique": "Singapore;Germany", + "bibtex": "@InProceedings{Gu_2021_ICCV,\n \n author = {\n Gu,\n Kerui and Yang,\n Linlin and Yao,\n Angela\n},\n title = {\n Removing the Bias of Integral Pose Regression\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11067-11076\n} \n}" }, { "title": "Representative Color Transform for Image Enhancement", @@ -34121,6 +36430,7 @@ "status": "Poster", "track": "main", "pid": 9697, + "author_site": "Hanul Kim; Su-Min Choi; Chang-Su Kim; Yeong Jun Koh", "author": "Hanul Kim; Su-Min Choi; Chang-Su Kim; Yeong Jun Koh", "abstract": "Recently, the encoder-decoder and intensity transformation approaches lead to impressive progress in image enhancement. However, the encoder-decoder often loses details in input images during down-sampling and up-sampling processes. Also, the intensity transformation has a limited capacity to cover color transformation between low-quality and high-quality images. In this paper, we propose a novel approach, called representative color transform (RCT), to tackle these issues in existing methods. RCT determines different representative colors specialized in input images and estimates transformed colors for the representative colors. It then determines enhanced colors using these transformed colors based on the similarity between input and representative colors. Extensive experiments demonstrate that the proposed algorithm outperforms recent state-of-the-art algorithms on various image enhancement problems.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Kim_Representative_Color_Transform_for_Image_Enhancement_ICCV_2021_paper.pdf", @@ -34144,7 +36454,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Kim_2021_ICCV,\n \n author = {\n Kim,\n Hanul and Choi,\n Su-Min and Kim,\n Chang-Su and Koh,\n Yeong Jun\n},\n title = {\n Representative Color Transform for Image Enhancement\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4459-4468\n} \n}" }, { "title": "ResRep: Lossless CNN Pruning via Decoupling Remembering and Forgetting", @@ -34152,6 +36463,7 @@ "status": "Poster", "track": "main", "pid": 6957, + "author_site": "Xiaohan Ding; Tianxiang Hao; Jianchao Tan; Ji Liu; Jungong Han; Yuchen Guo; Guiguang Ding", "author": "Xiaohan Ding; Tianxiang Hao; Jianchao Tan; Ji Liu; Jungong Han; Yuchen Guo; Guiguang Ding", "abstract": "We propose ResRep, a novel method for lossless channel pruning (a.k.a. filter pruning), which slims down a CNN by reducing the width (number of output channels) of convolutional layers. Inspired by the neurobiology research about the independence of remembering and forgetting, we propose to re-parameterize a CNN into the remembering parts and forgetting parts, where the former learn to maintain the performance and the latter learn to prune. Via training with regular SGD on the former but a novel update rule with penalty gradients on the latter, we realize structured sparsity. Then we equivalently merge the remembering and forgetting parts into the original architecture with narrower layers. In this sense, ResRep can be viewed as a successful application of Structural Re-parameterization. Such a methodology distinguishes ResRep from the traditional learning-based pruning paradigm that applies a penalty on parameters to produce sparsity, which may suppress the parameters essential for the remembering. ResRep slims down a standard ResNet-50 with 76.15% accuracy on ImageNet to a narrower one with only 45% FLOPs and no accuracy drop, which is the first to achieve lossless pruning with such a high compression ratio. The code and models are at https://github.com/DingXiaoH/ResRep.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ding_ResRep_Lossless_CNN_Pruning_via_Decoupling_Remembering_and_Forgetting_ICCV_2021_paper.pdf", @@ -34175,7 +36487,8 @@ "aff_campus_unique_index": "1;1;2;1;1", "aff_campus_unique": ";Beijing;Aberystwyth", "aff_country_unique_index": "0+0;0+0;1;1;2;0+0;0+0", - "aff_country_unique": "China;United States;United Kingdom" + "aff_country_unique": "China;United States;United Kingdom", + "bibtex": "@InProceedings{Ding_2021_ICCV,\n \n author = {\n Ding,\n Xiaohan and Hao,\n Tianxiang and Tan,\n Jianchao and Liu,\n Ji and Han,\n Jungong and Guo,\n Yuchen and Ding,\n Guiguang\n},\n title = {\n ResRep: Lossless CNN Pruning via Decoupling Remembering and Forgetting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4510-4520\n} \n}" }, { "title": "Residual Attention: A Simple but Effective Method for Multi-Label Recognition", @@ -34183,6 +36496,7 @@ "status": "Poster", "track": "main", "pid": 3690, + "author_site": "Ke Zhu; Jianxin Wu", "author": "Ke Zhu; Jianxin Wu", "abstract": "Multi-label image recognition is a challenging computer vision task of practical use. Progresses in this area, however, are often characterized by complicated methods, heavy computations, and lack of intuitive explanations. To effectively capture different spatial regions occupied by objects from different categories, we propose an embarrassingly simple module, named class-specific residual attention (CSRA). CSRA generates class-specific features for every category by proposing a simple spatial attention score, and then combines it with the class-agnostic average pooling feature. CSRA achieves state-of-the-art results on multilabel recognition, and at the same time is much simpler than them. Furthermore, with only 4 lines of code, CSRA also leads to consistent improvement across many diverse pretrained models and datasets without any extra training. CSRA is both easy to implement and light in computations, which also enjoys intuitive explanations and visualizations.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhu_Residual_Attention_A_Simple_but_Effective_Method_for_Multi-Label_Recognition_ICCV_2021_paper.pdf", @@ -34206,7 +36520,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhu_2021_ICCV,\n \n author = {\n Zhu,\n Ke and Wu,\n Jianxin\n},\n title = {\n Residual Attention: A Simple but Effective Method for Multi-Label Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 184-193\n} \n}" }, { "title": "Rethinking 360deg Image Visual Attention Modelling With Unsupervised Learning.", @@ -34214,7 +36529,8 @@ "status": "Poster", "track": "main", "pid": 9149, - "author": "Yasser Abdelaziz Dahou Djilali; Tarun Krishna; Kevin McGuinness; Noel E. O\u2019Connor", + "author_site": "Yasser Abdelaziz Dahou Djilali; Tarun Krishna; Kevin McGuinness; Noel E. O’Connor", + "author": "Yasser Abdelaziz Dahou Djilali; Tarun Krishna; Kevin McGuinness; Noel E. O’Connor", "abstract": "Despite the success of self-supervised representation learning on planar data, to date it has not been studied on 360deg images. In this paper, we extend recent advances in contrastive learning to learn latent representations that are sufficiently invariant to be highly effective for spherical saliency prediction as a downstream task. We argue that omni-directional images are particularly suited to such an approach due to the geometry of the data domain. To verify this hypothesis, we design an unsupervised framework that effectively maximizes the mutual information between the different views from both the equator and the poles. We show that the decoder is able to learn good quality saliency distributions from the encoder embeddings. Our model compares favorably with fully-supervised learning methods on the Salient360!, VR-EyeTracking and Sitzman datasets. This performance is achieved using an encoder that is trained in a completely unsupervised way and a relatively lightweight supervised decoder (3.8 X fewer parameters in the case of the ResNet50 encoder). We believe that this combination of supervised and unsupervised learning is an important step toward flexible formulations of human visual attention.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Djilali_Rethinking_360deg_Image_Visual_Attention_Modelling_With_Unsupervised_Learning._ICCV_2021_paper.pdf", "aff": "Insight Centre for Data Analytics, Dublin City University (DCU); Insight Centre for Data Analytics, Dublin City University (DCU); Insight Centre for Data Analytics, Dublin City University (DCU); Insight Centre for Data Analytics, Dublin City University (DCU)", @@ -34237,7 +36553,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Dublin", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Ireland" + "aff_country_unique": "Ireland", + "bibtex": "@InProceedings{Djilali_2021_ICCV,\n \n author = {\n Djilali,\n Yasser Abdelaziz Dahou and Krishna,\n Tarun and McGuinness,\n Kevin and O{\\textquoteright\n}Connor,\n Noel E.\n},\n title = {\n Rethinking 360deg Image Visual Attention Modelling With Unsupervised Learning.\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15414-15424\n} \n}" }, { "title": "Rethinking Coarse-To-Fine Approach in Single Image Deblurring", @@ -34245,6 +36562,7 @@ "status": "Poster", "track": "main", "pid": 4055, + "author_site": "Sung-Jin Cho; Seo-Won Ji; Jun-Pyo Hong; Seung-Won Jung; Sung-Jea Ko", "author": "Sung-Jin Cho; Seo-Won Ji; Jun-Pyo Hong; Seung-Won Jung; Sung-Jea Ko", "abstract": "Coarse-to-fine strategies have been extensively used for the architecture design of single image deblurring networks. Conventional methods typically stack sub-networks with multi-scale input images and gradually improve sharpness of images from the bottom sub-network to the top sub-network, yielding inevitably high computational costs. Toward a fast and accurate deblurring network design, we revisit the coarse-to-fine strategy and present a multi-input multi-output U-net (MIMO-UNet). The MIMO-UNet has three distinct features. First, the single encoder of the MIMO-UNet takes multi-scale input images to ease the difficulty of training. Second, the single decoder of the MIMO-UNet outputs multiple deblurred images with different scales to mimic multi-cascaded U-nets using a single U-shaped network. Last, asymmetric feature fusion is introduced to merge multi-scale features in an efficient manner. Extensive experiments on the GoPro and RealBlur datasets demonstrate that the proposed network outperforms the state-of-the-art methods in terms of both accuracy and computational complexity. Source code is available for research purposes at https://github.com/chosj95/MIMO-UNet.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Cho_Rethinking_Coarse-To-Fine_Approach_in_Single_Image_Deblurring_ICCV_2021_paper.pdf", @@ -34268,7 +36586,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Cho_2021_ICCV,\n \n author = {\n Cho,\n Sung-Jin and Ji,\n Seo-Won and Hong,\n Jun-Pyo and Jung,\n Seung-Won and Ko,\n Sung-Jea\n},\n title = {\n Rethinking Coarse-To-Fine Approach in Single Image Deblurring\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4641-4650\n} \n}" }, { "title": "Rethinking Counting and Localization in Crowds: A Purely Point-Based Framework", @@ -34276,6 +36595,7 @@ "status": "Poster", "track": "main", "pid": 6144, + "author_site": "Qingyu Song; Changan Wang; Zhengkai Jiang; Yabiao Wang; Ying Tai; Chengjie Wang; Jilin Li; Feiyue Huang; Yang Wu", "author": "Qingyu Song; Changan Wang; Zhengkai Jiang; Yabiao Wang; Ying Tai; Chengjie Wang; Jilin Li; Feiyue Huang; Yang Wu", "abstract": "Localizing individuals in crowds is more in accordance with the practical demands of subsequent high-level crowd analysis tasks than simply counting. However, existing localization based methods relying on intermediate representations (i.e., density maps or pseudo boxes) serving as learning targets are counter-intuitive and error-prone. In this paper, we propose a purely point-based framework for joint crowd counting and individual localization. For this framework, instead of merely reporting the absolute counting error at image level, we propose a new metric, called density Normalized Average Precision (nAP), to provide more comprehensive and more precise performance evaluation. Moreover, we design an intuitive solution under this framework, which is called Point to Point Network (P2PNet). P2PNet discards superfluous steps and directly predicts a set of point proposals to represent heads in an image, being consistent with the human annotation results. By thorough analysis, we reveal the key step towards implementing such a novel idea is to assign optimal learning targets for these proposals. Therefore, we propose to conduct this crucial association in an one-to-one matching manner using the Hungarian algorithm. The P2PNet not only significantly surpasses state-of-the-art methods on popular counting benchmarks, but also achieves promising localization accuracy. The codes will be available at: https://github.com/TencentYoutuResearch/CrowdCounting-P2PNet.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Song_Rethinking_Counting_and_Localization_in_Crowds_A_Purely_Point-Based_Framework_ICCV_2021_paper.pdf", @@ -34299,7 +36619,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Song_2021_ICCV,\n \n author = {\n Song,\n Qingyu and Wang,\n Changan and Jiang,\n Zhengkai and Wang,\n Yabiao and Tai,\n Ying and Wang,\n Chengjie and Li,\n Jilin and Huang,\n Feiyue and Wu,\n Yang\n},\n title = {\n Rethinking Counting and Localization in Crowds: A Purely Point-Based Framework\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3365-3374\n} \n}" }, { "title": "Rethinking Deep Image Prior for Denoising", @@ -34307,6 +36628,7 @@ "status": "Poster", "track": "main", "pid": 6967, + "author_site": "Yeonsik Jo; Se Young Chun; Jonghyun Choi", "author": "Yeonsik Jo; Se Young Chun; Jonghyun Choi", "abstract": "Deep image prior (DIP) serves as a good inductive bias for diverse inverse problems. Among them, denoising is known to be particularly challenging for the DIP due to noise fitting with the requirement of an early stopping. To address the issue, we first analyze the DIP by the notion of effective degrees of freedom (DF) to monitor the optimization progress and propose a principled stopping criterion before fitting to noise without access of a paired ground truth image for Gaussian noise. We also propose the 'stochastic temporal ensemble (STE)' method for incorporating techniques to further improve DIP's performance for denoising. We additionally extend our method to Poisson noise. Our empirical validations show that given a single noisy image, our method denoises the image while pre- serving rich textual details. Further, our approach outperforms prior arts in LPIPS by large margins with comparable PSNR and SSIM on seven different datasets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Jo_Rethinking_Deep_Image_Prior_for_Denoising_ICCV_2021_paper.pdf", @@ -34323,14 +36645,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Jo_Rethinking_Deep_Image_Prior_for_Denoising_ICCV_2021_paper.html", "aff_unique_index": "0+1;2;1", - "aff_unique_norm": "LG;Gwangju Institute of Science and Technology;Seoul National University", - "aff_unique_dep": "LG AI Research;;Electrical and Computer Engineering", + "aff_unique_norm": "LG AI Research;Gwangju Institute of Science and Technology;Seoul National University", + "aff_unique_dep": ";;Electrical and Computer Engineering", "aff_unique_url": "https://www.lgaires.com;https://www.gist.ac.kr;https://www.snu.ac.kr", "aff_unique_abbr": "LG AI;GIST;SNU", "aff_campus_unique_index": "1;2;1", "aff_campus_unique": ";Gwangju;Seoul", "aff_country_unique_index": "0+0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Jo_2021_ICCV,\n \n author = {\n Jo,\n Yeonsik and Chun,\n Se Young and Choi,\n Jonghyun\n},\n title = {\n Rethinking Deep Image Prior for Denoising\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5087-5096\n} \n}" }, { "title": "Rethinking Noise Synthesis and Modeling in Raw Denoising", @@ -34338,6 +36661,7 @@ "status": "Poster", "track": "main", "pid": 4157, + "author_site": "Yi Zhang; Hongwei Qin; Xiaogang Wang; Hongsheng Li", "author": "Yi Zhang; Hongwei Qin; Xiaogang Wang; Hongsheng Li", "abstract": "The lack of large-scale real raw image denoising dataset gives the rise to challenges on synthesizing realistic raw image noise for training denoising models. However, the real raw image noise is contributed by many noise sources and varies greatly among different sensors. Existing methods are unable to model all noise sources accurately, and building a noise model for each sensor is also laborious. In this paper, we introduce a new perspective to synthesize noise by directly sampling from the sensor's real noise. It inherently generates accurate raw image noise for different camera sensors. Two efficient and generic techniques: pattern-aligned patch sampling and high-bit reconstruction help accurate synthesis of spatial-correlated noise and high-bit noise respectively. We conduct systematic experiments on SIDD and ELD datasets. The results show that (1) our method outperforms existing methods and demonstrates wide generalization on different sensors and lighting conditions. (2) Recent conclusions derived from DNN-based noise modeling methods are actually based on inaccurate noise parameters. The DNN-based methods still cannot outperform physics-based statistical methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhang_Rethinking_Noise_Synthesis_and_Modeling_in_Raw_Denoising_ICCV_2021_paper.pdf", @@ -34361,7 +36685,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2021_ICCV,\n \n author = {\n Zhang,\n Yi and Qin,\n Hongwei and Wang,\n Xiaogang and Li,\n Hongsheng\n},\n title = {\n Rethinking Noise Synthesis and Modeling in Raw Denoising\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4593-4601\n} \n}" }, { "title": "Rethinking Preventing Class-Collapsing in Metric Learning With Margin-Based Losses", @@ -34369,6 +36694,7 @@ "status": "Poster", "track": "main", "pid": 7568, + "author_site": "Elad Levi; Tete Xiao; Xiaolong Wang; Trevor Darrell", "author": "Elad Levi; Tete Xiao; Xiaolong Wang; Trevor Darrell", "abstract": "Metric learning seeks perceptual embeddings where visually similar instances are close and dissimilar instances are apart, but learned representations can be sub-optimal when the distribution of intra-class samples is diverse and distinct sub-clusters are present. Although theoretically with optimal assumptions, margin-based losses such as the triplet loss and margin loss have a diverse family of solutions. We theoretically prove and empirically show that under reasonable noise assumptions, margin-based losses tend to project all samples of a class with various modes onto a single point in the embedding space, resulting in a class collapse that usually renders the space ill-sorted for classification or retrieval. To address this problem, we propose a simple modification to the embedding losses such that each sample selects its nearest same-class counterpart in a batch as the positive element in the tuple. This allows for the presence of multiple sub-clusters within each class. The adaptation can be integrated into a wide range of metric learning losses. The proposed sampling method demonstrates clear benefits on various fine-grained image retrieval datasets over a variety of existing losses; qualitative retrieval results show that samples with similar visual patterns are indeed closer in the embedding space.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Levi_Rethinking_Preventing_Class-Collapsing_in_Metric_Learning_With_Margin-Based_Losses_ICCV_2021_paper.pdf", @@ -34383,7 +36709,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Levi_Rethinking_Preventing_Class-Collapsing_in_Metric_Learning_With_Margin-Based_Losses_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Levi_Rethinking_Preventing_Class-Collapsing_in_Metric_Learning_With_Margin-Based_Losses_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Levi_2021_ICCV,\n \n author = {\n Levi,\n Elad and Xiao,\n Tete and Wang,\n Xiaolong and Darrell,\n Trevor\n},\n title = {\n Rethinking Preventing Class-Collapsing in Metric Learning With Margin-Based Losses\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10316-10325\n} \n}" }, { "title": "Rethinking Self-Supervised Correspondence Learning: A Video Frame-Level Similarity Perspective", @@ -34391,6 +36718,7 @@ "status": "Poster", "track": "main", "pid": 8399, + "author_site": "Jiarui Xu; Xiaolong Wang", "author": "Jiarui Xu; Xiaolong Wang", "abstract": "Learning a good representation for space-time correspondence is the key for various computer vision tasks, including tracking object bounding boxes and performing video object pixel segmentation. To learn generalizable representation for correspondence in large-scale, a variety of self-supervised pretext tasks are proposed to explicitly perform object-level or patch-level similarity learning. Instead of following the previous literature, we propose to learn correspondence using Video Frame-level Similarity (VFS) learning, i.e, simply learning from comparing video frames. Our work is inspired by the recent success in image-level contrastive learning and similarity learning for visual recognition. Our hypothesis is that if the representation is good for recognition, it requires the convolutional features to find correspondence between similar objects or parts. Our experiments show surprising results that VFS surpasses state-of-the-art self-supervised approaches for both OTB visual object tracking and DAVIS video object segmentation. We perform detailed analysis on what matters in VFS and reveals new properties on image and frame level similarity learning.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xu_Rethinking_Self-Supervised_Correspondence_Learning_A_Video_Frame-Level_Similarity_Perspective_ICCV_2021_paper.pdf", @@ -34414,7 +36742,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "San Diego", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Xu_2021_ICCV,\n \n author = {\n Xu,\n Jiarui and Wang,\n Xiaolong\n},\n title = {\n Rethinking Self-Supervised Correspondence Learning: A Video Frame-Level Similarity Perspective\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10075-10085\n} \n}" }, { "title": "Rethinking Spatial Dimensions of Vision Transformers", @@ -34422,6 +36751,7 @@ "status": "Poster", "track": "main", "pid": 1311, + "author_site": "Byeongho Heo; Sangdoo Yun; Dongyoon Han; Sanghyuk Chun; Junsuk Choe; Seong Joon Oh", "author": "Byeongho Heo; Sangdoo Yun; Dongyoon Han; Sanghyuk Chun; Junsuk Choe; Seong Joon Oh", "abstract": "Vision Transformer (ViT) extends the application range of transformers from language processing to computer vision tasks as being an alternative architecture against the existing convolutional neural networks (CNN). Since the transformer-based architecture has been innovative for computer vision modeling, the design convention towards an effective architecture has been less studied yet. From the successful design principles of CNN, we investigate the role of spatial dimension conversion and its effectiveness on transformer-based architecture. We particularly attend to the dimension reduction principle of CNNs; as the depth increases, a conventional CNN increases channel dimension and decreases spatial dimensions. We empirically show that such a spatial dimension reduction is beneficial to a transformer architecture as well, and propose a novel Pooling-based Vision Transformer (PiT) upon the original ViT model. We show that PiT achieves the improved model capability and generalization performance against ViT. Throughout the extensive experiments, we further show PiT outperforms the baseline on several tasks such as image classification, object detection, and robustness evaluation. Source codes and ImageNet models are available at https://github.com/naver-ai/pit.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Heo_Rethinking_Spatial_Dimensions_of_Vision_Transformers_ICCV_2021_paper.pdf", @@ -34445,7 +36775,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Heo_2021_ICCV,\n \n author = {\n Heo,\n Byeongho and Yun,\n Sangdoo and Han,\n Dongyoon and Chun,\n Sanghyuk and Choe,\n Junsuk and Oh,\n Seong Joon\n},\n title = {\n Rethinking Spatial Dimensions of Vision Transformers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11936-11945\n} \n}" }, { "title": "Rethinking Transformer-Based Set Prediction for Object Detection", @@ -34453,6 +36784,7 @@ "status": "Poster", "track": "main", "pid": 5484, + "author_site": "Zhiqing Sun; Shengcao Cao; Yiming Yang; Kris M. Kitani", "author": "Zhiqing Sun; Shengcao Cao; Yiming Yang; Kris M. Kitani", "abstract": "DETR is a recently proposed Transformer-based method which views object detection as a set prediction problem and achieves state-of-the-art performance but demands extra-long training time to converge. In this paper, we investigate the causes of the optimization difficulty in the training of DETR. Our examinations reveal several factors contributing to the slow convergence of DETR, primarily the issues with the Hungarian loss and the Transformer cross attention mechanism. To overcome these issues we propose two solutions, namely, TSP-FCOS (Transformer-based Set Prediction with FCOS) and TSP-RCNN (Transformer-based Set Prediction with RCNN). Experimental results show that the proposed methods not only converge much faster than the original DETR, but also significantly outperform DETR and other baselines in terms of detection accuracy.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Sun_Rethinking_Transformer-Based_Set_Prediction_for_Object_Detection_ICCV_2021_paper.pdf", @@ -34476,7 +36808,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Sun_2021_ICCV,\n \n author = {\n Sun,\n Zhiqing and Cao,\n Shengcao and Yang,\n Yiming and Kitani,\n Kris M.\n},\n title = {\n Rethinking Transformer-Based Set Prediction for Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3611-3620\n} \n}" }, { "title": "Rethinking and Improving Relative Position Encoding for Vision Transformer", @@ -34484,6 +36817,7 @@ "status": "Poster", "track": "main", "pid": 5789, + "author_site": "Kan Wu; Houwen Peng; Minghao Chen; Jianlong Fu; Hongyang Chao", "author": "Kan Wu; Houwen Peng; Minghao Chen; Jianlong Fu; Hongyang Chao", "abstract": "Relative position encoding (RPE) is important for transformer to capture sequence ordering of input tokens. General efficacy has been proven in natural language processing. However, in computer vision, its efficacy is not well studied and even remains controversial, e.g., whether relative position encoding can work equally well as absolute position? In order to clarify this, we first review existing relative position encoding methods and analyze their pros and cons when applied in vision transformers. We then propose new relative position encoding methods dedicated to 2D images, called image RPE (iRPE). Our methods consider directional relative distance modeling as well as the interactions between queries and relative position embeddings in self-attention mechanism. The proposed iRPE methods are simple and lightweight. They can be easily plugged into transformer blocks. Experiments demonstrate that solely due to the proposed encoding methods, DeiT and DETR obtain up to 1.5% (top-1 Acc) and 1.3% (mAP) stable improvements over their original versions on ImageNet and COCO respectively, without tuning any extra hyperparameters such as learning rate and weight decay. Our ablation and analysis also yield interesting findings, some of which run counter to previous understanding. Code and models are open-sourced at https://github.com/microsoft/Cream/tree/main/iRPE.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wu_Rethinking_and_Improving_Relative_Position_Encoding_for_Vision_Transformer_ICCV_2021_paper.pdf", @@ -34500,14 +36834,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wu_Rethinking_and_Improving_Relative_Position_Encoding_for_Vision_Transformer_ICCV_2021_paper.html", "aff_unique_index": "0+0;1;1;1;0+0", - "aff_unique_norm": "Sun Yat-sen University;Microsoft", + "aff_unique_norm": "Sun Yat-sen University;Microsoft Research", "aff_unique_dep": "School of Computer Science and Engineering;Research", "aff_unique_url": "http://www.sysu.edu.cn;https://www.microsoft.com/en-us/research/group/asia", "aff_unique_abbr": "SYSU;MSR Asia", "aff_campus_unique_index": ";1;1;1;", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0+0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wu_2021_ICCV,\n \n author = {\n Wu,\n Kan and Peng,\n Houwen and Chen,\n Minghao and Fu,\n Jianlong and Chao,\n Hongyang\n},\n title = {\n Rethinking and Improving Relative Position Encoding for Vision Transformer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10033-10041\n} \n}" }, { "title": "Rethinking the Backdoor Attacks' Triggers: A Frequency Perspective", @@ -34515,6 +36850,7 @@ "status": "Poster", "track": "main", "pid": 5581, + "author_site": "Yi Zeng; Won Park; Z. Morley Mao; Ruoxi Jia", "author": "Yi Zeng; Won Park; Z. Morley Mao; Ruoxi Jia", "abstract": "Backdoor attacks have been considered a severe security threat to deep learning. Such attacks can make models perform abnormally on inputs with predefined triggers and still retain state-of-the-art performance on clean data. While backdoor attacks have been thoroughly investigated in the image domain from both attackers' and defenders' sides, an analysis in the frequency domain has been missing thus far. This paper first revisits existing backdoor triggers from a frequency perspective and performs a comprehensive analysis. Our results show that many current backdoor attacks exhibit severe high-frequency artifacts, which persist across different datasets and resolutions. We further demonstrate these high-frequency artifacts enable a simple way to detect existing backdoor triggers at a detection rate of 98.50% without prior knowledge of the attack details and the target model. Acknowledging previous attacks' weaknesses, we propose a practical way to create smooth backdoor triggers without high-frequency artifacts and study their detectability. We show that existing defense works can benefit by incorporating these smooth triggers into their design consideration. Moreover, we show that the detector tuned over stronger smooth triggers can generalize well to unseen weak smooth triggers. In short, our work emphasizes the importance of considering frequency analysis when designing both backdoor attacks and defenses in deep learning.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zeng_Rethinking_the_Backdoor_Attacks_Triggers_A_Frequency_Perspective_ICCV_2021_paper.pdf", @@ -34538,7 +36874,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zeng_2021_ICCV,\n \n author = {\n Zeng,\n Yi and Park,\n Won and Mao,\n Z. Morley and Jia,\n Ruoxi\n},\n title = {\n Rethinking the Backdoor Attacks' Triggers: A Frequency Perspective\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 16473-16481\n} \n}" }, { "title": "Rethinking the Truly Unsupervised Image-to-Image Translation", @@ -34546,10 +36883,11 @@ "status": "Poster", "track": "main", "pid": 1744, + "author_site": "Kyungjune Baek; Yunjey Choi; Youngjung Uh; Jaejun Yoo; Hyunjung Shim", "author": "Kyungjune Baek; Yunjey Choi; Youngjung Uh; Jaejun Yoo; Hyunjung Shim", "abstract": "Every recent image-to-image translation model inherently requires either image-level (i.e. input-output pairs) or set-level (i.e. domain labels) supervision. However, even set-level supervision can be a severe bottleneck for data collection in practice. In this paper, we tackle image-to-image translation in a fully unsupervised setting, i.e., neither paired images nor domain labels. To this end, we propose a truly unsupervised image-to-image translation model (TUNIT) that simultaneously learns to separate image domains and translates input images into the estimated domains. Experimental results show that our model achieves comparable or even better performance than the set-level supervised model trained with full labels, generalizes well on various datasets, and is robust against the choice of hyperparameters (e.g. the preset number of pseudo domains). Furthermore, TUNIT can be easily extended to semi-supervised learning with a few labeled data.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Baek_Rethinking_the_Truly_Unsupervised_Image-to-Image_Translation_ICCV_2021_paper.pdf", - "aff": "Yonsei University*; NA VER AI Lab; Yonsei University; UNIST; Yonsei University\u2020", + "aff": "Yonsei University*; NA VER AI Lab; Yonsei University; UNIST; Yonsei University†", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Baek_Rethinking_the_Truly_ICCV_2021_supplemental.pdf", @@ -34569,7 +36907,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Baek_2021_ICCV,\n \n author = {\n Baek,\n Kyungjune and Choi,\n Yunjey and Uh,\n Youngjung and Yoo,\n Jaejun and Shim,\n Hyunjung\n},\n title = {\n Rethinking the Truly Unsupervised Image-to-Image Translation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14154-14163\n} \n}" }, { "title": "RetrievalFuse: Neural 3D Scene Reconstruction With a Database", @@ -34577,10 +36916,11 @@ "status": "Poster", "track": "main", "pid": 1750, - "author": "Yawar Siddiqui; Justus Thies; Fangchang Ma; Qi Shan; Matthias Nie\u00dfner; Angela Dai", + "author_site": "Yawar Siddiqui; Justus Thies; Fangchang Ma; Qi Shan; Matthias Nießner; Angela Dai", + "author": "Yawar Siddiqui; Justus Thies; Fangchang Ma; Qi Shan; Matthias Nießner; Angela Dai", "abstract": "3D reconstruction of large scenes is a challenging problem due to the high-complexity nature of the solution space, in particular for generative neural networks. In contrast to traditional generative learned models which encode the full generative process into a neural network and can struggle with maintaining local details at the scene level, we introduce a new method that directly leverages scene geometry from the training database. First, we learn to synthesize an initial estimate for a 3D scene, constructed by retrieving a top-k set of volumetric chunks from the scene database. These candidates are then refined to a final scene generation with an attention-based refinement that can effectively select the most consistent set of geometry from the candidates and combine them together to create an output scene, facilitating transfer of coherent structures and local detail from train scene geometry. We demonstrate our neural scene reconstruction with a database for the tasks of 3D super-resolution and surface reconstruction from sparse point clouds, showing that our approach enables generation of more coherent, accurate 3D scenes, improving on average by over 8% in IoU over state-of-the-art scene reconstruction.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Siddiqui_RetrievalFuse_Neural_3D_Scene_Reconstruction_With_a_Database_ICCV_2021_paper.pdf", - "aff": "Technical University of Munich; Technical University of Munich + Max Planck Institute for Intelligent Systems, T\u00fcbingen; Apple; Apple; Technical University of Munich; Technical University of Munich", + "aff": "Technical University of Munich; Technical University of Munich + Max Planck Institute for Intelligent Systems, Tübingen; Apple; Apple; Technical University of Munich; Technical University of Munich", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Siddiqui_RetrievalFuse_Neural_3D_ICCV_2021_supplemental.zip", @@ -34593,14 +36933,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Siddiqui_RetrievalFuse_Neural_3D_Scene_Reconstruction_With_a_Database_ICCV_2021_paper.html", "aff_unique_index": "0;0+1;2;2;0;0", - "aff_unique_norm": "Technical University of Munich;Max Planck Institute for Intelligent Systems;Apple", - "aff_unique_dep": ";;Apple Inc.", + "aff_unique_norm": "Technical University of Munich;Max Planck Institute for Intelligent Systems;Apple Inc.", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.tum.de;https://www.mpi-is.mpg.de;https://www.apple.com", "aff_unique_abbr": "TUM;MPI-IS;Apple", "aff_campus_unique_index": "1", - "aff_campus_unique": ";T\u00fcbingen", + "aff_campus_unique": ";Tübingen", "aff_country_unique_index": "0;0+0;1;1;0;0", - "aff_country_unique": "Germany;United States" + "aff_country_unique": "Germany;United States", + "bibtex": "@InProceedings{Siddiqui_2021_ICCV,\n \n author = {\n Siddiqui,\n Yawar and Thies,\n Justus and Ma,\n Fangchang and Shan,\n Qi and Nie{\\ss\n}ner,\n Matthias and Dai,\n Angela\n},\n title = {\n RetrievalFuse: Neural 3D Scene Reconstruction With a Database\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12568-12577\n} \n}" }, { "title": "Retrieve in Style: Unsupervised Facial Feature Transfer and Retrieval", @@ -34608,6 +36949,7 @@ "status": "Poster", "track": "main", "pid": 4320, + "author_site": "Min Jin Chong; Wen-Sheng Chu; Abhishek Kumar; David Forsyth", "author": "Min Jin Chong; Wen-Sheng Chu; Abhishek Kumar; David Forsyth", "abstract": "We present Retrieve in Style (RIS), an unsupervised framework for facial feature transfer and retrieval on real images. Recent work shows capabilities of transferring local facial features by capitalizing on the disentanglement property of the StyleGAN latent space. RIS improves existing art on the following: 1) Introducing more effective feature disentanglement to allow for challenging transfers (i.e., hair, pose) that were not shown possible in SoTA methods. 2) Eliminating the need for per-image hyperparameter tuning, and for computing a catalog over a large batch of images. 3) Enabling fine-grained face retrieval using disentangled facial features (e.g., eyes). To our best knowledge, this is the first work to retrieve face images at this fine level. 4) Demonstrating robust, natural editing on real images. Our qualitative and quantitative analyses show RIS achieves both high-fidelity feature transfers and accurate fine-grained retrievals on real images. We also discuss the responsible applications of RIS. Our code is available at https://github.com/mchong6/RetrieveInStyle.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chong_Retrieve_in_Style_Unsupervised_Facial_Feature_Transfer_and_Retrieval_ICCV_2021_paper.pdf", @@ -34624,14 +36966,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Chong_Retrieve_in_Style_Unsupervised_Facial_Feature_Transfer_and_Retrieval_ICCV_2021_paper.html", "aff_unique_index": "0;1;1;0", - "aff_unique_norm": "University of Illinois Urbana-Champaign;Google", + "aff_unique_norm": "University of Illinois at Urbana-Champaign;Google", "aff_unique_dep": ";Google Research", "aff_unique_url": "https://illinois.edu;https://research.google", "aff_unique_abbr": "UIUC;Google Research", "aff_campus_unique_index": "0;1;1;0", "aff_campus_unique": "Urbana-Champaign;Mountain View", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Chong_2021_ICCV,\n \n author = {\n Chong,\n Min Jin and Chu,\n Wen-Sheng and Kumar,\n Abhishek and Forsyth,\n David\n},\n title = {\n Retrieve in Style: Unsupervised Facial Feature Transfer and Retrieval\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3887-3896\n} \n}" }, { "title": "Revealing the Reciprocal Relations Between Self-Supervised Stereo and Monocular Depth Estimation", @@ -34639,6 +36982,7 @@ "status": "Poster", "track": "main", "pid": 2418, + "author_site": "Zhi Chen; Xiaoqing Ye; Wei Yang; Zhenbo Xu; Xiao Tan; Zhikang Zou; Errui Ding; Xinming Zhang; Liusheng Huang", "author": "Zhi Chen; Xiaoqing Ye; Wei Yang; Zhenbo Xu; Xiao Tan; Zhikang Zou; Errui Ding; Xinming Zhang; Liusheng Huang", "abstract": "Current self-supervised depth estimation algorithms mainly focus on either stereo or monocular only, neglecting the reciprocal relations between them. In this paper, we propose a simple yet effective framework to improve both stereo and monocular depth estimation by leveraging the underlying complementary knowledge of the two tasks. Our approach consists of three stages. In the first stage, the proposed stereo matching network termed StereoNet is trained on image pairs in a self-supervised manner. Second, we introduce an occlusion-aware distillation (OA Distillation) module, which leverages the predicted depths from StereoNet in non-occluded regions to train our monocular depth estimation network named SingleNet. At last, we design an occlusion-aware fusion module (OA Fusion), which generates more reliable depths by fusing estimated depths from StereoNet and SingleNet given the occlusion map. Furthermore, we also take the fused depths as pseudo labels to supervise StereoNet in turn, which brings StereoNet's performance to a new height. Extensive experiments on KITTI dataset demonstrate the effectiveness of our proposed framework. We achieve new SOTA performance on both stereo and monocular depth estimation tasks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_Revealing_the_Reciprocal_Relations_Between_Self-Supervised_Stereo_and_Monocular_Depth_ICCV_2021_paper.pdf", @@ -34655,14 +36999,15 @@ "author_num": 9, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Chen_Revealing_the_Reciprocal_Relations_Between_Self-Supervised_Stereo_and_Monocular_Depth_ICCV_2021_paper.html", "aff_unique_index": "0;1;0;0;1;1;1;0;0", - "aff_unique_norm": "University of Science and Technology of China;Baidu", + "aff_unique_norm": "University of Science and Technology of China;Baidu Inc.", "aff_unique_dep": ";Department of Computer Vision Technology (VIS)", "aff_unique_url": "http://www.ustc.edu.cn;https://www.baidu.com", "aff_unique_abbr": "USTC;Baidu", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Zhi and Ye,\n Xiaoqing and Yang,\n Wei and Xu,\n Zhenbo and Tan,\n Xiao and Zou,\n Zhikang and Ding,\n Errui and Zhang,\n Xinming and Huang,\n Liusheng\n},\n title = {\n Revealing the Reciprocal Relations Between Self-Supervised Stereo and Monocular Depth Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15529-15538\n} \n}" }, { "title": "Revisiting Adversarial Robustness Distillation: Robust Soft Labels Make Student Better", @@ -34670,6 +37015,7 @@ "status": "Poster", "track": "main", "pid": 5508, + "author_site": "Bojia Zi; Shihao Zhao; Xingjun Ma; Yu-Gang Jiang", "author": "Bojia Zi; Shihao Zhao; Xingjun Ma; Yu-Gang Jiang", "abstract": "Adversarial training is one effective approach for training robust deep neural networks against adversarial attacks. While being able to bring reliable robustness, adversarial training (AT) methods in general favor high capacity models, i.e., the larger the model the better the robustness. This tends to limit their effectiveness on small models, which are more preferable in scenarios where storage or computing resources are very limited (e.g., mobile devices). In this paper, we leverage the concept of knowledge distillation to improve the robustness of small models by distilling from adversarially trained large models. We first revisit several state-of-the-art AT methods from a distillation perspective and identify one common technique that can lead to improved robustness: the use of robust soft labels -- predictions of a robust model. Following this observation, we propose a novel adversarial robustness distillation method called Robust Soft Label Adversarial Distillation (RSLAD) to train robust small student models. RSLAD fully exploits the robust soft labels produced by a robust (adversarially-trained) large teacher model to guide the student's learning on both natural and adversarial examples in all loss terms. We empirically demonstrate the effectiveness of our RSLAD approach over existing adversarial training and distillation methods in improving the robustness of small models against state-of-the-art attacks including the AutoAttack. We also provide a set of understandings on our RSLAD and the importance of robust soft labels for adversarial robustness distillation. Code: https://github.com/zibojia/RSLAD.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zi_Revisiting_Adversarial_Robustness_Distillation_Robust_Soft_Labels_Make_Student_Better_ICCV_2021_paper.pdf", @@ -34693,7 +37039,8 @@ "aff_campus_unique_index": "0;0;2;0", "aff_campus_unique": "Shanghai;;Geelong", "aff_country_unique_index": "0+0;0+0;1;0+0", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Zi_2021_ICCV,\n \n author = {\n Zi,\n Bojia and Zhao,\n Shihao and Ma,\n Xingjun and Jiang,\n Yu-Gang\n},\n title = {\n Revisiting Adversarial Robustness Distillation: Robust Soft Labels Make Student Better\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 16443-16452\n} \n}" }, { "title": "Revisiting Stereo Depth Estimation From a Sequence-to-Sequence Perspective With Transformers", @@ -34701,6 +37048,7 @@ "status": "Poster", "track": "main", "pid": 2986, + "author_site": "Zhaoshuo Li; Xingtong Liu; Nathan Drenkow; Andy Ding; Francis X. Creighton; Russell H. Taylor; Mathias Unberath", "author": "Zhaoshuo Li; Xingtong Liu; Nathan Drenkow; Andy Ding; Francis X. Creighton; Russell H. Taylor; Mathias Unberath", "abstract": "Stereo depth estimation relies on optimal correspondence matching between pixels on epipolar lines in the left and right images to infer depth. In this work, we revisit the problem from a sequence-to-sequence correspondence perspective to replace cost volume construction with dense pixel matching using position information and attention. This approach, named STereo TRansformer (STTR), has several advantages: It 1) relaxes the limitation of a fixed disparity range, 2) identifies occluded regions and provides confidence estimates, and 3) imposes uniqueness constraints during the matching process. We report promising results on both synthetic and real-world datasets and demonstrate that STTR generalizes across different domains, even without fine-tuning.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_Revisiting_Stereo_Depth_Estimation_From_a_Sequence-to-Sequence_Perspective_With_Transformers_ICCV_2021_paper.pdf", @@ -34724,7 +37072,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Zhaoshuo and Liu,\n Xingtong and Drenkow,\n Nathan and Ding,\n Andy and Creighton,\n Francis X. and Taylor,\n Russell H. and Unberath,\n Mathias\n},\n title = {\n Revisiting Stereo Depth Estimation From a Sequence-to-Sequence Perspective With Transformers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6197-6206\n} \n}" }, { "title": "Revitalizing Optimization for 3D Human Pose and Shape Estimation: A Sparse Constrained Formulation", @@ -34732,6 +37081,7 @@ "status": "Poster", "track": "main", "pid": 3916, + "author_site": "Taosha Fan; Kalyan Vasudev Alwala; Donglai Xiang; Weipeng Xu; Todd Murphey; Mustafa Mukadam", "author": "Taosha Fan; Kalyan Vasudev Alwala; Donglai Xiang; Weipeng Xu; Todd Murphey; Mustafa Mukadam", "abstract": "We propose a novel sparse constrained formulation and from it derive a real-time optimization method for 3D human pose and shape estimation. Our optimization method, SCOPE (Sparse Constrained Optimization for 3D human Pose and shapE estimation), is orders of magnitude faster (avg. 4 ms convergence) than existing optimization methods, while being mathematically equivalent to their dense unconstrained formulation under mild assumptions. We achieve this by exploiting the underlying sparsity and constraints of our formulation to efficiently compute the Gauss-Newton direction. We show that this computation scales linearly with the number of joints and measurements of a complex 3D human model, in contrast to prior work where it scales cubically due to their dense unconstrained formulation. Based on our optimization method, we present a real-time motion capture framework that estimates 3D human poses and shapes from a single image at over 30 FPS. In benchmarks against state-of-the-art methods on multiple public datasets, our framework outperforms other optimization methods and achieves competitive accuracy against regression methods. Project page with code and videos: https://sites.google.com/view/scope-human/.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Fan_Revitalizing_Optimization_for_3D_Human_Pose_and_Shape_Estimation_A_ICCV_2021_paper.pdf", @@ -34746,7 +37096,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Fan_Revitalizing_Optimization_for_3D_Human_Pose_and_Shape_Estimation_A_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Fan_Revitalizing_Optimization_for_3D_Human_Pose_and_Shape_Estimation_A_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Fan_2021_ICCV,\n \n author = {\n Fan,\n Taosha and Alwala,\n Kalyan Vasudev and Xiang,\n Donglai and Xu,\n Weipeng and Murphey,\n Todd and Mukadam,\n Mustafa\n},\n title = {\n Revitalizing Optimization for 3D Human Pose and Shape Estimation: A Sparse Constrained Formulation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11457-11466\n} \n}" }, { "title": "Road Anomaly Detection by Partial Image Reconstruction With Segmentation Coupling", @@ -34754,7 +37105,8 @@ "status": "Poster", "track": "main", "pid": 10232, - "author": "Tomas Vojir; Tom\u00e1\u0161 \u0160ipka; Rahaf Aljundi; Nikolay Chumerin; Daniel Olmeda Reino; Jiri Matas", + "author_site": "Tomas Vojir; Tomáš Šipka; Rahaf Aljundi; Nikolay Chumerin; Daniel Olmeda Reino; Jiri Matas", + "author": "Tomas Vojir; Tomáš Šipka; Rahaf Aljundi; Nikolay Chumerin; Daniel Olmeda Reino; Jiri Matas", "abstract": "We present a novel approach to the detection of unknown objects in the context of autonomous driving. The problem is formulated as anomaly detection, since we assume that the unknown stuff or object appearance cannot be learned. To that end, we propose a reconstruction module that can be used with many existing semantic segmentation networks, and that is trained to recognize and reconstruct road (drivable) surface from a small bottleneck. We postulate that poor reconstruction of the road surface is due to areas that are outside of the training distribution, which is a strong indicator of an anomaly. The road structural similarity error is coupled with the semantic segmentation to incorporate information from known classes and produce final per-pixel anomaly scores. The proposed JSR-Net was evaluated on four datasets, Lost-and-found, Road Anomaly, Road Obstacles, and FishyScapes, achieving state-of-art performance on all, reducing the false positives significantly, while typically having the highest average precision for wide range of operation points.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Vojir_Road_Anomaly_Detection_by_Partial_Image_Reconstruction_With_Segmentation_Coupling_ICCV_2021_paper.pdf", "aff": "Czech Technical University in Prague, Faculty of Electrical Engineering, Czech Republic+Toyota Motor Europe, Brussels, Belgium; Czech Technical University in Prague, Faculty of Electrical Engineering, Czech Republic; Toyota Motor Europe, Brussels, Belgium; Toyota Motor Europe, Brussels, Belgium; Toyota Motor Europe, Brussels, Belgium; Czech Technical University in Prague, Faculty of Electrical Engineering, Czech Republic", @@ -34777,7 +37129,8 @@ "aff_campus_unique_index": "0+1;0;1;1;1;0", "aff_campus_unique": "Prague;Brussels", "aff_country_unique_index": "0+1;0;1;1;1;0", - "aff_country_unique": "Czech Republic;Belgium" + "aff_country_unique": "Czech Republic;Belgium", + "bibtex": "@InProceedings{Vojir_2021_ICCV,\n \n author = {\n Vojir,\n Tomas and \\v{S\n}ipka,\n Tom\\'a\\v{s\n} and Aljundi,\n Rahaf and Chumerin,\n Nikolay and Reino,\n Daniel Olmeda and Matas,\n Jiri\n},\n title = {\n Road Anomaly Detection by Partial Image Reconstruction With Segmentation Coupling\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15651-15660\n} \n}" }, { "title": "Robust 2D/3D Vehicle Parsing in Arbitrary Camera Views for CVIS", @@ -34785,6 +37138,7 @@ "status": "Poster", "track": "main", "pid": 2674, + "author_site": "Hui Miao; Feixiang Lu; Zongdai Liu; Liangjun Zhang; Dinesh Manocha; Bin Zhou", "author": "Hui Miao; Feixiang Lu; Zongdai Liu; Liangjun Zhang; Dinesh Manocha; Bin Zhou", "abstract": "We present a novel approach to robustly detect and perceive vehicles in different camera views as part of a cooperative vehicle-infrastructure system (CVIS). Our formulation is designed for arbitrary camera views and makes no assumptions about intrinsic or extrinsic parameters. First, to deal with multi-view data scarcity, we propose a part-assisted novel view synthesis algorithm for data augmentation. We train a part-based texture inpainting network in a self-supervised manner. Then we render the textured model into the background image with the target 6-DoF pose. Second, to handle various camera parameters, we present a new method that produces dense mappings between image pixels and 3D points to perform robust 2D/3D vehicle parsing. Third, we build the first CVIS dataset for benchmarking, which annotates more than 1540 images (14017 instances) from real-world traffic scenarios. We combine these novel algorithms and datasets to develop a robust approach for 2D/3D vehicle parsing for CVIS. In practice, our approach outperforms SOTA methods on 2D detection, instance segmentation, and 6-DoF pose estimation by 3.8%, 4.3%, and 2.9%, respectively.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Miao_Robust_2D3D_Vehicle_Parsing_in_Arbitrary_Camera_Views_for_CVIS_ICCV_2021_paper.pdf", @@ -34801,14 +37155,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Miao_Robust_2D3D_Vehicle_Parsing_in_Arbitrary_Camera_Views_for_CVIS_ICCV_2021_paper.html", "aff_unique_index": "0;1;1;1;2;0+3", - "aff_unique_norm": "Beihang University;Baidu;University of Maryland;Pengcheng Laboratory", - "aff_unique_dep": "State Key Laboratory of Virtual Reality Technology and Systems;Robotics and Autonomous Driving Laboratory;;Peng Cheng Laboratory", + "aff_unique_norm": "Beihang University;Baidu Research;University of Maryland;Peng Cheng Laboratory", + "aff_unique_dep": "State Key Laboratory of Virtual Reality Technology and Systems;Robotics and Autonomous Driving Laboratory;;", "aff_unique_url": "http://www.buaa.edu.cn;https://baidu.com;https://www/umd.edu;", "aff_unique_abbr": "Beihang;Baidu;UMD;", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";College Park;Shenzhen", "aff_country_unique_index": "0;0;0;0;1;0+0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Miao_2021_ICCV,\n \n author = {\n Miao,\n Hui and Lu,\n Feixiang and Liu,\n Zongdai and Zhang,\n Liangjun and Manocha,\n Dinesh and Zhou,\n Bin\n},\n title = {\n Robust 2D/3D Vehicle Parsing in Arbitrary Camera Views for CVIS\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15631-15640\n} \n}" }, { "title": "Robust Automatic Monocular Vehicle Speed Estimation for Traffic Surveillance", @@ -34816,6 +37171,7 @@ "status": "Poster", "track": "main", "pid": 3900, + "author_site": "Jerome Revaud; Martin Humenberger", "author": "Jerome Revaud; Martin Humenberger", "abstract": "Even though CCTV cameras are widely deployed for traffic surveillance and have therefore the potential of becoming cheap automated sensors for traffic speed analysis, their large-scale usage toward this goal has not been reported yet. A key difficulty lies in fact in the camera calibration phase. Existing state-of-the-art methods perform the calibration using image processing or keypoint detection techniques that require high-quality video streams, yet typical CCTV footage is low-resolution and noisy. As a result, these methods largely fail in real-world conditions. In contrast, we propose two novel calibration techniques whose only inputs come from an off-the-shelf object detector. Both methods consider multiple detections jointly, leveraging the fact that cars have similar and well-known 3D shapes with normalized dimensions. The first one is based on minimizing an energy function corresponding to a 3D reprojection error, the second one instead learns from synthetic training data to predict the scene geometry directly. Noticing the lack of speed estimation benchmarks faithfully reflecting the actual quality of surveillance cameras, we introduce a novel dataset collected from public CCTV streams. Experimental results conducted on three diverse benchmarks demonstrate excellent speed estimation accuracy that could enable the wide use of CCTV cameras for traffic analysis, even in challenging conditions where state-of-the-art methods completely fail. Additional information can be found on our project web page: https://rebrand.ly/nle-cctv", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Revaud_Robust_Automatic_Monocular_Vehicle_Speed_Estimation_for_Traffic_Surveillance_ICCV_2021_paper.pdf", @@ -34839,7 +37195,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "France" + "aff_country_unique": "France", + "bibtex": "@InProceedings{Revaud_2021_ICCV,\n \n author = {\n Revaud,\n Jerome and Humenberger,\n Martin\n},\n title = {\n Robust Automatic Monocular Vehicle Speed Estimation for Traffic Surveillance\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4551-4561\n} \n}" }, { "title": "Robust Object Detection via Instance-Level Temporal Cycle Confusion", @@ -34847,10 +37204,11 @@ "status": "Poster", "track": "main", "pid": 5408, + "author_site": "Xin Wang; Thomas E. Huang; Benlin Liu; Fisher Yu; Xiaolong Wang; Joseph E. Gonzalez; Trevor Darrell", "author": "Xin Wang; Thomas E. Huang; Benlin Liu; Fisher Yu; Xiaolong Wang; Joseph E. Gonzalez; Trevor Darrell", "abstract": "Building reliable object detectors that are robust to domain shifts, such as various changes in context, viewpoint, and object appearances, is critical for real-world applications. In this work, we study the effectiveness of auxiliary self-supervised tasks to improve the out-of-distribution generalization of object detectors. Inspired by the principle of maximum entropy, we introduce a novel self-supervised task, instance-level temporal cycle confusion (CycConf), which operates on the region features of the object detectors. For each object, the task is to find the most different object proposals in the adjacent frame in a video and then cycle back to itself for self-supervision. CycConf encourages the object detector to explore invariant structures across instances under various motions, which leads to improved model robustness in unseen domains at test time. We observe consistent out-of-domain performance improvements when training object detectors in tandem with self-supervised tasks on various domain adaptation benchmarks with static images (Cityscapes, Foggy Cityscapes, Sim10K) and large-scale video datasets (BDD100K and Waymo open data). The code and models are released at https://xinw.ai/cyc-conf.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_Robust_Object_Detection_via_Instance-Level_Temporal_Cycle_Confusion_ICCV_2021_paper.pdf", - "aff": "Microsoft Research; ETH Z\u00fcrich; University of Washington; UC San Diego; UC Berkeley; UC Berkeley; UC Berkeley", + "aff": "Microsoft Research; ETH Zürich; University of Washington; UC San Diego; UC Berkeley; UC Berkeley; UC Berkeley", "project": "https://xinw.ai/cyc-conf", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Wang_Robust_Object_Detection_ICCV_2021_supplemental.pdf", @@ -34863,14 +37221,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wang_Robust_Object_Detection_via_Instance-Level_Temporal_Cycle_Confusion_ICCV_2021_paper.html", "aff_unique_index": "0;1;2;3;4;4;4", - "aff_unique_norm": "Microsoft;ETH Zurich;University of Washington;University of California, San Diego;University of California, Berkeley", + "aff_unique_norm": "Microsoft Corporation;ETH Zürich;University of Washington;University of California, San Diego;University of California, Berkeley", "aff_unique_dep": "Microsoft Research;;;;", "aff_unique_url": "https://www.microsoft.com/en-us/research;https://www.ethz.ch;https://www.washington.edu;https://www.ucsd.edu;https://www.berkeley.edu", "aff_unique_abbr": "MSR;ETHZ;UW;UCSD;UC Berkeley", "aff_campus_unique_index": "1;2;2;2", "aff_campus_unique": ";San Diego;Berkeley", "aff_country_unique_index": "0;1;0;0;0;0;0", - "aff_country_unique": "United States;Switzerland" + "aff_country_unique": "United States;Switzerland", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Xin and Huang,\n Thomas E. and Liu,\n Benlin and Yu,\n Fisher and Wang,\n Xiaolong and Gonzalez,\n Joseph E. and Darrell,\n Trevor\n},\n title = {\n Robust Object Detection via Instance-Level Temporal Cycle Confusion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9143-9152\n} \n}" }, { "title": "Robust Small Object Detection on the Water Surface Through Fusion of Camera and Millimeter Wave Radar", @@ -34878,6 +37237,7 @@ "status": "Poster", "track": "main", "pid": 7924, + "author_site": "Yuwei Cheng; Hu Xu; Yimin Liu", "author": "Yuwei Cheng; Hu Xu; Yimin Liu", "abstract": "In recent years, unmanned surface vehicles (USVs) have been experiencing growth in various applications. With the expansion of USVs' application scenes from the typical marine areas to inland waters, new challenges arise for the object detection task, which is an essential part of the perception system of USVs. In our work, we focus on a relatively unexplored task for USVs in inland waters: small object detection on water surfaces, which is of vital importance for safe autonomous navigation and USVs' certain missions such as floating waste cleaning. Considering the limitations of vision-based object detection, we propose a novel vision-radar fusion based method for robust small object detection on water surfaces. By using a novel representation format of millimeter wave radar point clouds and applying a deep-level multi-scale fusion of RGB images and radar data, the proposed method can efficiently utilize the characteristics of radar data and improve the accuracy and robustness for small object detection on water surfaces. We test the method on the real-world floating bottle dataset that we collected and released. The result shows that, our method improves the average detection accuracy significantly compared to the vision-based methods and achieves state-of-the-art performance. Besides, the proposed method performs robustly when single sensor degrades.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Cheng_Robust_Small_Object_Detection_on_the_Water_Surface_Through_Fusion_ICCV_2021_paper.pdf", @@ -34901,7 +37261,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Cheng_2021_ICCV,\n \n author = {\n Cheng,\n Yuwei and Xu,\n Hu and Liu,\n Yimin\n},\n title = {\n Robust Small Object Detection on the Water Surface Through Fusion of Camera and Millimeter Wave Radar\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15263-15272\n} \n}" }, { "title": "Robust Small-Scale Pedestrian Detection With Cued Recall via Memory Learning", @@ -34909,6 +37270,7 @@ "status": "Poster", "track": "main", "pid": 2339, + "author_site": "Jung Uk Kim; Sungjune Park; Yong Man Ro", "author": "Jung Uk Kim; Sungjune Park; Yong Man Ro", "abstract": "Although the visual appearances of small-scale objects are not well observed, humans can recognize them by associating the visual cues of small objects from their memorized appearance. It is called cued recall. In this paper, motivated by the memory process of humans, we introduce a novel pedestrian detection framework that imitates cued recall in detecting small-scale pedestrians. We propose a large-scale embedding learning with the large-scale pedestrian recalling memory (LPR Memory). The purpose of the proposed large-scale embedding learning is to memorize and recall the large-scale pedestrian appearance via the LPR Memory. To this end, we employ the large-scale pedestrian exemplar set, so that, the LPR Memory can recall the information of the large-scale pedestrians from the small-scale pedestrians. Comprehensive quantitative and qualitative experimental results validate the effectiveness of the proposed framework with the LPR Memory.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Kim_Robust_Small-Scale_Pedestrian_Detection_With_Cued_Recall_via_Memory_Learning_ICCV_2021_paper.pdf", @@ -34932,7 +37294,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Kim_2021_ICCV,\n \n author = {\n Kim,\n Jung Uk and Park,\n Sungjune and Ro,\n Yong Man\n},\n title = {\n Robust Small-Scale Pedestrian Detection With Cued Recall via Memory Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3050-3059\n} \n}" }, { "title": "Robust Trust Region for Weakly Supervised Segmentation", @@ -34940,6 +37303,7 @@ "status": "Poster", "track": "main", "pid": 5813, + "author_site": "Dmitrii Marin; Yuri Boykov", "author": "Dmitrii Marin; Yuri Boykov", "abstract": "Acquisition of training data for the standard semantic segmentation is expensive if requiring that each pixel is labeled. Yet, current methods significantly deteriorate in weakly supervised settings, e.g. where a fraction of pixels is labeled or when only image-level tags are available. It has been shown that regularized losses---originally developed for unsupervised low-level segmentation and representing geometric priors on pixel labels---can considerably improve the quality of weakly supervised training. However, many common priors require optimization stronger than gradient descent. Thus, such regularizers have limited applicability in deep learning. We propose a new robust trust region approach for regularized losses improving the state-of-the-art results. Our approach can be seen as a higher-order generalization of the classic chain rule. It allows neural network optimization to use strong low-level solvers for the corresponding regularizers, including discrete ones.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Marin_Robust_Trust_Region_for_Weakly_Supervised_Segmentation_ICCV_2021_paper.pdf", @@ -34963,7 +37327,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Marin_2021_ICCV,\n \n author = {\n Marin,\n Dmitrii and Boykov,\n Yuri\n},\n title = {\n Robust Trust Region for Weakly Supervised Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6608-6618\n} \n}" }, { "title": "Robust Watermarking for Deep Neural Networks via Bi-Level Optimization", @@ -34971,6 +37336,7 @@ "status": "Poster", "track": "main", "pid": 8381, + "author_site": "Peng Yang; Yingjie Lao; Ping Li", "author": "Peng Yang; Yingjie Lao; Ping Li", "abstract": "Deep neural networks (DNNs) have become state-of-the-art in many application domains. The increasing complexity and cost for building these models demand means for protecting their intellectual property (IP). This paper presents a novel DNN framework that optimizes the robustness of the embedded watermarks. Our method is originated from DNN fault attacks. Different from prior end-to-end DNN watermarking approaches, we only modify a tiny subset of weights to embed the watermark, which also facilities better control of the model behaviors and enables larger rooms for optimizing the robustness of the watermarks. In this paper, built upon the above concept, we propose a bi-level optimization framework where the inner loop phase optimizes the example-level problem to generate robust exemplars, while the outer loop phase proposes a masked adaptive optimization to achieve the robustness of the projected DNN models. Our method alternates the learning of the protected models and watermark exemplars across all phases, where watermark exemplars are not just data samples that could be optimized and adjusted instead. We verify the performance of the proposed methods over a wide range of datasets and DNN architectures. Various transformation attacks including fine-tuning, pruning and overwriting are used to evaluate the robustness.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yang_Robust_Watermarking_for_Deep_Neural_Networks_via_Bi-Level_Optimization_ICCV_2021_paper.pdf", @@ -34985,7 +37351,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Yang_Robust_Watermarking_for_Deep_Neural_Networks_via_Bi-Level_Optimization_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Yang_Robust_Watermarking_for_Deep_Neural_Networks_via_Bi-Level_Optimization_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Yang_2021_ICCV,\n \n author = {\n Yang,\n Peng and Lao,\n Yingjie and Li,\n Ping\n},\n title = {\n Robust Watermarking for Deep Neural Networks via Bi-Level Optimization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14841-14850\n} \n}" }, { "title": "RobustNav: Towards Benchmarking Robustness in Embodied Navigation", @@ -34993,6 +37360,7 @@ "status": "Poster", "track": "main", "pid": 3176, + "author_site": "Prithvijit Chattopadhyay; Judy Hoffman; Roozbeh Mottaghi; Aniruddha Kembhavi", "author": "Prithvijit Chattopadhyay; Judy Hoffman; Roozbeh Mottaghi; Aniruddha Kembhavi", "abstract": "As an attempt towards assessing the robustness of embodied navigation agents, we propose RobustNav, a framework to quantify the performance of embodied navigation agents when exposed to a wide variety of visual-- affecting RGB inputs -- and dynamics -- affecting transition dynamics -- corruptions. Most recent efforts in visual navigation have typically focused on generalizing to novel target environments with similar appearance and dynamics characteristics. With RobustNav, we find that some standard embodied navigation agents significantly underperform (or fail) in the presence of visual or dynamics corruptions. We systematically analyze the kind of idiosyncrasies that emerge in the behavior of such agents when operating under corruptions. Finally, for visual corruptions in RobustNav, we show that while standard techniques to improve robustness such as data-augmentation and self-supervised adaptation offer some zero-shot resistance and improvements in navigation performance, there is still a long way to go in terms of recovering lost performance relative to clean \"non-corrupt\" settings, warranting more research in this direction. Our code is available at https://github.com/allenai/robustnav.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chattopadhyay_RobustNav_Towards_Benchmarking_Robustness_in_Embodied_Navigation_ICCV_2021_paper.pdf", @@ -35016,7 +37384,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0+0;0;0+0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Chattopadhyay_2021_ICCV,\n \n author = {\n Chattopadhyay,\n Prithvijit and Hoffman,\n Judy and Mottaghi,\n Roozbeh and Kembhavi,\n Aniruddha\n},\n title = {\n RobustNav: Towards Benchmarking Robustness in Embodied Navigation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15691-15700\n} \n}" }, { "title": "Robustness Certification for Point Cloud Models", @@ -35024,7 +37393,8 @@ "status": "Poster", "track": "main", "pid": 8042, - "author": "Tobias Lorenz; Anian Ruoss; Mislav Balunovi\u0107; Gagandeep Singh; Martin Vechev", + "author_site": "Tobias Lorenz; Anian Ruoss; Mislav Balunović; Gagandeep Singh; Martin Vechev", + "author": "Tobias Lorenz; Anian Ruoss; Mislav Balunović; Gagandeep Singh; Martin Vechev", "abstract": "The use of deep 3D point cloud models in safety-critical applications, such as autonomous driving, dictates the need to certify the robustness of these models to real-world transformations. This is technically challenging, as it requires a scalable verifier tailored to point cloud models that handles a wide range of semantic 3D transformations. In this work, we address this challenge and introduce 3DCertify, the first verifier able to certify the robustness of point cloud models. 3DCertify is based on two key insights: (i) a generic relaxation based on first-order Taylor approximations, applicable to any differentiable transformation, and (ii) a precise relaxation for global feature pooling, which is more complex than pointwise activations (e.g., ReLU or sigmoid) but commonly employed in point cloud models. We demonstrate the effectiveness of 3DCertify by performing an extensive evaluation on a wide range of 3D transformations (e.g., rotation, twisting) for both classification and part segmentation tasks. For example, we can certify robustness against rotations by +-60deg for 95.7% of point clouds, and our max pool relaxation increases certification by up to 15.6%.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Lorenz_Robustness_Certification_for_Point_Cloud_Models_ICCV_2021_paper.pdf", "aff": "CISPA Helmholtz Center for Information Security; Department of Computer Science, ETH Zurich; Department of Computer Science, ETH Zurich; University of Illinois at Urbana-Champaign and VMware Research; Department of Computer Science, ETH Zurich", @@ -35040,14 +37410,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Lorenz_Robustness_Certification_for_Point_Cloud_Models_ICCV_2021_paper.html", "aff_unique_index": "0;1;1;2;1", - "aff_unique_norm": "CISPA Helmholtz Center for Information Security;ETH Zurich;University of Illinois Urbana-Champaign", + "aff_unique_norm": "CISPA Helmholtz Center for Information Security;ETH Zurich;University of Illinois at Urbana-Champaign", "aff_unique_dep": ";Department of Computer Science;", "aff_unique_url": "https://www.cispa.de/;https://www.ethz.ch;https://illinois.edu", "aff_unique_abbr": "CISPA;ETHZ;UIUC", "aff_campus_unique_index": "1", "aff_campus_unique": ";Urbana-Champaign", "aff_country_unique_index": "0;1;1;2;1", - "aff_country_unique": "Germany;Switzerland;United States" + "aff_country_unique": "Germany;Switzerland;United States", + "bibtex": "@InProceedings{Lorenz_2021_ICCV,\n \n author = {\n Lorenz,\n Tobias and Ruoss,\n Anian and Balunovi\\'c,\n Mislav and Singh,\n Gagandeep and Vechev,\n Martin\n},\n title = {\n Robustness Certification for Point Cloud Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7608-7618\n} \n}" }, { "title": "Robustness and Generalization via Generative Adversarial Training", @@ -35055,6 +37426,7 @@ "status": "Poster", "track": "main", "pid": 2380, + "author_site": "Omid Poursaeed; Tianxing Jiang; Harry Yang; Serge Belongie; Ser-Nam Lim", "author": "Omid Poursaeed; Tianxing Jiang; Harry Yang; Serge Belongie; Ser-Nam Lim", "abstract": "While deep neural networks have achieved remarkable success in various computer vision tasks, they often fail to generalize to subtle variations of input images. Several defenses have been proposed to improve the robustness against these variations. However, current defenses can only withstand the specific attack used in training, and the models often remain vulnerable to other input variations. Moreover, these methods often degrade performance of the model on clean images. In this paper, we present Generative Adversarial Training, an approach to simultaneously improve the model's generalization and robustness to unseen adversarial attacks. Instead of altering a single pre-defined aspect of images, we generate a spectrum of low-level, mid-level and high-level changes using generative models with a disentangled latent space. Adversarial training with these examples enable the model to withstand a wide range of attacks by observing a variety of input alterations during training. We show that our approach not only improves performance of the model on clean images but also makes it robust against unforeseen attacks and outperforms prior work. We validate effectiveness of our method by demonstrating results on various tasks such as classification, semantic segmentation and object detection.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Poursaeed_Robustness_and_Generalization_via_Generative_Adversarial_Training_ICCV_2021_paper.pdf", @@ -35071,14 +37443,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Poursaeed_Robustness_and_Generalization_via_Generative_Adversarial_Training_ICCV_2021_paper.html", "aff_unique_index": "0+0;0+0;1;0+0;1", - "aff_unique_norm": "Cornell University;Meta", + "aff_unique_norm": "Cornell University;Facebook", "aff_unique_dep": ";Facebook AI", "aff_unique_url": "https://www.cornell.edu;https://www.facebook.com", "aff_unique_abbr": "Cornell;Facebook AI", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";New York City", "aff_country_unique_index": "0+0;0+0;0;0+0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Poursaeed_2021_ICCV,\n \n author = {\n Poursaeed,\n Omid and Jiang,\n Tianxing and Yang,\n Harry and Belongie,\n Serge and Lim,\n Ser-Nam\n},\n title = {\n Robustness and Generalization via Generative Adversarial Training\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15711-15720\n} \n}" }, { "title": "Robustness via Cross-Domain Ensembles", @@ -35086,7 +37459,8 @@ "status": "Poster", "track": "main", "pid": 3503, - "author": "Teresa Yeo; O\u011fuzhan Fatih Kar; Amir Zamir", + "author_site": "Teresa Yeo; Oğuzhan Fatih Kar; Amir Zamir", + "author": "Teresa Yeo; Oğuzhan Fatih Kar; Amir Zamir", "abstract": "We present a method for making neural network predictions robust to shifts from the training data distribution. The proposed method is based on making predictions via a diverse set of cues (called `middle domains') and ensembling them into one strong prediction. The premise of the idea is that predictions made via different cues respond differently to a distribution shift, hence one should be able to merge them into one robust final prediction. We perform the merging in a straightforward but principled manner based on the uncertainty associated with each prediction. The evaluations are performed using multiple tasks and datasets (Taskonomy, Replica, ImageNet, CIFAR) under a wide range of adversarial and non-adversarial distribution shifts which demonstrate the proposed method is considerably more robust than its standard learning counterpart, conventional deep ensembles, and several other baselines.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yeo_Robustness_via_Cross-Domain_Ensembles_ICCV_2021_paper.pdf", "aff": ";;", @@ -35100,7 +37474,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Yeo_Robustness_via_Cross-Domain_Ensembles_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Yeo_Robustness_via_Cross-Domain_Ensembles_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Yeo_2021_ICCV,\n \n author = {\n Yeo,\n Teresa and Kar,\n O\\u{g\n}uzhan Fatih and Zamir,\n Amir\n},\n title = {\n Robustness via Cross-Domain Ensembles\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12189-12199\n} \n}" }, { "title": "Rotation Averaging in a Split Second: A Primal-Dual Method and a Closed-Form for Cycle Graphs", @@ -35108,7 +37483,8 @@ "status": "Poster", "track": "main", "pid": 9108, - "author": "Gabriel Moreira; Manuel Marques; Jo\u00e3o Paulo Costeira", + "author_site": "Gabriel Moreira; Manuel Marques; João Paulo Costeira", + "author": "Gabriel Moreira; Manuel Marques; João Paulo Costeira", "abstract": "A cornerstone of geometric reconstruction, rotation averaging seeks the set of absolute rotations that optimally explains a set of measured relative orientations between them. In spite of being an integral part of bundle adjustment and structure-from-motion, averaging rotations is both a nonconvex and high-dimensional optimization problem. In this paper, we address it from a maximum likelihood estimation standpoint and make a twofold contribution. Firstly, we set forth a novel initialization-free primal-dual method which we show empirically to converge to the global optimum. Further, we derive what is to our knowledge, the first optimal closed-form solution for rotation averaging in cycle graphs and contextualize this result within spectral graph theory. Our proposed methods achieve a significant gain both in precision and performance.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Moreira_Rotation_Averaging_in_a_Split_Second_A_Primal-Dual_Method_and_ICCV_2021_paper.pdf", "aff": ";;", @@ -35122,7 +37498,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Moreira_Rotation_Averaging_in_a_Split_Second_A_Primal-Dual_Method_and_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Moreira_Rotation_Averaging_in_a_Split_Second_A_Primal-Dual_Method_and_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Moreira_2021_ICCV,\n \n author = {\n Moreira,\n Gabriel and Marques,\n Manuel and Costeira,\n Jo\\~ao Paulo\n},\n title = {\n Rotation Averaging in a Split Second: A Primal-Dual Method and a Closed-Form for Cycle Graphs\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5452-5460\n} \n}" }, { "title": "S3VAADA: Submodular Subset Selection for Virtual Adversarial Active Domain Adaptation", @@ -35130,6 +37507,7 @@ "status": "Poster", "track": "main", "pid": 10567, + "author_site": "Harsh Rangwani; Arihant Jain; Sumukh K Aithal; R. Venkatesh Babu", "author": "Harsh Rangwani; Arihant Jain; Sumukh K Aithal; R. Venkatesh Babu", "abstract": "Unsupervised domain adaptation (DA) methods have focused on achieving maximal performance through aligning features from source and target domains without using labeled data in the target domain. Whereas, in the real-world scenario's it might be feasible to get labels for a small proportion of target data. In these scenarios, it is important to select maximally-informative samples to label and find an effective way to combine them with the existing knowledge from source data. Towards achieving this, we propose S^3VAADA which i) introduces a novel submodular criterion to select a maximally informative subset to label and ii) enhances a cluster-based DA procedure through novel improvements to effectively utilize all the available data for improving generalization on target. Our approach consistently outperforms the competing state-of-the-art approaches on datasets with varying degrees of domain shifts. The project page with additional details is available here: https://sites.google.com/iisc.ac.in/s3vaada-iccv2021.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Rangwani_S3VAADA_Submodular_Subset_Selection_for_Virtual_Adversarial_Active_Domain_Adaptation_ICCV_2021_paper.pdf", @@ -35144,7 +37522,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Rangwani_S3VAADA_Submodular_Subset_Selection_for_Virtual_Adversarial_Active_Domain_Adaptation_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Rangwani_S3VAADA_Submodular_Subset_Selection_for_Virtual_Adversarial_Active_Domain_Adaptation_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Rangwani_2021_ICCV,\n \n author = {\n Rangwani,\n Harsh and Jain,\n Arihant and Aithal,\n Sumukh K and Babu,\n R. Venkatesh\n},\n title = {\n S3VAADA: Submodular Subset Selection for Virtual Adversarial Active Domain Adaptation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7516-7525\n} \n}" }, { "title": "SA-ConvONet: Sign-Agnostic Optimization of Convolutional Occupancy Networks", @@ -35152,6 +37531,7 @@ "status": "Poster", "track": "main", "pid": 5406, + "author_site": "Jiapeng Tang; Jiabao Lei; Dan Xu; Feiying Ma; Kui Jia; Lei Zhang", "author": "Jiapeng Tang; Jiabao Lei; Dan Xu; Feiying Ma; Kui Jia; Lei Zhang", "abstract": "Surface reconstruction from point clouds is a fundamental problem in the computer vision and graphics community. Recent state-of-the-arts solve this problem by individually optimizing each local implicit field during inference. Without considering the geometric relationships between local fields, they typically require accurate normals to avoid the sign conflict problem in overlapped regions of local fields, which severely limits their applicability to raw scans where surface normals could be unavailable. Although SAL breaks this limitation via sign-agnostic learning, further works still need to explore how to extend this technique for local shape modeling. To this end, we propose to learn implicit surface reconstruction by sign-agnostic optimization of convolutional occupancy networks, to simultaneously achieve advanced scalability to large-scale scenes, generality to novel shapes, and applicability to raw scans in a unified framework. Concretely, we achieve this goal by a simple yet effective design, which further optimizes the pre-trained occupancy prediction networks with an unsigned cross-entropy loss during inference. The learning of occupancy fields is conditioned on convolutional features from an hourglass network architecture. Extensive experimental comparisons with previous state-of-the-arts on both object-level and scene-level datasets demonstrate the superior accuracy of our approach for surface reconstruction from un-orientated point clouds. The code is available at https://github.com/tangjiapeng/SA-ConvONet.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Tang_SA-ConvONet_Sign-Agnostic_Optimization_of_Convolutional_Occupancy_Networks_ICCV_2021_paper.pdf", @@ -35168,14 +37548,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Tang_SA-ConvONet_Sign-Agnostic_Optimization_of_Convolutional_Occupancy_Networks_ICCV_2021_paper.html", "aff_unique_index": "0;0;1;2;0+3+4;5+2", - "aff_unique_norm": "South China University of Technology;Hong Kong University of Science and Technology;Alibaba Group;Pazhou Lab;Pengcheng Laboratory;Hong Kong Polytechnic University", - "aff_unique_dep": "School of Electronic and Information Engineering;Department of Computer Science and Engineering;DAMO Academy;;Peng Cheng Laboratory;Department of Computing", + "aff_unique_norm": "South China University of Technology;Hong Kong University of Science and Technology;Alibaba Group;Pazhou Lab;Peng Cheng Laboratory;The Hong Kong Polytechnic University", + "aff_unique_dep": "School of Electronic and Information Engineering;Department of Computer Science and Engineering;DAMO Academy;;;Department of Computing", "aff_unique_url": "https://www.scut.edu.cn;https://www.hkust.edu.hk;https://www.alibaba-group.com;;;https://www.polyu.edu.hk", "aff_unique_abbr": "SCUT;HKUST;Alibaba;;;PolyU", "aff_campus_unique_index": "1;2+3;1", "aff_campus_unique": ";Hong Kong SAR;Guangzhou;Shenzhen", "aff_country_unique_index": "0;0;0;0;0+0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Tang_2021_ICCV,\n \n author = {\n Tang,\n Jiapeng and Lei,\n Jiabao and Xu,\n Dan and Ma,\n Feiying and Jia,\n Kui and Zhang,\n Lei\n},\n title = {\n SA-ConvONet: Sign-Agnostic Optimization of Convolutional Occupancy Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6504-6513\n} \n}" }, { "title": "SACoD: Sensor Algorithm Co-Design Towards Efficient CNN-Powered Intelligent PhlatCam", @@ -35183,6 +37564,7 @@ "status": "Poster", "track": "main", "pid": 2881, + "author_site": "Yonggan Fu; Yang Zhang; Yue Wang; Zhihan Lu; Vivek Boominathan; Ashok Veeraraghavan; Yingyan Lin", "author": "Yonggan Fu; Yang Zhang; Yue Wang; Zhihan Lu; Vivek Boominathan; Ashok Veeraraghavan; Yingyan Lin", "abstract": "There has been a booming demand for integrating Convolutional Neural Networks (CNNs) powered functionalities into Internet-of-Thing (IoT) devices to enable ubiquitous intelligent \"IoT cameras\". However, more extensive applications of such IoT systems are still limited by two challenges. First, some applications, especially medicine- and wearable-related ones, impose stringent requirements on the camera form factor. Second, powerful CNNs often require considerable storage and energy cost, whereas IoT devices often suffer from limited resources. PhlatCam, with its form factor potentially reduced by orders of magnitude, has emerged as a promising solution to the first aforementioned challenge, while the second one remains a bottleneck. Existing compression techniques, which can potentially tackle the second challenge, are far from realizing the full potential in storage and energy reduction, because they mostly focus on the CNN algorithm itself. To this end, this work proposes SACoD, a Sensor Algorithm Co-Design framework to develop more efficient CNN-powered PhlatCam. In particular, the mask coded in the PhlatCam sensor and the backend CNN model are jointly optimized in terms of both model parameters and architectures via differential neural architecture search. Extensive experiments including both simulation and physical measurement on manufactured masks show that the proposed SACoD framework achieves aggressive model compression and energy savings while maintaining or even boosting the task accuracy, when benchmarking over two state-of-the-art (SOTA) designs with six datasets across four different vision tasks including classification, segmentation, image translation, and face recognition. Our codes are available at: https://github.com/RICE-EIC/SACoD.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Fu_SACoD_Sensor_Algorithm_Co-Design_Towards_Efficient_CNN-Powered_Intelligent_PhlatCam_ICCV_2021_paper.pdf", @@ -35206,7 +37588,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Fu_2021_ICCV,\n \n author = {\n Fu,\n Yonggan and Zhang,\n Yang and Wang,\n Yue and Lu,\n Zhihan and Boominathan,\n Vivek and Veeraraghavan,\n Ashok and Lin,\n Yingyan\n},\n title = {\n SACoD: Sensor Algorithm Co-Design Towards Efficient CNN-Powered Intelligent PhlatCam\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5168-5177\n} \n}" }, { "title": "SAT: 2D Semantics Assisted Training for 3D Visual Grounding", @@ -35214,6 +37597,7 @@ "status": "Poster", "track": "main", "pid": 7105, + "author_site": "Zhengyuan Yang; Songyang Zhang; Liwei Wang; Jiebo Luo", "author": "Zhengyuan Yang; Songyang Zhang; Liwei Wang; Jiebo Luo", "abstract": "3D visual grounding aims at grounding a natural language description about a 3D scene, usually represented in the form of 3D point clouds, to the targeted object region. Point clouds are sparse, noisy, and contain limited semantic information compared with 2D images. These inherent limitations make the 3D visual grounding problem more challenging. In this study, we propose 2D Semantics Assisted Training (SAT) that utilizes 2D image semantics in the training stage to ease point-cloud-language joint representation learning and assist 3D visual grounding. The main idea is to learn auxiliary alignments between rich, clean 2D object representations and the corresponding objects or mentioned entities in 3D scenes. SAT takes 2D object semantics, i.e., object label, image feature, and 2D geometric feature, as the extra input in training but does not require such inputs during inference. By effectively utilizing 2D semantics in training, our approach boosts the accuracy on the Nr3D dataset from 37.7% to 49.2%, which significantly surpasses the non-SAT baseline with the identical network architecture and inference input. Our approach outperforms the state of the art by large margins on multiple 3D visual grounding datasets, i.e., +10.4% absolute accuracy on Nr3D, +9.9% on Sr3D, and +5.6% on ScanRef.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yang_SAT_2D_Semantics_Assisted_Training_for_3D_Visual_Grounding_ICCV_2021_paper.pdf", @@ -35230,14 +37614,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Yang_SAT_2D_Semantics_Assisted_Training_for_3D_Visual_Grounding_ICCV_2021_paper.html", "aff_unique_index": "0;0;1;0", - "aff_unique_norm": "University of Rochester;Chinese University of Hong Kong", + "aff_unique_norm": "University of Rochester;The Chinese University of Hong Kong", "aff_unique_dep": ";", "aff_unique_url": "https://www.rochester.edu;https://www.cuhk.edu.hk", "aff_unique_abbr": "U of R;CUHK", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;1;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Yang_2021_ICCV,\n \n author = {\n Yang,\n Zhengyuan and Zhang,\n Songyang and Wang,\n Liwei and Luo,\n Jiebo\n},\n title = {\n SAT: 2D Semantics Assisted Training for 3D Visual Grounding\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1856-1866\n} \n}" }, { "title": "SCOUTER: Slot Attention-Based Classifier for Explainable Image Recognition", @@ -35245,6 +37630,7 @@ "status": "Poster", "track": "main", "pid": 5459, + "author_site": "Liangzhi Li; Bowen Wang; Manisha Verma; Yuta Nakashima; Ryo Kawasaki; Hajime Nagahara", "author": "Liangzhi Li; Bowen Wang; Manisha Verma; Yuta Nakashima; Ryo Kawasaki; Hajime Nagahara", "abstract": "Explainable artificial intelligence has been gaining attention in the past few years. However, most existing methods are based on gradients or intermediate features, which are not directly involved in the decision-making process of the classifier. In this paper, we propose a slot attention-based classifier called SCOUTER for transparent yet accurate classification. Two major differences from other attention-based methods include: (a) SCOUTER's explanation is involved in the final confidence for each category, offering more intuitive interpretation, and (b) all the categories have their corresponding positive or negative explanation, which tells \"why the image is of a certain category\" or \"why the image is not of a certain category.\" We design a new loss tailored for SCOUTER that controls the model's behavior to switch between positive and negative explanations, as well as the size of explanatory regions. Experimental results show that SCOUTER can give better visual explanations in terms of various metrics while keeping good accuracy on small and medium-sized datasets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_SCOUTER_Slot_Attention-Based_Classifier_for_Explainable_Image_Recognition_ICCV_2021_paper.pdf", @@ -35268,7 +37654,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Liangzhi and Wang,\n Bowen and Verma,\n Manisha and Nakashima,\n Yuta and Kawasaki,\n Ryo and Nagahara,\n Hajime\n},\n title = {\n SCOUTER: Slot Attention-Based Classifier for Explainable Image Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1046-1055\n} \n}" }, { "title": "SENTRY: Selective Entropy Optimization via Committee Consistency for Unsupervised Domain Adaptation", @@ -35276,6 +37663,7 @@ "status": "Poster", "track": "main", "pid": 1713, + "author_site": "Viraj Prabhu; Shivam Khare; Deeksha Kartik; Judy Hoffman", "author": "Viraj Prabhu; Shivam Khare; Deeksha Kartik; Judy Hoffman", "abstract": "Many existing approaches for unsupervised domain adaptation (UDA) focus on adapting under only data distribution shift and offer limited success under additional cross-domain label distribution shift. Recent work based on self-training using target pseudolabels has shown promise, but on challenging shifts pseudolabels may be highly unreliable and using them for self-training may lead to error accumulation and domain misalignment. We propose Selective Entropy Optimization via Committee Consistency (SENTRY), a UDA algorithm that judges the reliability of a target instance based on its predictive consistency under a committee of random image transformations. Our algorithm then selectively minimizes predictive entropy to increase confidence on highly consistent target instances, while maximizing predictive entropy to reduce confidence on highly inconsistent ones. In combination with pseudolabel-based approximate target class balancing, our approach leads to significant improvements over the state-of-the-art on 27/31 domain shifts from standard UDA benchmarks as well as benchmarks designed to stress-test adaptation under label distribution shift.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Prabhu_SENTRY_Selective_Entropy_Optimization_via_Committee_Consistency_for_Unsupervised_Domain_ICCV_2021_paper.pdf", @@ -35299,7 +37687,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Prabhu_2021_ICCV,\n \n author = {\n Prabhu,\n Viraj and Khare,\n Shivam and Kartik,\n Deeksha and Hoffman,\n Judy\n},\n title = {\n SENTRY: Selective Entropy Optimization via Committee Consistency for Unsupervised Domain Adaptation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8558-8567\n} \n}" }, { "title": "SGMNet: Learning Rotation-Invariant Point Cloud Representations via Sorted Gram Matrix", @@ -35307,6 +37696,7 @@ "status": "Poster", "track": "main", "pid": 7415, + "author_site": "Jianyun Xu; Xin Tang; Yushi Zhu; Jie Sun; Shiliang Pu", "author": "Jianyun Xu; Xin Tang; Yushi Zhu; Jie Sun; Shiliang Pu", "abstract": "Recently, various works that attempted to introduce rotation invariance to point cloud analysis have devised point-pair features, such as angles and distances. In these methods, however, the point-pair is only comprised of the center point and its adjacent points in a vicinity, which may bring information loss to the local feature representation. In this paper, we instead connect each point densely with all other points in a local neighborhood to compose the point-pairs. Specifically, we present a simple but effective local feature representation, called sorted Gram matrix(SGM), which is not only invariant to arbitrary rotations, but also models the pair-wise relationship of all the points in a neighborhood. In more detail, we utilize vector inner product to model distance- and angle-information between two points, and in a local patch it naturally forms a Gram matrix. In order to guarantee permutation invariance, we sort the correlation value in Gram matrix for each point, therefore this geometric feature names sorted Gram matrix. Furthermore, we mathematically prove that the Gram matrix is rotation-invariant and sufficient to model the inherent structure of a point cloud patch. We then use SGM as features in convolution, which can be readily integrated as a drop-in module into any point-based networks. Finally, we evaluated the proposed method on two widely used datasets, and it outperforms previous state-of-the-arts on both shape classification and part segmentation tasks by a large margin.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xu_SGMNet_Learning_Rotation-Invariant_Point_Cloud_Representations_via_Sorted_Gram_Matrix_ICCV_2021_paper.pdf", @@ -35330,7 +37720,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xu_2021_ICCV,\n \n author = {\n Xu,\n Jianyun and Tang,\n Xin and Zhu,\n Yushi and Sun,\n Jie and Pu,\n Shiliang\n},\n title = {\n SGMNet: Learning Rotation-Invariant Point Cloud Representations via Sorted Gram Matrix\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10468-10477\n} \n}" }, { "title": "SGPA: Structure-Guided Prior Adaptation for Category-Level 6D Object Pose Estimation", @@ -35338,6 +37729,7 @@ "status": "Poster", "track": "main", "pid": 4065, + "author_site": "Kai Chen; Qi Dou", "author": "Kai Chen; Qi Dou", "abstract": "Category-level 6D object pose estimation aims to predict the position and orientation for unseen objects, which plays a pillar role in many scenarios such as robotics and augmented reality. The significant intra-class variation is the bottleneck challenge in this task yet remains unsolved so far. In this paper, we take advantage of category prior to overcome this problem by innovating a structure-guided prior adaptation scheme to accurately estimate 6D pose for individual objects. Different from existing prior-based methods, given one object and its corresponding category prior, we propose to leverage their structure similarity to dynamically adapt the prior to the observed object. The prior adaptation intrinsically associates the adopted prior with different objects, from which we can accurately reconstruct the 3D canonical model of the specific object for pose estimation. To further enhance the structure characteristic of objects, we extract low-rank structure points from the dense object point cloud, therefore more efficiently incorporating sparse structural information during prior adaptation. Extensive experiments on CAMERA25 and REAL275 benchmarks demonstrate significant performance improvement. Project homepage: https://www.cse.cuhk.edu.hk/ kaichen/projects/sgpa/sgpa.html.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_SGPA_Structure-Guided_Prior_Adaptation_for_Category-Level_6D_Object_Pose_Estimation_ICCV_2021_paper.pdf", @@ -35354,14 +37746,15 @@ "author_num": 2, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Chen_SGPA_Structure-Guided_Prior_Adaptation_for_Category-Level_6D_Object_Pose_Estimation_ICCV_2021_paper.html", "aff_unique_index": "0;0", - "aff_unique_norm": "Chinese University of Hong Kong", + "aff_unique_norm": "The Chinese University of Hong Kong", "aff_unique_dep": "Department of Computer Science and Engineering", "aff_unique_url": "https://www.cuhk.edu.hk", "aff_unique_abbr": "CUHK", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Kai and Dou,\n Qi\n},\n title = {\n SGPA: Structure-Guided Prior Adaptation for Category-Level 6D Object Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2773-2782\n} \n}" }, { "title": "SIGN: Spatial-Information Incorporated Generative Network for Generalized Zero-Shot Semantic Segmentation", @@ -35369,6 +37762,7 @@ "status": "Poster", "track": "main", "pid": 9899, + "author_site": "Jiaxin Cheng; Soumyaroop Nandi; Prem Natarajan; Wael Abd-Almageed", "author": "Jiaxin Cheng; Soumyaroop Nandi; Prem Natarajan; Wael Abd-Almageed", "abstract": "Unlike conventional zero-shot classification, zero-shot semantic segmentation predicts a class label at the pixel level instead of the image level. When solving zero-shot semantic segmentation problems, the need for pixel-level prediction with surrounding context motivates us to incorporate spatial information using positional encoding. We improve standard positional encoding by introducing the concept of Relative Positional Encoding, which integrates spatial information at the feature level and can handle arbitrary image sizes. Furthermore, while self-training is widely used in zero-shot semantic segmentation to generate pseudo-labels, we propose a new knowledge-distillation-inspired self-training strategy, namely Annealed Self-Training, which can automatically assign different importance to pseudo-labels to improve performance. We systematically study the proposed Relative Positional Encoding and Annealed Self-Training in a comprehensive experimental evaluation, and our empirical results confirm the effectiveness of our method on three benchmark datasets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Cheng_SIGN_Spatial-Information_Incorporated_Generative_Network_for_Generalized_Zero-Shot_Semantic_Segmentation_ICCV_2021_paper.pdf", @@ -35392,7 +37786,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Marina del Rey", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Cheng_2021_ICCV,\n \n author = {\n Cheng,\n Jiaxin and Nandi,\n Soumyaroop and Natarajan,\n Prem and Abd-Almageed,\n Wael\n},\n title = {\n SIGN: Spatial-Information Incorporated Generative Network for Generalized Zero-Shot Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9556-9566\n} \n}" }, { "title": "SIGNET: Efficient Neural Representation for Light Fields", @@ -35400,6 +37795,7 @@ "status": "Poster", "track": "main", "pid": 9008, + "author_site": "Brandon Yushan Feng; Amitabh Varshney", "author": "Brandon Yushan Feng; Amitabh Varshney", "abstract": "We present a novel neural representation for light field content that enables compact storage and easy local reconstruction with high fidelity. We use a fully-connected neural network to learn the mapping function between each light field pixel's coordinates and its corresponding color values. However, neural networks that simply take in raw coordinates are unable to accurately learn data containing fine details. We present an input transformation strategy based on the Gegenbauer polynomials which previously showed theoretical advantages over the Fourier basis. We conduct experiments that show our Gegenbauer-based design combined with sinusoidal activation functions leads to a better light field reconstruction quality than a variety of network designs, including those with Fourier-inspired techniques introduced by prior works. Moreover, our SInusoidal Gegenbauer NETwork, or SIGNET, can represent light field scenes more compactly than the state-of-the-art compression methods while maintaining a comparable reconstruction quality. SIGNET also innately allows random access to encoded light field pixels due to its functional design. Furthermore, we demonstrate that SIGNET facilitates super-resolution along the spatial, angular, and temporal dimensions of a light field without any additional training.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Feng_SIGNET_Efficient_Neural_Representation_for_Light_Fields_ICCV_2021_paper.pdf", @@ -35423,7 +37819,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "College Park", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Feng_2021_ICCV,\n \n author = {\n Feng,\n Brandon Yushan and Varshney,\n Amitabh\n},\n title = {\n SIGNET: Efficient Neural Representation for Light Fields\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14224-14233\n} \n}" }, { "title": "SIMstack: A Generative Shape and Instance Model for Unordered Object Stacks", @@ -35431,6 +37828,7 @@ "status": "Poster", "track": "main", "pid": 9331, + "author_site": "Zoe Landgraf; Raluca Scona; Tristan Laidlow; Stephen James; Stefan Leutenegger; Andrew J. Davison", "author": "Zoe Landgraf; Raluca Scona; Tristan Laidlow; Stephen James; Stefan Leutenegger; Andrew J. Davison", "abstract": "By estimating 3D shape and instances from a single view, we can capture information about the environment quickly, without the need for comprehensive scanning and multi-view fusion. Solving this task for composite scenes (such as object stacks) is challenging: occluded areas are not only ambiguous in shape but also in instance segmentation; multiple decompositions could be valid. We observe that physics constrains decomposition as well as shape in occluded regions and hypothesise that a latent space learned from scenes built under physics simulation can serve as a prior to better predict shape and instances in occluded regions. To this end we propose SIMstack, a depth-conditioned Variational Auto-Encoder (VAE), trained on a dataset of objects stacked under physics simulation. We formulate instance segmentation as a center voting task which allows for class-agnostic detection and doesn't require setting the maximum number of objects in the scene. At test time, our model can generate 3D shape and instance segmentation from a single depth view, probabilistically sampling proposals for the occluded region from the learned latent space. We argue that this method has practical applications in providing robots some of the ability humans have to make rapid intuitive inferences of partially observed scenes. We demonstrate an application for precise (non-disruptive) object grasping of unknown objects from a single depth view.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Landgraf_SIMstack_A_Generative_Shape_and_Instance_Model_for_Unordered_Object_ICCV_2021_paper.pdf", @@ -35454,7 +37852,8 @@ "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "London", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Landgraf_2021_ICCV,\n \n author = {\n Landgraf,\n Zoe and Scona,\n Raluca and Laidlow,\n Tristan and James,\n Stephen and Leutenegger,\n Stefan and Davison,\n Andrew J.\n},\n title = {\n SIMstack: A Generative Shape and Instance Model for Unordered Object Stacks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13012-13022\n} \n}" }, { "title": "SLAMP: Stochastic Latent Appearance and Motion Prediction", @@ -35462,10 +37861,11 @@ "status": "Poster", "track": "main", "pid": 9184, - "author": "Adil Kaan Akan; Erkut Erdem; Aykut Erdem; Fatma G\u00fcney", + "author_site": "Adil Kaan Akan; Erkut Erdem; Aykut Erdem; Fatma Güney", + "author": "Adil Kaan Akan; Erkut Erdem; Aykut Erdem; Fatma Güney", "abstract": "Motion is an important cue for video prediction and often utilized by separating video content into static and dynamic components. Most of the previous work utilizing motion is deterministic but there are stochastic methods that can model the inherent uncertainty of the future. Existing stochastic models either do not reason about motion explicitly or make limiting assumptions about the static part. In this paper, we reason about appearance and motion in the video stochastically by predicting the future based on the motion history. Explicit reasoning about motion without history already reaches the performance of current stochastic models. The motion history further improves the results by allowing to predict consistent dynamics several frames into the future. Our model performs comparably to the state-of-the-art models on the generic video prediction datasets, however, significantly outperforms them on two challenging real-world autonomous driving datasets with complex motion and dynamic background.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Akan_SLAMP_Stochastic_Latent_Appearance_and_Motion_Prediction_ICCV_2021_paper.pdf", - "aff": "Koc\u00b8 University Is Bank AI Center, Istanbul, Turkey; Hacettepe University Computer Vision Lab, Ankara, Turkey; Koc\u00b8 University Is Bank AI Center, Istanbul, Turkey; Koc\u00b8 University Is Bank AI Center, Istanbul, Turkey", + "aff": "Koc¸ University Is Bank AI Center, Istanbul, Turkey; Hacettepe University Computer Vision Lab, Ankara, Turkey; Koc¸ University Is Bank AI Center, Istanbul, Turkey; Koc¸ University Is Bank AI Center, Istanbul, Turkey", "project": "", "github": "https://kuis-ai.github.io/slamp", "supp": "", @@ -35485,7 +37885,8 @@ "aff_campus_unique_index": "0;1;0;0", "aff_campus_unique": "Istanbul;Ankara", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "T\u00fcrkiye" + "aff_country_unique": "Turkey", + "bibtex": "@InProceedings{Akan_2021_ICCV,\n \n author = {\n Akan,\n Adil Kaan and Erdem,\n Erkut and Erdem,\n Aykut and G\\"uney,\n Fatma\n},\n title = {\n SLAMP: Stochastic Latent Appearance and Motion Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14728-14737\n} \n}" }, { "title": "SLIDE: Single Image 3D Photography With Soft Layering and Depth-Aware Inpainting", @@ -35493,6 +37894,7 @@ "status": "Poster", "track": "main", "pid": 1636, + "author_site": "Varun Jampani; Huiwen Chang; Kyle Sargent; Abhishek Kar; Richard Tucker; Michael Krainin; Dominik Kaeser; William T. Freeman; David Salesin; Brian Curless; Ce Liu", "author": "Varun Jampani; Huiwen Chang; Kyle Sargent; Abhishek Kar; Richard Tucker; Michael Krainin; Dominik Kaeser; William T. Freeman; David Salesin; Brian Curless; Ce Liu", "abstract": "Single image 3D photography enables viewers to view a still image from novel viewpoints. Recent approaches combine monocular depth networks with inpainting networks to achieve compelling results. A drawback of these techniques is the use of hard depth layering, making them unable to model intricate appearance details such as thin hair-like structures. We present SLIDE, a modular and unified system for single image 3D photography that uses a simple yet effective soft layering strategy to better preserve appearance details in novel views. In addition, we propose a novel depth-aware training strategy for our inpainting module, better suited for the 3D photography task. The resulting SLIDE approach is modular, enabling the use of other components such as segmentation and matting for improved layering. At the same time, SLIDE uses an efficient layered depth formulation that only requires a single forward pass through the component networks to produce high quality 3D photos. Extensive experimental analysis on three view-synthesis datasets, in combination with user studies on in-the-wild image collections, demonstrate superior performance of our technique in comparison to existing strong baselines while being conceptually much simpler. Project page: https://varunjampani.github.io/slide", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Jampani_SLIDE_Single_Image_3D_Photography_With_Soft_Layering_and_Depth-Aware_ICCV_2021_paper.pdf", @@ -35507,7 +37909,8 @@ "aff_domain": ";;;;;;;;;;", "email": ";;;;;;;;;;", "author_num": 11, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Jampani_SLIDE_Single_Image_3D_Photography_With_Soft_Layering_and_Depth-Aware_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Jampani_SLIDE_Single_Image_3D_Photography_With_Soft_Layering_and_Depth-Aware_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Jampani_2021_ICCV,\n \n author = {\n Jampani,\n Varun and Chang,\n Huiwen and Sargent,\n Kyle and Kar,\n Abhishek and Tucker,\n Richard and Krainin,\n Michael and Kaeser,\n Dominik and Freeman,\n William T. and Salesin,\n David and Curless,\n Brian and Liu,\n Ce\n},\n title = {\n SLIDE: Single Image 3D Photography With Soft Layering and Depth-Aware Inpainting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12518-12527\n} \n}" }, { "title": "SLIM: Self-Supervised LiDAR Scene Flow and Motion Segmentation", @@ -35515,10 +37918,11 @@ "status": "Poster", "track": "main", "pid": 10591, - "author": "Stefan Andreas Baur; David Josef Emmerichs; Frank Moosmann; Peter Pinggera; Bj\u00f6rn Ommer; Andreas Geiger", + "author_site": "Stefan Andreas Baur; David Josef Emmerichs; Frank Moosmann; Peter Pinggera; Björn Ommer; Andreas Geiger", + "author": "Stefan Andreas Baur; David Josef Emmerichs; Frank Moosmann; Peter Pinggera; Björn Ommer; Andreas Geiger", "abstract": "Recently, several frameworks for self-supervised learning of 3D scene flow on point clouds have emerged. Scene flow inherently separates every scene into multiple moving agents and a large class of points following a single rigid sensor motion. However, existing methods do not leverage this property of the data in their self-supervised training routines which could improve and stabilize flow predictions. Based on the discrepancy between a robust rigid ego-motion estimate and a raw flow prediction, we generate a self-supervised motion segmentation signal. The predicted motion segmentation, in turn, is used by our algorithm to attend to stationary points for aggregation of motion information in static parts of the scene. We learn our model end-to-end by backpropagating gradients through Kabsch's algorithm and demonstrate that this leads to accurate ego-motion which in turn improves the scene flow estimate. Using our method, we show state-of-the-art results across multiple scene flow metrics for different real-world datasets, showcasing the robustness and generalizability of this approach. We further analyze the performance gain when performing joint motion segmentation and scene flow in an ablation study. We also present a novel network architecture for 3D LiDAR scene flow which is capable of handling an order of magnitude more points during training than previously possible.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Baur_SLIM_Self-Supervised_LiDAR_Scene_Flow_and_Motion_Segmentation_ICCV_2021_paper.pdf", - "aff": "Mercedes-Benz AG, Stuttgart+MPI-IS, T\u00fcbingen+University of T\u00fcbingen; Mercedes-Benz AG, Stuttgart+Heidelberg University; Mercedes-Benz AG, Stuttgart; Mercedes-Benz AG, Stuttgart; Ludwig Maximilian University of Munich+Heidelberg University; MPI-IS, T\u00fcbingen+University of T\u00fcbingen", + "aff": "Mercedes-Benz AG, Stuttgart+MPI-IS, Tübingen+University of Tübingen; Mercedes-Benz AG, Stuttgart+Heidelberg University; Mercedes-Benz AG, Stuttgart; Mercedes-Benz AG, Stuttgart; Ludwig Maximilian University of Munich+Heidelberg University; MPI-IS, Tübingen+University of Tübingen", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Baur_SLIM_Self-Supervised_LiDAR_ICCV_2021_supplemental.zip", @@ -35531,14 +37935,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Baur_SLIM_Self-Supervised_LiDAR_Scene_Flow_and_Motion_Segmentation_ICCV_2021_paper.html", "aff_unique_index": "0+1+2;0+3;0;0;4+3;1+2", - "aff_unique_norm": "Mercedes-Benz AG;Max Planck Institute for Intelligent Systems;University of T\u00fcbingen;Heidelberg University;Ludwig Maximilian University of Munich", + "aff_unique_norm": "Mercedes-Benz AG;Max Planck Institute for Intelligent Systems;University of Tübingen;Heidelberg University;Ludwig Maximilian University of Munich", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.mercedes-benz.com;https://www.mpituebingen.mpg.de;https://www.uni-tuebingen.de/;https://www.uni-heidelberg.de;https://www.lmu.de", - "aff_unique_abbr": "MB AG;MPI-IS;Uni T\u00fcbingen;Uni Heidelberg;LMU", + "aff_unique_abbr": "MB AG;MPI-IS;Uni Tübingen;Uni Heidelberg;LMU", "aff_campus_unique_index": "1;;;1", - "aff_campus_unique": ";T\u00fcbingen", + "aff_campus_unique": ";Tübingen", "aff_country_unique_index": "0+0+0;0+0;0;0;0+0;0+0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Baur_2021_ICCV,\n \n author = {\n Baur,\n Stefan Andreas and Emmerichs,\n David Josef and Moosmann,\n Frank and Pinggera,\n Peter and Ommer,\n Bj\\"orn and Geiger,\n Andreas\n},\n title = {\n SLIM: Self-Supervised LiDAR Scene Flow and Motion Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13126-13136\n} \n}" }, { "title": "SNARF: Differentiable Forward Skinning for Animating Non-Rigid Neural Implicit Shapes", @@ -35546,10 +37951,11 @@ "status": "Poster", "track": "main", "pid": 3371, + "author_site": "Xu Chen; Yufeng Zheng; Michael J. Black; Otmar Hilliges; Andreas Geiger", "author": "Xu Chen; Yufeng Zheng; Michael J. Black; Otmar Hilliges; Andreas Geiger", "abstract": "Neural implicit surface representations have emerged as a promising paradigm to capture 3D shapes in a continuous and resolution-independent manner. However, adapting them to articulated shapes is non-trivial. Existing approaches learn a backward warp field that maps deformed to canonical points. However, this is problematic since the backward warp field is pose dependent and thus requires large amounts of data to learn. To address this, we introduce SNARF, which combines the advantages of linear blend skinning (LBS) for polygonal meshes with those of neural implicit surfaces by learning a forward deformation field without direct supervision. This deformation field is defined in canonical, pose-independent, space, enabling generalization to unseen poses. Learning the deformation field from posed meshes alone is challenging since the correspondences of deformed points are defined implicitly and may not be unique under changes of topology. We propose a forward skinning model that finds all canonical correspondences of any deformed point using iterative root finding. We derive analytical gradients via implicit differentiation, enabling end-to-end training from 3D meshes with bone transformations. Compared to state-of-the-art neural implicit representations, our approach generalizes better to unseen poses while preserving accuracy. We demonstrate our method in challenging scenarios on (clothed) 3D humans in diverse and unseen poses.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_SNARF_Differentiable_Forward_Skinning_for_Animating_Non-Rigid_Neural_Implicit_Shapes_ICCV_2021_paper.pdf", - "aff": "ETH Z\u00fcrich, Department of Computer Science+Max Planck Institute for Intelligent Systems, T\u00fcbingen; ETH Z\u00fcrich, Department of Computer Science+Max Planck Institute for Intelligent Systems, T\u00fcbingen; Max Planck Institute for Intelligent Systems, T\u00fcbingen; ETH Z\u00fcrich, Department of Computer Science; University of T\u00fcbingen+Max Planck Institute for Intelligent Systems, T\u00fcbingen", + "aff": "ETH Zürich, Department of Computer Science+Max Planck Institute for Intelligent Systems, Tübingen; ETH Zürich, Department of Computer Science+Max Planck Institute for Intelligent Systems, Tübingen; Max Planck Institute for Intelligent Systems, Tübingen; ETH Zürich, Department of Computer Science; University of Tübingen+Max Planck Institute for Intelligent Systems, Tübingen", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Chen_SNARF_Differentiable_Forward_ICCV_2021_supplemental.pdf", @@ -35562,14 +37968,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Chen_SNARF_Differentiable_Forward_Skinning_for_Animating_Non-Rigid_Neural_Implicit_Shapes_ICCV_2021_paper.html", "aff_unique_index": "0+1;0+1;1;0;2+1", - "aff_unique_norm": "ETH Zurich;Max Planck Institute for Intelligent Systems;University of T\u00fcbingen", + "aff_unique_norm": "ETH Zürich;Max Planck Institute for Intelligent Systems;University of Tübingen", "aff_unique_dep": "Department of Computer Science;;", "aff_unique_url": "https://www.ethz.ch;https://www.mpi-is.mpg.de;https://www.uni-tuebingen.de/", - "aff_unique_abbr": "ETHZ;MPI-IS;Uni T\u00fcbingen", + "aff_unique_abbr": "ETHZ;MPI-IS;Uni Tübingen", "aff_campus_unique_index": "1;1;1;1", - "aff_campus_unique": ";T\u00fcbingen", + "aff_campus_unique": ";Tübingen", "aff_country_unique_index": "0+1;0+1;1;0;1+1", - "aff_country_unique": "Switzerland;Germany" + "aff_country_unique": "Switzerland;Germany", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Xu and Zheng,\n Yufeng and Black,\n Michael J. and Hilliges,\n Otmar and Geiger,\n Andreas\n},\n title = {\n SNARF: Differentiable Forward Skinning for Animating Non-Rigid Neural Implicit Shapes\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11594-11604\n} \n}" }, { "title": "SO-Pose: Exploiting Self-Occlusion for Direct 6D Pose Estimation", @@ -35577,6 +37984,7 @@ "status": "Poster", "track": "main", "pid": 5695, + "author_site": "Yan Di; Fabian Manhardt; Gu Wang; Xiangyang Ji; Nassir Navab; Federico Tombari", "author": "Yan Di; Fabian Manhardt; Gu Wang; Xiangyang Ji; Nassir Navab; Federico Tombari", "abstract": "Directly regressing all 6 degrees-of-freedom (6DoF) for the object pose (i.e. the 3D rotation and translation) in a cluttered environment from a single RGB image is a challenging problem. While end-to-end methods have recently demonstrated promising results at high efficiency, they are still inferior when compared with elaborate PnP/RANSAC-based approaches in terms of pose accuracy. In this work, we address this shortcoming by means of a novel reason-ing about self-occlusion, in order to establish a two-layer representation for 3D objects which considerably enhances the accuracy of end-to-end 6D pose estimation. Our frame-work, named SO-Pose, takes a single RGB image as input and respectively generates 2D-3D correspondences as well as self-occlusion information harnessing a shared encoder and two separate decoders. Both outputs are then fused to directly regress the 6DoF pose parameters. Incorporating cross-layer consistencies that align correspondences, self-occlusion, and 6D pose, we can further improve accuracy and robustness, surpassing or rivaling all other state-of-the-art approaches on various challenging datasets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Di_SO-Pose_Exploiting_Self-Occlusion_for_Direct_6D_Pose_Estimation_ICCV_2021_paper.pdf", @@ -35594,13 +38002,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Di_SO-Pose_Exploiting_Self-Occlusion_for_Direct_6D_Pose_Estimation_ICCV_2021_paper.html", "aff_unique_index": "0;1;2;2;0;0+1", "aff_unique_norm": "Technical University of Munich;Google;Tsinghua University", - "aff_unique_dep": ";Google;", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.tum.de;https://www.google.com;https://www.tsinghua.edu.cn", "aff_unique_abbr": "TUM;Google;THU", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;2;2;0;0+1", - "aff_country_unique": "Germany;United States;China" + "aff_country_unique": "Germany;United States;China", + "bibtex": "@InProceedings{Di_2021_ICCV,\n \n author = {\n Di,\n Yan and Manhardt,\n Fabian and Wang,\n Gu and Ji,\n Xiangyang and Navab,\n Nassir and Tombari,\n Federico\n},\n title = {\n SO-Pose: Exploiting Self-Occlusion for Direct 6D Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12396-12405\n} \n}" }, { "title": "SOMA: Solving Optical Marker-Based MoCap Automatically", @@ -35608,10 +38017,11 @@ "status": "Poster", "track": "main", "pid": 8024, + "author_site": "Nima Ghorbani; Michael J. Black", "author": "Nima Ghorbani; Michael J. Black", "abstract": "Marker-based optical motion capture (mocap) is the \"gold standard\" method for acquiring accurate 3D human motion in computer vision, medicine, and graphics. The raw output of these systems are noisy and incomplete 3D points or short tracklets of points. To be useful, one must associate these points with corresponding markers on the captured subject; i.e. \"labelling\". Given these labels, one can then \"solve\" for the 3D skeleton or body surface mesh. Commercial auto-labeling tools require a specific calibration procedure at capture time, which is not possible for archival data. Here we train a novel neural network called SOMA, which takes raw mocap point clouds with varying numbers of points, labels them at scale without any calibration data, independent of the capture technology, and requiring only minimal human intervention. Our key insight is that, while labeling point clouds is highly ambiguous, the 3D body provides strong constraints on the solution that can be exploited by a learning-based method. To enable learning, we generate massive training sets of simulated noisy and ground truth mocap markers animated by 3D bodies from AMASS. SOMA exploits an architecture with stacked self-attention elements to learn the spatial structure of the 3D body and an optimal transport layer to constrain the assignment (labeling) problem while rejecting outliers. We extensively evaluate SOMA both quantitatively and qualitatively. SOMA is more accurate and robust than existing state of the art research methods and can be applied where commercial systems cannot. We automatically label over 8 hours of archival mocap data across 4 different datasets captured using various technologies and output SMPL-X body models. The model and data is released for research purposes at https://soma.is.tue.mpg.de/.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ghorbani_SOMA_Solving_Optical_Marker-Based_MoCap_Automatically_ICCV_2021_paper.pdf", - "aff": "Max Planck Institute for Intelligent Systems, T\u00fcbingen, Germany; Max Planck Institute for Intelligent Systems, T\u00fcbingen, Germany", + "aff": "Max Planck Institute for Intelligent Systems, Tübingen, Germany; Max Planck Institute for Intelligent Systems, Tübingen, Germany", "project": "https://soma.is.tue.mpg.de/", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Ghorbani_SOMA_Solving_Optical_ICCV_2021_supplemental.pdf", @@ -35629,9 +38039,10 @@ "aff_unique_url": "https://www.mpi-is.mpg.de", "aff_unique_abbr": "MPI-IS", "aff_campus_unique_index": "0;0", - "aff_campus_unique": "T\u00fcbingen", + "aff_campus_unique": "Tübingen", "aff_country_unique_index": "0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Ghorbani_2021_ICCV,\n \n author = {\n Ghorbani,\n Nima and Black,\n Michael J.\n},\n title = {\n SOMA: Solving Optical Marker-Based MoCap Automatically\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11117-11126\n} \n}" }, { "title": "SOTR: Segmenting Objects With Transformers", @@ -35639,6 +38050,7 @@ "status": "Poster", "track": "main", "pid": 7410, + "author_site": "Ruohao Guo; Dantong Niu; Liao Qu; Zhenbo Li", "author": "Ruohao Guo; Dantong Niu; Liao Qu; Zhenbo Li", "abstract": "Most recent transformer-based models show impressive performance on vision tasks, even better than Convolution Neural Networks (CNN). In this work, we present a novel, flexible, and effective transformer-based model for high-quality instance segmentation. The proposed method, Segmenting Objects with TRansformers (SOTR), simplifies the segmentation pipeline, building on an alternative CNN backbone appended with two parallel subtasks: (1) predicting per-instance category via transformer and (2) dynamically generating segmentation mask with the multi-level upsampling module. SOTR can effectively extract lower-level feature representations and capture long-range context dependencies by Feature Pyramid Network (FPN) and twin transformer, respectively. Meanwhile, compared with the original transformer, the proposed twin transformer is timeand resource-efficient since only a row and a column attention are involved to encode pixels. Moreover, SOTR is easy to be incorporated with various CNN backbones and transformer model variants to make considerable improvements for the segmentation accuracy and training convergence. Extensive experiments show that our SOTR performs well on the MS COCO dataset and surpasses state-of-the-art instance segmentation approaches. We hope our simple but strong framework could serve as a preferment baseline for instance-level recognition. Our code is available at https://github.com/easton-cau/SOTR.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Guo_SOTR_Segmenting_Objects_With_Transformers_ICCV_2021_paper.pdf", @@ -35662,7 +38074,8 @@ "aff_campus_unique_index": ";1;;", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0+0+0;1;0+0+0;0+0+0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Guo_2021_ICCV,\n \n author = {\n Guo,\n Ruohao and Niu,\n Dantong and Qu,\n Liao and Li,\n Zhenbo\n},\n title = {\n SOTR: Segmenting Objects With Transformers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7157-7166\n} \n}" }, { "title": "SPEC: Seeing People in the Wild With an Estimated Camera", @@ -35670,10 +38083,11 @@ "status": "Poster", "track": "main", "pid": 1111, - "author": "Muhammed Kocabas; Chun-Hao P. Huang; Joachim Tesch; Lea M\u00fcller; Otmar Hilliges; Michael J. Black", + "author_site": "Muhammed Kocabas; Chun-Hao P. Huang; Joachim Tesch; Lea Müller; Otmar Hilliges; Michael J. Black", + "author": "Muhammed Kocabas; Chun-Hao P. Huang; Joachim Tesch; Lea Müller; Otmar Hilliges; Michael J. Black", "abstract": "Due to the lack of camera parameter information for in-the-wild images, existing 3D human pose and shape (HPS) estimation methods make several simplifying assumptions: weak-perspective projection, large constant focal length, and zero camera rotation. These assumptions often do not hold and we show, quantitatively and qualitatively, that they cause errors in the reconstructed 3D shape and pose. To address this, we introduce SPEC, the first in-the-wild 3D HPS method that estimates the perspective camera from a single image and employs this to reconstruct 3D human bodies more accurately. First, we train a neural network to estimate the field of view, camera pitch, and roll given an input image. We employ novel losses that improve the calibration accuracy over previous work. We then train a novel network that concatenates the camera calibration to the image features and uses these together to regress 3D body shape and pose. SPEC is more accurate than the prior art on the standard benchmark (3DPW) as well as two new datasets with more challenging camera views and varying focal lengths. Specifically, we create a new photorealistic synthetic dataset (SPEC-SYN) with ground truth 3D bodies and a novel in-the-wild dataset (SPEC-MTP) with calibration and high-quality reference bodies. Code and datasets are available for research purposes at https://spec.is.tue.mpg.de/.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Kocabas_SPEC_Seeing_People_in_the_Wild_With_an_Estimated_Camera_ICCV_2021_paper.pdf", - "aff": "Max Planck Institute for Intelligent Systems, T \u00a8ubingen, Germany+ETH Zurich; Max Planck Institute for Intelligent Systems, T \u00a8ubingen, Germany; Max Planck Institute for Intelligent Systems, T \u00a8ubingen, Germany; Max Planck Institute for Intelligent Systems, T \u00a8ubingen, Germany; ETH Zurich; Max Planck Institute for Intelligent Systems, T \u00a8ubingen, Germany", + "aff": "Max Planck Institute for Intelligent Systems, T ¨ubingen, Germany+ETH Zurich; Max Planck Institute for Intelligent Systems, T ¨ubingen, Germany; Max Planck Institute for Intelligent Systems, T ¨ubingen, Germany; Max Planck Institute for Intelligent Systems, T ¨ubingen, Germany; ETH Zurich; Max Planck Institute for Intelligent Systems, T ¨ubingen, Germany", "project": "https://spec.is.tue.mpg.de/", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Kocabas_SPEC_Seeing_People_ICCV_2021_supplemental.pdf", @@ -35691,9 +38105,10 @@ "aff_unique_url": "https://www.mpi-is.mpg.de;https://www.ethz.ch", "aff_unique_abbr": "MPI-IS;ETHZ", "aff_campus_unique_index": "0;0;0;0;0", - "aff_campus_unique": "T\u00fcbingen;", + "aff_campus_unique": "Tübingen;", "aff_country_unique_index": "0+1;0;0;0;1;0", - "aff_country_unique": "Germany;Switzerland" + "aff_country_unique": "Germany;Switzerland", + "bibtex": "@InProceedings{Kocabas_2021_ICCV,\n \n author = {\n Kocabas,\n Muhammed and Huang,\n Chun-Hao P. and Tesch,\n Joachim and M\\"uller,\n Lea and Hilliges,\n Otmar and Black,\n Michael J.\n},\n title = {\n SPEC: Seeing People in the Wild With an Estimated Camera\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11035-11045\n} \n}" }, { "title": "SPG: Unsupervised Domain Adaptation for 3D Object Detection via Semantic Point Generation", @@ -35701,6 +38116,7 @@ "status": "Poster", "track": "main", "pid": 6081, + "author_site": "Qiangeng Xu; Yin Zhou; Weiyue Wang; Charles R. Qi; Dragomir Anguelov", "author": "Qiangeng Xu; Yin Zhou; Weiyue Wang; Charles R. Qi; Dragomir Anguelov", "abstract": "In autonomous driving, a LiDAR-based object detector should perform reliably at different geographic locations and under various weather conditions. While recent 3D detection research focuses on improving performance within a single domain, our study reveals that the performance of modern detectors can drop drastically cross-domain. In this paper, we investigate unsupervised domain adaptation (UDA) for LiDAR-based 3D object detection. On the Waymo Domain Adaptation dataset, we identify the deteriorating point cloud quality as the root cause of the performance drop. To address this issue, we present Semantic Point Generation (SPG), a general approach to enhance the reliability of LiDAR detectors against domain shifts. Specifically, SPG generates semantic points at the predicted foreground regions and faithfully recovers missing parts of the foreground objects, which are caused by phenomena such as occlusions, low reflectance, or weather interference. By merging the semantic points with the original points, we obtain an augmented point cloud, which can be directly consumed by modern LiDAR-based detectors. To validate the wide applicability of SPG, we experiment with two representative detectors, PointPillars and PV-RCNN. On the UDA task, SPG significantly improves both detectors across all object categories of interest and at all difficulty levels. SPG can also benefit object detection in the original domain. On the Waymo Open Dataset and KITTI, SPG improves 3D detection results of these two methods across all categories. Combined with PV-RCNN, SPG achieves state-of-the-art 3D detection results on KITTI.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xu_SPG_Unsupervised_Domain_Adaptation_for_3D_Object_Detection_via_Semantic_ICCV_2021_paper.pdf", @@ -35724,7 +38140,8 @@ "aff_campus_unique_index": "0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Xu_2021_ICCV,\n \n author = {\n Xu,\n Qiangeng and Zhou,\n Yin and Wang,\n Weiyue and Qi,\n Charles R. and Anguelov,\n Dragomir\n},\n title = {\n SPG: Unsupervised Domain Adaptation for 3D Object Detection via Semantic Point Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15446-15456\n} \n}" }, { "title": "SPatchGAN: A Statistical Feature Based Discriminator for Unsupervised Image-to-Image Translation", @@ -35732,6 +38149,7 @@ "status": "Poster", "track": "main", "pid": 3731, + "author_site": "Xuning Shao; Weidong Zhang", "author": "Xuning Shao; Weidong Zhang", "abstract": "For unsupervised image-to-image translation, we propose a discriminator architecture which focuses on the statistical features instead of individual patches. The network is stabilized by distribution matching of key statistical features at multiple scales. Unlike the existing methods which impose more and more constraints on the generator, our method facilitates the shape deformation and enhances the fine details with a greatly simplified framework. We show that the proposed method outperforms the existing state-of-the-art models in various challenging applications including selfie-to-anime, male-to-female and glasses removal.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Shao_SPatchGAN_A_Statistical_Feature_Based_Discriminator_for_Unsupervised_Image-to-Image_Translation_ICCV_2021_paper.pdf", @@ -35746,7 +38164,8 @@ "aff_domain": ";", "email": ";", "author_num": 2, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Shao_SPatchGAN_A_Statistical_Feature_Based_Discriminator_for_Unsupervised_Image-to-Image_Translation_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Shao_SPatchGAN_A_Statistical_Feature_Based_Discriminator_for_Unsupervised_Image-to-Image_Translation_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Shao_2021_ICCV,\n \n author = {\n Shao,\n Xuning and Zhang,\n Weidong\n},\n title = {\n SPatchGAN: A Statistical Feature Based Discriminator for Unsupervised Image-to-Image Translation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6546-6555\n} \n}" }, { "title": "SS-IL: Separated Softmax for Incremental Learning", @@ -35754,6 +38173,7 @@ "status": "Poster", "track": "main", "pid": 7767, + "author_site": "Hongjoon Ahn; Jihwan Kwak; Subin Lim; Hyeonsu Bang; Hyojun Kim; Taesup Moon", "author": "Hongjoon Ahn; Jihwan Kwak; Subin Lim; Hyeonsu Bang; Hyojun Kim; Taesup Moon", "abstract": "We consider class incremental learning (CIL) problem, in which a learning agent continuously learns new classes from incrementally arriving training data batches and aims to predict well on all the classes learned so far. The main challenge of the problem is the catastrophic forgetting, and for the exemplar-memory based CIL methods, it is generally known that the forgetting is commonly caused by the classification score bias that is injected due to the data imbalance between the new classes and the old classes (in the exemplar-memory). While several methods have been proposed to correct such score bias by some additional post-processing, e.g., score re-scaling or balanced fine-tuning, no systematic analysis on the root cause of such bias has been done. To that end, we analyze that computing the softmax probabilities by combining the output scores for all old and new classes could be the main cause of the bias. Then, we propose a new CIL method, dubbed as Separated Softmax for Incremental Learning (SS-IL), that consists of separated softmax (SS) output layer combined with task-wise knowledge distillation (TKD) to resolve such bias. Throughout our extensive experimental results on several large-scale CIL benchmark datasets, we show our SS-IL achieves strong state-of-the-art accuracy through attaining much more balanced prediction scores across old and new classes, without any additional post-processing.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ahn_SS-IL_Separated_Softmax_for_Incremental_Learning_ICCV_2021_paper.pdf", @@ -35777,7 +38197,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Seoul", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Ahn_2021_ICCV,\n \n author = {\n Ahn,\n Hongjoon and Kwak,\n Jihwan and Lim,\n Subin and Bang,\n Hyeonsu and Kim,\n Hyojun and Moon,\n Taesup\n},\n title = {\n SS-IL: Separated Softmax for Incremental Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 844-853\n} \n}" }, { "title": "SSH: A Self-Supervised Framework for Image Harmonization", @@ -35785,6 +38206,7 @@ "status": "Poster", "track": "main", "pid": 3500, + "author_site": "Yifan Jiang; He Zhang; Jianming Zhang; Yilin Wang; Zhe Lin; Kalyan Sunkavalli; Simon Chen; Sohrab Amirghodsi; Sarah Kong; Zhangyang Wang", "author": "Yifan Jiang; He Zhang; Jianming Zhang; Yilin Wang; Zhe Lin; Kalyan Sunkavalli; Simon Chen; Sohrab Amirghodsi; Sarah Kong; Zhangyang Wang", "abstract": "Image harmonization aims to improve the quality of image compositing by matching the \"appearance\"\" (e.g., color tone, brightness and contrast) between foreground and background images. However, collecting large-scale annotated datasets for this task requires complex professional retouching. Instead, we propose a novel Self-Supervised Harmonization framework (SSH) that can be trained using just \"free\"\" natural images without being edited. We reformulate the image harmonization problem from a representation fusion perspective, which separately processes the foreground and background examples, to address the background occlusion issue. This framework design allows for a dual data augmentation method, where diverse [foreground, background, pseudo GT] triplets can be generated by cropping an image with perturbations using 3D color lookup tables (LUTs). In addition, we build a real-world harmonization dataset as carefully created by expert users, for evaluation and benchmarking purposes. Our results show that the proposed self-supervised method outperforms previous state-of-the-art methods in terms of reference metrics, visual quality, and subject user study. Code and dataset will be publicly available.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Jiang_SSH_A_Self-Supervised_Framework_for_Image_Harmonization_ICCV_2021_paper.pdf", @@ -35801,14 +38223,15 @@ "author_num": 10, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Jiang_SSH_A_Self-Supervised_Framework_for_Image_Harmonization_ICCV_2021_paper.html", "aff_unique_index": "0;1;1;1;1;1;1;1;1;0", - "aff_unique_norm": "University of Texas at Austin;Adobe", - "aff_unique_dep": ";Adobe Inc.", + "aff_unique_norm": "University of Texas at Austin;Adobe Inc.", + "aff_unique_dep": ";", "aff_unique_url": "https://www.utexas.edu;https://www.adobe.com", "aff_unique_abbr": "UT Austin;Adobe", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Austin;", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Jiang_2021_ICCV,\n \n author = {\n Jiang,\n Yifan and Zhang,\n He and Zhang,\n Jianming and Wang,\n Yilin and Lin,\n Zhe and Sunkavalli,\n Kalyan and Chen,\n Simon and Amirghodsi,\n Sohrab and Kong,\n Sarah and Wang,\n Zhangyang\n},\n title = {\n SSH: A Self-Supervised Framework for Image Harmonization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4832-4841\n} \n}" }, { "title": "STAR: A Structure-Aware Lightweight Transformer for Real-Time Image Enhancement", @@ -35816,6 +38239,7 @@ "status": "Poster", "track": "main", "pid": 3778, + "author_site": "Zhaoyang Zhang; Yitong Jiang; Jun Jiang; Xiaogang Wang; Ping Luo; Jinwei Gu", "author": "Zhaoyang Zhang; Yitong Jiang; Jun Jiang; Xiaogang Wang; Ping Luo; Jinwei Gu", "abstract": "Image and video enhancement such as color constancy, low light enhancement, and tone mapping on smartphones is challenging because high-quality images should be achieved efficiently with a limited resource budget. Unlike prior works that either used very deep CNNs or large Transformer models, we propose a \\underline s eman\\underline t ic-\\underline a wa\\underline r e lightweight Transformer, termed STAR, for real-time image enhancement. STAR is formulated to capture long-range dependencies between image patches, which naturally and implicitly captures the semantic relationships of different regions in an image. STAR is a general architecture that can be easily adapted to different image enhancement tasks. Extensive experiments show that STAR can effectively boost the quality and efficiency of many tasks such as illumination enhancement, auto white balance, and photo retouching, which are indispensable components for image processing on smartphones. For example, STAR reduces model complexity and improves image quality compared to the recent state-of-the-art [??] on the MIT-Adobe FiveK dataset [??] (i.e., 1.8dB PSNR improvements with 25% parameters and 13% float operations.)", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhang_STAR_A_Structure-Aware_Lightweight_Transformer_for_Real-Time_Image_Enhancement_ICCV_2021_paper.pdf", @@ -35832,14 +38256,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhang_STAR_A_Structure-Aware_Lightweight_Transformer_for_Real-Time_Image_Enhancement_ICCV_2021_paper.html", "aff_unique_index": "0+1;1;1;0;2;1+3", - "aff_unique_norm": "Chinese University of Hong Kong;SenseBrain Research;University of Hong Kong;Shanghai AI Laboratory", + "aff_unique_norm": "The Chinese University of Hong Kong;SenseBrain Research;The University of Hong Kong;Shanghai AI Laboratory", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.cuhk.edu.hk;;https://www.hku.hk;https://www.shanghai-ai-lab.com", "aff_unique_abbr": "CUHK;;HKU;SAIL", "aff_campus_unique_index": "0;0;0;", "aff_campus_unique": "Hong Kong SAR;", - "aff_country_unique_index": "0+1;1;1;0;0;1+0", - "aff_country_unique": "China;United States" + "aff_country_unique_index": "0;0;0;0", + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Zhang_2021_ICCV,\n \n author = {\n Zhang,\n Zhaoyang and Jiang,\n Yitong and Jiang,\n Jun and Wang,\n Xiaogang and Luo,\n Ping and Gu,\n Jinwei\n},\n title = {\n STAR: A Structure-Aware Lightweight Transformer for Real-Time Image Enhancement\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4106-4115\n} \n}" }, { "title": "STEM: An Approach to Multi-Source Domain Adaptation With Guarantees", @@ -35847,6 +38272,7 @@ "status": "Poster", "track": "main", "pid": 8566, + "author_site": "Van-Anh Nguyen; Tuan Nguyen; Trung Le; Quan Hung Tran; Dinh Phung", "author": "Van-Anh Nguyen; Tuan Nguyen; Trung Le; Quan Hung Tran; Dinh Phung", "abstract": "Multi-source Domain Adaptation (MSDA) is more practical but challenging than the conventional unsupervised domain adaptation due to the involvement of diverse multiple data sources. Two fundamental challenges of MSDA are: (i) how to deal with the diversity in the multiple source domains and (ii) how to cope with the data shift between the target domain and the source domains. In this paper, to address the first challenge, we propose a theoretical-guaranteed approach to combine domain experts locally trained on its own source domain to achieve a combined multi-source teacher that globally predicts well on the mixture of source domains. To address the second challenge, we propose to bridge the gap between the target domain and the mixture of source domains in the latent space via a generator or feature extractor. Together with bridging the gap in the latent space, we train a student to mimic the predictions of the teacher expert on both source and target examples. In addition, our approach is guaranteed with rigorous theory offered insightful justifications of how each component influences the transferring performance. Extensive experiments conducted on three benchmark datasets show that our proposed method achieves state-of-the-art performances to the best of our knowledge.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Nguyen_STEM_An_Approach_to_Multi-Source_Domain_Adaptation_With_Guarantees_ICCV_2021_paper.pdf", @@ -35863,14 +38289,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Nguyen_STEM_An_Approach_to_Multi-Source_Domain_Adaptation_With_Guarantees_ICCV_2021_paper.html", "aff_unique_index": "0;1;1;2;1+3", - "aff_unique_norm": "VNU - University of Science;Monash University;Adobe;VinAI Research", - "aff_unique_dep": ";Department of Data Science and AI;Adobe Research;", + "aff_unique_norm": "VNU - University of Science;Monash University;Adobe Research;VinAI Research", + "aff_unique_dep": ";Department of Data Science and AI;;", "aff_unique_url": "https://www.unis.vnu.edu.vn;https://www.monash.edu;https://research.adobe.com;https://www.vin.ai", "aff_unique_abbr": "VNU;Monash;Adobe;VinAI", "aff_campus_unique_index": "1;", "aff_campus_unique": ";San Jose", "aff_country_unique_index": "0;1;1;2;1+0", - "aff_country_unique": "Vietnam;Australia;United States" + "aff_country_unique": "Vietnam;Australia;United States", + "bibtex": "@InProceedings{Nguyen_2021_ICCV,\n \n author = {\n Nguyen,\n Van-Anh and Nguyen,\n Tuan and Le,\n Trung and Tran,\n Quan Hung and Phung,\n Dinh\n},\n title = {\n STEM: An Approach to Multi-Source Domain Adaptation With Guarantees\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9352-9363\n} \n}" }, { "title": "STR-GQN: Scene Representation and Rendering for Unknown Cameras Based on Spatial Transformation Routing", @@ -35878,6 +38305,7 @@ "status": "Poster", "track": "main", "pid": 7548, + "author_site": "Wen-Cheng Chen; Min-Chun Hu; Chu-Song Chen", "author": "Wen-Cheng Chen; Min-Chun Hu; Chu-Song Chen", "abstract": "Geometry-aware modules are widely applied in recent deep learning architectures for scene representation and rendering. However, these modules require intrinsic camera information that might not be obtained accurately. In this paper, we propose a Spatial Transformation Routing (STR) mechanism to model the spatial properties without applying any geometric prior. The STR mechanism treats the spatial transformation as the message passing process, and the relation between the view poses and the routing weights is modeled by an end-to-end trainable neural network. Besides, an Occupancy Concept Mapping (OCM) framework is proposed to provide explainable rationals for scene-fusion processes. We conducted experiments on several datasets and show that the proposed STR mechanism improves the performance of the Generative Query Network (GQN). The visualization results reveal that the routing process can pass the observed information from one location of some view to the associated location in the other view, which demonstrates the advantage of the proposed model in terms of spatial cognition.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_STR-GQN_Scene_Representation_and_Rendering_for_Unknown_Cameras_Based_on_ICCV_2021_paper.pdf", @@ -35901,7 +38329,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Taiwan", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Wen-Cheng and Hu,\n Min-Chun and Chen,\n Chu-Song\n},\n title = {\n STR-GQN: Scene Representation and Rendering for Unknown Cameras Based on Spatial Transformation Routing\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5966-5975\n} \n}" }, { "title": "STRIVE: Scene Text Replacement in Videos", @@ -35909,6 +38338,7 @@ "status": "Poster", "track": "main", "pid": 7102, + "author_site": "Vijay Kumar B G; Jeyasri Subramanian; Varnith Chordia; Eugene Bart; Shaobo Fang; Kelly Guan; Raja Bala", "author": "Vijay Kumar B G; Jeyasri Subramanian; Varnith Chordia; Eugene Bart; Shaobo Fang; Kelly Guan; Raja Bala", "abstract": "We propose replacing scene text in videos using deep style transfer and learned photometric transformations. Building on recent progress on still image text replacement, we present extensions that alter text while preserving the appearance and motion characteristics of the original video. Compared to the problem of still image text replacement, our method addresses additional challenges introduced by video, namely effects induced by changing lighting, motion blur, diverse variations in camera-object pose over time, and preservation of temporal consistency. We parse the problem into three steps. First, the text in all frames is normalized to a frontal pose using a spatio-temporal transformer network. Second, the text is replaced in a single reference frame using a state-of-art still-image text replacement method. Finally, the new text is transferred from the reference to remaining frames using a novel learned image transformation network that captures lighting and blur effects in a temporally consistent manner. Results on synthetic and challenging real videos show realistic text transfer, competitive quantitative and qualitative performance, and superior inference speed relative to alternatives. We introduce new synthetic and real-world datasets with paired text objects. To the best of our knowledge this is the first attempt at deep video text replacement.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/G_STRIVE_Scene_Text_Replacement_in_Videos_ICCV_2021_paper.pdf", @@ -35925,14 +38355,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/G_STRIVE_Scene_Text_Replacement_in_Videos_ICCV_2021_paper.html", "aff_unique_index": "0;1;2;1;3;3;2", - "aff_unique_norm": "NEC Laboratories America;Palo Alto Research Center;Amazon;Stanford University", - "aff_unique_dep": ";;Amazon.com, Inc.;", + "aff_unique_norm": "NEC Laboratories America;Palo Alto Research Center;Amazon.com, Inc.;Stanford University", + "aff_unique_dep": ";;;", "aff_unique_url": "https://www.nec-labs.com;https://www.parc.com;https://www.amazon.com;https://www.stanford.edu", "aff_unique_abbr": "NEC Labs America;PARC;Amazon;Stanford", "aff_campus_unique_index": "1;1;2;2", "aff_campus_unique": ";Palo Alto;Stanford", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{G_2021_ICCV,\n \n author = {\n G,\n Vijay Kumar B and Subramanian,\n Jeyasri and Chordia,\n Varnith and Bart,\n Eugene and Fang,\n Shaobo and Guan,\n Kelly and Bala,\n Raja\n},\n title = {\n STRIVE: Scene Text Replacement in Videos\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14549-14558\n} \n}" }, { "title": "STVGBert: A Visual-Linguistic Transformer Based Framework for Spatio-Temporal Video Grounding", @@ -35940,6 +38371,7 @@ "status": "Poster", "track": "main", "pid": 3669, + "author_site": "Rui Su; Qian Yu; Dong Xu", "author": "Rui Su; Qian Yu; Dong Xu", "abstract": "Spatio-temporal video grounding (STVG) aims to localize a spatio-temporal tube of a target object in an untrimmed video based on a query sentence. In this work, we propose a one-stage visual-linguistic transformer based framework called STVGBert for the STVG task, which can simultaneously localize the target object in both spatial and temporal domains. Specifically, without resorting to pre-generated object proposals, our STVGBert directly takes a video and a query sentence as the input, and then produces the cross-modal features by using the newly introduced cross-modal feature learning module ST-ViLBert. Based on the cross-modal features, our method then generates bounding boxes and predicts the starting and ending frames to produce the predicted object tube. To the best of our knowledge, our STVGBert is the first one-stage method, which can handle the STVG task without relying on any pre-trained object detectors. Comprehensive experiments demonstrate our newly proposed framework outperforms the state-of-the-art multi-stage methods on two benchmark datasets Vid-STG and HC-STVG.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Su_STVGBert_A_Visual-Linguistic_Transformer_Based_Framework_for_Spatio-Temporal_Video_Grounding_ICCV_2021_paper.pdf", @@ -35956,14 +38388,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Su_STVGBert_A_Visual-Linguistic_Transformer_Based_Framework_for_Spatio-Temporal_Video_Grounding_ICCV_2021_paper.html", "aff_unique_index": "0;1;2", - "aff_unique_norm": "Tencent;Beihang University;University of Sydney", + "aff_unique_norm": "Tencent;Beihang University;The University of Sydney", "aff_unique_dep": "Platform & Content Group;College of Software;School of Electrical and Information Engineering", "aff_unique_url": "https://www.tencent.com;http://www.buaa.edu.cn;https://www.sydney.edu.au", "aff_unique_abbr": "Tencent;Beihang;USYD", "aff_campus_unique_index": "1", "aff_campus_unique": ";Sydney", "aff_country_unique_index": "0;0;1", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Su_2021_ICCV,\n \n author = {\n Su,\n Rui and Yu,\n Qian and Xu,\n Dong\n},\n title = {\n STVGBert: A Visual-Linguistic Transformer Based Framework for Spatio-Temporal Video Grounding\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1533-1542\n} \n}" }, { "title": "SUNet: Symmetric Undistortion Network for Rolling Shutter Correction", @@ -35971,10 +38404,11 @@ "status": "Poster", "track": "main", "pid": 6199, + "author_site": "Bin Fan; Yuchao Dai; Mingyi He", "author": "Bin Fan; Yuchao Dai; Mingyi He", "abstract": "The vast majority of modern consumer-grade cameras employ a rolling shutter mechanism, leading to image distortions if the camera moves during image acquisition. In this paper, we present a novel deep network to solve the generic rolling shutter correction problem with two consecutive frames. Our pipeline is symmetrically designed to predict the global shutter image corresponding to the intermediate time of these two frames, which is difficult for existing methods because it corresponds to a camera pose that differs most from the two frames. First, two time-symmetric dense undistortion flows are estimated by using well-established principles: pyramidal construction, warping, and cost volume processing. Then, both rolling shutter images are warped into a common global shutter one in the feature space, respectively. Finally, a symmetric consistency constraint is constructed in the image decoder to effectively aggregate the contextual cues of two rolling shutter images, thereby recovering the high-quality global shutter image. Extensive experiments with both synthetic and real data from public benchmarks demonstrate the superiority of our proposed approach over the state-of-the-art methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Fan_SUNet_Symmetric_Undistortion_Network_for_Rolling_Shutter_Correction_ICCV_2021_paper.pdf", - "aff": "School of Electronics and Information, Northwestern Polytechnical University, Xi\u2019an, China; School of Electronics and Information, Northwestern Polytechnical University, Xi\u2019an, China; School of Electronics and Information, Northwestern Polytechnical University, Xi\u2019an, China", + "aff": "School of Electronics and Information, Northwestern Polytechnical University, Xi’an, China; School of Electronics and Information, Northwestern Polytechnical University, Xi’an, China; School of Electronics and Information, Northwestern Polytechnical University, Xi’an, China", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Fan_SUNet_Symmetric_Undistortion_ICCV_2021_supplemental.zip", @@ -35994,7 +38428,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Xi'an", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Fan_2021_ICCV,\n \n author = {\n Fan,\n Bin and Dai,\n Yuchao and He,\n Mingyi\n},\n title = {\n SUNet: Symmetric Undistortion Network for Rolling Shutter Correction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4541-4550\n} \n}" }, { "title": "SaccadeCam: Adaptive Visual Attention for Monocular Depth Sensing", @@ -36002,6 +38437,7 @@ "status": "Poster", "track": "main", "pid": 3560, + "author_site": "Brevin Tilmon; Sanjeev J. Koppal", "author": "Brevin Tilmon; Sanjeev J. Koppal", "abstract": "Most monocular depth sensing methods use conventionally captured images that are created without considering scene content. In contrast, animal eyes have fast mechanical motions, called saccades, that control how the scene is imaged by the fovea, where resolution is highest. In this paper, we present the SaccadeCam framework for adaptively distributing resolution onto regions of interest in the scene. Our algorithm for adaptive resolution is a self-supervised network and we demonstrate results for end-to-end learning for monocular depth estimation. We also show preliminary results with a real SaccadeCam hardware prototype.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Tilmon_SaccadeCam_Adaptive_Visual_Attention_for_Monocular_Depth_Sensing_ICCV_2021_paper.pdf", @@ -36025,7 +38461,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Tilmon_2021_ICCV,\n \n author = {\n Tilmon,\n Brevin and Koppal,\n Sanjeev J.\n},\n title = {\n SaccadeCam: Adaptive Visual Attention for Monocular Depth Sensing\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6009-6018\n} \n}" }, { "title": "Safety-Aware Motion Prediction With Unseen Vehicles for Autonomous Driving", @@ -36033,10 +38470,11 @@ "status": "Poster", "track": "main", "pid": 2837, + "author_site": "Xuanchi Ren; Tao Yang; Li Erran Li; Alexandre Alahi; Qifeng Chen", "author": "Xuanchi Ren; Tao Yang; Li Erran Li; Alexandre Alahi; Qifeng Chen", "abstract": "Motion prediction of vehicles is critical but challenging due to the uncertainties in complex environments and the limited visibility caused by occlusions and limited sensor ranges. In this paper, we study a new task, safety-aware motion prediction with unseen vehicles for autonomous driving. Unlike the existing trajectory prediction task for seen vehicles, we aim at predicting an occupancy map that indicates the earliest time when each location can be occupied by either seen and unseen vehicles. The ability to predict unseen vehicles is critical for safety in autonomous driving. To tackle this challenging task, we propose a safety-aware deep learning model with three new loss functions to predict the earliest occupancy map. Experiments on the large-scale autonomous driving nuScenes dataset show that our proposed model significantly outperforms the state-of-the-art baselines on the safety-aware motion prediction task. To the best of our knowledge, our approach is the first one that can predict the existence of unseen vehicles in most cases.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ren_Safety-Aware_Motion_Prediction_With_Unseen_Vehicles_for_Autonomous_Driving_ICCV_2021_paper.pdf", - "aff": "HKUST; Xi\u2019an Jiaotong University; Alexa AI, Amazon; EPFL; HKUST", + "aff": "HKUST; Xi’an Jiaotong University; Alexa AI, Amazon; EPFL; HKUST", "project": "", "github": "https://github.com/xrenaa/Safety-Aware-Motion-Prediction", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Ren_Safety-Aware_Motion_Prediction_ICCV_2021_supplemental.pdf", @@ -36049,14 +38487,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Ren_Safety-Aware_Motion_Prediction_With_Unseen_Vehicles_for_Autonomous_Driving_ICCV_2021_paper.html", "aff_unique_index": "0;1;2;3;0", - "aff_unique_norm": "Hong Kong University of Science and Technology;Xi'an Jiao Tong University;Amazon;EPFL", + "aff_unique_norm": "Hong Kong University of Science and Technology;Xi'an Jiaotong University;Amazon;Ecole Polytechnique Fédérale de Lausanne", "aff_unique_dep": ";;Alexa AI;", "aff_unique_url": "https://www.ust.hk;https://www.xjtu.edu.cn;https://www.amazon.com;https://www.epfl.ch", "aff_unique_abbr": "HKUST;XJTU;Amazon;EPFL", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;1;2;0", - "aff_country_unique": "China;United States;Switzerland" + "aff_country_unique": "China;United States;Switzerland", + "bibtex": "@InProceedings{Ren_2021_ICCV,\n \n author = {\n Ren,\n Xuanchi and Yang,\n Tao and Li,\n Li Erran and Alahi,\n Alexandre and Chen,\n Qifeng\n},\n title = {\n Safety-Aware Motion Prediction With Unseen Vehicles for Autonomous Driving\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15731-15740\n} \n}" }, { "title": "Saliency-Associated Object Tracking", @@ -36064,6 +38503,7 @@ "status": "Poster", "track": "main", "pid": 2763, + "author_site": "Zikun Zhou; Wenjie Pei; Xin Li; Hongpeng Wang; Feng Zheng; Zhenyu He", "author": "Zikun Zhou; Wenjie Pei; Xin Li; Hongpeng Wang; Feng Zheng; Zhenyu He", "abstract": "Most existing trackers based on deep learning perform tracking in a holistic strategy, which aims to learn deep representations of the whole target for localizing the target. It is arduous for such methods to track targets with various appearance variations. To address this limitation, another type of methods adopts a part-based tracking strategy which divides the target into equal patches and tracks all these patches in parallel. The target state is inferred by summarizing the tracking results of these patches. A potential limitation of such trackers is that not all patches are equally informative for tracking. Some patches that are not discriminative may have adverse effects. In this paper, we propose to track the salient local parts of the target that are discriminative for tracking. In particular, we propose a fine-grained saliency mining module to capture the local saliencies. Further, we design a saliency-association modeling module to associate the captured saliencies together to learn effective correlation representations between the exemplar and the search image for state estimation. Extensive experiments on five diverse datasets demonstrate that the proposed method performs favorably against state-of-the-art trackers.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhou_Saliency-Associated_Object_Tracking_ICCV_2021_paper.pdf", @@ -36080,14 +38520,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhou_Saliency-Associated_Object_Tracking_ICCV_2021_paper.html", "aff_unique_index": "0;0;1;0+1;2;0", - "aff_unique_norm": "Harbin Institute of Technology;Pengcheng Laboratory;Southern University of Science and Technology", - "aff_unique_dep": ";Peng Cheng Laboratory;", + "aff_unique_norm": "Harbin Institute of Technology;Peng Cheng Laboratory;Southern University of Science and Technology", + "aff_unique_dep": ";;", "aff_unique_url": "http://en.hhit.edu.cn/;http://www.pcl.ac.cn;https://www.sustech.edu.cn", "aff_unique_abbr": "HIT;PCL;SUSTech", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Shenzhen;", "aff_country_unique_index": "0;0;0;0+0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhou_2021_ICCV,\n \n author = {\n Zhou,\n Zikun and Pei,\n Wenjie and Li,\n Xin and Wang,\n Hongpeng and Zheng,\n Feng and He,\n Zhenyu\n},\n title = {\n Saliency-Associated Object Tracking\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9866-9875\n} \n}" }, { "title": "Salient Object Ranking With Position-Preserved Attention", @@ -36095,6 +38536,7 @@ "status": "Poster", "track": "main", "pid": 7836, + "author_site": "Hao Fang; Daoxin Zhang; Yi Zhang; Minghao Chen; Jiawei Li; Yao Hu; Deng Cai; Xiaofei He", "author": "Hao Fang; Daoxin Zhang; Yi Zhang; Minghao Chen; Jiawei Li; Yao Hu; Deng Cai; Xiaofei He", "abstract": "Instance segmentation can detect where the objects are in an image, but hard to understand the relationship between them. We pay attention to a typical relationship, relative saliency. A closely related task, salient object detection, predicts a binary map highlighting a visually salient region while hard to distinguish multiple objects. Directly combining two tasks by post-processing also leads to poor performance. There is a lack of research on relative saliency at present, limiting the practical applications such as content-aware image cropping, video summary, and image labeling. In this paper, we study the Salient Object Ranking (SOR) task, which manages to assign a ranking order of each detected object according to its visual saliency. We propose the first end-to-end framework of the SOR task and solve it in a multi-task learning fashion. The framework handles instance segmentation and salient object ranking simultaneously. In this framework, the SOR branch is independent and flexible to cooperate with different detection methods, so that easy to use as a plugin. We also introduce a Position-Preserved Attention (PPA) module tailored for the SOR branch. It consists of the position embedding stage and feature interaction stage. Considering the importance of position in saliency comparison, we preserve absolute coordinates of objects in ROI pooling operation and then fuse positional information with semantic features in the first stage. In the feature interaction stage, we apply the attention mechanism to obtain proposals' contextualized representations to predict their relative ranking orders. Extensive experiments have been conducted on the ASR dataset. Without bells and whistles, our proposed method outperforms the former state-of-the-art method significantly. The code will be released publicly available on https://github.com/EricFH/SOR.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Fang_Salient_Object_Ranking_With_Position-Preserved_Attention_ICCV_2021_paper.pdf", @@ -36118,7 +38560,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0+1;0;0;0;0;0;0+0+1;0+1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Fang_2021_ICCV,\n \n author = {\n Fang,\n Hao and Zhang,\n Daoxin and Zhang,\n Yi and Chen,\n Minghao and Li,\n Jiawei and Hu,\n Yao and Cai,\n Deng and He,\n Xiaofei\n},\n title = {\n Salient Object Ranking With Position-Preserved Attention\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 16331-16341\n} \n}" }, { "title": "Sample Efficient Detection and Classification of Adversarial Attacks via Self-Supervised Embeddings", @@ -36126,6 +38569,7 @@ "status": "Poster", "track": "main", "pid": 11013, + "author_site": "Mazda Moayeri; Soheil Feizi", "author": "Mazda Moayeri; Soheil Feizi", "abstract": "Adversarial robustness of deep models is pivotal in ensuring safe deployment in real world settings, but most modern defenses have narrow scope and expensive costs. In this paper, we propose a self-supervised method to detect adversarial attacks and classify them to their respective threat models, based on a linear model operating on the embeddings from a pre-trained self-supervised encoder. We use a SimCLR encoder in our experiments, since we show the SimCLR embedding distance is a good proxy for human perceptibility, enabling it to encapsulate many threat models at once. We call our method SimCat since it uses SimCLR encoder to catch and categorize various types of adversarial attacks, including L_p and non-L_p evasion attacks, as well as data poisonings. The simple nature of a linear classifier makes our method efficient in both time and sample complexity. For example, on SVHN, using only five pairs of clean and adversarial examples computed with a PGD-L_inf attack, SimCat's detection accuracy is over 85%. Moreover, on ImageNet, using only 25 examples from each threat model, SimCat can classify eight different attack types such as PGD-L_2, PGD-L_inf, CW-L_2, PPGD, LPA, StAdv, ReColor, and JPEG-L_inf, with over 40% accuracy. On STL10 data, we apply SimCat as a defense against poisoning attacks, such as BP, CP, FC, CLBD, HTBD, halving the success rate while using only twenty total poisons for training. We find that the detectors generalize well to unseen threat models. Lastly, we investigate the performance of our detection method under adaptive attacks and further boost its robustness against such attacks via adversarial training.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Moayeri_Sample_Efficient_Detection_and_Classification_of_Adversarial_Attacks_via_Self-Supervised_ICCV_2021_paper.pdf", @@ -36149,7 +38593,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Moayeri_2021_ICCV,\n \n author = {\n Moayeri,\n Mazda and Feizi,\n Soheil\n},\n title = {\n Sample Efficient Detection and Classification of Adversarial Attacks via Self-Supervised Embeddings\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7677-7686\n} \n}" }, { "title": "Sampling Network Guided Cross-Entropy Method for Unsupervised Point Cloud Registration", @@ -36157,6 +38602,7 @@ "status": "Poster", "track": "main", "pid": 7046, + "author_site": "Haobo Jiang; Yaqi Shen; Jin Xie; Jun Li; Jianjun Qian; Jian Yang", "author": "Haobo Jiang; Yaqi Shen; Jin Xie; Jun Li; Jianjun Qian; Jian Yang", "abstract": "In this paper, by modeling the point cloud registration task as a Markov decision process, we propose an end-to-end deep model embedded with the cross-entropy method (CEM) for unsupervised 3D registration. Our model consists of a sampling network module and a differentiable CEM module. In our sampling network module, given a pair of point clouds, the sampling network learns a prior sampling distribution over the transformation space. The learned sampling distribution can be used as a \"good\"\" initialization of the differentiable CEM module. In our differentiable CEM module, we first propose a maximum consensus criterion based alignment metric as the reward function for the point cloud registration task. Based on the reward function, for each state, we then construct a fused score function to evaluate the sampled transformations, where we weight the current and future rewards of the transformations. Particularly, the future rewards of the sampled transforms are obtained by performing the iterative closest point (ICP) algorithm on the transformed state. By selecting the top-k transformations with the highest scores, we iteratively update the sampling distribution. Furthermore, in order to make the CEM differentiable, we use the sparsemax function to replace the hard top-k selection. Finally, we formulate a Geman-McClure estimator based loss to train our end-to-end registration model. Extensive experimental results demonstrate the good registration performance of our method on benchmark datasets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Jiang_Sampling_Network_Guided_Cross-Entropy_Method_for_Unsupervised_Point_Cloud_Registration_ICCV_2021_paper.pdf", @@ -36180,7 +38626,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Jiang_2021_ICCV,\n \n author = {\n Jiang,\n Haobo and Shen,\n Yaqi and Xie,\n Jin and Li,\n Jun and Qian,\n Jianjun and Yang,\n Jian\n},\n title = {\n Sampling Network Guided Cross-Entropy Method for Unsupervised Point Cloud Registration\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6128-6137\n} \n}" }, { "title": "Sat2Vid: Street-View Panoramic Video Synthesis From a Single Satellite Image", @@ -36188,6 +38635,7 @@ "status": "Poster", "track": "main", "pid": 2469, + "author_site": "Zuoyue Li; Zhenqiang Li; Zhaopeng Cui; Rongjun Qin; Marc Pollefeys; Martin R. Oswald", "author": "Zuoyue Li; Zhenqiang Li; Zhaopeng Cui; Rongjun Qin; Marc Pollefeys; Martin R. Oswald", "abstract": "We present a novel method for synthesizing both temporally and geometrically consistent street-view panoramic video from a single satellite image and camera trajectory. Existing cross-view synthesis approaches focus on images, while video synthesis in such a case has not yet received enough attention. For geometrical and temporal consistency, our approach explicitly creates a 3D point cloud representation of the scene and maintains dense 3D-2D correspondences across frames that reflect the geometric scene configuration inferred from the satellite view. As for synthesis in the 3D space, we implement a cascaded network architecture with two hourglass modules to generate point-wise coarse and fine features from semantics and per-class latent vectors, followed by projection to frames and an upsampling module to obtain the final realistic video. By leveraging computed correspondences, the produced street-view video frames adhere to the 3D geometric scene structure and maintain temporal consistency. Qualitative and quantitative experiments demonstrate superior results compared to other state-of-the-art synthesis approaches that either lack temporal consistency or realistic appearance. To the best of our knowledge, our work is the first one to synthesize cross-view images to videos.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_Sat2Vid_Street-View_Panoramic_Video_Synthesis_From_a_Single_Satellite_Image_ICCV_2021_paper.pdf", @@ -36202,7 +38650,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Li_Sat2Vid_Street-View_Panoramic_Video_Synthesis_From_a_Single_Satellite_Image_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Li_Sat2Vid_Street-View_Panoramic_Video_Synthesis_From_a_Single_Satellite_Image_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Zuoyue and Li,\n Zhenqiang and Cui,\n Zhaopeng and Qin,\n Rongjun and Pollefeys,\n Marc and Oswald,\n Martin R.\n},\n title = {\n Sat2Vid: Street-View Panoramic Video Synthesis From a Single Satellite Image\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12436-12445\n} \n}" }, { "title": "Scalable Vision Transformers With Hierarchical Pooling", @@ -36210,6 +38659,7 @@ "status": "Poster", "track": "main", "pid": 2470, + "author_site": "Zizheng Pan; Bohan Zhuang; Jing Liu; Haoyu He; Jianfei Cai", "author": "Zizheng Pan; Bohan Zhuang; Jing Liu; Haoyu He; Jianfei Cai", "abstract": "The recently proposed Visual image Transformers (ViT) with pure attention have achieved promising performance on image recognition tasks, such as image classification. However, the routine of the current ViT model is to maintain a full-length patch sequence during inference, which is redundant and lacks hierarchical representation. To this end, we propose a Hierarchical Visual Transformer (HVT) which progressively pools visual tokens to shrink the sequence length and hence reduces the computational cost, analogous to the feature maps downsampling in Convolutional Neural Networks (CNNs). It brings a great benefit that we can increase the model capacity by scaling dimensions of depth/width/resolution/patch size without introducing extra computational complexity due to the reduced sequence length. Moreover, we empirically find that the average pooled visual tokens contain more discriminative information than the single class token. To demonstrate the improved scalability of our HVT, we conduct extensive experiments on the image classification task. With comparable FLOPs, our HVT outperforms the competitive baselines on ImageNet and CIFAR-100 datasets. Code is available at https://github.com/MonashAI/HVT.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Pan_Scalable_Vision_Transformers_With_Hierarchical_Pooling_ICCV_2021_paper.pdf", @@ -36233,7 +38683,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "Australia" + "aff_country_unique": "Australia", + "bibtex": "@InProceedings{Pan_2021_ICCV,\n \n author = {\n Pan,\n Zizheng and Zhuang,\n Bohan and Liu,\n Jing and He,\n Haoyu and Cai,\n Jianfei\n},\n title = {\n Scalable Vision Transformers With Hierarchical Pooling\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 377-386\n} \n}" }, { "title": "Scaling Semantic Segmentation Beyond 1K Classes on a Single GPU", @@ -36241,6 +38692,7 @@ "status": "Poster", "track": "main", "pid": 10632, + "author_site": "Shipra Jain; Danda Pani Paudel; Martin Danelljan; Luc Van Gool", "author": "Shipra Jain; Danda Pani Paudel; Martin Danelljan; Luc Van Gool", "abstract": "The state-of-the-art object detection and image classification methods can perform impressively on more than 9k and 10k classes respectively. In contrast, the number of classes in semantic segmentation datasets is relatively limited. This is not surprising when the restrictions caused by the lack of labelled data and high computation demand for segmentation are considered. In this paper, we propose a novel training methodology to train and scale the existing semantic segmentation models for a large number of semantic classes without increasing the memory overhead. In our approach, we reduce the space complexity of the segmentation model's output from O(C) to O(1), propose an approximation method for ground-truth class probability, and use it to compute cross-entropy loss. The proposed approach is general and can be adopted by any state-of-the-art segmentation model to gracefully scale it for any number of semantic classes with only one GPU. Our approach achieves similar, and in some cases even better mIoU for Cityscapes, Pascal VOC and ADE20k dataset when adopted to DeeplabV3+ model with different backbones. We demonstrate a clear benefit of our approach on a dataset with 1284 classes, bootstrapped from LVIS and COCO annotations, with almost three times better mIoU when compared to DeeplabV3+. Code is available at: https://github.com/shipra25jain/ESSNet.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Jain_Scaling_Semantic_Segmentation_Beyond_1K_Classes_on_a_Single_GPU_ICCV_2021_paper.pdf", @@ -36264,7 +38716,8 @@ "aff_campus_unique_index": "1;", "aff_campus_unique": ";Stockholm", "aff_country_unique_index": "0+1;0;0;0+2", - "aff_country_unique": "Switzerland;Sweden;Belgium" + "aff_country_unique": "Switzerland;Sweden;Belgium", + "bibtex": "@InProceedings{Jain_2021_ICCV,\n \n author = {\n Jain,\n Shipra and Paudel,\n Danda Pani and Danelljan,\n Martin and Van Gool,\n Luc\n},\n title = {\n Scaling Semantic Segmentation Beyond 1K Classes on a Single GPU\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7426-7436\n} \n}" }, { "title": "Scaling Up Instance Annotation via Label Propagation", @@ -36272,6 +38725,7 @@ "status": "Poster", "track": "main", "pid": 10821, + "author_site": "Dim P. Papadopoulos; Ethan Weber; Antonio Torralba", "author": "Dim P. Papadopoulos; Ethan Weber; Antonio Torralba", "abstract": "Manually annotating object segmentation masks is very time-consuming. While interactive segmentation methods offer a more efficient alternative, they become unaffordable at a large scale because the cost grows linearly with the number of annotated masks. In this paper, we propose a highly efficient annotation scheme for building large datasets with object segmentation masks. At a large scale, images contain many object instances with similar appearance. We exploit these similarities by using hierarchical clustering on mask predictions made by a segmentation model. We propose a scheme that efficiently searches through the hierarchy of clusters and selects which clusters to annotate. Humans manually verify only a few masks per cluster, and the labels are propagated to the whole cluster. Through a large-scale experiment to populate 1M unlabeled images with object segmentation masks for 80 object classes, we show that (1) we obtain 1M object segmentation masks with an total annotation time of only 290 hours; (2) we reduce annotation time by 76x compared to manual annotation; (3) the segmentation quality of our masks is on par with those from manually annotated datasets. Code, data, and models are available online.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Papadopoulos_Scaling_Up_Instance_Annotation_via_Label_Propagation_ICCV_2021_paper.pdf", @@ -36295,7 +38749,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Papadopoulos_2021_ICCV,\n \n author = {\n Papadopoulos,\n Dim P. and Weber,\n Ethan and Torralba,\n Antonio\n},\n title = {\n Scaling Up Instance Annotation via Label Propagation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15364-15373\n} \n}" }, { "title": "Scaling-Up Disentanglement for Image Translation", @@ -36303,6 +38758,7 @@ "status": "Poster", "track": "main", "pid": 7699, + "author_site": "Aviv Gabbay; Yedid Hoshen", "author": "Aviv Gabbay; Yedid Hoshen", "abstract": "Image translation methods typically aim to manipulate a set of labeled attributes (given as supervision at training time e.g. domain label) while leaving the unlabeled attributes intact. Current methods achieve either: (i) disentanglement, which exhibits low visual fidelity and can only be satisfied where the attributes are perfectly uncorrelated. (ii) visually-plausible translations, which are clearly not disentangled. In this work, we propose OverLORD, a single framework for disentangling labeled and unlabeled attributes as well as synthesizing high-fidelity images, which is composed of two stages; (i) Disentanglement: Learning disentangled representations with latent optimization. Differently from previous approaches, we do not rely on adversarial training or any architectural biases. (ii) Synthesis: Training feed-forward encoders for inferring the learned attributes and tuning the generator in an adversarial manner to increase the perceptual quality. When the labeled and unlabeled attributes are correlated, we model an additional representation that accounts for the correlated attributes and improves disentanglement. We highlight that our flexible framework covers multiple settings as disentangling labeled attributes, pose and appearance, localized concepts, and shape and texture. We present significantly better disentanglement with higher translation quality and greater output diversity than state-of-the-art methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Gabbay_Scaling-Up_Disentanglement_for_Image_Translation_ICCV_2021_paper.pdf", @@ -36319,14 +38775,15 @@ "author_num": 2, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Gabbay_Scaling-Up_Disentanglement_for_Image_Translation_ICCV_2021_paper.html", "aff_unique_index": "0;0", - "aff_unique_norm": "Hebrew University of Jerusalem", + "aff_unique_norm": "The Hebrew University of Jerusalem", "aff_unique_dep": "School of Computer Science and Engineering", "aff_unique_url": "http://www.huji.ac.il", "aff_unique_abbr": "HUJI", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Jerusalem", "aff_country_unique_index": "0;0", - "aff_country_unique": "Israel" + "aff_country_unique": "Israel", + "bibtex": "@InProceedings{Gabbay_2021_ICCV,\n \n author = {\n Gabbay,\n Aviv and Hoshen,\n Yedid\n},\n title = {\n Scaling-Up Disentanglement for Image Translation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6783-6792\n} \n}" }, { "title": "Scene Context-Aware Salient Object Detection", @@ -36334,6 +38791,7 @@ "status": "Poster", "track": "main", "pid": 7063, + "author_site": "Avishek Siris; Jianbo Jiao; Gary K.L. Tam; Xianghua Xie; Rynson W.H. Lau", "author": "Avishek Siris; Jianbo Jiao; Gary K.L. Tam; Xianghua Xie; Rynson W.H. Lau", "abstract": "Salient object detection identifies objects in an image that grab visual attention. Although contextual features are considered in recent literature, they often fail in real-world complex scenarios. We observe that this is mainly due to two issues: First, most existing datasets consist of simple foregrounds and backgrounds that hardly represent real-life scenarios. Second, current methods only learn contextual features of salient objects, which are insufficient to model high-level semantics for saliency reasoning in complex scenes. To address these problems, we first construct a new large-scale dataset with complex scenes in this paper. We then propose a context-aware learning approach to explicitly exploit the semantic scene contexts. Specifically, two modules are proposed to achieve the goal: 1) a Semantic Scene Context Refinement module to enhance contextual features learned from salient objects with scene context, and 2) a Contextual Instance Transformer to learn contextual relations between objects and scene context. To our knowledge, such high-level semantic contextual information of image scenes is under-explored for saliency detection in the literature. Extensive experiments demonstrate that the proposed approach outperforms state-of-the-art techniques in complex scenarios for saliency detection, and transfers well to other existing datasets. The code and dataset are available at https://github.com/SirisAvishek/Scene_Context_Aware_Saliency.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Siris_Scene_Context-Aware_Salient_Object_Detection_ICCV_2021_paper.pdf", @@ -36357,7 +38815,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;1", - "aff_country_unique": "United Kingdom;China" + "aff_country_unique": "United Kingdom;China", + "bibtex": "@InProceedings{Siris_2021_ICCV,\n \n author = {\n Siris,\n Avishek and Jiao,\n Jianbo and Tam,\n Gary K.L. and Xie,\n Xianghua and Lau,\n Rynson W.H.\n},\n title = {\n Scene Context-Aware Salient Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4156-4166\n} \n}" }, { "title": "Scene Synthesis via Uncertainty-Driven Attribute Synchronization", @@ -36365,6 +38824,7 @@ "status": "Poster", "track": "main", "pid": 2877, + "author_site": "Haitao Yang; Zaiwei Zhang; Siming Yan; Haibin Huang; Chongyang Ma; Yi Zheng; Chandrajit Bajaj; Qixing Huang", "author": "Haitao Yang; Zaiwei Zhang; Siming Yan; Haibin Huang; Chongyang Ma; Yi Zheng; Chandrajit Bajaj; Qixing Huang", "abstract": "Developing deep neural networks to generate 3D scenes is a fundamental problem in neural synthesis with immediate applications in architectural CAD, computer graphics, as well as in generating virtual robot training environments. This task is challenging because 3D scenes exhibit diverse patterns, ranging from continuous ones, such as object sizes and the relative poses between pairs of shapes, to discrete patterns, such as occurrence and co-occurrence of objects with symmetrical relationships. This paper introduces a novel neural scene synthesis approach that can capture diverse feature patterns of 3D scenes. Our method combines the strength of both neural network-based and conventional scene synthesis approaches. We use the parametric prior distributions learned from training data, which provide uncertainties of object attributes and relative attributes, to regularize the outputs of feed-forward neural models. Moreover, instead of merely predicting a scene layout, our approach predicts an over-complete set of attributes. This methodology allows us to utilize the underlying consistency constraints among the predicted attributes to prune infeasible predictions. Experimental results show that our approach outperforms existing methods considerably. The generated 3D scenes interpolate the training data faithfully while preserving both continuous and discrete feature patterns.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yang_Scene_Synthesis_via_Uncertainty-Driven_Attribute_Synchronization_ICCV_2021_paper.pdf", @@ -36379,7 +38839,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Yang_Scene_Synthesis_via_Uncertainty-Driven_Attribute_Synchronization_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Yang_Scene_Synthesis_via_Uncertainty-Driven_Attribute_Synchronization_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Yang_2021_ICCV,\n \n author = {\n Yang,\n Haitao and Zhang,\n Zaiwei and Yan,\n Siming and Huang,\n Haibin and Ma,\n Chongyang and Zheng,\n Yi and Bajaj,\n Chandrajit and Huang,\n Qixing\n},\n title = {\n Scene Synthesis via Uncertainty-Driven Attribute Synchronization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5630-5640\n} \n}" }, { "title": "Score-Based Point Cloud Denoising", @@ -36387,6 +38848,7 @@ "status": "Poster", "track": "main", "pid": 2930, + "author_site": "Shitong Luo; Wei Hu", "author": "Shitong Luo; Wei Hu", "abstract": "Point clouds acquired from scanning devices are often perturbed by noise, which affects downstream tasks such as surface reconstruction and analysis. The distribution of a noisy point cloud can be viewed as the distribution of a set of noise-free samples p(x) convolved with some noise model n, leading to (p * n)(x) whose mode is the underlying clean surface. To denoise a noisy point cloud, we propose to increase the log-likelihood of each point from p * n via gradient ascent---iteratively updating each point's position. Since p * n is unknown at test-time, and we only need the score (i.e., the gradient of the log-probability function) to perform gradient ascent, we propose a neural network architecture to estimate the score of p * n given only noisy point clouds as input. We derive objective functions for training the network and develop a denoising algorithm leveraging on the estimated scores. Experiments demonstrate that the proposed model outperforms state-of-the-art methods under a variety of noise models, and shows the potential to be applied in other tasks such as point cloud upsampling.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Luo_Score-Based_Point_Cloud_Denoising_ICCV_2021_paper.pdf", @@ -36410,7 +38872,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Luo_2021_ICCV,\n \n author = {\n Luo,\n Shitong and Hu,\n Wei\n},\n title = {\n Score-Based Point Cloud Denoising\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4583-4592\n} \n}" }, { "title": "Scribble-Supervised Semantic Segmentation Inference", @@ -36418,6 +38881,7 @@ "status": "Poster", "track": "main", "pid": 9519, + "author_site": "Jingshan Xu; Chuanwei Zhou; Zhen Cui; Chunyan Xu; Yuge Huang; Pengcheng Shen; Shaoxin Li; Jian Yang", "author": "Jingshan Xu; Chuanwei Zhou; Zhen Cui; Chunyan Xu; Yuge Huang; Pengcheng Shen; Shaoxin Li; Jian Yang", "abstract": "In this paper, we propose a progressive segmentation inference (PSI) framework to tackle with scribble-supervised semantic segmentation. In virtue of latent contextual dependency, we encapsulate two crucial cues, contextual pattern propagation and semantic label diffusion, to enhance and refine pixel-level segmentation results from partially known seeds. In contextual pattern propagation, different-granular contextual patterns are correlated and leveraged to properly diffuse pattern information based on graphical model, so as to increase the inference confidence of pixel label prediction. Further, depending on high confidence scores of estimated pixels, the initial annotated seeds are progressively spread over the image through dynamically learning an adaptive decision strategy. The two cues are finally modularized to form a close-looping update process during pixel-wise label inference. Extensive experiments demonstrate that our proposed progressive segmentation inference can benefit from the combination of spatial and semantic context cues, and meantime achieve the state-of-the-art performance on two public scribble segmentation datasets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xu_Scribble-Supervised_Semantic_Segmentation_Inference_ICCV_2021_paper.pdf", @@ -36434,14 +38898,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Xu_Scribble-Supervised_Semantic_Segmentation_Inference_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;0;1;1;1;0", - "aff_unique_norm": "Nanjing University of Science and Technology;Tencent", - "aff_unique_dep": "School of Computer Science and Engineering;Tencent Holdings Limited", + "aff_unique_norm": "Nanjing University of Science and Technology;Tencent Holdings Limited", + "aff_unique_dep": "School of Computer Science and Engineering;", "aff_unique_url": "http://www.nust.edu.cn;https://www.tencent.com", "aff_unique_abbr": "NUST;Tencent", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Nanjing;", "aff_country_unique_index": "0;0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xu_2021_ICCV,\n \n author = {\n Xu,\n Jingshan and Zhou,\n Chuanwei and Cui,\n Zhen and Xu,\n Chunyan and Huang,\n Yuge and Shen,\n Pengcheng and Li,\n Shaoxin and Yang,\n Jian\n},\n title = {\n Scribble-Supervised Semantic Segmentation Inference\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15354-15363\n} \n}" }, { "title": "Scribble-Supervised Semantic Segmentation by Uncertainty Reduction on Neural Representation and Self-Supervision on Neural Eigenspace", @@ -36449,6 +38914,7 @@ "status": "Poster", "track": "main", "pid": 6198, + "author_site": "Zhiyi Pan; Peng Jiang; Yunhai Wang; Changhe Tu; Anthony G. Cohn", "author": "Zhiyi Pan; Peng Jiang; Yunhai Wang; Changhe Tu; Anthony G. Cohn", "abstract": "Scribble-supervised semantic segmentation has gained much attention recently for its promising performance without high-quality annotations. Due to the lack of supervision, confident and consistent predictions are usually hard to obtain. Typically, people handle these problems by either adopting an auxiliary task with the well-labeled dataset or incorporating a graphical model with additional requirements on scribble annotations. Instead, this work aims to achieve semantic segmentation by scribble annotations directly without extra information and other limitations. Specifically, we propose holistic operations, including minimizing entropy and a network embedded random walk on the neural representation to reduce uncertainty. Given the probabilistic transition matrix of a random walk, we further train the network with self-supervision on its neural eigenspace to impose consistency on predictions between related images. Comprehensive experiments and ablation studies verify the proposed approach, which demonstrates superiority over others; it is even comparable to some full-label supervised ones and works well when scribbles are randomly shrunk or dropped.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Pan_Scribble-Supervised_Semantic_Segmentation_by_Uncertainty_Reduction_on_Neural_Representation_and_ICCV_2021_paper.pdf", @@ -36472,7 +38938,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1+0", - "aff_country_unique": "China;United Kingdom" + "aff_country_unique": "China;United Kingdom", + "bibtex": "@InProceedings{Pan_2021_ICCV,\n \n author = {\n Pan,\n Zhiyi and Jiang,\n Peng and Wang,\n Yunhai and Tu,\n Changhe and Cohn,\n Anthony G.\n},\n title = {\n Scribble-Supervised Semantic Segmentation by Uncertainty Reduction on Neural Representation and Self-Supervision on Neural Eigenspace\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7416-7425\n} \n}" }, { "title": "SeLFVi: Self-Supervised Light-Field Video Reconstruction From Stereo Video", @@ -36480,6 +38947,7 @@ "status": "Poster", "track": "main", "pid": 9922, + "author_site": "Prasan Shedligeri; Florian Schiffers; Sushobhan Ghosh; Oliver Cossairt; Kaushik Mitra", "author": "Prasan Shedligeri; Florian Schiffers; Sushobhan Ghosh; Oliver Cossairt; Kaushik Mitra", "abstract": "Light-field (LF) imaging is appealing to the mobile devices market because of its capability for intuitive post-capture processing. Acquiring LF data with high angular, spatial and temporal resolution poses significant challenges, especially with space constraints preventing bulky optics. At the same time, stereo video capture, now available on many consumer devices, can be interpreted as a sparse LF-capture. We explore the application of small baseline stereo videos for reconstructing high fidelity LF videos. We propose a self-supervised learning-based algorithm for LF video reconstruction from stereo video. The self-supervised LF video reconstruction is guided via the geometric information from the individual stereo pairs and the temporal information from the video sequence. LF estimation is further regularized by a low-rank constraint based on layered LF displays. The proposed self-supervised algorithm facilitates advantages such as post-training fine-tuning on test sequences and variable angular view interpolation and extrapolation. Quantitatively the LF videos show higher fidelity than previously proposed unsupervised approaches for LF reconstruction. We demonstrate our results via LF videos generated from stereo videos acquired from commercially available stereoscopic cameras. Finally, we demonstrate that our reconstructed LF videos allow applications such as post-capture focus control and RoI-based focus tracking for videos.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Shedligeri_SeLFVi_Self-Supervised_Light-Field_Video_Reconstruction_From_Stereo_Video_ICCV_2021_paper.pdf", @@ -36503,7 +38971,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Madras;", "aff_country_unique_index": "0;1;1;1;0", - "aff_country_unique": "India;United States" + "aff_country_unique": "India;United States", + "bibtex": "@InProceedings{Shedligeri_2021_ICCV,\n \n author = {\n Shedligeri,\n Prasan and Schiffers,\n Florian and Ghosh,\n Sushobhan and Cossairt,\n Oliver and Mitra,\n Kaushik\n},\n title = {\n SeLFVi: Self-Supervised Light-Field Video Reconstruction From Stereo Video\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2491-2501\n} \n}" }, { "title": "Searching for Controllable Image Restoration Networks", @@ -36511,6 +38980,7 @@ "status": "Poster", "track": "main", "pid": 11219, + "author_site": "Heewon Kim; Sungyong Baik; Myungsub Choi; Janghoon Choi; Kyoung Mu Lee", "author": "Heewon Kim; Sungyong Baik; Myungsub Choi; Janghoon Choi; Kyoung Mu Lee", "abstract": "We present a novel framework for controllable image restoration that can effectively restore multiple types and levels of degradation of a corrupted image. The proposed model, named TASNet, is automatically determined by our neural architecture search algorithm, which optimizes the efficiency-accuracy trade-off of the candidate model architectures. Specifically, we allow TASNet to share the early layers across different restoration tasks and adaptively adjust the remaining layers with respect to each task. The shared task-agnostic layers greatly improve the efficiency while the task-specific layers are optimized for restoration quality, and our search algorithm seeks for the best balance between the two. We also propose a new data sampling strategy to further improve the overall restoration performance. As a result, TASNet achieves significantly faster GPU latency and lower FLOPs compared to the existing state-of-the-art models, while also showing visually more pleasing outputs. The source code and pre-trained models are available at https://github.com/ghimhw/TASNet.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Kim_Searching_for_Controllable_Image_Restoration_Networks_ICCV_2021_paper.pdf", @@ -36534,7 +39004,8 @@ "aff_campus_unique_index": "0;0;0+1;0;0", "aff_campus_unique": "Seoul;Mountain View;", "aff_country_unique_index": "0;0;0+1;0+0;0", - "aff_country_unique": "South Korea;United States" + "aff_country_unique": "South Korea;United States", + "bibtex": "@InProceedings{Kim_2021_ICCV,\n \n author = {\n Kim,\n Heewon and Baik,\n Sungyong and Choi,\n Myungsub and Choi,\n Janghoon and Lee,\n Kyoung Mu\n},\n title = {\n Searching for Controllable Image Restoration Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14234-14243\n} \n}" }, { "title": "Searching for Robustness: Loss Learning for Noisy Classification Tasks", @@ -36542,6 +39013,7 @@ "status": "Poster", "track": "main", "pid": 8996, + "author_site": "Boyan Gao; Henry Gouk; Timothy M. Hospedales", "author": "Boyan Gao; Henry Gouk; Timothy M. Hospedales", "abstract": "We present a \"learning to learn\" approach for discovering white-box classification loss functions that are robust to label noise in the training data. We parameterise a flexible family of loss functions using Taylor polynomials, and apply evolutionary strategies to search for noise-robust losses in this space. To learn re-usable loss functions that can apply to new tasks, our fitness function scores their performance in aggregate across a range of training datasets and architectures. The resulting white-box loss provides a simple and fast \"plug-and-play\" module that enables effective label-noise-robust learning in diverse downstream tasks, without requiring a special training procedure or network architecture. The efficacy of our loss is demonstrated on a variety of datasets with both synthetic and real label noise, where we compare favourably to prior work.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Gao_Searching_for_Robustness_Loss_Learning_for_Noisy_Classification_Tasks_ICCV_2021_paper.pdf", @@ -36558,14 +39030,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Gao_Searching_for_Robustness_Loss_Learning_for_Noisy_Classification_Tasks_ICCV_2021_paper.html", "aff_unique_index": "0+1;0+1;0+1", - "aff_unique_norm": "University of Edinburgh;Samsung", + "aff_unique_norm": "University of Edinburgh;Samsung AI Centre", "aff_unique_dep": "School of Informatics;AI Centre", "aff_unique_url": "https://www.ed.ac.uk;https://www.samsung.com/global/campaign/ai-research-centre/", "aff_unique_abbr": "Edinburgh;Samsung AI", "aff_campus_unique_index": "0+1;0+1;0+1", "aff_campus_unique": "Edinburgh;Cambridge", "aff_country_unique_index": "0+0;0+0;0+0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Gao_2021_ICCV,\n \n author = {\n Gao,\n Boyan and Gouk,\n Henry and Hospedales,\n Timothy M.\n},\n title = {\n Searching for Robustness: Loss Learning for Noisy Classification Tasks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6670-6679\n} \n}" }, { "title": "Searching for Two-Stream Models in Multivariate Space for Video Recognition", @@ -36573,6 +39046,7 @@ "status": "Poster", "track": "main", "pid": 6123, + "author_site": "Xinyu Gong; Heng Wang; Mike Zheng Shou; Matt Feiszli; Zhangyang Wang; Zhicheng Yan", "author": "Xinyu Gong; Heng Wang; Mike Zheng Shou; Matt Feiszli; Zhangyang Wang; Zhicheng Yan", "abstract": "Conventional video models rely on a single stream to capture the complex spatial-temporal features. Recent work on two-stream video models, such as SlowFast network and AssembleNet, prescribe separate streams to learn complementary features, and achieve stronger performance. However, manually designing both streams as well as the in-between fusion blocks is a daunting task, requiring to explore a tremendously large design space. Such manual exploration is time-consuming and often ends up with sub-optimal architectures when computational resources are limited and the exploration is insufficient. In this work, we present a pragmatic neural architecture search approach, which is able to search for two-stream video models in giant spaces efficiently. We design a multivariate search space, including 6 search variables to capture a wide variety of choices in designing two-stream models. Furthermore, we propose a progressive search procedure, by searching for the architecture of individual streams, fusion blocks and attention blocks one after the other. We demonstrate two-stream models with significantly better performance can be automatically discovered in our design space. Our searched two-stream models, namely Auto-TSNet, consistently outperform other models on standard benchmarks. On Kinetics, compared with the SlowFast model, our Auto-TSNet-L model reduces FLOPS by nearly 11 times while achieving the same accuracy 78.9%. On Something-Something-V2, Auto-TSNet-M improves the accuracy by at least 2% over other methods which use less than 50 GFLOPS per video.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Gong_Searching_for_Two-Stream_Models_in_Multivariate_Space_for_Video_Recognition_ICCV_2021_paper.pdf", @@ -36589,14 +39063,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Gong_Searching_for_Two-Stream_Models_in_Multivariate_Space_for_Video_Recognition_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;0;1;0", - "aff_unique_norm": "Meta;University of Texas at Austin", + "aff_unique_norm": "Facebook;University of Texas at Austin", "aff_unique_dep": "Facebook AI;", "aff_unique_url": "https://www.facebook.com;https://www.utexas.edu", "aff_unique_abbr": "Facebook AI;UT Austin", "aff_campus_unique_index": "1", "aff_campus_unique": ";Austin", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Gong_2021_ICCV,\n \n author = {\n Gong,\n Xinyu and Wang,\n Heng and Shou,\n Mike Zheng and Feiszli,\n Matt and Wang,\n Zhangyang and Yan,\n Zhicheng\n},\n title = {\n Searching for Two-Stream Models in Multivariate Space for Video Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8033-8042\n} \n}" }, { "title": "Seasonal Contrast: Unsupervised Pre-Training From Uncurated Remote Sensing Data", @@ -36604,10 +39079,11 @@ "status": "Poster", "track": "main", "pid": 9057, - "author": "Oscar Ma\u00f1as; Alexandre Lacoste; Xavier Gir\u00f3-i-Nieto; David Vazquez; Pau Rodr\u00edguez", + "author_site": "Oscar Mañas; Alexandre Lacoste; Xavier Giró-i-Nieto; David Vazquez; Pau Rodríguez", + "author": "Oscar Mañas; Alexandre Lacoste; Xavier Giró-i-Nieto; David Vazquez; Pau Rodríguez", "abstract": "Remote sensing and automatic earth monitoring are key to solve global-scale challenges such as disaster prevention, land use monitoring, or tackling climate change. Although there exist vast amounts of remote sensing data, most of it remains unlabeled and thus inaccessible for supervised learning algorithms. Transfer learning approaches can reduce the data requirements of deep learning algorithms. However, most of these methods are pre-trained on ImageNet and their generalization to remote sensing imagery is not guaranteed due to the domain gap. In this work, we propose Seasonal Contrast (SeCo), an effective pipeline to leverage unlabeled data for in-domain pre-training of remote sensing representations. The SeCo pipeline is composed of two parts. First, a principled procedure to gather large-scale, unlabeled and uncurated remote sensing datasets containing images from multiple Earth locations at different timestamps. Second, a self-supervised algorithm that takes advantage of time and position invariance to learn transferable representations for remote sensing applications. We empirically show that models trained with SeCo achieve better performance than their ImageNet pre-trained counterparts and state-of-the-art self-supervised learning methods on multiple downstream tasks. The datasets and models in SeCo will be made public to facilitate transfer learning and enable rapid progress in remote sensing applications.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Manas_Seasonal_Contrast_Unsupervised_Pre-Training_From_Uncurated_Remote_Sensing_Data_ICCV_2021_paper.pdf", - "aff": "Element AI+Universitat Polit\u00e8cnica de Catalunya; Element AI; Universitat Polit\u00e8cnica de Catalunya; Element AI; Element AI", + "aff": "Element AI+Universitat Politècnica de Catalunya; Element AI; Universitat Politècnica de Catalunya; Element AI; Element AI", "project": "", "github": "https://github.com/ElementAI/seasonal-contrast", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Manas_Seasonal_Contrast_Unsupervised_ICCV_2021_supplemental.pdf", @@ -36620,14 +39096,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Manas_Seasonal_Contrast_Unsupervised_Pre-Training_From_Uncurated_Remote_Sensing_Data_ICCV_2021_paper.html", "aff_unique_index": "0+1;0;1;0;0", - "aff_unique_norm": "Element AI;Universitat Polit\u00e8cnica de Catalunya", + "aff_unique_norm": "Element AI;Universitat Politècnica de Catalunya", "aff_unique_dep": ";", "aff_unique_url": "https://www.elementai.com;https://www.upc.edu", "aff_unique_abbr": "Element AI;UPC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0;1;0;0", - "aff_country_unique": "Canada;Spain" + "aff_country_unique": "Canada;Spain", + "bibtex": "@InProceedings{Manas_2021_ICCV,\n \n author = {\n Ma\\~nas,\n Oscar and Lacoste,\n Alexandre and Gir\\'o-i-Nieto,\n Xavier and Vazquez,\n David and Rodr{\\'\\i\n}guez,\n Pau\n},\n title = {\n Seasonal Contrast: Unsupervised Pre-Training From Uncurated Remote Sensing Data\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9414-9423\n} \n}" }, { "title": "Seeing Dynamic Scene in the Dark: A High-Quality Video Dataset With Mechatronic Alignment", @@ -36635,6 +39112,7 @@ "status": "Poster", "track": "main", "pid": 3035, + "author_site": "Ruixing Wang; Xiaogang Xu; Chi-Wing Fu; Jiangbo Lu; Bei Yu; Jiaya Jia", "author": "Ruixing Wang; Xiaogang Xu; Chi-Wing Fu; Jiangbo Lu; Bei Yu; Jiaya Jia", "abstract": "Low-light video enhancement is an important task. Previous work is mostly trained on paired static images or videos. We compile a new dataset formed by our new strategy that contains high-quality spatially-aligned video pairs from dynamic scenes in low- and normal-light conditions. We built it using a mechatronic system to precisely control the dynamics during the video capture process, and further align the video pairs, both spatially and temporally, by identifying the system's uniform motion stage. Besides the dataset, we propose an end-to-end framework, in which we design a self-supervised strategy to reduce noise, while enhancing the illumination based on the Retinex theory. Extensive experiments based on various metrics and large-scale user study demonstrate the value of our dataset and effectiveness of our method. The dataset and code are available at https://github.com/dvlab-research/SDSD.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_Seeing_Dynamic_Scene_in_the_Dark_A_High-Quality_Video_Dataset_ICCV_2021_paper.pdf", @@ -36651,14 +39129,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wang_Seeing_Dynamic_Scene_in_the_Dark_A_High-Quality_Video_Dataset_ICCV_2021_paper.html", "aff_unique_index": "0+1;0+1;0;1;0+1;0+1", - "aff_unique_norm": "Chinese University of Hong Kong;SmartMore", + "aff_unique_norm": "The Chinese University of Hong Kong;SmartMore", "aff_unique_dep": ";", "aff_unique_url": "https://www.cuhk.edu.hk;", "aff_unique_abbr": "CUHK;", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Ruixing and Xu,\n Xiaogang and Fu,\n Chi-Wing and Lu,\n Jiangbo and Yu,\n Bei and Jia,\n Jiaya\n},\n title = {\n Seeing Dynamic Scene in the Dark: A High-Quality Video Dataset With Mechatronic Alignment\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9700-9709\n} \n}" }, { "title": "Seeking Similarities Over Differences: Similarity-Based Domain Alignment for Adaptive Object Detection", @@ -36666,6 +39145,7 @@ "status": "Poster", "track": "main", "pid": 8560, + "author_site": "Farzaneh Rezaeianaran; Rakshith Shetty; Rahaf Aljundi; Daniel Olmeda Reino; Shanshan Zhang; Bernt Schiele", "author": "Farzaneh Rezaeianaran; Rakshith Shetty; Rahaf Aljundi; Daniel Olmeda Reino; Shanshan Zhang; Bernt Schiele", "abstract": "In order to robustly deploy object detectors across a wide range of scenarios, they should be adaptable to shifts in the input distribution without the need to constantly annotate new data. This has motivated research in Unsupervised Domain Adaptation (UDA) algorithms for detection. UDA methods learn to adapt from labeled source domains to unlabeled target domains, by inducing alignment between detector features from source and target domains. Yet, there is no consensus on what features to align and how to do the alignment. In our work, we propose a framework that generalizes the different components commonly used by UDA methods laying the ground for an in-depth analysis of the UDA design space. Specifically, we propose a novel UDA algorithm, ViSGA, a direct implementation of our framework, that leverages the best design choices and introduces a simple but effective method to aggregate features at the instance-level based on the visual similarity before inducing group alignment via adversarial training. We show that both similarity-based grouping and adversarial training allows our model to focus on coarsely aligning feature groups, without being forced to match all instances across loosely aligned domains. Finally, we examine the applicability of ViSGA to the setting where labeled data are gathered from different sources. Experiments show that not only our method outperforms previous single-source approaches on Sim2Real and Adverse Weather, but also generalizes well to the multi-source setting.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Rezaeianaran_Seeking_Similarities_Over_Differences_Similarity-Based_Domain_Alignment_for_Adaptive_Object_ICCV_2021_paper.pdf", @@ -36680,7 +39160,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Rezaeianaran_Seeking_Similarities_Over_Differences_Similarity-Based_Domain_Alignment_for_Adaptive_Object_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Rezaeianaran_Seeking_Similarities_Over_Differences_Similarity-Based_Domain_Alignment_for_Adaptive_Object_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Rezaeianaran_2021_ICCV,\n \n author = {\n Rezaeianaran,\n Farzaneh and Shetty,\n Rakshith and Aljundi,\n Rahaf and Reino,\n Daniel Olmeda and Zhang,\n Shanshan and Schiele,\n Bernt\n},\n title = {\n Seeking Similarities Over Differences: Similarity-Based Domain Alignment for Adaptive Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9204-9213\n} \n}" }, { "title": "Segmentation-Grounded Scene Graph Generation", @@ -36688,6 +39169,7 @@ "status": "Poster", "track": "main", "pid": 6324, + "author_site": "Siddhesh Khandelwal; Mohammed Suhail; Leonid Sigal", "author": "Siddhesh Khandelwal; Mohammed Suhail; Leonid Sigal", "abstract": "Scene graph generation has emerged as an important problem in computer vision. While scene graphs provide a grounded representation of objects, their locations and relations in an image, they do so only at the granularity of proposal bounding boxes. In this work, we propose the first, to our knowledge, framework for pixel-level segmentation-grounded scene graph generation. Our framework is agnostic to the underlying scene graph generation method and address the lack of segmentation annotations in target scene graph datasets (e.g., Visual Genome) through transfer and multi-task learning from, and with, an auxiliary dataset (e.g., MS COCO). Specifically, each target object being detected is endowed with a segmentation mask, which is expressed as a lingual-similarity weighted linear combination over categories that have annotations present in an auxiliary dataset. These inferred masks, along with a Gaussian masking mechanism which grounds the relations at a pixel-level within the image, allow for improved relation prediction. The entire framework is end-to-end trainable and is learned in a multi-task manner.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Khandelwal_Segmentation-Grounded_Scene_Graph_Generation_ICCV_2021_paper.pdf", @@ -36711,7 +39193,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Vancouver;", "aff_country_unique_index": "0+0+0;0+0+0;0+0+0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Khandelwal_2021_ICCV,\n \n author = {\n Khandelwal,\n Siddhesh and Suhail,\n Mohammed and Sigal,\n Leonid\n},\n title = {\n Segmentation-Grounded Scene Graph Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15879-15889\n} \n}" }, { "title": "Segmenter: Transformer for Semantic Segmentation", @@ -36719,10 +39202,11 @@ "status": "Poster", "track": "main", "pid": 6249, + "author_site": "Robin Strudel; Ricardo Garcia; Ivan Laptev; Cordelia Schmid", "author": "Robin Strudel; Ricardo Garcia; Ivan Laptev; Cordelia Schmid", "abstract": "Image segmentation is often ambiguous at the level of individual image patches and requires contextual information to reach label consensus. In this paper we introduce Segmenter, a transformer model for semantic segmentation. In contrast to convolution-based methods, our approach allows to model global context already at the first layer and throughout the network. We build on the recent Vision Transformer (ViT) and extend it to semantic segmentation. To do so, we rely on the output embeddings corresponding to image patches and obtain class labels from these embeddings with a point-wise linear decoder or a mask transformer decoder. We leverage models pre-trained for image classification and show that we can fine-tune them on moderate sized datasets available for semantic segmentation. The linear decoder allows to obtain excellent results already, but the performance can be further improved by a mask transformer generating class masks. We conduct an extensive ablation study to show the impact of the different parameters, in particular the performance is better for large models and small patch sizes. Segmenter attains excellent results for semantic segmentation. It outperforms the state of the art on both ADE20K and Pascal Context datasets and is competitive on Cityscapes.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Strudel_Segmenter_Transformer_for_Semantic_Segmentation_ICCV_2021_paper.pdf", - "aff": "Inria\u2020; Inria\u2020; Inria\u2020; Inria\u2020", + "aff": "Inria†; Inria†; Inria†; Inria†", "project": "", "github": "https://github.com/rstrudel/segmenter", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Strudel_Segmenter_Transformer_for_ICCV_2021_supplemental.pdf", @@ -36735,14 +39219,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Strudel_Segmenter_Transformer_for_Semantic_Segmentation_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;0", - "aff_unique_norm": "INRIA", + "aff_unique_norm": "Inria", "aff_unique_dep": "", "aff_unique_url": "https://www.inria.fr", "aff_unique_abbr": "Inria", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "France" + "aff_country_unique": "France", + "bibtex": "@InProceedings{Strudel_2021_ICCV,\n \n author = {\n Strudel,\n Robin and Garcia,\n Ricardo and Laptev,\n Ivan and Schmid,\n Cordelia\n},\n title = {\n Segmenter: Transformer for Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7262-7272\n} \n}" }, { "title": "Selective Feature Compression for Efficient Activity Recognition Inference", @@ -36750,6 +39235,7 @@ "status": "Poster", "track": "main", "pid": 7356, + "author_site": "Chunhui Liu; Xinyu Li; Hao Chen; Davide Modolo; Joseph Tighe", "author": "Chunhui Liu; Xinyu Li; Hao Chen; Davide Modolo; Joseph Tighe", "abstract": "Most action recognition solutions rely on dense sampling to precisely cover the informative temporal clip. Extensively searching temporal region is expensive for a real-world application. In this work, we focus on improving the inference efficiency of current action recognition backbones on trimmed videos, and illustrate that one action model can also cover then informative region by dropping non-informative features. We present Selective Feature Compression (SFC), an action recognition inference strategy that greatly increase model inference efficiency without any accuracy compromise. Differently from previous works that compress kernel sizes and decrease the channel dimension, we propose to compress feature flow at spatio-temporal dimension without changing any backbone parameters. Our experiments on Kinetics-400, UCF101 and ActivityNet show that SFC is able to reduce inference speed by 6-7x and memory usage by 5-6x compared with the commonly used 30 crops dense sampling procedure, while also slightly improving Top1 Accuracy. We thoroughly quantitatively and qualitatively evaluate SFC and all its components and show how does SFC learn to attend to important video regions and to drop temporal features that are uninformative for the task of action recognition.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liu_Selective_Feature_Compression_for_Efficient_Activity_Recognition_Inference_ICCV_2021_paper.pdf", @@ -36766,14 +39252,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Liu_Selective_Feature_Compression_for_Efficient_Activity_Recognition_Inference_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;0;0", - "aff_unique_norm": "Amazon", - "aff_unique_dep": "Amazon Web Services", + "aff_unique_norm": "Amazon Web Services", + "aff_unique_dep": "", "aff_unique_url": "https://aws.amazon.com", "aff_unique_abbr": "AWS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Liu_2021_ICCV,\n \n author = {\n Liu,\n Chunhui and Li,\n Xinyu and Chen,\n Hao and Modolo,\n Davide and Tighe,\n Joseph\n},\n title = {\n Selective Feature Compression for Efficient Activity Recognition Inference\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13628-13637\n} \n}" }, { "title": "Self Supervision to Distillation for Long-Tailed Visual Recognition", @@ -36781,6 +39268,7 @@ "status": "Poster", "track": "main", "pid": 8355, + "author_site": "Tianhao Li; Limin Wang; Gangshan Wu", "author": "Tianhao Li; Limin Wang; Gangshan Wu", "abstract": "Deep learning has achieved remarkable progress for visual recognition on large-scale balanced datasets but still performs poorly on real-world long-tailed data. Previous methods often adopt class re-balanced training strategies to effectively alleviate the imbalance issue, but might be a risk of over-fitting tail classes. The recent decoupling method overcomes over-fitting issues by using a multi-stage training scheme, yet, it is still incapable of capturing tail class information in the feature learning stage. In this paper, we show that soft label can serve as a powerful solution to incorporate label correlation into a multi-stage training scheme for long-tailed recognition. The intrinsic relation between classes embodied by soft labels turns out to be helpful for long-tailed recognition by transferring knowledge from head to tail classes. Specifically, we propose a conceptually simple yet particularly effective multi-stage training scheme, termed as Self Supervised to Distillation (SSD). This scheme is composed of two parts. First, we introduce a self-distillation framework for long-tailed recognition, which can mine the label relation automatically. Second, we present a new distillation label generation module guided by self-supervision. The distilled labels integrate information from both label and data domains that can model long-tailed distribution effectively. We conduct extensive experiments and our method achieves the state-of-the-art results on three long-tailed recognition benchmarks: ImageNet-LT, CIFAR100-LT and iNaturalist 2018. Our SSD outperforms the strong LWS baseline by from 2.7% to 4.5% on various datasets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_Self_Supervision_to_Distillation_for_Long-Tailed_Visual_Recognition_ICCV_2021_paper.pdf", @@ -36804,7 +39292,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Tianhao and Wang,\n Limin and Wu,\n Gangshan\n},\n title = {\n Self Supervision to Distillation for Long-Tailed Visual Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 630-639\n} \n}" }, { "title": "Self-Born Wiring for Neural Trees", @@ -36812,6 +39301,7 @@ "status": "Poster", "track": "main", "pid": 2972, + "author_site": "Ying Chen; Feng Mao; Jie Song; Xinchao Wang; Huiqiong Wang; Mingli Song", "author": "Ying Chen; Feng Mao; Jie Song; Xinchao Wang; Huiqiong Wang; Mingli Song", "abstract": "Neural trees aim at integrating deep neural networks and decision trees so as to bring the best of the two worlds, including representation learning from the former and faster inference from the latter. In this paper, we introduce a novel approach, termed as Self-born Wiring (SeBoW), to learn neural trees from a mother deep neural network. In contrast to prior neural-tree approaches that either adopt a pre-defined structure or grow hierarchical layers in a progressive manner, task-adaptive neural trees in SeBoW evolve from a deep neural network through a construction-by-destruction process, enabling a global-level parameter optimization that further yields favorable results. Specifically, given a designated network configuration like VGG, SeBoW disconnects all the layers and derives isolated filter groups, based on which a global-level wiring process is conducted to attach a subset of filter groups, eventually bearing a lightweight neural tree. Extensive experiments demonstrate that, with a lower computational cost, SeBoW outperforms all prior neural trees by a significant margin and even achieves results on par with predominant non-tree networks like ResNets. Moreover, SeBoW proves its scalability to large-scale datasets like ImageNet, which has been barely explored by prior tree networks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_Self-Born_Wiring_for_Neural_Trees_ICCV_2021_paper.pdf", @@ -36835,7 +39325,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Ningbo", "aff_country_unique_index": "0;0;0;1;0;0", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Ying and Mao,\n Feng and Song,\n Jie and Wang,\n Xinchao and Wang,\n Huiqiong and Song,\n Mingli\n},\n title = {\n Self-Born Wiring for Neural Trees\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5047-5056\n} \n}" }, { "title": "Self-Calibrating Neural Radiance Fields", @@ -36843,6 +39334,7 @@ "status": "Poster", "track": "main", "pid": 2605, + "author_site": "Yoonwoo Jeong; Seokjun Ahn; Christopher Choy; Anima Anandkumar; Minsu Cho; Jaesik Park", "author": "Yoonwoo Jeong; Seokjun Ahn; Christopher Choy; Anima Anandkumar; Minsu Cho; Jaesik Park", "abstract": "In this work, we propose a camera self-calibration algorithm for generic cameras with arbitrary non-linear distortions. We jointly learn the geometry of the scene and the accurate camera parameters without any calibration objects. Our camera model consists of a pinhole model, a fourth order radial distortion, and a generic noise model that can learn arbitrary non-linear camera distortions. While traditional self-calibration algorithms mostly rely on geometric constraints, we additionally incorporate photometric consistency. This requires learning the geometry of the scene, and we use Neural Radiance Fields (NeRF). We also propose a new geometric loss function, viz., projected ray distance loss, to incorporate geometric consistency for complex non-linear camera models. We validate our approach on standard real image datasets and demonstrate that our model can learn the camera intrinsics and extrinsics (pose) from scratch without COLMAP initialization. Also, we show that learning accurate camera models in a differentiable manner allows us to improve PSNR over baselines.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Jeong_Self-Calibrating_Neural_Radiance_Fields_ICCV_2021_paper.pdf", @@ -36857,7 +39349,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Jeong_Self-Calibrating_Neural_Radiance_Fields_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Jeong_Self-Calibrating_Neural_Radiance_Fields_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Jeong_2021_ICCV,\n \n author = {\n Jeong,\n Yoonwoo and Ahn,\n Seokjun and Choy,\n Christopher and Anandkumar,\n Anima and Cho,\n Minsu and Park,\n Jaesik\n},\n title = {\n Self-Calibrating Neural Radiance Fields\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5846-5854\n} \n}" }, { "title": "Self-Conditioned Probabilistic Learning of Video Rescaling", @@ -36865,6 +39358,7 @@ "status": "Poster", "track": "main", "pid": 4155, + "author_site": "Yuan Tian; Guo Lu; Xiongkuo Min; Zhaohui Che; Guangtao Zhai; Guodong Guo; Zhiyong Gao", "author": "Yuan Tian; Guo Lu; Xiongkuo Min; Zhaohui Che; Guangtao Zhai; Guodong Guo; Zhiyong Gao", "abstract": "Bicubic downscaling is a prevalent technique used to reduce the video storage burden or to accelerate the downstream processing speed. However, the inverse upscaling step is non-trivial, and the downscaled video may also deteriorate the performance of downstream tasks. In this paper, we propose a self-conditioned probabilistic framework for video rescaling to learn the paired downscaling and upscaling procedures simultaneously. During the training, we decrease the entropy of the information lost in the downscaling by maximizing its probability conditioned on the strong spatial-temporal prior information within the downscaled video. After optimization, the downscaled video by our framework preserves more meaningful information, which is beneficial for both the upscaling step and the downstream tasks, e.g., video action recognition task. We further extend the framework to a lossy video compression system, in which a gradient estimator for non-differential industrial lossy codecs is proposed for the end-to-end training of the whole system. Extensive experimental results demonstrate the superiority and effectiveness of our approach on video rescaling, video compression, and efficient action recognition tasks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Tian_Self-Conditioned_Probabilistic_Learning_of_Video_Rescaling_ICCV_2021_paper.pdf", @@ -36881,14 +39375,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Tian_Self-Conditioned_Probabilistic_Learning_of_Video_Rescaling_ICCV_2021_paper.html", "aff_unique_index": "0;1;0;0;0;2;0", - "aff_unique_norm": "Shanghai Jiao Tong University;Beijing Institute of Technology;Baidu", - "aff_unique_dep": ";;Baidu, Inc.", + "aff_unique_norm": "Shanghai Jiao Tong University;Beijing Institute of Technology;Baidu, Inc.", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.sjtu.edu.cn;http://www.bit.edu.cn/;https://www.baidu.com", "aff_unique_abbr": "SJTU;BIT;Baidu", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Tian_2021_ICCV,\n \n author = {\n Tian,\n Yuan and Lu,\n Guo and Min,\n Xiongkuo and Che,\n Zhaohui and Zhai,\n Guangtao and Guo,\n Guodong and Gao,\n Zhiyong\n},\n title = {\n Self-Conditioned Probabilistic Learning of Video Rescaling\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4490-4499\n} \n}" }, { "title": "Self-Knowledge Distillation With Progressive Refinement of Targets", @@ -36896,6 +39391,7 @@ "status": "Poster", "track": "main", "pid": 8123, + "author_site": "Kyungyul Kim; ByeongMoon Ji; Doyoung Yoon; Sangheum Hwang", "author": "Kyungyul Kim; ByeongMoon Ji; Doyoung Yoon; Sangheum Hwang", "abstract": "The generalization capability of deep neural networks has been substantially improved by applying a wide spectrum of regularization methods, e.g., restricting function space, injecting randomness during training, augmenting data, etc. In this work, we propose a simple yet effective regularization method named progressive self-knowledge distillation (PS-KD), which progressively distills a model's own knowledge to soften hard targets (i.e., one-hot vectors) during training. Hence, it can be interpreted within a framework of knowledge distillation as a student becomes a teacher itself. Specifically, targets are adjusted adaptively by combining the ground-truth and past predictions from the model itself. We show that PS-KD provides an effect of hard example mining by rescaling gradients according to difficulty in classifying examples. The proposed method is applicable to any supervised learning tasks with hard targets and can be easily combined with existing regularization methods to further enhance the generalization performance. Furthermore, it is confirmed that PS-KD achieves not only better accuracy, but also provides high quality of confidence estimates in terms of calibration as well as ordinal ranking. Extensive experimental results on three different tasks, image classification, object detection, and machine translation, demonstrate that our method consistently improves the performance of the state-of-the-art baselines.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Kim_Self-Knowledge_Distillation_With_Progressive_Refinement_of_Targets_ICCV_2021_paper.pdf", @@ -36912,14 +39408,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Kim_Self-Knowledge_Distillation_With_Progressive_Refinement_of_Targets_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;1", - "aff_unique_norm": "LG;Seoul National University of Science and Technology", + "aff_unique_norm": "LG CNS;Seoul National University of Science and Technology", "aff_unique_dep": "AI Research;", "aff_unique_url": "https://www.lgcns.com;https://www.snust.ac.kr", "aff_unique_abbr": "LG CNS;SNUST", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Seoul", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Kim_2021_ICCV,\n \n author = {\n Kim,\n Kyungyul and Ji,\n ByeongMoon and Yoon,\n Doyoung and Hwang,\n Sangheum\n},\n title = {\n Self-Knowledge Distillation With Progressive Refinement of Targets\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6567-6576\n} \n}" }, { "title": "Self-Motivated Communication Agent for Real-World Vision-Dialog Navigation", @@ -36927,10 +39424,11 @@ "status": "Poster", "track": "main", "pid": 3762, + "author_site": "Yi Zhu; Yue Weng; Fengda Zhu; Xiaodan Liang; Qixiang Ye; Yutong Lu; Jianbin Jiao", "author": "Yi Zhu; Yue Weng; Fengda Zhu; Xiaodan Liang; Qixiang Ye; Yutong Lu; Jianbin Jiao", "abstract": "Vision-Dialog Navigation (VDN) requires an agent to ask questions and navigate following the human responses to find target objects. Conventional approaches are only allowed to ask questions at predefined locations, which are built upon expensive dialogue annotations, and inconvenience the real-word human-robot communication and cooperation. In this paper, we propose a Self-Motivated Communication Agent (SCoA) that learns whether and what to communicate with human adaptively to acquire instructive information for realizing dialogue annotation-free navigation and enhancing the transferability in real-world unseen environment. Specifically, we introduce a whether-to-ask (WeTA) policy, together with uncertainty of which action to choose, to indicate whether the agent should ask a question. Then, a what-to-ask (WaTA) policy is proposed, in which, along with the oracle's answers, the agent learns to score question candidates so as to pick up the most informative one for navigation, and meanwhile mimic oracle's answering. Thus, the agent can navigate in a self-Q&A manner even in real-world environment where the human assistance is often unavailable. Through joint optimization of communication and navigation in a unified imitation learning and reinforcement learning framework, SCoA asks a question if necessary and obtains a hint for guiding the agent to move towards the target with less communication cost. Experiments on seen and unseen environments demonstrate that SCoA shows not only superior performance over existing baselines without dialog annotations, but also competing results compared with rich dialog annotations based counterparts.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhu_Self-Motivated_Communication_Agent_for_Real-World_Vision-Dialog_Navigation_ICCV_2021_paper.pdf", - "aff": "Noah\u2019s Ark Lab, Huawei Technologies; Sun Yat-sen University; Monash University; Sun Yat-sen University; University of Chinese Academy of Sciences; Sun Yat-sen University; University of Chinese Academy of Sciences", + "aff": "Noah’s Ark Lab, Huawei Technologies; Sun Yat-sen University; Monash University; Sun Yat-sen University; University of Chinese Academy of Sciences; Sun Yat-sen University; University of Chinese Academy of Sciences", "project": "", "github": "", "supp": "", @@ -36943,14 +39441,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhu_Self-Motivated_Communication_Agent_for_Real-World_Vision-Dialog_Navigation_ICCV_2021_paper.html", "aff_unique_index": "0;1;2;1;3;1;3", - "aff_unique_norm": "Huawei;Sun Yat-sen University;Monash University;University of Chinese Academy of Sciences", - "aff_unique_dep": "Noah\u2019s Ark Lab;;;", + "aff_unique_norm": "Huawei Technologies;Sun Yat-sen University;Monash University;University of Chinese Academy of Sciences", + "aff_unique_dep": "Noah’s Ark Lab;;;", "aff_unique_url": "https://www.huawei.com;http://www.sysu.edu.cn/;https://www.monash.edu;http://www.ucas.ac.cn", "aff_unique_abbr": "Huawei;SYSU;Monash;UCAS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0;0;0", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Zhu_2021_ICCV,\n \n author = {\n Zhu,\n Yi and Weng,\n Yue and Zhu,\n Fengda and Liang,\n Xiaodan and Ye,\n Qixiang and Lu,\n Yutong and Jiao,\n Jianbin\n},\n title = {\n Self-Motivated Communication Agent for Real-World Vision-Dialog Navigation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1594-1603\n} \n}" }, { "title": "Self-Mutating Network for Domain Adaptive Segmentation in Aerial Images", @@ -36958,6 +39457,7 @@ "status": "Poster", "track": "main", "pid": 8537, + "author_site": "Kyungsu Lee; Haeyun Lee; Jae Youn Hwang", "author": "Kyungsu Lee; Haeyun Lee; Jae Youn Hwang", "abstract": "The domain-adaptive semantic segmentation in aerial images by a deep-learning technique remains a challenge owing to the domain gaps caused by a resolution, image sensors, time-zone, the density of buildings, and even building styles of each city. Currently, convolutional neural network (CNN)-based domain adaptation methodologies have been developed to decrease the domain gaps, but, they have shown still poor performance to utilize multiple aerial images in different domains. In this paper, therefore, the CNN-based network denoted as Self-Mutating Network, which changes the values of parameters of convolutional filters itself according to the domain of input image, is proposed. By adopting Parameter Mutation to change the values of parameters and Parameter Fluctuation to randomly convulse the parameters, the network self-changes and fine-tunes the parameters, then achieves better predictions of a domain-adaptive segmentation. Through the ablation study of the Self-Mutating Network, we concluded that the Self-Mutating Network can be utilized in the domain-adaptive semantic segmentation of aerial images in different domains.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Lee_Self-Mutating_Network_for_Domain_Adaptive_Segmentation_in_Aerial_Images_ICCV_2021_paper.pdf", @@ -36981,7 +39481,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Lee_2021_ICCV,\n \n author = {\n Lee,\n Kyungsu and Lee,\n Haeyun and Hwang,\n Jae Youn\n},\n title = {\n Self-Mutating Network for Domain Adaptive Segmentation in Aerial Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7068-7077\n} \n}" }, { "title": "Self-Mutual Distillation Learning for Continuous Sign Language Recognition", @@ -36989,6 +39490,7 @@ "status": "Poster", "track": "main", "pid": 6107, + "author_site": "Aiming Hao; Yuecong Min; Xilin Chen", "author": "Aiming Hao; Yuecong Min; Xilin Chen", "abstract": "In recent years, deep learning moves video-based Continuous Sign Language Recognition (CSLR) significantly forward. Currently, a typical network combination for CSLR includes a visual module, which focuses on spatial and short-temporal information, followed by a contextual module, which focuses on long-temporal information, and the Connectionist Temporal Classification (CTC) loss is adopted to train the network. However, due to the limitation of chain rules in back-propagation, the visual module is hard to adjust for seeking optimized visual features. As a result, it enforces that the contextual module focuses on contextual information optimization only rather than balancing efficient visual and contextual information. In this paper, we propose a Self-Mutual Knowledge Distillation (SMKD) method, which enforces the visual and contextual modules to focus on short-term and long-term information and enhances the discriminative power of both modules simultaneously. Specifically, the visual and contextual modules share the weights of their corresponding classifiers, and train with CTC loss simultaneously. Moreover, the spike phenomenon widely exists with CTC loss. Although it can help us choose a few of the key frames of a gloss, it does drop other frames in a gloss and makes the visual feature saturation in the early stage. A gloss segmentation is developed to relieve the spike phenomenon and decrease saturation in the visual module. We conduct experiments on two CSLR benchmarks: PHOENIX14 and PHOENIX14-T. Experimental results demonstrate the effectiveness of the SMKD.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Hao_Self-Mutual_Distillation_Learning_for_Continuous_Sign_Language_Recognition_ICCV_2021_paper.pdf", @@ -37012,7 +39514,8 @@ "aff_campus_unique_index": "0+0;0+0;0+0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0+0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Hao_2021_ICCV,\n \n author = {\n Hao,\n Aiming and Min,\n Yuecong and Chen,\n Xilin\n},\n title = {\n Self-Mutual Distillation Learning for Continuous Sign Language Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11303-11312\n} \n}" }, { "title": "Self-Regulation for Semantic Segmentation", @@ -37020,6 +39523,7 @@ "status": "Poster", "track": "main", "pid": 6670, + "author_site": "Dong Zhang; Hanwang Zhang; Jinhui Tang; Xian-Sheng Hua; Qianru Sun", "author": "Dong Zhang; Hanwang Zhang; Jinhui Tang; Xian-Sheng Hua; Qianru Sun", "abstract": "In this paper, we seek reasons for the two major failure cases in Semantic Segmentation (SS): 1) missing small objects or minor object parts, and 2) mislabeling minor parts of large objects as wrong classes. We have an interesting finding that Failure-1 is due to the underuse of detailed features and Failure-2 is due to the underuse of visual contexts. To help the model learn a better trade-off, we introduce several Self-Regulation (SR) losses for training SS neural networks. By \"self\", we mean that the losses are from the model per se without using any additional data or supervision. By applying the SR losses, the deep layer features are regulated by the shallow ones to preserve more details; meanwhile, shallow layer classification logits are regulated by the deep ones to capture more semantics. We conduct extensive experiments on both weakly and fully supervised SS tasks, and the results show that our approach consistently surpasses the baselines. We also validate that SR losses are easy to implement in various state-of-the-art SS models, e.g., SPGNet and OCRNet, incurring little computational overhead during training and none for testing", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhang_Self-Regulation_for_Semantic_Segmentation_ICCV_2021_paper.pdf", @@ -37034,7 +39538,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhang_Self-Regulation_for_Semantic_Segmentation_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhang_Self-Regulation_for_Semantic_Segmentation_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Zhang_2021_ICCV,\n \n author = {\n Zhang,\n Dong and Zhang,\n Hanwang and Tang,\n Jinhui and Hua,\n Xian-Sheng and Sun,\n Qianru\n},\n title = {\n Self-Regulation for Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6953-6963\n} \n}" }, { "title": "Self-Supervised 3D Face Reconstruction via Conditional Estimation", @@ -37042,6 +39547,7 @@ "status": "Poster", "track": "main", "pid": 9507, + "author_site": "Yandong Wen; Weiyang Liu; Bhiksha Raj; Rita Singh", "author": "Yandong Wen; Weiyang Liu; Bhiksha Raj; Rita Singh", "abstract": "We present a conditional estimation (CEST) framework to learn 3D facial parameters from 2D single-view images by self-supervised training from videos. CEST is based on the process of analysis by synthesis, where the 3D facial parameters (shape, reflectance, viewpoint, and illumination) are estimated from the face image, and then recombined to reconstruct the 2D face image. In order to learn semantically meaningful 3D facial parameters without explicit access to their labels, CEST couples the estimation of different 3D facial parameters by taking their statistical dependency into account. Specifically, the estimation of any 3D facial parameter is not only conditioned on the given image, but also on the facial parameters that have already been derived. Moreover, the reflectance symmetry and consistency among the video frames are adopted to improve the disentanglement of facial parameters. Together with a novel strategy for incorporating the reflectance symmetry and consistency, CEST can be efficiently trained with in-the-wild video clips. Both qualitative and quantitative experiments demonstrate the effectiveness of CEST.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wen_Self-Supervised_3D_Face_Reconstruction_via_Conditional_Estimation_ICCV_2021_paper.pdf", @@ -37056,7 +39562,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wen_Self-Supervised_3D_Face_Reconstruction_via_Conditional_Estimation_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wen_Self-Supervised_3D_Face_Reconstruction_via_Conditional_Estimation_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Wen_2021_ICCV,\n \n author = {\n Wen,\n Yandong and Liu,\n Weiyang and Raj,\n Bhiksha and Singh,\n Rita\n},\n title = {\n Self-Supervised 3D Face Reconstruction via Conditional Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13289-13298\n} \n}" }, { "title": "Self-Supervised 3D Hand Pose Estimation From Monocular RGB via Contrastive Learning", @@ -37064,6 +39571,7 @@ "status": "Poster", "track": "main", "pid": 1363, + "author_site": "Adrian Spurr; Aneesh Dahiya; Xi Wang; Xucong Zhang; Otmar Hilliges", "author": "Adrian Spurr; Aneesh Dahiya; Xi Wang; Xucong Zhang; Otmar Hilliges", "abstract": "Encouraged by the success of contrastive learning on image classification tasks, we propose a new self-supervised method for the structured regression task of 3D hand pose estimation. Contrastive learning makes use of unlabeled data for the purpose of representation learning via a loss formulation that encourages the learned feature representations to be invariant under any image transformation. For 3D hand pose estimation, it too is desirable to have invariance to appearance transformation such as color jitter. However, the task requires equivariance under affine transformations, such as rotation and translation. To address this issue, we propose an equivariant contrastive objective and demonstrate its effectiveness in the context of 3D hand pose estimation. We experimentally investigate the impact of invariant and equivariant contrastive objectives and show that learning equivariant features leads to better representations for the task of 3D hand pose estimation. Furthermore, we show that standard ResNets with sufficient depth, trained on additional unlabeled data, attain improvements of up to 14.5% in PA-EPE on FreiHAND and thus achieves state-of-the-art performance without any task specific, specialized architectures. Code and models are available at https://ait.ethz.ch/projects/2021/PeCLR", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Spurr_Self-Supervised_3D_Hand_Pose_Estimation_From_Monocular_RGB_via_Contrastive_ICCV_2021_paper.pdf", @@ -37087,7 +39595,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "Switzerland" + "aff_country_unique": "Switzerland", + "bibtex": "@InProceedings{Spurr_2021_ICCV,\n \n author = {\n Spurr,\n Adrian and Dahiya,\n Aneesh and Wang,\n Xi and Zhang,\n Xucong and Hilliges,\n Otmar\n},\n title = {\n Self-Supervised 3D Hand Pose Estimation From Monocular RGB via Contrastive Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11230-11239\n} \n}" }, { "title": "Self-Supervised 3D Skeleton Action Representation Learning With Motion Consistency and Continuity", @@ -37095,6 +39604,7 @@ "status": "Poster", "track": "main", "pid": 8299, + "author_site": "Yukun Su; Guosheng Lin; Qingyao Wu", "author": "Yukun Su; Guosheng Lin; Qingyao Wu", "abstract": "Recently, self-supervised learning (SSL) has been proved very effective and it can help boost the performance in learning representations from unlabeled data in the image domain. Yet, very little is explored about its usefulness in 3D skeleton-based action recognition understanding. Directly applying existing SSL techniques for 3D skeleton learning, however, suffers from trivial solutions and imprecise representations. To tackle these drawbacks, we consider perceiving the consistency and continuity of motion at different playback speeds are two critical issues. To this end, we propose a novel SSL method to learn the 3D skeleton representation in an efficacious way. Specifically, by constructing a positive clip (speed-changed) and a negative clip (motion-broken) of the sampled action sequence, we encourage the positive pairs closer while pushing the negative pairs to force the network to learn the intrinsic dynamic motion consistency information. Moreover, to enhance the learning features, skeleton interpolation is further exploited to model the continuity of human skeleton data. To validate the effectiveness of the proposed method, extensive experiments are conducted on Kinetics, NTU60, NTU120, and PKUMMD datasets with several alternative network architectures. Experimental evaluations demonstrate the superiority of our approach and through which, we can gain significant performance improvement without using extra labeled data.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Su_Self-Supervised_3D_Skeleton_Action_Representation_Learning_With_Motion_Consistency_and_ICCV_2021_paper.pdf", @@ -37118,7 +39628,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;1;0+0", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Su_2021_ICCV,\n \n author = {\n Su,\n Yukun and Lin,\n Guosheng and Wu,\n Qingyao\n},\n title = {\n Self-Supervised 3D Skeleton Action Representation Learning With Motion Consistency and Continuity\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13328-13338\n} \n}" }, { "title": "Self-Supervised Cryo-Electron Tomography Volumetric Image Restoration From Single Noisy Volume With Sparsity Constraint", @@ -37126,6 +39637,7 @@ "status": "Poster", "track": "main", "pid": 10045, + "author_site": "Zhidong Yang; Fa Zhang; Renmin Han", "author": "Zhidong Yang; Fa Zhang; Renmin Han", "abstract": "Cryo-Electron Tomography (cryo-ET) is a powerful tool for 3D cellular visualization. Due to instrumental limitations, cryo-ET images and their volumetric reconstruction suffer from extremely low signal-to-noise ratio. In this paper, we propose a novel end-to-end self-supervised learning model, the Sparsity Constrained Network (SC-Net), to restore volumetric image from single noisy data in cryo-ET. The proposed method only requires a single noisy data as training input and no ground-truth is needed in the whole training procedure. A new target function is proposed to preserve both local smoothness and detailed structure. Additionally, a novel procedure for the simulation of electron tomographic photographing is designed to help the evaluation of methods. Experiments are done on three simulated data and four real-world data. The results show that our method could produce a strong enhancement for a single very noisy cryo-ET volumetric data, which is much better than the state-of-the-art Noise2Void, and with a competitive performance comparing with Noise2Noise. Code is available at https://github.com/icthrm/SC-Net.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yang_Self-Supervised_Cryo-Electron_Tomography_Volumetric_Image_Restoration_From_Single_Noisy_Volume_ICCV_2021_paper.pdf", @@ -37149,7 +39661,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yang_2021_ICCV,\n \n author = {\n Yang,\n Zhidong and Zhang,\n Fa and Han,\n Renmin\n},\n title = {\n Self-Supervised Cryo-Electron Tomography Volumetric Image Restoration From Single Noisy Volume With Sparsity Constraint\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4056-4065\n} \n}" }, { "title": "Self-Supervised Domain Adaptation for Forgery Localization of JPEG Compressed Images", @@ -37157,6 +39670,7 @@ "status": "Poster", "track": "main", "pid": 7306, + "author_site": "Yuan Rao; Jiangqun Ni", "author": "Yuan Rao; Jiangqun Ni", "abstract": "With wide applications of image editing tools, forged images (splicing, copy-move, removal and etc.) have been becoming great public concerns. Although existing image forgery localization methods could achieve fairly good results on several public datasets, most of them perform poorly when the forged images are JPEG compressed as they are usually done in social networks. To tackle this issue, in this paper, a self-supervised domain adaptation network, which is composed of a backbone network with Siamese architecture and a compression approximation network (ComNet), is proposed for JPEG-resistant image forgery localization. To improve the performance against JPEG compression, ComNet is customized to approximate the JPEG compression operation through self-supervised learning, generating JPEG-agent images with general JPEG compression characteristics. The backbone network is then trained with domain adaptation strategy to localize the tampering boundary and region, and alleviate the domain shift between uncompressed and JPEG-agent images. Extensive experimental results on several public datasets show that the proposed method outperforms or rivals to other state-of-the-art methods in image forgery localization, especially for JPEG compression with unknown QFs.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Rao_Self-Supervised_Domain_Adaptation_for_Forgery_Localization_of_JPEG_Compressed_Images_ICCV_2021_paper.pdf", @@ -37172,15 +39686,16 @@ "email": "mail2.sysu.edu.cn;mail.sysu.edu.cn", "author_num": 2, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Rao_Self-Supervised_Domain_Adaptation_for_Forgery_Localization_of_JPEG_Compressed_Images_ICCV_2021_paper.html", - "aff_unique_index": "0;0+0", - "aff_unique_norm": "Sun Yat-sen University", - "aff_unique_dep": "School of Electronics and Information Technology", - "aff_unique_url": "http://www.sysu.edu.cn", - "aff_unique_abbr": "SYSU", + "aff_unique_index": "0;0+1", + "aff_unique_norm": "Sun Yat-Sen University;Sun Yat-sen University", + "aff_unique_dep": "School of Electronics and Information Technology;Guangdong Provincial Key Laboratory of Information Security", + "aff_unique_url": "http://www.sysu.edu.cn;http://www.sysu.edu.cn", + "aff_unique_abbr": "SYSU;SYSU", "aff_campus_unique_index": "0;0+0", "aff_campus_unique": "Guangzhou", "aff_country_unique_index": "0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Rao_2021_ICCV,\n \n author = {\n Rao,\n Yuan and Ni,\n Jiangqun\n},\n title = {\n Self-Supervised Domain Adaptation for Forgery Localization of JPEG Compressed Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15034-15043\n} \n}" }, { "title": "Self-Supervised Geometric Features Discovery via Interpretable Attention for Vehicle Re-Identification and Beyond", @@ -37188,6 +39703,7 @@ "status": "Poster", "track": "main", "pid": 5524, + "author_site": "Ming Li; Xinming Huang; Ziming Zhang", "author": "Ming Li; Xinming Huang; Ziming Zhang", "abstract": "To learn distinguishable patterns, most of recent works in vehicle re-identification (ReID) struggled to redevelop official benchmarks to provide various supervisions, which requires prohibitive human labors. In this paper, we seek to achieve the similar goal but do not involve more human efforts. To this end, we introduce a novel framework, which successfully encodes both geometric local features and global representations to distinguish vehicle instances, optimized only by the supervision from official ID labels. Specifically, given our insight that objects in ReID share similar geometric characteristics, we propose to borrow self-supervised representation learning to facilitate geometric features discovery. To condense these features, we introduce an interpretable attention module, with the core of local maxima aggregation instead of fully automatic learning, whose mechanism is completely understandable and whose response map is physically reasonable. To the best of our knowledge, we are the first that perform self-supervised learning to discover geometric features. We conduct comprehensive experiments on three most popular datasets for vehicle ReID, i.e., VeRi-776, CityFlow-ReID, and VehicleID. We report our state-of-the-art (SOTA) performances and promising visualization results. We also show the excellent scalability of our approach on other ReID related tasks, i.e., person ReID and multi-target multi-camera (MTMC) vehicle tracking.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_Self-Supervised_Geometric_Features_Discovery_via_Interpretable_Attention_for_Vehicle_Re-Identification_ICCV_2021_paper.pdf", @@ -37211,7 +39727,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Ming and Huang,\n Xinming and Zhang,\n Ziming\n},\n title = {\n Self-Supervised Geometric Features Discovery via Interpretable Attention for Vehicle Re-Identification and Beyond\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 194-204\n} \n}" }, { "title": "Self-Supervised Image Prior Learning With GMM From a Single Noisy Image", @@ -37219,6 +39736,7 @@ "status": "Poster", "track": "main", "pid": 2309, + "author_site": "Haosen Liu; Xuan Liu; Jiangbo Lu; Shan Tan", "author": "Haosen Liu; Xuan Liu; Jiangbo Lu; Shan Tan", "abstract": "The lack of clean images undermines the practicability of supervised image prior learning methods, of which the training schemes require a large number of clean images. To free image prior learning from the image collection burden, a novel Self-Supervised learning method for Gaussian Mixture Model (SS-GMM) is proposed in this paper. It can simultaneously achieve the noise level estimation and the image prior learning directly from only a single noisy image. This work is derived from our study on eigenvalues of the GMM's covariance matrix. Through statistical experiments and theoretical analysis, we conclude that (1) covariance eigenvalues for clean images hold the sparsity; and that (2) those for noisy images contain sufficient information for noise estimation. The first conclusion inspires us to impose a sparsity constraint on covariance eigenvalues during the learning process to suppress the influence of noise. The second conclusion leads to a self-contained noise estimation module of high accuracy in our proposed method. This module serves to estimate the noise level and automatically determine the specific level of the sparsity constraint. Our final derived method requires only minor modifications to the standard expectation-maximization algorithm. This makes it easy to implement. Very interestingly, the GMM learned via our proposed self-supervised learning method can even achieve better image denoising performance than its supervised counterpart, i.e., the EPLL. Also, it is on par with the state-of-the-art self-supervised deep learning method, i.e., the Self2Self. Code is available at https://github.com/HUST-Tan/SS-GMM.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liu_Self-Supervised_Image_Prior_Learning_With_GMM_From_a_Single_Noisy_ICCV_2021_paper.pdf", @@ -37242,7 +39760,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2021_ICCV,\n \n author = {\n Liu,\n Haosen and Liu,\n Xuan and Lu,\n Jiangbo and Tan,\n Shan\n},\n title = {\n Self-Supervised Image Prior Learning With GMM From a Single Noisy Image\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2845-2854\n} \n}" }, { "title": "Self-Supervised Monocular Depth Estimation for All Day Images Using Domain Separation", @@ -37250,6 +39769,7 @@ "status": "Poster", "track": "main", "pid": 4195, + "author_site": "Lina Liu; Xibin Song; Mengmeng Wang; Yong Liu; Liangjun Zhang", "author": "Lina Liu; Xibin Song; Mengmeng Wang; Yong Liu; Liangjun Zhang", "abstract": "Remarkable results have been achieved by DCNN based self-supervised depth estimation approaches. However, most of these approaches can only handle either day-time or night-time images, while their performance degrades for all-day images due to large domain shift and the variation of illumination between day and night images. To relieve these limitations, we propose a domain-separated network for self-supervised depth estimation of all-day images. Specifically, to relieve the negative influence of disturbing terms (illumination, etc.), we partition the information of day and night image pairs into two complementary sub-spaces: private and invariant domains, where the former contains the unique information (illumination, etc.) of day and night images and the latter contains essential shared information (texture, etc.). Meanwhile, to guarantee that the day and night images contain the same information, the domain-separated network takes the day-time images and corresponding night-time images (generated by GAN) as input, and the private and invariant feature extractors are learned by orthogonality and similarity loss, where the domain gap can be alleviated, thus better depth maps can be expected. Meanwhile, the reconstruction and photometric losses are utilized to estimate complementary information and depth maps effectively. Experimental results demonstrate that our approach achieves state-of-the-art depth estimation results for all-day images on the challenging Oxford RobotCar dataset, proving the superiority of our proposed approach.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liu_Self-Supervised_Monocular_Depth_Estimation_for_All_Day_Images_Using_Domain_ICCV_2021_paper.pdf", @@ -37266,14 +39786,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Liu_Self-Supervised_Monocular_Depth_Estimation_for_All_Day_Images_Using_Domain_ICCV_2021_paper.html", "aff_unique_index": "0+1+2;1+2;0;0+0;1+2", - "aff_unique_norm": "Zhejiang University;Baidu;National Engineering Laboratory of Deep Learning Technology and Application", - "aff_unique_dep": "Institute of Cyber-Systems and Control;Baidu Research;", + "aff_unique_norm": "Zhejiang University;Baidu Research;National Engineering Laboratory of Deep Learning Technology and Application", + "aff_unique_dep": "Institute of Cyber-Systems and Control;;", "aff_unique_url": "http://www.zju.edu.cn;https://research.baidu.com;", "aff_unique_abbr": "ZJU;Baidu;", "aff_campus_unique_index": ";;1;", "aff_campus_unique": ";Huzhou", "aff_country_unique_index": "0+0+0;0+0;0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2021_ICCV,\n \n author = {\n Liu,\n Lina and Song,\n Xibin and Wang,\n Mengmeng and Liu,\n Yong and Zhang,\n Liangjun\n},\n title = {\n Self-Supervised Monocular Depth Estimation for All Day Images Using Domain Separation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12737-12746\n} \n}" }, { "title": "Self-Supervised Neural Networks for Spectral Snapshot Compressive Imaging", @@ -37281,6 +39802,7 @@ "status": "Poster", "track": "main", "pid": 8206, + "author_site": "Ziyi Meng; Zhenming Yu; Kun Xu; Xin Yuan", "author": "Ziyi Meng; Zhenming Yu; Kun Xu; Xin Yuan", "abstract": "We consider using untrained neural networks to solve the reconstruction problem of snapshot compressive imaging (SCI), which uses a two-dimensional (2D) detector to capture a high-dimensional (usually 3D) data-cube in a compressed manner. Various SCI systems have been built in recent years to capture data such as high-speed videos, hyperspectral images, and the state-of-the-art reconstruction is obtained by the deep neural networks. However, most of these networks are trained in an end-to-end manner by a large amount of corpus with sometimes simulated ground truth, measurement pairs. In this paper, inspired by the untrained neural networks such as deep image priors (DIP) and deep decoders, we develop a framework by integrating DIP into the plug-and-play regime, leading to a self-supervised network for spectral SCI reconstruction. Extensive synthetic and real data results show that the proposed algorithm without training is capable of achieving competitive results to the training based networks. Furthermore, by integrating the proposed method with a pre-trained deep denoising prior, we have achieved higher performance than existing state-of-the-art.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Meng_Self-Supervised_Neural_Networks_for_Spectral_Snapshot_Compressive_Imaging_ICCV_2021_paper.pdf", @@ -37304,7 +39826,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Beijing;", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Meng_2021_ICCV,\n \n author = {\n Meng,\n Ziyi and Yu,\n Zhenming and Xu,\n Kun and Yuan,\n Xin\n},\n title = {\n Self-Supervised Neural Networks for Spectral Snapshot Compressive Imaging\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2622-2631\n} \n}" }, { "title": "Self-Supervised Object Detection via Generative Image Synthesis", @@ -37312,6 +39835,7 @@ "status": "Poster", "track": "main", "pid": 1764, + "author_site": "Siva Karthik Mustikovela; Shalini De Mello; Aayush Prakash; Umar Iqbal; Sifei Liu; Thu Nguyen-Phuoc; Carsten Rother; Jan Kautz", "author": "Siva Karthik Mustikovela; Shalini De Mello; Aayush Prakash; Umar Iqbal; Sifei Liu; Thu Nguyen-Phuoc; Carsten Rother; Jan Kautz", "abstract": "We present SSOD -- the first end-to-end analysis-by-synthesis framework with controllable GANs for the task of self-supervised object detection. We use collections of real-world images without bounding box annotations to learn to synthesize and detect objects. We leverage controllable GANs to synthesize images with pre-defined object properties and use them to train object detectors. We propose a tight end-to-end coupling of the synthesis and detection networks to optimally train our system. Finally, we also propose a method to optimally adapt SSOD to an intended target data without requiring labels for it. For the task of car detection, on the challenging KITTI and Cityscapes datasets, we show that SSOD outperforms the prior state-of-the-art purely image-based self-supervised object detection method Wetectron. Even without requiring any 3DCAD assets, it also surpasses the state-of-the-art rendering-based method Meta-Sim2. Our work advances the field of self-supervised object detection by introducing a successful new paradigm of using controllable GAN-based image synthesis for it and by significantly improving the base-line accuracy of the task. We open-source our code athttps://github.com/NVlabs/SSOD.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Mustikovela_Self-Supervised_Object_Detection_via_Generative_Image_Synthesis_ICCV_2021_paper.pdf", @@ -37328,14 +39852,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Mustikovela_Self-Supervised_Object_Detection_via_Generative_Image_Synthesis_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;0;0;1;2;0", - "aff_unique_norm": "NVIDIA;University of Bath;Heidelberg University", - "aff_unique_dep": "NVIDIA Corporation;;", + "aff_unique_norm": "NVIDIA Corporation;University of Bath;Heidelberg University", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.nvidia.com;https://www.bath.ac.uk;https://www.uni-heidelberg.de", "aff_unique_abbr": "NVIDIA;Bath;Uni Heidelberg", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;1;2;0", - "aff_country_unique": "United States;United Kingdom;Germany" + "aff_country_unique": "United States;United Kingdom;Germany", + "bibtex": "@InProceedings{Mustikovela_2021_ICCV,\n \n author = {\n Mustikovela,\n Siva Karthik and De Mello,\n Shalini and Prakash,\n Aayush and Iqbal,\n Umar and Liu,\n Sifei and Nguyen-Phuoc,\n Thu and Rother,\n Carsten and Kautz,\n Jan\n},\n title = {\n Self-Supervised Object Detection via Generative Image Synthesis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8609-8618\n} \n}" }, { "title": "Self-Supervised Pretraining of 3D Features on Any Point-Cloud", @@ -37343,6 +39868,7 @@ "status": "Poster", "track": "main", "pid": 3077, + "author_site": "Zaiwei Zhang; Rohit Girdhar; Armand Joulin; Ishan Misra", "author": "Zaiwei Zhang; Rohit Girdhar; Armand Joulin; Ishan Misra", "abstract": "Pretraining on large labeled datasets is a prerequisite to achieve good performance in many computer vision tasks like image recognition, video understanding etc. However, pretraining is not widely used for 3D recognition tasks where state-of-the-art methods train models from scratch. A primary reason is the lack of large annotated datasets because 3D data labelling is time-consuming. Recent work shows that self-supervised learning is useful to pretrain models in 3D but requires multi-view data and point correspondences. We present a simple self-supervised pretraining method that can work with single-view depth scans acquired by varied sensors, without 3D registration and point correspondences. We pretrain standard point cloud and voxel based model architectures, and show that joint pretraining further improves performance. We evaluate our models on 9 benchmarks for object detection, semantic segmentation, and object classification, where they achieve state-of-the-art results. Most notably, we set a new state-of-the-art for object detection on ScanNet (69.0% mAP) and SUNRGBD (63.5% mAP). Our pretrained models are label efficient and improve performance for classes with few examples.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhang_Self-Supervised_Pretraining_of_3D_Features_on_Any_Point-Cloud_ICCV_2021_paper.pdf", @@ -37359,14 +39885,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhang_Self-Supervised_Pretraining_of_3D_Features_on_Any_Point-Cloud_ICCV_2021_paper.html", "aff_unique_index": "0+1;0;0;0", - "aff_unique_norm": "Meta;University of Texas at Austin", + "aff_unique_norm": "Facebook;University of Texas at Austin", "aff_unique_dep": "Facebook AI Research;", "aff_unique_url": "https://research.facebook.com;https://www.utexas.edu", "aff_unique_abbr": "FAIR;UT Austin", "aff_campus_unique_index": "1", "aff_campus_unique": ";Austin", "aff_country_unique_index": "0+0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhang_2021_ICCV,\n \n author = {\n Zhang,\n Zaiwei and Girdhar,\n Rohit and Joulin,\n Armand and Misra,\n Ishan\n},\n title = {\n Self-Supervised Pretraining of 3D Features on Any Point-Cloud\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10252-10263\n} \n}" }, { "title": "Self-Supervised Product Quantization for Deep Unsupervised Image Retrieval", @@ -37374,6 +39901,7 @@ "status": "Poster", "track": "main", "pid": 8581, + "author_site": "Young Kyun Jang; Nam Ik Cho", "author": "Young Kyun Jang; Nam Ik Cho", "abstract": "Supervised deep learning-based hash and vector quantization are enabling fast and large-scale image retrieval systems. By fully exploiting label annotations, they are achieving outstanding retrieval performances compared to the conventional methods. However, it is painstaking to assign labels precisely for a vast amount of training data, and also, the annotation process is error-prone. To tackle these issues, we propose the first deep unsupervised image retrieval method dubbed Self-supervised Product Quantization (SPQ) network, which is label-free and trained in a self-supervised manner. We design a Cross Quantized Contrastive learning strategy that jointly learns codewords and deep visual descriptors by comparing individually transformed images (views). Our method analyzes the image contents to extract descriptive features, allowing us to understand image representations for accurate retrieval. By conducting extensive experiments on benchmarks, we demonstrate that the proposed method yields state-of-the-art results even without supervised pretraining.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Jang_Self-Supervised_Product_Quantization_for_Deep_Unsupervised_Image_Retrieval_ICCV_2021_paper.pdf", @@ -37397,7 +39925,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Seoul", "aff_country_unique_index": "0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Jang_2021_ICCV,\n \n author = {\n Jang,\n Young Kyun and Cho,\n Nam Ik\n},\n title = {\n Self-Supervised Product Quantization for Deep Unsupervised Image Retrieval\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12085-12094\n} \n}" }, { "title": "Self-Supervised Real-to-Sim Scene Generation", @@ -37405,6 +39934,7 @@ "status": "Poster", "track": "main", "pid": 6596, + "author_site": "Aayush Prakash; Shoubhik Debnath; Jean-Francois Lafleche; Eric Cameracci; Gavriel State; Stan Birchfield; Marc T. Law", "author": "Aayush Prakash; Shoubhik Debnath; Jean-Francois Lafleche; Eric Cameracci; Gavriel State; Stan Birchfield; Marc T. Law", "abstract": "Synthetic data is emerging as a promising solution to the scalability issue of supervised deep learning, especially when real data are difficult to acquire or hard to annotate. Synthetic data generation, however, can itself be prohibitively expensive when domain experts have to manually and painstakingly oversee the process. Moreover, neural networks trained on synthetic data often do not perform well on real data because of the domain gap. To solve these challenges, we propose Sim2SG, a self-supervised automatic scene generation technique for matching the distribution of real data. Importantly, Sim2SG does not require supervision from the real-world dataset, thus making it applicable in situations for which such annotations are difficult to obtain. Sim2SG is designed to bridge both the content and appearance gaps, by matching the content of real data, and by matching the features in the source and target domains. We select scene graph (SG) generation as the downstream task, due to the limited availability of labeled datasets. Experiments demonstrate significant improvements over leading baselines in reducing the domain gap both qualitatively and quantitatively, on several synthetic datasets as well as the real-world KITTI dataset.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Prakash_Self-Supervised_Real-to-Sim_Scene_Generation_ICCV_2021_paper.pdf", @@ -37421,14 +39951,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Prakash_Self-Supervised_Real-to-Sim_Scene_Generation_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;0;0;0;0", - "aff_unique_norm": "NVIDIA", - "aff_unique_dep": "NVIDIA Corporation", + "aff_unique_norm": "NVIDIA Corporation", + "aff_unique_dep": "", "aff_unique_url": "https://www.nvidia.com", "aff_unique_abbr": "NVIDIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Prakash_2021_ICCV,\n \n author = {\n Prakash,\n Aayush and Debnath,\n Shoubhik and Lafleche,\n Jean-Francois and Cameracci,\n Eric and State,\n Gavriel and Birchfield,\n Stan and Law,\n Marc T.\n},\n title = {\n Self-Supervised Real-to-Sim Scene Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 16044-16054\n} \n}" }, { "title": "Self-Supervised Representation Learning From Flow Equivariance", @@ -37436,6 +39967,7 @@ "status": "Poster", "track": "main", "pid": 8051, + "author_site": "Yuwen Xiong; Mengye Ren; Wenyuan Zeng; Raquel Urtasun", "author": "Yuwen Xiong; Mengye Ren; Wenyuan Zeng; Raquel Urtasun", "abstract": "Self-supervised representation learning is able to learn semantically meaningful features; however, much of its recent success relies on multiple crops of an image with very few objects. Instead of learning view-invariant representation from simple images, humans learn representations in a complex world with changing scenes by observing object movement, deformation, pose variation, and ego motion. Motivated by this ability, we present a new self-supervised learning representation framework that can be directly deployed on a video stream of complex scenes with many moving objects. Our framework features a simple flow equivariance objective that encourages the network to predict the features of another frame by applying a flow transformation to the features of the current frame. Our representations, learned from high-resolution raw video, can be readily used for downstream tasks on static images. Readout experiments on challenging semantic segmentation, instance segmentation, and object detection benchmarks show that we are able to outperform representations obtained from previous state-of-the-art methods including SimCLR and BYOL.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xiong_Self-Supervised_Representation_Learning_From_Flow_Equivariance_ICCV_2021_paper.pdf", @@ -37450,7 +39982,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Xiong_Self-Supervised_Representation_Learning_From_Flow_Equivariance_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Xiong_Self-Supervised_Representation_Learning_From_Flow_Equivariance_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Xiong_2021_ICCV,\n \n author = {\n Xiong,\n Yuwen and Ren,\n Mengye and Zeng,\n Wenyuan and Urtasun,\n Raquel\n},\n title = {\n Self-Supervised Representation Learning From Flow Equivariance\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10191-10200\n} \n}" }, { "title": "Self-Supervised Transfer Learning for Hand Mesh Recovery From Binocular Images", @@ -37458,6 +39991,7 @@ "status": "Poster", "track": "main", "pid": 7210, + "author_site": "Zheng Chen; Sihan Wang; Yi Sun; Xiaohong Ma", "author": "Zheng Chen; Sihan Wang; Yi Sun; Xiaohong Ma", "abstract": "Traditional methods for RGB hand mesh recovery usually need to train a separate model for each dataset with the corresponding ground truth and are hardly adapted to new scenarios without the ground truth for supervision. To address the problem, we propose a self-supervised framework for hand mesh estimation, where we pre-learn hand priors from existing hand datasets and transfer the priors to new scenarios without any landmark annotations. The proposed approach takes binocular images as input and mainly relies on left-right consistency constraints including appearance consensus and shape consistency to train the model to estimate the hand mesh in new scenarios. We conduct experiments on the widely used stereo hand dataset, and the experimental results verify that our model can get comparable performance compared with state-of-the-art methods even without the corresponding landmark annotations. To further evaluate our model, we collect a large real binocular dataset. The experimental results on the collected real dataset also verify the effectiveness of our model qualitatively.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_Self-Supervised_Transfer_Learning_for_Hand_Mesh_Recovery_From_Binocular_Images_ICCV_2021_paper.pdf", @@ -37481,7 +40015,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Zheng and Wang,\n Sihan and Sun,\n Yi and Ma,\n Xiaohong\n},\n title = {\n Self-Supervised Transfer Learning for Hand Mesh Recovery From Binocular Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11626-11634\n} \n}" }, { "title": "Self-Supervised Vessel Segmentation via Adversarial Learning", @@ -37489,10 +40024,11 @@ "status": "Poster", "track": "main", "pid": 6057, + "author_site": "Yuxin Ma; Yang Hua; Hanming Deng; Tao Song; Hao Wang; Zhengui Xue; Heng Cao; Ruhui Ma; Haibing Guan", "author": "Yuxin Ma; Yang Hua; Hanming Deng; Tao Song; Hao Wang; Zhengui Xue; Heng Cao; Ruhui Ma; Haibing Guan", "abstract": "Vessel segmentation is critically essential for diagnosinga series of diseases, e.g., coronary artery disease and retinal disease. However, annotating vessel segmentation maps of medical images is notoriously challenging due to the tiny and complex vessel structures, leading to insufficient available annotated datasets for existing supervised methods and domain adaptation methods. The subtle structures and confusing background of medical images further suppress the efficacy of unsupervised methods. In this paper, we propose a self-supervised vessel segmentation method via adversarial learning. Our method learns vessel representations by training an attention-guided generator and a segmentation generator to simultaneously synthesize fake vessels and segment vessels out of coronary angiograms. To support the research, we also build the first X-ray angiography coronary vessel segmentation dataset, named XCAD. We evaluate our method extensively on multiple vessel segmentation datasets, including the XCAD dataset, the DRIVE dataset,and the STARE dataset. The experimental results show our method suppresses unsupervised methods significantly and achieves competitive performance compared with supervised methods and traditional methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ma_Self-Supervised_Vessel_Segmentation_via_Adversarial_Learning_ICCV_2021_paper.pdf", - "aff": "Shanghai Jiao Tong University; Queen\u2019s University Belfast; Shanghai Jiao Tong University; Shanghai Jiao Tong University; Louisiana State University; Shanghai Jiao Tong University; Shanghai General Hospital; Shanghai Jiao Tong University; Shanghai Jiao Tong University", + "aff": "Shanghai Jiao Tong University; Queen’s University Belfast; Shanghai Jiao Tong University; Shanghai Jiao Tong University; Louisiana State University; Shanghai Jiao Tong University; Shanghai General Hospital; Shanghai Jiao Tong University; Shanghai Jiao Tong University", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Ma_Self-Supervised_Vessel_Segmentation_ICCV_2021_supplemental.pdf", @@ -37512,7 +40048,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;2;0;0;0;0", - "aff_country_unique": "China;United Kingdom;United States" + "aff_country_unique": "China;United Kingdom;United States", + "bibtex": "@InProceedings{Ma_2021_ICCV,\n \n author = {\n Ma,\n Yuxin and Hua,\n Yang and Deng,\n Hanming and Song,\n Tao and Wang,\n Hao and Xue,\n Zhengui and Cao,\n Heng and Ma,\n Ruhui and Guan,\n Haibing\n},\n title = {\n Self-Supervised Vessel Segmentation via Adversarial Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7536-7545\n} \n}" }, { "title": "Self-Supervised Video Object Segmentation by Motion Grouping", @@ -37520,6 +40057,7 @@ "status": "Poster", "track": "main", "pid": 2132, + "author_site": "Charig Yang; Hala Lamdouar; Erika Lu; Andrew Zisserman; Weidi Xie", "author": "Charig Yang; Hala Lamdouar; Erika Lu; Andrew Zisserman; Weidi Xie", "abstract": "Animals have evolved highly functional visual systems to understand motion, assisting perception even under complex environments. In this paper, we work towards developing a computer vision system able to segment objects by exploiting motion cues, i.e. motion segmentation. To achieve this, we introduce a simple variant of the Transformer to segment optical flow frames into primary objects and the background, which can be trained in a self-supervised manner, i.e. without using any manual annotations. Despite using only optical flow, and no appearance information, as input, our approach achieves superior results compared to previous state-of-the-art self-supervised methods on public benchmarks (DAVIS2016, SegTrackv2, FBMS59), while being an order of magnitude faster. On a challenging camouflage dataset (MoCA), we significantly outperform other self-supervised approaches, and are competitive with the top supervised approach, highlighting the importance of motion cues and the potential bias towards appearance in existing video segmentation models.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yang_Self-Supervised_Video_Object_Segmentation_by_Motion_Grouping_ICCV_2021_paper.pdf", @@ -37543,7 +40081,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Oxford", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Yang_2021_ICCV,\n \n author = {\n Yang,\n Charig and Lamdouar,\n Hala and Lu,\n Erika and Zisserman,\n Andrew and Xie,\n Weidi\n},\n title = {\n Self-Supervised Video Object Segmentation by Motion Grouping\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7177-7188\n} \n}" }, { "title": "Self-Supervised Video Representation Learning With Meta-Contrastive Network", @@ -37551,6 +40090,7 @@ "status": "Poster", "track": "main", "pid": 8834, + "author_site": "Yuanze Lin; Xun Guo; Yan Lu", "author": "Yuanze Lin; Xun Guo; Yan Lu", "abstract": "Self-supervised learning has been successfully applied to pre-train video representations, which aims at efficient adaptation from pre-training domain to downstream tasks. Existing approaches merely leverage contrastive loss to learn instance-level discrimination. However, lack of category information will lead to hard-positive problem that constrains the generalization ability of this kind of methods. We find that the multi-task process of meta learning can provide a solution to this problem. In this paper, we propose a Meta-Contrastive Network (MCN), which combines the contrastive learning and meta learning, to enhance the learning ability of existing self-supervised approaches. Our method contains two training stages based on model-agnostic meta learning (MAML), each of which consists of a contrastive branch and a meta branch. Extensive evaluations demonstrate the effectiveness of our method. For two downstream tasks, i.e., video action recognition and video retrieval, MCN outperforms state-of-the-art approaches on UCF101 and HMDB51 datasets. To be more specific, with R(2+1)D backbone, MCN achieves Top-1 accuracies of 84.8% and 54.5% for video action recognition, as well as 52.5% and 23.7% for video retrieval.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Lin_Self-Supervised_Video_Representation_Learning_With_Meta-Contrastive_Network_ICCV_2021_paper.pdf", @@ -37567,14 +40107,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Lin_Self-Supervised_Video_Representation_Learning_With_Meta-Contrastive_Network_ICCV_2021_paper.html", "aff_unique_index": "0;1;1", - "aff_unique_norm": "University of Washington;Microsoft", + "aff_unique_norm": "University of Washington;Microsoft Research", "aff_unique_dep": ";Research", "aff_unique_url": "https://www.washington.edu;https://www.microsoft.com/en-us/research/group/asia", "aff_unique_abbr": "UW;MSR Asia", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0;1;1", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Lin_2021_ICCV,\n \n author = {\n Lin,\n Yuanze and Guo,\n Xun and Lu,\n Yan\n},\n title = {\n Self-Supervised Video Representation Learning With Meta-Contrastive Network\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8239-8249\n} \n}" }, { "title": "Self-Supervised Visual Representations Learning by Contrastive Mask Prediction", @@ -37582,6 +40123,7 @@ "status": "Poster", "track": "main", "pid": 4290, + "author_site": "Yucheng Zhao; Guangting Wang; Chong Luo; Wenjun Zeng; Zheng-Jun Zha", "author": "Yucheng Zhao; Guangting Wang; Chong Luo; Wenjun Zeng; Zheng-Jun Zha", "abstract": "Advanced self-supervised visual representation learning methods rely on the instance discrimination (ID) pretext task. We point out that the ID task has an implicit semantic consistency (SC) assumption, which may not hold in unconstrained datasets. In this paper, we propose a novel contrastive mask prediction (CMP) task for visual representation learning and design a mask contrast (MaskCo) framework to implement the idea. MaskCo contrasts region-level features instead of view-level features, which makes it possible to identify the positive sample without any assumptions. To solve the domain gap between masked and unmasked features, we design a dedicated mask prediction head in MaskCo. This module is shown to be the key to the success of the CMP. We evaluated MaskCo on training datasets beyond ImageNet and compare its performance with MoCo V2. Results show that MaskCo achieves comparable performance with MoCo V2 using ImageNet training dataset, but demonstrates a stronger performance across a range of downstream tasks when COCO or Conceptual Captions are used for training. MaskCo provides a promising alternative to the ID-based methods for self-supervised learning in the wild.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhao_Self-Supervised_Visual_Representations_Learning_by_Contrastive_Mask_Prediction_ICCV_2021_paper.pdf", @@ -37597,15 +40139,16 @@ "email": "mail.ustc.edu.cn;mail.ustc.edu.cn;microsoft.com;microsoft.com;ustc.edu.cn", "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhao_Self-Supervised_Visual_Representations_Learning_by_Contrastive_Mask_Prediction_ICCV_2021_paper.html", - "aff_unique_index": "0+1;0+1;1;1;0", - "aff_unique_norm": "University of Science and Technology of China;Microsoft", - "aff_unique_dep": ";Research", - "aff_unique_url": "http://www.ustc.edu.cn;https://www.microsoft.com/en-us/research/group/asia", - "aff_unique_abbr": "USTC;MSR Asia", + "aff_unique_index": "0+1;0+1;2;2;0", + "aff_unique_norm": "University of Science and Technology of China;Microsoft Research;Microsoft Research Asia", + "aff_unique_dep": ";Research;", + "aff_unique_url": "http://www.ustc.edu.cn;https://www.microsoft.com/en-us/research/group/asia;https://www.microsoft.com/en-us/research/group/asia", + "aff_unique_abbr": "USTC;MSR Asia;MSRA", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0+0;0+0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhao_2021_ICCV,\n \n author = {\n Zhao,\n Yucheng and Wang,\n Guangting and Luo,\n Chong and Zeng,\n Wenjun and Zha,\n Zheng-Jun\n},\n title = {\n Self-Supervised Visual Representations Learning by Contrastive Mask Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10160-10169\n} \n}" }, { "title": "SelfReg: Self-Supervised Contrastive Regularization for Domain Generalization", @@ -37613,6 +40156,7 @@ "status": "Poster", "track": "main", "pid": 9830, + "author_site": "Daehee Kim; Youngjun Yoo; Seunghyun Park; Jinkyu Kim; Jaekoo Lee", "author": "Daehee Kim; Youngjun Yoo; Seunghyun Park; Jinkyu Kim; Jaekoo Lee", "abstract": "In general, an experimental environment for deep learning assumes that the training and the test dataset are sampled from the same distribution. However, in real-world situations, a difference in the distribution between two datasets, i.e. domain shift, may occur, which becomes a major factor impeding the generalization performance of the model. The research field to solve this problem is called domain generalization, and it alleviates the domain shift problem by extracting domain-invariant features explicitly or implicitly. In recent studies, contrastive learning-based domain generalization approaches have been proposed and achieved high performance. These approaches require sampling of the negative data pair. However, the performance of contrastive learning fundamentally depends on quality and quantity of negative data pairs. To address this issue, we propose a new regularization method for domain generalization based on contrastive learning, called self-supervised contrastive regularization (SelfReg). The proposed approach uses only positive data pairs, thus it resolves various problems caused by negative pair sampling. Moreover, we propose a class-specific domain perturbation layer (CDPL), which makes it possible to effectively apply mixup augmentation even when only positive data pairs are used. The experimental results show that the techniques incorporated by SelfReg contributed to the performance in a compatible manner. In the recent benchmark, DomainBed, the proposed method shows comparable performance to the conventional state-of-the-art alternatives.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Kim_SelfReg_Self-Supervised_Contrastive_Regularization_for_Domain_Generalization_ICCV_2021_paper.pdf", @@ -37636,7 +40180,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Kim_2021_ICCV,\n \n author = {\n Kim,\n Daehee and Yoo,\n Youngjun and Park,\n Seunghyun and Kim,\n Jinkyu and Lee,\n Jaekoo\n},\n title = {\n SelfReg: Self-Supervised Contrastive Regularization for Domain Generalization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9619-9628\n} \n}" }, { "title": "SemIE: Semantically-Aware Image Extrapolation", @@ -37644,6 +40189,7 @@ "status": "Poster", "track": "main", "pid": 10412, + "author_site": "Bholeshwar Khurana; Soumya Ranjan Dash; Abhishek Bhatia; Aniruddha Mahapatra; Hrituraj Singh; Kuldeep Kulkarni", "author": "Bholeshwar Khurana; Soumya Ranjan Dash; Abhishek Bhatia; Aniruddha Mahapatra; Hrituraj Singh; Kuldeep Kulkarni", "abstract": "We propose a semantically-aware novel paradigm to perform image extrapolation that enables the addition of new object instances. All previous methods are limited in their capability of extrapolation to merely extending the already existing objects in the image. However, our proposed approach focuses not only on (i) extending the already present objects but also on (ii) adding new objects in the extended region based on the context. To this end, for a given image, we first obtain an object segmentation map using a state-of-the-art semantic segmentation method. The, thus, obtained segmentation map is fed into a network to compute the extrapolated semantic segmentation and the corresponding panoptic segmentation maps. The input image and the obtained segmentation maps are further utilized to generate the final extrapolated image. We conduct experiments on Cityscapes and ADE20K bedroom datasets and show that our method outperforms all baselines in terms of FID, and similarity object co-occurrence statistics.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Khurana_SemIE_Semantically-Aware_Image_Extrapolation_ICCV_2021_paper.pdf", @@ -37660,14 +40206,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Khurana_SemIE_Semantically-Aware_Image_Extrapolation_ICCV_2021_paper.html", "aff_unique_index": "0+1;0+1;0+1;1;1+2;1", - "aff_unique_norm": "Indian Institute of Technology Kanpur;Adobe;Triomics", + "aff_unique_norm": "Indian Institute of Technology Kanpur;Adobe Research;Triomics", "aff_unique_dep": ";Adobe Research;", "aff_unique_url": "https://www.iitk.ac.in;https://research.adobe.com;", "aff_unique_abbr": "IITK;Adobe;", "aff_campus_unique_index": "0;0;0;", "aff_campus_unique": "Kanpur;", "aff_country_unique_index": "0+0;0+0;0+0;0;0;0", - "aff_country_unique": "India;" + "aff_country_unique": "India;", + "bibtex": "@InProceedings{Khurana_2021_ICCV,\n \n author = {\n Khurana,\n Bholeshwar and Dash,\n Soumya Ranjan and Bhatia,\n Abhishek and Mahapatra,\n Aniruddha and Singh,\n Hrituraj and Kulkarni,\n Kuldeep\n},\n title = {\n SemIE: Semantically-Aware Image Extrapolation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14900-14909\n} \n}" }, { "title": "Semantic Aware Data Augmentation for Cell Nuclei Microscopical Images With Artificial Neural Networks", @@ -37675,6 +40222,7 @@ "status": "Poster", "track": "main", "pid": 10845, + "author_site": "Alireza Naghizadeh; Hongye Xu; Mohab Mohamed; Dimitris N. Metaxas; Dongfang Liu", "author": "Alireza Naghizadeh; Hongye Xu; Mohab Mohamed; Dimitris N. Metaxas; Dongfang Liu", "abstract": "There exists many powerful architectures for object detection and semantic segmentation of both biomedical and natural images. However, a difficulty arises in the ability to create training datasets that are large and well-varied. The importance of this subject is nested in the amount of training data that artificial neural networks need to accurately identify and segment objects in images and the infeasibility of acquiring a sufficient dataset within the biomedical field. This paper introduces a new data augmentation method that generates artificial cell nuclei microscopical images along with their correct semantic segmentation labels. Data augmentation provides a step toward accessing higher generalization capabilities of artificial neural networks. An initial set of segmentation objects is used with Greedy AutoAugment to find the strongest performing augmentation policies. The found policies and the initial set of segmentation objects are then used in the creation of the final artificial images. When comparing the state-of-the-art data augmentation methods with the proposed method, the proposed method is shown to consistently outperform current solutions in the generation of nuclei microscopical images.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Naghizadeh_Semantic_Aware_Data_Augmentation_for_Cell_Nuclei_Microscopical_Images_With_ICCV_2021_paper.pdf", @@ -37698,7 +40246,8 @@ "aff_campus_unique_index": "0;0;0;1;0", "aff_campus_unique": "Newark;New Brunswick", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Naghizadeh_2021_ICCV,\n \n author = {\n Naghizadeh,\n Alireza and Xu,\n Hongye and Mohamed,\n Mohab and Metaxas,\n Dimitris N. and Liu,\n Dongfang\n},\n title = {\n Semantic Aware Data Augmentation for Cell Nuclei Microscopical Images With Artificial Neural Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3952-3961\n} \n}" }, { "title": "Semantic Concentration for Domain Adaptation", @@ -37706,6 +40255,7 @@ "status": "Poster", "track": "main", "pid": 7791, + "author_site": "Shuang Li; Mixue Xie; Fangrui Lv; Chi Harold Liu; Jian Liang; Chen Qin; Wei Li", "author": "Shuang Li; Mixue Xie; Fangrui Lv; Chi Harold Liu; Jian Liang; Chen Qin; Wei Li", "abstract": "Domain adaptation (DA) paves the way for label annotation and dataset bias issues by the knowledge transfer from a label-rich source domain to a related but unlabeled target domain. A mainstream of DA methods is to align the feature distributions of the two domains. However, the majority of them focus on the entire image features where irrelevant semantic information, e.g., the messy background, is inevitably embedded. Enforcing feature alignments in such case will negatively influence the correct matching of objects and consequently lead to the semantically negative transfer due to the confusion of irrelevant semantics. To tackle this issue, we propose Semantic Concentration for Domain Adaptation (SCDA), which encourages the model to concentrate on the most principal features via the pair-wise adversarial alignment of prediction distributions. Specifically, we train the classifier to class-wisely maximize the prediction distribution divergence of each sample pair, which enables the model to find the region with large differences among the same class of samples. Meanwhile, the feature extractor attempts to minimize that discrepancy, which suppresses the features of dissimilar regions among the same class of samples and accentuates the features of principal parts. As a general method, SCDA can be easily integrated into various DA methods as a regularizer to further boost their performance. Extensive experiments on the cross-domain benchmarks show the efficacy of SCDA.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_Semantic_Concentration_for_Domain_Adaptation_ICCV_2021_paper.pdf", @@ -37729,7 +40279,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;1", - "aff_country_unique": "China;United Kingdom;" + "aff_country_unique": "China;United Kingdom;", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Shuang and Xie,\n Mixue and Lv,\n Fangrui and Liu,\n Chi Harold and Liang,\n Jian and Qin,\n Chen and Li,\n Wei\n},\n title = {\n Semantic Concentration for Domain Adaptation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9102-9111\n} \n}" }, { "title": "Semantic Diversity Learning for Zero-Shot Multi-Label Classification", @@ -37737,6 +40288,7 @@ "status": "Poster", "track": "main", "pid": 8595, + "author_site": "Avi Ben-Cohen; Nadav Zamir; Emanuel Ben-Baruch; Itamar Friedman; Lihi Zelnik-Manor", "author": "Avi Ben-Cohen; Nadav Zamir; Emanuel Ben-Baruch; Itamar Friedman; Lihi Zelnik-Manor", "abstract": "Training a neural network model for recognizing multiple labels associated with an image, including identifying unseen labels, is challenging, especially for images that portray numerous semantically diverse labels. As challenging as this task is, it is an essential task to tackle since it represents many real-world cases, such as image retrieval of natural images. We argue that using a single embedding vector to represent an image, as commonly practiced, is not sufficient to rank both relevant seen and unseen labels accurately. This study introduces an end-to-end model training for multi-label zero-shot learning that supports the semantic diversity of the images and labels. We propose to use an embedding matrix having principal embedding vectors trained using a tailored loss function. In addition, during training, we suggest up-weighting in the loss function image samples presenting higher semantic diversity to encourage the diversity of the embedding matrix. Extensive experiments show that our proposed method improves the zero-shot model's quality in tag-based image retrieval achieving SoTA results on several common datasets (NUS-Wide, COCO, Open Images).", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ben-Cohen_Semantic_Diversity_Learning_for_Zero-Shot_Multi-Label_Classification_ICCV_2021_paper.pdf", @@ -37760,7 +40312,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Ben-Cohen_2021_ICCV,\n \n author = {\n Ben-Cohen,\n Avi and Zamir,\n Nadav and Ben-Baruch,\n Emanuel and Friedman,\n Itamar and Zelnik-Manor,\n Lihi\n},\n title = {\n Semantic Diversity Learning for Zero-Shot Multi-Label Classification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 640-650\n} \n}" }, { "title": "Semantic Perturbations With Normalizing Flows for Improved Generalization", @@ -37768,7 +40321,8 @@ "status": "Poster", "track": "main", "pid": 10428, - "author": "Oguz Kaan Y\u00fcksel; Sebastian U. Stich; Martin Jaggi; Tatjana Chavdarova", + "author_site": "Oguz Kaan Yüksel; Sebastian U. Stich; Martin Jaggi; Tatjana Chavdarova", + "author": "Oguz Kaan Yüksel; Sebastian U. Stich; Martin Jaggi; Tatjana Chavdarova", "abstract": "Data augmentation is a widely adopted technique for avoiding overfitting when training deep neural networks. However, this approach requires domain-specific knowledge and is often limited to a fixed set of hard-coded transformations. Recently, several works proposed to use generative models for generating semantically meaningful perturbations to train a classifier. However, because accurate encoding and decoding is critical, these methods, which use architectures that approximate the latent-variable inference, remained limited to pilot studies on small datasets. Exploiting the exactly reversible encoder-decoder structure of normalizing flows, we perform on-manifold perturbations in the latent space to define fully unsupervised data augmentations. We demonstrate that such perturbations match the performance of advanced data augmentation techniques---reaching 96.6% test accuracy for CIFAR-10 using ResNet-18 and outperform existing methods, particularly in low data regimes---yielding 10--25% relative improvement of test accuracy from classical training. We find that our latent adversarial perturbations adaptive to the classifier throughout its training are most effective, yielding the first test accuracy improvement results on real-world datasets---CIFAR-10/100---via latent-space perturbations.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yuksel_Semantic_Perturbations_With_Normalizing_Flows_for_Improved_Generalization_ICCV_2021_paper.pdf", "aff": "Machine Learning and Optimization Lab, EPFL; Machine Learning and Optimization Lab, EPFL; Machine Learning and Optimization Lab, EPFL; Department of Electrical Engineering and Computer Sciences, UC Berkeley + Machine Learning and Optimization Lab, EPFL", @@ -37784,14 +40338,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Yuksel_Semantic_Perturbations_With_Normalizing_Flows_for_Improved_Generalization_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;1+0", - "aff_unique_norm": "EPFL;University of California, Berkeley", + "aff_unique_norm": "École Polytechnique Fédérale de Lausanne;University of California, Berkeley", "aff_unique_dep": "Machine Learning and Optimization Lab;Department of Electrical Engineering and Computer Sciences", "aff_unique_url": "https://www.epfl.ch;https://www.berkeley.edu", "aff_unique_abbr": "EPFL;UC Berkeley", "aff_campus_unique_index": "1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;0;0;1+0", - "aff_country_unique": "Switzerland;United States" + "aff_country_unique": "Switzerland;United States", + "bibtex": "@InProceedings{Yuksel_2021_ICCV,\n \n author = {\n Y\\"uksel,\n Oguz Kaan and Stich,\n Sebastian U. and Jaggi,\n Martin and Chavdarova,\n Tatjana\n},\n title = {\n Semantic Perturbations With Normalizing Flows for Improved Generalization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6619-6629\n} \n}" }, { "title": "Semantic-Embedded Unsupervised Spectral Reconstruction From Single RGB Images in the Wild", @@ -37799,6 +40354,7 @@ "status": "Poster", "track": "main", "pid": 2456, + "author_site": "Zhiyu Zhu; Hui Liu; Junhui Hou; Huanqiang Zeng; Qingfu Zhang", "author": "Zhiyu Zhu; Hui Liu; Junhui Hou; Huanqiang Zeng; Qingfu Zhang", "abstract": "This paper investigates the problem of reconstructing hyperspectral (HS) images from single RGB images captured by commercial cameras, without using paired HS and RGB images during training. To tackle this challenge, we propose a new lightweight and end-to-end learning-based framework. Specifically, on the basis of the intrinsic imaging degradation model of RGB images from HS images, we progressively spread the differences between input RGB images and re-projected RGB images from recovered HS images via effective unsupervised camera spectral response function estimation. To enable the learning without paired ground-truth HS images as supervision, we adopt the adversarial learning manner and boost it with a simple yet effective L1 gradient clipping scheme. Besides, we embed the semantic information of input RGB images to locally regularize the unsupervised learning, which is expected to promote pixels with identical semantics to have consistent spectral signatures. In addition to conducting quantitative experiments over two widely-used datasets for HS image reconstruction from synthetic RGB images, we also evaluate our method by applying recovered HS images from real RGB images to HS-based visual tracking. Extensive results show that our method significantly outperforms state-of-the-art unsupervised methods and even exceeds the latest supervised method under some settings. The source code is public available at https://github.com/zbzhzhy/Unsupervised-Spectral-Reconstruction.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhu_Semantic-Embedded_Unsupervised_Spectral_Reconstruction_From_Single_RGB_Images_in_the_ICCV_2021_paper.pdf", @@ -37822,7 +40378,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhu_2021_ICCV,\n \n author = {\n Zhu,\n Zhiyu and Liu,\n Hui and Hou,\n Junhui and Zeng,\n Huanqiang and Zhang,\n Qingfu\n},\n title = {\n Semantic-Embedded Unsupervised Spectral Reconstruction From Single RGB Images in the Wild\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2279-2288\n} \n}" }, { "title": "Semantically Coherent Out-of-Distribution Detection", @@ -37830,6 +40387,7 @@ "status": "Poster", "track": "main", "pid": 1154, + "author_site": "Jingkang Yang; Haoqi Wang; Litong Feng; Xiaopeng Yan; Huabin Zheng; Wayne Zhang; Ziwei Liu", "author": "Jingkang Yang; Haoqi Wang; Litong Feng; Xiaopeng Yan; Huabin Zheng; Wayne Zhang; Ziwei Liu", "abstract": "Current out-of-distribution (OOD) detection benchmarks are commonly built by defining one dataset as in-distribution (ID) and all others as OOD. However, these benchmarks unfortunately introduce some unwanted and impractical goals, e.g., to perfectly distinguish CIFAR dogs from ImageNet dogs, even though they have the same semantics and negligible covariate shifts. These unrealistic goals will result in an extremely narrow range of model capabilities, greatly limiting their use in real applications. To overcome these drawbacks, we re-design the benchmarks and propose the semantically coherent out-of-distribution detection (SC-OOD). On the SC-OOD benchmarks, existing methods suffer from large performance degradation, suggesting that they are extremely sensitive to low-level discrepancy between data sources while ignoring their inherent semantics. To develop an effective SC-OOD detection approach, we leverage an external un- labeled set and design a concise framework featured by unsupervised dual grouping (UDG) for the joint modeling of ID and OOD data. The proposed UDG can not only enrich the semantic knowledge of the model by exploiting unlabeled data in an unsupervised manner but also distinguish ID/OOD samples to enhance ID classification and OOD detection tasks simultaneously. Extensive experiments demonstrate that our approach achieves state-of-the-art performance on SC-OOD benchmarks. Code and benchmarks are provided on our project page: https://jingkang50.github.io/projects/scood.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yang_Semantically_Coherent_Out-of-Distribution_Detection_ICCV_2021_paper.pdf", @@ -37848,12 +40406,13 @@ "aff_unique_index": "0;1;1;1;1;1+2+3;0", "aff_unique_norm": "Nanyang Technological University;SenseTime;Shanghai Jiao Tong University;Shanghai AI Laboratory", "aff_unique_dep": "S-Lab;SenseTime Research;Qing Yuan Research Institute;", - "aff_unique_url": "https://www.ntu.edu.sg;https://www.sensetime.com;https://www.sjtu.edu.cn;https://www.shanghaiailab.com", - "aff_unique_abbr": "NTU;SenseTime;SJTU;SAIL", + "aff_unique_url": "https://www.ntu.edu.sg;https://www.sensetime.com;https://www.sjtu.edu.cn;", + "aff_unique_abbr": "NTU;SenseTime;SJTU;", "aff_campus_unique_index": "1+1", "aff_campus_unique": ";Shanghai", "aff_country_unique_index": "0;1;1;1;1;1+1+1;0", - "aff_country_unique": "Singapore;China" + "aff_country_unique": "Singapore;China", + "bibtex": "@InProceedings{Yang_2021_ICCV,\n \n author = {\n Yang,\n Jingkang and Wang,\n Haoqi and Feng,\n Litong and Yan,\n Xiaopeng and Zheng,\n Huabin and Zhang,\n Wayne and Liu,\n Ziwei\n},\n title = {\n Semantically Coherent Out-of-Distribution Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8301-8309\n} \n}" }, { "title": "Semantically Robust Unpaired Image Translation for Data With Unmatched Semantics Statistics", @@ -37861,6 +40420,7 @@ "status": "Poster", "track": "main", "pid": 3957, + "author_site": "Zhiwei Jia; Bodi Yuan; Kangkang Wang; Hong Wu; David Clifford; Zhiqiang Yuan; Hao Su", "author": "Zhiwei Jia; Bodi Yuan; Kangkang Wang; Hong Wu; David Clifford; Zhiqiang Yuan; Hao Su", "abstract": "Many applications of unpaired image-to-image translation require the input contents to be preserved semantically during translations. Unaware of the inherently unmatched semantics distributions between source and target domains, existing distribution matching methods (i.e., GAN-based) can give undesired solutions. In specific, although producing visually reasonable outputs, the learned models usually flip the semantics of the inputs. To tackle this without using extra supervisions, we propose to enforce the translated outputs to be semantically invariant w.r.t. small perceptual variations of the inputs, a property we call \"\"semantic robustness\"\". By optimizing a robustness loss w.r.t. multi-scale feature space perturbations of the inputs, our method effectively reduces semantics flipping and produces translations that outperform existing methods both quantitatively and qualitatively.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Jia_Semantically_Robust_Unpaired_Image_Translation_for_Data_With_Unmatched_Semantics_ICCV_2021_paper.pdf", @@ -37884,7 +40444,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "San Diego;", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States;" + "aff_country_unique": "United States;", + "bibtex": "@InProceedings{Jia_2021_ICCV,\n \n author = {\n Jia,\n Zhiwei and Yuan,\n Bodi and Wang,\n Kangkang and Wu,\n Hong and Clifford,\n David and Yuan,\n Zhiqiang and Su,\n Hao\n},\n title = {\n Semantically Robust Unpaired Image Translation for Data With Unmatched Semantics Statistics\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14273-14283\n} \n}" }, { "title": "Semantics Disentangling for Generalized Zero-Shot Learning", @@ -37892,6 +40453,7 @@ "status": "Poster", "track": "main", "pid": 1903, + "author_site": "Zhi Chen; Yadan Luo; Ruihong Qiu; Sen Wang; Zi Huang; Jingjing Li; Zheng Zhang", "author": "Zhi Chen; Yadan Luo; Ruihong Qiu; Sen Wang; Zi Huang; Jingjing Li; Zheng Zhang", "abstract": "Generalized zero-shot learning (GZSL) aims to classify samples under the assumption that some classes are not observable during training. To bridge the gap between the seen and unseen classes, most GZSL methods attempt to associate the visual features of seen classes with attributes or to generate unseen samples directly. Nevertheless, the visual features used in prior approaches do not necessarily encode semantically related information that the shared attributes refer to, which greatly degrades the model generalization to unseen classes. To address this issue, in this paper, we propose a novel semantics disentangling framework for the generalized zero-shot learning task (SDGZSL), where the visual features depicted unseen classes are firstly estimated by a conditional VAE and then factorized into semantic-consistent and semantic-unrelated latent vectors. In particular, a total correlation penalty is applied to guarantee the independence between the two factorized representations, and the semantic consistency of which is measured by the derived relation network. Extensive experiments conducted on four GZSL benchmark datasets have evidenced that the semantic-consistent features disentangled by the proposed SDGZSL are more generalizable in tasks of canonical and generalized zero-shot learning.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_Semantics_Disentangling_for_Generalized_Zero-Shot_Learning_ICCV_2021_paper.pdf", @@ -37908,14 +40470,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Chen_Semantics_Disentangling_for_Generalized_Zero-Shot_Learning_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;0;0;1;2", - "aff_unique_norm": "University of Queensland;University of Electronic Science and Technology of China;Harbin Institute of Technology", + "aff_unique_norm": "The University of Queensland;University of Electronic Science and Technology of China;Harbin Institute of Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://www.uq.edu.au;https://www.uestc.edu.cn;http://en.hhit.edu.cn/", "aff_unique_abbr": "UQ;UESTC;HIT", "aff_campus_unique_index": "1", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0;0;0;0;0;1;1", - "aff_country_unique": "Australia;China" + "aff_country_unique": "Australia;China", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Zhi and Luo,\n Yadan and Qiu,\n Ruihong and Wang,\n Sen and Huang,\n Zi and Li,\n Jingjing and Zhang,\n Zheng\n},\n title = {\n Semantics Disentangling for Generalized Zero-Shot Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8712-8720\n} \n}" }, { "title": "Semi-Supervised Active Learning With Temporal Output Discrepancy", @@ -37923,6 +40486,7 @@ "status": "Poster", "track": "main", "pid": 1177, + "author_site": "Siyu Huang; Tianyang Wang; Haoyi Xiong; Jun Huan; Dejing Dou", "author": "Siyu Huang; Tianyang Wang; Haoyi Xiong; Jun Huan; Dejing Dou", "abstract": "While deep learning succeeds in a wide range of tasks, it highly depends on the massive collection of annotated data which is expensive and time-consuming. To lower the cost of data annotation, active learning has been proposed to interactively query an oracle to annotate a small proportion of informative samples in an unlabeled dataset. Inspired by the fact that the samples with higher loss are usually more informative to the model than the samples with lower loss, in this paper we present a novel deep active learning approach that queries the oracle for data annotation when the unlabeled sample is believed to incorporate high loss. The core of our approach is a measurement Temporal Output Discrepancy (TOD) that estimates the sample loss by evaluating the discrepancy of outputs given by models at different optimization steps. Our theoretical investigation shows that TOD lower-bounds the accumulated sample loss thus it can be used to select informative unlabeled samples. On basis of TOD, we further develop an effective unlabeled data sampling strategy as well as an unsupervised learning criterion that enhances model performance by incorporating the unlabeled data. Due to the simplicity of TOD, our active learning approach is efficient, flexible, and task-agnostic. Extensive experimental results demonstrate that our approach achieves superior performances than the state-of-the-art active learning methods on image classification and semantic segmentation tasks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Huang_Semi-Supervised_Active_Learning_With_Temporal_Output_Discrepancy_ICCV_2021_paper.pdf", @@ -37946,7 +40510,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;1;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Huang_2021_ICCV,\n \n author = {\n Huang,\n Siyu and Wang,\n Tianyang and Xiong,\n Haoyi and Huan,\n Jun and Dou,\n Dejing\n},\n title = {\n Semi-Supervised Active Learning With Temporal Output Discrepancy\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3447-3456\n} \n}" }, { "title": "Semi-Supervised Active Learning for Semi-Supervised Models: Exploit Adversarial Examples With Graph-Based Virtual Labels", @@ -37954,10 +40519,11 @@ "status": "Poster", "track": "main", "pid": 2233, + "author_site": "Jiannan Guo; Haochen Shi; Yangyang Kang; Kun Kuang; Siliang Tang; Zhuoren Jiang; Changlong Sun; Fei Wu; Yueting Zhuang", "author": "Jiannan Guo; Haochen Shi; Yangyang Kang; Kun Kuang; Siliang Tang; Zhuoren Jiang; Changlong Sun; Fei Wu; Yueting Zhuang", "abstract": "The performance of computer vision models significantly improves with more labeled data. However, the acquisition of labeled data is limited by the high cost. To mitigate the reliance on large labeled datasets, active learning (AL) and semi-supervised learning (SSL) are frequently adopted. Although current mainstream methods begin to combine SSL and AL (SSL-AL) to excavate the diverse expressions of unlabeled samples, these methods' fully supervised task models are still trained only with labeled data. Besides, these method's SSL-AL frameworks suffer from mismatch problems. Here, we propose a graph-based SSL-AL framework to unleash the SSL task models' power and make an effective SSL-AL interaction. In the framework, SSL leverages graph-based label propagation to deliver virtual labels to unlabeled samples, rendering AL samples' structural distribution and boosting AL. AL finds samples near the clusters' boundary to help SSL perform better label propagation by exploiting adversarial examples. The information exchange in the closed-loop realizes mutual enhancement of SSL and AL. Experimental results show that our method outperforms the state-of-the-art methods against classification and segmentation benchmarks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Guo_Semi-Supervised_Active_Learning_for_Semi-Supervised_Models_Exploit_Adversarial_Examples_With_ICCV_2021_paper.pdf", - "aff": "Zhejiang University+Alibaba Group; Universit \u00b4e de Montr \u00b4eal; Alibaba Group; Zhejiang University; Zhejiang University; Zhejiang University; Zhejiang University+Alibaba Group; Zhejiang University; Zhejiang University", + "aff": "Zhejiang University+Alibaba Group; Universit ´e de Montr ´eal; Alibaba Group; Zhejiang University; Zhejiang University; Zhejiang University; Zhejiang University+Alibaba Group; Zhejiang University; Zhejiang University", "project": "", "github": "", "supp": "", @@ -37970,14 +40536,15 @@ "author_num": 9, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Guo_Semi-Supervised_Active_Learning_for_Semi-Supervised_Models_Exploit_Adversarial_Examples_With_ICCV_2021_paper.html", "aff_unique_index": "0+1;2;1;0;0;0;0+1;0;0", - "aff_unique_norm": "Zhejiang University;Alibaba Group;Universit\u00e9 de Montr\u00e9al", + "aff_unique_norm": "Zhejiang University;Alibaba Group;Université de Montréal", "aff_unique_dep": ";;", "aff_unique_url": "https://www.zju.edu.cn;https://www.alibaba.com;https://www.umontreal.ca", "aff_unique_abbr": "ZJU;Alibaba;UdeM", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;1;0;0;0;0;0+0;0;0", - "aff_country_unique": "China;Canada" + "aff_country_unique": "China;Canada", + "bibtex": "@InProceedings{Guo_2021_ICCV,\n \n author = {\n Guo,\n Jiannan and Shi,\n Haochen and Kang,\n Yangyang and Kuang,\n Kun and Tang,\n Siliang and Jiang,\n Zhuoren and Sun,\n Changlong and Wu,\n Fei and Zhuang,\n Yueting\n},\n title = {\n Semi-Supervised Active Learning for Semi-Supervised Models: Exploit Adversarial Examples With Graph-Based Virtual Labels\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2896-2905\n} \n}" }, { "title": "Semi-Supervised Learning of Visual Features by Non-Parametrically Predicting View Assignments With Support Samples", @@ -37985,6 +40552,7 @@ "status": "Poster", "track": "main", "pid": 8327, + "author_site": "Mahmoud Assran; Mathilde Caron; Ishan Misra; Piotr Bojanowski; Armand Joulin; Nicolas Ballas; Michael Rabbat", "author": "Mahmoud Assran; Mathilde Caron; Ishan Misra; Piotr Bojanowski; Armand Joulin; Nicolas Ballas; Michael Rabbat", "abstract": "This paper proposes a novel method of learning by predicting view assignments with support samples (PAWS). The method trains a model to minimize a consistency loss, which ensures that different views of the same unlabeled instance are assigned similar pseudo-labels. The pseudo-labels are generated non-parametrically, by comparing the representations of the image views to those of a set of randomly sampled labeled images. The distance between the view representations and labeled representations is used to provide a weighting over class labels, which we interpret as a soft pseudo-label. By non-parametrically incorporating labeled samples in this way, PAWS extends the distance-metric loss used in self-supervised methods such as BYOL and SwAV to the semi-supervised setting. Despite the simplicity of the approach, PAWS outperforms other semi-supervised methods across architectures, setting a new state-of-the-art for a ResNet-50 on ImageNet trained with either 10% or 1% of the labels, reaching 75% and 66% top-1 respectively. This is achieved with only 200 epochs of training, which is 4x less than the previous best method.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Assran_Semi-Supervised_Learning_of_Visual_Features_by_Non-Parametrically_Predicting_View_Assignments_ICCV_2021_paper.pdf", @@ -38001,14 +40569,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Assran_Semi-Supervised_Learning_of_Visual_Features_by_Non-Parametrically_Predicting_View_Assignments_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;0;0;0;0", - "aff_unique_norm": "Meta", + "aff_unique_norm": "Facebook", "aff_unique_dep": "Facebook AI Research", "aff_unique_url": "https://research.facebook.com", "aff_unique_abbr": "FAIR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Assran_2021_ICCV,\n \n author = {\n Assran,\n Mahmoud and Caron,\n Mathilde and Misra,\n Ishan and Bojanowski,\n Piotr and Joulin,\n Armand and Ballas,\n Nicolas and Rabbat,\n Michael\n},\n title = {\n Semi-Supervised Learning of Visual Features by Non-Parametrically Predicting View Assignments With Support Samples\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8443-8452\n} \n}" }, { "title": "Semi-Supervised Semantic Segmentation With Pixel-Level Contrastive Learning From a Class-Wise Memory Bank", @@ -38016,10 +40585,11 @@ "status": "Poster", "track": "main", "pid": 4318, - "author": "I\u00f1igo Alonso; Alberto Sabater; David Ferstl; Luis Montesano; Ana C. Murillo", + "author_site": "Iñigo Alonso; Alberto Sabater; David Ferstl; Luis Montesano; Ana C. Murillo", + "author": "Iñigo Alonso; Alberto Sabater; David Ferstl; Luis Montesano; Ana C. Murillo", "abstract": "This work presents a novel approach for semi-supervised semantic segmentation. The key element of this approach is our contrastive learning module that enforces the segmentation network to yield similar pixel-level feature representations for same-class samples across the whole dataset. To achieve this, we maintain a memory bank which is continuously updated with relevant and high-quality feature vectors from labeled data. In an end-to-end training, the features from both labeled and unlabeled data are optimized to be similar to same-class samples from the memory bank. Our approach not only outperforms the current state-of-the-art for semi-supervised semantic segmentation but also for semi-supervised domain adaptation on well-known public benchmarks, with larger improvements on the most challenging scenarios, i.e., less available labeled data. Code is available at https://github.com/Shathe/SemiSeg-Contrastive", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Alonso_Semi-Supervised_Semantic_Segmentation_With_Pixel-Level_Contrastive_Learning_From_a_Class-Wise_ICCV_2021_paper.pdf", - "aff": "RoPeRT group, at DIIS - I3A, Universidad de Zaragoza, Spain; RoPeRT group, at DIIS - I3A, Universidad de Zaragoza, Spain; Magic Leap, Z\u00fcrich, Switzerland; RoPeRT group, at DIIS - I3A, Universidad de Zaragoza, Spain + Bitbrain, Zaragoza, Spain; RoPeRT group, at DIIS - I3A, Universidad de Zaragoza, Spain", + "aff": "RoPeRT group, at DIIS - I3A, Universidad de Zaragoza, Spain; RoPeRT group, at DIIS - I3A, Universidad de Zaragoza, Spain; Magic Leap, Zürich, Switzerland; RoPeRT group, at DIIS - I3A, Universidad de Zaragoza, Spain + Bitbrain, Zaragoza, Spain; RoPeRT group, at DIIS - I3A, Universidad de Zaragoza, Spain", "project": "", "github": "https://github.com/Shathe/SemiSeg-Contrastive", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Alonso_Semi-Supervised_Semantic_Segmentation_ICCV_2021_supplemental.pdf", @@ -38037,9 +40607,10 @@ "aff_unique_url": "https://www.unizar.es;https://www.magicleap.com;", "aff_unique_abbr": ";;", "aff_campus_unique_index": "1;2", - "aff_campus_unique": ";Z\u00fcrich;Zaragoza", + "aff_campus_unique": ";Zürich;Zaragoza", "aff_country_unique_index": "0;0;1;0+0;0", - "aff_country_unique": "Spain;Switzerland" + "aff_country_unique": "Spain;Switzerland", + "bibtex": "@InProceedings{Alonso_2021_ICCV,\n \n author = {\n Alonso,\n I\\~nigo and Sabater,\n Alberto and Ferstl,\n David and Montesano,\n Luis and Murillo,\n Ana C.\n},\n title = {\n Semi-Supervised Semantic Segmentation With Pixel-Level Contrastive Learning From a Class-Wise Memory Bank\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8219-8228\n} \n}" }, { "title": "Semi-Supervised Single-Stage Controllable GANs for Conditional Fine-Grained Image Generation", @@ -38047,6 +40618,7 @@ "status": "Poster", "track": "main", "pid": 9424, + "author_site": "Tianyi Chen; Yi Liu; Yunfei Zhang; Si Wu; Yong Xu; Feng Liangbing; Hau San Wong", "author": "Tianyi Chen; Yi Liu; Yunfei Zhang; Si Wu; Yong Xu; Feng Liangbing; Hau San Wong", "abstract": "Previous state-of-the-art deep generative models improve fine-grained image generation quality by designing hierarchical model structures and synthesizing images across multiple stages. The learning process is typically performed without any supervision in object categories. To address this issue, while at the same time to alleviate the level of complexity of both model design and training, we propose a Single-Stage Controllable GAN (SSC-GAN) for conditional fine-grained image synthesis in a semi-supervised setting. Considering the fact that fine-grained object categories may have subtle distinctions and shared attributes, we take into account three factors of variation for generative modeling: class-independent content, cross-class attributes and class semantics, and associate them with different variables. To ensure disentanglement among the variables, we maximize mutual information between the class-independent variable and synthesized images, map real images to the latent space of a generator to perform consistency regularization of cross-class attributes, and incorporate class semantic-based regularization into a discriminator's feature space. We show that the proposed approach delivers a single-stage controllable generator and high-fidelity synthesized images of fine-grained categories. The proposed approach establishes state-of-the-art semi-supervised image synthesis results across multiple fine-grained datasets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_Semi-Supervised_Single-Stage_Controllable_GANs_for_Conditional_Fine-Grained_Image_Generation_ICCV_2021_paper.pdf", @@ -38061,7 +40633,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Chen_Semi-Supervised_Single-Stage_Controllable_GANs_for_Conditional_Fine-Grained_Image_Generation_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Chen_Semi-Supervised_Single-Stage_Controllable_GANs_for_Conditional_Fine-Grained_Image_Generation_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Tianyi and Liu,\n Yi and Zhang,\n Yunfei and Wu,\n Si and Xu,\n Yong and Liangbing,\n Feng and Wong,\n Hau San\n},\n title = {\n Semi-Supervised Single-Stage Controllable GANs for Conditional Fine-Grained Image Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9264-9273\n} \n}" }, { "title": "SemiHand: Semi-Supervised Hand Pose Estimation With Consistency", @@ -38069,6 +40642,7 @@ "status": "Poster", "track": "main", "pid": 8208, + "author_site": "Linlin Yang; Shicheng Chen; Angela Yao", "author": "Linlin Yang; Shicheng Chen; Angela Yao", "abstract": "We present SemiHand, a semi-supervised framework for 3D hand pose estimation from monocular images. We pre-train the model on labelled synthetic data and fine-tune it on unlabelled real-world data by pseudo-labeling with consistency training. By design, we introduce data augmentation of differing difficulties, consistency regularizer, label correction and sample selection for RGB-based 3D hand pose estimation. In particular, by approximating the hand masks from hand poses, we propose a cross-modal consistency and leverage semantic predictions to guide the predicted poses. Meanwhile, we introduce pose registration as label correction to guarantee the biomechanical feasibility of hand bone lengths. Experiments show that our method achieves a favorable improvement on real-world datasets after fine-tuning.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yang_SemiHand_Semi-Supervised_Hand_Pose_Estimation_With_Consistency_ICCV_2021_paper.pdf", @@ -38092,7 +40666,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0;0", - "aff_country_unique": "Singapore;Germany" + "aff_country_unique": "Singapore;Germany", + "bibtex": "@InProceedings{Yang_2021_ICCV,\n \n author = {\n Yang,\n Linlin and Chen,\n Shicheng and Yao,\n Angela\n},\n title = {\n SemiHand: Semi-Supervised Hand Pose Estimation With Consistency\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11364-11373\n} \n}" }, { "title": "Seminar Learning for Click-Level Weakly Supervised Semantic Segmentation", @@ -38100,6 +40675,7 @@ "status": "Poster", "track": "main", "pid": 1181, + "author_site": "Hongjun Chen; Jinbao Wang; Hong Cai Chen; Xiantong Zhen; Feng Zheng; Rongrong Ji; Ling Shao", "author": "Hongjun Chen; Jinbao Wang; Hong Cai Chen; Xiantong Zhen; Feng Zheng; Rongrong Ji; Ling Shao", "abstract": "Annotation burden has become one of the biggest barriers to semantic segmentation. Approaches based on click-level annotations have therefore attracted increasing attention due to their superior trade-off between supervision and annotation cost. In this paper, we propose seminar learning, a new learning paradigm for semantic segmentation with click-level supervision. The fundamental rationale of seminar learning is to leverage the knowledge from different networks to compensate for insufficient information provided in click-level annotations. Mimicking a seminar, our seminar learning involves a teacher-student and a student-student module, where a student can learn from both skillful teachers and other students. The teacher-student module uses a teacher network based on the exponential moving average to guide the training of the student network. In the student-student module, heterogeneous pseudo-labels are proposed to bridge the transfer of knowledge among students to enhance each other's performance. Experimental results demonstrate the effectiveness of seminar learning, which achieves the new state-of-the-art performance of 72.51% (mIOU), surpassing previous methods by a large margin of up to 16.88% on the Pascal VOC 2012 dataset.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_Seminar_Learning_for_Click-Level_Weakly_Supervised_Semantic_Segmentation_ICCV_2021_paper.pdf", @@ -38123,7 +40699,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0+2;0;2", - "aff_country_unique": "China;Netherlands;United Arab Emirates" + "aff_country_unique": "China;Netherlands;United Arab Emirates", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Hongjun and Wang,\n Jinbao and Chen,\n Hong Cai and Zhen,\n Xiantong and Zheng,\n Feng and Ji,\n Rongrong and Shao,\n Ling\n},\n title = {\n Seminar Learning for Click-Level Weakly Supervised Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6920-6929\n} \n}" }, { "title": "Sensor-Guided Optical Flow", @@ -38131,6 +40708,7 @@ "status": "Poster", "track": "main", "pid": 1206, + "author_site": "Matteo Poggi; Filippo Aleotti; Stefano Mattoccia", "author": "Matteo Poggi; Filippo Aleotti; Stefano Mattoccia", "abstract": "This paper proposes a framework to guide an optical flow network with external cues to achieve superior accuracy either on known or unseen domains. Given the availability of sparse yet accurate optical flow hints from an external source, these are injected to modulate the correlation scores computed by a state-of-the-art optical flow network and guide it towards more accurate predictions. Although no real sensor can provide sparse flow hints, we show how these can be obtained by combining depth measurements from active sensors with geometry and hand-crafted optical flow algorithms, leading to accurate enough hints for our purpose. Experimental results with a state-of-the-art flow network on standard benchmarks support the effectiveness of our framework, both in simulated and real conditions.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Poggi_Sensor-Guided_Optical_Flow_ICCV_2021_paper.pdf", @@ -38154,7 +40732,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Italy" + "aff_country_unique": "Italy", + "bibtex": "@InProceedings{Poggi_2021_ICCV,\n \n author = {\n Poggi,\n Matteo and Aleotti,\n Filippo and Mattoccia,\n Stefano\n},\n title = {\n Sensor-Guided Optical Flow\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7908-7918\n} \n}" }, { "title": "Separable Flow: Learning Motion Cost Volumes for Optical Flow Estimation", @@ -38162,10 +40741,11 @@ "status": "Poster", "track": "main", "pid": 2645, + "author_site": "Feihu Zhang; Oliver J. Woodford; Victor Adrian Prisacariu; Philip H.S. Torr", "author": "Feihu Zhang; Oliver J. Woodford; Victor Adrian Prisacariu; Philip H.S. Torr", "abstract": "Full-motion cost volumes play a central role in current state-of-the-art optical flow methods. However, constructed using simple feature correlations, they lack the ability to encapsulate prior, or even non-local, knowledge. This creates artifacts in poorly constrained, ambiguous regions, such as occluded and textureless areas. We propose a separable cost volume module, a drop-in replacement to correlation cost volumes, that uses non-local aggregation layers to exploit global context cues and prior knowledge, in order to disambiguate motions in these regions. Our method leads both the now standard Sintel and KITTI optical flow benchmarks in terms of accuracy, and is also shown to generalize better from synthetic to real data.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhang_Separable_Flow_Learning_Motion_Cost_Volumes_for_Optical_Flow_Estimation_ICCV_2021_paper.pdf", - "aff": "University of Oxford\u2217; ; University of Oxford\u2217; University of Oxford\u2217", + "aff": "University of Oxford∗; ; University of Oxford∗; University of Oxford∗", "project": "", "github": "https://github.com/feihuzhang/SeparableFlow", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Zhang_Separable_Flow_Learning_ICCV_2021_supplemental.pdf", @@ -38185,7 +40765,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Zhang_2021_ICCV,\n \n author = {\n Zhang,\n Feihu and Woodford,\n Oliver J. and Prisacariu,\n Victor Adrian and Torr,\n Philip H.S.\n},\n title = {\n Separable Flow: Learning Motion Cost Volumes for Optical Flow Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10807-10817\n} \n}" }, { "title": "Shallow Bayesian Meta Learning for Real-World Few-Shot Recognition", @@ -38193,6 +40774,7 @@ "status": "Poster", "track": "main", "pid": 11109, + "author_site": "Xueting Zhang; Debin Meng; Henry Gouk; Timothy M. Hospedales", "author": "Xueting Zhang; Debin Meng; Henry Gouk; Timothy M. Hospedales", "abstract": "Many state-of-the-art few-shot learners focus on developing effective training procedures for feature representations, before using simple (e.g., nearest centroid) classifiers. We take an approach that is agnostic to the features used, and focus exclusively on meta-learning the final classifier layer. Specifically, we introduce MetaQDA, a Bayesian meta-learning generalisation of the classic quadratic discriminant analysis. This approach has several benefits of interest to practitioners: meta-learning is fast and memory efficient, without the need to fine-tune features. It is agnostic to the off-the-shelf features chosen, and thus will continue to benefit from future advances in feature representations. Empirically, it leads to excellent performance in cross-domain few-shot learning, class-incremental few-shot learning, and crucially for real-world applications, the Bayesian formulation leads to state-of-the-art uncertainty calibration in predictions.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhang_Shallow_Bayesian_Meta_Learning_for_Real-World_Few-Shot_Recognition_ICCV_2021_paper.pdf", @@ -38209,14 +40791,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhang_Shallow_Bayesian_Meta_Learning_for_Real-World_Few-Shot_Recognition_ICCV_2021_paper.html", "aff_unique_index": "0;1+2;0;0+2", - "aff_unique_norm": "University of Edinburgh;University of Chinese Academy of Sciences;Samsung", + "aff_unique_norm": "University of Edinburgh;University of Chinese Academy of Sciences;Samsung AI Centre", "aff_unique_dep": ";;AI Centre", "aff_unique_url": "https://www.ed.ac.uk;http://www.ucas.ac.cn;https://www.samsung.com", "aff_unique_abbr": "Edinburgh;UCAS;Samsung AI", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;1+2;0;0+2", - "aff_country_unique": "United Kingdom;China;South Korea" + "aff_country_unique": "United Kingdom;China;South Korea", + "bibtex": "@InProceedings{Zhang_2021_ICCV,\n \n author = {\n Zhang,\n Xueting and Meng,\n Debin and Gouk,\n Henry and Hospedales,\n Timothy M.\n},\n title = {\n Shallow Bayesian Meta Learning for Real-World Few-Shot Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 651-660\n} \n}" }, { "title": "Shape Self-Correction for Unsupervised Point Cloud Understanding", @@ -38224,6 +40807,7 @@ "status": "Poster", "track": "main", "pid": 2962, + "author_site": "Ye Chen; Jinxian Liu; Bingbing Ni; Hang Wang; Jiancheng Yang; Ning Liu; Teng Li; Qi Tian", "author": "Ye Chen; Jinxian Liu; Bingbing Ni; Hang Wang; Jiancheng Yang; Ning Liu; Teng Li; Qi Tian", "abstract": "We develop a novel self-supervised learning method named Shape Self-Correction for point cloud analysis. Our method is motivated by the principle that a good shape representation should be able to find distorted parts of a shape and correct them. To learn strong shape representations in an unsupervised manner, we first design a shape-disorganizing module to destroy certain local shape parts of an object. Then the destroyed shape and the normal shape are sent into a point cloud network to get representations, which are employed to segment points that belong to distorted parts and further reconstruct them to restore the shape to normal. To perform better in these two associated pretext tasks, the network is constrained to capture useful shape features from the object, which indicates that the point cloud network encodes rich geometric and contextual information. The learned feature extractor transfers well to downstream classification and segmentation tasks. Experimental results on ModelNet, ScanNet and ShapeNetPart demonstrate that our method achieves state-of-the-art performance among unsupervised methods. Our framework can be applied to a wide range of deep learning networks for point cloud analysis and we show experimentally that pre-training with our framework significantly boosts the performance of supervised models.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_Shape_Self-Correction_for_Unsupervised_Point_Cloud_Understanding_ICCV_2021_paper.pdf", @@ -38239,15 +40823,16 @@ "email": "sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;ahu.edu.cn;huawei.com", "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Chen_Shape_Self-Correction_for_Unsupervised_Point_Cloud_Understanding_ICCV_2021_paper.html", - "aff_unique_index": "0;0+1;0+1;0+1;0;0;1;1", - "aff_unique_norm": "Shanghai Jiao Tong University;Huawei", - "aff_unique_dep": ";Hisilicon", - "aff_unique_url": "https://www.sjtu.edu.cn;https://www.huawei.com/en/", - "aff_unique_abbr": "SJTU;Huawei", + "aff_unique_index": "0;0+1;0+1;0+1;0;0;2;2", + "aff_unique_norm": "Shanghai Jiao Tong University;Huawei Technologies Co., Ltd.;Huawei", + "aff_unique_dep": ";Hisilicon;Car BU", + "aff_unique_url": "https://www.sjtu.edu.cn;https://www.huawei.com/en/;https://www.huawei.com", + "aff_unique_abbr": "SJTU;Huawei;Huawei Car BU", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0+0;0+0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Ye and Liu,\n Jinxian and Ni,\n Bingbing and Wang,\n Hang and Yang,\n Jiancheng and Liu,\n Ning and Li,\n Teng and Tian,\n Qi\n},\n title = {\n Shape Self-Correction for Unsupervised Point Cloud Understanding\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8382-8391\n} \n}" }, { "title": "Shape-Aware Multi-Person Pose Estimation From Multi-View Images", @@ -38255,10 +40840,11 @@ "status": "Poster", "track": "main", "pid": 6245, + "author_site": "Zijian Dong; Jie Song; Xu Chen; Chen Guo; Otmar Hilliges", "author": "Zijian Dong; Jie Song; Xu Chen; Chen Guo; Otmar Hilliges", "abstract": "In this paper we contribute a simple yet effective approach for estimating 3D poses of multiple people from multi-view images. Our proposed coarse-to-fine pipeline first aggregates noisy 2D observations from multiple camera views into 3D space and then associates them into individual instances based on a confidence-aware majority voting technique. The final pose estimates are attained from a novel optimization scheme which links high-confidence multi-view 2D observations and 3D joint candidates. Moreover, a statistical parametric body model such as SMPL is leveraged as a regularizing prior for these 3D joint candidates. Specifically, both 3D poses and SMPL parameters are optimized jointly in an alternating fashion. Here the parametric models help in correcting implausible 3D pose estimates and filling in missing joint detections while updated 3D poses in turn guide obtaining better SMPL estimations. By linking 2D and 3D observations, our method is both accurate and generalizes to different data sources because it better decouples the final 3D pose from the inter-person constellation and is more robust to noisy 2D detections. We systematically evaluate our method on public datasets and achieve state-of-the-art performance. The code and video will be available on the project page: https://ait.ethz.ch/projects/2021/multi-human-pose/.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Dong_Shape-Aware_Multi-Person_Pose_Estimation_From_Multi-View_Images_ICCV_2021_paper.pdf", - "aff": "ETH Z\u00fcrich; ETH Z\u00fcrich; ETH Z\u00fcrich+Max Planck Institute for Intelligent Systems, T\u00fcbingen; ETH Z\u00fcrich; ETH Z\u00fcrich", + "aff": "ETH Zürich; ETH Zürich; ETH Zürich+Max Planck Institute for Intelligent Systems, Tübingen; ETH Zürich; ETH Zürich", "project": "https://ait.ethz.ch/projects/2021/multi-human-pose/", "github": "", "supp": "", @@ -38271,14 +40857,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Dong_Shape-Aware_Multi-Person_Pose_Estimation_From_Multi-View_Images_ICCV_2021_paper.html", "aff_unique_index": "0;0;0+1;0;0", - "aff_unique_norm": "ETH Zurich;Max Planck Institute for Intelligent Systems", + "aff_unique_norm": "ETH Zürich;Max Planck Institute for Intelligent Systems", "aff_unique_dep": ";", "aff_unique_url": "https://www.ethz.ch;https://www.mpi-is.mpg.de", "aff_unique_abbr": "ETHZ;MPI-IS", "aff_campus_unique_index": "1", - "aff_campus_unique": ";T\u00fcbingen", + "aff_campus_unique": ";Tübingen", "aff_country_unique_index": "0;0;0+1;0;0", - "aff_country_unique": "Switzerland;Germany" + "aff_country_unique": "Switzerland;Germany", + "bibtex": "@InProceedings{Dong_2021_ICCV,\n \n author = {\n Dong,\n Zijian and Song,\n Jie and Chen,\n Xu and Guo,\n Chen and Hilliges,\n Otmar\n},\n title = {\n Shape-Aware Multi-Person Pose Estimation From Multi-View Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11158-11168\n} \n}" }, { "title": "Shape-Biased Domain Generalization via Shock Graph Embeddings", @@ -38286,6 +40873,7 @@ "status": "Poster", "track": "main", "pid": 9683, + "author_site": "Maruthi Narayanan; Vickram Rajendran; Benjamin Kimia", "author": "Maruthi Narayanan; Vickram Rajendran; Benjamin Kimia", "abstract": "There is an emerging sense that the vulnerability of Image Convolutional Neural Networks (CNN), i.e., sensitivity to image corruptions, perturbations, and adversarial attacks, is connected with Texture Bias. This relative lack of Shape Bias is also responsible for poor performance in Domain Generalization (DG). The inclusion of a role of shape alleviates these vulnerabilities and some approaches have achieved this by training on negative images, images endowed with edge maps, or images with conflicting shape and texture information. This paper advocates an explicit and complete representation of shape using a classical computer vision approach, namely, representing the shape content of an image with the shock graph of its contour map. The resulting graph and its descriptor is a complete representation of contour content and is classified using recent Graph Neural Network (GNN) methods. The experimental results on three domain shift datasets, Colored MNIST, PACS, and VLCS demonstrate that even without using appearance the shape-based approach exceeds classical Image CNN based methods in domain generalization.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Narayanan_Shape-Biased_Domain_Generalization_via_Shock_Graph_Embeddings_ICCV_2021_paper.pdf", @@ -38309,7 +40897,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Narayanan_2021_ICCV,\n \n author = {\n Narayanan,\n Maruthi and Rajendran,\n Vickram and Kimia,\n Benjamin\n},\n title = {\n Shape-Biased Domain Generalization via Shock Graph Embeddings\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1315-1325\n} \n}" }, { "title": "ShapeConv: Shape-Aware Convolutional Layer for Indoor RGB-D Semantic Segmentation", @@ -38317,6 +40906,7 @@ "status": "Poster", "track": "main", "pid": 6025, + "author_site": "Jinming Cao; Hanchao Leng; Dani Lischinski; Daniel Cohen-Or; Changhe Tu; Yangyan Li", "author": "Jinming Cao; Hanchao Leng; Dani Lischinski; Daniel Cohen-Or; Changhe Tu; Yangyan Li", "abstract": "RGB-D semantic segmentation has attracted increasing attention over the past few years. Existing methods mostly employ homogeneous convolution operators to consume the RGB and depth features, ignoring their intrinsic differences. In fact, the RGB values capture the photometric appearance properties in the projected image space, while the depth feature encodes both the shape of a local geometry as well as the base (whereabout) of it in a larger context. Compared with the base, the shape probably is more inherent and has a stronger connection to the semantics, and thus is more critical for segmentation accuracy. Inspired by this observation, we introduce Shape-aware Convolutional layer (ShapeConv) for processing the depth feature, where the depth feature is firstly decomposed into a shape-component and a base-component, next two learnable weights are introduced to cooperate with them independently, and finally a convolution is applied on the re-weighted combination of these two components. ShapeConv is model-agnostic and can be easily integrated into most CNNs to replace vanilla convolutional layers for semantic segmentation. Extensive experiments on three challenging indoor RGB-D semantic segmentation benchmarks, i.e., NYU-Dv2(-13,-40), SUN RGB-D, and SID, demonstrate the effectiveness of our ShapeConv when employing it over five popular architectures. Moreover, the performance of CNNs with ShapeConv is boosted without introducing any computation and memory increase in the inference phase. The reason is that the learnt weights for balancing the importance between the shape and base components in ShapeConv become constants in the inference phase, and thus can be fused into the following convolution, resulting in a network that is identical to one with vanilla convolutional layers.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Cao_ShapeConv_Shape-Aware_Convolutional_Layer_for_Indoor_RGB-D_Semantic_Segmentation_ICCV_2021_paper.pdf", @@ -38333,14 +40923,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Cao_ShapeConv_Shape-Aware_Convolutional_Layer_for_Indoor_RGB-D_Semantic_Segmentation_ICCV_2021_paper.html", "aff_unique_index": "0;0;1;2;0;3", - "aff_unique_norm": "Shandong University;Hebrew University of Jerusalem;Tel Aviv University;Alibaba Group", + "aff_unique_norm": "Shandong University;The Hebrew University of Jerusalem;Tel Aviv University;Alibaba Group", "aff_unique_dep": ";;;", "aff_unique_url": "http://www.sdu.edu.cn;https://www.huji.ac.il;https://www.tau.ac.il;https://www.alibaba.com", "aff_unique_abbr": "SDU;HUJI;TAU;Alibaba", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;1;0;0", - "aff_country_unique": "China;Israel" + "aff_country_unique": "China;Israel", + "bibtex": "@InProceedings{Cao_2021_ICCV,\n \n author = {\n Cao,\n Jinming and Leng,\n Hanchao and Lischinski,\n Dani and Cohen-Or,\n Daniel and Tu,\n Changhe and Li,\n Yangyan\n},\n title = {\n ShapeConv: Shape-Aware Convolutional Layer for Indoor RGB-D Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7088-7097\n} \n}" }, { "title": "SignBERT: Pre-Training of Hand-Model-Aware Representation for Sign Language Recognition", @@ -38348,6 +40939,7 @@ "status": "Poster", "track": "main", "pid": 2656, + "author_site": "Hezhen Hu; Weichao Zhao; Wengang Zhou; Yuechen Wang; Houqiang Li", "author": "Hezhen Hu; Weichao Zhao; Wengang Zhou; Yuechen Wang; Houqiang Li", "abstract": "Hand gesture serves as a critical role in sign language. Current deep-learning-based sign language recognition (SLR) methods may suffer insufficient interpretability and overfitting due to limited sign data sources. In this paper, we introduce the first self-supervised pre-trainable SignBERT with incorporated hand prior for SLR. SignBERT views the hand pose as a visual token, which is derived from an off-the-shelf pose extractor. The visual tokens are then embedded with gesture state, temporal and hand chirality information. To take full advantage of available sign data sources, SignBERT first performs self-supervised pre-training by masking and reconstructing visual tokens. Jointly with several mask modeling strategies, we attempt to incorporate hand prior in a model-aware method to better model hierarchical context over the hand sequence. Then with the prediction head added, SignBERT is fine-tuned to perform the downstream SLR task. To validate the effectiveness of our method on SLR, we perform extensive experiments on four public benchmark datasets, i.e., NMFs-CSL, SLR500, MSASL and WLASL. Experiment results demonstrate the effectiveness of both self-supervised learning and imported hand prior. Furthermore, we achieve state-of-the-art performance on all benchmarks with a notable gain.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Hu_SignBERT_Pre-Training_of_Hand-Model-Aware_Representation_for_Sign_Language_Recognition_ICCV_2021_paper.pdf", @@ -38371,7 +40963,8 @@ "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Hefei", "aff_country_unique_index": "0;0+0;0+0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Hu_2021_ICCV,\n \n author = {\n Hu,\n Hezhen and Zhao,\n Weichao and Zhou,\n Wengang and Wang,\n Yuechen and Li,\n Houqiang\n},\n title = {\n SignBERT: Pre-Training of Hand-Model-Aware Representation for Sign Language Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11087-11096\n} \n}" }, { "title": "SimROD: A Simple Adaptation Method for Robust Object Detection", @@ -38379,6 +40972,7 @@ "status": "Poster", "track": "main", "pid": 8083, + "author_site": "Rindra Ramamonjison; Amin Banitalebi-Dehkordi; Xinyu Kang; Xiaolong Bai; Yong Zhang", "author": "Rindra Ramamonjison; Amin Banitalebi-Dehkordi; Xinyu Kang; Xiaolong Bai; Yong Zhang", "abstract": "This paper presents a Simple and effective unsupervised adaptation method for Robust Object Detection (SimROD). To overcome the challenging issues of domain shift and pseudo-label noise, our method integrates a novel domain-centric data augmentation, a gradual self-labeling adaptation procedure, and a teacher-guided fine-tuning mechanism. Using our method, target domain samples can be leveraged to adapt object detection models without changing the model architecture or generating synthetic data. When applied to image corruptions and high-level cross-domain adaptation benchmarks, our method outperforms prior baselines on multiple domain adaptation benchmarks. SimROD achieves new state-of-the-art on standard real-to-synthetic and cross-camera setup benchmarks. On the image corruption benchmark, models adapted with our method achieved a relative robustness improvement of 15-25% AP50 on Pascal-C and 5-6% AP on COCO-C and Cityscapes-C. On the cross-domain benchmark, our method outperformed the best baseline performance by up to 8% and 4% AP50 on Comic and Watercolor respectively.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ramamonjison_SimROD_A_Simple_Adaptation_Method_for_Robust_Object_Detection_ICCV_2021_paper.pdf", @@ -38394,15 +40988,16 @@ "email": "huawei.com;huawei.com;alumni.ubc.ca;huawei.com;huawei.com", "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Ramamonjison_SimROD_A_Simple_Adaptation_Method_for_Robust_Object_Detection_ICCV_2021_paper.html", - "aff_unique_index": "0;0;1;0;0", - "aff_unique_norm": "Huawei;University of British Columbia", - "aff_unique_dep": "Huawei Technologies;", - "aff_unique_url": "https://www.huawei.com/ca-en/;https://www.ubc.ca", - "aff_unique_abbr": "Huawei;UBC", + "aff_unique_index": "0;0;1;2;0", + "aff_unique_norm": "Huawei Technologies;University of British Columbia;Huawei", + "aff_unique_dep": ";;Huawei Cloud", + "aff_unique_url": "https://www.huawei.com/ca-en/;https://www.ubc.ca;https://www.huaweicloud.com", + "aff_unique_abbr": "Huawei;UBC;Huawei Cloud", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0", - "aff_country_unique": "Canada;China" + "aff_country_unique": "Canada;China", + "bibtex": "@InProceedings{Ramamonjison_2021_ICCV,\n \n author = {\n Ramamonjison,\n Rindra and Banitalebi-Dehkordi,\n Amin and Kang,\n Xinyu and Bai,\n Xiaolong and Zhang,\n Yong\n},\n title = {\n SimROD: A Simple Adaptation Method for Robust Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3570-3579\n} \n}" }, { "title": "Simpler Is Better: Few-Shot Semantic Segmentation With Classifier Weight Transformer", @@ -38410,6 +41005,7 @@ "status": "Poster", "track": "main", "pid": 2873, + "author_site": "Zhihe Lu; Sen He; Xiatian Zhu; Li Zhang; Yi-Zhe Song; Tao Xiang", "author": "Zhihe Lu; Sen He; Xiatian Zhu; Li Zhang; Yi-Zhe Song; Tao Xiang", "abstract": "A few-shot semantic segmentation model is typically composed of a CNN encoder, a CNN decoder and a simple classifier (separating foreground and background pixels). Most existing methods meta-learn all three model components for fast adaptation to a new class. However, given that as few as a single support set image is available, effective model adaption of all three components to the new class is extremely challenging. In this work we propose to simplify the meta-learning task by focusing solely on the simplest component -- the classifier, whilst leaving the encoder and decoder to pre-training. We hypothesize that if we pre-train an off-the-shelf segmentation model over a set of diverse training classes with sufficient annotations, the encoder and decoder can capture rich discriminative features applicable for any unseen classes, rendering the subsequent meta-learning stage unnecessary. For the classifier meta-learning, we introduce a Classifier Weight Transformer (CWT) designed to dynamically adapt the support-set trained classifier's weights to each query image in an inductive way. Extensive experiments on two standard benchmarks show that despite its simplicity, our method outperforms the state-of-the-art alternatives, often by a large margin. Code is available on https://github.com/zhiheLu/CWT-for-FSS.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Lu_Simpler_Is_Better_Few-Shot_Semantic_Segmentation_With_Classifier_Weight_Transformer_ICCV_2021_paper.pdf", @@ -38433,7 +41029,8 @@ "aff_campus_unique_index": ";;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0;1;0+0;0+0", - "aff_country_unique": "United Kingdom;China" + "aff_country_unique": "United Kingdom;China", + "bibtex": "@InProceedings{Lu_2021_ICCV,\n \n author = {\n Lu,\n Zhihe and He,\n Sen and Zhu,\n Xiatian and Zhang,\n Li and Song,\n Yi-Zhe and Xiang,\n Tao\n},\n title = {\n Simpler Is Better: Few-Shot Semantic Segmentation With Classifier Weight Transformer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8741-8750\n} \n}" }, { "title": "Single Image 3D Shape Retrieval via Cross-Modal Instance and Category Contrastive Learning", @@ -38441,6 +41038,7 @@ "status": "Poster", "track": "main", "pid": 3725, + "author_site": "Ming-Xian Lin; Jie Yang; He Wang; Yu-Kun Lai; Rongfei Jia; Binqiang Zhao; Lin Gao", "author": "Ming-Xian Lin; Jie Yang; He Wang; Yu-Kun Lai; Rongfei Jia; Binqiang Zhao; Lin Gao", "abstract": "In this work, we tackle the problem of single image-based 3D shape retrieval (IBSR), where we seek to find the most matched shape of a given single 2D image from a shape repository. Most of the existing works learn to embed 2D images and 3D shapes into a common feature space and perform metric learning using a triplet loss. Inspired by the great success in recent contrastive learning works on self-supervised representation learning, we propose a novel IBSR pipeline leveraging contrastive learning. We note that adopting such cross-modal contrastive learning between 2D images and 3D shapes into IBSR tasks is non-trivial and challenging: contrastive learning requires very strong data augmentation in constructed positive pairs to learn the feature invariance, whereas traditional metric learning works do not have this requirement. Moreover, object shape and appearance are entangled in 2D query images, thus making the learning task more difficult than contrasting single-modal data. To mitigate the challenges, we propose to use multi-view grayscale rendered images from the 3D shapes as a shape representation. We then introduce a strong data augmentation technique based on color transfer, which can significantly but naturally change the appearance of the query image, effectively satisfying the need for contrastive learning. Finally, we propose to incorporate a novel category-level contrastive loss that helps distinguish similar objects from different categories, in addition to classic instance-level contrastive loss. Our experiments demonstrate that our approach achieves the best performance on all the three popular IBSR benchmarks, including Pix3D, Stanford Cars, and Comp Cars, outperforming the previous state-of-the-art from 4% - 15% on retrieval accuracy.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Lin_Single_Image_3D_Shape_Retrieval_via_Cross-Modal_Instance_and_Category_ICCV_2021_paper.pdf", @@ -38464,7 +41062,8 @@ "aff_campus_unique_index": ";;1;", "aff_campus_unique": ";Cardiff", "aff_country_unique_index": "0+0+0;0+0+0;0;1;0;0;0+0+0", - "aff_country_unique": "China;United Kingdom" + "aff_country_unique": "China;United Kingdom", + "bibtex": "@InProceedings{Lin_2021_ICCV,\n \n author = {\n Lin,\n Ming-Xian and Yang,\n Jie and Wang,\n He and Lai,\n Yu-Kun and Jia,\n Rongfei and Zhao,\n Binqiang and Gao,\n Lin\n},\n title = {\n Single Image 3D Shape Retrieval via Cross-Modal Instance and Category Contrastive Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11405-11415\n} \n}" }, { "title": "Single Image Defocus Deblurring Using Kernel-Sharing Parallel Atrous Convolutions", @@ -38472,6 +41071,7 @@ "status": "Poster", "track": "main", "pid": 9450, + "author_site": "Hyeongseok Son; Junyong Lee; Sunghyun Cho; Seungyong Lee", "author": "Hyeongseok Son; Junyong Lee; Sunghyun Cho; Seungyong Lee", "abstract": "This paper proposes a novel deep learning approach for single image defocus deblurring based on inverse kernels. In a defocused image, the blur shapes are similar among pixels although the blur sizes can spatially vary. To utilize the property with inverse kernels, we exploit the observation that when only the size of a defocus blur changes while keeping the shape, the shape of the corresponding inverse kernel remains the same and only the scale changes. Based on the observation, we propose a kernel-sharing parallel atrous convolutional (KPAC) block specifically designed by incorporating the property of inverse kernels for single image defocus deblurring. To effectively simulate the invariant shapes of inverse kernels with different scales, KPAC shares the same convolutional weights among multiple atrous convolution layers. To efficiently simulate the varying scales of inverse kernels, KPAC consists of only a few atrous convolution layers with different dilations and learns per-pixel scale attentions to aggregate the outputs of the layers. KPAC also utilizes the shape attention to combine the outputs of multiple convolution filters in each atrous convolution layer, to deal with defocus blur with a slightly varying shape. We demonstrate that our approach achieves state-of-the-art performance with a much smaller number of parameters than previous methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Son_Single_Image_Defocus_Deblurring_Using_Kernel-Sharing_Parallel_Atrous_Convolutions_ICCV_2021_paper.pdf", @@ -38495,7 +41095,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Pohang", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Son_2021_ICCV,\n \n author = {\n Son,\n Hyeongseok and Lee,\n Junyong and Cho,\n Sunghyun and Lee,\n Seungyong\n},\n title = {\n Single Image Defocus Deblurring Using Kernel-Sharing Parallel Atrous Convolutions\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2642-2650\n} \n}" }, { "title": "Single View Physical Distance Estimation Using Human Pose", @@ -38503,10 +41104,11 @@ "status": "Poster", "track": "main", "pid": 8374, + "author_site": "Xiaohan Fei; Henry Wang; Lin Lee Cheong; Xiangyu Zeng; Meng Wang; Joseph Tighe", "author": "Xiaohan Fei; Henry Wang; Lin Lee Cheong; Xiangyu Zeng; Meng Wang; Joseph Tighe", "abstract": "We propose a fully automated system that simultaneously estimates the camera intrinsics, the ground plane, and physical distances between people from a single RGB image or video captured by a camera viewing a 3-D scene from a fixed vantage point. To automate camera calibration and distance estimation, we leverage priors about human pose and develop a novel direct formulation for pose-based auto-calibration and distance estimation, which shows state-of-the-art performance on publicly available datasets. The proposed approach enables existing camera systems to measure physical distances without needing a dedicated calibration process or range sensors, and is applicable to a broad range of use cases such as social distancing and workplace safety. Furthermore, to enable evaluation and drive research in this area, we contribute to the publicly available MEVA dataset with additional distance annotations, resulting in \"MEVADA\" -- an evaluation benchmark for the pose-based auto-calibration and distance estimation problem.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Fei_Single_View_Physical_Distance_Estimation_Using_Human_Pose_ICCV_2021_paper.pdf", - "aff": "Amazon Web Services; Amazon Web Services; Amazon Web Services; Amazon Web Services+\u2217; Amazon Web Services; Amazon Web Services", + "aff": "Amazon Web Services; Amazon Web Services; Amazon Web Services; Amazon Web Services+∗; Amazon Web Services; Amazon Web Services", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Fei_Single_View_Physical_ICCV_2021_supplemental.pdf", @@ -38519,14 +41121,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Fei_Single_View_Physical_Distance_Estimation_Using_Human_Pose_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;0;0;0", - "aff_unique_norm": "Amazon;", - "aff_unique_dep": "Amazon Web Services;", + "aff_unique_norm": "Amazon Web Services;", + "aff_unique_dep": ";", "aff_unique_url": "https://aws.amazon.com;", "aff_unique_abbr": "AWS;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States;" + "aff_country_unique": "United States;", + "bibtex": "@InProceedings{Fei_2021_ICCV,\n \n author = {\n Fei,\n Xiaohan and Wang,\n Henry and Cheong,\n Lin Lee and Zeng,\n Xiangyu and Wang,\n Meng and Tighe,\n Joseph\n},\n title = {\n Single View Physical Distance Estimation Using Human Pose\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12406-12416\n} \n}" }, { "title": "Single-Shot Hyperspectral-Depth Imaging With Learned Diffractive Optics", @@ -38534,6 +41137,7 @@ "status": "Poster", "track": "main", "pid": 7653, + "author_site": "Seung-Hwan Baek; Hayato Ikoma; Daniel S. Jeon; Yuqi Li; Wolfgang Heidrich; Gordon Wetzstein; Min H. Kim", "author": "Seung-Hwan Baek; Hayato Ikoma; Daniel S. Jeon; Yuqi Li; Wolfgang Heidrich; Gordon Wetzstein; Min H. Kim", "abstract": "Imaging depth and spectrum have been extensively studied in isolation from each other for decades. Recently, hyperspectral-depth (HS-D) imaging emerges to capture both information simultaneously by combining two different imaging systems; one for depth, the other for spectrum. While being accurate, this combinational approach induces increased form factor, cost, capture time, and alignment/registration problems. In this work, departing from the combinational principle, we propose a compact single-shot monocular HS-D imaging method. Our method uses a diffractive optical element (DOE), the point spread function of which changes with respect to both depth and spectrum. This enables us to reconstruct spectrum and depth from a single captured image. To this end, we develop a differentiable simulator and a neural-network-based reconstruction method that are jointly optimized via automatic differentiation. To facilitate learning the DOE, we present a first HS-D dataset by building a benchtop HS-D imager that acquires high-quality ground truth. We evaluate our method with synthetic and real experiments by building an experimental prototype and achieve state-of-the-art HS-D imaging results.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Baek_Single-Shot_Hyperspectral-Depth_Imaging_With_Learned_Diffractive_Optics_ICCV_2021_paper.pdf", @@ -38557,7 +41161,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;1;1;2;0;1;0", - "aff_country_unique": "South Korea;United States;Saudi Arabia" + "aff_country_unique": "South Korea;United States;Saudi Arabia", + "bibtex": "@InProceedings{Baek_2021_ICCV,\n \n author = {\n Baek,\n Seung-Hwan and Ikoma,\n Hayato and Jeon,\n Daniel S. and Li,\n Yuqi and Heidrich,\n Wolfgang and Wetzstein,\n Gordon and Kim,\n Min H.\n},\n title = {\n Single-Shot Hyperspectral-Depth Imaging With Learned Diffractive Optics\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2651-2660\n} \n}" }, { "title": "Skeleton Cloud Colorization for Unsupervised 3D Action Representation Learning", @@ -38565,6 +41170,7 @@ "status": "Poster", "track": "main", "pid": 3352, + "author_site": "Siyuan Yang; Jun Liu; Shijian Lu; Meng Hwa Er; Alex C. Kot", "author": "Siyuan Yang; Jun Liu; Shijian Lu; Meng Hwa Er; Alex C. Kot", "abstract": "Skeleton-based human action recognition has attracted increasing attention in recent years. However, most of the existing works focus on supervised learning which requiring a large number of annotated action sequences that are often expensive to collect. We investigate unsupervised representation learning for skeleton action recognition, and design a novel skeleton cloud colorization technique that is capable of learning skeleton representations from unlabeled skeleton sequence data. Specifically, we represent a skeleton action sequence as a 3D skeleton cloud and colorize each point in the cloud according to its temporal and spatial orders in the original (unannotated) skeleton sequence. Leveraging the colorized skeleton point cloud, we design an auto-encoder framework that can learn spatial-temporal features from the artificial color labels of skeleton joints effectively. We evaluate our skeleton cloud colorization approach with action classifiers trained under different configurations, including unsupervised, semi-supervised and fully-supervised settings. Extensive experiments on NTU RGB+D and NW-UCLA datasets show that the proposed method outperforms existing unsupervised and semi-supervised 3D action recognition methods by large margins, and it achieves competitive performance in supervised 3D action recognition as well.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yang_Skeleton_Cloud_Colorization_for_Unsupervised_3D_Action_Representation_Learning_ICCV_2021_paper.pdf", @@ -38588,7 +41194,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Yang_2021_ICCV,\n \n author = {\n Yang,\n Siyuan and Liu,\n Jun and Lu,\n Shijian and Er,\n Meng Hwa and Kot,\n Alex C.\n},\n title = {\n Skeleton Cloud Colorization for Unsupervised 3D Action Representation Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13423-13433\n} \n}" }, { "title": "Skeleton2Mesh: Kinematics Prior Injected Unsupervised Human Mesh Recovery", @@ -38596,6 +41203,7 @@ "status": "Poster", "track": "main", "pid": 3645, + "author_site": "Zhenbo Yu; Junjie Wang; Jingwei Xu; Bingbing Ni; Chenglong Zhao; Minsi Wang; Wenjun Zhang", "author": "Zhenbo Yu; Junjie Wang; Jingwei Xu; Bingbing Ni; Chenglong Zhao; Minsi Wang; Wenjun Zhang", "abstract": "In this paper, we decouple unsupervised human mesh recovery into the well-studied problems of unsupervised 3D pose estimation, and human mesh recovery from estimated 3D skeletons, focusing on the latter task. The challenges of the latter task are two folds: (1) pose failure (i.e., pose mismatching -- different skeleton definitions in dataset and SMPL , and pose ambiguity -- endpoints have arbitrary joint angle configurations for the same 3D joint coordinates). (2) shape ambiguity (i.e., the lack of shape constraints on body configuration). To address these issues, we propose Skeleton2Mesh, a novel lightweight framework that recovers human mesh from a single image. Our Skeleton2Mesh contains three modules, i.e., Differentiable Inverse Kinematics (DIK), Pose Refinement (PR) and Shape Refinement (SR) modules. DIK is designed to transfer 3D rotation from estimated 3D skeletons, which relies on a minimal set of kinematics prior knowledge. Then PR and SR modules are utilized to tackle the pose ambiguity and shape ambiguity respectively. All three modules can be incorporated into Skeleton2Mesh seamlessly via an end-to-end manner. Furthermore, we utilize an adaptive joint regressor to alleviate the effects of skeletal topology from different datasets. Results on the Human3.6M dataset for human mesh recovery demonstrate that our method improves upon the previous unsupervised methods by 32.6% under the same setting. Qualitative results on in-the-wild datasets exhibit that the recovered 3D meshes are natural, realistic. Our project is available at https://sites.google.com/view/skeleton2mesh.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yu_Skeleton2Mesh_Kinematics_Prior_Injected_Unsupervised_Human_Mesh_Recovery_ICCV_2021_paper.pdf", @@ -38619,7 +41227,8 @@ "aff_campus_unique_index": ";;;;;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0+0;0+0;0+0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yu_2021_ICCV,\n \n author = {\n Yu,\n Zhenbo and Wang,\n Junjie and Xu,\n Jingwei and Ni,\n Bingbing and Zhao,\n Chenglong and Wang,\n Minsi and Zhang,\n Wenjun\n},\n title = {\n Skeleton2Mesh: Kinematics Prior Injected Unsupervised Human Mesh Recovery\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8619-8629\n} \n}" }, { "title": "Sketch Your Own GAN", @@ -38627,6 +41236,7 @@ "status": "Poster", "track": "main", "pid": 1622, + "author_site": "Sheng-Yu Wang; David Bau; Jun-Yan Zhu", "author": "Sheng-Yu Wang; David Bau; Jun-Yan Zhu", "abstract": "Can a user create a deep generative model by sketching a single example? Traditionally, creating a GAN model has required the collection of a large-scale dataset of exemplars and specialized knowledge in deep learning. In contrast, sketching is possibly the most universally accessible way to convey a visual concept. In this work, we present a method, GAN Sketching, for rewriting GANs with one or more sketches, to make GANs training easier for novice users. In particular, we change the weights of an original GAN model according to user sketches. We encourage the model's output to match the user sketches through a cross-domain adversarial loss. Furthermore, we explore different regularization methods to preserve the original model's diversity and image quality. Experiments have shown that our method can mold GANs to match shapes and poses specified by sketches while maintaining realism and diversity. Finally, we demonstrate a few applications of the resulting GAN, including latent space interpolation and image editing.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_Sketch_Your_Own_GAN_ICCV_2021_paper.pdf", @@ -38641,7 +41251,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wang_Sketch_Your_Own_GAN_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wang_Sketch_Your_Own_GAN_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Sheng-Yu and Bau,\n David and Zhu,\n Jun-Yan\n},\n title = {\n Sketch Your Own GAN\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14050-14060\n} \n}" }, { "title": "Sketch2Mesh: Reconstructing and Editing 3D Shapes From Sketches", @@ -38649,6 +41260,7 @@ "status": "Poster", "track": "main", "pid": 10402, + "author_site": "Benoit Guillard; Edoardo Remelli; Pierre Yvernay; Pascal Fua", "author": "Benoit Guillard; Edoardo Remelli; Pierre Yvernay; Pascal Fua", "abstract": "Reconstructing 3D shape from 2D sketches has long been an open problem because the sketches only provide very sparse and ambiguous information. In this paper, we use an encoder/decoder architecture for the sketch to mesh translation. When integrated into a user interface that provides camera parameters for the sketches, this enables us to leverage its latent parametrization to represent and refine a 3D mesh so that its projections match the external contours outlined in the sketch. We will show that this approach is easy to deploy, robust to style changes, and effective. Furthermore, it can be used for shape refinement given only single pen strokes. We compare our approach to state-of-the-art methods on sketches - both hand-drawn and synthesized - and demonstrate that we outperform them.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Guillard_Sketch2Mesh_Reconstructing_and_Editing_3D_Shapes_From_Sketches_ICCV_2021_paper.pdf", @@ -38665,14 +41277,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Guillard_Sketch2Mesh_Reconstructing_and_Editing_3D_Shapes_From_Sketches_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;0", - "aff_unique_norm": "EPFL", + "aff_unique_norm": "École Polytechnique Fédérale de Lausanne", "aff_unique_dep": "CVLab", "aff_unique_url": "https://cvlab.epfl.ch", "aff_unique_abbr": "EPFL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Switzerland" + "aff_country_unique": "Switzerland", + "bibtex": "@InProceedings{Guillard_2021_ICCV,\n \n author = {\n Guillard,\n Benoit and Remelli,\n Edoardo and Yvernay,\n Pierre and Fua,\n Pascal\n},\n title = {\n Sketch2Mesh: Reconstructing and Editing 3D Shapes From Sketches\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13023-13032\n} \n}" }, { "title": "SketchAA: Abstract Representation for Abstract Sketches", @@ -38680,10 +41293,11 @@ "status": "Poster", "track": "main", "pid": 2719, + "author_site": "Lan Yang; Kaiyue Pang; Honggang Zhang; Yi-Zhe Song", "author": "Lan Yang; Kaiyue Pang; Honggang Zhang; Yi-Zhe Song", "abstract": "What makes free-hand sketches appealing for humans lies with its capability as a universal tool to depict the visual world. Such flexibility at human ease, however, introduces abstract renderings that pose unique challenges to computer vision models. In this paper, we propose a purpose-made sketch representation for human sketches. The key intuition is that such representation should be abstract at design, so to accommodate the abstract nature of sketches. This is achieved by interpreting sketch abstraction on two levels: appearance and structure. We abstract sketch structure as a pre-defined coarse-to-fine visual block hierarchy, and average visual features within each block to model appearance abstraction. We then discuss three general strategies on how to exploit feature synergy across different levels of this abstraction hierarchy. The superiority of explicitly abstracting sketch representation is empirically validated on a number of sketch analysis tasks, including sketch recognition, fine-grained sketch-based image retrieval, and generative sketch healing. Our simple design not only yields strong results on all said tasks, but also offers intuitive feature granularity control to tailor for various downstream tasks. Code will be made publicly available.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yang_SketchAA_Abstract_Representation_for_Abstract_Sketches_ICCV_2021_paper.pdf", - "aff": "PRIS, School of Arti\ufb01cial Intelligence, Beijing University of Posts and Telecommunications, China + SketchX, CVSSP, University of Surrey, United Kingdom; SketchX, CVSSP, University of Surrey, United Kingdom; PRIS, School of Arti\ufb01cial Intelligence, Beijing University of Posts and Telecommunications, China; SketchX, CVSSP, University of Surrey, United Kingdom", + "aff": "PRIS, School of Artificial Intelligence, Beijing University of Posts and Telecommunications, China + SketchX, CVSSP, University of Surrey, United Kingdom; SketchX, CVSSP, University of Surrey, United Kingdom; PRIS, School of Artificial Intelligence, Beijing University of Posts and Telecommunications, China; SketchX, CVSSP, University of Surrey, United Kingdom", "project": "", "github": "", "supp": "", @@ -38697,13 +41311,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Yang_SketchAA_Abstract_Representation_for_Abstract_Sketches_ICCV_2021_paper.html", "aff_unique_index": "0+1;1;0;1", "aff_unique_norm": "Beijing University of Posts and Telecommunications;University of Surrey", - "aff_unique_dep": "School of Arti\ufb01cial Intelligence;CVSSP", - "aff_unique_url": ";https://www.surrey.ac.uk", + "aff_unique_dep": "School of Artificial Intelligence;CVSSP", + "aff_unique_url": "http://www.bupt.edu.cn/;https://www.surrey.ac.uk", "aff_unique_abbr": "BUPT;Surrey", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;1;0;1", - "aff_country_unique": "China;United Kingdom" + "aff_country_unique": "China;United Kingdom", + "bibtex": "@InProceedings{Yang_2021_ICCV,\n \n author = {\n Yang,\n Lan and Pang,\n Kaiyue and Zhang,\n Honggang and Song,\n Yi-Zhe\n},\n title = {\n SketchAA: Abstract Representation for Abstract Sketches\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10097-10106\n} \n}" }, { "title": "SketchLattice: Latticed Representation for Sketch Manipulation", @@ -38711,6 +41326,7 @@ "status": "Poster", "track": "main", "pid": 5434, + "author_site": "Yonggang Qi; Guoyao Su; Pinaki Nath Chowdhury; Mingkang Li; Yi-Zhe Song", "author": "Yonggang Qi; Guoyao Su; Pinaki Nath Chowdhury; Mingkang Li; Yi-Zhe Song", "abstract": "The key challenge in designing a sketch representation lies with handling the abstract and iconic nature of sketches. Existing work predominantly utilizes either, (i) a pixelative format that treats sketches as natural images employing off-the-shelf CNN-based networks, or (ii) an elaborately designed vector format that leverages the structural information of drawing orders using sequential RNN-based methods. While the pixelative format lacks intuitive exploitation of structural cues, sketches in vector format are absent in most cases limiting their practical usage. Hence, in this paper, we propose a lattice structured sketch representation that not only removes the bottleneck of requiring vector data but also preserves the structural cues that vector data provides. Essentially, sketch lattice is a set of points sampled from the pixelative format of the sketch using a lattice graph. We show that our lattice structure is particularly amenable to structural changes that largely benefits sketch abstraction modeling for generation tasks. Our lattice representation could be effectively encoded using a graph model, that uses significantly fewer model parameters (13.5 times lesser) than existing state-of-the-art. Extensive experiments demonstrate the effectiveness of sketch lattice for sketch manipulation, including sketch healing and image-to-sketch synthesis.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Qi_SketchLattice_Latticed_Representation_for_Sketch_Manipulation_ICCV_2021_paper.pdf", @@ -38734,7 +41350,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;1", - "aff_country_unique": "China;United Kingdom" + "aff_country_unique": "China;United Kingdom", + "bibtex": "@InProceedings{Qi_2021_ICCV,\n \n author = {\n Qi,\n Yonggang and Su,\n Guoyao and Chowdhury,\n Pinaki Nath and Li,\n Mingkang and Song,\n Yi-Zhe\n},\n title = {\n SketchLattice: Latticed Representation for Sketch Manipulation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 953-961\n} \n}" }, { "title": "SmartShadow: Artistic Shadow Drawing Tool for Line Drawings", @@ -38742,6 +41359,7 @@ "status": "Poster", "track": "main", "pid": 1791, + "author_site": "Lvmin Zhang; Jinyue Jiang; Yi Ji; Chunping Liu", "author": "Lvmin Zhang; Jinyue Jiang; Yi Ji; Chunping Liu", "abstract": "SmartShadow is a deep learning application for digital painting artists to draw shadows on line drawings, with three proposed tools. (1) Shadow brush: artists can draw scribbles to coarsely indicate the areas inside or outside their wanted shadows, and the application will generate the shadows in real-time. (2) Shadow boundary brush: this brush can precisely control the boundary of any specific shadow. (3) Global shadow generator: this tool can estimate the global shadow direction from input brush scribbles, and then consistently propagate local shadows to the entire image. These three tools can not only speed up the shadow drawing process (by 3.1 times as experiments validate), but also allow for the flexibility to achieve various shadow effects and facilitate richer artistic creations. To this end, we train Convolutional Neural Networks (CNNs) with a collected large-scale dataset of both real and synthesized data, and especially, we collect 1670 shadow samples drawn by real artists. Both qualitative analysis and user study show that our approach can generate high-quality shadows that are practically usable in the daily works of digital painting artists. We present 30 additional results and 15 visual comparisons in the supplementary materiel.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhang_SmartShadow_Artistic_Shadow_Drawing_Tool_for_Line_Drawings_ICCV_2021_paper.pdf", @@ -38765,7 +41383,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Zhang_2021_ICCV,\n \n author = {\n Zhang,\n Lvmin and Jiang,\n Jinyue and Ji,\n Yi and Liu,\n Chunping\n},\n title = {\n SmartShadow: Artistic Shadow Drawing Tool for Line Drawings\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5391-5400\n} \n}" }, { "title": "SnowflakeNet: Point Cloud Completion by Snowflake Point Deconvolution With Skip-Transformer", @@ -38773,6 +41392,7 @@ "status": "Poster", "track": "main", "pid": 2548, + "author_site": "Peng Xiang; Xin Wen; Yu-Shen Liu; Yan-Pei Cao; Pengfei Wan; Wen Zheng; Zhizhong Han", "author": "Peng Xiang; Xin Wen; Yu-Shen Liu; Yan-Pei Cao; Pengfei Wan; Wen Zheng; Zhizhong Han", "abstract": "Point cloud completion aims to predict a complete shape in high accuracy from its partial observation. However, previous methods usually suffered from discrete nature of point cloud and unstructured prediction of points in local regions, which makes it hard to reveal fine local geometric details on the complete shape. To resolve this issue, we propose SnowflakeNet with Snowflake Point Deconvolution (SPD) to generate the complete point clouds. The SnowflakeNet models the generation of complete point clouds as the snowflake-like growth of points in 3D space, where the child points are progressively generated by splitting their parent points after each SPD. Our insight of revealing detailed geometry is to introduce skip-transformer in SPD to learn point splitting patterns which can fit local regions the best. Skip-transformer leverages attention mechanism to summarize the splitting patterns used in the previous SPD layer to produce the splitting in the current SPD layer. The locally compact and structured point cloud generated by SPD is able to precisely capture the structure characteristic of 3D shape in local patches, which enables the network to predict highly detailed geometries, such as smooth regions, sharp edges and corners. Our experimental results outperform the state-of-the-art point cloud completion methods under widely used benchmarks. Code will be available at https://github.com/AllenXiangX/SnowflakeNet.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xiang_SnowflakeNet_Point_Cloud_Completion_by_Snowflake_Point_Deconvolution_With_Skip-Transformer_ICCV_2021_paper.pdf", @@ -38796,7 +41416,8 @@ "aff_campus_unique_index": "0;0+0;0", "aff_campus_unique": "Beijing;", "aff_country_unique_index": "0;0+0;0;0;0;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Xiang_2021_ICCV,\n \n author = {\n Xiang,\n Peng and Wen,\n Xin and Liu,\n Yu-Shen and Cao,\n Yan-Pei and Wan,\n Pengfei and Zheng,\n Wen and Han,\n Zhizhong\n},\n title = {\n SnowflakeNet: Point Cloud Completion by Snowflake Point Deconvolution With Skip-Transformer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5499-5509\n} \n}" }, { "title": "Social Fabric: Tubelet Compositions for Video Relation Detection", @@ -38804,6 +41425,7 @@ "status": "Poster", "track": "main", "pid": 3868, + "author_site": "Shuo Chen; Zenglin Shi; Pascal Mettes; Cees G. M. Snoek", "author": "Shuo Chen; Zenglin Shi; Pascal Mettes; Cees G. M. Snoek", "abstract": "This paper strives to classify and detect the relationship between object tubelets appearing within a video as a triplet. Where existing works treat object proposals or tubelets as single entities and model their relations a posteriori, we propose to classify and detect predicates for pairs of object tubelets a priori. We also propose Social Fabric: an encoding that represents a pair of object tubelets as a composition of interaction primitives. These primitives are learned over all relations, resulting in a compact representation able to localize and classify relations from the pool of co-occurring object tubelets across all timespans in a video. The encoding enables our two-stage network. In the first stage, we train Social Fabric to suggest proposals that are likely interacting. We use the Social Fabric in the second stage to simultaneously fine-tune and predict predicate labels for the tubelets. Experiments demonstrate the benefit of early video relation modeling, our encoding and the two-stage architecture, leading to a new state-of-the-art on two benchmarks. We also show how the encoding enables query-by-primitive-example to search for spatio-temporal video relations. Code: https://github.com/shanshuo/Social-Fabric.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_Social_Fabric_Tubelet_Compositions_for_Video_Relation_Detection_ICCV_2021_paper.pdf", @@ -38818,7 +41440,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Chen_Social_Fabric_Tubelet_Compositions_for_Video_Relation_Detection_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Chen_Social_Fabric_Tubelet_Compositions_for_Video_Relation_Detection_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Shuo and Shi,\n Zenglin and Mettes,\n Pascal and Snoek,\n Cees G. M.\n},\n title = {\n Social Fabric: Tubelet Compositions for Video Relation Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13485-13494\n} \n}" }, { "title": "Social NCE: Contrastive Learning of Socially-Aware Motion Representations", @@ -38826,10 +41449,11 @@ "status": "Poster", "track": "main", "pid": 1039, + "author_site": "Yuejiang Liu; Qi Yan; Alexandre Alahi", "author": "Yuejiang Liu; Qi Yan; Alexandre Alahi", "abstract": "Learning socially-aware motion representations is at the core of recent advances in multi-agent problems, such as human motion forecasting and robot navigation in crowds. Despite promising progress, existing representations learned with neural networks still struggle to generalize in closed-loop predictions (e.g., output colliding trajectories). This issue largely arises from the non-i.i.d. nature of sequential prediction in conjunction with ill-distributed training data. Intuitively, if the training data only comes from human behaviors in safe spaces, i.e., from \"positive\" examples, it is difficult for learning algorithms to capture the notion of \"negative\" examples like collisions. In this work, we aim to address this issue by explicitly modeling negative examples through self-supervision: (i) we introduce a social contrastive loss that regularizes the extracted motion representation by discerning the ground-truth positive events from synthetic negative ones; (ii) we construct informative negative samples based on our prior knowledge of rare but dangerous circumstances. Our method substantially reduces the collision rates of recent trajectory forecasting, behavioral cloning and reinforcement learning algorithms, outperforming state-of-the-art methods on several benchmarks. Our code is available at https://github.com/vita-epfl/social-nce.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liu_Social_NCE_Contrastive_Learning_of_Socially-Aware_Motion_Representations_ICCV_2021_paper.pdf", - "aff": "\u00b4Ecole Polytechnique F \u00b4ed\u00b4erale de Lausanne (EPFL); \u00b4Ecole Polytechnique F \u00b4ed\u00b4erale de Lausanne (EPFL); \u00b4Ecole Polytechnique F \u00b4ed\u00b4erale de Lausanne (EPFL)", + "aff": "´Ecole Polytechnique F ´ed´erale de Lausanne (EPFL); ´Ecole Polytechnique F ´ed´erale de Lausanne (EPFL); ´Ecole Polytechnique F ´ed´erale de Lausanne (EPFL)", "project": "", "github": "https://github.com/vita-epfl/social-nce", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Liu_Social_NCE_Contrastive_ICCV_2021_supplemental.pdf", @@ -38842,14 +41466,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Liu_Social_NCE_Contrastive_Learning_of_Socially-Aware_Motion_Representations_ICCV_2021_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "EPFL", + "aff_unique_norm": "Ecole Polytechnique Fédérale de Lausanne", "aff_unique_dep": "", "aff_unique_url": "https://www.epfl.ch", "aff_unique_abbr": "EPFL", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Lausanne", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Switzerland" + "aff_country_unique": "Switzerland", + "bibtex": "@InProceedings{Liu_2021_ICCV,\n \n author = {\n Liu,\n Yuejiang and Yan,\n Qi and Alahi,\n Alexandre\n},\n title = {\n Social NCE: Contrastive Learning of Socially-Aware Motion Representations\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15118-15129\n} \n}" }, { "title": "Solving Inefficiency of Self-Supervised Representation Learning", @@ -38857,6 +41482,7 @@ "status": "Poster", "track": "main", "pid": 9739, + "author_site": "Guangrun Wang; Keze Wang; Guangcong Wang; Philip H.S. Torr; Liang Lin", "author": "Guangrun Wang; Keze Wang; Guangcong Wang; Philip H.S. Torr; Liang Lin", "abstract": "Self-supervised learning (especially contrastive learning) has attracted great interest due to its huge potential in learning discriminative representations in an unsupervised manner. Despite the acknowledged successes, existing contrastive learning methods suffer from very low learning efficiency, e.g., taking about ten times more training epochs than supervised learning for comparable recognition accuracy. In this paper, we reveal two contradictory phenomena in contrastive learning that we call under-clustering and over-clustering problems, which are major obstacles to learning efficiency. Under-clustering means that the model cannot efficiently learn to discover the dissimilarity between inter-class samples when the negative sample pairs for contrastive learning are insufficient to differentiate all the actual object classes. Over-clustering implies that the model cannot efficiently learn features from excessive negative sample pairs, forcing the model to over-cluster samples of the same actual classes into different clusters. To simultaneously overcome these two problems, we propose a novel self-supervised learning framework using a truncated triplet loss. Precisely, we employ a triplet loss tending to maximize the relative distance between the positive pair and negative pairs to address the under-clustering problem; and we construct the negative pair by selecting a negative sample deputy from all negative samples to avoid the over-clustering problem, guaranteed by the Bernoulli Distribution model. We extensively evaluate our framework in several large-scale benchmarks (e.g., ImageNet, SYSU-30k, and COCO). The results demonstrate our model's superiority (e.g., the learning efficiency) over the latest state-of-the-art methods by a clear margin. See Codes at: https://github.com/wanggrun/triplet.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_Solving_Inefficiency_of_Self-Supervised_Representation_Learning_ICCV_2021_paper.pdf", @@ -38880,7 +41506,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;2;3;1;0", - "aff_country_unique": "China;United Kingdom;United States;Singapore" + "aff_country_unique": "China;United Kingdom;United States;Singapore", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Guangrun and Wang,\n Keze and Wang,\n Guangcong and Torr,\n Philip H.S. and Lin,\n Liang\n},\n title = {\n Solving Inefficiency of Self-Supervised Representation Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9505-9515\n} \n}" }, { "title": "Space-Time Crop & Attend: Improving Cross-Modal Video Representation Learning", @@ -38888,7 +41515,8 @@ "status": "Poster", "track": "main", "pid": 7813, - "author": "Mandela Patrick; Po-Yao Huang; Ishan Misra; Florian Metze; Andrea Vedaldi; Yuki M. Asano; Jo\u00e3o F. Henriques", + "author_site": "Mandela Patrick; Po-Yao Huang; Ishan Misra; Florian Metze; Andrea Vedaldi; Yuki M. Asano; João F. Henriques", + "author": "Mandela Patrick; Po-Yao Huang; Ishan Misra; Florian Metze; Andrea Vedaldi; Yuki M. Asano; João F. Henriques", "abstract": "The quality of the image representations obtained from self-supervised learning depends strongly on the type of data augmentations used in the learning formulation. Recent papers have ported these methods from still images to videos and found that leveraging both audio and video signals yields strong gains; however, they did not find that spatial augmentations such as cropping, which are very important for still images, work as well for videos. In this paper, we improve these formulations in two ways unique to the spatio-temporal aspect of videos. First, for space, we show that spatial augmentations such as cropping do work well for videos too, but that previous implementations, due to the high processing and memory cost, could not do this at a scale sufficient for it to work well. To address this issue, we first introduce Feature Crop, a method to simulate such augmentations much more efficiently directly in feature space. Second, we show that as opposed to naive average pooling, the use of transformer-based attention improves performance significantly, and is well suited for processing feature crops. Combining both of our discoveries into a new method, Space-time Crop & Attend (STiCA) we achieve state-of-the-art performance across multiple video-representation learning benchmarks. In particular, we achieve new state-of-the-art accuracies of 67.0% on HMDB-51 and 93.1% on UCF-101 when pre-training on Kinetics-400.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Patrick_Space-Time_Crop__Attend_Improving_Cross-Modal_Video_Representation_Learning_ICCV_2021_paper.pdf", "aff": "Facebook AI Research; Facebook AI Research; Facebook AI Research; Facebook AI Research; Facebook AI Research; Oxford University; Oxford University", @@ -38904,14 +41532,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Patrick_Space-Time_Crop__Attend_Improving_Cross-Modal_Video_Representation_Learning_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;0;0;1;1", - "aff_unique_norm": "Meta;University of Oxford", + "aff_unique_norm": "Facebook;University of Oxford", "aff_unique_dep": "Facebook AI Research;", "aff_unique_url": "https://research.facebook.com;https://www.ox.ac.uk", "aff_unique_abbr": "FAIR;Oxford", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;1;1", - "aff_country_unique": "United States;United Kingdom" + "aff_country_unique": "United States;United Kingdom", + "bibtex": "@InProceedings{Patrick_2021_ICCV,\n \n author = {\n Patrick,\n Mandela and Huang,\n Po-Yao and Misra,\n Ishan and Metze,\n Florian and Vedaldi,\n Andrea and Asano,\n Yuki M. and Henriques,\n Jo\\~ao F.\n},\n title = {\n Space-Time Crop \\& Attend: Improving Cross-Modal Video Representation Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10560-10572\n} \n}" }, { "title": "Space-Time-Separable Graph Convolutional Network for Pose Forecasting", @@ -38919,6 +41548,7 @@ "status": "Poster", "track": "main", "pid": 7726, + "author_site": "Theodoros Sofianos; Alessio Sampieri; Luca Franco; Fabio Galasso", "author": "Theodoros Sofianos; Alessio Sampieri; Luca Franco; Fabio Galasso", "abstract": "Human pose forecasting is a complex structured-data sequence-modelling task, which has received increasing attention, also due to numerous potential applications. Research has mainly addressed the temporal dimension as time series and the interaction of human body joints with a kinematic tree or by a graph. This has decoupled the two aspects and leveraged progress from the relevant fields, but it has also limited the understanding of the complex structural joint spatio-temporal dynamics of the human pose. Here we propose a novel Space-Time-Separable Graph Convolutional Network (STS-GCN) for pose forecasting. For the first time, STS-GCN models the human pose dynamics only with a graph convolutional network (GCN), including the temporal evolution and the spatial joint interaction within a single-graph framework, which allows the cross-talk of motion and spatial correlations. Concurrently, STS-GCN is the first space-time-separable GCN: the space-time graph connectivity is factored into space and time affinity matrices, which bottlenecks the space-time cross-talk, while enabling full joint-joint and time-time correlations. Both affinity matrices are learnt end-to-end, which results in connections substantially deviating from the standard kinematic tree and the linear-time time series. In experimental evaluation on three complex, recent and large-scale benchmarks, Human3.6M [Ionescu et al. TPAMI'14], AMASS [Mahmood et al. ICCV'19] and 3DPW [Von Marcard et al. ECCV'18], STS-GCN outperforms the state-of-the-art, surpassing the current best technique [Mao et al. ECCV'20] by over 32% in average at the most difficult long-term predictions, while only requiring 1.7% of its parameters. We explain the results qualitatively and illustrate the graph interactions by the factored joint-joint and time-time learnt graph connections. Our source code is available at https://github.com/FraLuca/STSGCN", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Sofianos_Space-Time-Separable_Graph_Convolutional_Network_for_Pose_Forecasting_ICCV_2021_paper.pdf", @@ -38942,7 +41572,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Rome", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Italy" + "aff_country_unique": "Italy", + "bibtex": "@InProceedings{Sofianos_2021_ICCV,\n \n author = {\n Sofianos,\n Theodoros and Sampieri,\n Alessio and Franco,\n Luca and Galasso,\n Fabio\n},\n title = {\n Space-Time-Separable Graph Convolutional Network for Pose Forecasting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11209-11218\n} \n}" }, { "title": "Sparse Needlets for Lighting Estimation With Spherical Transport Loss", @@ -38950,6 +41581,7 @@ "status": "Poster", "track": "main", "pid": 2697, + "author_site": "Fangneng Zhan; Changgong Zhang; Wenbo Hu; Shijian Lu; Feiying Ma; Xuansong Xie; Ling Shao", "author": "Fangneng Zhan; Changgong Zhang; Wenbo Hu; Shijian Lu; Feiying Ma; Xuansong Xie; Ling Shao", "abstract": "Accurate lighting estimation is challenging yet critical to many computer vision and computer graphics tasks such as high-dynamic-range (HDR) relighting. Existing approaches model lighting in either frequency domain or spatial domain which is insufficient to represent the complex lighting conditions in scenes and tends to produce inaccurate estimation. This paper presents NeedleLight, a new lighting estimation model that represents illumination with needlets and allows lighting estimation in both frequency domain and spatial domain jointly. An optimal thresholding function is designed to achieve sparse needlets which trims redundant lighting parameters and demonstrates superior localization properties for illumination representation. In addition, a novel spherical transport loss is designed based on optimal transport theory which guides to regress lighting representation parameters with consideration of the spatial information. Furthermore, we propose a new metric that is concise yet effective by directly evaluating the estimated illumination maps rather than rendered images. Extensive experiments show that NeedleLight achieves superior lighting estimation consistently across multiple evaluation metrics as compared with state-of-the-art methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhan_Sparse_Needlets_for_Lighting_Estimation_With_Spherical_Transport_Loss_ICCV_2021_paper.pdf", @@ -38964,7 +41596,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhan_Sparse_Needlets_for_Lighting_Estimation_With_Spherical_Transport_Loss_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhan_Sparse_Needlets_for_Lighting_Estimation_With_Spherical_Transport_Loss_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Zhan_2021_ICCV,\n \n author = {\n Zhan,\n Fangneng and Zhang,\n Changgong and Hu,\n Wenbo and Lu,\n Shijian and Ma,\n Feiying and Xie,\n Xuansong and Shao,\n Ling\n},\n title = {\n Sparse Needlets for Lighting Estimation With Spherical Transport Loss\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12830-12839\n} \n}" }, { "title": "Sparse-Shot Learning With Exclusive Cross-Entropy for Extremely Many Localisations", @@ -38972,6 +41605,7 @@ "status": "Poster", "track": "main", "pid": 2176, + "author_site": "Andreas Panteli; Jonas Teuwen; Hugo Horlings; Efstratios Gavves", "author": "Andreas Panteli; Jonas Teuwen; Hugo Horlings; Efstratios Gavves", "abstract": "Object localisation, in the context of regular images, often depicts objects like people or cars. In these images, there is typically a relatively small number of objects per class, which usually is manageable to annotate. However, outside the setting of regular images, we are often confronted with a different situation. In computational pathology, digitised tissue sections are extremely large images, whose dimensions quickly exceed 250'000x250'000 pixels, where relevant objects, such as tumour cells or lymphocytes can quickly number in the millions. Annotating them all is practically impossible and annotating sparsely a few, out of many more, is the only possibility. Unfortunately, learning from sparse annotations, or sparse-shot learning, clashes with standard supervised learning because what is not annotated is treated as a negative. However, assigning negative labels to what are true positives leads to confusion in the gradients and biased learning. To this end, we present exclusive cross-entropy, which slows down the biased learning by examining the second-order loss derivatives in order to drop the loss terms corresponding to likely biased terms. Experiments on nine datasets and two different localisation tasks, detection with YOLLO and segmentation with Unet, show that we obtain considerable improvements compared to cross-entropy or focal loss, while often reaching the best possible performance for the model with only 10-40% of annotations.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Panteli_Sparse-Shot_Learning_With_Exclusive_Cross-Entropy_for_Extremely_Many_Localisations_ICCV_2021_paper.pdf", @@ -38995,7 +41629,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0+0;0;0+1", - "aff_country_unique": "Netherlands;Unknown" + "aff_country_unique": "Netherlands;Unknown", + "bibtex": "@InProceedings{Panteli_2021_ICCV,\n \n author = {\n Panteli,\n Andreas and Teuwen,\n Jonas and Horlings,\n Hugo and Gavves,\n Efstratios\n},\n title = {\n Sparse-Shot Learning With Exclusive Cross-Entropy for Extremely Many Localisations\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2813-2823\n} \n}" }, { "title": "Sparse-to-Dense Feature Matching: Intra and Inter Domain Cross-Modal Learning in Domain Adaptation for 3D Semantic Segmentation", @@ -39003,6 +41638,7 @@ "status": "Poster", "track": "main", "pid": 7397, + "author_site": "Duo Peng; Yinjie Lei; Wen Li; Pingping Zhang; Yulan Guo", "author": "Duo Peng; Yinjie Lei; Wen Li; Pingping Zhang; Yulan Guo", "abstract": "Domain adaptation is critical for success when confronting with the lack of annotations in a new domain. As the huge time consumption of labeling process on 3D point cloud, domain adaptation for 3D semantic segmentation is of great expectation. With the rise of multi-modal datasets, large amount of 2D images are accessible besides 3D point clouds. In light of this, we propose to further leverage 2D data for 3D domain adaptation by intra and inter domain cross modal learning. As for intra-domain cross modal learning, most existing works sample the dense 2D pixel-wise features into the same size with sparse 3D point-wise features, resulting in the abandon of numerous useful 2D features. To address this problem, we propose Dynamic sparse-to-dense Cross Modal Learning (DsCML) to increase the sufficiency of multi-modality information interaction for domain adaptation. For inter-domain cross modal learning, we further advance Cross Modal Adversarial Learning (CMAL) on 2D and 3D data which contains different semantic content aiming to promote high-level modal complementarity. We evaluate our model under various multi-modality domain adaptation settings including day-to-night, country-to-country and dataset-to-dataset, brings large improvements over both uni-modal and multi-modal domain adaptation methods on all settings.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Peng_Sparse-to-Dense_Feature_Matching_Intra_and_Inter_Domain_Cross-Modal_Learning_in_ICCV_2021_paper.pdf", @@ -39026,7 +41662,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Peng_2021_ICCV,\n \n author = {\n Peng,\n Duo and Lei,\n Yinjie and Li,\n Wen and Zhang,\n Pingping and Guo,\n Yulan\n},\n title = {\n Sparse-to-Dense Feature Matching: Intra and Inter Domain Cross-Modal Learning in Domain Adaptation for 3D Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7108-7117\n} \n}" }, { "title": "Spatial Uncertainty-Aware Semi-Supervised Crowd Counting", @@ -39034,6 +41671,7 @@ "status": "Poster", "track": "main", "pid": 1834, + "author_site": "Yanda Meng; Hongrun Zhang; Yitian Zhao; Xiaoyun Yang; Xuesheng Qian; Xiaowei Huang; Yalin Zheng", "author": "Yanda Meng; Hongrun Zhang; Yitian Zhao; Xiaoyun Yang; Xuesheng Qian; Xiaowei Huang; Yalin Zheng", "abstract": "Semi-supervised approaches for crowd counting attract attention, as the fully supervised paradigm is expensive and laborious due to its request for a large number of images of dense crowd scenarios and their annotations. This paper proposes a spatial uncertainty-aware semi-supervised approach via regularized surrogate task (binary segmentation) for crowd counting problems. Different from existing semi-supervised learning-based crowd counting methods, to exploit the unlabeled data, our proposed spatial uncertainty-aware teacher-student framework focuses on high confident regions' information while addressing the noisy supervision from the unlabeled data in an end-to-end manner. Specifically, we estimate the spatial uncertainty maps from the teacher model's surrogate task to guide the feature learning of the main task (density regression) and the surrogate task of the student model at the same time. Besides, we introduce a simple yet effective differential transformation layer to enforce the inherent spatial consistency regularization between the main task and the surrogate task in the student model, which helps the surrogate task to yield more reliable predictions and generates high-quality uncertainty maps. Thus, our model can also address the task-level perturbation problems that occur spatial inconsistency between the primary and surrogate tasks in the student model. Experimental results on four challenging crowd counting datasets demonstrate that our method achieves superior performance to the state-of-the-art semi-supervised methods. Code is available at : https://github.com/smallmax00/SUA_crowd_counting", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Meng_Spatial_Uncertainty-Aware_Semi-Supervised_Crowd_Counting_ICCV_2021_paper.pdf", @@ -39054,10 +41692,11 @@ "aff_unique_dep": "Department of Eye and Vision Science;Biomedical Engineering;;", "aff_unique_url": "https://www.liverpool.ac.uk;;;", "aff_unique_abbr": "Liv Uni;;;", - "aff_campus_unique_index": "0;0;1;3;0;0", - "aff_campus_unique": "Liverpool;Ningbo;;Shanghai", + "aff_campus_unique_index": "0;0;1;0;0", + "aff_campus_unique": "Liverpool;Ningbo;", "aff_country_unique_index": "0;0;1;0;1;0;0", - "aff_country_unique": "United Kingdom;China" + "aff_country_unique": "United Kingdom;China", + "bibtex": "@InProceedings{Meng_2021_ICCV,\n \n author = {\n Meng,\n Yanda and Zhang,\n Hongrun and Zhao,\n Yitian and Yang,\n Xiaoyun and Qian,\n Xuesheng and Huang,\n Xiaowei and Zheng,\n Yalin\n},\n title = {\n Spatial Uncertainty-Aware Semi-Supervised Crowd Counting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15549-15559\n} \n}" }, { "title": "Spatial and Semantic Consistency Regularizations for Pedestrian Attribute Recognition", @@ -39065,6 +41704,7 @@ "status": "Poster", "track": "main", "pid": 3989, + "author_site": "Jian Jia; Xiaotang Chen; Kaiqi Huang", "author": "Jian Jia; Xiaotang Chen; Kaiqi Huang", "abstract": "While recent studies on pedestrian attribute recognition have shown remarkable progress in leveraging complicated networks and attention mechanisms, most of them neglect the inter-image relations and an important prior: spatial consistency and semantic consistency of attributes under surveillance scenarios. The spatial locations of the same attribute should be consistent between different pedestrian images, e.g., the \"hat\" attribute and the \"boots\" attribute are always located at the top and bottom of the picture respectively. In addition, the inherent semantic feature of the \"hat\" attribute should be consistent, whether it is a baseball cap, beret, or helmet. To fully exploit inter-image relations and aggregate human prior in the model learning process, we construct a Spatial and Semantic Consistency (SSC) framework that consists of two complementary regularizations to achieve spatial and semantic consistency for each attribute. Specifically, we first propose a spatial consistency regularization to focus on reliable and stable attribute-related regions. Based on the precise attribute locations, we further propose a semantic consistency regularization to extract intrinsic and discriminative semantic features. We conduct extensive experiments on popular benchmarks including PA100K, RAP, and PETA. Results show that the proposed method performs favorably against state-of-the-art methods without increasing parameters.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Jia_Spatial_and_Semantic_Consistency_Regularizations_for_Pedestrian_Attribute_Recognition_ICCV_2021_paper.pdf", @@ -39088,7 +41728,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0+0;0+0+0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Jia_2021_ICCV,\n \n author = {\n Jia,\n Jian and Chen,\n Xiaotang and Huang,\n Kaiqi\n},\n title = {\n Spatial and Semantic Consistency Regularizations for Pedestrian Attribute Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 962-971\n} \n}" }, { "title": "Spatial-Temporal Consistency Network for Low-Latency Trajectory Forecasting", @@ -39096,6 +41737,7 @@ "status": "Poster", "track": "main", "pid": 6643, + "author_site": "Shijie Li; Yanying Zhou; Jinhui Yi; Juergen Gall", "author": "Shijie Li; Yanying Zhou; Jinhui Yi; Juergen Gall", "abstract": "Trajectory forecasting is a crucial step for autonomous vehicles and mobile robots in order to navigate and interact safely. In order to handle the spatial interactions between objects, graph-based approaches have been proposed. These methods, however, model motion on a frame-to-frame basis and do not provide a strong temporal model. To overcome this limitation, we propose a compact model called Spatial-Temporal Consistency Network (STC-Net). In STC-Net, dilated temporal convolutions are introduced to model long-range dependencies along each trajectory for better temporal modeling while graph convolutions are employed to model the spatial interaction among different trajectories. Furthermore, we propose a feature-wise convolution to generate the predicted trajectories in one pass and refine the forecast trajectories together with the reconstructed observed trajectories. We demonstrate that STC-Net generates spatially and temporally consistent trajectories and outperforms other graph-based methods. Since STC-Net requires only 0.7k parameters and forecasts the future with a latency of only 1.3ms, it advances the state-of-the-art and satisfies the requirements for realistic applications.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_Spatial-Temporal_Consistency_Network_for_Low-Latency_Trajectory_Forecasting_ICCV_2021_paper.pdf", @@ -39110,7 +41752,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Li_Spatial-Temporal_Consistency_Network_for_Low-Latency_Trajectory_Forecasting_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Li_Spatial-Temporal_Consistency_Network_for_Low-Latency_Trajectory_Forecasting_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Shijie and Zhou,\n Yanying and Yi,\n Jinhui and Gall,\n Juergen\n},\n title = {\n Spatial-Temporal Consistency Network for Low-Latency Trajectory Forecasting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1940-1949\n} \n}" }, { "title": "Spatial-Temporal Transformer for Dynamic Scene Graph Generation", @@ -39118,6 +41761,7 @@ "status": "Poster", "track": "main", "pid": 1882, + "author_site": "Yuren Cong; Wentong Liao; Hanno Ackermann; Bodo Rosenhahn; Michael Ying Yang", "author": "Yuren Cong; Wentong Liao; Hanno Ackermann; Bodo Rosenhahn; Michael Ying Yang", "abstract": "Dynamic scene graph generation aims at generating a scene graph of the given video. Compared to the task of scene graph generation from images, it is more challenging because of the dynamic relationships between objects and the temporal dependencies between frames allowing for a richer semantic interpretation. In this paper, we propose Spatial-temporal Transformer (STTran), a neural network that consists of two core modules: (1) a spatial encoder that takes an input frame to extract spatial context and reason about the visual relationships within a frame, and (2) a temporal decoder which takes the output of the spatial encoder as input in order to capture the temporal dependencies between frames and infer the dynamic relationships. Furthermore, STTran is flexible to take varying lengths of videos as input without clipping, which is especially important for long videos. Our method is validated on the benchmark dataset Action Genome (AG). The experimental results demonstrate the superior performance of our method in terms of dynamic scene graphs. Moreover, a set of ablative studies is conducted and the effect of each proposed module is justified.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Cong_Spatial-Temporal_Transformer_for_Dynamic_Scene_Graph_Generation_ICCV_2021_paper.pdf", @@ -39141,7 +41785,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1", - "aff_country_unique": "Germany;Netherlands" + "aff_country_unique": "Germany;Netherlands", + "bibtex": "@InProceedings{Cong_2021_ICCV,\n \n author = {\n Cong,\n Yuren and Liao,\n Wentong and Ackermann,\n Hanno and Rosenhahn,\n Bodo and Yang,\n Michael Ying\n},\n title = {\n Spatial-Temporal Transformer for Dynamic Scene Graph Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 16372-16382\n} \n}" }, { "title": "Spatially Conditioned Graphs for Detecting Human-Object Interactions", @@ -39149,6 +41794,7 @@ "status": "Poster", "track": "main", "pid": 3553, + "author_site": "Frederic Z. Zhang; Dylan Campbell; Stephen Gould", "author": "Frederic Z. Zhang; Dylan Campbell; Stephen Gould", "abstract": "We address the problem of detecting human-object interactions in images using graphical neural networks. Unlike conventional methods, where nodes send scaled but otherwise identical messages to each of their neighbours, we propose to condition messages between pairs of nodes on their spatial relationships, resulting in different messages going to neighbours of the same node. To this end, we explore various ways of applying spatial conditioning under a multi-branch structure. Through extensive experimentation we demonstrate the advantages of spatial conditioning for the computation of the adjacency structure, messages and the refined graph features. In particular, we empirically show that as the quality of the bounding boxes increases, their coarse appearance features contribute relatively less to the disambiguation of interactions compared to the spatial information. Our method achieves an mAP of 31.33% on HICO-DET and 54.2% on V-COCO, significantly outperforming state-of-the-art on fine-tuned detections.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhang_Spatially_Conditioned_Graphs_for_Detecting_Human-Object_Interactions_ICCV_2021_paper.pdf", @@ -39172,7 +41818,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;1+0;0+0", - "aff_country_unique": "Australia;United Kingdom" + "aff_country_unique": "Australia;United Kingdom", + "bibtex": "@InProceedings{Zhang_2021_ICCV,\n \n author = {\n Zhang,\n Frederic Z. and Campbell,\n Dylan and Gould,\n Stephen\n},\n title = {\n Spatially Conditioned Graphs for Detecting Human-Object Interactions\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13319-13327\n} \n}" }, { "title": "Spatially-Adaptive Image Restoration Using Distortion-Guided Networks", @@ -39180,6 +41827,7 @@ "status": "Poster", "track": "main", "pid": 9231, + "author_site": "Kuldeep Purohit; Maitreya Suin; A. N. Rajagopalan; Vishnu Naresh Boddeti", "author": "Kuldeep Purohit; Maitreya Suin; A. N. Rajagopalan; Vishnu Naresh Boddeti", "abstract": "We present a general learning-based solution for restoring images suffering from spatially-varying degradations. Prior approaches are typically degradation-specific and employ the same processing across different images and different pixels within. However, we hypothesize that such spatially rigid processing is suboptimal for simultaneously restoring the degraded pixels as well as reconstructing the clean regions of the image. To overcome this limitation, we propose SPAIR, a network design that harnesses distortion-localization information and dynamically adjusts computation to difficult regions in the image. SPAIR comprises of two components, (1) a localization network that identifies degraded pixels, and (2) a restoration network that exploits knowledge from the localization network in filter and feature domain to selectively and adaptively restore degraded pixels. Our key idea is to exploit the non-uniformity of heavy degradations in spatial-domain and suitably embed this knowledge within distortion-guided modules performing sparse normalization, feature extraction and attention. Our architecture is agnostic to physical formation model and generalizes across several types of spatially-varying degradations. We demonstrate the efficacy of SPAIR individually on four restoration tasks- removal of rain-streaks, raindrops, shadows and motion blur. Extensive qualitative and quantitative comparisons with prior art on 11 benchmark datasets demonstrate that our degradation-agnostic network design offers significant performance gains over state-of-the-art degradation-specific architectures. Code available at https://github.com/human-analysis/spatially-adaptive-image-restoration.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Purohit_Spatially-Adaptive_Image_Restoration_Using_Distortion-Guided_Networks_ICCV_2021_paper.pdf", @@ -39203,7 +41851,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Madras", "aff_country_unique_index": "0;1;1;0", - "aff_country_unique": "United States;India" + "aff_country_unique": "United States;India", + "bibtex": "@InProceedings{Purohit_2021_ICCV,\n \n author = {\n Purohit,\n Kuldeep and Suin,\n Maitreya and Rajagopalan,\n A. N. and Boddeti,\n Vishnu Naresh\n},\n title = {\n Spatially-Adaptive Image Restoration Using Distortion-Guided Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2309-2319\n} \n}" }, { "title": "Spatio-Temporal Dynamic Inference Network for Group Activity Recognition", @@ -39211,6 +41860,7 @@ "status": "Poster", "track": "main", "pid": 4145, + "author_site": "Hangjie Yuan; Dong Ni; Mang Wang", "author": "Hangjie Yuan; Dong Ni; Mang Wang", "abstract": "Group activity recognition aims to understand the activity performed by a group of people. In order to solve it, modeling complex spatio-temporal interactions is the key. Previous methods are limited in reasoning on a predefined graph, which ignores the inherent person-specific interaction context. Moreover, they adopt inference schemes that are computationally expensive and easily result in the over-smoothing problem. In this paper, we manage to achieve spatio-temporal person-specific inferences by proposing Dynamic Inference Network (DIN), which composes of Dynamic Relation (DR) module and Dynamic Walk (DW) module. We firstly propose to initialize interaction fields on a primary spatio-temporal graph. Within each interaction field, we apply DR to predict the relation matrix and DW to predict the dynamic walk offsets in a joint-processing manner, thus forming a person-specific interaction graph. By updating features on the specific graph, a person can possess a global-level interaction field with a local initialization. Experiments indicate both modules' effectiveness. Moreover, DIN achieves significant improvement compared to previous state-of-the-art methods on two popular datasets under the same setting, while costing much less computation overhead of the reasoning module.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yuan_Spatio-Temporal_Dynamic_Inference_Network_for_Group_Activity_Recognition_ICCV_2021_paper.pdf", @@ -39234,7 +41884,8 @@ "aff_campus_unique_index": "0+0;0+0", "aff_campus_unique": "Hangzhou;", "aff_country_unique_index": "0+0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yuan_2021_ICCV,\n \n author = {\n Yuan,\n Hangjie and Ni,\n Dong and Wang,\n Mang\n},\n title = {\n Spatio-Temporal Dynamic Inference Network for Group Activity Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7476-7485\n} \n}" }, { "title": "Spatio-Temporal Representation Factorization for Video-Based Person Re-Identification", @@ -39242,6 +41893,7 @@ "status": "Poster", "track": "main", "pid": 1629, + "author_site": "Abhishek Aich; Meng Zheng; Srikrishna Karanam; Terrence Chen; Amit K. Roy-Chowdhury; Ziyan Wu", "author": "Abhishek Aich; Meng Zheng; Srikrishna Karanam; Terrence Chen; Amit K. Roy-Chowdhury; Ziyan Wu", "abstract": "Despite much recent progress in video-based person re-identification (re-ID), the current state-of-the-art still suffers from common real-world challenges such as appearance similarity among various people, occlusions, and frame misalignment. To alleviate these problems, we propose Spatio-Temporal Representation Factorization (STRF), a flexible new computational unit that can be used in conjunction with most existing 3D convolutional neural network architectures for re-ID. The key innovations of STRF over prior work include explicit pathways for learning discriminative temporal and spatial features, with each component further factorized to capture complementary person-specific appearance and motion information. Specifically, temporal factorization comprises two branches, one each for static features (e.g., the color of clothes) that do not change much over time, and dynamic features (e.g., walking patterns) that change over time. Further, spatial factorization also comprises two branches to learn both global (coarse segments) as well as local (finer segments) appearance features, with the local features particularly useful in cases of occlusion or spatial misalignment. These two factorization operations taken together result in a modular architecture for our parameter-wise light STRF unit that can be plugged in between any two 3D convolutional layers, resulting in an end-to-end learning framework. We empirically show that STRF improves performance of various existing baseline architectures while demonstrating new state-of-the-art results using standard person re-ID evaluation protocols on three benchmarks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Aich_Spatio-Temporal_Representation_Factorization_for_Video-Based_Person_Re-Identification_ICCV_2021_paper.pdf", @@ -39265,7 +41917,8 @@ "aff_campus_unique_index": "0+1;0;0;0;1;0", "aff_campus_unique": "Cambridge;Riverside", "aff_country_unique_index": "0+0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Aich_2021_ICCV,\n \n author = {\n Aich,\n Abhishek and Zheng,\n Meng and Karanam,\n Srikrishna and Chen,\n Terrence and Roy-Chowdhury,\n Amit K. and Wu,\n Ziyan\n},\n title = {\n Spatio-Temporal Representation Factorization for Video-Based Person Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 152-162\n} \n}" }, { "title": "Spatio-Temporal Self-Supervised Representation Learning for 3D Point Clouds", @@ -39273,6 +41926,7 @@ "status": "Poster", "track": "main", "pid": 8366, + "author_site": "Siyuan Huang; Yichen Xie; Song-Chun Zhu; Yixin Zhu", "author": "Siyuan Huang; Yichen Xie; Song-Chun Zhu; Yixin Zhu", "abstract": "To date, various 3D scene understanding tasks still lack practical and generalizable pre-trained models, primarily due to the intricate nature of 3D scene understanding tasks and their immerse variations due to camera views, lighting, occlusions, etc. In this paper, we tackle this immanent challenge by introducing a spatio-temporal representation learning (STRL) framework, capable of learning from unlabeled 3D point clouds in a self-supervised fashion. Inspired by how infants learn from visual data in-the-wild, we explore the rich spatio-temporal cues derived from the 3D data. Specifically, STRL takes two temporal-correlated frames from a 3D point cloud sequence as the input, transforms it with spatial data augmentation, and learns the invariant representation self-supervisedly. To corroborate the efficacy of STRL, we conduct extensive experiments on synthetic, indoor, and outdoor datasets. Experimental results demonstrate that, compared with supervised learning methods, the learned self-supervised representation facilitates various models to attain comparable or even better performances while capable of generalizing pre-trained models to downstream tasks, including 3D shape classification, 3D object detection, and 3D semantic segmentation. Moreover, spatio-temporal contextual cues embedded in 3D point clouds significantly improve the learned representations.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Huang_Spatio-Temporal_Self-Supervised_Representation_Learning_for_3D_Point_Clouds_ICCV_2021_paper.pdf", @@ -39287,7 +41941,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Huang_Spatio-Temporal_Self-Supervised_Representation_Learning_for_3D_Point_Clouds_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Huang_Spatio-Temporal_Self-Supervised_Representation_Learning_for_3D_Point_Clouds_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Huang_2021_ICCV,\n \n author = {\n Huang,\n Siyuan and Xie,\n Yichen and Zhu,\n Song-Chun and Zhu,\n Yixin\n},\n title = {\n Spatio-Temporal Self-Supervised Representation Learning for 3D Point Clouds\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6535-6545\n} \n}" }, { "title": "Specialize and Fuse: Pyramidal Output Representation for Semantic Segmentation", @@ -39295,6 +41950,7 @@ "status": "Poster", "track": "main", "pid": 3602, + "author_site": "Chi-Wei Hsiao; Cheng Sun; Hwann-Tzong Chen; Min Sun", "author": "Chi-Wei Hsiao; Cheng Sun; Hwann-Tzong Chen; Min Sun", "abstract": "We present a novel pyramidal output representation to ensure parsimony with our \"specialize and fuse\" process for semantic segmentation. A pyramidal \"output\" representation consists of coarse-to-fine levels, where each level is \"specialize\" in a different class distribution (e.g., more stuff than things classes at coarser levels). Two types of pyramidal outputs (i.e., unity and semantic pyramid) are \"fused\" into the final semantic output, where the unity pyramid indicates unity-cells (i.e., all pixels in such cell share the same semantic label). The process ensures parsimony by predicting a relatively small number of labels for unity-cells (e.g., a large cell of grass) to build the final semantic output. In addition to the \"output\" representation, we design a coarse-to-fine contextual module to aggregate the \"features\" representation from different levels. We validate the effectiveness of each key module in our method through comprehensive ablation studies. Finally, our approach achieves state-of-the-art performance on three widely-used semantic segmentation datasets---ADE20K, COCO-Stuff, and Pascal-Context.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Hsiao_Specialize_and_Fuse_Pyramidal_Output_Representation_for_Semantic_Segmentation_ICCV_2021_paper.pdf", @@ -39318,7 +41974,8 @@ "aff_campus_unique_index": "0+0;0+0;0;0", "aff_campus_unique": "Taiwan;", "aff_country_unique_index": "0+0+2;0+0+2;0;0+2", - "aff_country_unique": "China;;United States" + "aff_country_unique": "China;;United States", + "bibtex": "@InProceedings{Hsiao_2021_ICCV,\n \n author = {\n Hsiao,\n Chi-Wei and Sun,\n Cheng and Chen,\n Hwann-Tzong and Sun,\n Min\n},\n title = {\n Specialize and Fuse: Pyramidal Output Representation for Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7137-7146\n} \n}" }, { "title": "Specificity-Preserving RGB-D Saliency Detection", @@ -39326,6 +41983,7 @@ "status": "Poster", "track": "main", "pid": 3208, + "author_site": "Tao Zhou; Huazhu Fu; Geng Chen; Yi Zhou; Deng-Ping Fan; Ling Shao", "author": "Tao Zhou; Huazhu Fu; Geng Chen; Yi Zhou; Deng-Ping Fan; Ling Shao", "abstract": "RGB-D saliency detection has attracted increasing attention, due to its effectiveness and the fact that depth cues can now be conveniently captured. Existing works often focus on learning a shared representation through various fusion strategies, with few methods explicitly considering how to preserve modality-specific characteristics. In this paper, taking a new perspective, we propose a specificity-preserving network for RGB-D saliency detection, which benefits saliency detection performance by exploring both the shared information and modality-specific properties (e.g., specificity). Specifically, two modality-specific networks and a shared learning network are adopted to generate individual and shared saliency maps. A cross-enhanced integration module (CIM) is proposed to fuse cross-modal features in the shared learning network, which are then propagated to the next layer for integrating cross-level information. Besides, we propose a multi-modal feature aggregation (MFA) module to integrate the modality-specific features from each individual decoder into the shared decoder, which can provide rich complementary multi-modal information to boost the saliency detection performance. Further, a skip connection is used to combine hierarchical features between the encoder and decoder layers. Experiments on six benchmark datasets demonstrate that our SP-Net outperforms other state-of-the-art methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhou_Specificity-Preserving_RGB-D_Saliency_Detection_ICCV_2021_paper.pdf", @@ -39340,7 +41998,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhou_Specificity-Preserving_RGB-D_Saliency_Detection_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhou_Specificity-Preserving_RGB-D_Saliency_Detection_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Zhou_2021_ICCV,\n \n author = {\n Zhou,\n Tao and Fu,\n Huazhu and Chen,\n Geng and Zhou,\n Yi and Fan,\n Deng-Ping and Shao,\n Ling\n},\n title = {\n Specificity-Preserving RGB-D Saliency Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4681-4691\n} \n}" }, { "title": "Spectral Leakage and Rethinking the Kernel Size in CNNs", @@ -39348,6 +42007,7 @@ "status": "Poster", "track": "main", "pid": 7925, + "author_site": "Nergis Tomen; Jan C. van Gemert", "author": "Nergis Tomen; Jan C. van Gemert", "abstract": "Convolutional layers in CNNs implement linear filters which decompose the input into different frequency bands. However, most modern architectures neglect standard principles of filter design when optimizing their model choices regarding the size and shape of the convolutional kernel. In this work, we consider the well-known problem of spectral leakage caused by windowing artifacts in filtering operations in the context of CNNs. We show that the small size of CNN kernels make them susceptible to spectral leakage, which may induce performance-degrading artifacts. To address this issue, we propose the use of larger kernel sizes along with the Hamming window function to alleviate leakage in CNN architectures. We demonstrate improved classification accuracy on multiple benchmark datasets including Fashion-MNIST, CIFAR-10, CIFAR-100 and ImageNet with the simple use of a standard window function in convolutional layers. Finally, we show that CNNs employing the Hamming window display increased robustness against various adversarial attacks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Tomen_Spectral_Leakage_and_Rethinking_the_Kernel_Size_in_CNNs_ICCV_2021_paper.pdf", @@ -39371,7 +42031,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Delft", "aff_country_unique_index": "0;0", - "aff_country_unique": "Netherlands" + "aff_country_unique": "Netherlands", + "bibtex": "@InProceedings{Tomen_2021_ICCV,\n \n author = {\n Tomen,\n Nergis and van Gemert,\n Jan C.\n},\n title = {\n Spectral Leakage and Rethinking the Kernel Size in CNNs\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5138-5147\n} \n}" }, { "title": "Speech Drives Templates: Co-Speech Gesture Synthesis With Learned Templates", @@ -39379,6 +42040,7 @@ "status": "Poster", "track": "main", "pid": 1080, + "author_site": "Shenhan Qian; Zhi Tu; Yihao Zhi; Wen Liu; Shenghua Gao", "author": "Shenhan Qian; Zhi Tu; Yihao Zhi; Wen Liu; Shenghua Gao", "abstract": "Co-speech gesture generation is to synthesize a gesture sequence that not only looks real but also matches with the input speech audio. Our method generates the movements of a complete upper body, including arms, hands, and the head. Although recent data-driven methods achieve great success, challenges still exist, such as limited variety, poor fidelity, and lack of objective metrics. Motivated by the fact that the speech cannot fully determine the gesture, we design a method that learns a set of gesture template vectors to model the latent conditions, which relieve the ambiguity. For our method, the template vector determines the general appearance of a generated gesture sequence, while the speech audio drives subtle movements of the body, both indispensable for synthesizing a realistic gesture sequence. Due to the intractability of an objective metric for gesture-speech synchronization, we adopt the lip-sync error as a proxy metric to tune and evaluate the synchronization ability of our model. Extensive experiments show the superiority of our method in both objective and subjective evaluations on fidelity and synchronization.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Qian_Speech_Drives_Templates_Co-Speech_Gesture_Synthesis_With_Learned_Templates_ICCV_2021_paper.pdf", @@ -39402,7 +42064,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Qian_2021_ICCV,\n \n author = {\n Qian,\n Shenhan and Tu,\n Zhi and Zhi,\n Yihao and Liu,\n Wen and Gao,\n Shenghua\n},\n title = {\n Speech Drives Templates: Co-Speech Gesture Synthesis With Learned Templates\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11077-11086\n} \n}" }, { "title": "Square Root Marginalization for Sliding-Window Bundle Adjustment", @@ -39410,6 +42073,7 @@ "status": "Poster", "track": "main", "pid": 7763, + "author_site": "Nikolaus Demmel; David Schubert; Christiane Sommer; Daniel Cremers; Vladyslav Usenko", "author": "Nikolaus Demmel; David Schubert; Christiane Sommer; Daniel Cremers; Vladyslav Usenko", "abstract": "In this paper we propose a novel square root sliding-window bundle adjustment suitable for real-time odometry applications. The square root formulation pervades three major aspects of our optimization-based sliding-window estimator: for bundle adjustment we eliminate landmark variables with nullspace projection; to store the marginalization prior we employ a matrix square root of the Hessian; and when marginalizing old poses we avoid forming normal equations and update the square root prior directly with a specialized QR decomposition. We show that the proposed square root marginalization is algebraically equivalent to the conventional use of Schur complement (SC) on the Hessian. Moreover, it elegantly deals with rank-deficient Jacobians producing a prior equivalent to SC with Moore--Penrose inverse. Our evaluation of visual and visual-inertial odometry on real-world datasets demonstrates that the proposed estimator is 36% faster than the baseline. It furthermore shows that in single precision, conventional Hessian-based marginalization leads to numeric failures and reduced accuracy. We analyse numeric properties of the marginalization prior to explain why our square root form does not suffer from the same effect and therefore entails superior performance.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Demmel_Square_Root_Marginalization_for_Sliding-Window_Bundle_Adjustment_ICCV_2021_paper.pdf", @@ -39433,7 +42097,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Demmel_2021_ICCV,\n \n author = {\n Demmel,\n Nikolaus and Schubert,\n David and Sommer,\n Christiane and Cremers,\n Daniel and Usenko,\n Vladyslav\n},\n title = {\n Square Root Marginalization for Sliding-Window Bundle Adjustment\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13260-13268\n} \n}" }, { "title": "Stacked Homography Transformations for Multi-View Pedestrian Detection", @@ -39441,6 +42106,7 @@ "status": "Poster", "track": "main", "pid": 2917, + "author_site": "Liangchen Song; Jialian Wu; Ming Yang; Qian Zhang; Yuan Li; Junsong Yuan", "author": "Liangchen Song; Jialian Wu; Ming Yang; Qian Zhang; Yuan Li; Junsong Yuan", "abstract": "Multi-view pedestrian detection aims to predict a bird's eye view (BEV) occupancy map from multiple camera views. This task is confronted with two challenges: how to establish the 3D correspondences from views to the BEV map and how to assemble occupancy information across views. In this paper, we propose a novel Stacked HOmography Transformations (SHOT) approach, which is motivated by approximating projections in 3D world coordinates via a stack of homographies. We first construct a stack of transformations for projecting views to the ground plane at different height levels. Then we design a soft selection module so that the network learns to predict the likelihood of the stack of transformations. Moreover, we provide an in-depth theoretical analysis on constructing SHOT and how well SHOT approximates projections in 3D world coordinates. SHOT is empirically verified to be capable of estimating accurate correspondences from individual views to the BEV map, leading to new state-of-the-art performance on standard evaluation benchmarks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Song_Stacked_Homography_Transformations_for_Multi-View_Pedestrian_Detection_ICCV_2021_paper.pdf", @@ -39458,13 +42124,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Song_Stacked_Homography_Transformations_for_Multi-View_Pedestrian_Detection_ICCV_2021_paper.html", "aff_unique_index": "0;0;1;1;2;0", "aff_unique_norm": "University at Buffalo;Horizon Robotics;Google", - "aff_unique_dep": ";;Google", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.buffalo.edu;https://www.horizon-robotics.com;https://www.google.com", "aff_unique_abbr": "UB;Horizon Robotics;Google", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Song_2021_ICCV,\n \n author = {\n Song,\n Liangchen and Wu,\n Jialian and Yang,\n Ming and Zhang,\n Qian and Li,\n Yuan and Yuan,\n Junsong\n},\n title = {\n Stacked Homography Transformations for Multi-View Pedestrian Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6049-6057\n} \n}" }, { "title": "Standardized Max Logits: A Simple yet Effective Approach for Identifying Unexpected Road Obstacles in Urban-Scene Segmentation", @@ -39472,6 +42139,7 @@ "status": "Poster", "track": "main", "pid": 2171, + "author_site": "Sanghun Jung; Jungsoo Lee; Daehoon Gwak; Sungha Choi; Jaegul Choo", "author": "Sanghun Jung; Jungsoo Lee; Daehoon Gwak; Sungha Choi; Jaegul Choo", "abstract": "Identifying unexpected objects on roads in semantic segmentation (e.g., identifying dogs on roads) is crucial in safety-critical applications. Existing approaches use images of unexpected objects from external datasets or require additional training (e.g., retraining segmentation networks or training an extra network), which necessitate a non-trivial amount of labor intensity or lengthy inference time. One possible alternative is to use prediction scores of a pre-trained network such as the max logits (i.e., maximum values among classes before the final softmax layer) for detecting such objects. However, the distribution of max logits of each predicted class is significantly different from each other, which degrades the performance of identifying unexpected objects in urban-scene segmentation. To address this issue, we propose a simple yet effective approach that standardizes the max logits in order to align the different distributions and reflect the relative meanings of max logits within each predicted class. Moreover, we consider the local regions from two different perspectives based on the intuition that neighboring pixels share similar semantic information. In contrast to previous approaches, our method does not utilize any external datasets or require additional training, which makes our method widely applicable to existing pre-trained segmentation models. Such a straightforward approach achieves a new state-of-the-art performance on the publicly available Fishyscapes Lost & Found leaderboard with a large margin. Our code is publicly available at this link.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Jung_Standardized_Max_Logits_A_Simple_yet_Effective_Approach_for_Identifying_ICCV_2021_paper.pdf", @@ -39488,14 +42156,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Jung_Standardized_Max_Logits_A_Simple_yet_Effective_Approach_for_Identifying_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;1;0", - "aff_unique_norm": "Korea Advanced Institute of Science and Technology;LG", - "aff_unique_dep": "KAIST AI;LG AI Research", + "aff_unique_norm": "Korea Advanced Institute of Science and Technology;LG AI Research", + "aff_unique_dep": "KAIST AI;", "aff_unique_url": "https://www.kaist.edu;https://www.lgaires.com", "aff_unique_abbr": "KAIST;LG AI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Jung_2021_ICCV,\n \n author = {\n Jung,\n Sanghun and Lee,\n Jungsoo and Gwak,\n Daehoon and Choi,\n Sungha and Choo,\n Jaegul\n},\n title = {\n Standardized Max Logits: A Simple yet Effective Approach for Identifying Unexpected Road Obstacles in Urban-Scene Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15425-15434\n} \n}" }, { "title": "StarEnhancer: Learning Real-Time and Style-Aware Image Enhancement", @@ -39503,6 +42172,7 @@ "status": "Poster", "track": "main", "pid": 1075, + "author_site": "Yuda Song; Hui Qian; Xin Du", "author": "Yuda Song; Hui Qian; Xin Du", "abstract": "Image enhancement is a subjective process whose targets vary with user preferences. In this paper, we propose a deep learning-based image enhancement method covering multiple tonal styles using only a single model dubbed StarEnhancer. It can transform an image from one tonal style to another, even if that style is unseen. With a simple one-time setting, users can customize the model to make the enhanced images more in line with their aesthetics. To make the method more practical, we propose a well-designed enhancer that can process a 4K-resolution image over 200 FPS but surpasses the contemporaneous single style image enhancement methods in terms of PSNR, SSIM, and LPIPS. Finally, our proposed enhancement method has good interactability, which allows the user to fine-tune the enhanced image using intuitive options.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Song_StarEnhancer_Learning_Real-Time_and_Style-Aware_Image_Enhancement_ICCV_2021_paper.pdf", @@ -39526,7 +42196,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hangzhou", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Song_2021_ICCV,\n \n author = {\n Song,\n Yuda and Qian,\n Hui and Du,\n Xin\n},\n title = {\n StarEnhancer: Learning Real-Time and Style-Aware Image Enhancement\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4126-4135\n} \n}" }, { "title": "Statistically Consistent Saliency Estimation", @@ -39534,6 +42205,7 @@ "status": "Poster", "track": "main", "pid": 9347, + "author_site": "Shunyan Luo; Emre Barut; Fang Jin", "author": "Shunyan Luo; Emre Barut; Fang Jin", "abstract": "The growing use of deep learning for a wide range of data problems has highlighted the need to understand and diagnose these models appropriately, making deep learning interpretation techniques an essential tool for data analysts. The numerous model interpretation methods proposed in recent years are generally based on heuristics, with little or no theoretical guarantees. Here we present a statistical framework for saliency estimation for black-box computer vision models. Our proposed model-agnostic estimation procedure, which is statistically consistent and capable of passing saliency checks, has polynomial-time computational efficiency since it only requires solving a linear program. An upper bound is established on the number of model evaluations needed to recover regions of importance with high probability through our theoretical analysis. Furthermore, a new perturbation scheme is presented for the estimation of local gradients that is more efficient than commonly used random perturbation schemes. The validity and excellence of our new method are demonstrated experimentally using sensitivity analysis on multiple datasets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Luo_Statistically_Consistent_Saliency_Estimation_ICCV_2021_paper.pdf", @@ -39557,7 +42229,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Luo_2021_ICCV,\n \n author = {\n Luo,\n Shunyan and Barut,\n Emre and Jin,\n Fang\n},\n title = {\n Statistically Consistent Saliency Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 745-753\n} \n}" }, { "title": "StereOBJ-1M: Large-Scale Stereo Image Dataset for 6D Object Pose Estimation", @@ -39565,6 +42238,7 @@ "status": "Poster", "track": "main", "pid": 2252, + "author_site": "Xingyu Liu; Shun Iwase; Kris M. Kitani", "author": "Xingyu Liu; Shun Iwase; Kris M. Kitani", "abstract": "We present a large-scale stereo RGB image object pose estimation dataset named the StereOBJ-1M dataset. The dataset is designed to address challenging cases such as object transparency, translucency, and specular reflection, in addition to the common challenges of occlusion, symmetry, and variations in illumination and environments. In order to collect data of sufficient scale for modern deep learning models, we propose a novel method for efficiently annotating pose data in a multi-view fashion that allows data capturing in complex and flexible environments. Fully annotated with 6D object poses, our dataset contains over 396K frames and over 1.5M annotations of 18 objects recorded in 183 scenes constructed in 11 different environments. The 18 objects include 8 symmetric objects, 7 transparent objects, and 8 reflective objects. We benchmark two state-of-the-art pose estimation frameworks on StereOBJ-1M as baselines for future work. We also propose a novel object-level pose optimization method for computing 6D pose from keypoint predictions in multiple images.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liu_StereOBJ-1M_Large-Scale_Stereo_Image_Dataset_for_6D_Object_Pose_Estimation_ICCV_2021_paper.pdf", @@ -39588,7 +42262,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Liu_2021_ICCV,\n \n author = {\n Liu,\n Xingyu and Iwase,\n Shun and Kitani,\n Kris M.\n},\n title = {\n StereOBJ-1M: Large-Scale Stereo Image Dataset for 6D Object Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10870-10879\n} \n}" }, { "title": "Stochastic Partial Swap: Enhanced Model Generalization and Interpretability for Fine-Grained Recognition", @@ -39596,6 +42271,7 @@ "status": "Poster", "track": "main", "pid": 6655, + "author_site": "Shaoli Huang; Xinchao Wang; Dacheng Tao", "author": "Shaoli Huang; Xinchao Wang; Dacheng Tao", "abstract": "Learning mid-level representation for fine-grained recognition is easily dominated by a limited number of highly discriminative patterns, degrading its robustness and generalization capability. To this end, we propose a novel Stochastic Partial Swap (SPS) scheme to address this issue. Our method performs element-wise swapping for partial features between samples to inject noise during training. It equips a regularization effect similar to Dropout, which promotes more neurons to represent the concepts. Furthermore, it also exhibits other advantages: 1) suppressing over-activation to some part patterns to improve feature representativeness, and 2) enriching pattern combination and simulating noisy cases to enhance classifier generalization. We verify the effectiveness of our approach through comprehensive experiments across four network backbones and three fine-grained datasets. Moreover, we demonstrate its ability to complement high-level representations, allowing a simple model to achieve performance comparable to the top-performing technologies in fine-grained recognition, indoor scene recognition, and material recognition while improving model interpretability.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Huang_Stochastic_Partial_Swap_Enhanced_Model_Generalization_and_Interpretability_for_Fine-Grained_ICCV_2021_paper.pdf", @@ -39612,14 +42288,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Huang_Stochastic_Partial_Swap_Enhanced_Model_Generalization_and_Interpretability_for_Fine-Grained_ICCV_2021_paper.html", "aff_unique_index": "0+1;2;1", - "aff_unique_norm": "University of Sydney;JD;National University of Singapore", - "aff_unique_dep": ";JD Explore Academy;", + "aff_unique_norm": "The University of Sydney;JD Explore Academy;National University of Singapore", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.sydney.edu.au;;https://www.nus.edu.sg", "aff_unique_abbr": "USYD;;NUS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;2;1", - "aff_country_unique": "Australia;China;Singapore" + "aff_country_unique": "Australia;China;Singapore", + "bibtex": "@InProceedings{Huang_2021_ICCV,\n \n author = {\n Huang,\n Shaoli and Wang,\n Xinchao and Tao,\n Dacheng\n},\n title = {\n Stochastic Partial Swap: Enhanced Model Generalization and Interpretability for Fine-Grained Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 620-629\n} \n}" }, { "title": "Stochastic Scene-Aware Motion Prediction", @@ -39627,10 +42304,11 @@ "status": "Poster", "track": "main", "pid": 9252, + "author_site": "Mohamed Hassan; Duygu Ceylan; Ruben Villegas; Jun Saito; Jimei Yang; Yi Zhou; Michael J. Black", "author": "Mohamed Hassan; Duygu Ceylan; Ruben Villegas; Jun Saito; Jimei Yang; Yi Zhou; Michael J. Black", "abstract": "A long-standing goal in computer vision is to capture, model, and realistically synthesize human behavior. Specifically, by learning from data, our goal is to enable virtual humans to navigate within cluttered indoor scenes and naturally interact with objects. Such embodied behavior has applications in virtual reality, computer games, and robotics, while synthesized behavior can be used as a source of training data. This is challenging because real human motion is diverse and adapts to the scene. For example, a person can sit or lie on a sofa in many places and with varying styles. It is necessary to model this diversity when synthesizing virtual humans that realistically perform human-scene interactions. We present a novel data-driven, stochastic motion synthesis method that models different styles of performing a given action with a target object. Our method, called SAMP, for Scene-Aware Motion Prediction, generalizes to target objects of various geometries while enabling the character to navigate in cluttered scenes. To train our method, we collected MoCap data covering various sitting, lying down, walking, and running styles. We demonstrate our method on complex indoor scenes and achieve superior performance compared to existing solutions. Our code and data are available for research at https://samp.is.tue.mpg.de.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Hassan_Stochastic_Scene-Aware_Motion_Prediction_ICCV_2021_paper.pdf", - "aff": "Max Planck Institute for Intelligent Systems, T\u00fcbingen, Germany; Adobe Research; Adobe Research; Adobe Research; Adobe Research; Adobe Research; Max Planck Institute for Intelligent Systems, T\u00fcbingen, Germany", + "aff": "Max Planck Institute for Intelligent Systems, Tübingen, Germany; Adobe Research; Adobe Research; Adobe Research; Adobe Research; Adobe Research; Max Planck Institute for Intelligent Systems, Tübingen, Germany", "project": "https://samp.is.tue.mpg.de", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Hassan_Stochastic_Scene-Aware_Motion_ICCV_2021_supplemental.pdf", @@ -39648,9 +42326,10 @@ "aff_unique_url": "https://www.mpi-is.mpg.de;https://research.adobe.com", "aff_unique_abbr": "MPI-IS;Adobe", "aff_campus_unique_index": "0;0", - "aff_campus_unique": "T\u00fcbingen;", + "aff_campus_unique": "Tübingen;", "aff_country_unique_index": "0;1;1;1;1;1;0", - "aff_country_unique": "Germany;United States" + "aff_country_unique": "Germany;United States", + "bibtex": "@InProceedings{Hassan_2021_ICCV,\n \n author = {\n Hassan,\n Mohamed and Ceylan,\n Duygu and Villegas,\n Ruben and Saito,\n Jun and Yang,\n Jimei and Zhou,\n Yi and Black,\n Michael J.\n},\n title = {\n Stochastic Scene-Aware Motion Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11374-11384\n} \n}" }, { "title": "Stochastic Transformer Networks With Linear Competing Units: Application To End-to-End SL Translation", @@ -39658,6 +42337,7 @@ "status": "Poster", "track": "main", "pid": 3881, + "author_site": "Andreas Voskou; Konstantinos P. Panousis; Dimitrios Kosmopoulos; Dimitris N. Metaxas; Sotirios Chatzis", "author": "Andreas Voskou; Konstantinos P. Panousis; Dimitrios Kosmopoulos; Dimitris N. Metaxas; Sotirios Chatzis", "abstract": "Automating sign language translation (SLT) is a challenging real-world application. Despite its societal importance, though, research progress in the field remains rather poor. Crucially, existing methods that yield viable performance necessitate the availability of laborious to obtain gloss sequence groundtruth. In this paper, we attenuate this need, by introducing an end-to-end SLT model that does not entail explicit use of glosses; the model only needs text groundtruth. This is in stark contrast to existing end-to-end models that use gloss sequence groundtruth, either in the form of a modality that is recognized at an intermediate model stage, or in the form of a parallel output process, jointly trained with the SLT model. Our approach constitutes a Transformer network with a novel type of layers that combines: (i) local winner-takes-all (LWTA) layers with stochastic winner sampling, instead of conventional ReLU layers, (ii) stochastic weights with posterior distributions estimated via variational inference, and (iii) a weight compression technique at inference time that exploits estimated posterior variance to perform massive, almost lossless compression. We demonstrate that our approach can reach the currently best reported BLEU-4 score on the PHOENIX 2014T benchmark, but without making use of glosses for model training, and with a memory footprint reduced by more than 70%.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Voskou_Stochastic_Transformer_Networks_With_Linear_Competing_Units_Application_To_End-to-End_ICCV_2021_paper.pdf", @@ -39681,7 +42361,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";New Jersey", "aff_country_unique_index": "0;0;1;2;0", - "aff_country_unique": "Cyprus;Greece;United States" + "aff_country_unique": "Cyprus;Greece;United States", + "bibtex": "@InProceedings{Voskou_2021_ICCV,\n \n author = {\n Voskou,\n Andreas and Panousis,\n Konstantinos P. and Kosmopoulos,\n Dimitrios and Metaxas,\n Dimitris N. and Chatzis,\n Sotirios\n},\n title = {\n Stochastic Transformer Networks With Linear Competing Units: Application To End-to-End SL Translation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11946-11955\n} \n}" }, { "title": "Striking a Balance Between Stability and Plasticity for Class-Incremental Learning", @@ -39689,6 +42370,7 @@ "status": "Poster", "track": "main", "pid": 7855, + "author_site": "Guile Wu; Shaogang Gong; Pan Li", "author": "Guile Wu; Shaogang Gong; Pan Li", "abstract": "Class-incremental learning (CIL) aims at continuously updating a trained model with new classes (plasticity) without forgetting previously learned old ones (stability). Contemporary studies resort to storing representative exemplars for rehearsal or preventing consolidated model parameters from drifting, but the former requires an additional space for storing exemplars at every incremental phase while the latter usually shows poor model generalization. In this paper, we focus on resolving the stability-plasticity dilemma in class-incremental learning where no exemplars from old classes are stored. To make a trade-off between learning new information and maintaining old knowledge, we reformulate a simple yet effective baseline method based on a cosine classifier framework and reciprocal adaptive weights. With the reformulated baseline, we present two new approaches to CIL by learning class-independent knowledge and multi-perspective knowledge, respectively. The former exploits class-independent knowledge to bridge learning new and old classes, while the latter learns knowledge from different perspectives to facilitate CIL. Extensive experiments on several widely used CIL benchmark datasets show the superiority of our approaches over the state-of-the-art methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wu_Striking_a_Balance_Between_Stability_and_Plasticity_for_Class-Incremental_Learning_ICCV_2021_paper.pdf", @@ -39712,7 +42394,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "London", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Wu_2021_ICCV,\n \n author = {\n Wu,\n Guile and Gong,\n Shaogang and Li,\n Pan\n},\n title = {\n Striking a Balance Between Stability and Plasticity for Class-Incremental Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1124-1133\n} \n}" }, { "title": "StructDepth: Leveraging the Structural Regularities for Self-Supervised Indoor Depth Estimation", @@ -39720,6 +42403,7 @@ "status": "Poster", "track": "main", "pid": 10268, + "author_site": "Boying Li; Yuan Huang; Zeyu Liu; Danping Zou; Wenxian Yu", "author": "Boying Li; Yuan Huang; Zeyu Liu; Danping Zou; Wenxian Yu", "abstract": "Self-supervised monocular depth estimation has achieved impressive performance on outdoor datasets. Its performance however degrades notably in indoor environments because of the lack of textures. Without rich textures, the photometric consistency is too weak to train a good depth network. Inspired by the early works on indoor modeling, we leverage the structural regularities exhibited in indoor scenes, to train a better depth network. Specifically, we adopt two extra supervisory signals for self-supervised training: 1) the Manhattan normal constraint and 2) the co-planar constraint. The Manhattan normal constraint enforces the major surfaces (the floor, ceiling, and walls) to be aligned with dominant directions. The co-planar constraint states that the 3D points be well fitted by a plane if they are located within the same planar region. To generate the supervisory signals, we adopt two components to classify the major surface normal into dominant directions and detect the planar regions on the fly during training. As the predicted depth becomes more accurate after more training epochs, the supervisory signals also improve and in turn feedback to obtain a better depth model. Through extensive experiments on indoor benchmark datasets, the results show that our network outperforms the state-of-the-art methods. The source code is available at https://github.com/SJTU-ViSYS/StructDepth.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_StructDepth_Leveraging_the_Structural_Regularities_for_Self-Supervised_Indoor_Depth_Estimation_ICCV_2021_paper.pdf", @@ -39743,7 +42427,8 @@ "aff_campus_unique_index": ";;;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0+0;0+0+0;0+0+0;0+0+0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Boying and Huang,\n Yuan and Liu,\n Zeyu and Zou,\n Danping and Yu,\n Wenxian\n},\n title = {\n StructDepth: Leveraging the Structural Regularities for Self-Supervised Indoor Depth Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12663-12673\n} \n}" }, { "title": "Structure-From-Sherds: Incremental 3D Reassembly of Axially Symmetric Pots From Unordered and Mixed Fragment Collections", @@ -39751,6 +42436,7 @@ "status": "Poster", "track": "main", "pid": 1113, + "author_site": "Je Hyeong Hong; Seong Jong Yoo; Muhammad Arshad Zeeshan; Young Min Kim; Jinwook Kim", "author": "Je Hyeong Hong; Seong Jong Yoo; Muhammad Arshad Zeeshan; Young Min Kim; Jinwook Kim", "abstract": "Re-assembling multiple pots accurately from numerous 3D scanned fragments remains a challenging task to this date. Previous methods extract all potential matching pairs of pot sherds and considers them simultaneously to search for an optimal global pot configuration. In this work, we empirically show such global approach greatly suffers from false positive matches between sherds inflicted by indistinctive sharp fracture surfaces in pot fragments. To mitigate this problem, we take inspirations from the field of structure-from-motion (SfM), where many pipelines have matured in reconstructing a 3D scene from multiple images. Motivated by the success of the incremental approach in robust SfM, we present an efficient reassembly method for axially symmetric pots based on iterative registration of one sherd at a time. Our method goes beyond replicating incremental SfM and addresses indistinguishable false matches by embracing beam search to explore multitudes of registration possibilities. Additionally, we utilize multiple roots in each step to allow simultaneous reassembly of multiple pots. The proposed approach shows above 80% reassembly accuracy on a dataset of real 80 fragments mixed from 5 pots, pushing the state-of-the-art and paving the way towards the goal of large-scale pot reassembly. Our code and preprocessed data will be made available for research.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Hong_Structure-From-Sherds_Incremental_3D_Reassembly_of_Axially_Symmetric_Pots_From_Unordered_ICCV_2021_paper.pdf", @@ -39774,7 +42460,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Hong_2021_ICCV,\n \n author = {\n Hong,\n Je Hyeong and Yoo,\n Seong Jong and Zeeshan,\n Muhammad Arshad and Kim,\n Young Min and Kim,\n Jinwook\n},\n title = {\n Structure-From-Sherds: Incremental 3D Reassembly of Axially Symmetric Pots From Unordered and Mixed Fragment Collections\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5443-5451\n} \n}" }, { "title": "Structure-Preserving Deraining With Residue Channel Prior Guidance", @@ -39782,6 +42469,7 @@ "status": "Poster", "track": "main", "pid": 2472, + "author_site": "Qiaosi Yi; Juncheng Li; Qinyan Dai; Faming Fang; Guixu Zhang; Tieyong Zeng", "author": "Qiaosi Yi; Juncheng Li; Qinyan Dai; Faming Fang; Guixu Zhang; Tieyong Zeng", "abstract": "Single image deraining is important for many high-level computer vision tasks since the rain streaks can severely degrade the visibility of images, thereby affecting the recognition and analysis of the image. Recently, many CNN-based methods have been proposed for rain removal. Although these methods can remove part of the rain streaks, it is difficult for them to adapt to real-world scenarios and restore high-quality rain-free images with clear and accurate structures. To solve this problem, we propose a Structure-Preserving Deraining Network (SPDNet) with RCP guidance. SPDNet directly generates high-quality rain-free images with clear and accurate structures under the guidance of RCP but does not rely on any rain-generating assumptions. Specifically, we found that the RCP of images contains more accurate structural information than rainy images. Therefore, we introduced it to our deraining network to protect structure information of the rain-free image. Meanwhile, a Wavelet-based Multi-Level Module (WMLM) is proposed as the backbone for learning the background information of rainy images and an Interactive Fusion Module (IFM) is designed to make full use of RCP information. In addition, an iterative guidance strategy is proposed to gradually improve the accuracy of RCP, refining the result in a progressive path. Extensive experimental results on both synthetic and real-world datasets demonstrate that the proposed model achieves new state-of-the-art results. Code: https://github.com/Joyies/SPDNet", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yi_Structure-Preserving_Deraining_With_Residue_Channel_Prior_Guidance_ICCV_2021_paper.pdf", @@ -39798,14 +42486,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Yi_Structure-Preserving_Deraining_With_Residue_Channel_Prior_Guidance_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;0;0;1", - "aff_unique_norm": "East China Normal University;Chinese University of Hong Kong", + "aff_unique_norm": "East China Normal University;The Chinese University of Hong Kong", "aff_unique_dep": "School of Computer Science and Technology;Department of Mathematics", "aff_unique_url": "http://www.ecnu.edu.cn;https://www.cuhk.edu.hk", "aff_unique_abbr": "ECNU;CUHK", "aff_campus_unique_index": "0;0;0;0;0;1", "aff_campus_unique": "Shanghai;Hong Kong", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yi_2021_ICCV,\n \n author = {\n Yi,\n Qiaosi and Li,\n Juncheng and Dai,\n Qinyan and Fang,\n Faming and Zhang,\n Guixu and Zeng,\n Tieyong\n},\n title = {\n Structure-Preserving Deraining With Residue Channel Prior Guidance\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4238-4247\n} \n}" }, { "title": "Structure-Transformed Texture-Enhanced Network for Person Image Synthesis", @@ -39813,6 +42502,7 @@ "status": "Poster", "track": "main", "pid": 1737, + "author_site": "Munan Xu; Yuanqi Chen; Shan Liu; Thomas H. Li; Ge Li", "author": "Munan Xu; Yuanqi Chen; Shan Liu; Thomas H. Li; Ge Li", "abstract": "Pose-guided virtual try-on task aims to modify the fashion item based on pose transfer task. These two tasks that belong to person image synthesis have strong correlations and similarities. However, existing methods treat them as two individual tasks and do not explore correlations between them. Moreover, these two tasks are challenging due to large misalignment and occlusions, thus most of these methods are prone to generate unclear human body structure and blurry fine-grained textures. In this paper, we devise a structure-transformed texture-enhanced network to generate high-quality person images and construct the relationships between two tasks. It consists of two modules: structure-transformed renderer and texture-enhanced stylizer. The structure-transformed renderer is introduced to transform the source person structure to the target one, while the texture-enhanced stylizer is served to enhance detailed textures and controllably inject the fashion style founded on the structural transformation. With the two modules, our model can generate photorealistic person images in diverse poses and even with various fashion styles. Extensive experiments demonstrate that our approach achieves state-of-the-art results on two tasks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xu_Structure-Transformed_Texture-Enhanced_Network_for_Person_Image_Synthesis_ICCV_2021_paper.pdf", @@ -39829,14 +42519,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Xu_Structure-Transformed_Texture-Enhanced_Network_for_Person_Image_Synthesis_ICCV_2021_paper.html", "aff_unique_index": "0;0;1;0;0", - "aff_unique_norm": "Peking University;Tencent", - "aff_unique_dep": "School of Electronic and Computer Engineering;Tencent America", + "aff_unique_norm": "Peking University;Tencent America", + "aff_unique_dep": "School of Electronic and Computer Engineering;", "aff_unique_url": "http://www.pku.edu.cn;https://www.tencent.com/en-us", "aff_unique_abbr": "PKU;Tencent America", "aff_campus_unique_index": "0;0;2;0", "aff_campus_unique": "Shenzhen Graduate School;;Beijing", "aff_country_unique_index": "0;0;1;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Xu_2021_ICCV,\n \n author = {\n Xu,\n Munan and Chen,\n Yuanqi and Liu,\n Shan and Li,\n Thomas H. and Li,\n Ge\n},\n title = {\n Structure-Transformed Texture-Enhanced Network for Person Image Synthesis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13859-13868\n} \n}" }, { "title": "Structured Bird's-Eye-View Traffic Scene Understanding From Onboard Images", @@ -39844,6 +42535,7 @@ "status": "Poster", "track": "main", "pid": 10564, + "author_site": "Yigit Baran Can; Alexander Liniger; Danda Pani Paudel; Luc Van Gool", "author": "Yigit Baran Can; Alexander Liniger; Danda Pani Paudel; Luc Van Gool", "abstract": "Autonomous navigation requires structured representation of the road network and instance-wise identification of the other traffic agents. Since the traffic scene is defined on the ground plane, this corresponds to scene understanding in the bird's-eye-view (BEV). However, the onboard cameras of autonomous cars are customarily mounted horizontally for a better view of the surrounding, making this task very challenging. In this work, we study the problem of extracting a directed graph representing the local road network in BEV coordinates, from a single onboard camera image. Moreover, we show that the method can be extended to detect dynamic objects on the BEV plane. The semantics, locations, and orientations of the detected objects together with the road graph facilitates a comprehensive understanding of the scene. Such understanding becomes fundamental for the downstream tasks, such as path planning and navigation. We validate our approach against powerful baselines and show that our network achieves superior performance. We also demonstrate the effects of various design choices through ablation studies.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Can_Structured_Birds-Eye-View_Traffic_Scene_Understanding_From_Onboard_Images_ICCV_2021_paper.pdf", @@ -39867,7 +42559,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Zurich;", "aff_country_unique_index": "0;0;0;0+1", - "aff_country_unique": "Switzerland;Belgium" + "aff_country_unique": "Switzerland;Belgium", + "bibtex": "@InProceedings{Can_2021_ICCV,\n \n author = {\n Can,\n Yigit Baran and Liniger,\n Alexander and Paudel,\n Danda Pani and Van Gool,\n Luc\n},\n title = {\n Structured Bird's-Eye-View Traffic Scene Understanding From Onboard Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15661-15670\n} \n}" }, { "title": "Structured Outdoor Architecture Reconstruction by Exploration and Classification", @@ -39875,6 +42568,7 @@ "status": "Poster", "track": "main", "pid": 2062, + "author_site": "Fuyang Zhang; Xiang Xu; Nelson Nauata; Yasutaka Furukawa", "author": "Fuyang Zhang; Xiang Xu; Nelson Nauata; Yasutaka Furukawa", "abstract": "This paper presents an explore-and-classify framework for structured architectural reconstruction from aerial image. Starting from a potentially imperfect building reconstruction by an existing algorithm, our approach 1) explores the space of building models by modifying the reconstruction via heuristic actions; 2) learns to classify the correctness of building models while generating classification labels based on the ground-truth; and 3) repeat. At test time, we iterate exploration and classification, seeking for a result with the best classification score. We evaluate the approach using initial reconstructions by two baselines and two state-of-the-art reconstruction algorithms. Qualitative and quantitative evaluations demonstrate that our approach consistently improves the reconstruction quality from every initial reconstruction.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhang_Structured_Outdoor_Architecture_Reconstruction_by_Exploration_and_Classification_ICCV_2021_paper.pdf", @@ -39898,7 +42592,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Burnaby", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Zhang_2021_ICCV,\n \n author = {\n Zhang,\n Fuyang and Xu,\n Xiang and Nauata,\n Nelson and Furukawa,\n Yasutaka\n},\n title = {\n Structured Outdoor Architecture Reconstruction by Exploration and Classification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12427-12435\n} \n}" }, { "title": "Student Customized Knowledge Distillation: Bridging the Gap Between Student and Teacher", @@ -39906,6 +42601,7 @@ "status": "Poster", "track": "main", "pid": 3693, + "author_site": "Yichen Zhu; Yi Wang", "author": "Yichen Zhu; Yi Wang", "abstract": "Knowledge distillation (KD) transfers the dark knowledge from cumbersome networks (teacher) to lightweight (student) networks and expects the student to achieve more promising performance than training without the teacher's knowledge. However, a counter-intuitive argument is that better teachers do not make better students due to the capacity mismatch. To this end, we present a novel adaptive knowledge distillation method to complement traditional approaches. The proposed method, named as Student Customized Knowledge Distillation (SCKD), examines the capacity mismatch between teacher and student from the perspective of gradient similarity. We formulate the knowledge distillation as a multi-task learning problem so that the teacher transfers knowledge to the student only if the student can benefit from learning such knowledge. We validate our methods on multiple datasets with various teacher-student configurations on image classification, object detection, and semantic segmentation.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhu_Student_Customized_Knowledge_Distillation_Bridging_the_Gap_Between_Student_and_ICCV_2021_paper.pdf", @@ -39929,7 +42625,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhu_2021_ICCV,\n \n author = {\n Zhu,\n Yichen and Wang,\n Yi\n},\n title = {\n Student Customized Knowledge Distillation: Bridging the Gap Between Student and Teacher\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5057-5066\n} \n}" }, { "title": "StyleCLIP: Text-Driven Manipulation of StyleGAN Imagery", @@ -39937,6 +42634,7 @@ "status": "Poster", "track": "main", "pid": 8652, + "author_site": "Or Patashnik; Zongze Wu; Eli Shechtman; Daniel Cohen-Or; Dani Lischinski", "author": "Or Patashnik; Zongze Wu; Eli Shechtman; Daniel Cohen-Or; Dani Lischinski", "abstract": "Inspired by the ability of StyleGAN to generate highly re-alistic images in a variety of domains, much recent work hasfocused on understanding how to use the latent spaces ofStyleGAN to manipulate generated and real images. How-ever, discovering semantically meaningful latent manipula-tions typically involves painstaking human examination ofthe many degrees of freedom, or an annotated collectionof images for each desired manipulation. In this work, weexplore leveraging the power of recently introduced Con-trastive Language-Image Pre-training (CLIP) models in or-der to develop a text-based interface for StyleGAN imagemanipulation that does not require such manual effort. Wefirst introduce an optimization scheme that utilizes a CLIP-based loss to modify an input latent vector in response to auser-provided text prompt. Next, we describe a latent map-per that infers a text-guided latent manipulation step fora given input image, allowing faster and more stable text-based manipulation. Finally, we present a method for map-ping a text prompts to input-agnostic directions in Style-GAN's style space, enabling interactive text-driven imagemanipulation. Extensive results and comparisons demon-strate the effectiveness of our approaches.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Patashnik_StyleCLIP_Text-Driven_Manipulation_of_StyleGAN_Imagery_ICCV_2021_paper.pdf", @@ -39960,7 +42658,8 @@ "aff_campus_unique_index": ";1;1", "aff_campus_unique": ";Jerusalem", "aff_country_unique_index": "0+1;0;1;0;0", - "aff_country_unique": "Israel;United States" + "aff_country_unique": "Israel;United States", + "bibtex": "@InProceedings{Patashnik_2021_ICCV,\n \n author = {\n Patashnik,\n Or and Wu,\n Zongze and Shechtman,\n Eli and Cohen-Or,\n Daniel and Lischinski,\n Dani\n},\n title = {\n StyleCLIP: Text-Driven Manipulation of StyleGAN Imagery\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2085-2094\n} \n}" }, { "title": "StyleFormer: Real-Time Arbitrary Style Transfer via Parametric Style Composition", @@ -39968,6 +42667,7 @@ "status": "Poster", "track": "main", "pid": 5609, + "author_site": "Xiaolei Wu; Zhihao Hu; Lu Sheng; Dong Xu", "author": "Xiaolei Wu; Zhihao Hu; Lu Sheng; Dong Xu", "abstract": "In this work, we propose a new feed-forward arbitrary style transfer method, referred to as StyleFormer, which can simultaneously fulfill fine-grained style diversity and semantic content coherency. Specifically, our transformer-inspired feature-level stylization method consists of three modules: (a) the style bank generation module for sparse but compact parametric style pattern extraction, (b) the transformer-driven style composition module for content-guided global style composition, and (c) the parametric content modulation module for flexible but faithful stylization. The output stylized images are impressively coherent with the content structure, sensitive to the detailed style variations, but still holistically adhere to the style distributions from the style images. Qualitative and quantitative comparisons as well as comprehensive user studies demonstrate that our StyleFormer outperforms the existing SOTA methods in generating visually plausible stylization results with real-time efficiency.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wu_StyleFormer_Real-Time_Arbitrary_Style_Transfer_via_Parametric_Style_Composition_ICCV_2021_paper.pdf", @@ -39984,14 +42684,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wu_StyleFormer_Real-Time_Arbitrary_Style_Transfer_via_Parametric_Style_Composition_ICCV_2021_paper.html", "aff_unique_index": "0;0;0+1;1", - "aff_unique_norm": "Beihang University;University of Sydney", + "aff_unique_norm": "Beihang University;The University of Sydney", "aff_unique_dep": "College of Software;", "aff_unique_url": "http://www.buaa.edu.cn;https://www.sydney.edu.au", "aff_unique_abbr": "Beihang;USYD", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+1;1", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Wu_2021_ICCV,\n \n author = {\n Wu,\n Xiaolei and Hu,\n Zhihao and Sheng,\n Lu and Xu,\n Dong\n},\n title = {\n StyleFormer: Real-Time Arbitrary Style Transfer via Parametric Style Composition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14618-14627\n} \n}" }, { "title": "Sub-Bit Neural Networks: Learning To Compress and Accelerate Binary Neural Networks", @@ -39999,6 +42700,7 @@ "status": "Poster", "track": "main", "pid": 6722, + "author_site": "Yikai Wang; Yi Yang; Fuchun Sun; Anbang Yao", "author": "Yikai Wang; Yi Yang; Fuchun Sun; Anbang Yao", "abstract": "In the low-bit quantization field, training Binarized Neural Networks (BNNs) is the extreme solution to ease the deployment of deep models on resource-constrained devices, having the lowest storage cost and significantly cheaper bit-wise operations compared to 32-bit floating-point counterparts. In this paper, we introduce Sub-bit Neural Networks (SNNs), a new type of binary quantization design tailored to compress and accelerate BNNs. SNNs are inspired by an empirical observation, showing that binary kernels learnt at convolutional layers of a BNN model are likely to be distributed over kernel subsets. As a result, unlike existing methods that binarize weights one by one, SNNs are trained with a kernel-aware optimization framework, which exploits binary quantization in the fine-grained convolutional kernel space. Specifically, our method includes a random sampling step generating layer-specific subsets of the kernel space, and a refinement step learning to adjust these subsets of binary kernels via optimization. Experiments on visual recognition benchmarks and the hardware deployment on FPGA validate the great potentials of SNNs. For instance, on ImageNet, SNNs of ResNet-18/ResNet-34 with 0.56-bit weights achieve 3.13/3.33 times runtime speed-up and 1.8 times compression over conventional BNNs with moderate drops in recognition accuracy. Promising results are also obtained when applying SNNs to binarize both weights and activations. Our code is available at https://github.com/yikaiw/SNN.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_Sub-Bit_Neural_Networks_Learning_To_Compress_and_Accelerate_Binary_Neural_ICCV_2021_paper.pdf", @@ -40015,14 +42717,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wang_Sub-Bit_Neural_Networks_Learning_To_Compress_and_Accelerate_Binary_Neural_ICCV_2021_paper.html", "aff_unique_index": "0+1;1;0;1", - "aff_unique_norm": "Tsinghua University;Intel", - "aff_unique_dep": "Department of Computer Science and Technology;Intel Corporation", + "aff_unique_norm": "Tsinghua University;Intel Corporation", + "aff_unique_dep": "Department of Computer Science and Technology;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.intel.com", "aff_unique_abbr": "Tsinghua;Intel", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Beijing;", "aff_country_unique_index": "0+1;1;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Yikai and Yang,\n Yi and Sun,\n Fuchun and Yao,\n Anbang\n},\n title = {\n Sub-Bit Neural Networks: Learning To Compress and Accelerate Binary Neural Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5360-5369\n} \n}" }, { "title": "Summarize and Search: Learning Consensus-Aware Dynamic Convolution for Co-Saliency Detection", @@ -40030,6 +42733,7 @@ "status": "Poster", "track": "main", "pid": 8492, + "author_site": "Ni Zhang; Junwei Han; Nian Liu; Ling Shao", "author": "Ni Zhang; Junwei Han; Nian Liu; Ling Shao", "abstract": "Humans perform co-saliency detection by first summarizing the consensus knowledge in the whole group and then searching corresponding objects in each image. Previous methods usually lack robustness, scalability, or stability for the first process and simply fuse consensus features with image features for the second process. In this paper, we propose a novel consensus-aware dynamic convolution model to explicitly and effectively perform the \"summarize and search\" process. To summarize consensus image features, we first summarize robust features for every single image using an effective pooling method and then aggregate cross-image consensus cues via the self-attention mechanism. By doing this, our model meets the scalability and stability requirements. Next, we generate dynamic kernels from consensus features to encode the summarized consensus knowledge. Two kinds of kernels are generated in a supplementary way to summarize fine-grained image-specific consensus object cues and the coarse group-wise common knowledge, respectively. Then, we can effectively perform object searching by employing dynamic convolution at multiple scales. Besides, a novel and effective data synthesis method is also proposed to train our network. Experimental results on four benchmark datasets verify the effectiveness of our proposed method. Our code and saliency maps are available at https://github.com/nnizhang/CADC.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhang_Summarize_and_Search_Learning_Consensus-Aware_Dynamic_Convolution_for_Co-Saliency_Detection_ICCV_2021_paper.pdf", @@ -40053,7 +42757,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;1", - "aff_country_unique": "China;United Arab Emirates" + "aff_country_unique": "China;United Arab Emirates", + "bibtex": "@InProceedings{Zhang_2021_ICCV,\n \n author = {\n Zhang,\n Ni and Han,\n Junwei and Liu,\n Nian and Shao,\n Ling\n},\n title = {\n Summarize and Search: Learning Consensus-Aware Dynamic Convolution for Co-Saliency Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4167-4176\n} \n}" }, { "title": "Super Resolve Dynamic Scene From Continuous Spike Streams", @@ -40061,6 +42766,7 @@ "status": "Poster", "track": "main", "pid": 6925, + "author_site": "Jing Zhao; Jiyu Xie; Ruiqin Xiong; Jian Zhang; Zhaofei Yu; Tiejun Huang", "author": "Jing Zhao; Jiyu Xie; Ruiqin Xiong; Jian Zhang; Zhaofei Yu; Tiejun Huang", "abstract": "Recently, a novel retina-inspired camera, namely spike camera, has shown great potential for recording high-speed dynamic scenes. Unlike the conventional digital cameras that compact the visual information within the exposure interval into a single snapshot, the spike camera continuously outputs binary spike streams to record the dynamic scenes, yielding a very high temporal resolution. Most of the existing reconstruction methods for spike camera focus on reconstructing images with the same resolution as spike camera. However, as a trade-off of high temporal resolution, the spatial resolution of spike camera is limited, resulting in inferior details of the reconstruction. To address this issue, we develop a spike camera super-resolution framework, aiming to super resolve high-resolution intensity images from the low-resolution binary spike streams. Due to the relative motion between the camera and the objects to capture, the spikes fired by the same sensor pixel no longer describes the same points in the external scene. In this paper, we properly exploit the relative motion and derive the relationship between light intensity and each spike, so as to recover the external scene with both high temporal and high spatial resolution. Experimental results demonstrate that the proposed method can reconstruct pleasant high-resolution images from low-resolution spike streams.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhao_Super_Resolve_Dynamic_Scene_From_Continuous_Spike_Streams_ICCV_2021_paper.pdf", @@ -40084,7 +42790,8 @@ "aff_campus_unique_index": "0;1;0+2;2;0;0", "aff_campus_unique": "Beijing;Hefei;Shenzhen", "aff_country_unique_index": "0;0;0+0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhao_2021_ICCV,\n \n author = {\n Zhao,\n Jing and Xie,\n Jiyu and Xiong,\n Ruiqin and Zhang,\n Jian and Yu,\n Zhaofei and Huang,\n Tiejun\n},\n title = {\n Super Resolve Dynamic Scene From Continuous Spike Streams\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2533-2542\n} \n}" }, { "title": "Super-Resolving Cross-Domain Face Miniatures by Peeking at One-Shot Exemplar", @@ -40092,6 +42799,7 @@ "status": "Poster", "track": "main", "pid": 3115, + "author_site": "Peike Li; Xin Yu; Yi Yang", "author": "Peike Li; Xin Yu; Yi Yang", "abstract": "Conventional face super-resolution methods usually assume testing low-resolution (LR) images lie in the same domain as the training ones. Due to different lighting conditions and imaging hardware, domain gaps between training and testing images inevitably occur in many real-world scenarios. Neglecting those domain gaps would lead to inferior face super-resolution (FSR) performance. However, how to transfer a trained FSR model to a target domain efficiently and effectively has not been investigated. To tackle this problem, we develop a Domain-Aware Pyramid-based Face Super-Resolution network, named DAP-FSR network. Our DAP-FSR is the first attempt to super-resolve LR faces from a target domain by exploiting only a pair of high-resolution (HR) and LR exemplar in the target domain. To be specific, our DAP-FSR firstly employs its encoder to extract the multi-scale latent representations of the input LR face. Considering only one target domain example is available, we propose to augment the target domain data by mixing the latent representations of the target domain face and source domain ones and then feed the mixed representations to the decoder of our DAP-FSR. The decoder will generate new face images resembling the target domain image style. The generated HR faces in turn are used to optimize our decoder to reduce the domain gap. By iteratively updating the latent representations and our decoder, our DAP-FSR will be adapted to the target domain, thus achieving authentic and high-quality upsampled HR faces. Extensive experiments on three benchmarks validate the effectiveness and superior performance of our DAP-FSR compared to the state-of-the-art methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_Super-Resolving_Cross-Domain_Face_Miniatures_by_Peeking_at_One-Shot_Exemplar_ICCV_2021_paper.pdf", @@ -40115,7 +42823,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Sydney;", "aff_country_unique_index": "0+1;0;0", - "aff_country_unique": "Australia;China" + "aff_country_unique": "Australia;China", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Peike and Yu,\n Xin and Yang,\n Yi\n},\n title = {\n Super-Resolving Cross-Domain Face Miniatures by Peeking at One-Shot Exemplar\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4469-4479\n} \n}" }, { "title": "Superpoint Network for Point Cloud Oversegmentation", @@ -40123,6 +42832,7 @@ "status": "Poster", "track": "main", "pid": 2942, + "author_site": "Le Hui; Jia Yuan; Mingmei Cheng; Jin Xie; Xiaoya Zhang; Jian Yang", "author": "Le Hui; Jia Yuan; Mingmei Cheng; Jin Xie; Xiaoya Zhang; Jian Yang", "abstract": "Superpoints are formed by grouping similar points with local geometric structures, which can effectively reduce the number of primitives of point clouds for subsequent point cloud processing. Existing superpoint methods mainly focus on employing clustering or graph partition to generate superpoints with handcrafted or learned features. Nonetheless, these methods cannot learn superpoints of point clouds with an end-to-end network. In this paper, we develop a new deep iterative clustering network to directly generate superpoints from irregular 3D point clouds in an end-to-end manner. Specifically, in our clustering network, we first jointly learn a soft point-superpoint association map from the coordinate and feature spaces of point clouds, where each point is assigned to the superpoint with a learned weight. Furthermore, we then iteratively update the association map and superpoint centers so that we can more accurately group the points into the corresponding superpoints with locally similar geometric structures. Finally, by predicting the pseudo labels of the superpoint centers, we formulate a label consistency loss on the points and superpoint centers to train the network. Extensive experiments on various datasets indicate that our method not only achieves the state-of-the-art on superpoint generation but also improves the performance of point cloud semantic segmentation. Code is available at https://github.com/fpthink/SPNet.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Hui_Superpoint_Network_for_Point_Cloud_Oversegmentation_ICCV_2021_paper.pdf", @@ -40146,7 +42856,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Hui_2021_ICCV,\n \n author = {\n Hui,\n Le and Yuan,\n Jia and Cheng,\n Mingmei and Xie,\n Jin and Zhang,\n Xiaoya and Yang,\n Jian\n},\n title = {\n Superpoint Network for Point Cloud Oversegmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5510-5519\n} \n}" }, { "title": "Support-Set Based Cross-Supervision for Video Grounding", @@ -40154,6 +42865,7 @@ "status": "Poster", "track": "main", "pid": 3412, + "author_site": "Xinpeng Ding; Nannan Wang; Shiwei Zhang; De Cheng; Xiaomeng Li; Ziyuan Huang; Mingqian Tang; Xinbo Gao", "author": "Xinpeng Ding; Nannan Wang; Shiwei Zhang; De Cheng; Xiaomeng Li; Ziyuan Huang; Mingqian Tang; Xinbo Gao", "abstract": "Current approaches for video grounding propose kinds of complex architectures to capture the video-text relations, and have achieved impressive improvements. However, it is hard to learn the complicated multi-modal relations by only architecture designing in fact. In this paper, we introduce a novel Support-set Based Cross-Supervision (Sscs) module which can improve existing methods during training phase without extra inference cost. The contrastive objective aims to learn effective representations by contrastive learning, while the caption objective can train a powerful video encoder supervised by texts. Due to the co-existence of some visual entities in both ground-truth and background intervals, i.e., mutual exclusion, naively contrastive learning is unsuitable to video grounding. We address the problem by boosting the cross-supervision with the support-set concept, which collects visual information from the whole video and eliminates the mutual exclusion of entities. Combined with the original objective, Sscs can enhance the abilities of multi-modal relation modeling for existing approaches. We extensively evaluate Sscs on three challenging datasets, and show that our method can improves current state-of-the-art methods by large margins, especially 6.35% in terms of R1@0.5 on Charades-STA.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ding_Support-Set_Based_Cross-Supervision_for_Video_Grounding_ICCV_2021_paper.pdf", @@ -40177,7 +42889,8 @@ "aff_campus_unique_index": ";1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0+0;0;0;0;0;1;0;0", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Ding_2021_ICCV,\n \n author = {\n Ding,\n Xinpeng and Wang,\n Nannan and Zhang,\n Shiwei and Cheng,\n De and Li,\n Xiaomeng and Huang,\n Ziyuan and Tang,\n Mingqian and Gao,\n Xinbo\n},\n title = {\n Support-Set Based Cross-Supervision for Video Grounding\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11573-11582\n} \n}" }, { "title": "SurfGen: Adversarial 3D Shape Synthesis With Explicit Surface Discriminators", @@ -40185,6 +42898,7 @@ "status": "Poster", "track": "main", "pid": 3505, + "author_site": "Andrew Luo; Tianqin Li; Wen-Hao Zhang; Tai Sing Lee", "author": "Andrew Luo; Tianqin Li; Wen-Hao Zhang; Tai Sing Lee", "abstract": "Recent advances in deep generative models have led to immense progress in 3D shape synthesis. While existing models are able to synthesize shapes represented as voxels, point-clouds, or implicit functions, these methods only indirectly enforce the plausibility of the final 3D shape surface. Here we present a 3D shape synthesis framework (SurfGen) that directly applies adversarial training to the object surface. Our approach uses a differentiable spherical projection layer to capture and represent the explicit zero isosurface of an implicit 3D generator as functions defined on the unit sphere. By processing the spherical representation of 3D object surfaces with a spherical CNN in an adversarial setting, our generator can better learn the statistics of natural shape surfaces. We evaluate our model on large-scale shape datasets, and demonstrate that the end-to-end trained model is capable of generating high fidelity 3D shapes with diverse topology", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Luo_SurfGen_Adversarial_3D_Shape_Synthesis_With_Explicit_Surface_Discriminators_ICCV_2021_paper.pdf", @@ -40199,7 +42913,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Luo_SurfGen_Adversarial_3D_Shape_Synthesis_With_Explicit_Surface_Discriminators_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Luo_SurfGen_Adversarial_3D_Shape_Synthesis_With_Explicit_Surface_Discriminators_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Luo_2021_ICCV,\n \n author = {\n Luo,\n Andrew and Li,\n Tianqin and Zhang,\n Wen-Hao and Lee,\n Tai Sing\n},\n title = {\n SurfGen: Adversarial 3D Shape Synthesis With Explicit Surface Discriminators\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 16238-16248\n} \n}" }, { "title": "SurfaceNet: Adversarial SVBRDF Estimation From a Single Image", @@ -40207,10 +42922,11 @@ "status": "Poster", "track": "main", "pid": 3887, + "author_site": "Giuseppe Vecchio; Simone Palazzo; Concetto Spampinato", "author": "Giuseppe Vecchio; Simone Palazzo; Concetto Spampinato", "abstract": "In this paper we present SurfaceNet, an approach for estimating spatially-varying bidirectional reflectance distribution function (SVBRDF) material properties from a single image. We pose the problem as an image translation task and propose a novel patch-based generative adversarial network (GAN) that is able to produce high-quality, high-resolution surface reflectance maps. The employment of the GAN paradigm has a twofold objective: 1) allowing the model to recover finer details than standard translation models; 2) reducing the domain shift between synthetic and real data distributions in an unsupervised way. An extensive evaluation, carried out on a public benchmark of synthetic and real images under different illumination conditions, shows that SurfaceNet largely outperforms existing SVBRDF reconstruction methods, both quantitatively and qualitatively. Furthermore, SurfaceNet exhibits a re-markable ability in generating high-quality maps from real samples without any supervision at training time.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Vecchio_SurfaceNet_Adversarial_SVBRDF_Estimation_From_a_Single_Image_ICCV_2021_paper.pdf", - "aff": "PeRCeiVe Lab; Department of Electrical, Electronics and Computer Engineering \u2013 University of Catania, Italy; Department of Electrical, Electronics and Computer Engineering \u2013 University of Catania, Italy", + "aff": "PeRCeiVe Lab; Department of Electrical, Electronics and Computer Engineering – University of Catania, Italy; Department of Electrical, Electronics and Computer Engineering – University of Catania, Italy", "project": "", "github": "https://github.com/perceivelab/surfacenet", "supp": "", @@ -40230,7 +42946,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "1;1", - "aff_country_unique": ";Italy" + "aff_country_unique": ";Italy", + "bibtex": "@InProceedings{Vecchio_2021_ICCV,\n \n author = {\n Vecchio,\n Giuseppe and Palazzo,\n Simone and Spampinato,\n Concetto\n},\n title = {\n SurfaceNet: Adversarial SVBRDF Estimation From a Single Image\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12840-12848\n} \n}" }, { "title": "Swin Transformer: Hierarchical Vision Transformer Using Shifted Windows", @@ -40238,6 +42955,7 @@ "status": "Poster", "track": "main", "pid": 7354, + "author_site": "Ze Liu; Yutong Lin; Yue Cao; Han Hu; Yixuan Wei; Zheng Zhang; Stephen Lin; Baining Guo", "author": "Ze Liu; Yutong Lin; Yue Cao; Han Hu; Yixuan Wei; Zheng Zhang; Stephen Lin; Baining Guo", "abstract": "This paper presents a new vision Transformer, called Swin Transformer, that capably serves as a general-purpose backbone for computer vision. Challenges in adapting Transformer from language to vision arise from differences between the two domains, such as large variations in the scale of visual entities and the high resolution of pixels in images compared to words in text. To address these differences, we propose a hierarchical Transformer whose representation is computed with Shifted windows. The shifted windowing scheme brings greater efficiency by limiting self-attention computation to non-overlapping local windows while also allowing for cross-window connection. This hierarchical architecture has the flexibility to model at various scales and has linear computational complexity with respect to image size. These qualities of Swin Transformer make it compatible with a broad range of vision tasks, including image classification (87.3 top-1 accuracy on ImageNet-1K) and dense prediction tasks such as object detection (58.7 box AP and 51.1 mask AP on COCO test-dev) and semantic segmentation (53.5 mIoU on ADE20K val). Its performance surpasses the previous state-of-the-art by a large margin of +2.7 box AP and +2.6 mask AP on COCO, and +3.2 mIoU on ADE20K, demonstrating the potential of Transformer-based models as vision backbones. The hierarchical design and the shifted window approach also prove beneficial for all-MLP architectures. The code and models are publicly available at https://github.com/microsoft/Swin-Transformer.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liu_Swin_Transformer_Hierarchical_Vision_Transformer_Using_Shifted_Windows_ICCV_2021_paper.pdf", @@ -40254,14 +42972,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Liu_Swin_Transformer_Hierarchical_Vision_Transformer_Using_Shifted_Windows_ICCV_2021_paper.html", "aff_unique_index": "0+1;0+2;0;0+3;0;0;0;0", - "aff_unique_norm": "Microsoft;University of Science and Technology of China;Xi'an Jiao Tong University;Tsinghua University", + "aff_unique_norm": "Microsoft Research;University of Science and Technology of China;Xi'an Jiaotong University;Tsinghua University", "aff_unique_dep": "Research;;;", "aff_unique_url": "https://www.microsoft.com/en-us/research/group/asia;http://www.ustc.edu.cn;https://www.xjtu.edu.cn;https://www.tsinghua.edu.cn", "aff_unique_abbr": "MSR Asia;USTC;XJTU;THU", "aff_campus_unique_index": "0;0+2;0;0;0;0;0;0", "aff_campus_unique": "Asia;;Xi'an", "aff_country_unique_index": "0+0;0+0;0;0+0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2021_ICCV,\n \n author = {\n Liu,\n Ze and Lin,\n Yutong and Cao,\n Yue and Hu,\n Han and Wei,\n Yixuan and Zhang,\n Zheng and Lin,\n Stephen and Guo,\n Baining\n},\n title = {\n Swin Transformer: Hierarchical Vision Transformer Using Shifted Windows\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10012-10022\n} \n}" }, { "title": "Switchable K-Class Hyperplanes for Noise-Robust Representation Learning", @@ -40269,6 +42988,7 @@ "status": "Poster", "track": "main", "pid": 6789, + "author_site": "Boxiao Liu; Guanglu Song; Manyuan Zhang; Haihang You; Yu Liu", "author": "Boxiao Liu; Guanglu Song; Manyuan Zhang; Haihang You; Yu Liu", "abstract": "Optimizing the K-class hyperplanes in the latent space has become the standard paradigm for efficient representation learning. However, it's almost impossible to find an optimal K-class hyperplane to accurately describe the latent space of massive noisy data. For this potential problem, we constructively propose a new method, named Switchable K-class Hyperplanes (SKH), to sufficiently describe the latent space by the mixture of K-class hyperplanes. It can directly replace the conventional single K-class hyperplane optimization as the new paradigm for noise-robust representation learning. When collaborated with the popular ArcFace on million-level data representation learning, we found that the switchable manner in SKH can effectively eliminate the gradient conflict generated by real-world label noise on a single K-class hyperplane. Moreover, combined with the margin-based loss functions (e.g. ArcFace), we propose a simple Posterior Data Clean strategy to reduce the model optimization deviation on clean dataset caused by the reduction of valid categories in each K-class hyperplane. Extensive experiments demonstrate that the proposed SKH easily achieves new state-of-the-art on IJB-B and IJB-C by encouraging noise-robust representation learning.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liu_Switchable_K-Class_Hyperplanes_for_Noise-Robust_Representation_Learning_ICCV_2021_paper.pdf", @@ -40285,14 +43005,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Liu_Switchable_K-Class_Hyperplanes_for_Noise-Robust_Representation_Learning_ICCV_2021_paper.html", "aff_unique_index": "0+1;2;2+3;0+1;2", - "aff_unique_norm": "Institute of Computing Technology;University of Chinese Academy of Sciences;SenseTime;Chinese University of Hong Kong", + "aff_unique_norm": "Institute of Computing Technology;University of Chinese Academy of Sciences;SenseTime;The Chinese University of Hong Kong", "aff_unique_dep": "State Key Laboratory of Computer Architecture;;SenseTime Research;CUHK - SenseTime Joint Lab", "aff_unique_url": "http://www.ict.ac.cn;http://www.ucas.ac.cn;https://www.sensetime.com;https://www.cuhk.edu.hk", "aff_unique_abbr": "ICT;UCAS;SenseTime;CUHK", "aff_campus_unique_index": ";1;", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0+0;0;0+0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2021_ICCV,\n \n author = {\n Liu,\n Boxiao and Song,\n Guanglu and Zhang,\n Manyuan and You,\n Haihang and Liu,\n Yu\n},\n title = {\n Switchable K-Class Hyperplanes for Noise-Robust Representation Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3019-3028\n} \n}" }, { "title": "SynFace: Face Recognition With Synthetic Data", @@ -40300,6 +43021,7 @@ "status": "Poster", "track": "main", "pid": 5961, + "author_site": "Haibo Qiu; Baosheng Yu; Dihong Gong; Zhifeng Li; Wei Liu; Dacheng Tao", "author": "Haibo Qiu; Baosheng Yu; Dihong Gong; Zhifeng Li; Wei Liu; Dacheng Tao", "abstract": "With the recent success of deep neural networks, remarkable progress has been achieved on face recognition. However, collecting large-scale real-world training data for face recognition has turned out to be challenging, especially due to the label noise and privacy issues. Meanwhile, existing face recognition datasets are usually collected from web images, lacking detailed annotations on attributes (e.g., pose and expression), so the influences of different attributes on face recognition have been poorly investigated. In this paper, we address the above-mentioned issues in face recognition using synthetic face images, i.e., SynFace. Specifically, we first explore the performance gap between recent state-of-the-art face recognition models trained with synthetic and real face images. We then analyze the underlying causes behind the performance gap, e.g., the poor intra-class variations and the domain gap between synthetic and real face images. Inspired by this, we devise the SynFace with identity mixup (IM) and domain mixup (DM) to mitigate the above performance gap, demonstrating the great potentials of synthetic data for face recognition. Furthermore, with the controllable face synthesis model, we can easily manage different factors of synthetic face generation, including pose, expression, illumination, the number of identities, and samples per identity. Therefore, we also perform a systematically empirical analysis on synthetic face images to provide some insights on how to effectively utilize synthetic data for face recognition.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Qiu_SynFace_Face_Recognition_With_Synthetic_Data_ICCV_2021_paper.pdf", @@ -40316,14 +43038,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Qiu_SynFace_Face_Recognition_With_Synthetic_Data_ICCV_2021_paper.html", "aff_unique_index": "0+1;1;2;2;2;0+1", - "aff_unique_norm": "JD;University of Sydney;Tencent", - "aff_unique_dep": "JD Explore Academy;;Tencent Data Platform", + "aff_unique_norm": "JD Explore Academy;The University of Sydney;Tencent", + "aff_unique_dep": ";;Tencent Data Platform", "aff_unique_url": ";https://www.sydney.edu.au;https://www.tencent.com", "aff_unique_abbr": ";USYD;Tencent", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+1;1;0;0;0;0+1", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Qiu_2021_ICCV,\n \n author = {\n Qiu,\n Haibo and Yu,\n Baosheng and Gong,\n Dihong and Li,\n Zhifeng and Liu,\n Wei and Tao,\n Dacheng\n},\n title = {\n SynFace: Face Recognition With Synthetic Data\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10880-10890\n} \n}" }, { "title": "Synchronization of Group-Labelled Multi-Graphs", @@ -40331,6 +43054,7 @@ "status": "Poster", "track": "main", "pid": 4176, + "author_site": "Andrea Porfiri Dal Cin; Luca Magri; Federica Arrigoni; Andrea Fusiello; Giacomo Boracchi", "author": "Andrea Porfiri Dal Cin; Luca Magri; Federica Arrigoni; Andrea Fusiello; Giacomo Boracchi", "abstract": "Synchronization refers to the problem of inferring the unknown values attached to vertices of a graph where edges are labelled with the ratio of the incident vertices, and labels belong to a group. This paper addresses the synchronization problem on multi-graphs, that are graphs with more than one edge connecting the same pair of nodes. The problem naturally arises when multiple measures are available to model the relationship between two vertices. This happens when different sensors measure the same quantity, or when the original graph is partitioned into sub-graphs that are solved independently. In this case, the relationships among sub-graphs give rise to multi-edges and the problem can be traced back to a multi-graph synchronization. The baseline solution reduces multi-graphs to simple ones by averaging their multi-edges, however this approach falls short because: i) averaging is well defined only for some groups and ii) the resulting estimator is less precise and accurate, as we prove empirically. Specifically, we present MultiSynch, a synchronization algorithm for multi-graphs that is based on a principled constrained eigenvalue optimization. MultiSynch is a general solution that can cope with any linear group and we show to be profitably usable both on synthetic and real problems.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Dal_Cin_Synchronization_of_Group-Labelled_Multi-Graphs_ICCV_2021_paper.pdf", @@ -40354,7 +43078,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "Italy" + "aff_country_unique": "Italy", + "bibtex": "@InProceedings{Cin_2021_ICCV,\n \n author = {\n Cin,\n Andrea Porfiri Dal and Magri,\n Luca and Arrigoni,\n Federica and Fusiello,\n Andrea and Boracchi,\n Giacomo\n},\n title = {\n Synchronization of Group-Labelled Multi-Graphs\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6453-6463\n} \n}" }, { "title": "Syncretic Modality Collaborative Learning for Visible Infrared Person Re-Identification", @@ -40362,6 +43087,7 @@ "status": "Poster", "track": "main", "pid": 7258, + "author_site": "Ziyu Wei; Xi Yang; Nannan Wang; Xinbo Gao", "author": "Ziyu Wei; Xi Yang; Nannan Wang; Xinbo Gao", "abstract": "Visible infrared person re-identification (VI-REID) aims to match pedestrian images between the daytime visible and nighttime infrared camera views. The large cross-modality discrepancies have become the bottleneck which limits the performance of VI-REID. Existing methods mainly focus on capturing cross-modality sharable representations by learning an identity classifier. However, the heterogeneous pedestrian images taken by different spectrum cameras differ significantly in image styles, resulting in inferior discriminability of feature representations. To alleviate the above problem, this paper explores the correlation between two modalities and proposes a novel syncretic modality collaborative learning (SMCL) model to bridge the cross-modality gap. A new modality that incorporates features of heterogeneous images is constructed automatically to steer the generation of modality-invariant representations. Challenge enhanced homogeneity learning (CEHL) and auxiliary distributional similarity learning (ADSL) are integrated to project heterogeneous features on a unified space and enlarge the inter-class disparity, thus strengthening the discriminative power. Extensive experiments on two cross-modality benchmarks demonstrate the effectiveness and superiority of the proposed method. Especially, on SYSU-MM01 dataset, our SMCL model achieves 67.39% rank-1 accuracy and 61.78% mAP, surpassing the cutting-edge works by a large margin.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wei_Syncretic_Modality_Collaborative_Learning_for_Visible_Infrared_Person_Re-Identification_ICCV_2021_paper.pdf", @@ -40385,7 +43111,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wei_2021_ICCV,\n \n author = {\n Wei,\n Ziyu and Yang,\n Xi and Wang,\n Nannan and Gao,\n Xinbo\n},\n title = {\n Syncretic Modality Collaborative Learning for Visible Infrared Person Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 225-234\n} \n}" }, { "title": "Synthesis of Compositional Animations From Textual Descriptions", @@ -40393,6 +43120,7 @@ "status": "Poster", "track": "main", "pid": 7714, + "author_site": "Anindita Ghosh; Noshaba Cheema; Cennet Oguz; Christian Theobalt; Philipp Slusallek", "author": "Anindita Ghosh; Noshaba Cheema; Cennet Oguz; Christian Theobalt; Philipp Slusallek", "abstract": "How can we animate 3D-characters from a movie script or move robots by simply telling them what we would like them to do?\" How unstructured and complex can we make a sentence and still generate plausible movements from it?\" These are questions that need to be answered in the long-run, as the field is still in its infancy. Inspired by these problems, we present a new technique for generating compositional actions, which handles complex input sentences. Our output is a 3D pose sequence depicting the actions in the input sentence. We propose a hierarchical two-stream sequential model to explore a finer joint-level mapping between natural language sentences and 3D pose sequences corresponding to the given motion. We learn two manifold representations of the motion, one each for the upper body and the lower body movements. Our model can generate plausible pose sequences for short sentences describing single actions as well as long complex sentences describing multiple sequential and compositional actions. We evaluate our proposed model on the publicly available KIT Motion-Language Dataset containing 3D pose data with human-annotated sentences. Experimental results show that our model advances the state-of-the-art on text-based motion synthesis in objective evaluations by a margin of 50%. Qualitative evaluations based on a user study indicate that our synthesized motions are perceived to be the closest to the ground-truth motion captures for both short and compositional sentences.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ghosh_Synthesis_of_Compositional_Animations_From_Textual_Descriptions_ICCV_2021_paper.pdf", @@ -40414,9 +43142,10 @@ "aff_unique_url": "https://www.dFKI.de;https://www.uni-saarland.de;https://mpi-inf.mpg.de", "aff_unique_abbr": "DFKI;Uni Saar;MPII", "aff_campus_unique_index": "1;1;1;1;1", - "aff_campus_unique": ";Saarbr\u00fccken", + "aff_campus_unique": ";Saarbrücken", "aff_country_unique_index": "0+0;0+0+0;0+0;0+0;0+0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Ghosh_2021_ICCV,\n \n author = {\n Ghosh,\n Anindita and Cheema,\n Noshaba and Oguz,\n Cennet and Theobalt,\n Christian and Slusallek,\n Philipp\n},\n title = {\n Synthesis of Compositional Animations From Textual Descriptions\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1396-1406\n} \n}" }, { "title": "Synthesized Feature Based Few-Shot Class-Incremental Learning on a Mixture of Subspaces", @@ -40424,6 +43153,7 @@ "status": "Poster", "track": "main", "pid": 1771, + "author_site": "Ali Cheraghian; Shafin Rahman; Sameera Ramasinghe; Pengfei Fang; Christian Simon; Lars Petersson; Mehrtash Harandi", "author": "Ali Cheraghian; Shafin Rahman; Sameera Ramasinghe; Pengfei Fang; Christian Simon; Lars Petersson; Mehrtash Harandi", "abstract": "Few-shot class incremental learning (FSCIL) aims to incrementally add sets of novel classes to a well-trained base model in multiple training sessions with the restriction that only a few novel instances are available per class. While learning novel classes, FSCIL methods gradually forget base (old) class training and overfit to a few novel class samples. Existing approaches have addressed this problem by computing the class prototypes from the visual or semantic word vector domain. In this paper, we propose addressing this problem using a mixture of subspaces. Subspaces define the cluster structure of the visual domain and help to describe the visual and semantic domain considering the overall distribution of the data. Additionally, we propose to employ a variational autoencoder (VAE) to generate synthesized visual samples for augmenting pseudo-feature while learning novel classes incrementally. The combined effect of the mixture of subspaces and synthesized features reduces the forgetting and overfitting problem of FSCIL. Extensive experiments on three image classification datasets show that our proposed method achieves competitive results compared to state-of-the-art methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Cheraghian_Synthesized_Feature_Based_Few-Shot_Class-Incremental_Learning_on_a_Mixture_of_ICCV_2021_paper.pdf", @@ -40447,7 +43177,8 @@ "aff_campus_unique_index": ";1;;;;", "aff_campus_unique": ";Dhaka", "aff_country_unique_index": "0+0;1;0+0;0+0;0+0;0+0;0", - "aff_country_unique": "Australia;Bangladesh" + "aff_country_unique": "Australia;Bangladesh", + "bibtex": "@InProceedings{Cheraghian_2021_ICCV,\n \n author = {\n Cheraghian,\n Ali and Rahman,\n Shafin and Ramasinghe,\n Sameera and Fang,\n Pengfei and Simon,\n Christian and Petersson,\n Lars and Harandi,\n Mehrtash\n},\n title = {\n Synthesized Feature Based Few-Shot Class-Incremental Learning on a Mixture of Subspaces\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8661-8670\n} \n}" }, { "title": "T-AutoML: Automated Machine Learning for Lesion Segmentation Using Transformers in 3D Medical Imaging", @@ -40455,6 +43186,7 @@ "status": "Poster", "track": "main", "pid": 10902, + "author_site": "Dong Yang; Andriy Myronenko; Xiaosong Wang; Ziyue Xu; Holger R. Roth; Daguang Xu", "author": "Dong Yang; Andriy Myronenko; Xiaosong Wang; Ziyue Xu; Holger R. Roth; Daguang Xu", "abstract": "Lesion segmentation in medical imaging has been an important topic in clinical research. Researchers have proposed various detection and segmentation algorithms to address this task. Recently, deep learning-based approaches have significantly improved the performance over conventional methods. However, most state-of-the-art deep learning methods require the manual design of multiple network components and training strategies. In this paper, we propose a new automated machine learning algorithm, T-AutoML, which not only searches for the best neural architecture, but also finds the best combination of hyper-parameters and data augmentation strategies simultaneously. The proposed method utilizes the modern transformer model, which is introduced to adapt to the dynamic length of the search space embedding and can significantly improve the ability of the search. We validate T-AutoML on several large-scale public lesion segmentation data-sets and achieve state-of-the-art performance.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yang_T-AutoML_Automated_Machine_Learning_for_Lesion_Segmentation_Using_Transformers_in_ICCV_2021_paper.pdf", @@ -40471,14 +43203,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Yang_T-AutoML_Automated_Machine_Learning_for_Lesion_Segmentation_Using_Transformers_in_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;0;0;0", - "aff_unique_norm": "NVIDIA", - "aff_unique_dep": "NVIDIA Corporation", + "aff_unique_norm": "NVIDIA Corporation", + "aff_unique_dep": "", "aff_unique_url": "https://www.nvidia.com", "aff_unique_abbr": "NVIDIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Yang_2021_ICCV,\n \n author = {\n Yang,\n Dong and Myronenko,\n Andriy and Wang,\n Xiaosong and Xu,\n Ziyue and Roth,\n Holger R. and Xu,\n Daguang\n},\n title = {\n T-AutoML: Automated Machine Learning for Lesion Segmentation Using Transformers in 3D Medical Imaging\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3962-3974\n} \n}" }, { "title": "T-Net: Effective Permutation-Equivariant Network for Two-View Correspondence Learning", @@ -40486,6 +43219,7 @@ "status": "Poster", "track": "main", "pid": 4234, + "author_site": "Zhen Zhong; Guobao Xiao; Linxin Zheng; Yan Lu; Jiayi Ma", "author": "Zhen Zhong; Guobao Xiao; Linxin Zheng; Yan Lu; Jiayi Ma", "abstract": "We develop a conceptually simple, flexible, and effective framework (named T-Net) for two-view correspondence learning. Given a set of putative correspondences, we reject outliers and regress the relative pose encoded by the essential matrix, by an end-to-end framework, which is consisted of two novel structures: \"-\" structure and \"|\" structure. \"-\" structure adopts an iterative strategy to learn correspondence features. \"|\" structure integrates all the features of the iterations and outputs the correspondence weight. In addition, we introduce Permutation-Equivariant Context Squeeze-and-Excitation module, an adapted version of SE module, to process sparse correspondences in a permutation-equivariant way and capture both global and channel-wise contextual information. Extensive experiments on outdoor and indoor scenes show that the proposed T-Net achieves state-of-the-art performance. On outdoor scenes (YFCC100M dataset), T-Net achieves an mAP of 52.28%, a 34.22% precision increase from the best-published result (38.95%). On indoor scenes (SUN3D dataset), T-Net (19.71%) obtains a 21.82% precision increase from the best-published result (16.18%).", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhong_T-Net_Effective_Permutation-Equivariant_Network_for_Two-View_Correspondence_Learning_ICCV_2021_paper.pdf", @@ -40509,7 +43243,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhong_2021_ICCV,\n \n author = {\n Zhong,\n Zhen and Xiao,\n Guobao and Zheng,\n Linxin and Lu,\n Yan and Ma,\n Jiayi\n},\n title = {\n T-Net: Effective Permutation-Equivariant Network for Two-View Correspondence Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1950-1959\n} \n}" }, { "title": "T-SVDNet: Exploring High-Order Prototypical Correlations for Multi-Source Domain Adaptation", @@ -40517,10 +43252,11 @@ "status": "Poster", "track": "main", "pid": 2326, + "author_site": "Ruihuang Li; Xu Jia; Jianzhong He; Shuaijun Chen; Qinghua Hu", "author": "Ruihuang Li; Xu Jia; Jianzhong He; Shuaijun Chen; Qinghua Hu", "abstract": "Most existing domain adaptation methods focus on adaptation from only one source domain, however, in practice there are a number of relevant sources that could be leveraged to help improve performance on target domain. We propose a novel approach named T-SVDNet to address the task of Multi-source Domain Adaptation (MDA), which is featured by incorporating Tensor Singular Value Decomposition (T-SVD) into a neural network's training pipeline. Overall, high-order correlations among multiple domains are fully explored so as to better bridge the domain gap in this work. Specifically, we impose Tensor-Low-Rank (TLR) constraint on the tensor obtained by stacking up a group of prototypical similarity matrices, aiming at capturing consistent data structure across different domains. Furthermore, to avoid negative transfer brought by noisy source data, we propose a novel uncertainty-aware weighting strategy to adaptively assign weights to different source domains and samples based on the result of uncertainty estimation. Extensive experiments conducted on public benchmarks demonstrate the superiority of our model in addressing the task of MDA compared to state-of-the-art methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_T-SVDNet_Exploring_High-Order_Prototypical_Correlations_for_Multi-Source_Domain_Adaptation_ICCV_2021_paper.pdf", - "aff": "College of Intelligence and Computing, Tianjin University; Dalian University of Technology; Huawei Technologies; Noah\u2019s Ark Lab, Huawei Technologies; College of Intelligence and Computing, Tianjin University + Noah\u2019s Ark Lab, Huawei Technologies", + "aff": "College of Intelligence and Computing, Tianjin University; Dalian University of Technology; Huawei Technologies; Noah’s Ark Lab, Huawei Technologies; College of Intelligence and Computing, Tianjin University + Noah’s Ark Lab, Huawei Technologies", "project": "", "github": "https://github.com/lslrh/T-SVDNet", "supp": "", @@ -40533,14 +43269,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Li_T-SVDNet_Exploring_High-Order_Prototypical_Correlations_for_Multi-Source_Domain_Adaptation_ICCV_2021_paper.html", "aff_unique_index": "0;1;2;2;0+2", - "aff_unique_norm": "Tianjin University;Dalian University of Technology;Huawei", - "aff_unique_dep": "College of Intelligence and Computing;;Huawei Technologies", + "aff_unique_norm": "Tianjin University;Dalian University of Technology;Huawei Technologies", + "aff_unique_dep": "College of Intelligence and Computing;;", "aff_unique_url": "http://www.tju.edu.cn;http://www.dlut.edu.cn/;https://www.huawei.com", "aff_unique_abbr": ";DUT;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Ruihuang and Jia,\n Xu and He,\n Jianzhong and Chen,\n Shuaijun and Hu,\n Qinghua\n},\n title = {\n T-SVDNet: Exploring High-Order Prototypical Correlations for Multi-Source Domain Adaptation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9991-10000\n} \n}" }, { "title": "TACo: Token-Aware Cascade Contrastive Learning for Video-Text Alignment", @@ -40548,6 +43285,7 @@ "status": "Poster", "track": "main", "pid": 3215, + "author_site": "Jianwei Yang; Yonatan Bisk; Jianfeng Gao", "author": "Jianwei Yang; Yonatan Bisk; Jianfeng Gao", "abstract": "Contrastive learning has been widely used to train transformer-based vision-language models for video-text alignment and multi-modal representation learning. This paper presents a new algorithm called Token-Aware Cascade contrastive learning (TACo) that improves contrastive learning using two novel techniques. The first is the token-aware contrastive loss which is computed by taking into account the syntactic classes of words. This is motivated by the observation that for a video-text pair, the content words in the text, such as nouns and verbs, are more likely to be aligned with the visual contents in the video than the function words. Second, a cascade sampling method is applied to generate a small set of hard negative examples for efficient loss estimation for multi-modal fusion layers. To validate the effectiveness of TACo, in our experiments we finetune pretrained models for a set of downstream tasks including text-video retrieval (YouCook2, MSR-VTT and ActivityNet), video action step localization (CrossTask), video action segmentation (COIN). Our results show that our models attain consistent improvements across different experimental settings over previous methods, setting new state-of-the-art on three public text-video retrieval benchmarks of YouCook2, MSR-VTT and ActivityNet.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yang_TACo_Token-Aware_Cascade_Contrastive_Learning_for_Video-Text_Alignment_ICCV_2021_paper.pdf", @@ -40564,14 +43302,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Yang_TACo_Token-Aware_Cascade_Contrastive_Learning_for_Video-Text_Alignment_ICCV_2021_paper.html", "aff_unique_index": "0;1;0", - "aff_unique_norm": "Microsoft;Carnegie Mellon University", + "aff_unique_norm": "Microsoft Corporation;Carnegie Mellon University", "aff_unique_dep": "Microsoft Research;", "aff_unique_url": "https://www.microsoft.com/en-us/research;https://www.cmu.edu", "aff_unique_abbr": "MSR;CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Yang_2021_ICCV,\n \n author = {\n Yang,\n Jianwei and Bisk,\n Yonatan and Gao,\n Jianfeng\n},\n title = {\n TACo: Token-Aware Cascade Contrastive Learning for Video-Text Alignment\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11562-11572\n} \n}" }, { "title": "TAM: Temporal Adaptive Module for Video Recognition", @@ -40579,6 +43318,7 @@ "status": "Poster", "track": "main", "pid": 6950, + "author_site": "Zhaoyang Liu; Limin Wang; Wayne Wu; Chen Qian; Tong Lu", "author": "Zhaoyang Liu; Limin Wang; Wayne Wu; Chen Qian; Tong Lu", "abstract": "Video data is with complex temporal dynamics due to various factors such as camera motion, speed variation, and different activities. To effectively capture this diverse motion pattern, this paper presents a new temporal adaptive module (TAM) to generate video-specific temporal kernels based on its own feature map. TAM proposes a unique two-level adaptive modeling scheme by decoupling the dynamic kernel into a location sensitive importance map and a location invariant aggregation weight. The importance map is learned in a local temporal window to capture short-term information, while the aggregation weight is generated from a global view with a focus on long-term structure. TAM is a modular block and could be integrated into 2D CNNs to yield a powerful video architecture (TANet) with a very small extra computational cost. The extensive experiments on Kinetics-400 and Something-Something datasets demonstrate that our TAM outperforms other temporal modeling methods consistently, and achieves the state-of-the-art performance under the similar complexity. The code is available at https://github.com/liu-zhy/temporal-adaptive-module.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liu_TAM_Temporal_Adaptive_Module_for_Video_Recognition_ICCV_2021_paper.pdf", @@ -40602,7 +43342,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2021_ICCV,\n \n author = {\n Liu,\n Zhaoyang and Wang,\n Limin and Wu,\n Wayne and Qian,\n Chen and Lu,\n Tong\n},\n title = {\n TAM: Temporal Adaptive Module for Video Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13708-13718\n} \n}" }, { "title": "TF-Blender: Temporal Feature Blender for Video Object Detection", @@ -40610,6 +43351,7 @@ "status": "Poster", "track": "main", "pid": 6618, + "author_site": "Yiming Cui; Liqi Yan; Zhiwen Cao; Dongfang Liu", "author": "Yiming Cui; Liqi Yan; Zhiwen Cao; Dongfang Liu", "abstract": "Video objection detection is a challenging task because isolated video frames may encounter appearance deterioration, which introduces great confusion for detection. One of the popular solutions is to exploit the temporal information and enhance per-frame representation through aggregating features from neighboring frames. Despite achieving improvements in detection, existing methods focus on the selection of higher-level video frames for aggregation rather than modeling lower-level temporal relations to increase the feature representation. To address this limitation, we propose a novel solution named TF-Blender, which includes three modules: 1) Temporal relation models the relations between the current frame and its neighboring frames to preserve spatial information. 2). Feature adjustment enriches the representation of every neighboring feature map; 3) Feature blender combines outputs from the first two modules and produces stronger features for the later detection tasks. For its simplicity, TF-Blender can be effortlessly plugged into any detection network to improve detection behavior. Extensive evaluations on ImageNet VID and YouTube-VIS benchmarks indicate the performance guarantees of using TF-Blender on recent state-of-the-art methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Cui_TF-Blender_Temporal_Feature_Blender_for_Video_Object_Detection_ICCV_2021_paper.pdf", @@ -40633,7 +43375,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Cui_2021_ICCV,\n \n author = {\n Cui,\n Yiming and Yan,\n Liqi and Cao,\n Zhiwen and Liu,\n Dongfang\n},\n title = {\n TF-Blender: Temporal Feature Blender for Video Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8138-8147\n} \n}" }, { "title": "TGRNet: A Table Graph Reconstruction Network for Table Structure Recognition", @@ -40641,6 +43384,7 @@ "status": "Poster", "track": "main", "pid": 6351, + "author_site": "Wenyuan Xue; Baosheng Yu; Wen Wang; Dacheng Tao; Qingyong Li", "author": "Wenyuan Xue; Baosheng Yu; Wen Wang; Dacheng Tao; Qingyong Li", "abstract": "A table arranging data in rows and columns is a very effective data structure, which has been widely used in business and scientific research. Considering large-scale tabular data in online and offline documents, automatic table recognition has attracted increasing attention from the document analysis community. Though human can easily understand the structure of tables, it remains a challenge for machines to understand that, especially due to a variety of different table layouts and styles. Existing methods usually model a table as either the markup sequence or the adjacency matrix between different table cells, failing to address the importance of the logical location of table cells, e.g., a cell is located in the first row and the second column of the table. In this paper, we reformulate the problem of table structure recognition as the table graph reconstruction, and propose an end-to-end trainable table graph reconstruction network (TGRNet) for table structure recognition. Specifically, the proposed method has two main branches, a cell detection branch and a cell logical location branch, to jointly predict the spatial location and the logical location of different cells. Experimental results on three popular table recognition datasets and a new dataset with table graph annotations (TableGraph-350K) demonstrate the effectiveness of the proposed TGRNet for table structure recognition. Code and annotations will be made publicly available.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xue_TGRNet_A_Table_Graph_Reconstruction_Network_for_Table_Structure_Recognition_ICCV_2021_paper.pdf", @@ -40655,7 +43399,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Xue_TGRNet_A_Table_Graph_Reconstruction_Network_for_Table_Structure_Recognition_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Xue_TGRNet_A_Table_Graph_Reconstruction_Network_for_Table_Structure_Recognition_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Xue_2021_ICCV,\n \n author = {\n Xue,\n Wenyuan and Yu,\n Baosheng and Wang,\n Wen and Tao,\n Dacheng and Li,\n Qingyong\n},\n title = {\n TGRNet: A Table Graph Reconstruction Network for Table Structure Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1295-1304\n} \n}" }, { "title": "THDA: Treasure Hunt Data Augmentation for Semantic Navigation", @@ -40663,6 +43408,7 @@ "status": "Poster", "track": "main", "pid": 2015, + "author_site": "Oleksandr Maksymets; Vincent Cartillier; Aaron Gokaslan; Erik Wijmans; Wojciech Galuba; Stefan Lee; Dhruv Batra", "author": "Oleksandr Maksymets; Vincent Cartillier; Aaron Gokaslan; Erik Wijmans; Wojciech Galuba; Stefan Lee; Dhruv Batra", "abstract": "Can general-purpose neural models learn to navigate? For PointGoal navigation (\"\"go to x, y\"\"), the answer is a clear `yes' -- mapless neural models composed of task-agnostic components (CNNs and RNNs) trained with large-scale model-free reinforcement learning achieve near-perfect performance. However, for ObjectGoal navigation (\"\"find a TV\"\"), this is an open question; one we tackle in this paper. The current best-known result on ObjectNav with general-purpose models is 6% success rate. First, we show that the key problem is overfitting. Large-scale training results in 94% success rate on training environments and only 8% in validation. We observe that this stems from agents memorizing environment layouts during training -- sidestepping the need for exploration and directly learning shortest paths to nearby goal objects. We show that this is a natural consequence of optimizing for the task metric (which in fact penalizes exploration), is enabled by powerful observation encoders, and is possible due to the finite set of training environment configurations. Informed by our findings, we introduce Treasure Hunt Data Augmentation (THDA) to address overfitting in ObjectNav. THDA inserts 3D scans of household objects at arbitrary scene locations and uses them as ObjectNav goals -- augmenting and greatly expanding the set of training layouts. Taken together with our other proposed changes, we improve the state of art on the Habitat ObjectGoal Navigation benchmark by 90% (from 14% success rate to 27%) and path efficiency by 48% (from 7.5 SPL to 11.1 SPL).", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Maksymets_THDA_Treasure_Hunt_Data_Augmentation_for_Semantic_Navigation_ICCV_2021_paper.pdf", @@ -40679,14 +43425,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Maksymets_THDA_Treasure_Hunt_Data_Augmentation_for_Semantic_Navigation_ICCV_2021_paper.html", "aff_unique_index": "0;1;0+2;1;0;3;0+1", - "aff_unique_norm": "Meta;Georgia Institute of Technology;Cornell University;Oregon State University", + "aff_unique_norm": "Facebook;Georgia Institute of Technology;Cornell University;Oregon State University", "aff_unique_dep": "Facebook AI Research;;;", "aff_unique_url": "https://research.facebook.com;https://www.gatech.edu;https://www.cornell.edu;https://oregonstate.edu", "aff_unique_abbr": "FAIR;Georgia Tech;Cornell;OSU", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0;0;0;0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Maksymets_2021_ICCV,\n \n author = {\n Maksymets,\n Oleksandr and Cartillier,\n Vincent and Gokaslan,\n Aaron and Wijmans,\n Erik and Galuba,\n Wojciech and Lee,\n Stefan and Batra,\n Dhruv\n},\n title = {\n THDA: Treasure Hunt Data Augmentation for Semantic Navigation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15374-15383\n} \n}" }, { "title": "THUNDR: Transformer-Based 3D Human Reconstruction With Markers", @@ -40694,6 +43441,7 @@ "status": "Poster", "track": "main", "pid": 10392, + "author_site": "Mihai Zanfir; Andrei Zanfir; Eduard Gabriel Bazavan; William T. Freeman; Rahul Sukthankar; Cristian Sminchisescu", "author": "Mihai Zanfir; Andrei Zanfir; Eduard Gabriel Bazavan; William T. Freeman; Rahul Sukthankar; Cristian Sminchisescu", "abstract": "We present THUNDR, a transformer-based deep neural network methodology to reconstruct the 3d pose and shape of people, given monocular RGB images. Key to our methodology is an intermediate 3d marker representation, where we aim to combine the predictive power of model-free-output architectures and the regularizing, anthropometrically-preserving properties of a statistical human surface model like GHUM---a recently introduced, expressive full body statistical 3d human model, trained end-to-end. Our novel transformer-based prediction pipeline can focus on image regions relevant to the task, supports self-supervised regimes, and ensures that solutions are consistent with human anthropometry. We show state-of-the-art results on Human3.6M and 3DPW, for both the fully-supervised and the self-supervised models, for the task of inferring 3d human shape, joint positions, and global translation. Moreover, we observe very solid 3d reconstruction performance for difficult human poses collected in the wild.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zanfir_THUNDR_Transformer-Based_3D_Human_Reconstruction_With_Markers_ICCV_2021_paper.pdf", @@ -40711,13 +43459,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zanfir_THUNDR_Transformer-Based_3D_Human_Reconstruction_With_Markers_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "Google", - "aff_unique_dep": "Google", + "aff_unique_dep": "", "aff_unique_url": "https://www.google.com", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zanfir_2021_ICCV,\n \n author = {\n Zanfir,\n Mihai and Zanfir,\n Andrei and Bazavan,\n Eduard Gabriel and Freeman,\n William T. and Sukthankar,\n Rahul and Sminchisescu,\n Cristian\n},\n title = {\n THUNDR: Transformer-Based 3D Human Reconstruction With Markers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12971-12980\n} \n}" }, { "title": "TMCOSS: Thresholded Multi-Criteria Online Subset Selection for Data-Efficient Autonomous Driving", @@ -40725,6 +43474,7 @@ "status": "Poster", "track": "main", "pid": 8648, + "author_site": "Soumi Das; Harikrishna Patibandla; Suparna Bhattacharya; Kshounis Bera; Niloy Ganguly; Sourangshu Bhattacharya", "author": "Soumi Das; Harikrishna Patibandla; Suparna Bhattacharya; Kshounis Bera; Niloy Ganguly; Sourangshu Bhattacharya", "abstract": "Training vision-based Autonomous driving models is a challenging problem with enormous practical implications. One of the main challenges is the requirement of storage and processing of vast volumes of (possibly redundant) driving video data. In this paper, we study the problem of data-efficient training of autonomous driving systems. We argue that in the context of an edge-device deployment, multi-criteria online video frame subset selection is an appropriate technique for developing such frameworks. We study existing convex optimization based solutions and show that they are unable to provide solution with high weightage to loss of selected video frames. We design a novel multi-criteria online subset selection algorithm, TMCOSS, which uses a thresholded concave function of selection variables. Extensive experiments using driving simulator CARLA show that we are able to drop 80% of the frames, while succeeding to complete 100% of the episodes. We also show that TMCOSS improves performance on the crucial affordance 'Relative Angle' during turns, on inclusion of bucket-specific relative angle loss (BL), leading to selection of more frames in those parts. TMCOSS also achieves an 80% reduction in number of training video frames, on real-world videos from the standard BDD and Cityscapes datasets, for the tasks of drivable area segmentation, and semantic segmentation.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Das_TMCOSS_Thresholded_Multi-Criteria_Online_Subset_Selection_for_Data-Efficient_Autonomous_Driving_ICCV_2021_paper.pdf", @@ -40739,7 +43489,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Das_TMCOSS_Thresholded_Multi-Criteria_Online_Subset_Selection_for_Data-Efficient_Autonomous_Driving_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Das_TMCOSS_Thresholded_Multi-Criteria_Online_Subset_Selection_for_Data-Efficient_Autonomous_Driving_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Das_2021_ICCV,\n \n author = {\n Das,\n Soumi and Patibandla,\n Harikrishna and Bhattacharya,\n Suparna and Bera,\n Kshounis and Ganguly,\n Niloy and Bhattacharya,\n Sourangshu\n},\n title = {\n TMCOSS: Thresholded Multi-Criteria Online Subset Selection for Data-Efficient Autonomous Driving\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6341-6350\n} \n}" }, { "title": "TOOD: Task-Aligned One-Stage Object Detection", @@ -40747,6 +43498,7 @@ "status": "Poster", "track": "main", "pid": 3835, + "author_site": "Chengjian Feng; Yujie Zhong; Yu Gao; Matthew R. Scott; Weilin Huang", "author": "Chengjian Feng; Yujie Zhong; Yu Gao; Matthew R. Scott; Weilin Huang", "abstract": "One-stage object detection is commonly implemented by optimizing two sub-tasks: object classification and localization, using heads with two parallel branches, which might lead to a certain level of spatial misalignment in predictions between the two tasks. In this work, we propose a Task-aligned One-stage Object Detection (TOOD) that explicitly aligns the two tasks in a learning-based manner. First, we design a novel Task-aligned Head (T-Head) which offers a better balance between learning task-interactive and task-specific features, as well as a greater flexibility to learn the alignment via a task-aligned predictor. Second, we propose Task Alignment Learning (TAL) to explicitly pull closer (or even unify) the optimal anchors for the two tasks during training via a designed sample assignment scheme and a task-aligned loss. Extensive experiments are conducted on MS-COCO, where TOOD achieves a 51.1 AP at single-model single-scale testing. This surpasses the recent one-stage detectors by a large margin, such as ATSS (47.7 AP), GFL (48.2 AP), and PAA (49.0 AP), with fewer parameters and FLOPs. Qualitative results also demonstrate the effectiveness of TOOD for better aligning the tasks of object classification and localization. Code is available at https://github.com/fcjian/TOOD.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Feng_TOOD_Task-Aligned_One-Stage_Object_Detection_ICCV_2021_paper.pdf", @@ -40770,7 +43522,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Feng_2021_ICCV,\n \n author = {\n Feng,\n Chengjian and Zhong,\n Yujie and Gao,\n Yu and Scott,\n Matthew R. and Huang,\n Weilin\n},\n title = {\n TOOD: Task-Aligned One-Stage Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3510-3519\n} \n}" }, { "title": "TRAR: Routing the Attention Spans in Transformer for Visual Question Answering", @@ -40778,6 +43531,7 @@ "status": "Poster", "track": "main", "pid": 6079, + "author_site": "Yiyi Zhou; Tianhe Ren; Chaoyang Zhu; Xiaoshuai Sun; Jianzhuang Liu; Xinghao Ding; Mingliang Xu; Rongrong Ji", "author": "Yiyi Zhou; Tianhe Ren; Chaoyang Zhu; Xiaoshuai Sun; Jianzhuang Liu; Xinghao Ding; Mingliang Xu; Rongrong Ji", "abstract": "Due to the superior ability of global dependency modeling, Transformer and its variants have become the primary choice of many vision-and-language tasks. However, in tasks like Visual Question Answering (VQA) and Referring Expression Comprehension (REC), the multimodal prediction often requires visual information from macro- to micro-views. Therefore, how to dynamically schedule the global and local dependency modeling in Transformer has become an emerging issue. In this paper, we propose an example-dependent routing scheme called TRAnsformer Routing (TRAR) to address this issue. Specifically, in TRAR, each visual Transformer layer is equipped with a routing module with different attention spans. The model can dynamically select the corresponding attentions based on the output of the previous inference step, so as to formulate the optimal routing path for each example. Notably, with careful designs, TRAR can reduce the additional computation and memory overhead to almost negligible. To validate TRAR, we conduct extensive experiments on five benchmark datasets of VQA and REC, and achieve superior performance gains than the standard Transformers and a bunch of state-of-the-art methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhou_TRAR_Routing_the_Attention_Spans_in_Transformer_for_Visual_Question_ICCV_2021_paper.pdf", @@ -40792,7 +43546,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhou_TRAR_Routing_the_Attention_Spans_in_Transformer_for_Visual_Question_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhou_TRAR_Routing_the_Attention_Spans_in_Transformer_for_Visual_Question_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Zhou_2021_ICCV,\n \n author = {\n Zhou,\n Yiyi and Ren,\n Tianhe and Zhu,\n Chaoyang and Sun,\n Xiaoshuai and Liu,\n Jianzhuang and Ding,\n Xinghao and Xu,\n Mingliang and Ji,\n Rongrong\n},\n title = {\n TRAR: Routing the Attention Spans in Transformer for Visual Question Answering\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2074-2084\n} \n}" }, { "title": "TRiPOD: Human Trajectory and Pose Dynamics Forecasting in the Wild", @@ -40800,6 +43555,7 @@ "status": "Poster", "track": "main", "pid": 6442, + "author_site": "Vida Adeli; Mahsa Ehsanpour; Ian Reid; Juan Carlos Niebles; Silvio Savarese; Ehsan Adeli; Hamid Rezatofighi", "author": "Vida Adeli; Mahsa Ehsanpour; Ian Reid; Juan Carlos Niebles; Silvio Savarese; Ehsan Adeli; Hamid Rezatofighi", "abstract": "Joint forecasting of human trajectory and pose dynamics is a fundamental building block of various applications ranging from robotics and autonomous driving to surveillance systems. Predicting body dynamics requires capturing subtle information embedded in the humans' interactions with each other and with the objects present in the scene. In this paper, we propose a novel TRajectory and POse Dynamics (nicknamed TRiPOD) method based on graph attentional networks to model the human-human and human-object interactions both in the input space and the output space (decoded future output). The model is supplemented by a message passing interface over the graphs to fuse these different levels of interactions efficiently. Furthermore, to incorporate a real-world challenge, we propound to learn an indicator representing whether an estimated body joint is visible/invisible at each frame, e.g. due to occlusion or being outside the sensor field of view. Finally, we introduce a new benchmark for this joint task based on two challenging datasets (PoseTrack and 3DPW) and propose evaluation metrics to measure the effectiveness of predictions in the global space, even when there are invisible cases of joints. Our evaluation shows that TRiPOD outperforms all prior work and state-of-the-art specifically designed for each of the trajectory and pose forecasting tasks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Adeli_TRiPOD_Human_Trajectory_and_Pose_Dynamics_Forecasting_in_the_Wild_ICCV_2021_paper.pdf", @@ -40814,7 +43570,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Adeli_TRiPOD_Human_Trajectory_and_Pose_Dynamics_Forecasting_in_the_Wild_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Adeli_TRiPOD_Human_Trajectory_and_Pose_Dynamics_Forecasting_in_the_Wild_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Adeli_2021_ICCV,\n \n author = {\n Adeli,\n Vida and Ehsanpour,\n Mahsa and Reid,\n Ian and Niebles,\n Juan Carlos and Savarese,\n Silvio and Adeli,\n Ehsan and Rezatofighi,\n Hamid\n},\n title = {\n TRiPOD: Human Trajectory and Pose Dynamics Forecasting in the Wild\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13390-13400\n} \n}" }, { "title": "TS-CAM: Token Semantic Coupled Attention Map for Weakly Supervised Object Localization", @@ -40822,6 +43579,7 @@ "status": "Poster", "track": "main", "pid": 8416, + "author_site": "Wei Gao; Fang Wan; Xingjia Pan; Zhiliang Peng; Qi Tian; Zhenjun Han; Bolei Zhou; Qixiang Ye", "author": "Wei Gao; Fang Wan; Xingjia Pan; Zhiliang Peng; Qi Tian; Zhenjun Han; Bolei Zhou; Qixiang Ye", "abstract": "Weakly supervised object localization (WSOL) is a challenging problem when given image category labels but requires to learn object localization models. Optimizing a convolutional neural network (CNN) for classification tends to activate local discriminative regions while ignoring complete object extent, causing the partial activation issue. In this paper, we argue that partial activation is caused by the intrinsic characteristics of CNN, where the convolution operations produce local receptive fields and experience difficulty to capture long-range feature dependency among pixels. We introduce the token semantic coupled attention map (TS-CAM) to take full advantage of the self-attention mechanism in visual transformer for long-range dependency extraction. TS-CAM first splits an image into a sequence of patch tokens for spatial embedding, which produce attention maps of long-range visual dependency to avoid partial activation. TS-CAM then re-allocates category-related semantics for patch tokens, enabling each of them to be aware of object categories. TS-CAM finally couples the patch tokens with the semantic-agnostic attention map to achieve semantic-aware localization. Experiments on the ILSVRC/CUB-200-2011 datasets show that TS-CAM outperforms its CNN-CAM counterparts by7.1%/27.1%for WSOL, achieving state-of-the-art performance. Code is available at https://github.com/vasgaowei/TS-CAM", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Gao_TS-CAM_Token_Semantic_Coupled_Attention_Map_for_Weakly_Supervised_Object_ICCV_2021_paper.pdf", @@ -40836,7 +43594,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Gao_TS-CAM_Token_Semantic_Coupled_Attention_Map_for_Weakly_Supervised_Object_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Gao_TS-CAM_Token_Semantic_Coupled_Attention_Map_for_Weakly_Supervised_Object_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Gao_2021_ICCV,\n \n author = {\n Gao,\n Wei and Wan,\n Fang and Pan,\n Xingjia and Peng,\n Zhiliang and Tian,\n Qi and Han,\n Zhenjun and Zhou,\n Bolei and Ye,\n Qixiang\n},\n title = {\n TS-CAM: Token Semantic Coupled Attention Map for Weakly Supervised Object Localization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2886-2895\n} \n}" }, { "title": "Talk-To-Edit: Fine-Grained Facial Editing via Dialog", @@ -40844,6 +43603,7 @@ "status": "Poster", "track": "main", "pid": 1152, + "author_site": "Yuming Jiang; Ziqi Huang; Xingang Pan; Chen Change Loy; Ziwei Liu", "author": "Yuming Jiang; Ziqi Huang; Xingang Pan; Chen Change Loy; Ziwei Liu", "abstract": "Facial editing is an important task in vision and graphics with numerous applications. However, existing works are incapable to deliver a continuous and fine-grained editing mode (e.g., editing a slightly smiling face to a big laughing one) with natural interactions with users. In this work, we propose Talk-to-Edit, an interactive facial editing framework that performs fine-grained attribute manipulation through dialog between the user and the system. Our key insight is to model a continual \"\"semantic field\"\" in the GAN latent space. 1) Unlike previous works that regard the editing as traversing straight lines in the latent space, here the fine-grained editing is formulated as finding a curving trajectory that respects fine-grained attribute landscape on the semantic field. 2) The curvature at each step is location-specific and determined by the input image as well as the users' language requests. 3) To engage the users in a meaningful dialog, our system generates language feedback by considering both the user request and the current state of the semantic field. We also contribute CelebA-Dialog, a visual-language facial editing dataset to facilitate large-scale study. Specifically, each image has manually annotated fine-grained attribute annotations as well as template-based textual descriptions in natural language. Extensive quantitative and qualitative experiments demonstrate the superiority of our framework in terms of 1) the smoothness of fine-grained editing, 2) the identity/attribute preservation, and 3) the visual photorealism and dialog fluency. Notably, user study validates that our overall system is consistently favored by around 80% of the participants.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Jiang_Talk-To-Edit_Fine-Grained_Facial_Editing_via_Dialog_ICCV_2021_paper.pdf", @@ -40860,14 +43620,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Jiang_Talk-To-Edit_Fine-Grained_Facial_Editing_via_Dialog_ICCV_2021_paper.html", "aff_unique_index": "0;0;1;0;0", - "aff_unique_norm": "Nanyang Technological University;Chinese University of Hong Kong", + "aff_unique_norm": "Nanyang Technological University;The Chinese University of Hong Kong", "aff_unique_dep": "S-Lab;", "aff_unique_url": "https://www.ntu.edu.sg;https://www.cuhk.edu.hk", "aff_unique_abbr": "NTU;CUHK", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;1;0;0", - "aff_country_unique": "Singapore;China" + "aff_country_unique": "Singapore;China", + "bibtex": "@InProceedings{Jiang_2021_ICCV,\n \n author = {\n Jiang,\n Yuming and Huang,\n Ziqi and Pan,\n Xingang and Loy,\n Chen Change and Liu,\n Ziwei\n},\n title = {\n Talk-To-Edit: Fine-Grained Facial Editing via Dialog\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13799-13808\n} \n}" }, { "title": "Target Adaptive Context Aggregation for Video Scene Graph Generation", @@ -40875,6 +43636,7 @@ "status": "Poster", "track": "main", "pid": 8674, + "author_site": "Yao Teng; Limin Wang; Zhifeng Li; Gangshan Wu", "author": "Yao Teng; Limin Wang; Zhifeng Li; Gangshan Wu", "abstract": "This paper deals with a challenging task of video scene graph generation (VidSGG), which could serve as a structured video representation for high-level understanding tasks. We present a new detect-to-track paradigm for this task by decoupling the context modeling for relation prediction from the complicated low-level entity tracking. Specifically, we design an efficient method for frame-level VidSGG, termed as Target Adaptive Context Aggregation Network (TRACE), with a focus on capturing spatio-temporal context information for relation recognition. Our TRACE framework streamlines the VidSGG pipeline with a modular design, and presents two unique blocks of Hierarchical Relation Tree (HRTree) construction and Target-adaptive Context Aggregation. More specific, our HRTree first provides an adpative structure for organizing possible relation candidates efficiently, and guides context aggregation module to effectively capture spatio-temporal structure information. Then, we obtain a contextualized feature representation for each relation candidate and build a classification head to recognize its relation category. Finally, we provide a simple temporal association strategy to track TRACE detected results to yield the video-level VidSGG. We perform experiments on two VidSGG benchmarks: ImageNet-VidVRD and Action Genome, and the results demonstrate that our TRACE achieves the state-of-the-art performance. The code and models are made available at https://github.com/MCG-NJU/TRACE.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Teng_Target_Adaptive_Context_Aggregation_for_Video_Scene_Graph_Generation_ICCV_2021_paper.pdf", @@ -40891,14 +43653,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Teng_Target_Adaptive_Context_Aggregation_for_Video_Scene_Graph_Generation_ICCV_2021_paper.html", "aff_unique_index": "0;0;1;0", - "aff_unique_norm": "Nanjing University;Tencent", + "aff_unique_norm": "Nanjing University;Tencent AI Lab", "aff_unique_dep": "State Key Laboratory for Novel Software Technology;AI Lab", "aff_unique_url": "http://www.nju.edu.cn;https://ai.tencent.com", "aff_unique_abbr": "Nanjing U;Tencent AI Lab", "aff_campus_unique_index": "1", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Teng_2021_ICCV,\n \n author = {\n Teng,\n Yao and Wang,\n Limin and Li,\n Zhifeng and Wu,\n Gangshan\n},\n title = {\n Target Adaptive Context Aggregation for Video Scene Graph Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13688-13697\n} \n}" }, { "title": "Task Switching Network for Multi-Task Learning", @@ -40906,7 +43669,8 @@ "status": "Poster", "track": "main", "pid": 8942, - "author": "Guolei Sun; Thomas Probst; Danda Pani Paudel; Nikola Popovi\u0107; Menelaos Kanakis; Jagruti Patel; Dengxin Dai; Luc Van Gool", + "author_site": "Guolei Sun; Thomas Probst; Danda Pani Paudel; Nikola Popović; Menelaos Kanakis; Jagruti Patel; Dengxin Dai; Luc Van Gool", + "author": "Guolei Sun; Thomas Probst; Danda Pani Paudel; Nikola Popović; Menelaos Kanakis; Jagruti Patel; Dengxin Dai; Luc Van Gool", "abstract": "We introduce Task Switching Networks (TSNs), a task-conditioned architecture with a single unified encoder/decoder for efficient multi-task learning. Multiple tasks are performed by switching between them, performing one task at a time. TSNs have a constant number of parameters irrespective of the number of tasks. This scalable yet conceptually simple approach circumvents the overhead and intricacy of task-specific network components in existing works. In fact, we demonstrate for the first time that multi-tasking can be performed with a single task-conditioned decoder. We achieve this by learning task-specific conditioning parameters through a jointly trained task embedding network, encouraging constructive interaction between tasks. Experiments validate the effectiveness of our approach, achieving state-of-the-art results on two challenging multi-task benchmarks, PASCAL-Context and NYUD. Our analysis of the learned task embeddings further indicates a connection to task relationships studied in the recent literature.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Sun_Task_Switching_Network_for_Multi-Task_Learning_ICCV_2021_paper.pdf", "aff": "Computer Vision Laboratory, ETH Zurich, Switzerland; Computer Vision Laboratory, ETH Zurich, Switzerland; Computer Vision Laboratory, ETH Zurich, Switzerland; Computer Vision Laboratory, ETH Zurich, Switzerland; Computer Vision Laboratory, ETH Zurich, Switzerland; Computer Vision Laboratory, ETH Zurich, Switzerland; Computer Vision Laboratory, ETH Zurich, Switzerland + MPI for Informatics, Germany; Computer Vision Laboratory, ETH Zurich, Switzerland", @@ -40926,10 +43690,11 @@ "aff_unique_dep": "Computer Vision Laboratory;", "aff_unique_url": "https://www.ethz.ch;https://www.mpi-inf.mpg.de", "aff_unique_abbr": "ETHZ;MPII", - "aff_campus_unique_index": "", - "aff_campus_unique": "", + "aff_campus_unique_index": "0;0;0;0;0;0;0;0", + "aff_campus_unique": "Zurich;", "aff_country_unique_index": "0;0;0;0;0;0;0+1;0", - "aff_country_unique": "Switzerland;Germany" + "aff_country_unique": "Switzerland;Germany", + "bibtex": "@InProceedings{Sun_2021_ICCV,\n \n author = {\n Sun,\n Guolei and Probst,\n Thomas and Paudel,\n Danda Pani and Popovi\\'c,\n Nikola and Kanakis,\n Menelaos and Patel,\n Jagruti and Dai,\n Dengxin and Van Gool,\n Luc\n},\n title = {\n Task Switching Network for Multi-Task Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8291-8300\n} \n}" }, { "title": "Task-Aware Part Mining Network for Few-Shot Learning", @@ -40937,6 +43702,7 @@ "status": "Poster", "track": "main", "pid": 8164, + "author_site": "Jiamin Wu; Tianzhu Zhang; Yongdong Zhang; Feng Wu", "author": "Jiamin Wu; Tianzhu Zhang; Yongdong Zhang; Feng Wu", "abstract": "Few-Shot Learning (FSL) aims at classifying samples into new unseen classes with only a handful of labeled samples available. However, most of the existing methods are based on the image-level pooled representation, yet ignore considerable local clues that are transferable across tasks. To address this issue, we propose an end-to-end Task-aware Part Mining Network (TPMN) by integrating an automatic part mining process into the metric-based model for FSL. The proposed TPMN model enjoys several merits. First, we design a meta filter learner to generate task-aware part filters based on the task embedding in a meta-learning way. The task-aware part filters can adapt to any individual task and automatically mine task-related local parts even for an unseen task. Second, an adaptive importance generator is proposed to identify key local parts and assign adaptive importance weights to different parts. To the best of our knowledge, this is the first work to automatically exploit the task-aware local parts in a meta-learning way for FSL. Extensive experimental results on four standard benchmarks demonstrate that the proposed model performs favorably against state-of-the-art FSL methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wu_Task-Aware_Part_Mining_Network_for_Few-Shot_Learning_ICCV_2021_paper.pdf", @@ -40960,7 +43726,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wu_2021_ICCV,\n \n author = {\n Wu,\n Jiamin and Zhang,\n Tianzhu and Zhang,\n Yongdong and Wu,\n Feng\n},\n title = {\n Task-Aware Part Mining Network for Few-Shot Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8433-8442\n} \n}" }, { "title": "TeachText: CrossModal Generalized Distillation for Text-Video Retrieval", @@ -40968,6 +43735,7 @@ "status": "Poster", "track": "main", "pid": 7093, + "author_site": "Ioana Croitoru; Simion-Vlad Bogolin; Marius Leordeanu; Hailin Jin; Andrew Zisserman; Samuel Albanie; Yang Liu", "author": "Ioana Croitoru; Simion-Vlad Bogolin; Marius Leordeanu; Hailin Jin; Andrew Zisserman; Samuel Albanie; Yang Liu", "abstract": "In recent years, considerable progress on the task of text-video retrieval has been achieved by leveraging large-scale pretraining on visual and audio datasets to construct powerful video encoders. By contrast, despite the natural symmetry, the design of effective algorithms for exploiting large-scale language pretraining remains under-explored. In this work, we are the first to investigate the design of such algorithms and propose a novel generalized distillation method,TeachText, which leverages complementary cues from multiple text encoders to provide an enhanced supervisory signal to the retrieval model. Moreover, we extend our method to video side modalities and show that we can effectively reduce the number of used modalities at test time without compromising performance. Our approach advances the state of the art on several video retrieval benchmarks by a significant margin and adds no computational overhead at test time. Last but not least, we show an effective application of our method for eliminating noise from retrieval datasets. Code and data can be found at https://www.robots.ox.ac.uk/ vgg/research/teachtext/.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Croitoru_TeachText_CrossModal_Generalized_Distillation_for_Text-Video_Retrieval_ICCV_2021_paper.pdf", @@ -40991,7 +43759,8 @@ "aff_campus_unique_index": "0;0;0;0;2+0;0", "aff_campus_unique": "Oxford;;Cambridge", "aff_country_unique_index": "0+1+1;0+1+1;0+1;2;0;0+0;3+0", - "aff_country_unique": "United Kingdom;Romania;United States;China" + "aff_country_unique": "United Kingdom;Romania;United States;China", + "bibtex": "@InProceedings{Croitoru_2021_ICCV,\n \n author = {\n Croitoru,\n Ioana and Bogolin,\n Simion-Vlad and Leordeanu,\n Marius and Jin,\n Hailin and Zisserman,\n Andrew and Albanie,\n Samuel and Liu,\n Yang\n},\n title = {\n TeachText: CrossModal Generalized Distillation for Text-Video Retrieval\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11583-11593\n} \n}" }, { "title": "Teacher-Student Adversarial Depth Hallucination To Improve Face Recognition", @@ -40999,10 +43768,11 @@ "status": "Poster", "track": "main", "pid": 4358, + "author_site": "Hardik Uppal; Alireza Sepas-Moghaddam; Michael Greenspan; Ali Etemad", "author": "Hardik Uppal; Alireza Sepas-Moghaddam; Michael Greenspan; Ali Etemad", "abstract": "We present the Teacher-Student Generative Adversarial Network (TS-GAN) to generate depth images from single RGB images in order to boost the performance of face recognition systems. For our method to generalize well across unseen datasets, we design two components in the architecture, a teacher and a student. The teacher, which itself consists of a generator and a discriminator, learns a latent mapping between input RGB and paired depth images in a supervised fashion. The student, which consists of two generators (one shared with the teacher) and a discriminator, learns from new RGB data with no available paired depth information, for improved generalization. The fully trained shared generator can then be used in runtime to hallucinate depth from RGB for downstream applications such as face recognition. We perform rigorous experiments to show the superiority of TS-GAN over other methods in generating synthetic depth images. Moreover, face recognition experiments demonstrate that our hallucinated depth along with the input RGB images boost performance across various architectures when compared to a single RGB modality by average values of +1.2%, +2.6%, and +2.6% for IIIT-D, EURECOM, and LFW datasets respectively. We make our implementation public at: https://github.com/hardik-uppal/teacher-student-gan.git.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Uppal_Teacher-Student_Adversarial_Depth_Hallucination_To_Improve_Face_Recognition_ICCV_2021_paper.pdf", - "aff": "Queen\u2019s University, Canada; Queen\u2019s University, Canada; Queen\u2019s University, Canada; Queen\u2019s University, Canada", + "aff": "Queen’s University, Canada; Queen’s University, Canada; Queen’s University, Canada; Queen’s University, Canada", "project": "", "github": "https://github.com/hardik-uppal/teacher-student-gan.git", "supp": "", @@ -41022,7 +43792,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Uppal_2021_ICCV,\n \n author = {\n Uppal,\n Hardik and Sepas-Moghaddam,\n Alireza and Greenspan,\n Michael and Etemad,\n Ali\n},\n title = {\n Teacher-Student Adversarial Depth Hallucination To Improve Face Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3671-3680\n} \n}" }, { "title": "Telling the What While Pointing to the Where: Multimodal Queries for Image Retrieval", @@ -41030,6 +43801,7 @@ "status": "Poster", "track": "main", "pid": 6697, + "author_site": "Soravit Changpinyo; Jordi Pont-Tuset; Vittorio Ferrari; Radu Soricut", "author": "Soravit Changpinyo; Jordi Pont-Tuset; Vittorio Ferrari; Radu Soricut", "abstract": "Most existing image retrieval systems use text queries as a way for the user to express what they are looking for. However, fine-grained image retrieval often requires the ability to also express where in the image the content they are looking for is. The text modality can only cumbersomely express such localization preferences, whereas pointing is a more natural fit. In this paper, we propose an image retrieval setup with a new form of multimodal queries, where the user simultaneously uses both spoken natural language (the what) and mouse traces over an empty canvas (the where) to express the characteristics of the desired target image. We then describe simple modifications to an existing image retrieval model, enabling it to operate in this setup. Qualitative and quantitative experiments show that our model effectively takes this spatial guidance into account, and provides significantly more accurate retrieval results compared to text-only equivalent systems.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Changpinyo_Telling_the_What_While_Pointing_to_the_Where_Multimodal_Queries_ICCV_2021_paper.pdf", @@ -41053,7 +43825,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Changpinyo_2021_ICCV,\n \n author = {\n Changpinyo,\n Soravit and Pont-Tuset,\n Jordi and Ferrari,\n Vittorio and Soricut,\n Radu\n},\n title = {\n Telling the What While Pointing to the Where: Multimodal Queries for Image Retrieval\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12136-12146\n} \n}" }, { "title": "TempNet: Online Semantic Segmentation on Large-Scale Point Cloud Series", @@ -41061,6 +43834,7 @@ "status": "Poster", "track": "main", "pid": 9870, + "author_site": "Yunsong Zhou; Hongzi Zhu; Chunqin Li; Tiankai Cui; Shan Chang; Minyi Guo", "author": "Yunsong Zhou; Hongzi Zhu; Chunqin Li; Tiankai Cui; Shan Chang; Minyi Guo", "abstract": "Online semantic segmentation on a time series of point cloud frames is an essential task in autonomous driving. Existing models focus on single-frame segmentation, which cannot achieve satisfactory segmentation accuracy and offer unstably flicker among frames. In this paper, we propose a light-weight semantic segmentation framework for large-scale point cloud series, called TempNet, which can improve both the accuracy and the stability of existing semantic segmentation models by combining a novel frame aggregation scheme. To be computational cost efficient, feature extraction and aggregation are only conducted on a small portion of key frames via a temporal feature aggregation (TFA) network using an attentional pooling mechanism, and such enhanced features are propagated to the intermediate non-key frames. To avoid information loss from non-key frames, a partial feature update (PFU) network is designed to partially update the propagated features with the local features extracted on a non-key frame if a large disparity between the two is quickly assessed. As a result, consistent and information-rich features can be obtained for each frame. We implement TempNet on five state-of-the-art (SOTA) point cloud segmentation models and conduct extensive experiments on the SemanticKITTI dataset. Results demonstrate that TempNet outperforms SOTA competitors by wide margins with little extra computational cost.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhou_TempNet_Online_Semantic_Segmentation_on_Large-Scale_Point_Cloud_Series_ICCV_2021_paper.pdf", @@ -41084,7 +43858,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhou_2021_ICCV,\n \n author = {\n Zhou,\n Yunsong and Zhu,\n Hongzi and Li,\n Chunqin and Cui,\n Tiankai and Chang,\n Shan and Guo,\n Minyi\n},\n title = {\n TempNet: Online Semantic Segmentation on Large-Scale Point Cloud Series\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7118-7127\n} \n}" }, { "title": "Temporal Action Detection With Multi-Level Supervision", @@ -41092,6 +43867,7 @@ "status": "Poster", "track": "main", "pid": 4314, + "author_site": "Baifeng Shi; Qi Dai; Judy Hoffman; Kate Saenko; Trevor Darrell; Huijuan Xu", "author": "Baifeng Shi; Qi Dai; Judy Hoffman; Kate Saenko; Trevor Darrell; Huijuan Xu", "abstract": "Training temporal action detection in videos requires large amounts of labeled data, yet such annotation is expensive to collect. Incorporating unlabeled or weakly-labeled data to train action detection model could help reduce annotation cost. In this work, we first introduce the Semi-supervised Action Detection (SSAD) task with a mixture of labeled and unlabeled data and analyze different types of errors in the proposed SSAD baselines which are directly adapted from the semi-supervised classification literature. Identifying that the main source of error is action incompleteness (i.e., missing parts of actions), we alleviate it by designing an unsupervised foreground attention (UFA) module utilizing the conditional independence between foreground and background motion. Then we incorporate weakly-labeled data into SSAD and propose Omni-supervised Action Detection (OSAD) with three levels of supervision. To overcome the accompanying action-context confusion problem in OSAD baselines, an information bottleneck (IB) is designed to suppress the scene information in non-action frames while preserving the action information. We extensively benchmark against the baselines for SSAD and OSAD on our created data splits in THUMOS14 and ActivityNet1.2, and demonstrate the effectiveness of the proposed UFA and IB methods. Lastly, the benefit of our full OSAD-IB model under limited annotation budgets is shown by exploring the optimal annotation strategy for labeled, unlabeled and weakly-labeled data.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Shi_Temporal_Action_Detection_With_Multi-Level_Supervision_ICCV_2021_paper.pdf", @@ -41106,7 +43882,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Shi_Temporal_Action_Detection_With_Multi-Level_Supervision_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Shi_Temporal_Action_Detection_With_Multi-Level_Supervision_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Shi_2021_ICCV,\n \n author = {\n Shi,\n Baifeng and Dai,\n Qi and Hoffman,\n Judy and Saenko,\n Kate and Darrell,\n Trevor and Xu,\n Huijuan\n},\n title = {\n Temporal Action Detection With Multi-Level Supervision\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8022-8032\n} \n}" }, { "title": "Temporal Cue Guided Video Highlight Detection With Low-Rank Audio-Visual Fusion", @@ -41114,6 +43891,7 @@ "status": "Poster", "track": "main", "pid": 1215, + "author_site": "Qinghao Ye; Xiyue Shen; Yuan Gao; Zirui Wang; Qi Bi; Ping Li; Guang Yang", "author": "Qinghao Ye; Xiyue Shen; Yuan Gao; Zirui Wang; Qi Bi; Ping Li; Guang Yang", "abstract": "Video highlight detection plays an increasingly important role in social media content filtering, however, it remains highly challenging to develop automated video highlight detection methods because of the lack of temporal annotations (i.e., where the highlight moments are in long videos) for supervised learning. In this paper, we propose a novel weakly supervised method that can learn to detect highlights by mining video characteristics with video level annotations (topic tags) only. Particularly, we exploit audio-visual features to enhance video representation and take temporal cues into account for improving detection performance. Our contributions are threefold: 1) we propose an audio-visual tensor fusion mechanism that efficiently models the complex association between two modalities while reducing the gap of the heterogeneity between the two modalities; 2) we introduce a novel hierarchical temporal context encoder to embed local temporal clues in between neighboring segments; 3) finally, we alleviate the gradient vanishing problem theoretically during model optimization with attention-gated instance aggregation. Extensive experiments on two benchmark datasets (YouTube Highlights and TVSum) have demonstrated our method outperforms other state-of-the-art methods with remarkable improvements.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ye_Temporal_Cue_Guided_Video_Highlight_Detection_With_Low-Rank_Audio-Visual_Fusion_ICCV_2021_paper.pdf", @@ -41128,7 +43906,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Ye_Temporal_Cue_Guided_Video_Highlight_Detection_With_Low-Rank_Audio-Visual_Fusion_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Ye_Temporal_Cue_Guided_Video_Highlight_Detection_With_Low-Rank_Audio-Visual_Fusion_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Ye_2021_ICCV,\n \n author = {\n Ye,\n Qinghao and Shen,\n Xiyue and Gao,\n Yuan and Wang,\n Zirui and Bi,\n Qi and Li,\n Ping and Yang,\n Guang\n},\n title = {\n Temporal Cue Guided Video Highlight Detection With Low-Rank Audio-Visual Fusion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7950-7959\n} \n}" }, { "title": "Temporal Knowledge Consistency for Unsupervised Visual Representation Learning", @@ -41136,6 +43915,7 @@ "status": "Poster", "track": "main", "pid": 5515, + "author_site": "Weixin Feng; Yuanjiang Wang; Lihua Ma; Ye Yuan; Chi Zhang", "author": "Weixin Feng; Yuanjiang Wang; Lihua Ma; Ye Yuan; Chi Zhang", "abstract": "The instance discrimination paradigm has become dominant in unsupervised learning. It always adopts a teacher-student framework, in which the teacher provides embedded knowledge as a supervision signal for the student. The student learns meaningful representations by enforcing instance spatial consistency with the views from the teacher. However, the outputs of the teacher can vary dramatically on the same instance during different training stages, introducing unexpected noise and leading to catastrophic forgetting caused by inconsistent objectives. In this paper, we first integrate instance temporal consistency into current instance discrimination paradigms, and propose a novel and strong algorithm named Temporal Knowledge Consistency (TKC). Specifically, our TKC dynamically ensembles the knowledge of temporal teachers and adaptively selects useful information according to its importance to learning instance temporal consistency. Experimental result shows that TKC can learn better visual representations on both ResNet and AlexNet on linear evaluation protocol while transfer well to downstream tasks. All experiments suggest the good effectiveness and generalization of our method. Code will be made available.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Feng_Temporal_Knowledge_Consistency_for_Unsupervised_Visual_Representation_Learning_ICCV_2021_paper.pdf", @@ -41159,7 +43939,8 @@ "aff_campus_unique_index": "0", "aff_campus_unique": "Beijing;", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Feng_2021_ICCV,\n \n author = {\n Feng,\n Weixin and Wang,\n Yuanjiang and Ma,\n Lihua and Yuan,\n Ye and Zhang,\n Chi\n},\n title = {\n Temporal Knowledge Consistency for Unsupervised Visual Representation Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10170-10180\n} \n}" }, { "title": "Temporal-Wise Attention Spiking Neural Networks for Event Streams Classification", @@ -41167,6 +43948,7 @@ "status": "Poster", "track": "main", "pid": 8275, + "author_site": "Man Yao; Huanhuan Gao; Guangshe Zhao; Dingheng Wang; Yihan Lin; Zhaoxu Yang; Guoqi Li", "author": "Man Yao; Huanhuan Gao; Guangshe Zhao; Dingheng Wang; Yihan Lin; Zhaoxu Yang; Guoqi Li", "abstract": "How to effectively and efficiently deal with spatio-temporal event streams, where the events are generally sparse and non-uniform and have the us temporal resolution, is of great value and has various real-life applications. Spiking neural network (SNN), as one of the brain-inspired event-triggered computing models, has the potential to extract effective spatio-temporal features from the event streams. However, when aggregating individual events into frames with a new higher temporal resolution, existing SNN models do not attach importance to that the serial frames have different signal-to-noise ratios since event streams are sparse and non-uniform. This situation interferes with the performance of existing SNNs. In this work, we propose a temporal-wise attention SNN (TA-SNN) model to learn frame-based representation for processing event streams. Concretely, we extend the attention concept to temporal-wise input to judge the significance of frames for the final decision at the training stage, and discard the irrelevant frames at the inference stage. We demonstrate that TA-SNN models improve the accuracy of event streams classification tasks. We also study the impact of multiple-scale temporal resolutions for frame-based representation. Our approach is tested on three different classification tasks: gesture recognition, image classification, and spoken digit recognition. We report the state-of-the-art results on these tasks, and get the essential improvement of accuracy (almost 19%) for gesture recognition with only 60 ms.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yao_Temporal-Wise_Attention_Spiking_Neural_Networks_for_Event_Streams_Classification_ICCV_2021_paper.pdf", @@ -41183,14 +43965,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Yao_Temporal-Wise_Attention_Spiking_Neural_Networks_for_Event_Streams_Classification_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;0;1;0;1", - "aff_unique_norm": "Xi'an Jiao Tong University;Tsinghua University", + "aff_unique_norm": "Xi'an Jiaotong University;Tsinghua University", "aff_unique_dep": ";", "aff_unique_url": "https://www.xjtu.edu.cn;https://www.tsinghua.edu.cn", "aff_unique_abbr": "XJTU;THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yao_2021_ICCV,\n \n author = {\n Yao,\n Man and Gao,\n Huanhuan and Zhao,\n Guangshe and Wang,\n Dingheng and Lin,\n Yihan and Yang,\n Zhaoxu and Li,\n Guoqi\n},\n title = {\n Temporal-Wise Attention Spiking Neural Networks for Event Streams Classification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10221-10230\n} \n}" }, { "title": "Temporally-Coherent Surface Reconstruction via Metric-Consistent Atlases", @@ -41198,6 +43981,7 @@ "status": "Poster", "track": "main", "pid": 5754, + "author_site": "Jan Bednarik; Vladimir G. Kim; Siddhartha Chaudhuri; Shaifali Parashar; Mathieu Salzmann; Pascal Fua; Noam Aigerman", "author": "Jan Bednarik; Vladimir G. Kim; Siddhartha Chaudhuri; Shaifali Parashar; Mathieu Salzmann; Pascal Fua; Noam Aigerman", "abstract": "We propose a method for the unsupervised reconstruction of a temporally-coherent sequence of surfaces from a sequence of time-evolving point clouds, yielding dense, semantically meaningful correspondences between all keyframes. We represent the reconstructed surface as an atlas, using a neural network. Using canonical correspondences defined via the atlas, we encourage the reconstruction to be as isometric as possible across frames, leading to semantically-meaningful reconstruction. Through experiments and comparisons, we empirically show that our method achieves results that exceed that state of the art in the accuracy of correspondences and accuracy of surface reconstruction.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Bednarik_Temporally-Coherent_Surface_Reconstruction_via_Metric-Consistent_Atlases_ICCV_2021_paper.pdf", @@ -41212,7 +43996,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Bednarik_Temporally-Coherent_Surface_Reconstruction_via_Metric-Consistent_Atlases_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Bednarik_Temporally-Coherent_Surface_Reconstruction_via_Metric-Consistent_Atlases_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Bednarik_2021_ICCV,\n \n author = {\n Bednarik,\n Jan and Kim,\n Vladimir G. and Chaudhuri,\n Siddhartha and Parashar,\n Shaifali and Salzmann,\n Mathieu and Fua,\n Pascal and Aigerman,\n Noam\n},\n title = {\n Temporally-Coherent Surface Reconstruction via Metric-Consistent Atlases\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10458-10467\n} \n}" }, { "title": "Testing Using Privileged Information by Adapting Features With Statistical Dependence", @@ -41220,6 +44005,7 @@ "status": "Poster", "track": "main", "pid": 8308, + "author_site": "Kwang In Kim; James Tompkin", "author": "Kwang In Kim; James Tompkin", "abstract": "Given an imperfect predictor, we exploit additional features at test time to improve the predictions made, without retraining and without knowledge of the prediction function. This scenario arises if training labels or data are proprietary, restricted, or no longer available, or if training itself is prohibitively expensive. We assume that the additional features are useful if they exhibit strong statistical dependence to the underlying perfect predictor. Then, we empirically estimate and strengthen the statistical dependence between the initial noisy predictor and the additional features via manifold denoising. As an example, we show that this approach leads to improvement in real-world visual attribute ranking.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Kim_Testing_Using_Privileged_Information_by_Adapting_Features_With_Statistical_Dependence_ICCV_2021_paper.pdf", @@ -41243,7 +44029,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", - "aff_country_unique": "South Korea;United States" + "aff_country_unique": "South Korea;United States", + "bibtex": "@InProceedings{Kim_2021_ICCV,\n \n author = {\n Kim,\n Kwang In and Tompkin,\n James\n},\n title = {\n Testing Using Privileged Information by Adapting Features With Statistical Dependence\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9405-9413\n} \n}" }, { "title": "Text Is Text, No Matter What: Unifying Text Recognition Using Knowledge Distillation", @@ -41251,6 +44038,7 @@ "status": "Poster", "track": "main", "pid": 4391, + "author_site": "Ayan Kumar Bhunia; Aneeshan Sain; Pinaki Nath Chowdhury; Yi-Zhe Song", "author": "Ayan Kumar Bhunia; Aneeshan Sain; Pinaki Nath Chowdhury; Yi-Zhe Song", "abstract": "Text recognition remains a fundamental and extensively researched topic in computer vision, largely owing to its wide array of commercial applications. The challenging nature of the very problem however dictated a fragmentation of research efforts: Scene Text Recognition (STR) that deals with text in everyday scenes, and Handwriting Text Recognition (HTR) that tackles hand-written text. In this paper, for the first time, we argue for their unification -- we aim for a single model that can compete favourably with two separate state-of-the-art STR and HTR models. We first show that cross-utilisation of STR and HTR models trigger significant performance drops due to differences in their inherent challenges. We then tackle their union by introducing a knowledge distillation (KD) based framework. This however is non-trivial, largely due to the variable-length and sequential nature of text sequences, which renders off-the-shelf KD techniques that mostly work with global fixed length data, inadequate. For that, we propose four distillation losses, all of which are specifically designed to cope with the aforementioned unique characteristics of text recognition. Empirical evidence suggests that our proposed unified model performs at par with individual models, even surpassing them in certain cases. Ablative studies demonstrate that naive baselines such as a two-stage framework, multi-task and domain adaption/generalisation alternatives do not work that well, further authenticating our design.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Bhunia_Text_Is_Text_No_Matter_What_Unifying_Text_Recognition_Using_ICCV_2021_paper.pdf", @@ -41274,7 +44062,8 @@ "aff_campus_unique_index": ";;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0+0;0+0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Bhunia_2021_ICCV,\n \n author = {\n Bhunia,\n Ayan Kumar and Sain,\n Aneeshan and Chowdhury,\n Pinaki Nath and Song,\n Yi-Zhe\n},\n title = {\n Text Is Text,\n No Matter What: Unifying Text Recognition Using Knowledge Distillation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 983-992\n} \n}" }, { "title": "The Animation Transformer: Visual Correspondence via Segment Matching", @@ -41282,7 +44071,8 @@ "status": "Poster", "track": "main", "pid": 11032, - "author": "Evan Casey; V\u00edctor P\u00e9rez; Zhuoru Li", + "author_site": "Evan Casey; Víctor Pérez; Zhuoru Li", + "author": "Evan Casey; Víctor Pérez; Zhuoru Li", "abstract": "Visual correspondence is a fundamental building block on the way to building assistive tools for hand-drawn animation. However, while a large body of work has focused on learning visual correspondences at the pixel-level, few approaches have emerged to learn correspondence at the level of line enclosures (segments) that naturally occur in hand-drawn animation. Exploiting this structure in animation has numerous benefits: it avoids the memory complexity of pixel attention over high resolution images and enables the use of real-world animation datasets that contain correspondence information at the level of per-segment colors. To that end, we propose the Animation Transformer (AnT) which uses a Transformer-based architecture to learn the spatial and visual relationships between segments across a sequence of images. By leveraging a forward match loss and a cycle consistency loss our approach attains excellent results compared to state-of-the-art pixel approaches on challenging datasets from real animation productions that lack ground-truth correspondence labels.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Casey_The_Animation_Transformer_Visual_Correspondence_via_Segment_Matching_ICCV_2021_paper.pdf", "aff": ";;", @@ -41296,7 +44086,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Casey_The_Animation_Transformer_Visual_Correspondence_via_Segment_Matching_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Casey_The_Animation_Transformer_Visual_Correspondence_via_Segment_Matching_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Casey_2021_ICCV,\n \n author = {\n Casey,\n Evan and P\\'erez,\n V{\\'\\i\n}ctor and Li,\n Zhuoru\n},\n title = {\n The Animation Transformer: Visual Correspondence via Segment Matching\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11323-11332\n} \n}" }, { "title": "The Benefit of Distraction: Denoising Camera-Based Physiological Measurements Using Inverse Attention", @@ -41304,6 +44095,7 @@ "status": "Poster", "track": "main", "pid": 3921, + "author_site": "Ewa M. Nowara; Daniel McDuff; Ashok Veeraraghavan", "author": "Ewa M. Nowara; Daniel McDuff; Ashok Veeraraghavan", "abstract": "Attention networks perform well on diverse computer vision tasks. The core idea is that the signal of interest is stronger in some pixels (\"foreground\"), and by selectively focusing computation on these pixels, networks can extract subtle information buried in noise and other sources of corruption. Our paper is based on one key observation: in many real-world applications, many sources of corruption, such as illumination and motion, are often shared between the \"foreground\" and the \"background\" pixels. Can we utilize this to our advantage? We propose the utility of inverse attention networks, which focus on extracting information about these shared sources of corruption. We show that this helps to effectively suppress shared covariates and amplify signal information, resulting in improved performance. We illustrate this on the task of camera-based physiological measurement where the signal of interest is weak and global illumination variations and motion act as significant shared sources of corruption. We perform experiments on three datasets and show that our approach of inverse attention produces state-of-the-art results, increasing the signal-to-noise ratio by up to 5.8 dB, reducing heart rate and breathing rate estimation errors by as much as 30 %, recovering subtle waveform dynamics, and generalizing from RGB to NIR videos without retraining.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Nowara_The_Benefit_of_Distraction_Denoising_Camera-Based_Physiological_Measurements_Using_Inverse_ICCV_2021_paper.pdf", @@ -41320,14 +44112,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Nowara_The_Benefit_of_Distraction_Denoising_Camera-Based_Physiological_Measurements_Using_Inverse_ICCV_2021_paper.html", "aff_unique_index": "0;1;0", - "aff_unique_norm": "Rice University;Microsoft", + "aff_unique_norm": "Rice University;Microsoft Corporation", "aff_unique_dep": ";Microsoft Research", "aff_unique_url": "https://www.rice.edu;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "Rice;MSR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Nowara_2021_ICCV,\n \n author = {\n Nowara,\n Ewa M. and McDuff,\n Daniel and Veeraraghavan,\n Ashok\n},\n title = {\n The Benefit of Distraction: Denoising Camera-Based Physiological Measurements Using Inverse Attention\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4955-4964\n} \n}" }, { "title": "The Center of Attention: Center-Keypoint Grouping via Attention for Multi-Person Pose Estimation", @@ -41335,7 +44128,8 @@ "status": "Poster", "track": "main", "pid": 7923, - "author": "Guillem Bras\u00f3; Nikita Kister; Laura Leal-Taix\u00e9", + "author_site": "Guillem Brasó; Nikita Kister; Laura Leal-Taixé", + "author": "Guillem Brasó; Nikita Kister; Laura Leal-Taixé", "abstract": "We introduce CenterGroup, an attention-based framework to estimate human poses from a set of identity-agnostic keypoints and person center predictions in an image. Our approach uses a transformer to obtain context-aware embeddings for all detected keypoints and centers and then applies multi-head attention to directly group joints into their corresponding person centers. While most bottom-up methods rely on non-learnable clustering at inference, CenterGroup uses a fully differentiable attention mechanism that we train end-to-end together with our keypoint detector. As a result, our method obtains state-of-the-art performance with up to 2.5x faster inference time than competing bottom-up methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Braso_The_Center_of_Attention_Center-Keypoint_Grouping_via_Attention_for_Multi-Person_ICCV_2021_paper.pdf", "aff": "Technical University of Munich; Technical University of Munich; Technical University of Munich", @@ -41358,7 +44152,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Braso_2021_ICCV,\n \n author = {\n Bras\\'o,\n Guillem and Kister,\n Nikita and Leal-Taix\\'e,\n Laura\n},\n title = {\n The Center of Attention: Center-Keypoint Grouping via Attention for Multi-Person Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11853-11863\n} \n}" }, { "title": "The Devil Is in the Task: Exploiting Reciprocal Appearance-Localization Features for Monocular 3D Object Detection", @@ -41366,6 +44161,7 @@ "status": "Poster", "track": "main", "pid": 2397, + "author_site": "Zhikang Zou; Xiaoqing Ye; Liang Du; Xianhui Cheng; Xiao Tan; Li Zhang; Jianfeng Feng; Xiangyang Xue; Errui Ding", "author": "Zhikang Zou; Xiaoqing Ye; Liang Du; Xianhui Cheng; Xiao Tan; Li Zhang; Jianfeng Feng; Xiangyang Xue; Errui Ding", "abstract": "Low-cost monocular 3D object detection plays a fundamental role in autonomous driving, whereas its accuracy is still far from satisfactory. Our objective is to dig into the 3D object detection task and reformulate it as the sub-tasks of object localization and appearance perception, which benefits to a deep excavation of reciprocal information underlying the entire task. We introduce a Dynamic Feature Reflecting Network, named DFR-Net, which contains two novel standalone modules: (i) the Appearance-Localization Feature Reflecting module (ALFR) that first separates task-specific features and then self-mutually reflects the reciprocal features; (ii) the Dynamic Intra-Trading module (DIT) that adaptively realigns the training processes of various sub-tasks via a self-learning manner. Extensive experiments on the challenging KITTI dataset demonstrate the effectiveness and generalization of DFR-Net. We rank 1st among all the monocular 3D object detectors in the KITTI test set (till March 16th, 2021). The proposed method is also easy to be plug-and-play in many cutting-edge 3D detection frameworks at negligible cost to boost performance. The code will be made publicly available.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zou_The_Devil_Is_in_the_Task_Exploiting_Reciprocal_Appearance-Localization_Features_ICCV_2021_paper.pdf", @@ -41382,14 +44178,15 @@ "author_num": 9, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zou_The_Devil_Is_in_the_Task_Exploiting_Reciprocal_Appearance-Localization_Features_ICCV_2021_paper.html", "aff_unique_index": "0;0;1;1;0;1;1;1;0", - "aff_unique_norm": "Baidu;Fudan University", - "aff_unique_dep": "Baidu Inc.;Institute of Science and Technology for Brain-Inspired Intelligence", + "aff_unique_norm": "Baidu Inc.;Fudan University", + "aff_unique_dep": ";Institute of Science and Technology for Brain-Inspired Intelligence", "aff_unique_url": "https://www.baidu.com;https://www.fudan.edu.cn", "aff_unique_abbr": "Baidu;Fudan", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zou_2021_ICCV,\n \n author = {\n Zou,\n Zhikang and Ye,\n Xiaoqing and Du,\n Liang and Cheng,\n Xianhui and Tan,\n Xiao and Zhang,\n Li and Feng,\n Jianfeng and Xue,\n Xiangyang and Ding,\n Errui\n},\n title = {\n The Devil Is in the Task: Exploiting Reciprocal Appearance-Localization Features for Monocular 3D Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2713-2722\n} \n}" }, { "title": "The Functional Correspondence Problem", @@ -41397,6 +44194,7 @@ "status": "Poster", "track": "main", "pid": 8241, + "author_site": "Zihang Lai; Senthil Purushwalkam; Abhinav Gupta", "author": "Zihang Lai; Senthil Purushwalkam; Abhinav Gupta", "abstract": "The ability to find correspondences in visual data is the essence of most computer vision tasks. But what are the right correspondences? The task of visual correspondence is well defined for two different images of same object instance. In case of two images of objects belonging to same category, visual correspondence is reasonably well-defined in most cases. But what about correspondence between two objects of completely different category -- e.g., a shoe and a bottle? Does there exist any correspondence? Inspired by humans' ability to: (a) generalize beyond semantic categories and; (b) infer functional affordances, we introduce the problem of functional correspondences in this paper. Given images of two objects, we ask a simple question: what is the set of correspondences between these two images for a given task? For example, what are the correspondences between a bottle and shoe for the task of pounding or the task of pouring. We introduce a new dataset: FunKPoint that has ground truth correspondences for 10 tasks and 20 object categories. We also introduce a modular task-driven representation for attacking this problem and demonstrate that our learned representation is effective for this task. But most importantly, because our supervision signal is not bound by semantics, we show that our learned representation can generalize better on few-shot classification problem. We hope this paper will inspire our community to think beyond semantics and focus more on cross-category generalization and learning representations for robotics tasks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Lai_The_Functional_Correspondence_Problem_ICCV_2021_paper.pdf", @@ -41411,7 +44209,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Lai_The_Functional_Correspondence_Problem_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Lai_The_Functional_Correspondence_Problem_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Lai_2021_ICCV,\n \n author = {\n Lai,\n Zihang and Purushwalkam,\n Senthil and Gupta,\n Abhinav\n},\n title = {\n The Functional Correspondence Problem\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15772-15781\n} \n}" }, { "title": "The Many Faces of Robustness: A Critical Analysis of Out-of-Distribution Generalization", @@ -41419,6 +44218,7 @@ "status": "Poster", "track": "main", "pid": 10942, + "author_site": "Dan Hendrycks; Steven Basart; Norman Mu; Saurav Kadavath; Frank Wang; Evan Dorundo; Rahul Desai; Tyler Zhu; Samyak Parajuli; Mike Guo; Dawn Song; Jacob Steinhardt; Justin Gilmer", "author": "Dan Hendrycks; Steven Basart; Norman Mu; Saurav Kadavath; Frank Wang; Evan Dorundo; Rahul Desai; Tyler Zhu; Samyak Parajuli; Mike Guo; Dawn Song; Jacob Steinhardt; Justin Gilmer", "abstract": "We introduce four new real-world distribution shift datasets consisting of changes in image style, image blurriness, geographic location, camera operation, and more. With our new datasets, we take stock of previously proposed methods for improving out-of-distribution robustness and put them to the test. We find that using larger models and artificial data augmentations can improve robustness on real-world distribution shifts, contrary to claims in prior work. We find improvements in artificial robustness benchmarks can transfer to real-world distribution shifts, contrary to claims in prior work. Motivated by our observation that data augmentations can help with real-world distribution shifts, we also introduce a new data augmentation method which advances the state-of-the-art and outperforms models pretrained with 1000x more labeled data. Overall we find that some methods consistently help with distribution shifts in texture and local image statistics, but these methods do not help with some other distribution shifts like geographic changes. Our results show that future research must study multiple distribution shifts simultaneously, as we demonstrate that no evaluated method consistently improves robustness.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Hendrycks_The_Many_Faces_of_Robustness_A_Critical_Analysis_of_Out-of-Distribution_ICCV_2021_paper.pdf", @@ -41436,13 +44236,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Hendrycks_The_Many_Faces_of_Robustness_A_Critical_Analysis_of_Out-of-Distribution_ICCV_2021_paper.html", "aff_unique_index": "0;1;0;0;2;2;0;0;0;0;0;0;2", "aff_unique_norm": "University of California, Berkeley;University of Chicago;Google", - "aff_unique_dep": ";;Google", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.berkeley.edu;https://www.uchicago.edu;https://www.google.com", "aff_unique_abbr": "UC Berkeley;UChicago;Google", "aff_campus_unique_index": "0;0;0;2;2;0;0;0;0;0;0;2", "aff_campus_unique": "Berkeley;;Mountain View", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Hendrycks_2021_ICCV,\n \n author = {\n Hendrycks,\n Dan and Basart,\n Steven and Mu,\n Norman and Kadavath,\n Saurav and Wang,\n Frank and Dorundo,\n Evan and Desai,\n Rahul and Zhu,\n Tyler and Parajuli,\n Samyak and Guo,\n Mike and Song,\n Dawn and Steinhardt,\n Jacob and Gilmer,\n Justin\n},\n title = {\n The Many Faces of Robustness: A Critical Analysis of Out-of-Distribution Generalization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8340-8349\n} \n}" }, { "title": "The Power of Points for Modeling Humans in Clothing", @@ -41450,10 +44251,11 @@ "status": "Poster", "track": "main", "pid": 1017, + "author_site": "Qianli Ma; Jinlong Yang; Siyu Tang; Michael J. Black", "author": "Qianli Ma; Jinlong Yang; Siyu Tang; Michael J. Black", "abstract": "Currently it requires an artist to create 3D human avatars with realistic clothing that can move naturally. Despite progress on 3D scanning and modeling of human bodies, there is still no technology that can easily turn a static scan into an animatable avatar. Automating the creation of such avatars would enable many applications in games, social networking, animation, and AR/VR to name a few. The key problem is one of representation. Standard 3D meshes are widely used in modeling the minimally-clothed body but do not readily capture the complex topology of clothing. Recent interest has shifted to implicit surface models for this task but they are computationally heavy and lack compatibility with existing 3D tools. What is needed is a 3D representation that can capture varied topology at high resolution and that can be learned from data. We argue that this representation has been with us all along --- the point cloud. Point clouds have properties of both implicit and explicit representations that we exploit to model 3D garment geometry on a human body. We train a neural network with a novel local clothing geometric feature to represent the shape of different outfits. The network is trained from 3D point clouds of many types of clothing, on many bodies, in many poses, and learns to model pose-dependent clothing deformations. The geometry feature can be optimized to fit a previously unseen scan of a person in clothing, enabling the scan to be reposed realistically. Our model demonstrates superior quantitative and qualitative results in both multi-outfit modeling and unseen outfit animation. The code is available for research purposes at https://qianlim.github.io/POP.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ma_The_Power_of_Points_for_Modeling_Humans_in_Clothing_ICCV_2021_paper.pdf", - "aff": "Max Planck Institute for Intelligent Systems, T\u00fcbingen, Germany+ETH Z\u00fcrich; Max Planck Institute for Intelligent Systems, T\u00fcbingen, Germany; ETH Z\u00fcrich; Max Planck Institute for Intelligent Systems, T\u00fcbingen, Germany", + "aff": "Max Planck Institute for Intelligent Systems, Tübingen, Germany+ETH Zürich; Max Planck Institute for Intelligent Systems, Tübingen, Germany; ETH Zürich; Max Planck Institute for Intelligent Systems, Tübingen, Germany", "project": "", "github": "https://qianlim.github.io/POP", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Ma_The_Power_of_ICCV_2021_supplemental.pdf", @@ -41466,14 +44268,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Ma_The_Power_of_Points_for_Modeling_Humans_in_Clothing_ICCV_2021_paper.html", "aff_unique_index": "0+1;0;1;0", - "aff_unique_norm": "Max Planck Institute for Intelligent Systems;ETH Zurich", + "aff_unique_norm": "Max Planck Institute for Intelligent Systems;ETH Zürich", "aff_unique_dep": ";", "aff_unique_url": "https://www.mpi-is.mpg.de;https://www.ethz.ch", "aff_unique_abbr": "MPI-IS;ETHZ", "aff_campus_unique_index": "0;0;0", - "aff_campus_unique": "T\u00fcbingen;", + "aff_campus_unique": "Tübingen;", "aff_country_unique_index": "0+1;0;1;0", - "aff_country_unique": "Germany;Switzerland" + "aff_country_unique": "Germany;Switzerland", + "bibtex": "@InProceedings{Ma_2021_ICCV,\n \n author = {\n Ma,\n Qianli and Yang,\n Jinlong and Tang,\n Siyu and Black,\n Michael J.\n},\n title = {\n The Power of Points for Modeling Humans in Clothing\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10974-10984\n} \n}" }, { "title": "The Pursuit of Knowledge: Discovering and Localizing Novel Categories Using Dual Memory", @@ -41481,6 +44284,7 @@ "status": "Poster", "track": "main", "pid": 9111, + "author_site": "Sai Saketh Rambhatla; Rama Chellappa; Abhinav Shrivastava", "author": "Sai Saketh Rambhatla; Rama Chellappa; Abhinav Shrivastava", "abstract": "We tackle object category discovery, which is the problem of discovering and localizing novel objects in a large unlabeled dataset. While existing methods show results on datasets with less cluttered scenes and fewer object instances per image, we present our results on the challenging COCO dataset. Moreover, we argue that, rather than discovering new categories from scratch, discovery algorithms can benefit from identifying what is already known and focusing their attention on the unknown. We propose a method that exploits prior knowledge about certain object types to discover new categories by leveraging two memory modules, namely Working and Semantic memory. We show the performance of our detector on the COCO minival dataset to demonstrate its in-the-wild capabilities.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Rambhatla_The_Pursuit_of_Knowledge_Discovering_and_Localizing_Novel_Categories_Using_ICCV_2021_paper.pdf", @@ -41495,7 +44299,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Rambhatla_The_Pursuit_of_Knowledge_Discovering_and_Localizing_Novel_Categories_Using_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Rambhatla_The_Pursuit_of_Knowledge_Discovering_and_Localizing_Novel_Categories_Using_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Rambhatla_2021_ICCV,\n \n author = {\n Rambhatla,\n Sai Saketh and Chellappa,\n Rama and Shrivastava,\n Abhinav\n},\n title = {\n The Pursuit of Knowledge: Discovering and Localizing Novel Categories Using Dual Memory\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9153-9163\n} \n}" }, { "title": "The Right To Talk: An Audio-Visual Transformer Approach", @@ -41503,6 +44308,7 @@ "status": "Poster", "track": "main", "pid": 10976, + "author_site": "Thanh-Dat Truong; Chi Nhan Duong; The De Vu; Hoang Anh Pham; Bhiksha Raj; Ngan Le; Khoa Luu", "author": "Thanh-Dat Truong; Chi Nhan Duong; The De Vu; Hoang Anh Pham; Bhiksha Raj; Ngan Le; Khoa Luu", "abstract": "Turn-taking has played an essential role in structuring the regulation of a conversation. The task of identifying the main speaker (who is properly taking his/her turn of speaking) and the interrupters (who are interrupting or reacting to the main speaker's utterances) remains a challenging task. Although some prior methods have partially addressed this task, there still remain some limitations. Firstly, a direct association of Audio and Visual features may limit the correlations to be extracted due to different modalities. Secondly, the relationship across temporal segments helping to maintain the consistency of localization, separation and conversation contexts is not effectively exploited. Finally, the interactions between speakers that usually contain the tracking and anticipatory decisions about transition to a new speaker is usually ignored. Therefore, this work introduces a new Audio-Visual Transformer approach to the problem of localization and highlighting the main speaker in both audio and visual channels of a multi-speaker conversation video in the wild. The proposed method exploits different types of correlations presented in both visual and audio signals. The temporal audio-visual relationships across spatial-temporal space are anticipated and optimized via the self-attention mechanism in a Transformer structure. Moreover, a newly collected dataset is introduced for the main speaker detection. To the best of our knowledge, it is one of the first studies that is able to automatically localize and highlight the main speaker in both visual and audio channels in multi-speaker conversation videos.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Truong_The_Right_To_Talk_An_Audio-Visual_Transformer_Approach_ICCV_2021_paper.pdf", @@ -41521,12 +44327,13 @@ "aff_unique_index": "0;1;2;2;3;0;0", "aff_unique_norm": "University of Arkansas;Concordia University;VinAI Research;Carnegie Mellon University", "aff_unique_dep": "CVIU Lab;;;", - "aff_unique_url": "https://www.uark.edu;https://www.concordia.ca;https://www.vinai.io/;https://www.cmu.edu", + "aff_unique_url": "https://www.uark.edu;https://www.concordia.ca;https://www.vinai.io;https://www.cmu.edu", "aff_unique_abbr": ";Concordia;VinAI;CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;2;0;0;0", - "aff_country_unique": "United States;Canada;Vietnam" + "aff_country_unique": "United States;Canada;Vietnam", + "bibtex": "@InProceedings{Truong_2021_ICCV,\n \n author = {\n Truong,\n Thanh-Dat and Duong,\n Chi Nhan and De Vu,\n The and Pham,\n Hoang Anh and Raj,\n Bhiksha and Le,\n Ngan and Luu,\n Khoa\n},\n title = {\n The Right To Talk: An Audio-Visual Transformer Approach\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1105-1114\n} \n}" }, { "title": "The Road To Know-Where: An Object-and-Room Informed Sequential BERT for Indoor Vision-Language Navigation", @@ -41534,6 +44341,7 @@ "status": "Poster", "track": "main", "pid": 9923, + "author_site": "Yuankai Qi; Zizheng Pan; Yicong Hong; Ming-Hsuan Yang; Anton van den Hengel; Qi Wu", "author": "Yuankai Qi; Zizheng Pan; Yicong Hong; Ming-Hsuan Yang; Anton van den Hengel; Qi Wu", "abstract": "Vision-and-Language Navigation (VLN) requires an agent to find a path to a remote location on the basis of natural-language instructions and a set of photo-realistic panoramas. Most existing methods take the words in the instructions and the discrete views of each panorama as the minimal unit of encoding. However, this requires a model to match different nouns (e.g., TV, table) against the same input view feature. In this work, we propose an object-informed sequential BERT to encode visual perceptions and linguistic instructions at the same fine-grained level, namely objects and words. Our sequential BERT also enables the visual-textual clues to be interpreted in light of the temporal context, which is crucial to multi-round VLN tasks. Additionally, we enable the model to identify the relative direction (e.g., left/right/front/back) of each navigable location and the room type (e.g., bedroom, kitchen) of its current and final navigation goal, as such information is widely mentioned in instructions implying the desired next and final locations. We thus enable the model to know-where the objects lie in the images, and to know-where they stand in the scene. Extensive experiments demonstrate the effectiveness compared against several state-of-the-art methods on three indoor VLN tasks: REVERIE, NDH, and R2R. Project repository: https://github.com/YuankaiQi/ORIST", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Qi_The_Road_To_Know-Where_An_Object-and-Room_Informed_Sequential_BERT_for_ICCV_2021_paper.pdf", @@ -41548,7 +44356,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Qi_The_Road_To_Know-Where_An_Object-and-Room_Informed_Sequential_BERT_for_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Qi_The_Road_To_Know-Where_An_Object-and-Room_Informed_Sequential_BERT_for_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Qi_2021_ICCV,\n \n author = {\n Qi,\n Yuankai and Pan,\n Zizheng and Hong,\n Yicong and Yang,\n Ming-Hsuan and van den Hengel,\n Anton and Wu,\n Qi\n},\n title = {\n The Road To Know-Where: An Object-and-Room Informed Sequential BERT for Indoor Vision-Language Navigation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1655-1664\n} \n}" }, { "title": "The Spatio-Temporal Poisson Point Process: A Simple Model for the Alignment of Event Camera Data", @@ -41556,6 +44365,7 @@ "status": "Poster", "track": "main", "pid": 6649, + "author_site": "Cheng Gu; Erik Learned-Miller; Daniel Sheldon; Guillermo Gallego; Pia Bideau", "author": "Cheng Gu; Erik Learned-Miller; Daniel Sheldon; Guillermo Gallego; Pia Bideau", "abstract": "Event cameras, inspired by biological vision systems, provide a natural and data efficient representation of visual information. Visual information is acquired in the form of events that are triggered by local brightness changes. However, because most brightness changes are triggered by relative motion of the camera and the scene, the events recorded at a single sensor location seldom correspond to the same world point. To extract meaningful information from event cameras, it is helpful to register events that were triggered by the same underlying world point. In this work we propose a new model of event data that captures its natural spatio-temporal structure. We start by developing a model for aligned event data. That is, we develop a model for the data as though it has been perfectly registered already. In particular, we model the aligned data as a spatio-temporal Poisson point process. Based on this model, we develop a maximum likelihood approach to registering events that are not yet aligned. That is, we find transformations of the observed events that make them as likely as possible under our model. In particular we extract the camera rotation that leads to the best event alignment. We show new state of the art accuracy for rotational velocity estimation on the DAVIS 240C dataset [??]. In addition, our method is also faster and has lower computational complexity than several competing methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Gu_The_Spatio-Temporal_Poisson_Point_Process_A_Simple_Model_for_the_ICCV_2021_paper.pdf", @@ -41572,14 +44382,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Gu_The_Spatio-Temporal_Poisson_Point_Process_A_Simple_Model_for_the_ICCV_2021_paper.html", "aff_unique_index": "0;1;1;0;0", - "aff_unique_norm": "Technische Universit\u00e4t Berlin;University of Massachusetts Amherst", + "aff_unique_norm": "Technische Universität Berlin;University of Massachusetts Amherst", "aff_unique_dep": ";", "aff_unique_url": "https://www.tu-berlin.de;https://www.umass.edu", "aff_unique_abbr": "TU Berlin;UMass Amherst", "aff_campus_unique_index": "0;1;1;0;0", "aff_campus_unique": "Berlin;Amherst", "aff_country_unique_index": "0;1;1;0;0", - "aff_country_unique": "Germany;United States" + "aff_country_unique": "Germany;United States", + "bibtex": "@InProceedings{Gu_2021_ICCV,\n \n author = {\n Gu,\n Cheng and Learned-Miller,\n Erik and Sheldon,\n Daniel and Gallego,\n Guillermo and Bideau,\n Pia\n},\n title = {\n The Spatio-Temporal Poisson Point Process: A Simple Model for the Alignment of Event Camera Data\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13495-13504\n} \n}" }, { "title": "The Surprising Effectiveness of Visual Odometry Techniques for Embodied PointGoal Navigation", @@ -41587,6 +44398,7 @@ "status": "Poster", "track": "main", "pid": 7366, + "author_site": "Xiaoming Zhao; Harsh Agrawal; Dhruv Batra; Alexander G. Schwing", "author": "Xiaoming Zhao; Harsh Agrawal; Dhruv Batra; Alexander G. Schwing", "abstract": "It is fundamental for personal robots to reliably navigate to a specified goal. To study this task, PointGoal navigation has been introduced in simulated Embodied AI environments. Recent advances solve this PointGoal navigation task with near-perfect accuracy (99.6% success) in photo-realistically simulated environments, assuming noiseless egocentric vision, noiseless actuation, and most importantly, perfect localization. However, under realistic noise models for visual sensors and actuation, and without access to a \"GPS and Compass sensor,\" the 99.6%-success agents for PointGoal navigation only succeed with 0.3%. In this work, we demonstrate the surprising effectiveness of visual odometry for the task of PointGoal navigation in this realistic setting, i.e., with realistic noise models for perception and actuation and without access to GPS and Compass sensors. We show that integrating visual odometry techniques into navigation policies improves the state-of-the-art on the popular Habitat PointNav benchmark by a large margin, improving success from 64.5% to 71.7% while executing 6.4 times faster.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhao_The_Surprising_Effectiveness_of_Visual_Odometry_Techniques_for_Embodied_PointGoal_ICCV_2021_paper.pdf", @@ -41603,14 +44415,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhao_The_Surprising_Effectiveness_of_Visual_Odometry_Techniques_for_Embodied_PointGoal_ICCV_2021_paper.html", "aff_unique_index": "0;1;1+2;0", - "aff_unique_norm": "University of Illinois;Georgia Institute of Technology;Meta", + "aff_unique_norm": "University of Illinois;Georgia Institute of Technology;Facebook", "aff_unique_dep": ";;Facebook AI Research", "aff_unique_url": "https://illinois.edu;https://www.gatech.edu;https://research.facebook.com", "aff_unique_abbr": "UIUC;Georgia Tech;FAIR", "aff_campus_unique_index": "0;;0", "aff_campus_unique": "Urbana-Champaign;", "aff_country_unique_index": "0;0;0+0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhao_2021_ICCV,\n \n author = {\n Zhao,\n Xiaoming and Agrawal,\n Harsh and Batra,\n Dhruv and Schwing,\n Alexander G.\n},\n title = {\n The Surprising Effectiveness of Visual Odometry Techniques for Embodied PointGoal Navigation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 16127-16136\n} \n}" }, { "title": "The Surprising Impact of Mask-Head Architecture on Novel Class Segmentation", @@ -41618,6 +44431,7 @@ "status": "Poster", "track": "main", "pid": 9454, + "author_site": "Vighnesh Birodkar; Zhichao Lu; Siyang Li; Vivek Rathod; Jonathan Huang", "author": "Vighnesh Birodkar; Zhichao Lu; Siyang Li; Vivek Rathod; Jonathan Huang", "abstract": "Instance segmentation models today are very accurate when trained on large annotated datasets, but collecting mask annotations at scale is prohibitively expensive. We address the partially supervised instance segmentation problem in which one can train on (significantly cheaper) bounding boxes for all categories but use masks only for a subset of categories. In this work, we focus on a popular family of models which apply differentiable cropping to a feature map and predict a mask based on the resulting crop. Under this family, we study Mask R-CNN and discover that instead of its default strategy of training the mask-head with a combination of proposals and groundtruth boxes, training the mask-head with only groundtruth boxes dramatically improves its performance on novel classes. This training strategy also allows us to take advantage of alternative mask-head architectures, which we exploit by replacing the typical mask-head of 2-4 layers with significantly deeper off-the-shelf architectures (e.g. ResNet, Hourglass models). While many of these architectures perform similarly when trained in fully supervised mode, our main finding is that they can generalize to novel classes in dramatically different ways. We call this ability of mask-heads to generalize to unseen classes the strong mask generalization effect and show that without any specialty modules or losses, we can achieve state-of-the-art results in the partially supervised COCO instance segmentation benchmark. Finally, we demonstrate that our effect is general, holding across underlying detection methodologies (including anchor-based, anchor-free or no detector at all) and across different backbone networks. Code and pre-trained models are available at https://git.io/deepmac.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Birodkar_The_Surprising_Impact_of_Mask-Head_Architecture_on_Novel_Class_Segmentation_ICCV_2021_paper.pdf", @@ -41635,13 +44449,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Birodkar_The_Surprising_Impact_of_Mask-Head_Architecture_on_Novel_Class_Segmentation_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Google", - "aff_unique_dep": "Google", + "aff_unique_dep": "", "aff_unique_url": "https://www.google.com", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Birodkar_2021_ICCV,\n \n author = {\n Birodkar,\n Vighnesh and Lu,\n Zhichao and Li,\n Siyang and Rathod,\n Vivek and Huang,\n Jonathan\n},\n title = {\n The Surprising Impact of Mask-Head Architecture on Novel Class Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7015-7025\n} \n}" }, { "title": "The Way to My Heart Is Through Contrastive Learning: Remote Photoplethysmography From Unlabelled Video", @@ -41649,6 +44464,7 @@ "status": "Poster", "track": "main", "pid": 7964, + "author_site": "John Gideon; Simon Stent", "author": "John Gideon; Simon Stent", "abstract": "The ability to reliably estimate physiological signals from video is a powerful tool in low-cost, pre-clinical health monitoring. In this work we propose a new approach to remote photoplethysmography (rPPG) -- the measurement of blood volume changes from observations of a person's face or skin. Similar to current state-of-the-art methods for rPPG, we apply neural networks to learn deep representations with invariance to nuisance image variation. In contrast to such methods, we employ a fully self-supervised training approach, which has no reliance on expensive ground truth physiological training data. Our proposed method uses contrastive learning with a weak prior over the frequency and temporal smoothness of the target signal of interest. We evaluate our approach on four rPPG datasets, showing that comparable or better results can be achieved compared to recent supervised deep learning methods but without using any annotation. In addition, we incorporate a learned saliency resampling module into both our unsupervised approach and supervised baseline. We show that by allowing the model to learn where to sample the input image, we can reduce the need for hand-engineered features while providing some interpretability into the model's behavior and possible failure modes. We release code for our complete training and evaluation pipeline to encourage reproducible progress in this exciting new direction.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Gideon_The_Way_to_My_Heart_Is_Through_Contrastive_Learning_Remote_ICCV_2021_paper.pdf", @@ -41663,7 +44479,8 @@ "aff_domain": ";", "email": ";", "author_num": 2, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Gideon_The_Way_to_My_Heart_Is_Through_Contrastive_Learning_Remote_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Gideon_The_Way_to_My_Heart_Is_Through_Contrastive_Learning_Remote_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Gideon_2021_ICCV,\n \n author = {\n Gideon,\n John and Stent,\n Simon\n},\n title = {\n The Way to My Heart Is Through Contrastive Learning: Remote Photoplethysmography From Unlabelled Video\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3995-4004\n} \n}" }, { "title": "Three Steps to Multimodal Trajectory Prediction: Modality Clustering, Classification and Synthesis", @@ -41671,6 +44488,7 @@ "status": "Poster", "track": "main", "pid": 7695, + "author_site": "Jianhua Sun; Yuxuan Li; Hao-Shu Fang; Cewu Lu", "author": "Jianhua Sun; Yuxuan Li; Hao-Shu Fang; Cewu Lu", "abstract": "Multimodal prediction results are essential for trajectory prediction task as there is no single correct answer for the future. Previous frameworks can be divided into three categories: regression, generation and classification frameworks. However, these frameworks have weaknesses in different aspects so that they cannot model the multimodal prediction task comprehensively. In this paper, we present a novel insight along with a brand-new prediction framework by formulating multimodal prediction into three steps: modality clustering, classification and synthesis, and address the shortcomings of earlier frameworks. Exhaustive experiments on popular benchmarks have demonstrated that our proposed method surpasses state-of-the-art works even without introducing social and map information. Specifically, we achieve 19.2% and 20.8% improvement on ADE and FDE respectively on ETH/UCY dataset.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Sun_Three_Steps_to_Multimodal_Trajectory_Prediction_Modality_Clustering_Classification_and_ICCV_2021_paper.pdf", @@ -41694,7 +44512,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0+0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Sun_2021_ICCV,\n \n author = {\n Sun,\n Jianhua and Li,\n Yuxuan and Fang,\n Hao-Shu and Lu,\n Cewu\n},\n title = {\n Three Steps to Multimodal Trajectory Prediction: Modality Clustering,\n Classification and Synthesis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13250-13259\n} \n}" }, { "title": "Time-Equivariant Contrastive Video Representation Learning", @@ -41702,6 +44521,7 @@ "status": "Poster", "track": "main", "pid": 8897, + "author_site": "Simon Jenni; Hailin Jin", "author": "Simon Jenni; Hailin Jin", "abstract": "We introduce a novel self-supervised contrastive learning method to learn representations from unlabelled videos. Existing approaches ignore the specifics of input distortions, e.g., by learning invariance to temporal transformations. Instead, we argue that video representation should preserve video dynamics and reflect temporal manipulations of the input. Therefore, we exploit novel constraints to build representations that are equivariant to temporal transformations and better capture video dynamics. In our method, relative temporal transformations between augmented clips of a video are encoded in a vector and contrasted with other transformation vectors. To support temporal equivariance learning, we additionally propose the self-supervised classification of two clips of a video into 1. overlapping 2. ordered, or 3. unordered. Our experiments show that time-equivariant representations achieve state-of-the-art results in video retrieval and action recognition benchmarks on UCF101, HMDB51, and Diving48.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Jenni_Time-Equivariant_Contrastive_Video_Representation_Learning_ICCV_2021_paper.pdf", @@ -41725,7 +44545,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Jenni_2021_ICCV,\n \n author = {\n Jenni,\n Simon and Jin,\n Hailin\n},\n title = {\n Time-Equivariant Contrastive Video Representation Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9970-9980\n} \n}" }, { "title": "Time-Multiplexed Coded Aperture Imaging: Learned Coded Aperture and Pixel Exposures for Compressive Imaging Systems", @@ -41733,6 +44554,7 @@ "status": "Poster", "track": "main", "pid": 10452, + "author_site": "Edwin Vargas; Julien N. P. Martel; Gordon Wetzstein; Henry Arguello", "author": "Edwin Vargas; Julien N. P. Martel; Gordon Wetzstein; Henry Arguello", "abstract": "Compressive imaging using coded apertures (CA) is a powerful technique that can be used to recover depth, light fields, hyperspectral images and other quantities from a single snapshot. The performance of compressive imaging systems based on CAs mostly depends on two factors: the properties of the mask's attenuation pattern, that we refer to as \"codification\", and the computational techniques used to recover the quantity of interest from the coded snapshot. In this work, we introduce the idea of using time-varying CAs synchronized with spatially varying pixel shutters. We divide the exposure of a sensor into sub-exposures at the beginning of which the CA mask changes and at which the sensor's pixels are simultaneously and individually switched \"on\" or \"off\". This is a practically appealing codification as it does not introduce additional optical components other than the already present CA but uses a change in the pixel shutter that can be easily realized electronically. We show that our proposed time-multiplexed coded aperture (TMCA) can be optimized end to end and induces better coded snapshots enabling superior reconstructions in two different applications: compressive light field imaging and hyperspectral imaging. We demonstrate both in simulation and with real captures (taken with prototypes we built) that this codification outperforms the state-of-the-art compressive imaging systems by a large margin in those applications.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Vargas_Time-Multiplexed_Coded_Aperture_Imaging_Learned_Coded_Aperture_and_Pixel_Exposures_ICCV_2021_paper.pdf", @@ -41756,7 +44578,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;1;1;0", - "aff_country_unique": "Colombia;United States" + "aff_country_unique": "Colombia;United States", + "bibtex": "@InProceedings{Vargas_2021_ICCV,\n \n author = {\n Vargas,\n Edwin and Martel,\n Julien N. P. and Wetzstein,\n Gordon and Arguello,\n Henry\n},\n title = {\n Time-Multiplexed Coded Aperture Imaging: Learned Coded Aperture and Pixel Exposures for Compressive Imaging Systems\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2692-2702\n} \n}" }, { "title": "TkML-AP: Adversarial Attacks to Top-k Multi-Label Learning", @@ -41764,6 +44587,7 @@ "status": "Poster", "track": "main", "pid": 6337, + "author_site": "Shu Hu; Lipeng Ke; Xin Wang; Siwei Lyu", "author": "Shu Hu; Lipeng Ke; Xin Wang; Siwei Lyu", "abstract": "Top-k multi-label learning, which returns the top-k predicted labels from an input, has many practical applications such as image annotation, document analysis, and web search engine. However, the vulnerabilities of such algorithms with regards to dedicated adversarial perturbation attacks have not been extensively studied previously. In this work, we develop methods to create adversarial perturbations that can be used to attack top-k multi-label learning-based image annotation systems (T_kML-AP). Our methods explicitly consider the top-k ranking relation and are based on novel loss functions. Experimental evaluations on large-scale benchmark datasets including PASCAL VOC and MS COCO demonstrate the effectiveness of our methods in reducing the performance of state-of-the-art top-k multi-label learning methods, under both untargeted and targeted attacks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Hu_TkML-AP_Adversarial_Attacks_to_Top-k_Multi-Label_Learning_ICCV_2021_paper.pdf", @@ -41787,7 +44611,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Buffalo;", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States;" + "aff_country_unique": "United States;", + "bibtex": "@InProceedings{Hu_2021_ICCV,\n \n author = {\n Hu,\n Shu and Ke,\n Lipeng and Wang,\n Xin and Lyu,\n Siwei\n},\n title = {\n TkML-AP: Adversarial Attacks to Top-k Multi-Label Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7649-7657\n} \n}" }, { "title": "TokenPose: Learning Keypoint Tokens for Human Pose Estimation", @@ -41795,6 +44620,7 @@ "status": "Poster", "track": "main", "pid": 9045, + "author_site": "Yanjie Li; Shoukui Zhang; Zhicheng Wang; Sen Yang; Wankou Yang; Shu-Tao Xia; Erjin Zhou", "author": "Yanjie Li; Shoukui Zhang; Zhicheng Wang; Sen Yang; Wankou Yang; Shu-Tao Xia; Erjin Zhou", "abstract": "Human pose estimation deeply relies on visual clues and anatomical constraints between parts to locate keypoints. Most existing CNN-based methods do well in visual representation, however, lacking in the ability to explicitly learn the constraint relationships between keypoints. In this paper, we propose a novel approach based on Token representation for human Pose estimation (TokenPose). In detail, each keypoint is explicitly embedded as a token to simultaneously learn constraint relationships and appearance cues from images. Extensive experiments show that the small and large TokenPose models are on par with state-of-the-art CNN-based counterparts while being more lightweight. Specifically, our TokenPose-S and TokenPose-L achieve 72.5 AP and 75.8 AP on COCO validation dataset respectively, with significant reduction in parameters and GFLOPs. Code is publicly available at https://github.com/leeyegy/TokenPose.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_TokenPose_Learning_Keypoint_Tokens_for_Human_Pose_Estimation_ICCV_2021_paper.pdf", @@ -41811,14 +44637,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Li_TokenPose_Learning_Keypoint_Tokens_for_Human_Pose_Estimation_ICCV_2021_paper.html", "aff_unique_index": "0;1;1;1+2;2;0+3;1", - "aff_unique_norm": "Tsinghua University;Megvii Technology;Southeast University;Pengcheng Laboratory", + "aff_unique_norm": "Tsinghua University;MEGVII Technology;Southeast University;Peng Cheng Laboratory", "aff_unique_dep": "International Graduate School;;;Research Center of Networks and Communications", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.megvii.com;https://www.seu.edu.cn/;http://www.pcl.ac.cn", "aff_unique_abbr": "THU;;SEU;PCL", "aff_campus_unique_index": "0;;0", "aff_campus_unique": "Shenzhen;", "aff_country_unique_index": "0;0;0;0+0;0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Yanjie and Zhang,\n Shoukui and Wang,\n Zhicheng and Yang,\n Sen and Yang,\n Wankou and Xia,\n Shu-Tao and Zhou,\n Erjin\n},\n title = {\n TokenPose: Learning Keypoint Tokens for Human Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11313-11322\n} \n}" }, { "title": "Tokens-to-Token ViT: Training Vision Transformers From Scratch on ImageNet", @@ -41826,6 +44653,7 @@ "status": "Poster", "track": "main", "pid": 2561, + "author_site": "Li Yuan; Yunpeng Chen; Tao Wang; Weihao Yu; Yujun Shi; Zi-Hang Jiang; Francis E.H. Tay; Jiashi Feng; Shuicheng Yan", "author": "Li Yuan; Yunpeng Chen; Tao Wang; Weihao Yu; Yujun Shi; Zi-Hang Jiang; Francis E.H. Tay; Jiashi Feng; Shuicheng Yan", "abstract": "Transformers, which are popular for language modeling, have been explored for solving vision tasks recently, e.g., the Vision Transformer (ViT) for image classification. The ViT model splits each image into a sequence of tokens with fixed length and then applies multiple Transformer layers to model their global relation for classification. However, ViT achieves inferior performance to CNNs when trained from scratch on a midsize dataset like ImageNet. We find it is because: 1) the simple tokenization of input images fails to model the important local structure such as edges and lines among neighboring pixels, leading to low training sample efficiency; 2) the redundant attention backbone design of ViT leads to limited feature richness for fixed computation budgets and limited training samples. To overcome such limitations, we propose a new Tokens-To-Token Vision Transformer (T2T-ViT), which incorporates 1) a layer-wise Tokens-to-Token (T2T) transformation to progressively structurize the image to tokens by recursively aggregating neighboring Tokens into one Token (Tokens-to-Token), such that local structure represented by surrounding tokens can be modeled and tokens length can be reduced; 2) an efficient backbone with a deep-narrow structure for vision transformer motivated by CNN architecture design after empirical study. Notably, T2T-ViT reduces the parameter count and MACs of vanilla ViT by half, while achieving more than 3.0% improvement when trained from scratch on ImageNet. It also outperforms ResNets and achieves comparable performance with MobileNets by directly training on ImageNet. For example, T2T-ViT with comparable size to ResNet50 (21.5M parameters) can achieve 83.3% top1 accuracy in image resolution 384x384 on ImageNet.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yuan_Tokens-to-Token_ViT_Training_Vision_Transformers_From_Scratch_on_ImageNet_ICCV_2021_paper.pdf", @@ -41849,7 +44677,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0+0;0;0;0;0;0;0", - "aff_country_unique": "Singapore;China" + "aff_country_unique": "Singapore;China", + "bibtex": "@InProceedings{Yuan_2021_ICCV,\n \n author = {\n Yuan,\n Li and Chen,\n Yunpeng and Wang,\n Tao and Yu,\n Weihao and Shi,\n Yujun and Jiang,\n Zi-Hang and Tay,\n Francis E.H. and Feng,\n Jiashi and Yan,\n Shuicheng\n},\n title = {\n Tokens-to-Token ViT: Training Vision Transformers From Scratch on ImageNet\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 558-567\n} \n}" }, { "title": "Topic Scene Graph Generation by Attention Distillation From Caption", @@ -41857,6 +44686,7 @@ "status": "Poster", "track": "main", "pid": 3565, + "author_site": "Wenbin Wang; Ruiping Wang; Xilin Chen", "author": "Wenbin Wang; Ruiping Wang; Xilin Chen", "abstract": "If an image tells a story, the image caption is the briefest narrator. Generally, a scene graph prefers to be an omniscient \"generalist\", while the image caption is more willing to be a \"specialist\", which outlines the gist. Lots of previous studies have found that a scene graph is not as practical as expected unless it can reduce the trivial contents and noises. In this respect, the image caption is a good tutor. To this end, we let the scene graph borrow the ability from the image caption so that it can be a specialist on the basis of remaining all-around, resulting in the so-called Topic Scene Graph. What an image caption pays attention to is distilled and passed to the scene graph for estimating the importance of partial objects, relationships, and events. Specifically, during the caption generation, the attention about individual objects in each time step is collected, pooled, and assembled to obtain the attention about relationships, which serves as weak supervision for regularizing the estimated importance scores of relationships. In addition, as this attention distillation process provides an opportunity for combining the generation of image caption and scene graph together, we further transform the scene graph into linguistic form with rich and free-form expressions by sharing a single generation model with image caption. Experiments show that attention distillation brings significant improvements in mining important relationships without strong supervision, and the topic scene graph shows great potential in subsequent applications.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_Topic_Scene_Graph_Generation_by_Attention_Distillation_From_Caption_ICCV_2021_paper.pdf", @@ -41880,7 +44710,8 @@ "aff_campus_unique_index": "0+0;0+0+0;0+0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0+0;0+0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Wenbin and Wang,\n Ruiping and Chen,\n Xilin\n},\n title = {\n Topic Scene Graph Generation by Attention Distillation From Caption\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15900-15910\n} \n}" }, { "title": "Topologically Consistent Multi-View Face Inference Using Volumetric Sampling", @@ -41888,10 +44719,11 @@ "status": "Poster", "track": "main", "pid": 3471, + "author_site": "Tianye Li; Shichen Liu; Timo Bolkart; Jiayi Liu; Hao Li; Yajie Zhao", "author": "Tianye Li; Shichen Liu; Timo Bolkart; Jiayi Liu; Hao Li; Yajie Zhao", "abstract": "High-fidelity face digitization solutions often combine multi-view stereo (MVS) techniques for 3D reconstruction and a non-rigid registration step to establish dense correspondence across identities and expressions. A common problem is the need for manual clean-up after the MVS step, as 3D scans are typically affected by noise and outliers and contain hairy surface regions that need to be cleaned up by artists. Furthermore, mesh registration tends to fail for extreme facial expressions. Most learning-based methods use an underlying 3D morphable model (3DMM) to ensure robustness, but this limits the output accuracy for extreme facial expressions. In addition, the global bottleneck of regression architectures cannot produce meshes that tightly fit the ground truth surfaces. We propose ToFu, Topological consistent Face from multi-view, a geometry inference framework that can produce topologically consistent meshes across facial identities and expressions using a volumetric representation instead of an explicit underlying 3DMM. Our novel progressive mesh generation network embeds the topological structure of the face in a feature volume, sampled from geometry-aware local features. A coarse-to-fine architecture facilitates dense and accurate facial mesh predictions in a consistent mesh topology. ToFu further captures displacement maps for pore-level geometric details and facilitates high-quality rendering in the form of albedo and specular reflectance maps. These high-quality assets are readily usable by production studios for avatar creation, animation and physically-based skin rendering. We demonstrate state-of-the-art geometric and correspondence accuracy, while only taking 0.385 seconds to compute a mesh with 10K vertices, which is three orders of magnitude faster than traditional techniques. The code and the model are available for research purposes at https://tianyeli.github.io/tofu.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_Topologically_Consistent_Multi-View_Face_Inference_Using_Volumetric_Sampling_ICCV_2021_paper.pdf", - "aff": "USC Institute for Creative Technologies+USC; USC Institute for Creative Technologies+USC; MPI for Intelligent Systems, T\u00fcbingen; USC Institute for Creative Technologies+USC; USC Institute for Creative Technologies+USC; USC Institute for Creative Technologies", + "aff": "USC Institute for Creative Technologies+USC; USC Institute for Creative Technologies+USC; MPI for Intelligent Systems, Tübingen; USC Institute for Creative Technologies+USC; USC Institute for Creative Technologies+USC; USC Institute for Creative Technologies", "project": "", "github": "https://tianyeli.github.io/tofu", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Li_Topologically_Consistent_Multi-View_ICCV_2021_supplemental.pdf", @@ -41909,9 +44741,10 @@ "aff_unique_url": "https://ict.usc.edu;https://www.mpi-is.mpg.de", "aff_unique_abbr": "USC ICT;MPI-IS", "aff_campus_unique_index": "1;1;2;1;1", - "aff_campus_unique": ";Los Angeles;T\u00fcbingen", + "aff_campus_unique": ";Los Angeles;Tübingen", "aff_country_unique_index": "0+0;0+0;1;0+0;0+0;0", - "aff_country_unique": "United States;Germany" + "aff_country_unique": "United States;Germany", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Tianye and Liu,\n Shichen and Bolkart,\n Timo and Liu,\n Jiayi and Li,\n Hao and Zhao,\n Yajie\n},\n title = {\n Topologically Consistent Multi-View Face Inference Using Volumetric Sampling\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3824-3834\n} \n}" }, { "title": "Toward Human-Like Grasp: Dexterous Grasping via Semantic Representation of Object-Hand", @@ -41919,6 +44752,7 @@ "status": "Poster", "track": "main", "pid": 2918, + "author_site": "Tianqiang Zhu; Rina Wu; Xiangbo Lin; Yi Sun", "author": "Tianqiang Zhu; Rina Wu; Xiangbo Lin; Yi Sun", "abstract": "In recent years, many dexterous robotic hands have been designed to assist or replace human hands in executing various tasks. But how to teach them to perform dexterous operations like human hands is still a challenging task. In this paper, we propose a grasp synthesis framework to make robots grasp and manipulate objects like human beings. We first build a dataset by accurately segmenting the functional areas of the object and annotating semantic touch code for each functional area to guide the dexterous hand to complete the functional grasp and post-grasp manipulation. This dataset contains 18 categories of 129 objects selected from four datasets, and 15 people participated in data annotation. Then we carefully design four loss functions to constrain the network, which successfully generates the functional grasp of dexterous hand under the guidance of semantic touch code. The thorough experiments in synthetic data show our model can robustly generate functional grasp, even for objects that the model has not see before.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhu_Toward_Human-Like_Grasp_Dexterous_Grasping_via_Semantic_Representation_of_Object-Hand_ICCV_2021_paper.pdf", @@ -41942,7 +44776,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhu_2021_ICCV,\n \n author = {\n Zhu,\n Tianqiang and Wu,\n Rina and Lin,\n Xiangbo and Sun,\n Yi\n},\n title = {\n Toward Human-Like Grasp: Dexterous Grasping via Semantic Representation of Object-Hand\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15741-15751\n} \n}" }, { "title": "Toward Realistic Single-View 3D Object Reconstruction With Unsupervised Learning From Multiple Images", @@ -41950,6 +44785,7 @@ "status": "Poster", "track": "main", "pid": 9848, + "author_site": "Long-Nhat Ho; Anh Tuan Tran; Quynh Phung; Minh Hoai", "author": "Long-Nhat Ho; Anh Tuan Tran; Quynh Phung; Minh Hoai", "abstract": "Recovering the 3D structure of an object from a single image is a challenging task due to its ill-posed nature. One approach is to utilize the plentiful photos of the same object category to learn a strong 3D shape prior for the object. This approach has successfully been demonstrated by a recent work of Wu et al. (2020), which obtained impressive 3D reconstruction networks with unsupervised learning. However, their algorithm is only applicable to symmetric objects. In this paper, we eliminate the symmetry requirement with a novel unsupervised algorithm that can learn a 3D reconstruction network from a multi-image dataset. Our algorithm is more general and covers the symmetry-required scenario as a special case. Besides, we employ a novel albedo loss that improves the reconstructed details and realisticity. Our method surpasses the previous work in both quality and robustness, as shown in experiments on datasets of various structures, including single-view, multi-view, image-collection, and video sets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ho_Toward_Realistic_Single-View_3D_Object_Reconstruction_With_Unsupervised_Learning_From_ICCV_2021_paper.pdf", @@ -41973,7 +44809,8 @@ "aff_campus_unique_index": "0+0;0+0+1;0;0+1", "aff_campus_unique": "Hanoi;Stony Brook", "aff_country_unique_index": "0+0;0+0+1;0;0+1", - "aff_country_unique": "Vietnam;United States" + "aff_country_unique": "Vietnam;United States", + "bibtex": "@InProceedings{Ho_2021_ICCV,\n \n author = {\n Ho,\n Long-Nhat and Tran,\n Anh Tuan and Phung,\n Quynh and Hoai,\n Minh\n},\n title = {\n Toward Realistic Single-View 3D Object Reconstruction With Unsupervised Learning From Multiple Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12600-12610\n} \n}" }, { "title": "Toward Spatially Unbiased Generative Models", @@ -41981,6 +44818,7 @@ "status": "Poster", "track": "main", "pid": 1935, + "author_site": "Jooyoung Choi; Jungbeom Lee; Yonghyun Jeong; Sungroh Yoon", "author": "Jooyoung Choi; Jungbeom Lee; Yonghyun Jeong; Sungroh Yoon", "abstract": "Recent image generation models show remarkable generation performance. However, they mirror strong location preference in datasets, which we call spatial bias. Therefore, generators render poor samples at unseen locations and scales. We argue that the generators rely on their implicit positional encoding to render spatial content. From our observations, the generator's implicit positional encoding is translation-variant, making the generator spatially biased. To address this issue, we propose injecting explicit positional encoding at each scale of the generator. By learning the spatially unbiased generator, we facilitate the robust use of generators in multiple tasks, such as GAN inversion, multi-scale generation, generation of arbitrary sizes and aspect ratios. Furthermore, we show that our method can also be applied to denoising diffusion probabilistic models.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Choi_Toward_Spatially_Unbiased_Generative_Models_ICCV_2021_paper.pdf", @@ -41995,7 +44833,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Choi_Toward_Spatially_Unbiased_Generative_Models_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Choi_Toward_Spatially_Unbiased_Generative_Models_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Choi_2021_ICCV,\n \n author = {\n Choi,\n Jooyoung and Lee,\n Jungbeom and Jeong,\n Yonghyun and Yoon,\n Sungroh\n},\n title = {\n Toward Spatially Unbiased Generative Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14253-14262\n} \n}" }, { "title": "Toward a Visual Concept Vocabulary for GAN Latent Space", @@ -42003,6 +44842,7 @@ "status": "Poster", "track": "main", "pid": 9545, + "author_site": "Sarah Schwettmann; Evan Hernandez; David Bau; Samuel Klein; Jacob Andreas; Antonio Torralba", "author": "Sarah Schwettmann; Evan Hernandez; David Bau; Samuel Klein; Jacob Andreas; Antonio Torralba", "abstract": "A large body of recent work has identified transformations in the latent spaces of generative adversarial networks (GANs) that consistently and interpretably transform generated images. But existing techniques for identifying these transformations rely on either a fixed vocabulary of pre-specified visual concepts, or on unsupervised disentanglement techniques whose alignment with human judgments about perceptual salience is unknown. This paper introduces a new method for building open-ended vocabularies of primitive visual concepts represented in a GAN's latent space. Our approach is built from three components: (1) automatic identification of perceptually salient directions based on their layer selectivity; (2) human annotation of these directions with free-form, compositional natural language descriptions; and (3) decomposition of these annotations into a visual concept vocabulary, consisting of distilled directions labeled with single words. Experiments show that concepts learned with our approach are reliable and composable--generalizing across classes, contexts, and observers, and enabling fine-grained manipulation of image style and content.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Schwettmann_Toward_a_Visual_Concept_Vocabulary_for_GAN_Latent_Space_ICCV_2021_paper.pdf", @@ -42026,7 +44866,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Cambridge;", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Schwettmann_2021_ICCV,\n \n author = {\n Schwettmann,\n Sarah and Hernandez,\n Evan and Bau,\n David and Klein,\n Samuel and Andreas,\n Jacob and Torralba,\n Antonio\n},\n title = {\n Toward a Visual Concept Vocabulary for GAN Latent Space\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6804-6812\n} \n}" }, { "title": "Towards Accurate Alignment in Real-Time 3D Hand-Mesh Reconstruction", @@ -42034,6 +44875,7 @@ "status": "Poster", "track": "main", "pid": 3646, + "author_site": "Xiao Tang; Tianyu Wang; Chi-Wing Fu", "author": "Xiao Tang; Tianyu Wang; Chi-Wing Fu", "abstract": "3D hand-mesh reconstruction from RGB images facilitates many applications, including augmented reality (AR). However, this requires not only real-time speed and accurate hand pose and shape but also plausible mesh-image alignment. While existing works already achieve promising results, meeting all three requirements is very challenging. This paper presents a novel pipeline by decoupling the hand-mesh reconstruction task into three stages: a joint stage to predict hand joints and segmentation; a mesh stage to predict a rough hand mesh; and a refine stage to fine-tune it with an offset mesh for mesh-image alignment. With careful design in the network structure and in the loss functions, we can promote high-quality finger-level mesh-image alignment and drive the models together to deliver real-time predictions. Extensive quantitative and qualitative results on benchmark datasets demonstrate that the quality of our results outperforms the state-of-the-art methods on hand-mesh/pose precision and hand-image alignment. In the end, we also showcase several real-time AR scenarios.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Tang_Towards_Accurate_Alignment_in_Real-Time_3D_Hand-Mesh_Reconstruction_ICCV_2021_paper.pdf", @@ -42050,14 +44892,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Tang_Towards_Accurate_Alignment_in_Real-Time_3D_Hand-Mesh_Reconstruction_ICCV_2021_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "Chinese University of Hong Kong", + "aff_unique_norm": "The Chinese University of Hong Kong", "aff_unique_dep": "", "aff_unique_url": "https://www.cuhk.edu.hk", "aff_unique_abbr": "CUHK", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Tang_2021_ICCV,\n \n author = {\n Tang,\n Xiao and Wang,\n Tianyu and Fu,\n Chi-Wing\n},\n title = {\n Towards Accurate Alignment in Real-Time 3D Hand-Mesh Reconstruction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11698-11707\n} \n}" }, { "title": "Towards Alleviating the Modeling Ambiguity of Unsupervised Monocular 3D Human Pose Estimation", @@ -42065,6 +44908,7 @@ "status": "Poster", "track": "main", "pid": 9643, + "author_site": "Zhenbo Yu; Bingbing Ni; Jingwei Xu; Junjie Wang; Chenglong Zhao; Wenjun Zhang", "author": "Zhenbo Yu; Bingbing Ni; Jingwei Xu; Junjie Wang; Chenglong Zhao; Wenjun Zhang", "abstract": "In this work, we study the ambiguity problem in the task of unsupervised 3D human pose estimation from 2D counterpart. On one hand, without explicit annotation, the scale of 3D pose is difficult to be accurately captured (scale ambiguity). On the other hand, one 2D pose might correspond to multiple 3D gestures, where the lifting procedure is inherently ambiguous (pose ambiguity). Previous methods generally use temporal constraints (e.g., constant bone length and motion smoothness) to alleviate the above issues. However, these methods commonly enforce the outputs to fulfill multiple training objectives simultaneously, which often lead to sub-optimal results. In contrast to the majority of previous works, we propose to split the whole problem into two sub-tasks, i.e., optimizing 2D input poses via a scale estimation module and then mapping optimized 2D pose to 3D counterpart via a pose lifting module. Furthermore, two temporal constraints are proposed to alleviate the scale and pose ambiguity respectively. These two modules are optimized via a iterative training scheme with corresponding temporal constraints, which effectively reduce the learning difficulty and lead to better performance. Results on the Human3.6M dataset demonstrate that our approach improves upon the prior art by 23.1% and also outperforms several weakly supervised approaches that rely on 3D annotations. Our project is available at https://sites.google.com/view/ambiguity-aware-hpe.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yu_Towards_Alleviating_the_Modeling_Ambiguity_of_Unsupervised_Monocular_3D_Human_ICCV_2021_paper.pdf", @@ -42088,7 +44932,8 @@ "aff_campus_unique_index": ";;;;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0+0;0+0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yu_2021_ICCV,\n \n author = {\n Yu,\n Zhenbo and Ni,\n Bingbing and Xu,\n Jingwei and Wang,\n Junjie and Zhao,\n Chenglong and Zhang,\n Wenjun\n},\n title = {\n Towards Alleviating the Modeling Ambiguity of Unsupervised Monocular 3D Human Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8651-8660\n} \n}" }, { "title": "Towards Better Explanations of Class Activation Mapping", @@ -42096,6 +44941,7 @@ "status": "Poster", "track": "main", "pid": 6000, + "author_site": "Hyungsik Jung; Youngrock Oh", "author": "Hyungsik Jung; Youngrock Oh", "abstract": "Increasing demands for understanding the internal behavior of convolutional neural networks (CNNs) have led to remarkable improvements in explanation methods. Particularly, several class activation mapping (CAM) based methods, which generate visual explanation maps by a linear combination of activation maps from CNNs, have been proposed. However, the majority of the methods lack a clear theoretical basis on how they assign the coefficients of the linear combination. In this paper, we revisit the intrinsic linearity of CAM with respect to the activation maps; we construct an explanation model of CNN as a linear function of binary variables that denote the existence of the corresponding activation maps. With this approach, the explanation model can be determined by additive feature attribution methods in an analytic manner. We then demonstrate the adequacy of SHAP values, which is a unique solution for the explanation model with a set of desirable properties, as the coefficients of CAM. Since the exact SHAP values are unattainable, we introduce an efficient approximation method, LIFT-CAM, based on DeepLIFT. Our proposed LIFT-CAM can estimate the SHAP values of the activation maps with high speed and accuracy. Furthermore, it greatly outperforms other previous CAM-based methods in both qualitative and quantitative aspects.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Jung_Towards_Better_Explanations_of_Class_Activation_Mapping_ICCV_2021_paper.pdf", @@ -42112,14 +44958,15 @@ "author_num": 2, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Jung_Towards_Better_Explanations_of_Class_Activation_Mapping_ICCV_2021_paper.html", "aff_unique_index": "0;0", - "aff_unique_norm": "Samsung", - "aff_unique_dep": "Samsung SDS", + "aff_unique_norm": "Samsung SDS", + "aff_unique_dep": "", "aff_unique_url": "https://www.samsungsds.com", "aff_unique_abbr": "Samsung SDS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Jung_2021_ICCV,\n \n author = {\n Jung,\n Hyungsik and Oh,\n Youngrock\n},\n title = {\n Towards Better Explanations of Class Activation Mapping\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1336-1344\n} \n}" }, { "title": "Towards Complete Scene and Regular Shape for Distortion Rectification by Curve-Aware Extrapolation", @@ -42127,6 +44974,7 @@ "status": "Poster", "track": "main", "pid": 3574, + "author_site": "Kang Liao; Chunyu Lin; Yunchao Wei; Feng Li; Shangrong Yang; Yao Zhao", "author": "Kang Liao; Chunyu Lin; Yunchao Wei; Feng Li; Shangrong Yang; Yao Zhao", "abstract": "The wide-angle lens gains increasing attention since it can capture a wide field-of-view scene (FoV). However, the obtained image is contaminated with radial distortion, making the scene not realistic. Previous distortion rectification methods rectify the image in a rectangle or invagination, failing to display the complete content and regular shape simultaneously. In this paper, we rethink the representation of rectification results and present a Rectification OutPainting (ROP) method, aiming to extrapolate the coherent semantics to the blank area and create a wider FoV beyond the original wide-angle lens. To address the specific challenges such as the variable painting region and curve boundary, a rectification module is designed to rectify the image with geometry supervision, and the extrapolated results are generated using a dual conditional expansion strategy. In terms of the spatially discounted correlation, a curve-aware correlation measurement is proposed to focus on the generated region to enforce the local consistency. To our knowledge, we are the first to tackle the challenging rectification via outpainting, and our curve-aware strategy can reach a rectification construction with complete content and regular shape. Extensive experiments well demonstrate the superiority of our ROP over other state-of-the-art solutions.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liao_Towards_Complete_Scene_and_Regular_Shape_for_Distortion_Rectification_by_ICCV_2021_paper.pdf", @@ -42143,14 +44991,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Liao_Towards_Complete_Scene_and_Regular_Shape_for_Distortion_Rectification_by_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;0;0;0", - "aff_unique_norm": "Beijing Jiao Tong University", + "aff_unique_norm": "Beijing Jiaotong University", "aff_unique_dep": "Institute of Information Science", "aff_unique_url": "http://www.bjtu.edu.cn", "aff_unique_abbr": "BJTU", - "aff_campus_unique_index": "0;0;0;0;0;0", - "aff_campus_unique": "Beijing", + "aff_campus_unique_index": "", + "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liao_2021_ICCV,\n \n author = {\n Liao,\n Kang and Lin,\n Chunyu and Wei,\n Yunchao and Li,\n Feng and Yang,\n Shangrong and Zhao,\n Yao\n},\n title = {\n Towards Complete Scene and Regular Shape for Distortion Rectification by Curve-Aware Extrapolation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14569-14578\n} \n}" }, { "title": "Towards Discovery and Attribution of Open-World GAN Generated Images", @@ -42158,6 +45007,7 @@ "status": "Poster", "track": "main", "pid": 10235, + "author_site": "Sharath Girish; Saksham Suri; Sai Saketh Rambhatla; Abhinav Shrivastava", "author": "Sharath Girish; Saksham Suri; Sai Saketh Rambhatla; Abhinav Shrivastava", "abstract": "With the recent progress in Generative Adversarial Networks (GANs), it is imperative for media and visual forensics to develop detectors which can identify and attribute images to the model generating them. Existing works have shown to attribute images to their corresponding GAN sources with high accuracy. However, these works are limited to a closed set scenario, failing to generalize to GANs unseen during train time and are therefore, not scalable with a steady influx of new GANs. We present an iterative algorithm for discovering images generated from previously unseen GANs by exploiting the fact that all GANs leave distinct fingerprints on their generated images. Our algorithm consists of multiple components including network training, out-of-distribution detection, clustering, merge and refine steps. Through extensive experiments, we show that our algorithm discovers unseen GANs with high accuracy and also generalizes to GANs trained on unseen real datasets. We additionally apply our algorithm to attribution and discovery of GANs in an online fashion as well as to the more standard task of real/fake detection. Our experiments demonstrate the effectiveness of our approach to discover new GANs and can be used in an open-world setup.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Girish_Towards_Discovery_and_Attribution_of_Open-World_GAN_Generated_Images_ICCV_2021_paper.pdf", @@ -42181,7 +45031,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "College Park", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Girish_2021_ICCV,\n \n author = {\n Girish,\n Sharath and Suri,\n Saksham and Rambhatla,\n Sai Saketh and Shrivastava,\n Abhinav\n},\n title = {\n Towards Discovery and Attribution of Open-World GAN Generated Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14094-14103\n} \n}" }, { "title": "Towards Discriminative Representation Learning for Unsupervised Person Re-Identification", @@ -42189,6 +45040,7 @@ "status": "Poster", "track": "main", "pid": 3033, + "author_site": "Takashi Isobe; Dong Li; Lu Tian; Weihua Chen; Yi Shan; Shengjin Wang", "author": "Takashi Isobe; Dong Li; Lu Tian; Weihua Chen; Yi Shan; Shengjin Wang", "abstract": "In this work, we address the problem of unsupervised domain adaptation for person re-ID where annotations are available for the source domain but not for target. Previous methods typically follow a two-stage optimization pipeline, where the network is first pre-trained on source and then fine-tuned on target with pseudo labels created by feature clustering. Such methods sustain two main limitations. (1) The label noise may hinder the learning of discriminative features for recognizing target classes. (2) The domain gap may hinder knowledge transferring from source to target. We propose three types of technical schemes to alleviate these issues. First, we propose a cluster-wise contrastive learning algorithm (CCL) by iterative optimization of feature learning and cluster refinery to learn noise-tolerant representations in the unsupervised manner. Second, we adopt a progressive domain adaptation (PDA) strategy to gradually mitigate the domain gap between source and target data. Third, we propose Fourier augmentation (FA) for further maximizing the class separability of re-ID models by imposing extra constraints in the Fourier space. We observe that these proposed schemes are capable of facilitating the learning of discriminative feature representations. Experiments demonstrate that our method consistently achieves notable improvements over the state-of-the-art unsupervised re-ID methods on multiple benchmarks, e.g., surpassing MMT largely by 8.1%, 9.9%, 11.4% and 11.1% mAP on the Market-to-Duke, Duke-to-Market, Market-to-MSMT and Duke-to-MSMT tasks, respectively.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Isobe_Towards_Discriminative_Representation_Learning_for_Unsupervised_Person_Re-Identification_ICCV_2021_paper.pdf", @@ -42212,7 +45064,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Isobe_2021_ICCV,\n \n author = {\n Isobe,\n Takashi and Li,\n Dong and Tian,\n Lu and Chen,\n Weihua and Shan,\n Yi and Wang,\n Shengjin\n},\n title = {\n Towards Discriminative Representation Learning for Unsupervised Person Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8526-8536\n} \n}" }, { "title": "Towards Efficient Graph Convolutional Networks for Point Cloud Handling", @@ -42220,6 +45073,7 @@ "status": "Poster", "track": "main", "pid": 3314, + "author_site": "Yawei Li; He Chen; Zhaopeng Cui; Radu Timofte; Marc Pollefeys; Gregory S. Chirikjian; Luc Van Gool", "author": "Yawei Li; He Chen; Zhaopeng Cui; Radu Timofte; Marc Pollefeys; Gregory S. Chirikjian; Luc Van Gool", "abstract": "We aim at improving the computational efficiency of graph convolutional networks (GCNs) for learning on point clouds. The basic graph convolution that is composed of a K-nearest neighbor (KNN) search and a multilayer perceptron (MLP) is examined. By mathematically analyzing the operations there, two findings to improve the efficiency of GCNs are obtained. (1) The local geometric structure information of 3D representations propagates smoothly across the GCN that relies on KNN search to gather neighborhood features. This motivates the simplification of multiple KNN searches in GCNs. (2) Shuffling the order of graph feature gathering and an MLP leads to equivalent or similar composite operations. Based on those findings, we optimize the computational procedure in GCNs. A series of experiments show that the optimized networks have reduced computational complexity, decreased memory consumption, and accelerated inference speed while maintaining comparable accuracy for learning on point clouds.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_Towards_Efficient_Graph_Convolutional_Networks_for_Point_Cloud_Handling_ICCV_2021_paper.pdf", @@ -42234,7 +45088,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Li_Towards_Efficient_Graph_Convolutional_Networks_for_Point_Cloud_Handling_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Li_Towards_Efficient_Graph_Convolutional_Networks_for_Point_Cloud_Handling_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Yawei and Chen,\n He and Cui,\n Zhaopeng and Timofte,\n Radu and Pollefeys,\n Marc and Chirikjian,\n Gregory S. and Van Gool,\n Luc\n},\n title = {\n Towards Efficient Graph Convolutional Networks for Point Cloud Handling\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3752-3762\n} \n}" }, { "title": "Towards Face Encryption by Generating Adversarial Identity Masks", @@ -42242,6 +45097,7 @@ "status": "Poster", "track": "main", "pid": 6099, + "author_site": "Xiao Yang; Yinpeng Dong; Tianyu Pang; Hang Su; Jun Zhu; Yuefeng Chen; Hui Xue", "author": "Xiao Yang; Yinpeng Dong; Tianyu Pang; Hang Su; Jun Zhu; Yuefeng Chen; Hui Xue", "abstract": "As billions of personal data being shared through social media and network, the data privacy and security have drawn an increasing attention. Several attempts have been made to alleviate the leakage of identity information from face photos, with the aid of, e.g., image obfuscation techniques. However, most of the present results are either perceptually unsatisfactory or ineffective against face recognition systems. Our goal in this paper is to develop a technique that can encrypt the personal photos such that they can protect users from unauthorized face recognition systems but remain visually identical to the original version for human beings. To achieve this, we propose a targeted identity-protection iterative method (TIP-IM) to generate adversarial identity masks which can be overlaid on facial images, such that the original identities can be concealed without sacrificing the visual quality. Extensive experiments demonstrate that TIP-IM provides 95%+ protection success rate against various state-of-the-art face recognition models under practical open-set test scenarios. Besides, we also show the practical and effective applicability of our method on a commercial API service.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yang_Towards_Face_Encryption_by_Generating_Adversarial_Identity_Masks_ICCV_2021_paper.pdf", @@ -42265,7 +45121,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Beijing;", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yang_2021_ICCV,\n \n author = {\n Yang,\n Xiao and Dong,\n Yinpeng and Pang,\n Tianyu and Su,\n Hang and Zhu,\n Jun and Chen,\n Yuefeng and Xue,\n Hui\n},\n title = {\n Towards Face Encryption by Generating Adversarial Identity Masks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3897-3907\n} \n}" }, { "title": "Towards Flexible Blind JPEG Artifacts Removal", @@ -42273,6 +45130,7 @@ "status": "Poster", "track": "main", "pid": 5739, + "author_site": "Jiaxi Jiang; Kai Zhang; Radu Timofte", "author": "Jiaxi Jiang; Kai Zhang; Radu Timofte", "abstract": "Training a single deep blind model to handle different quality factors for JPEG image artifacts removal has been attracting considerable attention due to its convenience for practical usage. However, existing deep blind methods usually directly reconstruct the image without predicting the quality factor, thus lacking the flexibility to control the output as the non-blind methods. To remedy this problem, in this paper, we propose a flexible blind convolutional neural network, namely FBCNN, that can predict the adjustable quality factor to control the trade-off between artifacts removal and details preservation. Specifically, FBCNN decouples the quality factor from the JPEG image via a decoupler module and then embeds the predicted quality factor into the subsequent reconstructor module through a quality factor attention block for flexible control. Besides, we find existing methods are prone to fail on non-aligned double JPEG images even with only a one-pixel shift, and we thus propose a double JPEG degradation model to augment the training data. Extensive experiments on single JPEG images, more general double JPEG images, and real-world JPEG images demonstrate that our proposed FBCNN achieves favorable performance against state-of-the-art methods in terms of both quantitative metrics and visual quality.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Jiang_Towards_Flexible_Blind_JPEG_Artifacts_Removal_ICCV_2021_paper.pdf", @@ -42296,7 +45154,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Switzerland" + "aff_country_unique": "Switzerland", + "bibtex": "@InProceedings{Jiang_2021_ICCV,\n \n author = {\n Jiang,\n Jiaxi and Zhang,\n Kai and Timofte,\n Radu\n},\n title = {\n Towards Flexible Blind JPEG Artifacts Removal\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4997-5006\n} \n}" }, { "title": "Towards High Fidelity Monocular Face Reconstruction With Rich Reflectance Using Self-Supervised Learning and Ray Tracing", @@ -42304,7 +45163,8 @@ "status": "Poster", "track": "main", "pid": 9084, - "author": "Abdallah Dib; C\u00e9dric Th\u00e9bault; Junghyun Ahn; Philippe-Henri Gosselin; Christian Theobalt; Louis Chevallier", + "author_site": "Abdallah Dib; Cédric Thébault; Junghyun Ahn; Philippe-Henri Gosselin; Christian Theobalt; Louis Chevallier", + "author": "Abdallah Dib; Cédric Thébault; Junghyun Ahn; Philippe-Henri Gosselin; Christian Theobalt; Louis Chevallier", "abstract": "Robust face reconstruction from monocular image in general lighting conditions is challenging. Methods combining deep neural network encoders with differentiable rendering have opened up the path for very fast monocular reconstruction of geometry, lighting and reflectance. They can also be trained in self-supervised manner for increased robustness and better generalization. However, their differentiable rasterization based image formation models, as well as underlying scene parameterization, limit them to Lambertian face reflectance and to poor shape details. More recently, ray tracing was introduced for monocular face reconstruction within a classic optimization-based framework and enables state-of-the art results. However optimization-based approaches are inherently slow and lack robustness. In this paper, we build our work on the aforementioned approaches and propose a new method that greatly improves reconstruction quality and robustness in general scenes. We achieve this by combining a CNN encoder with a differentiable ray tracer, which enables us to base the reconstruction on much more advanced personalized diffuse and specular albedos, a more sophisticated illumination model and a plausible representation of self-shadows. This enables to take a big leap forward in reconstruction quality of shape, appearance and lighting even in scenes with difficult illumination. With consistent face attributes reconstruction, our method leads to practical applications such as relighting and self-shadows removal. Compared to state-of-the-art methods, our results show improved accuracy and validity of the approach.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Dib_Towards_High_Fidelity_Monocular_Face_Reconstruction_With_Rich_Reflectance_Using_ICCV_2021_paper.pdf", "aff": "InterDigital R&I; InterDigital R&I; InterDigital R&I; InterDigital R&I; Max-Planck-Institute for Informatics; InterDigital R&I", @@ -42327,7 +45187,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1;0", - "aff_country_unique": "United States;Germany" + "aff_country_unique": "United States;Germany", + "bibtex": "@InProceedings{Dib_2021_ICCV,\n \n author = {\n Dib,\n Abdallah and Th\\'ebault,\n C\\'edric and Ahn,\n Junghyun and Gosselin,\n Philippe-Henri and Theobalt,\n Christian and Chevallier,\n Louis\n},\n title = {\n Towards High Fidelity Monocular Face Reconstruction With Rich Reflectance Using Self-Supervised Learning and Ray Tracing\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12819-12829\n} \n}" }, { "title": "Towards Interpretable Deep Metric Learning With Structural Matching", @@ -42335,6 +45196,7 @@ "status": "Poster", "track": "main", "pid": 2020, + "author_site": "Wenliang Zhao; Yongming Rao; Ziyi Wang; Jiwen Lu; Jie Zhou", "author": "Wenliang Zhao; Yongming Rao; Ziyi Wang; Jiwen Lu; Jie Zhou", "abstract": "How do the neural networks distinguish two images? It is of critical importance to understand the matching mechanism of deep models for developing reliable intelligent systems for many risky visual applications such as surveillance and access control. However, most existing deep metric learning methods match the images by comparing feature vectors, which ignores the spatial structure of images and thus lacks interpretability. In this paper, we present a deep interpretable metric learning (DIML) method for more transparent embedding learning. Unlike conventional metric learning methods based on feature vector comparison, we propose a structural matching strategy that explicitly aligns the spatial embeddings by computing an optimal matching flow between feature maps of the two images. Our method enables deep models to learn metrics in a more human-friendly way, where the similarity of two images can be decomposed to several part-wise similarities and their contributions to the overall similarity. Our method is model-agnostic, which can be applied to off-the-shelf backbone networks and metric learning methods. We evaluate our method on three major benchmarks of deep metric learning including CUB200- 2011, Cars196, and Stanford Online Products, and achieve substantial improvements over popular metric learning methods with better interpretability.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhao_Towards_Interpretable_Deep_Metric_Learning_With_Structural_Matching_ICCV_2021_paper.pdf", @@ -42358,7 +45220,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhao_2021_ICCV,\n \n author = {\n Zhao,\n Wenliang and Rao,\n Yongming and Wang,\n Ziyi and Lu,\n Jiwen and Zhou,\n Jie\n},\n title = {\n Towards Interpretable Deep Metric Learning With Structural Matching\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9887-9896\n} \n}" }, { "title": "Towards Interpretable Deep Networks for Monocular Depth Estimation", @@ -42366,6 +45229,7 @@ "status": "Poster", "track": "main", "pid": 3164, + "author_site": "Zunzhi You; Yi-Hsuan Tsai; Wei-Chen Chiu; Guanbin Li", "author": "Zunzhi You; Yi-Hsuan Tsai; Wei-Chen Chiu; Guanbin Li", "abstract": "Deep networks for Monocular Depth Estimation (MDE) have achieved promising performance recently and it is of great importance to further understand the interpretability of these networks. Existing methods attempt to provide post-hoc explanations by investigating visual cues, which may not explore the internal representations learned by deep networks. In this paper, we find that some hidden units of the network are selective to certain ranges of depth, and thus such behavior can be served as a way to interpret the internal representations. Based on our observations, we quantify the interpretability of a deep MDE network by the depth selectivity of its hidden units. Moreover, we then propose a method to train interpretable MDE deep networks without changing their original architectures, by assigning a depth range for each unit to select. Experimental results demonstrate that our method is able to enhance the interpretability of deep MDE networks by largely improving the depth selectivity of their units, while not harming or even improving the depth estimation accuracy. We further provide comprehensive analysis to show the reliability of selective units, the applicability of our method on different models and layers, and a demonstration on monocular depth completion. We further provide comprehensive analysis to show the reliability of selective units, the applicability of our method on different layers, models, and datasets, and a demonstration on analysis of model error.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/You_Towards_Interpretable_Deep_Networks_for_Monocular_Depth_Estimation_ICCV_2021_paper.pdf", @@ -42389,7 +45253,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Taiwan", "aff_country_unique_index": "0;1;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{You_2021_ICCV,\n \n author = {\n You,\n Zunzhi and Tsai,\n Yi-Hsuan and Chiu,\n Wei-Chen and Li,\n Guanbin\n},\n title = {\n Towards Interpretable Deep Networks for Monocular Depth Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12879-12888\n} \n}" }, { "title": "Towards Learning Spatially Discriminative Feature Representations", @@ -42397,6 +45262,7 @@ "status": "Poster", "track": "main", "pid": 10990, + "author_site": "Chaofei Wang; Jiayu Xiao; Yizeng Han; Qisen Yang; Shiji Song; Gao Huang", "author": "Chaofei Wang; Jiayu Xiao; Yizeng Han; Qisen Yang; Shiji Song; Gao Huang", "abstract": "The backbone of traditional CNN classifier is generally considered as a feature extractor, followed by a linear layer which performs the classification. We propose a novel loss function, termed as CAM-loss, to constrain the embedded feature maps with the class activation maps (CAMs) which indicate the spatially discriminative regions of an image for particular categories. CAM-loss drives the backbone to express the features of target category and suppress the features of non-target categories or background, so as to obtain more discriminative feature representations. It can be simply applied in any CNN architecture with neglectable additional parameters and calculations. Experimental results show that CAM-loss is applicable to a variety of network structures and can be combined with mainstream regularization methods to improve the performance of image classification. The strong generalization ability of CAM-loss is validated in the transfer learning and few shot learning tasks. Based on CAM-loss, we also propose a novel CAAM-CAM matching knowledge distillation method. This method directly uses the CAM generated by the teacher network to supervise the CAAM generated by the student network, which effectively improves the accuracy and convergence rate of the student network.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_Towards_Learning_Spatially_Discriminative_Feature_Representations_ICCV_2021_paper.pdf", @@ -42420,7 +45286,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Chaofei and Xiao,\n Jiayu and Han,\n Yizeng and Yang,\n Qisen and Song,\n Shiji and Huang,\n Gao\n},\n title = {\n Towards Learning Spatially Discriminative Feature Representations\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1326-1335\n} \n}" }, { "title": "Towards Memory-Efficient Neural Networks via Multi-Level In Situ Generation", @@ -42428,6 +45295,7 @@ "status": "Poster", "track": "main", "pid": 5536, + "author_site": "Jiaqi Gu; Hanqing Zhu; Chenghao Feng; Mingjie Liu; Zixuan Jiang; Ray T. Chen; David Z. Pan", "author": "Jiaqi Gu; Hanqing Zhu; Chenghao Feng; Mingjie Liu; Zixuan Jiang; Ray T. Chen; David Z. Pan", "abstract": "Deep neural networks (DNN) have shown superior performance in a variety of tasks. As they rapidly evolve, their escalating computation and memory demands make it challenging to deploy them on resource-constrained edge devices. Though extensive efficient accelerator designs, from traditional electronics to emerging photonics, have been successfully demonstrated, they are still bottlenecked by expensive memory accesses due to tremendous gaps between the bandwidth/power/latency of electrical memory and computing cores. Previous solutions fail to fully-leverage the ultra-fast computational speed of emerging DNN accelerators to break through the critical memory bound. In this work, we propose a general and unified framework to trade expensive memory transactions with ultra-fast on-chip computations, directly translating to performance improvement. We are the first to jointly explore the intrinsic correlations and bit-level redundancy within DNN kernels and propose a multi-level in situ generation mechanism with mixed-precision bases to achieve on-the-fly recovery of high-resolution parameters with minimum hardware overhead. Extensive experiments demonstrate that our proposed joint method can boost the memory efficiency by 10-20x with comparable accuracy over four state-of-the-art designs when benchmarked on ResNet-18/DenseNet-121/MobileNetV2/V3 with various tasks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Gu_Towards_Memory-Efficient_Neural_Networks_via_Multi-Level_In_Situ_Generation_ICCV_2021_paper.pdf", @@ -42451,7 +45319,8 @@ "aff_campus_unique_index": "0;0;0;0;0;0;0", "aff_campus_unique": "Austin", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Gu_2021_ICCV,\n \n author = {\n Gu,\n Jiaqi and Zhu,\n Hanqing and Feng,\n Chenghao and Liu,\n Mingjie and Jiang,\n Zixuan and Chen,\n Ray T. and Pan,\n David Z.\n},\n title = {\n Towards Memory-Efficient Neural Networks via Multi-Level In Situ Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5229-5238\n} \n}" }, { "title": "Towards Mixed-Precision Quantization of Neural Networks via Constrained Optimization", @@ -42459,6 +45328,7 @@ "status": "Poster", "track": "main", "pid": 4223, + "author_site": "Weihan Chen; Peisong Wang; Jian Cheng", "author": "Weihan Chen; Peisong Wang; Jian Cheng", "abstract": "Quantization is a widely used technique to compress and accelerate deep neural networks. However, conventional quantization methods use the same bit-width for all (or most of) the layers, which often suffer significant accuracy degradation in the ultra-low precision regime and ignore the fact that emergent hardware accelerators begin to support mixed-precision computation. Consequently, we present a novel and principled framework to solve the mixed-precision quantization problem in this paper. Briefly speaking, we first formulate the mixed-precision quantization as a discrete constrained optimization problem. Then, to make the optimization tractable, we approximate the objective function with second-order Taylor expansion and propose an efficient approach to compute its Hessian matrix. Finally, based on the above simplification, we show that the original problem can be reformulated as a Multiple Choice Knapsack Problem (MCKP) and propose a greedy search algorithm to solve it efficiently. Compared with existing mixed-precision quantization works, our method is derived in a principled way and much more computationally efficient. Moreover, extensive experiments conducted on the ImageNet dataset and various kinds of network architectures also demonstrate its superiority over existing uniform and mixed-precision quantization approaches.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_Towards_Mixed-Precision_Quantization_of_Neural_Networks_via_Constrained_Optimization_ICCV_2021_paper.pdf", @@ -42482,7 +45352,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Weihan and Wang,\n Peisong and Cheng,\n Jian\n},\n title = {\n Towards Mixed-Precision Quantization of Neural Networks via Constrained Optimization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5350-5359\n} \n}" }, { "title": "Towards Novel Target Discovery Through Open-Set Domain Adaptation", @@ -42490,6 +45361,7 @@ "status": "Poster", "track": "main", "pid": 4015, + "author_site": "Taotao Jing; Hongfu Liu; Zhengming Ding", "author": "Taotao Jing; Hongfu Liu; Zhengming Ding", "abstract": "Open-set domain adaptation (OSDA) considers that the target domain contains samples from novel categories unobserved in external source domain. Unfortunately, existing OSDA methods always ignore the demand for the information of unseen categories and simply recognize them as \"unknown\" set without further explanation. This motivates us to understand the unknown categories more specifically by exploring the underlying structures and recovering their interpretable semantic attributes. In this paper, we propose a novel framework to accurately identify the seen categories in target domain, and effectively recover the semantic attributes for unseen categories. Specifically, structure preserving partial alignment is developed to recognize the seen categories through domain-invariant feature learning. Attribute propagation over visual graph is designed to smoothly transit attributes from seen to unseen categories via visual-semantic mapping. Moreover, two new cross-main benchmarks are constructed to evaluate the proposed framework in the novel and practical challenge. Experimental results on open-set recognition and semantic recovery demonstrate the superiority of the proposed method over other compared baselines.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Jing_Towards_Novel_Target_Discovery_Through_Open-Set_Domain_Adaptation_ICCV_2021_paper.pdf", @@ -42509,11 +45381,12 @@ "aff_unique_norm": "Tulane University;Brandeis University", "aff_unique_dep": "Department of Computer Science;Michtom School of Computer Science", "aff_unique_url": "https://www.tulane.edu;https://www.brandeis.edu", - "aff_unique_abbr": "Tulane;Brandeis", + "aff_unique_abbr": "Tulane;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Jing_2021_ICCV,\n \n author = {\n Jing,\n Taotao and Liu,\n Hongfu and Ding,\n Zhengming\n},\n title = {\n Towards Novel Target Discovery Through Open-Set Domain Adaptation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9322-9331\n} \n}" }, { "title": "Towards Real-World Prohibited Item Detection: A Large-Scale X-Ray Benchmark", @@ -42521,6 +45394,7 @@ "status": "Poster", "track": "main", "pid": 4121, + "author_site": "Boying Wang; Libo Zhang; Longyin Wen; Xianglong Liu; Yanjun Wu", "author": "Boying Wang; Libo Zhang; Longyin Wen; Xianglong Liu; Yanjun Wu", "abstract": "Automatic security inspection using computer vision technology is a challenging task in real-world scenarios due to various factors, including intra-class variance, class imbalance, and occlusion. Most of the previous methods rarely solve the cases that the prohibited items are deliberately hidden in messy objects due to the lack of large-scale datasets, restricted their applications in real-world scenarios. Towards real-world prohibited item detection, we collect a large-scale dataset, named as PIDray, which covers various cases in real-world scenarios for prohibited item detection, especially for deliberately hidden items. With an intensive amount of effort, our dataset contains 12 categories of prohibited items in 47,677 X-ray images with high-quality annotated segmentation masks and bounding boxes. To the best of our knowledge, it is the largest prohibited items detection dataset to date. Meanwhile, we design the selective dense attention network (SDANet) to construct a strong baseline, which consists of the dense attention module and the dependency refinement module. The dense attention module formed by the spatial and channel-wise dense attentions, is designed to learn the discriminative features to boost the performance. The dependency refinement module is used to exploit the dependencies of multi-scale features. Extensive experiments conducted on the collected PIDray dataset demonstrate that the proposed method performs favorably against the state-of-the-art methods, especially for detecting the deliberately hidden items.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_Towards_Real-World_Prohibited_Item_Detection_A_Large-Scale_X-Ray_Benchmark_ICCV_2021_paper.pdf", @@ -42537,14 +45411,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wang_Towards_Real-World_Prohibited_Item_Detection_A_Large-Scale_X-Ray_Benchmark_ICCV_2021_paper.html", "aff_unique_index": "0+1;0+1+2;3;4;0", - "aff_unique_norm": "Chinese Academy of Sciences;University of Chinese Academy of Sciences;Hangzhou Institute for Advanced Study;JD;Beihang University", - "aff_unique_dep": "Institute of Software;;;JD Finance America Corporation;", + "aff_unique_norm": "Chinese Academy of Sciences;University of Chinese Academy of Sciences;Hangzhou Institute for Advanced Study;JD Finance America Corporation;Beihang University", + "aff_unique_dep": "Institute of Software;;;;", "aff_unique_url": "http://www.ios.ac.cn;http://www.ucas.ac.cn;;;http://www.buaa.edu.cn/", "aff_unique_abbr": "CAS;UCAS;HIFAS;;BUAA", "aff_campus_unique_index": "0+0;0+0+1;2;0;0", "aff_campus_unique": "Beijing;Hangzhou;Mountain View", "aff_country_unique_index": "0+0;0+0+0;1;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Boying and Zhang,\n Libo and Wen,\n Longyin and Liu,\n Xianglong and Wu,\n Yanjun\n},\n title = {\n Towards Real-World Prohibited Item Detection: A Large-Scale X-Ray Benchmark\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5412-5421\n} \n}" }, { "title": "Towards Real-World X-Ray Security Inspection: A High-Quality Benchmark and Lateral Inhibition Module for Prohibited Items Detection", @@ -42552,6 +45427,7 @@ "status": "Poster", "track": "main", "pid": 3021, + "author_site": "Renshuai Tao; Yanlu Wei; Xiangjian Jiang; Hainan Li; Haotong Qin; Jiakai Wang; Yuqing Ma; Libo Zhang; Xianglong Liu", "author": "Renshuai Tao; Yanlu Wei; Xiangjian Jiang; Hainan Li; Haotong Qin; Jiakai Wang; Yuqing Ma; Libo Zhang; Xianglong Liu", "abstract": "Prohibited items detection in X-ray images often plays an important role in protecting public safety, which often deals with color-monotonous and luster-insufficient objects, resulting in unsatisfactory performance. Till now, there have been rare studies touching this topic due to the lack of specialized high-quality datasets. In this work, we first present a High-quality X-ray (HiXray) security inspection image dataset, which contains 102,928 common prohibited items of 8 categories. It is the largest dataset of high quality for prohibited items detection, gathered from the real-world airport security inspection and annotated by professional security inspectors. Besides, for accurate prohibited item detection, we further propose the Lateral Inhibition Module (LIM) inspired by the fact that humans recognize these items by ignoring irrelevant information and focusing on identifiable characteristics, especially when objects are overlapped with each other. Specifically, LIM, the elaborately designed flexible additional module, suppresses the noisy information flowing maximumly by the Bidirectional Propagation (BP) module and activates the most identifiable charismatic, boundary, from four directions by Boundary Activation (BA) module. We evaluate our method extensively on HiXray and OPIXray and the results demonstrate that it outperforms SOTA detection methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Tao_Towards_Real-World_X-Ray_Security_Inspection_A_High-Quality_Benchmark_and_Lateral_ICCV_2021_paper.pdf", @@ -42566,7 +45442,8 @@ "aff_domain": ";;;;;;;;", "email": ";;;;;;;;", "author_num": 9, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Tao_Towards_Real-World_X-Ray_Security_Inspection_A_High-Quality_Benchmark_and_Lateral_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Tao_Towards_Real-World_X-Ray_Security_Inspection_A_High-Quality_Benchmark_and_Lateral_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Tao_2021_ICCV,\n \n author = {\n Tao,\n Renshuai and Wei,\n Yanlu and Jiang,\n Xiangjian and Li,\n Hainan and Qin,\n Haotong and Wang,\n Jiakai and Ma,\n Yuqing and Zhang,\n Libo and Liu,\n Xianglong\n},\n title = {\n Towards Real-World X-Ray Security Inspection: A High-Quality Benchmark and Lateral Inhibition Module for Prohibited Items Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10923-10932\n} \n}" }, { "title": "Towards Robustness of Deep Neural Networks via Regularization", @@ -42574,6 +45451,7 @@ "status": "Poster", "track": "main", "pid": 9425, + "author_site": "Yao Li; Martin Renqiang Min; Thomas Lee; Wenchao Yu; Erik Kruus; Wei Wang; Cho-Jui Hsieh", "author": "Yao Li; Martin Renqiang Min; Thomas Lee; Wenchao Yu; Erik Kruus; Wei Wang; Cho-Jui Hsieh", "abstract": "Recent studies have demonstrated the vulnerability of deep neural networks against adversarial examples. Inspired by the observation that adversarial examples often lie outside the natural image data manifold and the intrinsic dimension of image data is much smaller than its pixel space dimension, we propose to embed high-dimensional input images into a low-dimensional space and apply regularization on the embedding space to push the adversarial examples back to the manifold. The proposed framework is called Embedding Regularized Classifier (ER-Classifier), which improves the adversarial robustness of the classifier through embedding regularization. Besides improving classification accuracy against adversarial examples, the framework can be combined with detection methods to detect adversarial examples. Experimental results on several benchmark datasets show that, our proposed framework achieves good performance against strong adversarial attack methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_Towards_Robustness_of_Deep_Neural_Networks_via_Regularization_ICCV_2021_paper.pdf", @@ -42597,7 +45475,8 @@ "aff_campus_unique_index": "0;1;2;1;1;3;3", "aff_campus_unique": "Chapel Hill;Princeton;Davis;Los Angeles", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Yao and Min,\n Martin Renqiang and Lee,\n Thomas and Yu,\n Wenchao and Kruus,\n Erik and Wang,\n Wei and Hsieh,\n Cho-Jui\n},\n title = {\n Towards Robustness of Deep Neural Networks via Regularization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7496-7505\n} \n}" }, { "title": "Towards Rotation Invariance in Object Detection", @@ -42605,6 +45484,7 @@ "status": "Poster", "track": "main", "pid": 8247, + "author_site": "Agastya Kalra; Guy Stoppi; Bradley Brown; Rishav Agarwal; Achuta Kadambi", "author": "Agastya Kalra; Guy Stoppi; Bradley Brown; Rishav Agarwal; Achuta Kadambi", "abstract": "Rotation augmentations generally improve a model's invariance/equivariance to rotation - except in object detection. In object detection the shape is not known, therefore rotation creates a label ambiguity. We show that the de-facto method for bounding box label rotation, the Largest Box Method, creates very large labels, leading to poor performance and in many cases worse performance than using no rotation at all. We propose a new method of rotation augmentation that can be implemented in a few lines of code. First, we create a differentiable approximation of label accuracy and show that axis-aligning the bounding box around an ellipse is optimal. We then introduce Rotation Uncertainty (RU) Loss, allowing the model to adapt to the uncertainty of the labels. On five different datasets (including COCO, PascalVOC, and Transparent Object Bin Picking), this approach improves the rotational invariance of both one-stage and two-stage architectures when measured with AP, AP50, and AP75.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Kalra_Towards_Rotation_Invariance_in_Object_Detection_ICCV_2021_paper.pdf", @@ -42628,7 +45508,8 @@ "aff_campus_unique_index": "0;0;0;0;0+1", "aff_campus_unique": "Palo Alto;Los Angeles", "aff_country_unique_index": "0;0;0;0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Kalra_2021_ICCV,\n \n author = {\n Kalra,\n Agastya and Stoppi,\n Guy and Brown,\n Bradley and Agarwal,\n Rishav and Kadambi,\n Achuta\n},\n title = {\n Towards Rotation Invariance in Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3530-3540\n} \n}" }, { "title": "Towards Understanding the Generative Capability of Adversarially Robust Classifiers", @@ -42636,10 +45517,11 @@ "status": "Poster", "track": "main", "pid": 11410, + "author_site": "Yao Zhu; Jiacheng Ma; Jiacheng Sun; Zewei Chen; Rongxin Jiang; Yaowu Chen; Zhenguo Li", "author": "Yao Zhu; Jiacheng Ma; Jiacheng Sun; Zewei Chen; Rongxin Jiang; Yaowu Chen; Zhenguo Li", "abstract": "Recently, some works found an interesting phenomenon that adversarially robust classifiers can generate good images comparable to generative models. We investigate this phenomenon from an energy perspective and provide a novel explanation. We reformulate adversarial example generation, adversarial training, and image generation in terms of an energy function. We find that adversarial training contributes to obtaining an energy function that is flat and has low energy around the real data, which is the key for generative capability. Based on our new understanding, we further propose a better adversarial training method, Joint Energy Adversarial Training (JEAT), which can generate high-quality images and achieve new state-of-the-art robustness under a wide range of attacks. The Inception Score of the images (CIFAR-10) generated by JEAT is 8.80, much better than original robust classifiers (7.50). In particular, we achieve new state-of-the-art robustness on CIFAR-10 (from 57.20% to 62.04%) and CIFAR-100 (from 30.03% to 30.18%) without extra training data.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhu_Towards_Understanding_the_Generative_Capability_of_Adversarially_Robust_Classifiers_ICCV_2021_paper.pdf", - "aff": "Zhejiang University; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Zhejiang University; Zhejiang University; Huawei Noah\u2019s Ark Lab", + "aff": "Zhejiang University; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; Zhejiang University; Zhejiang University; Huawei Noah’s Ark Lab", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Zhu_Towards_Understanding_the_ICCV_2021_supplemental.pdf", @@ -42653,13 +45535,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhu_Towards_Understanding_the_Generative_Capability_of_Adversarially_Robust_Classifiers_ICCV_2021_paper.html", "aff_unique_index": "0;1;1;1;0;0;1", "aff_unique_norm": "Zhejiang University;Huawei", - "aff_unique_dep": ";Noah\u2019s Ark Lab", + "aff_unique_dep": ";Noah’s Ark Lab", "aff_unique_url": "https://www.zju.edu.cn;https://www.huawei.com", "aff_unique_abbr": "ZJU;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhu_2021_ICCV,\n \n author = {\n Zhu,\n Yao and Ma,\n Jiacheng and Sun,\n Jiacheng and Chen,\n Zewei and Jiang,\n Rongxin and Chen,\n Yaowu and Li,\n Zhenguo\n},\n title = {\n Towards Understanding the Generative Capability of Adversarially Robust Classifiers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7728-7737\n} \n}" }, { "title": "Towards Vivid and Diverse Image Colorization With Generative Color Prior", @@ -42667,6 +45550,7 @@ "status": "Poster", "track": "main", "pid": 6095, + "author_site": "Yanze Wu; Xintao Wang; Yu Li; Honglun Zhang; Xun Zhao; Ying Shan", "author": "Yanze Wu; Xintao Wang; Yu Li; Honglun Zhang; Xun Zhao; Ying Shan", "abstract": "Colorization has attracted increasing interest in recent years. Classic reference-based methods usually rely on external color images for plausible results. A large image database or online search engine is inevitably required for retrieving such exemplars. Recent deep-learning-based methods could automatically colorize images at a low cost. However, unsatisfactory artifacts and incoherent colors are always accompanied. In this work, we aim at recovering vivid colors by leveraging the rich and diverse color priors encapsulated in a pretrained Generative Adversarial Networks (GAN). Specifically, we first \"retrieve\" matched features (similar to exemplars) via a GAN encoder and then incorporate these features into the colorization process with feature modulations. Thanks to the powerful generative color prior and delicate designs, our method could produce vivid colors with a single forward pass. Moreover, it is highly convenient to obtain diverse results by modifying GAN latent codes. Our method also inherits the merit of interpretable controls of GANs and could attain controllable and smooth transitions by walking through GAN latent space. Extensive experiments and user studies demonstrate that our method achieves superior performance than previous works.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wu_Towards_Vivid_and_Diverse_Image_Colorization_With_Generative_Color_Prior_ICCV_2021_paper.pdf", @@ -42690,7 +45574,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wu_2021_ICCV,\n \n author = {\n Wu,\n Yanze and Wang,\n Xintao and Li,\n Yu and Zhang,\n Honglun and Zhao,\n Xun and Shan,\n Ying\n},\n title = {\n Towards Vivid and Diverse Image Colorization With Generative Color Prior\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14377-14386\n} \n}" }, { "title": "Towards a Universal Model for Cross-Dataset Crowd Counting", @@ -42698,6 +45583,7 @@ "status": "Poster", "track": "main", "pid": 3020, + "author_site": "Zhiheng Ma; Xiaopeng Hong; Xing Wei; Yunfeng Qiu; Yihong Gong", "author": "Zhiheng Ma; Xiaopeng Hong; Xing Wei; Yunfeng Qiu; Yihong Gong", "abstract": "This paper proposes to handle the practical problem of learning a universal model for crowd counting across scenes and datasets. We dissect that the crux of this problem is the catastrophic sensitivity of crowd counters to scale shift, which is very common in the real world and caused by factors such as different scene layouts and image resolutions. Therefore it is difficult to train a universal model that can be applied to various scenes. To address this problem, we propose scale alignment as a prime module for establishing a novel crowd counting framework. We derive a closed-form solution to get the optimal image rescaling factors for alignment by minimizing the distances between their scale distributions. A novel neural network together with a loss function based on an efficient sliced Wasserstein distance is also proposed for scale distribution estimation. Benefiting from the proposed method, we have learned a universal model that generally works well on several datasets where can even outperform state-of-the-art models that are particularly fine-tuned for each dataset significantly. Experiments also demonstrate the much better generalizability of our model to unseen scenes.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ma_Towards_a_Universal_Model_for_Cross-Dataset_Crowd_Counting_ICCV_2021_paper.pdf", @@ -42714,14 +45600,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Ma_Towards_a_Universal_Model_for_Cross-Dataset_Crowd_Counting_ICCV_2021_paper.html", "aff_unique_index": "0;0+1;0;0;0", - "aff_unique_norm": "Xi'an Jiao Tong University;Pengcheng Laboratory", + "aff_unique_norm": "Xi'an Jiaotong University;Peng Cheng Laboratory", "aff_unique_dep": "College of Artificial Intelligence;Research Center for Artificial Intelligence", "aff_unique_url": "http://www.xjtu.edu.cn;http://www.pcl.ac.cn", "aff_unique_abbr": "XJTU;PCL", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Xi'an;", "aff_country_unique_index": "0;0+0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Ma_2021_ICCV,\n \n author = {\n Ma,\n Zhiheng and Hong,\n Xiaopeng and Wei,\n Xing and Qiu,\n Yunfeng and Gong,\n Yihong\n},\n title = {\n Towards a Universal Model for Cross-Dataset Crowd Counting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3205-3214\n} \n}" }, { "title": "Towards the Unseen: Iterative Text Recognition by Distilling From Errors", @@ -42729,6 +45616,7 @@ "status": "Poster", "track": "main", "pid": 3747, + "author_site": "Ayan Kumar Bhunia; Pinaki Nath Chowdhury; Aneeshan Sain; Yi-Zhe Song", "author": "Ayan Kumar Bhunia; Pinaki Nath Chowdhury; Aneeshan Sain; Yi-Zhe Song", "abstract": "Visual text recognition is undoubtedly one of the most extensively researched topics in computer vision. Great progress have been made to date, with the latest models starting to focus on the more practical \"in-the-wild\" setting. However, a salient problem still hinders practical deployment -- prior arts mostly struggle with recognising unseen (or rarely seen) character sequences. In this paper, we put forward a novel framework to specifically tackle this \"unseen\" problem. Our framework is iterative in nature, in that it utilises predicted knowledge of character sequences from a previous iteration, to augment the main network in improving the next prediction. Key to our success is a unique cross-modal variational autoencoder to act as a feedback module, which is trained with the presence of textual error distribution data. This module importantly translate a discrete predicted character space, to a continuous affine transformation parameter space used to condition the visual feature map at next iteration. Experiments on common datasets have shown competitive performance over state-of-the-arts under the conventional setting. Most importantly, under the new disjoint setup where train-test labels are mutually exclusive, ours offers the best performance thus showcasing the capability of generalising onto unseen words.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Bhunia_Towards_the_Unseen_Iterative_Text_Recognition_by_Distilling_From_Errors_ICCV_2021_paper.pdf", @@ -42752,7 +45640,8 @@ "aff_campus_unique_index": ";;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0+0;0+0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Bhunia_2021_ICCV,\n \n author = {\n Bhunia,\n Ayan Kumar and Chowdhury,\n Pinaki Nath and Sain,\n Aneeshan and Song,\n Yi-Zhe\n},\n title = {\n Towards the Unseen: Iterative Text Recognition by Distilling From Errors\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14950-14959\n} \n}" }, { "title": "Towers of Babel: Combining Images, Language, and 3D Geometry for Learning Multimodal Vision", @@ -42760,6 +45649,7 @@ "status": "Poster", "track": "main", "pid": 3383, + "author_site": "Xiaoshi Wu; Hadar Averbuch-Elor; Jin Sun; Noah Snavely", "author": "Xiaoshi Wu; Hadar Averbuch-Elor; Jin Sun; Noah Snavely", "abstract": "The abundance and richness of Internet photos of landmarks and cities has led to significant progress in 3D vision over the past two decades, including automated 3D reconstructions of the world's landmarks from tourist photos. However, a major source of information available for these 3D-augmented collections---language, e.g., from image captions---has been virtually untapped. In this work, we present WikiScenes, a new, large-scale dataset of landmark photo collections that contains descriptive text in the form of captions and hierarchical category names. WikiScenes forms a new testbed for multimodal reasoning involving images, text, and 3D geometry. We demonstrate the utility of WikiScenes for learning semantic concepts over images and 3D models. Our weakly-supervised framework connects images, 3D structure and semantics---utilizing the strong constraints provided by 3D geometry---to associate semantic concepts to image pixels and points in 3D space.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wu_Towers_of_Babel_Combining_Images_Language_and_3D_Geometry_for_ICCV_2021_paper.pdf", @@ -42774,7 +45664,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wu_Towers_of_Babel_Combining_Images_Language_and_3D_Geometry_for_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wu_Towers_of_Babel_Combining_Images_Language_and_3D_Geometry_for_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Wu_2021_ICCV,\n \n author = {\n Wu,\n Xiaoshi and Averbuch-Elor,\n Hadar and Sun,\n Jin and Snavely,\n Noah\n},\n title = {\n Towers of Babel: Combining Images,\n Language,\n and 3D Geometry for Learning Multimodal Vision\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 428-437\n} \n}" }, { "title": "Track Without Appearance: Learn Box and Tracklet Embedding With Local and Global Motion Patterns for Vehicle Tracking", @@ -42782,6 +45673,7 @@ "status": "Poster", "track": "main", "pid": 3118, + "author_site": "Gaoang Wang; Renshu Gu; Zuozhu Liu; Weijie Hu; Mingli Song; Jenq-Neng Hwang", "author": "Gaoang Wang; Renshu Gu; Zuozhu Liu; Weijie Hu; Mingli Song; Jenq-Neng Hwang", "abstract": "Vehicle tracking is an essential task in the multi-object tracking (MOT) field. A distinct characteristic in vehicle tracking is that the trajectories of vehicles are fairly smooth in both the world coordinate and the image coordinate. Hence, models that capture motion consistencies are of high necessity. However, tracking with the standalone motion-based trackers is quite challenging because targets could get lost easily due to limited information, detection error and occlusion. Leveraging appearance information to assist object re-identification could resolve this challenge to some extent. However, doing so requires extra computation while appearance information is sensitive to occlusion as well. In this paper, we try to explore the significance of motion patterns for vehicle tracking without appearance information. We propose a novel approach that tackles the association issue for long-term tracking with the exclusive fully-exploited motion information. We address the tracklet embedding issue with the proposed reconstruct-to-embed strategy based on deep graph convolutional neural networks (GCN). Comprehensive experiments on the KITTI-car tracking dataset and UA-Detrac dataset show that the proposed method, though without appearance information, could achieve competitive performance with the state-of-the-art (SOTA) trackers. The source code will be available at https://github.com/GaoangW/LGMTracker.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_Track_Without_Appearance_Learn_Box_and_Tracklet_Embedding_With_Local_ICCV_2021_paper.pdf", @@ -42805,7 +45697,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Gaoang and Gu,\n Renshu and Liu,\n Zuozhu and Hu,\n Weijie and Song,\n Mingli and Hwang,\n Jenq-Neng\n},\n title = {\n Track Without Appearance: Learn Box and Tracklet Embedding With Local and Global Motion Patterns for Vehicle Tracking\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9876-9886\n} \n}" }, { "title": "Training Multi-Object Detector by Estimating Bounding Box Distribution for Input Image", @@ -42813,6 +45706,7 @@ "status": "Poster", "track": "main", "pid": 8969, + "author_site": "Jaeyoung Yoo; Hojun Lee; Inseop Chung; Geonseok Seo; Nojun Kwak", "author": "Jaeyoung Yoo; Hojun Lee; Inseop Chung; Geonseok Seo; Nojun Kwak", "abstract": "In multi-object detection using neural networks, the fundamental problem is, \"How should the network learn a variable number of bounding boxes in different input images?\". Previous methods train a multi-object detection network through a procedure that directly assigns the ground truth bounding boxes to the specific locations of the network's output. However, this procedure makes the training of a multi-object detection network too heuristic and complicated. In this paper, we reformulate the multi-object detection task as a problem of density estimation of bounding boxes. Instead of assigning each ground truth to specific locations of network's output, we train a network by estimating the probability density of bounding boxes in an input image using a mixture model. For this purpose, we propose a novel network for object detection called Mixture Density Object Detector (MDOD), and the corresponding objective function for the density-estimation-based training. We applied MDOD to MS COCO dataset. Our proposed method not only deals with multi-object detection problems in a new approach, but also improves detection performances through MDOD. The code is available: https://github.com/yoojy31/MDOD.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yoo_Training_Multi-Object_Detector_by_Estimating_Bounding_Box_Distribution_for_Input_ICCV_2021_paper.pdf", @@ -42829,14 +45723,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Yoo_Training_Multi-Object_Detector_by_Estimating_Bounding_Box_Distribution_for_Input_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;1;0", - "aff_unique_norm": "Seoul National University;Samsung", - "aff_unique_dep": ";Samsung Advanced Institute of Technology", + "aff_unique_norm": "Seoul National University;Samsung Advanced Institute of Technology", + "aff_unique_dep": ";", "aff_unique_url": "https://www.snu.ac.kr;https://www.sait.samsung.com", "aff_unique_abbr": "SNU;SAIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Yoo_2021_ICCV,\n \n author = {\n Yoo,\n Jaeyoung and Lee,\n Hojun and Chung,\n Inseop and Seo,\n Geonseok and Kwak,\n Nojun\n},\n title = {\n Training Multi-Object Detector by Estimating Bounding Box Distribution for Input Image\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3437-3446\n} \n}" }, { "title": "Training Weakly Supervised Video Frame Interpolation With Events", @@ -42844,6 +45739,7 @@ "status": "Poster", "track": "main", "pid": 6187, + "author_site": "Zhiyang Yu; Yu Zhang; Deyuan Liu; Dongqing Zou; Xijun Chen; Yebin Liu; Jimmy S. Ren", "author": "Zhiyang Yu; Yu Zhang; Deyuan Liu; Dongqing Zou; Xijun Chen; Yebin Liu; Jimmy S. Ren", "abstract": "Event-based video frame interpolation is promising as event cameras capture dense motion signals that can greatly facilitate motion-aware synthesis. However, training existing frameworks for this task requires high frame-rate videos with synchronized events, posing challenges to collect real training data. In this work we show event-based frame interpolation can be trained without the need of high framerate videos. This is achieved via a novel weakly supervised framework that 1) corrects image appearance by extracting complementary information from events and 2) supplants motion dynamics modeling with attention mechanisms. For the latter we propose subpixel attention learning, which supports searching high-resolution correspondence efficiently on low-resolution feature grid. Though trained on low frame-rate videos, our framework outperforms existing models trained with full high frame-rate videos (and events) on both GoPro dataset and a new real event-based dataset. Codes, models and dataset will be made available at: https://github.com/YU-Zhiyang/WEVI.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yu_Training_Weakly_Supervised_Video_Frame_Interpolation_With_Events_ICCV_2021_paper.pdf", @@ -42867,7 +45763,8 @@ "aff_campus_unique_index": "0;;;2;0;2", "aff_campus_unique": "Harbin;;Shanghai", "aff_country_unique_index": "0+0;0+0;0+0;0+0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yu_2021_ICCV,\n \n author = {\n Yu,\n Zhiyang and Zhang,\n Yu and Liu,\n Deyuan and Zou,\n Dongqing and Chen,\n Xijun and Liu,\n Yebin and Ren,\n Jimmy S.\n},\n title = {\n Training Weakly Supervised Video Frame Interpolation With Events\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14589-14598\n} \n}" }, { "title": "TransFER: Learning Relation-Aware Facial Expression Representations With Transformers", @@ -42875,6 +45772,7 @@ "status": "Poster", "track": "main", "pid": 11288, + "author_site": "Fanglei Xue; Qiangchang Wang; Guodong Guo", "author": "Fanglei Xue; Qiangchang Wang; Guodong Guo", "abstract": "Facial expression recognition (FER) has received increasing interest in computer vision. We propose the TransFER model which can learn rich relation-aware local representations. It mainly consists of three components: Multi-Attention Dropping (MAD), ViT-FER, and Multi-head Self-Attention Dropping (MSAD). First, local patches play an important role in distinguishing various expressions, however, few existing works can locate discriminative and diverse local patches. This can cause serious problems when some patches are invisible due to pose variations or viewpoint changes. To address this issue, the MAD is proposed to randomly drop an attention map. Consequently, models are pushed to explore diverse local patches adaptively. Second, to build rich relations between different local patches, the Vision Transformers (ViT) are used in FER, called ViT-FER. Since the global scope is used to reinforce each local patch, a better representation is obtained to boost the FER performance. Thirdly, the multi-head self-attention allows ViT to jointly attend to features from different information subspaces at different positions. Given no explicit guidance, however, multiple self-attentions may extract similar relations. To address this, the MSAD is proposed to randomly drop one self-attention module. As a result, models are forced to learn rich relations among diverse local patches. Our proposed TransFER model outperforms the state-of-the-art methods on several FER benchmarks, showing its effectiveness and usefulness.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xue_TransFER_Learning_Relation-Aware_Facial_Expression_Representations_With_Transformers_ICCV_2021_paper.pdf", @@ -42891,14 +45789,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Xue_TransFER_Learning_Relation-Aware_Facial_Expression_Representations_With_Transformers_ICCV_2021_paper.html", "aff_unique_index": "0+1;2;3+4", - "aff_unique_norm": "University of Chinese Academy of Sciences;Chinese Academy of Sciences;West Virginia University;Baidu;National Engineering Laboratory for Deep Learning Technology and Application", + "aff_unique_norm": "University of Chinese Academy of Sciences;Chinese Academy of Sciences;West Virginia University;Baidu Research;National Engineering Laboratory for Deep Learning Technology and Application", "aff_unique_dep": ";Key Laboratory of Space Utilization, Technology and Engineering Center for Space Utilization;;Institute of Deep Learning;", "aff_unique_url": "http://www.ucas.ac.cn;http://www.cas.ac.cn;https://www.wvu.edu;https://baidu.com;", "aff_unique_abbr": "UCAS;CAS;WVU;Baidu;", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;1;0+0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Xue_2021_ICCV,\n \n author = {\n Xue,\n Fanglei and Wang,\n Qiangchang and Guo,\n Guodong\n},\n title = {\n TransFER: Learning Relation-Aware Facial Expression Representations With Transformers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3601-3610\n} \n}" }, { "title": "TransForensics: Image Forgery Localization With Dense Self-Attention", @@ -42906,6 +45805,7 @@ "status": "Poster", "track": "main", "pid": 11189, + "author_site": "Jing Hao; Zhixin Zhang; Shicai Yang; Di Xie; Shiliang Pu", "author": "Jing Hao; Zhixin Zhang; Shicai Yang; Di Xie; Shiliang Pu", "abstract": "Nowadays advanced image editing tools and technical skills produce tampered images more realistically, which can easily evade image forensic systems and make authenticity verification of images more difficult. To tackle this challenging problem, we introduce TransForensics, a novel image forgery localization method inspired by Transformers. The two major components in our framework are dense self-attention encoders and dense correction modules. The former is to model global context and all pairwise interactions between local patches at different scales, while the latter is used for improving the transparency of the hidden layers and correcting the outputs from different branches. Compared to previous traditional and deep learning methods, TransForensics not only can capture discriminative representations and obtain high-quality mask predictions but is also not limited by tampering types and patch sequence orders. By conducting experiments on main benchmarks, we show that TransForensics outperforms the state-of-the-art methods by a large margin.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Hao_TransForensics_Image_Forgery_Localization_With_Dense_Self-Attention_ICCV_2021_paper.pdf", @@ -42929,7 +45829,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Hao_2021_ICCV,\n \n author = {\n Hao,\n Jing and Zhang,\n Zhixin and Yang,\n Shicai and Xie,\n Di and Pu,\n Shiliang\n},\n title = {\n TransForensics: Image Forgery Localization With Dense Self-Attention\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15055-15064\n} \n}" }, { "title": "TransPose: Keypoint Localization via Transformer", @@ -42937,6 +45838,7 @@ "status": "Poster", "track": "main", "pid": 6717, + "author_site": "Sen Yang; Zhibin Quan; Mu Nie; Wankou Yang", "author": "Sen Yang; Zhibin Quan; Mu Nie; Wankou Yang", "abstract": "While CNN-based models have made remarkable progress on human pose estimation, what spatial dependencies they capture to localize keypoints remains unclear. In this work, we propose a model called TransPose, which introduces Transformer for human pose estimation. The attention layers built in Transformer enable our model to capture long-range relationships efficiently and also can reveal what dependencies the predicted keypoints rely on. To predict keypoint heatmaps, the last attention layer acts as an aggregator, which collects contributions from image clues and forms maximum positions of keypoints. Such a heatmap-based localization approach via Transformer conforms to the principle of Activation Maximization. And the revealed dependencies are image-specific and fine-grained, which also can provide evidence of how the model handles special cases, e.g., occlusion. The experiments show that TransPose achieves 75.8 AP and 75.0 AP on COCO validation and test-dev sets, while being more lightweight and faster than mainstream CNN architectures. The TransPose model also transfers very well on MPII benchmark, achieving superior performance on the test set when fine-tuned with small training costs. Code and pre-trained models are publicly available.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yang_TransPose_Keypoint_Localization_via_Transformer_ICCV_2021_paper.pdf", @@ -42960,7 +45862,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Nanjing", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yang_2021_ICCV,\n \n author = {\n Yang,\n Sen and Quan,\n Zhibin and Nie,\n Mu and Yang,\n Wankou\n},\n title = {\n TransPose: Keypoint Localization via Transformer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11802-11812\n} \n}" }, { "title": "TransReID: Transformer-Based Object Re-Identification", @@ -42968,6 +45871,7 @@ "status": "Poster", "track": "main", "pid": 3846, + "author_site": "Shuting He; Hao Luo; Pichao Wang; Fan Wang; Hao Li; Wei Jiang", "author": "Shuting He; Hao Luo; Pichao Wang; Fan Wang; Hao Li; Wei Jiang", "abstract": "Extracting robust feature representation is one of the key challenges in object re-identification (ReID). Although convolution neural network (CNN)-based methods have achieved great success, they only process one local neighborhood at a time and suffer from information loss on details caused by convolution and downsampling operators pooling and strided convolution).To overcome these limitations, we propose a pure transformer-based object ReID framework named TransReID. Specifically, we first encode an image as a sequence of patches and build a transformer-based strong baseline with a few critical improvements, which achieves competitive results on several ReID benchmarks with CNN-based methods. To further enhance the robust feature learning in the context of transformers, two novel modules are carefully designed. (i) The jigsaw patch module (JPM) is proposed to rearrange the patch embeddings via shift and patch shuffle operations which generates robust features with improved discrimination ability and more diversified coverage. (ii) The side information embeddings (SIE) is introduced to mitigate feature bias towards camera/view variations by plugging in learnable embeddings to incorporate these non-visual clues. To the best of our knowledge, this is the first work to adopt a pure transformer for ReID research. Experimental results of TransReID are superior promising, which achieve state-of-the-art performance on both person and vehicle ReID benchmarks. Code is available at https://github.com/heshuting555/TransReID", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/He_TransReID_Transformer-Based_Object_Re-Identification_ICCV_2021_paper.pdf", @@ -42991,7 +45895,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{He_2021_ICCV,\n \n author = {\n He,\n Shuting and Luo,\n Hao and Wang,\n Pichao and Wang,\n Fan and Li,\n Hao and Jiang,\n Wei\n},\n title = {\n TransReID: Transformer-Based Object Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15013-15022\n} \n}" }, { "title": "TransVG: End-to-End Visual Grounding With Transformers", @@ -42999,6 +45904,7 @@ "status": "Poster", "track": "main", "pid": 2560, + "author_site": "Jiajun Deng; Zhengyuan Yang; Tianlang Chen; Wengang Zhou; Houqiang Li", "author": "Jiajun Deng; Zhengyuan Yang; Tianlang Chen; Wengang Zhou; Houqiang Li", "abstract": "In this paper, we present a neat yet effective transformer-based framework for visual grounding, namely TransVG, to address the task of grounding a language query to the corresponding region onto an image. The state-of-the-art methods, including two-stage or one-stage ones, rely on a complex module with manually-designed mechanisms to perform the query reasoning and multi-modal fusion. However, the involvement of certain mechanisms in fusion module design, such as query decomposition and image scene graph, makes the models easily overfit to datasets with specific scenarios, and limits the plenitudinous interaction between the visual-linguistic context. To avoid this caveat, we propose to establish the multi-modal correspondence by leveraging transformers, and empirically show that the complex fusion modules (e.g., modular attention network, dynamic graph, and multi-modal tree) can be replaced by a simple stack of transformer encoder layers with higher performance. Moreover, we re-formulate the visual grounding as a direct coordinates regression problem and avoid making predictions out of a set of candidates (i.e., region proposals or anchor boxes). Extensive experiments are conducted on five widely used datasets, and a series of state-of-the-art records are set by our TransVG. We build the benchmark of transformer-based visual grounding framework and make the code available at https://github.com/djiajunustc/TransVG.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Deng_TransVG_End-to-End_Visual_Grounding_With_Transformers_ICCV_2021_paper.pdf", @@ -43022,7 +45928,8 @@ "aff_campus_unique_index": "0;0+0;0+0", "aff_campus_unique": "Hefei;", "aff_country_unique_index": "0;1;1;0+0;0+0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Deng_2021_ICCV,\n \n author = {\n Deng,\n Jiajun and Yang,\n Zhengyuan and Chen,\n Tianlang and Zhou,\n Wengang and Li,\n Houqiang\n},\n title = {\n TransVG: End-to-End Visual Grounding With Transformers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1769-1779\n} \n}" }, { "title": "TransView: Inside, Outside, and Across the Cropping View Boundaries", @@ -43030,6 +45937,7 @@ "status": "Poster", "track": "main", "pid": 6179, + "author_site": "Zhiyu Pan; Zhiguo Cao; Kewei Wang; Hao Lu; Weicai Zhong", "author": "Zhiyu Pan; Zhiguo Cao; Kewei Wang; Hao Lu; Weicai Zhong", "abstract": "We show that relation modeling between visual elements matters in cropping view recommendation. Cropping view recommendation addresses the problem of image recomposition conditioned on the composition quality and the ranking of views (cropped sub-regions). This task is challenging because the visual difference is subtle when a visual element is reserved or removed. Existing methods represent visual elements by extracting region-based convolutional features inside and outside the cropping view boundaries, without probing a fundamental question: why some visual elements are of interest or of discard? In this work, we observe that the relation between different visual elements significantly affects their relative positions to the desired cropping view, and such relation can be characterized by the attraction inside/outside the cropping view boundaries and the repulsion across the boundaries. By instantiating a transformer-based solution that represents visual elements as visual words and that models the dependencies between visual words, we report not only state of-the-art performance on public benchmarks, but also interesting visualizations that depict the attraction and repulsion between visual elements, which may shed light on what makes for effective cropping view recommendation.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Pan_TransView_Inside_Outside_and_Across_the_Cropping_View_Boundaries_ICCV_2021_paper.pdf", @@ -43053,7 +45961,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Pan_2021_ICCV,\n \n author = {\n Pan,\n Zhiyu and Cao,\n Zhiguo and Wang,\n Kewei and Lu,\n Hao and Zhong,\n Weicai\n},\n title = {\n TransView: Inside,\n Outside,\n and Across the Cropping View Boundaries\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4218-4227\n} \n}" }, { "title": "Transductive Few-Shot Classification on the Oblique Manifold", @@ -43061,6 +45970,7 @@ "status": "Poster", "track": "main", "pid": 2532, + "author_site": "Guodong Qi; Huimin Yu; Zhaohui Lu; Shuzhao Li", "author": "Guodong Qi; Huimin Yu; Zhaohui Lu; Shuzhao Li", "abstract": "Few-shot learning (FSL) attempts to learn with limited data. In this work, we perform the feature extraction in the Euclidean space and the geodesic distance metric on the Oblique Manifold (OM). Specially, for better feature extraction, we propose a non-parametric Region Self-attention with Spatial Pyramid Pooling (RSSPP), which realizes a trade-off between the generalization and the discriminative ability of the single image feature. Then, we embed the feature to OM as a point. Furthermore, we design an Oblique Distance-based Classifier (ODC) that achieves classification in the tangent spaces which better approximate OM locally by learnable tangency points. Finally, we introduce a new method for parameters initialization and a novel loss function in the transductive settings. Extensive experiments demonstrate the effectiveness of our algorithm and it outperforms state-of-the-art methods on the popular benchmarks: mini-ImageNet, tiered-ImageNet, and Caltech-UCSD Birds-200-2011 (CUB).", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Qi_Transductive_Few-Shot_Classification_on_the_Oblique_Manifold_ICCV_2021_paper.pdf", @@ -43084,7 +45994,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0+0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Qi_2021_ICCV,\n \n author = {\n Qi,\n Guodong and Yu,\n Huimin and Lu,\n Zhaohui and Li,\n Shuzhao\n},\n title = {\n Transductive Few-Shot Classification on the Oblique Manifold\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8412-8422\n} \n}" }, { "title": "TransferI2I: Transfer Learning for Image-to-Image Translation From Small Datasets", @@ -43092,7 +46003,8 @@ "status": "Poster", "track": "main", "pid": 1948, - "author": "Yaxing Wang; H\u00e9ctor Laria; Joost van de Weijer; Laura Lopez-Fuentes; Bogdan Raducanu", + "author_site": "Yaxing Wang; Héctor Laria; Joost van de Weijer; Laura Lopez-Fuentes; Bogdan Raducanu", + "author": "Yaxing Wang; Héctor Laria; Joost van de Weijer; Laura Lopez-Fuentes; Bogdan Raducanu", "abstract": "Image-to-image (I2I) translation has matured in recent years and is able to generate high-quality realistic images. However, despite current success, it still faces important challenges when applied to small domains. Existing methods use transfer learning for I2I translation, but they still require the learning of millions of parameters from scratch. This drawback severely limits its application on small domains. In this paper, we propose a new transfer learning for I2I translation (TransferI2I). We decouple our learning process into the image generation step and the I2I translation step. In the first step we propose two novel techniques: source-target initialization and self-initialization of the adaptor layer. The former finetunes the pretrained generative model (e.g., StyleGAN) on source and target data. The latter allows to initialize all non-pretrained network parameters without the need of any data. These techniques provide a better initialization for the I2I translation. Second step performs the actual I2I translation using the learned weights in the first step. In addition, we introduce an auxiliary GAN that further facilitates the training of deep I2I systems even from small datasets. In extensive experiments on three datasets, (Animal faces, Birds, and Foods), we show that we outperform existing methods and that mFID improves on several datasets with over 25 points.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_TransferI2I_Transfer_Learning_for_Image-to-Image_Translation_From_Small_Datasets_ICCV_2021_paper.pdf", "aff": "PCALab, Nanjing University of Science and Technology, China+Computer Vision Center, Universitat Aut `onoma de Barcelona, Spain; Computer Vision Center, Universitat Aut `onoma de Barcelona, Spain; Computer Vision Center, Universitat Aut `onoma de Barcelona, Spain; Universitat de les Illes Balears, Spain; Computer Vision Center, Universitat Aut `onoma de Barcelona, Spain", @@ -43112,10 +46024,11 @@ "aff_unique_dep": "PCALab;Computer Vision Center;", "aff_unique_url": ";https://www.uab.cat;https://www UIB.es", "aff_unique_abbr": ";;UIB", - "aff_campus_unique_index": "", - "aff_campus_unique": "", + "aff_campus_unique_index": ";1", + "aff_campus_unique": ";Palma de Mallorca", "aff_country_unique_index": "0+1;1;1;1;1", - "aff_country_unique": "China;Spain" + "aff_country_unique": "China;Spain", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Yaxing and Laria,\n H\\'ector and van de Weijer,\n Joost and Lopez-Fuentes,\n Laura and Raducanu,\n Bogdan\n},\n title = {\n TransferI2I: Transfer Learning for Image-to-Image Translation From Small Datasets\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14010-14019\n} \n}" }, { "title": "Transformer-Based Attention Networks for Continuous Pixel-Wise Prediction", @@ -43123,6 +46036,7 @@ "status": "Poster", "track": "main", "pid": 2248, + "author_site": "Guanglei Yang; Hao Tang; Mingli Ding; Nicu Sebe; Elisa Ricci", "author": "Guanglei Yang; Hao Tang; Mingli Ding; Nicu Sebe; Elisa Ricci", "abstract": "While convolutional neural networks have shown a tremendous impact on various computer vision tasks, they generally demonstrate limitations in explicitly modeling long-range dependencies due to the intrinsic locality of the convolution operation. Initially designed for natural language processing tasks, Transformers have emerged as alternative architectures with innate global self-attention mechanisms to capture long-range dependencies. In this paper, we propose TransDepth, an architecture that benefits from both convolutional neural networks and transformers. To avoid the network losing its ability to capture local-level details due to the adoption of transformers, we propose a novel decoder that employs attention mechanisms based on gates. Notably, this is the first paper that applies transformers to pixel-wise prediction problems involving continuous labels (i.e., monocular depth prediction and surface normal estimation). Extensive experiments demonstrate that the proposed TransDepth achieves state-of-the-art performance on three challenging datasets. Our code is available at: https://github.com/ygjwd12345/TransDepth.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yang_Transformer-Based_Attention_Networks_for_Continuous_Pixel-Wise_Prediction_ICCV_2021_paper.pdf", @@ -43146,7 +46060,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+1;2;0;1+1;1+1", - "aff_country_unique": "China;Italy;Switzerland" + "aff_country_unique": "China;Italy;Switzerland", + "bibtex": "@InProceedings{Yang_2021_ICCV,\n \n author = {\n Yang,\n Guanglei and Tang,\n Hao and Ding,\n Mingli and Sebe,\n Nicu and Ricci,\n Elisa\n},\n title = {\n Transformer-Based Attention Networks for Continuous Pixel-Wise Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 16269-16279\n} \n}" }, { "title": "Transformer-Based Dual Relation Graph for Multi-Label Image Recognition", @@ -43154,6 +46069,7 @@ "status": "Poster", "track": "main", "pid": 2387, + "author_site": "Jiawei Zhao; Ke Yan; Yifan Zhao; Xiaowei Guo; Feiyue Huang; Jia Li", "author": "Jiawei Zhao; Ke Yan; Yifan Zhao; Xiaowei Guo; Feiyue Huang; Jia Li", "abstract": "The simultaneous recognition of multiple objects in one image remains a challenging task, spanning multiple events in the recognition field such as various object scales, inconsistent appearances, and confused inter-class relationships. Recent research efforts mainly resort to the statistic label co-occurrences and linguistic word embedding to enhance the unclear semantics. Different from these researches, in this paper, we propose a novel Transformer-based Dual Relation learning framework, constructing complementary relationships by exploring two aspects of correlation, i.e., structural relation graph and semantic relation graph. The structural relation graph aims to capture long-range correlations from object context, by developing a cross-scale transformer-based architecture. The semantic graph dynamically models the semantic meanings of image objects with explicit semantic-aware constraints. In addition, we also incorporate the learnt structural relationship into the semantic graph, constructing a joint relation graph for robust representations. With the collaborative learning of these two effective relation graphs, our approach achieves new state-of-the-art on two popular multi-label recognition benchmarks, i.e. MS-COCO and VOC 2007 dataset.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhao_Transformer-Based_Dual_Relation_Graph_for_Multi-Label_Image_Recognition_ICCV_2021_paper.pdf", @@ -43170,14 +46086,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhao_Transformer-Based_Dual_Relation_Graph_for_Multi-Label_Image_Recognition_ICCV_2021_paper.html", "aff_unique_index": "0;1;0;1;1;0+2", - "aff_unique_norm": "Beihang University;Tencent;Pengcheng Laboratory", - "aff_unique_dep": "State Key Laboratory of Virtual Reality Technology and Systems, SCSE;Youtu Lab;Peng Cheng Laboratory", + "aff_unique_norm": "Beihang University;Tencent;Peng Cheng Laboratory", + "aff_unique_dep": "State Key Laboratory of Virtual Reality Technology and Systems, SCSE;Youtu Lab;", "aff_unique_url": "http://www.buaa.edu.cn;https://www.tencent.com;", "aff_unique_abbr": "Beihang;Tencent;", "aff_campus_unique_index": "1;1;1;2", "aff_campus_unique": ";Shanghai;Shenzhen", "aff_country_unique_index": "0;0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhao_2021_ICCV,\n \n author = {\n Zhao,\n Jiawei and Yan,\n Ke and Zhao,\n Yifan and Guo,\n Xiaowei and Huang,\n Feiyue and Li,\n Jia\n},\n title = {\n Transformer-Based Dual Relation Graph for Multi-Label Image Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 163-172\n} \n}" }, { "title": "Transforms Based Tensor Robust PCA: Corrupted Low-Rank Tensors Recovery via Convex Optimization", @@ -43185,6 +46102,7 @@ "status": "Poster", "track": "main", "pid": 10248, + "author_site": "Canyi Lu", "author": "Canyi Lu", "abstract": "This work studies the Tensor Robust Principal Component Analysis (TRPCA) problem, which aims to exactly recover the low-rank and sparse components from their sum. Our model is motivated by the recently proposed linear transforms based tensor-tensor product and tensor SVD. We define a new transforms depended tensor rank and the corresponding tensor nuclear norm. Then we solve the TRPCA problem by convex optimization whose objective is a weighted combination of the new tensor nuclear norm and l_1-norm. In theory, we prove that under some incoherence conditions, the convex program exactly recovers the underlying low-rank and sparse components with high probability. Our new TRPCA is much more general since it allows to use any invertible linear transforms. Thus, we have more choices in practice for different tasks and different type of data. Numerical experiments verify our results and the application on image recovery demonstrates the superiority of our method.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Lu_Transforms_Based_Tensor_Robust_PCA_Corrupted_Low-Rank_Tensors_Recovery_via_ICCV_2021_paper.pdf", @@ -43208,7 +46126,8 @@ "aff_campus_unique_index": "0", "aff_campus_unique": "Pittsburgh", "aff_country_unique_index": "0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Lu_2021_ICCV,\n \n author = {\n Lu,\n Canyi\n},\n title = {\n Transforms Based Tensor Robust PCA: Corrupted Low-Rank Tensors Recovery via Convex Optimization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1145-1152\n} \n}" }, { "title": "Transfusion: A Novel SLAM Method Focused on Transparent Objects", @@ -43216,6 +46135,7 @@ "status": "Poster", "track": "main", "pid": 3859, + "author_site": "Yifan Zhu; Jiaxiong Qiu; Bo Ren", "author": "Yifan Zhu; Jiaxiong Qiu; Bo Ren", "abstract": "Recently RGB-D sensors have become very popular in the area of Simultaneous Localisation and Mapping (SLAM). The RGB-D SLAM approach relies heavily on the accuracy of the input depth map. However, refraction and reflection of transparent objects will result in false depth input of RGB-D cameras, which makes the traditional RGB-D SLAM algorithm unable to work correctly in the presence of transparent objects. In this paper, we propose a novel SLAM approach called transfusion that allows transparent object existence and recovery in the video input. Our method is composed of two parts. Transparent Objects Cut Iterative Closest Points (TC-ICP)is first used to recover camera pose, detecting and removing transparent objects from input to reduce the trajectory errors. Then Transparent Objects Reconstruction (TO-Reconstruction) is used to reconstruct the transparent objects and opaque objects separately. The opaque objects are reconstructed with the traditional method, and the transparent objects are reconstructed with the visual hull-based method. To evaluate our algorithm, we construct a new RGB-D SLAM database containing 25 video sequences. Each sequence has at least one transparent object. Experiments show that our approach can work adequately in scenes contain transparent objects while the existing approach can not handle them. Our approach significantly improves the accuracy of the camera trajectory and the quality of environment reconstruction.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhu_Transfusion_A_Novel_SLAM_Method_Focused_on_Transparent_Objects_ICCV_2021_paper.pdf", @@ -43239,7 +46159,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhu_2021_ICCV,\n \n author = {\n Zhu,\n Yifan and Qiu,\n Jiaxiong and Ren,\n Bo\n},\n title = {\n Transfusion: A Novel SLAM Method Focused on Transparent Objects\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6019-6028\n} \n}" }, { "title": "Transparent Object Tracking Benchmark", @@ -43247,6 +46168,7 @@ "status": "Poster", "track": "main", "pid": 7992, + "author_site": "Heng Fan; Halady Akhilesha Miththanthaya; Harshit; Siranjiv Ramana Rajan; Xiaoqiong Liu; Zhilin Zou; Yuewei Lin; Haibin Ling", "author": "Heng Fan; Halady Akhilesha Miththanthaya; Harshit; Siranjiv Ramana Rajan; Xiaoqiong Liu; Zhilin Zou; Yuewei Lin; Haibin Ling", "abstract": "Visual tracking has achieved considerable progress in recent years. However, current research in the field mainly focuses on tracking of opaque objects, while little attention is paid to transparent object tracking. In this paper, we make the first attempt in exploring this problem by proposing a Transparent Object Tracking Benchmark (TOTB). Specifically, TOTB consists of 225 videos (86K frames) from 15 diverse transparent object categories. Each sequence is manually labeled with axis-aligned bounding boxes. To the best of our knowledge, TOTB is the first benchmark dedicated to transparent object tracking. In order to understand how existing trackers perform and to provide comparison for future research on TOTB, we extensively evaluate 25 state-of-the-art tracking algorithms. The evaluation results exhibit that more efforts are needed to improve transparent object tracking. Besides, we observe some nontrivial findings from the evaluation that are discrepant with some common beliefs in opaque object tracking. For example, we find that deep(er) features are not always good for improvements. Moreover, to encourage future research, we introduce a novel tracker, named TransATOM, which leverages transparency features for tracking and surpasses all 25 evaluated approaches by a large margin. By releasing TOTB, we expect to facilitate future research and application of transparent object tracking in both the academia and industry. The TOTB and evaluation results as well as TransATOM are available at https://hengfan2010.github.io/projects/TOTB/.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Fan_Transparent_Object_Tracking_Benchmark_ICCV_2021_paper.pdf", @@ -43261,7 +46183,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Fan_Transparent_Object_Tracking_Benchmark_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Fan_Transparent_Object_Tracking_Benchmark_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Fan_2021_ICCV,\n \n author = {\n Fan,\n Heng and Miththanthaya,\n Halady Akhilesha and Harshit and Rajan,\n Siranjiv Ramana and Liu,\n Xiaoqiong and Zou,\n Zhilin and Lin,\n Yuewei and Ling,\n Haibin\n},\n title = {\n Transparent Object Tracking Benchmark\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10734-10743\n} \n}" }, { "title": "Transporting Causal Mechanisms for Unsupervised Domain Adaptation", @@ -43269,6 +46192,7 @@ "status": "Poster", "track": "main", "pid": 9727, + "author_site": "Zhongqi Yue; Qianru Sun; Xian-Sheng Hua; Hanwang Zhang", "author": "Zhongqi Yue; Qianru Sun; Xian-Sheng Hua; Hanwang Zhang", "abstract": "Existing Unsupervised Domain Adaptation (UDA) literature adopts the covariate shift and conditional shift assumptions, which essentially encourage models to learn common features across domains. However, due to the lack of supervision in the target domain, they suffer from the semantic loss: the feature will inevitably lose non-discriminative semantics in source domain, which is however discriminative in target domain. We use a causal view---transportability theory---to identify that such loss is in fact a confounding effect, which can only be removed by causal intervention. However, the theoretical solution provided by transportability is far from practical for UDA, because it requires the stratification and representation of the unobserved confounder that is the cause of the domain gap. To this end, we propose a practical solution: Transporting Causal Mechanisms (TCM), to identify the confounder stratum and representations by using the domain-invariant disentangled causal mechanisms, which are discovered in an unsupervised fashion. Our TCM is both theoretically and empirically grounded. Extensive experiments show that TCM achieves state-of-the-art performance on three challenging UDA benchmarks: ImageCLEF-DA, Office-Home, and VisDA-2017. Codes are available at https://github.com/yue-zhongqi/tcm.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yue_Transporting_Causal_Mechanisms_for_Unsupervised_Domain_Adaptation_ICCV_2021_paper.pdf", @@ -43292,7 +46216,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", - "aff_country_unique": "Singapore;China" + "aff_country_unique": "Singapore;China", + "bibtex": "@InProceedings{Yue_2021_ICCV,\n \n author = {\n Yue,\n Zhongqi and Sun,\n Qianru and Hua,\n Xian-Sheng and Zhang,\n Hanwang\n},\n title = {\n Transporting Causal Mechanisms for Unsupervised Domain Adaptation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8599-8608\n} \n}" }, { "title": "Trash To Treasure: Harvesting OOD Data With Cross-Modal Matching for Open-Set Semi-Supervised Learning", @@ -43300,6 +46225,7 @@ "status": "Poster", "track": "main", "pid": 2731, + "author_site": "Junkai Huang; Chaowei Fang; Weikai Chen; Zhenhua Chai; Xiaolin Wei; Pengxu Wei; Liang Lin; Guanbin Li", "author": "Junkai Huang; Chaowei Fang; Weikai Chen; Zhenhua Chai; Xiaolin Wei; Pengxu Wei; Liang Lin; Guanbin Li", "abstract": "Open-set semi-supervised learning (open-set SSL) investigates a challenging but practical scenario where out-of-distribution (OOD) samples are contained in the unlabeled data. While the mainstream technique seeks to completely filter out the OOD samples for semi-supervised learning (SSL), we propose a novel training mechanism that could effectively exploit the presence of OOD data for enhanced feature learning while avoiding its adverse impact on the SSL. We achieve this goal by first introducing a warm-up training that leverages all the unlabeled data, including both the in-distribution (ID) and OOD samples. Specifically, we perform a pretext task that enforces our feature extractor to obtain a high-level semantic understanding of the training images, leading to more discriminative features that can benefit the downstream tasks. Since the OOD samples are inevitably detrimental to SSL, we propose a novel cross-modal matching strategy to detect OOD samples. Instead of directly applying binary classification, we train the network to predict whether the data sample is matched to an assigned one-hot class label. The appeal of the proposed cross-modal matching over binary classification is the ability to generate a compatible feature space that aligns with the core classification task. Extensive experiments show that our approach substantially lifts the performance on open-set SSL and outperforms the state-of-the-art by a large margin.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Huang_Trash_To_Treasure_Harvesting_OOD_Data_With_Cross-Modal_Matching_for_ICCV_2021_paper.pdf", @@ -43316,14 +46242,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Huang_Trash_To_Treasure_Harvesting_OOD_Data_With_Cross-Modal_Matching_for_ICCV_2021_paper.html", "aff_unique_index": "0;1;2;3;3;0;0;0", - "aff_unique_norm": "Sun Yat-sen University;Xidian University;Tencent;Meituan", - "aff_unique_dep": ";;Tencent America;", + "aff_unique_norm": "Sun Yat-sen University;Xidian University;Tencent America;Meituan", + "aff_unique_dep": ";;;", "aff_unique_url": "http://www.sysu.edu.cn/;http://www.xidian.edu.cn/;https://www.tencent.com/en-us;https://www.meituan.com", "aff_unique_abbr": "SYSU;Xidian;Tencent America;Meituan", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0;0;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Huang_2021_ICCV,\n \n author = {\n Huang,\n Junkai and Fang,\n Chaowei and Chen,\n Weikai and Chai,\n Zhenhua and Wei,\n Xiaolin and Wei,\n Pengxu and Lin,\n Liang and Li,\n Guanbin\n},\n title = {\n Trash To Treasure: Harvesting OOD Data With Cross-Modal Matching for Open-Set Semi-Supervised Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8310-8319\n} \n}" }, { "title": "TravelNet: Self-Supervised Physically Plausible Hand Motion Learning From Monocular Color Images", @@ -43331,6 +46258,7 @@ "status": "Poster", "track": "main", "pid": 5727, + "author_site": "Zimeng Zhao; Xi Zhao; Yangang Wang", "author": "Zimeng Zhao; Xi Zhao; Yangang Wang", "abstract": "This paper aims to reconstruct physically plausible hand motion from monocular color images. Existing frame-by-frame estimating approaches can not guarantee the physical plausibility (e.g. penetration, jittering) directly. In this paper, we embed physical constraints on the per-frame estimated motions in both spatial and temporal space. Our key idea is to adopt a self-supervised learning strategy to train a novel encoder-decoder, named TravelNet, whose training motion data is prepared by the physics engine using discrete pose states. TravelNet captures key pose states from hand motion sequences as compact motion descriptors, inspired by the concept of keyframes in animation. Finally, it manages to extract those key states out of perturbations without manual annotations, and reconstruct the motions preserving details and physical plausibility. In the experiments, we show that the outputs of the TravelNet contain both finger synergism and time consistency. Through the proposed framework, hand motions can be accurately reconstructed and flexibly re-edited, which is superior to the state-of-the-art methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhao_TravelNet_Self-Supervised_Physically_Plausible_Hand_Motion_Learning_From_Monocular_Color_ICCV_2021_paper.pdf", @@ -43354,7 +46282,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhao_2021_ICCV,\n \n author = {\n Zhao,\n Zimeng and Zhao,\n Xi and Wang,\n Yangang\n},\n title = {\n TravelNet: Self-Supervised Physically Plausible Hand Motion Learning From Monocular Color Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11666-11676\n} \n}" }, { "title": "Triggering Failures: Out-of-Distribution Detection by Learning From Local Adversarial Attacks in Semantic Segmentation", @@ -43362,10 +46291,11 @@ "status": "Poster", "track": "main", "pid": 3734, + "author_site": "Victor Besnier; Andrei Bursuc; David Picard; Alexandre Briot", "author": "Victor Besnier; Andrei Bursuc; David Picard; Alexandre Briot", "abstract": "In this paper, we tackle the detection of out-of-distribution (OOD) objects in semantic segmentation. By analyzing the literature, we found that current methods are either accurate or fast but not both which limits their usability in real world applications. To get the best of both aspects, we propose to mitigate the common shortcomings by following four design principles: decoupling the OOD detection from the segmentation task, observing the entire segmentation network instead of just its output, generating training data for the OOD detector by leveraging blind spots in the segmentation network and focusing the generated data on localized regions in the image to simulate OOD objects. Our main contribution is a new OOD detection architecture called ObsNet associated with a dedicated training scheme based on Local Adversarial Attacks (LAA). We validate the soundness of our approach across numerous ablation studies. We also show it obtains top performances both in speed and accuracy when compared to ten recent methods of the literature on three different datasets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Besnier_Triggering_Failures_Out-of-Distribution_Detection_by_Learning_From_Local_Adversarial_Attacks_ICCV_2021_paper.pdf", - "aff": "Valeo, Cr\u00e9teil, France+LIGM, Ecole des Ponts, Univ Gustave Eiffel, CNRS, Marne-la-Vall\u00e9e, France+ETIS UMR8051, CY Universit\u00e9, ENSEA, CNRS, Cergy France; Valeo.ai, Paris, France; LIGM, Ecole des Ponts, Univ Gustave Eiffel, CNRS, Marne-la-Vall\u00e9e, France; Valeo, Cr\u00e9teil, France", + "aff": "Valeo, Créteil, France+LIGM, Ecole des Ponts, Univ Gustave Eiffel, CNRS, Marne-la-Vallée, France+ETIS UMR8051, CY Université, ENSEA, CNRS, Cergy France; Valeo.ai, Paris, France; LIGM, Ecole des Ponts, Univ Gustave Eiffel, CNRS, Marne-la-Vallée, France; Valeo, Créteil, France", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Besnier_Triggering_Failures_Out-of-Distribution_ICCV_2021_supplemental.pdf", @@ -43378,14 +46308,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Besnier_Triggering_Failures_Out-of-Distribution_Detection_by_Learning_From_Local_Adversarial_Attacks_ICCV_2021_paper.html", "aff_unique_index": "0+1+2;3;1;0", - "aff_unique_norm": "Valeo;Ecole des Ponts ParisTech;CY Universit\u00e9;Valeo.ai", + "aff_unique_norm": "Valeo;Ecole des Ponts ParisTech;CY Université;Valeo.ai", "aff_unique_dep": ";LIGM;ETIS UMR8051;", "aff_unique_url": "https://www.valeo.com;https://www.ponts.fr;https://www.univ-cergy.fr;https://www.valeo.ai", "aff_unique_abbr": ";ENPC;CYU;", "aff_campus_unique_index": "1+2;3;1", - "aff_campus_unique": ";Marne-la-Vall\u00e9e;Cergy;Paris", + "aff_campus_unique": ";Marne-la-Vallée;Cergy;Paris", "aff_country_unique_index": "0+0+0;0;0;0", - "aff_country_unique": "France" + "aff_country_unique": "France", + "bibtex": "@InProceedings{Besnier_2021_ICCV,\n \n author = {\n Besnier,\n Victor and Bursuc,\n Andrei and Picard,\n David and Briot,\n Alexandre\n},\n title = {\n Triggering Failures: Out-of-Distribution Detection by Learning From Local Adversarial Attacks in Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15701-15710\n} \n}" }, { "title": "Tripartite Information Mining and Integration for Image Matting", @@ -43393,6 +46324,7 @@ "status": "Poster", "track": "main", "pid": 8717, + "author_site": "Yuhao Liu; Jiake Xie; Xiao Shi; Yu Qiao; Yujie Huang; Yong Tang; Xin Yang", "author": "Yuhao Liu; Jiake Xie; Xiao Shi; Yu Qiao; Yujie Huang; Yong Tang; Xin Yang", "abstract": "With the development of deep convolutional neural networks, image matting has ushered in a new phase. Regarding the nature of image matting, most researches have focused on solutions for transition regions. However, we argue that many existing approaches are excessively focused on transition-dominant local fields and ignored the inherent coordination between global information and transition optimisation. In this paper, we propose the Tripartite Information Mining and Integration Network (TIMI-Net) to harmonize the coordination between global and local attributes formally. Specifically, we resort to a novel 3-branch encoder to accomplish comprehensive mining of the input information, which can supplement the neglected coordination between global and local fields. In order to achieve effective and complete interaction between such multi-branches information, we develop the Tripartite Information Integration (TI^2) Module to transform and integrate the interconnections between the different branches. In addition, we built a large-scale human matting dataset (Human-2K) to advance human image matting, which consists of 2100 high-precision human images (2000 images for training and 100 images for test). Finally, we conduct extensive experiments to prove the performance of our proposed TIMI-Net, which demonstrates that our method performs favourably against the SOTA approaches on the alphamatting.com (Rank First), Composition-1K (MSE-0.006, Grad-11.5), Distinctions-646 and our Human-2K. Also, we have developed an online evaluation website to perform natural image matting. Project page: https://wukaoliu.github.io/TIMI-Net.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liu_Tripartite_Information_Mining_and_Integration_for_Image_Matting_ICCV_2021_paper.pdf", @@ -43407,7 +46339,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Liu_Tripartite_Information_Mining_and_Integration_for_Image_Matting_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Liu_Tripartite_Information_Mining_and_Integration_for_Image_Matting_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Liu_2021_ICCV,\n \n author = {\n Liu,\n Yuhao and Xie,\n Jiake and Shi,\n Xiao and Qiao,\n Yu and Huang,\n Yujie and Tang,\n Yong and Yang,\n Xin\n},\n title = {\n Tripartite Information Mining and Integration for Image Matting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7555-7564\n} \n}" }, { "title": "TrivialAugment: Tuning-Free Yet State-of-the-Art Data Augmentation", @@ -43415,7 +46348,8 @@ "status": "Poster", "track": "main", "pid": 6598, - "author": "Samuel G. M\u00fcller; Frank Hutter", + "author_site": "Samuel G. Müller; Frank Hutter", + "author": "Samuel G. Müller; Frank Hutter", "abstract": "Automatic augmentation methods have recently become a crucial pillar for strong model performance in vision tasks. While existing automatic augmentation methods need to trade off simplicity, cost and performance, we present a most simple baseline, TrivialAugment, that outperforms previous methods for almost free. TrivialAugment is parameter-free and only applies a single augmentation to each image. Thus, TrivialAugment's effectiveness is very unexpected to us and we performed very thorough experiments to study its performance. First, we compare TrivialAugment to previous state-of-the-art methods in a variety of image classification scenarios. Then, we perform multiple ablation studies with different augmentation spaces, augmentation methods and setups to understand the crucial requirements for its performance. Additionally, we provide a simple interface to facilitate the widespread adoption of automatic augmentation methods, as well as our full code base for reproducibility. Since our work reveals a stagnation in many parts of automatic augmentation research, we end with a short proposal of best practices for sustained future progress in automatic augmentation methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Muller_TrivialAugment_Tuning-Free_Yet_State-of-the-Art_Data_Augmentation_ICCV_2021_paper.pdf", "aff": "University of Freiburg; University of Freiburg & Bosch Center for Artificial Intelligence, Germany", @@ -43438,7 +46372,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Muller_2021_ICCV,\n \n author = {\n M\\"uller,\n Samuel G. and Hutter,\n Frank\n},\n title = {\n TrivialAugment: Tuning-Free Yet State-of-the-Art Data Augmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 774-782\n} \n}" }, { "title": "Tune It the Right Way: Unsupervised Validation of Domain Adaptation via Soft Neighborhood Density", @@ -43446,6 +46381,7 @@ "status": "Poster", "track": "main", "pid": 6193, + "author_site": "Kuniaki Saito; Donghyun Kim; Piotr Teterwak; Stan Sclaroff; Trevor Darrell; Kate Saenko", "author": "Kuniaki Saito; Donghyun Kim; Piotr Teterwak; Stan Sclaroff; Trevor Darrell; Kate Saenko", "abstract": "Unsupervised domain adaptation (UDA) methods can dramatically improve generalization on unlabeled target domains. However, optimal hyper-parameter selection is critical to achieving high accuracy and avoiding negative transfer. Supervised hyper-parameter validation is not possible without labeled target data, which raises the question: How can we validate unsupervised adaptation techniques in a realistic way? We first empirically analyze existing criteria and demonstrate that they are not very effective for tuning hyper-parameters. Intuitively, a well-trained source classifier should embed target samples of the same class nearby, forming dense neighborhoods in feature space. Based on this assumption, we propose a novel unsupervised validation criterion that measures the density of soft neighborhoods by computing the entropy of the similarity distribution between points. Our criterion is simpler than competing validation methods, yet more effective; it can tune hyper-parameters and the number of training iterations in both image classification and semantic segmentation models.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Saito_Tune_It_the_Right_Way_Unsupervised_Validation_of_Domain_Adaptation_ICCV_2021_paper.pdf", @@ -43469,7 +46405,8 @@ "aff_campus_unique_index": "1;", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;0;0;0;0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Saito_2021_ICCV,\n \n author = {\n Saito,\n Kuniaki and Kim,\n Donghyun and Teterwak,\n Piotr and Sclaroff,\n Stan and Darrell,\n Trevor and Saenko,\n Kate\n},\n title = {\n Tune It the Right Way: Unsupervised Validation of Domain Adaptation via Soft Neighborhood Density\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9184-9193\n} \n}" }, { "title": "UASNet: Uncertainty Adaptive Sampling Network for Deep Stereo Matching", @@ -43477,6 +46414,7 @@ "status": "Poster", "track": "main", "pid": 5578, + "author_site": "Yamin Mao; Zhihua Liu; Weiming Li; Yuchao Dai; Qiang Wang; Yun-Tae Kim; Hong-Seok Lee", "author": "Yamin Mao; Zhihua Liu; Weiming Li; Yuchao Dai; Qiang Wang; Yun-Tae Kim; Hong-Seok Lee", "abstract": "Recent studies have shown that cascade cost volume can play a vital role in deep stereo matching to achieve high resolution depth map with efficient hardware usage. However, how to construct good cascade volume as well as effective sampling for them are still under in-depth study. Previous cascade-based methods usually perform uniform sampling in a predicted disparity range based on variance, which easily misses the ground truth disparity and decreases disparity map accuracy. In this paper, we propose an uncertainty adaptive sampling network (UASNet) featuring two modules: an uncertainty distribution-guided range prediction (URP) model and an uncertainty-based disparity sampler (UDS) module. The URP explores the more discriminative uncertainty distribution to handle the complex matching ambiguities and to improve disparity range prediction. The UDS adaptively adjusts sampling interval to localize disparity with improved accuracy. With the proposed modules, our UASNet learns to construct cascade cost volume and predict full-resolution disparity map directly. Extensive experiments show that the proposed method achieves the highest ground truth covering ratio compared with other cascade cost volume based stereo matching methods. Our method also achieves top performance on both SceneFlow dataset and KITTI benchmark.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Mao_UASNet_Uncertainty_Adaptive_Sampling_Network_for_Deep_Stereo_Matching_ICCV_2021_paper.pdf", @@ -43491,7 +46429,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Mao_UASNet_Uncertainty_Adaptive_Sampling_Network_for_Deep_Stereo_Matching_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Mao_UASNet_Uncertainty_Adaptive_Sampling_Network_for_Deep_Stereo_Matching_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Mao_2021_ICCV,\n \n author = {\n Mao,\n Yamin and Liu,\n Zhihua and Li,\n Weiming and Dai,\n Yuchao and Wang,\n Qiang and Kim,\n Yun-Tae and Lee,\n Hong-Seok\n},\n title = {\n UASNet: Uncertainty Adaptive Sampling Network for Deep Stereo Matching\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6311-6319\n} \n}" }, { "title": "UNISURF: Unifying Neural Implicit Surfaces and Radiance Fields for Multi-View Reconstruction", @@ -43499,10 +46438,11 @@ "status": "Poster", "track": "main", "pid": 1194, + "author_site": "Michael Oechsle; Songyou Peng; Andreas Geiger", "author": "Michael Oechsle; Songyou Peng; Andreas Geiger", "abstract": "Neural implicit 3D representations have emerged as a powerful paradigm for reconstructing surfaces from multi-view images and synthesizing novel views. Unfortunately, existing methods such as DVR or IDR require accurate per-pixel object masks as supervision. At the same time, neural radiance fields have revolutionized novel view synthesis. However, NeRF's estimated volume density does not admit accurate surface reconstruction. Our key insight is that implicit surface models and radiance fields can be formulated in a unified way, enabling both surface and volume rendering using the same model. This unified perspective enables novel, more efficient sampling procedures and the ability to reconstruct accurate surfaces without input masks. We compare our method on the DTU, BlendedMVS, and a synthetic indoor dataset. Our experiments demonstrate that we outperform NeRF in terms of reconstruction quality while performing on par with IDR without requiring masks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Oechsle_UNISURF_Unifying_Neural_Implicit_Surfaces_and_Radiance_Fields_for_Multi-View_ICCV_2021_paper.pdf", - "aff": "Max Planck Institute for Intelligent Systems, T\u00fcbingen + University of T\u00fcbingen + ETAS GmbH, Stuttgart; Max Planck Institute for Intelligent Systems, T\u00fcbingen + ETH Zurich; Max Planck Institute for Intelligent Systems, T\u00fcbingen + University of T\u00fcbingen", + "aff": "Max Planck Institute for Intelligent Systems, Tübingen + University of Tübingen + ETAS GmbH, Stuttgart; Max Planck Institute for Intelligent Systems, Tübingen + ETH Zurich; Max Planck Institute for Intelligent Systems, Tübingen + University of Tübingen", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Oechsle_UNISURF_Unifying_Neural_ICCV_2021_supplemental.pdf", @@ -43515,14 +46455,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Oechsle_UNISURF_Unifying_Neural_Implicit_Surfaces_and_Radiance_Fields_for_Multi-View_ICCV_2021_paper.html", "aff_unique_index": "0+1+2;0+3;0+1", - "aff_unique_norm": "Max Planck Institute for Intelligent Systems;University of T\u00fcbingen;ETAS GmbH;ETH Zurich", + "aff_unique_norm": "Max Planck Institute for Intelligent Systems;University of Tübingen;ETAS GmbH;ETH Zurich", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.mpi-is.mpg.de;https://www.uni-tuebingen.de/;https://www.etas.com;https://www.ethz.ch", - "aff_unique_abbr": "MPI-IS;Uni T\u00fcbingen;;ETHZ", + "aff_unique_abbr": "MPI-IS;Uni Tübingen;;ETHZ", "aff_campus_unique_index": "0;0;0", - "aff_campus_unique": "T\u00fcbingen;", + "aff_campus_unique": "Tübingen;", "aff_country_unique_index": "0+0+0;0+1;0+0", - "aff_country_unique": "Germany;Switzerland" + "aff_country_unique": "Germany;Switzerland", + "bibtex": "@InProceedings{Oechsle_2021_ICCV,\n \n author = {\n Oechsle,\n Michael and Peng,\n Songyou and Geiger,\n Andreas\n},\n title = {\n UNISURF: Unifying Neural Implicit Surfaces and Radiance Fields for Multi-View Reconstruction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5589-5599\n} \n}" }, { "title": "UVStyle-Net: Unsupervised Few-Shot Learning of 3D Style Similarity Measure for B-Reps", @@ -43530,6 +46471,7 @@ "status": "Poster", "track": "main", "pid": 8737, + "author_site": "Peter Meltzer; Hooman Shayani; Amir Khasahmadi; Pradeep Kumar Jayaraman; Aditya Sanghi; Joseph Lambourne", "author": "Peter Meltzer; Hooman Shayani; Amir Khasahmadi; Pradeep Kumar Jayaraman; Aditya Sanghi; Joseph Lambourne", "abstract": "Boundary Representations (B-Reps) are the industry standard in 3D Computer Aided Design/Manufacturing (CAD/CAM) and industrial design due to their fidelity in representing stylistic details. However, they have been ignored in the 3D style research. Existing 3D style metrics typically operate on meshes or point clouds, and fail to account for end-user subjectivity by adopting fixed definitions of style, either through crowd-sourcing for style labels or hand-crafted features. We propose UVStyle-Net, a style similarity measure for B-Reps that leverages the style signals in the second order statistics of the activations in a pre-trained (unsupervised) 3D encoder, and learns their relative importance to a subjective end-user through few-shot learning. Our approach differs from all existing data-driven 3D style methods since it may be used in completely unsupervised settings, which is desirable given the lack of publicly available labeled B-Rep datasets. More importantly, the few-shot learning accounts for the inherent subjectivity associated with style. We show quantitatively that our proposed method with B-Reps is able to capture stronger style signals than alternative methods on meshes and point clouds despite its significantly greater computational efficiency. We also show it is able to generate meaningful style gradients with respect to the input shape, and that few-shot learning with as few as two positive examples selected by an end-user is sufficient to significantly improve the style measure. Finally, we demonstrate its efficacy on a large unlabeled public dataset of CAD models. Source code and data are available at https://github.com/AutodeskAILab/UVStyle-Net.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Meltzer_UVStyle-Net_Unsupervised_Few-Shot_Learning_of_3D_Style_Similarity_Measure_for_ICCV_2021_paper.pdf", @@ -43544,7 +46486,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Meltzer_UVStyle-Net_Unsupervised_Few-Shot_Learning_of_3D_Style_Similarity_Measure_for_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Meltzer_UVStyle-Net_Unsupervised_Few-Shot_Learning_of_3D_Style_Similarity_Measure_for_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Meltzer_2021_ICCV,\n \n author = {\n Meltzer,\n Peter and Shayani,\n Hooman and Khasahmadi,\n Amir and Jayaraman,\n Pradeep Kumar and Sanghi,\n Aditya and Lambourne,\n Joseph\n},\n title = {\n UVStyle-Net: Unsupervised Few-Shot Learning of 3D Style Similarity Measure for B-Reps\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9690-9699\n} \n}" }, { "title": "Ultra-High-Definition Image HDR Reconstruction via Collaborative Bilateral Learning", @@ -43552,10 +46495,11 @@ "status": "Poster", "track": "main", "pid": 5573, + "author_site": "Zhuoran Zheng; Wenqi Ren; Xiaochun Cao; Tao Wang; Xiuyi Jia", "author": "Zhuoran Zheng; Wenqi Ren; Xiaochun Cao; Tao Wang; Xiuyi Jia", "abstract": "Existing single image high dynamic range (HDR) reconstruction attempt to expand the range of luminance. They are not effective to generate plausible textures and colors in the reconstructed results, especially for high-density pixels in ultra-high-definition (UHD) images.To address these problems, we propose a new HDR reconstruction network for UHD images by collaboratively learning color and texture details. First, we propose a dual-path network to extract content and chromatic features at a reduced resolution of the low dynamic range (LDR) input. These two types features are used to fit bilatera-space affine models for real-time HDR reconstruction. To extract the main data structure of the LDR input, we propose to use 3D Tucker decomposition and reconstruction to prevents false edges and noise amplification in the learned bilateral grid. As a result, the high-quality content and chromatic features can be reconstructed capitalized on guided bilateral upsampling. Finally, we fuse these two full-resolution feature maps into the HDR reconstructed results.Our proposed method can achieve real-time processing for UHD image (about 160 fps).Experimental results demonstrate that the proposed algorithm performs favorably against the state-of-the-art HDR reconstruction approaches on public benchmarks and real-world UHD images.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zheng_Ultra-High-Definition_Image_HDR_Reconstruction_via_Collaborative_Bilateral_Learning_ICCV_2021_paper.pdf", - "aff": "School of Computer Science and Engineering, Nanjing University of Science and Technology + Jiangsu Key Laboratory of Image and Video Understanding for Social Safety, Nanjing University of Science and Technology + SKLOIS, IIE, CAS; Jiangsu Key Laboratory of Image and Video Understanding for Social Safety, Nanjing University of Science and Technology + SKLOIS, IIE, CAS; SKLOIS, IIE, CAS; Huawei Noah\u2019s Ark Lab; School of Computer Science and Engineering, Nanjing University of Science and Technology + Jiangsu Key Laboratory of Image and Video Understanding for Social Safety, Nanjing University of Science and Technology", + "aff": "School of Computer Science and Engineering, Nanjing University of Science and Technology + Jiangsu Key Laboratory of Image and Video Understanding for Social Safety, Nanjing University of Science and Technology + SKLOIS, IIE, CAS; Jiangsu Key Laboratory of Image and Video Understanding for Social Safety, Nanjing University of Science and Technology + SKLOIS, IIE, CAS; SKLOIS, IIE, CAS; Huawei Noah’s Ark Lab; School of Computer Science and Engineering, Nanjing University of Science and Technology + Jiangsu Key Laboratory of Image and Video Understanding for Social Safety, Nanjing University of Science and Technology", "project": "", "github": "", "supp": "", @@ -43569,13 +46513,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zheng_Ultra-High-Definition_Image_HDR_Reconstruction_via_Collaborative_Bilateral_Learning_ICCV_2021_paper.html", "aff_unique_index": "0+0+1;0+1;1;2;0+0", "aff_unique_norm": "Nanjing University of Science and Technology;Institute of Information Engineering, Chinese Academy of Sciences;Huawei", - "aff_unique_dep": "School of Computer Science and Engineering;SKLOIS (State Key Laboratory of Information Security);Noah\u2019s Ark Lab", + "aff_unique_dep": "School of Computer Science and Engineering;SKLOIS (State Key Laboratory of Information Security);Noah’s Ark Lab", "aff_unique_url": "http://www.nust.edu.cn;http://www.iie.cas.cn;https://www.huawei.com", "aff_unique_abbr": "NUST;IIE, CAS;Huawei", "aff_campus_unique_index": "0;;0", "aff_campus_unique": "Nanjing;", "aff_country_unique_index": "0+0+0;0+0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zheng_2021_ICCV,\n \n author = {\n Zheng,\n Zhuoran and Ren,\n Wenqi and Cao,\n Xiaochun and Wang,\n Tao and Jia,\n Xiuyi\n},\n title = {\n Ultra-High-Definition Image HDR Reconstruction via Collaborative Bilateral Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4449-4458\n} \n}" }, { "title": "UltraPose: Synthesizing Dense Pose With 1 Billion Points by Human-Body Decoupling 3D Model", @@ -43583,6 +46528,7 @@ "status": "Poster", "track": "main", "pid": 6361, + "author_site": "Haonan Yan; Jiaqi Chen; Xujie Zhang; Shengkai Zhang; Nianhong Jiao; Xiaodan Liang; Tianxiang Zheng", "author": "Haonan Yan; Jiaqi Chen; Xujie Zhang; Shengkai Zhang; Nianhong Jiao; Xiaodan Liang; Tianxiang Zheng", "abstract": "Recovering dense human poses from images plays a critical role in establishing an image-to-surface correspondence between RGB images and the 3D surface of the human body, serving the foundation of rich real-world applications, such as virtual humans, monocular-to-3d reconstruction. However, the popular DensePose-COCO dataset relies on a sophisticated manual annotation system, leading to severe limitations in acquiring the denser and more accurate annotated pose resources. In this work, we introduce a new 3D human-body model with a series of decoupled parameters that could freely control the generation of the body. Furthermore, we build a data generation system based on this decoupling 3D model, and construct an ultra dense synthetic benchmark UltraPose, containing around 1.3 billion corresponding points. Compared to the existing manually annotated DensePose-COCO dataset, the synthetic UltraPose has ultra dense image-to-surface correspondences without annotation cost and error. Our proposed UltraPose provides the largest benchmark and data resources for lifting the model capability in predicting more accurate dense poses. To promote future researches in this field, we also propose a transformer-based method to model the dense correspondence between 2D and 3D worlds. The proposed model trained on synthetic UltraPose can be applied to real-world scenarios, indicating the effectiveness of our benchmark and model.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yan_UltraPose_Synthesizing_Dense_Pose_With_1_Billion_Points_by_Human-Body_ICCV_2021_paper.pdf", @@ -43606,7 +46552,8 @@ "aff_campus_unique_index": ";;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0+0;0+0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yan_2021_ICCV,\n \n author = {\n Yan,\n Haonan and Chen,\n Jiaqi and Zhang,\n Xujie and Zhang,\n Shengkai and Jiao,\n Nianhong and Liang,\n Xiaodan and Zheng,\n Tianxiang\n},\n title = {\n UltraPose: Synthesizing Dense Pose With 1 Billion Points by Human-Body Decoupling 3D Model\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10891-10900\n} \n}" }, { "title": "Unaligned Image-to-Image Translation by Learning to Reweight", @@ -43614,6 +46561,7 @@ "status": "Poster", "track": "main", "pid": 3993, + "author_site": "Shaoan Xie; Mingming Gong; Yanwu Xu; Kun Zhang", "author": "Shaoan Xie; Mingming Gong; Yanwu Xu; Kun Zhang", "abstract": "Unsupervised image-to-image translation aims at learning the mapping from the source to target domain without using paired images for training. An essential yet restrictive assumption for unsupervised image translation is that the two domains are aligned, e.g., for the selfie2anime task, the anime (selfie) domain must contain only anime (selfie) face images that can be translated to some images in the other domain. Collecting aligned domains can be laborious and needs lots of attention. In this paper, we consider the task of image translation between two unaligned domains, which may arise for various possible reasons. To solve this problem, we propose to select images based on importance reweighting and develop a method to learn the weights and perform translation simultaneously and automatically. We compare the proposed method with state-of-the-art image translation approaches and present qualitative and quantitative results on different tasks with unaligned domains. Extensive empirical evidence demonstrates the usefulness of the proposed problem formulation and the superiority of our method.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xie_Unaligned_Image-to-Image_Translation_by_Learning_to_Reweight_ICCV_2021_paper.pdf", @@ -43628,7 +46576,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Xie_Unaligned_Image-to-Image_Translation_by_Learning_to_Reweight_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Xie_Unaligned_Image-to-Image_Translation_by_Learning_to_Reweight_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Xie_2021_ICCV,\n \n author = {\n Xie,\n Shaoan and Gong,\n Mingming and Xu,\n Yanwu and Zhang,\n Kun\n},\n title = {\n Unaligned Image-to-Image Translation by Learning to Reweight\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14174-14184\n} \n}" }, { "title": "Uncertainty-Aware Human Mesh Recovery From Video by Learning Part-Based 3D Dynamics", @@ -43636,6 +46585,7 @@ "status": "Poster", "track": "main", "pid": 2388, + "author_site": "Gun-Hee Lee; Seong-Whan Lee", "author": "Gun-Hee Lee; Seong-Whan Lee", "abstract": "Despite the recent success of 3D human reconstruction methods, recovering the accurate and smooth 3D human motion from video is still challenging. Designing a temporal model in the encoding stage is not sufficient enough to settle the trade-off problem between the per-frame accuracy and the motion smoothness. To address this problem, we approach some of the fundamental problems of 3D reconstruction tasks, simultaneously predicting 3D pose and 3D motion dynamics. First, we utilize the power of uncertainty to address the problem of multiple 3D configurations resulting in the same 2D projections. Second, we confirmed that dividing the body into local regions shows outstanding results for estimating 3D motion dynamics. In this paper, we propose (i) an encoder that makes two different estimations: a static feature that presents 2D pose feature as distribution and a dynamic feature that includes optical flow information and (ii) a decoder that divides the body into five different local regions to estimate the 3D motion dynamics of each region. We demonstrate how our method recovers the accurate and smooth motion and achieves the state-of-the-art results for both constrained and in-the-wild videos.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Lee_Uncertainty-Aware_Human_Mesh_Recovery_From_Video_by_Learning_Part-Based_3D_ICCV_2021_paper.pdf", @@ -43659,7 +46609,8 @@ "aff_campus_unique_index": "0+0;0+0", "aff_campus_unique": "Seoul", "aff_country_unique_index": "0+0;0+0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Lee_2021_ICCV,\n \n author = {\n Lee,\n Gun-Hee and Lee,\n Seong-Whan\n},\n title = {\n Uncertainty-Aware Human Mesh Recovery From Video by Learning Part-Based 3D Dynamics\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12375-12384\n} \n}" }, { "title": "Uncertainty-Aware Pseudo Label Refinery for Domain Adaptive Semantic Segmentation", @@ -43667,6 +46618,7 @@ "status": "Poster", "track": "main", "pid": 5676, + "author_site": "Yuxi Wang; Junran Peng; ZhaoXiang Zhang", "author": "Yuxi Wang; Junran Peng; ZhaoXiang Zhang", "abstract": "Unsupervised domain adaptation for semantic segmentation aims to assign the pixel-level labels for unlabeled target domain by transferring knowledge from the labeled source domain. A typical self-supervised learning approach generates pseudo labels from the source model and then re-trains the model to fit the target distribution. However, it suffers from noisy pseudo labels due to the existence of domain shift. Related works alleviate this problem by selecting high-confidence predictions, but uncertain classes with low confidence scores have rarely been considered. This informative uncertainty is essential to enhance feature representation and align source and target domains. In this paper, we propose a novel uncertainty-aware pseudo label refinery framework considering two crucial factors simultaneously. First, we progressively enhance the feature alignment model via the target-guided uncertainty rectifying framework. Second, we provide an uncertainty-aware pseudo label assignment strategy without any manually designed threshold to reduce the noisy labels. Extensive experiments demonstrate the effectiveness of our proposed approach and achieve state-of-the-art performance on two standard synthetic-2-real tasks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_Uncertainty-Aware_Pseudo_Label_Refinery_for_Domain_Adaptive_Semantic_Segmentation_ICCV_2021_paper.pdf", @@ -43690,7 +46642,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0+0+0+0+0;0;0+0+0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Yuxi and Peng,\n Junran and Zhang,\n ZhaoXiang\n},\n title = {\n Uncertainty-Aware Pseudo Label Refinery for Domain Adaptive Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9092-9101\n} \n}" }, { "title": "Uncertainty-Guided Transformer Reasoning for Camouflaged Object Detection", @@ -43698,6 +46651,7 @@ "status": "Poster", "track": "main", "pid": 5583, + "author_site": "Fan Yang; Qiang Zhai; Xin Li; Rui Huang; Ao Luo; Hong Cheng; Deng-Ping Fan", "author": "Fan Yang; Qiang Zhai; Xin Li; Rui Huang; Ao Luo; Hong Cheng; Deng-Ping Fan", "abstract": "Spotting objects that are visually adapted to their surroundings is challenging for both humans and AI. Conventional generic / salient object detection techniques are suboptimal for this task because they tend to only discover easy and clear objects, while overlooking the difficult-to-detect ones with inherent uncertainties derived from indistinguishable textures. In this work, we contribute a novel approach using a probabilistic representational model in combination with transformers to explicitly reason under uncertainties, namely uncertainty-guided transformer reasoning (UGTR), for camouflaged object detection. The core idea is to first learn a conditional distribution over the backbone's output to obtain initial estimates and associated uncertainties, and then reason over these uncertain regions with attention mechanism to produce final predictions. Our approach combines the benefits of both Bayesian learning and Transformer-based reasoning, allowing the model to handle camouflaged object detection by leveraging both deterministic and probabilistic information. We empirically demonstrate that our proposed approach can achieve higher accuracy than existing state-of-the-art models on CHAMELEON, CAMO and COD10K datasets. Code is available at https://github.com/fanyang587/UGTR.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yang_Uncertainty-Guided_Transformer_Reasoning_for_Camouflaged_Object_Detection_ICCV_2021_paper.pdf", @@ -43721,7 +46675,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "1;1;2;1;1;2", - "aff_country_unique": ";China;United States" + "aff_country_unique": ";China;United States", + "bibtex": "@InProceedings{Yang_2021_ICCV,\n \n author = {\n Yang,\n Fan and Zhai,\n Qiang and Li,\n Xin and Huang,\n Rui and Luo,\n Ao and Cheng,\n Hong and Fan,\n Deng-Ping\n},\n title = {\n Uncertainty-Guided Transformer Reasoning for Camouflaged Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4146-4155\n} \n}" }, { "title": "Unconditional Scene Graph Generation", @@ -43729,10 +46684,11 @@ "status": "Poster", "track": "main", "pid": 1554, + "author_site": "Sarthak Garg; Helisa Dhamo; Azade Farshad; Sabrina Musatian; Nassir Navab; Federico Tombari", "author": "Sarthak Garg; Helisa Dhamo; Azade Farshad; Sabrina Musatian; Nassir Navab; Federico Tombari", "abstract": "Despite recent advancements in single-domain or single-object image generation, it is still challenging to generate complex scenes containing diverse, multiple objects and their interactions. Scene graphs, composed of nodes as objects and directed-edges as relationships among objects, offer an alternative representation of a scene that is more semantically grounded than images. We hypothesize that a generative model for scene graphs might be able to learn the underlying semantic structure of real-world scenes more effectively than images, and hence, generate realistic novel scenes in the form of scene graphs. In this work, we explore a new task for the unconditional generation of semantic scene graphs. We develop a deep auto-regressive model called SceneGraphGen which can directly learn the probability distribution over labelled and directed graphs using a hierarchical recurrent architecture. The model takes a seed object as input and generates a scene graph in a sequence of steps, each step generating an object node, followed by a sequence of relationship edges connecting to the previous nodes. We show that the scene graphs generated by SceneGraphGen are diverse and follow the semantic patterns of real-world scenes. Additionally, we demonstrate the application of the generated graphs in image synthesis, anomaly detection and scene graph completion.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Garg_Unconditional_Scene_Graph_Generation_ICCV_2021_paper.pdf", - "aff": "Technische Universit \u00a8at M \u00a8unchen; Technische Universit \u00a8at M \u00a8unchen; Technische Universit \u00a8at M \u00a8unchen; Technische Universit \u00a8at M \u00a8unchen; Technische Universit \u00a8at M \u00a8unchen; Technische Universit \u00a8at M \u00a8unchen+Google", + "aff": "Technische Universit ¨at M ¨unchen; Technische Universit ¨at M ¨unchen; Technische Universit ¨at M ¨unchen; Technische Universit ¨at M ¨unchen; Technische Universit ¨at M ¨unchen; Technische Universit ¨at M ¨unchen+Google", "project": "https://SceneGraphGen.github.io/", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Garg_Unconditional_Scene_Graph_ICCV_2021_supplemental.pdf", @@ -43745,14 +46701,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Garg_Unconditional_Scene_Graph_Generation_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;0;0;0+1", - "aff_unique_norm": "Technische Universit\u00e4t M\u00fcnchen;Google", - "aff_unique_dep": ";Google", + "aff_unique_norm": "Technische Universität München;Google", + "aff_unique_dep": ";", "aff_unique_url": "https://www.tum.de;https://www.google.com", "aff_unique_abbr": "TUM;Google", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0;0;0+1", - "aff_country_unique": "Germany;United States" + "aff_country_unique": "Germany;United States", + "bibtex": "@InProceedings{Garg_2021_ICCV,\n \n author = {\n Garg,\n Sarthak and Dhamo,\n Helisa and Farshad,\n Azade and Musatian,\n Sabrina and Navab,\n Nassir and Tombari,\n Federico\n},\n title = {\n Unconditional Scene Graph Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 16362-16371\n} \n}" }, { "title": "Unconstrained Scene Generation With Locally Conditioned Radiance Fields", @@ -43760,6 +46717,7 @@ "status": "Poster", "track": "main", "pid": 2128, + "author_site": "Terrance DeVries; Miguel Angel Bautista; Nitish Srivastava; Graham W. Taylor; Joshua M. Susskind", "author": "Terrance DeVries; Miguel Angel Bautista; Nitish Srivastava; Graham W. Taylor; Joshua M. Susskind", "abstract": "We tackle the challenge of learning a distribution over complex, realistic, indoor scenes. In this paper, we introduce Generative Scene Networks (GSN), which learns to decompose scenes into a collection of many local radiance fields that can be rendered from a free moving camera. Our model can be used as a prior to generate new scenes, or to complete a scene given only sparse 2D observations. Recent work has shown that generative models of radiance fields can capture properties such as multi-view consistency and view-dependent lighting. However, these models are specialized for constrained viewing of single objects, such as cars or faces. Due to the size and complexity of realistic indoor environments, existing models lack the representational capacity to adequately capture them. Our decomposition scheme scales to larger and more complex scenes while preserving details and diversity, and the learned prior enables high-quality rendering from view-points that are significantly different from observed viewpoints. When compared to existing models, GSN produces quantitatively higher quality scene renderings across several different scene datasets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/DeVries_Unconstrained_Scene_Generation_With_Locally_Conditioned_Radiance_Fields_ICCV_2021_paper.pdf", @@ -43776,14 +46734,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/DeVries_Unconstrained_Scene_Generation_With_Locally_Conditioned_Radiance_Fields_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;1+2;0", - "aff_unique_norm": "Apple;University of Guelph;Vector Institute", - "aff_unique_dep": "Apple Inc.;;", + "aff_unique_norm": "Apple Inc.;University of Guelph;Vector Institute", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.apple.com;https://www.uoguelph.ca;https://vectorinstitute.ai/", "aff_unique_abbr": "Apple;U of G;Vector Institute", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1+1;0", - "aff_country_unique": "United States;Canada" + "aff_country_unique": "United States;Canada", + "bibtex": "@InProceedings{DeVries_2021_ICCV,\n \n author = {\n DeVries,\n Terrance and Bautista,\n Miguel Angel and Srivastava,\n Nitish and Taylor,\n Graham W. and Susskind,\n Joshua M.\n},\n title = {\n Unconstrained Scene Generation With Locally Conditioned Radiance Fields\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14304-14313\n} \n}" }, { "title": "Understanding Robustness of Transformers for Image Classification", @@ -43791,6 +46750,7 @@ "status": "Poster", "track": "main", "pid": 10364, + "author_site": "Srinadh Bhojanapalli; Ayan Chakrabarti; Daniel Glasner; Daliang Li; Thomas Unterthiner; Andreas Veit", "author": "Srinadh Bhojanapalli; Ayan Chakrabarti; Daniel Glasner; Daliang Li; Thomas Unterthiner; Andreas Veit", "abstract": "Deep Convolutional Neural Networks (CNNs) have long been the architecture of choice for computer vision tasks. Recently, Transformer-based architectures like Vision Transformer (ViT) have matched or even surpassed ResNets for image classification. However, details of the Transformer architecture such as the use of non-overlapping patches lead one to wonder whether these networks are as robust. In this paper, we perform an extensive study of a variety of different measures of robustness of ViT models and compare the findings to ResNet baselines. We investigate robustness to input perturbations as well as robustness to model perturbations. We find that when pre-trained with a sufficient amount of data, ViT models are at least as robust as the ResNet counterparts on a broad range of perturbations. We also find that Transformers are robust to the removal of almost any single layer, and that while activations from later layers are highly correlated with each other, they nevertheless play an important role in classification.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Bhojanapalli_Understanding_Robustness_of_Transformers_for_Image_Classification_ICCV_2021_paper.pdf", @@ -43814,7 +46774,8 @@ "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Bhojanapalli_2021_ICCV,\n \n author = {\n Bhojanapalli,\n Srinadh and Chakrabarti,\n Ayan and Glasner,\n Daniel and Li,\n Daliang and Unterthiner,\n Thomas and Veit,\n Andreas\n},\n title = {\n Understanding Robustness of Transformers for Image Classification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10231-10241\n} \n}" }, { "title": "Understanding and Evaluating Racial Biases in Image Captioning", @@ -43822,6 +46783,7 @@ "status": "Poster", "track": "main", "pid": 7878, + "author_site": "Dora Zhao; Angelina Wang; Olga Russakovsky", "author": "Dora Zhao; Angelina Wang; Olga Russakovsky", "abstract": "Image captioning is an important task for benchmarking visual reasoning and for enabling accessibility for people with vision impairments. However, as in many machine learning settings, social biases can influence image captioning in undesirable ways. In this work, we study bias propagation pathways within image captioning, focusing specifically on the COCO dataset. Prior work has analyzed gender bias in captions using automatically-derived gender labels; here we examine racial and intersectional biases using manual annotations. Our first contribution is in annotating the perceived gender and skin color of 28,315 of the depicted people after obtaining IRB approval. Using these annotations, we compare racial biases present in both manual and automatically-generated image captions. We demonstrate differences in caption performance, sentiment, and word choice between images of lighter versus darker-skinned people. Further, we find the magnitude of these differences to be greater in modern captioning systems compared to older ones, thus leading to concerns that without proper consideration and mitigation these differences will only become increasingly prevalent. Code and data is available at https://princetonvisualai.github.io/imagecaptioning-bias/.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhao_Understanding_and_Evaluating_Racial_Biases_in_Image_Captioning_ICCV_2021_paper.pdf", @@ -43845,7 +46807,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhao_2021_ICCV,\n \n author = {\n Zhao,\n Dora and Wang,\n Angelina and Russakovsky,\n Olga\n},\n title = {\n Understanding and Evaluating Racial Biases in Image Captioning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14830-14840\n} \n}" }, { "title": "Understanding and Mitigating Annotation Bias in Facial Expression Recognition", @@ -43853,6 +46816,7 @@ "status": "Poster", "track": "main", "pid": 7255, + "author_site": "Yunliang Chen; Jungseock Joo", "author": "Yunliang Chen; Jungseock Joo", "abstract": "The performance of a computer vision model depends on the size and quality of its training data. Recent studies have unveiled previously-unknown composition biases in common image datasets which then lead to skewed model outputs, and have proposed methods to mitigate these biases. However, most existing works assume that human-generated annotations can be considered gold-standard and unbiased. In this paper, we reveal that this assumption can be problematic, and that special care should be taken to prevent models from learning such annotation biases. We focus on facial expression recognition and compare the label biases between lab-controlled and in-the-wild datasets. We demonstrate that many expression datasets contain significant annotation biases between genders, especially when it comes to the happy and angry expressions, and that traditional methods cannot fully mitigate such biases in trained models. To remove expression annotation bias, we propose an AU-Calibrated Facial Expression Recognition (AUC-FER) framework that utilizes facial action units (AUs) and incorporates the triplet loss into the objective function. Experimental results suggest that the proposed method is more effective in removing expression annotation bias than existing techniques.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_Understanding_and_Mitigating_Annotation_Bias_in_Facial_Expression_Recognition_ICCV_2021_paper.pdf", @@ -43876,7 +46840,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Los Angeles", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Yunliang and Joo,\n Jungseock\n},\n title = {\n Understanding and Mitigating Annotation Bias in Facial Expression Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14980-14991\n} \n}" }, { "title": "UniT: Multimodal Multitask Learning With a Unified Transformer", @@ -43884,6 +46849,7 @@ "status": "Poster", "track": "main", "pid": 5428, + "author_site": "Ronghang Hu; Amanpreet Singh", "author": "Ronghang Hu; Amanpreet Singh", "abstract": "We propose UniT, a Unified Transformer model to simultaneously learn the most prominent tasks across different domains, ranging from object detection to natural language understanding and multimodal reasoning. Based on the transformer encoder-decoder architecture, our UniT model encodes each input modality with an encoder and makes predictions on each task with a shared decoder over the encoded input representations, followed by task-specific output heads. The entire model is jointly trained end-to-end with losses from each task. Compared to previous efforts on multi-task learning with transformers, we share the same model parameters across all tasks instead of separately fine-tuning task-specific models and handle a much higher variety of tasks across different domains. In our experiments, we learn 7 tasks jointly over 8 datasets, achieving strong performance on each task with significantly fewer parameters. Our code is available in MMF at https://mmf.sh.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Hu_UniT_Multimodal_Multitask_Learning_With_a_Unified_Transformer_ICCV_2021_paper.pdf", @@ -43900,14 +46866,15 @@ "author_num": 2, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Hu_UniT_Multimodal_Multitask_Learning_With_a_Unified_Transformer_ICCV_2021_paper.html", "aff_unique_index": "0;0", - "aff_unique_norm": "Meta", + "aff_unique_norm": "Facebook", "aff_unique_dep": "Facebook AI Research", "aff_unique_url": "https://research.facebook.com", "aff_unique_abbr": "FAIR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Hu_2021_ICCV,\n \n author = {\n Hu,\n Ronghang and Singh,\n Amanpreet\n},\n title = {\n UniT: Multimodal Multitask Learning With a Unified Transformer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1439-1449\n} \n}" }, { "title": "Unidentified Video Objects: A Benchmark for Dense, Open-World Segmentation", @@ -43915,6 +46882,7 @@ "status": "Poster", "track": "main", "pid": 8101, + "author_site": "Weiyao Wang; Matt Feiszli; Heng Wang; Du Tran", "author": "Weiyao Wang; Matt Feiszli; Heng Wang; Du Tran", "abstract": "Current state-of-the-art object detection and segmentation methods work well under the closed-world assumption. This closed-world setting assumes that the list of object categories is available during training and deployment. However, many real-world applications require detecting or segmenting novel objects, i.e., object categories never seen during training. In this paper, we present, UVO (Unidentified Video Objects), a new benchmark for open-world class-agnostic object segmentation in videos. Besides shifting the focus to the open-world setup, UVO is significantly larger, providing approximately 6 times more videos compared with DAVIS, and 7 times more mask (instance) annotations per video compared with YouTube-VO(I)S. UVO is also more challenging as it includes many videos with crowded scenes and complex background motions. We also demonstrated that UVO can be used for other applications, such as object tracking and super-voxel segmentation. We believe that UVO is a versatile testbed for researchers to develop novel approaches for open-world class-agnostic object segmentation, and inspires new research directions towards a more comprehensive video understanding beyond classification and detection.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_Unidentified_Video_Objects_A_Benchmark_for_Dense_Open-World_Segmentation_ICCV_2021_paper.pdf", @@ -43931,14 +46899,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wang_Unidentified_Video_Objects_A_Benchmark_for_Dense_Open-World_Segmentation_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;0", - "aff_unique_norm": "Meta", + "aff_unique_norm": "Facebook", "aff_unique_dep": "Facebook AI Research", "aff_unique_url": "https://research.facebook.com", "aff_unique_abbr": "FAIR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Weiyao and Feiszli,\n Matt and Wang,\n Heng and Tran,\n Du\n},\n title = {\n Unidentified Video Objects: A Benchmark for Dense,\n Open-World Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10776-10785\n} \n}" }, { "title": "Unified Graph Structured Models for Video Understanding", @@ -43946,6 +46915,7 @@ "status": "Poster", "track": "main", "pid": 3807, + "author_site": "Anurag Arnab; Chen Sun; Cordelia Schmid", "author": "Anurag Arnab; Chen Sun; Cordelia Schmid", "abstract": "Accurate video understanding involves reasoning about the relationships between actors, objects and their environment, often over long temporal intervals. In this paper, we propose a message passing graph neural network that explicitly models these spatio-temporal relations and can use explicit representations of objects, when supervision is available, and implicit representations otherwise. Our formulation generalises previous structured models for video understanding, and allows us to study how different design choices in graph structure and representation affect the model's performance. We demonstrate our method on two different tasks requiring relational reasoning in videos -- spatio-temporal action detection on AVA and UCF101-24, and video scene graph classification on the recent Action Genome dataset -- and achieve state-of-the-art results on all three datasets. Furthermore, we show quantitatively and qualitatively how our method is able to more effectively model relationships between relevant entities in the scene.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Arnab_Unified_Graph_Structured_Models_for_Video_Understanding_ICCV_2021_paper.pdf", @@ -43969,7 +46939,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Arnab_2021_ICCV,\n \n author = {\n Arnab,\n Anurag and Sun,\n Chen and Schmid,\n Cordelia\n},\n title = {\n Unified Graph Structured Models for Video Understanding\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8117-8126\n} \n}" }, { "title": "Unified Questioner Transformer for Descriptive Question Generation in Goal-Oriented Visual Dialogue", @@ -43977,6 +46948,7 @@ "status": "Poster", "track": "main", "pid": 7620, + "author_site": "Shoya Matsumori; Kosuke Shingyouchi; Yuki Abe; Yosuke Fukuchi; Komei Sugiura; Michita Imai", "author": "Shoya Matsumori; Kosuke Shingyouchi; Yuki Abe; Yosuke Fukuchi; Komei Sugiura; Michita Imai", "abstract": "Building an interactive artificial intelligence that can ask questions about the real world is one of the biggest challenges for vision and language problems. In particular, goal-oriented visual dialogue, where the aim of the agent is to seek information by asking questions during a turn-taking dialogue, has been gaining scholarly attention recently. While several existing models based on the GuessWhat?! dataset have been proposed, the Questioner typically asks simple category-based questions or absolute spatial questions. This might be problematic for complex scenes where the objects share attributes or in cases where descriptive questions are required to distinguish objects. In this paper, we propose a novel Questioner architecture, called Unified Questioner Transformer (UniQer), for descriptive question generation with referring expressions. In addition, we build a goal-oriented visual dialogue task called CLEVR Ask. It synthesizes complex scenes that require the Questioner to generate descriptive questions. We train our model with two variants of CLEVR Ask datasets. The results of the quantitative and qualitative evaluations show that UniQer outperforms the baseline.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Matsumori_Unified_Questioner_Transformer_for_Descriptive_Question_Generation_in_Goal-Oriented_Visual_ICCV_2021_paper.pdf", @@ -44000,7 +46972,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Matsumori_2021_ICCV,\n \n author = {\n Matsumori,\n Shoya and Shingyouchi,\n Kosuke and Abe,\n Yuki and Fukuchi,\n Yosuke and Sugiura,\n Komei and Imai,\n Michita\n},\n title = {\n Unified Questioner Transformer for Descriptive Question Generation in Goal-Oriented Visual Dialogue\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1898-1907\n} \n}" }, { "title": "Uniformity in Heterogeneity: Diving Deep Into Count Interval Partition for Crowd Counting", @@ -44008,6 +46981,7 @@ "status": "Poster", "track": "main", "pid": 6147, + "author_site": "Changan Wang; Qingyu Song; Boshen Zhang; Yabiao Wang; Ying Tai; Xuyi Hu; Chengjie Wang; Jilin Li; Jiayi Ma; Yang Wu", "author": "Changan Wang; Qingyu Song; Boshen Zhang; Yabiao Wang; Ying Tai; Xuyi Hu; Chengjie Wang; Jilin Li; Jiayi Ma; Yang Wu", "abstract": "Recently, the problem of inaccurate learning targets in crowd counting draws increasing attention. Inspired by a few pioneering work, we solve this problem by trying to predict the indices of pre-defined interval bins of counts instead of the count values themselves. However, an inappropriate interval setting might make the count error contributions from different intervals extremely imbalanced, leading to inferior counting performance. Therefore, we propose a novel count interval partition criterion called Uniform Error Partition (UEP), which always keeps the expected counting error contributions equal for all intervals to minimize the prediction risk. Then to mitigate the inevitably introduced discretization errors in the count quantization process, we propose another criterion called Mean Count Proxies (MCP). The MCP criterion selects the best count proxy for each interval to represent its count value during inference, making the overall expected discretization error of an image nearly negligible. As far as we are aware, this work is the first to delve into such a classification task and ends up with a promising solution for count interval partition. Following the above two theoretically demonstrated criterions, we propose a simple yet effective model termed Uniform Error Partition Network (UEPNet), which achieves state-of-the-art performance on several challenging datasets. The codes will be available at: https://github.com/TencentYoutuResearch/CrowdCounting-UEPNet.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_Uniformity_in_Heterogeneity_Diving_Deep_Into_Count_Interval_Partition_for_ICCV_2021_paper.pdf", @@ -44031,7 +47005,8 @@ "aff_campus_unique_index": "1;2", "aff_campus_unique": ";London;Wuhan", "aff_country_unique_index": "0;0;0;0;0;0+1;0;0;0;0", - "aff_country_unique": "China;United Kingdom" + "aff_country_unique": "China;United Kingdom", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Changan and Song,\n Qingyu and Zhang,\n Boshen and Wang,\n Yabiao and Tai,\n Ying and Hu,\n Xuyi and Wang,\n Chengjie and Li,\n Jilin and Ma,\n Jiayi and Wu,\n Yang\n},\n title = {\n Uniformity in Heterogeneity: Diving Deep Into Count Interval Partition for Crowd Counting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3234-3242\n} \n}" }, { "title": "Unifying Nonlocal Blocks for Neural Networks", @@ -44039,6 +47014,7 @@ "status": "Poster", "track": "main", "pid": 5528, + "author_site": "Lei Zhu; Qi She; Duo Li; Yanye Lu; Xuejing Kang; Jie Hu; Changhu Wang", "author": "Lei Zhu; Qi She; Duo Li; Yanye Lu; Xuejing Kang; Jie Hu; Changhu Wang", "abstract": "The nonlocal-based blocks are designed for capturing long-range spatial-temporal dependencies in computer vision tasks. Although having shown excellent performance, they still lack the mechanism to encode the rich, structured information among elements in an image or video. In this paper, to theoretically analyze the property of these nonlocal-based blocks, we provide a new perspective to interpret them, where we view them as a set of graph filters generated on a fully-connected graph. Specifically, when choosing the Chebyshev graph filter, a unified formulation can be derived for explaining and analyzing the existing nonlocal-based blocks (e.g., nonlocal block, nonlocal stage, double attention block). Furthermore, by concerning the property of spectral, we propose an efficient and robust spectral nonlocal block, which can be more robust and flexible to catch long-range dependencies when inserted into deep neural networks than the existing nonlocal blocks. Experimental results demonstrate the clear-cut improvements and practical applicabilities of our method on image classification, action recognition, semantic segmentation, and person re-identification tasks. Code are available at https://github.com/zh460045050/SNL_ICCV2021.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhu_Unifying_Nonlocal_Blocks_for_Neural_Networks_ICCV_2021_paper.pdf", @@ -44055,14 +47031,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhu_Unifying_Nonlocal_Blocks_for_Neural_Networks_ICCV_2021_paper.html", "aff_unique_index": "0+0+0;1;2;0+0+0;3;4;1", - "aff_unique_norm": "Peking University;ByteDance;Hong Kong University of Science and Technology;Beijing University of Posts and Telecommunications;University of Chinese Academy of Sciences", + "aff_unique_norm": "Peking University;Bytedance;Hong Kong University of Science and Technology;Beijing University of Posts and Telecommunications;University of Chinese Academy of Sciences", "aff_unique_dep": "Institute of Medical Technology;AI Lab;;;SKLCS", "aff_unique_url": "http://www.pku.edu.cn;https://www.bytedance.com;https://www.ust.hk;http://www.bupt.edu.cn/;http://www.ucas.ac.cn", - "aff_unique_abbr": "PKU;Bytedance AI Lab;HKUST;BUPT;UCAS", + "aff_unique_abbr": "PKU;Bytedance;HKUST;BUPT;UCAS", "aff_campus_unique_index": "0+2;3;0+2;4", "aff_campus_unique": "Health Science Center;;Shenzhen;Hong Kong SAR;Beijing", "aff_country_unique_index": "0+0+0;0;0;0+0+0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhu_2021_ICCV,\n \n author = {\n Zhu,\n Lei and She,\n Qi and Li,\n Duo and Lu,\n Yanye and Kang,\n Xuejing and Hu,\n Jie and Wang,\n Changhu\n},\n title = {\n Unifying Nonlocal Blocks for Neural Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12292-12301\n} \n}" }, { "title": "Universal Cross-Domain Retrieval: Generalizing Across Classes and Domains", @@ -44070,6 +47047,7 @@ "status": "Poster", "track": "main", "pid": 1784, + "author_site": "Soumava Paul; Titir Dutta; Soma Biswas", "author": "Soumava Paul; Titir Dutta; Soma Biswas", "abstract": "In this work, for the first time, we address the problem of universal cross-domain retrieval, where the test data can belong to classes or domains which are unseen during training. Due to dynamically increasing number of categories and practical constraint of training on every possible domain, which requires large amounts of data, generalizing to both unseen classes and domains is important. Towards that goal, we propose SnMpNet (Semantic Neighbourhood and Mixture Prediction Network), which incorporates two novel losses to account for the unseen classes and domains encountered during testing. Specifically, we introduce a novel Semantic Neighborhood loss to bridge the knowledge gap between seen and unseen classes and ensure that the latent space embedding of the unseen classes is semantically meaningful with respect to its neighboring classes. We also introduce a mix-up based supervision at image-level as well as semantic-level of the data for training with the Mixture Prediction loss, which helps in efficient retrieval when the query belongs to an unseen domain. These losses are incorporated on the SE-ResNet50 backbone to obtain SnMpNet. Extensive experiments on two large-scale datasets, Sketchy Extended and DomainNet, and thorough comparisons with state-of-the-art justify the effectiveness of the proposed model.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Paul_Universal_Cross-Domain_Retrieval_Generalizing_Across_Classes_and_Domains_ICCV_2021_paper.pdf", @@ -44093,7 +47071,8 @@ "aff_campus_unique_index": "0;1;1", "aff_campus_unique": "Kharagpur;Bangalore", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "India" + "aff_country_unique": "India", + "bibtex": "@InProceedings{Paul_2021_ICCV,\n \n author = {\n Paul,\n Soumava and Dutta,\n Titir and Biswas,\n Soma\n},\n title = {\n Universal Cross-Domain Retrieval: Generalizing Across Classes and Domains\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12056-12064\n} \n}" }, { "title": "Universal Representation Learning From Multiple Domains for Few-Shot Classification", @@ -44101,6 +47080,7 @@ "status": "Poster", "track": "main", "pid": 6478, + "author_site": "Wei-Hong Li; Xialei Liu; Hakan Bilen", "author": "Wei-Hong Li; Xialei Liu; Hakan Bilen", "abstract": "In this paper, we look at the problem of few-shot image classification that aims to learn a classifier for previously unseen classes and domains from few labeled samples. Recent methods use various adaptation strategies for aligning their visual representations to new domains or select the relevant ones from multiple domain-specific feature extractors. In this work, we present URL, which learns a single set of universal visual representations by distilling knowledge of multiple domain-specific networks after co-aligning their features with the help of adapters and centered kernel alignment. We show that the universal representations can be further refined for previously unseen domains by an efficient adaptation step in a similar spirit to distance learning methods. We rigorously evaluate our model in the recent Meta-Dataset benchmark and demonstrate that it significantly outperforms the previous methods while being more efficient.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_Universal_Representation_Learning_From_Multiple_Domains_for_Few-Shot_Classification_ICCV_2021_paper.pdf", @@ -44124,7 +47104,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Wei-Hong and Liu,\n Xialei and Bilen,\n Hakan\n},\n title = {\n Universal Representation Learning From Multiple Domains for Few-Shot Classification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9526-9535\n} \n}" }, { "title": "Universal and Flexible Optical Aberration Correction Using Deep-Prior Based Deconvolution", @@ -44132,6 +47113,7 @@ "status": "Poster", "track": "main", "pid": 7010, + "author_site": "Xiu Li; Jinli Suo; Weihang Zhang; Xin Yuan; Qionghai Dai", "author": "Xiu Li; Jinli Suo; Weihang Zhang; Xin Yuan; Qionghai Dai", "abstract": "High quality imaging usually requires bulky and expensive lenses to compensate geometric and chromatic aberrations. This poses high constraints on the optical hash or low cost applications. Although one can utilize algorithmic reconstruction to remove the artifacts of low-end lenses, the degeneration from optical aberrations is spatially varying and the computation has to trade off efficiency for performance. For example, we need to conduct patch-wise optimization or train a large set of local deep neural networks to achieve high reconstruction performance across the whole image. In this paper, we propose a PSF aware plug-and-play deep network, which takes the aberrant image and PSF map as input and produces the latent high quality version via incorporating lens-specific deep priors, thus leading to a universal and flexible optical aberration correction method. Specifically, we pre-train a base model from a set of diverse lenses and then adapt it to a given lens by quickly refining the parameters, which largely alleviates the time and memory consumption of model learning. The approach is of high efficiency in both training and testing stages. Extensive results verify the promising applications of our proposed approach for compact low-end cameras.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_Universal_and_Flexible_Optical_Aberration_Correction_Using_Deep-Prior_Based_Deconvolution_ICCV_2021_paper.pdf", @@ -44155,7 +47137,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Xiu and Suo,\n Jinli and Zhang,\n Weihang and Yuan,\n Xin and Dai,\n Qionghai\n},\n title = {\n Universal and Flexible Optical Aberration Correction Using Deep-Prior Based Deconvolution\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2613-2621\n} \n}" }, { "title": "Universal-Prototype Enhancing for Few-Shot Object Detection", @@ -44163,10 +47146,11 @@ "status": "Poster", "track": "main", "pid": 11284, + "author_site": "Aming Wu; Yahong Han; Linchao Zhu; Yi Yang", "author": "Aming Wu; Yahong Han; Linchao Zhu; Yi Yang", "abstract": "Few-shot object detection (FSOD) aims to strengthen the performance of novel object detection with few labeled samples. To alleviate the constraint of few samples, enhancing the generalization ability of learned features for novel objects plays a key role. Thus, the feature learning process of FSOD should focus more on intrinsical object characteristics, which are invariant under different visual changes and therefore are helpful for feature generalization. Unlike previous attempts of the meta-learning paradigm, in this paper, we explore how to enhance object features with intrinsical characteristics that are universal across different object categories. We propose a new prototype, namely universal prototype, that is learned from all object categories. Besides the advantage of characterizing invariant characteristics, the universal prototypes alleviate the impact of unbalanced object categories. After enhancing object features with the universal prototypes, we impose a consistency loss to maximize the agreement between the enhanced features and the original ones, which is beneficial for learning invariant object characteristics. Thus, we develop a new framework of few-shot object detection with universal prototypes ( FSOD ^ up ) that owns the merit of feature generalization towards novel objects. Experimental results on PASCAL VOC and MS COCO show the effectiveness of FSOD ^ up . Particularly, for the 1-shot case of VOC Split2, FSOD ^ up outperforms the baseline by 6.8% in terms of mAP.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wu_Universal-Prototype_Enhancing_for_Few-Shot_Object_Detection_ICCV_2021_paper.pdf", - "aff": "School of Electronic Engineering, Xidian University, Xi\u2019an, China; College of Intelligence and Computing, Tianjin University, Tianjin, China+Tianjin Key Lab of Machine Learning, Tianjin University, Tianjin, China+Peng Cheng Laboratory, Shenzhen, China; ReLER Lab, AAII, University of Technology Sydney; ReLER Lab, AAII, University of Technology Sydney", + "aff": "School of Electronic Engineering, Xidian University, Xi’an, China; College of Intelligence and Computing, Tianjin University, Tianjin, China+Tianjin Key Lab of Machine Learning, Tianjin University, Tianjin, China+Peng Cheng Laboratory, Shenzhen, China; ReLER Lab, AAII, University of Technology Sydney; ReLER Lab, AAII, University of Technology Sydney", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Wu_Universal-Prototype_Enhancing_for_ICCV_2021_supplemental.pdf", @@ -44179,14 +47163,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wu_Universal-Prototype_Enhancing_for_Few-Shot_Object_Detection_ICCV_2021_paper.html", "aff_unique_index": "0;1+1+2;3;3", - "aff_unique_norm": "Xidian University;Tianjin University;Pengcheng Laboratory;University of Technology Sydney", - "aff_unique_dep": "School of Electronic Engineering;College of Intelligence and Computing;Peng Cheng Laboratory;ReLER Lab, AAII", + "aff_unique_norm": "Xidian University;Tianjin University;Peng Cheng Laboratory;University of Technology Sydney", + "aff_unique_dep": "School of Electronic Engineering;College of Intelligence and Computing;;ReLER Lab, AAII", "aff_unique_url": "http://www.xidian.edu.cn;http://www.tju.edu.cn;;https://www.uts.edu.au", "aff_unique_abbr": "Xidian;Tianjin University;;UTS", - "aff_campus_unique_index": "0;1+1+2", - "aff_campus_unique": "Xi'an;Tianjin;Shenzhen;", + "aff_campus_unique_index": "0;1+1+2;3;3", + "aff_campus_unique": "Xi'an;Tianjin;Shenzhen;Sydney", "aff_country_unique_index": "0;0+0+0;1;1", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Wu_2021_ICCV,\n \n author = {\n Wu,\n Aming and Han,\n Yahong and Zhu,\n Linchao and Yang,\n Yi\n},\n title = {\n Universal-Prototype Enhancing for Few-Shot Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9567-9576\n} \n}" }, { "title": "Unlimited Neighborhood Interaction for Heterogeneous Trajectory Prediction", @@ -44194,10 +47179,11 @@ "status": "Poster", "track": "main", "pid": 6216, + "author_site": "Fang Zheng; Le Wang; Sanping Zhou; Wei Tang; Zhenxing Niu; Nanning Zheng; Gang Hua", "author": "Fang Zheng; Le Wang; Sanping Zhou; Wei Tang; Zhenxing Niu; Nanning Zheng; Gang Hua", "abstract": "Understanding complex social interactions among agents is a key challenge for trajectory prediction. Most existing methods consider the interactions between pairwise traffic agents or in a local area, while the nature of interactions is unlimited, involving an uncertain number of agents and non-local areas simultaneously. Besides, they treat heterogeneous traffic agents the same, namely those among agents of different categories, while neglecting people's diverse reaction patterns toward traffic agents in different categories. To address these problems, we propose a simple yet effective Unlimited Neighborhood Interaction Network (UNIN), which predicts trajectories of heterogeneous agents in multiple categories. Specifically, the proposed unlimited neighborhood interaction module generates the fused-features of all agents involved in an interaction simultaneously, which is adaptive to any number of agents and any range of interaction area. Meanwhile, a hierarchical graph attention module is proposed to obtain category-to-category interaction and agent-to-agent interaction. Finally, parameters of a Gaussian Mixture Model are estimated for generating the future trajectories. Extensive experimental results on benchmark datasets demonstrate a significant performance improvement of our method over the state-of-the-art methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zheng_Unlimited_Neighborhood_Interaction_for_Heterogeneous_Trajectory_Prediction_ICCV_2021_paper.pdf", - "aff": "School of Software Engineering, Xi\u2019an Jiaotong University; Institute of Artificial Intelligence and Robotics, Xi\u2019an Jiaotong University; University of Illinois at Chicago; School of Computer Science and Technology, Xidian University; Wormpex AI Research; Institute of Artificial Intelligence and Robotics, Xi\u2019an Jiaotong University; Wormpex AI Research", + "aff": "School of Software Engineering, Xi’an Jiaotong University; Institute of Artificial Intelligence and Robotics, Xi’an Jiaotong University; University of Illinois at Chicago; School of Computer Science and Technology, Xidian University; Wormpex AI Research; Institute of Artificial Intelligence and Robotics, Xi’an Jiaotong University; Wormpex AI Research", "project": "", "github": "", "supp": "", @@ -44210,14 +47196,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zheng_Unlimited_Neighborhood_Interaction_for_Heterogeneous_Trajectory_Prediction_ICCV_2021_paper.html", "aff_unique_index": "0;0;1;2;3;0;3", - "aff_unique_norm": "Xi'an Jiao Tong University;University of Illinois at Chicago;Xidian University;Wormpex AI Research", + "aff_unique_norm": "Xi'an Jiaotong University;University of Illinois at Chicago;Xidian University;Wormpex AI Research", "aff_unique_dep": "School of Software Engineering;;School of Computer Science and Technology;AI Research", - "aff_unique_url": "http://www.xjtu.edu.cn;https://www.uic.edu;http://www.xidian.edu.cn/;", + "aff_unique_url": "http://www.xjtu.edu.cn;https://www.uic.edu;http://www.xidian.edu.cn;", "aff_unique_abbr": "XJTU;UIC;Xidian;Wormpex AI", "aff_campus_unique_index": "0;0;1;0", "aff_campus_unique": "Xi'an;Chicago;", "aff_country_unique_index": "0;0;1;0;1;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Zheng_2021_ICCV,\n \n author = {\n Zheng,\n Fang and Wang,\n Le and Zhou,\n Sanping and Tang,\n Wei and Niu,\n Zhenxing and Zheng,\n Nanning and Hua,\n Gang\n},\n title = {\n Unlimited Neighborhood Interaction for Heterogeneous Trajectory Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13168-13177\n} \n}" }, { "title": "Unlocking the Potential of Ordinary Classifier: Class-Specific Adversarial Erasing Framework for Weakly Supervised Semantic Segmentation", @@ -44225,6 +47212,7 @@ "status": "Poster", "track": "main", "pid": 5568, + "author_site": "Hyeokjun Kweon; Sung-Hoon Yoon; Hyeonseong Kim; Daehee Park; Kuk-Jin Yoon", "author": "Hyeokjun Kweon; Sung-Hoon Yoon; Hyeonseong Kim; Daehee Park; Kuk-Jin Yoon", "abstract": "Weakly supervised semantic segmentation (WSSS) using image-level classification labels usually utilizes the Class Activation Maps (CAMs) to localize objects of interest in images. While pointing out that CAMs only highlight the most discriminative regions of the classes of interest, adversarial erasing (AE) methods have been proposed to further explore the less discriminative regions. In this paper, we review the potential of the pre-trained classifier which is trained on the raw images. We experimentally verify that the ordinary classifier already has the capability to activate the less discriminative regions if the most discriminative regions are erased to some extent. Based on that, we propose a class-specific AE-based framework that fully exploits the potential of an ordinary classifier. Our framework (1) adopts the ordinary classifier to notify the regions to be erased and (2) generates a class-specific mask for erasing by randomly sampling a single specific class to be erased (target class) among the existing classes on the image for obtaining more precise CAMs. Specifically, with the guidance of the ordinary classifier, the proposed CAMs Generation Network (CGNet) is enforced to generate a CAM of the target class while constraining the CAM not to intrude the object regions of the other classes. Along with the pseudo-labels refined from our CAMs, we achieve the state-of-the-art WSSS performance on both PASCAL VOC 2012 and MS-COCO dataset only with image-level supervision. The code is available at https://github.com/KAIST-vilab/OC-CSE.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Kweon_Unlocking_the_Potential_of_Ordinary_Classifier_Class-Specific_Adversarial_Erasing_Framework_ICCV_2021_paper.pdf", @@ -44248,7 +47236,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Kweon_2021_ICCV,\n \n author = {\n Kweon,\n Hyeokjun and Yoon,\n Sung-Hoon and Kim,\n Hyeonseong and Park,\n Daehee and Yoon,\n Kuk-Jin\n},\n title = {\n Unlocking the Potential of Ordinary Classifier: Class-Specific Adversarial Erasing Framework for Weakly Supervised Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6994-7003\n} \n}" }, { "title": "Unpaired Learning for Deep Image Deraining With Rain Direction Regularizer", @@ -44256,6 +47245,7 @@ "status": "Poster", "track": "main", "pid": 8909, + "author_site": "Yang Liu; Ziyu Yue; Jinshan Pan; Zhixun Su", "author": "Yang Liu; Ziyu Yue; Jinshan Pan; Zhixun Su", "abstract": "We present a simple yet effective unpaired learning based image rain removal method from an unpaired set of synthetic images and real rainy images by exploring the properties of rain maps. The proposed algorithm mainly consists of a semi-supervised learning part and a knowledge distillation part. The semi-supervised part estimates the rain map and reconstructs the derained image based on the well-established layer separation principle. To facilitate rain removal, we develop a rain direction regularizer to constrain the rain estimation network in the semi-supervised learning part. With the estimated rain maps from the semi-supervised learning part, we first synthesize a new paired set by adding to rain-free images based on the superimposition model. The real rainy images and the derained results constitute another paired set. Then we develop an effective knowledge distillation method to explore such two paired sets so that the deraining model in the semi-supervised learning part is distilled. We propose two new rainy datasets, named RainDirection and Real3000, to validate the effectiveness of the proposed method. Both quantitative and qualitative experimental results demonstrate that the proposed method achieves favorable results against state-of-the-art methods in benchmark datasets and real-world images.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liu_Unpaired_Learning_for_Deep_Image_Deraining_With_Rain_Direction_Regularizer_ICCV_2021_paper.pdf", @@ -44270,7 +47260,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Liu_Unpaired_Learning_for_Deep_Image_Deraining_With_Rain_Direction_Regularizer_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Liu_Unpaired_Learning_for_Deep_Image_Deraining_With_Rain_Direction_Regularizer_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Liu_2021_ICCV,\n \n author = {\n Liu,\n Yang and Yue,\n Ziyu and Pan,\n Jinshan and Su,\n Zhixun\n},\n title = {\n Unpaired Learning for Deep Image Deraining With Rain Direction Regularizer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4753-4761\n} \n}" }, { "title": "Unpaired Learning for High Dynamic Range Image Tone Mapping", @@ -44278,6 +47269,7 @@ "status": "Poster", "track": "main", "pid": 3926, + "author_site": "Yael Vinker; Inbar Huberman-Spiegelglas; Raanan Fattal", "author": "Yael Vinker; Inbar Huberman-Spiegelglas; Raanan Fattal", "abstract": "High dynamic range (HDR) photography is becoming increasingly popular and available by DSLR and mobile-phone cameras. While deep neural networks (DNN) have greatly impacted other domains of image manipulation, their use for HDR tone-mapping is limited due to the lack of a definite notion of ground-truth solution, which is needed for producing training data. In this paper we describe a new tone-mapping approach guided by the distinct goal of producing low dynamic range (LDR) renditions that best reproduce the visual characteristics of native LDR images. This goal enables the use of an unpaired adversarial training based on unrelated sets of HDR and LDR images, both of which are widely available and easy to acquire. In order to achieve an effective training under this minimal requirements, we introduce the following new steps and components: (i) a range-normalizing pre-process which estimates and applies a different level of curve-based compression, (ii) a loss that preserves the input content while allowing the network to achieve its goal, and (iii) the use of a more concise discriminator network, designed to promote the reproduction of low-level attributes native LDR possess. Evaluation of the resulting network demonstrates its ability to produce photo-realistic artifact-free tone-mapped images, and state-of-the-art performance on different image fidelity indices and visual distances.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Vinker_Unpaired_Learning_for_High_Dynamic_Range_Image_Tone_Mapping_ICCV_2021_paper.pdf", @@ -44294,14 +47286,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Vinker_Unpaired_Learning_for_High_Dynamic_Range_Image_Tone_Mapping_ICCV_2021_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "Hebrew University of Jerusalem", + "aff_unique_norm": "The Hebrew University of Jerusalem", "aff_unique_dep": "School of Computer Science and Engineering", "aff_unique_url": "http://www.huji.ac.il", "aff_unique_abbr": "HUJI", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Jerusalem", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Israel" + "aff_country_unique": "Israel", + "bibtex": "@InProceedings{Vinker_2021_ICCV,\n \n author = {\n Vinker,\n Yael and Huberman-Spiegelglas,\n Inbar and Fattal,\n Raanan\n},\n title = {\n Unpaired Learning for High Dynamic Range Image Tone Mapping\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14657-14666\n} \n}" }, { "title": "Unshuffling Data for Improved Generalization in Visual Question Answering", @@ -44309,6 +47302,7 @@ "status": "Poster", "track": "main", "pid": 1225, + "author_site": "Damien Teney; Ehsan Abbasnejad; Anton van den Hengel", "author": "Damien Teney; Ehsan Abbasnejad; Anton van den Hengel", "abstract": "Generalization beyond the training distribution is a core challenge in machine learning. The common practice of mixing and shuffling examples when training neural networks may not be optimal in this regard. We show that partitioning the data into well-chosen, non-i.i.d. subsets treated as multiple training environments can guide the learning of models with better out-of-distribution generalization. We describe a training procedure to capture the patterns that are stable across environments while discarding spurious ones. The method makes a step beyond correlation-based learning: the choice of the partitioning allows injecting information about the task that cannot be otherwise recovered from the joint distribution of the training data. We demonstrate multiple use cases with the task of visual question answering, which is notorious for dataset biases. We obtain significant improvements on VQA-CP, using environments built from prior knowledge, existing meta data, or unsupervised clustering. We also get improvements on GQA using annotations of \"equivalent questions\", and on multi-dataset training (VQA v2 / Visual Genome) by treating them as distinct environments.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Teney_Unshuffling_Data_for_Improved_Generalization_in_Visual_Question_Answering_ICCV_2021_paper.pdf", @@ -44332,7 +47326,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Australia" + "aff_country_unique": "Australia", + "bibtex": "@InProceedings{Teney_2021_ICCV,\n \n author = {\n Teney,\n Damien and Abbasnejad,\n Ehsan and van den Hengel,\n Anton\n},\n title = {\n Unshuffling Data for Improved Generalization in Visual Question Answering\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1417-1427\n} \n}" }, { "title": "Unsupervised 3D Pose Estimation for Hierarchical Dance Video Recognition", @@ -44340,6 +47335,7 @@ "status": "Poster", "track": "main", "pid": 10918, + "author_site": "Xiaodan Hu; Narendra Ahuja", "author": "Xiaodan Hu; Narendra Ahuja", "abstract": "Dance experts often view dance as a hierarchy of information, spanning low-level (raw images, image sequences), mid-levels (human poses and bodypart movements), and high-level (dance genre). We propose a Hierarchical Dance Video Recognition framework (HDVR). HDVR estimates 2D pose sequences, tracks dancers, and then simultaneously estimates corresponding 3D poses and 3D-to-2D imaging parameters, without requiring ground truth for 3D poses. Unlike most methods that work on a single person, our tracking works on multiple dancers, under occlusions. From the estimated 3D pose sequence, HDVR extracts body part movements, and therefrom dance genre. The resulting hierarchical dance representation is explainable to experts. To overcome noise and interframe correspondence ambiguities, we enforce spatial and temporal motion smoothness and photometric continuity over time. We use an LSTM network to extract 3D movement subsequences from which we recognize dance genre. For experiments, we have identified 154 movement types, of 16 body parts, and assembled a new University of Illinois Dance (UID) Dataset, containing 1143 video clips of 9 genres covering 30 hours, annotated with movement and genre labels. Our experimental results demonstrate that our algorithms outperform the state-of-the-art 3D pose estimation methods, which also enhances our dance recognition performance.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Hu_Unsupervised_3D_Pose_Estimation_for_Hierarchical_Dance_Video_Recognition_ICCV_2021_paper.pdf", @@ -44356,14 +47352,15 @@ "author_num": 2, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Hu_Unsupervised_3D_Pose_Estimation_for_Hierarchical_Dance_Video_Recognition_ICCV_2021_paper.html", "aff_unique_index": "0;0", - "aff_unique_norm": "University of Illinois Urbana-Champaign", + "aff_unique_norm": "University of Illinois at Urbana-Champaign", "aff_unique_dep": "Department of Electrical and Computer Engineering", "aff_unique_url": "https://illinois.edu", "aff_unique_abbr": "UIUC", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Urbana-Champaign", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Hu_2021_ICCV,\n \n author = {\n Hu,\n Xiaodan and Ahuja,\n Narendra\n},\n title = {\n Unsupervised 3D Pose Estimation for Hierarchical Dance Video Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11015-11024\n} \n}" }, { "title": "Unsupervised Curriculum Domain Adaptation for No-Reference Video Quality Assessment", @@ -44371,6 +47368,7 @@ "status": "Poster", "track": "main", "pid": 7252, + "author_site": "Pengfei Chen; Leida Li; Jinjian Wu; Weisheng Dong; Guangming Shi", "author": "Pengfei Chen; Leida Li; Jinjian Wu; Weisheng Dong; Guangming Shi", "abstract": "During the last years, convolutional neural networks (CNNs) have triumphed over video quality assessment (VQA) tasks. However, CNN-based approaches heavily rely on annotated data which are typically not available in VQA, leading to the difficulty of model generalization. Recent advances in domain adaptation technique makes it possible to adapt models trained on source data to unlabeled target data. However, due to the distortion diversity and content variation of the collected videos, the intrinsic subjectivity of VQA tasks hampers the adaptation performance. In this work, we propose a curriculum-style unsupervised domain adaptation to handle the cross-domain no-reference VQA problem. The proposed approach could be divided into two stages. In the first stage, we conduct an adaptation between source and target domains to predict the rating distribution for target samples, which can better reveal the subjective nature of VQA. From this adaptation, we split the data in target domain into confident and uncertain subdomains using the proposed uncertainty-based ranking function, through measuring their prediction confidences. In the second stage, by regarding samples in confident subdomain as the easy tasks in the curriculum, a fine-level adaptation is conducted between two subdomains to fine-tune the prediction model. Extensive experimental results on benchmark datasets highlight the superiority of the proposed method over the competing methods in both accuracy and speed. The source code is released at https://github.com/cpf0079/UCDA.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_Unsupervised_Curriculum_Domain_Adaptation_for_No-Reference_Video_Quality_Assessment_ICCV_2021_paper.pdf", @@ -44394,7 +47392,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0;0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Pengfei and Li,\n Leida and Wu,\n Jinjian and Dong,\n Weisheng and Shi,\n Guangming\n},\n title = {\n Unsupervised Curriculum Domain Adaptation for No-Reference Video Quality Assessment\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5178-5187\n} \n}" }, { "title": "Unsupervised Deep Video Denoising", @@ -44402,6 +47401,7 @@ "status": "Poster", "track": "main", "pid": 10671, + "author_site": "Dev Yashpal Sheth; Sreyas Mohan; Joshua L. Vincent; Ramon Manzorro; Peter A. Crozier; Mitesh M. Khapra; Eero P. Simoncelli; Carlos Fernandez-Granda", "author": "Dev Yashpal Sheth; Sreyas Mohan; Joshua L. Vincent; Ramon Manzorro; Peter A. Crozier; Mitesh M. Khapra; Eero P. Simoncelli; Carlos Fernandez-Granda", "abstract": "Deep convolutional neural networks (CNNs) for video denoising are typically trained with supervision, assuming the availability of clean videos. However, in many applications, such as microscopy, noiseless videos are not available. To address this, we propose an Unsupervised Deep Video Denoiser (UDVD), a CNN architecture designed to be trained exclusively with noisy data. The performance of UDVD is comparable to the supervised state-of-the-art, even when trained only on a single short noisy video. We demonstrate the promise of our approach in real-world imaging applications by denoising raw video, fluorescence-microscopy and electron-microscopy data. In contrast to many current approaches to video denoising, UDVD does not require explicit motion compensation. This is advantageous because motion compensation is computationally expensive, and can be unreliable when the input data are noisy. A gradient-based analysis reveals that UDVD automatically tracks the motion of objects in the input noisy videos. Thus, the network learns to perform implicit motion compensation, even though it is only trained for denoising.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Sheth_Unsupervised_Deep_Video_Denoising_ICCV_2021_paper.pdf", @@ -44416,7 +47416,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Sheth_Unsupervised_Deep_Video_Denoising_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Sheth_Unsupervised_Deep_Video_Denoising_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Sheth_2021_ICCV,\n \n author = {\n Sheth,\n Dev Yashpal and Mohan,\n Sreyas and Vincent,\n Joshua L. and Manzorro,\n Ramon and Crozier,\n Peter A. and Khapra,\n Mitesh M. and Simoncelli,\n Eero P. and Fernandez-Granda,\n Carlos\n},\n title = {\n Unsupervised Deep Video Denoising\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1759-1768\n} \n}" }, { "title": "Unsupervised Dense Deformation Embedding Network for Template-Free Shape Correspondence", @@ -44424,6 +47425,7 @@ "status": "Poster", "track": "main", "pid": 2416, + "author_site": "Ronghan Chen; Yang Cong; Jiahua Dong", "author": "Ronghan Chen; Yang Cong; Jiahua Dong", "abstract": "Shape correspondence from 3D deformation learning has attracted appealing academy interests recently. Nevertheless, current deep learning based methods require the supervision of dense annotations to learn per-point translations, which severely over-parameterize the deformation process. Moreover, they fail to capture local geometric details of original shape via global feature embedding. To address these challenges, we develop a new Unsupervised Dense Deformation Embedding Network (i.e., UD2E-Net), which learns to predict deformations between non-rigid shapes from dense local features. Since it is non-trivial to match deformation-variant local features for deformation prediction, we develop an Extrinsic-Intrinsic Autoencoder to frst encode extrinsic geometric features from source into intrinsic coordinates in a shared canonical shape, with which the decoder then synthesizes corresponding target features. Moreover, a bounded maximum mean discrepancy loss is developed to mitigate the distribution divergence between the synthesized and original features. To learn natural deformation without dense supervision, we introduce a coarse parameterized deformation graph, for which a novel trace and propagation algorithm is proposed to improve both the quality and effciency of the deformation. Our UD2E-Net outperforms state-of-the-art unsupervised methods by 24% on Faust Inter challenge and even supervised methods by 13% on Faust Intra challenge.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_Unsupervised_Dense_Deformation_Embedding_Network_for_Template-Free_Shape_Correspondence_ICCV_2021_paper.pdf", @@ -44447,7 +47449,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0+0;0+0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Ronghan and Cong,\n Yang and Dong,\n Jiahua\n},\n title = {\n Unsupervised Dense Deformation Embedding Network for Template-Free Shape Correspondence\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8361-8370\n} \n}" }, { "title": "Unsupervised Depth Completion With Calibrated Backprojection Layers", @@ -44455,6 +47458,7 @@ "status": "Poster", "track": "main", "pid": 6271, + "author_site": "Alex Wong; Stefano Soatto", "author": "Alex Wong; Stefano Soatto", "abstract": "We propose a deep neural network architecture to infer dense depth from an image and a sparse point cloud. It is trained using a video stream and corresponding synchronized sparse point cloud, as obtained from a LIDAR or other range sensor, along with the intrinsic calibration parameters of the camera. At inference time, the calibration of the camera, which can be different than the one used for training, is fed as an input to the network along with the sparse point cloud and a single image. A Calibrated Backprojection Layer backprojects each pixel in the image to three-dimensional space using the calibration matrix and a depth feature descriptor. The resulting 3D positional encoding is concatenated with the image descriptor and the previous layer output to yield the input to the next layer of the encoder. A decoder, exploiting skip-connections, produces a dense depth map. The resulting Calibrated Backprojection Network, or KBNet, is trained without supervision by minimizing the photometric reprojection error. KBNet imputes missing depth value based on the training set, rather than on generic regularization. We test KBNet on public depth completion benchmarks, where it outperforms the state of the art by 30% indoor and 8% outdoor when the same camera is used for training and testing. When the test camera is different, the improvement reaches 62%.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wong_Unsupervised_Depth_Completion_With_Calibrated_Backprojection_Layers_ICCV_2021_paper.pdf", @@ -44478,7 +47482,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Los Angeles", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Wong_2021_ICCV,\n \n author = {\n Wong,\n Alex and Soatto,\n Stefano\n},\n title = {\n Unsupervised Depth Completion With Calibrated Backprojection Layers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12747-12756\n} \n}" }, { "title": "Unsupervised Domain Adaptive 3D Detection With Multi-Level Consistency", @@ -44486,6 +47491,7 @@ "status": "Poster", "track": "main", "pid": 2035, + "author_site": "Zhipeng Luo; Zhongang Cai; Changqing Zhou; Gongjie Zhang; Haiyu Zhao; Shuai Yi; Shijian Lu; Hongsheng Li; Shanghang Zhang; Ziwei Liu", "author": "Zhipeng Luo; Zhongang Cai; Changqing Zhou; Gongjie Zhang; Haiyu Zhao; Shuai Yi; Shijian Lu; Hongsheng Li; Shanghang Zhang; Ziwei Liu", "abstract": "Deep learning-based 3D object detection has achieved unprecedented success with the advent of large-scale autonomous driving datasets. However, drastic performance degradation remains a critical challenge for cross-domain deployment. In addition, existing 3D domain adaptive detection methods often assume prior access to the target domain annotations, which is rarely feasible in the real world. To address this challenge, we study a more realistic setting, unsupervised 3D domain adaptive detection, which only utilizes source domain annotations. 1) We first comprehensively investigate the major underlying factors of the domain gap in 3D detection. Our key insight is that geometric mismatch is the key factor of domain shift. 2) Then, we propose a novel and unified framework, Multi-Level Consistency Network (MLC-Net), which employs a teacher-student paradigm to generate adaptive and reliable pseudo-targets. MLC-Net exploits point-, instance- and neural statistics-level consistency to facilitate cross-domain transfer. Extensive experiments demonstrate that MLC-Net outperforms existing state-of-the-art methods (including those using additional target domain information) on standard benchmarks. Notably, our approach is detector-agnostic, which achieves consistent gains on both single- and two-stage 3D detectors. Code will be released.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Luo_Unsupervised_Domain_Adaptive_3D_Detection_With_Multi-Level_Consistency_ICCV_2021_paper.pdf", @@ -44500,7 +47506,8 @@ "aff_domain": ";;;;;;;;;", "email": ";;;;;;;;;", "author_num": 10, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Luo_Unsupervised_Domain_Adaptive_3D_Detection_With_Multi-Level_Consistency_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Luo_Unsupervised_Domain_Adaptive_3D_Detection_With_Multi-Level_Consistency_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Luo_2021_ICCV,\n \n author = {\n Luo,\n Zhipeng and Cai,\n Zhongang and Zhou,\n Changqing and Zhang,\n Gongjie and Zhao,\n Haiyu and Yi,\n Shuai and Lu,\n Shijian and Li,\n Hongsheng and Zhang,\n Shanghang and Liu,\n Ziwei\n},\n title = {\n Unsupervised Domain Adaptive 3D Detection With Multi-Level Consistency\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8866-8875\n} \n}" }, { "title": "Unsupervised Few-Shot Action Recognition via Action-Appearance Aligned Meta-Adaptation", @@ -44508,10 +47515,11 @@ "status": "Poster", "track": "main", "pid": 6622, + "author_site": "Jay Patravali; Gaurav Mittal; Ye Yu; Fuxin Li; Mei Chen", "author": "Jay Patravali; Gaurav Mittal; Ye Yu; Fuxin Li; Mei Chen", "abstract": "We present MetaUVFS as the first Unsupervised Meta-learning algorithm for Video Few-Shot action recognition. MetaUVFS leverages over 550K unlabeled videos to train a two-stream 2D and 3D CNN architecture via contrastive learning to capture the appearance-specific spatial and action-specific spatio-temporal video features respectively. MetaUVFS comprises a novel Action-Appearance Aligned Meta-adaptation (A3M) module that learns to focus on the action-oriented video features in relation to the appearance features via explicit few-shot episodic meta-learning over unsupervised hard-mined episodes. Our action-appearance alignment and explicit few-shot learner conditions the unsupervised training to mimic the downstream few-shot task, enabling MetaUVFS to significantly outperform all unsupervised methods on few-shot benchmarks. Moreover, unlike previous few-shot action recognition methods that are supervised, MetaUVFS needs neither base-class labels nor a supervised pretrained backbone. Thus, we need to train MetaUVFS just once to perform competitively or sometimes even outperform state-of-the-art supervised methods on popular HMDB51, UCF101, and Kinetics100 few-shot datasets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Patravali_Unsupervised_Few-Shot_Action_Recognition_via_Action-Appearance_Aligned_Meta-Adaptation_ICCV_2021_paper.pdf", - "aff": "Microsoft\u2021Oregon State University; Microsoft\u2021Oregon State University; Microsoft; Oregon State University; Microsoft", + "aff": "Microsoft‡Oregon State University; Microsoft‡Oregon State University; Microsoft; Oregon State University; Microsoft", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Patravali_Unsupervised_Few-Shot_Action_ICCV_2021_supplemental.pdf", @@ -44523,15 +47531,16 @@ "email": "oregonstate.edu;microsoft.com;microsoft.com;oregonstate.edu;microsoft.com", "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Patravali_Unsupervised_Few-Shot_Action_Recognition_via_Action-Appearance_Aligned_Meta-Adaptation_ICCV_2021_paper.html", - "aff_unique_index": "0;0;0;1;0", - "aff_unique_norm": "Microsoft;Oregon State University", - "aff_unique_dep": "Microsoft;", - "aff_unique_url": "https://www.microsoft.com;https://oregonstate.edu", - "aff_unique_abbr": "Microsoft;OSU", + "aff_unique_index": "0;0;1;2;1", + "aff_unique_norm": "Microsoft;Microsoft Corporation;Oregon State University", + "aff_unique_dep": ";;", + "aff_unique_url": "https://www.microsoft.com;https://www.microsoft.com;https://oregonstate.edu", + "aff_unique_abbr": "Microsoft;Microsoft;OSU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Patravali_2021_ICCV,\n \n author = {\n Patravali,\n Jay and Mittal,\n Gaurav and Yu,\n Ye and Li,\n Fuxin and Chen,\n Mei\n},\n title = {\n Unsupervised Few-Shot Action Recognition via Action-Appearance Aligned Meta-Adaptation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8484-8494\n} \n}" }, { "title": "Unsupervised Image Generation With Infinite Generative Adversarial Networks", @@ -44539,6 +47548,7 @@ "status": "Poster", "track": "main", "pid": 11132, + "author_site": "Hui Ying; He Wang; Tianjia Shao; Yin Yang; Kun Zhou", "author": "Hui Ying; He Wang; Tianjia Shao; Yin Yang; Kun Zhou", "abstract": "Image generation has been heavily investigated in computer vision, where one core research challenge is to generate images from arbitrarily complex distributions with little supervision. Generative Adversarial Networks (GANs) as an implicit approach have achieved great successes in this direction and therefore been employed widely. However, GANs are known to suffer from issues such as mode collapse, non-structured latent space, being unable to compute likelihoods, etc. In this paper, we propose a new unsupervised non-parametric method named mixture of infinite conditional GANs or MIC-GANs, to tackle several GAN issues together, aiming for image generation with parsimonious prior knowledge. Through comprehensive evaluations across different datasets, we show that MIC-GANs are effective in structuring the latent space and avoiding mode collapse, and outperform state-of-the-art methods. MICGANs are adaptive, versatile, and robust. They offer a promising solution to several well-known GAN issues. Code available: github.com/yinghdb/MICGANs.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ying_Unsupervised_Image_Generation_With_Infinite_Generative_Adversarial_Networks_ICCV_2021_paper.pdf", @@ -44562,7 +47572,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;2;0", - "aff_country_unique": "China;United Kingdom;United States" + "aff_country_unique": "China;United Kingdom;United States", + "bibtex": "@InProceedings{Ying_2021_ICCV,\n \n author = {\n Ying,\n Hui and Wang,\n He and Shao,\n Tianjia and Yang,\n Yin and Zhou,\n Kun\n},\n title = {\n Unsupervised Image Generation With Infinite Generative Adversarial Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14284-14293\n} \n}" }, { "title": "Unsupervised Layered Image Decomposition Into Object Prototypes", @@ -44570,6 +47581,7 @@ "status": "Poster", "track": "main", "pid": 4323, + "author_site": "Tom Monnier; Elliot Vincent; Jean Ponce; Mathieu Aubry", "author": "Tom Monnier; Elliot Vincent; Jean Ponce; Mathieu Aubry", "abstract": "We present an unsupervised learning framework for decomposing images into layers of automatically discovered object models. Contrary to recent approaches that model image layers with autoencoder networks, we represent them as explicit transformations of a small set of prototypical images. Our model has three main components: (i) a set of object prototypes in the form of learnable images with a transparency channel, which we refer to as sprites; (ii) differentiable parametric functions predicting occlusions and transformation parameters necessary to instantiate the sprites in a given image; (iii) a layered image formation model with occlusion for compositing these instances into complete images including background. By jointly learning the sprites and occlusion/transformation predictors to reconstruct images, our approach not only yields accurate layered image decompositions, but also identifies object categories and instance parameters. We first validate our approach by providing results on par with the state of the art on standard multi-object synthetic benchmarks (Tetrominoes, Multi-dSprites, CLEVR6). We then demonstrate the applicability of our model to real images in tasks that include clustering (SVHN, GTSRB), cosegmentation (Weizmann Horse) and object discovery from unfiltered social network images. To the best of our knowledge, our approach is the first layered image decomposition algorithm that learns an explicit and shared concept of object type, and is robust enough to be applied to real images.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Monnier_Unsupervised_Layered_Image_Decomposition_Into_Object_Prototypes_ICCV_2021_paper.pdf", @@ -44584,7 +47596,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Monnier_Unsupervised_Layered_Image_Decomposition_Into_Object_Prototypes_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Monnier_Unsupervised_Layered_Image_Decomposition_Into_Object_Prototypes_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Monnier_2021_ICCV,\n \n author = {\n Monnier,\n Tom and Vincent,\n Elliot and Ponce,\n Jean and Aubry,\n Mathieu\n},\n title = {\n Unsupervised Layered Image Decomposition Into Object Prototypes\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8640-8650\n} \n}" }, { "title": "Unsupervised Learning of Fine Structure Generation for 3D Point Clouds by 2D Projections Matching", @@ -44592,6 +47605,7 @@ "status": "Poster", "track": "main", "pid": 1173, + "author_site": "Chao Chen; Zhizhong Han; Yu-Shen Liu; Matthias Zwicker", "author": "Chao Chen; Zhizhong Han; Yu-Shen Liu; Matthias Zwicker", "abstract": "Learning to generate 3D point clouds without 3D supervision is an important but challenging problem. Current solutions leverage various differentiable renderers to project the generated 3D point clouds onto a 2D image plane, and train deep neural networks using the per-pixel difference with 2D ground truth images. However, these solutions are still struggling to fully recover fine structures of 3D shapes, such as thin tubes or planes. To resolve this issue, we propose an unsupervised approach for 3D point cloud generation with fine structures. Specifically, we cast 3D point cloud learning as a 2D projection matching problem. Rather than using entire 2D silhouette images as a regular pixel supervision, we introduce structure adaptive sampling to randomly sample 2D points within the silhouettes as an irregular point supervision, which alleviates the consistency issue of sampling from different view angles. Our method pushes the neural network to generate a 3D point cloud whose 2D projections match the irregular point supervision from different view angles. Our 2D projection matching approach enables the neural network to learn more accurate structure information than using the per-pixel difference, especially for fine and thin 3D structures. Our method can recover fine 3D structures from 2D silhouette images at different resolutions, and is robust to different sampling methods and point number in irregular point supervision. Our method outperforms others under widely used benchmarks. Our code, data and models are available at https://github.com/chenchao15/2D_projection_matching.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_Unsupervised_Learning_of_Fine_Structure_Generation_for_3D_Point_Clouds_ICCV_2021_paper.pdf", @@ -44606,7 +47620,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Chen_Unsupervised_Learning_of_Fine_Structure_Generation_for_3D_Point_Clouds_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Chen_Unsupervised_Learning_of_Fine_Structure_Generation_for_3D_Point_Clouds_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Chao and Han,\n Zhizhong and Liu,\n Yu-Shen and Zwicker,\n Matthias\n},\n title = {\n Unsupervised Learning of Fine Structure Generation for 3D Point Clouds by 2D Projections Matching\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12466-12477\n} \n}" }, { "title": "Unsupervised Non-Rigid Image Distortion Removal via Grid Deformation", @@ -44614,6 +47629,7 @@ "status": "Poster", "track": "main", "pid": 6286, + "author_site": "Nianyi Li; Simron Thapa; Cameron Whyte; Albert W. Reed; Suren Jayasuriya; Jinwei Ye", "author": "Nianyi Li; Simron Thapa; Cameron Whyte; Albert W. Reed; Suren Jayasuriya; Jinwei Ye", "abstract": "Many computer vision problems face difficulties when imaging through turbulent refractive media (e.g., air and water) due to the refraction and scattering of light. These effects cause geometric distortion that requires either handcrafted physical priors or supervised learning methods to remove. In this paper, we present a novel unsupervised network to recover the latent distortion-free image. The key idea is to model non-rigid distortions as deformable grids. Our network consists of a grid deformer that estimates the distortion field and an image generator that outputs the distortion-free image. By leveraging the positional encoding operator, we can simplify the network structure while maintaining fine spatial details in the recovered images. Our method doesn't need to be trained on labeled data and has good transferability across various turbulent image datasets with different types of distortions. Extensive experiments on both simulated and real-captured turbulent images demonstrate that our method can remove both air and water distortions without much customization.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_Unsupervised_Non-Rigid_Image_Distortion_Removal_via_Grid_Deformation_ICCV_2021_paper.pdf", @@ -44637,7 +47653,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Nianyi and Thapa,\n Simron and Whyte,\n Cameron and Reed,\n Albert W. and Jayasuriya,\n Suren and Ye,\n Jinwei\n},\n title = {\n Unsupervised Non-Rigid Image Distortion Removal via Grid Deformation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2522-2532\n} \n}" }, { "title": "Unsupervised Point Cloud Object Co-Segmentation by Co-Contrastive Learning and Mutual Attention Sampling", @@ -44645,6 +47662,7 @@ "status": "Poster", "track": "main", "pid": 3098, + "author_site": "Cheng-Kun Yang; Yung-Yu Chuang; Yen-Yu Lin", "author": "Cheng-Kun Yang; Yung-Yu Chuang; Yen-Yu Lin", "abstract": "This paper presents a new task, point cloud object co-segmentation, aiming to segment the common 3D objects in a set of point clouds. We formulate this task as an object point sampling problem, and develop two techniques, the mutual attention module and co-contrastive learning, to enable it. The proposed method employs two point samplers based on deep neural networks, the object sampler and the background sampler. The former targets at sampling points of common objects while the latter focuses on the rest. The mutual attention module explores point-wise correlation across point clouds. It is embedded in both samplers and can identify points with strong cross-cloud correlation from the rest. After extracting features for points selected by the two samplers, we optimize the networks by developing the co-contrastive loss, which minimizes feature discrepancy of the estimated object points while maximizing feature separation between the estimated object and background points. Our method works on point clouds of an arbitrary object class. It is end-to-end trainable and does not need point-level annotations. It is evaluated on the ScanObjectNN and S3DIS datasets and achieves promising results.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yang_Unsupervised_Point_Cloud_Object_Co-Segmentation_by_Co-Contrastive_Learning_and_Mutual_ICCV_2021_paper.pdf", @@ -44668,7 +47686,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Taiwan", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yang_2021_ICCV,\n \n author = {\n Yang,\n Cheng-Kun and Chuang,\n Yung-Yu and Lin,\n Yen-Yu\n},\n title = {\n Unsupervised Point Cloud Object Co-Segmentation by Co-Contrastive Learning and Mutual Attention Sampling\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7335-7344\n} \n}" }, { "title": "Unsupervised Point Cloud Pre-Training via Occlusion Completion", @@ -44676,6 +47695,7 @@ "status": "Poster", "track": "main", "pid": 1727, + "author_site": "Hanchen Wang; Qi Liu; Xiangyu Yue; Joan Lasenby; Matt J. Kusner", "author": "Hanchen Wang; Qi Liu; Xiangyu Yue; Joan Lasenby; Matt J. Kusner", "abstract": "We describe a simple pre-training approach for point clouds. It works in three steps: 1. Mask all points occluded in a camera view; 2. Learn an encoder-decoder model to reconstruct the occluded points; 3. Use the encoder weights as initialisation for downstream point cloud tasks. We find that even when we pre-train on a single dataset (ModelNet40), this method improves accuracy across different datasets and encoders, on a wide range of downstream tasks. Specifically, we show that our method outperforms previous pre-training methods in object classification, and both part-based and semantic segmentation tasks. We study the pre-trained features and find that they lead to wide downstream minima, have high transformation invariance, and have activations that are highly correlated with part labels. Code and data are available at https://github.com/hansen7/OcCo", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_Unsupervised_Point_Cloud_Pre-Training_via_Occlusion_Completion_ICCV_2021_paper.pdf", @@ -44699,7 +47719,8 @@ "aff_campus_unique_index": "0;2;0", "aff_campus_unique": "Cambridge;;Berkeley", "aff_country_unique_index": "0;0;1;0;0", - "aff_country_unique": "United Kingdom;United States" + "aff_country_unique": "United Kingdom;United States", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Hanchen and Liu,\n Qi and Yue,\n Xiangyu and Lasenby,\n Joan and Kusner,\n Matt J.\n},\n title = {\n Unsupervised Point Cloud Pre-Training via Occlusion Completion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9782-9792\n} \n}" }, { "title": "Unsupervised Real-World Super-Resolution: A Domain Adaptation Perspective", @@ -44707,6 +47728,7 @@ "status": "Poster", "track": "main", "pid": 8230, + "author_site": "Wei Wang; Haochen Zhang; Zehuan Yuan; Changhu Wang", "author": "Wei Wang; Haochen Zhang; Zehuan Yuan; Changhu Wang", "abstract": "Most existing convolution neural network (CNN) based super-resolution (SR) methods generate their paired training dataset by artificially synthesizing low-resolution (LR) images from the high-resolution (HR) ones. However, this dataset preparation strategy harms the application of these CNNs in real-world scenarios due to the inherent domain gap between the training and testing data. A popular attempts towards the challenge is unpaired generative adversarial networks, which generate \"real\" LR counterparts from real HR images using image-to-image translation and then perform super-resolution from \"real\" LR->SR. Despite great progress, it is still difficult to synthesize perfect \"real\" LR images for super-resolution. In this paper, we firstly consider the real-world SR problem from the traditional domain adaptation perspective. We propose a novel unpaired SR training framework based on feature distribution alignment, with which we can obtain degradation-indistinguishable feature maps and then map them to HR images. In order to generate better SR images for target LR domain, we introduce several regularization losses to force the aligned feature to locate around the target domain. Our experiments indicate that our SR network obtains the state-of-the-art performance over both blind and unpaired SR methods on diverse datasets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_Unsupervised_Real-World_Super-Resolution_A_Domain_Adaptation_Perspective_ICCV_2021_paper.pdf", @@ -44730,7 +47752,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";San Diego", "aff_country_unique_index": "0;0+1;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Wei and Zhang,\n Haochen and Yuan,\n Zehuan and Wang,\n Changhu\n},\n title = {\n Unsupervised Real-World Super-Resolution: A Domain Adaptation Perspective\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4318-4327\n} \n}" }, { "title": "Unsupervised Segmentation Incorporating Shape Prior via Generative Adversarial Networks", @@ -44738,6 +47761,7 @@ "status": "Poster", "track": "main", "pid": 9869, + "author_site": "Dahye Kim; Byung-Woo Hong", "author": "Dahye Kim; Byung-Woo Hong", "abstract": "We present an image segmentation algorithm that is developed in an unsupervised deep learning framework. The delineation of object boundaries often fails due to the nuisance factors such as illumination changes and occlusions. Thus, we initially propose an unsupervised image decomposition algorithm to obtain an intrinsic representation that is robust with respect to undesirable bias fields based on a multiplicative image model. The obtained intrinsic image is subsequently provided to an unsupervised segmentation procedure that is developed based on a piecewise smooth model. The segmentation model is further designed to incorporate a geometric constraint imposed in the generative adversarial network framework where the discrepancy between the distribution of partitioning functions and the distribution of prior shapes is minimized. We demonstrate the effectiveness and robustness of the proposed algorithm in particular with bias fields and occlusions using simple yet illustrative synthetic examples and a benchmark dataset for image segmentation.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Kim_Unsupervised_Segmentation_Incorporating_Shape_Prior_via_Generative_Adversarial_Networks_ICCV_2021_paper.pdf", @@ -44761,7 +47785,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Seoul", "aff_country_unique_index": "0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Kim_2021_ICCV,\n \n author = {\n Kim,\n Dahye and Hong,\n Byung-Woo\n},\n title = {\n Unsupervised Segmentation Incorporating Shape Prior via Generative Adversarial Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7324-7334\n} \n}" }, { "title": "Unsupervised Semantic Segmentation by Contrasting Object Mask Proposals", @@ -44769,6 +47794,7 @@ "status": "Poster", "track": "main", "pid": 3456, + "author_site": "Wouter Van Gansbeke; Simon Vandenhende; Stamatios Georgoulis; Luc Van Gool", "author": "Wouter Van Gansbeke; Simon Vandenhende; Stamatios Georgoulis; Luc Van Gool", "abstract": "Being able to learn dense semantic representations of images without supervision is an important problem in computer vision. However, despite its significance, this problem remains rather unexplored, with a few exceptions that considered unsupervised semantic segmentation on small-scale datasets with a narrow visual domain. In this paper, we make a first attempt to tackle the problem on datasets that have been traditionally utilized for the supervised case. To achieve this, we introduce a two-step framework that adopts a predetermined mid-level prior in a contrastive optimization objective to learn pixel embeddings. This marks a large deviation from existing works that relied on proxy tasks or end-to-end clustering. Additionally, we argue about the importance of having a prior that contains information about objects, or their parts, and discuss several possibilities to obtain such a prior in an unsupervised manner. Experimental evaluation shows that our method comes with key advantages over existing works. First, the learned pixel embeddings can be directly clustered in semantic groups using K-Means on PASCAL. Under the fully unsupervised setting, there is no precedent in solving the semantic segmentation task on such a challenging benchmark. Second, our representations can improve over strong baselines when transferred to new datasets, e.g. COCO and DAVIS. The code is available.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Van_Gansbeke_Unsupervised_Semantic_Segmentation_by_Contrasting_Object_Mask_Proposals_ICCV_2021_paper.pdf", @@ -44792,7 +47818,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0+1", - "aff_country_unique": "Belgium;Switzerland" + "aff_country_unique": "Belgium;Switzerland", + "bibtex": "@InProceedings{Van_Gansbeke_2021_ICCV,\n \n author = {\n Van Gansbeke,\n Wouter and Vandenhende,\n Simon and Georgoulis,\n Stamatios and Van Gool,\n Luc\n},\n title = {\n Unsupervised Semantic Segmentation by Contrasting Object Mask Proposals\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10052-10062\n} \n}" }, { "title": "V-DESIRR: Very Fast Deep Embedded Single Image Reflection Removal", @@ -44800,6 +47827,7 @@ "status": "Poster", "track": "main", "pid": 9614, + "author_site": "B H Pawan Prasad; Green Rosh K S; Lokesh R. Boregowda; Kaushik Mitra; Sanjoy Chowdhury", "author": "B H Pawan Prasad; Green Rosh K S; Lokesh R. Boregowda; Kaushik Mitra; Sanjoy Chowdhury", "abstract": "Real world images often gets corrupted due to unwanted reflections and their removal is highly desirable. A major share of such images originate from smart phone cameras capable of very high resolution captures. Most of the existing methods either focus on restoration quality by compromising on processing speed and memory requirements or, focus on removing reflections at very low resolutions, there by limiting their practical deploy-ability. We propose a light weight deep learning model for reflection removal using a novel scale space architecture. Our method processes the corrupted image in two stages, a Low Scale Sub-network (LSSNet) to process the lowest scale and a Progressive Inference (PI) stage to process all the higher scales. In order to reduce the computational complexity, the sub-networks in PI stage are designed to be much shallower than LSSNet. Moreover, we employ weight sharing between various scales within the PI stage to limit the model size. This also allows our method to generalize to very high resolutions without explicit retraining. Our method is superior both qualitatively and quantitatively compared to the state of the art methods and at the same time 20x faster with 50x less number of parameters compared to the most recent state-of-the-art algorithm RAGNet. We implemented our method on an android smart phone, where a high resolution 12 MP image is restored in under 5 seconds.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Prasad_V-DESIRR_Very_Fast_Deep_Embedded_Single_Image_Reflection_Removal_ICCV_2021_paper.pdf", @@ -44816,14 +47844,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Prasad_V-DESIRR_Very_Fast_Deep_Embedded_Single_Image_Reflection_Removal_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;1;0", - "aff_unique_norm": "Samsung;Indian Institute of Technology Madras", - "aff_unique_dep": "Samsung R&D Institute;", + "aff_unique_norm": "Samsung R&D Institute;Indian Institute of Technology Madras", + "aff_unique_dep": ";", "aff_unique_url": "https://www.samsung.com/in/;https://www.iitm.ac.in", "aff_unique_abbr": "Samsung R&D;IIT Madras", "aff_campus_unique_index": "0;0;0;1;0", "aff_campus_unique": "Bangalore;Chennai", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "India" + "aff_country_unique": "India", + "bibtex": "@InProceedings{Prasad_2021_ICCV,\n \n author = {\n Prasad,\n B H Pawan and S,\n Green Rosh K and Boregowda,\n Lokesh R. and Mitra,\n Kaushik and Chowdhury,\n Sanjoy\n},\n title = {\n V-DESIRR: Very Fast Deep Embedded Single Image Reflection Removal\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2390-2399\n} \n}" }, { "title": "VENet: Voting Enhancement Network for 3D Object Detection", @@ -44831,6 +47860,7 @@ "status": "Poster", "track": "main", "pid": 6157, + "author_site": "Qian Xie; Yu-Kun Lai; Jing Wu; Zhoutao Wang; Dening Lu; Mingqiang Wei; Jun Wang", "author": "Qian Xie; Yu-Kun Lai; Jing Wu; Zhoutao Wang; Dening Lu; Mingqiang Wei; Jun Wang", "abstract": "Hough voting, as has been demonstrated in VoteNet, is effective for 3D object detection, where voting is a key step. In this paper, we propose a novel VoteNet-based 3D detector with vote enhancement to improve the detection accuracy in cluttered indoor scenes. It addresses the limitations of current voting schemes, i.e., votes from neighboring objects and background have significant negative impacts Specifically, before voting, we replace the classic MLP with the proposed Attentive MLP (AMLP) in the backbone network to get better feature description of seed points. During voting, we design a new vote attraction loss (VALoss) to enforce vote centers to locate closely and compactly to the corresponding object centers. After voting, we then devise a vote weighting module to integrate the foreground/background prediction into the vote aggregation process to enhance the capability of the original VoteNet to handle noise from background voting. The three proposed strategies all contribute to more effective voting and improved performance, resulting in a novel 3D object detector, termed VENet. Experiments show that our method outperforms state-of-the-art methods on benchmark datasets. Ablation studies demonstrate the effectiveness of the proposed components.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xie_VENet_Voting_Enhancement_Network_for_3D_Object_Detection_ICCV_2021_paper.pdf", @@ -44854,7 +47884,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0;0;0;0", - "aff_country_unique": "China;United Kingdom" + "aff_country_unique": "China;United Kingdom", + "bibtex": "@InProceedings{Xie_2021_ICCV,\n \n author = {\n Xie,\n Qian and Lai,\n Yu-Kun and Wu,\n Jing and Wang,\n Zhoutao and Lu,\n Dening and Wei,\n Mingqiang and Wang,\n Jun\n},\n title = {\n VENet: Voting Enhancement Network for 3D Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3712-3721\n} \n}" }, { "title": "VIL-100: A New Dataset and a Baseline Model for Video Instance Lane Detection", @@ -44862,10 +47893,11 @@ "status": "Poster", "track": "main", "pid": 2714, + "author_site": "Yujun Zhang; Lei Zhu; Wei Feng; Huazhu Fu; Mingqian Wang; Qingxia Li; Cheng Li; Song Wang", "author": "Yujun Zhang; Lei Zhu; Wei Feng; Huazhu Fu; Mingqian Wang; Qingxia Li; Cheng Li; Song Wang", "abstract": "Lane detection plays a key role in autonomous driving. While car cameras always take streaming videos on the way, current lane detection works mainly focus on individual images (frames) by ignoring dynamics along the video. In this work, we collect a new video instance lane detection (VIL-100) dataset, which contains 100 videos with in total 10,000 frames, acquired from different real traffic scenarios. All the frames in each video are manually annotated to a high-quality instance-level lane annotation, and a set of frame-level and video-level metrics are included for quantitative performance evaluation. Moreover, we propose a new baseline model, named multi-level memory aggregation network (MMA-Net), for video instance lane detection. In our approach, the representation of current frame is enhanced by attentively aggregating both local and global memory features from other frames. Experiments on the new collected dataset show that the proposed MMA-Net outperforms state-of-the-art lane detection methods and video object segmentation methods. We release our dataset and code at https://github.com/yujun0-0/MMA-Net.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhang_VIL-100_A_New_Dataset_and_a_Baseline_Model_for_Video_ICCV_2021_paper.pdf", - "aff": "Tianjin University; University of Cambridge; Tianjin University+Inception Institute of Arti\ufb01cial Intelligence; Inception Institute of Arti\ufb01cial Intelligence; Tianjin University; Automotive Data of China (Tianjin) Co., Ltd; Tianjin University; Tianjin University+University of South Carolina", + "aff": "Tianjin University; University of Cambridge; Tianjin University+Inception Institute of Artificial Intelligence; Inception Institute of Artificial Intelligence; Tianjin University; Automotive Data of China (Tianjin) Co., Ltd; Tianjin University; Tianjin University+University of South Carolina", "project": "", "github": "https://github.com/yujun0-0/MMA-Net", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Zhang_VIL-100_A_New_ICCV_2021_supplemental.pdf", @@ -44882,10 +47914,11 @@ "aff_unique_dep": ";;;;", "aff_unique_url": "http://www.tju.edu.cn;https://www.cam.ac.uk;https://www.inceptionai.org;;https://www.sc.edu", "aff_unique_abbr": "TJU;Cambridge;;;USC", - "aff_campus_unique_index": "1;;", - "aff_campus_unique": ";Cambridge", + "aff_campus_unique_index": "1;;2;", + "aff_campus_unique": ";Cambridge;Tianjin", "aff_country_unique_index": "0;1;0+0;0;0;0;0;0+2", - "aff_country_unique": "China;United Kingdom;United States" + "aff_country_unique": "China;United Kingdom;United States", + "bibtex": "@InProceedings{Zhang_2021_ICCV,\n \n author = {\n Zhang,\n Yujun and Zhu,\n Lei and Feng,\n Wei and Fu,\n Huazhu and Wang,\n Mingqian and Li,\n Qingxia and Li,\n Cheng and Wang,\n Song\n},\n title = {\n VIL-100: A New Dataset and a Baseline Model for Video Instance Lane Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15681-15690\n} \n}" }, { "title": "VLGrammar: Grounded Grammar Induction of Vision and Language", @@ -44893,6 +47926,7 @@ "status": "Poster", "track": "main", "pid": 9974, + "author_site": "Yining Hong; Qing Li; Song-Chun Zhu; Siyuan Huang", "author": "Yining Hong; Qing Li; Song-Chun Zhu; Siyuan Huang", "abstract": "Cognitive grammar suggests that the acquisition of language grammar is grounded within visual structures. While grammar is an essential representation of natural language, it also exists ubiquitously in vision to represent the hierarchical part-whole structure. In this work, we study grounded grammar induction of vision and language in a joint learning framework. Specifically, we present VLGrammar, a method that uses compound probabilistic context-free grammars (compound PCFGs) to induce the language grammar and the image grammar simultaneously. We propose a novel contrastive learning framework to guide the joint learning of both modules. To provide a benchmark for the grounded grammar induction task, we collect a large-scale dataset, PartIt, which contains human-written sentences that describe part-level semantics for 3D objects. Experiments on the PartIt dataset show that VLGrammar outperforms all baselines in image grammar induction and language grammar induction. The learned VLGrammar naturally benefits related downstream tasks. Specifically, it improves the image unsupervised clustering accuracy by 30%, and performs well in image retrieval and text retrieval. Notably, the induced grammar shows superior generalizability by easily generalizing to unseen categories.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Hong_VLGrammar_Grounded_Grammar_Induction_of_Vision_and_Language_ICCV_2021_paper.pdf", @@ -44916,7 +47950,8 @@ "aff_campus_unique_index": "0;0;;0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0;0;1+1+1;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Hong_2021_ICCV,\n \n author = {\n Hong,\n Yining and Li,\n Qing and Zhu,\n Song-Chun and Huang,\n Siyuan\n},\n title = {\n VLGrammar: Grounded Grammar Induction of Vision and Language\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1665-1674\n} \n}" }, { "title": "VMNet: Voxel-Mesh Network for Geodesic-Aware 3D Semantic Segmentation", @@ -44924,6 +47959,7 @@ "status": "Poster", "track": "main", "pid": 7221, + "author_site": "Zeyu Hu; Xuyang Bai; Jiaxiang Shang; Runze Zhang; Jiayu Dong; Xin Wang; Guangyuan Sun; Hongbo Fu; Chiew-Lan Tai", "author": "Zeyu Hu; Xuyang Bai; Jiaxiang Shang; Runze Zhang; Jiayu Dong; Xin Wang; Guangyuan Sun; Hongbo Fu; Chiew-Lan Tai", "abstract": "In recent years, sparse voxel-based methods have become the state-of-the-arts for 3D semantic segmentation of indoor scenes, thanks to the powerful 3D CNNs. Nevertheless, being oblivious to the underlying geometry, voxel-based methods suffer from ambiguous features on spatially close objects and struggle with handling complex and irregular geometries due to the lack of geodesic information. In view of this, we present Voxel-Mesh Network (VMNet), a novel 3D deep architecture that operates on the voxel and mesh representations leveraging both the Euclidean and geodesic information. Intuitively, the Euclidean information extracted from voxels can offer contextual cues representing interactions between nearby objects, while the geodesic information extracted from meshes can help separate objects that are spatially close but have disconnected surfaces. To incorporate such information from the two domains, we design an intra-domain attentive module for effective feature aggregation and an inter-domain attentive module for adaptive feature fusion. Experimental results validate the effectiveness of VMNet: specifically, on the challenging ScanNet dataset for large-scale segmentation of indoor scenes, it outperforms the state-of-the-art SparseConvNet and MinkowskiNet (74.6% vs 72.5% and 73.6% in mIoU) with a simpler network structure (17M vs 30M and 38M parameters). Code release: https://github.com/hzykent/VMNet", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Hu_VMNet_Voxel-Mesh_Network_for_Geodesic-Aware_3D_Semantic_Segmentation_ICCV_2021_paper.pdf", @@ -44947,7 +47983,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Hu_2021_ICCV,\n \n author = {\n Hu,\n Zeyu and Bai,\n Xuyang and Shang,\n Jiaxiang and Zhang,\n Runze and Dong,\n Jiayu and Wang,\n Xin and Sun,\n Guangyuan and Fu,\n Hongbo and Tai,\n Chiew-Lan\n},\n title = {\n VMNet: Voxel-Mesh Network for Geodesic-Aware 3D Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15488-15498\n} \n}" }, { "title": "VSAC: Efficient and Accurate Estimator for H and F", @@ -44955,10 +47992,11 @@ "status": "Poster", "track": "main", "pid": 3463, - "author": "Maksym Ivashechkin; Daniel Barath; Ji\u0159\u00ed Matas", + "author_site": "Maksym Ivashechkin; Daniel Barath; Jiří Matas", + "author": "Maksym Ivashechkin; Daniel Barath; Jiří Matas", "abstract": "We present VSAC, a RANSAC-type robust estimator with a number of novelties. It benefits from the introduction of the concept of independent inliers that improves significantly the efficacy of the dominant plane handling and also allows near error-free rejection of incorrect models, without false positives. The local optimization process and its application is improved so that it is run on average only once. Further technical improvements include adaptive sequential hypothesis verification and efficient model estimation via Gaussian elimination. Experiments on four standard datasets show that VSAC is significantly faster than all its predecessors and runs on average in 1-2 ms, on a CPU. It is two orders of magnitude faster and yet as precise as MAGSAC++, the currently most accurate estimator of two-view geometry. In the repeated runs on EVD, HPatches, PhotoTourism, and Kusvod2 datasets, it never failed.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ivashechkin_VSAC_Efficient_and_Accurate_Estimator_for_H_and_F_ICCV_2021_paper.pdf", - "aff": "Centre for Machine Perception, Czech Technical University in Prague, Czech Republic; Computer Vision and Geometry Group, Department of Computer Science, ETH Z\u00fcrich; Centre for Machine Perception, Czech Technical University in Prague, Czech Republic", + "aff": "Centre for Machine Perception, Czech Technical University in Prague, Czech Republic; Computer Vision and Geometry Group, Department of Computer Science, ETH Zürich; Centre for Machine Perception, Czech Technical University in Prague, Czech Republic", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Ivashechkin_VSAC_Efficient_and_ICCV_2021_supplemental.pdf", @@ -44971,14 +48009,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Ivashechkin_VSAC_Efficient_and_Accurate_Estimator_for_H_and_F_ICCV_2021_paper.html", "aff_unique_index": "0;1;0", - "aff_unique_norm": "Czech Technical University in Prague;ETH Zurich", + "aff_unique_norm": "Czech Technical University in Prague;ETH Zürich", "aff_unique_dep": "Centre for Machine Perception;Department of Computer Science", "aff_unique_url": "https://www.cvut.cz;https://www.ethz.ch", "aff_unique_abbr": "CTU;ETHZ", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Prague;", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "Czech Republic;Switzerland" + "aff_country_unique": "Czech Republic;Switzerland", + "bibtex": "@InProceedings{Ivashechkin_2021_ICCV,\n \n author = {\n Ivashechkin,\n Maksym and Barath,\n Daniel and Matas,\n Ji\\v{r\n}{\\'\\i\n}\n},\n title = {\n VSAC: Efficient and Accurate Estimator for H and F\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15243-15252\n} \n}" }, { "title": "VaPiD: A Rapid Vanishing Point Detector via Learned Optimizers", @@ -44986,6 +48025,7 @@ "status": "Poster", "track": "main", "pid": 7103, + "author_site": "Shichen Liu; Yichao Zhou; Yajie Zhao", "author": "Shichen Liu; Yichao Zhou; Yajie Zhao", "abstract": "Being able to infer 3D structures from 2D images with geometric principles, vanishing points have been a well-recognized concept in 3D vision research. It has been widely used in autonomous driving, SLAM, and AR/VR for applications including road direction estimation, camera calibration, and camera pose estimation. Existing vanishing point detection methods often need to trade off between robustness, precision, and inference speed. In this paper, we introduce VaPiD, a novel neural network-based rapid Vanishing Point Detector that achieves unprecedented efficiency with learned vanishing point optimizers. The core of our method contains two components: a vanishing point proposal network that gives a set of vanishing point proposals as coarse estimations; and a neural vanishing point optimizer that iteratively optimizes the positions of the vanishing point proposals to achieve high-precision levels. Extensive experiments on both synthetic and real-world datasets show that our method provides competitive, if not better, performance as compared to the previous state-of-the-art vanishing point detection approaches, while being significantly faster.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liu_VaPiD_A_Rapid_Vanishing_Point_Detector_via_Learned_Optimizers_ICCV_2021_paper.pdf", @@ -45009,7 +48049,8 @@ "aff_campus_unique_index": "0;2", "aff_campus_unique": "Los Angeles;;Berkeley", "aff_country_unique_index": "0+0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Liu_2021_ICCV,\n \n author = {\n Liu,\n Shichen and Zhou,\n Yichao and Zhao,\n Yajie\n},\n title = {\n VaPiD: A Rapid Vanishing Point Detector via Learned Optimizers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12859-12868\n} \n}" }, { "title": "VariTex: Variational Neural Face Textures", @@ -45017,7 +48058,8 @@ "status": "Poster", "track": "main", "pid": 9106, - "author": "Marcel C. B\u00fchler; Abhimitra Meka; Gengyan Li; Thabo Beeler; Otmar Hilliges", + "author_site": "Marcel C. Bühler; Abhimitra Meka; Gengyan Li; Thabo Beeler; Otmar Hilliges", + "author": "Marcel C. Bühler; Abhimitra Meka; Gengyan Li; Thabo Beeler; Otmar Hilliges", "abstract": "Deep generative models can synthesize photorealistic images of human faces with novel identities.However, a key challenge to the wide applicability of such techniques is to provide independent control over semantically meaningful parameters: appearance, head pose, face shape, and facial expressions. In this paper, we propose VariTex - to the best of our knowledge the first method that learns a variational latent feature space of neural face textures, which allows sampling of novel identities. We combine this generative model with a parametric face model and gain explicit control over head pose and facial expressions. To generate complete images of human heads, we propose an additive decoder that adds plausible details such as hair. A novel training scheme enforces a pose-independent latent space and in consequence, allows learning a one-to-many mapping between latent codes and pose-conditioned exterior regions. The resulting method can generate geometrically consistent images of novel identities under fine-grained control over head pose, face shape, and facial expressions. This facilitates a broad range of downstream tasks, like sampling novel identities, changing the head pose, expression transfer, and more.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Buhler_VariTex_Variational_Neural_Face_Textures_ICCV_2021_paper.pdf", "aff": "ETH Zurich; Google; ETH Zurich + Google; Google; ETH Zurich", @@ -45034,13 +48076,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Buhler_VariTex_Variational_Neural_Face_Textures_ICCV_2021_paper.html", "aff_unique_index": "0;1;0+1;1;0", "aff_unique_norm": "ETH Zurich;Google", - "aff_unique_dep": ";Google", + "aff_unique_dep": ";", "aff_unique_url": "https://www.ethz.ch;https://www.google.com", "aff_unique_abbr": "ETHZ;Google", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;0+1;1;0", - "aff_country_unique": "Switzerland;United States" + "aff_country_unique": "Switzerland;United States", + "bibtex": "@InProceedings{Buhler_2021_ICCV,\n \n author = {\n B\\"uhler,\n Marcel C. and Meka,\n Abhimitra and Li,\n Gengyan and Beeler,\n Thabo and Hilliges,\n Otmar\n},\n title = {\n VariTex: Variational Neural Face Textures\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13890-13899\n} \n}" }, { "title": "Variable-Rate Deep Image Compression Through Spatially-Adaptive Feature Transform", @@ -45048,6 +48091,7 @@ "status": "Poster", "track": "main", "pid": 5705, + "author_site": "Myungseo Song; Jinyoung Choi; Bohyung Han", "author": "Myungseo Song; Jinyoung Choi; Bohyung Han", "abstract": "We propose a versatile deep image compression network based on Spatial Feature Transform (SFT), which takes a source image and a corresponding quality map as inputs and produce a compressed image with variable rates. Our model covers a wide range of compression rates using a single model, which is controlled by arbitrary pixel-wise quality maps. In addition, the proposed framework allows us to perform task-aware image compressions for various tasks, e.g., classification, by efficiently estimating optimized quality maps specific to target tasks for our encoding network. This is even possible with a pretrained network without learning separate models for individual tasks. Our algorithm achieves outstanding rate-distortion trade-off compared to the approaches based on multiple models that are optimized separately for several different target rates. At the same level of compression, the proposed approach successfully improves performance on image classification and text region quality preservation via task-aware quality map estimation without additional model training. The code is available at the project website https://github.com/micmic123/QmapCompression.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Song_Variable-Rate_Deep_Image_Compression_Through_Spatially-Adaptive_Feature_Transform_ICCV_2021_paper.pdf", @@ -45071,7 +48115,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Seoul", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Song_2021_ICCV,\n \n author = {\n Song,\n Myungseo and Choi,\n Jinyoung and Han,\n Bohyung\n},\n title = {\n Variable-Rate Deep Image Compression Through Spatially-Adaptive Feature Transform\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2380-2389\n} \n}" }, { "title": "Variational Attention: Propagating Domain-Specific Knowledge for Multi-Domain Learning in Crowd Counting", @@ -45079,6 +48124,7 @@ "status": "Poster", "track": "main", "pid": 7198, + "author_site": "Binghui Chen; Zhaoyi Yan; Ke Li; Pengyu Li; Biao Wang; Wangmeng Zuo; Lei Zhang", "author": "Binghui Chen; Zhaoyi Yan; Ke Li; Pengyu Li; Biao Wang; Wangmeng Zuo; Lei Zhang", "abstract": "In crowd counting, due to the problem of laborious labelling, it is perceived intractability of collecting a new large-scale dataset which has plentiful images with large diversity in density, scene, etc. Thus, for learning a general model, training with data from multiple different datasets might be a remedy and be of great value. In this paper, we resort to the multi-domain joint learning and propose a simple but effective Domain-specific Knowledge Propagating Network (DKPNet) for unbiasedly learning the knowledge from multiple diverse data domains at the same time. It is mainly achieved by proposing the novel Variational Attention(VA) technique for explicitly modeling the attention distributions for different domains. And as an extension to VA, Intrinsic Variational Attention(InVA) is proposed to handle the problems of over-lapped domains and sub-domains. Extensive experiments have been conducted to validate the superiority of our DKPNet over several popular datasets, including ShanghaiTech A/B, UCF-QNRF and NWPU.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_Variational_Attention_Propagating_Domain-Specific_Knowledge_for_Multi-Domain_Learning_in_Crowd_ICCV_2021_paper.pdf", @@ -45095,14 +48141,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Chen_Variational_Attention_Propagating_Domain-Specific_Knowledge_for_Multi-Domain_Learning_in_Crowd_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;1", - "aff_unique_norm": "Harbin Institute of Technology;Hong Kong Polytechnic University", + "aff_unique_norm": "Harbin Institute of Technology;The Hong Kong Polytechnic University", "aff_unique_dep": ";", "aff_unique_url": "http://www.hit.edu.cn/;https://www.polyu.edu.hk", "aff_unique_abbr": "HIT;PolyU", "aff_campus_unique_index": "0;0;0;1", "aff_campus_unique": "Harbin;Hong Kong SAR", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Binghui and Yan,\n Zhaoyi and Li,\n Ke and Li,\n Pengyu and Wang,\n Biao and Zuo,\n Wangmeng and Zhang,\n Lei\n},\n title = {\n Variational Attention: Propagating Domain-Specific Knowledge for Multi-Domain Learning in Crowd Counting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 16065-16075\n} \n}" }, { "title": "Variational Feature Disentangling for Fine-Grained Few-Shot Classification", @@ -45110,6 +48157,7 @@ "status": "Poster", "track": "main", "pid": 2008, + "author_site": "Jingyi Xu; Hieu Le; Mingzhen Huang; ShahRukh Athar; Dimitris Samaras", "author": "Jingyi Xu; Hieu Le; Mingzhen Huang; ShahRukh Athar; Dimitris Samaras", "abstract": "Data augmentation is an intuitive step towards solving the problem of few-shot classification. However, ensuring both discriminability and diversity in the augmented samples is challenging. To address this, we propose a feature disentanglement framework that allows us to augment features with randomly sampled intra-class variations while preserving their class-discriminative features. Specifically, we disentangle a feature representation into two components: one represents the intra-class variance and the other encodes the class-discriminative information. We assume that the intra-class variance induced by variations in poses, backgrounds, or illumination conditions is shared across all classes and can be modelled via a common distribution. Then we sample features repeatedly from the learned intra-class variability distribution and add them to the class-discriminative features to get the augmented features. Such a data augmentation scheme ensures that the augmented features inherit crucial class-discriminative features while exhibiting large intra-class variance. Our method significantly outperforms the state-of-the-art methods on multiple challenging fine-grained few-shot image classification benchmarks. Code is available at: https://github.com/cvlab-stonybrook/vfd-iccv21", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xu_Variational_Feature_Disentangling_for_Fine-Grained_Few-Shot_Classification_ICCV_2021_paper.pdf", @@ -45124,7 +48172,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Xu_Variational_Feature_Disentangling_for_Fine-Grained_Few-Shot_Classification_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Xu_Variational_Feature_Disentangling_for_Fine-Grained_Few-Shot_Classification_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Xu_2021_ICCV,\n \n author = {\n Xu,\n Jingyi and Le,\n Hieu and Huang,\n Mingzhen and Athar,\n ShahRukh and Samaras,\n Dimitris\n},\n title = {\n Variational Feature Disentangling for Fine-Grained Few-Shot Classification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8812-8821\n} \n}" }, { "title": "Vector Neurons: A General Framework for SO(3)-Equivariant Networks", @@ -45132,6 +48181,7 @@ "status": "Poster", "track": "main", "pid": 6964, + "author_site": "Congyue Deng; Or Litany; Yueqi Duan; Adrien Poulenard; Andrea Tagliasacchi; Leonidas J. Guibas", "author": "Congyue Deng; Or Litany; Yueqi Duan; Adrien Poulenard; Andrea Tagliasacchi; Leonidas J. Guibas", "abstract": "Invariance and equivariance to the rotation group have been widely discussed in the 3D deep learning community for pointclouds. Yet most proposed methods either use complex mathematical tools that may limit their accessibility, or are tied to specific input data types and network architectures. In this paper, we introduce a general framework built on top of what we call Vector Neuron representations for creating SO(3)-equivariant neural networks for pointcloud processing. Extending neurons from 1D scalars to 3D vectors, our vector neurons enable a simple mapping of SO(3) actions to latent spaces thereby providing a framework for building equivariance in common neural operations -- including linear layers, non-linearities, pooling, and normalizations. Due to their simplicity, vector neurons are versatile and, as we demonstrate, can be incorporated into diverse network architecture backbones, allowing them to process geometry inputs in arbitrary poses. Despite its simplicity, our method performs comparably well in accuracy and generalization with other more complex and specialized state-of-the-art methods on classification and segmentation tasks. We also show for the first time a rotation equivariant reconstruction network. Source code is available at https://github.com/FlyingGiraffe/vnn.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Deng_Vector_Neurons_A_General_Framework_for_SO3-Equivariant_Networks_ICCV_2021_paper.pdf", @@ -45148,14 +48198,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Deng_Vector_Neurons_A_General_Framework_for_SO3-Equivariant_Networks_ICCV_2021_paper.html", "aff_unique_index": "0;1;0;0;2+3;0", - "aff_unique_norm": "Stanford University;NVIDIA;Google;University of Toronto", - "aff_unique_dep": ";NVIDIA Corporation;Google Research;", + "aff_unique_norm": "Stanford University;NVIDIA Corporation;Google;University of Toronto", + "aff_unique_dep": ";;Google Research;", "aff_unique_url": "https://www.stanford.edu;https://www.nvidia.com;https://research.google;https://www.utoronto.ca", "aff_unique_abbr": "Stanford;NVIDIA;Google Research;U of T", "aff_campus_unique_index": "0;0;0;2;0", "aff_campus_unique": "Stanford;;Mountain View", "aff_country_unique_index": "0;0;0;0;0+1;0", - "aff_country_unique": "United States;Canada" + "aff_country_unique": "United States;Canada", + "bibtex": "@InProceedings{Deng_2021_ICCV,\n \n author = {\n Deng,\n Congyue and Litany,\n Or and Duan,\n Yueqi and Poulenard,\n Adrien and Tagliasacchi,\n Andrea and Guibas,\n Leonidas J.\n},\n title = {\n Vector Neurons: A General Framework for SO(3)-Equivariant Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12200-12209\n} \n}" }, { "title": "Vector-Decomposed Disentanglement for Domain-Invariant Object Detection", @@ -45163,6 +48214,7 @@ "status": "Poster", "track": "main", "pid": 8166, + "author_site": "Aming Wu; Rui Liu; Yahong Han; Linchao Zhu; Yi Yang", "author": "Aming Wu; Rui Liu; Yahong Han; Linchao Zhu; Yi Yang", "abstract": "To improve the generalization of detectors, for domain adaptive object detection (DAOD), recent advances mainly explore aligning feature-level distributions between the source and single-target domain, which may neglect the impact of domain-specific information existing in the aligned features. Towards DAOD, it is important to extract domain-invariant object representations. To this end, in this paper, we try to disentangle domain-invariant representations from domain-specific representations. And we propose a novel disentangled method based on vector decomposition. Firstly, an extractor is devised to separate domain-invariant representations from the input, which are used for extracting object proposals. Secondly, domain-specific representations are introduced as the differences between the input and domain-invariant representations. Through the difference operation, the gap between the domain-specific and domain-invariant representations is enlarged, which promotes domain-invariant representations to contain more domain-irrelevant information. In the experiment, we separately evaluate our method on the single- and compound-target case. For the single-target case, experimental results of four domain-shift scenes show our method obtains a significant performance gain over baseline methods. Moreover, for the compound-target case (i.e., the target is a compound of two different domains without domain labels), our method outperforms baseline methods by around 4%, which demonstrates the effectiveness of our method.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wu_Vector-Decomposed_Disentanglement_for_Domain-Invariant_Object_Detection_ICCV_2021_paper.pdf", @@ -45179,14 +48231,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wu_Vector-Decomposed_Disentanglement_for_Domain-Invariant_Object_Detection_ICCV_2021_paper.html", "aff_unique_index": "0+0;0+0+1;0+0+1;2;2", - "aff_unique_norm": "Tianjin University;Pengcheng Laboratory;University of Technology Sydney", - "aff_unique_dep": "College of Intelligence and Computing;Peng Cheng Laboratory;ReLER Lab, AAII", + "aff_unique_norm": "Tianjin University;Peng Cheng Laboratory;University of Technology Sydney", + "aff_unique_dep": "College of Intelligence and Computing;;ReLER Lab, AAII", "aff_unique_url": "http://www.tju.edu.cn;;https://www.uts.edu.au", "aff_unique_abbr": "Tianjin University;;UTS", - "aff_campus_unique_index": "0+0;0+0+1;0+0+1", - "aff_campus_unique": "Tianjin;Shenzhen;", + "aff_campus_unique_index": "0+0;0+0+1;0+0+1;2;2", + "aff_campus_unique": "Tianjin;Shenzhen;Sydney", "aff_country_unique_index": "0+0;0+0+0;0+0+0;1;1", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Wu_2021_ICCV,\n \n author = {\n Wu,\n Aming and Liu,\n Rui and Han,\n Yahong and Zhu,\n Linchao and Yang,\n Yi\n},\n title = {\n Vector-Decomposed Disentanglement for Domain-Invariant Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9342-9351\n} \n}" }, { "title": "Vi2CLR: Video and Image for Visual Contrastive Learning of Representation", @@ -45194,6 +48247,7 @@ "status": "Poster", "track": "main", "pid": 8546, + "author_site": "Ali Diba; Vivek Sharma; Reza Safdari; Dariush Lotfi; Saquib Sarfraz; Rainer Stiefelhagen; Luc Van Gool", "author": "Ali Diba; Vivek Sharma; Reza Safdari; Dariush Lotfi; Saquib Sarfraz; Rainer Stiefelhagen; Luc Van Gool", "abstract": "In this paper, we introduce a novel self-supervised visual representation learning method which understands both images and videos in a joint learning fashion. The proposed neural network architecture and objectives are designed to obtain two different Convolutional Neural Networks for solving visual recognition tasks in the domain of videos and images. Our method called Video/Image for Visual Contrastive Learning of Representation(Vi2CLR) uses unlabeled videos to exploit dynamic and static visual cues for self-supervised and instances similarity/dissimilarity learning. Vi2CLR optimization pipeline consists of visual clustering part and representation learning based on groups of similar positive instances within a cluster and negative ones from other clusters and learning visual clusters and their distances. We show how a joint self-supervised visual clustering and instance similarity learning with 2D (image) and 3D (video) CovNet encoders yields such robust and near to supervised learning performance. We extensively evaluate the method on downstream tasks like large scale action recognition and image and object classification on datasets like Kinetics, ImageNet, Pascal VOC'07 and UCF101 and achieve outstanding results compared to state-of-the-art self-supervised methods. To the best of our knowledge, the Vi2CLR is the first of its kind self-supervised neural network to tackle both video and image recognition task simultaneously by only using one source of data.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Diba_Vi2CLR_Video_and_Image_for_Visual_Contrastive_Learning_of_Representation_ICCV_2021_paper.pdf", @@ -45211,13 +48265,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Diba_Vi2CLR_Video_and_Image_for_Visual_Contrastive_Learning_of_Representation_ICCV_2021_paper.html", "aff_unique_index": "0+1;2+3;4;4;5+6;5;0+4+1", "aff_unique_norm": "Katholieke Universiteit Leuven;ETH Zurich;Massachusetts Institute of Technology;Harvard University;Sensifai;Karlsruhe Institute of Technology;Daimler AG", - "aff_unique_dep": ";;;Medical School;;;TSS (Testing and Simulation Services)", + "aff_unique_dep": ";;;Medical School;;;TSS (Technology Solutions & Services)", "aff_unique_url": "https://www.kuleuven.be;https://www.ethz.ch;https://web.mit.edu;https://hms.harvard.edu;;https://www.kit.edu;https://www.daimler.com", "aff_unique_abbr": "KU Leuven;ETHZ;MIT;HMS;;KIT;Daimler", "aff_campus_unique_index": ";1;;", "aff_campus_unique": ";Boston", "aff_country_unique_index": "0+1;2+2;3;3;4+4;4;0+3+1", - "aff_country_unique": "Belgium;Switzerland;United States;Unknown;Germany" + "aff_country_unique": "Belgium;Switzerland;United States;Unknown;Germany", + "bibtex": "@InProceedings{Diba_2021_ICCV,\n \n author = {\n Diba,\n Ali and Sharma,\n Vivek and Safdari,\n Reza and Lotfi,\n Dariush and Sarfraz,\n Saquib and Stiefelhagen,\n Rainer and Van Gool,\n Luc\n},\n title = {\n Vi2CLR: Video and Image for Visual Contrastive Learning of Representation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1502-1512\n} \n}" }, { "title": "ViViT: A Video Vision Transformer", @@ -45225,7 +48280,8 @@ "status": "Poster", "track": "main", "pid": 2188, - "author": "Anurag Arnab; Mostafa Dehghani; Georg Heigold; Chen Sun; Mario Lu\u010di\u0107; Cordelia Schmid", + "author_site": "Anurag Arnab; Mostafa Dehghani; Georg Heigold; Chen Sun; Mario Lučić; Cordelia Schmid", + "author": "Anurag Arnab; Mostafa Dehghani; Georg Heigold; Chen Sun; Mario Lučić; Cordelia Schmid", "abstract": "We present pure-transformer based models for video classification, drawing upon the recent success of such models in image classification. Our model extracts spatio-temporal tokens from the input video, which are then encoded by a series of transformer layers. In order to handle the long sequences of tokens encountered in video, we propose several, efficient variants of our model which factorise the spatial- and temporal-dimensions of the input. Although transformer-based models are known to only be effective when large training datasets are available, we show how we can effectively regularise the model during training and leverage pretrained image models to be able to train on comparatively small datasets. We conduct thorough ablation studies, and achieve state-of-the-art results on multiple video classification benchmarks including Kinetics 400 and 600, Epic Kitchens, Something-Something v2 and Moments in Time, outperforming prior methods based on deep 3D convolutional networks. To facilitate further research, we will release code and models.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Arnab_ViViT_A_Video_Vision_Transformer_ICCV_2021_paper.pdf", "aff": "Google Research; Google Research; Google Research; Google Research; Google Research; Google Research", @@ -45248,7 +48304,8 @@ "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Arnab_2021_ICCV,\n \n author = {\n Arnab,\n Anurag and Dehghani,\n Mostafa and Heigold,\n Georg and Sun,\n Chen and Lu\\v{c\n}i\\'c,\n Mario and Schmid,\n Cordelia\n},\n title = {\n ViViT: A Video Vision Transformer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6836-6846\n} \n}" }, { "title": "VidTr: Video Transformer Without Convolutions", @@ -45256,6 +48313,7 @@ "status": "Poster", "track": "main", "pid": 6737, + "author_site": "Yanyi Zhang; Xinyu Li; Chunhui Liu; Bing Shuai; Yi Zhu; Biagio Brattoli; Hao Chen; Ivan Marsic; Joseph Tighe", "author": "Yanyi Zhang; Xinyu Li; Chunhui Liu; Bing Shuai; Yi Zhu; Biagio Brattoli; Hao Chen; Ivan Marsic; Joseph Tighe", "abstract": "We introduce Video Transformer (VidTr) with separable-attention for video classification. Comparing with commonly used 3D networks, VidTr is able to aggregate spatio-temporal information via stacked attentions and provide better performance with higher efficiency. We first introduce the vanilla video transformer and show that the transformer module is able to perform spatio-temporal modeling from raw pixels, but with heavy memory usage. We then present VidTr which reduces the memory cost by 3.3xwhile keeping the same performance. To further optimize the model, we propose the standard deviation based topK pooling for attention, which reduces the computation by dropping non-informative features along temporal dimension. VidTr achieves state-of-the-art performance on five commonly used datasets with lower computational requirements, showing both the efficiency and effectiveness of our design. Finally, error analysis and visualization show that VidTr is especially good at predicting actions that require long-term temporal reasoning.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhang_VidTr_Video_Transformer_Without_Convolutions_ICCV_2021_paper.pdf", @@ -45272,14 +48330,15 @@ "author_num": 9, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhang_VidTr_Video_Transformer_Without_Convolutions_ICCV_2021_paper.html", "aff_unique_index": "0+1;0+1;0;0;0;0;0;1;0", - "aff_unique_norm": "Amazon;Rutgers University", - "aff_unique_dep": "Amazon Web Services;", + "aff_unique_norm": "Amazon Web Services;Rutgers University", + "aff_unique_dep": ";", "aff_unique_url": "https://aws.amazon.com;https://www.rutgers.edu", "aff_unique_abbr": "AWS;Rutgers", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhang_2021_ICCV,\n \n author = {\n Zhang,\n Yanyi and Li,\n Xinyu and Liu,\n Chunhui and Shuai,\n Bing and Zhu,\n Yi and Brattoli,\n Biagio and Chen,\n Hao and Marsic,\n Ivan and Tighe,\n Joseph\n},\n title = {\n VidTr: Video Transformer Without Convolutions\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13577-13587\n} \n}" }, { "title": "Video Annotation for Visual Tracking via Selection and Refinement", @@ -45287,6 +48346,7 @@ "status": "Poster", "track": "main", "pid": 3624, + "author_site": "Kenan Dai; Jie Zhao; Lijun Wang; Dong Wang; Jianhua Li; Huchuan Lu; Xuesheng Qian; Xiaoyun Yang", "author": "Kenan Dai; Jie Zhao; Lijun Wang; Dong Wang; Jianhua Li; Huchuan Lu; Xuesheng Qian; Xiaoyun Yang", "abstract": "Deep learning based visual trackers entail offline pre-training on large volumes of video datasets with accurate bounding box annotations that are labor-expensive to achieve. We present a new framework to facilitate bounding box annotations for video sequences, which investigates a selection-and-refinement strategy to automatically improve the preliminary annotations generated by tracking algorithms. A temporal assessment network (T-Assess Net) is proposed which is able to capture the temporal coherence of target locations and select reliable tracking results by measuring their quality. Meanwhile, a visual-geometry refinement network (VG-Refine Net) is also designed to further enhance the selected tracking results by considering both target appearance and temporal geometry constraints, allowing inaccurate tracking results to be corrected. The combination of the above two networks provides a principled approach to ensure the quality of automatic video annotation. Experiments on large scale tracking benchmarks demonstrate that our method can deliver highly accurate bounding box annotations and significantly reduce human labor by 94.0%, yielding an effective means to further boost tracking performance with augmented training data.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Dai_Video_Annotation_for_Visual_Tracking_via_Selection_and_Refinement_ICCV_2021_paper.pdf", @@ -45303,14 +48363,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Dai_Video_Annotation_for_Visual_Tracking_via_Selection_and_Refinement_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;0;0;0+1;2;3", - "aff_unique_norm": "Dalian University of Technology;Pengcheng Laboratory;CSA Intellicloud;Remark Holdings", - "aff_unique_dep": ";Peng Cheng Lab;;", + "aff_unique_norm": "Dalian University of Technology;Peng Cheng Lab;CSA Intellicloud;Remark Holdings", + "aff_unique_dep": ";;;", "aff_unique_url": "http://www.dlut.edu.cn/;;;", "aff_unique_abbr": "DUT;;;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0+0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Dai_2021_ICCV,\n \n author = {\n Dai,\n Kenan and Zhao,\n Jie and Wang,\n Lijun and Wang,\n Dong and Li,\n Jianhua and Lu,\n Huchuan and Qian,\n Xuesheng and Yang,\n Xiaoyun\n},\n title = {\n Video Annotation for Visual Tracking via Selection and Refinement\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10296-10305\n} \n}" }, { "title": "Video Autoencoder: Self-Supervised Disentanglement of Static 3D Structure and Motion", @@ -45318,6 +48379,7 @@ "status": "Poster", "track": "main", "pid": 1710, + "author_site": "Zihang Lai; Sifei Liu; Alexei A. Efros; Xiaolong Wang", "author": "Zihang Lai; Sifei Liu; Alexei A. Efros; Xiaolong Wang", "abstract": "We present Video Autoencoder for learning disentangled representations of 3D structure and camera pose from videos in a self-supervised manner. Relying on temporal continuity in videos, our work assumes that the 3D scene structure in nearby video frames remains static. Given a sequence of video frames as input, the Video Autoencoder extracts a disentangled representation of the scene including: (i) a temporally-consistent deep voxel feature to represent the 3D structure and (ii) a 3D trajectory of camera poses for each frame. These two representations will then be re-entangled for rendering the input video frames. Video Autoencoder can be trained directly using a pixel reconstruction loss, without any ground truth 3D or camera pose annotations. The disentangled representation can be applied to a range of tasks, including novel view synthesis, camera pose estimation, and video generation by motion following. We evaluate our method on several large-scale natural video datasets, and show generalization results on out-of-domain images.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Lai_Video_Autoencoder_Self-Supervised_Disentanglement_of_Static_3D_Structure_and_Motion_ICCV_2021_paper.pdf", @@ -45332,7 +48394,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Lai_Video_Autoencoder_Self-Supervised_Disentanglement_of_Static_3D_Structure_and_Motion_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Lai_Video_Autoencoder_Self-Supervised_Disentanglement_of_Static_3D_Structure_and_Motion_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Lai_2021_ICCV,\n \n author = {\n Lai,\n Zihang and Liu,\n Sifei and Efros,\n Alexei A. and Wang,\n Xiaolong\n},\n title = {\n Video Autoencoder: Self-Supervised Disentanglement of Static 3D Structure and Motion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9730-9740\n} \n}" }, { "title": "Video Geo-Localization Employing Geo-Temporal Feature Learning and GPS Trajectory Smoothing", @@ -45340,6 +48403,7 @@ "status": "Poster", "track": "main", "pid": 6672, + "author_site": "Krishna Regmi; Mubarak Shah", "author": "Krishna Regmi; Mubarak Shah", "abstract": "In this paper, we address the problem of video geo-localization by proposing a Geo-Temporal Feature Learning (GTFL) Network to simultaneously learn the discriminative features between the query videos and gallery images for estimating the geo-spatial trajectory of a query video. Based on a transformer encoder architecture, our GTFL model encodes query and gallery data separately, via two dedicated branches. The proposed GPS Loss and Clip Triplet Loss exploit the geographical and temporal proximity between the frames and the clips to jointly learn the query and gallery features. We also propose a deep learning approach to trajectory smoothing by predicting the outliers in the estimated GPS positions and learning the offsets to smooth the trajectory. We build a large dataset from four different regions of USA; New York, San Francisco, Berkeley and Bay Area using BDD driving videos as query, and by collecting corresponding Google StreetView (GSV) Images for gallery. Extensive evaluations of proposed method on this new dataset are provided. Code and dataset details will be made publicly available.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Regmi_Video_Geo-Localization_Employing_Geo-Temporal_Feature_Learning_and_GPS_Trajectory_Smoothing_ICCV_2021_paper.pdf", @@ -45363,7 +48427,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Orlando", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Regmi_2021_ICCV,\n \n author = {\n Regmi,\n Krishna and Shah,\n Mubarak\n},\n title = {\n Video Geo-Localization Employing Geo-Temporal Feature Learning and GPS Trajectory Smoothing\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12126-12135\n} \n}" }, { "title": "Video Instance Segmentation With a Propose-Reduce Paradigm", @@ -45371,6 +48436,7 @@ "status": "Poster", "track": "main", "pid": 7359, + "author_site": "Huaijia Lin; Ruizheng Wu; Shu Liu; Jiangbo Lu; Jiaya Jia", "author": "Huaijia Lin; Ruizheng Wu; Shu Liu; Jiangbo Lu; Jiaya Jia", "abstract": "Video instance segmentation (VIS) aims to segment and associate all instances of predefined classes for each frame in videos. Prior methods usually obtain segmentation for a frame or clip first, and merge the incomplete results by tracking or matching. These methods may cause error accumulation in the merging step. Contrarily, we propose a new paradigm -- Propose-Reduce, to generate complete sequences for input videos by a single step. We further build a sequence propagation head on the existing image-level instance segmentation network for long-term propagation. To ensure robustness and high recall of our proposed framework, multiple sequences are proposed where redundant sequences of the same instance are reduced. We achieve state-of-the-art performance on two representative benchmark datasets -- we obtain 47.6% in terms of AP on YouTube-VIS validation set and 70.4% for J&F on DAVIS-UVOS validation set.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Lin_Video_Instance_Segmentation_With_a_Propose-Reduce_Paradigm_ICCV_2021_paper.pdf", @@ -45387,14 +48453,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Lin_Video_Instance_Segmentation_With_a_Propose-Reduce_Paradigm_ICCV_2021_paper.html", "aff_unique_index": "0;0;1;1;0+1", - "aff_unique_norm": "Chinese University of Hong Kong;SmartMore", + "aff_unique_norm": "The Chinese University of Hong Kong;SmartMore", "aff_unique_dep": ";", "aff_unique_url": "https://www.cuhk.edu.hk;", "aff_unique_abbr": "CUHK;", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Lin_2021_ICCV,\n \n author = {\n Lin,\n Huaijia and Wu,\n Ruizheng and Liu,\n Shu and Lu,\n Jiangbo and Jia,\n Jiaya\n},\n title = {\n Video Instance Segmentation With a Propose-Reduce Paradigm\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1739-1748\n} \n}" }, { "title": "Video Matting via Consistency-Regularized Graph Neural Networks", @@ -45402,6 +48469,7 @@ "status": "Poster", "track": "main", "pid": 9687, + "author_site": "Tiantian Wang; Sifei Liu; Yapeng Tian; Kai Li; Ming-Hsuan Yang", "author": "Tiantian Wang; Sifei Liu; Yapeng Tian; Kai Li; Ming-Hsuan Yang", "abstract": "Learning temporally consistent foreground opacity from videos, i.e., video matting, has drawn great attention due to the blossoming of video conferencing. Previous approaches are built on top of image matting models, which fail in maintaining the temporal coherence when being adapted to videos. They either utilize the optical flow to smooth frame-wise prediction, where the performance is dependent on the selected optical flow model; or naively combine feature maps from multiple frames, which does not model well the correspondence of pixels in adjacent frames. In this paper, we propose to enhance the temporal coherence by Consistency-Regularized Graph Neural Networks (CRGNN) with the aid of a synthesized video matting dataset. CRGNN utilizes Graph Neural Networks (GNN) to relate adjacent frames such that pixels or regions that are incorrectly predicted in one frame can be corrected by leveraging information from its neighboring frames. To generalize our model from synthesized videos to real-world videos, we propose a consistency regularization technique to enforce the consistency on the alpha and foreground when blending them with different backgrounds. To evaluate the efficacy of CRGNN, we further collect a real-world dataset with annotated alpha mattes. Compared with state-of-the-art methods that require hand-crafted trimaps or backgrounds for modeling training, CRGNN generates favorably results with the help of unlabeled real training dataset.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_Video_Matting_via_Consistency-Regularized_Graph_Neural_Networks_ICCV_2021_paper.pdf", @@ -45418,14 +48486,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wang_Video_Matting_via_Consistency-Regularized_Graph_Neural_Networks_ICCV_2021_paper.html", "aff_unique_index": "0;1;2;3;4+5", - "aff_unique_norm": "University of California, Merced;NVIDIA;University of Rochester;Northeastern University;Google;Yonsei University", - "aff_unique_dep": ";NVIDIA Corporation;;;Google Research;", + "aff_unique_norm": "University of California, Merced;NVIDIA Corporation;University of Rochester;Northeastern University;Google;Yonsei University", + "aff_unique_dep": ";;;;Google Research;", "aff_unique_url": "https://www.ucmerced.edu;https://www.nvidia.com;https://www.rochester.edu;https://www.northeastern.edu;https://research.google;https://www.yonsei.ac.kr", "aff_unique_abbr": "UCM;NVIDIA;U of R;NEU;Google Research;Yonsei", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Merced;;Mountain View", "aff_country_unique_index": "0;0;0;0;0+1", - "aff_country_unique": "United States;South Korea" + "aff_country_unique": "United States;South Korea", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Tiantian and Liu,\n Sifei and Tian,\n Yapeng and Li,\n Kai and Yang,\n Ming-Hsuan\n},\n title = {\n Video Matting via Consistency-Regularized Graph Neural Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4902-4911\n} \n}" }, { "title": "Video Object Segmentation With Dynamic Memory Networks and Adaptive Object Alignment", @@ -45433,6 +48502,7 @@ "status": "Poster", "track": "main", "pid": 2999, + "author_site": "Shuxian Liang; Xu Shen; Jianqiang Huang; Xian-Sheng Hua", "author": "Shuxian Liang; Xu Shen; Jianqiang Huang; Xian-Sheng Hua", "abstract": "In this paper, we propose a novel solution for object-matching based semi-supervised video object segmentation, where the target object masks in the first frame are provided. Existing object-matching based methods focus on the matching between the raw object features of the current frame and the first/previous frames. However, two issues are still not solved by these object-matching based methods. As the appearance of the video object changes drastically over time, 1) unseen parts/details of the object present in the current frame, resulting in incomplete annotation in the first annotated frame (e.g., view/scale changes). 2) even for the seen parts/details of the object in the current frame, their positions change relatively (e.g., pose changes/camera motion), leading to a misalignment for the object matching. To obtain the complete information of the target object, we propose a novel object-based dynamic memory network that exploits visual contents of all the past frames. To solve the misalignment problem caused by position changes of visual contents, we propose an adaptive object alignment module by incorporating a region translation function that aligns object proposals towards templates in the feature space. Our method achieves state-of-the-art results on latest benchmark datasets DAVIS 2017 (J of 81.4% and F of 87.5% on the validation set) and YouTube-VOS (the overall score of 82.7% on the validation set) with a very efficient inference time (0.16 second/frame on DAVIS 2017 validation set). Code is available at: https://github.com/liang4sx/DMN-AOA.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liang_Video_Object_Segmentation_With_Dynamic_Memory_Networks_and_Adaptive_Object_ICCV_2021_paper.pdf", @@ -45456,7 +48526,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liang_2021_ICCV,\n \n author = {\n Liang,\n Shuxian and Shen,\n Xu and Huang,\n Jianqiang and Hua,\n Xian-Sheng\n},\n title = {\n Video Object Segmentation With Dynamic Memory Networks and Adaptive Object Alignment\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8065-8074\n} \n}" }, { "title": "Video Pose Distillation for Few-Shot, Fine-Grained Sports Action Recognition", @@ -45464,7 +48535,8 @@ "status": "Poster", "track": "main", "pid": 8427, - "author": "James Hong; Matthew Fisher; Micha\u00ebl Gharbi; Kayvon Fatahalian", + "author_site": "James Hong; Matthew Fisher; Michaël Gharbi; Kayvon Fatahalian", + "author": "James Hong; Matthew Fisher; Michaël Gharbi; Kayvon Fatahalian", "abstract": "Human pose is a useful feature for fine-grained sports action understanding. However, pose estimators are often unreliable when run on sports video due to domain shift and factors such as motion blur and occlusions. This leads to poor accuracy when downstream tasks, such as action recognition, depend on pose. End-to-end learning circumvents pose, but requires more labels to generalize. We introduce Video Pose Distillation (VPD), a weakly-supervised technique to learn features for new video domains, such as individual sports that challenge pose estimation. Under VPD, a student network learns to extract robust pose features from RGB frames in the sports video, such that, whenever pose is considered reliable, the features match the output of a pretrained teacher pose detector. Our strategy retains the best of both pose and end-to-end worlds, exploiting the rich visual patterns in raw video frames, while learning features that agree with the athletes' pose and motion in the target video domain to avoid over-fitting to patterns unrelated to athletes' motion. VPD features improve performance on few-shot, fine-grained action recognition, retrieval, and detection tasks in four real-world sports video datasets, without requiring additional ground-truth pose annotations.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Hong_Video_Pose_Distillation_for_Few-Shot_Fine-Grained_Sports_Action_Recognition_ICCV_2021_paper.pdf", "aff": ";;;", @@ -45478,7 +48550,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Hong_Video_Pose_Distillation_for_Few-Shot_Fine-Grained_Sports_Action_Recognition_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Hong_Video_Pose_Distillation_for_Few-Shot_Fine-Grained_Sports_Action_Recognition_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Hong_2021_ICCV,\n \n author = {\n Hong,\n James and Fisher,\n Matthew and Gharbi,\n Micha\\"el and Fatahalian,\n Kayvon\n},\n title = {\n Video Pose Distillation for Few-Shot,\n Fine-Grained Sports Action Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9254-9263\n} \n}" }, { "title": "Video Question Answering Using Language-Guided Deep Compressed-Domain Video Feature", @@ -45486,6 +48559,7 @@ "status": "Poster", "track": "main", "pid": 6557, + "author_site": "Nayoung Kim; Seong Jong Ha; Je-Won Kang", "author": "Nayoung Kim; Seong Jong Ha; Je-Won Kang", "abstract": "Video Question Answering (Video QA) aims to give an answer to the question through semantic reasoning between visual and linguistic information. Recently, handling large amounts of multi-modal video and language information of a video is considered important in the industry. However, the current video QA models use deep features, suffered from significant computational complexity and insufficient representation capability both in training and testing. Existing features are extracted using pre-trained networks after all the frames are decoded, which is not always suitable for video QA tasks. In this paper, we develop a novel deep neural network to provide video QA features obtained from coded video bit-stream to reduce the complexity. The proposed network includes several dedicated deep modules to both the video QA and the video compression system, which is the first attempt at the video QA task. The proposed network is predominantly model-agnostic. It is integrated into the state-of-the-art networks for improved performance without any computationally expensive motion-related deep models. The experimental results demonstrate that the proposed network outperforms the previous studies at lower complexity.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Kim_Video_Question_Answering_Using_Language-Guided_Deep_Compressed-Domain_Video_Feature_ICCV_2021_paper.pdf", @@ -45509,7 +48583,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0+0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Kim_2021_ICCV,\n \n author = {\n Kim,\n Nayoung and Ha,\n Seong Jong and Kang,\n Je-Won\n},\n title = {\n Video Question Answering Using Language-Guided Deep Compressed-Domain Video Feature\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1708-1717\n} \n}" }, { "title": "Video Self-Stitching Graph Network for Temporal Action Localization", @@ -45517,6 +48592,7 @@ "status": "Poster", "track": "main", "pid": 1236, + "author_site": "Chen Zhao; Ali K. Thabet; Bernard Ghanem", "author": "Chen Zhao; Ali K. Thabet; Bernard Ghanem", "abstract": "Temporal action localization (TAL) in videos is a challenging task, especially due to the large variation in action temporal scales. Short actions usually occupy a major proportion in the datasets, but tend to have the lowest performance. In this paper, we confront the challenge of short actions and propose a multi-level cross-scale solution dubbed as video self-stitching graph network (VSGN). We have two key components in VSGN: video self-stitching (VSS) and cross-scale graph pyramid network (xGPN). In VSS, we focus on a short period of a video and magnify it along the temporal dimension to obtain a larger scale. We stitch the original clip and its magnified counterpart in one input sequence to take advantage of the complementary properties of both scales. The xGPN component further exploits the cross-scale correlations by a pyramid of cross-scale graph networks, each containing a hybrid module to aggregate features from across scales as well as within the same scale. Our VSGN not only enhances the feature representations, but also generates more positive anchors for short actions and more short training samples. Experiments demonstrate that VSGN obviously improves the localization performance of short actions as well as achieving the state-of-the-art overall performance on THUMOS-14 and ActivityNet-v1.3.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhao_Video_Self-Stitching_Graph_Network_for_Temporal_Action_Localization_ICCV_2021_paper.pdf", @@ -45540,7 +48616,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Thuwal", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Saudi Arabia" + "aff_country_unique": "Saudi Arabia", + "bibtex": "@InProceedings{Zhao_2021_ICCV,\n \n author = {\n Zhao,\n Chen and Thabet,\n Ali K. and Ghanem,\n Bernard\n},\n title = {\n Video Self-Stitching Graph Network for Temporal Action Localization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13658-13667\n} \n}" }, { "title": "Video-Based Person Re-Identification With Spatial and Temporal Memory Networks", @@ -45548,6 +48625,7 @@ "status": "Poster", "track": "main", "pid": 7307, + "author_site": "Chanho Eom; Geon Lee; Junghyup Lee; Bumsub Ham", "author": "Chanho Eom; Geon Lee; Junghyup Lee; Bumsub Ham", "abstract": "Video-based person re-identification (reID) aims to retrieve person videos with the same identity as a query person across multiple cameras. Spatial and temporal distractors in person videos, such as background clutter and partial occlusions over frames, respectively, make this task much more challenging than image-based person reID. We observe that spatial distractors appear consistently in a particular location, and temporal distractors show several patterns, e.g., partial occlusions occur in the first few frames, where such patterns provide informative cues for predicting which frames to focus on (i.e., temporal attentions). Based on this, we introduce a novel Spatial and Temporal Memory Networks (STMN). The spatial memory stores features for spatial distractors that frequently emerge across video frames, while the temporal memory saves attentions which are optimized for typical temporal patterns in person videos. We leverage the spatial and temporal memories to refine frame-level person representations and to aggregate the refined frame-level features into a sequence-level person representation, respectively, effectively handling spatial and temporal distractors in person videos. We also introduce a memory spread loss preventing our model from addressing particular items only in the memories. Experimental results on standard benchmarks, including MARS, DukeMTMC-VideoReID, and LS-VID, demonstrate the effectiveness of our method.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Eom_Video-Based_Person_Re-Identification_With_Spatial_and_Temporal_Memory_Networks_ICCV_2021_paper.pdf", @@ -45562,7 +48640,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Eom_Video-Based_Person_Re-Identification_With_Spatial_and_Temporal_Memory_Networks_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Eom_Video-Based_Person_Re-Identification_With_Spatial_and_Temporal_Memory_Networks_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Eom_2021_ICCV,\n \n author = {\n Eom,\n Chanho and Lee,\n Geon and Lee,\n Junghyup and Ham,\n Bumsub\n},\n title = {\n Video-Based Person Re-Identification With Spatial and Temporal Memory Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12036-12045\n} \n}" }, { "title": "VideoLT: Large-Scale Long-Tailed Video Recognition", @@ -45570,10 +48649,11 @@ "status": "Poster", "track": "main", "pid": 2509, + "author_site": "Xing Zhang; Zuxuan Wu; Zejia Weng; Huazhu Fu; Jingjing Chen; Yu-Gang Jiang; Larry S. Davis", "author": "Xing Zhang; Zuxuan Wu; Zejia Weng; Huazhu Fu; Jingjing Chen; Yu-Gang Jiang; Larry S. Davis", "abstract": "Label distributions in real-world are oftentimes long-tailed and imbalanced, resulting in biased models towards dominant labels. While long-tailed recognition has been extensively studied for image classification tasks, limited effort has been made for video domain. In this paper, we introduce VideoLT, a large-scale long-tailed video recognition dataset, as a step toward real-world video recognition. VideoLT contains 256,218 untrimmed videos, annotated into 1,004 classes with a long-tailed distribution. Through extensive studies, we demonstrate that state-of-the-art methods used for long-tailed image recognition do not perform well in the video domain due to the additional temporal dimension in video data. This motivates us to propose FrameStack, a simple yet effective method for long-tailed video recognition task. In particular, FrameStack performs sampling at the frame-level in order to balance class distributions, and the sampling ratio is dynamically determined using knowledge derived from the network during training. Experimental results demonstrate that FrameStack can improve classification performance without sacrificing overall accuracy. Code and dataset are available at: https://github.com/17Skye17/VideoLT.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhang_VideoLT_Large-Scale_Long-Tailed_Video_Recognition_ICCV_2021_paper.pdf", - "aff": "Academy for Engineering and Technology, Fudan University; Shanghai Key Lab of Intel. Info. Processing, School of Computer Science, Fudan University+Shanghai Collaborative Innovation Center on Intelligent Visual Computing; Shanghai Key Lab of Intel. Info. Processing, School of Computer Science, Fudan University+Shanghai Collaborative Innovation Center on Intelligent Visual Computing; Inception Institute of Arti\ufb01cial Intelligence; Shanghai Key Lab of Intel. Info. Processing, School of Computer Science, Fudan University+Shanghai Collaborative Innovation Center on Intelligent Visual Computing; Shanghai Key Lab of Intel. Info. Processing, School of Computer Science, Fudan University+Shanghai Collaborative Innovation Center on Intelligent Visual Computing; University of Maryland", + "aff": "Academy for Engineering and Technology, Fudan University; Shanghai Key Lab of Intel. Info. Processing, School of Computer Science, Fudan University+Shanghai Collaborative Innovation Center on Intelligent Visual Computing; Shanghai Key Lab of Intel. Info. Processing, School of Computer Science, Fudan University+Shanghai Collaborative Innovation Center on Intelligent Visual Computing; Inception Institute of Artificial Intelligence; Shanghai Key Lab of Intel. Info. Processing, School of Computer Science, Fudan University+Shanghai Collaborative Innovation Center on Intelligent Visual Computing; Shanghai Key Lab of Intel. Info. Processing, School of Computer Science, Fudan University+Shanghai Collaborative Innovation Center on Intelligent Visual Computing; University of Maryland", "project": "", "github": "https://github.com/17Skye17/VideoLT", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Zhang_VideoLT_Large-Scale_Long-Tailed_ICCV_2021_supplemental.pdf", @@ -45593,7 +48673,8 @@ "aff_campus_unique_index": "1;1;1;1", "aff_campus_unique": ";Shanghai", "aff_country_unique_index": "0;0+0;0+0;0;0+0;0+0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Zhang_2021_ICCV,\n \n author = {\n Zhang,\n Xing and Wu,\n Zuxuan and Weng,\n Zejia and Fu,\n Huazhu and Chen,\n Jingjing and Jiang,\n Yu-Gang and Davis,\n Larry S.\n},\n title = {\n VideoLT: Large-Scale Long-Tailed Video Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7960-7969\n} \n}" }, { "title": "ViewNet: Unsupervised Viewpoint Estimation From Conditional Generation", @@ -45601,6 +48682,7 @@ "status": "Poster", "track": "main", "pid": 5693, + "author_site": "Octave Mariotti; Oisin Mac Aodha; Hakan Bilen", "author": "Octave Mariotti; Oisin Mac Aodha; Hakan Bilen", "abstract": "Understanding the 3D world without supervision is currently a major challenge in computer vision as the annotations required to supervise deep networks for tasks in this domain are expensive to obtain on a large scale. In this paper, we address the problem of unsupervised viewpoint estimation. We formulate this as a self-supervised learning task, where image reconstruction provides the supervision needed to predict the camera viewpoint. Specifically, we make use of pairs of images of the same object at training time, from unknown viewpoints, to self-supervise training by combining the viewpoint information from one image with the appearance information from the other. We demonstrate that using a perspective spatial transformer allows efficient viewpoint learning, outperforming existing unsupervised approaches on synthetic data, and obtains competitive results on the challenging PASCAL3D+ dataset.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Mariotti_ViewNet_Unsupervised_Viewpoint_Estimation_From_Conditional_Generation_ICCV_2021_paper.pdf", @@ -45615,7 +48697,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Mariotti_ViewNet_Unsupervised_Viewpoint_Estimation_From_Conditional_Generation_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Mariotti_ViewNet_Unsupervised_Viewpoint_Estimation_From_Conditional_Generation_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Mariotti_2021_ICCV,\n \n author = {\n Mariotti,\n Octave and Mac Aodha,\n Oisin and Bilen,\n Hakan\n},\n title = {\n ViewNet: Unsupervised Viewpoint Estimation From Conditional Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10418-10428\n} \n}" }, { "title": "Viewing Graph Solvability via Cycle Consistency", @@ -45623,6 +48706,7 @@ "status": "Poster", "track": "main", "pid": 1139, + "author_site": "Federica Arrigoni; Andrea Fusiello; Elisa Ricci; Tomas Pajdla", "author": "Federica Arrigoni; Andrea Fusiello; Elisa Ricci; Tomas Pajdla", "abstract": "In structure-from-motion the viewing graph is a graph where vertices correspond to cameras and edges represent fundamental matrices. We provide a new formulation and an algorithm for establishing whether a viewing graph is solvable, i.e. it uniquely determines a set of projective cameras. Known theoretical conditions either do not fully characterize the solvability of all viewing graphs, or are exceedingly hard to compute for they involve solving a system of polynomial equations with a large number of unknowns. The main result of this paper is a method for reducing the number of unknowns by exploiting the cycle consistency. We advance the understanding of the solvability by (i) finishing the classification of all previously undecided minimal graphs up to 9 nodes, (ii) extending the practical solvability testing up to minimal graphs with up to 90 nodes, and (iii) definitely answering an open research question by showing that the finite solvability is not equivalent to the solvability. Finally, we present an experiment on real data showing that unsolvable graphs are appearing in practical situations.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Arrigoni_Viewing_Graph_Solvability_via_Cycle_Consistency_ICCV_2021_paper.pdf", @@ -45646,7 +48730,8 @@ "aff_campus_unique_index": ";1", "aff_campus_unique": ";Prague", "aff_country_unique_index": "0;0;0+0;1", - "aff_country_unique": "Italy;Czech Republic" + "aff_country_unique": "Italy;Czech Republic", + "bibtex": "@InProceedings{Arrigoni_2021_ICCV,\n \n author = {\n Arrigoni,\n Federica and Fusiello,\n Andrea and Ricci,\n Elisa and Pajdla,\n Tomas\n},\n title = {\n Viewing Graph Solvability via Cycle Consistency\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5540-5549\n} \n}" }, { "title": "Viewpoint Invariant Dense Matching for Visual Geolocalization", @@ -45654,6 +48739,7 @@ "status": "Poster", "track": "main", "pid": 8952, + "author_site": "Gabriele Berton; Carlo Masone; Valerio Paolicelli; Barbara Caputo", "author": "Gabriele Berton; Carlo Masone; Valerio Paolicelli; Barbara Caputo", "abstract": "In this paper we propose a novel method for image matching based on dense local features and tailored for visual geolocalization. Dense local features matching is robust against changes in illumination and occlusions, but not against viewpoint shifts which are a fundamental aspect of geolocalization. Our method, called GeoWarp, directly embeds invariance to viewpoint shifts in the process of extracting dense features. This is achieved via a trainable module which learns from the data an invariance that is meaningful for the task of recognizing places. We also devise a new self-supervised loss and two new weakly supervised losses to train this module using only unlabeled data and weak labels. GeoWarp is implemented efficiently as a re-ranking method that can be easily embedded into pre-existing visual geolocalization pipelines. Experimental validation on standard geolocalization benchmarks demonstrates that GeoWarp boosts the accuracy of state-of-the-art retrieval architectures. The code and trained models will be released upon acceptance of this paper.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Berton_Viewpoint_Invariant_Dense_Matching_for_Visual_Geolocalization_ICCV_2021_paper.pdf", @@ -45677,7 +48763,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0+0", - "aff_country_unique": "Italy" + "aff_country_unique": "Italy", + "bibtex": "@InProceedings{Berton_2021_ICCV,\n \n author = {\n Berton,\n Gabriele and Masone,\n Carlo and Paolicelli,\n Valerio and Caputo,\n Barbara\n},\n title = {\n Viewpoint Invariant Dense Matching for Visual Geolocalization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12169-12178\n} \n}" }, { "title": "Viewpoint-Agnostic Change Captioning With Cycle Consistency", @@ -45685,6 +48772,7 @@ "status": "Poster", "track": "main", "pid": 8806, + "author_site": "Hoeseong Kim; Jongseok Kim; Hyungseok Lee; Hyunsung Park; Gunhee Kim", "author": "Hoeseong Kim; Jongseok Kim; Hyungseok Lee; Hyunsung Park; Gunhee Kim", "abstract": "Change captioning is the task of identifying the change and describing it with a concise caption. Despite recent advancements, filtering out insignificant changes still remains as a challenge. Namely, images from different camera perspectives can cause issues; a mere change in viewpoint should be disregarded while still capturing the actual changes. In order to tackle this problem, we present a new Viewpoint-Agnostic change captioning network with Cycle Consistency (VACC) that requires only one image each for the before and after scene, without depending on any other information. We achieve this by devising a new difference encoder module which can encode viewpoint information and model the difference more effectively. In addition, we propose a cycle consistency module that can potentially improve the performance of any change captioning networks in general by matching the composite feature of the generated caption and before image with the after image feature. We evaluate the performance of our proposed model across three datasets for change captioning, including a novel dataset we introduce here that contains images with changes under extreme viewpoint shifts. Through our experiments, we show the excellence of our method with respect to the CIDEr, BLEU-4, METEOR and SPICE scores. Moreover, we demonstrate that attaching our proposed cycle consistency module yields a performance boost for existing change captioning networks, even with varying image encoding mechanisms.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Kim_Viewpoint-Agnostic_Change_Captioning_With_Cycle_Consistency_ICCV_2021_paper.pdf", @@ -45708,7 +48796,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Kim_2021_ICCV,\n \n author = {\n Kim,\n Hoeseong and Kim,\n Jongseok and Lee,\n Hyungseok and Park,\n Hyunsung and Kim,\n Gunhee\n},\n title = {\n Viewpoint-Agnostic Change Captioning With Cycle Consistency\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2095-2104\n} \n}" }, { "title": "Virtual Light Transport Matrices for Non-Line-of-Sight Imaging", @@ -45716,7 +48805,8 @@ "status": "Poster", "track": "main", "pid": 10117, - "author": "Julio Marco; Adrian Jarabo; Ji Hyun Nam; Xiaochun Liu; Miguel \u00c1ngel Cosculluela; Andreas Velten; Diego Gutierrez", + "author_site": "Julio Marco; Adrian Jarabo; Ji Hyun Nam; Xiaochun Liu; Miguel Ángel Cosculluela; Andreas Velten; Diego Gutierrez", + "author": "Julio Marco; Adrian Jarabo; Ji Hyun Nam; Xiaochun Liu; Miguel Ángel Cosculluela; Andreas Velten; Diego Gutierrez", "abstract": "The light transport matrix (LTM) is an instrumental tool in line-of-sight (LOS) imaging, describing how light interacts with the scene and enabling applications such as relighting or separation of illumination components. We introduce a framework to estimate the LTM of non-line-of-sight (NLOS) scenarios, coupling recent virtual forward light propagation models for NLOS imaging with the LOS light transport equation. We design computational projector-camera setups, and use these virtual imaging systems to estimate the transport matrix of hidden scenes. We introduce the specific illumination functions to compute the different elements of the matrix, overcoming the challenging wide-aperture conditions of NLOS setups. Our NLOS light transport matrix allows us to (re)illuminate specific locations of a hidden scene, and separate direct, first-order indirect, and higher-order indirect illumination of complex cluttered hidden scenes, similar to existing LOS techniques.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Marco_Virtual_Light_Transport_Matrices_for_Non-Line-of-Sight_Imaging_ICCV_2021_paper.pdf", "aff": ";;;;;;", @@ -45730,7 +48820,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Marco_Virtual_Light_Transport_Matrices_for_Non-Line-of-Sight_Imaging_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Marco_Virtual_Light_Transport_Matrices_for_Non-Line-of-Sight_Imaging_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Marco_2021_ICCV,\n \n author = {\n Marco,\n Julio and Jarabo,\n Adrian and Nam,\n Ji Hyun and Liu,\n Xiaochun and Cosculluela,\n Miguel \\'Angel and Velten,\n Andreas and Gutierrez,\n Diego\n},\n title = {\n Virtual Light Transport Matrices for Non-Line-of-Sight Imaging\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2440-2449\n} \n}" }, { "title": "Virtual Multi-Modality Self-Supervised Foreground Matting for Human-Object Interaction", @@ -45738,6 +48829,7 @@ "status": "Poster", "track": "main", "pid": 5927, + "author_site": "Bo Xu; Han Huang; Cheng Lu; Ziwen Li; Yandong Guo", "author": "Bo Xu; Han Huang; Cheng Lu; Ziwen Li; Yandong Guo", "abstract": "Most existing human matting algorithms tried to separate pure human-only foreground from the background. In this paper, we propose a Virtual Multi-modality Foreground Matting (VMFM) method to learn human-object interactive foreground (human and objects interacted with him or her) from a raw RGB image. The VMFM method requires no additional inputs, e.g. trimap or known background. We reformulate foreground matting as a self-supervised multi-modality problem: factor each input image into estimated depth map, segmentation mask, and interaction heatmap using three auto-encoders. In order to fully utilize the characteristics of each modality, we first train a dual encoder-to-decoder network to estimate the same alpha matte. Then we introduce a self-supervised method: Complementary Learning(CL) to predict deviation probability map and exchange reliable gradients across modalities without label. We conducted extensive experiments to analyze the effectiveness of each modality and the significance of different components in complementary learning. We demonstrate that our model outperforms the state-of-the-art methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xu_Virtual_Multi-Modality_Self-Supervised_Foreground_Matting_for_Human-Object_Interaction_ICCV_2021_paper.pdf", @@ -45761,7 +48853,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";San Diego", "aff_country_unique_index": "0;0+2;0+2;0", - "aff_country_unique": "China;;United States" + "aff_country_unique": "China;;United States", + "bibtex": "@InProceedings{Xu_2021_ICCV,\n \n author = {\n Xu,\n Bo and Huang,\n Han and Lu,\n Cheng and Li,\n Ziwen and Guo,\n Yandong\n},\n title = {\n Virtual Multi-Modality Self-Supervised Foreground Matting for Human-Object Interaction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 438-447\n} \n}" }, { "title": "Vis2Mesh: Efficient Mesh Reconstruction From Unstructured Point Clouds of Large Scenes With Learned Virtual View Visibility", @@ -45769,10 +48862,11 @@ "status": "Poster", "track": "main", "pid": 5970, + "author_site": "Shuang Song; Zhaopeng Cui; Rongjun Qin", "author": "Shuang Song; Zhaopeng Cui; Rongjun Qin", "abstract": "We present a novel framework for mesh reconstruction from unstructured point clouds by taking advantage of the learned visibility of the 3D points in the virtual views and traditional graph-cut based mesh generation. Specifically, we first propose a three-step network that explicitly employs depth completion for visibility prediction. Then the visibility information of multiple views is aggregated to generate a 3D mesh model by solving an optimization problem considering visibility in which a novel adaptive visibility weighting term in surface determination is also introduced to suppress line of sight with a large incident angle. Compared to other learning-based approaches, our pipeline only exercises the learning on a 2D binary classification task, i.e., points visible or not in a view, which is much more generalizable and practically more efficient and capable to deal with a large number of points. Experiments demonstrate that our method with favorable transferability and robustness, and achieve competing performances w.r.t. state-of-the-art learning-based approaches on small complex objects and outperforms on large indoor and outdoor scenes. Code is available at https://github.com/GDAOSU/vis2mesh.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Song_Vis2Mesh_Efficient_Mesh_Reconstruction_From_Unstructured_Point_Clouds_of_Large_ICCV_2021_paper.pdf", - "aff": "The Ohio State University; Zhejiang University+ETH Z\u00fcrich; The Ohio State University", + "aff": "The Ohio State University; Zhejiang University+ETH Zürich; The Ohio State University", "project": "", "github": "https://github.com/GDAOSU/vis2mesh", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Song_Vis2Mesh_Efficient_Mesh_ICCV_2021_supplemental.pdf", @@ -45785,14 +48879,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Song_Vis2Mesh_Efficient_Mesh_Reconstruction_From_Unstructured_Point_Clouds_of_Large_ICCV_2021_paper.html", "aff_unique_index": "0;1+2;0", - "aff_unique_norm": "Ohio State University;Zhejiang University;ETH Zurich", + "aff_unique_norm": "The Ohio State University;Zhejiang University;ETH Zürich", "aff_unique_dep": ";;", "aff_unique_url": "https://www.osu.edu;https://www.zju.edu.cn;https://www.ethz.ch", "aff_unique_abbr": "OSU;ZJU;ETHZ", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1+2;0", - "aff_country_unique": "United States;China;Switzerland" + "aff_country_unique": "United States;China;Switzerland", + "bibtex": "@InProceedings{Song_2021_ICCV,\n \n author = {\n Song,\n Shuang and Cui,\n Zhaopeng and Qin,\n Rongjun\n},\n title = {\n Vis2Mesh: Efficient Mesh Reconstruction From Unstructured Point Clouds of Large Scenes With Learned Virtual View Visibility\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6514-6524\n} \n}" }, { "title": "Visformer: The Vision-Friendly Transformer", @@ -45800,6 +48895,7 @@ "status": "Poster", "track": "main", "pid": 2849, + "author_site": "Zhengsu Chen; Lingxi Xie; Jianwei Niu; Xuefeng Liu; Longhui Wei; Qi Tian", "author": "Zhengsu Chen; Lingxi Xie; Jianwei Niu; Xuefeng Liu; Longhui Wei; Qi Tian", "abstract": "The past year has witnessed the rapid development of applying the Transformer module to vision problems. While some researchers have demonstrated that Transformer-based models enjoy a favorable ability of fitting data, there are still growing number of evidences showing that these models suffer over-fitting especially when the training data is limited. This paper offers an empirical study by performing step-by-step operations to gradually transit a Transformer-based model to a convolution-based model. The results we obtain during the transition process deliver useful messages for improving visual recognition. Based on these observations, we propose a new architecture named Visformer, which is abbreviated from the 'Vision-friendly Transformer'. With the same computational complexity, Visformer outperforms both the Transformer-based and convolution-based models in terms of ImageNet classification accuracy, and the advantage becomes more significant when the model complexity is lower or the training set is smaller. The code is available at https://github.com/danczs/Visformer.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_Visformer_The_Vision-Friendly_Transformer_ICCV_2021_paper.pdf", @@ -45823,7 +48919,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Hangzhou", "aff_country_unique_index": "0;1;0+0+0;0;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Zhengsu and Xie,\n Lingxi and Niu,\n Jianwei and Liu,\n Xuefeng and Wei,\n Longhui and Tian,\n Qi\n},\n title = {\n Visformer: The Vision-Friendly Transformer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 589-598\n} \n}" }, { "title": "Visio-Temporal Attention for Multi-Camera Multi-Target Association", @@ -45831,6 +48928,7 @@ "status": "Poster", "track": "main", "pid": 1984, + "author_site": "Yu-Jhe Li; Xinshuo Weng; Yan Xu; Kris M. Kitani", "author": "Yu-Jhe Li; Xinshuo Weng; Yan Xu; Kris M. Kitani", "abstract": "We address the task of Re-Identification (Re-ID) in multi-target multi-camera (MTMC) tracking where we track multiple pedestrians using multiple overlapping uncalibrated (unknown pose) cameras. Since the videos are temporally synchronized and spatially overlapping, we can see a person from multiple views and associate their trajectory across cameras. In order to find the correct association between pedestrians visible from multiple views during the same time window, we extract a visual feature from a tracklet (sequence of pedestrian images) that encodes its similarity and dissimilarity to all other candidate tracklets. We propose a inter-tracklet (person to person) attention mechanism that learns a representation for a target tracklet while taking into account other tracklets across multiple views. Furthermore, to encode the gait and motion of a person, we introduce second intra-tracklet (person-specific) attention module with position embeddings. This second module employs a transformer encoder to learn a feature from a sequence of features over one tracklet. Experimental results on WILDTRACK and our new dataset `ConstructSite' confirm the superiority of our model over state-of-the-art ReID methods (5% and 10% performance gain respectively) in the context of uncalibrated MTMC tracking. While our model is designed for overlapping cameras, we also obtain state-of-the-art results on two other benchmark datasets (MARS and DukeMTMC) with non-overlapping cameras.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_Visio-Temporal_Attention_for_Multi-Camera_Multi-Target_Association_ICCV_2021_paper.pdf", @@ -45854,7 +48952,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Yu-Jhe and Weng,\n Xinshuo and Xu,\n Yan and Kitani,\n Kris M.\n},\n title = {\n Visio-Temporal Attention for Multi-Camera Multi-Target Association\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9834-9844\n} \n}" }, { "title": "Vision Transformer With Progressive Sampling", @@ -45862,6 +48961,7 @@ "status": "Poster", "track": "main", "pid": 7920, + "author_site": "Xiaoyu Yue; Shuyang Sun; Zhanghui Kuang; Meng Wei; Philip H.S. Torr; Wayne Zhang; Dahua Lin", "author": "Xiaoyu Yue; Shuyang Sun; Zhanghui Kuang; Meng Wei; Philip H.S. Torr; Wayne Zhang; Dahua Lin", "abstract": "Transformers with powerful global relation modeling abilities have been introduced to fundamental computer vision tasks recently. As a typical example, the Vision Transformer (ViT) directly applies a pure transformer architecture on image classification, by simply splitting images into tokens with a fixed length, and employing transformers to learn relations between these tokens. However, such naive tokenization could destruct object structures, assign grids to uninterested regions such as background, and introduce interference signals. To mitigate the above issues, in this paper, we propose an iterative and progressive sampling strategy to locate discriminative regions. At each iteration, embeddings of the current sampling step are fed into a transformer encoder layer, and a group of sampling offsets is predicted to update the sampling locations for the next step. The progressive sampling is differentiable. When combined with the Vision Transformer, the obtained PS-ViT network can adaptively learn where to look. The proposed PS-ViT is both effective and efficient. When trained from scratch on ImageNet, PS-ViT performs 3.8% higher than the vanilla ViT in terms of top-1 accuracy with about 4x fewer parameters and 10x fewer FLOPs. Code is available at https://github.com/yuexy/PS-ViT.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yue_Vision_Transformer_With_Progressive_Sampling_ICCV_2021_paper.pdf", @@ -45878,14 +48978,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Yue_Vision_Transformer_With_Progressive_Sampling_ICCV_2021_paper.html", "aff_unique_index": "0+1;2+3;4;5;2;4+3;0+1", - "aff_unique_norm": "Centre for Perceptual and Interactive Intelligence;Chinese University of Hong Kong;University of Oxford;Shanghai Jiao Tong University;SenseTime;Tsinghua University", + "aff_unique_norm": "Centre for Perceptual and Interactive Intelligence;The Chinese University of Hong Kong;University of Oxford;Shanghai Jiao Tong University;SenseTime;Tsinghua University", "aff_unique_dep": ";;;Qing Yuan Research Institute;SenseTime Research;", "aff_unique_url": ";https://www.cuhk.edu.hk;https://www.ox.ac.uk;https://www.sjtu.edu.cn;https://www.sensetime.com;https://www.tsinghua.edu.cn", "aff_unique_abbr": ";CUHK;Oxford;SJTU;SenseTime;THU", "aff_campus_unique_index": "1;2;2;1", "aff_campus_unique": ";Hong Kong SAR;Shanghai", "aff_country_unique_index": "1;2+1;1;1;2;1+1;1", - "aff_country_unique": ";China;United Kingdom" + "aff_country_unique": ";China;United Kingdom", + "bibtex": "@InProceedings{Yue_2021_ICCV,\n \n author = {\n Yue,\n Xiaoyu and Sun,\n Shuyang and Kuang,\n Zhanghui and Wei,\n Meng and Torr,\n Philip H.S. and Zhang,\n Wayne and Lin,\n Dahua\n},\n title = {\n Vision Transformer With Progressive Sampling\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 387-396\n} \n}" }, { "title": "Vision Transformers for Dense Prediction", @@ -45893,7 +48994,8 @@ "status": "Poster", "track": "main", "pid": 1593, - "author": "Ren\u00e9 Ranftl; Alexey Bochkovskiy; Vladlen Koltun", + "author_site": "René Ranftl; Alexey Bochkovskiy; Vladlen Koltun", + "author": "René Ranftl; Alexey Bochkovskiy; Vladlen Koltun", "abstract": "We introduce dense prediction transformers, an architecture that leverages vision transformers in place of convolutional networks as a backbone for dense prediction tasks. We assemble tokens from various stages of the vision transformer into image-like representations at various resolutions and progressively combine them into full resolution predictions using a convolutional decoder. The transformer backbone processes representations at a constant and relatively high resolution and has a global receptive field at every stage. These properties allow the dense prediction transformer to provide finer-grained and more globally coherent predictions when compared to fully-convolutional networks. Our experiments show that this architecture yields substantial improvements on dense prediction tasks, especially when a large amount of training data is available. For monocular depth estimation, we observe an improvement of up to 28% in relative performance when compared to a state-of-the-art fully-convolutional network. When applied to semantic segmentation, dense prediction transformers set a new state of the art on ADE20K with 49.02% mIoU. We further show that the architecture can be fine-tuned on smaller datasets such as NYUv2, KITTI, and Pascal Context where it also sets the new state of the art. Our models are available at https://github.com/intel-isl/DPT.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ranftl_Vision_Transformers_for_Dense_Prediction_ICCV_2021_paper.pdf", "aff": "Intel Labs; Intel Labs; ", @@ -45909,14 +49011,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Ranftl_Vision_Transformers_for_Dense_Prediction_ICCV_2021_paper.html", "aff_unique_index": "0;0", - "aff_unique_norm": "Intel", + "aff_unique_norm": "Intel Corporation", "aff_unique_dep": "Intel Labs", "aff_unique_url": "https://www.intel.com", "aff_unique_abbr": "Intel", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Ranftl_2021_ICCV,\n \n author = {\n Ranftl,\n Ren\\'e and Bochkovskiy,\n Alexey and Koltun,\n Vladlen\n},\n title = {\n Vision Transformers for Dense Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12179-12188\n} \n}" }, { "title": "Vision-Language Navigation With Random Environmental Mixup", @@ -45924,6 +49027,7 @@ "status": "Poster", "track": "main", "pid": 5652, + "author_site": "Chong Liu; Fengda Zhu; Xiaojun Chang; Xiaodan Liang; Zongyuan Ge; Yi-Dong Shen", "author": "Chong Liu; Fengda Zhu; Xiaojun Chang; Xiaodan Liang; Zongyuan Ge; Yi-Dong Shen", "abstract": "Vision-language Navigation (VLN) task requires an agent to perceive both the visual scene and natural language and navigate step-by-step. Large data bias makes the VLN task challenging, which is caused by the disparity ratio between small data scale and large navigation space. Previous works have proposed many data augmentation methods to reduce data bias. However, these works do not explicitly reduce the data bias across different house scenes. Therefore, the agent would be overfitting to the seen scenes and perform navigation poorly in the unseen scenes. To tackle this problem, we propose the random environmental mixup (REM) method, which generates augmentation data in cross-connected house scenes. This method consists of three steps: 1) we select the key viewpoints according to the room connection graph for each scene in the training split; 2) we cross-connect the key views of different scenes to construct augmented scenes; 3) we generate augmentation data triplets (environment, path, instruction) in the cross-connected scenes. Our experiments prove that the augmentation data helps the agent reduce its performance gap between the seen and unseen environment and improve its performance, making our model be the best existing approach on the standard benchmark.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liu_Vision-Language_Navigation_With_Random_Environmental_Mixup_ICCV_2021_paper.pdf", @@ -45947,7 +49051,8 @@ "aff_campus_unique_index": "1;2;2;3;2", "aff_campus_unique": ";Beijing;Melbourne;Guangzhou", "aff_country_unique_index": "0+0;1;1;0;1;0", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Liu_2021_ICCV,\n \n author = {\n Liu,\n Chong and Zhu,\n Fengda and Chang,\n Xiaojun and Liang,\n Xiaodan and Ge,\n Zongyuan and Shen,\n Yi-Dong\n},\n title = {\n Vision-Language Navigation With Random Environmental Mixup\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1644-1654\n} \n}" }, { "title": "Vision-Language Transformer and Query Generation for Referring Segmentation", @@ -45955,6 +49060,7 @@ "status": "Poster", "track": "main", "pid": 1756, + "author_site": "Henghui Ding; Chang Liu; Suchen Wang; Xudong Jiang", "author": "Henghui Ding; Chang Liu; Suchen Wang; Xudong Jiang", "abstract": "In this work, we address the challenging task of referring segmentation. The query expression in referring segmentation typically indicates the target object by describing its relationship with others. Therefore, to find the target one among all instances in the image, the model must have a holistic understanding of the whole image. To achieve this, we reformulate referring segmentation as a direct attention problem: finding the region in the image where the query language expression is most attended to. We introduce transformer and multi-head attention to build a network with an encoder-decoder attention mechanism architecture that \"queries\" the given image with the language expression. Furthermore, we propose a Query Generation Module, which produces multiple sets of queries with different attention weights that represent the diversified comprehensions of the language expression from different aspects. At the same time, to find the best way from these diversified comprehensions based on visual clues, we further propose a Query Balance Module to adaptively select the output features of these queries for a better mask generation. Without bells and whistles, our approach is light-weight and achieves new state-of-the-art performance consistently on three referring segmentation datasets, RefCOCO, RefCOCO+, and G-Ref. Our code is available at https://github.com/henghuiding/Vision-Language-Transformer.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ding_Vision-Language_Transformer_and_Query_Generation_for_Referring_Segmentation_ICCV_2021_paper.pdf", @@ -45978,7 +49084,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Ding_2021_ICCV,\n \n author = {\n Ding,\n Henghui and Liu,\n Chang and Wang,\n Suchen and Jiang,\n Xudong\n},\n title = {\n Vision-Language Transformer and Query Generation for Referring Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 16321-16330\n} \n}" }, { "title": "Visual Alignment Constraint for Continuous Sign Language Recognition", @@ -45986,6 +49093,7 @@ "status": "Poster", "track": "main", "pid": 3096, + "author_site": "Yuecong Min; Aiming Hao; Xiujuan Chai; Xilin Chen", "author": "Yuecong Min; Aiming Hao; Xiujuan Chai; Xilin Chen", "abstract": "Vision-based Continuous Sign Language Recognition (CSLR) aims to recognize unsegmented signs from image streams. Overfitting is one of the most critical problems in CSLR training, and previous works show that the iterative training scheme can partially solve this problem while also costing more training time. In this study, we revisit the iterative training scheme in recent CSLR works and realize that sufficient training of the feature extractor is critical to solving the overfitting problem. Therefore, we propose a Visual Alignment Constraint (VAC) to enhance the feature extractor with alignment supervision. Specifically, the proposed VAC comprises two auxiliary losses: one focuses on visual features only, and the other enforces prediction alignment between the feature extractor and the alignment module. Moreover, we propose two metrics to reflect overfitting by measuring the prediction inconsistency between the feature extractor and the alignment module. Experimental results on two challenging CSLR datasets show that the proposed VAC makes CSLR networks end-to-end trainable and achieves competitive performance.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Min_Visual_Alignment_Constraint_for_Continuous_Sign_Language_Recognition_ICCV_2021_paper.pdf", @@ -46009,7 +49117,8 @@ "aff_campus_unique_index": "0+0;0+0;0;0+0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0+0;0+0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Min_2021_ICCV,\n \n author = {\n Min,\n Yuecong and Hao,\n Aiming and Chai,\n Xiujuan and Chen,\n Xilin\n},\n title = {\n Visual Alignment Constraint for Continuous Sign Language Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11542-11551\n} \n}" }, { "title": "Visual Distant Supervision for Scene Graph Generation", @@ -46017,6 +49126,7 @@ "status": "Poster", "track": "main", "pid": 6903, + "author_site": "Yuan Yao; Ao Zhang; Xu Han; Mengdi Li; Cornelius Weber; Zhiyuan Liu; Stefan Wermter; Maosong Sun", "author": "Yuan Yao; Ao Zhang; Xu Han; Mengdi Li; Cornelius Weber; Zhiyuan Liu; Stefan Wermter; Maosong Sun", "abstract": "Scene graph generation aims to identify objects and their relations in images, providing structured image representations that can facilitate numerous applications in computer vision. However, scene graph models usually require supervised learning on large quantities of labeled data with intensive human annotation. In this work, we propose visual distant supervision, a novel paradigm of visual relation learning, which can train scene graph models without any human-labeled data. The intuition is that by aligning commonsense knowledge bases and images, we can automatically create large-scale labeled data to provide distant supervision for visual relation learning. To alleviate the noise in distantly labeled data, we further propose a framework that iteratively estimates the probabilistic relation labels and eliminates the noisy ones. Comprehensive experimental results show that our distantly supervised model outperforms strong weakly supervised and semi-supervised baselines. By further incorporating human-labeled data in a semi-supervised fashion, our model outperforms state-of-the-art fully supervised models by a large margin (e.g., 8.3 micro- and 7.8 macro-recall@50 improvements for predicate classification in Visual Genome evaluation). We make the data and code for this paper publicly available at https://github.com/thunlp/VisualDS.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yao_Visual_Distant_Supervision_for_Scene_Graph_Generation_ICCV_2021_paper.pdf", @@ -46031,7 +49141,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Yao_Visual_Distant_Supervision_for_Scene_Graph_Generation_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Yao_Visual_Distant_Supervision_for_Scene_Graph_Generation_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Yao_2021_ICCV,\n \n author = {\n Yao,\n Yuan and Zhang,\n Ao and Han,\n Xu and Li,\n Mengdi and Weber,\n Cornelius and Liu,\n Zhiyuan and Wermter,\n Stefan and Sun,\n Maosong\n},\n title = {\n Visual Distant Supervision for Scene Graph Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15816-15826\n} \n}" }, { "title": "Visual Graph Memory With Unsupervised Representation for Visual Navigation", @@ -46039,6 +49150,7 @@ "status": "Poster", "track": "main", "pid": 3694, + "author_site": "Obin Kwon; Nuri Kim; Yunho Choi; Hwiyeon Yoo; Jeongho Park; Songhwai Oh", "author": "Obin Kwon; Nuri Kim; Yunho Choi; Hwiyeon Yoo; Jeongho Park; Songhwai Oh", "abstract": "We present a novel graph-structured memory for visual navigation, called visual graph memory (VGM), which consists of unsupervised image representations obtained from navigation history. The proposed VGM is constructed incrementally based on the similarities among the unsupervised representations of observed images, and these representations are learned from an unlabeled image dataset. We also propose a navigation framework that can utilize the proposed VGM to tackle visual navigation problems. By incorporating a graph convolutional network and the attention mechanism, the proposed agent refers to the VGM to navigate the environment while simultaneously building the VGM. Using the VGM, the agent can embed its navigation history and other useful task-related information. We validate our approach on the visual navigation tasks using the Habitat simulator with the Gibson dataset, which provides a photo-realistic simulation environment. The extensive experimental results show that the proposed navigation agent with VGM surpasses the state-of-the-art approaches on image-goal navigation tasks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Kwon_Visual_Graph_Memory_With_Unsupervised_Representation_for_Visual_Navigation_ICCV_2021_paper.pdf", @@ -46062,7 +49174,8 @@ "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Seoul", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Kwon_2021_ICCV,\n \n author = {\n Kwon,\n Obin and Kim,\n Nuri and Choi,\n Yunho and Yoo,\n Hwiyeon and Park,\n Jeongho and Oh,\n Songhwai\n},\n title = {\n Visual Graph Memory With Unsupervised Representation for Visual Navigation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15890-15899\n} \n}" }, { "title": "Visual Relationship Detection Using Part-and-Sum Transformers With Composite Queries", @@ -46070,6 +49183,7 @@ "status": "Poster", "track": "main", "pid": 9886, + "author_site": "Qi Dong; Zhuowen Tu; Haofu Liao; Yuting Zhang; Vijay Mahadevan; Stefano Soatto", "author": "Qi Dong; Zhuowen Tu; Haofu Liao; Yuting Zhang; Vijay Mahadevan; Stefano Soatto", "abstract": "Computer vision applications such as visual relationship detection and human object interaction can be formulated as a composite (structured) set detection problem in which both the parts (subject, object, and predicate) and the sum (triplet as a whole) are to be detected in a hierarchical fashion. In this paper, we present a new approach, denoted Part-and-Sum detection Transformer (PST), to perform end-to-end visual composite set detection. Different from existing Transformers in which queries are at a single level, we simultaneously model the joint part and sum hypotheses/interactions with composite queries and attention modules. We explicitly incorporate sum queries to enable better modeling of the part-and-sum relations that are absent in the standard Transformers. Our approach also uses novel tensor-based part queries and vector-based sum queries, and models their joint interaction. We report experiments on two vision tasks, visual relationship detection and human object interaction and demonstrate that PST achieves state of the art results among single-stage models, while nearly matching the results of custom designed two-stage models.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Dong_Visual_Relationship_Detection_Using_Part-and-Sum_Transformers_With_Composite_Queries_ICCV_2021_paper.pdf", @@ -46086,14 +49200,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Dong_Visual_Relationship_Detection_Using_Part-and-Sum_Transformers_With_Composite_Queries_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;0;0;0", - "aff_unique_norm": "Amazon", - "aff_unique_dep": "Amazon Web Services", + "aff_unique_norm": "Amazon Web Services", + "aff_unique_dep": "", "aff_unique_url": "https://aws.amazon.com", "aff_unique_abbr": "AWS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Dong_2021_ICCV,\n \n author = {\n Dong,\n Qi and Tu,\n Zhuowen and Liao,\n Haofu and Zhang,\n Yuting and Mahadevan,\n Vijay and Soatto,\n Stefano\n},\n title = {\n Visual Relationship Detection Using Part-and-Sum Transformers With Composite Queries\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3550-3559\n} \n}" }, { "title": "Visual Saliency Transformer", @@ -46101,6 +49216,7 @@ "status": "Poster", "track": "main", "pid": 8501, + "author_site": "Nian Liu; Ni Zhang; Kaiyuan Wan; Ling Shao; Junwei Han", "author": "Nian Liu; Ni Zhang; Kaiyuan Wan; Ling Shao; Junwei Han", "abstract": "Existing state-of-the-art saliency detection methods heavily rely on CNN-based architectures. Alternatively, we rethink this task from a convolution-free sequence-to-sequence perspective and predict saliency by modeling long-range dependencies, which can not be achieved by convolution. Specifically, we develop a novel unified model based on a pure transformer, namely, Visual Saliency Transformer (VST), for both RGB and RGB-D salient object detection (SOD). It takes image patches as inputs and leverages the transformer to propagate global contexts among image patches. Unlike conventional architectures used in Vision Transformer (ViT), we leverage multi-level token fusion and propose a new token upsampling method under the transformer framework to get high-resolution detection results. We also develop a token-based multi-task decoder to simultaneously perform saliency and boundary detection by introducing task-related tokens and a novel patch-task-attention mechanism. Experimental results show that our model outperforms existing methods on both RGB and RGB-D SOD benchmark datasets. Most importantly, our whole framework not only provides a new perspective for the SOD field but also shows a new paradigm for transformer-based dense prediction models. Code is available at https://github.com/nnizhang/VST.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liu_Visual_Saliency_Transformer_ICCV_2021_paper.pdf", @@ -46124,7 +49240,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0;1", - "aff_country_unique": "United Arab Emirates;China" + "aff_country_unique": "United Arab Emirates;China", + "bibtex": "@InProceedings{Liu_2021_ICCV,\n \n author = {\n Liu,\n Nian and Zhang,\n Ni and Wan,\n Kaiyuan and Shao,\n Ling and Han,\n Junwei\n},\n title = {\n Visual Saliency Transformer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4722-4732\n} \n}" }, { "title": "Visual Scene Graphs for Audio Source Separation", @@ -46132,6 +49249,7 @@ "status": "Poster", "track": "main", "pid": 10834, + "author_site": "Moitreya Chatterjee; Jonathan Le Roux; Narendra Ahuja; Anoop Cherian", "author": "Moitreya Chatterjee; Jonathan Le Roux; Narendra Ahuja; Anoop Cherian", "abstract": "State-of-the-art approaches for visually-guided audio source separation typically assume sources that have characteristic sounds, such as musical instruments. These approaches often ignore the visual context of these sound sources or avoid modeling object interactions that may be useful to better characterize the sources, especially when the same object class may produce varied sounds from distinct interactions. To address this challenging problem, we propose Audio Visual Scene Graph Segmenter (AVSGS), a novel deep learning model that embeds the visual structure of the scene as a graph and segments this graph into subgraphs, each subgraph being associated with a unique sound obtained by co-segmenting the audio spectrogram. At its core, AVSGS uses a recursive neural network that emits mutually-orthogonal sub-graph embeddings of the visual graph using multi-head attention. These embeddings are used for conditioning an audio encoder-decoder towards source separation. Our pipeline is trained end-to-end via a self-supervised task consisting of separating audio sources using the visual graph from artificially mixed sounds. In this paper, we also introduce an \"\"in the wild\" video dataset for sound source separation that contains multiple non-musical sources, which we call Audio Separation in the Wild (ASIW). This dataset is adapted from the AudioCaps dataset, and provides a challenging, natural, and daily-life setting for source separation. Thorough experiments on the proposed ASIW and the standard MUSIC datasets demonstrate state-of-the-art sound separation performance of our method against recent prior approaches.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chatterjee_Visual_Scene_Graphs_for_Audio_Source_Separation_ICCV_2021_paper.pdf", @@ -46148,14 +49266,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Chatterjee_Visual_Scene_Graphs_for_Audio_Source_Separation_ICCV_2021_paper.html", "aff_unique_index": "0;1;0;1", - "aff_unique_norm": "University of Illinois Urbana-Champaign;Mitsubishi Electric Research Laboratories", + "aff_unique_norm": "University of Illinois at Urbana-Champaign;Mitsubishi Electric Research Laboratories", "aff_unique_dep": ";", "aff_unique_url": "https://illinois.edu;https://www.merl.com", "aff_unique_abbr": "UIUC;MERL", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Urbana-Champaign;", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Chatterjee_2021_ICCV,\n \n author = {\n Chatterjee,\n Moitreya and Le Roux,\n Jonathan and Ahuja,\n Narendra and Cherian,\n Anoop\n},\n title = {\n Visual Scene Graphs for Audio Source Separation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1204-1213\n} \n}" }, { "title": "Visual Transformers: Where Do Transformers Really Belong in Vision Models?", @@ -46163,6 +49282,7 @@ "status": "Poster", "track": "main", "pid": 9872, + "author_site": "Bichen Wu; Chenfeng Xu; Xiaoliang Dai; Alvin Wan; Peizhao Zhang; Zhicheng Yan; Masayoshi Tomizuka; Joseph E. Gonzalez; Kurt Keutzer; Peter Vajda", "author": "Bichen Wu; Chenfeng Xu; Xiaoliang Dai; Alvin Wan; Peizhao Zhang; Zhicheng Yan; Masayoshi Tomizuka; Joseph E. Gonzalez; Kurt Keutzer; Peter Vajda", "abstract": "A recent trend in computer vision is to replace convolutions with transformers. However, the performance gain of transformers is attained at a steep cost, requiring GPU years and hundreds of millions of samples for training. This excessive resource usage compensates for a misuse of transformers: Transformers densely model relationships between its inputs -- ideal for late stages of a neural network, when concepts are sparse and spatially-distant, but extremely inefficient for early stages of a network, when patterns are redundant and localized. To address these issues, we leverage the respective strengths of both operations, building convolution-transformer hybrids. Critically, in sharp contrast to pixel-space transformers, our Visual Transformer (VT) operates in a semantic token space, judiciously attending to different image parts based on context. Our VTs significantly outperforms baselines: On ImageNet, our VT-ResNets outperform convolution-only ResNet by 4.6 to 7 points and transformer-only ViT-B by 2.6 points with 2.5 times fewer FLOPs, 2.1 times fewer parameters. For semantic segmentation on LIP and COCO-stuff, VT-based feature pyramid networks (FPN) achieve 0.35 points higher mIoU while reducing the FPN module's FLOPs by 6.5x.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wu_Visual_Transformers_Where_Do_Transformers_Really_Belong_in_Vision_Models_ICCV_2021_paper.pdf", @@ -46179,14 +49299,15 @@ "author_num": 10, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wu_Visual_Transformers_Where_Do_Transformers_Really_Belong_in_Vision_Models_ICCV_2021_paper.html", "aff_unique_index": "0;1;0;1;0;0;1;1;1;0", - "aff_unique_norm": "Meta;University of California, Berkeley", - "aff_unique_dep": "Facebook;", + "aff_unique_norm": "Facebook;University of California, Berkeley", + "aff_unique_dep": ";", "aff_unique_url": "https://www.facebook.com;https://www.berkeley.edu", "aff_unique_abbr": "FB;UC Berkeley", "aff_campus_unique_index": "1;1;1;1;1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Wu_2021_ICCV,\n \n author = {\n Wu,\n Bichen and Xu,\n Chenfeng and Dai,\n Xiaoliang and Wan,\n Alvin and Zhang,\n Peizhao and Yan,\n Zhicheng and Tomizuka,\n Masayoshi and Gonzalez,\n Joseph E. and Keutzer,\n Kurt and Vajda,\n Peter\n},\n title = {\n Visual Transformers: Where Do Transformers Really Belong in Vision Models?\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 599-609\n} \n}" }, { "title": "Visual-Textual Attentive Semantic Consistency for Medical Report Generation", @@ -46194,10 +49315,11 @@ "status": "Poster", "track": "main", "pid": 3310, + "author_site": "Yi Zhou; Lei Huang; Tao Zhou; Huazhu Fu; Ling Shao", "author": "Yi Zhou; Lei Huang; Tao Zhou; Huazhu Fu; Ling Shao", "abstract": "Diagnosing diseases from medical radiographs and writing reports requires professional knowledge and is time-consuming. To address this, automatic medical report generation approaches have recently gained interest. However, identifying diseases as well as correctly predicting their corresponding sizes, locations and other medical description patterns, which is essential for generating high-quality reports, is challenging. Although previous methods focused on producing readable reports, how to accurately detect and describe findings that match with the query X-Ray has not been successfully addressed. In this paper, we propose a multi-modality semantic attention model to integrate visual features, predicted key finding embeddings, as well as clinical features, and progressively decode reports with visual-textual semantic consistency. First, multi-modality features are extracted and attended with the hidden states from the sentence decoder, to encode enriched context vectors for better decoding a report. These modalities include regional visual features of scans, semantic word embeddings of the top-K findings predicted with high probabilities, and clinical features of indications. Second, the progressive report decoder consists of a sentence decoder and a word decoder, where we propose image-sentence matching and description accuracy losses to constrain the visual-textual semantic consistency. Extensive experiments on the public MIMIC-CXR and IU X-Ray datasets show that our model achieves consistent improvements over the state-of-the-art methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhou_Visual-Textual_Attentive_Semantic_Consistency_for_Medical_Report_Generation_ICCV_2021_paper.pdf", - "aff": "School of Computer Science and Engineering, Southeast University, Nanjing, China; SKLSDE, Institute of Arti\ufb01cial Intelligence, Beihang University, Beijing, China; School of Computer Science and Technology, Nanjing University of Science and Technology, China; Inception Institute of Arti\ufb01cial Intelligence, Abu Dhabi, UAE; Inception Institute of Arti\ufb01cial Intelligence, Abu Dhabi, UAE", + "aff": "School of Computer Science and Engineering, Southeast University, Nanjing, China; SKLSDE, Institute of Artificial Intelligence, Beihang University, Beijing, China; School of Computer Science and Technology, Nanjing University of Science and Technology, China; Inception Institute of Artificial Intelligence, Abu Dhabi, UAE; Inception Institute of Artificial Intelligence, Abu Dhabi, UAE", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Zhou_Visual-Textual_Attentive_Semantic_ICCV_2021_supplemental.pdf", @@ -46211,13 +49333,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhou_Visual-Textual_Attentive_Semantic_Consistency_for_Medical_Report_Generation_ICCV_2021_paper.html", "aff_unique_index": "0;1;2;3;3", "aff_unique_norm": "Southeast University;Beihang University;Nanjing University of Science and Technology;Inception Institute of Artificial Intelligence", - "aff_unique_dep": "School of Computer Science and Engineering;Institute of Arti\ufb01cial Intelligence;School of Computer Science and Technology;", - "aff_unique_url": "https://www.seu.edu.cn/;http://www.buaa.edu.cn;http://www.nust.edu.cn;", - "aff_unique_abbr": "SEU;BUAA;NUST;", + "aff_unique_dep": "School of Computer Science and Engineering;Institute of Artificial Intelligence;School of Computer Science and Technology;", + "aff_unique_url": "https://www.seu.edu.cn/;http://www.buaa.edu.cn;;", + "aff_unique_abbr": "SEU;BUAA;;", "aff_campus_unique_index": "0;1;3;3", "aff_campus_unique": "Nanjing;Beijing;;Abu Dhabi", "aff_country_unique_index": "0;0;0;1;1", - "aff_country_unique": "China;United Arab Emirates" + "aff_country_unique": "China;United Arab Emirates", + "bibtex": "@InProceedings{Zhou_2021_ICCV,\n \n author = {\n Zhou,\n Yi and Huang,\n Lei and Zhou,\n Tao and Fu,\n Huazhu and Shao,\n Ling\n},\n title = {\n Visual-Textual Attentive Semantic Consistency for Medical Report Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3985-3994\n} \n}" }, { "title": "VolumeFusion: Deep Depth Fusion for 3D Scene Reconstruction", @@ -46225,6 +49348,7 @@ "status": "Poster", "track": "main", "pid": 9550, + "author_site": "Jaesung Choe; Sunghoon Im; Francois Rameau; Minjun Kang; In So Kweon", "author": "Jaesung Choe; Sunghoon Im; Francois Rameau; Minjun Kang; In So Kweon", "abstract": "To reconstruct a 3D scene from a set of calibrated views, traditional multi-view stereo techniques rely on two distinct stages: local depth maps computation and global depth maps fusion. Recent studies concentrate on deep neural architectures for depth estimation by using conventional depth fusion method or direct 3D reconstruction network by regressing Truncated Signed Distance Function (TSDF). In this paper, we advocate that replicating the traditional two stages framework with deep neural networks improves both the interpretability and the accuracy of the results. As mentioned, our network operates in two steps: 1) the local computation of the local depth maps with a deep MVS technique, and, 2) the depth maps and images' features fusion to build a single TSDF volume. In order to improve the matching performance between images acquired from very different viewpoints (e.g., large-baseline and rotations), we introduce a rotation-invariant 3D convolution kernel called PosedConv. The effectiveness of the proposed architecture is underlined via a large series of experiments conducted on the ScanNet dataset where our approach compares favorably against both traditional and deep learning techniques.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Choe_VolumeFusion_Deep_Depth_Fusion_for_3D_Scene_Reconstruction_ICCV_2021_paper.pdf", @@ -46248,7 +49372,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Choe_2021_ICCV,\n \n author = {\n Choe,\n Jaesung and Im,\n Sunghoon and Rameau,\n Francois and Kang,\n Minjun and Kweon,\n In So\n},\n title = {\n VolumeFusion: Deep Depth Fusion for 3D Scene Reconstruction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 16086-16095\n} \n}" }, { "title": "Voxel Transformer for 3D Object Detection", @@ -46256,10 +49381,11 @@ "status": "Poster", "track": "main", "pid": 3825, + "author_site": "Jiageng Mao; Yujing Xue; Minzhe Niu; Haoyue Bai; Jiashi Feng; Xiaodan Liang; Hang Xu; Chunjing Xu", "author": "Jiageng Mao; Yujing Xue; Minzhe Niu; Haoyue Bai; Jiashi Feng; Xiaodan Liang; Hang Xu; Chunjing Xu", "abstract": "We present Voxel Transformer (VoTr), a novel and effective voxel-based Transformer backbone for 3D object detection from point clouds. Conventional 3D convolutional backbones in voxel-based 3D detectors cannot efficiently capture large context information, which is crucial for object recognition and localization, owing to the limited receptive fields. In this paper, we resolve the problem by introducing a Transformer-based architecture that enables long-range relationships between voxels by self-attention. Given the fact that non-empty voxels are naturally sparse but numerous, directly applying standard Transformer on voxels is non-trivial. To this end, we propose the sparse voxel module and the submanifold voxel module, which can operate on the empty and non-empty voxel positions effectively. To further enlarge the attention range while maintaining comparable computational overhead to the convolutional counterparts, we propose two attention mechanisms for multi-head attention in those two modules: Local Attention and Dilated Attention, and we further propose Fast Voxel Query to accelerate the querying process in multi-head attention. VoTr contains a series of sparse and submanifold voxel modules and can be applied in most voxel-based detectors. Our proposed VoTr shows consistent improvement over the convolutional baselines while maintaining computational efficiency on the KITTI dataset and the Waymo Open dataset.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Mao_Voxel_Transformer_for_3D_Object_Detection_ICCV_2021_paper.pdf", - "aff": "The Chinese University of Hong Kong; National University of Singapore; Huawei Noah\u2019s Ark Lab; HKUST; Sun Yat-Sen University; National University of Singapore; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab", + "aff": "The Chinese University of Hong Kong; National University of Singapore; Huawei Noah’s Ark Lab; HKUST; Sun Yat-Sen University; National University of Singapore; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab", "project": "", "github": "", "supp": "", @@ -46272,14 +49398,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Mao_Voxel_Transformer_for_3D_Object_Detection_ICCV_2021_paper.html", "aff_unique_index": "0;1;2;3;4;1;2;2", - "aff_unique_norm": "Chinese University of Hong Kong;National University of Singapore;Huawei;Hong Kong University of Science and Technology;Sun Yat-sen University", - "aff_unique_dep": ";;Noah\u2019s Ark Lab;;", + "aff_unique_norm": "The Chinese University of Hong Kong;National University of Singapore;Huawei;Hong Kong University of Science and Technology;Sun Yat-Sen University", + "aff_unique_dep": ";;Noah’s Ark Lab;;", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.nus.edu.sg;https://www.huawei.com;https://www.ust.hk;http://www.sysu.edu.cn/", "aff_unique_abbr": "CUHK;NUS;Huawei;HKUST;SYSU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;1;0;0;0;1;0;0", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Mao_2021_ICCV,\n \n author = {\n Mao,\n Jiageng and Xue,\n Yujing and Niu,\n Minzhe and Bai,\n Haoyue and Feng,\n Jiashi and Liang,\n Xiaodan and Xu,\n Hang and Xu,\n Chunjing\n},\n title = {\n Voxel Transformer for 3D Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3164-3173\n} \n}" }, { "title": "Voxel-Based Network for Shape Completion by Leveraging Edge Generation", @@ -46287,6 +49414,7 @@ "status": "Poster", "track": "main", "pid": 8459, + "author_site": "Xiaogang Wang; Marcelo H Ang; Gim Hee Lee", "author": "Xiaogang Wang; Marcelo H Ang; Gim Hee Lee", "abstract": "Deep learning technique has yielded significant improvements in point cloud completion with the aim of completing missing object shapes from partial inputs. However, most existing methods fail to recover realistic structures due to over-smoothing of fine-grained details. In this paper, we develop a voxel-based network for point cloud completion by leveraging edge generation (VE-PCN). We first embed point clouds into regular voxel grids, and then generate complete objects with the help of the hallucinated shape edges. This decoupled architecture together with a multi-scale grid feature learning is able to generate more realistic on-surface details. We evaluate our model on the publicly available completion datasets and show that it outperforms existing state-of-the-art approaches quantitatively and qualitatively. Our source code is available at https://github.com/xiaogangw/VE-PCN.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_Voxel-Based_Network_for_Shape_Completion_by_Leveraging_Edge_Generation_ICCV_2021_paper.pdf", @@ -46310,7 +49438,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Xiaogang and Ang,\n Marcelo H and Lee,\n Gim Hee\n},\n title = {\n Voxel-Based Network for Shape Completion by Leveraging Edge Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 13189-13198\n} \n}" }, { "title": "WB-DETR: Transformer-Based Detector Without Backbone", @@ -46318,6 +49447,7 @@ "status": "Poster", "track": "main", "pid": 3873, + "author_site": "Fanfan Liu; Haoran Wei; Wenzhe Zhao; Guozhen Li; Jingquan Peng; Zihao Li", "author": "Fanfan Liu; Haoran Wei; Wenzhe Zhao; Guozhen Li; Jingquan Peng; Zihao Li", "abstract": "Transformer-based detector is a new paradigm in object detection, which aims to achieve pretty-well performance while eliminates the priori knowledge driven components, e.g., anchors, proposals and the NMS. DETR, the state-of-the-art model among them, is composed of three sub-modules, i.e., a CNN-based backbone and paired transformer encoder-decoder. The CNN is applied to extract local features and the transformer is used to capture global contexts. This pipeline, however, is not concise enough. In this paper, we propose WB-DETR (DETR-based detector Without Backbone) to prove that the reliance on CNN features extraction for a transformer-based detector is not necessary. Unlike the original DETR, WB-DETR is composed of only an encoder and a decoder without CNN backbone. For an input image, WB-DETR serializes it directly to encode the local features into each individual token. To make up the deficiency of transformer in modeling local information, we design an LIE-T2T (local information enhancement tokens to token) module to enhance the internal information of tokens after unfolding. Experimental results demonstrate that WB-DETR, the first pure-transformer detector without CNN to our knowledge, yields on par accuracy and faster inference speed with only half number of parameters compared with DETR baseline.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Liu_WB-DETR_Transformer-Based_Detector_Without_Backbone_ICCV_2021_paper.pdf", @@ -46341,7 +49471,8 @@ "aff_campus_unique_index": "1;1;1;2;1;1", "aff_campus_unique": ";Beijing;Dalian", "aff_country_unique_index": "0+1+0+0;0+1+0+0;0+1+0+0;0;0+1+0+0;0+1+0+0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Liu_2021_ICCV,\n \n author = {\n Liu,\n Fanfan and Wei,\n Haoran and Zhao,\n Wenzhe and Li,\n Guozhen and Peng,\n Jingquan and Li,\n Zihao\n},\n title = {\n WB-DETR: Transformer-Based Detector Without Backbone\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2979-2987\n} \n}" }, { "title": "Walk in the Cloud: Learning Curves for Point Clouds Shape Analysis", @@ -46349,6 +49480,7 @@ "status": "Poster", "track": "main", "pid": 3184, + "author_site": "Tiange Xiang; Chaoyi Zhang; Yang Song; Jianhui Yu; Weidong Cai", "author": "Tiange Xiang; Chaoyi Zhang; Yang Song; Jianhui Yu; Weidong Cai", "abstract": "Discrete point cloud objects lack sufficient shape descriptors of 3D geometries. In this paper, we present a novel method for aggregating hypothetical curves in point clouds. Sequences of connected points (curves) are initially grouped by taking guided walks in the point clouds, and then subsequently aggregated back to augment their point-wise features. We provide an effective implementation of the proposed aggregation strategy including a novel curve grouping operator followed by a curve aggregation operator. Our method was benchmarked on several point cloud analysis tasks where we achieved the state-of-the-art classification accuracy of 94.2% on the ModelNet40 classification task, instance IoU of 86.8% on the ShapeNetPart segmentation task and cosine error of 0.11 on the ModelNet40 normal estimation task.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xiang_Walk_in_the_Cloud_Learning_Curves_for_Point_Clouds_Shape_ICCV_2021_paper.pdf", @@ -46372,7 +49504,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "Australia" + "aff_country_unique": "Australia", + "bibtex": "@InProceedings{Xiang_2021_ICCV,\n \n author = {\n Xiang,\n Tiange and Zhang,\n Chaoyi and Song,\n Yang and Yu,\n Jianhui and Cai,\n Weidong\n},\n title = {\n Walk in the Cloud: Learning Curves for Point Clouds Shape Analysis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 915-924\n} \n}" }, { "title": "Wanderlust: Online Continual Object Detection in the Real World", @@ -46380,6 +49513,7 @@ "status": "Poster", "track": "main", "pid": 5409, + "author_site": "Jianren Wang; Xin Wang; Yue Shang-Guan; Abhinav Gupta", "author": "Jianren Wang; Xin Wang; Yue Shang-Guan; Abhinav Gupta", "abstract": "Online continual learning from data streams in dynamic environments is a critical direction in the computer vision field. However, realistic benchmarks and fundamental studies in this line are still missing. To bridge the gap, we present a new online continual object detection benchmark with an egocentric video dataset, Objects Around Krishna (OAK). OAK adopts the KrishnaCAM videos, an ego-centric video stream collected over nine months by a graduate student. OAK provides exhaustive bounding box annotations of 80 video snippets (~17.5 hours) for 105 object categories in outdoor scenes. The emergence of new object categories in our benchmark follows a pattern similar to what a single person might see in their day-to-day life. The dataset also captures the natural distribution shifts as the person travels to different places. These egocentric long running videos provide a realistic playground for continual learning algorithms, especially in online embodied settings. We also introduce new evaluation metrics to evaluate the model performance and catastrophic forgetting and provide baseline studies for online continual object detection. We believe this benchmark will pose new exciting challenges for learning from non-stationary data in continual learning. The OAK dataset and the associated benchmark are released at https://oakdata.github.io/.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_Wanderlust_Online_Continual_Object_Detection_in_the_Real_World_ICCV_2021_paper.pdf", @@ -46394,7 +49528,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wang_Wanderlust_Online_Continual_Object_Detection_in_the_Real_World_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wang_Wanderlust_Online_Continual_Object_Detection_in_the_Real_World_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Jianren and Wang,\n Xin and Shang-Guan,\n Yue and Gupta,\n Abhinav\n},\n title = {\n Wanderlust: Online Continual Object Detection in the Real World\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10829-10838\n} \n}" }, { "title": "Warp Consistency for Unsupervised Learning of Dense Correspondences", @@ -46402,6 +49537,7 @@ "status": "Poster", "track": "main", "pid": 2724, + "author_site": "Prune Truong; Martin Danelljan; Fisher Yu; Luc Van Gool", "author": "Prune Truong; Martin Danelljan; Fisher Yu; Luc Van Gool", "abstract": "The key challenge in learning dense correspondences lies in the lack of ground-truth matches for real image pairs. While photometric consistency losses provide unsupervised alternatives, they struggle with large appearance changes, which are ubiquitous in geometric and semantic matching tasks. Moreover, methods relying on synthetic training pairs often suffer from poor generalisation to real data. We propose Warp Consistency, an unsupervised learning objective for dense correspondence regression. Our objective is effective even in settings with large appearance and view-point changes. Given a pair of real images, we first construct an image triplet by applying a randomly sampled warp to one of the original images. We derive and analyze all flow-consistency constraints arising between the triplet. From our observations and empirical results, we design a general unsupervised objective employing two of the derived constraints. We validate our warp consistency loss by training three recent dense correspondence networks for the geometric and semantic matching tasks. Our approach sets a new state-of-the-art on several challenging benchmarks, including MegaDepth, RobotCar and TSS. Code and models are at github.com/PruneTruong/DenseMatching.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Truong_Warp_Consistency_for_Unsupervised_Learning_of_Dense_Correspondences_ICCV_2021_paper.pdf", @@ -46425,7 +49561,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Switzerland" + "aff_country_unique": "Switzerland", + "bibtex": "@InProceedings{Truong_2021_ICCV,\n \n author = {\n Truong,\n Prune and Danelljan,\n Martin and Yu,\n Fisher and Van Gool,\n Luc\n},\n title = {\n Warp Consistency for Unsupervised Learning of Dense Correspondences\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10346-10356\n} \n}" }, { "title": "Warp-Refine Propagation: Semi-Supervised Auto-Labeling via Cycle-Consistency", @@ -46433,6 +49570,7 @@ "status": "Poster", "track": "main", "pid": 8564, + "author_site": "Aditya Ganeshan; Alexis Vallet; Yasunori Kudo; Shin-ichi Maeda; Tommi Kerola; Rares Ambrus; Dennis Park; Adrien Gaidon", "author": "Aditya Ganeshan; Alexis Vallet; Yasunori Kudo; Shin-ichi Maeda; Tommi Kerola; Rares Ambrus; Dennis Park; Adrien Gaidon", "abstract": "Deep learning models for semantic segmentation rely on expensive, large-scale, manually annotated datasets. Labelling is a tedious process that can take hours per image. Automatically annotating video sequences by propagating sparsely labeled frames through time is a more scalable alternative. In this work, we propose a novel label propagation method, termed Warp-Refine Propagation, that combines semantic cues with geometric cues to efficiently auto-label videos. Our method learns to refine geometrically-warped labels and infuse them with learned semantic priors in a semi-supervised setting by leveraging cycle consistency across time. We quantitatively show that our method improves label-propagation by a noteworthy margin of 13.1 mIoU on the ApolloScape dataset. Furthermore, by training with the auto-labelled frames, we achieve competitive results on three semantic-segmentation benchmarks, improving the state-of-the-art by a large margin of 1.8 and 3.61 mIoU on NYU-V2 and KITTI, while matching the current best results on Cityscapes.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Ganeshan_Warp-Refine_Propagation_Semi-Supervised_Auto-Labeling_via_Cycle-Consistency_ICCV_2021_paper.pdf", @@ -46456,7 +49594,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;1;0;0;0", - "aff_country_unique": "United States;Japan" + "aff_country_unique": "United States;Japan", + "bibtex": "@InProceedings{Ganeshan_2021_ICCV,\n \n author = {\n Ganeshan,\n Aditya and Vallet,\n Alexis and Kudo,\n Yasunori and Maeda,\n Shin-ichi and Kerola,\n Tommi and Ambrus,\n Rares and Park,\n Dennis and Gaidon,\n Adrien\n},\n title = {\n Warp-Refine Propagation: Semi-Supervised Auto-Labeling via Cycle-Consistency\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15499-15509\n} \n}" }, { "title": "WarpedGANSpace: Finding Non-Linear RBF Paths in GAN Latent Space", @@ -46464,6 +49603,7 @@ "status": "Poster", "track": "main", "pid": 8992, + "author_site": "Christos Tzelepis; Georgios Tzimiropoulos; Ioannis Patras", "author": "Christos Tzelepis; Georgios Tzimiropoulos; Ioannis Patras", "abstract": "This work addresses the problem of discovering, in an unsupervised manner, interpretable paths in the latent space of pretrained GANs, so as to provide an intuitive and easy way of controlling the underlying generative factors. In doing so, it addresses some of the limitations of the state-of-the-art works, namely, a) that they discover directions that are independent of the latent code, i.e., paths that are linear, and b) that their evaluation relies either on visual inspection or on laborious human labeling. More specifically, we propose to learn non-linear warpings on the latent space, each one parametrized by a set of RBF-based latent space warping functions, and where each warping gives rise to a family of non-linear paths via the gradient of the function. Building on the work of Voynov and Babenko, that discovers linear paths, we optimize the trainable parameters of the set of RBFs, so as that images that are generated by codes along different paths, are easily distinguishable by a discriminator network. This leads to easily distinguishable image transformations, such as pose and facial expressions in facial images. We show that linear paths can be derived as a special case of our method, and show experimentally that non-linear paths in the latent space lead to steeper, more disentangled and interpretable changes in the image space than in state-of-the art methods, both qualitatively and quantitatively. We make the code and the pretrained models publicly available at: https://github.com/chi0tzp/WarpedGANSpace.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Tzelepis_WarpedGANSpace_Finding_Non-Linear_RBF_Paths_in_GAN_Latent_Space_ICCV_2021_paper.pdf", @@ -46487,7 +49627,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "London", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Tzelepis_2021_ICCV,\n \n author = {\n Tzelepis,\n Christos and Tzimiropoulos,\n Georgios and Patras,\n Ioannis\n},\n title = {\n WarpedGANSpace: Finding Non-Linear RBF Paths in GAN Latent Space\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6393-6402\n} \n}" }, { "title": "Wasserstein Coupled Graph Learning for Cross-Modal Retrieval", @@ -46495,6 +49636,7 @@ "status": "Poster", "track": "main", "pid": 8916, + "author_site": "Yun Wang; Tong Zhang; Xueya Zhang; Zhen Cui; Yuge Huang; Pengcheng Shen; Shaoxin Li; Jian Yang", "author": "Yun Wang; Tong Zhang; Xueya Zhang; Zhen Cui; Yuge Huang; Pengcheng Shen; Shaoxin Li; Jian Yang", "abstract": "Graphs play an important role in cross-modal image-text understanding as they characterize the intrinsic structure which is robust and crucial for the measurement of cross-modal similarity. In this work, we propose a Wasserstein Coupled Graph Learning (WCGL) method to deal with the cross-modal retrieval task. First, graphs are constructed according to two input cross-modal samples separately, and passed through the corresponding graph encoders to extract robust features. Then, a Wasserstein coupled dictionary, containing multiple pairs of counterpart graph keys with each key corresponding to one modality, is constructed for further feature learning. Based on this dictionary, the input graphs can be transformed into the dictionary space to facilitate the similarity measurement through a Wasserstein Graph Embedding (WGE) process. The WGE could capture the graph correlation between the input and each corresponding key through optimal transport, and hence well characterize the inter-graph structural relationship. To further achieve discriminant graph learning, we specifically define a Wasserstein discriminant loss on the coupled graph keys to make the intra-class (counterpart) keys more compact and inter-class (non-counterpart) keys more dispersed, which further promotes the final cross-modal retrieval task. Experimental results demonstrate the effectiveness and state-of-the-art performance.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Wang_Wasserstein_Coupled_Graph_Learning_for_Cross-Modal_Retrieval_ICCV_2021_paper.pdf", @@ -46509,7 +49651,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wang_Wasserstein_Coupled_Graph_Learning_for_Cross-Modal_Retrieval_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Wang_Wasserstein_Coupled_Graph_Learning_for_Cross-Modal_Retrieval_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Wang_2021_ICCV,\n \n author = {\n Wang,\n Yun and Zhang,\n Tong and Zhang,\n Xueya and Cui,\n Zhen and Huang,\n Yuge and Shen,\n Pengcheng and Li,\n Shaoxin and Yang,\n Jian\n},\n title = {\n Wasserstein Coupled Graph Learning for Cross-Modal Retrieval\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1813-1822\n} \n}" }, { "title": "Watch Only Once: An End-to-End Video Action Detection Framework", @@ -46517,6 +49660,7 @@ "status": "Poster", "track": "main", "pid": 5723, + "author_site": "Shoufa Chen; Peize Sun; Enze Xie; Chongjian Ge; Jiannan Wu; Lan Ma; Jiajun Shen; Ping Luo", "author": "Shoufa Chen; Peize Sun; Enze Xie; Chongjian Ge; Jiannan Wu; Lan Ma; Jiajun Shen; Ping Luo", "abstract": "We propose an end-to-end pipeline, named Watch Once Only (WOO), for video action detection. Current methods either decouple video action detection task into separated stages of actor localization and action classification or train two separated models within one stage. In contrast, our approach solves the actor localization and action classification simultaneously in a unified network. The whole pipeline is significantly simplified by unifying the backbone network and eliminating many hand-crafted components. WOO takes a unified video backbone to simultaneously extract features for actor location and action classification. In addition, we introduce spatial-temporal action embeddings into our framework and design a spatial-temporal fusion module to obtain more discriminative features with richer information, which further boosts the action classification performance. Extensive experiments on AVA and JHMDB datasets show that WOO achieves state-of-the-art performance, while still reduces up to 16.7% GFLOPs compared with existing methods. We hope our work can inspire rethinking the convention of action detection and serve as a solid baseline for end-to-end action detection. Code is available.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_Watch_Only_Once_An_End-to-End_Video_Action_Detection_Framework_ICCV_2021_paper.pdf", @@ -46531,7 +49675,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Chen_Watch_Only_Once_An_End-to-End_Video_Action_Detection_Framework_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Chen_Watch_Only_Once_An_End-to-End_Video_Action_Detection_Framework_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Shoufa and Sun,\n Peize and Xie,\n Enze and Ge,\n Chongjian and Wu,\n Jiannan and Ma,\n Lan and Shen,\n Jiajun and Luo,\n Ping\n},\n title = {\n Watch Only Once: An End-to-End Video Action Detection Framework\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8178-8187\n} \n}" }, { "title": "WaveFill: A Wavelet-Based Generation Network for Image Inpainting", @@ -46539,6 +49684,7 @@ "status": "Poster", "track": "main", "pid": 3194, + "author_site": "Yingchen Yu; Fangneng Zhan; Shijian Lu; Jianxiong Pan; Feiying Ma; Xuansong Xie; Chunyan Miao", "author": "Yingchen Yu; Fangneng Zhan; Shijian Lu; Jianxiong Pan; Feiying Ma; Xuansong Xie; Chunyan Miao", "abstract": "Image inpainting aims to complete the missing or corrupted regions of images with realistic contents. The prevalent approaches adopt a hybrid objective of reconstruction and perceptual quality by using generative adversarial networks. However, the reconstruction loss and adversarial loss focus on synthesizing contents of different frequencies and simply applying them together often leads to inter-frequency conflicts and compromised inpainting. This paper presents WaveFill, a wavelet-based inpainting network that decomposes images into multiple frequency bands and fills the missing regions in each frequency band separately and explicitly. WaveFill decomposes images by using discrete wavelet transform (DWT) that preserves spatial information naturally. It applies L1 reconstruction loss to the decomposed low-frequency bands and adversarial loss to high-frequency bands, hence effectively mitigate inter-frequency conflicts while completing images in spatial domain. To address the inpainting inconsistency in different frequency bands and fuse features with distinct statistics, we design a novel normalization scheme that aligns and fuses the multi-frequency features effectively. Extensive experiments over multiple datasets show that WaveFill achieves superior image inpainting qualitatively and quantitatively.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Yu_WaveFill_A_Wavelet-Based_Generation_Network_for_Image_Inpainting_ICCV_2021_paper.pdf", @@ -46562,7 +49708,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0;0;1;1;1;0", - "aff_country_unique": "Singapore;China" + "aff_country_unique": "Singapore;China", + "bibtex": "@InProceedings{Yu_2021_ICCV,\n \n author = {\n Yu,\n Yingchen and Zhan,\n Fangneng and Lu,\n Shijian and Pan,\n Jianxiong and Ma,\n Feiying and Xie,\n Xuansong and Miao,\n Chunyan\n},\n title = {\n WaveFill: A Wavelet-Based Generation Network for Image Inpainting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14114-14123\n} \n}" }, { "title": "Waypoint Models for Instruction-Guided Navigation in Continuous Environments", @@ -46570,6 +49717,7 @@ "status": "Poster", "track": "main", "pid": 10739, + "author_site": "Jacob Krantz; Aaron Gokaslan; Dhruv Batra; Stefan Lee; Oleksandr Maksymets", "author": "Jacob Krantz; Aaron Gokaslan; Dhruv Batra; Stefan Lee; Oleksandr Maksymets", "abstract": "Little inquiry has explicitly addressed the role of action spaces in language-guided visual navigation -- either in terms of its effect on navigation success or the efficiency with which a robotic agent could execute the resulting trajectory. Building on the recently released VLN-CE setting for instruction following in continuous environments, we develop a class of language-conditioned waypoint prediction networks to examine this question. We vary the expressivity of these models to explore a spectrum between low-level actions and continuous waypoint prediction. We measure task performance and estimated execution time on a profiled LoCoBot robot. We find more expressive models result in simpler, faster to execute trajectories, but lower-level actions can achieve better navigation metrics by approximating shortest paths better. Further, our models outperform prior work in VLN-CE and set a new state-of-the-art on the public leaderboard -- increasing success rate by 4% with our best model on this challenging task.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Krantz_Waypoint_Models_for_Instruction-Guided_Navigation_in_Continuous_Environments_ICCV_2021_paper.pdf", @@ -46586,14 +49734,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Krantz_Waypoint_Models_for_Instruction-Guided_Navigation_in_Continuous_Environments_ICCV_2021_paper.html", "aff_unique_index": "0;1+2;1+3;0;1", - "aff_unique_norm": "Oregon State University;Meta;Cornell University;Georgia Institute of Technology", + "aff_unique_norm": "Oregon State University;Facebook;Cornell University;Georgia Institute of Technology", "aff_unique_dep": ";Facebook AI Research;;", "aff_unique_url": "https://oregonstate.edu;https://research.facebook.com;https://www.cornell.edu;https://www.gatech.edu", "aff_unique_abbr": "OSU;FAIR;Cornell;Georgia Tech", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0+0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Krantz_2021_ICCV,\n \n author = {\n Krantz,\n Jacob and Gokaslan,\n Aaron and Batra,\n Dhruv and Lee,\n Stefan and Maksymets,\n Oleksandr\n},\n title = {\n Waypoint Models for Instruction-Guided Navigation in Continuous Environments\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 15162-15171\n} \n}" }, { "title": "Weak Adaptation Learning: Addressing Cross-Domain Data Insufficiency With Weak Annotator", @@ -46601,6 +49750,7 @@ "status": "Poster", "track": "main", "pid": 2249, + "author_site": "Shichao Xu; Lixu Wang; Yixuan Wang; Qi Zhu", "author": "Shichao Xu; Lixu Wang; Yixuan Wang; Qi Zhu", "abstract": "Data quantity and quality are crucial factors for data-driven learning methods. In some target problem domains, there are not many data samples available, which could significantly hinder the learning process. While data from similar domains may be leveraged to help through domain adaptation, obtaining high-quality labeled data for those source domains themselves could be difficult or costly. To address such challenges on data insufficiency for classification problem in a target domain, we propose a weak adaptation learning (WAL) approach that leverages unlabeled data from a similar source domain, a low-cost weak annotator that produces labels based on task-specific heuristics, labeling rules, or other methods (albeit with inaccuracy), and a small amount of labeled data in the target domain. Our approach first conducts a theoretical analysis on the error bound of the trained classifier with respect to the data quantity and the performance of the weak annotator, and then introduces a multi-stage weak adaptation learning method to learn an accurate classifier by lowering the error bound. Our experiments demonstrate the effectiveness of our approach in learning an accurate classifier with limited labeled data in the target domain and unlabeled data in the source domain.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xu_Weak_Adaptation_Learning_Addressing_Cross-Domain_Data_Insufficiency_With_Weak_Annotator_ICCV_2021_paper.pdf", @@ -46624,7 +49774,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Xu_2021_ICCV,\n \n author = {\n Xu,\n Shichao and Wang,\n Lixu and Wang,\n Yixuan and Zhu,\n Qi\n},\n title = {\n Weak Adaptation Learning: Addressing Cross-Domain Data Insufficiency With Weak Annotator\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8917-8926\n} \n}" }, { "title": "Weakly Supervised 3D Semantic Segmentation Using Cross-Image Consensus and Inter-Voxel Affinity Relations", @@ -46632,6 +49783,7 @@ "status": "Poster", "track": "main", "pid": 5419, + "author_site": "Xiaoyu Zhu; Jeffrey Chen; Xiangrui Zeng; Junwei Liang; Chengqi Li; Sinuo Liu; Sima Behpour; Min Xu", "author": "Xiaoyu Zhu; Jeffrey Chen; Xiangrui Zeng; Junwei Liang; Chengqi Li; Sinuo Liu; Sima Behpour; Min Xu", "abstract": "We propose a novel weakly supervised approach for 3D semantic segmentation on volumetric images. Unlike most existing methods that require voxel-wise densely labeled training data, our weakly-supervised CIVA-Net is the first model that only needs image-level class labels as guidance to learn accurate volumetric segmentation. Our model learns from cross-image co-occurrence for integral region generation, and explores inter-voxel affinity relations to predict segmentation with accurate boundaries. We empirically validate our model on both simulated and real cryo-ET datasets. Our experiments show that CIVA-Net achieves comparable performance to the state-of-the-art models trained with stronger supervision.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhu_Weakly_Supervised_3D_Semantic_Segmentation_Using_Cross-Image_Consensus_and_Inter-Voxel_ICCV_2021_paper.pdf", @@ -46655,7 +49807,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";San Diego", "aff_country_unique_index": "0;0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhu_2021_ICCV,\n \n author = {\n Zhu,\n Xiaoyu and Chen,\n Jeffrey and Zeng,\n Xiangrui and Liang,\n Junwei and Li,\n Chengqi and Liu,\n Sinuo and Behpour,\n Sima and Xu,\n Min\n},\n title = {\n Weakly Supervised 3D Semantic Segmentation Using Cross-Image Consensus and Inter-Voxel Affinity Relations\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2834-2844\n} \n}" }, { "title": "Weakly Supervised Contrastive Learning", @@ -46663,6 +49816,7 @@ "status": "Poster", "track": "main", "pid": 2614, + "author_site": "Mingkai Zheng; Fei Wang; Shan You; Chen Qian; Changshui Zhang; Xiaogang Wang; Chang Xu", "author": "Mingkai Zheng; Fei Wang; Shan You; Chen Qian; Changshui Zhang; Xiaogang Wang; Chang Xu", "abstract": "Unsupervised visual representation learning has gained much attention from the computer vision community because of the recent achievement of contrastive learning. Most of the existing contrastive learning frameworks adopt the instance discrimination as the pretext task, which treating every single instance as a different class. However, such method will inevitably cause class collision problems, which hurts the quality of the learned representation. Motivated by this observation, we introduced a weakly supervised contrastive learning framework (WCL) to tackle this issue. Specifically, our proposed framework is based on two projection heads, one of which will perform the regular instance discrimination task. The other head will use a graph-based method to explore similar samples and generate a weak label, then perform a supervised contrastive learning task based on the weak label to pull the similar images closer. We further introduced a K-Nearest Neighbor based multi-crop strategy to expand the number of positive samples. Extensive experimental results demonstrate WCL improves the quality of self-supervised representations across different datasets. Notably, we get a new state-of-the-art result for semi-supervised learning. With only 1% and 10% labeled examples, WCL achieves 65% and 72% ImageNet Top-1 Accuracy using ResNet50, which is even higher than SimCLRv2 with ResNet101.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zheng_Weakly_Supervised_Contrastive_Learning_ICCV_2021_paper.pdf", @@ -46679,14 +49833,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zheng_Weakly_Supervised_Contrastive_Learning_ICCV_2021_paper.html", "aff_unique_index": "0+1;2;0+1+3;0;1;3;4", - "aff_unique_norm": "SenseTime;Tsinghua University;University of Science and Technology of China;Chinese University of Hong Kong;University of Sydney", + "aff_unique_norm": "SenseTime;Tsinghua University;University of Science and Technology of China;The Chinese University of Hong Kong;The University of Sydney", "aff_unique_dep": "SenseTime Research;Department of Automation;;;School of Computer Science", "aff_unique_url": "https://www.sensetime.com;https://www.tsinghua.edu.cn;http://www.ustc.edu.cn;https://www.cuhk.edu.hk;https://www.sydney.edu.au", "aff_unique_abbr": "SenseTime;THU;USTC;CUHK;USYD", "aff_campus_unique_index": "1;1+2;1;2", "aff_campus_unique": ";Beijing;Hong Kong SAR", "aff_country_unique_index": "0+0;0;0+0+0;0;0;0;1", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Zheng_2021_ICCV,\n \n author = {\n Zheng,\n Mingkai and Wang,\n Fei and You,\n Shan and Qian,\n Chen and Zhang,\n Changshui and Wang,\n Xiaogang and Xu,\n Chang\n},\n title = {\n Weakly Supervised Contrastive Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10042-10051\n} \n}" }, { "title": "Weakly Supervised Human-Object Interaction Detection in Video via Contrastive Spatiotemporal Regions", @@ -46694,6 +49849,7 @@ "status": "Poster", "track": "main", "pid": 5475, + "author_site": "Shuang Li; Yilun Du; Antonio Torralba; Josef Sivic; Bryan Russell", "author": "Shuang Li; Yilun Du; Antonio Torralba; Josef Sivic; Bryan Russell", "abstract": "We introduce the task of weakly supervised learning for detecting human and object interactions in videos. Our task poses unique challenges as a system does not know what types of human-object interactions are present in a video or the actual spatiotemporal location of the human and object. To address these challenges, we introduce a contrastive weakly supervised training loss that aims to jointly associate spatiotemporal regions in a video with an action and object vocabulary and encourage temporal continuity of the visual appearance of moving objects as a form of self-supervision. To train our model, we introduce a dataset comprising over 6.5k videos with human-object interaction annotations that have been semi-automatically curated from sentence captions associated with the videos. We demonstrate improved performance over weakly supervised baselines adapted to our task on our video dataset.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Li_Weakly_Supervised_Human-Object_Interaction_Detection_in_Video_via_Contrastive_Spatiotemporal_ICCV_2021_paper.pdf", @@ -46710,14 +49866,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Li_Weakly_Supervised_Human-Object_Interaction_Detection_in_Video_via_Contrastive_Spatiotemporal_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;1;2", - "aff_unique_norm": "Massachusetts Institute of Technology;Czech Institute of Informatics, Robotics, and Cybernetics;Adobe", - "aff_unique_dep": ";;Adobe Inc.", + "aff_unique_norm": "Massachusetts Institute of Technology;Czech Institute of Informatics, Robotics, and Cybernetics;Adobe Inc.", + "aff_unique_dep": ";;", "aff_unique_url": "https://web.mit.edu;https://www.ciirc.cvut.cz/;https://www.adobe.com", "aff_unique_abbr": "MIT;CIIRC;Adobe", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0", - "aff_country_unique": "United States;Czech Republic" + "aff_country_unique": "United States;Czech Republic", + "bibtex": "@InProceedings{Li_2021_ICCV,\n \n author = {\n Li,\n Shuang and Du,\n Yilun and Torralba,\n Antonio and Sivic,\n Josef and Russell,\n Bryan\n},\n title = {\n Weakly Supervised Human-Object Interaction Detection in Video via Contrastive Spatiotemporal Regions\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1845-1855\n} \n}" }, { "title": "Weakly Supervised Person Search With Region Siamese Networks", @@ -46725,6 +49882,7 @@ "status": "Poster", "track": "main", "pid": 1072, + "author_site": "Chuchu Han; Kai Su; Dongdong Yu; Zehuan Yuan; Changxin Gao; Nong Sang; Yi Yang; Changhu Wang", "author": "Chuchu Han; Kai Su; Dongdong Yu; Zehuan Yuan; Changxin Gao; Nong Sang; Yi Yang; Changhu Wang", "abstract": "Supervised learning is dominant in person search, but it requires elaborate labeling of bounding boxes and identities. Large-scale labeled training data is often difficult to collect, especially for person identities. A natural question is whether a good person search model can be trained without the need of identity supervision. In this paper, we present a weakly supervised setting where only bounding box annotations are available. Based on this new setting, we provide an effective baseline model termed Region Siamese Networks (R-SiamNets). Towards learning useful representations for recognition in the absence of identity labels, we supervise the R-SiamNet with instance-level consistency loss and cluster-level contrastive loss. For instance-level consistency learning, the R-SiamNet is constrained to extract consistent features from each person region with or without out-of-region context. For cluster-level contrastive learning, we enforce the aggregation of closest instances and the separation of dissimilar ones in feature space. Extensive experiments validate the utility of our weakly supervised method. Our model achieves the rank-1 of 87.1% and mAP of 86.0% on CUHK-SYSU benchmark, which surpasses several fully supervised methods, such as OIM and MGTS, by a clear margin. More promising performance can be reached by incorporating extra training data. We hope this work could encourage the future research in this field.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Han_Weakly_Supervised_Person_Search_With_Region_Siamese_Networks_ICCV_2021_paper.pdf", @@ -46748,7 +49906,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0;0;0;1;0", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Han_2021_ICCV,\n \n author = {\n Han,\n Chuchu and Su,\n Kai and Yu,\n Dongdong and Yuan,\n Zehuan and Gao,\n Changxin and Sang,\n Nong and Yang,\n Yi and Wang,\n Changhu\n},\n title = {\n Weakly Supervised Person Search With Region Siamese Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12006-12015\n} \n}" }, { "title": "Weakly Supervised Relative Spatial Reasoning for Visual Question Answering", @@ -46756,6 +49915,7 @@ "status": "Poster", "track": "main", "pid": 10857, + "author_site": "Pratyay Banerjee; Tejas Gokhale; Yezhou Yang; Chitta Baral", "author": "Pratyay Banerjee; Tejas Gokhale; Yezhou Yang; Chitta Baral", "abstract": "Vision-and-language (V&L) reasoning necessitates perception of visual concepts such as objects and actions, understanding semantics and language grounding, and reasoning about the interplay between the two modalities. One crucial aspect of visual reasoning is spatial understanding, which involves understanding relative locations of objects, i.e. implicitly learning the geometry of the scene. In this work, we evaluate the faithfulness of V&L models to such geometric understanding, by formulating the prediction of pair-wise relative locations of objects as a classification as well as a regression task. Our findings suggest that state-of-the-art transformer-based V&L models lack sufficient abilities to excel at this task. Motivated by this, we design two objectives as proxies for 3D spatial reasoning (SR) -- object centroid estimation, and relative position estimation, and train V&L with weak supervision from off-the-shelf depth estimators. This leads to considerable improvements in accuracy for the \"GQA\" visual question answering challenge (in fully supervised, few-shot, and O.O.D settings) as well as improvements in relative spatial reasoning. Code and data will be released here.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Banerjee_Weakly_Supervised_Relative_Spatial_Reasoning_for_Visual_Question_Answering_ICCV_2021_paper.pdf", @@ -46770,7 +49930,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Banerjee_Weakly_Supervised_Relative_Spatial_Reasoning_for_Visual_Question_Answering_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Banerjee_Weakly_Supervised_Relative_Spatial_Reasoning_for_Visual_Question_Answering_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Banerjee_2021_ICCV,\n \n author = {\n Banerjee,\n Pratyay and Gokhale,\n Tejas and Yang,\n Yezhou and Baral,\n Chitta\n},\n title = {\n Weakly Supervised Relative Spatial Reasoning for Visual Question Answering\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1908-1918\n} \n}" }, { "title": "Weakly Supervised Representation Learning With Coarse Labels", @@ -46778,6 +49939,7 @@ "status": "Poster", "track": "main", "pid": 9709, + "author_site": "Yuanhong Xu; Qi Qian; Hao Li; Rong Jin; Juhua Hu", "author": "Yuanhong Xu; Qi Qian; Hao Li; Rong Jin; Juhua Hu", "abstract": "With the development of computational power and techniques for data collection, deep learning demonstrates a superior performance over most existing algorithms on visual benchmark data sets. Many efforts have been devoted to studying the mechanism of deep learning. One important observation is that deep learning can learn the discriminative patterns from raw materials directly in a task-dependent manner. Therefore, the representations obtained by deep learning outperform hand-crafted features significantly. However, for some real-world applications, it is too expensive to collect the task-specific labels, such as visual search in online shopping. Compared to the limited availability of these task-specific labels, their coarse-class labels are much more affordable, but representations learned from them can be suboptimal for the target task. To mitigate this challenge, we propose an algorithm to learn the fine-grained patterns for the target task, when only its coarse-class labels are available. More importantly, we provide a theoretical guarantee for this. Extensive experiments on real-world data sets demonstrate that the proposed method can significantly improve the performance of learned representations on the target task, when only coarse-class information is available for training.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Xu_Weakly_Supervised_Representation_Learning_With_Coarse_Labels_ICCV_2021_paper.pdf", @@ -46801,7 +49963,8 @@ "aff_campus_unique_index": "0;1;0;1;2", "aff_campus_unique": "Hangzhou;Bellevue;Tacoma", "aff_country_unique_index": "0;1;0;1;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Xu_2021_ICCV,\n \n author = {\n Xu,\n Yuanhong and Qian,\n Qi and Li,\n Hao and Jin,\n Rong and Hu,\n Juhua\n},\n title = {\n Weakly Supervised Representation Learning With Coarse Labels\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10593-10601\n} \n}" }, { "title": "Weakly Supervised Segmentation of Small Buildings With Point Labels", @@ -46809,6 +49972,7 @@ "status": "Poster", "track": "main", "pid": 5738, + "author_site": "Jae-Hun Lee; ChanYoung Kim; Sanghoon Sull", "author": "Jae-Hun Lee; ChanYoung Kim; Sanghoon Sull", "abstract": "Most supervised image segmentation methods require delicate and time-consuming pixel-level labeling of building or objects, especially for small objects. In this paper, we present a weakly supervised segmentation network for aerial/satellite images, separately considering small and large objects. First, we propose a simple point labeling method for small objects, while large objects are fully labeled. Then, we present a segmentation network trained with a small object mask to separate small and large objects in the loss function. During training, we employ a memory bank to cope with the limited number of point labels. Experiments results with three public datasets demonstrate the feasibility of our approach.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Lee_Weakly_Supervised_Segmentation_of_Small_Buildings_With_Point_Labels_ICCV_2021_paper.pdf", @@ -46832,7 +49996,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Lee_2021_ICCV,\n \n author = {\n Lee,\n Jae-Hun and Kim,\n ChanYoung and Sull,\n Sanghoon\n},\n title = {\n Weakly Supervised Segmentation of Small Buildings With Point Labels\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7406-7415\n} \n}" }, { "title": "Weakly Supervised Temporal Anomaly Segmentation With Dynamic Time Warping", @@ -46840,6 +50005,7 @@ "status": "Poster", "track": "main", "pid": 5792, + "author_site": "Dongha Lee; Sehun Yu; Hyunjun Ju; Hwanjo Yu", "author": "Dongha Lee; Sehun Yu; Hyunjun Ju; Hwanjo Yu", "abstract": "Most recent studies on detecting and localizing temporal anomalies have mainly employed deep neural networks to learn the normal patterns of temporal data in an unsupervised manner. Unlike them, the goal of our work is to fully utilize instance-level (or weak) anomaly labels, which only indicate whether any anomalous events occurred or not in each instance of temporal data. In this paper, we present WETAS, a novel framework that effectively identifies anomalous temporal segments (i.e., consecutive time points) in an input instance. WETAS learns discriminative features from the instance-level labels so that it infers the sequential order of normal and anomalous segments within each instance, which can be used as a rough segmentation mask. Based on the dynamic time warping (DTW) alignment between the input instance and its segmentation mask, WETAS obtains the result of temporal segmentation, and simultaneously, it further enhances itself by using the mask as additional supervision. Our experiments show that WETAS considerably outperforms other baselines in terms of the localization of temporal anomalies, and also it provides more informative results than point-level detection methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Lee_Weakly_Supervised_Temporal_Anomaly_Segmentation_With_Dynamic_Time_Warping_ICCV_2021_paper.pdf", @@ -46856,14 +50022,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Lee_Weakly_Supervised_Temporal_Anomaly_Segmentation_With_Dynamic_Time_Warping_ICCV_2021_paper.html", "aff_unique_index": "0;1;1;1", - "aff_unique_norm": "University of Illinois Urbana-Champaign;Pohang University of Science and Technology", + "aff_unique_norm": "University of Illinois at Urbana-Champaign;Pohang University of Science and Technology", "aff_unique_dep": ";", "aff_unique_url": "https://illinois.edu;https://www.postech.ac.kr", "aff_unique_abbr": "UIUC;POSTECH", "aff_campus_unique_index": "0;1;1;1", "aff_campus_unique": "Urbana;Pohang", "aff_country_unique_index": "0;1;1;1", - "aff_country_unique": "United States;South Korea" + "aff_country_unique": "United States;South Korea", + "bibtex": "@InProceedings{Lee_2021_ICCV,\n \n author = {\n Lee,\n Dongha and Yu,\n Sehun and Ju,\n Hyunjun and Yu,\n Hwanjo\n},\n title = {\n Weakly Supervised Temporal Anomaly Segmentation With Dynamic Time Warping\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7355-7364\n} \n}" }, { "title": "Weakly Supervised Text-Based Person Re-Identification", @@ -46871,10 +50038,11 @@ "status": "Poster", "track": "main", "pid": 2501, + "author_site": "Shizhen Zhao; Changxin Gao; Yuanjie Shao; Wei-Shi Zheng; Nong Sang", "author": "Shizhen Zhao; Changxin Gao; Yuanjie Shao; Wei-Shi Zheng; Nong Sang", "abstract": "The conventional text-based person re-identification methods heavily rely on identity annotations. However, this labeling process is costly and time-consuming. In this paper, we consider a more practical setting called weakly supervised text-based person re-identification, where only the text-image pairs are available without the requirement of annotating identities during the training phase. To this end, we propose a Cross-Modal Mutual Training (CMMT) framework. Specifically, to alleviate the intra-class variations, a clustering method is utilized to generate pseudo labels for both visual and textual instances. To further refine the clustering results, CMMT provides a Mutual Pseudo Label Refinement module, which leverages the clustering results in one modality to refine that in the other modality constrained by the text-image pairwise relationship. Meanwhile, CMMT introduces a Text-IoU Guided Cross-Modal Projection Matching loss to resolve the cross-modal matching ambiguity problem. A Text-IoU Guided Hard Sample Mining method is also proposed for learning discriminative textual-visual joint embeddings. We conduct extensive experiments to demonstrate the effectiveness of the proposed CMMT, and the results show that CMMT performs favorably against existing text-based person re-identification methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhao_Weakly_Supervised_Text-Based_Person_Re-Identification_ICCV_2021_paper.pdf", - "aff": "Key Laboratory of Image Processing and Intelligent Control, School of Arti\ufb01cial Intelligence and Automation, Huazhong University of Science and Technology; Key Laboratory of Image Processing and Intelligent Control, School of Arti\ufb01cial Intelligence and Automation, Huazhong University of Science and Technology; Key Laboratory of Image Processing and Intelligent Control, School of Arti\ufb01cial Intelligence and Automation, Huazhong University of Science and Technology; Sun Yat-sen University; Key Laboratory of Image Processing and Intelligent Control, School of Arti\ufb01cial Intelligence and Automation, Huazhong University of Science and Technology", + "aff": "Key Laboratory of Image Processing and Intelligent Control, School of Artificial Intelligence and Automation, Huazhong University of Science and Technology; Key Laboratory of Image Processing and Intelligent Control, School of Artificial Intelligence and Automation, Huazhong University of Science and Technology; Key Laboratory of Image Processing and Intelligent Control, School of Artificial Intelligence and Automation, Huazhong University of Science and Technology; Sun Yat-sen University; Key Laboratory of Image Processing and Intelligent Control, School of Artificial Intelligence and Automation, Huazhong University of Science and Technology", "project": "", "github": "https://github.com/X-BrainLab/WS_Text-ReID", "supp": "", @@ -46888,13 +50056,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Zhao_Weakly_Supervised_Text-Based_Person_Re-Identification_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "Huazhong University of Science and Technology;Sun Yat-sen University", - "aff_unique_dep": "School of Arti\ufb01cial Intelligence and Automation;", + "aff_unique_dep": "School of Artificial Intelligence and Automation;", "aff_unique_url": "http://www.hust.edu.cn;http://www.sysu.edu.cn/", "aff_unique_abbr": "HUST;SYSU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhao_2021_ICCV,\n \n author = {\n Zhao,\n Shizhen and Gao,\n Changxin and Shao,\n Yuanjie and Zheng,\n Wei-Shi and Sang,\n Nong\n},\n title = {\n Weakly Supervised Text-Based Person Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 11395-11404\n} \n}" }, { "title": "Weakly-Supervised Action Segmentation and Alignment via Transcript-Aware Union-of-Subspaces Learning", @@ -46902,6 +50071,7 @@ "status": "Poster", "track": "main", "pid": 3497, + "author_site": "Zijia Lu; Ehsan Elhamifar", "author": "Zijia Lu; Ehsan Elhamifar", "abstract": "We address the problem of learning to segment actions from weakly-annotated videos, i.e., videos accompanied by transcripts (ordered list of actions). We propose a framework in which we model actions with a union of low-dimensional subspaces, learn the subspaces using transcripts and refine video features that lend themselves to action subspaces. To do so, we design an architecture consisting of a Union-of-Subspace Network, which is an ensemble of autoencoders, each modeling a low-dimensional action subspace and can capture variations of an action within and across videos. For learning, at each iteration, we generate positive and negative soft alignment matrices using the segmentations from the previous iteration, which we use for discriminative training of our model. To regularize the learning, we introduce a constraint loss that prevents imbalanced segmentations and enforces relatively similar duration of each action across videos. To have a real-time inference, we develop a hierarchical segmentation framework that uses subset selection to find representative transcripts and hierarchically align a test video with increasingly refined representative transcripts. Our experiments on three datasets show that our method improves the state-of-the-art action segmentation and alignment, while speeding up the inference time by a factor of 4 to 13.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Lu_Weakly-Supervised_Action_Segmentation_and_Alignment_via_Transcript-Aware_Union-of-Subspaces_Learning_ICCV_2021_paper.pdf", @@ -46925,7 +50095,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Lu_2021_ICCV,\n \n author = {\n Lu,\n Zijia and Elhamifar,\n Ehsan\n},\n title = {\n Weakly-Supervised Action Segmentation and Alignment via Transcript-Aware Union-of-Subspaces Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8085-8095\n} \n}" }, { "title": "Weakly-Supervised Video Anomaly Detection With Robust Temporal Feature Magnitude Learning", @@ -46933,6 +50104,7 @@ "status": "Poster", "track": "main", "pid": 8292, + "author_site": "Yu Tian; Guansong Pang; Yuanhong Chen; Rajvinder Singh; Johan W. Verjans; Gustavo Carneiro", "author": "Yu Tian; Guansong Pang; Yuanhong Chen; Rajvinder Singh; Johan W. Verjans; Gustavo Carneiro", "abstract": "Anomaly detection with weakly supervised video-level labels is typically formulated as a multiple instance learning (MIL) problem, in which we aim to identify snippets containing abnormal events, with each video represented as a bag of video snippets. Although current methods show effective detection performance, their recognition of the positive instances, i.e., rare abnormal snippets in the abnormal videos, is largely biased by the dominant negative instances, especially when the abnormal events are subtle anomalies that exhibit only small differences compared with normal events. This issue is exacerbated in many methods that ignore important video temporal dependencies. To address this issue, we introduce a novel and theoretically sound method, named Robust Temporal Feature Magnitude learning (RTFM), which trains a feature magnitude learning function to effectively recognise the positive instances, substantially improving the robustness of the MIL approach to the negative instances from abnormal videos. RTFM also adapts dilated convolutions and self-attention mechanisms to capture long- and short-range temporal dependencies to learn the feature magnitude more faithfully. Extensive experiments show that the RTFM-enabled MIL model (i) outperforms several state-of-the-art methods by a large margin on four benchmark data sets (ShanghaiTech, UCF-Crime, XD-Violence and UCSD-Peds) and (ii) achieves significantly improved subtle anomaly discriminability and sample efficiency.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Tian_Weakly-Supervised_Video_Anomaly_Detection_With_Robust_Temporal_Feature_Magnitude_Learning_ICCV_2021_paper.pdf", @@ -46947,7 +50119,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Tian_Weakly-Supervised_Video_Anomaly_Detection_With_Robust_Temporal_Feature_Magnitude_Learning_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Tian_Weakly-Supervised_Video_Anomaly_Detection_With_Robust_Temporal_Feature_Magnitude_Learning_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Tian_2021_ICCV,\n \n author = {\n Tian,\n Yu and Pang,\n Guansong and Chen,\n Yuanhong and Singh,\n Rajvinder and Verjans,\n Johan W. and Carneiro,\n Gustavo\n},\n title = {\n Weakly-Supervised Video Anomaly Detection With Robust Temporal Feature Magnitude Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4975-4986\n} \n}" }, { "title": "Webly Supervised Fine-Grained Recognition: Benchmark Datasets and an Approach", @@ -46955,6 +50128,7 @@ "status": "Poster", "track": "main", "pid": 8484, + "author_site": "Zeren Sun; Yazhou Yao; Xiu-Shen Wei; Yongshun Zhang; Fumin Shen; Jianxin Wu; Jian Zhang; Heng Tao Shen", "author": "Zeren Sun; Yazhou Yao; Xiu-Shen Wei; Yongshun Zhang; Fumin Shen; Jianxin Wu; Jian Zhang; Heng Tao Shen", "abstract": "Learning from the web can ease the extreme dependence of deep learning on large-scale manually labeled datasets. Especially for fine-grained recognition, which targets at distinguishing subordinate categories, it will significantly reduce the labeling costs by leveraging free web data. Despite its significant practical and research value, the webly supervised fine-grained recognition problem is not extensively studied in the computer vision community, largely due to the lack of high-quality datasets. To fill this gap, in this paper we construct two new benchmark webly supervised fine-grained datasets, termed WebFG-496 and WebiNat-5089, respectively. In concretely, WebFG-496 consists of three sub-datasets containing a total of 53,339 web training images with 200 species of birds (Web-bird), 100 types of aircrafts (Web-aircraft), and 196 models of cars (Web-car). For WebiNat-5089, it contains 5089 sub-categories and more than 1.1 million web training images, which is the largest webly supervised fine-grained dataset ever. As a minor contribution, we also propose a novel webly supervised method (termed \"Peer-learning\") for benchmarking these datasets. Comprehensive experimental results and analyses on two new benchmark datasets demonstrate that the proposed method achieves superior performance over the competing baseline models and states-of-the-art.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Sun_Webly_Supervised_Fine-Grained_Recognition_Benchmark_Datasets_and_an_Approach_ICCV_2021_paper.pdf", @@ -46969,7 +50143,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Sun_Webly_Supervised_Fine-Grained_Recognition_Benchmark_Datasets_and_an_Approach_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Sun_Webly_Supervised_Fine-Grained_Recognition_Benchmark_Datasets_and_an_Approach_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Sun_2021_ICCV,\n \n author = {\n Sun,\n Zeren and Yao,\n Yazhou and Wei,\n Xiu-Shen and Zhang,\n Yongshun and Shen,\n Fumin and Wu,\n Jianxin and Zhang,\n Jian and Shen,\n Heng Tao\n},\n title = {\n Webly Supervised Fine-Grained Recognition: Benchmark Datasets and an Approach\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10602-10611\n} \n}" }, { "title": "What You Can Learn by Staring at a Blank Wall", @@ -46977,7 +50152,8 @@ "status": "Poster", "track": "main", "pid": 2599, - "author": "Prafull Sharma; Miika Aittala; Yoav Y. Schechner; Antonio Torralba; Gregory W. Wornell; William T. Freeman; Fr\u00e9do Durand", + "author_site": "Prafull Sharma; Miika Aittala; Yoav Y. Schechner; Antonio Torralba; Gregory W. Wornell; William T. Freeman; Frédo Durand", + "author": "Prafull Sharma; Miika Aittala; Yoav Y. Schechner; Antonio Torralba; Gregory W. Wornell; William T. Freeman; Frédo Durand", "abstract": "We present a passive non-line-of-sight method that infers the number of people or activity of a person from the observation of a blank wall in an unknown room. Our technique analyzes complex imperceptible changes in indirect illumination in a video of the wall to reveal a signal that is correlated with motion in the hidden part of a scene. We use this signal to classify between zero, one, or two moving people, or the activity of a person in the hidden scene. We train two convolutional neural networks using data collected from 20 different scenes, and achieve an accuracy of approximately 94% for both tasks in unseen test environments and real-time online settings. Unlike other passive non-line-of-sight methods, the technique does not rely on known occluders or controllable light sources, and generalizes to unknown rooms with no recalibration. We analyze the generalization and robustness of our method with both real and synthetic data, and study the effect of the scene parameters on the signal quality.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Sharma_What_You_Can_Learn_by_Staring_at_a_Blank_Wall_ICCV_2021_paper.pdf", "aff": ";;;;;;", @@ -46991,7 +50167,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Sharma_What_You_Can_Learn_by_Staring_at_a_Blank_Wall_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Sharma_What_You_Can_Learn_by_Staring_at_a_Blank_Wall_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Sharma_2021_ICCV,\n \n author = {\n Sharma,\n Prafull and Aittala,\n Miika and Schechner,\n Yoav Y. and Torralba,\n Antonio and Wornell,\n Gregory W. and Freeman,\n William T. and Durand,\n Fr\\'edo\n},\n title = {\n What You Can Learn by Staring at a Blank Wall\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 2330-2339\n} \n}" }, { "title": "When Do GANs Replicate? On the Choice of Dataset Size", @@ -46999,6 +50176,7 @@ "status": "Poster", "track": "main", "pid": 7035, + "author_site": "Qianli Feng; Chenqi Guo; Fabian Benitez-Quiroz; Aleix M. Martinez", "author": "Qianli Feng; Chenqi Guo; Fabian Benitez-Quiroz; Aleix M. Martinez", "abstract": "Do GANs replicate training images? Previous studies have shown that GANs do not seem to replicate training data without significant change in the training procedure. This leads to a series of research on the exact condition needed for GANs to overfit to the training data. Although a number of factors has been theoretically or empirically identified, the effect of dataset size and complexity on GANs replication is still unknown. With empirical evidence from BigGAN and StyleGAN2, on datasets CelebA, Flower and LSUN-bedroom, we show that dataset size and its complexity play an important role in GANs replication and perceptual quality of the generated images. We further quantify this relationship, discovering that replication percentage decays exponentially with respect to dataset size and complexity, with a shared decaying factor across GAN-dataset combinations. Meanwhile, the perceptual image quality follows a U-shape trend w.r.t dataset size. This finding leads to a practical tool for one-shot estimation on minimal dataset size to prevent GAN replication which can be used to guide datasets construction and selection.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Feng_When_Do_GANs_Replicate_On_the_Choice_of_Dataset_Size_ICCV_2021_paper.pdf", @@ -47015,14 +50193,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Feng_When_Do_GANs_Replicate_On_the_Choice_of_Dataset_Size_ICCV_2021_paper.html", "aff_unique_index": "0+1;0;0;0+1", - "aff_unique_norm": "Ohio State University;Amazon", - "aff_unique_dep": ";Amazon.com, Inc.", + "aff_unique_norm": "The Ohio State University;Amazon.com, Inc.", + "aff_unique_dep": ";", "aff_unique_url": "https://www.osu.edu;https://www.amazon.com", "aff_unique_abbr": "OSU;Amazon", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Feng_2021_ICCV,\n \n author = {\n Feng,\n Qianli and Guo,\n Chenqi and Benitez-Quiroz,\n Fabian and Martinez,\n Aleix M.\n},\n title = {\n When Do GANs Replicate? On the Choice of Dataset Size\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6701-6710\n} \n}" }, { "title": "When Pigs Fly: Contextual Reasoning in Synthetic and Natural Scenes", @@ -47030,10 +50209,11 @@ "status": "Poster", "track": "main", "pid": 1840, + "author_site": "Philipp Bomatter; Mengmi Zhang; Dimitar Karev; Spandan Madan; Claire Tseng; Gabriel Kreiman", "author": "Philipp Bomatter; Mengmi Zhang; Dimitar Karev; Spandan Madan; Claire Tseng; Gabriel Kreiman", "abstract": "Context is of fundamental importance to both human and machine vision; e.g., an object in the air is more likely to be an airplane than a pig. The rich notion of context incorporates several aspects including physics rules, statistical co-occurrences, and relative object sizes, among others. While previous work has focused on crowd-sourced out-of-context photographs from the web to study scene context, controlling the nature and extent of contextual violations has been a daunting task. Here we introduce a diverse, synthetic Out-of-Context Dataset (OCD) with fine-grained control over scene context. By leveraging a 3D simulation engine, we systematically control the gravity, object co-occurrences and relative sizes across 36 object categories in a virtual household environment. We conducted a series of experiments to gain insights into the impact of contextual cues on both human and machine vision using OCD. We conducted psychophysics experiments to establish a human benchmark for out-of-context recognition and then compared it with state-of-the-art computer vision models to quantify the gap between the two. We propose a context-aware recognition transformer model, fusing object and contextual information via multi-head attention. Our model captures useful information for contextual reasoning, enabling human-level performance and better robustness in out-of-context conditions compared to baseline models across OCD and other out-of-context datasets. All source code and data are publicly available at https://github.com/kreimanlab/WhenPigsFlyContext", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Bomatter_When_Pigs_Fly_Contextual_Reasoning_in_Synthetic_and_Natural_Scenes_ICCV_2021_paper.pdf", - "aff": "ETH Z\u00fcrich; Children\u2019s Hospital, Harvard Medical School+Center for Brains, Minds and Machines; Harvard College, Harvard University; Center for Brains, Minds and Machines+School of Engineering and Applied Sciences, Harvard University; Harvard College, Harvard University; Children\u2019s Hospital, Harvard Medical School+Center for Brains, Minds and Machines", + "aff": "ETH Zürich; Children’s Hospital, Harvard Medical School+Center for Brains, Minds and Machines; Harvard College, Harvard University; Center for Brains, Minds and Machines+School of Engineering and Applied Sciences, Harvard University; Harvard College, Harvard University; Children’s Hospital, Harvard Medical School+Center for Brains, Minds and Machines", "project": "", "github": "https://github.com/kreimanlab/WhenPigsFlyContext", "supp": "https://openaccess.thecvf.com/content/ICCV2021/supplemental/Bomatter_When_Pigs_Fly_ICCV_2021_supplemental.pdf", @@ -47046,14 +50226,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Bomatter_When_Pigs_Fly_Contextual_Reasoning_in_Synthetic_and_Natural_Scenes_ICCV_2021_paper.html", "aff_unique_index": "0;1+2;3;2+3;3;1+2", - "aff_unique_norm": "ETH Zurich;Harvard Medical School;Center for Brains, Minds and Machines;Harvard University", - "aff_unique_dep": ";Children\u2019s Hospital;;Harvard College", + "aff_unique_norm": "ETH Zürich;Harvard Medical School;Center for Brains, Minds and Machines;Harvard University", + "aff_unique_dep": ";Children’s Hospital;;Harvard College", "aff_unique_url": "https://www.ethz.ch;https://hms.harvard.edu;http://cbmm.mit.edu/;https://www.harvard.edu", "aff_unique_abbr": "ETHZ;HMS;CBMM;Harvard", "aff_campus_unique_index": ";1;1;1;", "aff_campus_unique": ";Cambridge", "aff_country_unique_index": "0;1+1;1;1+1;1;1+1", - "aff_country_unique": "Switzerland;United States" + "aff_country_unique": "Switzerland;United States", + "bibtex": "@InProceedings{Bomatter_2021_ICCV,\n \n author = {\n Bomatter,\n Philipp and Zhang,\n Mengmi and Karev,\n Dimitar and Madan,\n Spandan and Tseng,\n Claire and Kreiman,\n Gabriel\n},\n title = {\n When Pigs Fly: Contextual Reasoning in Synthetic and Natural Scenes\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 255-264\n} \n}" }, { "title": "Where Are You Heading? Dynamic Trajectory Prediction With Expert Goal Examples", @@ -47061,6 +50242,7 @@ "status": "Poster", "track": "main", "pid": 6217, + "author_site": "He Zhao; Richard P. Wildes", "author": "He Zhao; Richard P. Wildes", "abstract": "Goal-conditioned approaches recently have been found very useful to human trajectory prediction, when adequate goal estimates are provided. Yet, goal inference is difficult in itself and often incurs extra learning efforts. We propose to predict pedestrian trajectories via the guidance of goal expertise, which can be obtained with modest expense through a novel goal-search mechanism on already seen training examples. There are three key contributions in our study. First, we devise a framework that exploits the nearest examples for high-quality goal position inquiry. This approach naturally considers multi-modality, physical constraints, compatibility with existing methods and is model-free; it therefore does not require additional learning efforts typical in goal inference. Second, we present an end-to-end trajectory predictor that can efficiently associate goal retrievals to past motion information and dynamically infer possible future trajectories. Third, with these two novel techniques in hand, we conduct a series of experiments on two broadly explored datasets (SDD and ETH/UCY) and show that our approach surpasses previous state-of-the-art performance by notable margins and reduces the need for additional parameters.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhao_Where_Are_You_Heading_Dynamic_Trajectory_Prediction_With_Expert_Goal_ICCV_2021_paper.pdf", @@ -47084,7 +50266,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Zhao_2021_ICCV,\n \n author = {\n Zhao,\n He and Wildes,\n Richard P.\n},\n title = {\n Where Are You Heading? Dynamic Trajectory Prediction With Expert Goal Examples\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 7629-7638\n} \n}" }, { "title": "Where2Act: From Pixels to Actions for Articulated 3D Objects", @@ -47092,6 +50275,7 @@ "status": "Poster", "track": "main", "pid": 1027, + "author_site": "Kaichun Mo; Leonidas J. Guibas; Mustafa Mukadam; Abhinav Gupta; Shubham Tulsiani", "author": "Kaichun Mo; Leonidas J. Guibas; Mustafa Mukadam; Abhinav Gupta; Shubham Tulsiani", "abstract": "One of the fundamental goals of visual perception is to allow agents to meaningfully interact with their environment. In this paper, we take a step towards that long-term goal -- we extract highly localized actionable information related to elementary actions such as pushing or pulling for articulated objects with movable parts. For example, given a drawer, our network predicts that applying a pulling force on the handle opens the drawer. We propose, discuss, and evaluate novel network architectures that given image and depth data, predict the set of actions possible at each pixel, and the regions over articulated parts that are likely to move under the force. We propose a learning-from-interaction framework with an online data sampling strategy that allows us to train the network in simulation (SAPIEN) and generalizes across categories. Check the website for code and data release.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Mo_Where2Act_From_Pixels_to_Actions_for_Articulated_3D_Objects_ICCV_2021_paper.pdf", @@ -47108,14 +50292,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Mo_Where2Act_From_Pixels_to_Actions_for_Articulated_3D_Objects_ICCV_2021_paper.html", "aff_unique_index": "0;0;1;1;1", - "aff_unique_norm": "Stanford University;Meta", + "aff_unique_norm": "Stanford University;Facebook", "aff_unique_dep": ";Facebook AI Research", "aff_unique_url": "https://www.stanford.edu;https://research.facebook.com", "aff_unique_abbr": "Stanford;FAIR", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Mo_2021_ICCV,\n \n author = {\n Mo,\n Kaichun and Guibas,\n Leonidas J. and Mukadam,\n Mustafa and Gupta,\n Abhinav and Tulsiani,\n Shubham\n},\n title = {\n Where2Act: From Pixels to Actions for Articulated 3D Objects\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6813-6823\n} \n}" }, { "title": "Who's Waldo? Linking People Across Text and Images", @@ -47123,6 +50308,7 @@ "status": "Poster", "track": "main", "pid": 3381, + "author_site": "Yuqing Cui; Apoorv Khandelwal; Yoav Artzi; Noah Snavely; Hadar Averbuch-Elor", "author": "Yuqing Cui; Apoorv Khandelwal; Yoav Artzi; Noah Snavely; Hadar Averbuch-Elor", "abstract": "We present a task and benchmark dataset for person-centric visual grounding, the problem of linking between people named in a caption and people pictured in an image. In contrast to prior work in visual grounding, which is predominantly object-based, our new task masks out the names of people in captions in order to encourage methods trained on such image--caption pairs to focus on contextual cues (such as rich interactions between multiple people), rather than learning associations between names and appearances. To facilitate this task, we introduce a new dataset, Who's Waldo, mined automatically from image--caption data on Wikimedia Commons. We propose a Transformer-based method that outperforms several strong baselines on this task, and are releasing our data to the research community to spur work on contextual models that consider both vision and language.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Cui_Whos_Waldo_Linking_People_Across_Text_and_Images_ICCV_2021_paper.pdf", @@ -47146,7 +50332,8 @@ "aff_campus_unique_index": "1;1;1;1;1", "aff_campus_unique": ";New York City", "aff_country_unique_index": "0+0;0+0;0+0;0+0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Cui_2021_ICCV,\n \n author = {\n Cui,\n Yuqing and Khandelwal,\n Apoorv and Artzi,\n Yoav and Snavely,\n Noah and Averbuch-Elor,\n Hadar\n},\n title = {\n Who's Waldo? Linking People Across Text and Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1374-1384\n} \n}" }, { "title": "Why Approximate Matrix Square Root Outperforms Accurate SVD in Global Covariance Pooling?", @@ -47154,6 +50341,7 @@ "status": "Poster", "track": "main", "pid": 7519, + "author_site": "Yue Song; Nicu Sebe; Wei Wang", "author": "Yue Song; Nicu Sebe; Wei Wang", "abstract": "Global Covariance Pooling (GCP) aims at exploiting the second-order statistics of the convolutional feature. Its effectiveness has been demonstrated in boosting the classification performance of Convolutional Neural Networks (CNNs). Singular Value Decomposition (SVD) is used in GCP to compute the matrix square root. However, the approximate matrix square root calculated using Newton-Schulz iteration [??] outperforms the accurate one computed via SVD [??]. We empirically analyze the reason behind the performance gap from the perspectives of data precision and gradient smoothness. Various remedies for computing smooth SVD gradients are investigated. Based on our observation and analyses, a hybrid training protocol is proposed for SVD-based GCP meta-layers such that competitive performances can be achieved against Newton-Schulz iteration. Moreover, we propose a new GCP meta-layer that uses SVD in the forward pass, and Pade approximants in the backward propagation to compute the gradients. The proposed meta-layer has been integrated into different CNN models and achieves state-of-the-art performances on both large-scale and fine-grained datasets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Song_Why_Approximate_Matrix_Square_Root_Outperforms_Accurate_SVD_in_Global_ICCV_2021_paper.pdf", @@ -47177,7 +50365,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Trento", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Italy" + "aff_country_unique": "Italy", + "bibtex": "@InProceedings{Song_2021_ICCV,\n \n author = {\n Song,\n Yue and Sebe,\n Nicu and Wang,\n Wei\n},\n title = {\n Why Approximate Matrix Square Root Outperforms Accurate SVD in Global Covariance Pooling?\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1115-1123\n} \n}" }, { "title": "With a Little Help From My Friends: Nearest-Neighbor Contrastive Learning of Visual Representations", @@ -47185,6 +50374,7 @@ "status": "Poster", "track": "main", "pid": 2259, + "author_site": "Debidatta Dwibedi; Yusuf Aytar; Jonathan Tompson; Pierre Sermanet; Andrew Zisserman", "author": "Debidatta Dwibedi; Yusuf Aytar; Jonathan Tompson; Pierre Sermanet; Andrew Zisserman", "abstract": "Self-supervised learning algorithms based on instance discrimination train encoders to be invariant to pre-defined transformations of the same instance. While most methods treat different views of the same image as positives for a contrastive loss, we are interested in using positives from other instances in the dataset. Our method, Nearest-Neighbor Contrastive Learning of visual Representations (NNCLR), samples the nearest neighbors from the dataset in the latent space, and treats them as positives. This provides more semantic variations than pre-defined transformations. We find that using the nearest-neighbor as positive in contrastive losses improves performance significantly on ImageNet classification, from 71.7% to 75.6%, outperforming previous state-of-the-art methods. On semi-supervised learning benchmarks we improve performance significantly when only 1% ImageNet labels are available, from 53.8% to 56.5%. On transfer learning benchmarks our method outperforms state-of-the-art methods (including supervised learning with ImageNet) on 8 out of 12 downstream datasets. Furthermore, we demonstrate empirically that our method is less reliant on complex data augmentations. We see a relative reduction of only 2.1% ImageNet Top-1 accuracy when we train using only random crops.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Dwibedi_With_a_Little_Help_From_My_Friends_Nearest-Neighbor_Contrastive_Learning_ICCV_2021_paper.pdf", @@ -47208,7 +50398,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;1;0;0;1", - "aff_country_unique": "United States;United Kingdom" + "aff_country_unique": "United States;United Kingdom", + "bibtex": "@InProceedings{Dwibedi_2021_ICCV,\n \n author = {\n Dwibedi,\n Debidatta and Aytar,\n Yusuf and Tompson,\n Jonathan and Sermanet,\n Pierre and Zisserman,\n Andrew\n},\n title = {\n With a Little Help From My Friends: Nearest-Neighbor Contrastive Learning of Visual Representations\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9588-9597\n} \n}" }, { "title": "Worldsheet: Wrapping the World in a 3D Sheet for View Synthesis From a Single Image", @@ -47216,6 +50407,7 @@ "status": "Poster", "track": "main", "pid": 5636, + "author_site": "Ronghang Hu; Nikhila Ravi; Alexander C. Berg; Deepak Pathak", "author": "Ronghang Hu; Nikhila Ravi; Alexander C. Berg; Deepak Pathak", "abstract": "We present Worldsheet, a method for novel view synthesis using just a single RGB image as input. The main insight is that simply shrink-wrapping a planar mesh sheet onto the input image, consistent with the learned intermediate depth, captures underlying geometry sufficient to generate photorealistic unseen views with large viewpoint changes. To operationalize this, we propose a novel differentiable texture sampler that allows our wrapped mesh sheet to be textured and rendered differentiably into an image from a target viewpoint. Our approach is category-agnostic, end-to-end trainable without using any 3D supervision, and requires a single image at test time. We also explore a simple extension by stacking multiple layers of Worldsheets to better handle occlusions. Worldsheet consistently outperforms prior state-of-the-art methods on single-image view synthesis across several datasets. Furthermore, this simple idea captures novel views surprisingly well on a wide range of high-resolution in-the-wild images, converting them into navigable 3D pop-ups. Video results and code are available at https://worldsheet.github.io.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Hu_Worldsheet_Wrapping_the_World_in_a_3D_Sheet_for_View_ICCV_2021_paper.pdf", @@ -47230,7 +50422,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Hu_Worldsheet_Wrapping_the_World_in_a_3D_Sheet_for_View_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Hu_Worldsheet_Wrapping_the_World_in_a_3D_Sheet_for_View_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Hu_2021_ICCV,\n \n author = {\n Hu,\n Ronghang and Ravi,\n Nikhila and Berg,\n Alexander C. and Pathak,\n Deepak\n},\n title = {\n Worldsheet: Wrapping the World in a 3D Sheet for View Synthesis From a Single Image\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 12528-12537\n} \n}" }, { "title": "X-World: Accessibility, Vision, and Autonomy Meet", @@ -47238,6 +50431,7 @@ "status": "Poster", "track": "main", "pid": 1449, + "author_site": "Jimuyang Zhang; Minglan Zheng; Matthew Boyd; Eshed Ohn-Bar", "author": "Jimuyang Zhang; Minglan Zheng; Matthew Boyd; Eshed Ohn-Bar", "abstract": "An important issue facing vision-based intelligent systems today is the lack of accessibility-aware development. A main reason for this issue is the absence of any large-scale, standardized vision benchmarks that incorporate relevant tasks and scenarios related to people with disabilities. This lack of representation hinders even preliminary analysis with respect to underlying pose, appearance, and occlusion characteristics of diverse pedestrians. What is the impact of significant occlusion from a wheelchair on instance segmentation quality? How can interaction with mobility aids, e.g., a long and narrow walking cane, be recognized robustly? To begin addressing such questions, we introduce X-World, an accessibility-centered development environment for vision-based autonomous systems. We tackle inherent data scarcity by leveraging a simulation environment to spawn dynamic agents with various mobility aids. The simulation supports generation of ample amounts of finely annotated, multi-modal data in a safe, cheap, and privacy-preserving manner. Our analysis highlights novel challenges introduced by our benchmark and tasks, as well as numerous opportunities for future developments. We further broaden our analysis using a complementary real-world evaluation benchmark of in-situ navigation by pedestrians with disabilities. Our contributions provide an initial step towards widespread deployment of vision-based agents that can perceive and model the interaction needs of diverse people with disabilities.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Zhang_X-World_Accessibility_Vision_and_Autonomy_Meet_ICCV_2021_paper.pdf", @@ -47261,7 +50455,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhang_2021_ICCV,\n \n author = {\n Zhang,\n Jimuyang and Zheng,\n Minglan and Boyd,\n Matthew and Ohn-Bar,\n Eshed\n},\n title = {\n X-World: Accessibility,\n Vision,\n and Autonomy Meet\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 9762-9771\n} \n}" }, { "title": "XVFI: eXtreme Video Frame Interpolation", @@ -47269,6 +50464,7 @@ "status": "Poster", "track": "main", "pid": 9691, + "author_site": "Hyeonjun Sim; Jihyong Oh; Munchurl Kim", "author": "Hyeonjun Sim; Jihyong Oh; Munchurl Kim", "abstract": "In this paper, we firstly present a dataset (X4K1000FPS) of 4K videos of 1000 fps with the extreme motion to the research community for video frame interpolation (VFI), and propose an extreme VFI network, called XVFI-Net, that first handles the VFI for 4K videos with large motion. The XVFI-Net is based on a recursive multi-scale shared structure that consists of two cascaded modules for bidirectional optical flow learning between two input frames (BiOF-I) and for bidirectional optical flow learning from target to input frames (BiOF-T). The optical flows are stably approximated by a complementary flow reversal (CFR) proposed in BiOF-T module. During inference, the BiOF-I module can start at any scale of input while the BiOF-T module only operates at the original input scale so that the inference can be accelerated while maintaining highly accurate VFI performance. Extensive experimental results show that our XVFI-Net can successfully capture the essential information of objects with extremely large motions and complex textures while the state-of-the-art methods exhibit poor performance. Furthermore, our XVFI-Net framework also performs comparably on the previous lower resolution benchmark dataset, which shows a robustness of our algorithm as well. All source codes, pre-trained models, and proposed X4K1000FPS datasets are publicly available at https://github.com/JihyongOh/XVFI.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Sim_XVFI_eXtreme_Video_Frame_Interpolation_ICCV_2021_paper.pdf", @@ -47292,7 +50488,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Sim_2021_ICCV,\n \n author = {\n Sim,\n Hyeonjun and Oh,\n Jihyong and Kim,\n Munchurl\n},\n title = {\n XVFI: eXtreme Video Frame Interpolation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14489-14498\n} \n}" }, { "title": "You Don't Only Look Once: Constructing Spatial-Temporal Memory for Integrated 3D Object Detection and Tracking", @@ -47300,6 +50497,7 @@ "status": "Poster", "track": "main", "pid": 7049, + "author_site": "Jiaming Sun; Yiming Xie; Siyu Zhang; Linghao Chen; Guofeng Zhang; Hujun Bao; Xiaowei Zhou", "author": "Jiaming Sun; Yiming Xie; Siyu Zhang; Linghao Chen; Guofeng Zhang; Hujun Bao; Xiaowei Zhou", "abstract": "Humans are able to continuously detect and track surrounding objects by constructing a spatial-temporal memory of the objects when looking around. In contrast, 3D object detectors in existing tracking-by-detection systems often search for objects in every new video frame from scratch, without fully leveraging memory from previous detection results. In this work, we propose a novel system for integrated 3D object detection and tracking, which uses a dynamic object occupancy map and previous object states as spatial-temporal memory to assist object detection in future frames. This memory, together with the ego-motion from back-end odometry, guides the detector to achieve more efficient object proposal generation and more accurate object state estimation. The experiments demonstrate the effectiveness of the proposed system and its performance on the ScanNet and KITTI datasets. Moreover, the proposed system produces stable bounding boxes and pose trajectories over time, while being able to handle occluded and truncated objects. Code is available at the project page: https://zju3dv.github.io/UDOLO.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Sun_You_Dont_Only_Look_Once_Constructing_Spatial-Temporal_Memory_for_Integrated_ICCV_2021_paper.pdf", @@ -47323,7 +50521,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Sun_2021_ICCV,\n \n author = {\n Sun,\n Jiaming and Xie,\n Yiming and Zhang,\n Siyu and Chen,\n Linghao and Zhang,\n Guofeng and Bao,\n Hujun and Zhou,\n Xiaowei\n},\n title = {\n You Don't Only Look Once: Constructing Spatial-Temporal Memory for Integrated 3D Object Detection and Tracking\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 3185-3194\n} \n}" }, { "title": "YouRefIt: Embodied Reference Understanding With Language and Gesture", @@ -47331,6 +50530,7 @@ "status": "Poster", "track": "main", "pid": 6741, + "author_site": "Yixin Chen; Qing Li; Deqian Kong; Yik Lun Kei; Song-Chun Zhu; Tao Gao; Yixin Zhu; Siyuan Huang", "author": "Yixin Chen; Qing Li; Deqian Kong; Yik Lun Kei; Song-Chun Zhu; Tao Gao; Yixin Zhu; Siyuan Huang", "abstract": "We study the machine's understanding of embodied reference: One agent uses both language and gesture to refer to an object to another agent in a shared physical environment. Of note, this new visual task requires understanding multimodal cues with perspective-taking to identify which object is being referred to. To tackle this problem, we introduce YouRefIt, a new crowd-sourced dataset of embodied reference collected in various physical scenes; the dataset contains 4,195 unique reference clips in 432 indoor scenes. To the best of our knowledge, this is the first embodied reference dataset that allows us to study referring expressions in daily physical scenes to understand referential behavior, human communication, and human-robot interaction. We further devise two benchmarks for image-based and video-based embodied reference understanding. Comprehensive baselines and extensive experiments provide the very first result of machine perception on how the referring expressions and gestures affect the embodied reference understanding. Our results provide essential evidence that gestural cues are as critical as language cues in understanding the embodied reference.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chen_YouRefIt_Embodied_Reference_Understanding_With_Language_and_Gesture_ICCV_2021_paper.pdf", @@ -47345,7 +50545,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Chen_YouRefIt_Embodied_Reference_Understanding_With_Language_and_Gesture_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Chen_YouRefIt_Embodied_Reference_Understanding_With_Language_and_Gesture_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Chen_2021_ICCV,\n \n author = {\n Chen,\n Yixin and Li,\n Qing and Kong,\n Deqian and Kei,\n Yik Lun and Zhu,\n Song-Chun and Gao,\n Tao and Zhu,\n Yixin and Huang,\n Siyuan\n},\n title = {\n YouRefIt: Embodied Reference Understanding With Language and Gesture\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1385-1395\n} \n}" }, { "title": "Z-Score Normalization, Hubness, and Few-Shot Learning", @@ -47353,6 +50554,7 @@ "status": "Poster", "track": "main", "pid": 9144, + "author_site": "Nanyi Fei; Yizhao Gao; Zhiwu Lu; Tao Xiang", "author": "Nanyi Fei; Yizhao Gao; Zhiwu Lu; Tao Xiang", "abstract": "The goal of few-shot learning (FSL) is to recognize a set of novel classes with only few labeled samples by exploiting a large set of abundant base class samples. Adopting a meta-learning framework, most recent FSL methods meta-learn a deep feature embedding network, and during inference classify novel class samples using nearest neighbor in the learned high-dimensional embedding space. This means that these methods are prone to the hubness problem, that is, a certain class prototype becomes the nearest neighbor of many test instances regardless which classes they belong to. However, this problem is largely ignored in existing FSL studies. In this work, for the first time we show that many FSL methods indeed suffer from the hubness problem. To mitigate its negative effects, we further propose to employ z-score feature normalization, a simple yet effective transformation, during meta-training. A theoretical analysis is provided on why it helps. Extensive experiments are then conducted to show that with z-score normalization, the performance of many recent FSL methods can be boosted, resulting in new state-of-the-art on three benchmarks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Fei_Z-Score_Normalization_Hubness_and_Few-Shot_Learning_ICCV_2021_paper.pdf", @@ -47376,7 +50578,8 @@ "aff_campus_unique_index": "0+0;0;0;2", "aff_campus_unique": "Beijing;;Guildford", "aff_country_unique_index": "0+0+0;0+0;0+0;1", - "aff_country_unique": "China;United Kingdom" + "aff_country_unique": "China;United Kingdom", + "bibtex": "@InProceedings{Fei_2021_ICCV,\n \n author = {\n Fei,\n Nanyi and Gao,\n Yizhao and Lu,\n Zhiwu and Xiang,\n Tao\n},\n title = {\n Z-Score Normalization,\n Hubness,\n and Few-Shot Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 142-151\n} \n}" }, { "title": "ZFlow: Gated Appearance Flow-Based Virtual Try-On With 3D Priors", @@ -47384,6 +50587,7 @@ "status": "Poster", "track": "main", "pid": 10355, + "author_site": "Ayush Chopra; Rishabh Jain; Mayur Hemani; Balaji Krishnamurthy", "author": "Ayush Chopra; Rishabh Jain; Mayur Hemani; Balaji Krishnamurthy", "abstract": "Image-based virtual try-on involves synthesizing perceptually convincing images of a model wearing a particular garment and has garnered significant research interest due to its immense practical applicability. Recent methods involve a two-stage process: i) warping of the garment to align with the model ii) texture fusion of the warped garment and target model to generate the try-on output. Issues arise due to the non-rigid nature of garments and the lack of geometric information about the model or the garment. It often results in improper rendering of granular details. We propose ZFlow, an end-to-end framework, which seeks to alleviate these concerns regarding geometric and textural integrity (such as pose, depth-ordering, skin and neckline reproduction) through a combination of gated aggregation of hierarchical flow estimates termed Gated Appearance Flow, and dense structural priors at various stage of the network. ZFlow achieves state-of-the-art results as observed qualitatively, and on benchmark image quality measures (PSNR, SSIM, and FID scores). The paper also presents extensive comparisons with existing state-of-the-art including a detailed user study and ablation studies to gauge the effectiveness of each of our contributions on multiple datasets", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Chopra_ZFlow_Gated_Appearance_Flow-Based_Virtual_Try-On_With_3D_Priors_ICCV_2021_paper.pdf", @@ -47407,7 +50611,8 @@ "aff_campus_unique_index": "0;2", "aff_campus_unique": "Cambridge;;Pilani", "aff_country_unique_index": "0+0;1+0;0;0", - "aff_country_unique": "United States;India" + "aff_country_unique": "United States;India", + "bibtex": "@InProceedings{Chopra_2021_ICCV,\n \n author = {\n Chopra,\n Ayush and Jain,\n Rishabh and Hemani,\n Mayur and Krishnamurthy,\n Balaji\n},\n title = {\n ZFlow: Gated Appearance Flow-Based Virtual Try-On With 3D Priors\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5433-5442\n} \n}" }, { "title": "Zen-NAS: A Zero-Shot NAS for High-Performance Image Recognition", @@ -47415,6 +50620,7 @@ "status": "Poster", "track": "main", "pid": 5417, + "author_site": "Ming Lin; Pichao Wang; Zhenhong Sun; Hesen Chen; Xiuyu Sun; Qi Qian; Hao Li; Rong Jin", "author": "Ming Lin; Pichao Wang; Zhenhong Sun; Hesen Chen; Xiuyu Sun; Qi Qian; Hao Li; Rong Jin", "abstract": "Accuracy predictor is a key component in Neural Architecture Search (NAS) for ranking architectures. Building a high-quality accuracy predictor usually costs enormous computation. To address this issue, instead of using an accuracy predictor, we propose a novel zero-shot index dubbed Zen-Score to rank the architectures. The Zen-Score represents the network expressivity and positively correlates with the model accuracy. The calculation of Zen-Score only takes a few forward inferences through a randomly initialized network, without training network parameters. Built upon the Zen-Score, we further propose a new NAS algorithm, termed as Zen-NAS, by maximizing the Zen-Score of the target network under given inference budgets. Within less than half GPU day, Zen-NAS is able to directly search high performance architectures in a data-free style. Comparing with previous NAS methods, the proposed Zen-NAS is magnitude times faster on multiple server-side and mobile-side GPU platforms with state-of-the-art accuracy on ImageNet. Searching and training code as well as pre-trained models are available from https://github.com/idstcv/ZenNAS.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Lin_Zen-NAS_A_Zero-Shot_NAS_for_High-Performance_Image_Recognition_ICCV_2021_paper.pdf", @@ -47438,7 +50644,8 @@ "aff_campus_unique_index": "0;0;1;1;1;0;1;1", "aff_campus_unique": "Bellevue;Hangzhou", "aff_country_unique_index": "0;0;1;1;1;0;1;1", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Lin_2021_ICCV,\n \n author = {\n Lin,\n Ming and Wang,\n Pichao and Sun,\n Zhenhong and Chen,\n Hesen and Sun,\n Xiuyu and Qian,\n Qi and Li,\n Hao and Jin,\n Rong\n},\n title = {\n Zen-NAS: A Zero-Shot NAS for High-Performance Image Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 347-356\n} \n}" }, { "title": "Zero-Shot Day-Night Domain Adaptation With a Physics Prior", @@ -47446,6 +50653,7 @@ "status": "Poster", "track": "main", "pid": 8606, + "author_site": "Attila Lengyel; Sourav Garg; Michael Milford; Jan C. van Gemert", "author": "Attila Lengyel; Sourav Garg; Michael Milford; Jan C. van Gemert", "abstract": "We explore the zero-shot setting for day-night domain adaptation. The traditional domain adaptation setting is to train on one domain and adapt to the target domain by exploiting unlabeled data samples from the test set. As gathering relevant test data is expensive and sometimes even impossible, we do not rely on test data and instead exploit a visual inductive prior derived from physics-based reflection models for domain adaptation. We cast a number of color invariant edge detectors as trainable layers in a convolutional neural network and evaluate their robustness to illumination changes. We show that the color invariant layer reduces the day-night distribution shift in feature map activations throughout the network. We demonstrate improved performance for zero-shot day to night domain adaptation on both synthetic as well as natural datasets in various tasks, including classification, segmentation and place recognition.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Lengyel_Zero-Shot_Day-Night_Domain_Adaptation_With_a_Physics_Prior_ICCV_2021_paper.pdf", @@ -47465,11 +50673,12 @@ "aff_unique_norm": "Delft University of Technology;Queensland University of Technology", "aff_unique_dep": ";Centre for Robotics", "aff_unique_url": "https://www.tudelft.nl;https://www.qut.edu.au", - "aff_unique_abbr": "TU Delft;QUT", + "aff_unique_abbr": "TUDelft;QUT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0", - "aff_country_unique": "Netherlands;Australia" + "aff_country_unique": "Netherlands;Australia", + "bibtex": "@InProceedings{Lengyel_2021_ICCV,\n \n author = {\n Lengyel,\n Attila and Garg,\n Sourav and Milford,\n Michael and van Gemert,\n Jan C.\n},\n title = {\n Zero-Shot Day-Night Domain Adaptation With a Physics Prior\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4399-4409\n} \n}" }, { "title": "Zero-Shot Natural Language Video Localization", @@ -47477,6 +50686,7 @@ "status": "Poster", "track": "main", "pid": 1518, + "author_site": "Jinwoo Nam; Daechul Ahn; Dongyeop Kang; Seong Jong Ha; Jonghyun Choi", "author": "Jinwoo Nam; Daechul Ahn; Dongyeop Kang; Seong Jong Ha; Jonghyun Choi", "abstract": "Understanding videos to localize moments with natural language often requires large expensive annotated video regions paired with language queries. To eliminate the annotation costs, we make a first attempt to train a natural language video localization model in zero-shot manner. Inspired by unsupervised image captioning setup, we merely require random text corpora, unlabeled video collections, and an off-the-shelf object detector to train a model. With the unrelated and unpaired data, we propose to generate pseudo-supervision of candidate temporal regions and corresponding query sentences, and develop a simple NLVL model to train with the pseudo-supervision. Our empirical validations show that the proposed pseudo-supervised method outperforms several baseline approaches and a number of methods using stronger supervision on Charades-STA and ActivityNet-Captions.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Nam_Zero-Shot_Natural_Language_Video_Localization_ICCV_2021_paper.pdf", @@ -47500,7 +50710,8 @@ "aff_campus_unique_index": "0;0;1+2;0", "aff_campus_unique": "Gwangju;Berkeley;Twin Cities;", "aff_country_unique_index": "0;0;1+1;0;0", - "aff_country_unique": "South Korea;United States" + "aff_country_unique": "South Korea;United States", + "bibtex": "@InProceedings{Nam_2021_ICCV,\n \n author = {\n Nam,\n Jinwoo and Ahn,\n Daechul and Kang,\n Dongyeop and Ha,\n Seong Jong and Choi,\n Jonghyun\n},\n title = {\n Zero-Shot Natural Language Video Localization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 1470-1479\n} \n}" }, { "title": "iMAP: Implicit Mapping and Positioning in Real-Time", @@ -47508,6 +50719,7 @@ "status": "Poster", "track": "main", "pid": 8134, + "author_site": "Edgar Sucar; Shikun Liu; Joseph Ortiz; Andrew J. Davison", "author": "Edgar Sucar; Shikun Liu; Joseph Ortiz; Andrew J. Davison", "abstract": "We show for the first time that a multilayer perceptron (MLP) can serve as the only scene representation in a real-time SLAM system for a handheld RGB-D camera. Our network is trained in live operation without prior data, building a dense, scene-specific implicit 3D model of occupancy and colour which is also immediately used for tracking. Achieving real-time SLAM via continual training of a neural network against a live image stream requires significant innovation. Our iMAP algorithm uses a keyframe structure and multi-processing computation flow, with dynamic information-guided pixel sampling for speed, with tracking at 10 Hz and global map updating at 2 Hz. The advantages of an implicit MLP over standard dense SLAM techniques include efficient geometry representation with automatic detail control and smooth, plausible filling-in of unobserved regions such as the back surfaces of objects.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Sucar_iMAP_Implicit_Mapping_and_Positioning_in_Real-Time_ICCV_2021_paper.pdf", @@ -47531,7 +50743,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "London", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Sucar_2021_ICCV,\n \n author = {\n Sucar,\n Edgar and Liu,\n Shikun and Ortiz,\n Joseph and Davison,\n Andrew J.\n},\n title = {\n iMAP: Implicit Mapping and Positioning in Real-Time\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 6229-6238\n} \n}" }, { "title": "iNAS: Integral NAS for Device-Aware Salient Object Detection", @@ -47539,6 +50752,7 @@ "status": "Poster", "track": "main", "pid": 3856, + "author_site": "Yu-Chao Gu; Shang-Hua Gao; Xu-Sheng Cao; Peng Du; Shao-Ping Lu; Ming-Ming Cheng", "author": "Yu-Chao Gu; Shang-Hua Gao; Xu-Sheng Cao; Peng Du; Shao-Ping Lu; Ming-Ming Cheng", "abstract": "Existing salient object detection (SOD) models usually focus on either backbone feature extractors or saliency heads, ignoring their relations. A powerful backbone could still achieve sub-optimal performance with a weak saliency head and vice versa. Moreover, the balance between model performance and inference latency poses a great challenge to model design, especially when considering different deployment scenarios. Considering all components in an integral neural architecture search (iNAS) space, we propose a flexible device-aware search scheme that only trains the SOD model once and quickly finds high-performance but low-latency models on multiple devices. An evolution search with latency-group sampling (LGS) is proposed to explore the entire latency area of our enlarged search space. Models searched by iNAS achieve similar performance with SOTA methods but reduce the 3.8x, 3.3x, 2.6x, 1.9x latency on Huawei Nova6 SE, Intel Core CPU, the Jetson Nano, and Nvidia Titan Xp. The code is released at https://mmcheng.net/inas/.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Gu_iNAS_Integral_NAS_for_Device-Aware_Salient_Object_Detection_ICCV_2021_paper.pdf", @@ -47555,14 +50769,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Gu_iNAS_Integral_NAS_for_Device-Aware_Salient_Object_Detection_ICCV_2021_paper.html", "aff_unique_index": "0;0;0;1;0;0", - "aff_unique_norm": "Nankai University;Huawei", - "aff_unique_dep": "Computer Science;Huawei Technologies", + "aff_unique_norm": "Nankai University;Huawei Technologies", + "aff_unique_dep": "Computer Science;", "aff_unique_url": "http://www.nankai.edu.cn;https://www.huawei.com", "aff_unique_abbr": "Nankai U;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Gu_2021_ICCV,\n \n author = {\n Gu,\n Yu-Chao and Gao,\n Shang-Hua and Cao,\n Xu-Sheng and Du,\n Peng and Lu,\n Shao-Ping and Cheng,\n Ming-Ming\n},\n title = {\n iNAS: Integral NAS for Device-Aware Salient Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 4934-4944\n} \n}" }, { "title": "iPOKE: Poking a Still Image for Controlled Stochastic Video Synthesis", @@ -47570,7 +50785,8 @@ "status": "Poster", "track": "main", "pid": 4284, - "author": "Andreas Blattmann; Timo Milbich; Michael Dorkenwald; Bj\u00f6rn Ommer", + "author_site": "Andreas Blattmann; Timo Milbich; Michael Dorkenwald; Björn Ommer", + "author": "Andreas Blattmann; Timo Milbich; Michael Dorkenwald; Björn Ommer", "abstract": "How would a static scene react to a local poke? What are the effects on other parts of an object if you could locally push it? There will be distinctive movement, despite evident variations caused by the stochastic nature of our world. These outcomes are governed by the characteristic kinematics of objects that dictate their overall motion caused by a local interaction. Conversely, the movement of an object provides crucial information about its underlying distinctive kinematics and the interdependencies between its parts. This two-way relation motivates learning a bijective mapping between object kinematics and plausible future image sequences. Therefore, we propose iPOKE -- invertible Prediction of Object Kinematics -- that, conditioned on an initial frame and a local poke, allows to sample object kinematics and establishes a one-to-one correspondence to the corresponding plausible videos, thereby providing a controlled stochastic video synthesis. In contrast to previous works, we do not generate arbitrary realistic videos, but provide efficient control of movements, while still capturing the stochastic nature of our environment and the diversity of plausible outcomes it entails. Moreover, our approach can transfer kinematics onto novel object instances and is not confined to particular object classes. Our project page is available at https://bit.ly/3dJN4Lf.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Blattmann_iPOKE_Poking_a_Still_Image_for_Controlled_Stochastic_Video_Synthesis_ICCV_2021_paper.pdf", "aff": ";;;", @@ -47584,7 +50800,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Blattmann_iPOKE_Poking_a_Still_Image_for_Controlled_Stochastic_Video_Synthesis_ICCV_2021_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2021/html/Blattmann_iPOKE_Poking_a_Still_Image_for_Controlled_Stochastic_Video_Synthesis_ICCV_2021_paper.html", + "bibtex": "@InProceedings{Blattmann_2021_ICCV,\n \n author = {\n Blattmann,\n Andreas and Milbich,\n Timo and Dorkenwald,\n Michael and Ommer,\n Bj\\"orn\n},\n title = {\n iPOKE: Poking a Still Image for Controlled Stochastic Video Synthesis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 14707-14717\n} \n}" }, { "title": "imGHUM: Implicit Generative Models of 3D Human Shape and Articulated Pose", @@ -47592,6 +50809,7 @@ "status": "Poster", "track": "main", "pid": 6149, + "author_site": "Thiemo Alldieck; Hongyi Xu; Cristian Sminchisescu", "author": "Thiemo Alldieck; Hongyi Xu; Cristian Sminchisescu", "abstract": "We present imGHUM, the first holistic generative model of 3D human shape and articulated pose, represented as a signed distance function. In contrast to prior work, we model the full human body implicitly as a function zero-level-set and without the use of an explicit template mesh. We propose a novel network architecture and a learning paradigm, which make it possible to learn a detailed implicit generative model of human pose, shape, and semantics, on par with state-of-the-art mesh-based models. Our model features desired detail for human models, such as articulated pose including hand motion and facial expressions, a broad spectrum of shape variations, and can be queried at arbitrary resolutions and spatial locations. Additionally, our model has attached spatial semantics making it straightforward to establish correspondences between different shape instances, thus enabling applications that are difficult to tackle using classical implicit representations. In extensive experiments, we demonstrate the model accuracy and its applicability to current research problems.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Alldieck_imGHUM_Implicit_Generative_Models_of_3D_Human_Shape_and_Articulated_ICCV_2021_paper.pdf", @@ -47615,7 +50833,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Alldieck_2021_ICCV,\n \n author = {\n Alldieck,\n Thiemo and Xu,\n Hongyi and Sminchisescu,\n Cristian\n},\n title = {\n imGHUM: Implicit Generative Models of 3D Human Shape and Articulated Pose\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 5461-5470\n} \n}" }, { "title": "mDALU: Multi-Source Domain Adaptation and Label Unification With Partial Datasets", @@ -47623,6 +50842,7 @@ "status": "Poster", "track": "main", "pid": 3953, + "author_site": "Rui Gong; Dengxin Dai; Yuhua Chen; Wen Li; Luc Van Gool", "author": "Rui Gong; Dengxin Dai; Yuhua Chen; Wen Li; Luc Van Gool", "abstract": "One challenge of object recognition is to generalize to new domains, to more classes and/or to new modalities. This necessitates methods to combine and reuse existing datasets that may belong to different domains, have partial annotations, and/or have different data modalities. This paper formulates this as a multi-source domain adaptation and label unification problem, and proposes a novel method for it. Our method consists of a partially-supervised adaptation stage and a fully-supervised adaptation stage. In the former, partial knowledge is transferred from multiple source domains to the target domain and fused therein. Negative transfer between unmatching label spaces is mitigated via three new modules: domain attention, uncertainty maximization and attention-guided adversarial alignment. In the latter, knowledge is transferred in the unified label space after a label completion process with pseudo-labels. Extensive experiments on three different tasks - image classification, 2D semantic image segmentation, and joint 2D-3D semantic segmentation - show that our method outperforms all competing methods significantly.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Gong_mDALU_Multi-Source_Domain_Adaptation_and_Label_Unification_With_Partial_Datasets_ICCV_2021_paper.pdf", @@ -47646,7 +50866,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Zurich;", "aff_country_unique_index": "0;0+1;0;2;0+3", - "aff_country_unique": "Switzerland;Germany;China;Belgium" + "aff_country_unique": "Switzerland;Germany;China;Belgium", + "bibtex": "@InProceedings{Gong_2021_ICCV,\n \n author = {\n Gong,\n Rui and Dai,\n Dengxin and Chen,\n Yuhua and Li,\n Wen and Van Gool,\n Luc\n},\n title = {\n mDALU: Multi-Source Domain Adaptation and Label Unification With Partial Datasets\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 8876-8885\n} \n}" }, { "title": "von Mises-Fisher Loss: An Exploration of Embedding Geometries for Supervised Learning", @@ -47654,6 +50875,7 @@ "status": "Poster", "track": "main", "pid": 9099, + "author_site": "Tyler R. Scott; Andrew C. Gallagher; Michael C. Mozer", "author": "Tyler R. Scott; Andrew C. Gallagher; Michael C. Mozer", "abstract": "Recent work has argued that classification losses utilizing softmax cross-entropy are superior not only for fixed-set classification tasks, but also by outperforming losses developed specifically for open-set tasks including few-shot learning and retrieval. Softmax classifiers have been studied using different embedding geometries---Euclidean, hyperbolic, and spherical---and claims have been made about the superiority of one or another, but they have not been systematically compared with careful controls. We conduct an empirical investigation of embedding geometry on softmax losses for a variety of fixed-set classification and image retrieval tasks. An interesting property observed for the spherical losses lead us to propose a probabilistic classifier based on the von Mises-Fisher distribution, and we show that it is competitive with state-of-the-art methods while producing improved out-of-the-box calibration. We provide guidance regarding the trade-offs between losses and how to choose among them.", "pdf": "https://openaccess.thecvf.com/content/ICCV2021/papers/Scott_von_Mises-Fisher_Loss_An_Exploration_of_Embedding_Geometries_for_Supervised_ICCV_2021_paper.pdf", @@ -47677,6 +50899,7 @@ "aff_campus_unique_index": "0;1;1", "aff_campus_unique": "Boulder;Mountain View", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Scott_2021_ICCV,\n \n author = {\n Scott,\n Tyler R. and Gallagher,\n Andrew C. and Mozer,\n Michael C.\n},\n title = {\n von Mises-Fisher Loss: An Exploration of Embedding Geometries for Supervised Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2021\n},\n pages = {\n 10612-10622\n} \n}" } ] \ No newline at end of file diff --git a/iccv/iccv2023.json b/iccv/iccv2023.json index 3ee7ff0..b603fcd 100644 --- a/iccv/iccv2023.json +++ b/iccv/iccv2023.json @@ -20,7 +20,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yang_2D-3D_Interlaced_Transformer_for_Point_Cloud_Segmentation_with_Scene-Level_Supervision_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yang_2D-3D_Interlaced_Transformer_for_Point_Cloud_Segmentation_with_Scene-Level_Supervision_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n Cheng-Kun and Chen,\n Min-Hung and Chuang,\n Yung-Yu and Lin,\n Yen-Yu\n},\n title = {\n 2D-3D Interlaced Transformer for Point Cloud Segmentation with Scene-Level Supervision\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 977-987\n} \n}" }, { "title": "2D3D-MATR: 2D-3D Matching Transformer for Detection-Free Registration Between Images and Point Clouds", @@ -52,7 +53,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0;0;0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Minhao and Qin,\n Zheng and Gao,\n Zhirui and Yi,\n Renjiao and Zhu,\n Chenyang and Guo,\n Yulan and Xu,\n Kai\n},\n title = {\n 2D3D-MATR: 2D-3D Matching Transformer for Detection-Free Registration Between Images and Point Clouds\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14128-14138\n} \n}" }, { "title": "360VOT: A New Benchmark Dataset for Omnidirectional Visual Object Tracking", @@ -84,7 +86,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Huang_2023_ICCV,\n \n author = {\n Huang,\n Huajian and Xu,\n Yinzhe and Chen,\n Yingshu and Yeung,\n Sai-Kit\n},\n title = {\n 360VOT: A New Benchmark Dataset for Omnidirectional Visual Object Tracking\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20566-20576\n} \n}" }, { "title": "3D Distillation: Improving Self-Supervised Monocular Depth Estimation on Reflective Surfaces", @@ -116,7 +119,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0+1;0;1+2;0", - "aff_country_unique": "United States;United Kingdom;South Korea" + "aff_country_unique": "United States;United Kingdom;South Korea", + "bibtex": "@InProceedings{Shi_2023_ICCV,\n \n author = {\n Shi,\n Xuepeng and Dikov,\n Georgi and Reitmayr,\n Gerhard and Kim,\n Tae-Kyun and Ghafoorian,\n Mohsen\n},\n title = {\n 3D Distillation: Improving Self-Supervised Monocular Depth Estimation on Reflective Surfaces\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9133-9143\n} \n}" }, { "title": "3D Human Mesh Recovery with Sequentially Global Rotation Estimation", @@ -148,7 +152,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Dongkai and Zhang,\n Shiliang\n},\n title = {\n 3D Human Mesh Recovery with Sequentially Global Rotation Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14953-14962\n} \n}" }, { "title": "3D Implicit Transporter for Temporally Consistent Keypoint Discovery", @@ -171,7 +176,8 @@ "aff_domain": ";;;;;;;;;;;", "email": ";;;;;;;;;;;", "author_num": 12, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhong_3D_Implicit_Transporter_for_Temporally_Consistent_Keypoint_Discovery_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhong_3D_Implicit_Transporter_for_Temporally_Consistent_Keypoint_Discovery_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Zhong_2023_ICCV,\n \n author = {\n Zhong,\n Chengliang and Zheng,\n Yuhang and Zheng,\n Yupeng and Zhao,\n Hao and Yi,\n Li and Mu,\n Xiaodong and Wang,\n Ling and Li,\n Pengfei and Zhou,\n Guyue and Yang,\n Chao and Zhang,\n Xinliang and Zhao,\n Jian\n},\n title = {\n 3D Implicit Transporter for Temporally Consistent Keypoint Discovery\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3869-3880\n} \n}" }, { "title": "3D Instance Segmentation via Enhanced Spatial and Semantic Supervision", @@ -183,7 +189,7 @@ "author": "Salwa Al Khatib; Mohamed El Amine Boudjoghra; Jean Lahoud; Fahad Shahbaz Khan", "abstract": "3D instance segmentation has recently garnered increased attention. Typical deep learning methods adopt point grouping schemes followed by hand-designed geometric clustering. Inspired by the success of transformers for various 3D tasks, newer hybrid approaches have utilized transformer decoders coupled with convolutional backbones that operate on voxelized scenes. However, due to the nature of sparse feature backbones, the extracted features provided to the transformer decoder are lacking in spatial understanding. Thus, such approaches often predict spatially separate objects as single instances. To this end, we introduce a novel approach for 3D point clouds instance segmentation that addresses the challenge of generating distinct instance masks for objects that share similar appearances but are spatially separated. Our method leverages spatial and semantic supervision with query refinement to improve the performance of hybrid 3D instance segmentation models. Specifically, we provide the transformer block with spatial features to facilitate differentiation between similar object queries and incorporate semantic supervision to enhance prediction accuracy based on object class. Our proposed approach outperforms existing methods on the validation sets of ScanNet V2 and ScanNet200 datasets, establishing a new state-of-the-art for this task.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Al_Khatib_3D_Instance_Segmentation_via_Enhanced_Spatial_and_Semantic_Supervision_ICCV_2023_paper.pdf", - "aff": "Mohamed Bin Zayed University of Artificial Intelligence (MBZUAI), UAE; Mohamed Bin Zayed University of Artificial Intelligence (MBZUAI), UAE; Mohamed Bin Zayed University of Artificial Intelligence (MBZUAI), UAE; Mohamed Bin Zayed University of Artificial Intelligence (MBZUAI), UAE + Link\u00f6ping University, Sweden", + "aff": "Mohamed Bin Zayed University of Artificial Intelligence (MBZUAI), UAE; Mohamed Bin Zayed University of Artificial Intelligence (MBZUAI), UAE; Mohamed Bin Zayed University of Artificial Intelligence (MBZUAI), UAE; Mohamed Bin Zayed University of Artificial Intelligence (MBZUAI), UAE + Linköping University, Sweden", "project": "", "github": "", "supp": "", @@ -196,14 +202,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Al_Khatib_3D_Instance_Segmentation_via_Enhanced_Spatial_and_Semantic_Supervision_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0+1", - "aff_unique_norm": "Mohamed bin Zayed University of Artificial Intelligence;Link\u00f6ping University", + "aff_unique_norm": "Mohamed Bin Zayed University of Artificial Intelligence;Linköping University", "aff_unique_dep": ";", "aff_unique_url": "https://www.mbzuai.ac.ae;https://www.liu.se", "aff_unique_abbr": "MBZUAI;LiU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0+1", - "aff_country_unique": "United Arab Emirates;Sweden" + "aff_country_unique": "United Arab Emirates;Sweden", + "bibtex": "@InProceedings{Al_Khatib_2023_ICCV,\n \n author = {\n Al Khatib,\n Salwa and El Amine Boudjoghra,\n Mohamed and Lahoud,\n Jean and Khan,\n Fahad Shahbaz\n},\n title = {\n 3D Instance Segmentation via Enhanced Spatial and Semantic Supervision\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 541-550\n} \n}" }, { "title": "3D Motion Magnification: Visualizing Subtle Motions from Time-Varying Radiance Fields", @@ -235,7 +242,8 @@ "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Mountain View;Cambridge", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Feng_2023_ICCV,\n \n author = {\n Feng,\n Brandon Y. and Alzayer,\n Hadi and Rubinstein,\n Michael and Freeman,\n William T. and Huang,\n Jia-bin\n},\n title = {\n 3D Motion Magnification: Visualizing Subtle Motions from Time-Varying Radiance Fields\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9837-9846\n} \n}" }, { "title": "3D Neural Embedding Likelihood: Probabilistic Inverse Graphics for Robust 6D Pose Estimation", @@ -243,8 +251,8 @@ "status": "Poster", "track": "main", "pid": "10277", - "author_site": "Guangyao Zhou, Nishad Gothoskar, Lirui Wang, Joshua B. Tenenbaum, Dan Gutfreund, Miguel L\u00e1zaro-Gredilla, Dileep George, Vikash K. Mansinghka", - "author": "Guangyao Zhou; Nishad Gothoskar; Lirui Wang; Joshua B. Tenenbaum; Dan Gutfreund; Miguel L\u00e1zaro-Gredilla; Dileep George; Vikash K. Mansinghka", + "author_site": "Guangyao Zhou, Nishad Gothoskar, Lirui Wang, Joshua B. Tenenbaum, Dan Gutfreund, Miguel Lázaro-Gredilla, Dileep George, Vikash K. Mansinghka", + "author": "Guangyao Zhou; Nishad Gothoskar; Lirui Wang; Joshua B. Tenenbaum; Dan Gutfreund; Miguel Lázaro-Gredilla; Dileep George; Vikash K. Mansinghka", "abstract": "The ability to perceive and understand 3D scenes is crucial for many applications in computer vision and robotics. Inverse graphics is an appealing approach to 3D scene understanding that aims to infer the 3D scene structure from 2D images. In this paper, we introduce probabilistic modeling to the inverse graphics framework to quantify uncertainty and achieve robustness in 6D pose estimation tasks. Specifically, we propose 3D Neural Embedding Likelihood (3DNEL) as a unified probabilistic model over RGB-D images, and develop efficient inference procedures on 3D scene descriptions. 3DNEL effectively combines learned neural embeddings from RGB with depth information to improve robustness in sim-to-real 6D object pose estimation from RGB-D images. Performance on the YCB-Video dataset is on par with state-of-the-art yet is much more robust in challenging regimes. In contrast to discriminative approaches, 3DNEL's probabilistic generative formulation jointly models multiple objects in a scene, quantifies uncertainty in a principled way, and handles object pose tracking under heavy occlusion. Finally, 3DNEL provides a principled framework for incorporating prior knowledge about the scene and objects, which allows natural extension to additional tasks like camera pose tracking from video.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Zhou_3D_Neural_Embedding_Likelihood_Probabilistic_Inverse_Graphics_for_Robust_6D_ICCV_2023_paper.pdf", "aff": "Google DeepMind; MIT; MIT; MIT; MIT-IBM Watson AI Lab; Google DeepMind; Google DeepMind; MIT", @@ -261,13 +269,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhou_3D_Neural_Embedding_Likelihood_Probabilistic_Inverse_Graphics_for_Robust_6D_ICCV_2023_paper.html", "aff_unique_index": "0;1;1;1;1;0;0;1", "aff_unique_norm": "Google;Massachusetts Institute of Technology", - "aff_unique_dep": "Google DeepMind;", + "aff_unique_dep": "DeepMind;", "aff_unique_url": "https://deepmind.com;https://web.mit.edu", "aff_unique_abbr": "DeepMind;MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;1;0;0;1", - "aff_country_unique": "United Kingdom;United States" + "aff_country_unique": "United Kingdom;United States", + "bibtex": "@InProceedings{Zhou_2023_ICCV,\n \n author = {\n Zhou,\n Guangyao and Gothoskar,\n Nishad and Wang,\n Lirui and Tenenbaum,\n Joshua B. and Gutfreund,\n Dan and L\\'azaro-Gredilla,\n Miguel and George,\n Dileep and Mansinghka,\n Vikash K.\n},\n title = {\n 3D Neural Embedding Likelihood: Probabilistic Inverse Graphics for Robust 6D Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21625-21636\n} \n}" }, { "title": "3D Segmentation of Humans in Point Clouds with Synthetic Data", @@ -275,11 +284,11 @@ "status": "Poster", "track": "main", "pid": "4949", - "author_site": "Ay\u00e7a Takmaz, Jonas Schult, Irem Kaftan, Mertcan Ak\u00e7ay, Bastian Leibe, Robert Sumner, Francis Engelmann, Siyu Tang", - "author": "Ay\u00e7a Takmaz; Jonas Schult; Irem Kaftan; Mertcan Ak\u00e7ay; Bastian Leibe; Robert Sumner; Francis Engelmann; Siyu Tang", + "author_site": "Ayça Takmaz, Jonas Schult, Irem Kaftan, Mertcan Akçay, Bastian Leibe, Robert Sumner, Francis Engelmann, Siyu Tang", + "author": "Ayça Takmaz; Jonas Schult; Irem Kaftan; Mertcan Akçay; Bastian Leibe; Robert Sumner; Francis Engelmann; Siyu Tang", "abstract": "Segmenting humans in 3D indoor scenes has become increasingly important with the rise of human-centered robotics and AR/VR applications. To this end, we propose the task of joint 3D human semantic segmentation, instance segmentation and multi-human body-part segmentation. Few works have attempted to directly segment humans in cluttered 3D scenes, which is largely due to the lack of annotated training data of humans interacting with 3D scenes. We address this challenge and propose a framework for generating training data of synthetic humans interacting with real 3D scenes. Furthermore, we propose a novel transformer-based model, Human3D, which is the first end-to-end model for segmenting multiple human instances and their body-parts in a unified manner. The key advantage of our synthetic data generation framework is its ability to generate diverse and realistic human-scene interactions, with highly accurate ground truth. Our experiments show that pre-training on synthetic data improves performance on a wide variety of 3D human segmentation tasks. Finally, we demonstrate that Human3D outperforms even task-specific state-of-the-art 3D segmentation methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Takmaz_3D_Segmentation_of_Humans_in_Point_Clouds_with_Synthetic_Data_ICCV_2023_paper.pdf", - "aff": "ETH Z\u00fcrich, Switzerland; RWTH Aachen University, Germany; ETH Z\u00fcrich, Switzerland; ETH Z\u00fcrich, Switzerland; RWTH Aachen University, Germany; ETH Z\u00fcrich, Switzerland; ETH AI Center, Switzerland; ETH Z\u00fcrich, Switzerland", + "aff": "ETH Zürich, Switzerland; RWTH Aachen University, Germany; ETH Zürich, Switzerland; ETH Zürich, Switzerland; RWTH Aachen University, Germany; ETH Zürich, Switzerland; ETH AI Center, Switzerland; ETH Zürich, Switzerland", "project": "https://human-3d.github.io", "github": "https://github.com/human-3d", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Takmaz_3D_Segmentation_of_ICCV_2023_supplemental.zip", @@ -291,15 +300,16 @@ "email": "ethz.ch;rwth-aachen.de;ethz.ch;ethz.ch;rwth-aachen.de;ethz.ch;ethz.ch;ethz.ch", "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Takmaz_3D_Segmentation_of_Humans_in_Point_Clouds_with_Synthetic_Data_ICCV_2023_paper.html", - "aff_unique_index": "0;1;0;0;1;0;0;0", - "aff_unique_norm": "ETH Zurich;RWTH Aachen University", - "aff_unique_dep": ";", - "aff_unique_url": "https://www.ethz.ch;https://www.rwth-aachen.de", - "aff_unique_abbr": "ETHZ;RWTH", + "aff_unique_index": "0;1;0;0;1;0;2;0", + "aff_unique_norm": "ETH Zürich;RWTH Aachen University;ETH Zurich", + "aff_unique_dep": ";;ETH AI Center", + "aff_unique_url": "https://www.ethz.ch;https://www.rwth-aachen.de;https://www.ethz.ch", + "aff_unique_abbr": "ETHZ;RWTH;ETH", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;1;0;0;0", - "aff_country_unique": "Switzerland;Germany" + "aff_country_unique": "Switzerland;Germany", + "bibtex": "@InProceedings{Takmaz_2023_ICCV,\n \n author = {\n Takmaz,\n Ay\\c{c\n}a and Schult,\n Jonas and Kaftan,\n Irem and Ak\\c{c\n}ay,\n Mertcan and Leibe,\n Bastian and Sumner,\n Robert and Engelmann,\n Francis and Tang,\n Siyu\n},\n title = {\n 3D Segmentation of Humans in Point Clouds with Synthetic Data\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1292-1304\n} \n}" }, { "title": "3D Semantic Subspace Traverser: Empowering 3D Generative Model with Shape Editing Capability", @@ -331,7 +341,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Ruowei and Liu,\n Yu and Su,\n Pei and Zhang,\n Jianwei and Zhao,\n Qijun\n},\n title = {\n 3D Semantic Subspace Traverser: Empowering 3D Generative Model with Shape Editing Capability\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14406-14417\n} \n}" }, { "title": "3D VR Sketch Guided 3D Shape Prototyping and Exploration", @@ -363,7 +374,8 @@ "aff_campus_unique_index": ";;;;1", "aff_campus_unique": ";Surrey", "aff_country_unique_index": "0+0;0+0;0+0;0+0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Luo_2023_ICCV,\n \n author = {\n Luo,\n Ling and Chowdhury,\n Pinaki Nath and Xiang,\n Tao and Song,\n Yi-Zhe and Gryaditskaya,\n Yulia\n},\n title = {\n 3D VR Sketch Guided 3D Shape Prototyping and Exploration\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9267-9276\n} \n}" }, { "title": "3D-Aware Generative Model for Improved Side-View Image Synthesis", @@ -395,7 +407,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Pohang", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Jo_2023_ICCV,\n \n author = {\n Jo,\n Kyungmin and Jin,\n Wonjoon and Choo,\n Jaegul and Lee,\n Hyunjoon and Cho,\n Sunghyun\n},\n title = {\n 3D-Aware Generative Model for Improved Side-View Image Synthesis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22862-22872\n} \n}" }, { "title": "3D-Aware Neural Body Fitting for Occlusion Robust 3D Human Pose Estimation", @@ -427,7 +440,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;2+2;0", - "aff_country_unique": "United States;China;Germany" + "aff_country_unique": "United States;China;Germany", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Yi and Ji,\n Pengliang and Wang,\n Angtian and Mei,\n Jieru and Kortylewski,\n Adam and Yuille,\n Alan\n},\n title = {\n 3D-Aware Neural Body Fitting for Occlusion Robust 3D Human Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9399-9410\n} \n}" }, { "title": "3D-VisTA: Pre-trained Transformer for 3D Vision and Text Alignment", @@ -459,7 +473,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhu_2023_ICCV,\n \n author = {\n Zhu,\n Ziyu and Ma,\n Xiaojian and Chen,\n Yixin and Deng,\n Zhidong and Huang,\n Siyuan and Li,\n Qing\n},\n title = {\n 3D-VisTA: Pre-trained Transformer for 3D Vision and Text Alignment\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2911-2921\n} \n}" }, { "title": "3D-aware Blending with Generative NeRFs", @@ -491,7 +506,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0+0;1", - "aff_country_unique": "South Korea;United States" + "aff_country_unique": "South Korea;United States", + "bibtex": "@InProceedings{Kim_2023_ICCV,\n \n author = {\n Kim,\n Hyunsu and Lee,\n Gayoung and Choi,\n Yunjey and Kim,\n Jin-Hwa and Zhu,\n Jun-Yan\n},\n title = {\n 3D-aware Blending with Generative NeRFs\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22906-22918\n} \n}" }, { "title": "3D-aware Image Generation using 2D Diffusion Models", @@ -516,14 +532,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Xiang_3D-aware_Image_Generation_using_2D_Diffusion_Models_ICCV_2023_paper.html", "aff_unique_index": "0+1;1;2+1;1", - "aff_unique_norm": "Tsinghua University;Microsoft;ShanghaiTech University", + "aff_unique_norm": "Tsinghua University;Microsoft Research;ShanghaiTech University", "aff_unique_dep": ";Research;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.microsoft.com/en-us/research/group/asia;https://www.shanghaitech.edu.cn", "aff_unique_abbr": "THU;MSR Asia;ShanghaiTech", "aff_campus_unique_index": "1;1;1;1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0+0;0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xiang_2023_ICCV,\n \n author = {\n Xiang,\n Jianfeng and Yang,\n Jiaolong and Huang,\n Binbin and Tong,\n Xin\n},\n title = {\n 3D-aware Image Generation using 2D Diffusion Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2383-2393\n} \n}" }, { "title": "3DHacker: Spectrum-based Decision Boundary Generation for Hard-label 3D Point Cloud Attack", @@ -531,6 +548,7 @@ "status": "Poster", "track": "main", "pid": "2530", + "author_site": "Yunbo Tao, Daizong Liu, Pan Zhou, Yulai Xie, Wei Du, Wei Hu", "author": "Yunbo Tao, Daizong Liu, Pan Zhou, Yulai Xie, Wei Du, Wei Hu", "abstract": "With the maturity of depth sensors, the vulnerability of 3D point cloud models has received increasing attention in various applications such as autonomous driving and robot navigation. Previous 3D adversarial attackers either follow the white-box setting to iteratively update the coordinate perturbations based on gradients, or utilize the output model logits to estimate noisy gradients in the black-box setting. However, these attack methods are hard to be deployed in real-world scenarios since realistic 3D applications will not share any model details to users. Therefore, we explore a more challenging yet practical 3D attack setting, i.e., attacking point clouds with black-box hard labels, in which the attacker can only have access to the prediction label of the input. To tackle this setting, we propose a novel 3D attack method, termed 3D Hard-label attacker (3DHacker), based on the developed decision boundary algorithm to generate adversarial samples solely with the knowledge of class labels. Specifically, to construct the class-aware model decision boundary, 3DHacker first randomly fuses two point clouds of different classes in the spectral domain to craft their intermediate sample with high imperceptibility, then projects it onto the decision boundary via binary search. To restrict the final perturbation size, 3DHacker further introduces an iterative optimization strategy to move the intermediate sample along the decision boundary for generating adversarial point clouds with smallest trivial perturbations. Extensive evaluations show that, even in the challenging hard-label setting, 3DHacker still competitively outperforms existing 3D attacks regarding the attack performance as well as adversary quality.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Tao_3DHacker_Spectrum-based_Decision_Boundary_Generation_for_Hard-label_3D_Point_Cloud_ICCV_2023_paper.pdf", @@ -542,7 +560,8 @@ "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15468857805593799613&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Tao_3DHacker_Spectrum-based_Decision_Boundary_Generation_for_Hard-label_3D_Point_Cloud_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Tao_3DHacker_Spectrum-based_Decision_Boundary_Generation_for_Hard-label_3D_Point_Cloud_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Tao_2023_ICCV,\n \n author = {\n Tao,\n Yunbo and Liu,\n Daizong and Zhou,\n Pan and Xie,\n Yulai and Du,\n Wei and Hu,\n Wei\n},\n title = {\n 3DHacker: Spectrum-based Decision Boundary Generation for Hard-label 3D Point Cloud Attack\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14340-14350\n} \n}" }, { "title": "3DHumanGAN: 3D-Aware Human Image Generation with 3D Pose Mapping", @@ -574,7 +593,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0;0;0", - "aff_country_unique": "China;Switzerland" + "aff_country_unique": "China;Switzerland", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n Zhuoqian and Li,\n Shikai and Wu,\n Wayne and Dai,\n Bo\n},\n title = {\n 3DHumanGAN: 3D-Aware Human Image Generation with 3D Pose Mapping\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23008-23019\n} \n}" }, { "title": "3DMOTFormer: Graph Transformer for Online 3D Multi-Object Tracking", @@ -606,7 +626,8 @@ "aff_campus_unique_index": "0+1;2;0;0;1", "aff_campus_unique": "Sindelfingen;Bonn;Stuttgart;", "aff_country_unique_index": "0+0;0;0;0;0+0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Ding_2023_ICCV,\n \n author = {\n Ding,\n Shuxiao and Rehder,\n Eike and Schneider,\n Lukas and Cordts,\n Marius and Gall,\n Juergen\n},\n title = {\n 3DMOTFormer: Graph Transformer for Online 3D Multi-Object Tracking\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9784-9794\n} \n}" }, { "title": "3DMiner: Discovering Shapes from Large-Scale Unannotated Image Datasets", @@ -614,8 +635,8 @@ "status": "Poster", "track": "main", "pid": "6423", - "author_site": "Ta-Ying Cheng, Matheus Gadelha, S\u00f6ren Pirk, Thibault Groueix, Radom\u00edr M?ch, Andrew Markham, Niki Trigoni", - "author": "Ta-Ying Cheng; Matheus Gadelha; S\u00f6ren Pirk; Thibault Groueix; Radom\u00edr M\u011bch; Andrew Markham; Niki Trigoni", + "author_site": "Ta-Ying Cheng, Matheus Gadelha, Sören Pirk, Thibault Groueix, Radomír M?ch, Andrew Markham, Niki Trigoni", + "author": "Ta-Ying Cheng; Matheus Gadelha; Sören Pirk; Thibault Groueix; Radomír Měch; Andrew Markham; Niki Trigoni", "abstract": "We present 3DMiner -- a pipeline for mining 3D shapes from challenging large-scale unannotated image datasets. Unlike other unsupervised 3D reconstruction methods, we assume that, within a large-enough dataset, there must exist images of objects with similar shapes but varying backgrounds, textures, and viewpoints. Our approach leverages the recent advances in learning self-supervised image representations to cluster images with geometrically similar shapes and find common image correspondences between them. We then exploit these correspondences to obtain rough camera estimates as initialization for bundle-adjustment. Finally, for every image cluster, we apply a progressive bundle-adjusting reconstruction method to learn a neural occupancy field representing the underlying shape. We show that this procedure is robust to several types of errors introduced in previous steps (e.g., wrong camera poses, images containing dissimilar shapes, etc.), allowing us to obtain shape and pose annotations for images in-the-wild.\n When using images from Pix3D chairs, our method is capable of producing significantly better results than state-of-the-art unsupervised 3D\n reconstruction techniques, both quantitatively and qualitatively. Furthermore, we show how 3DMiner can be applied to in-the-wild data by reconstructing shapes present in images from the LAION-5B dataset.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Cheng_3DMiner_Discovering_Shapes_from_Large-Scale_Unannotated_Image_Datasets_ICCV_2023_paper.pdf", "aff": "University of Oxford; Adobe Research; Adobe Research; Adobe Research; Adobe Research; University of Oxford; University of Oxford", @@ -638,7 +659,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;1;0;0", - "aff_country_unique": "United Kingdom;United States" + "aff_country_unique": "United Kingdom;United States", + "bibtex": "@InProceedings{Cheng_2023_ICCV,\n \n author = {\n Cheng,\n Ta-Ying and Gadelha,\n Matheus and Pirk,\n S\\"oren and Groueix,\n Thibault and M\\v{e\n}ch,\n Radom{\\'\\i\n}r and Markham,\n Andrew and Trigoni,\n Niki\n},\n title = {\n 3DMiner: Discovering Shapes from Large-Scale Unannotated Image Datasets\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9331-9341\n} \n}" }, { "title": "3DPPE: 3D Point Positional Encoding for Transformer-based Multi-Camera 3D Object Detection", @@ -650,7 +672,7 @@ "author": "Changyong Shu; Jiajun Deng; Fisher Yu; Yifan Liu", "abstract": "Transformer-based methods have swept the benchmarks on 2D and 3D detection on images. Because tokenization before the attention mechanism drops the spatial information, positional encoding becomes critical for those methods. Recent works found that encodings based on samples of the 3D viewing rays can significantly improve the quality of multi-camera 3D object detection. We hypothesize that 3D point locations can provide more information than rays. Therefore, we introduce 3D point positional encoding, 3DPPE, to the 3D detection Transformer decoder. Although 3D measurements are not available at the inference time of monocular 3D object detection, 3DPPE uses predicted depth to approximate the real point positions. Our hybrid-depth module combines direct and categorical depth to estimate the refined depth of each pixel. Despite the approximation, 3DPPE achieves 46.0 mAP and 51.4 NDS on the competitive nuScenes dataset, significantly outperforming encodings based on ray samples. The codes are available at https://github.com/drilistbox/3DPPE.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Shu_3DPPE_3D_Point_Positional_Encoding_for_Transformer-based_Multi-Camera_3D_Object_ICCV_2023_paper.pdf", - "aff": "Houmo AI; University of Sydney; ETH Z\u00fcrich; University of Adelaide", + "aff": "Houmo AI; University of Sydney; ETH Zürich; University of Adelaide", "project": "", "github": "https://github.com/drilistbox/3DPPE", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Shu_3DPPE_3D_Point_ICCV_2023_supplemental.pdf", @@ -663,14 +685,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Shu_3DPPE_3D_Point_Positional_Encoding_for_Transformer-based_Multi-Camera_3D_Object_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;3", - "aff_unique_norm": "Houmo AI;University of Sydney;ETH Zurich;University of Adelaide", + "aff_unique_norm": "Houmo AI;University of Sydney;ETH Zürich;University of Adelaide", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.houmo.ai;https://www.sydney.edu.au;https://www.ethz.ch;https://www.adelaide.edu.au", "aff_unique_abbr": "Houmo AI;USYD;ETHZ;Adelaide", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;1", - "aff_country_unique": "China;Australia;Switzerland" + "aff_country_unique": "China;Australia;Switzerland", + "bibtex": "@InProceedings{Shu_2023_ICCV,\n \n author = {\n Shu,\n Changyong and Deng,\n Jiajun and Yu,\n Fisher and Liu,\n Yifan\n},\n title = {\n 3DPPE: 3D Point Positional Encoding for Transformer-based Multi-Camera 3D Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3580-3589\n} \n}" }, { "title": "4D Myocardium Reconstruction with Decoupled Motion and Shape Model", @@ -702,7 +725,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yuan_2023_ICCV,\n \n author = {\n Yuan,\n Xiaohan and Liu,\n Cong and Wang,\n Yangang\n},\n title = {\n 4D Myocardium Reconstruction with Decoupled Motion and Shape Model\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21252-21262\n} \n}" }, { "title": "4D Panoptic Segmentation as Invariant and Equivariant Field Prediction", @@ -734,7 +758,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Ann Arbor;", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhu_2023_ICCV,\n \n author = {\n Zhu,\n Minghan and Han,\n Shizhong and Cai,\n Hong and Borse,\n Shubhankar and Ghaffari,\n Maani and Porikli,\n Fatih\n},\n title = {\n 4D Panoptic Segmentation as Invariant and Equivariant Field Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22488-22498\n} \n}" }, { "title": "A 5-Point Minimal Solver for Event Camera Relative Motion Estimation", @@ -766,7 +791,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Shanghai;", "aff_country_unique_index": "0;0;1;1;1;0", - "aff_country_unique": "China;Switzerland" + "aff_country_unique": "China;Switzerland", + "bibtex": "@InProceedings{Gao_2023_ICCV,\n \n author = {\n Gao,\n Ling and Su,\n Hang and Gehrig,\n Daniel and Cannici,\n Marco and Scaramuzza,\n Davide and Kneip,\n Laurent\n},\n title = {\n A 5-Point Minimal Solver for Event Camera Relative Motion Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8049-8059\n} \n}" }, { "title": "A Benchmark for Chinese-English Scene Text Image Super-Resolution", @@ -791,14 +817,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ma_A_Benchmark_for_Chinese-English_Scene_Text_Image_Super-Resolution_ICCV_2023_paper.html", "aff_unique_index": "0;1;0;0;0", - "aff_unique_norm": "Hong Kong Polytechnic University;OPPO", + "aff_unique_norm": "The Hong Kong Polytechnic University;OPPO", "aff_unique_dep": ";Research", "aff_unique_url": "https://www.polyu.edu.hk;https://www.oppo.com", "aff_unique_abbr": "PolyU;OPPO", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Ma_2023_ICCV,\n \n author = {\n Ma,\n Jianqi and Liang,\n Zhetong and Xiang,\n Wangmeng and Yang,\n Xi and Zhang,\n Lei\n},\n title = {\n A Benchmark for Chinese-English Scene Text Image Super-Resolution\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19452-19461\n} \n}" }, { "title": "A Complete Recipe for Diffusion Generative Models", @@ -824,13 +851,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Pandey_A_Complete_Recipe_for_Diffusion_Generative_Models_ICCV_2023_paper.html", "aff_unique_index": "0;0", "aff_unique_norm": "University of California, Irvine", - "aff_unique_dep": "Dept. of Computer Science", + "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.uci.edu", "aff_unique_abbr": "UCI", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Irvine", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Pandey_2023_ICCV,\n \n author = {\n Pandey,\n Kushagra and Mandt,\n Stephan\n},\n title = {\n A Complete Recipe for Diffusion Generative Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4261-4272\n} \n}" }, { "title": "A Dynamic Dual-Processing Object Detection Framework Inspired by the Brain's Recognition Mechanism", @@ -853,7 +881,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhang_A_Dynamic_Dual-Processing_Object_Detection_Framework_Inspired_by_the_Brains_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhang_A_Dynamic_Dual-Processing_Object_Detection_Framework_Inspired_by_the_Brains_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Minying and Bu,\n Tianpeng and Hu,\n Lulu\n},\n title = {\n A Dynamic Dual-Processing Object Detection Framework Inspired by the Brain's Recognition Mechanism\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6264-6274\n} \n}" }, { "title": "A Fast Unified System for 3D Object Detection and Tracking", @@ -885,7 +914,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Wien", "aff_country_unique_index": "0;0", - "aff_country_unique": "Austria" + "aff_country_unique": "Austria", + "bibtex": "@InProceedings{Heitzinger_2023_ICCV,\n \n author = {\n Heitzinger,\n Thomas and Kampel,\n Martin\n},\n title = {\n A Fast Unified System for 3D Object Detection and Tracking\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17044-17054\n} \n}" }, { "title": "A Game of Bundle Adjustment - Learning Efficient Convergence", @@ -917,7 +947,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "Israel;United States" + "aff_country_unique": "Israel;United States", + "bibtex": "@InProceedings{Belder_2023_ICCV,\n \n author = {\n Belder,\n Amir and Vivanti,\n Refael and Tal,\n Ayellet\n},\n title = {\n A Game of Bundle Adjustment - Learning Efficient Convergence\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8428-8437\n} \n}" }, { "title": "A Generalist Framework for Panoptic Segmentation of Images and Videos", @@ -949,7 +980,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Ting and Li,\n Lala and Saxena,\n Saurabh and Hinton,\n Geoffrey and Fleet,\n David J.\n},\n title = {\n A Generalist Framework for Panoptic Segmentation of Images and Videos\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 909-919\n} \n}" }, { "title": "A Good Student is Cooperative and Reliable: CNN-Transformer Collaborative Learning for Semantic Segmentation", @@ -981,7 +1013,8 @@ "aff_campus_unique_index": "0;0;0+2", "aff_campus_unique": "Guangzhou;;Hong Kong SAR", "aff_country_unique_index": "0;1;0;0;0+0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Zhu_2023_ICCV,\n \n author = {\n Zhu,\n Jinjing and Luo,\n Yunhao and Zheng,\n Xu and Wang,\n Hao and Wang,\n Lin\n},\n title = {\n A Good Student is Cooperative and Reliable: CNN-Transformer Collaborative Learning for Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11720-11730\n} \n}" }, { "title": "A Large-Scale Outdoor Multi-Modal Dataset and Benchmark for Novel View Synthesis and Implicit Scene Reconstruction", @@ -1013,7 +1046,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Lu_2023_ICCV,\n \n author = {\n Lu,\n Chongshan and Yin,\n Fukun and Chen,\n Xin and Liu,\n Wen and Chen,\n Tao and Yu,\n Gang and Fan,\n Jiayuan\n},\n title = {\n A Large-Scale Outdoor Multi-Modal Dataset and Benchmark for Novel View Synthesis and Implicit Scene Reconstruction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7557-7567\n} \n}" }, { "title": "A Large-scale Study of Spatiotemporal Representation Learning with a New Benchmark on Action Recognition", @@ -1042,10 +1076,11 @@ "aff_unique_dep": "Center for Research in Computer Vision", "aff_unique_url": "https://www.ucf.edu", "aff_unique_abbr": "UCF", - "aff_campus_unique_index": "0;0;0", - "aff_campus_unique": "Central Florida", + "aff_campus_unique_index": "", + "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Deng_2023_ICCV,\n \n author = {\n Deng,\n Andong and Yang,\n Taojiannan and Chen,\n Chen\n},\n title = {\n A Large-scale Study of Spatiotemporal Representation Learning with a New Benchmark on Action Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20519-20531\n} \n}" }, { "title": "A Latent Space of Stochastic Diffusion Models for Zero-Shot Image Editing and Guidance", @@ -1077,7 +1112,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Pittsburgh", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Wu_2023_ICCV,\n \n author = {\n Wu,\n Chen Henry and De la Torre,\n Fernando\n},\n title = {\n A Latent Space of Stochastic Diffusion Models for Zero-Shot Image Editing and Guidance\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7378-7387\n} \n}" }, { "title": "A Low-Shot Object Counting Network With Iterative Prototype Adaptation", @@ -1085,8 +1121,8 @@ "status": "Poster", "track": "main", "pid": "12275", - "author_site": "Nikola ?uki?, Alan Luke\u009ei?, Vitjan Zavrtanik, Matej Kristan", - "author": "Nikola \u0110uki\u0107; Alan Luke\u017ei\u010d; Vitjan Zavrtanik; Matej Kristan", + "author_site": "Nikola ?uki?, Alan Lukeži?, Vitjan Zavrtanik, Matej Kristan", + "author": "Nikola Đukić; Alan Lukežič; Vitjan Zavrtanik; Matej Kristan", "abstract": "We consider low-shot counting of arbitrary semantic categories in the image using only few annotated exemplars (few-shot) or no exemplars (no-shot). The standard few-shot pipeline follows extraction of appearance queries from exemplars and matching them with image features to infer the object counts. Existing methods extract queries by feature pooling which neglects the shape information (e.g., size and aspect) and leads to a reduced object localization accuracy and count estimates. We propose a Low-shot Object Counting network with iterative prototype Adaptation (LOCA). Our main contribution is the new object prototype extraction module, which iteratively fuses the exemplar shape and appearance information with image features. The module is easily adapted to zero-shot scenarios, enabling LOCA to cover the entire spectrum of low-shot counting problems. LOCA outperforms all recent state-of-the-art methods on FSC147 benchmark by 20-30% in RMSE on one-shot and few-shot and achieves state-of-the-art on zero-shot scenarios, while demonstrating better generalization capabilities. The code and models are available here: https://github.com/djukicn/loca.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Dukic_A_Low-Shot_Object_Counting_Network_With_Iterative_Prototype_Adaptation_ICCV_2023_paper.pdf", "aff": "Faculty of Computer and Information Science, University of Ljubljana, Slovenia; Faculty of Computer and Information Science, University of Ljubljana, Slovenia; Faculty of Computer and Information Science, University of Ljubljana, Slovenia; Faculty of Computer and Information Science, University of Ljubljana, Slovenia", @@ -1109,7 +1145,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Slovenia" + "aff_country_unique": "Slovenia", + "bibtex": "@InProceedings{Dukic_2023_ICCV,\n \n author = {\n {\\DJ\n}uki\\'c,\n Nikola and Luke\\v{z\n}i\\v{c\n},\n Alan and Zavrtanik,\n Vitjan and Kristan,\n Matej\n},\n title = {\n A Low-Shot Object Counting Network With Iterative Prototype Adaptation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18872-18881\n} \n}" }, { "title": "A Multidimensional Analysis of Social Biases in Vision Transformers", @@ -1141,7 +1178,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Brinkmann_2023_ICCV,\n \n author = {\n Brinkmann,\n Jannik and Swoboda,\n Paul and Bartelt,\n Christian\n},\n title = {\n A Multidimensional Analysis of Social Biases in Vision Transformers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4914-4923\n} \n}" }, { "title": "A Parse-Then-Place Approach for Generating Graphic Layouts from Textual Descriptions", @@ -1153,7 +1191,7 @@ "author": "Jiawei Lin; Jiaqi Guo; Shizhao Sun; Weijiang Xu; Ting Liu; Jian-Guang Lou; Dongmei Zhang", "abstract": "Creating layouts is a fundamental step in graphic design. In this work, we propose to use text as the guidance to create graphic layouts, i.e., Text-to-Layout, aiming to lower the design barriers. Text-to-Layout is a challenging task, because it needs to consider the implicit, combined, and incomplete layout constraints from text, each of which has not been studied in previous work. To address this, we present a two-stage approach, named parse-then-place. The approach introduces an intermediate representation (IR) between text and layout to represent diverse layout constraints. With IR, Text-to-Layout is decomposed into a parse stage and a place stage. The parse stage takes a textual description as input and generates an IR, in which the implicit constraints from the text are transformed into explicit ones. The place stage generates layouts based on the IR. To model combined and incomplete constraints, we use a Transformer-based layout generation model and carefully design a way to represent constraints and layouts as sequences. Besides, we adopt the pretrain-then-finetune strategy to boost the performance of the layout generation model with large-scale unlabeled layouts. To evaluate our approach, we construct two Text-to-Layout datasets and conduct experiments on them. Quantitative results, qualitative analysis, and user studies demonstrate our approach's effectiveness.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Lin_A_Parse-Then-Place_Approach_for_Generating_Graphic_Layouts_from_Textual_Descriptions_ICCV_2023_paper.pdf", - "aff": "Xi\u2019an Jiaotong University; Microsoft Research Asia; Microsoft Research Asia; Microsoft Research Asia; Xi\u2019an Jiaotong University; Microsoft Research Asia; Microsoft Research Asia", + "aff": "Xi’an Jiaotong University; Microsoft Research Asia; Microsoft Research Asia; Microsoft Research Asia; Xi’an Jiaotong University; Microsoft Research Asia; Microsoft Research Asia", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Lin_A_Parse-Then-Place_Approach_ICCV_2023_supplemental.pdf", @@ -1166,14 +1204,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Lin_A_Parse-Then-Place_Approach_for_Generating_Graphic_Layouts_from_Textual_Descriptions_ICCV_2023_paper.html", "aff_unique_index": "0;1;1;1;0;1;1", - "aff_unique_norm": "Xi'an Jiao Tong University;Microsoft", + "aff_unique_norm": "Xi'an Jiaotong University;Microsoft Research", "aff_unique_dep": ";Research", "aff_unique_url": "https://www.xjtu.edu.cn;https://www.microsoft.com/en-us/research/group/asia", "aff_unique_abbr": "XJTU;MSR Asia", "aff_campus_unique_index": "1;1;1;1;1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Lin_2023_ICCV,\n \n author = {\n Lin,\n Jiawei and Guo,\n Jiaqi and Sun,\n Shizhao and Xu,\n Weijiang and Liu,\n Ting and Lou,\n Jian-Guang and Zhang,\n Dongmei\n},\n title = {\n A Parse-Then-Place Approach for Generating Graphic Layouts from Textual Descriptions\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23622-23631\n} \n}" }, { "title": "A Retrospect to Multi-prompt Learning across Vision and Language", @@ -1205,7 +1244,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Ziliang and Huang,\n Xin and Guan,\n Quanlong and Lin,\n Liang and Luo,\n Weiqi\n},\n title = {\n A Retrospect to Multi-prompt Learning across Vision and Language\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22190-22201\n} \n}" }, { "title": "A Sentence Speaks a Thousand Images: Domain Generalization through Distilling CLIP with Language Guidance", @@ -1230,14 +1270,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Huang_A_Sentence_Speaks_a_Thousand_Images_Domain_Generalization_through_Distilling_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;0;1;0", - "aff_unique_norm": "University of Wisconsin-Madison;University of Illinois Urbana-Champaign;Imperial College London", + "aff_unique_norm": "University of Wisconsin-Madison;University of Illinois at Urbana-Champaign;Imperial College London", "aff_unique_dep": ";;", "aff_unique_url": "https://www.wisc.edu;https://illinois.edu;https://www.imperial.ac.uk", "aff_unique_abbr": "UW-Madison;UIUC;ICL", "aff_campus_unique_index": "0;1;0;1;0", "aff_campus_unique": "Madison;Urbana-Champaign;", "aff_country_unique_index": "0;0;1;0;0;0", - "aff_country_unique": "United States;United Kingdom" + "aff_country_unique": "United States;United Kingdom", + "bibtex": "@InProceedings{Huang_2023_ICCV,\n \n author = {\n Huang,\n Zeyi and Zhou,\n Andy and Ling,\n Zijian and Cai,\n Mu and Wang,\n Haohan and Lee,\n Yong Jae\n},\n title = {\n A Sentence Speaks a Thousand Images: Domain Generalization through Distilling CLIP with Language Guidance\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11685-11695\n} \n}" }, { "title": "A Simple Framework for Open-Vocabulary Segmentation and Detection", @@ -1262,14 +1303,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhang_A_Simple_Framework_for_Open-Vocabulary_Segmentation_and_Detection_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;3;4;4;1", - "aff_unique_norm": "Hong Kong University of Science and Technology;International Digital Economy Academy;University of Wisconsin-Madison;Tsinghua University;Microsoft", + "aff_unique_norm": "Hong Kong University of Science and Technology;International Digital Economy Academy;University of Wisconsin-Madison;Tsinghua University;Microsoft Research", "aff_unique_dep": ";;;Dept. of CST.;Research", "aff_unique_url": "https://www.ust.hk;;https://www.wisc.edu;https://www.tsinghua.edu.cn;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "HKUST;IDEA;UW-Madison;THU;MSR", "aff_campus_unique_index": "0;2;3;3", "aff_campus_unique": "Hong Kong SAR;;Madison;Redmond", "aff_country_unique_index": "0;2;0;2;2", - "aff_country_unique": "China;;United States" + "aff_country_unique": "China;;United States", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Hao and Li,\n Feng and Zou,\n Xueyan and Liu,\n Shilong and Li,\n Chunyuan and Yang,\n Jianwei and Zhang,\n Lei\n},\n title = {\n A Simple Framework for Open-Vocabulary Segmentation and Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1020-1031\n} \n}" }, { "title": "A Simple Recipe to Meta-Learn Forward and Backward Transfer", @@ -1281,7 +1323,7 @@ "author": "Edoardo Cetin; Antonio Carta; Oya Celiktutan", "abstract": "Meta-learning holds the potential to provide a general and explicit solution to tackle interference and forgetting in continual learning. However, many popular algorithms introduce expensive and unstable optimization processes with new key hyper-parameters and requirements, hindering their applicability. We propose a new, general, and simple meta-learning algorithm for continual learning (SiM4C) that explicitly optimizes to minimize forgetting and facilitate forward transfer. We show our method is stable, introduces only minimal computational overhead, and can be integrated with any memory-based continual learning algorithm in only a few lines of code. SiM4C meta-learns how to effectively continually learn even on very long task sequences, largely outperforming prior meta-approaches. Naively integrating with existing memory-based algorithms, we also record universal performance benefits and state-of-the-art results across different visual classification benchmarks without introducing new hyper-parameters.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Cetin_A_Simple_Recipe_to_Meta-Learn_Forward_and_Backward_Transfer_ICCV_2023_paper.pdf", - "aff": "King\u2019s College London, Department of Engineering; University of Pisa, Department of Computer Science; King\u2019s College London, Department of Engineering", + "aff": "King’s College London, Department of Engineering; University of Pisa, Department of Computer Science; King’s College London, Department of Engineering", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Cetin_A_Simple_Recipe_ICCV_2023_supplemental.pdf", @@ -1301,7 +1343,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "United Kingdom;Italy" + "aff_country_unique": "United Kingdom;Italy", + "bibtex": "@InProceedings{Cetin_2023_ICCV,\n \n author = {\n Cetin,\n Edoardo and Carta,\n Antonio and Celiktutan,\n Oya\n},\n title = {\n A Simple Recipe to Meta-Learn Forward and Backward Transfer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18732-18742\n} \n}" }, { "title": "A Simple Vision Transformer for Weakly Semi-supervised 3D Object Detection", @@ -1326,14 +1369,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhang_A_Simple_Vision_Transformer_for_Weakly_Semi-supervised_3D_Object_Detection_ICCV_2023_paper.html", "aff_unique_index": "0;0;1;0;1;0;1;0", - "aff_unique_norm": "Huazhong University of Science and Technology;Baidu", - "aff_unique_dep": ";Baidu Inc.", + "aff_unique_norm": "Huazhong University of Science and Technology;Baidu Inc.", + "aff_unique_dep": ";", "aff_unique_url": "http://www.hust.edu.cn;https://www.baidu.com", "aff_unique_abbr": "HUST;Baidu", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Dingyuan and Liang,\n Dingkang and Zou,\n Zhikang and Li,\n Jingyu and Ye,\n Xiaoqing and Liu,\n Zhe and Tan,\n Xiao and Bai,\n Xiang\n},\n title = {\n A Simple Vision Transformer for Weakly Semi-supervised 3D Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8373-8383\n} \n}" }, { "title": "A Skeletonization Algorithm for Gradient-Based Optimization", @@ -1365,7 +1409,8 @@ "aff_campus_unique_index": ";;;", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0+1;0;0;0;1;0;0+1;0+1", - "aff_country_unique": "Germany;United Kingdom" + "aff_country_unique": "Germany;United Kingdom", + "bibtex": "@InProceedings{Menten_2023_ICCV,\n \n author = {\n Menten,\n Martin J. and Paetzold,\n Johannes C. and Zimmer,\n Veronika A. and Shit,\n Suprosanna and Ezhov,\n Ivan and Holland,\n Robbie and Probst,\n Monika and Schnabel,\n Julia A. and Rueckert,\n Daniel\n},\n title = {\n A Skeletonization Algorithm for Gradient-Based Optimization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21394-21403\n} \n}" }, { "title": "A Soft Nearest-Neighbor Framework for Continual Semi-Supervised Learning", @@ -1390,14 +1435,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Kang_A_Soft_Nearest-Neighbor_Framework_for_Continual_Semi-Supervised_Learning_ICCV_2023_paper.html", "aff_unique_index": "0+1;2;3;2+4;0", - "aff_unique_norm": "INRIA;Universite Grenoble Alpes;University of Trento;SAP;Fondazione Bruno Kessler", + "aff_unique_norm": "Inria;Universite Grenoble Alpes;University of Trento;SAP;Fondazione Bruno Kessler", "aff_unique_dep": ";Laboratoire Jean Kuntzmann (LJK);;SAP AI Research;", "aff_unique_url": "https://www.inria.fr;https://www.univ-grenoble-alpes.fr;https://www.unitn.it;https://www.sap.com;https://www.fbk.eu", "aff_unique_abbr": "Inria;UGA;UniTN;SAP;FBK", "aff_campus_unique_index": "1;", "aff_campus_unique": ";Grenoble", "aff_country_unique_index": "0+0;1;2;1+1;0", - "aff_country_unique": "France;Italy;Germany" + "aff_country_unique": "France;Italy;Germany", + "bibtex": "@InProceedings{Kang_2023_ICCV,\n \n author = {\n Kang,\n Zhiqi and Fini,\n Enrico and Nabi,\n Moin and Ricci,\n Elisa and Alahari,\n Karteek\n},\n title = {\n A Soft Nearest-Neighbor Framework for Continual Semi-Supervised Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11868-11877\n} \n}" }, { "title": "A Theory of Topological Derivatives for Inverse Rendering of Geometry", @@ -1420,7 +1466,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Mehta_A_Theory_of_Topological_Derivatives_for_Inverse_Rendering_of_Geometry_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Mehta_A_Theory_of_Topological_Derivatives_for_Inverse_Rendering_of_Geometry_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Mehta_2023_ICCV,\n \n author = {\n Mehta,\n Ishit and Chandraker,\n Manmohan and Ramamoorthi,\n Ravi\n},\n title = {\n A Theory of Topological Derivatives for Inverse Rendering of Geometry\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 419-429\n} \n}" }, { "title": "A Unified Continual Learning Framework with General Parameter-Efficient Tuning", @@ -1443,7 +1490,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Gao_A_Unified_Continual_Learning_Framework_with_General_Parameter-Efficient_Tuning_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Gao_A_Unified_Continual_Learning_Framework_with_General_Parameter-Efficient_Tuning_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Gao_2023_ICCV,\n \n author = {\n Gao,\n Qiankun and Zhao,\n Chen and Sun,\n Yifan and Xi,\n Teng and Zhang,\n Gang and Ghanem,\n Bernard and Zhang,\n Jian\n},\n title = {\n A Unified Continual Learning Framework with General Parameter-Efficient Tuning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11483-11493\n} \n}" }, { "title": "A Unified Framework for Robustness on Diverse Sampling Errors", @@ -1475,7 +1523,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Jeon_2023_ICCV,\n \n author = {\n Jeon,\n Myeongho and Kang,\n Myungjoo and Lee,\n Joonseok\n},\n title = {\n A Unified Framework for Robustness on Diverse Sampling Errors\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1464-1472\n} \n}" }, { "id": "9c5e84c27f", @@ -1494,7 +1543,8 @@ "gs_version_total": 7, "aff_domain": ";;;", "email": ";;;", - "author_num": 4 + "author_num": 4, + "bibtex": "@InProceedings{Pintea_2023_ICCV,\n \n author = {\n Pintea,\n Silvia L. and Lin,\n Yancong and Dijkstra,\n Jouke and van Gemert,\n Jan C.\n},\n title = {\n A step towards understanding why classification helps regression\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19972-19981\n} \n}" }, { "title": "A-STAR: Test-time Attention Segregation and Retention for Text-to-image Synthesis", @@ -1519,14 +1569,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Agarwal_A-STAR_Test-time_Attention_Segregation_and_Retention_for_Text-to-image_Synthesis_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0;0;0", - "aff_unique_norm": "Adobe", + "aff_unique_norm": "Adobe Research", "aff_unique_dep": "Research", "aff_unique_url": "https://research.adobe.com", "aff_unique_abbr": "Adobe", "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Bengaluru", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "India" + "aff_country_unique": "India", + "bibtex": "@InProceedings{Agarwal_2023_ICCV,\n \n author = {\n Agarwal,\n Aishwarya and Karanam,\n Srikrishna and Joseph,\n K J and Saxena,\n Apoorv and Goswami,\n Koustava and Srinivasan,\n Balaji Vasan\n},\n title = {\n A-STAR: Test-time Attention Segregation and Retention for Text-to-image Synthesis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2283-2293\n} \n}" }, { "title": "A2Q: Accumulator-Aware Quantization with Guaranteed Overflow Avoidance", @@ -1558,7 +1609,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Colbert_2023_ICCV,\n \n author = {\n Colbert,\n Ian and Pappalardo,\n Alessandro and Petri-Koenig,\n Jakoba\n},\n title = {\n A2Q: Accumulator-Aware Quantization with Guaranteed Overflow Avoidance\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16989-16998\n} \n}" }, { "title": "ACLS: Adaptive and Conditional Label Smoothing for Network Calibration", @@ -1590,7 +1642,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0+0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Park_2023_ICCV,\n \n author = {\n Park,\n Hyekang and Noh,\n Jongyoun and Oh,\n Youngmin and Baek,\n Donghyeon and Ham,\n Bumsub\n},\n title = {\n ACLS: Adaptive and Conditional Label Smoothing for Network Calibration\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3936-3945\n} \n}" }, { "title": "ACTIVE: Towards Highly Transferable 3D Physical Camouflage for Universal and Robust Vehicle Evasion", @@ -1622,7 +1675,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0;0+0;0;0;0;0;0+0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Suryanto_2023_ICCV,\n \n author = {\n Suryanto,\n Naufal and Kim,\n Yongsu and Larasati,\n Harashta Tatimma and Kang,\n Hyoeun and Le,\n Thi-Thu-Huong and Hong,\n Yoonyoung and Yang,\n Hunmin and Oh,\n Se-Yoon and Kim,\n Howon\n},\n title = {\n ACTIVE: Towards Highly Transferable 3D Physical Camouflage for Universal and Robust Vehicle Evasion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4305-4314\n} \n}" }, { "title": "ADAPT: Efficient Multi-Agent Trajectory Prediction with Adaptation", @@ -1630,11 +1684,11 @@ "status": "Poster", "track": "main", "pid": "1528", - "author_site": "G\u00f6rkay Aydemir, Adil Kaan Akan, Fatma G\u00fcney", - "author": "G\u00f6rkay Aydemir; Adil Kaan Akan; Fatma G\u00fcney", + "author_site": "Görkay Aydemir, Adil Kaan Akan, Fatma Güney", + "author": "Görkay Aydemir; Adil Kaan Akan; Fatma Güney", "abstract": "Forecasting future trajectories of agents in complex traffic scenes requires reliable and efficient predictions for all agents in the scene. However, existing methods for trajectory prediction are either inefficient or sacrifice accuracy. To address this challenge, we propose ADAPT, a novel approach for jointly predicting the trajectories of all agents in the scene with dynamic weight learning. Our approach outperforms state-of-the-art methods in both single-agent and multi-agent settings on the Argoverse and Interaction datasets, with a fraction of their computational overhead. We attribute the improvement in our performance: first, to the adaptive head augmenting the model capacity without increasing the model size; second, to our design choices in the endpoint-conditioned prediction, reinforced by gradient stopping. Our analyses show that ADAPT can focus on each agent with adaptive prediction, allowing for accurate predictions efficiently. https://KUIS-AI.github.io/adapt", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Aydemir_ADAPT_Efficient_Multi-Agent_Trajectory_Prediction_with_Adaptation_ICCV_2023_paper.pdf", - "aff": "KUIS AI Center; KUIS AI Center + Department of Computer Engineering, Koc \u00b8 University; KUIS AI Center + Department of Computer Engineering, Koc \u00b8 University", + "aff": "KUIS AI Center; KUIS AI Center + Department of Computer Engineering, Koc ¸ University; KUIS AI Center + Department of Computer Engineering, Koc ¸ University", "project": "https://KUIS-AI.github.io/adapt", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Aydemir_ADAPT_Efficient_Multi-Agent_ICCV_2023_supplemental.pdf", @@ -1654,7 +1708,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0+1;0+1", - "aff_country_unique": "Japan;T\u00fcrkiye" + "aff_country_unique": "Japan;Turkey", + "bibtex": "@InProceedings{Aydemir_2023_ICCV,\n \n author = {\n Aydemir,\n G\\"orkay and Akan,\n Adil Kaan and G\\"uney,\n Fatma\n},\n title = {\n ADAPT: Efficient Multi-Agent Trajectory Prediction with Adaptation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8295-8305\n} \n}" }, { "title": "ADNet: Lane Shape Prediction via Anchor Decomposition", @@ -1686,7 +1741,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xiao_2023_ICCV,\n \n author = {\n Xiao,\n Lingyu and Li,\n Xiang and Yang,\n Sen and Yang,\n Wankou\n},\n title = {\n ADNet: Lane Shape Prediction via Anchor Decomposition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6404-6413\n} \n}" }, { "title": "AG3D: Learning to Generate 3D Avatars from 2D Image Collections", @@ -1698,7 +1754,7 @@ "author": "Zijian Dong; Xu Chen; Jinlong Yang; Michael J. Black; Otmar Hilliges; Andreas Geiger", "abstract": "While progress in 2D generative models of human appearance has been rapid, many applications require 3D avatars that can be animated and rendered. Unfortunately, most existing methods for learning generative models of 3D humans with diverse shape and appearance require 3D training data, which is limited and expensive to acquire. The key to progress is hence to learn generative models of 3D avatars from abundant unstructured 2D image collections. However, learning realistic and complete 3D appearance and geometry in this under-constrained setting remains challenging, especially in the presence of loose clothing such as dresses. In this paper, we propose a new adversarial generative model of realistic 3D people from 2D images. Our method captures shape and deformation of the body and loose clothing by adopting a holistic 3D generator and integrating an efficient, flexible, articulation module. To improve realism, we train our model using multiple discriminators while also integrating geometric cues in the form of predicted 2D normal maps. We experimentally find that our method outperforms previous 3D- and articulation-aware methods in terms of geometry and appearance. We validate the effectiveness of our model and the importance of each component via systematic ablation studies.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Dong_AG3D_Learning_to_Generate_3D_Avatars_from_2D_Image_Collections_ICCV_2023_paper.pdf", - "aff": "ETH Z\u00fcrich, Department of Computer Science; University of T\u00fcbingen; Max Planck Institute for Intelligent Systems, T\u00fcbingen; Max Planck Institute for Intelligent Systems, T\u00fcbingen; ETH Z\u00fcrich, Department of Computer Science; University of T\u00fcbingen", + "aff": "ETH Zürich, Department of Computer Science; University of Tübingen; Max Planck Institute for Intelligent Systems, Tübingen; Max Planck Institute for Intelligent Systems, Tübingen; ETH Zürich, Department of Computer Science; University of Tübingen", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Dong_AG3D_Learning_to_ICCV_2023_supplemental.pdf", @@ -1711,14 +1767,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Dong_AG3D_Learning_to_Generate_3D_Avatars_from_2D_Image_Collections_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;2;0;1", - "aff_unique_norm": "ETH Zurich;University of T\u00fcbingen;Max Planck Institute for Intelligent Systems", + "aff_unique_norm": "ETH Zürich;University of Tübingen;Max Planck Institute for Intelligent Systems", "aff_unique_dep": "Department of Computer Science;;", "aff_unique_url": "https://www.ethz.ch;https://www.uni-tuebingen.de/;https://www.mpi-is.mpg.de", - "aff_unique_abbr": "ETHZ;Uni T\u00fcbingen;MPI-IS", + "aff_unique_abbr": "ETHZ;Uni Tübingen;MPI-IS", "aff_campus_unique_index": "1;1", - "aff_campus_unique": ";T\u00fcbingen", + "aff_campus_unique": ";Tübingen", "aff_country_unique_index": "0;1;1;1;0;1", - "aff_country_unique": "Switzerland;Germany" + "aff_country_unique": "Switzerland;Germany", + "bibtex": "@InProceedings{Dong_2023_ICCV,\n \n author = {\n Dong,\n Zijian and Chen,\n Xu and Yang,\n Jinlong and Black,\n Michael J. and Hilliges,\n Otmar and Geiger,\n Andreas\n},\n title = {\n AG3D: Learning to Generate 3D Avatars from 2D Image Collections\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14916-14927\n} \n}" }, { "title": "AGG-Net: Attention Guided Gated-Convolutional Network for Depth Image Completion", @@ -1750,7 +1807,8 @@ "aff_campus_unique_index": "0+0+1;0;0;0+0;0+1", "aff_campus_unique": "Shenyang;Foshan", "aff_country_unique_index": "0+0+0;0;0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Dongyue and Huang,\n Tingxuan and Song,\n Zhimin and Deng,\n Shizhuo and Jia,\n Tong\n},\n title = {\n AGG-Net: Attention Guided Gated-Convolutional Network for Depth Image Completion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8853-8862\n} \n}" }, { "title": "AIDE: A Vision-Driven Multi-View, Multi-Modal, Multi-Tasking Dataset for Assistive Driving Perception", @@ -1775,14 +1833,15 @@ "author_num": 15, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yang_AIDE_A_Vision-Driven_Multi-View_Multi-Modal_Multi-Tasking_Dataset_for_Assistive_Driving_ICCV_2023_paper.html", "aff_unique_index": "0+1+2+3;0;0;0;0;0;0;0;0;0;0;0;4;0;0+1+2+3", - "aff_unique_norm": "Fudan University;Meta;Engineering Research Center of AI and Robotics;Jilin Province AI and Unmanned Systems Engineering Research Center;Boli Technology Co., Ltd.", - "aff_unique_dep": "Academy for Engineering and Technology;Institute of Meta-Medical;Ministry of Education;AI and Unmanned Systems Engineering;", + "aff_unique_norm": "Fudan University;Institute of Meta-Medical;Engineering Research Center of AI and Robotics;Jilin Province AI and Unmanned Systems Engineering Research Center;Boli Technology Co., Ltd.", + "aff_unique_dep": "Academy for Engineering and Technology;;Ministry of Education;AI and Unmanned Systems Engineering;", "aff_unique_url": "https://www.fudan.edu.cn;;;;", "aff_unique_abbr": "Fudan;;;;", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Changchun", "aff_country_unique_index": "0+0+0;0;0;0;0;0;0;0;0;0;0;0;0;0;0+0+0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n Dingkang and Huang,\n Shuai and Xu,\n Zhi and Li,\n Zhenpeng and Wang,\n Shunli and Li,\n Mingcheng and Wang,\n Yuzheng and Liu,\n Yang and Yang,\n Kun and Chen,\n Zhaoyu and Wang,\n Yan and Liu,\n Jing and Zhang,\n Peixuan and Zhai,\n Peng and Zhang,\n Lihua\n},\n title = {\n AIDE: A Vision-Driven Multi-View,\n Multi-Modal,\n Multi-Tasking Dataset for Assistive Driving Perception\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20459-20470\n} \n}" }, { "title": "ALIP: Adaptive Language-Image Pre-Training with Synthetic Caption", @@ -1814,7 +1873,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0;3", - "aff_country_unique": "China;United Kingdom;;Australia" + "aff_country_unique": "China;United Kingdom;;Australia", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n Kaicheng and Deng,\n Jiankang and An,\n Xiang and Li,\n Jiawei and Feng,\n Ziyong and Guo,\n Jia and Yang,\n Jing and Liu,\n Tongliang\n},\n title = {\n ALIP: Adaptive Language-Image Pre-Training with Synthetic Caption\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2922-2931\n} \n}" }, { "title": "ALWOD: Active Learning for Weakly-Supervised Object Detection", @@ -1823,7 +1883,7 @@ "track": "main", "pid": "9085", "author_site": "Yuting Wang, Velibor Ilic, Jiatong Li, Branislav Kisa?anin, Vladimir Pavlovic", - "author": "Yuting Wang; Velibor Ilic; Jiatong Li; Branislav Kisa\u010danin; Vladimir Pavlovic", + "author": "Yuting Wang; Velibor Ilic; Jiatong Li; Branislav Kisačanin; Vladimir Pavlovic", "abstract": "Object detection (OD), a crucial vision task, remains challenged by the lack of large training datasets with precise object localization labels. In this work, we propose ALWOD, a new framework that addresses this problem by fusing active learning (AL) with weakly and semi-supervised object detection paradigms. Because the performance of AL critically depends on the model initialization, we propose a new auxiliary image generator strategy that utilizes an extremely small labeled set, coupled with a large weakly tagged set of images, as a warm-start for AL. We then propose a new AL acquisition function, another critical factor in AL success, that leverages the student-teacher OD pair disagreement and uncertainty to effectively propose the most informative images to annotate. Finally, to complete the AL loop, we introduce a new labeling task delegated to human annotators, based on selection and correction of model-proposed detections, which is both rapid and effective in labeling the informative images. We demonstrate, across several challenging benchmarks, that ALWOD significantly narrows the gap between the ODs trained on few partially labeled but strategically selected image instances and those that rely on the fully-labeled data. Our code is publicly available on https://github.com/seqam-lab/ALWOD.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Wang_ALWOD_Active_Learning_for_Weakly-Supervised_Object_Detection_ICCV_2023_paper.pdf", "aff": "Rutgers University, NJ, USA; The Institute for Artificial Intelligence Research and Development of Serbia, Novi Sad, Serbia; Rutgers University, NJ, USA; Nvidia Corporation, TX, USA + The Institute for Artificial Intelligence Research and Development of Serbia, Novi Sad, Serbia; Rutgers University, NJ, USA", @@ -1839,14 +1899,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wang_ALWOD_Active_Learning_for_Weakly-Supervised_Object_Detection_ICCV_2023_paper.html", "aff_unique_index": "0;1;0;2+1;0", - "aff_unique_norm": "Rutgers University;Institute for Artificial Intelligence Research and Development of Serbia;NVIDIA", - "aff_unique_dep": ";;Nvidia Corporation", + "aff_unique_norm": "Rutgers University;Institute for Artificial Intelligence Research and Development of Serbia;Nvidia Corporation", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.rutgers.edu;;https://www.nvidia.com", "aff_unique_abbr": "Rutgers;;NVidia", "aff_campus_unique_index": "0;1;0;1;0", "aff_campus_unique": "New Brunswick;Novi Sad;", "aff_country_unique_index": "0;1;0;0+1;0", - "aff_country_unique": "United States;Serbia" + "aff_country_unique": "United States;Serbia", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Yuting and Ilic,\n Velibor and Li,\n Jiatong and Kisa\\v{c\n}anin,\n Branislav and Pavlovic,\n Vladimir\n},\n title = {\n ALWOD: Active Learning for Weakly-Supervised Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6459-6469\n} \n}" }, { "title": "AREA: Adaptive Reweighting via Effective Area for Long-Tailed Classification", @@ -1878,7 +1939,8 @@ "aff_campus_unique_index": "0+0;0+0;0;0+0;0+0", "aff_campus_unique": "Beijing;", "aff_country_unique_index": "0+0;0+0;0;0;0+0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Xiaohua and Zhou,\n Yucan and Wu,\n Dayan and Yang,\n Chule and Li,\n Bo and Hu,\n Qinghua and Wang,\n Weiping\n},\n title = {\n AREA: Adaptive Reweighting via Effective Area for Long-Tailed Classification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19277-19287\n} \n}" }, { "title": "ARNOLD: A Benchmark for Language-Grounded Task Learning with Continuous States in Realistic 3D Scenes", @@ -1910,7 +1972,8 @@ "aff_campus_unique_index": "0;;0;;0;0;0;0;", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0;1+1;0;1+1;0;0;0;0;0;1+1+1;1;1", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Gong_2023_ICCV,\n \n author = {\n Gong,\n Ran and Huang,\n Jiangyong and Zhao,\n Yizhou and Geng,\n Haoran and Gao,\n Xiaofeng and Wu,\n Qingyang and Ai,\n Wensi and Zhou,\n Ziheng and Terzopoulos,\n Demetri and Zhu,\n Song-Chun and Jia,\n Baoxiong and Huang,\n Siyuan\n},\n title = {\n ARNOLD: A Benchmark for Language-Grounded Task Learning with Continuous States in Realistic 3D Scenes\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20483-20495\n} \n}" }, { "title": "ASAG: Building Strong One-Decoder-Layer Sparse Detectors via Adaptive Sparse Anchor Generation", @@ -1942,7 +2005,8 @@ "aff_campus_unique_index": ";;;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0+0+0;0+0+0+0;0+0+0+0;0+0+0+0;0+0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Fu_2023_ICCV,\n \n author = {\n Fu,\n Shenghao and Yan,\n Junkai and Gao,\n Yipeng and Xie,\n Xiaohua and Zheng,\n Wei-Shi\n},\n title = {\n ASAG: Building Strong One-Decoder-Layer Sparse Detectors via Adaptive Sparse Anchor Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6328-6338\n} \n}" }, { "title": "ASIC: Aligning Sparse in-the-wild Image Collections", @@ -1965,7 +2029,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Gupta_ASIC_Aligning_Sparse_in-the-wild_Image_Collections_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Gupta_ASIC_Aligning_Sparse_in-the-wild_Image_Collections_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Gupta_2023_ICCV,\n \n author = {\n Gupta,\n Kamal and Jampani,\n Varun and Esteves,\n Carlos and Shrivastava,\n Abhinav and Makadia,\n Ameesh and Snavely,\n Noah and Kar,\n Abhishek\n},\n title = {\n ASIC: Aligning Sparse in-the-wild Image Collections\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4134-4145\n} \n}" }, { "title": "ASM: Adaptive Skinning Model for High-Quality 3D Face Modeling", @@ -1997,7 +2062,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n Kai and Shang,\n Hong and Shi,\n Tianyang and Chen,\n Xinghan and Zhou,\n Jingkai and Sun,\n Zhongqian and Yang,\n Wei\n},\n title = {\n ASM: Adaptive Skinning Model for High-Quality 3D Face Modeling\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20708-20717\n} \n}" }, { "title": "ATT3D: Amortized Text-to-3D Object Synthesis", @@ -2022,14 +2088,15 @@ "author_num": 10, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Lorraine_ATT3D_Amortized_Text-to-3D_Object_Synthesis_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0;0;0;0;0;0;0", - "aff_unique_norm": "NVIDIA", - "aff_unique_dep": "NVIDIA Corporation", + "aff_unique_norm": "NVIDIA Corporation", + "aff_unique_dep": "", "aff_unique_url": "https://www.nvidia.com", "aff_unique_abbr": "NVIDIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Lorraine_2023_ICCV,\n \n author = {\n Lorraine,\n Jonathan and Xie,\n Kevin and Zeng,\n Xiaohui and Lin,\n Chen-Hsuan and Takikawa,\n Towaki and Sharp,\n Nicholas and Lin,\n Tsung-Yi and Liu,\n Ming-Yu and Fidler,\n Sanja and Lucas,\n James\n},\n title = {\n ATT3D: Amortized Text-to-3D Object Synthesis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17946-17956\n} \n}" }, { "title": "Ablating Concepts in Text-to-Image Diffusion Models", @@ -2052,7 +2119,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Kumari_Ablating_Concepts_in_Text-to-Image_Diffusion_Models_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Kumari_Ablating_Concepts_in_Text-to-Image_Diffusion_Models_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Kumari_2023_ICCV,\n \n author = {\n Kumari,\n Nupur and Zhang,\n Bingliang and Wang,\n Sheng-Yu and Shechtman,\n Eli and Zhang,\n Richard and Zhu,\n Jun-Yan\n},\n title = {\n Ablating Concepts in Text-to-Image Diffusion Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22691-22702\n} \n}" }, { "title": "AccFlow: Backward Accumulation for Long-Range Optical Flow", @@ -2077,14 +2145,15 @@ "author_num": 9, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wu_AccFlow_Backward_Accumulation_for_Long-Range_Optical_Flow_ICCV_2023_paper.html", "aff_unique_index": "0+1;0;2;1;3;1;4;0;1", - "aff_unique_norm": "Shanghai Jiao Tong University;University of Electronic Science and Technology of China;Hong Kong University of Science and Technology;Shenzhen Institute of Advanced Technology;Microsoft", + "aff_unique_norm": "Shanghai Jiao Tong University;University of Electronic Science and Technology of China;Hong Kong University of Science and Technology;Shenzhen Institute of Advanced Technology;Microsoft Research", "aff_unique_dep": ";;;;Research", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.uestc.edu.cn;https://www.ust.hk;http://www.siat.ac.cn;https://www.microsoft.com/en-us/research/group/asia", - "aff_unique_abbr": "SJTU;UESTC;HKUST;;MSR Asia", + "aff_unique_abbr": "SJTU;UESTC;HKUST;SIAT;MSR Asia", "aff_campus_unique_index": ";1;2", "aff_campus_unique": ";Hong Kong SAR;Asia", "aff_country_unique_index": "0+0;0;0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wu_2023_ICCV,\n \n author = {\n Wu,\n Guangyang and Liu,\n Xiaohong and Luo,\n Kunming and Liu,\n Xi and Zheng,\n Qingqing and Liu,\n Shuaicheng and Jiang,\n Xinyang and Zhai,\n Guangtao and Wang,\n Wenyi\n},\n title = {\n AccFlow: Backward Accumulation for Long-Range Optical Flow\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12119-12128\n} \n}" }, { "title": "Accurate 3D Face Reconstruction with Facial Component Tokens", @@ -2107,7 +2176,8 @@ "aff_domain": ";;;;;;;;;;", "email": ";;;;;;;;;;", "author_num": 11, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhang_Accurate_3D_Face_Reconstruction_with_Facial_Component_Tokens_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhang_Accurate_3D_Face_Reconstruction_with_Facial_Component_Tokens_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Tianke and Chu,\n Xuangeng and Liu,\n Yunfei and Lin,\n Lijian and Yang,\n Zhendong and Xu,\n Zhengzhuo and Cao,\n Chengkun and Yu,\n Fei and Zhou,\n Changyin and Yuan,\n Chun and Li,\n Yu\n},\n title = {\n Accurate 3D Face Reconstruction with Facial Component Tokens\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9033-9042\n} \n}" }, { "title": "Accurate and Fast Compressed Video Captioning", @@ -2132,14 +2202,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Shen_Accurate_and_Fast_Compressed_Video_Captioning_ICCV_2023_paper.html", "aff_unique_index": "0+1;0+1;2;3;2;0+1", - "aff_unique_norm": "Chinese Academy of Sciences;University of Chinese Academy of Sciences;Bytedance Inc.;University of North Texas", + "aff_unique_norm": "Chinese Academy of Sciences;University of Chinese Academy of Sciences;ByteDance Inc.;University of North Texas", "aff_unique_dep": "Institute of Software;;;Department of Computer Science and Engineering", "aff_unique_url": "http://www.ios.ac.cn;http://www.ucas.ac.cn;https://www.bytedance.com;https://www.unt.edu", "aff_unique_abbr": "CAS;UCAS;ByteDance;UNT", "aff_campus_unique_index": "0+0;0+0;1;2;1;0+0", "aff_campus_unique": "Beijing;San Jose;Denton", "aff_country_unique_index": "0+0;0+0;1;1;1;0+0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Shen_2023_ICCV,\n \n author = {\n Shen,\n Yaojie and Gu,\n Xin and Xu,\n Kai and Fan,\n Heng and Wen,\n Longyin and Zhang,\n Libo\n},\n title = {\n Accurate and Fast Compressed Video Captioning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15558-15567\n} \n}" }, { "title": "Achievement-Based Training Progress Balancing for Multi-Task Learning", @@ -2164,14 +2235,15 @@ "author_num": 2, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yun_Achievement-Based_Training_Progress_Balancing_for_Multi-Task_Learning_ICCV_2023_paper.html", "aff_unique_index": "0;0", - "aff_unique_norm": "Samsung", - "aff_unique_dep": "Samsung Research", + "aff_unique_norm": "Samsung Research", + "aff_unique_dep": "", "aff_unique_url": "https://research.samsung.com", "aff_unique_abbr": "Samsung", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Yun_2023_ICCV,\n \n author = {\n Yun,\n Hayoung and Cho,\n Hanjoo\n},\n title = {\n Achievement-Based Training Progress Balancing for Multi-Task Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16935-16944\n} \n}" }, { "title": "ActFormer: A GAN-based Transformer towards General Action-Conditioned 3D Human Motion Generation", @@ -2196,14 +2268,15 @@ "author_num": 12, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Xu_ActFormer_A_GAN-based_Transformer_towards_General_Action-Conditioned_3D_Human_Motion_ICCV_2023_paper.html", "aff_unique_index": "0;1;0;0;0;0;2;3;4+5;3;4+5;0+6", - "aff_unique_norm": "SenseTime;Hong Kong Polytechnic University;Mashang Consumer Finance;Shanghai Jiao Tong University;Eastern Institute of Technology;Ningbo Institute of Digital Twin;Shanghai AI Laboratory", + "aff_unique_norm": "SenseTime;The Hong Kong Polytechnic University;Mashang Consumer Finance;Shanghai Jiao Tong University;Eastern Institute of Technology;Ningbo Institute of Digital Twin;Shanghai AI Laboratory", "aff_unique_dep": "SenseTime Research;;Consumer Finance;;;;", "aff_unique_url": "https://www.sensetime.com;https://www.polyu.edu.hk;;https://www.sjtu.edu.cn;https://www.eit.edu.cn;;https://www.shanghai-ai-lab.com", "aff_unique_abbr": "SenseTime;PolyU;;SJTU;;;SAIL", "aff_campus_unique_index": "1;2;2;", "aff_campus_unique": ";Hong Kong SAR;Ningbo", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0+0;0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xu_2023_ICCV,\n \n author = {\n Xu,\n Liang and Song,\n Ziyang and Wang,\n Dongliang and Su,\n Jing and Fang,\n Zhicheng and Ding,\n Chenjing and Gan,\n Weihao and Yan,\n Yichao and Jin,\n Xin and Yang,\n Xiaokang and Zeng,\n Wenjun and Wu,\n Wei\n},\n title = {\n ActFormer: A GAN-based Transformer towards General Action-Conditioned 3D Human Motion Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2228-2238\n} \n}" }, { "title": "Action Sensitivity Learning for Temporal Action Localization", @@ -2235,7 +2308,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Shao_2023_ICCV,\n \n author = {\n Shao,\n Jiayi and Wang,\n Xiaohan and Quan,\n Ruijie and Zheng,\n Junjun and Yang,\n Jiang and Yang,\n Yi\n},\n title = {\n Action Sensitivity Learning for Temporal Action Localization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13457-13469\n} \n}" }, { "title": "Activate and Reject: Towards Safe Domain Generalization under Category Shift", @@ -2260,14 +2334,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chen_Activate_and_Reject_Towards_Safe_Domain_Generalization_under_Category_Shift_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;0;1;3;0", - "aff_unique_norm": "University of Hong Kong;Xiamen University;University of Wisconsin-Madison;Chinese University of Hong Kong", + "aff_unique_norm": "The University of Hong Kong;Xiamen University;University of Wisconsin-Madison;The Chinese University of Hong Kong", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.hku.hk;https://www.xmu.edu.cn;https://www.wisc.edu;https://www.cuhk.edu.cn", "aff_unique_abbr": "HKU;XMU;UW-Madison;CUHK", "aff_campus_unique_index": "0;2;0;3;0", "aff_campus_unique": "Hong Kong SAR;;Madison;Shenzhen", "aff_country_unique_index": "0;0;1;0;0;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Chaoqi and Tang,\n Luyao and Tao,\n Leitian and Zhou,\n Hong-Yu and Huang,\n Yue and Han,\n Xiaoguang and Yu,\n Yizhou\n},\n title = {\n Activate and Reject: Towards Safe Domain Generalization under Category Shift\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11552-11563\n} \n}" }, { "title": "Active Neural Mapping", @@ -2299,7 +2374,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yan_2023_ICCV,\n \n author = {\n Yan,\n Zike and Yang,\n Haoxiang and Zha,\n Hongbin\n},\n title = {\n Active Neural Mapping\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10981-10992\n} \n}" }, { "title": "Active Self-Supervised Learning: A Few Low-Cost Relationships Are All You Need", @@ -2324,14 +2400,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Cabannes_Active_Self-Supervised_Learning_A_Few_Low-Cost_Relationships_Are_All_You_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0", - "aff_unique_norm": "Meta", + "aff_unique_norm": "Meta Platforms, Inc.", "aff_unique_dep": "Meta AI", "aff_unique_url": "https://meta.com", "aff_unique_abbr": "Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Cabannes_2023_ICCV,\n \n author = {\n Cabannes,\n Vivien and Bottou,\n Leon and Lecun,\n Yann and Balestriero,\n Randall\n},\n title = {\n Active Self-Supervised Learning: A Few Low-Cost Relationships Are All You Need\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16274-16283\n} \n}" }, { "title": "Active Stereo Without Pattern Projector", @@ -2357,13 +2434,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Bartolomei_Active_Stereo_Without_Pattern_Projector_ICCV_2023_paper.html", "aff_unique_index": "0+1;1;1;1;0+1", "aff_unique_norm": "ARCES;Dipartimento di Scienze dell'Informazione", - "aff_unique_dep": ";", - "aff_unique_url": ";http://www.disi.unige.it", + "aff_unique_dep": ";Scienze dell'Informazione", + "aff_unique_url": ";https://www.disi.unige.it", "aff_unique_abbr": ";DISI", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "1;1;1;1;1", - "aff_country_unique": ";Italy" + "aff_country_unique": ";Italy", + "bibtex": "@InProceedings{Bartolomei_2023_ICCV,\n \n author = {\n Bartolomei,\n Luca and Poggi,\n Matteo and Tosi,\n Fabio and Conti,\n Andrea and Mattoccia,\n Stefano\n},\n title = {\n Active Stereo Without Pattern Projector\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18470-18482\n} \n}" }, { "title": "ActorsNeRF: Animatable Few-shot Human Rendering with Generalizable NeRFs", @@ -2386,7 +2464,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Mu_ActorsNeRF_Animatable_Few-shot_Human_Rendering_with_Generalizable_NeRFs_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Mu_ActorsNeRF_Animatable_Few-shot_Human_Rendering_with_Generalizable_NeRFs_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Mu_2023_ICCV,\n \n author = {\n Mu,\n Jiteng and Sang,\n Shen and Vasconcelos,\n Nuno and Wang,\n Xiaolong\n},\n title = {\n ActorsNeRF: Animatable Few-shot Human Rendering with Generalizable NeRFs\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18391-18401\n} \n}" }, { "title": "AdVerb: Visually Guided Audio Dereverberation", @@ -2398,7 +2477,7 @@ "author": "Sanjoy Chowdhury; Sreyan Ghosh; Subhrajyoti Dasgupta; Anton Ratnarajah; Utkarsh Tyagi; Dinesh Manocha", "abstract": "We present AdVerb, a novel audio-visual dereverberation framework that uses visual cues in addition to the reverberant sound to estimate clean audio. Although audio-only dereverberation is a well-studied problem, our approach incorporates the complementary visual modality to perform audio dereverberation. Given an image of the environment where the reverberated sound signal has been recorded, AdVerb employs a novel geometry-aware cross-modal transformer architecture that captures scene geometry and audio-visual cross-modal relationship to generate a complex ideal ratio mask, which, when applied to the reverberant audio predicts the clean sound. The effectiveness of our method is demonstrated through extensive quantitative and qualitative evaluations. Our approach significantly outperforms traditional audio-only and audio-visual baselines on three downstream tasks: speech enhancement, speech recognition, and speaker verification, with relative improvements in the range of 18% - 82% on the LibriSpeech test-clean set. We also achieve highly satisfactory RT60 error scores on the AVSpeech dataset.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Chowdhury_AdVerb_Visually_Guided_Audio_Dereverberation_ICCV_2023_paper.pdf", - "aff": "University of Maryland, College Park; University of Maryland, College Park; Mila and Universit\u00e9 de Montr\u00e9al; University of Maryland, College Park; University of Maryland, College Park; University of Maryland, College Park", + "aff": "University of Maryland, College Park; University of Maryland, College Park; Mila and Université de Montréal; University of Maryland, College Park; University of Maryland, College Park; University of Maryland, College Park", "project": "https://gamma.umd.edu/researchdirections/speech/adverb", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Chowdhury_AdVerb_Visually_Guided_ICCV_2023_supplemental.pdf", @@ -2411,14 +2490,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chowdhury_AdVerb_Visually_Guided_Audio_Dereverberation_ICCV_2023_paper.html", "aff_unique_index": "0;0;1;0;0;0", - "aff_unique_norm": "University of Maryland;Universit\u00e9 de Montr\u00e9al", + "aff_unique_norm": "University of Maryland;Université de Montréal", "aff_unique_dep": ";Mila", "aff_unique_url": "https://www/umd.edu;https://www.umontreal.ca", "aff_unique_abbr": "UMD;UdeM", "aff_campus_unique_index": "0;0;1;0;0;0", - "aff_campus_unique": "College Park;Montr\u00e9al", + "aff_campus_unique": "College Park;Montréal", "aff_country_unique_index": "0;0;1;0;0;0", - "aff_country_unique": "United States;Canada" + "aff_country_unique": "United States;Canada", + "bibtex": "@InProceedings{Chowdhury_2023_ICCV,\n \n author = {\n Chowdhury,\n Sanjoy and Ghosh,\n Sreyan and Dasgupta,\n Subhrajyoti and Ratnarajah,\n Anton and Tyagi,\n Utkarsh and Manocha,\n Dinesh\n},\n title = {\n AdVerb: Visually Guided Audio Dereverberation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7884-7896\n} \n}" }, { "title": "Ada3D : Exploiting the Spatial Redundancy with Adaptive Inference for Efficient 3D Object Detection", @@ -2450,7 +2530,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Zhao_2023_ICCV,\n \n author = {\n Zhao,\n Tianchen and Ning,\n Xuefei and Hong,\n Ke and Qiu,\n Zhongyuan and Lu,\n Pu and Zhao,\n Yali and Zhang,\n Linfeng and Zhou,\n Lipu and Dai,\n Guohao and Yang,\n Huazhong and Wang,\n Yu\n},\n title = {\n Ada3D : Exploiting the Spatial Redundancy with Adaptive Inference for Efficient 3D Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17728-17738\n} \n}" }, { "title": "AdaMV-MoE: Adaptive Multi-Task Vision Mixture-of-Experts", @@ -2475,14 +2556,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chen_AdaMV-MoE_Adaptive_Multi-Task_Vision_Mixture-of-Experts_ICCV_2023_paper.html", "aff_unique_index": "0;0;1;2;2;2;0;2", - "aff_unique_norm": "University of Texas at Austin;Apple;Google", - "aff_unique_dep": ";Apple Inc.;Google", + "aff_unique_norm": "University of Texas at Austin;Apple Inc.;Google", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.utexas.edu;https://www.apple.com;https://www.google.com", "aff_unique_abbr": "UT Austin;Apple;Google", "aff_campus_unique_index": "0;0;2;2;2;0;2", "aff_campus_unique": "Austin;;Mountain View", "aff_country_unique_index": "0;0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Tianlong and Chen,\n Xuxi and Du,\n Xianzhi and Rashwan,\n Abdullah and Yang,\n Fan and Chen,\n Huizhong and Wang,\n Zhangyang and Li,\n Yeqing\n},\n title = {\n AdaMV-MoE: Adaptive Multi-Task Vision Mixture-of-Experts\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17346-17357\n} \n}" }, { "title": "AdaNIC: Towards Practical Neural Image Compression via Dynamic Transform Routing", @@ -2514,7 +2596,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Shenzhen;", "aff_country_unique_index": "0+0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Tao_2023_ICCV,\n \n author = {\n Tao,\n Lvfang and Gao,\n Wei and Li,\n Ge and Zhang,\n Chenhao\n},\n title = {\n AdaNIC: Towards Practical Neural Image Compression via Dynamic Transform Routing\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16879-16888\n} \n}" }, { "title": "AdaptGuard: Defending Against Universal Attacks for Model Adaptation", @@ -2522,7 +2605,11 @@ "author": "Lijun Sheng, Jian Liang, Ran He, Zilei Wang, Tieniu Tan", "status": "Poster", "track": "main", - "pid": "6912" + "pid": "6912", + "gs_citation": 10, + "bibtex": "@misc{sheng2023,\n title={AdaptGuard: Defending Against Universal Attacks for Model Adaptation},\n author={Lijun Sheng and Jian Liang and Ran He and Zilei Wang and Tieniu Tan},\n year={2023},\n eprint={2303.10594v2},\n archivePrefix={arXiv},\n primaryClass={cs.CV},\n url={https://arxiv.org/abs/2303.10594v2}\n}", + "abstract": "Model adaptation aims at solving the domain transfer problem under the\nconstraint of only accessing the pretrained source models. With the increasing\nconsiderations of data privacy and transmission efficiency, this paradigm has\nbeen gaining recent popularity. This paper studies the vulnerability to\nuniversal attacks transferred from the source domain during model adaptation\nalgorithms due to the existence of malicious providers. We explore both\nuniversal adversarial perturbations and backdoor attacks as loopholes on the\nsource side and discover that they still survive in the target models after\nadaptation. To address this issue, we propose a model preprocessing framework,\nnamed AdaptGuard, to improve the security of model adaptation algorithms.\nAdaptGuard avoids direct use of the risky source parameters through knowledge\ndistillation and utilizes the pseudo adversarial samples under adjusted radius\nto enhance the robustness. AdaptGuard is a plug-and-play module that requires\nneither robust pretrained models nor any changes for the following model\nadaptation algorithms. Extensive results on three commonly used datasets and\ntwo popular adaptation methods validate that AdaptGuard can effectively defend\nagainst universal attacks and maintain clean accuracy in the target domain\nsimultaneously. We hope this research will shed light on the safety and\nrobustness of transfer learning. Code is available at\nhttps://github.com/TomSheng21/AdaptGuard.", + "pdf_url": "http://arxiv.org/pdf/2303.10594v2" }, { "title": "Adaptive Calibrator Ensemble: Navigating Test Set Difficulty in Out-of-Distribution Scenarios", @@ -2547,14 +2634,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zou_Adaptive_Calibrator_Ensemble_Navigating_Test_Set_Difficulty_in_Out-of-Distribution_Scenarios_ICCV_2023_paper.html", "aff_unique_index": "0;1;1", - "aff_unique_norm": "Hong Kong Polytechnic University;Australian National University", + "aff_unique_norm": "The Hong Kong Polytechnic University;Australian National University", "aff_unique_dep": ";", "aff_unique_url": "https://www.polyu.edu.hk;https://www.anu.edu.au", "aff_unique_abbr": "PolyU;ANU", "aff_campus_unique_index": "0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;1;1", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Zou_2023_ICCV,\n \n author = {\n Zou,\n Yuli and Deng,\n Weijian and Zheng,\n Liang\n},\n title = {\n Adaptive Calibrator Ensemble: Navigating Test Set Difficulty in Out-of-Distribution Scenarios\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19333-19342\n} \n}" }, { "title": "Adaptive Frequency Filters As Efficient Global Token Mixers", @@ -2579,14 +2667,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Huang_Adaptive_Frequency_Filters_As_Efficient_Global_Token_Mixers_ICCV_2023_paper.html", "aff_unique_index": "0+1;1;1;0;1;1", - "aff_unique_norm": "University of Science and Technology of China;Microsoft", + "aff_unique_norm": "University of Science and Technology of China;Microsoft Research", "aff_unique_dep": ";Research", "aff_unique_url": "http://www.ustc.edu.cn;https://www.microsoft.com/en-us/research/group/asia", "aff_unique_abbr": "USTC;MSR Asia", "aff_campus_unique_index": "1;1;1;1;1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0+0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Huang_2023_ICCV,\n \n author = {\n Huang,\n Zhipeng and Zhang,\n Zhizheng and Lan,\n Cuiling and Zha,\n Zheng-Jun and Lu,\n Yan and Guo,\n Baining\n},\n title = {\n Adaptive Frequency Filters As Efficient Global Token Mixers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6049-6059\n} \n}" }, { "title": "Adaptive Illumination Mapping for Shadow Detection in Raw Images", @@ -2618,7 +2707,8 @@ "aff_campus_unique_index": "0+1;1;0;0;0;1;1", "aff_campus_unique": "Dalian;Hong Kong SAR", "aff_country_unique_index": "0+0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Sun_2023_ICCV,\n \n author = {\n Sun,\n Jiayu and Xu,\n Ke and Pang,\n Youwei and Zhang,\n Lihe and Lu,\n Huchuan and Hancke,\n Gerhard and Lau,\n Rynson\n},\n title = {\n Adaptive Illumination Mapping for Shadow Detection in Raw Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12709-12718\n} \n}" }, { "title": "Adaptive Image Anonymization in the Context of Image Classification with Neural Networks", @@ -2630,7 +2720,7 @@ "author": "Nadiya Shvai; Arcadi Llanza Carmona; Amir Nakib", "abstract": "Deep learning based methods have become the de-facto standard for various computer vision tasks. Nevertheless, they have repeatedly shown their vulnerability to various form of input perturbations such as pixels modification, region anonymization, etc. which are closely related to the adversarial attacks. This research particularly addresses the case of image anonymization, which is significantly important to preserve privacy and hence to secure digitized form of personal information from being exposed and potentially misused by different services that have captured it for various purposes. However, applying anonymization causes the classifier to provide different class decisions before and after applying it and therefore reduces the classifier's reliability and usability. In order to achieve a robust solution to this problem we propose a novel anonymization procedure that allows the existing classifiers to become class decision invariant on the anonymized images without any modification requires to apply on the classification models. We conduct numerous experiments on the popular ImageNet benchmark as well as on a large scale industrial toll classification problem's dataset. Obtained results confirm the efficiency and effectiveness of the proposed method as it obtained 0% rate of class decision change for both datasets compared to 15.95% on ImageNet and 0.18% on toll dataset obtained by applying the naive anonymization approaches. Moreover, it has shown a great potential to be applied to similar problems from different domains.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Shvai_Adaptive_Image_Anonymization_in_the_Context_of_Image_Classification_with_ICCV_2023_paper.pdf", - "aff": "Cyclope.ai, Paris, France+University Paris Est Cr\u00e9teil, Laboratoire LISSI, Paris, France; Cyclope.ai, Paris, France+University Paris Est Cr\u00e9teil, Laboratoire LISSI, Paris, France; Cyclope.ai, Paris, France+University Paris Est Cr\u00e9teil, Laboratoire LISSI, Paris, France", + "aff": "Cyclope.ai, Paris, France+University Paris Est Créteil, Laboratoire LISSI, Paris, France; Cyclope.ai, Paris, France+University Paris Est Créteil, Laboratoire LISSI, Paris, France; Cyclope.ai, Paris, France+University Paris Est Créteil, Laboratoire LISSI, Paris, France", "project": "", "github": "", "supp": "", @@ -2643,14 +2733,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Shvai_Adaptive_Image_Anonymization_in_the_Context_of_Image_Classification_with_ICCV_2023_paper.html", "aff_unique_index": "0+1;0+1;0+1", - "aff_unique_norm": "Cyclope.ai;University Paris Est Cr\u00e9teil", + "aff_unique_norm": "Cyclope.ai;University Paris Est Créteil", "aff_unique_dep": ";Laboratoire LISSI", "aff_unique_url": ";https://www.univ-pec.fr", "aff_unique_abbr": ";UPEC", "aff_campus_unique_index": "0+0;0+0;0+0", "aff_campus_unique": "Paris", "aff_country_unique_index": "0+0;0+0;0+0", - "aff_country_unique": "France" + "aff_country_unique": "France", + "bibtex": "@InProceedings{Shvai_2023_ICCV,\n \n author = {\n Shvai,\n Nadiya and Carmona,\n Arcadi Llanza and Nakib,\n Amir\n},\n title = {\n Adaptive Image Anonymization in the Context of Image Classification with Neural Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5074-5083\n} \n}" }, { "title": "Adaptive Nonlinear Latent Transformation for Conditional Face Editing", @@ -2675,14 +2766,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Huang_Adaptive_Nonlinear_Latent_Transformation_for_Conditional_Face_Editing_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0+1", - "aff_unique_norm": "Fudan University;Shanghai Center for Brain Science and Brain-Inspired Technology", + "aff_unique_norm": "Fudan University;Shanghai Center for Brain Science and Brain-inspired Technology", "aff_unique_dep": "School of Computer Science;", "aff_unique_url": "https://www.fudan.edu.cn;", "aff_unique_abbr": "Fudan;", "aff_campus_unique_index": "0;0;0;0+0", "aff_campus_unique": "Shanghai", "aff_country_unique_index": "0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Huang_2023_ICCV,\n \n author = {\n Huang,\n Zhizhong and Ma,\n Siteng and Zhang,\n Junping and Shan,\n Hongming\n},\n title = {\n Adaptive Nonlinear Latent Transformation for Conditional Face Editing\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21022-21031\n} \n}" }, { "title": "Adaptive Positional Encoding for Bundle-Adjusting Neural Radiance Fields", @@ -2714,7 +2806,8 @@ "aff_campus_unique_index": "1;", "aff_campus_unique": ";Hangzhou", "aff_country_unique_index": "0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Gao_2023_ICCV,\n \n author = {\n Gao,\n Zelin and Dai,\n Weichen and Zhang,\n Yu\n},\n title = {\n Adaptive Positional Encoding for Bundle-Adjusting Neural Radiance Fields\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3284-3294\n} \n}" }, { "title": "Adaptive Reordering Sampler with Neurally Guided MAGSAC", @@ -2746,7 +2839,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Prague;", "aff_country_unique_index": "0;0;1", - "aff_country_unique": "Czech Republic;Switzerland" + "aff_country_unique": "Czech Republic;Switzerland", + "bibtex": "@InProceedings{Wei_2023_ICCV,\n \n author = {\n Wei,\n Tong and Matas,\n Jiri and Barath,\n Daniel\n},\n title = {\n Adaptive Reordering Sampler with Neurally Guided MAGSAC\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18163-18173\n} \n}" }, { "title": "Adaptive Rotated Convolution for Rotated Object Detection", @@ -2778,7 +2872,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Pu_2023_ICCV,\n \n author = {\n Pu,\n Yifan and Wang,\n Yiru and Xia,\n Zhuofan and Han,\n Yizeng and Wang,\n Yulin and Gan,\n Weihao and Wang,\n Zidong and Song,\n Shiji and Huang,\n Gao\n},\n title = {\n Adaptive Rotated Convolution for Rotated Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6589-6600\n} \n}" }, { "title": "Adaptive Similarity Bootstrapping for Self-Distillation Based Representation Learning", @@ -2786,8 +2881,8 @@ "status": "Poster", "track": "main", "pid": "7566", - "author_site": "Tim Lebailly, Thomas Stegm\u00fcller, Behzad Bozorgtabar, Jean-Philippe Thiran, Tinne Tuytelaars", - "author": "Tim Lebailly; Thomas Stegm\u00fcller; Behzad Bozorgtabar; Jean-Philippe Thiran; Tinne Tuytelaars", + "author_site": "Tim Lebailly, Thomas Stegmüller, Behzad Bozorgtabar, Jean-Philippe Thiran, Tinne Tuytelaars", + "author": "Tim Lebailly; Thomas Stegmüller; Behzad Bozorgtabar; Jean-Philippe Thiran; Tinne Tuytelaars", "abstract": "Most self-supervised methods for representation learning leverage a cross-view consistency objective i.e., they maximize the representation similarity of a given image's augmented views. Recent work NNCLR goes beyond the cross-view paradigm and uses positive pairs from different images obtained via nearest neighbor bootstrapping in a contrastive setting. We empirically show that as opposed to the contrastive learning setting which relies on negative samples, incorporating nearest neighbor bootstrapping in a self-distillation scheme can lead to a performance drop or even collapse. We scrutinize the reason for this unexpected behavior and provide a solution. We propose to adaptively bootstrap neighbors based on the estimated quality of the latent space. We report consistent improvements compared to the naive bootstrapping approach and the original baselines. Our approach leads to performance improvements for various self-distillation method/backbone combinations and standard downstream tasks. Our code is publicly available at https://github.com/tileb1/AdaSim.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Lebailly_Adaptive_Similarity_Bootstrapping_for_Self-Distillation_Based_Representation_Learning_ICCV_2023_paper.pdf", "aff": "KU Leuven; EPFL; EPFL+CHUV; EPFL+CHUV; KU Leuven", @@ -2803,14 +2898,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Lebailly_Adaptive_Similarity_Bootstrapping_for_Self-Distillation_Based_Representation_Learning_ICCV_2023_paper.html", "aff_unique_index": "0;1;1+2;1+2;0", - "aff_unique_norm": "Katholieke Universiteit Leuven;EPFL;Centre Hospitalier Universitaire Vaudois", + "aff_unique_norm": "Katholieke Universiteit Leuven;Ecole Polytechnique Fédérale de Lausanne;Centre Hospitalier Universitaire Vaudois", "aff_unique_dep": ";;", "aff_unique_url": "https://www.kuleuven.be;https://www.epfl.ch;https://www.chuv.ch", "aff_unique_abbr": "KU Leuven;EPFL;CHUV", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1+1;1+1;0", - "aff_country_unique": "Belgium;Switzerland" + "aff_country_unique": "Belgium;Switzerland", + "bibtex": "@InProceedings{Lebailly_2023_ICCV,\n \n author = {\n Lebailly,\n Tim and Stegm\\"uller,\n Thomas and Bozorgtabar,\n Behzad and Thiran,\n Jean-Philippe and Tuytelaars,\n Tinne\n},\n title = {\n Adaptive Similarity Bootstrapping for Self-Distillation Based Representation Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16505-16514\n} \n}" }, { "title": "Adaptive Spiral Layers for Efficient 3D Representation Learning on Meshes", @@ -2822,7 +2918,7 @@ "author": "Francesca Babiloni; Matteo Maggioni; Thomas Tanay; Jiankang Deng; Ales Leonardis; Stefanos Zafeiriou", "abstract": "The success of deep learning models on structured data has generated significant interest in extending their application to non-Euclidean domains. In this work, we introduce a novel intrinsic operator suitable for representation learning on 3D meshes. Our operator is specifically tailored to adapt its behavior to the irregular structure of the underlying graph and effectively utilize its long-range dependencies, while at the same time ensuring computational efficiency and ease of optimization. In particular, inspired by the framework of Spiral Convolution, which extracts and transforms the vertices in the 3D mesh following a local spiral ordering, we propose a general operator that dynamically adjusts the length of the spiral trajectory and the parameters of the transformation for each processed vertex and mesh. Then, we use polyadic decomposition to factorize its dense weight tensor into a sequence of lighter linear layers that separately process features and vertices information, hence significantly reducing the computational complexity without introducing any stringent inductive biases. Notably, we leverage dynamic gating to achieve spatial adaptivity and induce global reasoning with constant time complexity benefitting from an efficient dynamic pooling mechanism based on Summed-Area-tables. Used as a drop-in replacement on existing architectures for shape correspondence our operator significantly improves the performance-efficiency trade-off, and in 3D shape generation with morphable models achieves state-of-the-art performance with a three-fold reduction in the number of parameters required. Project page: https://github.com/Fb2221/DFC", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Babiloni_Adaptive_Spiral_Layers_for_Efficient_3D_Representation_Learning_on_Meshes_ICCV_2023_paper.pdf", - "aff": "Huawei, Noah\u2019s Ark Lab+Imperial College London; Huawei, Noah\u2019s Ark Lab; Huawei, Noah\u2019s Ark Lab; Huawei, Noah\u2019s Ark Lab+Imperial College London; Huawei, Noah\u2019s Ark Lab; Imperial College London", + "aff": "Huawei, Noah’s Ark Lab+Imperial College London; Huawei, Noah’s Ark Lab; Huawei, Noah’s Ark Lab; Huawei, Noah’s Ark Lab+Imperial College London; Huawei, Noah’s Ark Lab; Imperial College London", "project": "", "github": "https://github.com/Fb2221/DFC", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Babiloni_Adaptive_Spiral_Layers_ICCV_2023_supplemental.pdf", @@ -2836,13 +2932,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Babiloni_Adaptive_Spiral_Layers_for_Efficient_3D_Representation_Learning_on_Meshes_ICCV_2023_paper.html", "aff_unique_index": "0+1;0;0;0+1;0;1", "aff_unique_norm": "Huawei;Imperial College London", - "aff_unique_dep": "Noah\u2019s Ark Lab;", + "aff_unique_dep": "Noah’s Ark Lab;", "aff_unique_url": "https://www.huawei.com;https://www.imperial.ac.uk", "aff_unique_abbr": "Huawei;ICL", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0;0;0+1;0;1", - "aff_country_unique": "China;United Kingdom" + "aff_country_unique": "China;United Kingdom", + "bibtex": "@InProceedings{Babiloni_2023_ICCV,\n \n author = {\n Babiloni,\n Francesca and Maggioni,\n Matteo and Tanay,\n Thomas and Deng,\n Jiankang and Leonardis,\n Ales and Zafeiriou,\n Stefanos\n},\n title = {\n Adaptive Spiral Layers for Efficient 3D Representation Learning on Meshes\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14620-14631\n} \n}" }, { "title": "Adaptive Superpixel for Active Learning in Semantic Segmentation", @@ -2874,7 +2971,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0+0;0+0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Kim_2023_ICCV,\n \n author = {\n Kim,\n Hoyoung and Oh,\n Minhyeon and Hwang,\n Sehyun and Kwak,\n Suha and Ok,\n Jungseul\n},\n title = {\n Adaptive Superpixel for Active Learning in Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 943-953\n} \n}" }, { "title": "Adaptive Template Transformer for Mitochondria Segmentation in Electron Microscopy Images", @@ -2882,6 +2980,7 @@ "status": "Poster", "track": "main", "pid": "5497", + "author_site": "Yuwen Pan, Naisong Luo, Rui Sun, Meng Meng, Tianzhu Zhang, Zhiwei Xiong, Yongdong Zhang", "author": "Yuwen Pan, Naisong Luo, Rui Sun, Meng Meng, Tianzhu Zhang, Zhiwei Xiong, Yongdong Zhang", "abstract": "Mitochondria, as tiny structures within the cell, are of significant importance to study cell functions for biological and clinical analysis. And exploring how to automatically segment mitochondria in electron microscopy (EM) images has attracted increasing attention. However, most of existing methods struggle to adapt to different scales and appearances of the input due to the inherent limitations of the traditional CNN architecture. To mitigate these limitations, we propose a novel adaptive template transformer (ATFormer) for mitochondria segmentation. The proposed ATFormer model enjoys several merits. First, the designed structural template learning module can acquire appearance-adaptive templates of background, foreground and contour to sense the characteristics of different shapes of mitochondria. And we further adopt an optimal transport algorithm to enlarge the discrepancy among diverse templates to fully activate corresponding regions. Second, we introduce a hierarchical attention learning mechanism to absorb multi-level information for templates to be adaptive scale-aware classifiers for dense prediction. Extensive experimental results on three challenging benchmarks including MitoEM, Lucchi and NucMM-Z datasets demonstrate that our ATFormer performs favorably against state-of-the-art mitochondria segmentation methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Pan_Adaptive_Template_Transformer_for_Mitochondria_Segmentation_in_Electron_Microscopy_Images_ICCV_2023_paper.pdf", @@ -2893,7 +2992,8 @@ "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4049000543664783034&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Pan_Adaptive_Template_Transformer_for_Mitochondria_Segmentation_in_Electron_Microscopy_Images_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Pan_Adaptive_Template_Transformer_for_Mitochondria_Segmentation_in_Electron_Microscopy_Images_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Pan_2023_ICCV,\n \n author = {\n Pan,\n Yuwen and Luo,\n Naisong and Sun,\n Rui and Meng,\n Meng and Zhang,\n Tianzhu and Xiong,\n Zhiwei and Zhang,\n Yongdong\n},\n title = {\n Adaptive Template Transformer for Mitochondria Segmentation in Electron Microscopy Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21474-21484\n} \n}" }, { "title": "Adaptive Testing of Computer Vision Models", @@ -2905,7 +3005,7 @@ "author": "Irena Gao; Gabriel Ilharco; Scott Lundberg; Marco Tulio Ribeiro", "abstract": "Vision models often fail systematically on groups of data that share common semantic characteristics (e.g., rare objects or unusual scenes), but identifying these failure modes is a challenge. We introduce AdaVision, an interactive process for testing vision models which helps users identify and fix coherent failure modes. Given a natural language description of a coherent group, AdaVision retrieves relevant images from LAION-5B with CLIP. The user then labels a small amount of data for model correctness, which is used in successive retrieval rounds to hill-climb towards high-error regions, refining the group definition. Once a group is saturated, AdaVision uses GPT-3 to suggest new group descriptions for the user to explore. We demonstrate the usefulness and generality of AdaVision in user studies, where users find major bugs in state-of-the-art classification, object detection, and image captioning models. These user-discovered groups have failure rates 2-3x higher than those surfaced by automatic error clustering methods. Finally, finetuning on examples found with AdaVision fixes the discovered bugs when evaluated on unseen examples, without degrading in-distribution accuracy, and while also improving performance on out-of-distribution datasets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Gao_Adaptive_Testing_of_Computer_Vision_Models_ICCV_2023_paper.pdf", - "aff": "Stanford University\u2217; University of Washington; Microsoft Research; Microsoft Research", + "aff": "Stanford University∗; University of Washington; Microsoft Research; Microsoft Research", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Gao_Adaptive_Testing_of_ICCV_2023_supplemental.pdf", @@ -2918,14 +3018,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Gao_Adaptive_Testing_of_Computer_Vision_Models_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;2", - "aff_unique_norm": "Stanford University;University of Washington;Microsoft", + "aff_unique_norm": "Stanford University;University of Washington;Microsoft Corporation", "aff_unique_dep": ";;Microsoft Research", "aff_unique_url": "https://www.stanford.edu;https://www.washington.edu;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "Stanford;UW;MSR", "aff_campus_unique_index": "0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Gao_2023_ICCV,\n \n author = {\n Gao,\n Irena and Ilharco,\n Gabriel and Lundberg,\n Scott and Ribeiro,\n Marco Tulio\n},\n title = {\n Adaptive Testing of Computer Vision Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4003-4014\n} \n}" }, { "title": "Adaptive and Background-Aware Vision Transformer for Real-Time UAV Tracking", @@ -2957,7 +3058,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Shuiwang and Yang,\n Yangxiang and Zeng,\n Dan and Wang,\n Xucheng\n},\n title = {\n Adaptive and Background-Aware Vision Transformer for Real-Time UAV Tracking\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13989-14000\n} \n}" }, { "title": "Adding Conditional Control to Text-to-Image Diffusion Models", @@ -2989,7 +3091,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Lvmin and Rao,\n Anyi and Agrawala,\n Maneesh\n},\n title = {\n Adding Conditional Control to Text-to-Image Diffusion Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3836-3847\n} \n}" }, { "title": "AdvDiffuser: Natural Adversarial Example Synthesis with Diffusion Models", @@ -2997,6 +3100,7 @@ "status": "Poster", "track": "main", "pid": "7043", + "author_site": "Xinquan Chen, Xitong Gao, Juanjuan Zhao, Kejiang Ye, Cheng-Zhong Xu", "author": "Xinquan Chen, Xitong Gao, Juanjuan Zhao, Kejiang Ye, Cheng-Zhong Xu", "abstract": "Previous work on adversarial examples typically involves a fixed norm perturbation budget, which fails to capture the way humans perceive perturbations. Recent work has shifted towards investigating natural unrestricted adversarial examples (UAEs) that breaks l_p perturbation bounds but nonetheless remain semantically plausible. Current methods use GAN or VAE to generate UAEs by perturbing latent codes. However, this leads to loss of high-level information, resulting in low-quality and unnatural UAEs. In light of this, we propose AddDiffuser, a new method for synthesizing natural UAEs using diffusion models. It can generate UAEs from scratch or conditionally based on reference images. To generate natural UAEs, we perturb predicted images to steer their latent code towards the adversarial sample space of a particular classifier. In addition, we propose adversarial inpainting based on class activation mapping to retain the salient regions of the image while perturbing less important areas. Our method achieves impressive results on CIFAR-10, CelebA and ImageNet, and we demonstrate that it can defeat the most robust models on the RobustBench leaderboard with near 100% success rates. Furthermore, The synthesized UAEs are not only more natural but also stronger compared to the current state-of-the-art attacks. Specifically, compared with GA-attack, the UAEs generated with AdvDiffuser exhibit 6xsmaller LPIPS perturbations, 2 ~ 3 xsmaller FID scores and 0.28 higher in SSIM metrics, making them perceptually stealthier. Lastly, it is capable of generating an unlimited number of natural adversarial examples. For more please visit our project page: Link to follow.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Chen_AdvDiffuser_Natural_Adversarial_Example_Synthesis_with_Diffusion_Models_ICCV_2023_paper.pdf", @@ -3008,7 +3112,8 @@ "gs_citation": 62, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10896555983133589313&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chen_AdvDiffuser_Natural_Adversarial_Example_Synthesis_with_Diffusion_Models_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chen_AdvDiffuser_Natural_Adversarial_Example_Synthesis_with_Diffusion_Models_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Xinquan and Gao,\n Xitong and Zhao,\n Juanjuan and Ye,\n Kejiang and Xu,\n Cheng-Zhong\n},\n title = {\n AdvDiffuser: Natural Adversarial Example Synthesis with Diffusion Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4562-4572\n} \n}" }, { "title": "Advancing Example Exploitation Can Alleviate Critical Challenges in Adversarial Training", @@ -3040,7 +3145,8 @@ "aff_campus_unique_index": "0;0;0;1;0", "aff_campus_unique": "Nanjing;Boston", "aff_country_unique_index": "0;0;0;1;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Ge_2023_ICCV,\n \n author = {\n Ge,\n Yao and Li,\n Yun and Han,\n Keji and Zhu,\n Junyi and Long,\n Xianzhong\n},\n title = {\n Advancing Example Exploitation Can Alleviate Critical Challenges in Adversarial Training\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 145-154\n} \n}" }, { "title": "Advancing Referring Expression Segmentation Beyond Single Image", @@ -3072,7 +3178,8 @@ "aff_campus_unique_index": ";;1", "aff_campus_unique": ";Shanghai", "aff_country_unique_index": "0+0;0;0+0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wu_2023_ICCV,\n \n author = {\n Wu,\n Yixuan and Zhang,\n Zhao and Xie,\n Chi and Zhu,\n Feng and Zhao,\n Rui\n},\n title = {\n Advancing Referring Expression Segmentation Beyond Single Image\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2628-2638\n} \n}" }, { "title": "Adversarial Bayesian Augmentation for Single-Source Domain Generalization", @@ -3104,7 +3211,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Baltimore County", "aff_country_unique_index": "0;0+0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Cheng_2023_ICCV,\n \n author = {\n Cheng,\n Sheng and Gokhale,\n Tejas and Yang,\n Yezhou\n},\n title = {\n Adversarial Bayesian Augmentation for Single-Source Domain Generalization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11400-11410\n} \n}" }, { "title": "Adversarial Finetuning with Latent Representation Constraint to Mitigate Accuracy-Robustness Tradeoff", @@ -3136,7 +3244,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0;0;0;0+0;0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Suzuki_2023_ICCV,\n \n author = {\n Suzuki,\n Satoshi and Yamaguchi,\n Shin'ya and Takeda,\n Shoichiro and Kanai,\n Sekitoshi and Makishima,\n Naoki and Ando,\n Atsushi and Masumura,\n Ryo\n},\n title = {\n Adversarial Finetuning with Latent Representation Constraint to Mitigate Accuracy-Robustness Tradeoff\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4390-4401\n} \n}" }, { "title": "Adverse Weather Removal with Codebook Priors", @@ -3161,14 +3270,15 @@ "author_num": 9, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ye_Adverse_Weather_Removal_with_Codebook_Priors_ICCV_2023_paper.html", "aff_unique_index": "0+1;0+1;2;3;1;1;1;1;4", - "aff_unique_norm": "Hong Kong University of Science and Technology;Jimei University;National University of Singapore;Xinjiang University;Southwest University", + "aff_unique_norm": "The Hong Kong University of Science and Technology;Jimei University;National University of Singapore;Xinjiang University;Southwest University", "aff_unique_dep": ";School of Ocean Information Engineering;;;College of Artificial Intelligence", "aff_unique_url": "https://www.ust.hk;http://www.jimei.edu.cn;https://www.nus.edu.sg;http://www.xju.edu.cn;https://www.swu.edu.cn", "aff_unique_abbr": "HKUST;;NUS;XJU;", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Guangzhou;", "aff_country_unique_index": "0+0;0+0;1;0;0;0;0;0;0", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Ye_2023_ICCV,\n \n author = {\n Ye,\n Tian and Chen,\n Sixiang and Bai,\n Jinbin and Shi,\n Jun and Xue,\n Chenghao and Jiang,\n Jingxia and Yin,\n Junjie and Chen,\n Erkang and Liu,\n Yun\n},\n title = {\n Adverse Weather Removal with Codebook Priors\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12653-12664\n} \n}" }, { "title": "AerialVLN: Vision-and-Language Navigation for UAVs", @@ -3200,7 +3310,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0;1", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Shubo and Zhang,\n Hongsheng and Qi,\n Yuankai and Wang,\n Peng and Zhang,\n Yanning and Wu,\n Qi\n},\n title = {\n AerialVLN: Vision-and-Language Navigation for UAVs\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15384-15394\n} \n}" }, { "title": "AesPA-Net: Aesthetic Pattern-Aware Style Transfer Networks", @@ -3232,7 +3343,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0+0;0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Hong_2023_ICCV,\n \n author = {\n Hong,\n Kibeom and Jeon,\n Seogkyu and Lee,\n Junsoo and Ahn,\n Namhyuk and Kim,\n Kunhee and Lee,\n Pilhyeon and Kim,\n Daesik and Uh,\n Youngjung and Byun,\n Hyeran\n},\n title = {\n AesPA-Net: Aesthetic Pattern-Aware Style Transfer Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22758-22767\n} \n}" }, { "title": "Affective Image Filter: Reflecting Emotions from Text to Images", @@ -3264,7 +3376,8 @@ "aff_campus_unique_index": ";1;1;1;", "aff_campus_unique": ";Beijing", "aff_country_unique_index": "0+0;0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Weng_2023_ICCV,\n \n author = {\n Weng,\n Shuchen and Zhang,\n Peixuan and Chang,\n Zheng and Wang,\n Xinlong and Li,\n Si and Shi,\n Boxin\n},\n title = {\n Affective Image Filter: Reflecting Emotions from Text to Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10810-10819\n} \n}" }, { "title": "Affine-Consistent Transformer for Multi-Class Cell Nuclei Detection", @@ -3289,14 +3402,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Huang_Affine-Consistent_Transformer_for_Multi-Class_Cell_Nuclei_Detection_ICCV_2023_paper.html", "aff_unique_index": "0+1;1;1;0", - "aff_unique_norm": "Sun Yat-sen University;Chinese University of Hong Kong", + "aff_unique_norm": "Sun Yat-sen University;The Chinese University of Hong Kong", "aff_unique_dep": "School of Computer Science and Engineering;Shenzhen Research Institute of Big Data", "aff_unique_url": "http://www.sysu.edu.cn;https://www.cuhk.edu.cn", "aff_unique_abbr": "SYSU;CUHK", "aff_campus_unique_index": "0+0;0;0;0", "aff_campus_unique": "Shenzhen", "aff_country_unique_index": "0+0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Huang_2023_ICCV,\n \n author = {\n Huang,\n Junjia and Li,\n Haofeng and Wan,\n Xiang and Li,\n Guanbin\n},\n title = {\n Affine-Consistent Transformer for Multi-Class Cell Nuclei Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21384-21393\n} \n}" }, { "title": "AffordPose: A Large-Scale Dataset of Hand-Object Interactions with Affordance-Driven Hand Pose", @@ -3328,7 +3442,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Jian_2023_ICCV,\n \n author = {\n Jian,\n Juntao and Liu,\n Xiuping and Li,\n Manyi and Hu,\n Ruizhen and Liu,\n Jian\n},\n title = {\n AffordPose: A Large-Scale Dataset of Hand-Object Interactions with Affordance-Driven Hand Pose\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14713-14724\n} \n}" }, { "title": "Agglomerative Transformer for Human-Object Interaction Detection", @@ -3357,10 +3472,11 @@ "aff_unique_dep": "Institute of Image Communication and Network Engineering", "aff_unique_url": "https://www.sjtu.edu.cn", "aff_unique_abbr": "SJTU", - "aff_campus_unique_index": "0;0;0", - "aff_campus_unique": "Shanghai;", + "aff_campus_unique_index": "0;0;0+0;0", + "aff_campus_unique": "Shanghai", "aff_country_unique_index": "0;0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Tu_2023_ICCV,\n \n author = {\n Tu,\n Danyang and Sun,\n Wei and Zhai,\n Guangtao and Shen,\n Wei\n},\n title = {\n Agglomerative Transformer for Human-Object Interaction Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21614-21624\n} \n}" }, { "title": "Aggregating Feature Point Cloud for Depth Completion", @@ -3392,7 +3508,8 @@ "aff_campus_unique_index": "1;", "aff_campus_unique": ";Ningbo", "aff_country_unique_index": "0;0;0;0;0+0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yu_2023_ICCV,\n \n author = {\n Yu,\n Zhu and Sheng,\n Zehua and Zhou,\n Zili and Luo,\n Lun and Cao,\n Si-Yuan and Gu,\n Hong and Zhang,\n Huaqi and Shen,\n Hui-Liang\n},\n title = {\n Aggregating Feature Point Cloud for Depth Completion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8732-8743\n} \n}" }, { "title": "Agile Modeling: From Concept to Classifier in Minutes", @@ -3401,7 +3518,7 @@ "track": "main", "pid": "11123", "author_site": "Otilia Stretcu, Edward Vendrow, Kenji Hata, Krishnamurthy Viswanathan, Vittorio Ferrari, Sasan Tavakkol, Wenlei Zhou, Aditya Avinash, Emming Luo, Neil Gordon Alldrin, MohammadHossein Bateni, Gabriel Berger, Andrew Bunner, Chun-Ta Lu, Javier Rey, Giulia DeSalvo, Ranjay Krishna, Ariel Fuxman?", - "author": "Otilia Stretcu; Edward Vendrow; Kenji Hata; Krishnamurthy Viswanathan; Vittorio Ferrari; Sasan Tavakkol; Wenlei Zhou; Aditya Avinash; Emming Luo; Neil Gordon Alldrin; MohammadHossein Bateni; Gabriel Berger; Andrew Bunner; Chun-Ta Lu; Javier Rey; Giulia DeSalvo; Ranjay Krishna; Ariel Fuxman\u200e", + "author": "Otilia Stretcu; Edward Vendrow; Kenji Hata; Krishnamurthy Viswanathan; Vittorio Ferrari; Sasan Tavakkol; Wenlei Zhou; Aditya Avinash; Emming Luo; Neil Gordon Alldrin; MohammadHossein Bateni; Gabriel Berger; Andrew Bunner; Chun-Ta Lu; Javier Rey; Giulia DeSalvo; Ranjay Krishna; Ariel Fuxman‎", "abstract": "The application of computer vision methods to nuanced, subjective concepts is growing. While crowdsourcing has served the vision community well for most objective tasks (such as labeling a \"zebra\"), it now falters on tasks where there is substantial subjectivity in the concept (such as identifying \"gourmet tuna\"). However, empowering any user to develop a classifier for their concept is technically difficult: users are neither machine learning experts nor have the patience to label thousands of examples. In reaction, we introduce the problem of Agile Modeling: the process of turning any subjective visual concept into a computer vision model through a real-time user-in-the-loop interactions. We instantiate an Agile Modeling prototype for image classification and show through a user study (N=14) that users can create classifiers with minimal effort under 30 minutes. We compare this user driven process with the traditional crowdsourcing paradigm and find that the crowd's notion often differs from that of the user's, especially as the concepts become more subjective. Finally, we scale our experiments with simulations of users training classifiers for ImageNet21k categories to further demonstrate the efficacy.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Stretcu_Agile_Modeling_From_Concept_to_Classifier_in_Minutes_ICCV_2023_paper.pdf", "aff": "Google Research; Google Research + Stanford University; Google Research; Google Research; Google Research; Google Research; Google Research; Google Research; Google Research; Google Research; Google Research; Google Research; Google Research; Google Research; Google Research; Google Research; University of Washington; Google Research", @@ -3424,7 +3541,8 @@ "aff_campus_unique_index": "0;0+1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0", "aff_campus_unique": "Mountain View;Stanford;", "aff_country_unique_index": "0;0+0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Stretcu_2023_ICCV,\n \n author = {\n Stretcu,\n Otilia and Vendrow,\n Edward and Hata,\n Kenji and Viswanathan,\n Krishnamurthy and Ferrari,\n Vittorio and Tavakkol,\n Sasan and Zhou,\n Wenlei and Avinash,\n Aditya and Luo,\n Emming and Alldrin,\n Neil Gordon and Bateni,\n MohammadHossein and Berger,\n Gabriel and Bunner,\n Andrew and Lu,\n Chun-Ta and Rey,\n Javier and DeSalvo,\n Giulia and Krishna,\n Ranjay and Fuxman‎,\n Ariel\n},\n title = {\n Agile Modeling: From Concept to Classifier in Minutes\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22323-22334\n} \n}" }, { "title": "Algebraically Rigorous Quaternion Framework for the Neural Network Pose Estimation Problem", @@ -3456,7 +3574,8 @@ "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "New York;Bloomington", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Lin_2023_ICCV,\n \n author = {\n Lin,\n Chen and Hanson,\n Andrew J. and Hanson,\n Sonya M.\n},\n title = {\n Algebraically Rigorous Quaternion Framework for the Neural Network Pose Estimation Problem\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14097-14106\n} \n}" }, { "title": "AlignDet: Aligning Pre-training and Fine-tuning in Object Detection", @@ -3488,7 +3607,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Orlando", "aff_country_unique_index": "0;0+1;0;1;0;0;0;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Ming and Wu,\n Jie and Wang,\n Xionghui and Chen,\n Chen and Qin,\n Jie and Xiao,\n Xuefeng and Wang,\n Rui and Zheng,\n Min and Pan,\n Xin\n},\n title = {\n AlignDet: Aligning Pre-training and Fine-tuning in Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6866-6876\n} \n}" }, { "title": "Alignment Before Aggregation: Trajectory Memory Retrieval Network for Video Object Segmentation", @@ -3520,7 +3640,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Hefei", "aff_country_unique_index": "0;0;0;0+0;0+0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Sun_2023_ICCV,\n \n author = {\n Sun,\n Rui and Wang,\n Yuan and Mai,\n Huayu and Zhang,\n Tianzhu and Wu,\n Feng\n},\n title = {\n Alignment Before Aggregation: Trajectory Memory Retrieval Network for Video Object Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1218-1228\n} \n}" }, { "title": "Alignment-free HDR Deghosting with Semantics Consistent Transformer", @@ -3528,8 +3649,8 @@ "status": "Poster", "track": "main", "pid": "10478", - "author_site": "Steven Tel, Zongwei Wu, Yulun Zhang, Barth\u00e9l\u00e9my Heyrman, C\u00e9dric Demonceaux, Radu Timofte, Dominique Ginhac", - "author": "Steven Tel; Zongwei Wu; Yulun Zhang; Barth\u00e9l\u00e9my Heyrman; C\u00e9dric Demonceaux; Radu Timofte; Dominique Ginhac", + "author_site": "Steven Tel, Zongwei Wu, Yulun Zhang, Barthélémy Heyrman, Cédric Demonceaux, Radu Timofte, Dominique Ginhac", + "author": "Steven Tel; Zongwei Wu; Yulun Zhang; Barthélémy Heyrman; Cédric Demonceaux; Radu Timofte; Dominique Ginhac", "abstract": "High dynamic range (HDR) imaging aims to retrieve information from multiple low-dynamic range inputs to generate realistic output. The essence is to leverage the contextual information, including both dynamic and static semantics, for better image generation. Existing methods often focus on the spatial misalignment across input frames caused by the foreground and/or camera motion. However, there is no research on jointly leveraging the dynamic and static context in a simultaneous manner. To delve into this problem, we propose a novel alignment-free network with a Semantics Consistent Transformer (SCTNet) with both spatial and channel attention modules in the network. The spatial attention aims to deal with the intra-image correlation to model the dynamic motion, while the channel attention enables the inter-image intertwining to enhance the semantic consistency across frames. Aside from this, we introduce a novel realistic HDR dataset with more variations in foreground objects, environmental factors, and larger motions. Extensive comparisons on both conventional datasets and ours validate the effectiveness of our method, achieving the best trade-off on the performance and the computational cost. The source code and dataset are available at https://github.com/Zongwei97/SCTNet.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Tel_Alignment-free_HDR_Deghosting_with_Semantics_Consistent_Transformer_ICCV_2023_paper.pdf", "aff": ";;;;;;", @@ -3543,7 +3664,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Tel_Alignment-free_HDR_Deghosting_with_Semantics_Consistent_Transformer_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Tel_Alignment-free_HDR_Deghosting_with_Semantics_Consistent_Transformer_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Tel_2023_ICCV,\n \n author = {\n Tel,\n Steven and Wu,\n Zongwei and Zhang,\n Yulun and Heyrman,\n Barth\\'el\\'emy and Demonceaux,\n C\\'edric and Timofte,\n Radu and Ginhac,\n Dominique\n},\n title = {\n Alignment-free HDR Deghosting with Semantics Consistent Transformer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12836-12845\n} \n}" }, { "title": "All in Tokens: Unifying Output Space of Visual Tasks via Soft Token", @@ -3555,7 +3677,7 @@ "author": "Jia Ning; Chen Li; Zheng Zhang; Chunyu Wang; Zigang Geng; Qi Dai; Kun He; Han Hu", "abstract": "We introduce AiT, a unified output representation for various vision tasks, which is a crucial step towards general-purpose vision task solvers. Despite the challenges posed by the high-dimensional and task-specific outputs, we showcase the potential of using discrete representation (VQ-VAE) to model the dense outputs of many computer vision tasks as a sequence of discrete tokens. This is inspired by the established ability of VQ-VAE to conserve the structures spanning multiple pixels using few discrete codes. To that end, we present a modified shallower architecture for VQ-VAE that improves efficiency while keeping prediction accuracy. Our approach also incorporates uncertainty into the decoding process by using a soft fusion of the codebook entries, providing a more stable training process, which notably improved prediction accuracy. Our evaluation of AiT on depth estimation and instance segmentation tasks, with both continuous and discrete labels, demonstrates its superiority compared to other unified models. The code and models are available at https://github.com/SwinTransformer/AiT.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Ning_All_in_Tokens_Unifying_Output_Space_of_Visual_Tasks_via_ICCV_2023_paper.pdf", - "aff": "Huazhong University of Science and Technology; National Key Laboratory of Human-Machine Hybrid Augmented Intelligence, National Engineering Research Center for Visual Information and Applications, and Institute of Artificial Intelligence and Robotics, Xi\u2019an Jiaotong University; University of Science and Technology of China; Microsoft Research Asia; Microsoft Research Asia; Microsoft Research Asia; Huazhong University of Science and Technology; Microsoft Research Asia", + "aff": "Huazhong University of Science and Technology; National Key Laboratory of Human-Machine Hybrid Augmented Intelligence, National Engineering Research Center for Visual Information and Applications, and Institute of Artificial Intelligence and Robotics, Xi’an Jiaotong University; University of Science and Technology of China; Microsoft Research Asia; Microsoft Research Asia; Microsoft Research Asia; Huazhong University of Science and Technology; Microsoft Research Asia", "project": "", "github": "https://github.com/SwinTransformer/AiT", "supp": "", @@ -3568,14 +3690,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ning_All_in_Tokens_Unifying_Output_Space_of_Visual_Tasks_via_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;3;3;3;0;3", - "aff_unique_norm": "Huazhong University of Science and Technology;Xi'an Jiao Tong University;University of Science and Technology of China;Microsoft", + "aff_unique_norm": "Huazhong University of Science and Technology;Xi'an Jiaotong University;University of Science and Technology of China;Microsoft Research", "aff_unique_dep": ";Institute of Artificial Intelligence and Robotics;;Research", "aff_unique_url": "http://www.hust.edu.cn;http://www.xjtu.edu.cn;http://www.ustc.edu.cn;https://www.microsoft.com/en-us/research/group/asia", "aff_unique_abbr": "HUST;XJTU;USTC;MSR Asia", "aff_campus_unique_index": "1;2;2;2;2", "aff_campus_unique": ";Xi'an;Asia", "aff_country_unique_index": "0;0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Ning_2023_ICCV,\n \n author = {\n Ning,\n Jia and Li,\n Chen and Zhang,\n Zheng and Wang,\n Chunyu and Geng,\n Zigang and Dai,\n Qi and He,\n Kun and Hu,\n Han\n},\n title = {\n All in Tokens: Unifying Output Space of Visual Tasks via Soft Token\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19900-19910\n} \n}" }, { "title": "All-to-Key Attention for Arbitrary Style Transfer", @@ -3600,14 +3723,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhu_All-to-Key_Attention_for_Arbitrary_Style_Transfer_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;1;2", - "aff_unique_norm": "Xidian University;Hong Kong University of Science and Technology;Chongqing University of Post and Telecommunications", + "aff_unique_norm": "Xidian University;The Hong Kong University of Science and Technology;Chongqing University of Post and Telecommunications", "aff_unique_dep": ";;", "aff_unique_url": "http://www.xidian.edu.cn/;https://www.ust.hk;http://www.cqupt.edu.cn", "aff_unique_abbr": "Xidian;HKUST;CQUPT", "aff_campus_unique_index": "1", "aff_campus_unique": ";Guangzhou", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhu_2023_ICCV,\n \n author = {\n Zhu,\n Mingrui and He,\n Xiao and Wang,\n Nannan and Wang,\n Xiaoyu and Gao,\n Xinbo\n},\n title = {\n All-to-Key Attention for Arbitrary Style Transfer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23109-23119\n} \n}" }, { "title": "All4One: Symbiotic Neighbour Contrastive Learning via Self-Attention and Redundancy Reduction", @@ -3632,14 +3756,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Estepa_All4One_Symbiotic_Neighbour_Contrastive_Learning_via_Self-Attention_and_Redundancy_Reduction_ICCV_2023_paper.html", "aff_unique_index": "0;1;0;0+2", - "aff_unique_norm": "Universitat de Barcelona;NVIDIA;Computer Vision Center", - "aff_unique_dep": ";NVIDIA Corporation;", + "aff_unique_norm": "Universitat de Barcelona;NVIDIA Corporation;Computer Vision Center", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.ub.edu;https://www.nvidia.com;", "aff_unique_abbr": "UB;NVIDIA;", "aff_campus_unique_index": "0;0;0+2", "aff_campus_unique": "Barcelona;;Cerdanyola (Barcelona)", "aff_country_unique_index": "0;1;0;0+0", - "aff_country_unique": "Spain;United States" + "aff_country_unique": "Spain;United States", + "bibtex": "@InProceedings{Estepa_2023_ICCV,\n \n author = {\n Estepa,\n Imanol G. and Sarasua,\n Ignacio and Nagarajan,\n Bhalaji and Radeva,\n Petia\n},\n title = {\n All4One: Symbiotic Neighbour Contrastive Learning via Self-Attention and Redundancy Reduction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16243-16253\n} \n}" }, { "title": "Alleviating Catastrophic Forgetting of Incremental Object Detection via Within-Class and Between-Class Knowledge Distillation", @@ -3671,7 +3796,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Kang_2023_ICCV,\n \n author = {\n Kang,\n Mengxue and Zhang,\n Jinpeng and Zhang,\n Jinming and Wang,\n Xiashuang and Chen,\n Yang and Ma,\n Zhe and Huang,\n Xuhui\n},\n title = {\n Alleviating Catastrophic Forgetting of Incremental Object Detection via Within-Class and Between-Class Knowledge Distillation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18894-18904\n} \n}" }, { "title": "Among Us: Adversarially Robust Collaborative Perception by Consensus", @@ -3696,14 +3822,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_Among_Us_Adversarially_Robust_Collaborative_Perception_by_Consensus_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;1+2;3;0", - "aff_unique_norm": "New York University;Shanghai Jiao Tong University;Shanghai AI Laboratory;Meta", + "aff_unique_norm": "New York University;Shanghai Jiao Tong University;Shanghai AI Laboratory;Meta Platforms, Inc.", "aff_unique_dep": ";;;Meta AI", "aff_unique_url": "https://www.nyu.edu;https://www.sjtu.edu.cn;https://www.shanghai-ai-lab.com;https://meta.com", "aff_unique_abbr": "NYU;SJTU;SAIL;Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1+1;0;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Yiming and Fang,\n Qi and Bai,\n Jiamu and Chen,\n Siheng and Juefei-Xu,\n Felix and Feng,\n Chen\n},\n title = {\n Among Us: Adversarially Robust Collaborative Perception by Consensus\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 186-195\n} \n}" }, { "title": "An Adaptive Model Ensemble Adversarial Attack for Boosting Adversarial Transferability", @@ -3735,7 +3862,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Taiwan", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Bin and Yin,\n Jiali and Chen,\n Shukai and Chen,\n Bohao and Liu,\n Ximeng\n},\n title = {\n An Adaptive Model Ensemble Adversarial Attack for Boosting Adversarial Transferability\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4489-4498\n} \n}" }, { "title": "An Embarrassingly Simple Backdoor Attack on Self-supervised Learning", @@ -3767,7 +3895,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1;1;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Changjiang and Pang,\n Ren and Xi,\n Zhaohan and Du,\n Tianyu and Ji,\n Shouling and Yao,\n Yuan and Wang,\n Ting\n},\n title = {\n An Embarrassingly Simple Backdoor Attack on Self-supervised Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4367-4378\n} \n}" }, { "title": "Anatomical Invariance Modeling and Semantic Alignment for Self-supervised Learning in 3D Medical Image Analysis", @@ -3799,7 +3928,8 @@ "aff_campus_unique_index": ";1;;;", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0+0;0+0;0+0;0;0+0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Jiang_2023_ICCV,\n \n author = {\n Jiang,\n Yankai and Sun,\n Mingze and Guo,\n Heng and Bai,\n Xiaoyu and Yan,\n Ke and Lu,\n Le and Xu,\n Minfeng\n},\n title = {\n Anatomical Invariance Modeling and Semantic Alignment for Self-supervised Learning in 3D Medical Image Analysis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15859-15869\n} \n}" }, { "title": "Anchor Structure Regularization Induced Multi-view Subspace Clustering via Enhanced Tensor Rank Minimization", @@ -3824,14 +3954,15 @@ "author_num": 2, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ji_Anchor_Structure_Regularization_Induced_Multi-view_Subspace_Clustering_via_Enhanced_Tensor_ICCV_2023_paper.html", "aff_unique_index": "0+0;0+0", - "aff_unique_norm": "Beijing Jiao Tong University", + "aff_unique_norm": "Beijing Jiaotong University", "aff_unique_dep": "Key Laboratory of Big Data & Artificial Intelligence in Transportation", "aff_unique_url": "http://www.bjtu.edu.cn", "aff_unique_abbr": "BJTU", "aff_campus_unique_index": "0+0;0+0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Ji_2023_ICCV,\n \n author = {\n Ji,\n Jintian and Feng,\n Songhe\n},\n title = {\n Anchor Structure Regularization Induced Multi-view Subspace Clustering via Enhanced Tensor Rank Minimization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19343-19352\n} \n}" }, { "title": "Anchor-Intermediate Detector: Decoupling and Coupling Bounding Boxes for Accurate Object Detection", @@ -3843,7 +3974,7 @@ "author": "Yilong Lv; Min Li; Yujie He; Shaopeng Li; Zhuzhen He; Aitao Yang", "abstract": "Anchor-based detectors have been continuously developed for object detection. However, the individual anchor box makes it difficult to predict the boundary's offset accurately. Instead of taking each bounding box as a closed individual, we consider using multiple boxes together to get prediction boxes. To this end, this paper proposes the Box Decouple-Couple(BDC) strategy in the inference, which no longer discards the overlapping boxes, but decouples the corner points of these boxes. Then, according to each corner's score, we couple the corner points to select the most accurate corner pairs. To meet the BDC strategy, a simple but novel model is designed named the Anchor-Intermediate Detector(AID), which contains two head networks, i.e., an anchor-based head and an anchor-free Corner-aware head. The corner-aware head is able to score the corners of each bounding box to facilitate the coupling between corner points. Extensive experiments on MS COCO show that the proposed anchor-intermediate detector respectively outperforms their baseline RetinaNet and GFL method by 2.4 and 1.2 AP on the MS COCO test-dev dataset without any bells and whistles.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Lv_Anchor-Intermediate_Detector_Decoupling_and_Coupling_Bounding_Boxes_for_Accurate_Object_ICCV_2023_paper.pdf", - "aff": "Xi\u2019an Institute of High Technology; Xi\u2019an Institute of High Technology; Xi\u2019an Institute of High Technology; Tsinghua University+Department of Automation; National University of Defense Technology; Xi\u2019an Institute of High Technology", + "aff": "Xi’an Institute of High Technology; Xi’an Institute of High Technology; Xi’an Institute of High Technology; Tsinghua University+Department of Automation; National University of Defense Technology; Xi’an Institute of High Technology", "project": "", "github": "", "supp": "", @@ -3863,7 +3994,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Lv_2023_ICCV,\n \n author = {\n Lv,\n Yilong and Li,\n Min and He,\n Yujie and Li,\n Shaopeng and He,\n Zhuzhen and Yang,\n Aitao\n},\n title = {\n Anchor-Intermediate Detector: Decoupling and Coupling Bounding Boxes for Accurate Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6275-6284\n} \n}" }, { "title": "Animal3D: A Comprehensive Dataset of 3D Animal Pose and Shape", @@ -3886,7 +4018,8 @@ "aff_domain": ";;;;;;;;;;;;;;;;;;;", "email": ";;;;;;;;;;;;;;;;;;;", "author_num": 20, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Xu_Animal3D_A_Comprehensive_Dataset_of_3D_Animal_Pose_and_Shape_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Xu_Animal3D_A_Comprehensive_Dataset_of_3D_Animal_Pose_and_Shape_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Xu_2023_ICCV,\n \n author = {\n Xu,\n Jiacong and Zhang,\n Yi and Peng,\n Jiawei and Ma,\n Wufei and Jesslen,\n Artur and Ji,\n Pengliang and Hu,\n Qixin and Zhang,\n Jiehua and Liu,\n Qihao and Wang,\n Jiahao and Ji,\n Wei and Wang,\n Chen and Yuan,\n Xiaoding and Kaushik,\n Prakhar and Zhang,\n Guofeng and Liu,\n Jie and Xie,\n Yushan and Cui,\n Yawen and Yuille,\n Alan and Kortylewski,\n Adam\n},\n title = {\n Animal3D: A Comprehensive Dataset of 3D Animal Pose and Shape\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9099-9109\n} \n}" }, { "title": "Anomaly Detection Under Distribution Shift", @@ -3918,7 +4051,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Cao_2023_ICCV,\n \n author = {\n Cao,\n Tri and Zhu,\n Jiawen and Pang,\n Guansong\n},\n title = {\n Anomaly Detection Under Distribution Shift\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6511-6523\n} \n}" }, { "title": "Anomaly Detection using Score-based Perturbation Resilience", @@ -3950,7 +4084,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0;0;0+0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Shin_2023_ICCV,\n \n author = {\n Shin,\n Woosang and Lee,\n Jonghyeon and Lee,\n Taehan and Lee,\n Sangmoon and Yun,\n Jong Pil\n},\n title = {\n Anomaly Detection using Score-based Perturbation Resilience\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23372-23382\n} \n}" }, { "title": "Anti-DreamBooth: Protecting Users from Personalized Text-to-image Synthesis", @@ -3977,12 +4112,13 @@ "aff_unique_index": "0;0;0;0;1;0", "aff_unique_norm": "VinAI Research;Vanderbilt University", "aff_unique_dep": ";", - "aff_unique_url": "https://www.vinai.io/;https://www.vanderbilt.edu", + "aff_unique_url": "https://www.vinai.io;https://www.vanderbilt.edu", "aff_unique_abbr": "VinAI;Vanderbilt", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1;0", - "aff_country_unique": "Vietnam;United States" + "aff_country_unique": "Vietnam;United States", + "bibtex": "@InProceedings{Van_Le_2023_ICCV,\n \n author = {\n Van Le,\n Thanh and Phung,\n Hao and Nguyen,\n Thuan Hoang and Dao,\n Quan and Tran,\n Ngoc N. and Tran,\n Anh\n},\n title = {\n Anti-DreamBooth: Protecting Users from Personalized Text-to-image Synthesis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2116-2127\n} \n}" }, { "title": "Aperture Diffraction for Compact Snapshot Spectral Imaging", @@ -4014,7 +4150,8 @@ "aff_campus_unique_index": "0;0;0;0;0;0;0", "aff_campus_unique": "Nanjing", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Lv_2023_ICCV,\n \n author = {\n Lv,\n Tao and Ye,\n Hao and Yuan,\n Quan and Shi,\n Zhan and Wang,\n Yibo and Wang,\n Shuming and Cao,\n Xun\n},\n title = {\n Aperture Diffraction for Compact Snapshot Spectral Imaging\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10574-10584\n} \n}" }, { "title": "Aria Digital Twin: A New Benchmark Dataset for Egocentric 3D Machine Perception", @@ -4039,14 +4176,15 @@ "author_num": 9, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Pan_Aria_Digital_Twin_A_New_Benchmark_Dataset_for_Egocentric_3D_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0;0;0;0;0;0", - "aff_unique_norm": "Meta", - "aff_unique_dep": "Meta Reality Labs", + "aff_unique_norm": "Meta Reality Labs", + "aff_unique_dep": "", "aff_unique_url": "https://www.meta.com", "aff_unique_abbr": "MRL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Pan_2023_ICCV,\n \n author = {\n Pan,\n Xiaqing and Charron,\n Nicholas and Yang,\n Yongqian and Peters,\n Scott and Whelan,\n Thomas and Kong,\n Chen and Parkhi,\n Omkar and Newcombe,\n Richard and Ren,\n Yuheng (Carl)\n},\n title = {\n Aria Digital Twin: A New Benchmark Dataset for Egocentric 3D Machine Perception\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20133-20143\n} \n}" }, { "title": "AssetField: Assets Mining and Reconfiguration in Ground Feature Plane Representation", @@ -4071,14 +4209,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Xiangli_AssetField_Assets_Mining_and_Reconfiguration_in_Ground_Feature_Plane_Representation_ICCV_2023_paper.html", "aff_unique_index": "0;0;1+2;3;4+0;0+4", - "aff_unique_norm": "Chinese University of Hong Kong;Max Planck Institute for Informatics;Nanyang Technological University;Adobe;Shanghai AI Laboratory", + "aff_unique_norm": "The Chinese University of Hong Kong;Max Planck Institute for Informatics;Nanyang Technological University;Adobe;Shanghai AI Laboratory", "aff_unique_dep": ";;;Adobe Research;", "aff_unique_url": "https://www.cuhk.edu.hk;https://mpi-inf.mpg.de;https://www.ntu.edu.sg;https://research.adobe.com;https://www.shanghai-ai-lab.com", "aff_unique_abbr": "CUHK;MPII;NTU;Adobe;SAIL", "aff_campus_unique_index": "0;0;;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;1+2;3;0+0;0+0", - "aff_country_unique": "China;Germany;Singapore;United States" + "aff_country_unique": "China;Germany;Singapore;United States", + "bibtex": "@InProceedings{Xiangli_2023_ICCV,\n \n author = {\n Xiangli,\n Yuanbo and Xu,\n Linning and Pan,\n Xingang and Zhao,\n Nanxuan and Dai,\n Bo and Lin,\n Dahua\n},\n title = {\n AssetField: Assets Mining and Reconfiguration in Ground Feature Plane Representation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3251-3261\n} \n}" }, { "title": "Atmospheric Transmission and Thermal Inertia Induced Blind Road Segmentation with a Large-Scale Dataset TBRSD", @@ -4110,7 +4249,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Junzhang and Bai,\n Xiangzhi\n},\n title = {\n Atmospheric Transmission and Thermal Inertia Induced Blind Road Segmentation with a Large-Scale Dataset TBRSD\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1053-1063\n} \n}" }, { "title": "AttT2M: Text-Driven Human Motion Generation with Multi-Perspective Attention Mechanism", @@ -4142,7 +4282,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhong_2023_ICCV,\n \n author = {\n Zhong,\n Chongyang and Hu,\n Lei and Zhang,\n Zihao and Xia,\n Shihong\n},\n title = {\n AttT2M: Text-Driven Human Motion Generation with Multi-Perspective Attention Mechanism\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 509-519\n} \n}" }, { "title": "Attention Discriminant Sampling for Point Clouds", @@ -4174,7 +4315,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Taiwan", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Hong_2023_ICCV,\n \n author = {\n Hong,\n Cheng-Yao and Chou,\n Yu-Ying and Liu,\n Tyng-Luh\n},\n title = {\n Attention Discriminant Sampling for Point Clouds\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14429-14440\n} \n}" }, { "title": "Attention Where It Matters: Rethinking Visual Document Understanding with Selective Region Concentration", @@ -4206,7 +4348,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Cao_2023_ICCV,\n \n author = {\n Cao,\n Haoyu and Bao,\n Changcun and Liu,\n Chaohu and Chen,\n Huang and Yin,\n Kun and Liu,\n Hao and Liu,\n Yinsong and Jiang,\n Deqiang and Sun,\n Xing\n},\n title = {\n Attention Where It Matters: Rethinking Visual Document Understanding with Selective Region Concentration\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19517-19527\n} \n}" }, { "title": "Attentive Mask CLIP", @@ -4231,14 +4374,15 @@ "author_num": 11, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yang_Attentive_Mask_CLIP_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;0;0;0;0;1;0;0;0", - "aff_unique_norm": "Microsoft;Tongji University;Tsinghua University", + "aff_unique_norm": "Microsoft Research;Tongji University;Tsinghua University", "aff_unique_dep": "Research;;", "aff_unique_url": "https://www.microsoft.com/en-us/research/group/asia;https://www.tongji.edu.cn;https://www.tsinghua.edu.cn", "aff_unique_abbr": "MSR Asia;Tongji;THU", "aff_campus_unique_index": "0;0;0;0;0;0;0;0", "aff_campus_unique": "Asia;", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n Yifan and Huang,\n Weiquan and Wei,\n Yixuan and Peng,\n Houwen and Jiang,\n Xinyang and Jiang,\n Huiqiang and Wei,\n Fangyun and Wang,\n Yin and Hu,\n Han and Qiu,\n Lili and Yang,\n Yuqing\n},\n title = {\n Attentive Mask CLIP\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2771-2781\n} \n}" }, { "title": "Audio-Enhanced Text-to-Video Retrieval using Text-Conditioned Feature Alignment", @@ -4270,7 +4414,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;1;1", - "aff_country_unique": "Netherlands;United States" + "aff_country_unique": "Netherlands;United States", + "bibtex": "@InProceedings{Ibrahimi_2023_ICCV,\n \n author = {\n Ibrahimi,\n Sarah and Sun,\n Xiaohang and Wang,\n Pichao and Garg,\n Amanmeet and Sanan,\n Ashutosh and Omar,\n Mohamed\n},\n title = {\n Audio-Enhanced Text-to-Video Retrieval using Text-Conditioned Feature Alignment\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12054-12064\n} \n}" }, { "title": "Audio-Visual Class-Incremental Learning", @@ -4302,7 +4447,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Dallas;", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Pian_2023_ICCV,\n \n author = {\n Pian,\n Weiguo and Mo,\n Shentong and Guo,\n Yunhui and Tian,\n Yapeng\n},\n title = {\n Audio-Visual Class-Incremental Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7799-7811\n} \n}" }, { "title": "Audio-Visual Deception Detection: DOLOS Dataset and Parameter-Efficient Crossmodal Learning", @@ -4325,7 +4471,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Guo_Audio-Visual_Deception_Detection_DOLOS_Dataset_and_Parameter-Efficient_Crossmodal_Learning_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Guo_Audio-Visual_Deception_Detection_DOLOS_Dataset_and_Parameter-Efficient_Crossmodal_Learning_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Guo_2023_ICCV,\n \n author = {\n Guo,\n Xiaobao and Selvaraj,\n Nithish Muthuchamy and Yu,\n Zitong and Kong,\n Adams Wai-Kin and Shen,\n Bingquan and Kot,\n Alex\n},\n title = {\n Audio-Visual Deception Detection: DOLOS Dataset and Parameter-Efficient Crossmodal Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22135-22145\n} \n}" }, { "title": "Audio-Visual Glance Network for Efficient Video Recognition", @@ -4357,7 +4504,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Nugroho_2023_ICCV,\n \n author = {\n Nugroho,\n Muhammad Adi and Woo,\n Sangmin and Lee,\n Sumin and Kim,\n Changick\n},\n title = {\n Audio-Visual Glance Network for Efficient Video Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10150-10159\n} \n}" }, { "title": "Audiovisual Masked Autoencoders", @@ -4389,7 +4537,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0+1;0;1;0;0;0", - "aff_country_unique": "United States;Romania" + "aff_country_unique": "United States;Romania", + "bibtex": "@InProceedings{Georgescu_2023_ICCV,\n \n author = {\n Georgescu,\n Mariana-Iuliana and Fonseca,\n Eduardo and Ionescu,\n Radu Tudor and Lucic,\n Mario and Schmid,\n Cordelia and Arnab,\n Anurag\n},\n title = {\n Audiovisual Masked Autoencoders\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16144-16154\n} \n}" }, { "title": "Augmented Box Replay: Overcoming Foreground Shift for Incremental Object Detection", @@ -4421,7 +4570,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0+0;0;1;0;1+1", - "aff_country_unique": "China;Spain" + "aff_country_unique": "China;Spain", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Yuyang and Cong,\n Yang and Goswami,\n Dipam and Liu,\n Xialei and van de Weijer,\n Joost\n},\n title = {\n Augmented Box Replay: Overcoming Foreground Shift for Incremental Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11367-11377\n} \n}" }, { "title": "Augmenting and Aligning Snippets for Few-Shot Video Domain Adaptation", @@ -4453,7 +4603,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Xu_2023_ICCV,\n \n author = {\n Xu,\n Yuecong and Yang,\n Jianfei and Zhou,\n Yunjiao and Chen,\n Zhenghua and Wu,\n Min and Li,\n Xiaoli\n},\n title = {\n Augmenting and Aligning Snippets for Few-Shot Video Domain Adaptation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13445-13456\n} \n}" }, { "title": "AutoAD II: The Sequel - Who, When, and What in Movie Audio Description", @@ -4465,7 +4616,7 @@ "author": "Tengda Han; Max Bain; Arsha Nagrani; Gul Varol; Weidi Xie; Andrew Zisserman", "abstract": "Audio Description (AD) is the task of generating descriptions of visual content, at suitable time intervals, for the benefit of visually impaired audiences. For movies, this presents notable challenges -- AD must occur only during existing pauses in dialogue, should refer to characters by name, and ought to aid understanding of the storyline as a whole.\n To this end, we develop a new model for automatically generating movie AD, given CLIP visual features of the frames, the cast list, and the temporal locations of the speech; addressing all three of the `who', `when', and `what' questions: (i) who -- we introduce a character bank consisting of the character's name, the actor that played the part, and a CLIP feature of their face, for the principal cast of each movie, and demonstrate how this can be used to improve naming in the generated AD; (ii) when -- we investigate several models for determining whether an AD should be generated for a time interval or not, based on the visual content of the interval and its neighbours; and (iii) what -- we implement a new vision-language model for this task, that can ingest the proposals from the character bank, whilst conditioning on the visual features using cross-attention, and demonstrate how this improves over previous architectures for AD text generation in an apples-to-apples comparison.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Han_AutoAD_II_The_Sequel_-_Who_When_and_What_in_ICCV_2023_paper.pdf", - "aff": "Visual Geometry Group, University of Oxford; Visual Geometry Group, University of Oxford; Visual Geometry Group, University of Oxford\u2020; Visual Geometry Group, University of Oxford+LIGM, \u00b4Ecole des Ponts ParisTech; Visual Geometry Group, University of Oxford+CMIC, Shanghai Jiao Tong University; Visual Geometry Group, University of Oxford", + "aff": "Visual Geometry Group, University of Oxford; Visual Geometry Group, University of Oxford; Visual Geometry Group, University of Oxford†; Visual Geometry Group, University of Oxford+LIGM, ´Ecole des Ponts ParisTech; Visual Geometry Group, University of Oxford+CMIC, Shanghai Jiao Tong University; Visual Geometry Group, University of Oxford", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Han_AutoAD_II_The_ICCV_2023_supplemental.pdf", @@ -4485,7 +4636,8 @@ "aff_campus_unique_index": "0;0;0;0;0+2;0", "aff_campus_unique": "Oxford;;Shanghai", "aff_country_unique_index": "0;0;0;0+1;0+2;0", - "aff_country_unique": "United Kingdom;France;China" + "aff_country_unique": "United Kingdom;France;China", + "bibtex": "@InProceedings{Han_2023_ICCV,\n \n author = {\n Han,\n Tengda and Bain,\n Max and Nagrani,\n Arsha and Varol,\n Gul and Xie,\n Weidi and Zisserman,\n Andrew\n},\n title = {\n AutoAD II: The Sequel - Who,\n When,\n and What in Movie Audio Description\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13645-13655\n} \n}" }, { "title": "AutoDiffusion: Training-Free Optimization of Time Steps and Architectures for Automated Diffusion Model Acceleration", @@ -4510,14 +4662,15 @@ "author_num": 10, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_AutoDiffusion_Training-Free_Optimization_of_Time_Steps_and_Architectures_for_Automated_ICCV_2023_paper.html", "aff_unique_index": "0+1+0+0;2;2;2;2;2;2;2;0+1+0+0;0+1+0+0", - "aff_unique_norm": "Xiamen University;Pengcheng Laboratory;ByteDance", - "aff_unique_dep": "Department of Artificial Intelligence, School of Informatics;Peng Cheng Laboratory;", + "aff_unique_norm": "Xiamen University;Peng Cheng Laboratory;ByteDance", + "aff_unique_dep": "Department of Artificial Intelligence, School of Informatics;;", "aff_unique_url": "https://www.xmu.edu.cn;http://www.pcl.ac.cn;https://www.bytedance.com", "aff_unique_abbr": "XMU;PCL;ByteDance", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0+0+0;0;0;0;0;0;0;0;0+0+0+0;0+0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Lijiang and Li,\n Huixia and Zheng,\n Xiawu and Wu,\n Jie and Xiao,\n Xuefeng and Wang,\n Rui and Zheng,\n Min and Pan,\n Xin and Chao,\n Fei and Ji,\n Rongrong\n},\n title = {\n AutoDiffusion: Training-Free Optimization of Time Steps and Architectures for Automated Diffusion Model Acceleration\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7105-7114\n} \n}" }, { "title": "AutoReP: Automatic ReLU Replacement for Fast Private Network Inference", @@ -4549,7 +4702,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0+0;0;0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Peng_2023_ICCV,\n \n author = {\n Peng,\n Hongwu and Huang,\n Shaoyi and Zhou,\n Tong and Luo,\n Yukui and Wang,\n Chenghong and Wang,\n Zigeng and Zhao,\n Jiahui and Xie,\n Xi and Li,\n Ang and Geng,\n Tony and Mahmood,\n Kaleel and Wen,\n Wujie and Xu,\n Xiaolin and Ding,\n Caiwen\n},\n title = {\n AutoReP: Automatic ReLU Replacement for Fast Private Network Inference\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5178-5188\n} \n}" }, { "title": "AutoSynth: Learning to Generate 3D Training Data for Object Point Cloud Registration", @@ -4574,14 +4728,15 @@ "author_num": 2, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Dang_AutoSynth_Learning_to_Generate_3D_Training_Data_for_Object_Point_ICCV_2023_paper.html", "aff_unique_index": "0+1;0+1", - "aff_unique_norm": "EPFL;ClearSpace", + "aff_unique_norm": "École Polytechnique Fédérale de Lausanne;ClearSpace", "aff_unique_dep": "CVLab;", "aff_unique_url": "https://cvlab.epfl.ch;", "aff_unique_abbr": "EPFL;", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0", - "aff_country_unique": "Switzerland" + "aff_country_unique": "Switzerland", + "bibtex": "@InProceedings{Dang_2023_ICCV,\n \n author = {\n Dang,\n Zheng and Salzmann,\n Mathieu\n},\n title = {\n AutoSynth: Learning to Generate 3D Training Data for Object Point Cloud Registration\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9009-9019\n} \n}" }, { "title": "Automated Knowledge Distillation via Monte Carlo Tree Search", @@ -4613,7 +4768,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Lujun and Dong,\n Peijie and Wei,\n Zimian and Yang,\n Ya\n},\n title = {\n Automated Knowledge Distillation via Monte Carlo Tree Search\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17413-17424\n} \n}" }, { "title": "Automatic Animation of Hair Blowing in Still Portrait Photos", @@ -4645,7 +4801,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;1", - "aff_country_unique": "China;Saudi Arabia" + "aff_country_unique": "China;Saudi Arabia", + "bibtex": "@InProceedings{Xiao_2023_ICCV,\n \n author = {\n Xiao,\n Wenpeng and Liu,\n Wentao and Wang,\n Yitong and Ghanem,\n Bernard and Li,\n Bing\n},\n title = {\n Automatic Animation of Hair Blowing in Still Portrait Photos\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22963-22975\n} \n}" }, { "title": "Automatic Network Pruning via Hilbert-Schmidt Independence Criterion Lasso under Information Bottleneck Principle", @@ -4668,7 +4825,8 @@ "aff_domain": ";;;;;;;;", "email": ";;;;;;;;", "author_num": 9, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Guo_Automatic_Network_Pruning_via_Hilbert-Schmidt_Independence_Criterion_Lasso_under_Information_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Guo_Automatic_Network_Pruning_via_Hilbert-Schmidt_Independence_Criterion_Lasso_under_Information_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Guo_2023_ICCV,\n \n author = {\n Guo,\n Song and Zhang,\n Lei and Zheng,\n Xiawu and Wang,\n Yan and Li,\n Yuchao and Chao,\n Fei and Wu,\n Chenglin and Zhang,\n Shengchuan and Ji,\n Rongrong\n},\n title = {\n Automatic Network Pruning via Hilbert-Schmidt Independence Criterion Lasso under Information Bottleneck Principle\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17458-17469\n} \n}" }, { "title": "Auxiliary Tasks Benefit 3D Skeleton-based Human Motion Prediction", @@ -4700,7 +4858,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;1;0;0+0;1;0+0", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Xu_2023_ICCV,\n \n author = {\n Xu,\n Chenxin and Tan,\n Robby T. and Tan,\n Yuhong and Chen,\n Siheng and Wang,\n Xinchao and Wang,\n Yanfeng\n},\n title = {\n Auxiliary Tasks Benefit 3D Skeleton-based Human Motion Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9509-9520\n} \n}" }, { "title": "AvatarCraft: Transforming Text into Neural Human Avatars with Parameterized Shape and Pose Control", @@ -4725,14 +4884,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Jiang_AvatarCraft_Transforming_Text_into_Neural_Human_Avatars_with_Parameterized_Shape_ICCV_2023_paper.html", "aff_unique_index": "0;1;1;2;3;4;1", - "aff_unique_norm": "Hong Kong Polytechnic University;City University of Hong Kong;Google;Netflix;Microsoft", - "aff_unique_dep": ";;Google;;Cloud AI", + "aff_unique_norm": "The Hong Kong Polytechnic University;City University of Hong Kong;Google;Netflix;Microsoft", + "aff_unique_dep": ";;;;Cloud AI", "aff_unique_url": "https://www.polyu.edu.hk;https://www.cityu.edu.hk;https://www.google.com;https://www.netflix.com;https://www.microsoft.com/en-us/research/group/cloud-ai", "aff_unique_abbr": "PolyU;CityU;Google;Netflix;Microsoft Cloud AI", "aff_campus_unique_index": "0;0;0;1;0", "aff_campus_unique": "Hong Kong SAR;Mountain View;", "aff_country_unique_index": "0;0;0;1;1;1;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Jiang_2023_ICCV,\n \n author = {\n Jiang,\n Ruixiang and Wang,\n Can and Zhang,\n Jingbo and Chai,\n Menglei and He,\n Mingming and Chen,\n Dongdong and Liao,\n Jing\n},\n title = {\n AvatarCraft: Transforming Text into Neural Human Avatars with Parameterized Shape and Pose Control\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14371-14382\n} \n}" }, { "title": "BANSAC: A Dynamic BAyesian Network for Adaptive SAmple Consensus", @@ -4744,7 +4904,7 @@ "author": "Valter Piedade; Pedro Miraldo", "abstract": "RANSAC-based algorithms are the standard techniques for robust estimation in computer vision. These algorithms are iterative and computationally expensive; they alternate between random sampling of data, computing hypotheses, and running inlier counting. Many authors tried different approaches to improve efficiency. One of the major improvements is having a guided sampling, letting the RANSAC cycle stop sooner. This paper presents a new adaptive sampling process for RANSAC. Previous methods either assume no prior information about the inlier/outlier classification of data points or use some previously computed scores in the sampling. In this paper, we derive a dynamic Bayesian network that updates individual data points' inlier scores while iterating RANSAC. At each iteration, we apply weighted sampling using the updated scores. Our method works with or without prior data point scorings. In addition, we use the updated inlier/outlier scoring for deriving a new stopping criterion for the RANSAC loop. We test our method in multiple real-world datasets for several applications and obtain state-of-the-art results. Our method outperforms the baselines in accuracy while needing less computational time.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Piedade_BANSAC_A_Dynamic_BAyesian_Network_for_Adaptive_SAmple_Consensus_ICCV_2023_paper.pdf", - "aff": "Instituto Superior T \u00b4ecnico, Lisboa; Mitsubishi Electric Research Labs", + "aff": "Instituto Superior T ´ecnico, Lisboa; Mitsubishi Electric Research Labs", "project": "", "github": "https://github.com/merlresearch/bansac", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Piedade_BANSAC_A_Dynamic_ICCV_2023_supplemental.pdf", @@ -4757,14 +4917,15 @@ "author_num": 2, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Piedade_BANSAC_A_Dynamic_BAyesian_Network_for_Adaptive_SAmple_Consensus_ICCV_2023_paper.html", "aff_unique_index": "0;1", - "aff_unique_norm": "Instituto Superior T\u00e9cnico;Mitsubishi Electric Research Laboratories", + "aff_unique_norm": "Instituto Superior Técnico;Mitsubishi Electric Research Labs", "aff_unique_dep": ";", "aff_unique_url": "https://www.ist.utl.pt;https://www.merl.com", "aff_unique_abbr": "IST;MERL", "aff_campus_unique_index": "0", "aff_campus_unique": "Lisboa;", "aff_country_unique_index": "0;1", - "aff_country_unique": "Portugal;United States" + "aff_country_unique": "Portugal;United States", + "bibtex": "@InProceedings{Piedade_2023_ICCV,\n \n author = {\n Piedade,\n Valter and Miraldo,\n Pedro\n},\n title = {\n BANSAC: A Dynamic BAyesian Network for Adaptive SAmple Consensus\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3738-3747\n} \n}" }, { "title": "BEV-DG: Cross-Modal Learning under Bird's-Eye View for Domain Generalization of 3D Semantic Segmentation", @@ -4796,7 +4957,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0;0;1;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Miaoyu and Zhang,\n Yachao and Ma,\n Xu and Qu,\n Yanyun and Fu,\n Yun\n},\n title = {\n BEV-DG: Cross-Modal Learning under Bird's-Eye View for Domain Generalization of 3D Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11632-11642\n} \n}" }, { "title": "BEVBert: Multimodal Map Pre-training for Language-guided Navigation", @@ -4804,7 +4966,11 @@ "author": "Dong An, Yuankai Qi, Yangguang Li, Yan Huang, Liang Wang, Tieniu Tan, Jing Shao", "status": "Poster", "track": "main", - "pid": "1519" + "pid": "1519", + "gs_citation": 97, + "bibtex": "@misc{an2022,\n title={BEVBert: Multimodal Map Pre-training for Language-guided Navigation},\n author={Dong An and Yuankai Qi and Yangguang Li and Yan Huang and Liang Wang and Tieniu Tan and Jing Shao},\n year={2022},\n eprint={2212.04385v2},\n archivePrefix={arXiv},\n primaryClass={cs.CV},\n url={https://arxiv.org/abs/2212.04385v2}\n}", + "abstract": "Large-scale pre-training has shown promising results on the\nvision-and-language navigation (VLN) task. However, most existing pre-training\nmethods employ discrete panoramas to learn visual-textual associations. This\nrequires the model to implicitly correlate incomplete, duplicate observations\nwithin the panoramas, which may impair an agent's spatial understanding. Thus,\nwe propose a new map-based pre-training paradigm that is spatial-aware for use\nin VLN. Concretely, we build a local metric map to explicitly aggregate\nincomplete observations and remove duplicates, while modeling navigation\ndependency in a global topological map. This hybrid design can balance the\ndemand of VLN for both short-term reasoning and long-term planning. Then, based\non the hybrid map, we devise a pre-training framework to learn a multimodal map\nrepresentation, which enhances spatial-aware cross-modal reasoning thereby\nfacilitating the language-guided navigation goal. Extensive experiments\ndemonstrate the effectiveness of the map-based pre-training route for VLN, and\nthe proposed method achieves state-of-the-art on four VLN benchmarks.", + "pdf_url": "http://arxiv.org/pdf/2212.04385v2" }, { "title": "BEVPlace: Learning LiDAR-based Place Recognition using Bird's Eye View Images", @@ -4836,7 +5002,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Ningbo;", "aff_country_unique_index": "0+0;0;0;0;0;0+0;0+0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Luo_2023_ICCV,\n \n author = {\n Luo,\n Lun and Zheng,\n Shuhang and Li,\n Yixuan and Fan,\n Yongzhi and Yu,\n Beinan and Cao,\n Si-Yuan and Li,\n Junwei and Shen,\n Hui-Liang\n},\n title = {\n BEVPlace: Learning LiDAR-based Place Recognition using Bird's Eye View Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8700-8709\n} \n}" }, { "title": "BT^2: Backward-compatible Training with Basis Transformation", @@ -4861,14 +5028,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhou_BT2_Backward-compatible_Training_with_Basis_Transformation_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;3;4;5;5", - "aff_unique_norm": "University of California, Berkeley;Cornell University;University of Maryland;University of Hong Kong;Massachusetts Institute of Technology;Meta", + "aff_unique_norm": "University of California, Berkeley;Cornell University;University of Maryland;University of Hong Kong;Massachusetts Institute of Technology;Meta Platforms, Inc.", "aff_unique_dep": ";;;;;Meta AI", "aff_unique_url": "https://www.berkeley.edu;https://www.cornell.edu;https://www/umd.edu;https://www.hku.hk;https://web.mit.edu;https://meta.com", "aff_unique_abbr": "UC Berkeley;Cornell;UMD;HKU;MIT;Meta", "aff_campus_unique_index": "0;2;3", "aff_campus_unique": "Berkeley;;College Park;Hong Kong SAR", "aff_country_unique_index": "0;0;0;1;0;0;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Zhou_2023_ICCV,\n \n author = {\n Zhou,\n Yifei and Li,\n Zilu and Shrivastava,\n Abhinav and Zhao,\n Hengshuang and Torralba,\n Antonio and Tian,\n Taipeng and Lim,\n Ser-Nam\n},\n title = {\n BT{\\textasciicircum\n}2: Backward-compatible Training with Basis Transformation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11229-11238\n} \n}" }, { "title": "BUS: Efficient and Effective Vision-Language Pre-Training with Bottom-Up Patch Summarization.", @@ -4900,7 +5068,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Jiang_2023_ICCV,\n \n author = {\n Jiang,\n Chaoya and Xu,\n Haiyang and Ye,\n Wei and Ye,\n Qinghao and Li,\n Chenliang and Yan,\n Ming and Bi,\n Bin and Zhang,\n Shikun and Huang,\n Fei and Huang,\n Songfang\n},\n title = {\n BUS: Efficient and Effective Vision-Language Pre-Training with Bottom-Up Patch Summarization.\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2900-2910\n} \n}" }, { "title": "BaRe-ESA: A Riemannian Framework for Unregistered Human Body Shapes", @@ -4923,7 +5092,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Hartman_BaRe-ESA_A_Riemannian_Framework_for_Unregistered_Human_Body_Shapes_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Hartman_BaRe-ESA_A_Riemannian_Framework_for_Unregistered_Human_Body_Shapes_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Hartman_2023_ICCV,\n \n author = {\n Hartman,\n Emmanuel and Pierson,\n Emery and Bauer,\n Martin and Charon,\n Nicolas and Daoudi,\n Mohamed\n},\n title = {\n BaRe-ESA: A Riemannian Framework for Unregistered Human Body Shapes\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14181-14191\n} \n}" }, { "title": "Backpropagation Path Search On Adversarial Transferability", @@ -4948,14 +5118,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Xu_Backpropagation_Path_Search_On_Adversarial_Transferability_ICCV_2023_paper.html", "aff_unique_index": "0;0;1;0;0;0", - "aff_unique_norm": "Ant Group;Chinese University of Hong Kong", + "aff_unique_norm": "Ant Group;The Chinese University of Hong Kong", "aff_unique_dep": "Tiansuan Lab;Department of Computer Science and Engineering", "aff_unique_url": "https://www.antgroup.com;https://www.cuhk.edu.hk", "aff_unique_abbr": "Ant Group;CUHK", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xu_2023_ICCV,\n \n author = {\n Xu,\n Zhuoer and Gu,\n Zhangxuan and Zhang,\n Jianping and Cui,\n Shiwen and Meng,\n Changhua and Wang,\n Weiqiang\n},\n title = {\n Backpropagation Path Search On Adversarial Transferability\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4663-4673\n} \n}" }, { "title": "BallGAN: 3D-aware Image Synthesis with a Spherical Background", @@ -4987,7 +5158,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Shin_2023_ICCV,\n \n author = {\n Shin,\n Minjung and Seo,\n Yunji and Bae,\n Jeongmin and Choi,\n Young Sun and Kim,\n Hyunsu and Byun,\n Hyeran and Uh,\n Youngjung\n},\n title = {\n BallGAN: 3D-aware Image Synthesis with a Spherical Background\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7268-7279\n} \n}" }, { "title": "Batch-based Model Registration for Fast 3D Sherd Reconstruction", @@ -5012,14 +5184,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wang_Batch-based_Model_Registration_for_Fast_3D_Sherd_Reconstruction_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;1;0;2;1", - "aff_unique_norm": "University of Hong Kong;Texas A&M University;Max Planck Institute for Informatics", + "aff_unique_norm": "The University of Hong Kong;Texas A&M University;Max Planck Institute for Informatics", "aff_unique_dep": ";;", "aff_unique_url": "https://www.hku.hk;https://www.tamu.edu;https://mpi-inf.mpg.de", "aff_unique_abbr": "HKU;TAMU;MPII", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;1;0;2;1", - "aff_country_unique": "China;United States;Germany" + "aff_country_unique": "China;United States;Germany", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Jiepeng and Zhang,\n Congyi and Wang,\n Peng and Li,\n Xin and Cobb,\n Peter J. and Theobalt,\n Christian and Wang,\n Wenping\n},\n title = {\n Batch-based Model Registration for Fast 3D Sherd Reconstruction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14519-14529\n} \n}" }, { "title": "Bayesian Optimization Meets Self-Distillation", @@ -5051,7 +5224,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Lee_2023_ICCV,\n \n author = {\n Lee,\n HyunJae and Song,\n Heon and Lee,\n Hyeonsoo and Lee,\n Gi-hyeon and Park,\n Suyeong and Yoo,\n Donggeun\n},\n title = {\n Bayesian Optimization Meets Self-Distillation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1696-1705\n} \n}" }, { "title": "Bayesian Prompt Learning for Image-Language Model Generalization", @@ -5076,14 +5250,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Derakhshani_Bayesian_Prompt_Learning_for_Image-Language_Model_Generalization_ICCV_2023_paper.html", "aff_unique_index": "0;1;1;2;0+1;3+1;1", - "aff_unique_norm": "University of Amsterdam;Samsung;University of Trento;Queen Mary University of London", + "aff_unique_norm": "University of Amsterdam;Samsung AI;University of Trento;Queen Mary University of London", "aff_unique_dep": ";Samsung AI;;", "aff_unique_url": "https://www.uva.nl;https://www.samsung.com;https://www.unitn.it;https://www.qmul.ac.uk", "aff_unique_abbr": "UvA;Samsung AI;UniTN;QMUL", "aff_campus_unique_index": "1;1;1;2+1;1", "aff_campus_unique": ";Cambridge;London", "aff_country_unique_index": "0;1;1;2;0+1;3+1;1", - "aff_country_unique": "Netherlands;South Korea;Italy;United Kingdom" + "aff_country_unique": "Netherlands;South Korea;Italy;United Kingdom", + "bibtex": "@InProceedings{Derakhshani_2023_ICCV,\n \n author = {\n Derakhshani,\n Mohammad Mahdi and Sanchez,\n Enrique and Bulat,\n Adrian and da Costa,\n Victor G. Turrisi and Snoek,\n Cees G.M. and Tzimiropoulos,\n Georgios and Martinez,\n Brais\n},\n title = {\n Bayesian Prompt Learning for Image-Language Model Generalization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15237-15246\n} \n}" }, { "title": "Be Everywhere - Hear Everything (BEE): Audio Scene Reconstruction by Sparse Audio-Visual Samples", @@ -5115,7 +5290,8 @@ "aff_campus_unique_index": "0;0;0+0", "aff_campus_unique": "Seattle", "aff_country_unique_index": "0;0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Mingfei and Su,\n Kun and Shlizerman,\n Eli\n},\n title = {\n Be Everywhere - Hear Everything (BEE): Audio Scene Reconstruction by Sparse Audio-Visual Samples\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7853-7862\n} \n}" }, { "title": "BeLFusion: Latent Diffusion for Behavior-Driven Human Motion Prediction", @@ -5147,7 +5323,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Spain" + "aff_country_unique": "Spain", + "bibtex": "@InProceedings{Barquero_2023_ICCV,\n \n author = {\n Barquero,\n German and Escalera,\n Sergio and Palmero,\n Cristina\n},\n title = {\n BeLFusion: Latent Diffusion for Behavior-Driven Human Motion Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2317-2327\n} \n}" }, { "title": "Beating Backdoor Attack at Its Own Game", @@ -5172,14 +5349,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Liu_Beating_Backdoor_Attack_at_Its_Own_Game_ICCV_2023_paper.html", "aff_unique_index": "0;1;2", - "aff_unique_norm": "Carnegie Mellon University;University of California, Berkeley;Chinese University of Hong Kong", + "aff_unique_norm": "Carnegie Mellon University;University of California, Berkeley;The Chinese University of Hong Kong", "aff_unique_dep": ";;", "aff_unique_url": "https://www.cmu.edu;https://www.berkeley.edu;https://www.cuhk.edu.hk", "aff_unique_abbr": "CMU;UC Berkeley;CUHK", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Berkeley;Hong Kong SAR", "aff_country_unique_index": "0;0;1", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Min and Sangiovanni-Vincentelli,\n Alberto and Yue,\n Xiangyu\n},\n title = {\n Beating Backdoor Attack at Its Own Game\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4620-4629\n} \n}" }, { "title": "Benchmarking Algorithmic Bias in Face Recognition: An Experimental Approach Using Synthetic Faces and Human Evaluation", @@ -5202,7 +5380,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Liang_Benchmarking_Algorithmic_Bias_in_Face_Recognition_An_Experimental_Approach_Using_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Liang_Benchmarking_Algorithmic_Bias_in_Face_Recognition_An_Experimental_Approach_Using_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Liang_2023_ICCV,\n \n author = {\n Liang,\n Hao and Perona,\n Pietro and Balakrishnan,\n Guha\n},\n title = {\n Benchmarking Algorithmic Bias in Face Recognition: An Experimental Approach Using Synthetic Faces and Human Evaluation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4977-4987\n} \n}" }, { "title": "Benchmarking Low-Shot Robustness to Natural Distribution Shifts", @@ -5227,14 +5406,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Singh_Benchmarking_Low-Shot_Robustness_to_Natural_Distribution_Shifts_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0+1", - "aff_unique_norm": "Georgia Institute of Technology;Amazon", + "aff_unique_norm": "Georgia Institute of Technology;Amazon Web Services", "aff_unique_dep": ";AWS AI Labs", "aff_unique_url": "https://www.gatech.edu;https://aws.amazon.com", "aff_unique_abbr": "Georgia Tech;AWS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Singh_2023_ICCV,\n \n author = {\n Singh,\n Aaditya and Sarangmath,\n Kartik and Chattopadhyay,\n Prithvijit and Hoffman,\n Judy\n},\n title = {\n Benchmarking Low-Shot Robustness to Natural Distribution Shifts\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16232-16242\n} \n}" }, { "title": "Benchmarking and Analyzing Robust Point Cloud Recognition: Bag of Tricks for Defending Adversarial Examples", @@ -5266,7 +5446,8 @@ "aff_campus_unique_index": "1+2", "aff_campus_unique": ";Guangzhou;Hong Kong SAR", "aff_country_unique_index": "0;1+1;0;1;0;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Ji_2023_ICCV,\n \n author = {\n Ji,\n Qiufan and Wang,\n Lin and Shi,\n Cong and Hu,\n Shengshan and Chen,\n Yingying and Sun,\n Lichao\n},\n title = {\n Benchmarking and Analyzing Robust Point Cloud Recognition: Bag of Tricks for Defending Adversarial Examples\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4295-4304\n} \n}" }, { "title": "Betrayed by Captions: Joint Caption Grounding and Generation for Open Vocabulary Instance Segmentation", @@ -5298,7 +5479,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;0;1;0;1", - "aff_country_unique": "China;Singapore;Switzerland" + "aff_country_unique": "China;Singapore;Switzerland", + "bibtex": "@InProceedings{Wu_2023_ICCV,\n \n author = {\n Wu,\n Jianzong and Li,\n Xiangtai and Ding,\n Henghui and Li,\n Xia and Cheng,\n Guangliang and Tong,\n Yunhai and Loy,\n Chen Change\n},\n title = {\n Betrayed by Captions: Joint Caption Grounding and Generation for Open Vocabulary Instance Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21938-21948\n} \n}" }, { "title": "Better May Not Be Fairer: A Study on Subgroup Discrepancy in Image Classification", @@ -5330,7 +5512,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Chiu_2023_ICCV,\n \n author = {\n Chiu,\n Ming-Chang and Chen,\n Pin-Yu and Ma,\n Xuezhe\n},\n title = {\n Better May Not Be Fairer: A Study on Subgroup Discrepancy in Image Classification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4956-4966\n} \n}" }, { "title": "Beyond Image Borders: Learning Feature Extrapolation for Unbounded Image Composition", @@ -5355,14 +5538,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Liu_Beyond_Image_Borders_Learning_Feature_Extrapolation_for_Unbounded_Image_Composition_ICCV_2023_paper.html", "aff_unique_index": "0+1;0+1;0;0+1", - "aff_unique_norm": "Harbin Institute of Technology;Pengcheng Laboratory", - "aff_unique_dep": ";Peng Cheng Laboratory", + "aff_unique_norm": "Harbin Institute of Technology;Peng Cheng Laboratory", + "aff_unique_dep": ";", "aff_unique_url": "http://www.hit.edu.cn/;", "aff_unique_abbr": "HIT;", "aff_campus_unique_index": "0+1;0+1;0;0+1", "aff_campus_unique": "Harbin;Shenzhen", "aff_country_unique_index": "0+0;0+0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Xiaoyu and Liu,\n Ming and Li,\n Junyi and Liu,\n Shuai and Wang,\n Xiaotao and Lei,\n Lei and Zuo,\n Wangmeng\n},\n title = {\n Beyond Image Borders: Learning Feature Extrapolation for Unbounded Image Composition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13023-13032\n} \n}" }, { "title": "Beyond Object Recognition: A New Benchmark towards Object Concept Learning", @@ -5394,7 +5578,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Yong-Lu and Xu,\n Yue and Xu,\n Xinyu and Mao,\n Xiaohan and Yao,\n Yuan and Liu,\n Siqi and Lu,\n Cewu\n},\n title = {\n Beyond Object Recognition: A New Benchmark towards Object Concept Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20029-20040\n} \n}" }, { "title": "Beyond One-to-One: Rethinking the Referring Image Segmentation", @@ -5406,7 +5591,7 @@ "author": "Yutao Hu; Qixiong Wang; Wenqi Shao; Enze Xie; Zhenguo Li; Jungong Han; Ping Luo", "abstract": "Referring image segmentation aims to segment the target object referred by a natural language expression. However, previous methods rely on the strong assumption that one sentence must describe one target in the image, which is often not the case in real-world applications. As a result, such methods fail when the expressions refer to either no objects or multiple objects. In this paper, we address this issue from two perspectives. First, we propose a Dual Multi-Modal Interaction (DMMI) Network, which contains two decoder branches and enables information flow in two directions. In the text-to-image decoder, text embedding is utilized to query the visual feature and localize the corresponding target. Meanwhile, the image-to-text decoder is implemented to reconstruct the erased entity-phrase conditioned on the visual feature. In this way, visual features are encouraged to contain the critical semantic information about target entity, which supports the accurate segmentation in the text-to-image decoder in turn. Secondly, we collect a new challenging but realistic dataset called Ref-ZOM, which includes image-text pairs under different settings. Extensive experiments demonstrate our method achieves state-of-the-art performance on different datasets, and the Ref-ZOM-trained model performs well on various types of text inputs. Codes and datasets are available at https://github.com/toggle1995/RIS-DMMI.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Hu_Beyond_One-to-One_Rethinking_the_Referring_Image_Segmentation_ICCV_2023_paper.pdf", - "aff": "The University of Hong Kong; Shanghai AI Laboratory; Huawei Noah\u2019s Ark Lab; The University of Sheffield; Huawei Noah\u2019s Ark Lab; The University of Sheffield; The University of Hong Kong+Shanghai AI Laboratory", + "aff": "The University of Hong Kong; Shanghai AI Laboratory; Huawei Noah’s Ark Lab; The University of Sheffield; Huawei Noah’s Ark Lab; The University of Sheffield; The University of Hong Kong+Shanghai AI Laboratory", "project": "", "github": "https://github.com/toggle1995/RIS-DMMI", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Hu_Beyond_One-to-One_Rethinking_the_Referring_Image_Segmentation_ICCV_2023_supplemental.pdf", @@ -5419,14 +5604,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Hu_Beyond_One-to-One_Rethinking_the_Referring_Image_Segmentation_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;3;2;3;0+1", - "aff_unique_norm": "University of Hong Kong;Shanghai AI Laboratory;Huawei;University of Sheffield", - "aff_unique_dep": ";;Noah\u2019s Ark Lab;", + "aff_unique_norm": "The University of Hong Kong;Shanghai AI Laboratory;Huawei;University of Sheffield", + "aff_unique_dep": ";;Noah’s Ark Lab;", "aff_unique_url": "https://www.hku.hk;https://www.shanghai-ai-lab.com;https://www.huawei.com;https://www.sheffield.ac.uk", "aff_unique_abbr": "HKU;SAIL;Huawei;Sheffield", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;1;0;1;0+0", - "aff_country_unique": "China;United Kingdom" + "aff_country_unique": "China;United Kingdom", + "bibtex": "@InProceedings{Hu_2023_ICCV,\n \n author = {\n Hu,\n Yutao and Wang,\n Qixiong and Shao,\n Wenqi and Xie,\n Enze and Li,\n Zhenguo and Han,\n Jungong and Luo,\n Ping\n},\n title = {\n Beyond One-to-One: Rethinking the Referring Image Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4067-4077\n} \n}" }, { "title": "Beyond Single Path Integrated Gradients for Reliable Input Attribution via Randomized Path Sampling", @@ -5438,7 +5624,7 @@ "author": "Giyoung Jeon; Haedong Jeong; Jaesik Choi", "abstract": "Input attribution is a widely used explanation method for deep neural networks, especially in visual tasks. Among various attribution methods, Integrated Gradients (IG) is frequently used because of its model-agnostic applicability and desirable axioms. However, previous work has shown that such method often produces noisy and unreliable attributions during the integration of the gradients over the path defined in the input space. In this paper, we tackle this issue by estimating the distribution of the possible attributions according to the integrating path selection. We show that such noisy attribution can be reduced by aggregating attributions from the multiple paths instead of using a single path. Inspired by Stick-Breaking Process (SBP), we suggest a random process to generate rich and various sampling of the gradient integrating path. Using multiple input attributions obtained from randomized path, we propose a novel attribution measure using the distribution of attributions at each input features. We identify proposed method qualitatively show less-noisy and object-aligned attribution and its feasibility through the quantitative evaluations.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Jeon_Beyond_Single_Path_Integrated_Gradients_for_Reliable_Input_Attribution_via_ICCV_2023_paper.pdf", - "aff": "LG AI Research*; UNIST+KAIST; KAIST+INEEJI\u2020", + "aff": "LG AI Research*; UNIST+KAIST; KAIST+INEEJI†", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Jeon_Beyond_Single_Path_ICCV_2023_supplemental.pdf", @@ -5451,14 +5637,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Jeon_Beyond_Single_Path_Integrated_Gradients_for_Reliable_Input_Attribution_via_ICCV_2023_paper.html", "aff_unique_index": "0;1+2;2+3", - "aff_unique_norm": "LG;Ulsan National Institute of Science and Technology;Korea Advanced Institute of Science and Technology;INEEJI", - "aff_unique_dep": "AI Research;;;", - "aff_unique_url": "https://www.lg.com/global/ai-research;https://www.unist.ac.kr;https://www.kaist.ac.kr;", + "aff_unique_norm": "LG AI Research;Ulsan National Institute of Science and Technology;Korea Advanced Institute of Science and Technology;INEEJI", + "aff_unique_dep": "LG AI Research;;;", + "aff_unique_url": "https://www.lgaires.com;https://www.unist.ac.kr;https://www.kaist.ac.kr;", "aff_unique_abbr": "LG AI;UNIST;KAIST;", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0", - "aff_country_unique": "South Korea;" + "aff_country_unique": "South Korea;", + "bibtex": "@InProceedings{Jeon_2023_ICCV,\n \n author = {\n Jeon,\n Giyoung and Jeong,\n Haedong and Choi,\n Jaesik\n},\n title = {\n Beyond Single Path Integrated Gradients for Reliable Input Attribution via Randomized Path Sampling\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2052-2061\n} \n}" }, { "title": "Beyond Skin Tone: A Multidimensional Measure of Apparent Skin Color", @@ -5490,7 +5677,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Thong_2023_ICCV,\n \n author = {\n Thong,\n William and Joniak,\n Przemyslaw and Xiang,\n Alice\n},\n title = {\n Beyond Skin Tone: A Multidimensional Measure of Apparent Skin Color\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4903-4913\n} \n}" }, { "title": "Beyond the Limitation of Monocular 3D Detector via Knowledge Distillation", @@ -5522,7 +5710,8 @@ "aff_campus_unique_index": ";;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0+0;0+0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n Yiran and Yin,\n Dongshuo and Rong,\n Xuee and Sun,\n Xian and Diao,\n Wenhui and Li,\n Xinming\n},\n title = {\n Beyond the Limitation of Monocular 3D Detector via Knowledge Distillation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9077-9086\n} \n}" }, { "title": "Beyond the Pixel: a Photometrically Calibrated HDR Dataset for Luminance and Color Prediction", @@ -5530,11 +5719,11 @@ "status": "Oral", "track": "main", "pid": "6978", - "author_site": "Christophe Bolduc, Justine Giroux, Marc H\u00e9bert, Claude Demers, Jean-Fran\u00e7ois Lalonde", - "author": "Christophe Bolduc; Justine Giroux; Marc H\u00e9bert; Claude Demers; Jean-Fran\u00e7ois Lalonde", + "author_site": "Christophe Bolduc, Justine Giroux, Marc Hébert, Claude Demers, Jean-François Lalonde", + "author": "Christophe Bolduc; Justine Giroux; Marc Hébert; Claude Demers; Jean-François Lalonde", "abstract": "Light plays an important role in human well-being. However, most computer vision tasks treat pixels without considering their relationship to physical luminance. To address this shortcoming, we introduce the Laval Photometric Indoor HDR Dataset, the first large-scale photometrically calibrated dataset of high dynamic range 360deg panoramas. Our key contribution is the calibration of an existing, uncalibrated HDR Dataset. We do so by accurately capturing RAW bracketed exposures simultaneously with a professional photometric measurement device (chroma meter) for multiple scenes across a variety of lighting conditions. Using the resulting measurements, we establish the calibration coefficients to be applied to the HDR images. The resulting dataset is a rich representation of indoor scenes which displays a wide range of illuminance and color, and varied types of light sources. We exploit the dataset to introduce three novel tasks, where: per-pixel luminance, per-pixel color and planar illuminance can be predicted from a single input image. Finally, we also capture another smaller photometric dataset with a commercial 360deg camera, to experiment on generalization across cameras. We are optimistic that the release of our datasets and associated code will spark interest in physically accurate light estimation within the community. Dataset and code are available at https://lvsn.github.io/beyondthepixel/.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Bolduc_Beyond_the_Pixel_a_Photometrically_Calibrated_HDR_Dataset_for_Luminance_ICCV_2023_paper.pdf", - "aff": "Universit \u00b4e Laval; Universit \u00b4e Laval; Universit \u00b4e Laval; Universit \u00b4e Laval; Universit \u00b4e Laval", + "aff": "Universit ´e Laval; Universit ´e Laval; Universit ´e Laval; Universit ´e Laval; Universit ´e Laval", "project": "https://lvsn.github.io/beyondthepixel/", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Bolduc_Beyond_the_Pixel_ICCV_2023_supplemental.pdf", @@ -5547,14 +5736,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Bolduc_Beyond_the_Pixel_a_Photometrically_Calibrated_HDR_Dataset_for_Luminance_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0;0", - "aff_unique_norm": "Universit\u00e9 Laval", + "aff_unique_norm": "Université Laval", "aff_unique_dep": "", "aff_unique_url": "https://www.ulaval.ca", "aff_unique_abbr": "UL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Bolduc_2023_ICCV,\n \n author = {\n Bolduc,\n Christophe and Giroux,\n Justine and H\\'ebert,\n Marc and Demers,\n Claude and Lalonde,\n Jean-Fran\\c{c\n}ois\n},\n title = {\n Beyond the Pixel: a Photometrically Calibrated HDR Dataset for Luminance and Color Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8071-8081\n} \n}" }, { "title": "BiFF: Bi-level Future Fusion with Polyline-based Coordinate for Interactive Trajectory Prediction", @@ -5586,7 +5776,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhu_2023_ICCV,\n \n author = {\n Zhu,\n Yiyao and Luan,\n Di and Shen,\n Shaojie\n},\n title = {\n BiFF: Bi-level Future Fusion with Polyline-based Coordinate for Interactive Trajectory Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8260-8271\n} \n}" }, { "title": "BiViT: Extremely Compressed Binary Vision Transformers", @@ -5618,7 +5809,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0;0+1;1", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{He_2023_ICCV,\n \n author = {\n He,\n Yefei and Lou,\n Zhenyu and Zhang,\n Luoming and Liu,\n Jing and Wu,\n Weijia and Zhou,\n Hong and Zhuang,\n Bohan\n},\n title = {\n BiViT: Extremely Compressed Binary Vision Transformers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5651-5663\n} \n}" }, { "title": "Bidirectional Alignment for Domain Adaptive Detection with Transformers", @@ -5643,14 +5835,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/He_Bidirectional_Alignment_for_Domain_Adaptive_Detection_with_Transformers_ICCV_2023_paper.html", "aff_unique_index": "0;1;1;1;1;0", - "aff_unique_norm": "Oregon State University;Amazon", - "aff_unique_dep": ";Amazon.com, Inc.", + "aff_unique_norm": "Oregon State University;Amazon.com, Inc.", + "aff_unique_dep": ";", "aff_unique_url": "https://oregonstate.edu;https://www.amazon.com", "aff_unique_abbr": "OSU;Amazon", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{He_2023_ICCV,\n \n author = {\n He,\n Liqiang and Wang,\n Wei and Chen,\n Albert and Sun,\n Min and Kuo,\n Cheng-Hao and Todorovic,\n Sinisa\n},\n title = {\n Bidirectional Alignment for Domain Adaptive Detection with Transformers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18775-18785\n} \n}" }, { "title": "Bidirectionally Deformable Motion Modulation For Video-based Human Pose Transfer", @@ -5682,7 +5875,8 @@ "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Hong Kong", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yu_2023_ICCV,\n \n author = {\n Yu,\n Wing-Yin and Po,\n Lai-Man and Cheung,\n Ray C.C. and Zhao,\n Yuzhi and Xue,\n Yu and Li,\n Kun\n},\n title = {\n Bidirectionally Deformable Motion Modulation For Video-based Human Pose Transfer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7502-7512\n} \n}" }, { "title": "Bird's-Eye-View Scene Graph for Vision-Language Navigation", @@ -5705,7 +5899,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Liu_Birds-Eye-View_Scene_Graph_for_Vision-Language_Navigation_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Liu_Birds-Eye-View_Scene_Graph_for_Vision-Language_Navigation_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Rui and Wang,\n Xiaohan and Wang,\n Wenguan and Yang,\n Yi\n},\n title = {\n Bird's-Eye-View Scene Graph for Vision-Language Navigation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10968-10980\n} \n}" }, { "title": "Black Box Few-Shot Adaptation for Vision-Language Models", @@ -5730,14 +5925,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ouali_Black_Box_Few-Shot_Adaptation_for_Vision-Language_Models_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0+1", - "aff_unique_norm": "Samsung;Queen Mary University of London", + "aff_unique_norm": "Samsung AI;Queen Mary University of London", "aff_unique_dep": "Samsung AI;", "aff_unique_url": "https://www.samsung.com/uk;https://www.qmul.ac.uk", "aff_unique_abbr": "Samsung AI;QMUL", "aff_campus_unique_index": "0;0;0;0+1", "aff_campus_unique": "Cambridge;London", "aff_country_unique_index": "0;0;0;0+0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Ouali_2023_ICCV,\n \n author = {\n Ouali,\n Yassine and Bulat,\n Adrian and Matinez,\n Brais and Tzimiropoulos,\n Georgios\n},\n title = {\n Black Box Few-Shot Adaptation for Vision-Language Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15534-15546\n} \n}" }, { "title": "Black-Box Unsupervised Domain Adaptation with Bi-Directional Atkinson-Shiffrin Memory", @@ -5769,7 +5965,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Jingyi and Huang,\n Jiaxing and Jiang,\n Xueying and Lu,\n Shijian\n},\n title = {\n Black-Box Unsupervised Domain Adaptation with Bi-Directional Atkinson-Shiffrin Memory\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11771-11782\n} \n}" }, { "title": "BlendFace: Re-designing Identity Encoders for Face-Swapping", @@ -5801,7 +5998,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Shiohara_2023_ICCV,\n \n author = {\n Shiohara,\n Kaede and Yang,\n Xingchao and Taketomi,\n Takafumi\n},\n title = {\n BlendFace: Re-designing Identity Encoders for Face-Swapping\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7634-7644\n} \n}" }, { "title": "Blending-NeRF: Text-Driven Localized Editing in Neural Radiance Fields", @@ -5826,14 +6024,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Song_Blending-NeRF_Text-Driven_Localized_Editing_in_Neural_Radiance_Fields_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0;1", - "aff_unique_norm": "LG;Seoul National University", + "aff_unique_norm": "LG Electronics;Seoul National University", "aff_unique_dep": "AI Lab, CTO Division;Dept. of Biosystems Engineering", "aff_unique_url": "https://www.lg.com;https://www.snu.ac.kr", "aff_unique_abbr": "LG;SNU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Seoul", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Song_2023_ICCV,\n \n author = {\n Song,\n Hyeonseop and Choi,\n Seokhun and Do,\n Hoseok and Lee,\n Chul and Kim,\n Taehyeong\n},\n title = {\n Blending-NeRF: Text-Driven Localized Editing in Neural Radiance Fields\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14383-14393\n} \n}" }, { "title": "BlindHarmony: \"Blind\" Harmonization for MR Images via Flow Model", @@ -5865,7 +6064,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Seoul", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Jeong_2023_ICCV,\n \n author = {\n Jeong,\n Hwihun and Byun,\n Heejoon and Kang,\n Dong Un and Lee,\n Jongho\n},\n title = {\n BlindHarmony: ''Blind'' Harmonization for MR Images via Flow Model\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21129-21139\n} \n}" }, { "title": "BoMD: Bag of Multi-label Descriptors for Noisy Chest X-ray Classification", @@ -5893,11 +6093,12 @@ "aff_unique_norm": "University of Adelaide;Harvard University;University of Surrey", "aff_unique_dep": "Australian Institute for Machine Learning;Harvard Medical School;Centre for Vision, Speech and Signal Processing", "aff_unique_url": "https://www.adelaide.edu.au;https://www.harvard.edu;https://www.surrey.ac.uk", - "aff_unique_abbr": "Adelaide;Harvard;", + "aff_unique_abbr": "Adelaide;Harvard;Surrey", "aff_campus_unique_index": "1", "aff_campus_unique": ";Cambridge", "aff_country_unique_index": "0;0;0;0;0;1;2", - "aff_country_unique": "Australia;United States;United Kingdom" + "aff_country_unique": "Australia;United States;United Kingdom", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Yuanhong and Liu,\n Fengbei and Wang,\n Hu and Wang,\n Chong and Liu,\n Yuyuan and Tian,\n Yu and Carneiro,\n Gustavo\n},\n title = {\n BoMD: Bag of Multi-label Descriptors for Noisy Chest X-ray Classification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21284-21295\n} \n}" }, { "title": "Body Knowledge and Uncertainty Modeling for Monocular 3D Human Body Reconstruction", @@ -5929,7 +6130,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Yufei and Wang,\n Hanjing and Kephart,\n Jeffrey O. and Ji,\n Qiang\n},\n title = {\n Body Knowledge and Uncertainty Modeling for Monocular 3D Human Body Reconstruction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9020-9032\n} \n}" }, { "title": "Bold but Cautious: Unlocking the Potential of Personalized Federated Learning through Cautiously Aggressive Collaboration", @@ -5953,15 +6155,16 @@ "email": "buaa.edu.cn;buaa.edu.cn;buaa.edu.cn;buaa.edu.cn;utdallas.edu", "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wu_Bold_but_Cautious_Unlocking_the_Potential_of_Personalized_Federated_Learning_ICCV_2023_paper.html", - "aff_unique_index": "0+1+2;0+1;0+1+2;0;3", - "aff_unique_norm": "Beihang University;Zhongguancun Laboratory;Zhengzhou University;University of Texas at Dallas", - "aff_unique_dep": "School of Computer Science and Engineering;;School of Information Engineering;Jindal School of Management", - "aff_unique_url": "http://www.buaa.edu.cn;;http://www.zzu.edu.cn;https://www.jindal.utdallas.edu", - "aff_unique_abbr": "Beihang;;ZZU;UT Dallas", - "aff_campus_unique_index": "0+2;0;0+2;0;3", - "aff_campus_unique": "Beijing;;Zhengzhou;Dallas", - "aff_country_unique_index": "0+0+0;0+0;0+0+0;0;1", - "aff_country_unique": "China;United States" + "aff_unique_index": "0+1;0+1;0+1;0;3", + "aff_unique_norm": "Beihang University;Zhongguancun Laboratory;;University of Texas at Dallas", + "aff_unique_dep": "School of Computer Science and Engineering;;;Jindal School of Management", + "aff_unique_url": "http://www.buaa.edu.cn;;;https://www.jindal.utdallas.edu", + "aff_unique_abbr": "Beihang;;;UT Dallas", + "aff_campus_unique_index": "0;0;0;0;2", + "aff_campus_unique": "Beijing;;Dallas", + "aff_country_unique_index": "0+0;0+0;0+0;0;2", + "aff_country_unique": "China;;United States", + "bibtex": "@InProceedings{Wu_2023_ICCV,\n \n author = {\n Wu,\n Xinghao and Liu,\n Xuefeng and Niu,\n Jianwei and Zhu,\n Guogang and Tang,\n Shaojie\n},\n title = {\n Bold but Cautious: Unlocking the Potential of Personalized Federated Learning through Cautiously Aggressive Collaboration\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19375-19384\n} \n}" }, { "title": "Boosting 3-DoF Ground-to-Satellite Camera Localization Accuracy via Geometry-Guided Cross-View Transformer", @@ -5993,7 +6196,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;1;0", - "aff_country_unique": "Australia;United States" + "aff_country_unique": "Australia;United States", + "bibtex": "@InProceedings{Shi_2023_ICCV,\n \n author = {\n Shi,\n Yujiao and Wu,\n Fei and Perincherry,\n Akhil and Vora,\n Ankit and Li,\n Hongdong\n},\n title = {\n Boosting 3-DoF Ground-to-Satellite Camera Localization Accuracy via Geometry-Guided Cross-View Transformer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21516-21526\n} \n}" }, { "title": "Boosting Adversarial Transferability via Gradient Relevance Attack", @@ -6025,7 +6229,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Shenyang;", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhu_2023_ICCV,\n \n author = {\n Zhu,\n Hegui and Ren,\n Yuchen and Sui,\n Xiaoyan and Yang,\n Lianping and Jiang,\n Wuming\n},\n title = {\n Boosting Adversarial Transferability via Gradient Relevance Attack\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4741-4750\n} \n}" }, { "title": "Boosting Few-shot Action Recognition with Graph-guided Hybrid Matching", @@ -6050,14 +6255,15 @@ "author_num": 9, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Xing_Boosting_Few-shot_Action_Recognition_with_Graph-guided_Hybrid_Matching_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0;0;0;1+2;3;0", - "aff_unique_norm": "Zhejiang University;SGIT AI Lab;State Grid Corporation of China;Baidu", - "aff_unique_dep": ";AI Lab;;Baidu Inc.", + "aff_unique_norm": "Zhejiang University;SGIT AI Lab;State Grid Corporation of China;Baidu Inc.", + "aff_unique_dep": ";AI Lab;;", "aff_unique_url": "https://www.zju.edu.cn;;http://www.sgcc.com.cn;https://www.baidu.com", "aff_unique_abbr": "ZJU;;SGCC;Baidu", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Xing_2023_ICCV,\n \n author = {\n Xing,\n Jiazheng and Wang,\n Mengmeng and Ruan,\n Yudi and Chen,\n Bofan and Guo,\n Yaowei and Mu,\n Boyu and Dai,\n Guang and Wang,\n Jingdong and Liu,\n Yong\n},\n title = {\n Boosting Few-shot Action Recognition with Graph-guided Hybrid Matching\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1740-1750\n} \n}" }, { "title": "Boosting Long-tailed Object Detection via Step-wise Learning on Smooth-tail Data", @@ -6089,7 +6295,8 @@ "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Harbin", "aff_country_unique_index": "0+1;1;1;0", - "aff_country_unique": "Singapore;China" + "aff_country_unique": "Singapore;China", + "bibtex": "@InProceedings{Dong_2023_ICCV,\n \n author = {\n Dong,\n Na and Zhang,\n Yongqiang and Ding,\n Mingli and Lee,\n Gim Hee\n},\n title = {\n Boosting Long-tailed Object Detection via Step-wise Learning on Smooth-tail Data\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6940-6949\n} \n}" }, { "title": "Boosting Multi-modal Model Performance with Adaptive Gradient Modulation", @@ -6121,7 +6328,8 @@ "aff_campus_unique_index": "0;0;2+2;0;2;2+2", "aff_campus_unique": "Shanghai;;Hefei", "aff_country_unique_index": "0+0;0;0+0+0;0+0;0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Hong and Li,\n Xingyu and Hu,\n Pengbo and Lei,\n Yinuo and Li,\n Chunxiao and Zhou,\n Yi\n},\n title = {\n Boosting Multi-modal Model Performance with Adaptive Gradient Modulation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22214-22224\n} \n}" }, { "title": "Boosting Novel Category Discovery Over Domains with Soft Contrastive Learning and All in One Classifier", @@ -6153,7 +6361,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zang_2023_ICCV,\n \n author = {\n Zang,\n Zelin and Shang,\n Lei and Yang,\n Senqiao and Wang,\n Fei and Sun,\n Baigui and Xie,\n Xuansong and Li,\n Stan Z.\n},\n title = {\n Boosting Novel Category Discovery Over Domains with Soft Contrastive Learning and All in One Classifier\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11858-11867\n} \n}" }, { "title": "Boosting Positive Segments for Weakly-Supervised Audio-Visual Video Parsing", @@ -6185,7 +6394,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Madras", "aff_country_unique_index": "0;0", - "aff_country_unique": "India" + "aff_country_unique": "India", + "bibtex": "@InProceedings{Rachavarapu_2023_ICCV,\n \n author = {\n Rachavarapu,\n Kranthi Kumar and N.,\n Rajagopalan A.\n},\n title = {\n Boosting Positive Segments for Weakly-Supervised Audio-Visual Video Parsing\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10192-10202\n} \n}" }, { "title": "Boosting Semantic Segmentation from the Perspective of Explicit Class Embeddings", @@ -6197,7 +6407,7 @@ "author": "Yuhe Liu; Chuanjian Liu; Kai Han; Quan Tang; Zengchang Qin", "abstract": "Semantic segmentation is a computer vision task that associates a label with each pixel in an image. Modern approaches tend to introduce class embeddings into semantic segmentation for deeply utilizing category semantics, and regard supervised class masks as final predictions. In this paper, we explore the mechanism of class embeddings and have an insight that more explicit and meaningful class embeddings can be generated based on class masks purposely. Following this observation, we propose ECENet, a new segmentation paradigm, in which class embeddings are obtained and enhanced explicitly during interacting with multi-stage image features. Based on this, we revisit the traditional decoding process and explore inverted information flow between segmentation masks and class embeddings. Furthermore, to ensure the discriminability and informativity of features from backbone, we propose a Feature Reconstruction module, which combines intrinsic and diverse branches together to ensure the concurrence of diversity and redundancy in features. Experiments show that our ECENet outperforms its counterparts on the ADE20K dataset with much less computational cost and achieves new state-of-the-art results on PASCAL-Context dataset. The code will be released at https://gitee.com/mindspore/models and https://github.com/Carol-lyh/ECENet.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Liu_Boosting_Semantic_Segmentation_from_the_Perspective_of_Explicit_Class_Embeddings_ICCV_2023_paper.pdf", - "aff": "Beihang University+Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; South China University of Technology; Beihang University+Huawei Noah\u2019s Ark Lab", + "aff": "Beihang University+Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; South China University of Technology; Beihang University+Huawei Noah’s Ark Lab", "project": "https://gitee.com/mindspore/models", "github": "https://github.com/Carol-lyh/ECENet", "supp": "", @@ -6211,13 +6421,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Liu_Boosting_Semantic_Segmentation_from_the_Perspective_of_Explicit_Class_Embeddings_ICCV_2023_paper.html", "aff_unique_index": "0+1;1;1;2;0+1", "aff_unique_norm": "Beihang University;Huawei;South China University of Technology", - "aff_unique_dep": ";Noah\u2019s Ark Lab;", + "aff_unique_dep": ";Noah’s Ark Lab;", "aff_unique_url": "http://www.buaa.edu.cn/;https://www.huawei.com;https://www.scut.edu.cn", "aff_unique_abbr": "BUAA;Huawei;SCUT", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Yuhe and Liu,\n Chuanjian and Han,\n Kai and Tang,\n Quan and Qin,\n Zengchang\n},\n title = {\n Boosting Semantic Segmentation from the Perspective of Explicit Class Embeddings\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 821-831\n} \n}" }, { "title": "Boosting Single Image Super-Resolution via Partial Channel Shifting", @@ -6242,14 +6453,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhang_Boosting_Single_Image_Super-Resolution_via_Partial_Channel_Shifting_ICCV_2023_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "Southwest Jiao Tong University", + "aff_unique_norm": "Southwest Jiaotong University", "aff_unique_dep": "", "aff_unique_url": "https://www.swjtu.edu.cn", "aff_unique_abbr": "SWJTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Xiaoming and Li,\n Tianrui and Zhao,\n Xiaole\n},\n title = {\n Boosting Single Image Super-Resolution via Partial Channel Shifting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13223-13232\n} \n}" }, { "title": "Boosting Whole Slide Image Classification from the Perspectives of Distribution, Correlation and Magnification", @@ -6281,7 +6493,8 @@ "aff_campus_unique_index": ";;;;;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0+0;0+0;0+0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Qu_2023_ICCV,\n \n author = {\n Qu,\n Linhao and Yang,\n Zhiwei and Duan,\n Minghong and Ma,\n Yingfan and Wang,\n Shuo and Wang,\n Manning and Song,\n Zhijian\n},\n title = {\n Boosting Whole Slide Image Classification from the Perspectives of Distribution,\n Correlation and Magnification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21463-21473\n} \n}" }, { "title": "Bootstrap Motion Forecasting With Self-Consistent Constraints", @@ -6313,7 +6526,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Ye_2023_ICCV,\n \n author = {\n Ye,\n Maosheng and Xu,\n Jiamiao and Xu,\n Xunnong and Wang,\n Tengfei and Cao,\n Tongyi and Chen,\n Qifeng\n},\n title = {\n Bootstrap Motion Forecasting With Self-Consistent Constraints\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8504-8514\n} \n}" }, { "title": "Borrowing Knowledge From Pre-trained Language Model: A New Data-efficient Visual Learning Paradigm", @@ -6325,7 +6539,7 @@ "author": "Wenxuan Ma; Shuang Li; JinMing Zhang; Chi Harold Liu; Jingxuan Kang; Yulin Wang; Gao Huang", "abstract": "The development of vision models for real-world applications is hindered by the challenge of annotated data scarcity, which has necessitated the adoption of data-efficient visual learning techniques such as semi-supervised learning. Unfortunately, the prevalent cross-entropy supervision is limited by its focus on category discrimination while disregarding the semantic connection between concepts, which ultimately results in the suboptimal exploitation of scarce labeled data. To address this issue, this paper presents a novel approach that seeks to leverage linguistic knowledge for data-efficient visual learning. The proposed approac, BorLan, Borrows knowledge from off-the-shelf pretrained Language models that are already endowed with rich semantics extracted from large corpora, to compensate the semantic deficiency due to limited annotation in visual training. Specifically, we design a distribution alignment objective, which guides the vision model to learn both semantic-aware and domain-agnostic representations for the task through linguistic knowledge. One significant advantage of this paradigm is its flexibility in combining various visual and linguistic models. Extensive experiments on semi-supervised learning, single domain generalization and few-shot learning validate its effectiveness.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Ma_Borrowing_Knowledge_From_Pre-trained_Language_Model_A_New_Data-efficient_Visual_ICCV_2023_paper.pdf", - "aff": "Beijing Institute of Technology; Beijing Institute of Technology\u2021; Beijing Institute of Technology; Beijing Institute of Technology; University of Liverpool; Tsinghua University; Tsinghua University", + "aff": "Beijing Institute of Technology; Beijing Institute of Technology‡; Beijing Institute of Technology; Beijing Institute of Technology; University of Liverpool; Tsinghua University; Tsinghua University", "project": "", "github": "https://github.com/BIT-DA/BorLan", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Ma_Borrowing_Knowledge_From_ICCV_2023_supplemental.pdf", @@ -6345,7 +6559,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1;0;0", - "aff_country_unique": "China;United Kingdom" + "aff_country_unique": "China;United Kingdom", + "bibtex": "@InProceedings{Ma_2023_ICCV,\n \n author = {\n Ma,\n Wenxuan and Li,\n Shuang and Zhang,\n JinMing and Liu,\n Chi Harold and Kang,\n Jingxuan and Wang,\n Yulin and Huang,\n Gao\n},\n title = {\n Borrowing Knowledge From Pre-trained Language Model: A New Data-efficient Visual Learning Paradigm\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18786-18797\n} \n}" }, { "title": "Both Diverse and Realism Matter: Physical Attribute and Style Alignment for Rainy Image Generation", @@ -6372,12 +6587,13 @@ "aff_unique_index": "0;1+2;0;3;0", "aff_unique_norm": "Huazhong University of Science and Technology;Carnegie Mellon University;Mohamed bin Zayed University of Artificial Intelligence;Fudan University", "aff_unique_dep": "School of Artificial Intelligence and Automation;;;AI3Institute", - "aff_unique_url": "http://www.hust.edu.cn;https://www.cmu.edu;https://mbzuai.ac.ae;https://www.fudan.edu.cn", + "aff_unique_url": "http://www.hust.edu.cn;https://www.cmu.edu;https://www.mbzuai.ac.ae;https://www.fudan.edu.cn", "aff_unique_abbr": "HUST;CMU;MBZUAI;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1+2;0;0;0", - "aff_country_unique": "China;United States;United Arab Emirates" + "aff_country_unique": "China;United States;United Arab Emirates", + "bibtex": "@InProceedings{Yu_2023_ICCV,\n \n author = {\n Yu,\n Changfeng and Chen,\n Shiming and Chang,\n Yi and Song,\n Yibing and Yan,\n Luxin\n},\n title = {\n Both Diverse and Realism Matter: Physical Attribute and Style Alignment for Rainy Image Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12387-12397\n} \n}" }, { "title": "Boundary-Aware Divide and Conquer: A Diffusion-Based Solution for Unsupervised Shadow Removal", @@ -6402,14 +6618,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Guo_Boundary-Aware_Divide_and_Conquer_A_Diffusion-Based_Solution_for_Unsupervised_Shadow_ICCV_2023_paper.html", "aff_unique_index": "0;0;1;0;0", - "aff_unique_norm": "Nanyang Technological University;Pengcheng Laboratory", - "aff_unique_dep": ";Peng Cheng Laboratory", + "aff_unique_norm": "Nanyang Technological University;Peng Cheng Laboratory", + "aff_unique_dep": ";", "aff_unique_url": "https://www.ntu.edu.sg;", "aff_unique_abbr": "NTU;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0", - "aff_country_unique": "Singapore;China" + "aff_country_unique": "Singapore;China", + "bibtex": "@InProceedings{Guo_2023_ICCV,\n \n author = {\n Guo,\n Lanqing and Wang,\n Chong and Yang,\n Wenhan and Wang,\n Yufei and Wen,\n Bihan\n},\n title = {\n Boundary-Aware Divide and Conquer: A Diffusion-Based Solution for Unsupervised Shadow Removal\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13045-13054\n} \n}" }, { "title": "Box-based Refinement for Weakly Supervised and Unsupervised Localization Tasks", @@ -6441,7 +6658,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Israel" + "aff_country_unique": "Israel", + "bibtex": "@InProceedings{Gomel_2023_ICCV,\n \n author = {\n Gomel,\n Eyal and Shaharbany,\n Tal and Wolf,\n Lior\n},\n title = {\n Box-based Refinement for Weakly Supervised and Unsupervised Localization Tasks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16044-16054\n} \n}" }, { "title": "BoxDiff: Text-to-Image Synthesis with Training-Free Box-Constrained Diffusion", @@ -6473,7 +6691,8 @@ "aff_campus_unique_index": "0;;0", "aff_campus_unique": "Singapore;", "aff_country_unique_index": "0;1;1;1+2;1;1;0", - "aff_country_unique": "Singapore;China;Saudi Arabia" + "aff_country_unique": "Singapore;China;Saudi Arabia", + "bibtex": "@InProceedings{Xie_2023_ICCV,\n \n author = {\n Xie,\n Jinheng and Li,\n Yuexiang and Huang,\n Yawen and Liu,\n Haozhe and Zhang,\n Wentian and Zheng,\n Yefeng and Shou,\n Mike Zheng\n},\n title = {\n BoxDiff: Text-to-Image Synthesis with Training-Free Box-Constrained Diffusion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7452-7461\n} \n}" }, { "title": "BoxSnake: Polygonal Instance Segmentation with Box Supervision", @@ -6505,7 +6724,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Shenzhen;", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n Rui and Song,\n Lin and Ge,\n Yixiao and Li,\n Xiu\n},\n title = {\n BoxSnake: Polygonal Instance Segmentation with Box Supervision\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 766-776\n} \n}" }, { "title": "Breaking Common Sense: WHOOPS! A Vision-and-Language Benchmark of Synthetic and Compositional Images", @@ -6530,14 +6750,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Bitton-Guetta_Breaking_Common_Sense_WHOOPS_A_Vision-and-Language_Benchmark_of_Synthetic_and_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;3;0;1;1", - "aff_unique_norm": "Ben Gurion University of the Negev;Hebrew University of Jerusalem;Allen Institute for Artificial Intelligence;University of Washington", + "aff_unique_norm": "Ben Gurion University of the Negev;The Hebrew University of Jerusalem;Allen Institute for Artificial Intelligence;University of Washington", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.bgu.ac.il;https://www.huji.ac.il;https://allenai.org;https://www.washington.edu", "aff_unique_abbr": "BGU;HUJI;AI2;UW", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;1;0;0;0", - "aff_country_unique": "Israel;United States" + "aff_country_unique": "Israel;United States", + "bibtex": "@InProceedings{Bitton-Guetta_2023_ICCV,\n \n author = {\n Bitton-Guetta,\n Nitzan and Bitton,\n Yonatan and Hessel,\n Jack and Schmidt,\n Ludwig and Elovici,\n Yuval and Stanovsky,\n Gabriel and Schwartz,\n Roy\n},\n title = {\n Breaking Common Sense: WHOOPS! A Vision-and-Language Benchmark of Synthetic and Compositional Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2616-2627\n} \n}" }, { "title": "Breaking Temporal Consistency: Generating Video Universal Adversarial Perturbations Using Image Models", @@ -6569,7 +6790,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Kim_2023_ICCV,\n \n author = {\n Kim,\n Hee-Seon and Son,\n Minji and Kim,\n Minbeom and Kwon,\n Myung-Joon and Kim,\n Changick\n},\n title = {\n Breaking Temporal Consistency: Generating Video Universal Adversarial Perturbations Using Image Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4325-4334\n} \n}" }, { "title": "Breaking The Limits of Text-conditioned 3D Motion Synthesis with Elaborative Descriptions", @@ -6592,7 +6814,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Qian_Breaking_The_Limits_of_Text-conditioned_3D_Motion_Synthesis_with_Elaborative_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Qian_Breaking_The_Limits_of_Text-conditioned_3D_Motion_Synthesis_with_Elaborative_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Qian_2023_ICCV,\n \n author = {\n Qian,\n Yijun and Urbanek,\n Jack and Hauptmann,\n Alexander G. and Won,\n Jungdam\n},\n title = {\n Breaking The Limits of Text-conditioned 3D Motion Synthesis with Elaborative Descriptions\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2306-2316\n} \n}" }, { "title": "Bridging Cross-task Protocol Inconsistency for Distillation in Dense Object Detection", @@ -6604,7 +6827,7 @@ "author": "Longrong Yang; Xianpan Zhou; Xuewei Li; Liang Qiao; Zheyang Li; Ziwei Yang; Gaoang Wang; Xi Li", "abstract": "Knowledge distillation (KD) has shown potential for learning compact models in dense object detection. However, the commonly used softmax-based distillation ignores the absolute classification scores for individual categories. Thus, the optimum of the distillation loss does not necessarily lead to the optimal student classification scores for dense object detectors. This cross-task protocol inconsistency is critical, especially for dense object detectors, since the foreground categories are extremely imbalanced. To address the issue of protocol differences between distillation and classification, we propose a novel distillation method with cross-task consistent protocols, tailored for the dense object detection. For classification distillation, we address the cross-task protocol inconsistency problem by formulating the classification logit maps in both teacher and student models as multiple binary-classification maps and applying a binary-classification distillation loss to each map. For localization distillation, we design an IoU-based Localization Distillation Loss that is free from specific network structures and can be compared with existing localization distillation losses. Our proposed method is simple but effective, and experimental results demonstrate its superiority over existing methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Yang_Bridging_Cross-task_Protocol_Inconsistency_for_Distillation_in_Dense_Object_Detection_ICCV_2023_paper.pdf", - "aff": "College of Computer Science & Technology, Zhejiang University; Polytechnic Institute, Zhejiang University; College of Computer Science & Technology, Zhejiang University + Hikvision Research Institute; College of Computer Science & Technology, Zhejiang University + Hikvision Research Institute; Hikvision Research Institute; Hikvision Research Institute; ZJU \u2013 UIUC Institute, Zhejiang University; College of Computer Science & Technology, Zhejiang University + Shanghai Institute for Advanced Study of Zhejiang University + Zhejiang \u2013 Singapore Innovation and AI Joint Research Lab, Hangzhou", + "aff": "College of Computer Science & Technology, Zhejiang University; Polytechnic Institute, Zhejiang University; College of Computer Science & Technology, Zhejiang University + Hikvision Research Institute; College of Computer Science & Technology, Zhejiang University + Hikvision Research Institute; Hikvision Research Institute; Hikvision Research Institute; ZJU – UIUC Institute, Zhejiang University; College of Computer Science & Technology, Zhejiang University + Shanghai Institute for Advanced Study of Zhejiang University + Zhejiang – Singapore Innovation and AI Joint Research Lab, Hangzhou", "project": "", "github": "https://github.com/TinyTigerPan/BCKD", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Yang_Bridging_Cross-task_Protocol_Inconsistency_for_Distillation_in_Dense_Object_Detection_ICCV_2023_supplemental.pdf", @@ -6624,7 +6847,8 @@ "aff_campus_unique_index": "1;;;2+1", "aff_campus_unique": ";Hangzhou;Shanghai", "aff_country_unique_index": "0;0;0+0;0+0;0;0;0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n Longrong and Zhou,\n Xianpan and Li,\n Xuewei and Qiao,\n Liang and Li,\n Zheyang and Yang,\n Ziwei and Wang,\n Gaoang and Li,\n Xi\n},\n title = {\n Bridging Cross-task Protocol Inconsistency for Distillation in Dense Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17175-17184\n} \n}" }, { "title": "Bridging Vision and Language Encoders: Parameter-Efficient Tuning for Referring Image Segmentation", @@ -6647,7 +6871,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Xu_Bridging_Vision_and_Language_Encoders_Parameter-Efficient_Tuning_for_Referring_Image_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Xu_Bridging_Vision_and_Language_Encoders_Parameter-Efficient_Tuning_for_Referring_Image_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Xu_2023_ICCV,\n \n author = {\n Xu,\n Zunnan and Chen,\n Zhihong and Zhang,\n Yong and Song,\n Yibing and Wan,\n Xiang and Li,\n Guanbin\n},\n title = {\n Bridging Vision and Language Encoders: Parameter-Efficient Tuning for Referring Image Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17503-17512\n} \n}" }, { "title": "Bring Clipart to Life", @@ -6679,7 +6904,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;1", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Zhao_2023_ICCV,\n \n author = {\n Zhao,\n Nanxuan and Dang,\n Shengqi and Lin,\n Hexun and Shi,\n Yang and Cao,\n Nan\n},\n title = {\n Bring Clipart to Life\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23341-23350\n} \n}" }, { "title": "Building Bridge Across the Time: Disruption and Restoration of Murals In the Wild", @@ -6687,6 +6913,7 @@ "status": "Poster", "track": "main", "pid": "9367", + "author_site": "Huiyang Shao, Qianqian Xu, Peisong Wen, Peifeng Gao, Zhiyong Yang, Qingming Huang", "author": "Huiyang Shao, Qianqian Xu, Peisong Wen, Peifeng Gao, Zhiyong Yang, Qingming Huang", "abstract": "In this paper, we focus on the mural-restoration task, which aims to detect damaged regions in the mural and repaint them automatically. Different from traditional image restoration tasks like in/out/blind-painting and image renovation, the corrupted mural suffers from more complicated degradation. However, existing mural-restoration methods and datasets still focus on simple degradation like masking. Such a significant gap prevents mural-restoration from being applied to real scenarios. To fill this gap, in this work, we propose a systematic framework to simulate the physical process for damaged murals and provide a new benchmark dataset for mural-restoration. Limited by the simplification of the data synthesis process, the previous mural-restoration methods suffer from poor performance in our proposed dataset. To handle this problem, we propose the Attention Diffusion Framework (ADF) for this challenging task. Within the framework, a damage attention map module is proposed to estimate the damage extent. Facing the diversity of defects, we propose a series of loss functions to choose repair strategies adaptively. Finally, experimental results support the effectiveness of the proposed framework in terms of both mural synthesis and restoration.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Shao_Building_Bridge_Across_the_Time_Disruption_and_Restoration_of_Murals_ICCV_2023_paper.pdf", @@ -6698,7 +6925,8 @@ "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11131685926767149196&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Shao_Building_Bridge_Across_the_Time_Disruption_and_Restoration_of_Murals_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Shao_Building_Bridge_Across_the_Time_Disruption_and_Restoration_of_Murals_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Shao_2023_ICCV,\n \n author = {\n Shao,\n Huiyang and Xu,\n Qianqian and Wen,\n Peisong and Gao,\n Peifeng and Yang,\n Zhiyong and Huang,\n Qingming\n},\n title = {\n Building Bridge Across the Time: Disruption and Restoration of Murals In the Wild\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20259-20269\n} \n}" }, { "title": "Building Vision Transformers with Hierarchy Aware Feature Aggregation", @@ -6730,7 +6958,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Yongjie and Liu,\n Hongmin and Yin,\n Haoran and Fan,\n Bin\n},\n title = {\n Building Vision Transformers with Hierarchy Aware Feature Aggregation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5908-5918\n} \n}" }, { "title": "Building a Winning Team: Selecting Source Model Ensembles using a Submodular Transferability Estimation Approach", @@ -6762,7 +6991,8 @@ "aff_campus_unique_index": "0;0+1;0;0", "aff_campus_unique": "Hyderabad;Riverside;", "aff_country_unique_index": "0;0+1;0;1;1;0", - "aff_country_unique": "India;United States" + "aff_country_unique": "India;United States", + "bibtex": "@InProceedings{B_2023_ICCV,\n \n author = {\n B,\n Vimal K and Bachu,\n Saketh and Garg,\n Tanmay and Narasimhan,\n Niveditha Lakshmi and Konuru,\n Raghavan and Balasubramanian,\n Vineeth N\n},\n title = {\n Building a Winning Team: Selecting Source Model Ensembles using a Submodular Transferability Estimation Approach\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11609-11620\n} \n}" }, { "title": "Building3D: A Urban-Scale Dataset and Benchmarks for Learning Roof Structures from Point Clouds", @@ -6794,7 +7024,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Calgary", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Ruisheng and Huang,\n Shangfeng and Yang,\n Hongxin\n},\n title = {\n Building3D: A Urban-Scale Dataset and Benchmarks for Learning Roof Structures from Point Clouds\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20076-20086\n} \n}" }, { "title": "C2F2NeUS: Cascade Cost Frustum Fusion for High Fidelity and Generalizable Neural Surface Reconstruction", @@ -6819,14 +7050,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Xu_C2F2NeUS_Cascade_Cost_Frustum_Fusion_for_High_Fidelity_and_Generalizable_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0;0;1;0", - "aff_unique_norm": "Huazhong University of Science and Technology;Tencent", - "aff_unique_dep": "School of Computer Science and Technology;Tencent Holdings Limited", + "aff_unique_norm": "Huazhong University of Science and Technology;Tencent Holdings Limited", + "aff_unique_dep": "School of Computer Science and Technology;", "aff_unique_url": "http://www.hust.edu.cn;https://www.tencent.com", "aff_unique_abbr": "HUST;Tencent", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xu_2023_ICCV,\n \n author = {\n Xu,\n Luoyuan and Guan,\n Tao and Wang,\n Yuesong and Liu,\n Wenkai and Zeng,\n Zhaojie and Wang,\n Junle and Yang,\n Wei\n},\n title = {\n C2F2NeUS: Cascade Cost Frustum Fusion for High Fidelity and Generalizable Neural Surface Reconstruction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18291-18301\n} \n}" }, { "title": "C2ST: Cross-Modal Contextualized Sequence Transduction for Continuous Sign Language Recognition", @@ -6858,7 +7090,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Hohhot", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Huaiwen and Guo,\n Zihang and Yang,\n Yang and Liu,\n Xin and Hu,\n De\n},\n title = {\n C2ST: Cross-Modal Contextualized Sequence Transduction for Continuous Sign Language Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21053-21062\n} \n}" }, { "title": "CAD-Estate: Large-scale CAD Model Annotation in RGB Videos", @@ -6866,8 +7099,8 @@ "status": "Poster", "track": "main", "pid": "7535", - "author_site": "Kevis-Kokitsi Maninis, Stefan Popov, Matthias Nie\u00dfner, Vittorio Ferrari", - "author": "Kevis-Kokitsi Maninis; Stefan Popov; Matthias Nie\u00dfner; Vittorio Ferrari", + "author_site": "Kevis-Kokitsi Maninis, Stefan Popov, Matthias Nießner, Vittorio Ferrari", + "author": "Kevis-Kokitsi Maninis; Stefan Popov; Matthias Nießner; Vittorio Ferrari", "abstract": "We propose a method for annotating videos of complex multi-object scenes with a globally-consistent 3D representation of the objects. We annotate each object with a CAD model from a database, and place it in the 3D coordinate frame of the scene with a 9-DoF pose transformation. Our method is semi-automatic and works on commonly-available RGB videos, without requiring a depth sensor. Many steps are performed automatically, and the tasks performed by humans are simple, well-specified, and require only limited reasoning in 3D. This makes them feasible for crowd-sourcing and has allowed us to construct a large-scale dataset by annotating real-estate videos from YouTube. Our dataset CAD-Estate offers 101k instances of 12k unique CAD models placed in the 3D representations of 20k videos. In comparison to Scan2CAD, the largest existing dataset with CAD model annotations on real \n scenes, CAD-Estate has 7x more instances and 4x more unique CAD models. We showcase the benefits of pre-training a Mask2CAD model on CAD-Estate for the task of automatic \n 3D object reconstruction and pose estimation, demonstrating that it leads to performance improvements on the popular Scan2CAD benchmark. The dataset is available at https://github.com/google-research/cad-estate.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Maninis_CAD-Estate_Large-scale_CAD_Model_Annotation_in_RGB_Videos_ICCV_2023_paper.pdf", "aff": "Google Research; Google Research; TUM; Google Research", @@ -6883,14 +7116,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Maninis_CAD-Estate_Large-scale_CAD_Model_Annotation_in_RGB_Videos_ICCV_2023_paper.html", "aff_unique_index": "0;0;1;0", - "aff_unique_norm": "Google;Technische Universit\u00e4t M\u00fcnchen", + "aff_unique_norm": "Google;Technische Universität München", "aff_unique_dep": "Google Research;", "aff_unique_url": "https://research.google;https://www.tum.de", "aff_unique_abbr": "Google Research;TUM", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;1;0", - "aff_country_unique": "United States;Germany" + "aff_country_unique": "United States;Germany", + "bibtex": "@InProceedings{Maninis_2023_ICCV,\n \n author = {\n Maninis,\n Kevis-Kokitsi and Popov,\n Stefan and Nie{\\ss\n}ner,\n Matthias and Ferrari,\n Vittorio\n},\n title = {\n CAD-Estate: Large-scale CAD Model Annotation in RGB Videos\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20189-20199\n} \n}" }, { "title": "CAFA: Class-Aware Feature Alignment for Test-Time Adaptation", @@ -6913,7 +7147,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Jung_CAFA_Class-Aware_Feature_Alignment_for_Test-Time_Adaptation_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Jung_CAFA_Class-Aware_Feature_Alignment_for_Test-Time_Adaptation_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Jung_2023_ICCV,\n \n author = {\n Jung,\n Sanghun and Lee,\n Jungsoo and Kim,\n Nanhee and Shaban,\n Amirreza and Boots,\n Byron and Choo,\n Jaegul\n},\n title = {\n CAFA: Class-Aware Feature Alignment for Test-Time Adaptation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19060-19071\n} \n}" }, { "title": "CAME: Contrastive Automated Model Evaluation", @@ -6936,7 +7171,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Peng_CAME_Contrastive_Automated_Model_Evaluation_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Peng_CAME_Contrastive_Automated_Model_Evaluation_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Peng_2023_ICCV,\n \n author = {\n Peng,\n Ru and Duan,\n Qiuyang and Wang,\n Haobo and Ma,\n Jiachen and Jiang,\n Yanbo and Tu,\n Yongjun and Jiang,\n Xiu and Zhao,\n Junbo\n},\n title = {\n CAME: Contrastive Automated Model Evaluation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20121-20132\n} \n}" }, { "title": "CASSPR: Cross Attention Single Scan Place Recognition", @@ -6944,8 +7180,8 @@ "status": "Poster", "track": "main", "pid": "4208", - "author_site": "Yan Xia, Mariia Gladkova, Rui Wang, Qianyun Li, Uwe Stilla, Jo\u00e3o F Henriques, Daniel Cremers", - "author": "Yan Xia; Mariia Gladkova; Rui Wang; Qianyun Li; Uwe Stilla; Jo\u00e3o F Henriques; Daniel Cremers", + "author_site": "Yan Xia, Mariia Gladkova, Rui Wang, Qianyun Li, Uwe Stilla, João F Henriques, Daniel Cremers", + "author": "Yan Xia; Mariia Gladkova; Rui Wang; Qianyun Li; Uwe Stilla; João F Henriques; Daniel Cremers", "abstract": "Place recognition based on point clouds (LiDAR) is an important component for autonomous robots or self-driving vehicles. Current SOTA performance is achieved on accumulated LiDAR submaps using either point-based or voxel-based structures. While voxel-based approaches nicely integrate spatial context across multiple scales, they do not exhibit the local precision of point-based methods. As a result, existing methods struggle with fine-grained matching of subtle geometric features in sparse single-shot LiDAR scans. To overcome these limitations, we propose CASSPR as a method to fuse point-based and voxel-based approaches using cross attention transformers. CASSPR leverages a sparse voxel branch for extracting and aggregating information at lower resolution and a point-wise branch for obtaining fine-grained local information. CASSPR uses queries from one branch to try to match structures in the other branch, ensuring that both extract self-contained descriptors of the point cloud (rather than one branch dominating), but using both to inform the output global descriptor of the point cloud. Extensive experiments show that CASSPR surpasses the state-of-the-art by a large margin on several datasets (Oxford RobotCar, TUM, USyd). For instance, it achieves AR@1 of 85.6% on the TUM dataset, surpassing the strongest prior model by 15%. Our code is publicly available.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Xia_CASSPR_Cross_Attention_Single_Scan_Place_Recognition_ICCV_2023_paper.pdf", "aff": "Technical University of Munich+Munich Center for Machine Learning (MCML)+Visual Geometry Group, University of Oxford; Technical University of Munich+Munich Data Science Institute; Microsoft Zurich; Technical University of Munich; Technical University of Munich+Munich Center for Machine Learning (MCML)+Munich Data Science Institute; Visual Geometry Group, University of Oxford; Technical University of Munich+Munich Center for Machine Learning (MCML)+Visual Geometry Group, University of Oxford+Munich Data Science Institute", @@ -6961,14 +7197,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Xia_CASSPR_Cross_Attention_Single_Scan_Place_Recognition_ICCV_2023_paper.html", "aff_unique_index": "0+1+2;0+3;4;0;0+1+3;2;0+1+2+3", - "aff_unique_norm": "Technical University of Munich;Munich Center for Machine Learning;University of Oxford;Munich Data Science Institute;Microsoft", - "aff_unique_dep": ";Center for Machine Learning;Visual Geometry Group;;Microsoft Corporation", + "aff_unique_norm": "Technical University of Munich;Munich Center for Machine Learning;University of Oxford;Munich Data Science Institute;Microsoft Corporation", + "aff_unique_dep": ";Center for Machine Learning;Visual Geometry Group;;", "aff_unique_url": "https://www.tum.de;https://www.munich-center-for-machine-learning.de;https://www.ox.ac.uk;https://www.mdsi.de;https://www.microsoft.com/en-gb", "aff_unique_abbr": "TUM;MCML;Oxford;;Microsoft", "aff_campus_unique_index": "1+2;;3;1;2;1+2", "aff_campus_unique": ";Munich;Oxford;Zurich", "aff_country_unique_index": "0+0+1;0+0;2;0;0+0+0;1;0+0+1+0", - "aff_country_unique": "Germany;United Kingdom;Switzerland" + "aff_country_unique": "Germany;United Kingdom;Switzerland", + "bibtex": "@InProceedings{Xia_2023_ICCV,\n \n author = {\n Xia,\n Yan and Gladkova,\n Mariia and Wang,\n Rui and Li,\n Qianyun and Stilla,\n Uwe and Henriques,\n Jo\\~ao F and Cremers,\n Daniel\n},\n title = {\n CASSPR: Cross Attention Single Scan Place Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8461-8472\n} \n}" }, { "title": "CBA: Improving Online Continual Learning via Continual Bias Adaptor", @@ -6980,7 +7217,7 @@ "author": "Quanziang Wang; Renzhen Wang; Yichen Wu; Xixi Jia; Deyu Meng", "abstract": "Online continual learning (CL) aims to learn new knowledge and consolidate previously learned knowledge from non-stationary data streams. Due to the time-varying training setting, the model learned from a changing distribution easily forgets the previously learned knowledge and biases towards the newly received task. To address this problem, we propose a Continual Bias Adaptor (CBA) module to augment the classifier network to adapt to catastrophic distribution change during training, such that the classifier network is able to learn a stable consolidation of previously learned tasks. In the testing stage, CBA can be removed which introduces no additional computation cost and memory overhead. We theoretically reveal the reason why the proposed method can effectively alleviate catastrophic distribution shifts, and empirically demonstrate its effectiveness through extensive experiments based on four rehearsal-based baselines and three public continual learning benchmarks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Wang_CBA_Improving_Online_Continual_Learning_via_Continual_Bias_Adaptor_ICCV_2023_paper.pdf", - "aff": "Xi\u2019an Jiaotong University; Xi\u2019an Jiaotong University; City University of Hong Kong; Xidian University; Xi\u2019an Jiaotong University+Macau University of Science and Technology", + "aff": "Xi’an Jiaotong University; Xi’an Jiaotong University; City University of Hong Kong; Xidian University; Xi’an Jiaotong University+Macau University of Science and Technology", "project": "", "github": "https://github.com/wqza/CBA-online-CL", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Wang_CBA_Improving_Online_ICCV_2023_supplemental.pdf", @@ -6993,14 +7230,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wang_CBA_Improving_Online_Continual_Learning_via_Continual_Bias_Adaptor_ICCV_2023_paper.html", "aff_unique_index": "0;0;1;2;0+3", - "aff_unique_norm": "Xi'an Jiao Tong University;City University of Hong Kong;Xidian University;Macau University of Science and Technology", + "aff_unique_norm": "Xi'an Jiaotong University;City University of Hong Kong;Xidian University;Macau University of Science and Technology", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.xjtu.edu.cn;https://www.cityu.edu.hk;http://www.xidian.edu.cn/;https://www.must.edu.mo", "aff_unique_abbr": "XJTU;CityU;Xidian;MUST", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Hong Kong SAR;Macau SAR", "aff_country_unique_index": "0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Quanziang and Wang,\n Renzhen and Wu,\n Yichen and Jia,\n Xixi and Meng,\n Deyu\n},\n title = {\n CBA: Improving Online Continual Learning via Continual Bias Adaptor\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19082-19092\n} \n}" }, { "title": "CC3D: Layout-Conditioned Generation of Compositional 3D Scenes", @@ -7032,7 +7270,8 @@ "aff_campus_unique_index": "1;1;1;1;2", "aff_campus_unique": ";Stanford;Mountain View", "aff_country_unique_index": "0;1;1;0;1;1;0+1+0", - "aff_country_unique": "Canada;United States" + "aff_country_unique": "Canada;United States", + "bibtex": "@InProceedings{Bahmani_2023_ICCV,\n \n author = {\n Bahmani,\n Sherwin and Park,\n Jeong Joon and Paschalidou,\n Despoina and Yan,\n Xingguang and Wetzstein,\n Gordon and Guibas,\n Leonidas and Tagliasacchi,\n Andrea\n},\n title = {\n CC3D: Layout-Conditioned Generation of Compositional 3D Scenes\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7171-7181\n} \n}" }, { "title": "CDAC: Cross-domain Attention Consistency in Transformer for Domain Adaptive Semantic Segmentation", @@ -7064,7 +7303,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", - "aff_country_unique": "United States;South Korea" + "aff_country_unique": "United States;South Korea", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Kaihong and Kim,\n Donghyun and Feris,\n Rogerio and Betke,\n Margrit\n},\n title = {\n CDAC: Cross-domain Attention Consistency in Transformer for Domain Adaptive Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11519-11529\n} \n}" }, { "title": "CDFSL-V: Cross-Domain Few-Shot Learning for Videos", @@ -7096,7 +7336,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "", - "aff_country_unique": "" + "aff_country_unique": "", + "bibtex": "@InProceedings{Samarasinghe_2023_ICCV,\n \n author = {\n Samarasinghe,\n Sarinda and Rizve,\n Mamshad Nayeem and Kardan,\n Navid and Shah,\n Mubarak\n},\n title = {\n CDFSL-V: Cross-Domain Few-Shot Learning for Videos\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11643-11652\n} \n}" }, { "title": "CDUL: CLIP-Driven Unsupervised Learning for Multi-Label Image Classification", @@ -7128,7 +7369,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0", - "aff_country_unique": "United States;Singapore" + "aff_country_unique": "United States;Singapore", + "bibtex": "@InProceedings{Abdelfattah_2023_ICCV,\n \n author = {\n Abdelfattah,\n Rabab and Guo,\n Qing and Li,\n Xiaoguang and Wang,\n Xiaofeng and Wang,\n Song\n},\n title = {\n CDUL: CLIP-Driven Unsupervised Learning for Multi-Label Image Classification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1348-1357\n} \n}" }, { "title": "CFCG: Semi-Supervised Semantic Segmentation via Cross-Fusion and Contour Guidance Supervision", @@ -7154,13 +7396,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_CFCG_Semi-Supervised_Semantic_Segmentation_via_Cross-Fusion_and_Contour_Guidance_Supervision_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0;0;0;0;0", "aff_unique_norm": "Baidu", - "aff_unique_dep": "Baidu", + "aff_unique_dep": "", "aff_unique_url": "https://www.baidu.com", "aff_unique_abbr": "Baidu", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Shuo and He,\n Yue and Zhang,\n Weiming and Zhang,\n Wei and Tan,\n Xiao and Han,\n Junyu and Ding,\n Errui and Wang,\n Jingdong\n},\n title = {\n CFCG: Semi-Supervised Semantic Segmentation via Cross-Fusion and Contour Guidance Supervision\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16348-16358\n} \n}" }, { "title": "CGBA: Curvature-aware Geometric Black-box Attack", @@ -7192,7 +7435,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Reza_2023_ICCV,\n \n author = {\n Reza,\n Md Farhamdur and Rahmati,\n Ali and Wu,\n Tianfu and Dai,\n Huaiyu\n},\n title = {\n CGBA: Curvature-aware Geometric Black-box Attack\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 124-133\n} \n}" }, { "title": "CHAMPAGNE: Learning Real-world Conversation from Large-Scale Web Videos", @@ -7215,7 +7459,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Han_CHAMPAGNE_Learning_Real-world_Conversation_from_Large-Scale_Web_Videos_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Han_CHAMPAGNE_Learning_Real-world_Conversation_from_Large-Scale_Web_Videos_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Han_2023_ICCV,\n \n author = {\n Han,\n Seungju and Hessel,\n Jack and Dziri,\n Nouha and Choi,\n Yejin and Yu,\n Youngjae\n},\n title = {\n CHAMPAGNE: Learning Real-world Conversation from Large-Scale Web Videos\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15498-15509\n} \n}" }, { "title": "CHORD: Category-level Hand-held Object Reconstruction via Shape Deformation", @@ -7247,7 +7492,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0;0;0;0;0+0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Kailin and Yang,\n Lixin and Zhen,\n Haoyu and Lin,\n Zenan and Zhan,\n Xinyu and Zhong,\n Licheng and Xu,\n Jian and Wu,\n Kejian and Lu,\n Cewu\n},\n title = {\n CHORD: Category-level Hand-held Object Reconstruction via Shape Deformation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9444-9454\n} \n}" }, { "title": "CHORUS : Learning Canonicalized 3D Human-Object Spatial Relations from Unbounded Synthesized Images", @@ -7279,7 +7525,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Han_2023_ICCV,\n \n author = {\n Han,\n Sookwan and Joo,\n Hanbyul\n},\n title = {\n CHORUS : Learning Canonicalized 3D Human-Object Spatial Relations from Unbounded Synthesized Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15835-15846\n} \n}" }, { "title": "CIRI: Curricular Inactivation for Residue-aware One-shot Video Inpainting", @@ -7306,12 +7553,13 @@ "aff_unique_index": "0;0;0+1+2+3;4;5", "aff_unique_norm": "South China University of Technology;State Key Laboratory of Subtropical Building Science;Ministry of Education;Guangdong Provincial Key Lab of Computational Intelligence and Cyberspace Information;Fuzhou University;Singapore Management University", "aff_unique_dep": ";;Key Laboratory of Big Data and Intelligent Robot;Computational Intelligence and Cyberspace Information;;", - "aff_unique_url": "https://www.scut.edu.cn;;;;https://www.fznu.edu.cn;https://www.smu.edu.sg", + "aff_unique_url": "https://www.scut.edu.cn;;;;https://www.fzu.edu.cn;https://www.smu.edu.sg", "aff_unique_abbr": "SCUT;;;;FZU;SMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0+0+0;0;1", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Zheng_2023_ICCV,\n \n author = {\n Zheng,\n Weiying and Xu,\n Cheng and Xu,\n Xuemiao and Liu,\n Wenxi and He,\n Shengfeng\n},\n title = {\n CIRI: Curricular Inactivation for Residue-aware One-shot Video Inpainting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13012-13022\n} \n}" }, { "title": "CL-MVSNet: Unsupervised Multi-View Stereo with Dual-Level Contrastive Learning", @@ -7336,14 +7584,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Xiong_CL-MVSNet_Unsupervised_Multi-View_Stereo_with_Dual-Level_Contrastive_Learning_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0;1;0;0+2+3", - "aff_unique_norm": "Peking University;University of Birmingham;Pengcheng Laboratory;Migu Culture Technology", - "aff_unique_dep": "School of Electronic and Computer Engineering;School of Computer Science;Peng Cheng Laboratory;Technology", + "aff_unique_norm": "Peking University;University of Birmingham;Peng Cheng Laboratory;Migu Culture Technology", + "aff_unique_dep": "School of Electronic and Computer Engineering;School of Computer Science;;Technology", "aff_unique_url": "http://www.pku.edu.cn;https://www.birmingham.ac.uk;http://www.pcl.ac.cn;http://www.migu.cn", "aff_unique_abbr": "PKU;UoB;PCL;Migu", "aff_campus_unique_index": "1;", "aff_campus_unique": ";Birmingham", "aff_country_unique_index": "0;0;0;0;1;0;0+0+0", - "aff_country_unique": "China;United Kingdom" + "aff_country_unique": "China;United Kingdom", + "bibtex": "@InProceedings{Xiong_2023_ICCV,\n \n author = {\n Xiong,\n Kaiqiang and Peng,\n Rui and Zhang,\n Zhe and Feng,\n Tianxing and Jiao,\n Jianbo and Gao,\n Feng and Wang,\n Ronggang\n},\n title = {\n CL-MVSNet: Unsupervised Multi-View Stereo with Dual-Level Contrastive Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3769-3780\n} \n}" }, { "title": "CLIP-Cluster: CLIP-Guided Attribute Hallucination for Face Clustering", @@ -7351,6 +7600,7 @@ "status": "Poster", "track": "main", "pid": "3054", + "author_site": "Shuai Shen, Wanhua Li, Xiaobing Wang, Dafeng Zhang, Zhezhu Jin, Jie Zhou, Jiwen Lu", "author": "Shuai Shen, Wanhua Li, Xiaobing Wang, Dafeng Zhang, Zhezhu Jin, Jie Zhou, Jiwen Lu", "abstract": "One of the most important yet rarely studied challenges for supervised face clustering is the large intra-class variance caused by different face attributes such as age, pose, and expression. Images of the same identity but with different face attributes usually tend to be clustered into different sub-clusters. For the first time, we proposed an attribute hallucination framework named CLIP-Cluster to address this issue, which first hallucinates multiple representations for different attributes with the powerful CLIP model and then pools them by learning neighbor-adaptive attention. Specifically, CLIP-Cluster first introduces a text-driven attribute hallucination module, which allows one to use natural language as the interface to hallucinate novel attributes for a given face image based on the well-aligned image-language CLIP space. Furthermore, we develop a neighbor-aware proxy generator that fuses the features describing various attributes into a proxy feature to build a bridge among different sub-clusters and reduce the intra-class variance. The proxy feature is generated by adaptively attending to the hallucinated visual features and the source one based on the local neighbor information. On this basis, a graph built with the proxy representations is used for subsequent clustering operations. Extensive experiments show our proposed approach outperforms state-of-the-art face clustering methods with high inference efficiency.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Shen_CLIP-Cluster_CLIP-Guided_Attribute_Hallucination_for_Face_Clustering_ICCV_2023_paper.pdf", @@ -7362,7 +7612,8 @@ "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9041835060050253916&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Shen_CLIP-Cluster_CLIP-Guided_Attribute_Hallucination_for_Face_Clustering_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Shen_CLIP-Cluster_CLIP-Guided_Attribute_Hallucination_for_Face_Clustering_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Shen_2023_ICCV,\n \n author = {\n Shen,\n Shuai and Li,\n Wanhua and Wang,\n Xiaobing and Zhang,\n Dafeng and Jin,\n Zhezhu and Zhou,\n Jie and Lu,\n Jiwen\n},\n title = {\n CLIP-Cluster: CLIP-Guided Attribute Hallucination for Face Clustering\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20786-20795\n} \n}" }, { "title": "CLIP-Driven Universal Model for Organ Segmentation and Tumor Detection", @@ -7387,14 +7638,15 @@ "author_num": 10, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Liu_CLIP-Driven_Universal_Model_for_Organ_Segmentation_and_Tumor_Detection_ICCV_2023_paper.html", "aff_unique_index": "0;1;1;1;1;2;3+3;1;2+4;1", - "aff_unique_norm": "City University of Hong Kong;Johns Hopkins University;Vanderbilt University;Chinese University of Hong Kong;NVIDIA", - "aff_unique_dep": ";;;;NVIDIA Corporation", + "aff_unique_norm": "City University of Hong Kong;Johns Hopkins University;Vanderbilt University;Chinese University of Hong Kong;NVIDIA Corporation", + "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.cityu.edu.hk;https://www.jhu.edu;https://www.vanderbilt.edu;https://www.cuhk.edu.hk;https://www.nvidia.com", "aff_unique_abbr": "CityU;JHU;Vanderbilt;CUHK;NVIDIA", "aff_campus_unique_index": "0;0+2;", "aff_campus_unique": "Hong Kong SAR;;Shenzhen", "aff_country_unique_index": "0;1;1;1;1;1;0+0;1;1+1;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Jie and Zhang,\n Yixiao and Chen,\n Jie-Neng and Xiao,\n Junfei and Lu,\n Yongyi and A Landman,\n Bennett and Yuan,\n Yixuan and Yuille,\n Alan and Tang,\n Yucheng and Zhou,\n Zongwei\n},\n title = {\n CLIP-Driven Universal Model for Organ Segmentation and Tumor Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21152-21164\n} \n}" }, { "title": "CLIP2Point: Transfer CLIP to Point Cloud Classification with Image-Depth Pre-Training", @@ -7417,7 +7669,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Huang_CLIP2Point_Transfer_CLIP_to_Point_Cloud_Classification_with_Image-Depth_Pre-Training_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Huang_CLIP2Point_Transfer_CLIP_to_Point_Cloud_Classification_with_Image-Depth_Pre-Training_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Huang_2023_ICCV,\n \n author = {\n Huang,\n Tianyu and Dong,\n Bowen and Yang,\n Yunhan and Huang,\n Xiaoshui and Lau,\n Rynson W.H. and Ouyang,\n Wanli and Zuo,\n Wangmeng\n},\n title = {\n CLIP2Point: Transfer CLIP to Point Cloud Classification with Image-Depth Pre-Training\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22157-22167\n} \n}" }, { "title": "CLIPN for Zero-Shot OOD Detection: Teaching CLIP to Say No", @@ -7442,14 +7695,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wang_CLIPN_for_Zero-Shot_OOD_Detection_Teaching_CLIP_to_Say_No_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0", - "aff_unique_norm": "Hong Kong University of Science and Technology", + "aff_unique_norm": "The Hong Kong University of Science and Technology", "aff_unique_dep": "Department of Electronic and Computer Engineering", "aff_unique_url": "https://www.ust.hk", "aff_unique_abbr": "HKUST", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Hualiang and Li,\n Yi and Yao,\n Huifeng and Li,\n Xiaomeng\n},\n title = {\n CLIPN for Zero-Shot OOD Detection: Teaching CLIP to Say No\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1802-1812\n} \n}" }, { "title": "CLIPTER: Looking at the Bigger Picture in Scene Text Recognition", @@ -7474,14 +7728,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Aberdam_CLIPTER_Looking_at_the_Bigger_Picture_in_Scene_Text_Recognition_ICCV_2023_paper.html", "aff_unique_index": "0;1;0;1;0;0;0;0", - "aff_unique_norm": "Amazon;Technion - Israel Institute of Technology", + "aff_unique_norm": "Amazon Web Services;Technion - Israel Institute of Technology", "aff_unique_dep": "AWS AI Labs;", "aff_unique_url": "https://aws.amazon.com;https://www.technion.ac.il/en/", "aff_unique_abbr": "AWS;Technion", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;1;0;0;0;0", - "aff_country_unique": "United States;Israel" + "aff_country_unique": "United States;Israel", + "bibtex": "@InProceedings{Aberdam_2023_ICCV,\n \n author = {\n Aberdam,\n Aviad and Bensaid,\n David and Golts,\n Alona and Ganz,\n Roy and Nuriel,\n Oren and Tichauer,\n Royee and Mazor,\n Shai and Litman,\n Ron\n},\n title = {\n CLIPTER: Looking at the Bigger Picture in Scene Text Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21706-21717\n} \n}" }, { "title": "CLIPTrans: Transferring Visual Knowledge with Pre-trained Models for Multimodal Machine Translation", @@ -7506,14 +7761,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Gupta_CLIPTrans_Transferring_Visual_Knowledge_with_Pre-trained_Models_for_Multimodal_Machine_ICCV_2023_paper.html", "aff_unique_index": "0+1;2;3;3;3;0", - "aff_unique_norm": "Boston College;Birla Institute of Technology and Science, Pilani;Microsoft;Harvard University", - "aff_unique_dep": ";;Microsoft Corporation;", + "aff_unique_norm": "Boston College;Birla Institute of Technology and Science, Pilani;Microsoft Corporation;Harvard University", + "aff_unique_dep": ";;;", "aff_unique_url": "https://www.bostoncollege.edu;https://www.bits-pilani.ac.in;https://www.microsoft.com/en-in;https://www.harvard.edu", "aff_unique_abbr": "BC;BITS Pilani;Microsoft;Harvard", "aff_campus_unique_index": "1", "aff_campus_unique": ";Pilani", "aff_country_unique_index": "0+1;1;0;0;0;0", - "aff_country_unique": "United States;India" + "aff_country_unique": "United States;India", + "bibtex": "@InProceedings{Gupta_2023_ICCV,\n \n author = {\n Gupta,\n Devaansh and Kharbanda,\n Siddhant and Zhou,\n Jiawei and Li,\n Wanhua and Pfister,\n Hanspeter and Wei,\n Donglai\n},\n title = {\n CLIPTrans: Transferring Visual Knowledge with Pre-trained Models for Multimodal Machine Translation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2875-2886\n} \n}" }, { "title": "CLIPascene: Scene Sketching with Different Types and Levels of Abstraction", @@ -7545,7 +7801,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Israel" + "aff_country_unique": "Israel", + "bibtex": "@InProceedings{Vinker_2023_ICCV,\n \n author = {\n Vinker,\n Yael and Alaluf,\n Yuval and Cohen-Or,\n Daniel and Shamir,\n Ariel\n},\n title = {\n CLIPascene: Scene Sketching with Different Types and Levels of Abstraction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4146-4156\n} \n}" }, { "title": "CLNeRF: Continual Learning Meets NeRF", @@ -7553,8 +7810,8 @@ "status": "Poster", "track": "main", "pid": "2987", - "author_site": "Zhipeng Cai, Matthias M\u00fcller", - "author": "Zhipeng Cai; Matthias M\u00fcller", + "author_site": "Zhipeng Cai, Matthias Müller", + "author": "Zhipeng Cai; Matthias Müller", "abstract": "Novel view synthesis aims to render unseen views given a set of calibrated images. In practical applications, the coverage, appearance or geometry of the scene may change over time, with new images continuously being captured. Efficiently incorporating such continuous change is an open challenge. Standard NeRF benchmarks only involve scene coverage expansion. To study other practical scene changes, we propose a new dataset, World Across Time (WAT), consisting of scenes that change in appearance and geometry over time. We also propose a simple yet effective method, CLNeRF, which introduces continual learning (CL) to Neural Radiance Fields (NeRFs). CLNeRF combines generative replay and the Instant Neural Graphics Primitives (NGP) architecture to effectively prevent catastrophic forgetting and efficiently update the model when new data arrives. We also add trainable appearance and geometry embeddings to NGP, allowing a single compact model to handle complex scene changes. Without the need to store historical images, CLNeRF trained sequentially over multiple scans of a changing scene performs on-par with the upper bound model trained on all scans at once. Compared to other CL baselines CLNeRF performs much better across standard benchmarks and WAT. The source code, a demo, and the WAT dataset are available at https://github.com/IntelLabs/CLNeRF.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Cai_CLNeRF_Continual_Learning_Meets_NeRF_ICCV_2023_paper.pdf", "aff": "Intel Labs; Intel Labs", @@ -7570,14 +7827,15 @@ "author_num": 2, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Cai_CLNeRF_Continual_Learning_Meets_NeRF_ICCV_2023_paper.html", "aff_unique_index": "0;0", - "aff_unique_norm": "Intel", + "aff_unique_norm": "Intel Corporation", "aff_unique_dep": "Intel Labs", "aff_unique_url": "https://www.intel.com", "aff_unique_abbr": "Intel", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Cai_2023_ICCV,\n \n author = {\n Cai,\n Zhipeng and M\\"uller,\n Matthias\n},\n title = {\n CLNeRF: Continual Learning Meets NeRF\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23185-23194\n} \n}" }, { "title": "CLR: Channel-wise Lightweight Reprogramming for Continual Learning", @@ -7609,7 +7867,8 @@ "aff_campus_unique_index": "0;0;0;1;1;0", "aff_campus_unique": "Los Angeles;Mountain View", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Ge_2023_ICCV,\n \n author = {\n Ge,\n Yunhao and Li,\n Yuecheng and Ni,\n Shuo and Zhao,\n Jiaping and Yang,\n Ming-Hsuan and Itti,\n Laurent\n},\n title = {\n CLR: Channel-wise Lightweight Reprogramming for Continual Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18798-18808\n} \n}" }, { "title": "CMDA: Cross-Modality Domain Adaptation for Nighttime Semantic Segmentation", @@ -7641,7 +7900,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xia_2023_ICCV,\n \n author = {\n Xia,\n Ruihao and Zhao,\n Chaoqiang and Zheng,\n Meng and Wu,\n Ziyan and Sun,\n Qiyu and Tang,\n Yang\n},\n title = {\n CMDA: Cross-Modality Domain Adaptation for Nighttime Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21572-21581\n} \n}" }, { "title": "CO-Net: Learning Multiple Point Cloud Tasks at Once with A Cohesive Network", @@ -7673,7 +7933,8 @@ "aff_campus_unique_index": "0;0;0;0;0;2;0", "aff_campus_unique": "Harbin;;Zhengzhou", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0+0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xie_2023_ICCV,\n \n author = {\n Xie,\n Tao and Wang,\n Ke and Lu,\n Siyi and Zhang,\n Yukun and Dai,\n Kun and Li,\n Xiaoyu and Xu,\n Jie and Wang,\n Li and Zhao,\n Lijun and Zhang,\n Xinyu and Li,\n Ruifeng\n},\n title = {\n CO-Net: Learning Multiple Point Cloud Tasks at Once with A Cohesive Network\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3523-3533\n} \n}" }, { "title": "CO-PILOT: Dynamic Top-Down Point Cloud with Conditional Neighborhood Aggregation for Multi-Gigapixel Histopathology Image Representation", @@ -7696,7 +7957,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Nakhli_CO-PILOT_Dynamic_Top-Down_Point_Cloud_with_Conditional_Neighborhood_Aggregation_for_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Nakhli_CO-PILOT_Dynamic_Top-Down_Point_Cloud_with_Conditional_Neighborhood_Aggregation_for_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Nakhli_2023_ICCV,\n \n author = {\n Nakhli,\n Ramin and Zhang,\n Allen and Mirabadi,\n Ali and Rich,\n Katherine and Asadi,\n Maryam and Gilks,\n Blake and Farahani,\n Hossein and Bashashati,\n Ali\n},\n title = {\n CO-PILOT: Dynamic Top-Down Point Cloud with Conditional Neighborhood Aggregation for Multi-Gigapixel Histopathology Image Representation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21063-21073\n} \n}" }, { "title": "COCO-O: A Benchmark for Object Detectors under Natural Distribution Shifts", @@ -7728,7 +7990,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0;0;0", - "aff_country_unique": "China;United Kingdom" + "aff_country_unique": "China;United Kingdom", + "bibtex": "@InProceedings{Mao_2023_ICCV,\n \n author = {\n Mao,\n Xiaofeng and Chen,\n Yuefeng and Zhu,\n Yao and Chen,\n Da and Su,\n Hang and Zhang,\n Rong and Xue,\n Hui\n},\n title = {\n COCO-O: A Benchmark for Object Detectors under Natural Distribution Shifts\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6339-6350\n} \n}" }, { "title": "COMPASS: High-Efficiency Deep Image Compression with Arbitrary-scale Spatial Scalability", @@ -7760,7 +8023,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Park_2023_ICCV,\n \n author = {\n Park,\n Jongmin and Lee,\n Jooyoung and Kim,\n Munchurl\n},\n title = {\n COMPASS: High-Efficiency Deep Image Compression with Arbitrary-scale Spatial Scalability\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12826-12835\n} \n}" }, { "title": "COOL-CHIC: Coordinate-based Low Complexity Hierarchical Image Codec", @@ -7768,8 +8032,8 @@ "status": "Poster", "track": "main", "pid": "8861", - "author_site": "Th\u00e9o Ladune, Pierrick Philippe, F\u00e9lix Henry, Gordon Clare, Thomas Leguay", - "author": "Th\u00e9o Ladune; Pierrick Philippe; F\u00e9lix Henry; Gordon Clare; Thomas Leguay", + "author_site": "Théo Ladune, Pierrick Philippe, Félix Henry, Gordon Clare, Thomas Leguay", + "author": "Théo Ladune; Pierrick Philippe; Félix Henry; Gordon Clare; Thomas Leguay", "abstract": "We introduce COOL-CHIC, a Coordinate-based Low Complexity Hierarchical Image Codec. It is a learned alternative to autoencoders with 629 parameters and 680 multiplications per decoded pixel. COOL-CHIC offers compression performance close to modern conventional MPEG codecs such as HEVC and is competitive with popular autoencoder-based systems. This method is inspired by Coordinate-based Neural Representations, where an image is represented as a learned function which maps pixel coordinates to RGB values. The parameters of the mapping function are then sent using entropy coding. At the receiver side, the compressed image is obtained by evaluating the mapping function for all pixel coordinates. COOL-CHIC implementation is made open-source.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Ladune_COOL-CHIC_Coordinate-based_Low_Complexity_Hierarchical_Image_Codec_ICCV_2023_paper.pdf", "aff": "Orange Innovation, France; Orange Innovation, France; Orange Innovation, France; Orange Innovation, France; Orange Innovation, France", @@ -7792,7 +8056,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "France" + "aff_country_unique": "France", + "bibtex": "@InProceedings{Ladune_2023_ICCV,\n \n author = {\n Ladune,\n Th\\'eo and Philippe,\n Pierrick and Henry,\n F\\'elix and Clare,\n Gordon and Leguay,\n Thomas\n},\n title = {\n COOL-CHIC: Coordinate-based Low Complexity Hierarchical Image Codec\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13515-13522\n} \n}" }, { "title": "COOP: Decoupling and Coupling of Whole-Body Grasping Pose Generation", @@ -7824,7 +8089,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zheng_2023_ICCV,\n \n author = {\n Zheng,\n Yanzhao and Shi,\n Yunzhou and Cui,\n Yuhao and Zhao,\n Zhongzhou and Luo,\n Zhiling and Zhou,\n Wei\n},\n title = {\n COOP: Decoupling and Coupling of Whole-Body Grasping Pose Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2163-2173\n} \n}" }, { "title": "COPILOT: Human-Environment Collision Prediction and Localization from Egocentric Videos", @@ -7849,14 +8115,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Pan_COPILOT_Human-Environment_Collision_Prediction_and_Localization_from_Egocentric_Videos_ICCV_2023_paper.html", "aff_unique_index": "0;0;0+1;0;0+1;0+2;0", - "aff_unique_norm": "Stanford University;NVIDIA;University of Hong Kong", + "aff_unique_norm": "Stanford University;NVIDIA Corporation;The University of Hong Kong", "aff_unique_dep": ";NVIDIA Research;", "aff_unique_url": "https://www.stanford.edu;https://www.nvidia.com/research;https://www.hku.hk", "aff_unique_abbr": "Stanford;NVIDIA;HKU", "aff_campus_unique_index": "0;0;0;0;0;0+2;0", "aff_campus_unique": "Stanford;;Hong Kong SAR", "aff_country_unique_index": "0;0;0+0;0;0+0;0+1;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Pan_2023_ICCV,\n \n author = {\n Pan,\n Boxiao and Shen,\n Bokui and Rempe,\n Davis and Paschalidou,\n Despoina and Mo,\n Kaichun and Yang,\n Yanchao and Guibas,\n Leonidas J.\n},\n title = {\n COPILOT: Human-Environment Collision Prediction and Localization from Egocentric Videos\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5262-5272\n} \n}" }, { "title": "CORE: Co-planarity Regularized Monocular Geometry Estimation with Weak Supervision", @@ -7868,7 +8135,7 @@ "author": "Yuguang Li; Kai Wang; Hui Li; Seon-Min Rhee; Seungju Han; Jihye Kim; Min Yang; Ran Yang; Feng Zhu", "abstract": "The ill-posed nature of monocular 3D geometry (depth map and surface normals) estimation makes it rely mostly on data-driven approaches such as Deep Neural Networks (DNN). However, data acquisition of surface normals, especially the reliable normals, is acknowledged difficult. Commonly, reconstruction of surface normals with high quality is heuristic and time-consuming. Such fact urges methodologies to minimize dependency on ground-truth normals when predicting 3D geometry. In this work, we devise CO-planarity REgularized (CORE) loss functions and Structure-Aware Normal Estimator (SANE). Without involving any knowledge of ground-truth normals, these two designs enable pixel-wise 3D geometry estimation weakly supervised by only ground-truth depth map. For CORE loss functions, the key idea is to exploit locally linear depth-normal orthogonality under spherical coordinates as pixel-level constraints, and utilize our designed Adaptive Polar Regularization (APR) to resolve underlying numerical degeneracies. Meanwhile, SANE easily establishes multi-task learning with CORE loss functions on both depth and surface normal estimation, leading to the whole performance leap. Extensive experiments present the effectiveness of our method on various DNN architectures and data benchmarks. The experimental results demonstrate that our depth estimation achieves the state-of-the-art performance across all metrics on indoor scenes and comparable performance on outdoor scenes. In addition, our surface normal estimation is overall superior.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Li_CORE_Co-planarity_Regularized_Monocular_Geometry_Estimation_with_Weak_Supervision_ICCV_2023_paper.pdf", - "aff": "Samsung R&D Institute China Xi\u2019an (SRCX); Samsung R&D Institute China Xi\u2019an (SRCX); Samsung R&D Institute China Xi\u2019an (SRCX); Samsung Advanced Institute of Technology (SAIT), South Korea; Samsung Advanced Institute of Technology (SAIT), South Korea; Samsung Advanced Institute of Technology (SAIT), South Korea; Samsung R&D Institute China Xi\u2019an (SRCX); Samsung R&D Institute China Xi\u2019an (SRCX); Samsung R&D Institute China Xi\u2019an (SRCX)", + "aff": "Samsung R&D Institute China Xi’an (SRCX); Samsung R&D Institute China Xi’an (SRCX); Samsung R&D Institute China Xi’an (SRCX); Samsung Advanced Institute of Technology (SAIT), South Korea; Samsung Advanced Institute of Technology (SAIT), South Korea; Samsung Advanced Institute of Technology (SAIT), South Korea; Samsung R&D Institute China Xi’an (SRCX); Samsung R&D Institute China Xi’an (SRCX); Samsung R&D Institute China Xi’an (SRCX)", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Li_CORE_Co-planarity_Regularized_Monocular_Geometry_Estimation_with_Weak_Supervision_ICCV_2023_supplemental.pdf", @@ -7880,15 +8147,16 @@ "email": "samsung.com;samsung.com;samsung.com;samsung.com;samsung.com;samsung.com;samsung.com;samsung.com;samsung.com", "author_num": 9, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_CORE_Co-planarity_Regularized_Monocular_Geometry_Estimation_with_Weak_Supervision_ICCV_2023_paper.html", - "aff_unique_index": "0;0;0;0;0;0;0;0;0", - "aff_unique_norm": "Samsung", - "aff_unique_dep": "R&D", - "aff_unique_url": "https://www.samsung.com/cn", - "aff_unique_abbr": "SRCX", + "aff_unique_index": "0;0;0;1;1;1;0;0;0", + "aff_unique_norm": "Samsung R&D Institute China;Samsung Advanced Institute of Technology", + "aff_unique_dep": "R&D;", + "aff_unique_url": "https://www.samsung.com/cn;https://www.sait.samsung.com", + "aff_unique_abbr": "SRCX;SAIT", "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Xi'an;", "aff_country_unique_index": "0;0;0;1;1;1;0;0;0", - "aff_country_unique": "China;South Korea" + "aff_country_unique": "China;South Korea", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Yuguang and Wang,\n Kai and Li,\n Hui and Rhee,\n Seon-Min and Han,\n Seungju and Kim,\n Jihye and Yang,\n Min and Yang,\n Ran and Zhu,\n Feng\n},\n title = {\n CORE: Co-planarity Regularized Monocular Geometry Estimation with Weak Supervision\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8796-8805\n} \n}" }, { "title": "CORE: Cooperative Reconstruction for Multi-Agent Perception", @@ -7920,7 +8188,8 @@ "aff_campus_unique_index": ";1", "aff_campus_unique": ";Zurich", "aff_country_unique_index": "0+0;0;0;0;0+1", - "aff_country_unique": "China;Switzerland" + "aff_country_unique": "China;Switzerland", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Binglu and Zhang,\n Lei and Wang,\n Zhaozhong and Zhao,\n Yongqiang and Zhou,\n Tianfei\n},\n title = {\n CORE: Cooperative Reconstruction for Multi-Agent Perception\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8710-8720\n} \n}" }, { "title": "CPCM: Contextual Point Cloud Modeling for Weakly-supervised Point Cloud Semantic Segmentation", @@ -7945,14 +8214,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Liu_CPCM_Contextual_Point_Cloud_Modeling_for_Weakly-supervised_Point_Cloud_Semantic_ICCV_2023_paper.html", "aff_unique_index": "0;1;0;0;0;0;2;0+1", - "aff_unique_norm": "South China University of Technology;Pazhou Lab;Baidu", - "aff_unique_dep": ";;Baidu Inc.", + "aff_unique_norm": "South China University of Technology;Pazhou Lab;Baidu Inc.", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.scut.edu.cn;;https://www.baidu.com", "aff_unique_abbr": "SCUT;;Baidu", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Lizhao and Zhuang,\n Zhuangwei and Huang,\n Shangxin and Xiao,\n Xunlong and Xiang,\n Tianhang and Chen,\n Cen and Wang,\n Jingdong and Tan,\n Mingkui\n},\n title = {\n CPCM: Contextual Point Cloud Modeling for Weakly-supervised Point Cloud Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18413-18422\n} \n}" }, { "title": "CRN: Camera Radar Net for Accurate, Robust, Efficient 3D Perception", @@ -7984,7 +8254,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Kim_2023_ICCV,\n \n author = {\n Kim,\n Youngseok and Shin,\n Juyeb and Kim,\n Sanmin and Lee,\n In-Jae and Choi,\n Jun Won and Kum,\n Dongsuk\n},\n title = {\n CRN: Camera Radar Net for Accurate,\n Robust,\n Efficient 3D Perception\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17615-17626\n} \n}" }, { "title": "CROSSFIRE: Camera Relocalization On Self-Supervised Features from an Implicit Representation", @@ -7996,7 +8267,7 @@ "author": "Arthur Moreau; Nathan Piasco; Moussab Bennehar; Dzmitry Tsishkou; Bogdan Stanciulescu; Arnaud de La Fortelle", "abstract": "Beyond novel view synthesis, Neural Radiance Fields are useful for applications that interact with the real world. In this paper, we use them as an implicit map of a given scene and propose a camera relocalization algorithm tailored for this representation. The proposed method enables to compute in real-time the precise position of a device using a single RGB camera, during its navigation. In contrast with previous work, we do not rely on pose regression or photometric alignment but rather use dense local features obtained through volumetric rendering which are specialized on the scene with a self-supervised objective. As a result, our algorithm is more accurate than competitors, able to operate in dynamic outdoor environments with changing lightning conditions and can be readily integrated in any volumetric neural renderer.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Moreau_CROSSFIRE_Camera_Relocalization_On_Self-Supervised_Features_from_an_Implicit_Representation_ICCV_2023_paper.pdf", - "aff": "Noah\u2019s Ark IoV team, Huawei France+Mines Paris, PSL University, Centre for robotics; Noah\u2019s Ark IoV team, Huawei France; Noah\u2019s Ark IoV team, Huawei France; Noah\u2019s Ark IoV team, Huawei France; Mines Paris, PSL University, Centre for robotics; Mines Paris, PSL University, Centre for robotics", + "aff": "Noah’s Ark IoV team, Huawei France+Mines Paris, PSL University, Centre for robotics; Noah’s Ark IoV team, Huawei France; Noah’s Ark IoV team, Huawei France; Noah’s Ark IoV team, Huawei France; Mines Paris, PSL University, Centre for robotics; Mines Paris, PSL University, Centre for robotics", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Moreau_CROSSFIRE_Camera_Relocalization_ICCV_2023_supplemental.zip", @@ -8010,13 +8281,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Moreau_CROSSFIRE_Camera_Relocalization_On_Self-Supervised_Features_from_an_Implicit_Representation_ICCV_2023_paper.html", "aff_unique_index": "0+1;0;0;0;1;1", "aff_unique_norm": "Huawei;Mines Paris", - "aff_unique_dep": "Noah\u2019s Ark IoV team;Centre for robotics", + "aff_unique_dep": "Noah’s Ark IoV team;Centre for robotics", "aff_unique_url": "https://www.huawei.com;https://www.minesparis.psl.eu", "aff_unique_abbr": "Huawei;Mines Paris", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0;0;0", - "aff_country_unique": "France" + "aff_country_unique": "France", + "bibtex": "@InProceedings{Moreau_2023_ICCV,\n \n author = {\n Moreau,\n Arthur and Piasco,\n Nathan and Bennehar,\n Moussab and Tsishkou,\n Dzmitry and Stanciulescu,\n Bogdan and de La Fortelle,\n Arnaud\n},\n title = {\n CROSSFIRE: Camera Relocalization On Self-Supervised Features from an Implicit Representation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 252-262\n} \n}" }, { "title": "CSDA: Learning Category-Scale Joint Feature for Domain Adaptive Object Detection", @@ -8028,7 +8300,7 @@ "author": "Changlong Gao; Chengxu Liu; Yujie Dun; Xueming Qian", "abstract": "Domain Adaptive Object Detection (DAOD) aims to improve the detection performance of target domains by minimizing the feature distribution between the source and target domain. Recent approaches usually align such distributions in terms of categories through adversarial learning and some progress has been made. However, when objects are non-uniformly distributed at different scales, such category-level alignment causes imbalanced object feature learning, refer as the inconsistency of category alignment at different scales. For better category-level feature alignment, we propose a novel DAOD framework of joint category and scale information, dubbed CSDA, such a design enables effective object learning for different scales. Specifically, our framework is implemented by two closely-related modules: 1) SGFF (Scale-Guided Feature Fusion) fuses the category representations of different domains to learn category-specific features, where the features are aligned by discriminators at three scales. 2) SAFE (Scale-Auxiliary Feature Enhancement) encodes scale coordinates into a group of tokens and enhances the representation of category-specific features at different scales by self-attention. Based on the anchor-based Faster-RCNN and anchor-free FCOS detectors, experiments show that our method achieves state-of-the-art results on three DAOD benchmarks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Gao_CSDA_Learning_Category-Scale_Joint_Feature_for_Domain_Adaptive_Object_Detection_ICCV_2023_paper.pdf", - "aff": "1Xi\u2019an Jiaotong University + 2Shaanxi Yulan Jiuzhou Intelligent Optoelectronic Technology Co., Ltd; 1Xi\u2019an Jiaotong University + 2Shaanxi Yulan Jiuzhou Intelligent Optoelectronic Technology Co., Ltd; 1Xi\u2019an Jiaotong University; 1Xi\u2019an Jiaotong University + 2Shaanxi Yulan Jiuzhou Intelligent Optoelectronic Technology Co., Ltd", + "aff": "1Xi’an Jiaotong University + 2Shaanxi Yulan Jiuzhou Intelligent Optoelectronic Technology Co., Ltd; 1Xi’an Jiaotong University + 2Shaanxi Yulan Jiuzhou Intelligent Optoelectronic Technology Co., Ltd; 1Xi’an Jiaotong University; 1Xi’an Jiaotong University + 2Shaanxi Yulan Jiuzhou Intelligent Optoelectronic Technology Co., Ltd", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Gao_CSDA_Learning_Category-Scale_Joint_Feature_for_Domain_Adaptive_Object_Detection_ICCV_2023_supplemental.pdf", @@ -8041,14 +8313,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Gao_CSDA_Learning_Category-Scale_Joint_Feature_for_Domain_Adaptive_Object_Detection_ICCV_2023_paper.html", "aff_unique_index": "0+1;0+1;0;0+1", - "aff_unique_norm": "Xi'an Jiao Tong University;Shaanxi Yulan Jiuzhou Intelligent Optoelectronic Technology Co., Ltd", + "aff_unique_norm": "Xi'an Jiaotong University;Shaanxi Yulan Jiuzhou Intelligent Optoelectronic Technology Co., Ltd", "aff_unique_dep": ";", "aff_unique_url": "https://www.xjtu.edu.cn;", "aff_unique_abbr": "XJTU;", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Gao_2023_ICCV,\n \n author = {\n Gao,\n Changlong and Liu,\n Chengxu and Dun,\n Yujie and Qian,\n Xueming\n},\n title = {\n CSDA: Learning Category-Scale Joint Feature for Domain Adaptive Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11421-11430\n} \n}" }, { "title": "CTP:Towards Vision-Language Continual Pretraining via Compatible Momentum Contrast and Topology Preservation", @@ -8056,6 +8329,7 @@ "status": "Poster", "track": "main", "pid": "8664", + "author_site": "Hongguang Zhu, Yunchao Wei, Xiaodan Liang, Chunjie Zhang, Yao Zhao", "author": "Hongguang Zhu, Yunchao Wei, Xiaodan Liang, Chunjie Zhang, Yao Zhao", "abstract": "Vision-Language Pretraining (VLP) has shown impressive results on diverse downstream tasks by offline training on large-scale datasets. Regarding the growing nature of real-world data, such an offline training paradigm on ever-expanding data is unsustainable, because models lack the continual learning ability to accumulate knowledge constantly. However, most continual learning studies are limited to uni-modal classification and existing multi-modal datasets cannot simulate continual non-stationary data stream scenarios. To support the study of Vision-Language Continual Pretraining (VLCP), we first contribute a comprehensive and unified benchmark dataset P9D which contains over one million product image-text pairs from 9 industries. The data from each industry as an independent task supports continual learning and conforms to the real-world long-tail nature to simulate pretraining on web data. We comprehensively study the characteristics and challenges of VLCP, and propose a new algorithm: Compatible momentum contrast with Topology Preservation, dubbed CTP. The compatible momentum model absorbs the knowledge of the current and previous-task models to flexibly update the modal feature. Moreover, Topology Preservation transfers the knowledge of embedding across tasks while preserving the flexibility of feature adjustment. The experimental results demonstrate our method not only achieves superior performance compared with other baselines but also does not bring an expensive training burden. Dataset and codes are available at https://github.com/KevinLight831/CTP.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Zhu_CTPTowards_Vision-Language_Continual_Pretraining_via_Compatible_Momentum_Contrast_and_Topology_ICCV_2023_paper.pdf", @@ -8067,7 +8341,8 @@ "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8023255851578988526&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhu_CTPTowards_Vision-Language_Continual_Pretraining_via_Compatible_Momentum_Contrast_and_Topology_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhu_CTPTowards_Vision-Language_Continual_Pretraining_via_Compatible_Momentum_Contrast_and_Topology_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Zhu_2023_ICCV,\n \n author = {\n Zhu,\n Hongguang and Wei,\n Yunchao and Liang,\n Xiaodan and Zhang,\n Chunjie and Zhao,\n Yao\n},\n title = {\n CTP:Towards Vision-Language Continual Pretraining via Compatible Momentum Contrast and Topology Preservation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22257-22267\n} \n}" }, { "title": "CTVIS: Consistent Training for Online Video Instance Segmentation", @@ -8092,14 +8367,15 @@ "author_num": 10, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ying_CTVIS_Consistent_Training_for_Online_Video_Instance_Segmentation_ICCV_2023_paper.html", "aff_unique_index": "0;1;1;2;0;3;1;0;1;0", - "aff_unique_norm": "Zhejiang University;University of Adelaide;Northwest A&F University;Swansea University", + "aff_unique_norm": "Zhejiang University;The University of Adelaide;Northwest A&F University;Swansea University", "aff_unique_dep": ";;College of Information Engineering;", "aff_unique_url": "https://www.zju.edu.cn;https://www.adelaide.edu.au;http://www.nwsuaf.edu.cn;https://www.swansea.ac.uk", "aff_unique_abbr": "ZJU;Adelaide;;Swansea U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0;0;2;1;0;1;0", - "aff_country_unique": "China;Australia;United Kingdom" + "aff_country_unique": "China;Australia;United Kingdom", + "bibtex": "@InProceedings{Ying_2023_ICCV,\n \n author = {\n Ying,\n Kaining and Zhong,\n Qing and Mao,\n Weian and Wang,\n Zhenhua and Chen,\n Hao and Wu,\n Lin Yuanbo and Liu,\n Yifan and Fan,\n Chengxiang and Zhuge,\n Yunzhi and Shen,\n Chunhua\n},\n title = {\n CTVIS: Consistent Training for Online Video Instance Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 899-908\n} \n}" }, { "title": "CVRecon: Rethinking 3D Geometric Feature Learning For Neural Reconstruction", @@ -8122,7 +8398,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Feng_CVRecon_Rethinking_3D_Geometric_Feature_Learning_For_Neural_Reconstruction_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Feng_CVRecon_Rethinking_3D_Geometric_Feature_Learning_For_Neural_Reconstruction_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Feng_2023_ICCV,\n \n author = {\n Feng,\n Ziyue and Yang,\n Liang and Guo,\n Pengsheng and Li,\n Bing\n},\n title = {\n CVRecon: Rethinking 3D Geometric Feature Learning For Neural Reconstruction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17750-17760\n} \n}" }, { "title": "CVSformer: Cross-View Synthesis Transformer for Semantic Scene Completion", @@ -8147,14 +8424,15 @@ "author_num": 10, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Dong_CVSformer_Cross-View_Synthesis_Transformer_for_Semantic_Scene_Completion_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;1;1;2;3;4;5;0", - "aff_unique_norm": "Tianjin University;Shenzhen University;Hong Kong Polytechnic University;Agency for Science, Technology and Research;South China University of Technology;Alibaba Group", + "aff_unique_norm": "Tianjin University;Shenzhen University;The Hong Kong Polytechnic University;Agency for Science, Technology and Research;South China University of Technology;Alibaba Group", "aff_unique_dep": ";;;IHPC and CFAR;Pazhou Lab;Damo Academy", - "aff_unique_url": "http://www.tju.edu.cn;https://www.szu.edu.cn;https://www.polyu.edu.hk;https://www.a-star.edu.sg;https://www.scut.edu.cn;https://www.alibaba-group.com", + "aff_unique_url": "http://www.tju.edu.cn;https://www.szu.edu.cn;https://www.polyu.edu.hk;https://www.a-star.edu.sg;https://www.scut.edu.cn;https://www.alibaba.com", "aff_unique_abbr": "TJU;SZU;PolyU;A*STAR;SCUT;Alibaba", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Hong Kong SAR;Pazhou", "aff_country_unique_index": "0;0;0;0;0;0;1;0;0;0", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Dong_2023_ICCV,\n \n author = {\n Dong,\n Haotian and Ma,\n Enhui and Wang,\n Lubo and Wang,\n Miaohui and Xie,\n Wuyuan and Guo,\n Qing and Li,\n Ping and Liang,\n Lingyu and Yang,\n Kairui and Lin,\n Di\n},\n title = {\n CVSformer: Cross-View Synthesis Transformer for Semantic Scene Completion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8874-8883\n} \n}" }, { "title": "CaPhy: Capturing Physical Properties for Animatable Human Avatars", @@ -8166,7 +8444,7 @@ "author": "Zhaoqi Su; Liangxiao Hu; Siyou Lin; Hongwen Zhang; Shengping Zhang; Justus Thies; Yebin Liu", "abstract": "We present CaPhy, a novel method for reconstructing animatable human avatars with realistic dynamic properties for clothing. Specifically, we aim for capturing the geometric and physical properties of the clothing from real observations. This allows us to apply novel poses to the human avatar with physically correct deformations and wrinkles of the clothing. To this end, we combine unsupervised training with physics-based losses and 3D-supervised training using scanned data to reconstruct a dynamic model of clothing that is physically realistic and conforms to the human scans. We also optimize the physical parameters of the underlying physical model from the scans by introducing gradient constraints of the physics-based losses. In contrast to previous work on 3D avatar reconstruction, our method is able to generalize to novel poses with realistic dynamic cloth deformations. Experiments on several subjects demonstrate that our method can estimate the physical properties of the garments, resulting in superior quantitative and qualitative results compared with previous methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Su_CaPhy_Capturing_Physical_Properties_for_Animatable_Human_Avatars_ICCV_2023_paper.pdf", - "aff": "Tsinghua University, Beijing, China; Harbin Institute of Technology, Weihai, Shandong, China; Tsinghua University, Beijing, China; Tsinghua University, Beijing, China; Harbin Institute of Technology, Weihai, Shandong, China; Max Planck Institute for Intelligent Systems, T\u00a8ubingen, Germany; Tsinghua University, Beijing, China", + "aff": "Tsinghua University, Beijing, China; Harbin Institute of Technology, Weihai, Shandong, China; Tsinghua University, Beijing, China; Tsinghua University, Beijing, China; Harbin Institute of Technology, Weihai, Shandong, China; Max Planck Institute for Intelligent Systems, T¨ubingen, Germany; Tsinghua University, Beijing, China", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Su_CaPhy_Capturing_Physical_ICCV_2023_supplemental.zip", @@ -8184,9 +8462,10 @@ "aff_unique_url": "https://www.tsinghua.edu.cn;http://www.hit.edu.cn/;https://www.mpi-is.mpg.de", "aff_unique_abbr": "THU;HIT;MPI-IS", "aff_campus_unique_index": "0;1;0;0;1;2;0", - "aff_campus_unique": "Beijing;Weihai;T\u00fcbingen", + "aff_campus_unique": "Beijing;Weihai;Tübingen", "aff_country_unique_index": "0;0;0;0;0;1;0", - "aff_country_unique": "China;Germany" + "aff_country_unique": "China;Germany", + "bibtex": "@InProceedings{Su_2023_ICCV,\n \n author = {\n Su,\n Zhaoqi and Hu,\n Liangxiao and Lin,\n Siyou and Zhang,\n Hongwen and Zhang,\n Shengping and Thies,\n Justus and Liu,\n Yebin\n},\n title = {\n CaPhy: Capturing Physical Properties for Animatable Human Avatars\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14150-14160\n} \n}" }, { "title": "Calibrating Panoramic Depth Estimation for Practical Localization and Mapping", @@ -8218,7 +8497,8 @@ "aff_campus_unique_index": "0;0;0+0", "aff_campus_unique": "Seoul", "aff_country_unique_index": "0;0;0+0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Kim_2023_ICCV,\n \n author = {\n Kim,\n Junho and Lee,\n Eun Sun and Kim,\n Young Min\n},\n title = {\n Calibrating Panoramic Depth Estimation for Practical Localization and Mapping\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8830-8840\n} \n}" }, { "title": "Calibrating Uncertainty for Semi-Supervised Crowd Counting", @@ -8241,7 +8521,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/LI_Calibrating_Uncertainty_for_Semi-Supervised_Crowd_Counting_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/LI_Calibrating_Uncertainty_for_Semi-Supervised_Crowd_Counting_ICCV_2023_paper.html", + "bibtex": "@InProceedings{LI_2023_ICCV,\n \n author = {\n LI,\n Chen and Hu,\n Xiaoling and Abousamra,\n Shahira and Chen,\n Chao\n},\n title = {\n Calibrating Uncertainty for Semi-Supervised Crowd Counting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16731-16741\n} \n}" }, { "title": "Camera-Driven Representation Learning for Unsupervised Domain Adaptive Person Re-identification", @@ -8273,7 +8554,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Lee_2023_ICCV,\n \n author = {\n Lee,\n Geon and Lee,\n Sanghoon and Kim,\n Dohyung and Shin,\n Younghoon and Yoon,\n Yongsang and Ham,\n Bumsub\n},\n title = {\n Camera-Driven Representation Learning for Unsupervised Domain Adaptive Person Re-identification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11453-11462\n} \n}" }, { "title": "Can Language Models Learn to Listen?", @@ -8296,7 +8578,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ng_Can_Language_Models_Learn_to_Listen_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ng_Can_Language_Models_Learn_to_Listen_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Ng_2023_ICCV,\n \n author = {\n Ng,\n Evonne and Subramanian,\n Sanjay and Klein,\n Dan and Kanazawa,\n Angjoo and Darrell,\n Trevor and Ginosar,\n Shiry\n},\n title = {\n Can Language Models Learn to Listen?\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10083-10093\n} \n}" }, { "title": "CancerUniT: Towards a Single Unified Model for Effective Detection, Segmentation, and Diagnosis of Eight Major Cancers Using a Large Collection of CT Scans", @@ -8319,7 +8602,8 @@ "aff_domain": ";;;;;;;;;;;;;;;;;;;;;;;;", "email": ";;;;;;;;;;;;;;;;;;;;;;;;", "author_num": 25, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chen_CancerUniT_Towards_a_Single_Unified_Model_for_Effective_Detection_Segmentation_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chen_CancerUniT_Towards_a_Single_Unified_Model_for_Effective_Detection_Segmentation_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Jieneng and Xia,\n Yingda and Yao,\n Jiawen and Yan,\n Ke and Zhang,\n Jianpeng and Lu,\n Le and Wang,\n Fakai and Zhou,\n Bo and Qiu,\n Mingyan and Yu,\n Qihang and Yuan,\n Mingze and Fang,\n Wei and Tang,\n Yuxing and Xu,\n Minfeng and Zhou,\n Jian and Zhao,\n Yuqian and Wang,\n Qifeng and Ye,\n Xianghua and Yin,\n Xiaoli and Shi,\n Yu and Chen,\n Xin and Zhou,\n Jingren and Yuille,\n Alan and Liu,\n Zaiyi and Zhang,\n Ling\n},\n title = {\n CancerUniT: Towards a Single Unified Model for Effective Detection,\n Segmentation,\n and Diagnosis of Eight Major Cancers Using a Large Collection of CT Scans\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21327-21338\n} \n}" }, { "title": "Candidate-aware Selective Disambiguation Based On Normalized Entropy for Instance-dependent Partial-label Learning", @@ -8351,7 +8635,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{He_2023_ICCV,\n \n author = {\n He,\n Shuo and Yang,\n Guowu and Feng,\n Lei\n},\n title = {\n Candidate-aware Selective Disambiguation Based On Normalized Entropy for Instance-dependent Partial-label Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1792-1801\n} \n}" }, { "title": "Canonical Factors for Hybrid Neural Fields", @@ -8374,7 +8659,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yi_Canonical_Factors_for_Hybrid_Neural_Fields_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yi_Canonical_Factors_for_Hybrid_Neural_Fields_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Yi_2023_ICCV,\n \n author = {\n Yi,\n Brent and Zeng,\n Weijia and Buchanan,\n Sam and Ma,\n Yi\n},\n title = {\n Canonical Factors for Hybrid Neural Fields\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3414-3426\n} \n}" }, { "title": "Cascade-DETR: Delving into High-Quality Universal Object Detection", @@ -8386,7 +8672,7 @@ "author": "Mingqiao Ye; Lei Ke; Siyuan Li; Yu-Wing Tai; Chi-Keung Tang; Martin Danelljan; Fisher Yu", "abstract": "Object localization in general environments is a fundamental part of vision systems. While dominating on the COCO benchmark, recent Transformer-based detection methods are not competitive in diverse domains. Moreover, these methods still struggle to very accurately estimate the object bounding boxes in complex environments. \n \n We introduce Cascade-DETR for high-quality universal object detection. We jointly tackle the generalization to diverse domains and localization accuracy by proposing the Cascade Attention layer, which explicitly integrates object-centric information into the detection decoder by limiting the attention to the previous box prediction. To further enhance accuracy, we also revisit the scoring of queries. Instead of relying on classification scores, we predict the expected IoU of the query, leading to substantially more well-calibrated confidences. Lastly, we introduce a universal object detection benchmark, UDB10, that contains 10 datasets from diverse domains. While also advancing the state-of-the-art on COCO, Cascade-DETR substantially improves DETR-based detectors on all datasets in UDB10, even by over 10 mAP in some cases. The improvements under stringent quality requirements are even more pronounced. Our code and pretrained models are at https://github.com/SysCV/cascade-detr.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Ye_Cascade-DETR_Delving_into_High-Quality_Universal_Object_Detection_ICCV_2023_paper.pdf", - "aff": "ETH Z\u00fcrich; ETH Z\u00fcrich + HKUST; ETH Z\u00fcrich; Dartmouth College; HKUST; ETH Z\u00fcrich; ETH Z\u00fcrich", + "aff": "ETH Zürich; ETH Zürich + HKUST; ETH Zürich; Dartmouth College; HKUST; ETH Zürich; ETH Zürich", "project": "", "github": "https://github.com/SysCV/cascade-detr", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Ye_Cascade-DETR_Delving_into_ICCV_2023_supplemental.pdf", @@ -8399,14 +8685,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ye_Cascade-DETR_Delving_into_High-Quality_Universal_Object_Detection_ICCV_2023_paper.html", "aff_unique_index": "0;0+1;0;2;1;0;0", - "aff_unique_norm": "ETH Zurich;Hong Kong University of Science and Technology;Dartmouth College", + "aff_unique_norm": "ETH Zürich;Hong Kong University of Science and Technology;Dartmouth College", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ethz.ch;https://www.ust.hk;https://www.dartmouth.edu", "aff_unique_abbr": "ETHZ;HKUST;Dartmouth", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0+1;0;2;1;0;0", - "aff_country_unique": "Switzerland;China;United States" + "aff_country_unique": "Switzerland;China;United States", + "bibtex": "@InProceedings{Ye_2023_ICCV,\n \n author = {\n Ye,\n Mingqiao and Ke,\n Lei and Li,\n Siyuan and Tai,\n Yu-Wing and Tang,\n Chi-Keung and Danelljan,\n Martin and Yu,\n Fisher\n},\n title = {\n Cascade-DETR: Delving into High-Quality Universal Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6704-6714\n} \n}" }, { "title": "Category-aware Allocation Transformer for Weakly Supervised Object Localization", @@ -8438,7 +8725,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Zhiwei and Ding,\n Jinren and Cao,\n Liujuan and Shen,\n Yunhang and Zhang,\n Shengchuan and Jiang,\n Guannan and Ji,\n Rongrong\n},\n title = {\n Category-aware Allocation Transformer for Weakly Supervised Object Localization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6643-6652\n} \n}" }, { "title": "CauSSL: Causality-inspired Semi-supervised Learning for Medical Image Segmentation", @@ -8463,14 +8751,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Miao_CauSSL_Causality-inspired_Semi-supervised_Learning_for_Medical_Image_Segmentation_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;0;0", - "aff_unique_norm": "Chinese University of Hong Kong;Harvard Medical School;Zhejiang Lab", + "aff_unique_norm": "The Chinese University of Hong Kong;Harvard Medical School;Zhejiang Lab", "aff_unique_dep": "Department of Computer Science and Engineering;Center for Advanced Medical Computing and Analysis;", "aff_unique_url": "https://www.cuhk.edu.hk;https://hms.harvard.edu;http://www.zhejianglab.com", "aff_unique_abbr": "CUHK;HMS;", "aff_campus_unique_index": "0;1;0;0", "aff_campus_unique": "Hong Kong SAR;Boston;", "aff_country_unique_index": "0;1;0;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Miao_2023_ICCV,\n \n author = {\n Miao,\n Juzheng and Chen,\n Cheng and Liu,\n Furui and Wei,\n Hao and Heng,\n Pheng-Ann\n},\n title = {\n CauSSL: Causality-inspired Semi-supervised Learning for Medical Image Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21426-21437\n} \n}" }, { "title": "Causal-DFQ: Causality Guided Data-Free Network Quantization", @@ -8493,7 +8782,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Shang_Causal-DFQ_Causality_Guided_Data-Free_Network_Quantization_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Shang_Causal-DFQ_Causality_Guided_Data-Free_Network_Quantization_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Shang_2023_ICCV,\n \n author = {\n Shang,\n Yuzhang and Xu,\n Bingxin and Liu,\n Gaowen and Kompella,\n Ramana Rao and Yan,\n Yan\n},\n title = {\n Causal-DFQ: Causality Guided Data-Free Network Quantization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17437-17446\n} \n}" }, { "title": "Center-Based Decoupled Point-cloud Registration for 6D Object Pose Estimation", @@ -8518,14 +8808,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Jiang_Center-Based_Decoupled_Point-cloud_Registration_for_6D_Object_Pose_Estimation_ICCV_2023_paper.html", "aff_unique_index": "0;1;0;0;1;0", - "aff_unique_norm": "Nanjing University of Science and Technology;EPFL", + "aff_unique_norm": "Nanjing University of Science and Technology;École Polytechnique Fédérale de Lausanne", "aff_unique_dep": "PCA Lab;CVLab", "aff_unique_url": ";https://cvlab.epfl.ch", "aff_unique_abbr": ";EPFL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;1;0", - "aff_country_unique": "China;Switzerland" + "aff_country_unique": "China;Switzerland", + "bibtex": "@InProceedings{Jiang_2023_ICCV,\n \n author = {\n Jiang,\n Haobo and Dang,\n Zheng and Gu,\n Shuo and Xie,\n Jin and Salzmann,\n Mathieu and Yang,\n Jian\n},\n title = {\n Center-Based Decoupled Point-cloud Registration for 6D Object Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3427-3437\n} \n}" }, { "title": "Chaotic World: A Large and Challenging Benchmark for Human Behavior Understanding in Chaotic Events", @@ -8557,7 +8848,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0+0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Ong_2023_ICCV,\n \n author = {\n Ong,\n Kian Eng and Ng,\n Xun Long and Li,\n Yanchao and Ai,\n Wenjie and Zhao,\n Kuangyi and Yeo,\n Si Yong and Liu,\n Jun\n},\n title = {\n Chaotic World: A Large and Challenging Benchmark for Human Behavior Understanding in Chaotic Events\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20213-20223\n} \n}" }, { "title": "ChartReader: A Unified Framework for Chart Derendering and Comprehension without Heuristic Rules", @@ -8582,14 +8874,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Cheng_ChartReader_A_Unified_Framework_for_Chart_Derendering_and_Comprehension_without_ICCV_2023_paper.html", "aff_unique_index": "0;1;0", - "aff_unique_norm": "Carnegie Mellon University;Microsoft", + "aff_unique_norm": "Carnegie Mellon University;Microsoft Corporation", "aff_unique_dep": "Language Technologies Institute;Microsoft Research", "aff_unique_url": "https://www.cmu.edu;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "CMU;MSR", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Pittsburgh;", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Cheng_2023_ICCV,\n \n author = {\n Cheng,\n Zhi-Qi and Dai,\n Qi and Hauptmann,\n Alexander G.\n},\n title = {\n ChartReader: A Unified Framework for Chart Derendering and Comprehension without Heuristic Rules\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22202-22213\n} \n}" }, { "title": "Chasing Clouds: Differentiable Volumetric Rasterisation of Point Clouds as a Highly Efficient and Accurate Loss for Large-Scale Deformable 3D Registration", @@ -8597,11 +8890,11 @@ "status": "Oral", "track": "main", "pid": "10213", - "author_site": "Mattias P. Heinrich, Alexander Bigalke, Christoph Gro\u00dfbr\u00f6hmer, Lasse Hansen", - "author": "Mattias P. Heinrich; Alexander Bigalke; Christoph Gro\u00dfbr\u00f6hmer; Lasse Hansen", + "author_site": "Mattias P. Heinrich, Alexander Bigalke, Christoph Großbröhmer, Lasse Hansen", + "author": "Mattias P. Heinrich; Alexander Bigalke; Christoph Großbröhmer; Lasse Hansen", "abstract": "Learning-based registration for large-scale 3D point clouds has been shown to improve robustness and accuracy compared to classical methods and can be trained without supervision for locally rigid problems. However, for tasks with highly deformable structures, such as alignment of pulmonary vascular trees for medical diagnostics, previous approaches of self-supervision with regularisation and point distance losses have failed to succeed, leading to the need for complex synthetic augmentation strategies to obtain reliably strong supervision. In this work, we introduce a novel Differentiable Volumetric Rasterisation of point Clouds (DiVRoC) that overcomes those limitations and offers a highly efficient and accurate loss for large-scale deformable 3D registration. DiVRoC drastically reduces the computational complexity for measuring point cloud distances for high-resolution data with over 100k 3D points and can also be employed to extrapolate and regularise sparse motion fields, as loss in a self-training setting and as objective function in instance optimisation. DiVRoC can be successfully embedded into geometric registration networks, including PointPWC-Net and other graph CNNs. Our approach yields new state-of-the-art accuracy on the challenging PVT dataset in three different settings without training with manual ground truth: 1) unsupervised metric-based learning 2) self-supervised learning with pseudo labels generated by self-training and 3) optimisation based alignment without learning. https://github.com/mattiaspaul/ChasingClouds", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Heinrich_Chasing_Clouds_Differentiable_Volumetric_Rasterisation_of_Point_Clouds_as_a_ICCV_2023_paper.pdf", - "aff": "Institute of Medical Informatics, University of L\u00fcbeck; Institute of Medical Informatics, University of L\u00fcbeck; Institute of Medical Informatics, University of L\u00fcbeck; EchoScout GmbH Germany", + "aff": "Institute of Medical Informatics, University of Lübeck; Institute of Medical Informatics, University of Lübeck; Institute of Medical Informatics, University of Lübeck; EchoScout GmbH Germany", "project": "", "github": "https://github.com/mattiaspaul/ChasingClouds", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Heinrich_Chasing_Clouds_Differentiable_ICCV_2023_supplemental.pdf", @@ -8614,14 +8907,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Heinrich_Chasing_Clouds_Differentiable_Volumetric_Rasterisation_of_Point_Clouds_as_a_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;1", - "aff_unique_norm": "University of L\u00fcbeck;EchoScout GmbH", + "aff_unique_norm": "University of Lübeck;EchoScout GmbH", "aff_unique_dep": "Institute of Medical Informatics;", "aff_unique_url": "https://www.uni-luebeck.de;", "aff_unique_abbr": ";", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Heinrich_2023_ICCV,\n \n author = {\n Heinrich,\n Mattias P. and Bigalke,\n Alexander and Gro{\\ss\n}br\\"ohmer,\n Christoph and Hansen,\n Lasse\n},\n title = {\n Chasing Clouds: Differentiable Volumetric Rasterisation of Point Clouds as a Highly Efficient and Accurate Loss for Large-Scale Deformable 3D Registration\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8026-8036\n} \n}" }, { "title": "CheckerPose: Progressive Dense Keypoint Localization for Object Pose Estimation with Graph Neural Network", @@ -8653,7 +8947,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Stony Brook", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Lian_2023_ICCV,\n \n author = {\n Lian,\n Ruyi and Ling,\n Haibin\n},\n title = {\n CheckerPose: Progressive Dense Keypoint Localization for Object Pose Estimation with Graph Neural Network\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14022-14033\n} \n}" }, { "title": "ChildPlay: A New Benchmark for Understanding Children's Gaze Behaviour", @@ -8685,7 +8980,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Martigny", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Switzerland" + "aff_country_unique": "Switzerland", + "bibtex": "@InProceedings{Tafasca_2023_ICCV,\n \n author = {\n Tafasca,\n Samy and Gupta,\n Anshul and Odobez,\n Jean-Marc\n},\n title = {\n ChildPlay: A New Benchmark for Understanding Children's Gaze Behaviour\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20935-20946\n} \n}" }, { "title": "Chinese Text Recognition with A Pre-Trained CLIP-Like Model Through Image-IDS Aligning", @@ -8717,7 +9013,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yu_2023_ICCV,\n \n author = {\n Yu,\n Haiyang and Wang,\n Xiaocong and Li,\n Bin and Xue,\n Xiangyang\n},\n title = {\n Chinese Text Recognition with A Pre-Trained CLIP-Like Model Through Image-IDS Aligning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11943-11952\n} \n}" }, { "title": "Chop & Learn: Recognizing and Generating Object-State Compositions", @@ -8740,7 +9037,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Saini_Chop__Learn_Recognizing_and_Generating_Object-State_Compositions_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Saini_Chop__Learn_Recognizing_and_Generating_Object-State_Compositions_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Saini_2023_ICCV,\n \n author = {\n Saini,\n Nirat and Wang,\n Hanyu and Swaminathan,\n Archana and Jayasundara,\n Vinoj and He,\n Bo and Gupta,\n Kamal and Shrivastava,\n Abhinav\n},\n title = {\n Chop \\& Learn: Recognizing and Generating Object-State Compositions\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20247-20258\n} \n}" }, { "title": "Chordal Averaging on Flag Manifolds and Its Applications", @@ -8772,7 +9070,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", - "aff_country_unique": "United States;United Kingdom" + "aff_country_unique": "United States;United Kingdom", + "bibtex": "@InProceedings{Mankovich_2023_ICCV,\n \n author = {\n Mankovich,\n Nathan and Birdal,\n Tolga\n},\n title = {\n Chordal Averaging on Flag Manifolds and Its Applications\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3881-3890\n} \n}" }, { "title": "Chupa: Carving 3D Clothed Humans from Skinned Shape Priors using 2D Diffusion Probabilistic Models", @@ -8795,7 +9094,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Kim_Chupa_Carving_3D_Clothed_Humans_from_Skinned_Shape_Priors_using_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Kim_Chupa_Carving_3D_Clothed_Humans_from_Skinned_Shape_Priors_using_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Kim_2023_ICCV,\n \n author = {\n Kim,\n Byungjun and Kwon,\n Patrick and Lee,\n Kwangho and Lee,\n Myunggi and Han,\n Sookwan and Kim,\n Daesik and Joo,\n Hanbyul\n},\n title = {\n Chupa: Carving 3D Clothed Humans from Skinned Shape Priors using 2D Diffusion Probabilistic Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15965-15976\n} \n}" }, { "title": "CiT: Curation in Training for Effective Vision-Language Data", @@ -8818,7 +9118,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Xu_CiT_Curation_in_Training_for_Effective_Vision-Language_Data_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Xu_CiT_Curation_in_Training_for_Effective_Vision-Language_Data_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Xu_2023_ICCV,\n \n author = {\n Xu,\n Hu and Xie,\n Saining and Huang,\n Po-Yao and Yu,\n Licheng and Howes,\n Russell and Ghosh,\n Gargi and Zettlemoyer,\n Luke and Feichtenhofer,\n Christoph\n},\n title = {\n CiT: Curation in Training for Effective Vision-Language Data\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15180-15189\n} \n}" }, { "title": "CiteTracker: Correlating Image and Text for Visual Tracking", @@ -8843,14 +9144,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_CiteTracker_Correlating_Image_and_Text_for_Visual_Tracking_ICCV_2023_paper.html", "aff_unique_index": "0;0+1;1;0+1;2;3+4", - "aff_unique_norm": "Pengcheng Laboratory;Harbin Institute of Technology;Dalian University of Technology;University of California, Merced;Yonsei University", - "aff_unique_dep": "Peng Cheng Laboratory;;;;", + "aff_unique_norm": "Peng Cheng Laboratory;Harbin Institute of Technology;Dalian University of Technology;University of California, Merced;Yonsei University", + "aff_unique_dep": ";;;;", "aff_unique_url": "http://www.pcl.ac.cn;http://en.hhit.edu.cn/;http://www.dlut.edu.cn/;https://www.ucmerced.edu;https://www.yonsei.ac.kr", "aff_unique_abbr": "PCL;HIT;DUT;UCM;Yonsei", "aff_campus_unique_index": "1;1;1;2", "aff_campus_unique": ";Shenzhen;Merced", "aff_country_unique_index": "0;0+0;0;0+0;0;1+2", - "aff_country_unique": "China;United States;South Korea" + "aff_country_unique": "China;United States;South Korea", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Xin and Huang,\n Yuqing and He,\n Zhenyu and Wang,\n Yaowei and Lu,\n Huchuan and Yang,\n Ming-Hsuan\n},\n title = {\n CiteTracker: Correlating Image and Text for Visual Tracking\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9974-9983\n} \n}" }, { "title": "Class Prior-Free Positive-Unlabeled Learning with Taylor Variational Loss for Hyperspectral Remote Sensing Imagery", @@ -8882,7 +9184,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhao_2023_ICCV,\n \n author = {\n Zhao,\n Hengwei and Wang,\n Xinyu and Li,\n Jingtao and Zhong,\n Yanfei\n},\n title = {\n Class Prior-Free Positive-Unlabeled Learning with Taylor Variational Loss for Hyperspectral Remote Sensing Imagery\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16827-16836\n} \n}" }, { "title": "Class-Aware Patch Embedding Adaptation for Few-Shot Image Classification", @@ -8907,14 +9210,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Hao_Class-Aware_Patch_Embedding_Adaptation_for_Few-Shot_Image_Classification_ICCV_2023_paper.html", "aff_unique_index": "0+1;2;3;0+1;3;0+1", - "aff_unique_norm": "Chinese Academy of Sciences;Chinese University of Hong Kong;University of Edinburgh;University of Sydney", + "aff_unique_norm": "Chinese Academy of Sciences;The Chinese University of Hong Kong;University of Edinburgh;The University of Sydney", "aff_unique_dep": "Guangdong-Hong Kong-Macao Joint Laboratory of Human-Machine Intelligence-Synergy Systems;;School of Informatics;School of Computer Science", "aff_unique_url": "http://www.cas.cn;https://www.cuhk.edu.hk;https://www.ed.ac.uk;https://www.sydney.edu.au", "aff_unique_abbr": "CAS;CUHK;Edinburgh;USYD", "aff_campus_unique_index": "0+1;2;0+1;0+1", "aff_campus_unique": "Shenzhen;Hong Kong;Edinburgh;", "aff_country_unique_index": "0+0;1;2;0+0;2;0+0", - "aff_country_unique": "China;United Kingdom;Australia" + "aff_country_unique": "China;United Kingdom;Australia", + "bibtex": "@InProceedings{Hao_2023_ICCV,\n \n author = {\n Hao,\n Fusheng and He,\n Fengxiang and Liu,\n Liu and Wu,\n Fuxiang and Tao,\n Dacheng and Cheng,\n Jun\n},\n title = {\n Class-Aware Patch Embedding Adaptation for Few-Shot Image Classification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18905-18915\n} \n}" }, { "title": "Class-Incremental Grouping Network for Continual Audio-Visual Learning", @@ -8926,7 +9230,7 @@ "author": "Shentong Mo; Weiguo Pian; Yapeng Tian", "abstract": "Continual learning is a challenging problem in which models need to be trained on non-stationary data across sequential tasks for class-incremental learning. While previous methods have focused on using either regularization or rehearsal-based frameworks to alleviate catastrophic forgetting in image classification, they are limited to a single modality and cannot learn compact class-aware cross-modal representations for continual audio-visual learning. To address this gap, we propose a novel class-incremental grouping network (CIGN) that can learn category-wise semantic features to achieve continual audio-visual learning. Our CIGN leverages learnable audio-visual class tokens and audio-visual grouping to continually aggregate class-aware features. Additionally, it utilizes class tokens distillation and continual grouping to prevent forgetting parameters learned from previous tasks, thereby improving the model's ability to capture discriminative audio-visual categories. We conduct extensive experiments on VGGSound-Instruments, VGGSound-100, and VGG-Sound Sources benchmarks. Our experimental results demonstrate that the CIGN achieves state-of-the-art audio-visual class-incremental learning performance.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Mo_Class-Incremental_Grouping_Network_for_Continual_Audio-Visual_Learning_ICCV_2023_paper.pdf", - "aff": "Carnegie Mellon University\u2020; University of Texas at Dallas\u2020; University of Texas at Dallas*", + "aff": "Carnegie Mellon University†; University of Texas at Dallas†; University of Texas at Dallas*", "project": "", "github": "https://github.com/stoneMo/CIGN", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Mo_Class-Incremental_Grouping_Network_ICCV_2023_supplemental.pdf", @@ -8946,7 +9250,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Dallas", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Mo_2023_ICCV,\n \n author = {\n Mo,\n Shentong and Pian,\n Weiguo and Tian,\n Yapeng\n},\n title = {\n Class-Incremental Grouping Network for Continual Audio-Visual Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7788-7798\n} \n}" }, { "title": "Class-incremental Continual Learning for Instance Segmentation with Image-level Weak Supervision", @@ -8978,7 +9283,8 @@ "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Taiwan", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Hsieh_2023_ICCV,\n \n author = {\n Hsieh,\n Yu-Hsing and Chen,\n Guan-Sheng and Cai,\n Shun-Xian and Wei,\n Ting-Yun and Yang,\n Huei-Fang and Chen,\n Chu-Song\n},\n title = {\n Class-incremental Continual Learning for Instance Segmentation with Image-level Weak Supervision\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1250-1261\n} \n}" }, { "title": "Class-relation Knowledge Distillation for Novel Class Discovery", @@ -9010,7 +9316,8 @@ "aff_campus_unique_index": "0;0+0;0;0", "aff_campus_unique": "Shanghai;", "aff_country_unique_index": "0;0+0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Gu_2023_ICCV,\n \n author = {\n Gu,\n Peiyan and Zhang,\n Chuyu and Xu,\n Ruijie and He,\n Xuming\n},\n title = {\n Class-relation Knowledge Distillation for Novel Class Discovery\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16474-16483\n} \n}" }, { "title": "CleanCLIP: Mitigating Data Poisoning Attacks in Multimodal Contrastive Learning", @@ -9022,7 +9329,7 @@ "author": "Hritik Bansal; Nishad Singhi; Yu Yang; Fan Yin; Aditya Grover; Kai-Wei Chang", "abstract": "Multimodal contrastive pretraining has been used to train multimodal representation models, such as CLIP, on large amounts of paired image-text data. However, previous studies have revealed that such models are vulnerable to backdoor attacks. Specifically, when trained on backdoored examples, CLIP learns spurious correlations between the embedded backdoor trigger and the target label, aligning their representations in the joint embedding space. Injecting even a small number of poisoned examples, such as 75 examples in 3 million pretraining data, can significantly manipulate the model's behavior, making it difficult to detect or unlearn such correlations. To address this issue, we propose CleanCLIP, a finetuning framework that weakens the learned spurious associations introduced by backdoor attacks by independently re-aligning the representations for individual modalities. We demonstrate that unsupervised finetuning using a combination of multimodal contrastive and unimodal self-supervised objectives for individual modalities can significantly reduce the impact of the backdoor attack. Additionally, we show that supervised finetuning on task-specific labeled image data removes the backdoor trigger from the CLIP vision encoder. We show empirically that CleanCLIP maintains model performance on benign examples while erasing a range of backdoor attacks on multimodal contrastive learning.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Bansal_CleanCLIP_Mitigating_Data_Poisoning_Attacks_in_Multimodal_Contrastive_Learning_ICCV_2023_paper.pdf", - "aff": "UCLA; University of T\u00a8ubingen; UCLA; UCLA; UCLA; UCLA", + "aff": "UCLA; University of T¨ubingen; UCLA; UCLA; UCLA; UCLA", "project": "", "github": "https://github.com/nishadsinghi/CleanCLIP", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Bansal_CleanCLIP_Mitigating_Data_ICCV_2023_supplemental.pdf", @@ -9035,14 +9342,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Bansal_CleanCLIP_Mitigating_Data_Poisoning_Attacks_in_Multimodal_Contrastive_Learning_ICCV_2023_paper.html", "aff_unique_index": "0;1;0;0;0;0", - "aff_unique_norm": "University of California, Los Angeles;University of T\u00fcbingen", + "aff_unique_norm": "University of California, Los Angeles;University of Tübingen", "aff_unique_dep": ";", "aff_unique_url": "https://www.ucla.edu;https://www.uni-tuebingen.de/", - "aff_unique_abbr": "UCLA;Uni T\u00fcbingen", + "aff_unique_abbr": "UCLA;Uni Tübingen", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0;1;0;0;0;0", - "aff_country_unique": "United States;Germany" + "aff_country_unique": "United States;Germany", + "bibtex": "@InProceedings{Bansal_2023_ICCV,\n \n author = {\n Bansal,\n Hritik and Singhi,\n Nishad and Yang,\n Yu and Yin,\n Fan and Grover,\n Aditya and Chang,\n Kai-Wei\n},\n title = {\n CleanCLIP: Mitigating Data Poisoning Attacks in Multimodal Contrastive Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 112-123\n} \n}" }, { "title": "ClimateNeRF: Extreme Weather Synthesis in Neural Radiance Field", @@ -9067,14 +9375,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_ClimateNeRF_Extreme_Weather_Synthesis_in_Neural_Radiance_Field_ICCV_2023_paper.html", "aff_unique_index": "0+1;0;0;2;0", - "aff_unique_norm": "University of Illinois Urbana-Champaign;Zhejiang University;University of Maryland", + "aff_unique_norm": "University of Illinois at Urbana-Champaign;Zhejiang University;University of Maryland", "aff_unique_dep": ";;", "aff_unique_url": "https://illinois.edu;https://www.zju.edu.cn;https://www/umd.edu", "aff_unique_abbr": "UIUC;ZJU;UMD", "aff_campus_unique_index": "0;0;0;2;0", "aff_campus_unique": "Urbana-Champaign;;College Park", "aff_country_unique_index": "0+1;0;0;0;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Yuan and Lin,\n Zhi-Hao and Forsyth,\n David and Huang,\n Jia-Bin and Wang,\n Shenlong\n},\n title = {\n ClimateNeRF: Extreme Weather Synthesis in Neural Radiance Field\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3227-3238\n} \n}" }, { "title": "Cloth2Body: Generating 3D Human Body Mesh from 2D Clothing", @@ -9099,14 +9408,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Dai_Cloth2Body_Generating_3D_Human_Body_Mesh_from_2D_Clothing_ICCV_2023_paper.html", "aff_unique_index": "0+1;1;2;0;3;0", - "aff_unique_norm": "Hong Kong University of Science and Technology;ZMO AI Inc.;Technical University of Munich;Nanyang Technological University", + "aff_unique_norm": "The Hong Kong University of Science and Technology;ZMO AI Inc.;Technical University of Munich;Nanyang Technological University", "aff_unique_dep": ";;;S-Lab", "aff_unique_url": "https://www.ust.hk;;https://www.tum.de;https://www.ntu.edu.sg", "aff_unique_abbr": "HKUST;;TUM;NTU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Guangzhou;", "aff_country_unique_index": "0+1;1;2;0;3;0", - "aff_country_unique": "China;United States;Germany;Singapore" + "aff_country_unique": "China;United States;Germany;Singapore", + "bibtex": "@InProceedings{Dai_2023_ICCV,\n \n author = {\n Dai,\n Lu and Ma,\n Liqian and Qian,\n Shenhan and Liu,\n Hao and Liu,\n Ziwei and Xiong,\n Hui\n},\n title = {\n Cloth2Body: Generating 3D Human Body Mesh from 2D Clothing\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15007-15017\n} \n}" }, { "title": "ClothPose: A Real-world Benchmark for Visual Analysis of Garment Pose via An Indirect Recording Solution", @@ -9138,7 +9448,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0;0;1;0;0+0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Xu_2023_ICCV,\n \n author = {\n Xu,\n Wenqiang and Du,\n Wenxin and Xue,\n Han and Li,\n Yutong and Ye,\n Ruolin and Wang,\n Yan-Feng and Lu,\n Cewu\n},\n title = {\n ClothPose: A Real-world Benchmark for Visual Analysis of Garment Pose via An Indirect Recording Solution\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 58-68\n} \n}" }, { "title": "ClothesNet: An Information-Rich 3D Garment Model Repository with Simulated Clothes Environment", @@ -9150,7 +9461,7 @@ "author": "Bingyang Zhou; Haoyu Zhou; Tianhai Liang; Qiaojun Yu; Siheng Zhao; Yuwei Zeng; Jun Lv; Siyuan Luo; Qiancai Wang; Xinyuan Yu; Haonan Chen; Cewu Lu; Lin Shao", "abstract": "We present ClothesNet: a large-scale dataset of 3D clothes objects with information-rich annotations. Our dataset consists of around 4000 models covering 11 categories annotated with clothes features, boundary lines, and keypoints. ClothesNet can be used to facilitate a variety of computer vision and robot interaction tasks. Using our dataset, we establish benchmark tasks for clothes perception, including classification, boundary line segmentation, and keypoint detection, and develop simulated clothes environments for robotic interaction tasks, including rearranging, folding, hanging, and dressing. \n We also demonstrate the efficacy of our ClothesNet in real-world experiments.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Zhou_ClothesNet_An_Information-Rich_3D_Garment_Model_Repository_with_Simulated_Clothes_ICCV_2023_paper.pdf", - "aff": "Harbin Institute of Technology, Shenzhen; Beihang University; Harbin Institute of Technology, Shenzhen; Shanghai Jiao Tong University; Nanjing University; National University of Singapore; Shanghai Jiao Tong University; Xi\u2019an Jiaotong University; Harbin Institute of Technology, Shenzhen; National University of Singapore; Nanjing University; Shanghai Jiao Tong University; National University of Singapore", + "aff": "Harbin Institute of Technology, Shenzhen; Beihang University; Harbin Institute of Technology, Shenzhen; Shanghai Jiao Tong University; Nanjing University; National University of Singapore; Shanghai Jiao Tong University; Xi’an Jiaotong University; Harbin Institute of Technology, Shenzhen; National University of Singapore; Nanjing University; Shanghai Jiao Tong University; National University of Singapore", "project": "https://sites.google.com/view/clothesnet", "github": "", "supp": "", @@ -9163,14 +9474,15 @@ "author_num": 13, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhou_ClothesNet_An_Information-Rich_3D_Garment_Model_Repository_with_Simulated_Clothes_ICCV_2023_paper.html", "aff_unique_index": "0;1;0;2;3;4;2;5;0;4;3;2;4", - "aff_unique_norm": "Harbin Institute of Technology;Beihang University;Shanghai Jiao Tong University;Nanjing University;National University of Singapore;Xi'an Jiao Tong University", + "aff_unique_norm": "Harbin Institute of Technology;Beihang University;Shanghai Jiao Tong University;Nanjing University;National University of Singapore;Xi'an Jiaotong University", "aff_unique_dep": ";;;;;", "aff_unique_url": "http://en.hhit.edu.cn/;http://www.buaa.edu.cn/;https://www.sjtu.edu.cn;https://www.nju.edu.cn;https://www.nus.edu.sg;https://www.xjtu.edu.cn", "aff_unique_abbr": "HIT;BUAA;SJTU;Nanjing U;NUS;XJTU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Shenzhen;", "aff_country_unique_index": "0;0;0;0;0;1;0;0;0;1;0;0;1", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Zhou_2023_ICCV,\n \n author = {\n Zhou,\n Bingyang and Zhou,\n Haoyu and Liang,\n Tianhai and Yu,\n Qiaojun and Zhao,\n Siheng and Zeng,\n Yuwei and Lv,\n Jun and Luo,\n Siyuan and Wang,\n Qiancai and Yu,\n Xinyuan and Chen,\n Haonan and Lu,\n Cewu and Shao,\n Lin\n},\n title = {\n ClothesNet: An Information-Rich 3D Garment Model Repository with Simulated Clothes Environment\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20428-20438\n} \n}" }, { "title": "ClusT3: Information Invariant Test-Time Training", @@ -9182,7 +9494,7 @@ "author": "Gustavo A. Vargas Hakim; David Osowiechi; Mehrdad Noori; Milad Cheraghalikhani; Ali Bahri; Ismail Ben Ayed; Christian Desrosiers", "abstract": "Deep Learning models have shown remarkable performance in a broad range of vision tasks. However, they are often vulnerable against domain shifts at test-time. Test-time training (TTT) methods have been developed in an attempt to mitigate these vulnerabilities, where a secondary task is solved at training time simultaneously with the main task, to be later used as an self-supervised proxy task at test-time. In this work, we propose a novel unsupervised TTT technique based on the maximization of Mutual Information between multi-scale feature maps and a discrete latent representation, which can be integrated to the standard training as an auxiliary clustering task. Experimental results demonstrate competitive classification performance on different popular test-time adaptation benchmarks. The code can be found at: https://github.com/dosowiechi/ClusT3.git", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Hakim_ClusT3_Information_Invariant_Test-Time_Training_ICCV_2023_paper.pdf", - "aff": "\u00b4ETS Montreal, Canada; \u00b4ETS Montreal, Canada; \u00b4ETS Montreal, Canada; \u00b4ETS Montreal, Canada; \u00b4ETS Montreal, Canada; \u00b4ETS Montreal, Canada; \u00b4ETS Montreal, Canada", + "aff": "´ETS Montreal, Canada; ´ETS Montreal, Canada; ´ETS Montreal, Canada; ´ETS Montreal, Canada; ´ETS Montreal, Canada; ´ETS Montreal, Canada; ´ETS Montreal, Canada", "project": "", "github": "https://github.com/dosowiechi/ClusT3.git", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Hakim_ClusT3_Information_Invariant_ICCV_2023_supplemental.pdf", @@ -9195,14 +9507,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Hakim_ClusT3_Information_Invariant_Test-Time_Training_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0;0;0;0", - "aff_unique_norm": "\u00c9cole de technologie sup\u00e9rieure", + "aff_unique_norm": "École de technologie supérieure", "aff_unique_dep": "", "aff_unique_url": "https://www.etsmtl.ca", "aff_unique_abbr": "ETS", "aff_campus_unique_index": "0;0;0;0;0;0;0", "aff_campus_unique": "Montreal", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Hakim_2023_ICCV,\n \n author = {\n Hakim,\n Gustavo A. Vargas and Osowiechi,\n David and Noori,\n Mehrdad and Cheraghalikhani,\n Milad and Bahri,\n Ali and Ben Ayed,\n Ismail and Desrosiers,\n Christian\n},\n title = {\n ClusT3: Information Invariant Test-Time Training\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6136-6145\n} \n}" }, { "title": "Clusterformer: Cluster-based Transformer for 3D Object Detection in Point Clouds", @@ -9227,14 +9540,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Pei_Clusterformer_Cluster-based_Transformer_for_3D_Object_Detection_in_Point_Clouds_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0;0;0", - "aff_unique_norm": "Hikvision", + "aff_unique_norm": "HikVision", "aff_unique_dep": "Research Institute", "aff_unique_url": "https://www.hikvision.com/cn/", "aff_unique_abbr": "HikVision", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Pei_2023_ICCV,\n \n author = {\n Pei,\n Yu and Zhao,\n Xian and Li,\n Hao and Ma,\n Jingyuan and Zhang,\n Jingwei and Pu,\n Shiliang\n},\n title = {\n Clusterformer: Cluster-based Transformer for 3D Object Detection in Point Clouds\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6664-6673\n} \n}" }, { "title": "Clustering based Point Cloud Representation Learning for 3D Analysis", @@ -9246,7 +9560,7 @@ "author": "Tuo Feng; Wenguan Wang; Xiaohan Wang; Yi Yang; Qinghua Zheng", "abstract": "Point cloud analysis (such as 3D segmentation and detection) is a challenging task, because of not only the irregular geometries of many millions of unordered points, but also the great variations caused by depth, viewpoint, occlusion, etc. Current studies put much focus on the adaption of neural networks to the complex geometries of point clouds, but are blind to a fundamental question: how to learn an appropriate point embedding space that is aware of both discriminative semantics and challenging variations? As a response, we propose a clustering based supervised learning scheme for point cloud analysis. Unlike current de-facto, scene-wise training paradigm, our algorithm conducts within-class clustering on the point embedding space for automatically discovering subclass patterns which are latent yet representative across scenes. The mined patterns are, in turn, used to repaint the embedding space, so as to respect the underlying distribution of the entire training dataset and improve the robustness to the variations. Our algorithm is principled and readily pluggable to modern point cloud segmentation networks during training, without extra overhead during testing. With various 3D network architectures (i.e., voxel-based, point-based, Transformer-based, automatically searched), our algorithm shows notable improvements on famous point cloud segmentation datasets (i.e., 2.0-2.6% on single-scan and 2.0-2.2% multi-scan of SemanticKITTI, 1.8-1.9% on S3DIS, in terms of mIoU). Our algorithm also demonstrates utility in 3D detection, showing 2.0-3.4% mAP gains on KITTI. Our code is released at: https://github.com/FengZicai/Cluster3Dseg/.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Feng_Clustering_based_Point_Cloud_Representation_Learning_for_3D_Analysis_ICCV_2023_paper.pdf", - "aff": "ReLER, AAII, University of Technology Sydney; ReLER, CCAI, Zhejiang University; ReLER, CCAI, Zhejiang University; ReLER, CCAI, Zhejiang University; Xi\u2019an Jiaotong University", + "aff": "ReLER, AAII, University of Technology Sydney; ReLER, CCAI, Zhejiang University; ReLER, CCAI, Zhejiang University; ReLER, CCAI, Zhejiang University; Xi’an Jiaotong University", "project": "", "github": "https://github.com/FengZicai/Cluster3Dseg/", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Feng_Clustering_based_Point_ICCV_2023_supplemental.pdf", @@ -9259,14 +9573,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Feng_Clustering_based_Point_Cloud_Representation_Learning_for_3D_Analysis_ICCV_2023_paper.html", "aff_unique_index": "0;1;1;1;2", - "aff_unique_norm": "University of Technology Sydney;Zhejiang University;Xi'an Jiao Tong University", + "aff_unique_norm": "University of Technology Sydney;Zhejiang University;Xi'an Jiaotong University", "aff_unique_dep": ";ReLER, CCAI;", "aff_unique_url": "https://www.uts.edu.au;http://www.zju.edu.cn;https://www.xjtu.edu.cn", "aff_unique_abbr": "UTS;ZJU;XJTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;1", - "aff_country_unique": "Australia;China" + "aff_country_unique": "Australia;China", + "bibtex": "@InProceedings{Feng_2023_ICCV,\n \n author = {\n Feng,\n Tuo and Wang,\n Wenguan and Wang,\n Xiaohan and Yang,\n Yi and Zheng,\n Qinghua\n},\n title = {\n Clustering based Point Cloud Representation Learning for 3D Analysis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8283-8294\n} \n}" }, { "title": "Clutter Detection and Removal in 3D Scenes with View-Consistent Inpainting", @@ -9289,7 +9604,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wei_Clutter_Detection_and_Removal_in_3D_Scenes_with_View-Consistent_Inpainting_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wei_Clutter_Detection_and_Removal_in_3D_Scenes_with_View-Consistent_Inpainting_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Wei_2023_ICCV,\n \n author = {\n Wei,\n Fangyin and Funkhouser,\n Thomas and Rusinkiewicz,\n Szymon\n},\n title = {\n Clutter Detection and Removal in 3D Scenes with View-Consistent Inpainting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18131-18141\n} \n}" }, { "title": "Co-Evolution of Pose and Mesh for 3D Human Body Estimation from Video", @@ -9301,7 +9617,7 @@ "author": "Yingxuan You; Hong Liu; Ti Wang; Wenhao Li; Runwei Ding; Xia Li", "abstract": "Despite significant progress in single image-based 3D human mesh recovery, accurately and smoothly recovering 3D human motion from a video remains challenging. Existing video-based methods generally recover human mesh by estimating the complex pose and shape parameters from coupled image features, whose high complexity and low representation ability often result in inconsistent pose motion and limited shape patterns. To alleviate this issue, we introduce 3D pose as the intermediary and propose a Pose and Mesh Co-Evolution network (PMCE) that decouples this task into two parts: 1) video-based 3D human pose estimation and 2) mesh vertices regression from the estimated 3D pose and temporal image feature. Specifically, we propose a two-stream encoder that estimates mid-frame 3D pose and extracts a temporal image feature from the input image sequence. In addition, we design a co-evolution decoder that performs pose and mesh interactions with the image-guided Adaptive Layer Normalization (AdaLN) to make pose and mesh fit the human body shape. Extensive experiments demonstrate that the proposed PMCE outperforms previous state-of-the-art methods in terms of both per-frame accuracy and temporal consistency on three benchmark datasets: 3DPW, Human3.6M, and MPI-INF-3DHP. Our code is available at https://github.com/kasvii/PMCE.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/You_Co-Evolution_of_Pose_and_Mesh_for_3D_Human_Body_Estimation_ICCV_2023_paper.pdf", - "aff": "Key Laboratory of Machine Perception, Shenzhen Graduate School, Peking University; Key Laboratory of Machine Perception, Shenzhen Graduate School, Peking University; Key Laboratory of Machine Perception, Shenzhen Graduate School, Peking University; Key Laboratory of Machine Perception, Shenzhen Graduate School, Peking University; Key Laboratory of Machine Perception, Shenzhen Graduate School, Peking University; Department of Computer Science, ETH Z\u00fcrich", + "aff": "Key Laboratory of Machine Perception, Shenzhen Graduate School, Peking University; Key Laboratory of Machine Perception, Shenzhen Graduate School, Peking University; Key Laboratory of Machine Perception, Shenzhen Graduate School, Peking University; Key Laboratory of Machine Perception, Shenzhen Graduate School, Peking University; Key Laboratory of Machine Perception, Shenzhen Graduate School, Peking University; Department of Computer Science, ETH Zürich", "project": "", "github": "https://github.com/kasvii/PMCE", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/You_Co-Evolution_of_Pose_ICCV_2023_supplemental.pdf", @@ -9314,14 +9630,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/You_Co-Evolution_of_Pose_and_Mesh_for_3D_Human_Body_Estimation_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0;0;1", - "aff_unique_norm": "Peking University;ETH Zurich", + "aff_unique_norm": "Peking University;ETH Zürich", "aff_unique_dep": "Key Laboratory of Machine Perception;Department of Computer Science", "aff_unique_url": "http://www.pku.edu.cn;https://www.ethz.ch", "aff_unique_abbr": "PKU;ETHZ", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Shenzhen;", "aff_country_unique_index": "0;0;0;0;0;1", - "aff_country_unique": "China;Switzerland" + "aff_country_unique": "China;Switzerland", + "bibtex": "@InProceedings{You_2023_ICCV,\n \n author = {\n You,\n Yingxuan and Liu,\n Hong and Wang,\n Ti and Li,\n Wenhao and Ding,\n Runwei and Li,\n Xia\n},\n title = {\n Co-Evolution of Pose and Mesh for 3D Human Body Estimation from Video\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14963-14973\n} \n}" }, { "title": "CoIn: Contrastive Instance Feature Mining for Outdoor 3D Object Detection with Very Limited Annotations", @@ -9346,14 +9663,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Xia_CoIn_Contrastive_Instance_Feature_Mining_for_Outdoor_3D_Object_Detection_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0;1;2;0", - "aff_unique_norm": "Xiamen University;Max-Planck-Gesellschaft zur F\u00f6rderung der Wissenschaften e.V.;Texas A&M University", + "aff_unique_norm": "Xiamen University;Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V.;Texas A&M University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.xmu.edu.cn;https://www.mpg.de;https://www.tamu.edu", "aff_unique_abbr": "XMU;MPG;TAMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1;2;0", - "aff_country_unique": "China;Germany;United States" + "aff_country_unique": "China;Germany;United States", + "bibtex": "@InProceedings{Xia_2023_ICCV,\n \n author = {\n Xia,\n Qiming and Deng,\n Jinhao and Wen,\n Chenglu and Wu,\n Hai and Shi,\n Shaoshuai and Li,\n Xin and Wang,\n Cheng\n},\n title = {\n CoIn: Contrastive Instance Feature Mining for Outdoor 3D Object Detection with Very Limited Annotations\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6254-6263\n} \n}" }, { "title": "CoSign: Exploring Co-occurrence Signals in Skeleton-based Continuous Sign Language Recognition", @@ -9385,7 +9703,8 @@ "aff_campus_unique_index": "0+0;0+0;0+0", "aff_campus_unique": "Beijing;", "aff_country_unique_index": "0+0;0+0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Jiao_2023_ICCV,\n \n author = {\n Jiao,\n Peiqi and Min,\n Yuecong and Li,\n Yanan and Wang,\n Xiaotao and Lei,\n Lei and Chen,\n Xilin\n},\n title = {\n CoSign: Exploring Co-occurrence Signals in Skeleton-based Continuous Sign Language Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20676-20686\n} \n}" }, { "title": "CoTDet: Affordance Knowledge Prompting for Task Driven Object Detection", @@ -9417,7 +9736,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Shanghai", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Tang_2023_ICCV,\n \n author = {\n Tang,\n Jiajin and Zheng,\n Ge and Yu,\n Jingyi and Yang,\n Sibei\n},\n title = {\n CoTDet: Affordance Knowledge Prompting for Task Driven Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3068-3078\n} \n}" }, { "title": "Coarse-to-Fine Amodal Segmentation with Shape Prior", @@ -9442,14 +9762,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Gao_Coarse-to-Fine_Amodal_Segmentation_with_Shape_Prior_ICCV_2023_paper.html", "aff_unique_index": "0;0+1;0;1;1;1;0", - "aff_unique_norm": "Fudan University;Amazon", - "aff_unique_dep": ";Amazon Web Services", + "aff_unique_norm": "Fudan University;Amazon Web Services", + "aff_unique_dep": ";", "aff_unique_url": "https://www.fudan.edu.cn;https://aws.amazon.com", "aff_unique_abbr": "Fudan;AWS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+1;0;1;1;1;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Gao_2023_ICCV,\n \n author = {\n Gao,\n Jianxiong and Qian,\n Xuelin and Wang,\n Yikai and Xiao,\n Tianjun and He,\n Tong and Zhang,\n Zheng and Fu,\n Yanwei\n},\n title = {\n Coarse-to-Fine Amodal Segmentation with Shape Prior\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1262-1271\n} \n}" }, { "title": "Coarse-to-Fine: Learning Compact Discriminative Representation for Single-Stage Image Retrieval", @@ -9481,7 +9802,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhu_2023_ICCV,\n \n author = {\n Zhu,\n Yunquan and Gao,\n Xinkai and Ke,\n Bo and Qiao,\n Ruizhi and Sun,\n Xing\n},\n title = {\n Coarse-to-Fine: Learning Compact Discriminative Representation for Single-Stage Image Retrieval\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11260-11269\n} \n}" }, { "title": "Coherent Event Guided Low-Light Video Enhancement", @@ -9513,7 +9835,8 @@ "aff_campus_unique_index": ";;;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0+0;0+0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liang_2023_ICCV,\n \n author = {\n Liang,\n Jinxiu and Yang,\n Yixin and Li,\n Boyu and Duan,\n Peiqi and Xu,\n Yong and Shi,\n Boxin\n},\n title = {\n Coherent Event Guided Low-Light Video Enhancement\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10615-10625\n} \n}" }, { "title": "CoinSeg: Contrast Inter- and Intra- Class Representations for Incremental Segmentation", @@ -9536,7 +9859,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhang_CoinSeg_Contrast_Inter-_and_Intra-_Class_Representations_for_Incremental_Segmentation_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhang_CoinSeg_Contrast_Inter-_and_Intra-_Class_Representations_for_Incremental_Segmentation_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Zekang and Gao,\n Guangyu and Jiao,\n Jianbo and Liu,\n Chi Harold and Wei,\n Yunchao\n},\n title = {\n CoinSeg: Contrast Inter- and Intra- Class Representations for Incremental Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 843-853\n} \n}" }, { "title": "Collaborative Propagation on Multiple Instance Graphs for 3D Instance Segmentation with Single-point Supervision", @@ -9568,7 +9892,8 @@ "aff_campus_unique_index": "1;1;1;1", "aff_campus_unique": ";Singapore", "aff_country_unique_index": "0+0;0+0;0;0;0+0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Dong_2023_ICCV,\n \n author = {\n Dong,\n Shichao and Li,\n Ruibo and Wei,\n Jiacheng and Liu,\n Fayao and Lin,\n Guosheng\n},\n title = {\n Collaborative Propagation on Multiple Instance Graphs for 3D Instance Segmentation with Single-point Supervision\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16665-16674\n} \n}" }, { "title": "Collaborative Tracking Learning for Frame-Rate-Insensitive Multi-Object Tracking", @@ -9591,7 +9916,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Liu_Collaborative_Tracking_Learning_for_Frame-Rate-Insensitive_Multi-Object_Tracking_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Liu_Collaborative_Tracking_Learning_for_Frame-Rate-Insensitive_Multi-Object_Tracking_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Yiheng and Wu,\n Junta and Fu,\n Yi\n},\n title = {\n Collaborative Tracking Learning for Frame-Rate-Insensitive Multi-Object Tracking\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9964-9973\n} \n}" }, { "title": "Collecting The Puzzle Pieces: Disentangled Self-Driven Human Pose Transfer by Permuting Textures", @@ -9616,14 +9942,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_Collecting_The_Puzzle_Pieces_Disentangled_Self-Driven_Human_Pose_Transfer_by_ICCV_2023_paper.html", "aff_unique_index": "0;1;0", - "aff_unique_norm": "Boston University;NVIDIA", - "aff_unique_dep": ";NVIDIA Corporation", + "aff_unique_norm": "Boston University;NVIDIA Corporation", + "aff_unique_dep": ";", "aff_unique_url": "https://www.bu.edu;https://www.nvidia.com", "aff_unique_abbr": "BU;NVIDIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Nannan and Shih,\n Kevin J and Plummer,\n Bryan A.\n},\n title = {\n Collecting The Puzzle Pieces: Disentangled Self-Driven Human Pose Transfer by Permuting Textures\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7126-7137\n} \n}" }, { "title": "Combating Noisy Labels with Sample Selection by Mining High-Discrepancy Examples", @@ -9648,14 +9975,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Xia_Combating_Noisy_Labels_with_Sample_Selection_by_Mining_High-Discrepancy_Examples_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;3;4;5;0", - "aff_unique_norm": "University of Sydney;Hong Kong Baptist University;JD;University of Science and Technology of China;University of Melbourne;Nanjing University of Science and Technology", - "aff_unique_dep": ";;JD Explore Academy;;;", + "aff_unique_norm": "University of Sydney;Hong Kong Baptist University;JD Explore Academy;University of Science and Technology of China;University of Melbourne;Nanjing University of Science and Technology", + "aff_unique_dep": ";;;;;", "aff_unique_url": "https://www.sydney.edu.au;https://www.hkbu.edu.hk;;http://www.ustc.edu.cn;https://www.unimelb.edu.au;http://www.nust.edu.cn/", "aff_unique_abbr": "USYD;HKBU;;USTC;UniMelb;NUST", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;1;1;0;1;0", - "aff_country_unique": "Australia;China;" + "aff_country_unique": "Australia;China;", + "bibtex": "@InProceedings{Xia_2023_ICCV,\n \n author = {\n Xia,\n Xiaobo and Han,\n Bo and Zhan,\n Yibing and Yu,\n Jun and Gong,\n Mingming and Gong,\n Chen and Liu,\n Tongliang\n},\n title = {\n Combating Noisy Labels with Sample Selection by Mining High-Discrepancy Examples\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1833-1843\n} \n}" }, { "title": "Communication-Efficient Vertical Federated Learning with Limited Overlapping Samples", @@ -9680,14 +10008,15 @@ "author_num": 9, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Sun_Communication-Efficient_Vertical_Federated_Learning_with_Limited_Overlapping_Samples_ICCV_2023_paper.html", "aff_unique_index": "0;1;1;1;1;1;1;0;1", - "aff_unique_norm": "Duke University;NVIDIA", - "aff_unique_dep": "Department of Electrical and Computer Engineering;NVIDIA Corporation", + "aff_unique_norm": "Duke University;NVIDIA Corporation", + "aff_unique_dep": "Department of Electrical and Computer Engineering;", "aff_unique_url": "https://www.duke.edu;https://www.nvidia.com", "aff_unique_abbr": "Duke;NVIDIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Sun_2023_ICCV,\n \n author = {\n Sun,\n Jingwei and Xu,\n Ziyue and Yang,\n Dong and Nath,\n Vishwesh and Li,\n Wenqi and Zhao,\n Can and Xu,\n Daguang and Chen,\n Yiran and Roth,\n Holger R.\n},\n title = {\n Communication-Efficient Vertical Federated Learning with Limited Overlapping Samples\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5203-5212\n} \n}" }, { "title": "Communication-efficient Federated Learning with Single-Step Synthetic Features Compressor for Faster Convergence", @@ -9712,14 +10041,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhou_Communication-efficient_Federated_Learning_with_Single-Step_Synthetic_Features_Compressor_for_Faster_ICCV_2023_paper.html", "aff_unique_index": "0+1;0+1;2;0+1;0+1;0+1", - "aff_unique_norm": "Sichuan University;Engineering Research Center of Machine Learning and Industry Intelligence;University of Illinois Urbana-Champaign", - "aff_unique_dep": ";Engineering;", + "aff_unique_norm": "Sichuan University;Engineering Research Center of Machine Learning and Industry Intelligence;University of Illinois at Urbana-Champaign", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.scu.edu.cn;;https://illinois.edu", "aff_unique_abbr": "SCU;;UIUC", "aff_campus_unique_index": ";;1;;;", "aff_campus_unique": ";Urbana-Champaign", "aff_country_unique_index": "0;0;2;0;0;0", - "aff_country_unique": "China;;United States" + "aff_country_unique": "China;;United States", + "bibtex": "@InProceedings{Zhou_2023_ICCV,\n \n author = {\n Zhou,\n Yuhao and Shi,\n Mingjia and Li,\n Yuanxi and Sun,\n Yanan and Ye,\n Qing and Lv,\n Jiancheng\n},\n title = {\n Communication-efficient Federated Learning with Single-Step Synthetic Features Compressor for Faster Convergence\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5031-5040\n} \n}" }, { "title": "Compatibility of Fundamental Matrices for Complete Viewing Graphs", @@ -9727,8 +10057,8 @@ "status": "Poster", "track": "main", "pid": "12114", - "author_site": "Martin Br\u00e5telund, Felix Rydell", - "author": "Martin Br\u00e5telund; Felix Rydell", + "author_site": "Martin Bråtelund, Felix Rydell", + "author": "Martin Bråtelund; Felix Rydell", "abstract": "This paper studies the problem of recovering cameras from a set of fundamental matrices. A set of fundamental matrices is said to be compatible if a set of cameras exists for which they are the fundamental matrices. We focus on the complete graph, where fundamental matrices for each pair of cameras are given. Previous work has established necessary and sufficient conditions for compatibility as rank and eigenvalue conditions on the n-view fundamental matrix obtained by concatenating the individual fundamental matrices. In this work, we show that the eigenvalue condition is redundant in the generic and collinear cases. We provide explicit homogeneous polynomials that describe necessary and sufficient conditions for compatibility in terms of the fundamental matrices and their epipoles. In this direction, we find that quadruple-wise compatibility is enough to ensure global compatibility for any number of cameras. We demonstrate that for four cameras, compatibility is generically described by triple-wise conditions and one additional equation involving all fundamental matrices.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Bratelund_Compatibility_of_Fundamental_Matrices_for_Complete_Viewing_Graphs_ICCV_2023_paper.pdf", "aff": "University of Oslo; KTH Royal Institute of Technology", @@ -9751,7 +10081,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", - "aff_country_unique": "Norway;Sweden" + "aff_country_unique": "Norway;Sweden", + "bibtex": "@InProceedings{Bratelund_2023_ICCV,\n \n author = {\n Br\\r{a\n}telund,\n Martin and Rydell,\n Felix\n},\n title = {\n Compatibility of Fundamental Matrices for Complete Viewing Graphs\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3328-3336\n} \n}" }, { "title": "Complementary Domain Adaptation and Generalization for Unsupervised Continual Domain Shift Learning", @@ -9783,7 +10114,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "", - "aff_country_unique": "" + "aff_country_unique": "", + "bibtex": "@InProceedings{Cho_2023_ICCV,\n \n author = {\n Cho,\n Wonguk and Park,\n Jinha and Kim,\n Taesup\n},\n title = {\n Complementary Domain Adaptation and Generalization for Unsupervised Continual Domain Shift Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11442-11452\n} \n}" }, { "title": "Compositional Feature Augmentation for Unbiased Scene Graph Generation", @@ -9815,7 +10147,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0+0;0;0;0;0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Lin and Chen,\n Guikun and Xiao,\n Jun and Yang,\n Yi and Wang,\n Chunping and Chen,\n Long\n},\n title = {\n Compositional Feature Augmentation for Unbiased Scene Graph Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21685-21695\n} \n}" }, { "title": "Computation and Data Efficient Backdoor Attacks", @@ -9847,7 +10180,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1+1;0", - "aff_country_unique": "Singapore;China" + "aff_country_unique": "Singapore;China", + "bibtex": "@InProceedings{Wu_2023_ICCV,\n \n author = {\n Wu,\n Yutong and Han,\n Xingshuo and Qiu,\n Han and Zhang,\n Tianwei\n},\n title = {\n Computation and Data Efficient Backdoor Attacks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4805-4814\n} \n}" }, { "title": "Computational 3D Imaging with Position Sensors", @@ -9870,7 +10204,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Klotz_Computational_3D_Imaging_with_Position_Sensors_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Klotz_Computational_3D_Imaging_with_Position_Sensors_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Klotz_2023_ICCV,\n \n author = {\n Klotz,\n Jeremy and Gupta,\n Mohit and Sankaranarayanan,\n Aswin C.\n},\n title = {\n Computational 3D Imaging with Position Sensors\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8125-8134\n} \n}" }, { "title": "Computationally-Efficient Neural Image Compression with Shallow Decoders", @@ -9902,7 +10237,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Irvine", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n Yibo and Mandt,\n Stephan\n},\n title = {\n Computationally-Efficient Neural Image Compression with Shallow Decoders\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 530-540\n} \n}" }, { "title": "ConSlide: Asynchronous Hierarchical Interaction Transformer with Breakup-Reorganize Rehearsal for Continual Whole Slide Image Analysis", @@ -9927,14 +10263,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Huang_ConSlide_Asynchronous_Hierarchical_Interaction_Transformer_with_Breakup-Reorganize_Rehearsal_for_Continual_ICCV_2023_paper.html", "aff_unique_index": "0+1;0+1;2;1;3;0", - "aff_unique_norm": "University of Hong Kong;Zhejiang University;Hong Kong Polytechnic University;Stanford University", + "aff_unique_norm": "The University of Hong Kong;Zhejiang University;The Hong Kong Polytechnic University;Stanford University", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.hku.hk;https://www.zju.edu.cn;https://www.polyu.edu.hk;https://www.stanford.edu", "aff_unique_abbr": "HKU;ZJU;PolyU;Stanford", "aff_campus_unique_index": "0;0;0;2;0", "aff_campus_unique": "Hong Kong SAR;;Stanford", "aff_country_unique_index": "0+0;0+0;0;0;1;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Huang_2023_ICCV,\n \n author = {\n Huang,\n Yanyan and Zhao,\n Weiqin and Wang,\n Shujun and Fu,\n Yu and Jiang,\n Yuming and Yu,\n Lequan\n},\n title = {\n ConSlide: Asynchronous Hierarchical Interaction Transformer with Breakup-Reorganize Rehearsal for Continual Whole Slide Image Analysis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21349-21360\n} \n}" }, { "title": "Concept-wise Fine-tuning Matters in Preventing Negative Transfer", @@ -9966,7 +10303,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n Yunqiao and Huang,\n Long-Kai and Wei,\n Ying\n},\n title = {\n Concept-wise Fine-tuning Matters in Preventing Negative Transfer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18753-18763\n} \n}" }, { "title": "Conceptual and Hierarchical Latent Space Decomposition for Face Editing", @@ -9991,14 +10329,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ozkan_Conceptual_and_Hierarchical_Latent_Space_Decomposition_for_Face_Editing_ICCV_2023_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "Samsung", - "aff_unique_dep": "Samsung Research UK", + "aff_unique_norm": "Samsung Research UK", + "aff_unique_dep": "", "aff_unique_url": "https://www.samsung.com/uk/research/", "aff_unique_abbr": "SRUK", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Ozkan_2023_ICCV,\n \n author = {\n Ozkan,\n Savas and Ozay,\n Mete and Robinson,\n Tom\n},\n title = {\n Conceptual and Hierarchical Latent Space Decomposition for Face Editing\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7248-7257\n} \n}" }, { "title": "Conditional 360-degree Image Synthesis for Immersive Indoor Scene Decoration", @@ -10030,7 +10369,8 @@ "aff_campus_unique_index": "0;0;;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;1+2;3;0", - "aff_country_unique": "China;Ireland;Vietnam;Australia" + "aff_country_unique": "China;Ireland;Vietnam;Australia", + "bibtex": "@InProceedings{Shum_2023_ICCV,\n \n author = {\n Shum,\n Ka Chun and Pang,\n Hong-Wing and Hua,\n Binh-Son and Nguyen,\n Duc Thanh and Yeung,\n Sai-Kit\n},\n title = {\n Conditional 360-degree Image Synthesis for Immersive Indoor Scene Decoration\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4478-4488\n} \n}" }, { "title": "Conditional Cross Attention Network for Multi-Space Embedding without Entanglement in Only a SINGLE Network", @@ -10062,7 +10402,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1", - "aff_country_unique": "United States;South Korea" + "aff_country_unique": "United States;South Korea", + "bibtex": "@InProceedings{Song_2023_ICCV,\n \n author = {\n Song,\n Chull Hwan and Hwang,\n Taebaek and Yoon,\n Jooyoung and Choi,\n Shunghyun and Gu,\n Yeong Hyeon\n},\n title = {\n Conditional Cross Attention Network for Multi-Space Embedding without Entanglement in Only a SINGLE Network\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11112-11121\n} \n}" }, { "title": "Confidence-aware Pseudo-label Learning for Weakly Supervised Visual Grounding", @@ -10094,7 +10435,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Yang and Zhang,\n Jiahua and Chen,\n Qingchao and Peng,\n Yuxin\n},\n title = {\n Confidence-aware Pseudo-label Learning for Weakly Supervised Visual Grounding\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2828-2838\n} \n}" }, { "title": "Confidence-based Visual Dispersal for Few-shot Unsupervised Domain Adaptation", @@ -10126,7 +10468,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0+0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xiong_2023_ICCV,\n \n author = {\n Xiong,\n Yizhe and Chen,\n Hui and Lin,\n Zijia and Zhao,\n Sicheng and Ding,\n Guiguang\n},\n title = {\n Confidence-based Visual Dispersal for Few-shot Unsupervised Domain Adaptation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11621-11631\n} \n}" }, { "title": "Consistent Depth Prediction for Transparent Object Reconstruction from RGB-D Camera", @@ -10158,7 +10501,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Cai_2023_ICCV,\n \n author = {\n Cai,\n Yuxiang and Zhu,\n Yifan and Zhang,\n Haiwei and Ren,\n Bo\n},\n title = {\n Consistent Depth Prediction for Transparent Object Reconstruction from RGB-D Camera\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3459-3468\n} \n}" }, { "title": "Constraining Depth Map Geometry for Multi-View Stereo: A Dual-Depth Approach with Saddle-shaped Depth Cells", @@ -10181,7 +10525,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ye_Constraining_Depth_Map_Geometry_for_Multi-View_Stereo_A_Dual-Depth_Approach_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ye_Constraining_Depth_Map_Geometry_for_Multi-View_Stereo_A_Dual-Depth_Approach_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Ye_2023_ICCV,\n \n author = {\n Ye,\n Xinyi and Zhao,\n Weiyue and Liu,\n Tianqi and Huang,\n Zihao and Cao,\n Zhiguo and Li,\n Xin\n},\n title = {\n Constraining Depth Map Geometry for Multi-View Stereo: A Dual-Depth Approach with Saddle-shaped Depth Cells\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17661-17670\n} \n}" }, { "title": "ContactGen: Generative Contact Modeling for Grasp Generation", @@ -10206,14 +10551,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Liu_ContactGen_Generative_Contact_Modeling_for_Grasp_Generation_ICCV_2023_paper.html", "aff_unique_index": "0;1;1;0;0", - "aff_unique_norm": "University of Illinois Urbana-Champaign;Adobe", + "aff_unique_norm": "University of Illinois at Urbana-Champaign;Adobe", "aff_unique_dep": ";Adobe Research", "aff_unique_url": "https://illinois.edu;https://research.adobe.com", "aff_unique_abbr": "UIUC;Adobe", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Urbana-Champaign;", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Shaowei and Zhou,\n Yang and Yang,\n Jimei and Gupta,\n Saurabh and Wang,\n Shenlong\n},\n title = {\n ContactGen: Generative Contact Modeling for Grasp Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20609-20620\n} \n}" }, { "title": "Contactless Pulse Estimation Leveraging Pseudo Labels and Self-Supervision", @@ -10245,7 +10591,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Zhihua and Yin,\n Lijun\n},\n title = {\n Contactless Pulse Estimation Leveraging Pseudo Labels and Self-Supervision\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20588-20597\n} \n}" }, { "title": "Content-Aware Local GAN for Photo-Realistic Super-Resolution", @@ -10277,7 +10624,8 @@ "aff_campus_unique_index": "0;0;0+0", "aff_campus_unique": "Seoul", "aff_country_unique_index": "0;0;0+0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Park_2023_ICCV,\n \n author = {\n Park,\n JoonKyu and Son,\n Sanghyun and Lee,\n Kyoung Mu\n},\n title = {\n Content-Aware Local GAN for Photo-Realistic Super-Resolution\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10585-10594\n} \n}" }, { "title": "Context-Aware Planning and Environment-Aware Memory for Instruction Following Embodied Agents", @@ -10309,7 +10657,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Gwangju", "aff_country_unique_index": "0;0;0+0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Kim_2023_ICCV,\n \n author = {\n Kim,\n Byeonghwi and Kim,\n Jinyeon and Kim,\n Yuyeong and Min,\n Cheolhong and Choi,\n Jonghyun\n},\n title = {\n Context-Aware Planning and Environment-Aware Memory for Instruction Following Embodied Agents\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10936-10946\n} \n}" }, { "title": "Continual Learning for Personalized Co-speech Gesture Generation", @@ -10341,7 +10690,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", - "aff_country_unique": "United States;Japan" + "aff_country_unique": "United States;Japan", + "bibtex": "@InProceedings{Ahuja_2023_ICCV,\n \n author = {\n Ahuja,\n Chaitanya and Joshi,\n Pratik and Ishii,\n Ryo and Morency,\n Louis-Philippe\n},\n title = {\n Continual Learning for Personalized Co-speech Gesture Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20893-20903\n} \n}" }, { "title": "Continual Segment: Towards a Single, Unified and Non-forgetting Continual Segmentation Model of 143 Whole-body Organs in CT Scans", @@ -10373,7 +10723,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0;0;0+0;0;0+0;0;0;1;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Ji_2023_ICCV,\n \n author = {\n Ji,\n Zhanghexuan and Guo,\n Dazhou and Wang,\n Puyang and Yan,\n Ke and Lu,\n Le and Xu,\n Minfeng and Wang,\n Qifeng and Ge,\n Jia and Gao,\n Mingchen and Ye,\n Xianghua and Jin,\n Dakai\n},\n title = {\n Continual Segment: Towards a Single,\n Unified and Non-forgetting Continual Segmentation Model of 143 Whole-body Organs in CT Scans\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21140-21151\n} \n}" }, { "title": "Continual Zero-Shot Learning through Semantically Guided Generative Random Walks", @@ -10405,7 +10756,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+1;0;0;0", - "aff_country_unique": "Saudi Arabia;Sri Lanka" + "aff_country_unique": "Saudi Arabia;Sri Lanka", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Wenxuan and Janson,\n Paul and Yi,\n Kai and Skorokhodov,\n Ivan and Elhoseiny,\n Mohamed\n},\n title = {\n Continual Zero-Shot Learning through Semantically Guided Generative Random Walks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11574-11585\n} \n}" }, { "title": "Continuously Masked Transformer for Image Inpainting", @@ -10430,14 +10782,15 @@ "author_num": 2, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ko_Continuously_Masked_Transformer_for_Image_Inpainting_ICCV_2023_paper.html", "aff_unique_index": "0;1", - "aff_unique_norm": "Catholic University of Korea;Korea University", + "aff_unique_norm": "The Catholic University of Korea;Korea University", "aff_unique_dep": ";", "aff_unique_url": "http://www.cuk.edu.ko;https://www.korea.ac.kr", "aff_unique_abbr": "CUK;KU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Ko_2023_ICCV,\n \n author = {\n Ko,\n Keunsoo and Kim,\n Chang-Su\n},\n title = {\n Continuously Masked Transformer for Image Inpainting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13169-13178\n} \n}" }, { "title": "Contrastive Continuity on Augmentation Stability Rehearsal for Continual Self-Supervised Learning", @@ -10469,7 +10822,8 @@ "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Chengdu", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Cheng_2023_ICCV,\n \n author = {\n Cheng,\n Haoyang and Wen,\n Haitao and Zhang,\n Xiaoliang and Qiu,\n Heqian and Wang,\n Lanxiao and Li,\n Hongliang\n},\n title = {\n Contrastive Continuity on Augmentation Stability Rehearsal for Continual Self-Supervised Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5707-5717\n} \n}" }, { "title": "Contrastive Feature Masking Open-Vocabulary Vision Transformer", @@ -10492,7 +10846,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Kim_Contrastive_Feature_Masking_Open-Vocabulary_Vision_Transformer_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Kim_Contrastive_Feature_Masking_Open-Vocabulary_Vision_Transformer_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Kim_2023_ICCV,\n \n author = {\n Kim,\n Dahun and Angelova,\n Anelia and Kuo,\n Weicheng\n},\n title = {\n Contrastive Feature Masking Open-Vocabulary Vision Transformer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15602-15612\n} \n}" }, { "title": "Contrastive Learning Relies More on Spatial Inductive Bias Than Supervised Learning: An Empirical Study", @@ -10517,14 +10872,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhong_Contrastive_Learning_Relies_More_on_Spatial_Inductive_Bias_Than_Supervised_ICCV_2023_paper.html", "aff_unique_index": "0;1;0;0", - "aff_unique_norm": "University of Illinois Urbana-Champaign;University of Pennsylvania", + "aff_unique_norm": "University of Illinois at Urbana-Champaign;University of Pennsylvania", "aff_unique_dep": ";", "aff_unique_url": "https://illinois.edu;https://www.upenn.edu", "aff_unique_abbr": "UIUC;UPenn", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Urbana-Champaign;", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhong_2023_ICCV,\n \n author = {\n Zhong,\n Yuanyi and Tang,\n Haoran and Chen,\n Jun-Kun and Wang,\n Yu-Xiong\n},\n title = {\n Contrastive Learning Relies More on Spatial Inductive Bias Than Supervised Learning: An Empirical Study\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16327-16336\n} \n}" }, { "title": "Contrastive Model Adaptation for Cross-Condition Robustness in Semantic Segmentation", @@ -10532,8 +10888,8 @@ "status": "Poster", "track": "main", "pid": "3271", - "author_site": "David Br\u00fcggemann, Christos Sakaridis, Tim Broedermann, Luc Van Gool", - "author": "David Br\u00fcggemann; Christos Sakaridis; Tim Broedermann; Luc Van Gool", + "author_site": "David Brüggemann, Christos Sakaridis, Tim Broedermann, Luc Van Gool", + "author": "David Brüggemann; Christos Sakaridis; Tim Broedermann; Luc Van Gool", "abstract": "Standard unsupervised domain adaptation methods adapt models from a source to a target domain using labeled source data and unlabeled target data jointly. In model adaptation, on the other hand, access to the labeled source data is prohibited, i.e., only the source-trained model and unlabeled target data are available. We investigate normal-to-adverse condition model adaptation for semantic segmentation, whereby image-level correspondences are available in the target domain. The target set consists of unlabeled pairs of adverse- and normal-condition street images taken at GPS-matched locations. Our method--CMA--leverages such image pairs to learn condition-invariant features via contrastive learning. In particular, CMA encourages features in the embedding space to be grouped according to their condition-invariant semantic content and not according to the condition under which respective inputs are captured. To obtain accurate cross-domain semantic correspondences, we warp the normal image to the viewpoint of the adverse image and leverage warp-confidence scores to create robust, aggregated features. With this approach, we achieve state-of-the-art semantic segmentation performance for model adaptation on several normal-to-adverse adaptation benchmarks, such as ACDC and Dark Zurich. We also evaluate CMA on a newly procured adverse-condition generalization benchmark and report favorable results compared to standard unsupervised domain adaptation methods, despite the comparative handicap of CMA due to source data inaccessibility. Code is available at https://github.com/brdav/cma.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Bruggemann_Contrastive_Model_Adaptation_for_Cross-Condition_Robustness_in_Semantic_Segmentation_ICCV_2023_paper.pdf", "aff": "ETH Zurich, Switzerland; ETH Zurich, Switzerland; ETH Zurich, Switzerland; ETH Zurich, Switzerland", @@ -10556,7 +10912,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Switzerland" + "aff_country_unique": "Switzerland", + "bibtex": "@InProceedings{Bruggemann_2023_ICCV,\n \n author = {\n Br\\"uggemann,\n David and Sakaridis,\n Christos and Broedermann,\n Tim and Van Gool,\n Luc\n},\n title = {\n Contrastive Model Adaptation for Cross-Condition Robustness in Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11378-11387\n} \n}" }, { "title": "Contrastive Pseudo Learning for Open-World DeepFake Attribution", @@ -10588,7 +10945,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0+0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Sun_2023_ICCV,\n \n author = {\n Sun,\n Zhimin and Chen,\n Shen and Yao,\n Taiping and Yin,\n Bangjie and Yi,\n Ran and Ding,\n Shouhong and Ma,\n Lizhuang\n},\n title = {\n Contrastive Pseudo Learning for Open-World DeepFake Attribution\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20882-20892\n} \n}" }, { "title": "Controllable Guide-Space for Generalizable Face Forgery Detection", @@ -10620,7 +10978,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Guo_2023_ICCV,\n \n author = {\n Guo,\n Ying and Zhen,\n Cheng and Yan,\n Pengfei\n},\n title = {\n Controllable Guide-Space for Generalizable Face Forgery Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20818-20827\n} \n}" }, { "title": "Controllable Person Image Synthesis with Pose-Constrained Latent Diffusion", @@ -10652,7 +11011,8 @@ "aff_campus_unique_index": ";;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0;0+0;0+0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Han_2023_ICCV,\n \n author = {\n Han,\n Xiao and Zhu,\n Xiatian and Deng,\n Jiankang and Song,\n Yi-Zhe and Xiang,\n Tao\n},\n title = {\n Controllable Person Image Synthesis with Pose-Constrained Latent Diffusion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22768-22777\n} \n}" }, { "title": "Controllable Visual-Tactile Synthesis", @@ -10675,7 +11035,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Gao_Controllable_Visual-Tactile_Synthesis_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Gao_Controllable_Visual-Tactile_Synthesis_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Gao_2023_ICCV,\n \n author = {\n Gao,\n Ruihan and Yuan,\n Wenzhen and Zhu,\n Jun-Yan\n},\n title = {\n Controllable Visual-Tactile Synthesis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7040-7052\n} \n}" }, { "title": "Convex Decomposition of Indoor Scenes", @@ -10700,14 +11061,15 @@ "author_num": 2, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Vavilala_Convex_Decomposition_of_Indoor_Scenes_ICCV_2023_paper.html", "aff_unique_index": "0;0", - "aff_unique_norm": "University of Illinois Urbana-Champaign", + "aff_unique_norm": "University of Illinois at Urbana-Champaign", "aff_unique_dep": "", "aff_unique_url": "https://www illinois.edu", "aff_unique_abbr": "UIUC", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Urbana-Champaign", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Vavilala_2023_ICCV,\n \n author = {\n Vavilala,\n Vaibhav and Forsyth,\n David\n},\n title = {\n Convex Decomposition of Indoor Scenes\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9176-9186\n} \n}" }, { "title": "Convolutional Networks with Oriented 1D Kernels", @@ -10739,7 +11101,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Kirchmeyer_2023_ICCV,\n \n author = {\n Kirchmeyer,\n Alexandre and Deng,\n Jia\n},\n title = {\n Convolutional Networks with Oriented 1D Kernels\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6222-6232\n} \n}" }, { "title": "Coordinate Quantized Neural Implicit Representations for Multi-view Reconstruction", @@ -10771,7 +11134,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Detroit", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Jiang_2023_ICCV,\n \n author = {\n Jiang,\n Sijia and Hua,\n Jing and Han,\n Zhizhong\n},\n title = {\n Coordinate Quantized Neural Implicit Representations for Multi-view Reconstruction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18358-18369\n} \n}" }, { "title": "Coordinate Transformer: Achieving Single-stage Multi-person Mesh Recovery from Videos", @@ -10794,7 +11158,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_Coordinate_Transformer_Achieving_Single-stage_Multi-person_Mesh_Recovery_from_Videos_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_Coordinate_Transformer_Achieving_Single-stage_Multi-person_Mesh_Recovery_from_Videos_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Haoyuan and Dong,\n Haoye and Jia,\n Hanchao and Huang,\n Dong and Kampffmeyer,\n Michael C. and Lin,\n Liang and Liang,\n Xiaodan\n},\n title = {\n Coordinate Transformer: Achieving Single-stage Multi-person Mesh Recovery from Videos\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8744-8753\n} \n}" }, { "title": "CopyRNeRF: Protecting the CopyRight of Neural Radiance Fields", @@ -10826,7 +11191,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0+1;2;1+0;1;0", - "aff_country_unique": "China;United States;Singapore" + "aff_country_unique": "China;United States;Singapore", + "bibtex": "@InProceedings{Luo_2023_ICCV,\n \n author = {\n Luo,\n Ziyuan and Guo,\n Qing and Cheung,\n Ka Chun and See,\n Simon and Wan,\n Renjie\n},\n title = {\n CopyRNeRF: Protecting the CopyRight of Neural Radiance Fields\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22401-22411\n} \n}" }, { "title": "Corrupting Neuron Explanations of Deep Visual Features", @@ -10858,7 +11224,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "San Diego", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Srivastava_2023_ICCV,\n \n author = {\n Srivastava,\n Divyansh and Oikarinen,\n Tuomas and Weng,\n Tsui-Wei\n},\n title = {\n Corrupting Neuron Explanations of Deep Visual Features\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1877-1886\n} \n}" }, { "title": "Counterfactual-based Saliency Map: Towards Visual Contrastive Explanations for Neural Networks", @@ -10881,7 +11248,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wang_Counterfactual-based_Saliency_Map_Towards_Visual_Contrastive_Explanations_for_Neural_Networks_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wang_Counterfactual-based_Saliency_Map_Towards_Visual_Contrastive_Explanations_for_Neural_Networks_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Xue and Wang,\n Zhibo and Weng,\n Haiqin and Guo,\n Hengchang and Zhang,\n Zhifei and Jin,\n Lu and Wei,\n Tao and Ren,\n Kui\n},\n title = {\n Counterfactual-based Saliency Map: Towards Visual Contrastive Explanations for Neural Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2042-2051\n} \n}" }, { "title": "Counting Crowds in Bad Weather", @@ -10913,7 +11281,8 @@ "aff_campus_unique_index": "0;0+1;0;0;2+3", "aff_campus_unique": "Taiwan;Stanford;Merced;Mountain View;", "aff_country_unique_index": "0;0+1;0;0;1+1+2", - "aff_country_unique": "China;United States;South Korea" + "aff_country_unique": "China;United States;South Korea", + "bibtex": "@InProceedings{Huang_2023_ICCV,\n \n author = {\n Huang,\n Zhi-Kai and Chen,\n Wei-Ting and Chiang,\n Yuan-Chun and Kuo,\n Sy-Yen and Yang,\n Ming-Hsuan\n},\n title = {\n Counting Crowds in Bad Weather\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23308-23319\n} \n}" }, { "title": "Creative Birds: Self-Supervised Single-View 3D Style Transfer", @@ -10945,7 +11314,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0;0", - "aff_country_unique": "China;Japan" + "aff_country_unique": "China;Japan", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Renke and Que,\n Guimin and Chen,\n Shuo and Li,\n Xiang and Li,\n Jun and Yang,\n Jian\n},\n title = {\n Creative Birds: Self-Supervised Single-View 3D Style Transfer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8775-8784\n} \n}" }, { "title": "CroCo v2: Improved Cross-view Completion Pre-training for Stereo Matching and Optical Flow", @@ -10953,8 +11323,8 @@ "status": "Poster", "track": "main", "pid": "6008", - "author_site": "Philippe Weinzaepfel, Thomas Lucas, Vincent Leroy, Yohann Cabon, Vaibhav Arora, Romain Br\u00e9gier, Gabriela Csurka, Leonid Antsfeld, Boris Chidlovskii, Jerome Revaud", - "author": "Philippe Weinzaepfel; Thomas Lucas; Vincent Leroy; Yohann Cabon; Vaibhav Arora; Romain Br\u00e9gier; Gabriela Csurka; Leonid Antsfeld; Boris Chidlovskii; Jerome Revaud", + "author_site": "Philippe Weinzaepfel, Thomas Lucas, Vincent Leroy, Yohann Cabon, Vaibhav Arora, Romain Brégier, Gabriela Csurka, Leonid Antsfeld, Boris Chidlovskii, Jerome Revaud", + "author": "Philippe Weinzaepfel; Thomas Lucas; Vincent Leroy; Yohann Cabon; Vaibhav Arora; Romain Brégier; Gabriela Csurka; Leonid Antsfeld; Boris Chidlovskii; Jerome Revaud", "abstract": "Despite impressive performance for high-level downstream tasks, self-supervised pre-training methods have not yet fully delivered on dense geometric vision tasks such as stereo matching or optical flow. The application of self-supervised concepts, such as instance discrimination or masked image modeling, to geometric tasks is an active area of research. In this work, we build on the recent cross-view completion framework, a variation of masked image modeling that leverages a second view from the same scene which makes it well suited for binocular downstream tasks. The applicability of this concept has so far been limited in at least two ways: (a) by the difficulty of collecting real-world image pairs -- in practice only synthetic data have been used -- and (b) by the lack of generalization of vanilla transformers to dense downstream tasks for which relative position is more meaningful than absolute position. We explore three avenues of improvement. First, we introduce a method to collect suitable real-world image pairs at large scale. Second, we experiment with relative positional embeddings and show that they enable vision transformers to perform substantially better. Third, we scale up vision transformer based cross-completion architectures, which is made possible by the use of large amounts of data. With these improvements, we show for the first time that state-of-the-art results on stereo matching and optical flow can be reached without using any classical task-specific techniques like correlation volume, iterative estimation, image warping or multi-scale reasoning, thus paving the way towards universal vision models.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Weinzaepfel_CroCo_v2_Improved_Cross-view_Completion_Pre-training_for_Stereo_Matching_and_ICCV_2023_paper.pdf", "aff": ";;;;;;;;;", @@ -10968,7 +11338,8 @@ "aff_domain": ";;;;;;;;;", "email": ";;;;;;;;;", "author_num": 10, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Weinzaepfel_CroCo_v2_Improved_Cross-view_Completion_Pre-training_for_Stereo_Matching_and_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Weinzaepfel_CroCo_v2_Improved_Cross-view_Completion_Pre-training_for_Stereo_Matching_and_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Weinzaepfel_2023_ICCV,\n \n author = {\n Weinzaepfel,\n Philippe and Lucas,\n Thomas and Leroy,\n Vincent and Cabon,\n Yohann and Arora,\n Vaibhav and Br\\'egier,\n Romain and Csurka,\n Gabriela and Antsfeld,\n Leonid and Chidlovskii,\n Boris and Revaud,\n Jerome\n},\n title = {\n CroCo v2: Improved Cross-view Completion Pre-training for Stereo Matching and Optical Flow\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17969-17980\n} \n}" }, { "title": "Cross Contrasting Feature Perturbation for Domain Generalization", @@ -10993,14 +11364,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_Cross_Contrasting_Feature_Perturbation_for_Domain_Generalization_ICCV_2023_paper.html", "aff_unique_index": "0+1;0;0;0+1", - "aff_unique_norm": "Southern University of Science and Technology;Pengcheng Laboratory", - "aff_unique_dep": "Department of Computer Science and Engineering;Peng Cheng Laboratory", + "aff_unique_norm": "Southern University of Science and Technology;Peng Cheng Laboratory", + "aff_unique_dep": "Department of Computer Science and Engineering;", "aff_unique_url": "https://www.sustech.edu.cn;", "aff_unique_abbr": "SUSTech;", "aff_campus_unique_index": "0+0;0;0;0+0", "aff_campus_unique": "Shenzhen", "aff_country_unique_index": "0+0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Chenming and Zhang,\n Daoan and Huang,\n Wenjian and Zhang,\n Jianguo\n},\n title = {\n Cross Contrasting Feature Perturbation for Domain Generalization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1327-1337\n} \n}" }, { "title": "Cross Modal Transformer: Towards Fast and Robust 3D Object Detection", @@ -11025,14 +11397,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yan_Cross_Modal_Transformer_Towards_Fast_and_Robust_3D_Object_Detection_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0;0;0;0", - "aff_unique_norm": "Megvii Technology", + "aff_unique_norm": "MEGVII Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.megvii.com", "aff_unique_abbr": "", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yan_2023_ICCV,\n \n author = {\n Yan,\n Junjie and Liu,\n Yingfei and Sun,\n Jianjian and Jia,\n Fan and Li,\n Shuailin and Wang,\n Tiancai and Zhang,\n Xiangyu\n},\n title = {\n Cross Modal Transformer: Towards Fast and Robust 3D Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18268-18278\n} \n}" }, { "title": "Cross-Domain Product Representation Learning for Rich-Content E-Commerce", @@ -11064,7 +11437,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0+0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Bai_2023_ICCV,\n \n author = {\n Bai,\n Xuehan and Li,\n Yan and Cheng,\n Yanhua and Yang,\n Wenjie and Chen,\n Quan and Li,\n Han\n},\n title = {\n Cross-Domain Product Representation Learning for Rich-Content E-Commerce\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5697-5706\n} \n}" }, { "title": "Cross-Modal Learning with 3D Deformable Attention for Action Recognition", @@ -11096,7 +11470,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Kim_2023_ICCV,\n \n author = {\n Kim,\n Sangwon and Ahn,\n Dasom and Ko,\n Byoung Chul\n},\n title = {\n Cross-Modal Learning with 3D Deformable Attention for Action Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10265-10275\n} \n}" }, { "title": "Cross-Modal Orthogonal High-Rank Augmentation for RGB-Event Transformer-Trackers", @@ -11128,7 +11503,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhu_2023_ICCV,\n \n author = {\n Zhu,\n Zhiyu and Hou,\n Junhui and Wu,\n Dapeng Oliver\n},\n title = {\n Cross-Modal Orthogonal High-Rank Augmentation for RGB-Event Transformer-Trackers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22045-22055\n} \n}" }, { "title": "Cross-Modal Translation and Alignment for Survival Analysis", @@ -11160,7 +11536,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States;" + "aff_country_unique": "United States;", + "bibtex": "@InProceedings{Zhou_2023_ICCV,\n \n author = {\n Zhou,\n Fengtao and Chen,\n Hao\n},\n title = {\n Cross-Modal Translation and Alignment for Survival Analysis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21485-21494\n} \n}" }, { "title": "Cross-Ray Neural Radiance Fields for Novel-View Synthesis from Unconstrained Image Collections", @@ -11168,6 +11545,7 @@ "status": "Oral", "track": "main", "pid": "2468", + "author_site": "Yifan Yang, Shuhai Zhang, Zixiong Huang, Yubing Zhang, Mingkui Tan", "author": "Yifan Yang, Shuhai Zhang, Zixiong Huang, Yubing Zhang, Mingkui Tan", "abstract": "Neural Radiance Fields (NeRF) is a revolutionary approach for rendering scenes by sampling a single ray per pixel and it has demonstrated impressive capabilities in novel-view synthesis from static scene images. However, in practice, we usually need to recover NeRF from unconstrained image collections, which poses two challenges: 1) the images often have dynamic changes in appearance because of different capturing time and camera settings; 2) the images may contain transient objects such as humans and cars, leading to occlusion and ghosting artifacts. Conventional approaches seek to address these challenges by locally utilizing a single ray to synthesize a color of a pixel. In contrast, humans typically perceive appearance and objects by globally utilizing information across multiple pixels. To mimic the perception process of humans, in this paper, we propose Cross-Ray NeRF (CR-NeRF) that leverages interactive information across multiple rays to synthesize occlusion-free novel views with the same appearances as the images. Specifically, to model varying appearances, we first propose to represent multiple rays with a novel cross-ray feature and then recover the appearance by fusing global statistics, i.e., feature covariance of the rays and the image appearance. Moreover, to avoid occlusion introduced by transient objects, we propose a transient objects handler and introduce a grid sampling strategy for masking out the transient objects. We theoretically find that leveraging correlation across multiple rays promotes capturing more global information. Moreover, extensive experimental results on large real-world datasets verify the effectiveness of CR-NeRF.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Yang_Cross-Ray_Neural_Radiance_Fields_for_Novel-View_Synthesis_from_Unconstrained_Image_ICCV_2023_paper.pdf", @@ -11179,7 +11557,8 @@ "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1587516035934714469&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yang_Cross-Ray_Neural_Radiance_Fields_for_Novel-View_Synthesis_from_Unconstrained_Image_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yang_Cross-Ray_Neural_Radiance_Fields_for_Novel-View_Synthesis_from_Unconstrained_Image_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n Yifan and Zhang,\n Shuhai and Huang,\n Zixiong and Zhang,\n Yubing and Tan,\n Mingkui\n},\n title = {\n Cross-Ray Neural Radiance Fields for Novel-View Synthesis from Unconstrained Image Collections\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15901-15911\n} \n}" }, { "title": "Cross-modal Latent Space Alignment for Image to Avatar Translation", @@ -11211,7 +11590,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{de_Guevara_2023_ICCV,\n \n author = {\n de Guevara,\n Manuel Ladron and Echevarria,\n Jose and Li,\n Yijun and Hold-Geoffroy,\n Yannick and Smith,\n Cameron and Ito,\n Daichi\n},\n title = {\n Cross-modal Latent Space Alignment for Image to Avatar Translation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 520-529\n} \n}" }, { "title": "Cross-modal Scalable Hyperbolic Hierarchical Clustering", @@ -11234,7 +11614,8 @@ "aff_domain": ";", "email": ";", "author_num": 2, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Long_Cross-modal_Scalable_Hierarchical_Clustering_in_Hyperbolic_space_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Long_Cross-modal_Scalable_Hierarchical_Clustering_in_Hyperbolic_space_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Long_2023_ICCV,\n \n author = {\n Long,\n Teng and van Noord,\n Nanne\n},\n title = {\n Cross-modal Scalable Hyperbolic Hierarchical Clustering\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16655-16664\n} \n}" }, { "title": "Cross-view Semantic Alignment for Livestreaming Product Recognition", @@ -11266,7 +11647,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n Wenjie and Chen,\n Yiyi and Li,\n Yan and Cheng,\n Yanhua and Liu,\n Xudong and Chen,\n Quan and Li,\n Han\n},\n title = {\n Cross-view Semantic Alignment for Livestreaming Product Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13404-13413\n} \n}" }, { "title": "Cross-view Topology Based Consistent and Complementary Information for Deep Multi-view Clustering", @@ -11298,7 +11680,8 @@ "aff_campus_unique_index": "0;1;0;0;0", "aff_campus_unique": "Changsha;Beijing", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Dong_2023_ICCV,\n \n author = {\n Dong,\n Zhibin and Wang,\n Siwei and Jin,\n Jiaqi and Liu,\n Xinwang and Zhu,\n En\n},\n title = {\n Cross-view Topology Based Consistent and Complementary Information for Deep Multi-view Clustering\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19440-19451\n} \n}" }, { "title": "CrossLoc3D: Aerial-Ground Cross-Source 3D Place Recognition", @@ -11321,7 +11704,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Guan_CrossLoc3D_Aerial-Ground_Cross-Source_3D_Place_Recognition_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Guan_CrossLoc3D_Aerial-Ground_Cross-Source_3D_Place_Recognition_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Guan_2023_ICCV,\n \n author = {\n Guan,\n Tianrui and Muthuselvam,\n Aswath and Hoover,\n Montana and Wang,\n Xijun and Liang,\n Jing and Sathyamoorthy,\n Adarsh Jagan and Conover,\n Damon and Manocha,\n Dinesh\n},\n title = {\n CrossLoc3D: Aerial-Ground Cross-Source 3D Place Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11335-11344\n} \n}" }, { "title": "CrossMatch: Source-Free Domain Adaptive Semantic Segmentation via Cross-Modal Consistency Training", @@ -11346,14 +11730,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yin_CrossMatch_Source-Free_Domain_Adaptive_Semantic_Segmentation_via_Cross-Modal_Consistency_Training_ICCV_2023_paper.html", "aff_unique_index": "0;1+2;3;2;0;1", - "aff_unique_norm": "Institute for Infocomm Research;National University of Singapore;Grabtaxi Holdings Pte. Ltd.;Zhejiang Gongshang University", + "aff_unique_norm": "Institute for Infocomm Research;National University of Singapore;Grabtaxi Holdings;Zhejiang Gongshang University", "aff_unique_dep": ";;;", - "aff_unique_url": "https://www.i2r.a-star.edu.sg;https://www.nus.edu.sg;https://www.grab.com;http://www.hzic.edu.cn", - "aff_unique_abbr": "I2R;NUS;Grab;ZJGSU", + "aff_unique_url": "https://www.i2r.a-star.edu.sg;https://www.nus.edu.sg;https://www.grab.com;http://www.hgh.edu.cn", + "aff_unique_abbr": "I2R;NUS;Grab;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;1;0;0;0", - "aff_country_unique": "Singapore;China" + "aff_country_unique": "Singapore;China", + "bibtex": "@InProceedings{Yin_2023_ICCV,\n \n author = {\n Yin,\n Yifang and Hu,\n Wenmiao and Liu,\n Zhenguang and Wang,\n Guanfeng and Xiang,\n Shili and Zimmermann,\n Roger\n},\n title = {\n CrossMatch: Source-Free Domain Adaptive Semantic Segmentation via Cross-Modal Consistency Training\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21786-21796\n} \n}" }, { "title": "CuNeRF: Cube-Based Neural Radiance Field for Zero-Shot Medical Image Arbitrary-Scale Super Resolution", @@ -11385,7 +11770,8 @@ "aff_campus_unique_index": "0+0;0+0;0+0;0+0", "aff_campus_unique": "Guangzhou;", "aff_country_unique_index": "0+0+0;0+0+0;0+0+0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Zixuan and Yang,\n Lingxiao and Lai,\n Jian-Huang and Xie,\n Xiaohua\n},\n title = {\n CuNeRF: Cube-Based Neural Radiance Field for Zero-Shot Medical Image Arbitrary-Scale Super Resolution\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21185-21195\n} \n}" }, { "title": "Cumulative Spatial Knowledge Distillation for Vision Transformers", @@ -11410,14 +11796,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhao_Cumulative_Spatial_Knowledge_Distillation_for_Vision_Transformers_ICCV_2023_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "Megvii Technology", + "aff_unique_norm": "MEGVII Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.megvii.com", "aff_unique_abbr": "", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhao_2023_ICCV,\n \n author = {\n Zhao,\n Borui and Song,\n Renjie and Liang,\n Jiajun\n},\n title = {\n Cumulative Spatial Knowledge Distillation for Vision Transformers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6146-6155\n} \n}" }, { "title": "Curvature-Aware Training for Coordinate Networks", @@ -11443,13 +11830,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Saratchandran_Curvature-Aware_Training_for_Coordinate_Networks_ICCV_2023_paper.html", "aff_unique_index": "0;0;1;0;0", "aff_unique_norm": "University of Adelaide;Amazon", - "aff_unique_dep": "Australian Institute of Machine Learning;Amazon", + "aff_unique_dep": "Australian Institute of Machine Learning;", "aff_unique_url": "https://www.adelaide.edu.au;https://www.amazon.com.au", "aff_unique_abbr": "UoA;Amazon", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Adelaide;", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "Australia" + "aff_country_unique": "Australia", + "bibtex": "@InProceedings{Saratchandran_2023_ICCV,\n \n author = {\n Saratchandran,\n Hemanth and Chng,\n Shin-Fang and Ramasinghe,\n Sameera and MacDonald,\n Lachlan and Lucey,\n Simon\n},\n title = {\n Curvature-Aware Training for Coordinate Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13328-13338\n} \n}" }, { "title": "Cyclic Test-Time Adaptation on Monocular Video for 3D Human Mesh Reconstruction", @@ -11472,7 +11860,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Nam_Cyclic_Test-Time_Adaptation_on_Monocular_Video_for_3D_Human_Mesh_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Nam_Cyclic_Test-Time_Adaptation_on_Monocular_Video_for_3D_Human_Mesh_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Nam_2023_ICCV,\n \n author = {\n Nam,\n Hyeongjin and Jung,\n Daniel Sungho and Oh,\n Yeonguk and Lee,\n Kyoung Mu\n},\n title = {\n Cyclic Test-Time Adaptation on Monocular Video for 3D Human Mesh Reconstruction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14829-14839\n} \n}" }, { "title": "Cyclic-Bootstrap Labeling for Weakly Supervised Object Detection", @@ -11504,7 +11893,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Hefei", "aff_country_unique_index": "0;1;0+0;0;0+0", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Yin_2023_ICCV,\n \n author = {\n Yin,\n Yufei and Deng,\n Jiajun and Zhou,\n Wengang and Li,\n Li and Li,\n Houqiang\n},\n title = {\n Cyclic-Bootstrap Labeling for Weakly Supervised Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7008-7018\n} \n}" }, { "title": "D-IF: Uncertainty-aware Human Digitization via Implicit Distribution Field", @@ -11536,7 +11926,8 @@ "aff_campus_unique_index": ";1;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0+1;2;0;0+3;0+3", - "aff_country_unique": "United States;United Kingdom;Germany;China" + "aff_country_unique": "United States;United Kingdom;Germany;China", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n Xueting and Luo,\n Yihao and Xiu,\n Yuliang and Wang,\n Wei and Xu,\n Hao and Fan,\n Zhaoxin\n},\n title = {\n D-IF: Uncertainty-aware Human Digitization via Implicit Distribution Field\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9122-9132\n} \n}" }, { "title": "D3G: Exploring Gaussian Prior for Temporal Sentence Grounding with Glance Annotation", @@ -11568,7 +11959,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Hanjun and Shu,\n Xiujun and He,\n Sunan and Qiao,\n Ruizhi and Wen,\n Wei and Guo,\n Taian and Gan,\n Bei and Sun,\n Xing\n},\n title = {\n D3G: Exploring Gaussian Prior for Temporal Sentence Grounding with Glance Annotation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13734-13746\n} \n}" }, { "title": "DALL-Eval: Probing the Reasoning Skills and Social Biases of Text-to-Image Generation Models", @@ -11600,7 +11992,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Chapel Hill", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Cho_2023_ICCV,\n \n author = {\n Cho,\n Jaemin and Zala,\n Abhay and Bansal,\n Mohit\n},\n title = {\n DALL-Eval: Probing the Reasoning Skills and Social Biases of Text-to-Image Generation Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3043-3054\n} \n}" }, { "title": "DARTH: Holistic Test-time Adaptation for Multiple Object Tracking", @@ -11632,7 +12025,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;1;0", - "aff_country_unique": "Switzerland;Germany" + "aff_country_unique": "Switzerland;Germany", + "bibtex": "@InProceedings{Segu_2023_ICCV,\n \n author = {\n Segu,\n Mattia and Schiele,\n Bernt and Yu,\n Fisher\n},\n title = {\n DARTH: Holistic Test-time Adaptation for Multiple Object Tracking\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9717-9727\n} \n}" }, { "title": "DCPB: Deformable Convolution Based on the Poincare Ball for Top-view Fisheye Cameras", @@ -11664,7 +12058,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wei_2023_ICCV,\n \n author = {\n Wei,\n Xuan and Ran,\n Zhidan and Lu,\n Xiaobo\n},\n title = {\n DCPB: Deformable Convolution Based on the Poincare Ball for Top-view Fisheye Cameras\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13308-13317\n} \n}" }, { "title": "DDColor: Towards Photo-Realistic Image Colorization via Dual Decoders", @@ -11696,7 +12091,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Kang_2023_ICCV,\n \n author = {\n Kang,\n Xiaoyang and Yang,\n Tao and Ouyang,\n Wenqi and Ren,\n Peiran and Li,\n Lingzhi and Xie,\n Xuansong\n},\n title = {\n DDColor: Towards Photo-Realistic Image Colorization via Dual Decoders\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 328-338\n} \n}" }, { "title": "DDFM: Denoising Diffusion Model for Multi-Modality Image Fusion", @@ -11708,7 +12104,7 @@ "author": "Zixiang Zhao; Haowen Bai; Yuanzhi Zhu; Jiangshe Zhang; Shuang Xu; Yulun Zhang; Kai Zhang; Deyu Meng; Radu Timofte; Luc Van Gool", "abstract": "Multi-modality image fusion aims to combine different modalities to produce fused images that retain the complementary features of each modality, such as functional highlights and texture details. To leverage strong generative priors and address challenges such as unstable training and lack of interpretability for GAN-based generative methods, we propose a novel fusion algorithm based on the denoising diffusion probabilistic model (DDPM). The fusion task is formulated as a conditional generation problem under the DDPM sampling framework, which is further divided into an unconditional generation subproblem and a maximum likelihood subproblem. The latter is modeled in a hierarchical Bayesian manner with latent variables and inferred by the expectation-maximization (EM) algorithm. By integrating the inference solution into the diffusion sampling iteration, our method can generate high-quality fused images with natural image generative priors and cross-modality information from source images. Note that all we required is an unconditional pre-trained generative model, and no fine-tuning is needed. Our extensive experiments indicate that our approach yields promising fusion results in infrared-visible image fusion and medical image fusion. The code is available at https://github.com/Zhaozixiang1228/MMIF-DDFM.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Zhao_DDFM_Denoising_Diffusion_Model_for_Multi-Modality_Image_Fusion_ICCV_2023_paper.pdf", - "aff": "Xi\u2019an Jiaotong University+Computer Vision Lab, ETH Z\u00fcrich; Xi\u2019an Jiaotong University; Computer Vision Lab, ETH Z\u00fcrich; Xi\u2019an Jiaotong University; Northwestern Polytechnical University; Computer Vision Lab, ETH Z\u00fcrich; Computer Vision Lab, ETH Z\u00fcrich; Xi\u2019an Jiaotong University+Macau University of Science and Technology; Computer Vision Lab, ETH Z\u00fcrich+University of W\u00fcrzburg; Computer Vision Lab, ETH Z\u00fcrich", + "aff": "Xi’an Jiaotong University+Computer Vision Lab, ETH Zürich; Xi’an Jiaotong University; Computer Vision Lab, ETH Zürich; Xi’an Jiaotong University; Northwestern Polytechnical University; Computer Vision Lab, ETH Zürich; Computer Vision Lab, ETH Zürich; Xi’an Jiaotong University+Macau University of Science and Technology; Computer Vision Lab, ETH Zürich+University of Würzburg; Computer Vision Lab, ETH Zürich", "project": "", "github": "https://github.com/Zhaozixiang1228/MMIF-DDFM", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Zhao_DDFM_Denoising_Diffusion_ICCV_2023_supplemental.pdf", @@ -11721,14 +12117,15 @@ "author_num": 10, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhao_DDFM_Denoising_Diffusion_Model_for_Multi-Modality_Image_Fusion_ICCV_2023_paper.html", "aff_unique_index": "0+1;0;1;0;2;1;1;0+3;1+4;1", - "aff_unique_norm": "Xi'an Jiao Tong University;ETH Zurich;Northwestern Polytechnical University;Macau University of Science and Technology;University of W\u00fcrzburg", + "aff_unique_norm": "Xi'an Jiaotong University;ETH Zürich;Northwestern Polytechnical University;Macau University of Science and Technology;University of Würzburg", "aff_unique_dep": ";Computer Vision Lab;;;", "aff_unique_url": "https://www.xjtu.edu.cn;https://www.ethz.ch;https://www.nwpu.edu.cn;https://www.must.edu.mo;https://www.uni-wuerzburg.de", "aff_unique_abbr": "XJTU;ETHZ;NWPU;MUST;UWue", "aff_campus_unique_index": "1;1;1;1;2;1;1", - "aff_campus_unique": ";Z\u00fcrich;Macau SAR", + "aff_campus_unique": ";Zürich;Macau SAR", "aff_country_unique_index": "0+1;0;1;0;0;1;1;0+0;1+2;1", - "aff_country_unique": "China;Switzerland;Germany" + "aff_country_unique": "China;Switzerland;Germany", + "bibtex": "@InProceedings{Zhao_2023_ICCV,\n \n author = {\n Zhao,\n Zixiang and Bai,\n Haowen and Zhu,\n Yuanzhi and Zhang,\n Jiangshe and Xu,\n Shuang and Zhang,\n Yulun and Zhang,\n Kai and Meng,\n Deyu and Timofte,\n Radu and Van Gool,\n Luc\n},\n title = {\n DDFM: Denoising Diffusion Model for Multi-Modality Image Fusion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8082-8093\n} \n}" }, { "title": "DDG-Net: Discriminability-Driven Graph Network for Weakly-supervised Temporal Action Localization", @@ -11751,7 +12148,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Tang_DDG-Net_Discriminability-Driven_Graph_Network_for_Weakly-supervised_Temporal_Action_Localization_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Tang_DDG-Net_Discriminability-Driven_Graph_Network_for_Weakly-supervised_Temporal_Action_Localization_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Tang_2023_ICCV,\n \n author = {\n Tang,\n Xiaojun and Fan,\n Junsong and Luo,\n Chuanchen and Zhang,\n Zhaoxiang and Zhang,\n Man and Yang,\n Zongyuan\n},\n title = {\n DDG-Net: Discriminability-Driven Graph Network for Weakly-supervised Temporal Action Localization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6622-6632\n} \n}" }, { "title": "DDIT: Semantic Scene Completion via Deformable Deep Implicit Templates", @@ -11776,14 +12174,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_DDIT_Semantic_Scene_Completion_via_Deformable_Deep_Implicit_Templates_ICCV_2023_paper.html", "aff_unique_index": "0+1;2;0+1;0+1+3;2;2;0+1+3", - "aff_unique_norm": "Technical University of Munich;Munich Center for Machine Learning;Chinese University of Hong Kong;University of Oxford", + "aff_unique_norm": "Technical University of Munich;Munich Center for Machine Learning;The Chinese University of Hong Kong;University of Oxford", "aff_unique_dep": ";Center for Machine Learning;;", "aff_unique_url": "https://www.tum.de;https://www.munich-center-for-machine-learning.de;https://www.cuhk.edu.hk;https://www.ox.ac.uk", "aff_unique_abbr": "TUM;MCML;CUHK;Oxford", "aff_campus_unique_index": "1;2;1;1;2;2;1", "aff_campus_unique": ";Munich;Hong Kong SAR", "aff_country_unique_index": "0+0;1;0+0;0+0+2;1;1;0+0+2", - "aff_country_unique": "Germany;China;United Kingdom" + "aff_country_unique": "Germany;China;United Kingdom", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Haoang and Dong,\n Jinhu and Wen,\n Binghui and Gao,\n Ming and Huang,\n Tianyu and Liu,\n Yun-Hui and Cremers,\n Daniel\n},\n title = {\n DDIT: Semantic Scene Completion via Deformable Deep Implicit Templates\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21894-21904\n} \n}" }, { "title": "DDP: Diffusion Model for Dense Visual Prediction", @@ -11806,7 +12205,8 @@ "aff_domain": ";;;;;;;;", "email": ";;;;;;;;", "author_num": 9, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ji_DDP_Diffusion_Model_for_Dense_Visual_Prediction_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ji_DDP_Diffusion_Model_for_Dense_Visual_Prediction_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Ji_2023_ICCV,\n \n author = {\n Ji,\n Yuanfeng and Chen,\n Zhe and Xie,\n Enze and Hong,\n Lanqing and Liu,\n Xihui and Liu,\n Zhaoqiang and Lu,\n Tong and Li,\n Zhenguo and Luo,\n Ping\n},\n title = {\n DDP: Diffusion Model for Dense Visual Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21741-21752\n} \n}" }, { "title": "DDS2M: Self-Supervised Denoising Diffusion Spatio-Spectral Model for Hyperspectral Image Restoration", @@ -11831,14 +12231,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Miao_DDS2M_Self-Supervised_Denoising_Diffusion_Spatio-Spectral_Model_for_Hyperspectral_Image_Restoration_ICCV_2023_paper.html", "aff_unique_index": "0;0+1;0;2", - "aff_unique_norm": "Wuhan University;Hubei Luojia Laboratory;University of Sydney", + "aff_unique_norm": "Wuhan University;Hubei Luojia Laboratory;The University of Sydney", "aff_unique_dep": "School of Computer Science;;School of Computer Science", "aff_unique_url": "http://www.whu.edu.cn/;;https://www.sydney.edu.au", "aff_unique_abbr": "WHU;;USYD", "aff_campus_unique_index": ";1;2", "aff_campus_unique": ";Wuhan;Sydney", "aff_country_unique_index": "0;0+0;0;1", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Miao_2023_ICCV,\n \n author = {\n Miao,\n Yuchun and Zhang,\n Lefei and Zhang,\n Liangpei and Tao,\n Dacheng\n},\n title = {\n DDS2M: Self-Supervised Denoising Diffusion Spatio-Spectral Model for Hyperspectral Image Restoration\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12086-12096\n} \n}" }, { "title": "DECO: Dense Estimation of 3D Human-Scene Contact In The Wild", @@ -11850,7 +12251,7 @@ "author": "Shashank Tripathi; Agniv Chatterjee; Jean-Claude Passy; Hongwei Yi; Dimitrios Tzionas; Michael J. Black", "abstract": "Understanding how humans use physical contact to interact with the world is key to enabling human-centric artificial intelligence. While inferring 3D contact is crucial for modeling realistic and physically-plausible human-object interactions, existing methods either focus on 2D, consider body joints rather than the surface, use coarse 3D body regions, or do not generalize to in-the-wild images. In contrast, we focus on inferring dense, 3D contact between the full body surface and objects in arbitrary images. To achieve this, we first collect DAMON, a new dataset containing dense vertex-level contact annotations paired with RGB images containing complex human-object and human-scene contact. Second, we train DECO, a novel 3D contact detector that uses both body-part-driven and scene-context-driven attention to estimate vertex-level contact on the SMPL body. DECO builds on the insight that human observers recognize contact by reasoning about the contacting body parts, their proximity to scene objects, and the surrounding scene context. \n We perform extensive evaluations of our detector on DAMON as well as on the RICH and BEHAVE datasets. We significantly outperform existing SOTA methods across all benchmarks. \n We also show qualitatively that DECO generalizes well to diverse and challenging real-world human interactions in natural images. The code, data, and models are available at https://deco.is.tue.mpg.de.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Tripathi_DECO_Dense_Estimation_of_3D_Human-Scene_Contact_In_The_Wild_ICCV_2023_paper.pdf", - "aff": "Max Planck Institute for Intelligent Systems, T\u00fcbingen, Germany; Max Planck Institute for Intelligent Systems, T\u00fcbingen, Germany; Max Planck Institute for Intelligent Systems, T\u00fcbingen, Germany; Max Planck Institute for Intelligent Systems, T\u00fcbingen, Germany; University of Amsterdam, the Netherlands; Max Planck Institute for Intelligent Systems, T\u00fcbingen, Germany", + "aff": "Max Planck Institute for Intelligent Systems, Tübingen, Germany; Max Planck Institute for Intelligent Systems, Tübingen, Germany; Max Planck Institute for Intelligent Systems, Tübingen, Germany; Max Planck Institute for Intelligent Systems, Tübingen, Germany; University of Amsterdam, the Netherlands; Max Planck Institute for Intelligent Systems, Tübingen, Germany", "project": "https://deco.is.tue.mpg.de", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Tripathi_DECO_Dense_Estimation_ICCV_2023_supplemental.pdf", @@ -11868,9 +12269,10 @@ "aff_unique_url": "https://www.mpi-is.mpg.de;https://www.uva.nl", "aff_unique_abbr": "MPI-IS;UvA", "aff_campus_unique_index": "0;0;0;0;0", - "aff_campus_unique": "T\u00fcbingen;", + "aff_campus_unique": "Tübingen;", "aff_country_unique_index": "0;0;0;0;1;0", - "aff_country_unique": "Germany;Netherlands" + "aff_country_unique": "Germany;Netherlands", + "bibtex": "@InProceedings{Tripathi_2023_ICCV,\n \n author = {\n Tripathi,\n Shashank and Chatterjee,\n Agniv and Passy,\n Jean-Claude and Yi,\n Hongwei and Tzionas,\n Dimitrios and Black,\n Michael J.\n},\n title = {\n DECO: Dense Estimation of 3D Human-Scene Contact In The Wild\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8001-8013\n} \n}" }, { "title": "DEDRIFT: Robust Similarity Search under Content Drift", @@ -11895,14 +12297,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Baranchuk_DEDRIFT_Robust_Similarity_Search_under_Content_Drift_ICCV_2023_paper.html", "aff_unique_index": "0;1;1;1", - "aff_unique_norm": "Yandex;Meta", + "aff_unique_norm": "Yandex;Meta Platforms, Inc.", "aff_unique_dep": "Yandex Research;Meta AI", "aff_unique_url": "https://research.yandex.com;https://meta.com", "aff_unique_abbr": "Yandex;Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1", - "aff_country_unique": "Russian Federation;United States" + "aff_country_unique": "Russia;United States", + "bibtex": "@InProceedings{Baranchuk_2023_ICCV,\n \n author = {\n Baranchuk,\n Dmitry and Douze,\n Matthijs and Upadhyay,\n Yash and Yalniz,\n I. Zeki\n},\n title = {\n DEDRIFT: Robust Similarity Search under Content Drift\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11026-11035\n} \n}" }, { "title": "DELFlow: Dense Efficient Learning of Scene Flow for Large-Scale Point Clouds", @@ -11934,7 +12337,8 @@ "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;0;0;0;1;1;1;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Peng_2023_ICCV,\n \n author = {\n Peng,\n Chensheng and Wang,\n Guangming and Lo,\n Xian Wan and Wu,\n Xinrui and Xu,\n Chenfeng and Tomizuka,\n Masayoshi and Zhan,\n Wei and Wang,\n Hesheng\n},\n title = {\n DELFlow: Dense Efficient Learning of Scene Flow for Large-Scale Point Clouds\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16901-16910\n} \n}" }, { "title": "DETA: Denoised Task Adaptation for Few-Shot Learning", @@ -11966,7 +12370,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0;0+0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Ji and Gao,\n Lianli and Luo,\n Xu and Shen,\n Hengtao and Song,\n Jingkuan\n},\n title = {\n DETA: Denoised Task Adaptation for Few-Shot Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11541-11551\n} \n}" }, { "title": "DETR Does Not Need Multi-Scale or Locality Design", @@ -11978,7 +12383,7 @@ "author": "Yutong Lin; Yuhui Yuan; Zheng Zhang; Chen Li; Nanning Zheng; Han Hu", "abstract": "This paper presents an improved DETR detector that maintains a \"plain\" nature: using a single-scale feature map and global cross-attention calculations without specific locality constraints, in contrast to previous leading DETR-based detectors that reintroduce architectural inductive biases of multi-scale and locality into the decoder. We show that two simple technologies are surprisingly effective within a plain design to compensate for the lack of multi-scale feature maps and locality constraints. The first is a box-to-pixel relative position bias (BoxRPB) term added to the cross-attention formulation, which well guides each query to attend to the corresponding object region while also providing encoding flexibility. The second is masked image modeling (MIM)-based backbone pre-training which helps learn representation with fine-grained localization ability and proves crucial for remedying dependencies on the multi-scale feature maps.\n By incorporating these technologies and recent advancements in training and problem formation, the improved \"plain\" DETR showed exceptional improvements over the original DETR detector. By leveraging the Object365 dataset for pre-training, it achieved 63.9 mAP accuracy using a Swin-L backbone, which is highly competitive with state-of-the-art detectors which all heavily rely on multi-scale feature maps and region-based feature extraction. Code will be available at https://github.com/impiga/Plain-DETR.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Lin_DETR_Does_Not_Need_Multi-Scale_or_Locality_Design_ICCV_2023_paper.pdf", - "aff": "National Key Laboratory of Human-Machine Hybrid Augmented Intelligence, National Engineering Research Center for Visual Information and Applications, and Institute of Artificial Intelligence and Robotics, Xi\u2019an Jiaotong University; Microsoft Research Asia; Microsoft Research Asia; National Key Laboratory of Human-Machine Hybrid Augmented Intelligence, National Engineering Research Center for Visual Information and Applications, and Institute of Artificial Intelligence and Robotics, Xi\u2019an Jiaotong University; National Key Laboratory of Human-Machine Hybrid Augmented Intelligence, National Engineering Research Center for Visual Information and Applications, and Institute of Artificial Intelligence and Robotics, Xi\u2019an Jiaotong University; Microsoft Research Asia", + "aff": "National Key Laboratory of Human-Machine Hybrid Augmented Intelligence, National Engineering Research Center for Visual Information and Applications, and Institute of Artificial Intelligence and Robotics, Xi’an Jiaotong University; Microsoft Research Asia; Microsoft Research Asia; National Key Laboratory of Human-Machine Hybrid Augmented Intelligence, National Engineering Research Center for Visual Information and Applications, and Institute of Artificial Intelligence and Robotics, Xi’an Jiaotong University; National Key Laboratory of Human-Machine Hybrid Augmented Intelligence, National Engineering Research Center for Visual Information and Applications, and Institute of Artificial Intelligence and Robotics, Xi’an Jiaotong University; Microsoft Research Asia", "project": "", "github": "https://github.com/impiga/Plain-DETR", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Lin_DETR_Does_Not_ICCV_2023_supplemental.pdf", @@ -11991,14 +12396,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Lin_DETR_Does_Not_Need_Multi-Scale_or_Locality_Design_ICCV_2023_paper.html", "aff_unique_index": "0;1;1;0;0;1", - "aff_unique_norm": "Xi'an Jiao Tong University;Microsoft", + "aff_unique_norm": "Xi'an Jiaotong University;Microsoft Research", "aff_unique_dep": "Institute of Artificial Intelligence and Robotics;Research", "aff_unique_url": "http://www.xjtu.edu.cn;https://www.microsoft.com/en-us/research/group/asia", "aff_unique_abbr": "XJTU;MSR Asia", "aff_campus_unique_index": "0;1;1;0;0;1", "aff_campus_unique": "Xi'an;Asia", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Lin_2023_ICCV,\n \n author = {\n Lin,\n Yutong and Yuan,\n Yuhui and Zhang,\n Zheng and Li,\n Chen and Zheng,\n Nanning and Hu,\n Han\n},\n title = {\n DETR Does Not Need Multi-Scale or Locality Design\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6545-6554\n} \n}" }, { "title": "DETRDistill: A Universal Knowledge Distillation Framework for DETR-families", @@ -12030,7 +12436,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;2;0", - "aff_country_unique": "China;Australia;United Kingdom" + "aff_country_unique": "China;Australia;United Kingdom", + "bibtex": "@InProceedings{Chang_2023_ICCV,\n \n author = {\n Chang,\n Jiahao and Wang,\n Shuo and Xu,\n Hai-Ming and Chen,\n Zehui and Yang,\n Chenhongyi and Zhao,\n Feng\n},\n title = {\n DETRDistill: A Universal Knowledge Distillation Framework for DETR-families\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6898-6908\n} \n}" }, { "title": "DETRs with Collaborative Hybrid Assignments Training", @@ -12062,7 +12469,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zong_2023_ICCV,\n \n author = {\n Zong,\n Zhuofan and Song,\n Guanglu and Liu,\n Yu\n},\n title = {\n DETRs with Collaborative Hybrid Assignments Training\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6748-6758\n} \n}" }, { "title": "DFA3D: 3D Deformable Attention For 2D-to-3D Feature Lifting", @@ -12085,7 +12493,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_DFA3D_3D_Deformable_Attention_For_2D-to-3D_Feature_Lifting_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_DFA3D_3D_Deformable_Attention_For_2D-to-3D_Feature_Lifting_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Hongyang and Zhang,\n Hao and Zeng,\n Zhaoyang and Liu,\n Shilong and Li,\n Feng and Ren,\n Tianhe and Zhang,\n Lei\n},\n title = {\n DFA3D: 3D Deformable Attention For 2D-to-3D Feature Lifting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6684-6693\n} \n}" }, { "title": "DG-Recon: Depth-Guided Neural 3D Scene Reconstruction", @@ -12117,7 +12526,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Ju_2023_ICCV,\n \n author = {\n Ju,\n Jihong and Tseng,\n Ching Wei and Bailo,\n Oleksandr and Dikov,\n Georgi and Ghafoorian,\n Mohsen\n},\n title = {\n DG-Recon: Depth-Guided Neural 3D Scene Reconstruction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18184-18194\n} \n}" }, { "title": "DG3D: Generating High Quality 3D Textured Shapes by Learning to Discriminate Multi-Modal Diffusion-Renderings", @@ -12149,7 +12559,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zuo_2023_ICCV,\n \n author = {\n Zuo,\n Qi and Song,\n Yafei and Li,\n Jianfang and Liu,\n Lin and Bo,\n Liefeng\n},\n title = {\n DG3D: Generating High Quality 3D Textured Shapes by Learning to Discriminate Multi-Modal Diffusion-Renderings\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14575-14584\n} \n}" }, { "title": "DIFFGUARD: Semantic Mismatch-Guided Out-of-Distribution Detection Using Pre-Trained Diffusion Models", @@ -12161,7 +12572,7 @@ "author": "Ruiyuan Gao; Chenchen Zhao; Lanqing Hong; Qiang Xu", "abstract": "Given a classifier, the inherent property of semantic Out-of-Distribution (OOD) samples is that their contents differ from all legal classes in terms of semantics, namely semantic mismatch. There is a recent work that directly applies it to OOD detection, which employs a conditional Generative Adversarial Network (cGAN) to enlarge semantic mismatch in the image space. While achieving remarkable OOD detection performance on small datasets, it is not applicable to ImageNet-scale datasets due to the difficulty in training cGANs with both input images and labels as conditions.\n As diffusion models are much easier to train and amenable to various conditions compared to cGANs, in this work, we propose to directly use pre-trained diffusion models for semantic mismatch-guided OOD detection, named DiffGuard. Specifically, given an OOD input image and the predicted label from the classifier, we try to enlarge the semantic difference between the reconstructed OOD image under these conditions and the original input image. We also present several test-time techniques to further strengthen such differences. Experimental results show that DiffGuard is effective on both Cifar-10 and hard cases of the large-scale ImageNet, and it can be easily combined with existing OOD detection techniques to achieve state-of-the-art OOD detection results.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Gao_DIFFGUARD_Semantic_Mismatch-Guided_Out-of-Distribution_Detection_Using_Pre-Trained_Diffusion_Models_ICCV_2023_paper.pdf", - "aff": "The Chinese University of Hong Kong; The Chinese University of Hong Kong; Huawei Noah\u2019s Ark Lab; The Chinese University of Hong Kong", + "aff": "The Chinese University of Hong Kong; The Chinese University of Hong Kong; Huawei Noah’s Ark Lab; The Chinese University of Hong Kong", "project": "", "github": "https://github.com/cure-lab/DiffGuard", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Gao_DIFFGUARD_Semantic_Mismatch-Guided_ICCV_2023_supplemental.pdf", @@ -12174,14 +12585,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Gao_DIFFGUARD_Semantic_Mismatch-Guided_Out-of-Distribution_Detection_Using_Pre-Trained_Diffusion_Models_ICCV_2023_paper.html", "aff_unique_index": "0;0;1;0", - "aff_unique_norm": "Chinese University of Hong Kong;Huawei", - "aff_unique_dep": ";Noah\u2019s Ark Lab", + "aff_unique_norm": "The Chinese University of Hong Kong;Huawei", + "aff_unique_dep": ";Noah’s Ark Lab", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.huawei.com", "aff_unique_abbr": "CUHK;Huawei", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Gao_2023_ICCV,\n \n author = {\n Gao,\n Ruiyuan and Zhao,\n Chenchen and Hong,\n Lanqing and Xu,\n Qiang\n},\n title = {\n DIFFGUARD: Semantic Mismatch-Guided Out-of-Distribution Detection Using Pre-Trained Diffusion Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1579-1589\n} \n}" }, { "title": "DIME-FM : DIstilling Multimodal and Efficient Foundation Models", @@ -12206,14 +12618,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Sun_DIME-FM__DIstilling_Multimodal_and_Efficient_Foundation_Models_ICCV_2023_paper.html", "aff_unique_index": "0+1;1;1;1;0+1;1", - "aff_unique_norm": "Boston University;Meta", + "aff_unique_norm": "Boston University;Meta Platforms, Inc.", "aff_unique_dep": ";Meta AI", "aff_unique_url": "https://www.bu.edu;https://meta.com", "aff_unique_abbr": "BU;Meta", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0;0+0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Sun_2023_ICCV,\n \n author = {\n Sun,\n Ximeng and Zhang,\n Pengchuan and Zhang,\n Peizhao and Shah,\n Hardik and Saenko,\n Kate and Xia,\n Xide\n},\n title = {\n DIME-FM : DIstilling Multimodal and Efficient Foundation Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15521-15533\n} \n}" }, { "title": "DINAR: Diffusion Inpainting of Neural Textures for One-Shot Human Avatars", @@ -12238,14 +12651,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Svitov_DINAR_Diffusion_Inpainting_of_Neural_Textures_for_One-Shot_Human_Avatars_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;1", - "aff_unique_norm": "Samsung;Cinemersive Labs", + "aff_unique_norm": "Samsung AI Center;Cinemersive Labs", "aff_unique_dep": "AI Center;", "aff_unique_url": "https://www.samsung.com/global/careers/ai-center/;", "aff_unique_abbr": "Samsung AI;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "South Korea;" + "aff_country_unique": "South Korea;", + "bibtex": "@InProceedings{Svitov_2023_ICCV,\n \n author = {\n Svitov,\n David and Gudkov,\n Dmitrii and Bashirov,\n Renat and Lempitsky,\n Victor\n},\n title = {\n DINAR: Diffusion Inpainting of Neural Textures for One-Shot Human Avatars\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7062-7072\n} \n}" }, { "title": "DIRE for Diffusion-Generated Image Detection", @@ -12270,14 +12684,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wang_DIRE_for_Diffusion-Generated_Image_Detection_ICCV_2023_paper.html", "aff_unique_index": "0;1;0+2;0;0;3;0+2", - "aff_unique_norm": "University of Science and Technology of China;Microsoft;Hefei Comprehensive National Science Center;Merchants Union Consumer Finance Company", + "aff_unique_norm": "University of Science and Technology of China;Microsoft Research;Hefei Comprehensive National Science Center;Merchants Union Consumer Finance Company", "aff_unique_dep": "EEIS Department;Research;Institute of Artificial Intelligence;", - "aff_unique_url": "http://www.ustc.edu.cn/;https://www.microsoft.com/en-us/research/group/asia;http://www.hfcn.edu.cn;", + "aff_unique_url": "http://www.ustc.edu.cn;https://www.microsoft.com/en-us/research/group/asia;http://www.hfcn.edu.cn;", "aff_unique_abbr": "USTC;MSR Asia;;", "aff_campus_unique_index": "1;2;2", "aff_campus_unique": ";Asia;Hefei", "aff_country_unique_index": "0;0;0+0;0;0;1;0+0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Zhendong and Bao,\n Jianmin and Zhou,\n Wengang and Wang,\n Weilun and Hu,\n Hezhen and Chen,\n Hong and Li,\n Houqiang\n},\n title = {\n DIRE for Diffusion-Generated Image Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22445-22455\n} \n}" }, { "title": "DISeR: Designing Imaging Systems with Reinforcement Learning", @@ -12309,7 +12724,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Klinghoffer_2023_ICCV,\n \n author = {\n Klinghoffer,\n Tzofi and Tiwary,\n Kushagra and Behari,\n Nikhil and Agrawalla,\n Bhavya and Raskar,\n Ramesh\n},\n title = {\n DISeR: Designing Imaging Systems with Reinforcement Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23632-23642\n} \n}" }, { "title": "DLGSANet: Lightweight Dynamic Local and Global Self-Attention Networks for Image Super-Resolution", @@ -12332,7 +12748,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_DLGSANet_Lightweight_Dynamic_Local_and_Global_Self-Attention_Networks_for_Image_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_DLGSANet_Lightweight_Dynamic_Local_and_Global_Self-Attention_Networks_for_Image_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Xiang and Dong,\n Jiangxin and Tang,\n Jinhui and Pan,\n Jinshan\n},\n title = {\n DLGSANet: Lightweight Dynamic Local and Global Self-Attention Networks for Image Super-Resolution\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12792-12801\n} \n}" }, { "title": "DLT: Conditioned layout generation with Joint Discrete-Continuous Diffusion Layout Transformer", @@ -12355,7 +12772,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Levi_DLT_Conditioned_layout_generation_with_Joint_Discrete-Continuous_Diffusion_Layout_Transformer_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Levi_DLT_Conditioned_layout_generation_with_Joint_Discrete-Continuous_Diffusion_Layout_Transformer_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Levi_2023_ICCV,\n \n author = {\n Levi,\n Elad and Brosh,\n Eli and Mykhailych,\n Mykola and Perez,\n Meir\n},\n title = {\n DLT: Conditioned layout generation with Joint Discrete-Continuous Diffusion Layout Transformer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2106-2115\n} \n}" }, { "title": "DMNet: Delaunay Meshing Network for 3D Shape Representation", @@ -12387,7 +12805,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+1;0", - "aff_country_unique": "China;Estonia" + "aff_country_unique": "China;Estonia", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Chen and Yuan,\n Ganzhangqin and Tao,\n Wenbing\n},\n title = {\n DMNet: Delaunay Meshing Network for 3D Shape Representation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14418-14428\n} \n}" }, { "title": "DNA-Rendering: A Diverse Neural Actor Repository for High-Fidelity Human-Centric Rendering", @@ -12412,14 +12831,15 @@ "author_num": 19, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Cheng_DNA-Rendering_A_Diverse_Neural_Actor_Repository_for_High-Fidelity_Human-Centric_Rendering_ICCV_2023_paper.html", "aff_unique_index": "0;1;0+1;1;0;2;3;1;0;1;2;0+1;2;2;0;0;0+3;0;0+3", - "aff_unique_norm": "Shanghai AI Laboratory;SenseTime;Nanyang Technological University;Chinese University of Hong Kong", + "aff_unique_norm": "Shanghai AI Laboratory;SenseTime;Nanyang Technological University;The Chinese University of Hong Kong", "aff_unique_dep": ";SenseTime Research;S-Lab;", "aff_unique_url": "https://www.shanghai-ai-lab.com;https://www.sensetime.com;https://www.ntu.edu.sg;https://www.cuhk.edu.hk", "aff_unique_abbr": "SAIL;SenseTime;NTU;CUHK", "aff_campus_unique_index": ";1;;1;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0+0;0;0;1;0;0;0;0;1;0+0;1;1;0;0;0+0;0;0+0", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Cheng_2023_ICCV,\n \n author = {\n Cheng,\n Wei and Chen,\n Ruixiang and Fan,\n Siming and Yin,\n Wanqi and Chen,\n Keyu and Cai,\n Zhongang and Wang,\n Jingbo and Gao,\n Yang and Yu,\n Zhengming and Lin,\n Zhengyu and Ren,\n Daxuan and Yang,\n Lei and Liu,\n Ziwei and Loy,\n Chen Change and Qian,\n Chen and Wu,\n Wayne and Lin,\n Dahua and Dai,\n Bo and Lin,\n Kwan-Yee\n},\n title = {\n DNA-Rendering: A Diverse Neural Actor Repository for High-Fidelity Human-Centric Rendering\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19982-19993\n} \n}" }, { "title": "DOLCE: A Model-Based Probabilistic Diffusion Framework for Limited-Angle CT Reconstruction", @@ -12451,7 +12871,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "St. Louis;", "aff_country_unique_index": "0;0;0;0;0;0+0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Jiaming and Anirudh,\n Rushil and Thiagarajan,\n Jayaraman J. and He,\n Stewart and Mohan,\n K Aditya and Kamilov,\n Ulugbek S. and Kim,\n Hyojin\n},\n title = {\n DOLCE: A Model-Based Probabilistic Diffusion Framework for Limited-Angle CT Reconstruction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10498-10508\n} \n}" }, { "title": "DOT: A Distillation-Oriented Trainer", @@ -12476,14 +12897,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhao_DOT_A_Distillation-Oriented_Trainer_ICCV_2023_paper.html", "aff_unique_index": "0;1;0;0", - "aff_unique_norm": "Megvii Technology;Waseda University", + "aff_unique_norm": "MEGVII Technology;Waseda University", "aff_unique_dep": ";", "aff_unique_url": "https://www.megvii.com;https://www.waseda.jp/top", "aff_unique_abbr": ";Waseda", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", - "aff_country_unique": "China;Japan" + "aff_country_unique": "China;Japan", + "bibtex": "@InProceedings{Zhao_2023_ICCV,\n \n author = {\n Zhao,\n Borui and Cui,\n Quan and Song,\n Renjie and Liang,\n Jiajun\n},\n title = {\n DOT: A Distillation-Oriented Trainer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6189-6198\n} \n}" }, { "title": "DPF-Net: Combining Explicit Shape Priors in Deformable Primitive Field for Unsupervised Structural Reconstruction of 3D Objects", @@ -12515,7 +12937,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Shuai_2023_ICCV,\n \n author = {\n Shuai,\n Qingyao and Zhang,\n Chi and Yang,\n Kaizhi and Chen,\n Xuejin\n},\n title = {\n DPF-Net: Combining Explicit Shape Priors in Deformable Primitive Field for Unsupervised Structural Reconstruction of 3D Objects\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14321-14329\n} \n}" }, { "title": "DPM-OT: A New Diffusion Probabilistic Model Based on Optimal Transport", @@ -12547,7 +12970,8 @@ "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Beijing;Stony Brook", "aff_country_unique_index": "0+0;0;0;0;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Zezeng and Li,\n Shenghao and Wang,\n Zhanpeng and Lei,\n Na and Luo,\n Zhongxuan and Gu,\n David Xianfeng\n},\n title = {\n DPM-OT: A New Diffusion Probabilistic Model Based on Optimal Transport\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22624-22633\n} \n}" }, { "title": "DPS-Net: Deep Polarimetric Stereo Depth Estimation", @@ -12555,6 +12979,7 @@ "status": "Poster", "track": "main", "pid": "9109", + "author_site": "Chaoran Tian, Weihong Pan, Zimo Wang, Mao Mao, Guofeng Zhang, Hujun Bao, Ping Tan, Zhaopeng Cui", "author": "Chaoran Tian, Weihong Pan, Zimo Wang, Mao Mao, Guofeng Zhang, Hujun Bao, Ping Tan, Zhaopeng Cui", "abstract": "Stereo depth estimation usually struggles to deal with textureless scenes for both traditional and learning-based methods due to the inherent dependence on image correspondence matching. In this paper, we propose a novel neural network, i.e., DPS-Net, to exploit both the prior geometric knowledge and polarimetric information for depth estimation with two polarimetric stereo images. Specifically, we construct both RGB and polarization correlation volumes to fully leverage the multi-domain similarity between polarimetric stereo images. Since inherent ambiguities exist in the polarization images, we introduce the iso-depth cost explicitly into the network to solve these ambiguities. Moreover, we design a cascaded dual-GRU architecture to recurrently update the disparity and effectively fuse both the multi-domain correlation features and the iso-depth cost. Besides, we present new synthetic and real polarimetric stereo datasets for evaluation. Experimental results demonstrate that our method outperforms the state-of-the-art stereo depth estimation methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Tian_DPS-Net_Deep_Polarimetric_Stereo_Depth_Estimation_ICCV_2023_paper.pdf", @@ -12566,7 +12991,8 @@ "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17563179505999671632&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Tian_DPS-Net_Deep_Polarimetric_Stereo_Depth_Estimation_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Tian_DPS-Net_Deep_Polarimetric_Stereo_Depth_Estimation_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Tian_2023_ICCV,\n \n author = {\n Tian,\n Chaoran and Pan,\n Weihong and Wang,\n Zimo and Mao,\n Mao and Zhang,\n Guofeng and Bao,\n Hujun and Tan,\n Ping and Cui,\n Zhaopeng\n},\n title = {\n DPS-Net: Deep Polarimetric Stereo Depth Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3569-3579\n} \n}" }, { "title": "DQS3D: Densely-matched Quantization-aware Semi-supervised 3D Detection", @@ -12598,7 +13024,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0+0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Gao_2023_ICCV,\n \n author = {\n Gao,\n Huan-ang and Tian,\n Beiwen and Li,\n Pengfei and Zhao,\n Hao and Zhou,\n Guyue\n},\n title = {\n DQS3D: Densely-matched Quantization-aware Semi-supervised 3D Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21905-21915\n} \n}" }, { "title": "DR-Tune: Improving Fine-tuning of Pretrained Visual Models by Distribution Regularization with Semantic Calibration", @@ -12630,7 +13057,8 @@ "aff_campus_unique_index": "0+0;0;0+0+1", "aff_campus_unique": "Beijing;Hangzhou", "aff_country_unique_index": "0+0;0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhou_2023_ICCV,\n \n author = {\n Zhou,\n Nan and Chen,\n Jiaxin and Huang,\n Di\n},\n title = {\n DR-Tune: Improving Fine-tuning of Pretrained Visual Models by Distribution Regularization with Semantic Calibration\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1547-1556\n} \n}" }, { "title": "DRAW: Defending Camera-shooted RAW Against Image Manipulation", @@ -12662,7 +13090,8 @@ "aff_campus_unique_index": ";;;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0+0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Hu_2023_ICCV,\n \n author = {\n Hu,\n Xiaoxiao and Ying,\n Qichao and Qian,\n Zhenxing and Li,\n Sheng and Zhang,\n Xinpeng\n},\n title = {\n DRAW: Defending Camera-shooted RAW Against Image Manipulation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22434-22444\n} \n}" }, { "title": "DREAM: Efficient Dataset Distillation by Representative Matching", @@ -12694,7 +13123,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;1;1;0", - "aff_country_unique": "Singapore;China" + "aff_country_unique": "Singapore;China", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Yanqing and Gu,\n Jianyang and Wang,\n Kai and Zhu,\n Zheng and Jiang,\n Wei and You,\n Yang\n},\n title = {\n DREAM: Efficient Dataset Distillation by Representative Matching\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17314-17324\n} \n}" }, { "title": "DREAMWALKER: Mental Planning for Continuous Vision-Language Navigation", @@ -12726,7 +13156,8 @@ "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Zurich;Jiaxing", "aff_country_unique_index": "0;1;0;0+0", - "aff_country_unique": "China;Switzerland" + "aff_country_unique": "China;Switzerland", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Hanqing and Liang,\n Wei and Van Gool,\n Luc and Wang,\n Wenguan\n},\n title = {\n DREAMWALKER: Mental Planning for Continuous Vision-Language Navigation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10873-10883\n} \n}" }, { "title": "DReg-NeRF: Deep Registration for Neural Radiance Fields", @@ -12758,7 +13189,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Yu and Lee,\n Gim Hee\n},\n title = {\n DReg-NeRF: Deep Registration for Neural Radiance Fields\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22703-22713\n} \n}" }, { "title": "DS-Fusion: Artistic Typography via Discriminated and Stylized Diffusion", @@ -12790,7 +13222,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Tanveer_2023_ICCV,\n \n author = {\n Tanveer,\n Maham and Wang,\n Yizhi and Mahdavi-Amiri,\n Ali and Zhang,\n Hao\n},\n title = {\n DS-Fusion: Artistic Typography via Discriminated and Stylized Diffusion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 374-384\n} \n}" }, { "title": "DVGaze: Dual-View Gaze Estimation", @@ -12822,7 +13255,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Cheng_2023_ICCV,\n \n author = {\n Cheng,\n Yihua and Lu,\n Feng\n},\n title = {\n DVGaze: Dual-View Gaze Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20632-20641\n} \n}" }, { "title": "DVIS: Decoupled Video Instance Segmentation Framework", @@ -12854,7 +13288,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Tao and Tian,\n Xingye and Wu,\n Yu and Ji,\n Shunping and Wang,\n Xuebo and Zhang,\n Yuan and Wan,\n Pengfei\n},\n title = {\n DVIS: Decoupled Video Instance Segmentation Framework\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1282-1291\n} \n}" }, { "title": "Dancing in the Dark: A Benchmark towards General Low-light Video Enhancement", @@ -12886,7 +13321,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Beijing;", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Fu_2023_ICCV,\n \n author = {\n Fu,\n Huiyuan and Zheng,\n Wenkai and Wang,\n Xicong and Wang,\n Jiaxuan and Zhang,\n Heng and Ma,\n Huadong\n},\n title = {\n Dancing in the Dark: A Benchmark towards General Low-light Video Enhancement\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12877-12886\n} \n}" }, { "title": "DandelionNet: Domain Composition with Instance Adaptive Classification for Domain Generalization", @@ -12911,14 +13347,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Hu_DandelionNet_Domain_Composition_with_Instance_Adaptive_Classification_for_Domain_Generalization_ICCV_2023_paper.html", "aff_unique_index": "0+1;0+1;0+1+2;0+1", - "aff_unique_norm": "Chinese Academy of Sciences;University of Chinese Academy of Sciences;Pengcheng Laboratory", - "aff_unique_dep": "Institute of Computing Technology;;Peng Cheng Laboratory", + "aff_unique_norm": "Chinese Academy of Sciences;University of Chinese Academy of Sciences;Peng Cheng Laboratory", + "aff_unique_dep": "Institute of Computing Technology;;", "aff_unique_url": "http://www.ict.ac.cn;http://www.ucas.ac.cn;", "aff_unique_abbr": "CAS;UCAS;", "aff_campus_unique_index": "0+0;0+0;0+0+1;0+0", "aff_campus_unique": "Beijing;Shenzhen", "aff_country_unique_index": "0+0;0+0;0+0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Hu_2023_ICCV,\n \n author = {\n Hu,\n Lanqing and Kan,\n Meina and Shan,\n Shiguang and Chen,\n Xilin\n},\n title = {\n DandelionNet: Domain Composition with Instance Adaptive Classification for Domain Generalization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19050-19059\n} \n}" }, { "title": "DarSwin: Distortion Aware Radial Swin Transformer", @@ -12926,11 +13363,11 @@ "status": "Poster", "track": "main", "pid": "6488", - "author_site": "Akshaya Athwale, Arman Afrasiyabi, Justin Lag\u00fce, Ichrak Shili, Ola Ahmad, Jean-Fran\u00e7ois Lalonde", - "author": "Akshaya Athwale; Arman Afrasiyabi; Justin Lag\u00fce; Ichrak Shili; Ola Ahmad; Jean-Fran\u00e7ois Lalonde", + "author_site": "Akshaya Athwale, Arman Afrasiyabi, Justin Lagüe, Ichrak Shili, Ola Ahmad, Jean-François Lalonde", + "author": "Akshaya Athwale; Arman Afrasiyabi; Justin Lagüe; Ichrak Shili; Ola Ahmad; Jean-François Lalonde", "abstract": "Wide-angle lenses are commonly used in perception tasks requiring a large field of view. Unfortunately, these lenses produce significant distortions making conventional models that ignore the distortion effects unable to adapt to wide-angle images. In this paper, we present a novel transformer-based model that automatically adapts to the distortion produced by wide-angle lenses. We leverage the physical characteristics of such lenses, which are analytically defined by the radial distortion profile (assumed to be known), to develop a distortion aware radial swin transformer (DarSwin). In contrast to conventional transformer-based architectures, DarSwin comprises a radial patch partitioning, a distortion-based sampling technique for creating token embeddings, and an angular position encoding for radial patch merging. We validate our method on classification tasks using synthetically distorted ImageNet data and show through extensive experiments that DarSwin can perform zero-shot adaptation to unseen distortions of different wide-angle lenses. Compared to other baselines, DarSwin achieves the best results (in terms of Top-1 accuracy) with significant gains when trained on bounded levels of distortions (very-low, low, medium, and high) and tested on all including out-of-distribution distortions. The code and models are publicly available at https://lvsn.github.io/darswin/", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Athwale_DarSwin_Distortion_Aware_Radial_Swin_Transformer_ICCV_2023_paper.pdf", - "aff": "Universit\u00e9 Laval; Yale University; Universit\u00e9 Laval; Universit\u00e9 Laval; Universit\u00e9 Laval+Thales Digital Solutions; Universit\u00e9 Laval", + "aff": "Université Laval; Yale University; Université Laval; Université Laval; Université Laval+Thales Digital Solutions; Université Laval", "project": "", "github": "https://lvsn.github.io/darswin/", "supp": "", @@ -12943,14 +13380,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Athwale_DarSwin_Distortion_Aware_Radial_Swin_Transformer_ICCV_2023_paper.html", "aff_unique_index": "0;1;0;0;0+2;0", - "aff_unique_norm": "Universit\u00e9 Laval;Yale University;Thales", + "aff_unique_norm": "Université Laval;Yale University;Thales", "aff_unique_dep": ";;Digital Solutions", "aff_unique_url": "https://www.ulaval.ca;https://www.yale.edu;https://www.thalesgroup.com", "aff_unique_abbr": "ULaval;Yale;Thales", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0+2;0", - "aff_country_unique": "Canada;United States;France" + "aff_country_unique": "Canada;United States;France", + "bibtex": "@InProceedings{Athwale_2023_ICCV,\n \n author = {\n Athwale,\n Akshaya and Afrasiyabi,\n Arman and Lag\\"ue,\n Justin and Shili,\n Ichrak and Ahmad,\n Ola and Lalonde,\n Jean-Fran\\c{c\n}ois\n},\n title = {\n DarSwin: Distortion Aware Radial Swin Transformer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5929-5938\n} \n}" }, { "title": "Dark Side Augmentation: Generating Diverse Night Examples for Metric Learning", @@ -12959,7 +13397,7 @@ "track": "main", "pid": "12539", "author_site": "Albert Mohwald, Tomas Jenicek, Ond?ej Chum", - "author": "Albert Mohwald; Tomas Jenicek; Ond\u0159ej Chum", + "author": "Albert Mohwald; Tomas Jenicek; Ondřej Chum", "abstract": "Image retrieval methods based on CNN descriptors rely on metric learning from a large number of diverse examples of positive and negative image pairs. Domains, such as night-time images, with limited availability and variability of training data suffer from poor retrieval performance even with methods performing well on standard benchmarks. We propose to train a GAN-based synthetic-image generator, translating available day-time image examples into night images. Such a generator is used in metric learning as a form of augmentation, supplying training data to the scarce domain. Various types of generators are evaluated and analyzed. We contribute with a novel light-weight GAN architecture that enforces the consistency between the original and translated image through edge consistency. The proposed architecture also allows a simultaneous training of an edge detector that operates on both night and day images. To further increase the variability in the training examples and to maximize the generalization of the trained model, we propose a novel method of diverse anchor mining. \n \n The proposed method improves over the state-of-the-art results on a standard Tokyo 24/7 day-night retrieval benchmark while preserving the performance on Oxford and Paris datasets. This is achieved without the need of training image pairs of matching day and night images. The source code is available at https://github.com/mohwald/gandtr .", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Mohwald_Dark_Side_Augmentation_Generating_Diverse_Night_Examples_for_Metric_Learning_ICCV_2023_paper.pdf", "aff": "VRG, Faculty of Electrical Engineering, Czech Technical University in Prague; VRG, Faculty of Electrical Engineering, Czech Technical University in Prague; VRG, Faculty of Electrical Engineering, Czech Technical University in Prague", @@ -12982,7 +13420,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Prague", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Czech Republic" + "aff_country_unique": "Czech Republic", + "bibtex": "@InProceedings{Mohwald_2023_ICCV,\n \n author = {\n Mohwald,\n Albert and Jenicek,\n Tomas and Chum,\n Ond\\v{r\n}ej\n},\n title = {\n Dark Side Augmentation: Generating Diverse Night Examples for Metric Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11153-11163\n} \n}" }, { "title": "Data Augmented Flatness-aware Gradient Projection for Continual Learning", @@ -13007,14 +13446,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yang_Data_Augmented_Flatness-aware_Gradient_Projection_for_Continual_Learning_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;3;0;0", - "aff_unique_norm": "Northeastern University;JD;University of Maryland;University of Texas at Austin", - "aff_unique_dep": ";JD Explore Academy;;", + "aff_unique_norm": "Northeastern University;JD Explore Academy;University of Maryland;The University of Texas at Austin", + "aff_unique_dep": ";;;", "aff_unique_url": "http://www.neu.edu.cn/;;https://www/umd.edu;https://www.utexas.edu", "aff_unique_abbr": "NEU;;UMD;UT Austin", "aff_campus_unique_index": "1", "aff_campus_unique": ";Austin", "aff_country_unique_index": "0;0;1;1;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n Enneng and Shen,\n Li and Wang,\n Zhenyi and Liu,\n Shiwei and Guo,\n Guibing and Wang,\n Xingwei\n},\n title = {\n Data Augmented Flatness-aware Gradient Projection for Continual Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5630-5639\n} \n}" }, { "title": "Data-Free Class-Incremental Hand Gesture Recognition", @@ -13039,14 +13479,15 @@ "author_num": 12, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Aich_Data-Free_Class-Incremental_Hand_Gesture_Recognition_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0;1;0;1;2;2;2;2;0", - "aff_unique_norm": "Carnegie Mellon University;Indian Institute of Technology Hyderabad;Meta", - "aff_unique_dep": ";;Meta Reality Labs", + "aff_unique_norm": "Carnegie Mellon University;Indian Institute of Technology Hyderabad;Meta Reality Labs", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.cmu.edu;https://www.iith.ac.in;https://www.meta.com", "aff_unique_abbr": "CMU;IIT Hyderabad;MRL", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Hyderabad", "aff_country_unique_index": "0;0;0;0;1;0;1;0;0;0;0;0", - "aff_country_unique": "United States;India" + "aff_country_unique": "United States;India", + "bibtex": "@InProceedings{Aich_2023_ICCV,\n \n author = {\n Aich,\n Shubhra and Ruiz-Santaquiteria,\n Jesus and Lu,\n Zhenyu and Garg,\n Prachi and Joseph,\n K J and Garcia,\n Alvaro Fernandez and Balasubramanian,\n Vineeth N and Kin,\n Kenrick and Wan,\n Chengde and Camgoz,\n Necati Cihan and Ma,\n Shugao and De la Torre,\n Fernando\n},\n title = {\n Data-Free Class-Incremental Hand Gesture Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20958-20967\n} \n}" }, { "title": "Data-free Knowledge Distillation for Fine-grained Visual Categorization", @@ -13074,11 +13515,12 @@ "aff_unique_norm": "East China Normal University;Shandong University", "aff_unique_dep": "School of Computer Science and Technology;School of Computer Science and Technology", "aff_unique_url": "http://www.ecnu.edu.cn;http://www.sdu.edu.cn", - "aff_unique_abbr": "ECNU;", + "aff_unique_abbr": "ECNU;SDU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Shao_2023_ICCV,\n \n author = {\n Shao,\n Renrong and Zhang,\n Wei and Yin,\n Jianhua and Wang,\n Jun\n},\n title = {\n Data-free Knowledge Distillation for Fine-grained Visual Categorization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1515-1525\n} \n}" }, { "title": "DataDAM: Efficient Dataset Distillation with Attention Matching", @@ -13110,7 +13552,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0;0;0;0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Sajedi_2023_ICCV,\n \n author = {\n Sajedi,\n Ahmad and Khaki,\n Samir and Amjadian,\n Ehsan and Liu,\n Lucy Z. and Lawryshyn,\n Yuri A. and Plataniotis,\n Konstantinos N.\n},\n title = {\n DataDAM: Efficient Dataset Distillation with Attention Matching\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17097-17107\n} \n}" }, { "title": "Dataset Quantization", @@ -13142,7 +13585,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+1;1;1;1;1;1;1;0+1", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Zhou_2023_ICCV,\n \n author = {\n Zhou,\n Daquan and Wang,\n Kai and Gu,\n Jianyang and Peng,\n Xiangyu and Lian,\n Dongze and Zhang,\n Yifan and You,\n Yang and Feng,\n Jiashi\n},\n title = {\n Dataset Quantization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17205-17216\n} \n}" }, { "title": "DeFormer: Integrating Transformers with Deformable Models for 3D Shape Abstraction from a Single Image", @@ -13174,7 +13618,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Di and Yu,\n Xiang and Ye,\n Meng and Zhangli,\n Qilong and Li,\n Zhuowei and Zhang,\n Zhixing and Metaxas,\n Dimitris N.\n},\n title = {\n DeFormer: Integrating Transformers with Deformable Models for 3D Shape Abstraction from a Single Image\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14236-14246\n} \n}" }, { "title": "DeLiRa: Self-Supervised Depth, Light, and Radiance Fields", @@ -13197,7 +13642,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Guizilini_DeLiRa_Self-Supervised_Depth_Light_and_Radiance_Fields_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Guizilini_DeLiRa_Self-Supervised_Depth_Light_and_Radiance_Fields_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Guizilini_2023_ICCV,\n \n author = {\n Guizilini,\n Vitor and Vasiljevic,\n Igor and Fang,\n Jiading and Ambrus,\n Rares and Zakharov,\n Sergey and Sitzmann,\n Vincent and Gaidon,\n Adrien\n},\n title = {\n DeLiRa: Self-Supervised Depth,\n Light,\n and Radiance Fields\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17935-17945\n} \n}" }, { "title": "Dec-Adapter: Exploring Efficient Decoder-Side Adapter for Bridging Screen Content and Natural Image Compression", @@ -13229,7 +13675,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Shen_2023_ICCV,\n \n author = {\n Shen,\n Sheng and Yue,\n Huanjing and Yang,\n Jingyu\n},\n title = {\n Dec-Adapter: Exploring Efficient Decoder-Side Adapter for Bridging Screen Content and Natural Image Compression\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12887-12896\n} \n}" }, { "title": "Decomposition-Based Variational Network for Multi-Contrast MRI Super-Resolution and Reconstruction", @@ -13261,7 +13708,8 @@ "aff_campus_unique_index": ";1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0+1;0;0", - "aff_country_unique": "China;Unknown" + "aff_country_unique": "China;Unknown", + "bibtex": "@InProceedings{Lei_2023_ICCV,\n \n author = {\n Lei,\n Pengcheng and Fang,\n Faming and Zhang,\n Guixu and Zeng,\n Tieyong\n},\n title = {\n Decomposition-Based Variational Network for Multi-Contrast MRI Super-Resolution and Reconstruction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21296-21306\n} \n}" }, { "title": "Decouple Before Interact: Multi-Modal Prompt Learning for Continual Visual Question Answering", @@ -13289,11 +13737,12 @@ "aff_unique_norm": "Tsinghua University;Alibaba Group", "aff_unique_dep": "Department of Computer Science and Technology;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.alibaba.com", - "aff_unique_abbr": "THU;Alibaba", + "aff_unique_abbr": "Tsinghua;Alibaba", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Qian_2023_ICCV,\n \n author = {\n Qian,\n Zi and Wang,\n Xin and Duan,\n Xuguang and Qin,\n Pengda and Li,\n Yuhong and Zhu,\n Wenwu\n},\n title = {\n Decouple Before Interact: Multi-Modal Prompt Learning for Continual Visual Question Answering\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2953-2962\n} \n}" }, { "title": "Decoupled DETR: Spatially Disentangling Localization and Classification for Improved End-to-End Object Detection", @@ -13318,14 +13767,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhang_Decoupled_DETR_Spatially_Disentangling_Localization_and_Classification_for_Improved_End-to-End_ICCV_2023_paper.html", "aff_unique_index": "0+1+2;3;3;0+3+1+2", - "aff_unique_norm": "Chinese University of Hong Kong;Centre for Perceptual and Interactive Intelligence;Shanghai AI Laboratory;SenseTime", + "aff_unique_norm": "The Chinese University of Hong Kong;Centre for Perceptual and Interactive Intelligence;Shanghai AI Laboratory;SenseTime", "aff_unique_dep": "Multimedia Laboratory;;;SenseTime Research", "aff_unique_url": "https://www.cuhk.edu.hk;;https://www.shanghai-ai-lab.com;https://www.sensetime.com", "aff_unique_abbr": "CUHK;;SAIL;SenseTime", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0+1+0;0;0;0+0+1+0", - "aff_country_unique": "China;United Kingdom" + "aff_country_unique": "China;United Kingdom", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Manyuan and Song,\n Guanglu and Liu,\n Yu and Li,\n Hongsheng\n},\n title = {\n Decoupled DETR: Spatially Disentangling Localization and Classification for Improved End-to-End Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6601-6610\n} \n}" }, { "title": "Decoupled Iterative Refinement Framework for Interacting Hands Reconstruction from a Single RGB Image", @@ -13352,12 +13802,13 @@ "aff_unique_index": "0+1;1;0+1;1;0;0;0;0", "aff_unique_norm": "Beijing University of Posts and Telecommunications;ByteDance", "aff_unique_dep": "State Key Laboratory of Networking and Switching Technology;PICO IDL", - "aff_unique_url": "http://www.bupt.edu.cn;https://www.bytedance.com", + "aff_unique_url": "http://www.bupt.edu.cn/;https://www.bytedance.com", "aff_unique_abbr": "BUPT;ByteDance", - "aff_campus_unique_index": "0+0;0;0+0;0;0;0;0;0", - "aff_campus_unique": "Beijing", + "aff_campus_unique_index": "1;1;1;1", + "aff_campus_unique": ";Beijing", "aff_country_unique_index": "0+0;0;0+0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Ren_2023_ICCV,\n \n author = {\n Ren,\n Pengfei and Wen,\n Chao and Zheng,\n Xiaozheng and Xue,\n Zhou and Sun,\n Haifeng and Qi,\n Qi and Wang,\n Jingyu and Liao,\n Jianxin\n},\n title = {\n Decoupled Iterative Refinement Framework for Interacting Hands Reconstruction from a Single RGB Image\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8014-8025\n} \n}" }, { "title": "DeePoint: Visual Pointing Recognition and Direction Estimation", @@ -13380,7 +13831,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Nakamura_DeePoint_Visual_Pointing_Recognition_and_Direction_Estimation_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Nakamura_DeePoint_Visual_Pointing_Recognition_and_Direction_Estimation_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Nakamura_2023_ICCV,\n \n author = {\n Nakamura,\n Shu and Kawanishi,\n Yasutomo and Nobuhara,\n Shohei and Nishino,\n Ko\n},\n title = {\n DeePoint: Visual Pointing Recognition and Direction Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20577-20587\n} \n}" }, { "title": "Deep Active Contours for Real-time 6-DoF Object Tracking", @@ -13412,7 +13864,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Long and Yan,\n Shen and Zhen,\n Jianan and Liu,\n Yu and Zhang,\n Maojun and Zhang,\n Guofeng and Zhou,\n Xiaowei\n},\n title = {\n Deep Active Contours for Real-time 6-DoF Object Tracking\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14034-14044\n} \n}" }, { "title": "Deep Directly-Trained Spiking Neural Networks for Object Detection", @@ -13424,7 +13877,7 @@ "author": "Qiaoyi Su; Yuhong Chou; Yifan Hu; Jianing Li; Shijie Mei; Ziyang Zhang; Guoqi Li", "abstract": "Spiking neural networks (SNNs) are brain-inspired energy-efficient models that encode information in spatiotemporal dynamics. Recently, deep SNNs trained directly have shown great success in achieving high performance on classification tasks with very few time steps. However, how to design a directly-trained SNN for the regression task of object detection still remains a challenging problem. To address this problem, we propose EMS-YOLO, a novel directly-trained SNN framework for object detection, which is the first trial to train a deep SNN with surrogate gradients for object detection rather than ANN-SNN conversion strategies. Specifically, we design a full-spike residual block, EMS-ResNet, which can effectively extend the depth of the directly-trained SNN with low power consumption. Furthermore, we theoretically analyze and prove the EMS-ResNet could avoid gradient vanishing or exploding. The results demonstrate that our approach outperforms the state-of-the-art ANN-SNN conversion methods (at least 500 time steps) in extremely fewer time steps (only 4 time steps). It is shown that our model could achieve comparable performance to the ANN with the same architecture while consuming 5.83x less energy on the frame-based COCO Dataset and the event-based Gen1 Dataset. Our code is available in https://github.com/BICLab/EMS-YOLO.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Su_Deep_Directly-Trained_Spiking_Neural_Networks_for_Object_Detection_ICCV_2023_paper.pdf", - "aff": "School of Artificial Intelligence, University of Chinese Academy of Sciences; College of Artificial Intelligence, Xi\u2019an Jiaotong University; Department of Precision Instrument, Tsinghua University; School of Computer Science, Peking University; School of Vehicle and Mobility, Tsinghua University + Institute of Automation, Chinese Academy of Sciences; Advanced Computing and Storage Lab, Huawei Technologies Co. Ltd. + Institute of Automation, Chinese Academy of Sciences; School of Artificial Intelligence, University of Chinese Academy of Sciences + Institute of Automation, Chinese Academy of Sciences", + "aff": "School of Artificial Intelligence, University of Chinese Academy of Sciences; College of Artificial Intelligence, Xi’an Jiaotong University; Department of Precision Instrument, Tsinghua University; School of Computer Science, Peking University; School of Vehicle and Mobility, Tsinghua University + Institute of Automation, Chinese Academy of Sciences; Advanced Computing and Storage Lab, Huawei Technologies Co. Ltd. + Institute of Automation, Chinese Academy of Sciences; School of Artificial Intelligence, University of Chinese Academy of Sciences + Institute of Automation, Chinese Academy of Sciences", "project": "", "github": "https://github.com/BICLab/EMS-YOLO", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Su_Deep_Directly-Trained_Spiking_ICCV_2023_supplemental.pdf", @@ -13437,14 +13890,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Su_Deep_Directly-Trained_Spiking_Neural_Networks_for_Object_Detection_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;3;2+4;5+4;0+4", - "aff_unique_norm": "University of Chinese Academy of Sciences;Xi'an Jiao Tong University;Tsinghua University;Peking University;Chinese Academy of Sciences;Huawei", + "aff_unique_norm": "University of Chinese Academy of Sciences;Xi'an Jiaotong University;Tsinghua University;Peking University;Chinese Academy of Sciences;Huawei Technologies Co. Ltd.", "aff_unique_dep": "School of Artificial Intelligence;College of Artificial Intelligence;Department of Precision Instrument;School of Computer Science;Institute of Automation;Advanced Computing and Storage Lab", "aff_unique_url": "http://www.ucas.ac.cn;http://www.xjtu.edu.cn;https://www.tsinghua.edu.cn;http://www.pku.edu.cn;http://www.ia.cas.cn;https://www.huawei.com", "aff_unique_abbr": "UCAS;XJTU;THU;PKU;CAS;Huawei", "aff_campus_unique_index": "1;2;;;", "aff_campus_unique": ";Xi'an;Beijing", "aff_country_unique_index": "0;0;0;0;0+0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Su_2023_ICCV,\n \n author = {\n Su,\n Qiaoyi and Chou,\n Yuhong and Hu,\n Yifan and Li,\n Jianing and Mei,\n Shijie and Zhang,\n Ziyang and Li,\n Guoqi\n},\n title = {\n Deep Directly-Trained Spiking Neural Networks for Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6555-6565\n} \n}" }, { "title": "Deep Equilibrium Object Detection", @@ -13467,7 +13921,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wang_Deep_Equilibrium_Object_Detection_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wang_Deep_Equilibrium_Object_Detection_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Shuai and Teng,\n Yao and Wang,\n Limin\n},\n title = {\n Deep Equilibrium Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6296-6306\n} \n}" }, { "title": "Deep Feature Deblurring Diffusion for Detecting Out-of-Distribution Objects", @@ -13479,7 +13934,7 @@ "author": "Aming Wu; Da Chen; Cheng Deng", "abstract": "To promote the safe application of detectors, a task of unsupervised out-of-distribution object detection (OOD-OD) is recently proposed, whose goal is to detect unseen OOD objects without accessing any auxiliary OOD data. For this task, the challenge mainly lies in how to only leverage the known in-distribution (ID) data to detect OOD objects accurately without affecting the detection of ID objects, which can be framed as the diffusion problem for deep feature synthesis. Accordingly, such challenge could be addressed by the forward and reverse processes in the diffusion model. In this paper, we propose a new approach of Deep Feature Deblurring Diffusion (DFDD), consisting of forward blurring and reverse deblurring processes. Specifically, the forward process gradually performs Gaussian Blur on the extracted features, which is instrumental in retaining sufficient input-relevant information. By this way, the forward process could synthesize virtual OOD features that are close to the classification boundary between ID and OOD objects, which improves the performance of detecting OOD objects. During the reverse process, based on the blurred features, a dedicated deblurring model is designed to continually recover the lost details in the forward process. Both the deblurred features and original features are taken as the input for training, strengthening the discrimination ability. In the experiments, our method is evaluated on OOD-OD, open-set object detection, and incremental object detection. The significant performance gains over baselines demonstrate the superiorities of our method. The source code will be made available at: https://github.com/AmingWu/DFDD-OOD.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Wu_Deep_Feature_Deblurring_Diffusion_for_Detecting_Out-of-Distribution_Objects_ICCV_2023_paper.pdf", - "aff": "School of Electronic Engineering, Xidian University, Xi\u2019an, China; University of Bath; School of Electronic Engineering, Xidian University, Xi\u2019an, China", + "aff": "School of Electronic Engineering, Xidian University, Xi’an, China; University of Bath; School of Electronic Engineering, Xidian University, Xi’an, China", "project": "", "github": "https://github.com/AmingWu/DFDD-OOD", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Wu_Deep_Feature_Deblurring_ICCV_2023_supplemental.pdf", @@ -13499,7 +13954,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Xi'an;", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "China;United Kingdom" + "aff_country_unique": "China;United Kingdom", + "bibtex": "@InProceedings{Wu_2023_ICCV,\n \n author = {\n Wu,\n Aming and Chen,\n Da and Deng,\n Cheng\n},\n title = {\n Deep Feature Deblurring Diffusion for Detecting Out-of-Distribution Objects\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13381-13391\n} \n}" }, { "title": "Deep Fusion Transformer Network with Weighted Vector-Wise Keypoints Voting for Robust 6D Object Pose Estimation", @@ -13524,14 +13980,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhou_Deep_Fusion_Transformer_Network_with_Weighted_Vector-Wise_Keypoints_Voting_for_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;1;0", - "aff_unique_norm": "Hong Kong Polytechnic University;Chinese University of Hong Kong;University of Waterloo", + "aff_unique_norm": "The Hong Kong Polytechnic University;The Chinese University of Hong Kong;University of Waterloo", "aff_unique_dep": ";;", "aff_unique_url": "https://www.polyu.edu.hk;https://www.cuhk.edu.hk;https://uwaterloo.ca", "aff_unique_abbr": "PolyU;CUHK;UW", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;1;0;0", - "aff_country_unique": "China;Canada" + "aff_country_unique": "China;Canada", + "bibtex": "@InProceedings{Zhou_2023_ICCV,\n \n author = {\n Zhou,\n Jun and Chen,\n Kai and Xu,\n Linlin and Dou,\n Qi and Qin,\n Jing\n},\n title = {\n Deep Fusion Transformer Network with Weighted Vector-Wise Keypoints Voting for Robust 6D Object Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13967-13977\n} \n}" }, { "title": "Deep Geometrized Cartoon Line Inbetweening", @@ -13563,7 +14020,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;2;0;0;0", - "aff_country_unique": "Singapore;;China" + "aff_country_unique": "Singapore;;China", + "bibtex": "@InProceedings{Siyao_2023_ICCV,\n \n author = {\n Siyao,\n Li and Gu,\n Tianpei and Xiao,\n Weiye and Ding,\n Henghui and Liu,\n Ziwei and Loy,\n Chen Change\n},\n title = {\n Deep Geometrized Cartoon Line Inbetweening\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7291-7300\n} \n}" }, { "title": "Deep Geometry-Aware Camera Self-Calibration from Video", @@ -13595,7 +14053,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Hagemann_2023_ICCV,\n \n author = {\n Hagemann,\n Annika and Knorr,\n Moritz and Stiller,\n Christoph\n},\n title = {\n Deep Geometry-Aware Camera Self-Calibration from Video\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3438-3448\n} \n}" }, { "title": "Deep Homography Mixture for Single Image Rolling Shutter Correction", @@ -13627,7 +14086,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Yan_2023_ICCV,\n \n author = {\n Yan,\n Weilong and Tan,\n Robby T. and Zeng,\n Bing and Liu,\n Shuaicheng\n},\n title = {\n Deep Homography Mixture for Single Image Rolling Shutter Correction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9868-9877\n} \n}" }, { "title": "Deep Image Harmonization with Globally Guided Feature Transformation and Relation Distillation", @@ -13659,7 +14119,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Shanghai;", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Niu_2023_ICCV,\n \n author = {\n Niu,\n Li and Tan,\n Linfeng and Tao,\n Xinhao and Cao,\n Junyan and Guo,\n Fengjun and Long,\n Teng and Zhang,\n Liqing\n},\n title = {\n Deep Image Harmonization with Globally Guided Feature Transformation and Relation Distillation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7723-7732\n} \n}" }, { "title": "Deep Image Harmonization with Learnable Augmentation", @@ -13691,7 +14152,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Shanghai", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Niu_2023_ICCV,\n \n author = {\n Niu,\n Li and Cao,\n Junyan and Cong,\n Wenyan and Zhang,\n Liqing\n},\n title = {\n Deep Image Harmonization with Learnable Augmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7482-7491\n} \n}" }, { "title": "Deep Incubation: Training Large Models by Divide-and-Conquering", @@ -13723,7 +14185,8 @@ "aff_campus_unique_index": "0+0;0+0;0;0;0;0+0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0+0;0+0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Ni_2023_ICCV,\n \n author = {\n Ni,\n Zanlin and Wang,\n Yulin and Yu,\n Jiangwei and Jiang,\n Haojun and Cao,\n Yue and Huang,\n Gao\n},\n title = {\n Deep Incubation: Training Large Models by Divide-and-Conquering\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17335-17345\n} \n}" }, { "title": "Deep Multitask Learning with Progressive Parameter Sharing", @@ -13748,14 +14211,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Shi_Deep_Multitask_Learning_with_Progressive_Parameter_Sharing_ICCV_2023_paper.html", "aff_unique_index": "0+1;2;1;0", - "aff_unique_norm": "Chinese University of Hong Kong;Nanyang Technological University;Continental Automotive", + "aff_unique_norm": "The Chinese University of Hong Kong;Nanyang Technological University;Continental Automotive", "aff_unique_dep": ";;", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.ntu.edu.sg;https://www.continental-automotive.com", "aff_unique_abbr": "CUHK;NTU;", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0+1;1;1;0", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Shi_2023_ICCV,\n \n author = {\n Shi,\n Haosen and Ren,\n Shen and Zhang,\n Tianwei and Pan,\n Sinno Jialin\n},\n title = {\n Deep Multitask Learning with Progressive Parameter Sharing\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19924-19935\n} \n}" }, { "title": "Deep Multiview Clustering by Contrasting Cluster Assignments", @@ -13787,7 +14251,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0", - "aff_country_unique": "China;United Kingdom" + "aff_country_unique": "China;United Kingdom", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Jie and Mao,\n Hua and Woo,\n Wai Lok and Peng,\n Xi\n},\n title = {\n Deep Multiview Clustering by Contrasting Cluster Assignments\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16752-16761\n} \n}" }, { "title": "Deep Optics for Video Snapshot Compressive Imaging", @@ -13819,7 +14284,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Ping and Wang,\n Lishun and Yuan,\n Xin\n},\n title = {\n Deep Optics for Video Snapshot Compressive Imaging\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10646-10656\n} \n}" }, { "title": "Deep Video Demoireing via Compact Invertible Dyadic Decomposition", @@ -13842,7 +14308,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Quan_Deep_Video_Demoireing_via_Compact_Invertible_Dyadic_Decomposition_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Quan_Deep_Video_Demoireing_via_Compact_Invertible_Dyadic_Decomposition_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Quan_2023_ICCV,\n \n author = {\n Quan,\n Yuhui and Huang,\n Haoran and He,\n Shengfeng and Xu,\n Ruotao\n},\n title = {\n Deep Video Demoireing via Compact Invertible Dyadic Decomposition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12677-12686\n} \n}" }, { "title": "DeepChange: A Long-Term Person Re-Identification Benchmark with Clothes Change", @@ -13874,7 +14341,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", - "aff_country_unique": "China;United Kingdom" + "aff_country_unique": "China;United Kingdom", + "bibtex": "@InProceedings{Xu_2023_ICCV,\n \n author = {\n Xu,\n Peng and Zhu,\n Xiatian\n},\n title = {\n DeepChange: A Long-Term Person Re-Identification Benchmark with Clothes Change\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11196-11205\n} \n}" }, { "title": "DeformToon3D: Deformable Neural Radiance Fields for 3D Toonification", @@ -13906,7 +14374,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0+1;0;0;1;0;0;0", - "aff_country_unique": "Singapore;China" + "aff_country_unique": "Singapore;China", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Junzhe and Lan,\n Yushi and Yang,\n Shuai and Hong,\n Fangzhou and Wang,\n Quan and Yeo,\n Chai Kiat and Liu,\n Ziwei and Loy,\n Chen Change\n},\n title = {\n DeformToon3D: Deformable Neural Radiance Fields for 3D Toonification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9144-9154\n} \n}" }, { "title": "Deformable Model-Driven Neural Rendering for High-Fidelity 3D Reconstruction of Human Heads Under Low-View Settings", @@ -13929,7 +14398,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Xu_Deformable_Model-Driven_Neural_Rendering_for_High-Fidelity_3D_Reconstruction_of_Human_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Xu_Deformable_Model-Driven_Neural_Rendering_for_High-Fidelity_3D_Reconstruction_of_Human_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Xu_2023_ICCV,\n \n author = {\n Xu,\n Baixin and Zhang,\n Jiarui and Lin,\n Kwan-Yee and Qian,\n Chen and He,\n Ying\n},\n title = {\n Deformable Model-Driven Neural Rendering for High-Fidelity 3D Reconstruction of Human Heads Under Low-View Settings\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17924-17934\n} \n}" }, { "title": "Deformable Neural Radiance Fields using RGB and Event Cameras", @@ -13952,7 +14422,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ma_Deformable_Neural_Radiance_Fields_using_RGB_and_Event_Cameras_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ma_Deformable_Neural_Radiance_Fields_using_RGB_and_Event_Cameras_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Ma_2023_ICCV,\n \n author = {\n Ma,\n Qi and Paudel,\n Danda Pani and Chhatkuli,\n Ajad and Van Gool,\n Luc\n},\n title = {\n Deformable Neural Radiance Fields using RGB and Event Cameras\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3590-3600\n} \n}" }, { "title": "Deformer: Dynamic Fusion Transformer for Robust Hand Pose Estimation", @@ -13984,7 +14455,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Fu_2023_ICCV,\n \n author = {\n Fu,\n Qichen and Liu,\n Xingyu and Xu,\n Ran and Niebles,\n Juan Carlos and Kitani,\n Kris M.\n},\n title = {\n Deformer: Dynamic Fusion Transformer for Robust Hand Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23600-23611\n} \n}" }, { "title": "Degradation-Resistant Unfolding Network for Heterogeneous Image Fusion", @@ -13996,7 +14468,7 @@ "author": "Chunming He; Kai Li; Guoxia Xu; Yulun Zhang; Runze Hu; Zhenhua Guo; Xiu Li", "abstract": "Heterogeneous image fusion (HIF) techniques aim to enhance image quality by merging complementary information from images captured by different sensors. Among these algorithms, deep unfolding network (DUN)-based methods achieve promising performance but still suffer from two issues: they lack a degradation-resistant-oriented fusion model and struggle to adequately consider the structural properties of DUNs, making them vulnerable to degradation scenarios. In this paper, we propose a Degradation-Resistant Unfolding Network (DeRUN) for the HIF task to generate high-quality fused images even in degradation scenarios. Specifically, we introduce a novel HIF model for degradation resistance and derive its optimization procedures. Then, we incorporate the optimization unfolding process into the proposed DeRUN for end-to-end training. To ensure the robustness and efficiency of DeRUN, we employ a joint constraint strategy and a lightweight partial weight sharing module. To train DeRUN, we further propose a gradient direction-based entropy loss with powerful texture representation capacity. Extensive experiments show that DeRUN significantly outperforms existing methods on four HIF tasks, as well as downstream applications, with cheaper computational and memory costs.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/He_Degradation-Resistant_Unfolding_Network_for_Heterogeneous_Image_Fusion_ICCV_2023_paper.pdf", - "aff": "Shenzhen International Graduate School, Tsinghua University+Smart Vision; NEC Laboratories America; Smart Vision+Nanjing University of Posts and Telecommunications; ETH Z\u00fcrich; Beijing Institute of Technology; Tianyi Traffic Technology; Shenzhen International Graduate School, Tsinghua University", + "aff": "Shenzhen International Graduate School, Tsinghua University+Smart Vision; NEC Laboratories America; Smart Vision+Nanjing University of Posts and Telecommunications; ETH Zürich; Beijing Institute of Technology; Tianyi Traffic Technology; Shenzhen International Graduate School, Tsinghua University", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/He_Degradation-Resistant_Unfolding_Network_ICCV_2023_supplemental.pdf", @@ -14009,14 +14481,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/He_Degradation-Resistant_Unfolding_Network_for_Heterogeneous_Image_Fusion_ICCV_2023_paper.html", "aff_unique_index": "0+1;2;1+3;4;5;6;0", - "aff_unique_norm": "Tsinghua University;Smart Vision;NEC Laboratories America;Nanjing University of Posts and Telecommunications;ETH Zurich;Beijing Institute of Technology;Tianyi Traffic Technology", + "aff_unique_norm": "Tsinghua University;Smart Vision;NEC Laboratories America;Nanjing University of Posts and Telecommunications;ETH Zürich;Beijing Institute of Technology;Tianyi Traffic Technology", "aff_unique_dep": "Shenzhen International Graduate School;;;;;;", "aff_unique_url": "https://www.tsinghua.edu.cn;;https://www.nec-labs.com;http://www.njupt.edu.cn;https://www.ethz.ch;http://www.bit.edu.cn/;", "aff_unique_abbr": "THU;;NEC Labs America;NJUPT;ETHZ;BIT;", "aff_campus_unique_index": "0;2;0", "aff_campus_unique": "Shenzhen;;Nanjing", "aff_country_unique_index": "0;2;0;3;0;0", - "aff_country_unique": "China;;United States;Switzerland" + "aff_country_unique": "China;;United States;Switzerland", + "bibtex": "@InProceedings{He_2023_ICCV,\n \n author = {\n He,\n Chunming and Li,\n Kai and Xu,\n Guoxia and Zhang,\n Yulun and Hu,\n Runze and Guo,\n Zhenhua and Li,\n Xiu\n},\n title = {\n Degradation-Resistant Unfolding Network for Heterogeneous Image Fusion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12611-12621\n} \n}" }, { "title": "Delicate Textured Mesh Recovery from NeRF via Adaptive Surface Refinement", @@ -14041,14 +14514,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Tang_Delicate_Textured_Mesh_Recovery_from_NeRF_via_Adaptive_Surface_Refinement_ICCV_2023_paper.html", "aff_unique_index": "0+1;1;0+1;1;1;1;0", - "aff_unique_norm": "Peking University;Baidu", - "aff_unique_dep": "School of IST;Baidu Inc.", + "aff_unique_norm": "Peking University;Baidu Inc.", + "aff_unique_dep": "School of IST;", "aff_unique_url": "http://www.pku.edu.cn;https://www.baidu.com", "aff_unique_abbr": "PKU;Baidu", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0+0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Tang_2023_ICCV,\n \n author = {\n Tang,\n Jiaxiang and Zhou,\n Hang and Chen,\n Xiaokang and Hu,\n Tianshu and Ding,\n Errui and Wang,\n Jingdong and Zeng,\n Gang\n},\n title = {\n Delicate Textured Mesh Recovery from NeRF via Adaptive Surface Refinement\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17739-17749\n} \n}" }, { "title": "Delta Denoising Score", @@ -14071,7 +14545,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Hertz_Delta_Denoising_Score_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Hertz_Delta_Denoising_Score_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Hertz_2023_ICCV,\n \n author = {\n Hertz,\n Amir and Aberman,\n Kfir and Cohen-Or,\n Daniel\n},\n title = {\n Delta Denoising Score\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2328-2337\n} \n}" }, { "title": "Delving into Motion-Aware Matching for Monocular 3D Object Tracking", @@ -14097,13 +14572,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Huang_Delving_into_Motion-Aware_Matching_for_Monocular_3D_Object_Tracking_ICCV_2023_paper.html", "aff_unique_index": "0;1+2;1", "aff_unique_norm": "University of California, Merced;Google;Yonsei University", - "aff_unique_dep": ";Google;", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.ucmerced.edu;https://www.google.com;https://www.yonsei.ac.kr", "aff_unique_abbr": "UC Merced;Google;Yonsei", "aff_campus_unique_index": "0;1;1", "aff_campus_unique": "Merced;Mountain View;", "aff_country_unique_index": "0;0+1;0", - "aff_country_unique": "United States;South Korea" + "aff_country_unique": "United States;South Korea", + "bibtex": "@InProceedings{Huang_2023_ICCV,\n \n author = {\n Huang,\n Kuan-Chih and Yang,\n Ming-Hsuan and Tsai,\n Yi-Hsuan\n},\n title = {\n Delving into Motion-Aware Matching for Monocular 3D Object Tracking\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6909-6918\n} \n}" }, { "title": "Democratising 2D Sketch to 3D Shape Retrieval Through Pivoting", @@ -14135,7 +14611,8 @@ "aff_campus_unique_index": ";;;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0+0;0+0;0+0;0+0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Chowdhury_2023_ICCV,\n \n author = {\n Chowdhury,\n Pinaki Nath and Bhunia,\n Ayan Kumar and Sain,\n Aneeshan and Koley,\n Subhadeep and Xiang,\n Tao and Song,\n Yi-Zhe\n},\n title = {\n Democratising 2D Sketch to 3D Shape Retrieval Through Pivoting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23275-23286\n} \n}" }, { "title": "Denoising Diffusion Autoencoders are Unified Self-supervised Learners", @@ -14167,7 +14644,8 @@ "aff_campus_unique_index": "0+0+0;0+0+0;0;0+0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0+0+0;0+0+0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xiang_2023_ICCV,\n \n author = {\n Xiang,\n Weilai and Yang,\n Hongyu and Huang,\n Di and Wang,\n Yunhong\n},\n title = {\n Denoising Diffusion Autoencoders are Unified Self-supervised Learners\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15802-15812\n} \n}" }, { "title": "Dense 2D-3D Indoor Prediction with Sound via Aligned Cross-Modal Distillation", @@ -14199,7 +14677,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Yun_2023_ICCV,\n \n author = {\n Yun,\n Heeseung and Na,\n Joonil and Kim,\n Gunhee\n},\n title = {\n Dense 2D-3D Indoor Prediction with Sound via Aligned Cross-Modal Distillation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7863-7872\n} \n}" }, { "title": "Dense Text-to-Image Generation with Attention Modulation", @@ -14231,7 +14710,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1", - "aff_country_unique": "South Korea;United States" + "aff_country_unique": "South Korea;United States", + "bibtex": "@InProceedings{Kim_2023_ICCV,\n \n author = {\n Kim,\n Yunji and Lee,\n Jiyoung and Kim,\n Jin-Hwa and Ha,\n Jung-Woo and Zhu,\n Jun-Yan\n},\n title = {\n Dense Text-to-Image Generation with Attention Modulation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7701-7711\n} \n}" }, { "title": "DenseShift: Towards Accurate and Efficient Low-Bit Power-of-Two Quantization", @@ -14254,7 +14734,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_DenseShift_Towards_Accurate_and_Efficient_Low-Bit_Power-of-Two_Quantization_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_DenseShift_Towards_Accurate_and_Efficient_Low-Bit_Power-of-Two_Quantization_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Xinlin and Liu,\n Bang and Yang,\n Rui Heng and Courville,\n Vanessa and Xing,\n Chao and Nia,\n Vahid Partovi\n},\n title = {\n DenseShift: Towards Accurate and Efficient Low-Bit Power-of-Two Quantization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17010-17020\n} \n}" }, { "title": "Density-invariant Features for Distant Point Cloud Registration", @@ -14286,7 +14767,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Quan and Zhu,\n Hongzi and Zhou,\n Yunsong and Li,\n Hongyang and Chang,\n Shan and Guo,\n Minyi\n},\n title = {\n Density-invariant Features for Distant Point Cloud Registration\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18215-18225\n} \n}" }, { "title": "Designing Phase Masks for Under-Display Cameras", @@ -14309,7 +14791,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yang_Designing_Phase_Masks_for_Under-Display_Cameras_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yang_Designing_Phase_Masks_for_Under-Display_Cameras_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n Anqi and Kang,\n Eunhee and Lee,\n Hyong-Euk and Sankaranarayanan,\n Aswin C.\n},\n title = {\n Designing Phase Masks for Under-Display Cameras\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10637-10645\n} \n}" }, { "title": "DetZero: Rethinking Offboard 3D Object Detection with Long-term Sequential Point Clouds", @@ -14334,14 +14817,15 @@ "author_num": 12, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ma_DetZero_Rethinking_Offboard_3D_Object_Detection_with_Long-term_Sequential_Point_ICCV_2023_paper.html", "aff_unique_index": "0;1;1;2;1;3;4;5;2;1;1;0+1+6", - "aff_unique_norm": "Chinese University of Hong Kong;Shanghai Artificial Intelligence Laboratory;East China Normal University;South China University of Technology;Fudan University;ETH Zurich;Center for Process Innovation and Integration", + "aff_unique_norm": "The Chinese University of Hong Kong;Shanghai Artificial Intelligence Laboratory;East China Normal University;South China University of Technology;Fudan University;ETH Zurich;Center for Process Innovation and Integration", "aff_unique_dep": "Multimedia Laboratory;;;;;;", "aff_unique_url": "https://www.cuhk.edu.hk;http://www.shailab.org/;http://www.ecnu.edu.cn;https://www.scut.edu.cn;https://www.fudan.edu.cn;https://www.ethz.ch;", "aff_unique_abbr": "CUHK;Shanghai AI Lab;ECNU;SCUT;Fudan;ETHZ;CPII", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0;0;0;1;0;0;0;0+0", - "aff_country_unique": "China;Switzerland;" + "aff_country_unique": "China;Switzerland;", + "bibtex": "@InProceedings{Ma_2023_ICCV,\n \n author = {\n Ma,\n Tao and Yang,\n Xuemeng and Zhou,\n Hongbin and Li,\n Xin and Shi,\n Botian and Liu,\n Junjie and Yang,\n Yuchen and Liu,\n Zhizheng and He,\n Liang and Qiao,\n Yu and Li,\n Yikang and Li,\n Hongsheng\n},\n title = {\n DetZero: Rethinking Offboard 3D Object Detection with Long-term Sequential Point Clouds\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6736-6747\n} \n}" }, { "title": "Detecting Objects with Context-Likelihood Graphs and Graph Refinement", @@ -14373,7 +14857,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "Netherlands" + "aff_country_unique": "Netherlands", + "bibtex": "@InProceedings{Bhowmik_2023_ICCV,\n \n author = {\n Bhowmik,\n Aritra and Wang,\n Yu and Baka,\n Nora and Oswald,\n Martin R. and Snoek,\n Cees G. M.\n},\n title = {\n Detecting Objects with Context-Likelihood Graphs and Graph Refinement\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6524-6533\n} \n}" }, { "title": "Detection Transformer with Stable Matching", @@ -14396,7 +14881,8 @@ "aff_domain": ";;;;;;;;;;", "email": ";;;;;;;;;;", "author_num": 11, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Liu_Detection_Transformer_with_Stable_Matching_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Liu_Detection_Transformer_with_Stable_Matching_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Shilong and Ren,\n Tianhe and Chen,\n Jiayu and Zeng,\n Zhaoyang and Zhang,\n Hao and Li,\n Feng and Li,\n Hongyang and Huang,\n Jun and Su,\n Hang and Zhu,\n Jun and Zhang,\n Lei\n},\n title = {\n Detection Transformer with Stable Matching\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6491-6500\n} \n}" }, { "title": "DetermiNet: A Large-Scale Diagnostic Dataset for Complex Visually-Grounded Referencing using Determiners", @@ -14428,7 +14914,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Lee_2023_ICCV,\n \n author = {\n Lee,\n Clarence and Kumar,\n M Ganesh and Tan,\n Cheston\n},\n title = {\n DetermiNet: A Large-Scale Diagnostic Dataset for Complex Visually-Grounded Referencing using Determiners\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20019-20028\n} \n}" }, { "title": "DiFaReli: Diffusion Face Relighting", @@ -14451,7 +14938,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ponglertnapakorn_DiFaReli_Diffusion_Face_Relighting_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ponglertnapakorn_DiFaReli_Diffusion_Face_Relighting_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Ponglertnapakorn_2023_ICCV,\n \n author = {\n Ponglertnapakorn,\n Puntawat and Tritrong,\n Nontawat and Suwajanakorn,\n Supasorn\n},\n title = {\n DiFaReli: Diffusion Face Relighting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22646-22657\n} \n}" }, { "title": "DiLiGenT-Pi: Photometric Stereo for Planar Surfaces with Rich Details - Benchmark Dataset and Beyond", @@ -14483,7 +14971,8 @@ "aff_campus_unique_index": ";1;2;1;", "aff_campus_unique": ";Shanghai;Beijing", "aff_country_unique_index": "0+0+0;0;0+1;0;0+0+0", - "aff_country_unique": "China;Japan" + "aff_country_unique": "China;Japan", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Feishi and Ren,\n Jieji and Guo,\n Heng and Ren,\n Mingjun and Shi,\n Boxin\n},\n title = {\n DiLiGenT-Pi: Photometric Stereo for Planar Surfaces with Rich Details - Benchmark Dataset and Beyond\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9477-9487\n} \n}" }, { "title": "Diff-Retinex: Rethinking Low-light Image Enhancement with A Generative Diffusion Model", @@ -14515,7 +15004,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yi_2023_ICCV,\n \n author = {\n Yi,\n Xunpeng and Xu,\n Han and Zhang,\n Hao and Tang,\n Linfeng and Ma,\n Jiayi\n},\n title = {\n Diff-Retinex: Rethinking Low-light Image Enhancement with A Generative Diffusion Model\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12302-12311\n} \n}" }, { "title": "DiffCloth: Diffusion Based Garment Synthesis and Manipulation via Structural Cross-modal Semantic Alignment", @@ -14538,7 +15028,8 @@ "aff_domain": ";;;;;;;;", "email": ";;;;;;;;", "author_num": 9, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhang_DiffCloth_Diffusion_Based_Garment_Synthesis_and_Manipulation_via_Structural_Cross-modal_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhang_DiffCloth_Diffusion_Based_Garment_Synthesis_and_Manipulation_via_Structural_Cross-modal_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Xujie and Yang,\n Binbin and Kampffmeyer,\n Michael C. and Zhang,\n Wenqing and Zhang,\n Shiyue and Lu,\n Guansong and Lin,\n Liang and Xu,\n Hang and Liang,\n Xiaodan\n},\n title = {\n DiffCloth: Diffusion Based Garment Synthesis and Manipulation via Structural Cross-modal Semantic Alignment\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23154-23163\n} \n}" }, { "title": "DiffDis: Empowering Generative Diffusion Model with Cross-Modal Discrimination Capability", @@ -14550,7 +15041,7 @@ "author": "Runhui Huang; Jianhua Han; Guansong Lu; Xiaodan Liang; Yihan Zeng; Wei Zhang; Hang Xu", "abstract": "Recently, large-scale diffusion models, e.g., Stable diffusion and DallE2, have shown remarkable results on image synthesis. On the other hand, large-scale cross-modal pre-trained models (e.g., CLIP, ALIGN, and FILIP) are competent for various downstream tasks by learning to align vision and language embeddings. In this paper, we explore the possibility of jointly modeling generation and discrimination. Specifically, we propose DiffDis to unify the cross-modal generative and discriminative pretraining into one single framework under the diffusion process. DiffDis first formulates the image-text discriminative problem as a generative diffusion process of the text embedding from the text encoder conditioned on the image. Then, we propose a novel dual-stream network architecture, which fuses the noisy text embedding with the knowledge of latent images from different scales for image-text discriminative learning. Moreover, the generative and discriminative tasks can efficiently share the image-branch network structure in the multi-modality model. Benefiting from diffusion-based unified training, DiffDis achieves both better generation ability and cross-modal semantic alignment in one architecture. Experimental results show that DiffDis outperforms single-task models on both the image generation and the image-text discriminative tasks, e.g., 1.65% improvement on average accuracy of zero-shot classification over 12 datasets and 2.42 improvement on FID of zero-shot image synthesis.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Huang_DiffDis_Empowering_Generative_Diffusion_Model_with_Cross-Modal_Discrimination_Capability_ICCV_2023_paper.pdf", - "aff": "Shenzhen Campus of Sun Yat-sen University; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Shenzhen Campus of Sun Yat-sen University; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab", + "aff": "Shenzhen Campus of Sun Yat-sen University; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; Shenzhen Campus of Sun Yat-sen University; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Huang_DiffDis_Empowering_Generative_ICCV_2023_supplemental.pdf", @@ -14564,13 +15055,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Huang_DiffDis_Empowering_Generative_Diffusion_Model_with_Cross-Modal_Discrimination_Capability_ICCV_2023_paper.html", "aff_unique_index": "0;1;1;0;1;1;1", "aff_unique_norm": "Sun Yat-sen University;Huawei", - "aff_unique_dep": ";Noah\u2019s Ark Lab", + "aff_unique_dep": ";Noah’s Ark Lab", "aff_unique_url": "http://www.sysu.edu.cn/;https://www.huawei.com", "aff_unique_abbr": "SYSU;Huawei", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Shenzhen;", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Huang_2023_ICCV,\n \n author = {\n Huang,\n Runhui and Han,\n Jianhua and Lu,\n Guansong and Liang,\n Xiaodan and Zeng,\n Yihan and Zhang,\n Wei and Xu,\n Hang\n},\n title = {\n DiffDis: Empowering Generative Diffusion Model with Cross-Modal Discrimination Capability\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15713-15723\n} \n}" }, { "title": "DiffDreamer: Towards Consistent Unsupervised Single-view Scene Extrapolation with Conditional Diffusion Models", @@ -14582,7 +15074,7 @@ "author": "Shengqu Cai; Eric Ryan Chan; Songyou Peng; Mohamad Shahbazi; Anton Obukhov; Luc Van Gool; Gordon Wetzstein", "abstract": "Scene extrapolation---the idea of generating novel views by flying into a given image---is a promising, yet challenging task. For each predicted frame, a joint inpainting and 3D refinement problem has to be solved, which is ill posed and includes a high level of ambiguity. Moreover, training data for long-range scenes is difficult to obtain and usually lacks sufficient views to infer accurate camera poses. We introduce DiffDreamer, an unsupervised framework capable of synthesizing novel views depicting a long camera trajectory while training solely on internet-collected images of nature scenes. Utilizing the stochastic nature of the guided denoising steps, we train the diffusion models to refine projected RGBD images but condition the denoising steps on multiple past and future frames for inference. We demonstrate that image-conditioned diffusion models can effectively perform long-range scene extrapolation while preserving consistency significantly better than prior GAN-based methods. DiffDreamer is a powerful and efficient solution for scene extrapolation, producing impressive results despite limited supervision. Project page: https://primecai.github.io/diffdreamer.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Cai_DiffDreamer_Towards_Consistent_Unsupervised_Single-view_Scene_Extrapolation_with_Conditional_Diffusion_ICCV_2023_paper.pdf", - "aff": "Stanford University; Stanford University; ETH Z\u00fcrich + MPI for Intelligent Systems, T\u00fcbingen; ETH Z\u00fcrich; ETH Z\u00fcrich; ETH Z\u00fcrich + KU Leuven; Stanford University", + "aff": "Stanford University; Stanford University; ETH Zürich + MPI for Intelligent Systems, Tübingen; ETH Zürich; ETH Zürich; ETH Zürich + KU Leuven; Stanford University", "project": "https://primecai.github.io/diffdreamer", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Cai_DiffDreamer_Towards_Consistent_ICCV_2023_supplemental.zip", @@ -14595,14 +15087,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Cai_DiffDreamer_Towards_Consistent_Unsupervised_Single-view_Scene_Extrapolation_with_Conditional_Diffusion_ICCV_2023_paper.html", "aff_unique_index": "0;0;1+2;1;1;1+3;0", - "aff_unique_norm": "Stanford University;ETH Zurich;Max Planck Institute for Intelligent Systems;Katholieke Universiteit Leuven", + "aff_unique_norm": "Stanford University;ETH Zürich;Max Planck Institute for Intelligent Systems;Katholieke Universiteit Leuven", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.stanford.edu;https://www.ethz.ch;https://www.mpi-is.mpg.de;https://www.kuleuven.be", "aff_unique_abbr": "Stanford;ETHZ;MPI-IS;KU Leuven", "aff_campus_unique_index": "0;0;2;;0", - "aff_campus_unique": "Stanford;;T\u00fcbingen", + "aff_campus_unique": "Stanford;;Tübingen", "aff_country_unique_index": "0;0;1+2;1;1;1+3;0", - "aff_country_unique": "United States;Switzerland;Germany;Belgium" + "aff_country_unique": "United States;Switzerland;Germany;Belgium", + "bibtex": "@InProceedings{Cai_2023_ICCV,\n \n author = {\n Cai,\n Shengqu and Chan,\n Eric Ryan and Peng,\n Songyou and Shahbazi,\n Mohamad and Obukhov,\n Anton and Van Gool,\n Luc and Wetzstein,\n Gordon\n},\n title = {\n DiffDreamer: Towards Consistent Unsupervised Single-view Scene Extrapolation with Conditional Diffusion Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2139-2150\n} \n}" }, { "title": "DiffFacto: Controllable Part-Based 3D Point Cloud Generation with Cross Diffusion", @@ -14634,7 +15127,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0;1;1;2;0", - "aff_country_unique": "United States;China;Canada" + "aff_country_unique": "United States;China;Canada", + "bibtex": "@InProceedings{Nakayama_2023_ICCV,\n \n author = {\n Nakayama,\n George Kiyohiro and Uy,\n Mikaela Angelina and Huang,\n Jiahui and Hu,\n Shi-Min and Li,\n Ke and Guibas,\n Leonidas\n},\n title = {\n DiffFacto: Controllable Part-Based 3D Point Cloud Generation with Cross Diffusion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14257-14267\n} \n}" }, { "title": "DiffFit: Unlocking Transferability of Large Diffusion Models via Simple Parameter-efficient Fine-Tuning", @@ -14646,7 +15140,7 @@ "author": "Enze Xie; Lewei Yao; Han Shi; Zhili Liu; Daquan Zhou; Zhaoqiang Liu; Jiawei Li; Zhenguo Li", "abstract": "Diffusion models have proven to be highly effective in generating high-quality images. However, adapting large pre-trained diffusion models to new domains remains an open challenge, which is critical for real-world applications. This paper proposes DiffFit, a parameter-efficient strategy to fine-tune large pre-trained diffusion models that enable fast adaptation to new domains. DiffFit is embarrassingly simple that only fine-tunes the bias term and newly-added scaling factors in specific layers, yet resulting in significant training speed-up and reduced model storage costs. Compared with full fine-tuning, DiffFit achieves 2x training speed-up and only needs to store approximately 0.12% of the total model parameters. Intuitive theoretical analysis has been provided to justify the efficacy of scaling factors on fast adaptation. On 8 downstream datasets, DiffFit achieves superior or competitive performances compared to the full fine-tuning while being more efficient. Remarkably, we show that DiffFit can adapt a pre-trained low-resolution generative model to a high-resolution one by adding minimal cost. Among diffusion-based methods, DiffFit sets a new state-of-the-art FID of 3.02 on ImageNet 512x512 benchmark by fine-tuning only 25 epochs from a public pre-trained ImageNet 256x256 checkpoint while being 30x more training efficient than the closest competitor.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Xie_DiffFit_Unlocking_Transferability_of_Large_Diffusion_Models_via_Simple_Parameter-efficient_ICCV_2023_paper.pdf", - "aff": "Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; National University of Singapore; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab", + "aff": "Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; National University of Singapore; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Xie_DiffFit_Unlocking_Transferability_ICCV_2023_supplemental.pdf", @@ -14660,13 +15154,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Xie_DiffFit_Unlocking_Transferability_of_Large_Diffusion_Models_via_Simple_Parameter-efficient_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0;1;0;0;0", "aff_unique_norm": "Huawei;National University of Singapore", - "aff_unique_dep": "Noah\u2019s Ark Lab;", + "aff_unique_dep": "Noah’s Ark Lab;", "aff_unique_url": "https://www.huawei.com;https://www.nus.edu.sg", "aff_unique_abbr": "Huawei;NUS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1;0;0;0", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Xie_2023_ICCV,\n \n author = {\n Xie,\n Enze and Yao,\n Lewei and Shi,\n Han and Liu,\n Zhili and Zhou,\n Daquan and Liu,\n Zhaoqiang and Li,\n Jiawei and Li,\n Zhenguo\n},\n title = {\n DiffFit: Unlocking Transferability of Large Diffusion Models via Simple Parameter-efficient Fine-Tuning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4230-4239\n} \n}" }, { "title": "DiffIR: Efficient Diffusion Model for Image Restoration", @@ -14678,7 +15173,7 @@ "author": "Bin Xia; Yulun Zhang; Shiyin Wang; Yitong Wang; Xinglong Wu; Yapeng Tian; Wenming Yang; Luc Van Gool", "abstract": "Diffusion model (DM) has achieved SOTA performance by modeling the image synthesis process into a sequential application of a denoising network. However, different from image synthesis generating each pixel from scratch, most pixels of image restoration (IR) are given. Thus, for IR, traditional DMs running massive iterations on a large model to estimate whole images or feature maps is inefficient. To address this issue, we propose an efficient DM for IR (DiffIR), which consists of a compact IR prior extraction network (CPEN), dynamic IR transformer (DIRformer), and denoising network. Specifically, DiffIR has two training stages: pretraining and training DM. In pretraining, we input ground-truth images into CPEN-S1 to capture a compact IR prior representation (IPR) to guide DIRformer. In the second stage, we train the DM to directly estimate the same IRP as pretrained CPEN-S1 only using LQ images. We observe that since the IPR is only a compact vector, DiffIR can use fewer iterations than traditional DM to obtain accurate estimations and generate more stable and realistic results. Since the iterations are few, our DiffIR can adopt a joint optimization of CPEN-S2, DIRformer, and denoising network, which can further reduce the estimation error influence. We conduct extensive experiments on several IR tasks and achieve SOTA performance while consuming less computational costs. Codes and models will be released.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Xia_DiffIR_Efficient_Diffusion_Model_for_Image_Restoration_ICCV_2023_paper.pdf", - "aff": "Tsinghua University; ETH Z\u00fcrich; ByteDance Inc; University of Texas at Dallas; ByteDance Inc; University of Texas at Dallas; Tsinghua University; ETH Z\u00fcrich", + "aff": "Tsinghua University; ETH Zürich; ByteDance Inc; University of Texas at Dallas; ByteDance Inc; University of Texas at Dallas; Tsinghua University; ETH Zürich", "project": "", "github": "https://github.com/Zj-BinXia/DiffIR", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Xia_DiffIR_Efficient_Diffusion_ICCV_2023_supplemental.pdf", @@ -14691,14 +15186,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Xia_DiffIR_Efficient_Diffusion_Model_for_Image_Restoration_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;3;2;3;0;1", - "aff_unique_norm": "Tsinghua University;ETH Zurich;ByteDance;University of Texas at Dallas", + "aff_unique_norm": "Tsinghua University;ETH Zürich;ByteDance;University of Texas at Dallas", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.ethz.ch;https://www.bytedance.com;https://www.utdallas.edu", "aff_unique_abbr": "THU;ETHZ;ByteDance;UT Dallas", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Dallas", "aff_country_unique_index": "0;1;0;2;0;2;0;1", - "aff_country_unique": "China;Switzerland;United States" + "aff_country_unique": "China;Switzerland;United States", + "bibtex": "@InProceedings{Xia_2023_ICCV,\n \n author = {\n Xia,\n Bin and Zhang,\n Yulun and Wang,\n Shiyin and Wang,\n Yitong and Wu,\n Xinglong and Tian,\n Yapeng and Yang,\n Wenming and Van Gool,\n Luc\n},\n title = {\n DiffIR: Efficient Diffusion Model for Image Restoration\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13095-13105\n} \n}" }, { "title": "DiffPose: Multi-hypothesis Human Pose Estimation using Diffusion Models", @@ -14710,7 +15206,7 @@ "author": "Karl Holmquist; Bastian Wandt", "abstract": "Traditionally, monocular 3D human pose estimation employs a machine learning model to predict the most likely 3D pose for a given input image. However, a single image can be highly ambiguous and induces multiple plausible solutions for the 2D-3D lifting step, which results in overly confident 3D pose predictors. To this end, we propose DiffPose, a conditional diffusion model that predicts multiple hypotheses for a given input image. Compared to similar approaches, our diffusion model is straightforward and avoids intensive hyperparameter tuning, complex network structures, mode collapse, and unstable training.\n Moreover, we tackle the problem of over-simplification of the intermediate representation of the common two-step approaches which first estimate a distribution of 2D joint locations via joint-wise heatmaps and consecutively use their maximum argument for the 3D pose estimation step. Since such a simplification of the heatmaps removes valid information about possibly correct, though labeled unlikely, joint locations, we propose to represent the heatmaps as a set of 2D joint candidate samples. To extract information about the original distribution from these samples, we introduce our embedding transformer which conditions the diffusion model. Experimentally, we show that DiffPose improves upon the state of the art for multi-hypothesis pose estimation by 3-5% for simple poses and outperforms it by a large\n margin for highly ambiguous poses.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Holmquist_DiffPose_Multi-hypothesis_Human_Pose_Estimation_using_Diffusion_Models_ICCV_2023_paper.pdf", - "aff": "Link \u00a8oping University; Link \u00a8oping University", + "aff": "Link ¨oping University; Link ¨oping University", "project": "", "github": "https://github.com/bastianwandt/DiffPose/", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Holmquist_DiffPose_Multi-hypothesis_Human_ICCV_2023_supplemental.pdf", @@ -14723,14 +15219,15 @@ "author_num": 2, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Holmquist_DiffPose_Multi-hypothesis_Human_Pose_Estimation_using_Diffusion_Models_ICCV_2023_paper.html", "aff_unique_index": "0;0", - "aff_unique_norm": "Link\u00f6ping University", + "aff_unique_norm": "Linköping University", "aff_unique_dep": "", "aff_unique_url": "https://www.liu.se", "aff_unique_abbr": "LiU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Sweden" + "aff_country_unique": "Sweden", + "bibtex": "@InProceedings{Holmquist_2023_ICCV,\n \n author = {\n Holmquist,\n Karl and Wandt,\n Bastian\n},\n title = {\n DiffPose: Multi-hypothesis Human Pose Estimation using Diffusion Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15977-15987\n} \n}" }, { "title": "DiffPose: SpatioTemporal Diffusion Model for Video-Based Human Pose Estimation", @@ -14762,7 +15259,8 @@ "aff_campus_unique_index": ";;1;;1", "aff_campus_unique": ";Birmingham", "aff_country_unique_index": "0+0;0+0;1;0+0;1", - "aff_country_unique": "China;United Kingdom" + "aff_country_unique": "China;United Kingdom", + "bibtex": "@InProceedings{Feng_2023_ICCV,\n \n author = {\n Feng,\n Runyang and Gao,\n Yixing and Tse,\n Tze Ho Elden and Ma,\n Xueqing and Chang,\n Hyung Jin\n},\n title = {\n DiffPose: SpatioTemporal Diffusion Model for Video-Based Human Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14861-14872\n} \n}" }, { "title": "DiffRate : Differentiable Compression Rate for Efficient Vision Transformers", @@ -14785,7 +15283,8 @@ "aff_domain": ";;;;;;;;", "email": ";;;;;;;;", "author_num": 9, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chen_DiffRate__Differentiable_Compression_Rate_for_Efficient_Vision_Transformers_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chen_DiffRate__Differentiable_Compression_Rate_for_Efficient_Vision_Transformers_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Mengzhao and Shao,\n Wenqi and Xu,\n Peng and Lin,\n Mingbao and Zhang,\n Kaipeng and Chao,\n Fei and Ji,\n Rongrong and Qiao,\n Yu and Luo,\n Ping\n},\n title = {\n DiffRate : Differentiable Compression Rate for Efficient Vision Transformers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17164-17174\n} \n}" }, { "title": "DiffTAD: Temporal Action Detection with Proposal Denoising Diffusion", @@ -14817,7 +15316,8 @@ "aff_campus_unique_index": ";;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0;0+0;0+0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Nag_2023_ICCV,\n \n author = {\n Nag,\n Sauradip and Zhu,\n Xiatian and Deng,\n Jiankang and Song,\n Yi-Zhe and Xiang,\n Tao\n},\n title = {\n DiffTAD: Temporal Action Detection with Proposal Denoising Diffusion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10362-10374\n} \n}" }, { "title": "DiffV2S: Diffusion-Based Video-to-Speech Synthesis with Vision-Guided Speaker Embedding", @@ -14849,7 +15349,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Choi_2023_ICCV,\n \n author = {\n Choi,\n Jeongsoo and Hong,\n Joanna and Ro,\n Yong Man\n},\n title = {\n DiffV2S: Diffusion-Based Video-to-Speech Synthesis with Vision-Guided Speaker Embedding\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7812-7821\n} \n}" }, { "title": "Differentiable Transportation Pruning", @@ -14872,7 +15373,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_Differentiable_Transportation_Pruning_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_Differentiable_Transportation_Pruning_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Yunqiang and van Gemert,\n Jan C. and Hoefler,\n Torsten and Moons,\n Bert and Eleftheriou,\n Evangelos and Verhoef,\n Bram-Ernst\n},\n title = {\n Differentiable Transportation Pruning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16957-16967\n} \n}" }, { "title": "DiffuMask: Synthesizing Images with Pixel-level Annotations for Semantic Segmentation Using Diffusion Models", @@ -14904,7 +15406,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0+1;0+0", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Wu_2023_ICCV,\n \n author = {\n Wu,\n Weijia and Zhao,\n Yuzhong and Shou,\n Mike Zheng and Zhou,\n Hong and Shen,\n Chunhua\n},\n title = {\n DiffuMask: Synthesizing Images with Pixel-level Annotations for Semantic Segmentation Using Diffusion Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1206-1217\n} \n}" }, { "title": "Diffuse3D: Wide-Angle 3D Photography via Bilateral Diffusion", @@ -14936,7 +15439,8 @@ "aff_campus_unique_index": ";;;1", "aff_campus_unique": ";Birmingham", "aff_country_unique_index": "0+1;0+1;0+1;0;2;0;1", - "aff_country_unique": "China;Singapore;United Kingdom" + "aff_country_unique": "China;Singapore;United Kingdom", + "bibtex": "@InProceedings{Jiang_2023_ICCV,\n \n author = {\n Jiang,\n Yutao and Zhou,\n Yang and Liang,\n Yuan and Liu,\n Wenxi and Jiao,\n Jianbo and Quan,\n Yuhui and He,\n Shengfeng\n},\n title = {\n Diffuse3D: Wide-Angle 3D Photography via Bilateral Diffusion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8998-9008\n} \n}" }, { "title": "Diffusion Action Segmentation", @@ -14961,14 +15465,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Liu_Diffusion_Action_Segmentation_ICCV_2023_paper.html", "aff_unique_index": "0;1+2;0;2;3;0", - "aff_unique_norm": "University of Sydney;NERCVT;Peking University;University of Central Florida", + "aff_unique_norm": "The University of Sydney;NERCVT;Peking University;University of Central Florida", "aff_unique_dep": "School of Computer Science;School of Computer Science;School of Mathematical Sciences;Center for Research in Computer Vision", "aff_unique_url": "https://www.sydney.edu.au;;http://www.pku.edu.cn;https://www.ucf.edu", "aff_unique_abbr": "USYD;;PKU;UCF", "aff_campus_unique_index": "1;1;2", "aff_campus_unique": ";Beijing;Orlando", "aff_country_unique_index": "0;2;0;2;3;0", - "aff_country_unique": "Australia;;China;United States" + "aff_country_unique": "Australia;;China;United States", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Daochang and Li,\n Qiyue and Dinh,\n Anh-Dung and Jiang,\n Tingting and Shah,\n Mubarak and Xu,\n Chang\n},\n title = {\n Diffusion Action Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10139-10149\n} \n}" }, { "title": "Diffusion Model as Representation Learner", @@ -15000,7 +15505,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n Xingyi and Wang,\n Xinchao\n},\n title = {\n Diffusion Model as Representation Learner\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18938-18949\n} \n}" }, { "title": "Diffusion Models as Masked Autoencoders", @@ -15025,14 +15531,15 @@ "author_num": 10, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wei_Diffusion_Models_as_Masked_Autoencoders_ICCV_2023_paper.html", "aff_unique_index": "0+1;0;0;0;0;0;0;2;1;0", - "aff_unique_norm": "Meta;Johns Hopkins University;University of California, Santa Cruz", + "aff_unique_norm": "Meta AI;Johns Hopkins University;University of California, Santa Cruz", "aff_unique_dep": "Meta AI;;", "aff_unique_url": "https://meta.ai;https://www.jhu.edu;https://www.ucsc.edu", "aff_unique_abbr": "Meta AI;JHU;UCSC", "aff_campus_unique_index": ";1", "aff_campus_unique": ";Santa Cruz", "aff_country_unique_index": "0+0;0;0;0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Wei_2023_ICCV,\n \n author = {\n Wei,\n Chen and Mangalam,\n Karttikeya and Huang,\n Po-Yao and Li,\n Yanghao and Fan,\n Haoqi and Xu,\n Hu and Wang,\n Huiyu and Xie,\n Cihang and Yuille,\n Alan and Feichtenhofer,\n Christoph\n},\n title = {\n Diffusion Models as Masked Autoencoders\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16284-16294\n} \n}" }, { "title": "Diffusion in Style", @@ -15040,8 +15547,8 @@ "status": "Poster", "track": "main", "pid": "6203", - "author_site": "Martin Nicolas Everaert, Marco Bocchio, Sami Arpa, Sabine S\u00fcsstrunk, Radhakrishna Achanta", - "author": "Martin Nicolas Everaert; Marco Bocchio; Sami Arpa; Sabine S\u00fcsstrunk; Radhakrishna Achanta", + "author_site": "Martin Nicolas Everaert, Marco Bocchio, Sami Arpa, Sabine Süsstrunk, Radhakrishna Achanta", + "author": "Martin Nicolas Everaert; Marco Bocchio; Sami Arpa; Sabine Süsstrunk; Radhakrishna Achanta", "abstract": "We present Diffusion in Style, a simple method to adapt Stable Diffusion to any desired style, using only a small set of target images. It is based on the key observation that the style of the images generated by Stable Diffusion is tied to the initial latent tensor. Not adapting this initial latent tensor to the style makes fine-tuning slow, expensive, and impractical, especially when only a few target style images are available. In contrast, fine-tuning is much easier if this initial latent tensor is also adapted. Our Diffusion in Style is orders of magnitude more sample-efficient and faster. It also generates more pleasing images than existing approaches, as shown qualitatively and with quantitative comparisons.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Everaert_Diffusion_in_Style_ICCV_2023_paper.pdf", "aff": "School of Computer and Communication Sciences, EPFL, Switzerland; Largo.ai, Lausanne, Switzerland; Largo.ai, Lausanne, Switzerland; School of Computer and Communication Sciences, EPFL, Switzerland; School of Computer and Communication Sciences, EPFL, Switzerland", @@ -15057,14 +15564,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Everaert_Diffusion_in_Style_ICCV_2023_paper.html", "aff_unique_index": "0;1;1;0;0", - "aff_unique_norm": "EPFL;Largo.ai", + "aff_unique_norm": "École Polytechnique Fédérale de Lausanne;Largo.ai", "aff_unique_dep": "School of Computer and Communication Sciences;", "aff_unique_url": "https://www.epfl.ch;", "aff_unique_abbr": "EPFL;", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Lausanne", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "Switzerland" + "aff_country_unique": "Switzerland", + "bibtex": "@InProceedings{Everaert_2023_ICCV,\n \n author = {\n Everaert,\n Martin Nicolas and Bocchio,\n Marco and Arpa,\n Sami and S\\"usstrunk,\n Sabine and Achanta,\n Radhakrishna\n},\n title = {\n Diffusion in Style\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2251-2261\n} \n}" }, { "title": "Diffusion-Based 3D Human Pose Estimation with Multi-Hypothesis Aggregation", @@ -15076,7 +15584,7 @@ "author": "Wenkang Shan; Zhenhua Liu; Xinfeng Zhang; Zhao Wang; Kai Han; Shanshe Wang; Siwei Ma; Wen Gao", "abstract": "In this paper, a novel Diffusion-based 3D Pose estimation (D3DP) method with Joint-wise reProjection-based Multi-hypothesis Aggregation (JPMA) is proposed for probabilistic 3D human pose estimation. On the one hand, D3DP generates multiple possible 3D pose hypotheses for a single 2D observation. It gradually diffuses the ground truth 3D poses to a random distribution, and learns a denoiser conditioned on 2D keypoints to recover the uncontaminated 3D poses. The proposed D3DP is compatible with existing 3D pose estimators and supports users to balance efficiency and accuracy during inference through two customizable parameters. On the other hand, JPMA is proposed to assemble multiple hypotheses generated by D3DP into a single 3D pose for practical use. It reprojects 3D pose hypotheses to the 2D camera plane, selects the best hypothesis joint-by-joint based on the reprojection errors, and combines the selected joints into the final pose. The proposed JPMA conducts aggregation at the joint level and makes use of the 2D prior information, both of which have been overlooked by previous approaches. Extensive experiments on Human3.6M and MPI-INF-3DHP datasets show that our method outperforms the state-of-the-art deterministic and probabilistic approaches by 1.5% and 8.9%, respectively. Code is available at https://github.com/paTRICK-swk/D3DP.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Shan_Diffusion-Based_3D_Human_Pose_Estimation_with_Multi-Hypothesis_Aggregation_ICCV_2023_paper.pdf", - "aff": "National Engineering Research Center of Visual Technology, Peking University; Huawei Noah\u2019s Ark Lab; University of Chinese Academy of Sciences; Huawei Noah\u2019s Ark Lab; National Engineering Research Center of Visual Technology, Peking University; National Engineering Research Center of Visual Technology, Peking University + Peng Cheng Laboratory; National Engineering Research Center of Visual Technology, Peking University + Peng Cheng Laboratory; National Engineering Research Center of Visual Technology, Peking University + Peng Cheng Laboratory", + "aff": "National Engineering Research Center of Visual Technology, Peking University; Huawei Noah’s Ark Lab; University of Chinese Academy of Sciences; Huawei Noah’s Ark Lab; National Engineering Research Center of Visual Technology, Peking University; National Engineering Research Center of Visual Technology, Peking University + Peng Cheng Laboratory; National Engineering Research Center of Visual Technology, Peking University + Peng Cheng Laboratory; National Engineering Research Center of Visual Technology, Peking University + Peng Cheng Laboratory", "project": "", "github": "https://github.com/paTRICK-swk/D3DP", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Shan_Diffusion-Based_3D_Human_Pose_Estimation_with_Multi-Hypothesis_Aggregation_ICCV_2023_supplemental.pdf", @@ -15089,14 +15597,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Shan_Diffusion-Based_3D_Human_Pose_Estimation_with_Multi-Hypothesis_Aggregation_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;1;0;0+3;0+3;0+3", - "aff_unique_norm": "Peking University;Huawei;University of Chinese Academy of Sciences;Pengcheng Laboratory", - "aff_unique_dep": "National Engineering Research Center of Visual Technology;Noah\u2019s Ark Lab;;Peng Cheng Laboratory", + "aff_unique_norm": "Peking University;Huawei;University of Chinese Academy of Sciences;Peng Cheng Laboratory", + "aff_unique_dep": "National Engineering Research Center of Visual Technology;Noah’s Ark Lab;;", "aff_unique_url": "http://www.pku.edu.cn;https://www.huawei.com;http://www.ucas.ac.cn;http://www.pcl.ac.cn", "aff_unique_abbr": "PKU;Huawei;UCAS;PCL", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0+0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Shan_2023_ICCV,\n \n author = {\n Shan,\n Wenkang and Liu,\n Zhenhua and Zhang,\n Xinfeng and Wang,\n Zhao and Han,\n Kai and Wang,\n Shanshe and Ma,\n Siwei and Gao,\n Wen\n},\n title = {\n Diffusion-Based 3D Human Pose Estimation with Multi-Hypothesis Aggregation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14761-14771\n} \n}" }, { "title": "Diffusion-Guided Reconstruction of Everyday Hand-Object Interaction Clips", @@ -15128,7 +15637,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Ye_2023_ICCV,\n \n author = {\n Ye,\n Yufei and Hebbar,\n Poorvi and Gupta,\n Abhinav and Tulsiani,\n Shubham\n},\n title = {\n Diffusion-Guided Reconstruction of Everyday Hand-Object Interaction Clips\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19717-19728\n} \n}" }, { "title": "Diffusion-SDF: Conditional Generative Modeling of Signed Distance Functions", @@ -15151,7 +15661,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chou_Diffusion-SDF_Conditional_Generative_Modeling_of_Signed_Distance_Functions_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chou_Diffusion-SDF_Conditional_Generative_Modeling_of_Signed_Distance_Functions_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Chou_2023_ICCV,\n \n author = {\n Chou,\n Gene and Bahat,\n Yuval and Heide,\n Felix\n},\n title = {\n Diffusion-SDF: Conditional Generative Modeling of Signed Distance Functions\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2262-2272\n} \n}" }, { "title": "Diffusion-based Image Translation with Label Guidance for Domain Adaptive Semantic Segmentation", @@ -15183,7 +15694,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;0", - "aff_country_unique": "Singapore;United States;Australia" + "aff_country_unique": "Singapore;United States;Australia", + "bibtex": "@InProceedings{Peng_2023_ICCV,\n \n author = {\n Peng,\n Duo and Hu,\n Ping and Ke,\n Qiuhong and Liu,\n Jun\n},\n title = {\n Diffusion-based Image Translation with Label Guidance for Domain Adaptive Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 808-820\n} \n}" }, { "title": "DiffusionDet: Diffusion Model for Object Detection", @@ -15208,14 +15720,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chen_DiffusionDet_Diffusion_Model_for_Object_Detection_ICCV_2023_paper.html", "aff_unique_index": "0;0;1+2;0+3", - "aff_unique_norm": "University of Hong Kong;Tencent;Fudan University;Shanghai AI Laboratory", + "aff_unique_norm": "The University of Hong Kong;Tencent;Fudan University;Shanghai AI Laboratory", "aff_unique_dep": ";Tencent AI Lab;AI3Institute;", "aff_unique_url": "https://www.hku.hk;https://ai.tencent.com;https://www.fudan.edu.cn;https://www.shanghai-ai-lab.com", "aff_unique_abbr": "HKU;Tencent AI Lab;;SAIL", "aff_campus_unique_index": "0;0;;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Shoufa and Sun,\n Peize and Song,\n Yibing and Luo,\n Ping\n},\n title = {\n DiffusionDet: Diffusion Model for Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19830-19843\n} \n}" }, { "title": "DiffusionRet: Generative Text-Video Retrieval with Diffusion Model", @@ -15238,7 +15751,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Jin_DiffusionRet_Generative_Text-Video_Retrieval_with_Diffusion_Model_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Jin_DiffusionRet_Generative_Text-Video_Retrieval_with_Diffusion_Model_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Jin_2023_ICCV,\n \n author = {\n Jin,\n Peng and Li,\n Hao and Cheng,\n Zesen and Li,\n Kehan and Ji,\n Xiangyang and Liu,\n Chang and Yuan,\n Li and Chen,\n Jie\n},\n title = {\n DiffusionRet: Generative Text-Video Retrieval with Diffusion Model\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2470-2481\n} \n}" }, { "title": "Discovering Spatio-Temporal Rationales for Video Question Answering", @@ -15270,7 +15784,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;1;0", - "aff_country_unique": "Singapore;China" + "aff_country_unique": "Singapore;China", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Yicong and Xiao,\n Junbin and Feng,\n Chun and Wang,\n Xiang and Chua,\n Tat-Seng\n},\n title = {\n Discovering Spatio-Temporal Rationales for Video Question Answering\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13869-13878\n} \n}" }, { "title": "Discrepant and Multi-Instance Proxies for Unsupervised Person Re-Identification", @@ -15282,7 +15797,7 @@ "author": "Chang Zou; Zeqi Chen; Zhichao Cui; Yuehu Liu; Chi Zhang", "abstract": "Most recent unsupervised person re-identification methods maintain a cluster uni-proxy for contrastive learning. However, due to the intra-class variance and inter-class similarity, the cluster uni-proxy is prone to be biased and confused with similar classes, resulting in the learned features lacking intra-class compactness and inter-class separation in the embedding space. To completely and accurately represent the information contained in a cluster and learn discriminative features, we propose to maintain discrepant cluster proxies and multi-instance proxies for a cluster. Each cluster proxy focuses on representing a part of the information, and several discrepant proxies collaborate to represent the entire cluster completely. As a complement to the overall representation, multi-instance proxies are used to accurately represent the fine-grained information contained in the instances of the cluster. Based on the proposed discrepant cluster proxies, we construct cluster contrastive loss to use the proxies as hard positive samples to pull instances of a cluster closer and reduce intra-class variance. Meanwhile, instance contrastive loss is constructed by global hard negative sample mining in multi-instance proxies to push away the truly indistinguishable classes and decrease inter-class similarity. Extensive experiments on Market-1501 and MSMT17 demonstrate that the proposed method outperforms state-of-the-art approaches.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Zou_Discrepant_and_Multi-Instance_Proxies_for_Unsupervised_Person_Re-Identification_ICCV_2023_paper.pdf", - "aff": "School of Software Engineering, Xi\u2019an Jiaotong University; Institute of Artificial Intelligence and Robotics, Xi\u2019an Jiaotong University; Chang\u2019an University; Institute of Artificial Intelligence and Robotics, Xi\u2019an Jiaotong University; Institute of Artificial Intelligence and Robotics, Xi\u2019an Jiaotong University", + "aff": "School of Software Engineering, Xi’an Jiaotong University; Institute of Artificial Intelligence and Robotics, Xi’an Jiaotong University; Chang’an University; Institute of Artificial Intelligence and Robotics, Xi’an Jiaotong University; Institute of Artificial Intelligence and Robotics, Xi’an Jiaotong University", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Zou_Discrepant_and_Multi-Instance_ICCV_2023_supplemental.pdf", @@ -15295,14 +15810,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zou_Discrepant_and_Multi-Instance_Proxies_for_Unsupervised_Person_Re-Identification_ICCV_2023_paper.html", "aff_unique_index": "0;0;1;0;0", - "aff_unique_norm": "Xi'an Jiao Tong University;Chang'an University", + "aff_unique_norm": "Xi'an Jiaotong University;Chang'an University", "aff_unique_dep": "School of Software Engineering;", "aff_unique_url": "http://www.xjtu.edu.cn;http://www.chang'an.edu.cn", "aff_unique_abbr": "XJTU;", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Xi'an;", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zou_2023_ICCV,\n \n author = {\n Zou,\n Chang and Chen,\n Zeqi and Cui,\n Zhichao and Liu,\n Yuehu and Zhang,\n Chi\n},\n title = {\n Discrepant and Multi-Instance Proxies for Unsupervised Person Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11058-11068\n} \n}" }, { "title": "Discriminative Class Tokens for Text-to-Image Diffusion Models", @@ -15310,8 +15826,8 @@ "status": "Poster", "track": "main", "pid": "11828", - "author_site": "Idan Schwartz, V\u00e9steinn Sn\u00e6bjarnarson, Hila Chefer, Serge Belongie, Lior Wolf, Sagie Benaim", - "author": "Idan Schwartz; V\u00e9steinn Sn\u00e6bjarnarson; Hila Chefer; Serge Belongie; Lior Wolf; Sagie Benaim", + "author_site": "Idan Schwartz, Vésteinn Snæbjarnarson, Hila Chefer, Serge Belongie, Lior Wolf, Sagie Benaim", + "author": "Idan Schwartz; Vésteinn Snæbjarnarson; Hila Chefer; Serge Belongie; Lior Wolf; Sagie Benaim", "abstract": "Recent advances in text-to-image diffusion models have enabled the generation of diverse and high-quality images. While impressive, the images often fall short of depicting subtle details and are susceptible to errors due to ambiguity in the input text. One way of alleviating these issues is to train diffusion models on class-labeled datasets. This approach has two disadvantages: (i) supervised datasets are generally small compared to large-scale scraped text-image datasets on which text-to-image models are trained, affecting the quality and diversity of the generated images, or (ii) the input is a hard-coded label, as opposed to free-form text, limiting the control over the generated images.\n \n In this work, we propose a non-invasive fine-tuning technique that capitalizes on the expressive potential of free-form text while achieving high accuracy through discriminative signals from a pretrained classifier. This is done by iteratively modifying the embedding of an added input token of a text-to-image diffusion model, by steering generated images toward a given target class according to a classifier. Our method is fast compared to prior fine-tuning methods and does not require a collection of in-class images or retraining of a noise-tolerant classifier. We evaluate our method extensively, showing that the generated images are: (i) more accurate and of higher quality than standard diffusion models, (ii) can be used to augment training data in a low-resource setting, and (iii) reveal information about the data used to train the guiding classifier. The code is available at https://github.com/idansc/discriminative_class_tokens.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Schwartz_Discriminative_Class_Tokens_for_Text-to-Image_Diffusion_Models_ICCV_2023_paper.pdf", "aff": "Tel Aviv University; University of Copenhagen; Tel Aviv University; University of Copenhagen; Tel Aviv University; University of Copenhagen", @@ -15334,7 +15850,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;1;0;1", - "aff_country_unique": "Israel;Denmark" + "aff_country_unique": "Israel;Denmark", + "bibtex": "@InProceedings{Schwartz_2023_ICCV,\n \n author = {\n Schwartz,\n Idan and Sn{\\ae\n}bjarnarson,\n V\\'esteinn and Chefer,\n Hila and Belongie,\n Serge and Wolf,\n Lior and Benaim,\n Sagie\n},\n title = {\n Discriminative Class Tokens for Text-to-Image Diffusion Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22725-22735\n} \n}" }, { "title": "Disentangle then Parse: Night-time Semantic Segmentation with Illumination Disentanglement", @@ -15366,7 +15883,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wei_2023_ICCV,\n \n author = {\n Wei,\n Zhixiang and Chen,\n Lin and Tu,\n Tao and Ling,\n Pengyang and Chen,\n Huaian and Jin,\n Yi\n},\n title = {\n Disentangle then Parse: Night-time Semantic Segmentation with Illumination Disentanglement\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21593-21603\n} \n}" }, { "title": "Disentangling Spatial and Temporal Learning for Efficient Image-to-Video Transfer Learning", @@ -15398,7 +15916,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0;0;0", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Qing_2023_ICCV,\n \n author = {\n Qing,\n Zhiwu and Zhang,\n Shiwei and Huang,\n Ziyuan and Zhang,\n Yingya and Gao,\n Changxin and Zhao,\n Deli and Sang,\n Nong\n},\n title = {\n Disentangling Spatial and Temporal Learning for Efficient Image-to-Video Transfer Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13934-13944\n} \n}" }, { "title": "Disposable Transfer Learning for Selective Source Task Unlearning", @@ -15423,14 +15942,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Koh_Disposable_Transfer_Learning_for_Selective_Source_Task_Unlearning_ICCV_2023_paper.html", "aff_unique_index": "0;0;1;0;0", - "aff_unique_norm": "Korea Advanced Institute of Science and Technology;LG", + "aff_unique_norm": "Korea Advanced Institute of Science and Technology;LG AI Research", "aff_unique_dep": ";AI Research", "aff_unique_url": "https://www.kaist.ac.kr;https://www.lgaires.com", "aff_unique_abbr": "KAIST;LG AI Research", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Koh_2023_ICCV,\n \n author = {\n Koh,\n Seunghee and Shon,\n Hyounguk and Lee,\n Janghyeon and Hong,\n Hyeong Gwon and Kim,\n Junmo\n},\n title = {\n Disposable Transfer Learning for Selective Source Task Unlearning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11752-11760\n} \n}" }, { "title": "DistillBEV: Boosting Multi-Camera 3D Object Detection with Cross-Modal Knowledge Distillation", @@ -15462,7 +15982,8 @@ "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Santa Cruz", "aff_country_unique_index": "1;1;1", - "aff_country_unique": ";United States" + "aff_country_unique": ";United States", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Zeyu and Li,\n Dingwen and Luo,\n Chenxu and Xie,\n Cihang and Yang,\n Xiaodong\n},\n title = {\n DistillBEV: Boosting Multi-Camera 3D Object Detection with Cross-Modal Knowledge Distillation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8637-8646\n} \n}" }, { "title": "Distilled Reverse Attention Network for Open-world Compositional Zero-Shot Learning", @@ -15494,7 +16015,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0+0", - "aff_country_unique": "Australia" + "aff_country_unique": "Australia", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Yun and Liu,\n Zhe and Jha,\n Saurav and Yao,\n Lina\n},\n title = {\n Distilled Reverse Attention Network for Open-world Compositional Zero-Shot Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1782-1791\n} \n}" }, { "title": "Distilling Coarse-to-Fine Semantic Matching Knowledge for Weakly Supervised 3D Visual Grounding", @@ -15526,7 +16048,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Zehan and Huang,\n Haifeng and Zhao,\n Yang and Li,\n Linjun and Cheng,\n Xize and Zhu,\n Yichen and Yin,\n Aoxiong and Zhao,\n Zhou\n},\n title = {\n Distilling Coarse-to-Fine Semantic Matching Knowledge for Weakly Supervised 3D Visual Grounding\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2662-2671\n} \n}" }, { "title": "Distilling DETR with Visual-Linguistic Knowledge for Open-Vocabulary Object Detection", @@ -15558,7 +16081,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0+0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Liangqi and Miao,\n Jiaxu and Shi,\n Dahu and Tan,\n Wenming and Ren,\n Ye and Yang,\n Yi and Pu,\n Shiliang\n},\n title = {\n Distilling DETR with Visual-Linguistic Knowledge for Open-Vocabulary Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6501-6510\n} \n}" }, { "title": "Distilling Large Vision-Language Model with Out-of-Distribution Generalizability", @@ -15590,7 +16114,8 @@ "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "San Diego", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Xuanlin and Fang,\n Yunhao and Liu,\n Minghua and Ling,\n Zhan and Tu,\n Zhuowen and Su,\n Hao\n},\n title = {\n Distilling Large Vision-Language Model with Out-of-Distribution Generalizability\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2492-2503\n} \n}" }, { "title": "Distilling from Similar Tasks for Transfer Learning on a Budget", @@ -15622,7 +16147,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", - "aff_country_unique": "Denmark;United States" + "aff_country_unique": "Denmark;United States", + "bibtex": "@InProceedings{Borup_2023_ICCV,\n \n author = {\n Borup,\n Kenneth and Phoo,\n Cheng Perng and Hariharan,\n Bharath\n},\n title = {\n Distilling from Similar Tasks for Transfer Learning on a Budget\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11431-11441\n} \n}" }, { "title": "Distracting Downpour: Adversarial Weather Attacks for Motion Estimation", @@ -15630,8 +16156,8 @@ "status": "Poster", "track": "main", "pid": "2332", - "author_site": "Jenny Schmalfuss, Lukas Mehl, Andr\u00e9s Bruhn", - "author": "Jenny Schmalfuss; Lukas Mehl; Andr\u00e9s Bruhn", + "author_site": "Jenny Schmalfuss, Lukas Mehl, Andrés Bruhn", + "author": "Jenny Schmalfuss; Lukas Mehl; Andrés Bruhn", "abstract": "Current adversarial attacks on motion estimation, or optical flow, optimize small per-pixel perturbations, which are unlikely to appear in the real world. In contrast, adverse weather conditions constitute a much more realistic threat scenario. Hence, in this work, we present a novel attack on motion estimation that exploits adversarially optimized particles to mimic weather effects like snowflakes, rain streaks or fog clouds. At the core of our attack framework is a differentiable particle rendering system that integrates particles (i) consistently over multiple time steps (ii) into the 3D space (iii) with a photo-realistic appearance. Through optimization, we obtain adversarial weather that significantly impacts the motion estimation. Surprisingly, methods that previously showed good robustness towards small per-pixel perturbations are particularly vulnerable to adversarial weather. At the same time, augmenting the training with non-optimized weather increases a method's robustness towards weather effects and improves generalizability at almost no additional cost. Our code is available at https://github.com/cv-stuttgart/DistractingDownpour.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Schmalfuss_Distracting_Downpour_Adversarial_Weather_Attacks_for_Motion_Estimation_ICCV_2023_paper.pdf", "aff": ";;", @@ -15645,7 +16171,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Schmalfuss_Distracting_Downpour_Adversarial_Weather_Attacks_for_Motion_Estimation_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Schmalfuss_Distracting_Downpour_Adversarial_Weather_Attacks_for_Motion_Estimation_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Schmalfuss_2023_ICCV,\n \n author = {\n Schmalfuss,\n Jenny and Mehl,\n Lukas and Bruhn,\n Andr\\'es\n},\n title = {\n Distracting Downpour: Adversarial Weather Attacks for Motion Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10106-10116\n} \n}" }, { "title": "Distributed Bundle Adjustment with Block-Based Sparse Matrix Compression for Super Large Scale Datasets", @@ -15677,7 +16204,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Wuhan;", "aff_country_unique_index": "0;0;0;0;2", - "aff_country_unique": "China;;United States" + "aff_country_unique": "China;;United States", + "bibtex": "@InProceedings{Zheng_2023_ICCV,\n \n author = {\n Zheng,\n Maoteng and Chen,\n Nengcheng and Zhu,\n Junfeng and Zeng,\n Xiaoru and Qiu,\n Huanbin and Jiang,\n Yuyao and Lu,\n Xingyue and Qu,\n Hao\n},\n title = {\n Distributed Bundle Adjustment with Block-Based Sparse Matrix Compression for Super Large Scale Datasets\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18152-18162\n} \n}" }, { "title": "Distribution Shift Matters for Knowledge Distillation with Webly Collected Images", @@ -15702,14 +16230,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Tang_Distribution_Shift_Matters_for_Knowledge_Distillation_with_Webly_Collected_Images_ICCV_2023_paper.html", "aff_unique_index": "0+1+2;3;3;4;0+1+2", - "aff_unique_norm": "Nanjing University of Science and Technology;Ministry of Education, China;Jiangsu Key Laboratory of Image and Video Understanding for Social Security;RIKEN;University of Tokyo", + "aff_unique_norm": "Nanjing University of Science and Technology;Ministry of Education, China;Jiangsu Key Laboratory of Image and Video Understanding for Social Security;RIKEN;The University of Tokyo", "aff_unique_dep": "School of Computer Science and Engineering;Key Laboratory of Intelligent Perception and Systems for High-Dimensional Information;Image and Video Understanding for Social Security;Center for Advanced Intelligence Project;Graduate School of Frontier Sciences", "aff_unique_url": ";;;https://www.riken.jp;https://www.u-tokyo.ac.jp", "aff_unique_abbr": ";;;RIKEN;UTokyo", "aff_campus_unique_index": ";1;", "aff_campus_unique": ";Tokyo", "aff_country_unique_index": "0+0+0;1;1;1;0+0+0", - "aff_country_unique": "China;Japan" + "aff_country_unique": "China;Japan", + "bibtex": "@InProceedings{Tang_2023_ICCV,\n \n author = {\n Tang,\n Jialiang and Chen,\n Shuo and Niu,\n Gang and Sugiyama,\n Masashi and Gong,\n Chen\n},\n title = {\n Distribution Shift Matters for Knowledge Distillation with Webly Collected Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17470-17480\n} \n}" }, { "title": "Distribution-Aligned Diffusion for Human Mesh Recovery", @@ -15741,7 +16270,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", - "aff_country_unique": "Singapore;United Kingdom" + "aff_country_unique": "Singapore;United Kingdom", + "bibtex": "@InProceedings{Foo_2023_ICCV,\n \n author = {\n Foo,\n Lin Geng and Gong,\n Jia and Rahmani,\n Hossein and Liu,\n Jun\n},\n title = {\n Distribution-Aligned Diffusion for Human Mesh Recovery\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9221-9232\n} \n}" }, { "title": "Distribution-Aware Prompt Tuning for Vision-Language Models", @@ -15773,7 +16303,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Cho_2023_ICCV,\n \n author = {\n Cho,\n Eulrang and Kim,\n Jooyeon and Kim,\n Hyunwoo J\n},\n title = {\n Distribution-Aware Prompt Tuning for Vision-Language Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22004-22013\n} \n}" }, { "title": "Distribution-Consistent Modal Recovering for Incomplete Multimodal Learning", @@ -15805,7 +16336,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Nanjing", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Yuanzhi and Cui,\n Zhen and Li,\n Yong\n},\n title = {\n Distribution-Consistent Modal Recovering for Incomplete Multimodal Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22025-22034\n} \n}" }, { "title": "Diverse Cotraining Makes Strong Semi-Supervised Segmentor", @@ -15828,7 +16360,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_Diverse_Cotraining_Makes_Strong_Semi-Supervised_Segmentor_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_Diverse_Cotraining_Makes_Strong_Semi-Supervised_Segmentor_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Yijiang and Wang,\n Xinjiang and Yang,\n Lihe and Feng,\n Litong and Zhang,\n Wayne and Gao,\n Ying\n},\n title = {\n Diverse Cotraining Makes Strong Semi-Supervised Segmentor\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16055-16067\n} \n}" }, { "title": "Diverse Data Augmentation with Diffusions for Effective Test-time Prompt Tuning", @@ -15860,7 +16393,8 @@ "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Canberra;Harbin", "aff_country_unique_index": "0;0;0;1+2;3", - "aff_country_unique": "Singapore;United Arab Emirates;Australia;China" + "aff_country_unique": "Singapore;United Arab Emirates;Australia;China", + "bibtex": "@InProceedings{Feng_2023_ICCV,\n \n author = {\n Feng,\n Chun-Mei and Yu,\n Kai and Liu,\n Yong and Khan,\n Salman and Zuo,\n Wangmeng\n},\n title = {\n Diverse Data Augmentation with Diffusions for Effective Test-time Prompt Tuning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2704-2714\n} \n}" }, { "title": "Diverse Inpainting and Editing with GAN Inversion", @@ -15892,7 +16426,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "T\u00fcrkiye" + "aff_country_unique": "Turkey", + "bibtex": "@InProceedings{Yildirim_2023_ICCV,\n \n author = {\n Yildirim,\n Ahmet Burak and Pehlivan,\n Hamza and Bilecen,\n Bahri Batuhan and Dundar,\n Aysegul\n},\n title = {\n Diverse Inpainting and Editing with GAN Inversion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23120-23130\n} \n}" }, { "title": "Divide and Conquer: 3D Point Cloud Instance Segmentation With Point-Wise Binarization", @@ -15904,7 +16439,7 @@ "author": "Weiguang Zhao; Yuyao Yan; Chaolong Yang; Jianan Ye; Xi Yang; Kaizhu Huang", "abstract": "Instance segmentation on point clouds is crucially important for 3D scene understanding. Most SOTAs adopt distance clustering, which is typically effective but does not perform well in segmenting adjacent objects with the same semantic label (especially when they share neighboring points). Due to the uneven distribution of offset points, these existing methods can hardly cluster all instance points. To this end, we design a novel divide-and-conquer strategy named PBNet that binarizes each point and clusters them separately to segment instances. Our binary clustering divides offset instance points into two categories: high and low density points (HPs vs. LPs). Adjacent objects can be clearly separated by removing LPs, and then be completed and refined by assigning LPs via a neighbor voting method. To suppress potential over-segmentation, we propose to construct local scenes with the weight mask for each instance. As a plug-in, the proposed binary clustering can replace the traditional distance clustering and lead to consistent performance gains on many mainstream baselines. A series of experiments on ScanNetV2 and S3DIS datasets indicate the superiority of our model. In particular, PBNet ranks first on the ScanNetV2 official benchmark challenge, achieving the highest mAP. Code will be available publicly at https://github.com/weiguangzhao/PBNet.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Zhao_Divide_and_Conquer_3D_Point_Cloud_Instance_Segmentation_With_Point-Wise_ICCV_2023_paper.pdf", - "aff": "Duke Kunshan University; Xi\u2019an Jiaotong-Liverpool University; Duke Kunshan University; Xi\u2019an Jiaotong-Liverpool University; Xi\u2019an Jiaotong-Liverpool University; Duke Kunshan University", + "aff": "Duke Kunshan University; Xi’an Jiaotong-Liverpool University; Duke Kunshan University; Xi’an Jiaotong-Liverpool University; Xi’an Jiaotong-Liverpool University; Duke Kunshan University", "project": "", "github": "https://github.com/weiguangzhao/PBNet", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Zhao_Divide_and_Conquer_ICCV_2023_supplemental.pdf", @@ -15917,14 +16452,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhao_Divide_and_Conquer_3D_Point_Cloud_Instance_Segmentation_With_Point-Wise_ICCV_2023_paper.html", "aff_unique_index": "0;1;0;1;1;0", - "aff_unique_norm": "Duke Kunshan University;Xi'an Jiao Tong-Liverpool University", + "aff_unique_norm": "Duke Kunshan University;Xi'an Jiaotong-Liverpool University", "aff_unique_dep": ";", - "aff_unique_url": "https://www.duk/Dk.edu;https://www.xjtu.edu.cn", + "aff_unique_url": "https://www.dukekunshan.edu.cn;https://www.xjtu.edu.cn", "aff_unique_abbr": "DKU;XJTLU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Kunshan;", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhao_2023_ICCV,\n \n author = {\n Zhao,\n Weiguang and Yan,\n Yuyao and Yang,\n Chaolong and Ye,\n Jianan and Yang,\n Xi and Huang,\n Kaizhu\n},\n title = {\n Divide and Conquer: 3D Point Cloud Instance Segmentation With Point-Wise Binarization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 562-571\n} \n}" }, { "title": "Divide and Conquer: a Two-Step Method for High Quality Face De-identification with Model Explainability", @@ -15947,7 +16483,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wen_Divide_and_Conquer_a_Two-Step_Method_for_High_Quality_Face_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wen_Divide_and_Conquer_a_Two-Step_Method_for_High_Quality_Face_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Wen_2023_ICCV,\n \n author = {\n Wen,\n Yunqian and Liu,\n Bo and Cao,\n Jingyi and Xie,\n Rong and Song,\n Li\n},\n title = {\n Divide and Conquer: a Two-Step Method for High Quality Face De-identification with Model Explainability\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5148-5157\n} \n}" }, { "title": "Divide&Classify: Fine-Grained Classification for City-Wide Visual Geo-Localization", @@ -15979,7 +16516,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "Italy" + "aff_country_unique": "Italy", + "bibtex": "@InProceedings{Trivigno_2023_ICCV,\n \n author = {\n Trivigno,\n Gabriele and Berton,\n Gabriele and Aragon,\n Juan and Caputo,\n Barbara and Masone,\n Carlo\n},\n title = {\n Divide\\&Classify: Fine-Grained Classification for City-Wide Visual Geo-Localization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11142-11152\n} \n}" }, { "title": "Do DALL-E and Flamingo Understand Each Other?", @@ -16011,7 +16549,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Munich;", "aff_country_unique_index": "0+0;1;0;0;0+0", - "aff_country_unique": "Germany;United Kingdom" + "aff_country_unique": "Germany;United Kingdom", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Hang and Gu,\n Jindong and Koner,\n Rajat and Sharifzadeh,\n Sahand and Tresp,\n Volker\n},\n title = {\n Do DALL-E and Flamingo Understand Each Other?\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1999-2010\n} \n}" }, { "title": "DocTr: Document Transformer for Structured Information Extraction in Documents", @@ -16035,15 +16574,16 @@ "email": "amazon.com; ; ; ; ; ; ; ; ", "author_num": 9, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Liao_DocTr_Document_Transformer_for_Structured_Information_Extraction_in_Documents_ICCV_2023_paper.html", - "aff_unique_index": "0;1;0;0;0;0;0;0;0", - "aff_unique_norm": "Amazon;MathWorks", - "aff_unique_dep": "AWS AI Labs;", - "aff_unique_url": "https://aws.amazon.com;https://www.mathworks.com", - "aff_unique_abbr": "AWS;MathWorks", + "aff_unique_index": "0;1;2;0;0;0;0;0;0", + "aff_unique_norm": "Amazon Web Services;MathWorks;Amazon", + "aff_unique_dep": "AWS AI Labs;;Physical Stores", + "aff_unique_url": "https://aws.amazon.com;https://www.mathworks.com;https://www.amazon.com", + "aff_unique_abbr": "AWS;MathWorks;Amazon", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Liao_2023_ICCV,\n \n author = {\n Liao,\n Haofu and RoyChowdhury,\n Aruni and Li,\n Weijian and Bansal,\n Ankan and Zhang,\n Yuting and Tu,\n Zhuowen and Satzoda,\n Ravi Kumar and Manmatha,\n R. and Mahadevan,\n Vijay\n},\n title = {\n DocTr: Document Transformer for Structured Information Extraction in Documents\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19584-19594\n} \n}" }, { "title": "Document Understanding Dataset and Evaluation (DUDE)", @@ -16051,11 +16591,11 @@ "status": "Poster", "track": "main", "pid": "12222", - "author_site": "Jordy Van Landeghem, Rub\u00e8n Tito, ?ukasz Borchmann, Micha? Pietruszka, Pawel Joziak, Rafal Powalski, Dawid Jurkiewicz, Mickael Coustaty, Bertrand Anckaert, Ernest Valveny, Matthew Blaschko, Sien Moens, Tomasz Stanislawek", - "author": "Jordy Van Landeghem; Rub\u00e8n Tito; \u0141ukasz Borchmann; Micha\u0142 Pietruszka; Pawel Joziak; Rafal Powalski; Dawid Jurkiewicz; Mickael Coustaty; Bertrand Anckaert; Ernest Valveny; Matthew Blaschko; Sien Moens; Tomasz Stanislawek", + "author_site": "Jordy Van Landeghem, Rubèn Tito, ?ukasz Borchmann, Micha? Pietruszka, Pawel Joziak, Rafal Powalski, Dawid Jurkiewicz, Mickael Coustaty, Bertrand Anckaert, Ernest Valveny, Matthew Blaschko, Sien Moens, Tomasz Stanislawek", + "author": "Jordy Van Landeghem; Rubèn Tito; Łukasz Borchmann; Michał Pietruszka; Pawel Joziak; Rafal Powalski; Dawid Jurkiewicz; Mickael Coustaty; Bertrand Anckaert; Ernest Valveny; Matthew Blaschko; Sien Moens; Tomasz Stanislawek", "abstract": "We call on the Document AI (DocAI) community to re-evaluate current methodologies and embrace the challenge of creating more practically-oriented benchmarks. Document Understanding Dataset and Evaluation (DUDE) seeks to remediate the halted research progress in understanding visually-rich documents (VRDs). We present a new dataset with novelties related to types of questions, answers, and document layouts based on multi-industry, multi-domain, and multi-page VRDs of various origins and dates. Moreover, we are pushing the boundaries of current methods by creating multi-task and multi-domain evaluation setups that more accurately simulate real-world situations where powerful generalization and adaptation under low-resource settings are desired. DUDE aims to set a new standard as a more practical, long-standing benchmark for the community, and we hope that it will lead to future extensions and contributions that address real-world challenges. Finally, our work illustrates the importance of finding more efficient ways to model language, images, and layout in DocAI.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Van_Landeghem_Document_Understanding_Dataset_and_Evaluation_DUDE_ICCV_2023_paper.pdf", - "aff": "KU Leuven+Contract.fit; Computer Vision Center, Universitat Aut\u00f2noma de Barcelona; Snowflake; Warsaw University of Technology+Snowflake; Snowflake+Warsaw University of Technology; Instabase; Snowflake+Adam Mickiewicz University; University of La Rochelle; Contract.fit; Computer Vision Center, Universitat Aut\u00f2noma de Barcelona; KU Leuven; KU Leuven; Snowflake", + "aff": "KU Leuven+Contract.fit; Computer Vision Center, Universitat Autònoma de Barcelona; Snowflake; Warsaw University of Technology+Snowflake; Snowflake+Warsaw University of Technology; Instabase; Snowflake+Adam Mickiewicz University; University of La Rochelle; Contract.fit; Computer Vision Center, Universitat Autònoma de Barcelona; KU Leuven; KU Leuven; Snowflake", "project": "huggingface.co/datasets/jordyvl/DUDE_loader", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Van_Landeghem_Document_Understanding_Dataset_ICCV_2023_supplemental.pdf", @@ -16068,14 +16608,15 @@ "author_num": 13, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Van_Landeghem_Document_Understanding_Dataset_and_Evaluation_DUDE_ICCV_2023_paper.html", "aff_unique_index": "0+1;2;3;4+3;3+4;5;3+6;7;1;2;0;0;3", - "aff_unique_norm": "Katholieke Universiteit Leuven;Contract.fit;Universitat Aut\u00f2noma de Barcelona;Snowflake Inc.;Warsaw University of Technology;Instabase;Adam Mickiewicz University;University of La Rochelle", + "aff_unique_norm": "Katholieke Universiteit Leuven;Contract.fit;Universitat Autònoma de Barcelona;Snowflake Inc.;Warsaw University of Technology;Instabase;Adam Mickiewicz University;University of La Rochelle", "aff_unique_dep": ";;Computer Vision Center;;;;;", "aff_unique_url": "https://www.kuleuven.be;;https://www.uab.cat;https://www.snowflake.com;https://www.pw.edu.pl;https://www.instabase.com;https://www.amu.edu.pl;https://www.univ-larochelle.fr", "aff_unique_abbr": "KU Leuven;;UAB;Snowflake;WUT;;AMU;ULR", "aff_campus_unique_index": ";;;", "aff_campus_unique": "", "aff_country_unique_index": "0;2;3;4+3;3+4;3;3+4;5;2;0;0;3", - "aff_country_unique": "Belgium;;Spain;United States;Poland;France" + "aff_country_unique": "Belgium;;Spain;United States;Poland;France", + "bibtex": "@InProceedings{Van_Landeghem_2023_ICCV,\n \n author = {\n Van Landeghem,\n Jordy and Tito,\n Rub\\`en and Borchmann,\n {\\L\n}ukasz and Pietruszka,\n Micha{\\l\n} and Joziak,\n Pawel and Powalski,\n Rafal and Jurkiewicz,\n Dawid and Coustaty,\n Mickael and Anckaert,\n Bertrand and Valveny,\n Ernest and Blaschko,\n Matthew and Moens,\n Sien and Stanislawek,\n Tomasz\n},\n title = {\n Document Understanding Dataset and Evaluation (DUDE)\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19528-19540\n} \n}" }, { "title": "Does Physical Adversarial Example Really Matter to Autonomous Driving? Towards System-Level Effect of Adversarial Object Evasion Attack", @@ -16107,7 +16648,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Irvine;", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Ningfei and Luo,\n Yunpeng and Sato,\n Takami and Xu,\n Kaidi and Chen,\n Qi Alfred\n},\n title = {\n Does Physical Adversarial Example Really Matter to Autonomous Driving? Towards System-Level Effect of Adversarial Object Evasion Attack\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4412-4423\n} \n}" }, { "title": "Domain Adaptive Few-Shot Open-Set Learning", @@ -16139,7 +16681,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Bombay;", "aff_country_unique_index": "0;0;0;1;1;0", - "aff_country_unique": "India;United States" + "aff_country_unique": "India;United States", + "bibtex": "@InProceedings{Pal_2023_ICCV,\n \n author = {\n Pal,\n Debabrata and More,\n Deeptej and Bhargav,\n Sai and Tamboli,\n Dipesh and Aggarwal,\n Vaneet and Banerjee,\n Biplab\n},\n title = {\n Domain Adaptive Few-Shot Open-Set Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18831-18840\n} \n}" }, { "title": "Domain Generalization Guided by Gradient Signal to Noise Ratio of Parameters", @@ -16164,14 +16707,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Michalkiewicz_Domain_Generalization_Guided_by_Gradient_Signal_to_Noise_Ratio_of_ICCV_2023_paper.html", "aff_unique_index": "0;1;2+1;1+3;0", - "aff_unique_norm": "University of Queensland;NEC Labs America;Amazon;University of California, San Diego", - "aff_unique_dep": ";;Amazon.com, Inc.;", + "aff_unique_norm": "University of Queensland;NEC Labs America;Amazon.com, Inc.;University of California, San Diego", + "aff_unique_dep": ";;;", "aff_unique_url": "https://www.uq.edu.au;https://www.nec-labs.com;https://www.amazon.com;https://www.ucsd.edu", "aff_unique_abbr": "UQ;NEC LA;Amazon;UCSD", "aff_campus_unique_index": ";1", "aff_campus_unique": ";San Diego", "aff_country_unique_index": "0;1;1+1;1+1;0", - "aff_country_unique": "Australia;United States" + "aff_country_unique": "Australia;United States", + "bibtex": "@InProceedings{Michalkiewicz_2023_ICCV,\n \n author = {\n Michalkiewicz,\n Mateusz and Faraki,\n Masoud and Yu,\n Xiang and Chandraker,\n Manmohan and Baktashmotlagh,\n Mahsa\n},\n title = {\n Domain Generalization Guided by Gradient Signal to Noise Ratio of Parameters\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6177-6188\n} \n}" }, { "title": "Domain Generalization of 3D Semantic Segmentation in Autonomous Driving", @@ -16179,8 +16723,8 @@ "status": "Poster", "track": "main", "pid": "10913", - "author_site": "Jules Sanchez, Jean-Emmanuel Deschaud, Fran\u00e7ois Goulette", - "author": "Jules Sanchez; Jean-Emmanuel Deschaud; Fran\u00e7ois Goulette", + "author_site": "Jules Sanchez, Jean-Emmanuel Deschaud, François Goulette", + "author": "Jules Sanchez; Jean-Emmanuel Deschaud; François Goulette", "abstract": "Using deep learning, 3D autonomous driving semantic segmentation has become a well-studied subject, with methods that can reach very high performance. Nonetheless, because of the limited size of the training datasets, these models cannot see every type of object and scene found in real-world applications. The ability to be reliable in these various unknown environments is called domain generalization. Despite its importance, domain generalization is relatively unexplored in the case of 3D autonomous driving semantic segmentation. To fill this gap, this paper presents the first benchmark for this application by testing state-of-the-art methods and discussing the difficulty of tackling Laser Imaging Detection and Ranging (LiDAR) domain shifts. We also propose the first method designed to address this domain generalization, which we call 3DLabelProp. This method relies on leveraging the geometry and sequentiality of the LiDAR data to enhance its generalization performances by working on partially accumulated point clouds. It reaches a mean Intersection over Union (mIoU) of 50.4% on SemanticPOSS and of 55.2% on PandaSet solid-state LiDAR while being trained only on SemanticKITTI, making it the state-of-the-art method for generalization (+5% and +33% better, respectively, than the second best method).", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Sanchez_Domain_Generalization_of_3D_Semantic_Segmentation_in_Autonomous_Driving_ICCV_2023_paper.pdf", "aff": ";;", @@ -16194,7 +16738,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Sanchez_Domain_Generalization_of_3D_Semantic_Segmentation_in_Autonomous_Driving_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Sanchez_Domain_Generalization_of_3D_Semantic_Segmentation_in_Autonomous_Driving_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Sanchez_2023_ICCV,\n \n author = {\n Sanchez,\n Jules and Deschaud,\n Jean-Emmanuel and Goulette,\n Fran\\c{c\n}ois\n},\n title = {\n Domain Generalization of 3D Semantic Segmentation in Autonomous Driving\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18077-18087\n} \n}" }, { "title": "Domain Generalization via Balancing Training Difficulty and Model Capability", @@ -16226,7 +16771,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Jiang_2023_ICCV,\n \n author = {\n Jiang,\n Xueying and Huang,\n Jiaxing and Jin,\n Sheng and Lu,\n Shijian\n},\n title = {\n Domain Generalization via Balancing Training Difficulty and Model Capability\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18993-19003\n} \n}" }, { "title": "Domain Generalization via Rationale Invariance", @@ -16234,6 +16780,7 @@ "status": "Poster", "track": "main", "pid": "6096", + "author_site": "Liang Chen, Yong Zhang, Yibing Song, Anton van den Hengel, Lingqiao Liu", "author": "Liang Chen, Yong Zhang, Yibing Song, Anton van den Hengel, Lingqiao Liu", "abstract": "This paper offers a new perspective to ease the challenge of domain generalization, which involves maintaining robust results even in unseen environments. Our design focuses on the decision-making process in the final classifier layer. Specifically, we propose treating the element-wise contributions to the final results as the rationale for making a decision and representing the rationale for each sample as a matrix. For a well-generalized model, we suggest the rationale matrices for samples belonging to the same category should be similar, indicating the model relies on domain-invariant clues to make decisions, thereby ensuring robust results. To implement this idea, we introduce a rationale invariance loss as a simple regularization technique, requiring only a few lines of code. Our experiments demonstrate that the proposed approach achieves competitive results across various datasets, despite its simplicity. Code is available at https://github.com/liangchen527/RIDG.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Chen_Domain_Generalization_via_Rationale_Invariance_ICCV_2023_paper.pdf", @@ -16245,7 +16792,8 @@ "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12388549440240009703&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chen_Domain_Generalization_via_Rationale_Invariance_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chen_Domain_Generalization_via_Rationale_Invariance_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Liang and Zhang,\n Yong and Song,\n Yibing and van den Hengel,\n Anton and Liu,\n Lingqiao\n},\n title = {\n Domain Generalization via Rationale Invariance\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1751-1760\n} \n}" }, { "title": "Domain Specified Optimization for Deployment Authorization", @@ -16277,7 +16825,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;1;1", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Haotian and Chi,\n Haoang and Yang,\n Wenjing and Lin,\n Zhipeng and Geng,\n Mingyang and Lan,\n Long and Zhang,\n Jing and Tao,\n Dacheng\n},\n title = {\n Domain Specified Optimization for Deployment Authorization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5095-5105\n} \n}" }, { "title": "Domain-Specificity Inducing Transformers for Source-Free Domain Adaptation", @@ -16300,7 +16849,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Sanyal_Domain-Specificity_Inducing_Transformers_for_Source-Free_Domain_Adaptation_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Sanyal_Domain-Specificity_Inducing_Transformers_for_Source-Free_Domain_Adaptation_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Sanyal_2023_ICCV,\n \n author = {\n Sanyal,\n Sunandini and Asokan,\n Ashish Ramayee and Bhambri,\n Suvaansh and Kulkarni,\n Akshay and Kundu,\n Jogendra Nath and Babu,\n R Venkatesh\n},\n title = {\n Domain-Specificity Inducing Transformers for Source-Free Domain Adaptation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18928-18937\n} \n}" }, { "title": "DomainAdaptor: A Novel Approach to Test-time Adaptation", @@ -16332,7 +16882,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Jian and Qi,\n Lei and Shi,\n Yinghuan and Gao,\n Yang\n},\n title = {\n DomainAdaptor: A Novel Approach to Test-time Adaptation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18971-18981\n} \n}" }, { "title": "DomainDrop: Suppressing Domain-Sensitive Channels for Domain Generalization", @@ -16364,7 +16915,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Guo_2023_ICCV,\n \n author = {\n Guo,\n Jintao and Qi,\n Lei and Shi,\n Yinghuan\n},\n title = {\n DomainDrop: Suppressing Domain-Sensitive Channels for Domain Generalization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19114-19124\n} \n}" }, { "title": "Doppelgangers: Learning to Disambiguate Images of Similar Structures", @@ -16387,7 +16939,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Cai_Doppelgangers_Learning_to_Disambiguate_Images_of_Similar_Structures_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Cai_Doppelgangers_Learning_to_Disambiguate_Images_of_Similar_Structures_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Cai_2023_ICCV,\n \n author = {\n Cai,\n Ruojin and Tung,\n Joseph and Wang,\n Qianqian and Averbuch-Elor,\n Hadar and Hariharan,\n Bharath and Snavely,\n Noah\n},\n title = {\n Doppelgangers: Learning to Disambiguate Images of Similar Structures\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 34-44\n} \n}" }, { "title": "Downscaled Representation Matters: Improving Image Rescaling with Collaborative Downscaled Images", @@ -16419,7 +16972,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xu_2023_ICCV,\n \n author = {\n Xu,\n Bingna and Guo,\n Yong and Jiang,\n Luoqian and Yu,\n Mianjie and Chen,\n Jian\n},\n title = {\n Downscaled Representation Matters: Improving Image Rescaling with Collaborative Downscaled Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12237-12247\n} \n}" }, { "title": "Downstream-agnostic Adversarial Examples", @@ -16445,13 +16999,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhou_Downstream-agnostic_Adversarial_Examples_ICCV_2023_paper.html", "aff_unique_index": "0+1+2+3+4;0+1+2+3+4;0+1+2+3+4;5;6;7;0+8", "aff_unique_norm": "Huazhong University of Science and Technology;National Engineering Research Center for Big Data Technology and System;Services Computing Technology and System Lab;Hubei Key Laboratory of Distributed System Security;Hubei Engineering Research Center on Big Data Security;Wuhan University;Griffith University;City University of Hong Kong;Cluster and Grid Computing Lab", - "aff_unique_dep": "School of Cyber Science and Engineering;;Services Computing Technology and System;Distributed System Security;Engineering Research Center on Big Data Security;School of Cyber Science and Engineering;School of Information and Communication Technology;Department of Computer Science;Computer Science", + "aff_unique_dep": "School of Cyber Science and Engineering;;Services Computing Technology and System;Distributed System Security;Engineering Research Center on Big Data Security;School of Cyber Science and Engineering;School of Information and Communication Technology;Department of Computer Science;", "aff_unique_url": "http://www.hust.edu.cn;;;;;http://www.whu.edu.cn/;https://www.griffith.edu.au;https://www.cityu.edu.hk;", "aff_unique_abbr": "HUST;;;;;WHU;;CityU;", - "aff_campus_unique_index": ";;;1;", - "aff_campus_unique": ";Hong Kong SAR", + "aff_campus_unique_index": ";;;1;2;", + "aff_campus_unique": ";Wuhan;Hong Kong SAR", "aff_country_unique_index": "0+0+0+0;0+0+0+0;0+0+0+0;0;2;0;0", - "aff_country_unique": "China;;Australia" + "aff_country_unique": "China;;Australia", + "bibtex": "@InProceedings{Zhou_2023_ICCV,\n \n author = {\n Zhou,\n Ziqi and Hu,\n Shengshan and Zhao,\n Ruizhi and Wang,\n Qian and Zhang,\n Leo Yu and Hou,\n Junhui and Jin,\n Hai\n},\n title = {\n Downstream-agnostic Adversarial Examples\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4345-4355\n} \n}" }, { "title": "DreamBooth3D: Subject-Driven Text-to-3D Generation", @@ -16474,7 +17029,8 @@ "aff_domain": ";;;;;;;;;;;", "email": ";;;;;;;;;;;", "author_num": 12, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Raj_DreamBooth3D_Subject-Driven_Text-to-3D_Generation_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Raj_DreamBooth3D_Subject-Driven_Text-to-3D_Generation_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Raj_2023_ICCV,\n \n author = {\n Raj,\n Amit and Kaza,\n Srinivas and Poole,\n Ben and Niemeyer,\n Michael and Ruiz,\n Nataniel and Mildenhall,\n Ben and Zada,\n Shiran and Aberman,\n Kfir and Rubinstein,\n Michael and Barron,\n Jonathan and Li,\n Yuanzhen and Jampani,\n Varun\n},\n title = {\n DreamBooth3D: Subject-Driven Text-to-3D Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2349-2359\n} \n}" }, { "title": "DreamPose: Fashion Video Synthesis with Stable Diffusion", @@ -16497,7 +17053,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Karras_DreamPose_Fashion_Video_Synthesis_with_Stable_Diffusion_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Karras_DreamPose_Fashion_Video_Synthesis_with_Stable_Diffusion_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Karras_2023_ICCV,\n \n author = {\n Karras,\n Johanna and Holynski,\n Aleksander and Wang,\n Ting-Chun and Kemelmacher-Shlizerman,\n Ira\n},\n title = {\n DreamPose: Fashion Video Synthesis with Stable Diffusion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22680-22690\n} \n}" }, { "title": "DreamTeacher: Pretraining Image Backbones with Deep Generative Models", @@ -16522,14 +17079,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_DreamTeacher_Pretraining_Image_Backbones_with_Deep_Generative_Models_ICCV_2023_paper.html", "aff_unique_index": "0;0+1+2;0+1+2;0+1+2;0+1+2;0;3;0+1+2", - "aff_unique_norm": "NVIDIA;University of Toronto;Vector Institute;Massachusetts Institute of Technology", - "aff_unique_dep": "NVIDIA Corporation;;;", + "aff_unique_norm": "NVIDIA Corporation;University of Toronto;Vector Institute;Massachusetts Institute of Technology", + "aff_unique_dep": ";;;", "aff_unique_url": "https://www.nvidia.com;https://www.utoronto.ca;https://vectorinstitute.ai/;https://web.mit.edu", "aff_unique_abbr": "NVIDIA;U of T;Vector Institute;MIT", "aff_campus_unique_index": ";;;;", "aff_campus_unique": "", "aff_country_unique_index": "0;0+1+1;0+1+1;0+1+1;0+1+1;0;0;0+1+1", - "aff_country_unique": "United States;Canada" + "aff_country_unique": "United States;Canada", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Daiqing and Ling,\n Huan and Kar,\n Amlan and Acuna,\n David and Kim,\n Seung Wook and Kreis,\n Karsten and Torralba,\n Antonio and Fidler,\n Sanja\n},\n title = {\n DreamTeacher: Pretraining Image Backbones with Deep Generative Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16698-16708\n} \n}" }, { "title": "DriveAdapter: Breaking the Coupling Barrier of Perception and Planning in End-to-End Autonomous Driving", @@ -16552,7 +17110,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Jia_DriveAdapter_Breaking_the_Coupling_Barrier_of_Perception_and_Planning_in_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Jia_DriveAdapter_Breaking_the_Coupling_Barrier_of_Perception_and_Planning_in_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Jia_2023_ICCV,\n \n author = {\n Jia,\n Xiaosong and Gao,\n Yulu and Chen,\n Li and Yan,\n Junchi and Liu,\n Patrick Langechuan and Li,\n Hongyang\n},\n title = {\n DriveAdapter: Breaking the Coupling Barrier of Perception and Planning in End-to-End Autonomous Driving\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7953-7963\n} \n}" }, { "title": "Dual Aggregation Transformer for Image Super-Resolution", @@ -16564,7 +17123,7 @@ "author": "Zheng Chen; Yulun Zhang; Jinjin Gu; Linghe Kong; Xiaokang Yang; Fisher Yu", "abstract": "Transformer has recently gained considerable popularity in low-level vision tasks, including image super-resolution (SR). These networks utilize self-attention along different dimensions, spatial or channel, and achieve impressive performance. This inspires us to combine the two dimensions in Transformer for a more powerful representation capability. Based on the above idea, we propose a novel Transformer model, Dual Aggregation Transformer (DAT), for image SR. Our DAT aggregates features across spatial and channel dimensions, in the inter-block and intra-block dual manner. Specifically, we alternately apply spatial and channel self-attention in consecutive Transformer blocks. The alternate strategy enables DAT to capture the global context and realize inter-block feature aggregation. Furthermore, we propose the adaptive interaction module (AIM) and the spatial-gate feed-forward network (SGFN) to achieve intra-block feature aggregation. AIM complements two self-attention mechanisms from corresponding dimensions. Meanwhile, SGFN introduces additional non-linear spatial information in the feed-forward network. Extensive experiments show that our DAT surpasses current methods. Code and models are obtainable at https://github.com/zhengchen1999/DAT.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Chen_Dual_Aggregation_Transformer_for_Image_Super-Resolution_ICCV_2023_paper.pdf", - "aff": "Shanghai Jiao Tong University; ETH Z\u00fcrich; The University of Sydney+Shanghai AI Laboratory; Shanghai Jiao Tong University; Shanghai Jiao Tong University; ETH Z\u00fcrich", + "aff": "Shanghai Jiao Tong University; ETH Zürich; The University of Sydney+Shanghai AI Laboratory; Shanghai Jiao Tong University; Shanghai Jiao Tong University; ETH Zürich", "project": "", "github": "https://github.com/zhengchen1999/DAT", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Chen_Dual_Aggregation_Transformer_ICCV_2023_supplemental.pdf", @@ -16577,14 +17136,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chen_Dual_Aggregation_Transformer_for_Image_Super-Resolution_ICCV_2023_paper.html", "aff_unique_index": "0;1;2+3;0;0;1", - "aff_unique_norm": "Shanghai Jiao Tong University;ETH Zurich;University of Sydney;Shanghai AI Laboratory", + "aff_unique_norm": "Shanghai Jiao Tong University;ETH Zürich;University of Sydney;Shanghai AI Laboratory", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.ethz.ch;https://www.sydney.edu.au;https://www.shanghai-ai-lab.com", "aff_unique_abbr": "SJTU;ETHZ;USYD;SAIL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2+0;0;0;1", - "aff_country_unique": "China;Switzerland;Australia" + "aff_country_unique": "China;Switzerland;Australia", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Zheng and Zhang,\n Yulun and Gu,\n Jinjin and Kong,\n Linghe and Yang,\n Xiaokang and Yu,\n Fisher\n},\n title = {\n Dual Aggregation Transformer for Image Super-Resolution\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12312-12321\n} \n}" }, { "title": "Dual Learning with Dynamic Knowledge Distillation for Partially Relevant Video Retrieval", @@ -16611,12 +17171,13 @@ "aff_unique_index": "0+1;0+1;0+1;0;2;3;0+1;0+1", "aff_unique_norm": "Zhejiang Gongshang University;Zhejiang University;Peking University;Huazhong University of Science and Technology", "aff_unique_dep": ";Key Lab of E-Commerce;;", - "aff_unique_url": "http://www.hzic.edu.cn;http://www.zju.edu.cn;http://www.pku.edu.cn;http://www.hust.edu.cn", - "aff_unique_abbr": "ZJGSU;;Peking U;HUST", + "aff_unique_url": "http://www.hgh.edu.cn;http://www.zju.edu.cn;http://www.pku.edu.cn;http://www.hust.edu.cn", + "aff_unique_abbr": ";;Peking U;HUST", "aff_campus_unique_index": ";;;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0+0;0;0;0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Dong_2023_ICCV,\n \n author = {\n Dong,\n Jianfeng and Zhang,\n Minsong and Zhang,\n Zheng and Chen,\n Xianke and Liu,\n Daizong and Qu,\n Xiaoye and Wang,\n Xun and Liu,\n Baolong\n},\n title = {\n Dual Learning with Dynamic Knowledge Distillation for Partially Relevant Video Retrieval\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11302-11312\n} \n}" }, { "title": "Dual Meta-Learning with Longitudinally Consistent Regularization for One-Shot Brain Tissue Segmentation Across the Human Lifespan", @@ -16628,7 +17189,7 @@ "author": "Yongheng Sun; Fan Wang; Jun Shu; Haifeng Wang; Li Wang; Deyu Meng; Chunfeng Lian", "abstract": "Brain tissue segmentation is essential for neuroscience and clinical studies. However, segmentation on longitudinal data is challenging due to dynamic brain changes across the lifespan. Previous researches mainly focus on self-supervision with regularizations and will lose longitudinal generalization when fine-tuning on a specific age group. In this paper, we propose a dual meta-learning paradigm to learn longitudinally consistent representations and persist when fine-tuning. Specifically, we learn a plug-and-play feature extractor to extract longitudinal-consistent anatomical representations by meta-feature learning and a well-initialized task head for fine-tuning by meta-initialization learning. Besides, two class-aware regularizations are proposed to encourage longitudinal consistency. Experimental results on the iSeg2019 and ADNI datasets demonstrate the effectiveness of our method.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Sun_Dual_Meta-Learning_with_Longitudinally_Consistent_Regularization_for_One-Shot_Brain_Tissue_ICCV_2023_paper.pdf", - "aff": "School of Mathematics and Statistics, Xi\u2019an Jiaotong University; The Key Laboratory of Biomedical Information Engineering of Ministry of Education, School of Life Science and Technology, Xi\u2019an Jiaotong University; School of Mathematics and Statistics, Xi\u2019an Jiaotong University; School of Mathematics and Statistics, Xi\u2019an Jiaotong University; UNC Chapel Hill; School of Mathematics and Statistics, Xi\u2019an Jiaotong University; School of Mathematics and Statistics, Xi\u2019an Jiaotong University", + "aff": "School of Mathematics and Statistics, Xi’an Jiaotong University; The Key Laboratory of Biomedical Information Engineering of Ministry of Education, School of Life Science and Technology, Xi’an Jiaotong University; School of Mathematics and Statistics, Xi’an Jiaotong University; School of Mathematics and Statistics, Xi’an Jiaotong University; UNC Chapel Hill; School of Mathematics and Statistics, Xi’an Jiaotong University; School of Mathematics and Statistics, Xi’an Jiaotong University", "project": "", "github": "https://github.com/ladderlab-xjtu/DuMeta", "supp": "", @@ -16641,14 +17202,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Sun_Dual_Meta-Learning_with_Longitudinally_Consistent_Regularization_for_One-Shot_Brain_Tissue_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0;1;0;0", - "aff_unique_norm": "Xi'an Jiao Tong University;University of North Carolina at Chapel Hill", + "aff_unique_norm": "Xi'an Jiaotong University;University of North Carolina at Chapel Hill", "aff_unique_dep": "School of Mathematics and Statistics;", "aff_unique_url": "http://en.xjtu.edu.cn/;https://www.unc.edu", "aff_unique_abbr": "XJTU;UNC", "aff_campus_unique_index": "0;0;0;0;1;0;0", "aff_campus_unique": "Xi'an;Chapel Hill", "aff_country_unique_index": "0;0;0;0;1;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Sun_2023_ICCV,\n \n author = {\n Sun,\n Yongheng and Wang,\n Fan and Shu,\n Jun and Wang,\n Haifeng and Wang,\n Li and Meng,\n Deyu and Lian,\n Chunfeng\n},\n title = {\n Dual Meta-Learning with Longitudinally Consistent Regularization for One-Shot Brain Tissue Segmentation Across the Human Lifespan\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21118-21128\n} \n}" }, { "title": "Dual Pseudo-Labels Interactive Self-Training for Semi-Supervised Visible-Infrared Person Re-Identification", @@ -16680,7 +17242,8 @@ "aff_campus_unique_index": "1;2;2;", "aff_campus_unique": ";Shenzhen;Chongqing", "aff_country_unique_index": "1;1;1+1;1+1;1;1;1", - "aff_country_unique": ";China" + "aff_country_unique": ";China", + "bibtex": "@InProceedings{Shi_2023_ICCV,\n \n author = {\n Shi,\n Jiangming and Zhang,\n Yachao and Yin,\n Xiangbo and Xie,\n Yuan and Zhang,\n Zhizhong and Fan,\n Jianping and Shi,\n Zhongchao and Qu,\n Yanyun\n},\n title = {\n Dual Pseudo-Labels Interactive Self-Training for Semi-Supervised Visible-Infrared Person Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11218-11228\n} \n}" }, { "title": "DyGait: Exploiting Dynamic Representations for High-performance Gait Recognition", @@ -16703,7 +17266,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wang_DyGait_Exploiting_Dynamic_Representations_for_High-performance_Gait_Recognition_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wang_DyGait_Exploiting_Dynamic_Representations_for_High-performance_Gait_Recognition_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Ming and Guo,\n Xianda and Lin,\n Beibei and Yang,\n Tian and Zhu,\n Zheng and Li,\n Lincheng and Zhang,\n Shunli and Yu,\n Xin\n},\n title = {\n DyGait: Exploiting Dynamic Representations for High-performance Gait Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13424-13433\n} \n}" }, { "title": "DynaMITe: Dynamic Query Bootstrapping for Multi-object Interactive Segmentation Transformer", @@ -16735,7 +17299,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Rana_2023_ICCV,\n \n author = {\n Rana,\n Amit Kumar and Mahadevan,\n Sabarinath and Hermans,\n Alexander and Leibe,\n Bastian\n},\n title = {\n DynaMITe: Dynamic Query Bootstrapping for Multi-object Interactive Segmentation Transformer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1043-1052\n} \n}" }, { "title": "Dynamic Hyperbolic Attention Network for Fine Hand-object Reconstruction", @@ -16743,6 +17308,7 @@ "status": "Poster", "track": "main", "pid": "1853", + "author_site": "Zhiying Leng, Shun-Cheng Wu, Mahdi Saleh, Antonio Montanaro, Hao Yu, Yin Wang, Nassir Navab, Xiaohui Liang, Federico Tombari", "author": "Zhiying Leng, Shun-Cheng Wu, Mahdi Saleh, Antonio Montanaro, Hao Yu, Yin Wang, Nassir Navab, Xiaohui Liang, Federico Tombari", "abstract": "Reconstructing both objects and hands in 3D from a single RGB image is complex. Existing methods rely on manually defined hand-object constraints in Euclidean space, leading to suboptimal feature learning. Compared with Euclidean space, hyperbolic space better preserves the geometric properties of meshes thanks to its exponentially-growing space distance, which amplifies the differences between the features based on similarity. In this work, we propose the first precise hand-object reconstruction method in hyperbolic space, namely Dynamic Hyperbolic Attention Network (DHANet), which leverages intrinsic properties of hyperbolic space to learn representative features. Our method that projects mesh and image features into a unified hyperbolic space includes two modules, i.e. dynamic hyperbolic graph convolution and image-attention hyperbolic graph convolution. With these two modules, our method learns mesh features with rich geometry-image multi-modal information and models better hand-object interaction. Our method provides a promising alternative for fine hand-object reconstruction in hyperbolic space. Extensive experiments on three public datasets demonstrate that our method outperforms most state-of-the-art methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Leng_Dynamic_Hyperbolic_Attention_Network_for_Fine_Hand-object_Reconstruction_ICCV_2023_paper.pdf", @@ -16754,7 +17320,8 @@ "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13376922569284570451&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Leng_Dynamic_Hyperbolic_Attention_Network_for_Fine_Hand-object_Reconstruction_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Leng_Dynamic_Hyperbolic_Attention_Network_for_Fine_Hand-object_Reconstruction_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Leng_2023_ICCV,\n \n author = {\n Leng,\n Zhiying and Wu,\n Shun-Cheng and Saleh,\n Mahdi and Montanaro,\n Antonio and Yu,\n Hao and Wang,\n Yin and Navab,\n Nassir and Liang,\n Xiaohui and Tombari,\n Federico\n},\n title = {\n Dynamic Hyperbolic Attention Network for Fine Hand-object Reconstruction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14894-14904\n} \n}" }, { "title": "Dynamic Mesh Recovery from Partial Point Cloud Sequence", @@ -16786,7 +17353,8 @@ "aff_campus_unique_index": "0;0;0;0+0", "aff_campus_unique": "Seoul", "aff_country_unique_index": "0;0;0;0+0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Jang_2023_ICCV,\n \n author = {\n Jang,\n Hojun and Kim,\n Minkwan and Bae,\n Jinseok and Kim,\n Young Min\n},\n title = {\n Dynamic Mesh Recovery from Partial Point Cloud Sequence\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15074-15084\n} \n}" }, { "title": "Dynamic Mesh-Aware Radiance Fields", @@ -16818,7 +17386,8 @@ "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "College Park", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Qiao_2023_ICCV,\n \n author = {\n Qiao,\n Yi-Ling and Gao,\n Alexander and Xu,\n Yiran and Feng,\n Yue and Huang,\n Jia-Bin and Lin,\n Ming C.\n},\n title = {\n Dynamic Mesh-Aware Radiance Fields\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 385-396\n} \n}" }, { "title": "Dynamic Perceiver for Efficient Visual Recognition", @@ -16850,7 +17419,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Han_2023_ICCV,\n \n author = {\n Han,\n Yizeng and Han,\n Dongchen and Liu,\n Zeyu and Wang,\n Yulin and Pan,\n Xuran and Pu,\n Yifan and Deng,\n Chao and Feng,\n Junlan and Song,\n Shiji and Huang,\n Gao\n},\n title = {\n Dynamic Perceiver for Efficient Visual Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5992-6002\n} \n}" }, { "title": "Dynamic PlenOctree for Adaptive Sampling Refinement in Explicit NeRF", @@ -16882,7 +17452,8 @@ "aff_campus_unique_index": "0+1;0+1;0+1;0+1", "aff_campus_unique": "Guangzhou;Hong Kong SAR", "aff_country_unique_index": "0+0;0+0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Bai_2023_ICCV,\n \n author = {\n Bai,\n Haotian and Lin,\n Yiqi and Chen,\n Yize and Wang,\n Lin\n},\n title = {\n Dynamic PlenOctree for Adaptive Sampling Refinement in Explicit NeRF\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8785-8795\n} \n}" }, { "title": "Dynamic Point Fields", @@ -16894,7 +17465,7 @@ "author": "Sergey Prokudin; Qianli Ma; Maxime Raafat; Julien Valentin; Siyu Tang", "abstract": "Recent years have witnessed significant progress in the field of neural surface reconstruction. While extensive focus was put on volumetric and implicit approaches, a number of works have shown that explicit graphics primitives, such as point clouds, can significantly reduce computational complexity without sacrificing the reconstructed surface quality. However, less emphasis has been put on modeling dynamic surfaces with point primitives. In this work, we present a dynamic point field model that combines the representational benefits of explicit point-based graphics with implicit deformation networks to allow efficient modeling of non-rigid 3D surfaces. Using explicit surface primitives also allows us to easily incorporate well-established constraints such as isometric-as-possible regularization. While learning this deformation model is prone to local optima when trained in a fully unsupervised manner, we propose to also leverage semantic information, such as keypoint correspondence, to guide the deformation learning. We demonstrate how this approach can be used for creating an expressive animatable human avatar from a collection of 3D scans. Here, previous methods mostly rely on variants of the linear blend skinning paradigm, which fundamentally limits the expressivity of such models when dealing with complex cloth appearances, such as long skirts. We show the advantages of our dynamic point field framework in terms of its representational power, learning efficiency, and robustness to out-of-distribution novel poses. The code for the project is publicly available.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Prokudin_Dynamic_Point_Fields_ICCV_2023_paper.pdf", - "aff": "ETH Z\u00fcrich; ETH Z\u00fcrich + Max Planck Institute for Intelligent Systems; ETH Z\u00fcrich; Microsoft; ETH Z\u00fcrich", + "aff": "ETH Zürich; ETH Zürich + Max Planck Institute for Intelligent Systems; ETH Zürich; Microsoft; ETH Zürich", "project": "", "github": "sergeyprokudin.github.io/dpf", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Prokudin_Dynamic_Point_Fields_ICCV_2023_supplemental.pdf", @@ -16907,14 +17478,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Prokudin_Dynamic_Point_Fields_ICCV_2023_paper.html", "aff_unique_index": "0;0+1;0;2;0", - "aff_unique_norm": "ETH Zurich;Max Planck Institute for Intelligent Systems;Microsoft", - "aff_unique_dep": ";Intelligent Systems;Microsoft Corporation", + "aff_unique_norm": "ETH Zürich;Max Planck Institute for Intelligent Systems;Microsoft Corporation", + "aff_unique_dep": ";Intelligent Systems;", "aff_unique_url": "https://www.ethz.ch;https://www.mpi-is.mpg.de;https://www.microsoft.com", "aff_unique_abbr": "ETHZ;MPI-IS;Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+1;0;2;0", - "aff_country_unique": "Switzerland;Germany;United States" + "aff_country_unique": "Switzerland;Germany;United States", + "bibtex": "@InProceedings{Prokudin_2023_ICCV,\n \n author = {\n Prokudin,\n Sergey and Ma,\n Qianli and Raafat,\n Maxime and Valentin,\n Julien and Tang,\n Siyu\n},\n title = {\n Dynamic Point Fields\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7964-7976\n} \n}" }, { "title": "Dynamic Residual Classifier for Class Incremental Learning", @@ -16926,7 +17498,7 @@ "author": "Xiuwei Chen; Xiaobin Chang", "abstract": "The rehearsal strategy is widely used to alleviate the catastrophic forgetting problem in class incremental learning (CIL) by preserving limited exemplars from previous tasks. With imbalanced sample numbers between old and new classes, the classifier learning can be biased. Existing CIL methods exploit the long-tailed (LT) recognition techniques, e.g., the adjusted losses and the data re-sampling methods, to handle the data imbalance issue within each increment task. In this work, the dynamic nature of data imbalance in CIL is shown and a novel Dynamic Residual Classifier (DRC) is proposed to handle this challenging scenario. Specifically, DRC is built upon a recent advance residual classifier with the branch layer merging to handle the model-growing problem. Moreover, DRC is compatible with different CIL pipelines and substantially improves them. Combining DRC with the model adaptation and fusion (MAF) pipeline, this method achieves state-of-the-art results on both the conventional CIL and the LT-CIL benchmarks. Extensive experiments are also conducted for a detailed analysis. The code is publicly available.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Chen_Dynamic_Residual_Classifier_for_Class_Incremental_Learning_ICCV_2023_paper.pdf", - "aff": "School of Arti\ufb01cial Intelligence, Sun Yat-sen University, China+Guangdong Key Laboratory of Big Data Analysis and Processing, Guangzhou 510006, P.R.China+Key Laboratory of Machine Intelligence and Advanced Computing, Ministry of Education, China; School of Arti\ufb01cial Intelligence, Sun Yat-sen University, China+Guangdong Key Laboratory of Big Data Analysis and Processing, Guangzhou 510006, P.R.China+Key Laboratory of Machine Intelligence and Advanced Computing, Ministry of Education, China", + "aff": "School of Artificial Intelligence, Sun Yat-sen University, China+Guangdong Key Laboratory of Big Data Analysis and Processing, Guangzhou 510006, P.R.China+Key Laboratory of Machine Intelligence and Advanced Computing, Ministry of Education, China; School of Artificial Intelligence, Sun Yat-sen University, China+Guangdong Key Laboratory of Big Data Analysis and Processing, Guangzhou 510006, P.R.China+Key Laboratory of Machine Intelligence and Advanced Computing, Ministry of Education, China", "project": "", "github": "https://github.com/chen-xw/DRC-CIL", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Chen_Dynamic_Residual_Classifier_ICCV_2023_supplemental.pdf", @@ -16940,13 +17512,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chen_Dynamic_Residual_Classifier_for_Class_Incremental_Learning_ICCV_2023_paper.html", "aff_unique_index": "0+1+2;0+1+2", "aff_unique_norm": "Sun Yat-sen University;Guangdong Key Laboratory of Big Data Analysis and Processing;Key Laboratory of Machine Intelligence and Advanced Computing", - "aff_unique_dep": "School of Arti\ufb01cial Intelligence;;Ministry of Education", + "aff_unique_dep": "School of Artificial Intelligence;;Ministry of Education", "aff_unique_url": "http://www.sysu.edu.cn/;;", "aff_unique_abbr": "SYSU;;", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Guangzhou", "aff_country_unique_index": "0+0+0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Xiuwei and Chang,\n Xiaobin\n},\n title = {\n Dynamic Residual Classifier for Class Incremental Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18743-18752\n} \n}" }, { "title": "Dynamic Snake Convolution Based on Topological Geometric Constraints for Tubular Structure Segmentation", @@ -16958,7 +17531,7 @@ "author": "Yaolei Qi; Yuting He; Xiaoming Qi; Yuan Zhang; Guanyu Yang", "abstract": "Accurate segmentation of topological tubular structures, such as blood vessels and roads, is crucial in various fields, ensuring accuracy and efficiency in downstream tasks. However, many factors complicate the task, including thin local structures and variable global morphologies. In this work, we note the specificity of tubular structures and use this knowledge to guide our DSCNet to simultaneously enhance perception in three stages: feature extraction, feature fusion, and loss constraint. First, we propose a dynamic snake convolution to accurately capture the features of tubular structures by adaptively focusing on slender and tortuous local structures. Subsequently, we propose a multi-view feature fusion strategy to complement the attention to features from multiple perspectives during feature fusion, ensuring the retention of important information from different global morphologies. Finally, a continuity constraint loss function, based on persistent homology, is proposed to constrain the topological continuity of the segmentation better. Experiments on 2D and 3D datasets show that our DSCNet provides better accuracy and continuity on the tubular structure segmentation task compared with several methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Qi_Dynamic_Snake_Convolution_Based_on_Topological_Geometric_Constraints_for_Tubular_ICCV_2023_paper.pdf", - "aff": "Key Laboratory of New Generation Artificial Intelligence Technology and Its Interdisciplinary Applications (Southeast University), Ministry of Education, Nanjing 210096, China; Key Laboratory of New Generation Artificial Intelligence Technology and Its Interdisciplinary Applications (Southeast University), Ministry of Education, Nanjing 210096, China; Key Laboratory of New Generation Artificial Intelligence Technology and Its Interdisciplinary Applications (Southeast University), Ministry of Education, Nanjing 210096, China; Key Laboratory of New Generation Artificial Intelligence Technology and Its Interdisciplinary Applications (Southeast University), Ministry of Education, Nanjing 210096, China; Key Laboratory of New Generation Artificial Intelligence Technology and Its Interdisciplinary Applications (Southeast University), Ministry of Education, Nanjing 210096, China+Jiangsu Province Joint International Research Laboratory of Medical Information Processing, Southeast University, Nanjing, China+Centre de Recherche en Information Biom\u00e9dicale Sino-Fran\u00e7ais (CRIBs), Strasbourg, France", + "aff": "Key Laboratory of New Generation Artificial Intelligence Technology and Its Interdisciplinary Applications (Southeast University), Ministry of Education, Nanjing 210096, China; Key Laboratory of New Generation Artificial Intelligence Technology and Its Interdisciplinary Applications (Southeast University), Ministry of Education, Nanjing 210096, China; Key Laboratory of New Generation Artificial Intelligence Technology and Its Interdisciplinary Applications (Southeast University), Ministry of Education, Nanjing 210096, China; Key Laboratory of New Generation Artificial Intelligence Technology and Its Interdisciplinary Applications (Southeast University), Ministry of Education, Nanjing 210096, China; Key Laboratory of New Generation Artificial Intelligence Technology and Its Interdisciplinary Applications (Southeast University), Ministry of Education, Nanjing 210096, China+Jiangsu Province Joint International Research Laboratory of Medical Information Processing, Southeast University, Nanjing, China+Centre de Recherche en Information Biomédicale Sino-Français (CRIBs), Strasbourg, France", "project": "", "github": "https://github.com/YaoleiQi/DSCNet", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Qi_Dynamic_Snake_Convolution_ICCV_2023_supplemental.pdf", @@ -16971,14 +17544,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Qi_Dynamic_Snake_Convolution_Based_on_Topological_Geometric_Constraints_for_Tubular_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0;0+0+1", - "aff_unique_norm": "Southeast University;Centre de Recherche en Information Biom\u00e9dicale Sino-Fran\u00e7ais", - "aff_unique_dep": "Key Laboratory of New Generation Artificial Intelligence Technology and Its Interdisciplinary Applications;Information Biom\u00e9dicale", + "aff_unique_norm": "Southeast University;Centre de Recherche en Information Biomédicale Sino-Français", + "aff_unique_dep": "Key Laboratory of New Generation Artificial Intelligence Technology and Its Interdisciplinary Applications;Information Biomédicale", "aff_unique_url": "https://www.seu.edu.cn/;", "aff_unique_abbr": "SEU;CRIBs", "aff_campus_unique_index": "0;0;0;0;0+0+1", "aff_campus_unique": "Nanjing;Strasbourg", "aff_country_unique_index": "0;0;0;0;0+0+1", - "aff_country_unique": "China;France" + "aff_country_unique": "China;France", + "bibtex": "@InProceedings{Qi_2023_ICCV,\n \n author = {\n Qi,\n Yaolei and He,\n Yuting and Qi,\n Xiaoming and Zhang,\n Yuan and Yang,\n Guanyu\n},\n title = {\n Dynamic Snake Convolution Based on Topological Geometric Constraints for Tubular Structure Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6070-6079\n} \n}" }, { "title": "Dynamic Token Pruning in Plain Vision Transformers for Semantic Segmentation", @@ -17010,7 +17584,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;1;1;0;1", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Tang_2023_ICCV,\n \n author = {\n Tang,\n Quan and Zhang,\n Bowen and Liu,\n Jiajun and Liu,\n Fagui and Liu,\n Yifan\n},\n title = {\n Dynamic Token Pruning in Plain Vision Transformers for Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 777-786\n} \n}" }, { "title": "DynamicISP: Dynamically Controlled Image Signal Processor for Image Recognition", @@ -17042,7 +17617,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Yoshimura_2023_ICCV,\n \n author = {\n Yoshimura,\n Masakazu and Otsuka,\n Junji and Irie,\n Atsushi and Ohashi,\n Takeshi\n},\n title = {\n DynamicISP: Dynamically Controlled Image Signal Processor for Image Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12866-12876\n} \n}" }, { "title": "E2E-LOAD: End-to-End Long-form Online Action Detection", @@ -17074,7 +17650,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Cao_2023_ICCV,\n \n author = {\n Cao,\n Shuqiang and Luo,\n Weixin and Wang,\n Bairui and Zhang,\n Wei and Ma,\n Lin\n},\n title = {\n E2E-LOAD: End-to-End Long-form Online Action Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10422-10432\n} \n}" }, { "title": "E2NeRF: Event Enhanced Neural Radiance Fields from Blurry Images", @@ -17099,14 +17676,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Qi_E2NeRF_Event_Enhanced_Neural_Radiance_Fields_from_Blurry_Images_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;3", - "aff_unique_norm": "Beihang University;Beijing Institute of Technology;SenseTime;Pengcheng Laboratory", - "aff_unique_dep": "State Key Laboratory of Virtual Reality Technology and Systems, SCSE;;;Peng Cheng Laboratory", + "aff_unique_norm": "Beihang University;Beijing Institute of Technology;SenseTime;Peng Cheng Laboratory", + "aff_unique_dep": "State Key Laboratory of Virtual Reality Technology and Systems, SCSE;;;", "aff_unique_url": "http://www.buaa.edu.cn;http://www.bit.edu.cn/;https://www.sensetime.com;http://www.pcl.ac.cn", "aff_unique_abbr": "Beihang;BIT;SenseTime;PCL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Qi_2023_ICCV,\n \n author = {\n Qi,\n Yunshan and Zhu,\n Lin and Zhang,\n Yu and Li,\n Jia\n},\n title = {\n E2NeRF: Event Enhanced Neural Radiance Fields from Blurry Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13254-13264\n} \n}" }, { "title": "E3Sym: Leveraging E(3) Invariance for Unsupervised 3D Planar Reflective Symmetry Detection", @@ -17138,7 +17716,8 @@ "aff_campus_unique_index": "0;0;0;2;0", "aff_campus_unique": "Beijing;;Cardiff", "aff_country_unique_index": "0+0;0;0;1;0+0", - "aff_country_unique": "China;United Kingdom" + "aff_country_unique": "China;United Kingdom", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Ren-Wu and Zhang,\n Ling-Xiao and Li,\n Chunpeng and Lai,\n Yu-Kun and Gao,\n Lin\n},\n title = {\n E3Sym: Leveraging E(3) Invariance for Unsupervised 3D Planar Reflective Symmetry Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14543-14553\n} \n}" }, { "title": "EDAPS: Enhanced Domain-Adaptive Panoptic Segmentation", @@ -17170,7 +17749,8 @@ "aff_campus_unique_index": "1;", "aff_campus_unique": ";Zurich", "aff_country_unique_index": "0;0;0;0+0;0+1", - "aff_country_unique": "Switzerland;Belgium" + "aff_country_unique": "Switzerland;Belgium", + "bibtex": "@InProceedings{Saha_2023_ICCV,\n \n author = {\n Saha,\n Suman and Hoyer,\n Lukas and Obukhov,\n Anton and Dai,\n Dengxin and Van Gool,\n Luc\n},\n title = {\n EDAPS: Enhanced Domain-Adaptive Panoptic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19234-19245\n} \n}" }, { "title": "EGC: Image Generation and Classification via a Diffusion Energy-Based Model", @@ -17195,14 +17775,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Guo_EGC_Image_Generation_and_Classification_via_a_Diffusion_Energy-Based_Model_ICCV_2023_paper.html", "aff_unique_index": "0;0;1;1;0+2;0+2", - "aff_unique_norm": "University of Hong Kong;ByteDance;Shanghai AI Laboratory", + "aff_unique_norm": "The University of Hong Kong;ByteDance;Shanghai AI Laboratory", "aff_unique_dep": ";;", "aff_unique_url": "https://www.hku.hk;https://www.bytedance.com;https://www.shanghai-ai-lab.com", "aff_unique_abbr": "HKU;ByteDance;SAIL", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Guo_2023_ICCV,\n \n author = {\n Guo,\n Qiushan and Ma,\n Chuofan and Jiang,\n Yi and Yuan,\n Zehuan and Yu,\n Yizhou and Luo,\n Ping\n},\n title = {\n EGC: Image Generation and Classification via a Diffusion Energy-Based Model\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22952-22962\n} \n}" }, { "title": "EGformer: Equirectangular Geometry-biased Transformer for 360 Depth Estimation", @@ -17234,7 +17815,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Yun_2023_ICCV,\n \n author = {\n Yun,\n Ilwi and Shin,\n Chanyong and Lee,\n Hyunku and Lee,\n Hyuk-Jae and Rhee,\n Chae Eun\n},\n title = {\n EGformer: Equirectangular Geometry-biased Transformer for 360 Depth Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6101-6112\n} \n}" }, { "title": "ELFNet: Evidential Local-global Fusion for Stereo Matching", @@ -17266,7 +17848,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Lou_2023_ICCV,\n \n author = {\n Lou,\n Jieming and Liu,\n Weide and Chen,\n Zhuo and Liu,\n Fayao and Cheng,\n Jun\n},\n title = {\n ELFNet: Evidential Local-global Fusion for Stereo Matching\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17784-17793\n} \n}" }, { "title": "ELITE: Encoding Visual Concepts into Textual Embeddings for Customized Text-to-Image Generation", @@ -17289,7 +17872,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wei_ELITE_Encoding_Visual_Concepts_into_Textual_Embeddings_for_Customized_Text-to-Image_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wei_ELITE_Encoding_Visual_Concepts_into_Textual_Embeddings_for_Customized_Text-to-Image_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Wei_2023_ICCV,\n \n author = {\n Wei,\n Yuxiang and Zhang,\n Yabo and Ji,\n Zhilong and Bai,\n Jinfeng and Zhang,\n Lei and Zuo,\n Wangmeng\n},\n title = {\n ELITE: Encoding Visual Concepts into Textual Embeddings for Customized Text-to-Image Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15943-15953\n} \n}" }, { "title": "EMDB: The Electromagnetic Database of Global 3D Human Pose and Shape in the Wild", @@ -17297,8 +17881,8 @@ "status": "Poster", "track": "main", "pid": "2785", - "author_site": "Manuel Kaufmann, Jie Song, Chen Guo, Kaiyue Shen, Tianjian Jiang, Chengcheng Tang, Juan Jos\u00e9 Z\u00e1rate, Otmar Hilliges", - "author": "Manuel Kaufmann; Jie Song; Chen Guo; Kaiyue Shen; Tianjian Jiang; Chengcheng Tang; Juan Jos\u00e9 Z\u00e1rate; Otmar Hilliges", + "author_site": "Manuel Kaufmann, Jie Song, Chen Guo, Kaiyue Shen, Tianjian Jiang, Chengcheng Tang, Juan José Zárate, Otmar Hilliges", + "author": "Manuel Kaufmann; Jie Song; Chen Guo; Kaiyue Shen; Tianjian Jiang; Chengcheng Tang; Juan José Zárate; Otmar Hilliges", "abstract": "We present EMDB, the Electromagnetic Database of Global 3D Human Pose and Shape in the Wild. EMDB is a novel dataset that contains high-quality 3D SMPL pose and shape parameters with global body and camera trajectories for in-the-wild videos. We use body-worn, wireless electromagnetic (EM) sensors and a hand-held iPhone to record a total of 58 minutes of motion data, distributed over 81 indoor and outdoor sequences and 10 participants. Together with accurate body poses and shapes, we also provide global camera poses and body root trajectories. To construct EMDB, we propose a multi-stage optimization procedure, which first fits SMPL to the 6-DoF EM measurements and then refines the poses via image observations. To achieve high-quality results, we leverage a neural implicit avatar model to reconstruct detailed human surface geometry and appearance, which allows for improved alignment and smoothness via a dense pixel-level objective. Our evaluations, conducted with a multi-view volumetric capture system, indicate that EMDB has an expected accuracy of 2.3 cm positional and 10.6 degrees angular error, surpassing the accuracy of previous in-the-wild datasets. We evaluate existing state-of-the-art monocular RGB methods for camera-relative and global pose estimation on EMDB. EMDB is publicly available under https://ait.ethz.ch/emdb.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Kaufmann_EMDB_The_Electromagnetic_Database_of_Global_3D_Human_Pose_and_ICCV_2023_paper.pdf", "aff": ";;;;;;;", @@ -17312,7 +17896,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Kaufmann_EMDB_The_Electromagnetic_Database_of_Global_3D_Human_Pose_and_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Kaufmann_EMDB_The_Electromagnetic_Database_of_Global_3D_Human_Pose_and_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Kaufmann_2023_ICCV,\n \n author = {\n Kaufmann,\n Manuel and Song,\n Jie and Guo,\n Chen and Shen,\n Kaiyue and Jiang,\n Tianjian and Tang,\n Chengcheng and Z\\'arate,\n Juan Jos\\'e and Hilliges,\n Otmar\n},\n title = {\n EMDB: The Electromagnetic Database of Global 3D Human Pose and Shape in the Wild\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14632-14643\n} \n}" }, { "title": "EMMN: Emotional Motion Memory Network for Audio-driven Emotional Talking Face Generation", @@ -17344,7 +17929,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Tan_2023_ICCV,\n \n author = {\n Tan,\n Shuai and Ji,\n Bin and Pan,\n Ye\n},\n title = {\n EMMN: Emotional Motion Memory Network for Audio-driven Emotional Talking Face Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22146-22156\n} \n}" }, { "title": "EMQ: Evolving Training-free Proxies for Automated Mixed Precision Quantization", @@ -17376,7 +17962,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Dong_2023_ICCV,\n \n author = {\n Dong,\n Peijie and Li,\n Lujun and Wei,\n Zimian and Niu,\n Xin and Tian,\n Zhiliang and Pan,\n Hengyue\n},\n title = {\n EMQ: Evolving Training-free Proxies for Automated Mixed Precision Quantization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17076-17086\n} \n}" }, { "title": "EMR-MSF: Self-Supervised Recurrent Monocular Scene Flow Exploiting Ego-Motion Rigidity", @@ -17384,6 +17971,7 @@ "status": "Oral", "track": "main", "pid": "6318", + "author_site": "Zijie Jiang, Masatoshi Okutomi", "author": "Zijie Jiang, Masatoshi Okutomi", "abstract": "Self-supervised monocular scene flow estimation, aiming to understand both 3D structures and 3D motions from two temporally consecutive monocular images, has received increasing attention for its simple and economical sensor setup. However, the accuracy of current methods suffers from the bottleneck of less-efficient network architecture and lack of motion rigidity for regularization. In this paper, we propose a superior model named EMR-MSF by borrowing the advantages of network architecture design under the scope of supervised learning. We further impose explicit and robust geometric constraints with an elaborately constructed ego-motion aggregation module where a rigidity soft mask is proposed to filter out dynamic regions for stable ego-motion estimation using static regions. Moreover, we propose a motion consistency loss along with a mask regularization loss to fully exploit static regions. Several efficient training strategies are integrated including a gradient detachment technique and an enhanced view synthesis process for better performance. Our proposed method outperforms the previous self-supervised works by a large margin and catches up to the performance of supervised methods. On the KITTI scene flow benchmark, our approach improves the SF-all metric of the state-of-the-art self-supervised monocular method by 44% and demonstrates superior performance across sub-tasks including depth and visual odometry, amongst other self-supervised single-task or multi-task methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Jiang_EMR-MSF_Self-Supervised_Recurrent_Monocular_Scene_Flow_Exploiting_Ego-Motion_Rigidity_ICCV_2023_paper.pdf", @@ -17395,7 +17983,8 @@ "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5589922010572365504&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Jiang_EMR-MSF_Self-Supervised_Recurrent_Monocular_Scene_Flow_Exploiting_Ego-Motion_Rigidity_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Jiang_EMR-MSF_Self-Supervised_Recurrent_Monocular_Scene_Flow_Exploiting_Ego-Motion_Rigidity_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Jiang_2023_ICCV,\n \n author = {\n Jiang,\n Zijie and Okutomi,\n Masatoshi\n},\n title = {\n EMR-MSF: Self-Supervised Recurrent Monocular Scene Flow Exploiting Ego-Motion Rigidity\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 69-78\n} \n}" }, { "title": "ENTL: Embodied Navigation Trajectory Learner", @@ -17420,14 +18009,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Kotar_ENTL_Embodied_Navigation_Trajectory_Learner_ICCV_2023_paper.html", "aff_unique_index": "0;1;2", - "aff_unique_norm": "Stanford University;University of Washington;Meta", + "aff_unique_norm": "Stanford University;University of Washington;Meta Platforms, Inc.", "aff_unique_dep": ";;Meta AI", "aff_unique_url": "https://www.stanford.edu;https://www.washington.edu;https://meta.com", "aff_unique_abbr": "Stanford;UW;Meta", "aff_campus_unique_index": "0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Kotar_2023_ICCV,\n \n author = {\n Kotar,\n Klemen and Walsman,\n Aaron and Mottaghi,\n Roozbeh\n},\n title = {\n ENTL: Embodied Navigation Trajectory Learner\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10863-10872\n} \n}" }, { "title": "ENVIDR: Implicit Differentiable Renderer with Neural Environment Lighting", @@ -17450,7 +18040,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Liang_ENVIDR_Implicit_Differentiable_Renderer_with_Neural_Environment_Lighting_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Liang_ENVIDR_Implicit_Differentiable_Renderer_with_Neural_Environment_Lighting_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Liang_2023_ICCV,\n \n author = {\n Liang,\n Ruofan and Chen,\n Huiting and Li,\n Chunlin and Chen,\n Fan and Panneer,\n Selvakumar and Vijaykumar,\n Nandita\n},\n title = {\n ENVIDR: Implicit Differentiable Renderer with Neural Environment Lighting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 79-89\n} \n}" }, { "title": "EP2P-Loc: End-to-End 3D Point to 2D Pixel Localization for Large-Scale Visual Localization", @@ -17482,7 +18073,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Kim_2023_ICCV,\n \n author = {\n Kim,\n Minjung and Koo,\n Junseo and Kim,\n Gunhee\n},\n title = {\n EP2P-Loc: End-to-End 3D Point to 2D Pixel Localization for Large-Scale Visual Localization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21527-21537\n} \n}" }, { "title": "EPiC: Ensemble of Partial Point Clouds for Robust Classification", @@ -17514,7 +18106,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Haifa", "aff_country_unique_index": "0;0", - "aff_country_unique": "Israel" + "aff_country_unique": "Israel", + "bibtex": "@InProceedings{Levi_2023_ICCV,\n \n author = {\n Levi,\n Meir Yossef and Gilboa,\n Guy\n},\n title = {\n EPiC: Ensemble of Partial Point Clouds for Robust Classification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14475-14484\n} \n}" }, { "title": "EQ-Net: Elastic Quantization Neural Networks", @@ -17546,7 +18139,8 @@ "aff_campus_unique_index": "0+0+0;0;0+0+0;0+0+0;0+0+0", "aff_campus_unique": "Hefei", "aff_country_unique_index": "0+0+0;0;0+0+0;0+0+0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xu_2023_ICCV,\n \n author = {\n Xu,\n Ke and Han,\n Lei and Tian,\n Ye and Yang,\n Shangshang and Zhang,\n Xingyi\n},\n title = {\n EQ-Net: Elastic Quantization Neural Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1505-1514\n} \n}" }, { "title": "ESSAformer: Efficient Transformer for Hyperspectral Image Super-resolution", @@ -17578,7 +18172,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0;1", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Mingjin and Zhang,\n Chi and Zhang,\n Qiming and Guo,\n Jie and Gao,\n Xinbo and Zhang,\n Jing\n},\n title = {\n ESSAformer: Efficient Transformer for Hyperspectral Image Super-resolution\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23073-23084\n} \n}" }, { "title": "ESTextSpotter: Towards Better Scene Text Spotting with Explicit Synergy in Transformer", @@ -17610,7 +18205,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Huang_2023_ICCV,\n \n author = {\n Huang,\n Mingxin and Zhang,\n Jiaxin and Peng,\n Dezhi and Lu,\n Hao and Huang,\n Can and Liu,\n Yuliang and Bai,\n Xiang and Jin,\n Lianwen\n},\n title = {\n ESTextSpotter: Towards Better Scene Text Spotting with Explicit Synergy in Transformer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19495-19505\n} \n}" }, { "title": "ETran: Energy-Based Transferability Estimation", @@ -17635,14 +18231,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Gholami_ETran_Energy-Based_Transferability_Estimation_ICCV_2023_paper.html", "aff_unique_index": "0+1;0;0;0;0", - "aff_unique_norm": "Huawei;University of British Columbia", - "aff_unique_dep": "Huawei Technologies;", + "aff_unique_norm": "Huawei Technologies;University of British Columbia", + "aff_unique_dep": ";", "aff_unique_url": "https://www.huawei.com/ca-en/;https://www.ubc.ca", "aff_unique_abbr": "Huawei;UBC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0;0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Gholami_2023_ICCV,\n \n author = {\n Gholami,\n Mohsen and Akbari,\n Mohammad and Wang,\n Xinglu and Kamranian,\n Behnam and Zhang,\n Yong\n},\n title = {\n ETran: Energy-Based Transferability Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18613-18622\n} \n}" }, { "title": "E^2VPT: An Effective and Efficient Approach for Visual Prompt Tuning", @@ -17665,7 +18262,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Han_E2VPT_An_Effective_and_Efficient_Approach_for_Visual_Prompt_Tuning_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Han_E2VPT_An_Effective_and_Efficient_Approach_for_Visual_Prompt_Tuning_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Han_2023_ICCV,\n \n author = {\n Han,\n Cheng and Wang,\n Qifan and Cui,\n Yiming and Cao,\n Zhiwen and Wang,\n Wenguan and Qi,\n Siyuan and Liu,\n Dongfang\n},\n title = {\n E{\\textasciicircum\n}2VPT: An Effective and Efficient Approach for Visual Prompt Tuning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17491-17502\n} \n}" }, { "title": "EdaDet: Open-Vocabulary Object Detection Using Early Dense Alignment", @@ -17697,7 +18295,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Shanghai", "aff_country_unique_index": "0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Shi_2023_ICCV,\n \n author = {\n Shi,\n Cheng and Yang,\n Sibei\n},\n title = {\n EdaDet: Open-Vocabulary Object Detection Using Early Dense Alignment\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15724-15734\n} \n}" }, { "title": "Editable Image Geometric Abstraction via Neural Primitive Assembly", @@ -17729,7 +18328,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Ye and Ni,\n Bingbing and Chen,\n Xuanhong and Hu,\n Zhangli\n},\n title = {\n Editable Image Geometric Abstraction via Neural Primitive Assembly\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23514-23523\n} \n}" }, { "title": "Editing Implicit Assumptions in Text-to-Image Diffusion Models", @@ -17761,7 +18361,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Israel" + "aff_country_unique": "Israel", + "bibtex": "@InProceedings{Orgad_2023_ICCV,\n \n author = {\n Orgad,\n Hadas and Kawar,\n Bahjat and Belinkov,\n Yonatan\n},\n title = {\n Editing Implicit Assumptions in Text-to-Image Diffusion Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7053-7061\n} \n}" }, { "title": "Effective Real Image Editing with Accelerated Iterative Diffusion Inversion", @@ -17784,7 +18385,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Pan_Effective_Real_Image_Editing_with_Accelerated_Iterative_Diffusion_Inversion_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Pan_Effective_Real_Image_Editing_with_Accelerated_Iterative_Diffusion_Inversion_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Pan_2023_ICCV,\n \n author = {\n Pan,\n Zhihong and Gherardi,\n Riccardo and Xie,\n Xiufeng and Huang,\n Stephen\n},\n title = {\n Effective Real Image Editing with Accelerated Iterative Diffusion Inversion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15912-15921\n} \n}" }, { "title": "Efficient 3D Semantic Segmentation with Superpoint Transformer", @@ -17816,7 +18418,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0+0", - "aff_country_unique": "France" + "aff_country_unique": "France", + "bibtex": "@InProceedings{Robert_2023_ICCV,\n \n author = {\n Robert,\n Damien and Raguet,\n Hugo and Landrieu,\n Loic\n},\n title = {\n Efficient 3D Semantic Segmentation with Superpoint Transformer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17195-17204\n} \n}" }, { "title": "Efficient Adaptive Human-Object Interaction Detection with Concept-guided Memory", @@ -17839,7 +18442,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Lei_Efficient_Adaptive_Human-Object_Interaction_Detection_with_Concept-guided_Memory_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Lei_Efficient_Adaptive_Human-Object_Interaction_Detection_with_Concept-guided_Memory_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Lei_2023_ICCV,\n \n author = {\n Lei,\n Ting and Caba,\n Fabian and Chen,\n Qingchao and Jin,\n Hailin and Peng,\n Yuxin and Liu,\n Yang\n},\n title = {\n Efficient Adaptive Human-Object Interaction Detection with Concept-guided Memory\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6480-6490\n} \n}" }, { "title": "Efficient Computation Sharing for Multi-Task Visual Scene Understanding", @@ -17871,7 +18475,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Shoouri_2023_ICCV,\n \n author = {\n Shoouri,\n Sara and Yang,\n Mingyu and Fan,\n Zichen and Kim,\n Hun-Seok\n},\n title = {\n Efficient Computation Sharing for Multi-Task Visual Scene Understanding\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17130-17141\n} \n}" }, { "title": "Efficient Controllable Multi-Task Architectures", @@ -17903,7 +18508,8 @@ "aff_campus_unique_index": "0;0;2", "aff_campus_unique": "Riverside;;San Diego", "aff_country_unique_index": "0+0;0;0;0+0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Aich_2023_ICCV,\n \n author = {\n Aich,\n Abhishek and Schulter,\n Samuel and Roy-Chowdhury,\n Amit K. and Chandraker,\n Manmohan and Suh,\n Yumin\n},\n title = {\n Efficient Controllable Multi-Task Architectures\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5740-5751\n} \n}" }, { "title": "Efficient Converted Spiking Neural Network for 3D and 2D Classification", @@ -17935,7 +18541,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0;0;1;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Lan_2023_ICCV,\n \n author = {\n Lan,\n Yuxiang and Zhang,\n Yachao and Ma,\n Xu and Qu,\n Yanyun and Fu,\n Yun\n},\n title = {\n Efficient Converted Spiking Neural Network for 3D and 2D Classification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9211-9220\n} \n}" }, { "title": "Efficient Decision-based Black-box Patch Attacks on Video Recognition", @@ -17967,7 +18574,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Jiang_2023_ICCV,\n \n author = {\n Jiang,\n Kaixun and Chen,\n Zhaoyu and Huang,\n Hao and Wang,\n Jiafeng and Yang,\n Dingkang and Li,\n Bo and Wang,\n Yan and Zhang,\n Wenqiang\n},\n title = {\n Efficient Decision-based Black-box Patch Attacks on Video Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4379-4389\n} \n}" }, { "title": "Efficient Deep Space Filling Curve", @@ -17992,14 +18600,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chen_Efficient_Deep_Space_Filling_Curve_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0", - "aff_unique_norm": "Chinese University of Hong Kong", + "aff_unique_norm": "The Chinese University of Hong Kong", "aff_unique_dep": "", "aff_unique_url": "https://www.cuhk.edu.hk", "aff_unique_abbr": "CUHK", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Wanli and Yao,\n Xufeng and Zhang,\n Xinyun and Yu,\n Bei\n},\n title = {\n Efficient Deep Space Filling Curve\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17525-17534\n} \n}" }, { "title": "Efficient Diffusion Training via Min-SNR Weighting Strategy", @@ -18011,7 +18620,7 @@ "author": "Tiankai Hang; Shuyang Gu; Chen Li; Jianmin Bao; Dong Chen; Han Hu; Xin Geng; Baining Guo", "abstract": "Denoising diffusion models have been a mainstream approach for image generation, however, training these models often suffers from slow convergence. In this paper, we discovered that the slow convergence is partly due to conflicting optimization directions between timesteps. To address this issue, we treat the diffusion training as a multi-task learning problem, and introduce a simple yet effective approach referred to as Min-SNR-g. This method adapts loss weights of timesteps based on clamped signal-to-noise ratios, which effectively balances the conflicts among timesteps. Our results demonstrate a significant improvement in converging speed, 3.4x faster than previous weighting strategies. It is also more effective, achieving a new record FID score of 2.06 on the ImageNet 256x256 benchmark using smaller architectures than that employed in previous state-of-the-art.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Hang_Efficient_Diffusion_Training_via_Min-SNR_Weighting_Strategy_ICCV_2023_paper.pdf", - "aff": "Southeast University; Microsoft Research Asia; National Key Laboratory of Human-Machine Hybrid Augmented Intelligence, National Engineering Research Center for Visual Information and Applications, and Institute of Arti\ufb01cial Intelligence and Robotics, Xi\u2019an Jiaotong University; Microsoft Research Asia; Microsoft Research Asia; Microsoft Research Asia; Southeast University; Southeast University", + "aff": "Southeast University; Microsoft Research Asia; National Key Laboratory of Human-Machine Hybrid Augmented Intelligence, National Engineering Research Center for Visual Information and Applications, and Institute of Artificial Intelligence and Robotics, Xi’an Jiaotong University; Microsoft Research Asia; Microsoft Research Asia; Microsoft Research Asia; Southeast University; Southeast University", "project": "", "github": "https://github.com/TiankaiHang/Min-SNR-Diffusion-Training", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Hang_Efficient_Diffusion_Training_ICCV_2023_supplemental.pdf", @@ -18024,14 +18633,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Hang_Efficient_Diffusion_Training_via_Min-SNR_Weighting_Strategy_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;1;1;1;0;0", - "aff_unique_norm": "Southeast University;Microsoft;Xi\u2019an Jiao Tong University", - "aff_unique_dep": ";Research;Institute of Arti\ufb01cial Intelligence and Robotics", + "aff_unique_norm": "Southeast University;Microsoft Research;Xi’an Jiaotong University", + "aff_unique_dep": ";Research;Institute of Artificial Intelligence and Robotics", "aff_unique_url": "https://www.seu.edu.cn/;https://www.microsoft.com/en-us/research/group/asia;http://www.xjtu.edu.cn", "aff_unique_abbr": "SEU;MSR Asia;XJTU", "aff_campus_unique_index": "1;2;1;1;1", "aff_campus_unique": ";Asia;Xi'an", "aff_country_unique_index": "0;0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Hang_2023_ICCV,\n \n author = {\n Hang,\n Tiankai and Gu,\n Shuyang and Li,\n Chen and Bao,\n Jianmin and Chen,\n Dong and Hu,\n Han and Geng,\n Xin and Guo,\n Baining\n},\n title = {\n Efficient Diffusion Training via Min-SNR Weighting Strategy\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7441-7451\n} \n}" }, { "title": "Efficient Discovery and Effective Evaluation of Visual Perceptual Similarity: A Benchmark and Beyond", @@ -18056,14 +18666,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Barkan_Efficient_Discovery_and_Effective_Evaluation_of_Visual_Perceptual_Similarity_A_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;3;2;2;2", - "aff_unique_norm": "Open University;Hebrew University of Jerusalem;Tel Aviv University;Technion - Israel Institute of Technology", + "aff_unique_norm": "The Open University;The Hebrew University of Jerusalem;Tel Aviv University;Technion - Israel Institute of Technology", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.open.ac.uk;https://www.huji.ac.il;https://www.tau.ac.il;https://www.technion.ac.il/en/", "aff_unique_abbr": "OU;HUJI;TAU;Technion", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;1;1;1", - "aff_country_unique": "United Kingdom;Israel" + "aff_country_unique": "United Kingdom;Israel", + "bibtex": "@InProceedings{Barkan_2023_ICCV,\n \n author = {\n Barkan,\n Oren and Reiss,\n Tal and Weill,\n Jonathan and Katz,\n Ori and Hirsch,\n Roy and Malkiel,\n Itzik and Koenigstein,\n Noam\n},\n title = {\n Efficient Discovery and Effective Evaluation of Visual Perceptual Similarity: A Benchmark and Beyond\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20007-20018\n} \n}" }, { "title": "Efficient Emotional Adaptation for Audio-Driven Talking-Head Generation", @@ -18095,7 +18706,8 @@ "aff_campus_unique_index": ";;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0+0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Gan_2023_ICCV,\n \n author = {\n Gan,\n Yuan and Yang,\n Zongxin and Yue,\n Xihang and Sun,\n Lingyun and Yang,\n Yi\n},\n title = {\n Efficient Emotional Adaptation for Audio-Driven Talking-Head Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22634-22645\n} \n}" }, { "title": "Efficient Joint Optimization of Layer-Adaptive Weight Pruning in Deep Neural Networks", @@ -18119,15 +18731,16 @@ "email": "i2r.a-star.edu.sg;i2r.a-star.edu.sg;i2r.a-star.edu.sg;i2r.a-star.edu.sg;i2r.a-star.edu.sg;ntu.edu.sg", "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Xu_Efficient_Joint_Optimization_of_Layer-Adaptive_Weight_Pruning_in_Deep_Neural_ICCV_2023_paper.html", - "aff_unique_index": "1;1", - "aff_unique_norm": ";Nanyang Technological University", - "aff_unique_dep": ";", - "aff_unique_url": ";https://www.ntu.edu.sg", - "aff_unique_abbr": ";NTU", + "aff_unique_index": "0;0;0;0;0+1;1", + "aff_unique_norm": "Agency for Science, Technology and Research;Nanyang Technological University", + "aff_unique_dep": "Institute for Infocomm Research;", + "aff_unique_url": "https://www.a-star.edu.sg;https://www.ntu.edu.sg", + "aff_unique_abbr": "A*STAR;NTU", "aff_campus_unique_index": "", "aff_campus_unique": "", - "aff_country_unique_index": "1;1", - "aff_country_unique": ";Singapore" + "aff_country_unique_index": "0;0;0;0;0+0;0", + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Xu_2023_ICCV,\n \n author = {\n Xu,\n Kaixin and Wang,\n Zhe and Geng,\n Xue and Wu,\n Min and Li,\n Xiaoli and Lin,\n Weisi\n},\n title = {\n Efficient Joint Optimization of Layer-Adaptive Weight Pruning in Deep Neural Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17447-17457\n} \n}" }, { "title": "Efficient LiDAR Point Cloud Oversegmentation Network", @@ -18159,7 +18772,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Hui_2023_ICCV,\n \n author = {\n Hui,\n Le and Tang,\n Linghua and Dai,\n Yuchao and Xie,\n Jin and Yang,\n Jian\n},\n title = {\n Efficient LiDAR Point Cloud Oversegmentation Network\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18003-18012\n} \n}" }, { "title": "Efficient Model Personalization in Federated Learning via Client-Specific Prompt Generation", @@ -18184,14 +18798,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yang_Efficient_Model_Personalization_in_Federated_Learning_via_Client-Specific_Prompt_Generation_ICCV_2023_paper.html", "aff_unique_index": "0+1;1;0+1", - "aff_unique_norm": "National Taiwan University;NVIDIA", - "aff_unique_dep": ";NVIDIA Corporation", + "aff_unique_norm": "National Taiwan University;NVIDIA Corporation", + "aff_unique_dep": ";", "aff_unique_url": "https://www.ntu.edu.tw;https://www.nvidia.com", "aff_unique_abbr": "NTU;NVIDIA", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Taiwan;", "aff_country_unique_index": "0+1;1;0+1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n Fu-En and Wang,\n Chien-Yi and Wang,\n Yu-Chiang Frank\n},\n title = {\n Efficient Model Personalization in Federated Learning via Client-Specific Prompt Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19159-19168\n} \n}" }, { "title": "Efficient Neural Supersampling on a Novel Gaming Dataset", @@ -18214,7 +18829,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Mercier_Efficient_Neural_Supersampling_on_a_Novel_Gaming_Dataset_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Mercier_Efficient_Neural_Supersampling_on_a_Novel_Gaming_Dataset_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Mercier_2023_ICCV,\n \n author = {\n Mercier,\n Antoine and Erasmus,\n Ruan and Savani,\n Yashesh and Dhingra,\n Manik and Porikli,\n Fatih and Berger,\n Guillaume\n},\n title = {\n Efficient Neural Supersampling on a Novel Gaming Dataset\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 296-306\n} \n}" }, { "title": "Efficient Region-Aware Neural Radiance Fields for High-Fidelity Talking Portrait Synthesis", @@ -18246,7 +18862,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;2+2", - "aff_country_unique": "China;Australia;Japan" + "aff_country_unique": "China;Australia;Japan", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Jiahe and Zhang,\n Jiawei and Bai,\n Xiao and Zhou,\n Jun and Gu,\n Lin\n},\n title = {\n Efficient Region-Aware Neural Radiance Fields for High-Fidelity Talking Portrait Synthesis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7568-7578\n} \n}" }, { "title": "Efficient Transformer-based 3D Object Detection with Dynamic Token Halting", @@ -18278,7 +18895,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Austin;", "aff_country_unique_index": "0+0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Ye_2023_ICCV,\n \n author = {\n Ye,\n Mao and Meyer,\n Gregory P. and Chai,\n Yuning and Liu,\n Qiang\n},\n title = {\n Efficient Transformer-based 3D Object Detection with Dynamic Token Halting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8438-8450\n} \n}" }, { "title": "Efficient Unified Demosaicing for Bayer and Non-Bayer Patterned Image Sensors", @@ -18310,7 +18928,8 @@ "aff_campus_unique_index": "0;0;0;0+0+0", "aff_campus_unique": "Seoul;", "aff_country_unique_index": "0+0;0+0;0;0;0;0;0+0+0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Lee_2023_ICCV,\n \n author = {\n Lee,\n Haechang and Park,\n Dongwon and Jeong,\n Wongi and Kim,\n Kijeong and Je,\n Hyunwoo and Ryu,\n Dongil and Chun,\n Se Young\n},\n title = {\n Efficient Unified Demosaicing for Bayer and Non-Bayer Patterned Image Sensors\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12750-12759\n} \n}" }, { "title": "Efficient Video Action Detection with Token Dropout and Context Refinement", @@ -18333,7 +18952,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chen_Efficient_Video_Action_Detection_with_Token_Dropout_and_Context_Refinement_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chen_Efficient_Video_Action_Detection_with_Token_Dropout_and_Context_Refinement_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Lei and Tong,\n Zhan and Song,\n Yibing and Wu,\n Gangshan and Wang,\n Limin\n},\n title = {\n Efficient Video Action Detection with Token Dropout and Context Refinement\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10388-10399\n} \n}" }, { "title": "Efficient Video Prediction via Sparsely Conditioned Flow Matching", @@ -18361,11 +18981,12 @@ "aff_unique_norm": "University of Bern", "aff_unique_dep": "Institute of Computer Science", "aff_unique_url": "https://www.unibe.ch", - "aff_unique_abbr": "", - "aff_campus_unique_index": "", - "aff_campus_unique": "", + "aff_unique_abbr": "UniBE", + "aff_campus_unique_index": "0;0;0", + "aff_campus_unique": "Bern", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Switzerland" + "aff_country_unique": "Switzerland", + "bibtex": "@InProceedings{Davtyan_2023_ICCV,\n \n author = {\n Davtyan,\n Aram and Sameni,\n Sepehr and Favaro,\n Paolo\n},\n title = {\n Efficient Video Prediction via Sparsely Conditioned Flow Matching\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23263-23274\n} \n}" }, { "title": "Efficient View Synthesis with Neural Radiance Distribution Field", @@ -18390,14 +19011,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wu_Efficient_View_Synthesis_with_Neural_Radiance_Distribution_Field_ICCV_2023_paper.html", "aff_unique_index": "0+1;2;2;0+1;0+1;2", - "aff_unique_norm": "Fudan University;Shenzhen University, College of Software Engineering;Microsoft", + "aff_unique_norm": "Fudan University;Shenzhen University, College of Software Engineering;Microsoft Research", "aff_unique_dep": ";College of Software Engineering;Research", - "aff_unique_url": "https://www.fudan.edu.cn/en/;http://sse.cuhkcz.edu.cn/;https://www.microsoft.com/en-us/research/group/asia", + "aff_unique_url": "https://www.fudan.edu.cn/en/;http://sse.cuhksz.edu.cn/;https://www.microsoft.com/en-us/research/group/asia", "aff_unique_abbr": "Fudan;SSE;MSR Asia", "aff_campus_unique_index": "0+0;1;1;0+0;0+0;1", "aff_campus_unique": "Shenzhen;Asia", "aff_country_unique_index": "0+0;0;0;0+0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wu_2023_ICCV,\n \n author = {\n Wu,\n Yushuang and Li,\n Xiao and Wang,\n Jinglu and Han,\n Xiaoguang and Cui,\n Shuguang and Lu,\n Yan\n},\n title = {\n Efficient View Synthesis with Neural Radiance Distribution Field\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18506-18515\n} \n}" }, { "title": "Efficient-VQGAN: Towards High-Resolution Image Generation with Efficient Vision Transformers", @@ -18429,7 +19051,8 @@ "aff_campus_unique_index": ";;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0;0;0+0+0;0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Cao_2023_ICCV,\n \n author = {\n Cao,\n Shiyue and Yin,\n Yueqin and Huang,\n Lianghua and Liu,\n Yu and Zhao,\n Xin and Zhao,\n Deli and Huang,\n Kaigi\n},\n title = {\n Efficient-VQGAN: Towards High-Resolution Image Generation with Efficient Vision Transformers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7368-7377\n} \n}" }, { "title": "EfficientTrain: Exploring Generalized Curriculum Learning for Training Visual Backbones", @@ -18454,14 +19077,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wang_EfficientTrain_Exploring_Generalized_Curriculum_Learning_for_Training_Visual_Backbones_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;1;1;0;0+2", - "aff_unique_norm": "Tsinghua University;Huawei;Beijing Academy of Artificial Intelligence", - "aff_unique_dep": "Department of Automation;Huawei Technologies;", + "aff_unique_norm": "Tsinghua University;Huawei Technologies;Beijing Academy of Artificial Intelligence", + "aff_unique_dep": "Department of Automation;;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.huawei.com;https://www.baaic.cn", "aff_unique_abbr": "Tsinghua;Huawei;BAAI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Yulin and Yue,\n Yang and Lu,\n Rui and Liu,\n Tianjiao and Zhong,\n Zhao and Song,\n Shiji and Huang,\n Gao\n},\n title = {\n EfficientTrain: Exploring Generalized Curriculum Learning for Training Visual Backbones\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5852-5864\n} \n}" }, { "title": "EfficientViT: Lightweight Multi-Scale Attention for High-Resolution Dense Prediction", @@ -18493,7 +19117,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Cai_2023_ICCV,\n \n author = {\n Cai,\n Han and Li,\n Junyan and Hu,\n Muyan and Gan,\n Chuang and Han,\n Song\n},\n title = {\n EfficientViT: Lightweight Multi-Scale Attention for High-Resolution Dense Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17302-17313\n} \n}" }, { "title": "Efficiently Robustify Pre-Trained Models", @@ -18518,14 +19143,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Jain_Efficiently_Robustify_Pre-Trained_Models_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;1", - "aff_unique_norm": "Indian Institute of Technology Roorkee;Microsoft;University of Central Florida", + "aff_unique_norm": "Indian Institute of Technology Roorkee;Microsoft Corporation;University of Central Florida", "aff_unique_dep": ";Microsoft Research;Center for Research in Computer Vision", "aff_unique_url": "https://www.iitr.ac.in;https://www.microsoft.com/en-us/research;https://www.crcv.ucf.edu", "aff_unique_abbr": "IITR;MSR;UCF", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1", - "aff_country_unique": "India;United States" + "aff_country_unique": "India;United States", + "bibtex": "@InProceedings{Jain_2023_ICCV,\n \n author = {\n Jain,\n Nishant and Behl,\n Harkirat and Rawat,\n Yogesh Singh and Vineet,\n Vibhav\n},\n title = {\n Efficiently Robustify Pre-Trained Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5505-5515\n} \n}" }, { "title": "Ego-Humans: An Ego-Centric 3D Multi-Human Benchmark", @@ -18548,7 +19174,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Khirodkar_Ego-Humans_An_Ego-Centric_3D_Multi-Human_Benchmark_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Khirodkar_Ego-Humans_An_Ego-Centric_3D_Multi-Human_Benchmark_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Khirodkar_2023_ICCV,\n \n author = {\n Khirodkar,\n Rawal and Bansal,\n Aayush and Ma,\n Lingni and Newcombe,\n Richard and Vo,\n Minh and Kitani,\n Kris\n},\n title = {\n Ego-Humans: An Ego-Centric 3D Multi-Human Benchmark\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19807-19819\n} \n}" }, { "title": "Ego-Only: Egocentric Action Detection without Exocentric Transferring", @@ -18571,7 +19198,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wang_Ego-Only_Egocentric_Action_Detection_without_Exocentric_Transferring_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wang_Ego-Only_Egocentric_Action_Detection_without_Exocentric_Transferring_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Huiyu and Singh,\n Mitesh Kumar and Torresani,\n Lorenzo\n},\n title = {\n Ego-Only: Egocentric Action Detection without Exocentric Transferring\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5250-5261\n} \n}" }, { "title": "EgoLoc: Revisiting 3D Object Localization from Egocentric Videos with Visual Queries", @@ -18603,7 +19231,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Oxford", "aff_country_unique_index": "0+1;0;0;0;0", - "aff_country_unique": "Saudi Arabia;United Kingdom" + "aff_country_unique": "Saudi Arabia;United Kingdom", + "bibtex": "@InProceedings{Mai_2023_ICCV,\n \n author = {\n Mai,\n Jinjie and Hamdi,\n Abdullah and Giancola,\n Silvio and Zhao,\n Chen and Ghanem,\n Bernard\n},\n title = {\n EgoLoc: Revisiting 3D Object Localization from Egocentric Videos with Visual Queries\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 45-57\n} \n}" }, { "title": "EgoObjects: A Large-Scale Egocentric Dataset for Fine-Grained Object Understanding", @@ -18628,14 +19257,15 @@ "author_num": 9, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhu_EgoObjects_A_Large-Scale_Egocentric_Dataset_for_Fine-Grained_Object_Understanding_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0;0;0;0;0;0", - "aff_unique_norm": "Meta", + "aff_unique_norm": "Meta Platforms, Inc.", "aff_unique_dep": "Meta AI", "aff_unique_url": "https://meta.com", "aff_unique_abbr": "Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhu_2023_ICCV,\n \n author = {\n Zhu,\n Chenchen and Xiao,\n Fanyi and Alvarado,\n Andres and Babaei,\n Yasmine and Hu,\n Jiabo and El-Mohri,\n Hichem and Culatana,\n Sean and Sumbaly,\n Roshan and Yan,\n Zhicheng\n},\n title = {\n EgoObjects: A Large-Scale Egocentric Dataset for Fine-Grained Object Understanding\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20110-20120\n} \n}" }, { "title": "EgoPCA: A New Framework for Egocentric Hand-Object Interaction Understanding", @@ -18667,7 +19297,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0+0;0;0;0;1;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Xu_2023_ICCV,\n \n author = {\n Xu,\n Yue and Li,\n Yong-Lu and Huang,\n Zhemin and Liu,\n Michael Xu and Lu,\n Cewu and Tai,\n Yu-Wing and Tang,\n Chi-Keung\n},\n title = {\n EgoPCA: A New Framework for Egocentric Hand-Object Interaction Understanding\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5273-5284\n} \n}" }, { "title": "EgoTV: Egocentric Task Verification from Natural Language Task Descriptions", @@ -18692,14 +19323,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Hazra_EgoTV_Egocentric_Task_Verification_from_Natural_Language_Task_Descriptions_ICCV_2023_paper.html", "aff_unique_index": "0;1;1;1;1", - "aff_unique_norm": "Orebro University;Meta", - "aff_unique_dep": ";Meta Platforms, Inc.", + "aff_unique_norm": "Orebro University;Meta Platforms, Inc.", + "aff_unique_dep": ";", "aff_unique_url": "https://www.oru.se;https://meta.com", "aff_unique_abbr": "ORU;Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;1", - "aff_country_unique": "Sweden;United States" + "aff_country_unique": "Sweden;United States", + "bibtex": "@InProceedings{Hazra_2023_ICCV,\n \n author = {\n Hazra,\n Rishi and Chen,\n Brian and Rai,\n Akshara and Kamra,\n Nitin and Desai,\n Ruta\n},\n title = {\n EgoTV: Egocentric Task Verification from Natural Language Task Descriptions\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15417-15429\n} \n}" }, { "title": "EgoVLPv2: Egocentric Video-Language Pre-training with Fusion in the Backbone", @@ -18724,14 +19356,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Pramanick_EgoVLPv2_Egocentric_Video-Language_Pre-training_with_Fusion_in_the_Backbone_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;3;1;3;0;1", - "aff_unique_norm": "Johns Hopkins University;Meta;University of Toronto;National University of Singapore", + "aff_unique_norm": "Johns Hopkins University;Meta Platforms, Inc.;University of Toronto;National University of Singapore", "aff_unique_dep": ";Meta AI;;", "aff_unique_url": "https://www.jhu.edu;https://meta.com;https://www.utoronto.ca;https://www.nus.edu.sg", "aff_unique_abbr": "JHU;Meta;U of T;NUS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;2;0;2;0;0", - "aff_country_unique": "United States;Canada;Singapore" + "aff_country_unique": "United States;Canada;Singapore", + "bibtex": "@InProceedings{Pramanick_2023_ICCV,\n \n author = {\n Pramanick,\n Shraman and Song,\n Yale and Nag,\n Sayan and Lin,\n Kevin Qinghong and Shah,\n Hardik and Shou,\n Mike Zheng and Chellappa,\n Rama and Zhang,\n Pengchuan\n},\n title = {\n EgoVLPv2: Egocentric Video-Language Pre-training with Fusion in the Backbone\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5285-5297\n} \n}" }, { "title": "EigenPlaces: Training Viewpoint Robust Models for Visual Place Recognition", @@ -18763,7 +19396,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Italy" + "aff_country_unique": "Italy", + "bibtex": "@InProceedings{Berton_2023_ICCV,\n \n author = {\n Berton,\n Gabriele and Trivigno,\n Gabriele and Caputo,\n Barbara and Masone,\n Carlo\n},\n title = {\n EigenPlaces: Training Viewpoint Robust Models for Visual Place Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11080-11090\n} \n}" }, { "title": "EigenTrajectory: Low-Rank Descriptors for Multi-Modal Trajectory Forecasting", @@ -18795,7 +19429,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "South Korea;United States" + "aff_country_unique": "South Korea;United States", + "bibtex": "@InProceedings{Bae_2023_ICCV,\n \n author = {\n Bae,\n Inhwan and Oh,\n Jean and Jeon,\n Hae-Gon\n},\n title = {\n EigenTrajectory: Low-Rank Descriptors for Multi-Modal Trajectory Forecasting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10017-10029\n} \n}" }, { "title": "ElasticViT: Conflict-aware Supernet Training for Deploying Fast Vision Transformer on Diverse Mobile Devices", @@ -18820,14 +19455,15 @@ "author_num": 9, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Tang_ElasticViT_Conflict-aware_Supernet_Training_for_Deploying_Fast_Vision_Transformer_on_ICCV_2023_paper.html", "aff_unique_index": "0+1;1;1;1;1;1;1;0;1", - "aff_unique_norm": "Tsinghua University;Microsoft", + "aff_unique_norm": "Tsinghua University;Microsoft Corporation", "aff_unique_dep": ";Microsoft Research", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "THU;MSR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;1;1;1;1;1;1;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Tang_2023_ICCV,\n \n author = {\n Tang,\n Chen and Zhang,\n Li Lyna and Jiang,\n Huiqiang and Xu,\n Jiahang and Cao,\n Ting and Zhang,\n Quanlu and Yang,\n Yuqing and Wang,\n Zhi and Yang,\n Mao\n},\n title = {\n ElasticViT: Conflict-aware Supernet Training for Deploying Fast Vision Transformer on Diverse Mobile Devices\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5829-5840\n} \n}" }, { "title": "EmoSet: A Large-scale Visual Emotion Dataset with Rich Attributes", @@ -18852,14 +19488,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yang_EmoSet_A_Large-scale_Visual_Emotion_Dataset_with_Rich_Attributes_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;1;0+2;0", - "aff_unique_norm": "Shenzhen University;Hebrew University of Jerusalem;Tel Aviv University", + "aff_unique_norm": "Shenzhen University;The Hebrew University of Jerusalem;Tel Aviv University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.szu.edu.cn;https://www.huji.ac.il;https://www.tau.ac.il", "aff_unique_abbr": "SZU;HUJI;TAU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0+1;0", - "aff_country_unique": "China;Israel" + "aff_country_unique": "China;Israel", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n Jingyuan and Huang,\n Qirui and Ding,\n Tingting and Lischinski,\n Dani and Cohen-Or,\n Danny and Huang,\n Hui\n},\n title = {\n EmoSet: A Large-scale Visual Emotion Dataset with Rich Attributes\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20383-20394\n} \n}" }, { "title": "EmoTalk: Speech-Driven Emotional Disentanglement for 3D Face Animation", @@ -18891,7 +19528,8 @@ "aff_campus_unique_index": "1;", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0;0;0;0+1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Peng_2023_ICCV,\n \n author = {\n Peng,\n Ziqiao and Wu,\n Haoyu and Song,\n Zhenbo and Xu,\n Hao and Zhu,\n Xiangyu and He,\n Jun and Liu,\n Hongyan and Fan,\n Zhaoxin\n},\n title = {\n EmoTalk: Speech-Driven Emotional Disentanglement for 3D Face Animation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20687-20697\n} \n}" }, { "title": "Emotional Listener Portrait: Neural Listener Head Generation with Emotion", @@ -18923,7 +19561,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;1;1;1;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Song_2023_ICCV,\n \n author = {\n Song,\n Luchuan and Yin,\n Guojun and Jin,\n Zhenchao and Dong,\n Xiaoyi and Xu,\n Chenliang\n},\n title = {\n Emotional Listener Portrait: Neural Listener Head Generation with Emotion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20839-20849\n} \n}" }, { "title": "Empowering Low-Light Image Enhancer through Customized Learnable Priors", @@ -18935,7 +19574,7 @@ "author": "Naishan Zheng; Man Zhou; Yanmeng Dong; Xiangyu Rui; Jie Huang; Chongyi Li; Feng Zhao", "abstract": "Deep neural networks have achieved remarkable progress in enhancing low-light images by improving their brightness and eliminating noise. However, most existing methods construct end-to-end mapping networks heuristically, neglecting the intrinsic prior of image enhancement task and lacking transparency and interpretability. Although some unfolding solutions have been proposed to relieve these issues, they rely on proximal operator networks that deliver ambiguous and implicit priors. In this work, we propose a paradigm for low-light image enhancement that explores the potential of customized learnable priors to improve the transparency of the deep unfolding paradigm.Motivated by the powerful feature representation capability of Masked Autoencoder (MAE), we customize MAE-based illumination and noise priors and redevelop them from two perspectives: 1) structure flow: we train the MAE from a normal-light image to its illumination properties and then embed it into the proximal operator design of the unfolding architecture; and\n 2) optimization flow: we train MAE from a normal-light image to its gradient representation and then employ it as a regularization term to constrain noise in the model output. These designs improve the interpretability and representation capability of the model. Extensive experiments on multiple low-light image enhancement datasets demonstrate the superiority of our proposed paradigm over state-of-the-art methods. Code is available at https://github.com/zheng980629/CUE.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Zheng_Empowering_Low-Light_Image_Enhancer_through_Customized_Learnable_Priors_ICCV_2023_paper.pdf", - "aff": "University of Science and Technology of China; University of Science and Technology of China; ; Xi\u2019an Jiaotong University; University of Science and Technology of China; Nankai University; University of Science and Technology of China", + "aff": "University of Science and Technology of China; University of Science and Technology of China; ; Xi’an Jiaotong University; University of Science and Technology of China; Nankai University; University of Science and Technology of China", "project": "", "github": "https://github.com/zheng980629/CUE", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Zheng_Empowering_Low-Light_Image_ICCV_2023_supplemental.pdf", @@ -18948,14 +19587,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zheng_Empowering_Low-Light_Image_Enhancer_through_Customized_Learnable_Priors_ICCV_2023_paper.html", "aff_unique_index": "0;0;1;0;2;0", - "aff_unique_norm": "University of Science and Technology of China;Xi'an Jiao Tong University;Nankai University", + "aff_unique_norm": "University of Science and Technology of China;Xi'an Jiaotong University;Nankai University", "aff_unique_dep": ";;", "aff_unique_url": "http://www.ustc.edu.cn;https://www.xjtu.edu.cn;http://www.nankai.edu.cn", "aff_unique_abbr": "USTC;XJTU;NKU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zheng_2023_ICCV,\n \n author = {\n Zheng,\n Naishan and Zhou,\n Man and Dong,\n Yanmeng and Rui,\n Xiangyu and Huang,\n Jie and Li,\n Chongyi and Zhao,\n Feng\n},\n title = {\n Empowering Low-Light Image Enhancer through Customized Learnable Priors\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12559-12569\n} \n}" }, { "title": "Encyclopedic VQA: Visual Questions About Detailed Properties of Fine-Grained Categories", @@ -18963,11 +19603,11 @@ "status": "Poster", "track": "main", "pid": "6900", - "author_site": "Thomas Mensink, Jasper Uijlings, Lluis Castrejon, Arushi Goel, Felipe Cadar, Howard Zhou, Fei Sha, Andr\u00e9 Araujo, Vittorio Ferrari", - "author": "Thomas Mensink; Jasper Uijlings; Lluis Castrejon; Arushi Goel; Felipe Cadar; Howard Zhou; Fei Sha; Andr\u00e9 Araujo; Vittorio Ferrari", + "author_site": "Thomas Mensink, Jasper Uijlings, Lluis Castrejon, Arushi Goel, Felipe Cadar, Howard Zhou, Fei Sha, André Araujo, Vittorio Ferrari", + "author": "Thomas Mensink; Jasper Uijlings; Lluis Castrejon; Arushi Goel; Felipe Cadar; Howard Zhou; Fei Sha; André Araujo; Vittorio Ferrari", "abstract": "We propose Encyclopedic-VQA, a large scale visual question answering (VQA) dataset featuring visual questions about detailed properties of fine-grained categories and instances. It contains 221k unique question+answer pairs each matched with (up to) 5 images, resulting in a total of 1M VQA samples. Moreover, our dataset comes with a controlled knowledge base derived from Wikipedia, marking the evidence to support each answer. Empirically, we show that our dataset poses a hard challenge for large vision+language models as they perform poorly on our dataset: PaLI [9] is state-of-the-art on OK-VQA [29], yet it only achieves 13.0% accuracy on our dataset. Moreover, we experimentally show that progress on answering our encyclopedic questions can be achieved by augmenting large models with a mechanism that retrieves relevant information for the knowledge base. An oracle experiment with perfect retrieval achieves 87.0% accuracy on the single-hop portion of our dataset, and an automatic retrieval-\n augmented prototype yields 48.8%. We believe that our dataset enables future research on retrieval-augmented vision+language models.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Mensink_Encyclopedic_VQA_Visual_Questions_About_Detailed_Properties_of_Fine-Grained_Categories_ICCV_2023_paper.pdf", - "aff": "Google Research\u2020\u2217; Google Research\u2020\u2217; Google Research\u2217; Google Research\u2021; Google Research\u2021; Google Research\u2217; Google Research\u2217; Google Research\u2217; Google Research\u2217", + "aff": "Google Research†∗; Google Research†∗; Google Research∗; Google Research‡; Google Research‡; Google Research∗; Google Research∗; Google Research∗; Google Research∗", "project": "", "github": "https://github.com/google-research/google-research/tree/master/encyclopedic_vqa", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Mensink_Encyclopedic_VQA_Visual_ICCV_2023_supplemental.pdf", @@ -18987,7 +19627,8 @@ "aff_campus_unique_index": "0;0;0;0;0;0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Mensink_2023_ICCV,\n \n author = {\n Mensink,\n Thomas and Uijlings,\n Jasper and Castrejon,\n Lluis and Goel,\n Arushi and Cadar,\n Felipe and Zhou,\n Howard and Sha,\n Fei and Araujo,\n Andr\\'e and Ferrari,\n Vittorio\n},\n title = {\n Encyclopedic VQA: Visual Questions About Detailed Properties of Fine-Grained Categories\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3113-3124\n} \n}" }, { "title": "End-to-End Diffusion Latent Optimization Improves Classifier Guidance", @@ -19019,7 +19660,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Wallace_2023_ICCV,\n \n author = {\n Wallace,\n Bram and Gokul,\n Akash and Ermon,\n Stefano and Naik,\n Nikhil\n},\n title = {\n End-to-End Diffusion Latent Optimization Improves Classifier Guidance\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7280-7290\n} \n}" }, { "title": "End-to-end 3D Tracking with Decoupled Queries", @@ -19042,7 +19684,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_End-to-end_3D_Tracking_with_Decoupled_Queries_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_End-to-end_3D_Tracking_with_Decoupled_Queries_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Yanwei and Yu,\n Zhiding and Philion,\n Jonah and Anandkumar,\n Anima and Fidler,\n Sanja and Jia,\n Jiaya and Alvarez,\n Jose\n},\n title = {\n End-to-end 3D Tracking with Decoupled Queries\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18302-18311\n} \n}" }, { "title": "End2End Multi-View Feature Matching with Differentiable Pose Optimization", @@ -19050,8 +19693,8 @@ "status": "Poster", "track": "main", "pid": "6069", - "author_site": "Barbara Roessle, Matthias Nie\u00dfner", - "author": "Barbara Roessle; Matthias Nie\u00dfner", + "author_site": "Barbara Roessle, Matthias Nießner", + "author": "Barbara Roessle; Matthias Nießner", "abstract": "Erroneous feature matches have severe impact on subsequent camera pose estimation and often require additional, time-costly measures, like RANSAC, for outlier rejection. Our method tackles this challenge by addressing feature matching and pose optimization jointly. To this end, we propose a graph attention network to predict image correspondences along with confidence weights. The resulting matches serve as weighted constraints in a differentiable pose estimation. Training feature matching with gradients from pose optimization naturally learns to down-weight outliers and boosts pose estimation on image pairs compared to SuperGlue by 6.7% on ScanNet. At the same time, it reduces the pose estimation time by over 50% and renders RANSAC iterations unnecessary. Moreover, we integrate information from multiple views by spanning the graph across multiple frames to predict the matches all at once. Multi-view matching combined with end-to-end training improves the pose estimation metrics on Matterport3D by 18.5% compared to SuperGlue.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Roessle_End2End_Multi-View_Feature_Matching_with_Differentiable_Pose_Optimization_ICCV_2023_paper.pdf", "aff": "Technical University of Munich; Technical University of Munich", @@ -19074,7 +19717,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Roessle_2023_ICCV,\n \n author = {\n Roessle,\n Barbara and Nie{\\ss\n}ner,\n Matthias\n},\n title = {\n End2End Multi-View Feature Matching with Differentiable Pose Optimization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 477-487\n} \n}" }, { "title": "Energy-based Self-Training and Normalization for Unsupervised Domain Adaptation", @@ -19099,14 +19743,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Herath_Energy-based_Self-Training_and_Normalization_for_Unsupervised_Domain_Adaptation_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;3;0;0;0;0", - "aff_unique_norm": "Monash University;Agency for Science, Technology and Research;University of Adelaide;eBay Inc.", + "aff_unique_norm": "Monash University;Agency for Science, Technology and Research;The University of Adelaide;eBay Inc.", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.monash.edu;https://www.a-star.edu.sg;https://www.adelaide.edu.au;https://www.ebayinc.com", "aff_unique_abbr": "Monash;A*STAR;Adelaide;eBay", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;2;0;0;0;0", - "aff_country_unique": "Australia;Singapore;United States" + "aff_country_unique": "Australia;Singapore;United States", + "bibtex": "@InProceedings{Herath_2023_ICCV,\n \n author = {\n Herath,\n Samitha and Fernando,\n Basura and Abbasnejad,\n Ehsan and Hayat,\n Munawar and Khadivi,\n Shahram and Harandi,\n Mehrtash and Rezatofighi,\n Hamid and Haffari,\n Gholamreza\n},\n title = {\n Energy-based Self-Training and Normalization for Unsupervised Domain Adaptation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11653-11662\n} \n}" }, { "title": "Enhanced Meta Label Correction for Coping with Label Corruption", @@ -19118,7 +19763,7 @@ "author": "Mitchell Keren Taraday; Chaim Baskin", "abstract": "Deep Neural Networks (DNNs) have revolutionized visual classification tasks over the last decade.\n The training phase of deep-learning-based algorithms, however, often requires a vast amount of reliable annotated data.\n While reliability collecting such amount of labeled data usually yields to an exhaustive, expensive process,\n for many applications, acquiring massive datasets with imperfect annotations is straightforward.\n For instance, crawling search engines and online websites can generate a boatload amount of noisy labeled data. Hence, solving the problem of learning with noisy labels (LNL) is of paramount importance.\n Traditional LNL methods have successfully handled datasets with artificially injected noise, but they still fall short of adequately handling real-world noise. With the increasing use of meta-learning in the diverse fields of machine learning, researchers have tried to leverage auxiliary small clean datasets to meta-correct the training labels. Nonetheless, existing meta-label correction approaches are not fully exploiting their potential. In this study, we propose EMLC, an enhanced meta-label correction approach for the LNL problem.\n We re-examine the meta-learning process and introduce faster and more accurate meta-gradient derivations. We propose a novel teacher architecture tailored explicitly for the LNL problem, equipped with novel training objectives.\n EMLC outperforms prior approaches and achieves state-of-the-art results in all standard benchmarks.\n Notably, EMLC enhances the previous art on the noisy real-world dataset Clothing1M by 0.87%. \n Our publicly available code can be found at the following link: https://github.com/iccv23anonymous/Enhanced-Meta-Label-Correction", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Taraday_Enhanced_Meta_Label_Correction_for_Coping_with_Label_Corruption_ICCV_2023_paper.pdf", - "aff": "Technion \u2013 Israel Institute of Technology; Technion \u2013 Israel Institute of Technology", + "aff": "Technion – Israel Institute of Technology; Technion – Israel Institute of Technology", "project": "https://sites.google.com/view/emlc-paper", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Taraday_Enhanced_Meta_Label_ICCV_2023_supplemental.pdf", @@ -19131,14 +19776,15 @@ "author_num": 2, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Taraday_Enhanced_Meta_Label_Correction_for_Coping_with_Label_Corruption_ICCV_2023_paper.html", "aff_unique_index": "0;0", - "aff_unique_norm": "Technion \u2013 Israel Institute of Technology", + "aff_unique_norm": "Technion – Israel Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.technion.ac.il/en/", "aff_unique_abbr": "Technion", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Israel" + "aff_country_unique": "Israel", + "bibtex": "@InProceedings{Taraday_2023_ICCV,\n \n author = {\n Taraday,\n Mitchell Keren and Baskin,\n Chaim\n},\n title = {\n Enhanced Meta Label Correction for Coping with Label Corruption\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16295-16304\n} \n}" }, { "title": "Enhanced Soft Label for Semi-Supervised Semantic Segmentation", @@ -19170,7 +19816,8 @@ "aff_campus_unique_index": "0;0;0;0+1;0+1", "aff_campus_unique": "Guangzhou;Shenzhen", "aff_country_unique_index": "0;0;0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Ma_2023_ICCV,\n \n author = {\n Ma,\n Jie and Wang,\n Chuan and Liu,\n Yang and Lin,\n Liang and Li,\n Guanbin\n},\n title = {\n Enhanced Soft Label for Semi-Supervised Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1185-1195\n} \n}" }, { "title": "Enhancing Adversarial Robustness in Low-Label Regime via Adaptively Weighted Regularization and Knowledge Distillation", @@ -19202,7 +19849,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n Dongyoon and Kong,\n Insung and Kim,\n Yongdai\n},\n title = {\n Enhancing Adversarial Robustness in Low-Label Regime via Adaptively Weighted Regularization and Knowledge Distillation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4552-4561\n} \n}" }, { "title": "Enhancing Fine-Tuning Based Backdoor Defense with Sharpness-Aware Minimization", @@ -19227,14 +19875,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhu_Enhancing_Fine-Tuning_Based_Backdoor_Defense_with_Sharpness-Aware_Minimization_ICCV_2023_paper.html", "aff_unique_index": "0;0;1;2;0", - "aff_unique_norm": "Chinese University of Hong Kong, Shenzhen;JD;Tencent", - "aff_unique_dep": "School of Data Science;JD Explore Academy;Tencent AI Lab", + "aff_unique_norm": "The Chinese University of Hong Kong, Shenzhen;JD Explore Academy;Tencent", + "aff_unique_dep": "School of Data Science;;Tencent AI Lab", "aff_unique_url": "https://www.cuhk.edu.cn/en/shenzhen;;https://ai.tencent.com", "aff_unique_abbr": "CUHK-Shenzhen;;Tencent AI Lab", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Shenzhen;", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Zhu_2023_ICCV,\n \n author = {\n Zhu,\n Mingli and Wei,\n Shaokui and Shen,\n Li and Fan,\n Yanbo and Wu,\n Baoyuan\n},\n title = {\n Enhancing Fine-Tuning Based Backdoor Defense with Sharpness-Aware Minimization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4466-4477\n} \n}" }, { "title": "Enhancing Generalization of Universal Adversarial Perturbation through Gradient Aggregation", @@ -19266,7 +19915,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Xuannan and Zhong,\n Yaoyao and Zhang,\n Yuhang and Qin,\n Lixiong and Deng,\n Weihong\n},\n title = {\n Enhancing Generalization of Universal Adversarial Perturbation through Gradient Aggregation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4435-4444\n} \n}" }, { "title": "Enhancing Modality-Agnostic Representations via Meta-Learning for Brain Tumor Segmentation", @@ -19298,7 +19948,8 @@ "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Stony Brook", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Konwer_2023_ICCV,\n \n author = {\n Konwer,\n Aishik and Hu,\n Xiaoling and Bae,\n Joseph and Xu,\n Xuan and Chen,\n Chao and Prasanna,\n Prateek\n},\n title = {\n Enhancing Modality-Agnostic Representations via Meta-Learning for Brain Tumor Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21415-21425\n} \n}" }, { "title": "Enhancing NeRF akin to Enhancing LLMs: Generalizable NeRF Transformer with Mixture-of-View-Experts", @@ -19330,7 +19981,8 @@ "aff_campus_unique_index": "0;0+1;0;0;0;0+2;0;0", "aff_campus_unique": "Austin;Cambridge;Madras", "aff_country_unique_index": "0;0+1;0;0;0;0+2;0;0", - "aff_country_unique": "United States;United Kingdom;India" + "aff_country_unique": "United States;United Kingdom;India", + "bibtex": "@InProceedings{Cong_2023_ICCV,\n \n author = {\n Cong,\n Wenyan and Liang,\n Hanxue and Wang,\n Peihao and Fan,\n Zhiwen and Chen,\n Tianlong and Varma,\n Mukund and Wang,\n Yi and Wang,\n Zhangyang\n},\n title = {\n Enhancing NeRF akin to Enhancing LLMs: Generalizable NeRF Transformer with Mixture-of-View-Experts\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3193-3204\n} \n}" }, { "title": "Enhancing Non-line-of-sight Imaging via Learnable Inverse Kernel and Attention Mechanisms", @@ -19353,7 +20005,8 @@ "aff_domain": ";;;;;;;;;", "email": ";;;;;;;;;", "author_num": 10, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yu_Enhancing_Non-line-of-sight_Imaging_via_Learnable_Inverse_Kernel_and_Attention_Mechanisms_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yu_Enhancing_Non-line-of-sight_Imaging_via_Learnable_Inverse_Kernel_and_Attention_Mechanisms_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Yu_2023_ICCV,\n \n author = {\n Yu,\n Yanhua and Shen,\n Siyuan and Wang,\n Zi and Huang,\n Binbin and Wang,\n Yuehan and Peng,\n Xingyue and Xia,\n Suan and Liu,\n Ping and Li,\n Ruiqian and Li,\n Shiying\n},\n title = {\n Enhancing Non-line-of-sight Imaging via Learnable Inverse Kernel and Attention Mechanisms\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10563-10573\n} \n}" }, { "title": "Enhancing Privacy Preservation in Federated Learning via Learning Rate Perturbation", @@ -19385,7 +20038,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Beijing;", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wan_2023_ICCV,\n \n author = {\n Wan,\n Guangnian and Du,\n Haitao and Yuan,\n Xuejing and Yang,\n Jun and Chen,\n Meiling and Xu,\n Jie\n},\n title = {\n Enhancing Privacy Preservation in Federated Learning via Learning Rate Perturbation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4772-4781\n} \n}" }, { "title": "Enhancing Sample Utilization through Sample Adaptive Augmentation in Semi-Supervised Learning", @@ -19417,7 +20071,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;1;0;0", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Gui_2023_ICCV,\n \n author = {\n Gui,\n Guan and Zhao,\n Zhen and Qi,\n Lei and Zhou,\n Luping and Wang,\n Lei and Shi,\n Yinghuan\n},\n title = {\n Enhancing Sample Utilization through Sample Adaptive Augmentation in Semi-Supervised Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15880-15889\n} \n}" }, { "title": "Environment Agnostic Representation for Visual Reinforcement Learning", @@ -19449,7 +20104,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Choi_2023_ICCV,\n \n author = {\n Choi,\n Hyesong and Lee,\n Hunsang and Jeong,\n Seongwon and Min,\n Dongbo\n},\n title = {\n Environment Agnostic Representation for Visual Reinforcement Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 263-273\n} \n}" }, { "title": "Environment-Invariant Curriculum Relation Learning for Fine-Grained Scene Graph Generation", @@ -19481,7 +20137,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Min_2023_ICCV,\n \n author = {\n Min,\n Yukuan and Wu,\n Aming and Deng,\n Cheng\n},\n title = {\n Environment-Invariant Curriculum Relation Learning for Fine-Grained Scene Graph Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13296-13307\n} \n}" }, { "title": "Equivariant Similarity for Vision-Language Foundation Models", @@ -19506,14 +20163,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wang_Equivariant_Similarity_for_Vision-Language_Foundation_Models_ICCV_2023_paper.html", "aff_unique_index": "0;1;1;1;1;0;1;1", - "aff_unique_norm": "Nanyang Technological University;Microsoft", - "aff_unique_dep": ";Microsoft Corporation", + "aff_unique_norm": "Nanyang Technological University;Microsoft Corporation", + "aff_unique_dep": ";", "aff_unique_url": "https://www.ntu.edu.sg;https://www.microsoft.com", "aff_unique_abbr": "NTU;Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;1;0;1;1", - "aff_country_unique": "Singapore;United States" + "aff_country_unique": "Singapore;United States", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Tan and Lin,\n Kevin and Li,\n Linjie and Lin,\n Chung-Ching and Yang,\n Zhengyuan and Zhang,\n Hanwang and Liu,\n Zicheng and Wang,\n Lijuan\n},\n title = {\n Equivariant Similarity for Vision-Language Foundation Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11998-12008\n} \n}" }, { "title": "Erasing Concepts from Diffusion Models", @@ -19545,7 +20203,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Gandikota_2023_ICCV,\n \n author = {\n Gandikota,\n Rohit and Materzynska,\n Joanna and Fiotto-Kaufman,\n Jaden and Bau,\n David\n},\n title = {\n Erasing Concepts from Diffusion Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2426-2436\n} \n}" }, { "title": "Essential Matrix Estimation using Convex Relaxations in Orthogonal Space", @@ -19577,7 +20236,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Karimian_2023_ICCV,\n \n author = {\n Karimian,\n Arman and Tron,\n Roberto\n},\n title = {\n Essential Matrix Estimation using Convex Relaxations in Orthogonal Space\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17142-17152\n} \n}" }, { "title": "Estimator Meets Equilibrium Perspective: A Rectified Straight Through Estimator for Binary Neural Networks Training", @@ -19609,7 +20269,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0+0+0;0;0;0+0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wu_2023_ICCV,\n \n author = {\n Wu,\n Xiao-Ming and Zheng,\n Dian and Liu,\n Zuhao and Zheng,\n Wei-Shi\n},\n title = {\n Estimator Meets Equilibrium Perspective: A Rectified Straight Through Estimator for Binary Neural Networks Training\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17055-17064\n} \n}" }, { "title": "Eulerian Single-Photon Vision", @@ -19641,7 +20302,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Madison", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Gupta_2023_ICCV,\n \n author = {\n Gupta,\n Shantanu and Gupta,\n Mohit\n},\n title = {\n Eulerian Single-Photon Vision\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10465-10476\n} \n}" }, { "title": "Evaluating Data Attribution for Text-to-Image Models", @@ -19664,7 +20326,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wang_Evaluating_Data_Attribution_for_Text-to-Image_Models_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wang_Evaluating_Data_Attribution_for_Text-to-Image_Models_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Sheng-Yu and Efros,\n Alexei A. and Zhu,\n Jun-Yan and Zhang,\n Richard\n},\n title = {\n Evaluating Data Attribution for Text-to-Image Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7192-7203\n} \n}" }, { "title": "Evaluation and Improvement of Interpretability for Self-Explainable Part-Prototype Networks", @@ -19696,7 +20359,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0;0;0", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Huang_2023_ICCV,\n \n author = {\n Huang,\n Qihan and Xue,\n Mengqi and Huang,\n Wenqi and Zhang,\n Haofei and Song,\n Jie and Jing,\n Yongcheng and Song,\n Mingli\n},\n title = {\n Evaluation and Improvement of Interpretability for Self-Explainable Part-Prototype Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2011-2020\n} \n}" }, { "title": "Event Camera Data Pre-training", @@ -19728,7 +20392,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";BIT", "aff_country_unique_index": "0;1;2", - "aff_country_unique": "Australia;India;China" + "aff_country_unique": "Australia;India;China", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n Yan and Pan,\n Liyuan and Liu,\n Liu\n},\n title = {\n Event Camera Data Pre-training\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10699-10709\n} \n}" }, { "title": "Event-Guided Procedure Planning from Instructional Videos with Text Supervision", @@ -19760,7 +20425,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n An-Lan and Lin,\n Kun-Yu and Du,\n Jia-Run and Meng,\n Jingke and Zheng,\n Wei-Shi\n},\n title = {\n Event-Guided Procedure Planning from Instructional Videos with Text Supervision\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13565-13575\n} \n}" }, { "title": "Event-based Temporally Dense Optical Flow Estimation with Sequential Learning", @@ -19783,7 +20449,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ponghiran_Event-based_Temporally_Dense_Optical_Flow_Estimation_with_Sequential_Learning_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ponghiran_Event-based_Temporally_Dense_Optical_Flow_Estimation_with_Sequential_Learning_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Ponghiran_2023_ICCV,\n \n author = {\n Ponghiran,\n Wachirawit and Liyanagedera,\n Chamika Mihiranga and Roy,\n Kaushik\n},\n title = {\n Event-based Temporally Dense Optical Flow Estimation with Sequential Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9827-9836\n} \n}" }, { "title": "Eventful Transformers: Leveraging Temporal Redundancy in Vision Transformers", @@ -19795,7 +20462,7 @@ "author": "Matthew Dutson; Yin Li; Mohit Gupta", "abstract": "Vision Transformers achieve impressive accuracy across a range of visual recognition tasks. Unfortunately, their accuracy frequently comes with high computational costs. This is a particular issue in video recognition, where models are often applied repeatedly across frames or temporal chunks. In this work, we exploit temporal redundancy between subsequent inputs to reduce the cost of Transformers for video processing. We describe a method for identifying and re-processing only those tokens that have changed significantly over time. Our proposed family of models, Eventful Transformers, can be converted from existing Transformers (often without any re-training) and give adaptive control over the compute cost at runtime. We evaluate our method on large-scale datasets for video object detection (ImageNet VID) and action recognition (EPIC-Kitchens 100). Our approach leads to significant computational savings (on the order of 2-4x) with only minor reductions in accuracy.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Dutson_Eventful_Transformers_Leveraging_Temporal_Redundancy_in_Vision_Transformers_ICCV_2023_paper.pdf", - "aff": "University of Wisconsin\u2013Madison; University of Wisconsin\u2013Madison; University of Wisconsin\u2013Madison", + "aff": "University of Wisconsin–Madison; University of Wisconsin–Madison; University of Wisconsin–Madison", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Dutson_Eventful_Transformers_Leveraging_ICCV_2023_supplemental.pdf", @@ -19808,14 +20475,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Dutson_Eventful_Transformers_Leveraging_Temporal_Redundancy_in_Vision_Transformers_ICCV_2023_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "University of Wisconsin\u2013Madison", + "aff_unique_norm": "University of Wisconsin–Madison", "aff_unique_dep": "", "aff_unique_url": "https://www.wisc.edu", - "aff_unique_abbr": "UW\u2013Madison", + "aff_unique_abbr": "UW–Madison", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Madison", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Dutson_2023_ICCV,\n \n author = {\n Dutson,\n Matthew and Li,\n Yin and Gupta,\n Mohit\n},\n title = {\n Eventful Transformers: Leveraging Temporal Redundancy in Vision Transformers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16911-16923\n} \n}" }, { "title": "EverLight: Indoor-Outdoor Editable HDR Lighting Estimation", @@ -19823,11 +20491,11 @@ "status": "Poster", "track": "main", "pid": "7070", - "author_site": "Mohammad Reza Karimi Dastjerdi, Jonathan Eisenmann, Yannick Hold-Geoffroy, Jean-Fran\u00e7ois Lalonde", - "author": "Mohammad Reza Karimi Dastjerdi; Jonathan Eisenmann; Yannick Hold-Geoffroy; Jean-Fran\u00e7ois Lalonde", + "author_site": "Mohammad Reza Karimi Dastjerdi, Jonathan Eisenmann, Yannick Hold-Geoffroy, Jean-François Lalonde", + "author": "Mohammad Reza Karimi Dastjerdi; Jonathan Eisenmann; Yannick Hold-Geoffroy; Jean-François Lalonde", "abstract": "Because of the diversity in lighting environments, existing illumination estimation techniques have been designed explicitly on indoor or outdoor environments. Methods have focused specifically on capturing accurate energy (e.g., through parametric lighting models), which emphasizes shading and strong cast shadows; or producing plausible texture (e.g., with GANs), which prioritizes plausible reflections. Approaches which provide editable lighting capabilities have been proposed, but these tend to be with simplified lighting models, offering limited realism. In this work, we propose to bridge the gap between these recent trends in the literature, and propose a method which combines a parametric light model with 360deg panoramas, ready to use as HDRI in rendering engines. We leverage recent advances in GAN-based LDR panorama extrapolation from a regular image, which we extend to HDR using parametric spherical gaussians. To achieve this, we introduce a novel lighting co-modulation method that injects lighting-related features throughout the generator, tightly coupling the original or edited scene illumination within the panorama generation process. In our representation, users can easily edit light direction, intensity, number, etc. to impact shading while providing rich, complex reflections while seamlessly blending with the edits. Furthermore, our method encompasses indoor and outdoor environments, demonstrating state-of-the-art results even when compared to domain-specific methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Dastjerdi_EverLight_Indoor-Outdoor_Editable_HDR_Lighting_Estimation_ICCV_2023_paper.pdf", - "aff": "Universit\u00b8e Laval+Adobe; Adobe; Adobe; Universit\u00b8e Laval", + "aff": "Universit¸e Laval+Adobe; Adobe; Adobe; Universit¸e Laval", "project": "https://lvsn.github.io/everlight/", "github": "", "supp": "", @@ -19840,14 +20508,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Dastjerdi_EverLight_Indoor-Outdoor_Editable_HDR_Lighting_Estimation_ICCV_2023_paper.html", "aff_unique_index": "0+1;1;1;0", - "aff_unique_norm": "Universit\u00e9 Laval;Adobe", - "aff_unique_dep": ";Adobe Inc.", + "aff_unique_norm": "Université Laval;Adobe Inc.", + "aff_unique_dep": ";", "aff_unique_url": "https://www.ulaval.ca;https://www.adobe.com", "aff_unique_abbr": "UL;Adobe", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;1;1;0", - "aff_country_unique": "Canada;United States" + "aff_country_unique": "Canada;United States", + "bibtex": "@InProceedings{Dastjerdi_2023_ICCV,\n \n author = {\n Dastjerdi,\n Mohammad Reza Karimi and Eisenmann,\n Jonathan and Hold-Geoffroy,\n Yannick and Lalonde,\n Jean-Fran\\c{c\n}ois\n},\n title = {\n EverLight: Indoor-Outdoor Editable HDR Lighting Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7420-7429\n} \n}" }, { "title": "ExBluRF: Efficient Radiance Fields for Extreme Motion Blurred Images", @@ -19879,7 +20548,8 @@ "aff_campus_unique_index": "0;0;0+0", "aff_campus_unique": "Seoul;", "aff_country_unique_index": "0;0;0;0;0+0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Lee_2023_ICCV,\n \n author = {\n Lee,\n Dongwoo and Oh,\n Jeongtaek and Rim,\n Jaesung and Cho,\n Sunghyun and Lee,\n Kyoung Mu\n},\n title = {\n ExBluRF: Efficient Radiance Fields for Extreme Motion Blurred Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17639-17648\n} \n}" }, { "title": "Examining Autoexposure for Challenging Scenes", @@ -19911,7 +20581,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Tedla_2023_ICCV,\n \n author = {\n Tedla,\n SaiKiran and Yang,\n Beixuan and Brown,\n Michael S.\n},\n title = {\n Examining Autoexposure for Challenging Scenes\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13076-13085\n} \n}" }, { "title": "Exemplar-Free Continual Transformer with Convolutions", @@ -19936,14 +20607,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Roy_Exemplar-Free_Continual_Transformer_with_Convolutions_ICCV_2023_paper.html", "aff_unique_index": "0;1;0;2;0;0", - "aff_unique_norm": "Indian Institute of Technology Kharagpur;Amazon;Indian Institute of Science Education and Research", + "aff_unique_norm": "Indian Institute of Technology Kharagpur;Amazon India;Indian Institute of Science Education and Research", "aff_unique_dep": ";IML;", "aff_unique_url": "https://www.iitkgp.ac.in;https://www.amazon.in;https://www.iiserkol.ac.in", "aff_unique_abbr": "IIT KGP;Amazon India;IISER", "aff_campus_unique_index": "0;0;2;0;0", "aff_campus_unique": "Kharagpur;;Kolkata", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "India" + "aff_country_unique": "India", + "bibtex": "@InProceedings{Roy_2023_ICCV,\n \n author = {\n Roy,\n Anurag and Verma,\n Vinay K. and Voonna,\n Sravan and Ghosh,\n Kripabandhu and Ghosh,\n Saptarshi and Das,\n Abir\n},\n title = {\n Exemplar-Free Continual Transformer with Convolutions\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5897-5907\n} \n}" }, { "title": "Explaining Adversarial Robustness of Neural Networks from Clustering Effect Perspective", @@ -19975,7 +20647,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Hangzhou", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Jin_2023_ICCV,\n \n author = {\n Jin,\n Yulin and Zhang,\n Xiaoyu and Lou,\n Jian and Ma,\n Xu and Wang,\n Zilong and Chen,\n Xiaofeng\n},\n title = {\n Explaining Adversarial Robustness of Neural Networks from Clustering Effect Perspective\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4522-4531\n} \n}" }, { "title": "Explicit Motion Disentangling for Efficient Optical Flow Estimation", @@ -20007,7 +20680,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Macau SAR;", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Deng_2023_ICCV,\n \n author = {\n Deng,\n Changxing and Luo,\n Ao and Huang,\n Haibin and Ma,\n Shaodan and Liu,\n Jiangyu and Liu,\n Shuaicheng\n},\n title = {\n Explicit Motion Disentangling for Efficient Optical Flow Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9521-9530\n} \n}" }, { "title": "Exploiting Proximity-Aware Tasks for Embodied Social Navigation", @@ -20039,7 +20713,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1+1+1;0", - "aff_country_unique": "Italy;Canada" + "aff_country_unique": "Italy;Canada", + "bibtex": "@InProceedings{Cancelli_2023_ICCV,\n \n author = {\n Cancelli,\n Enrico and Campari,\n Tommaso and Serafini,\n Luciano and Chang,\n Angel X. and Ballan,\n Lamberto\n},\n title = {\n Exploiting Proximity-Aware Tasks for Embodied Social Navigation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10957-10967\n} \n}" }, { "title": "Explore and Tell: Embodied Visual Captioning in 3D Environments", @@ -20071,7 +20746,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", - "aff_country_unique": "China;France" + "aff_country_unique": "China;France", + "bibtex": "@InProceedings{Hu_2023_ICCV,\n \n author = {\n Hu,\n Anwen and Chen,\n Shizhe and Zhang,\n Liang and Jin,\n Qin\n},\n title = {\n Explore and Tell: Embodied Visual Captioning in 3D Environments\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2482-2491\n} \n}" }, { "title": "Exploring Group Video Captioning with Efficient Relational Approximation", @@ -20103,7 +20779,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Lin_2023_ICCV,\n \n author = {\n Lin,\n Wang and Jin,\n Tao and Wang,\n Ye and Pan,\n Wenwen and Li,\n Linjun and Cheng,\n Xize and Zhao,\n Zhou\n},\n title = {\n Exploring Group Video Captioning with Efficient Relational Approximation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15281-15290\n} \n}" }, { "title": "Exploring Lightweight Hierarchical Vision Transformers for Efficient Visual Tracking", @@ -20128,14 +20805,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Kang_Exploring_Lightweight_Hierarchical_Vision_Transformers_for_Efficient_Visual_Tracking_ICCV_2023_paper.html", "aff_unique_index": "0;0;0+1;1;0", - "aff_unique_norm": "Dalian University of Technology;Microsoft", + "aff_unique_norm": "Dalian University of Technology;Microsoft Corporation", "aff_unique_dep": "School of Information and Communication Engineering;Microsoft Research", "aff_unique_url": "http://en.dlut.edu.cn/;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "DUT;MSR", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Dalian;", "aff_country_unique_index": "0;0;0+1;1;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Kang_2023_ICCV,\n \n author = {\n Kang,\n Ben and Chen,\n Xin and Wang,\n Dong and Peng,\n Houwen and Lu,\n Huchuan\n},\n title = {\n Exploring Lightweight Hierarchical Vision Transformers for Efficient Visual Tracking\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9612-9621\n} \n}" }, { "title": "Exploring Model Transferability through the Lens of Potential Energy", @@ -20160,14 +20838,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_Exploring_Model_Transferability_through_the_Lens_of_Potential_Energy_ICCV_2023_paper.html", "aff_unique_index": "0+1;0;2;2;0+1", - "aff_unique_norm": "Peking University;Pengcheng Laboratory;Tencent", - "aff_unique_dep": "School of Computer Science;Peng Cheng Laboratory;ARC Lab", + "aff_unique_norm": "Peking University;Peng Cheng Laboratory;Tencent PCG", + "aff_unique_dep": "School of Computer Science;;ARC Lab", "aff_unique_url": "http://www.pku.edu.cn;;https://www.tencent.com", "aff_unique_abbr": "PKU;;Tencent", "aff_campus_unique_index": "0+1;0;0;0;0+1", "aff_campus_unique": "Beijing;Shenzhen", "aff_country_unique_index": "0+0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Xiaotong and Hu,\n Zixuan and Ge,\n Yixiao and Shan,\n Ying and Duan,\n Ling-Yu\n},\n title = {\n Exploring Model Transferability through the Lens of Potential Energy\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5429-5438\n} \n}" }, { "title": "Exploring Object-Centric Temporal Modeling for Efficient Multi-View 3D Object Detection", @@ -20192,14 +20871,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wang_Exploring_Object-Centric_Temporal_Modeling_for_Efficient_Multi-View_3D_Object_Detection_ICCV_2023_paper.html", "aff_unique_index": "0;1;1;0;1", - "aff_unique_norm": "Beijing Institute of Technology;Megvii Technology", + "aff_unique_norm": "Beijing Institute of Technology;MEGVII Technology", "aff_unique_dep": ";", "aff_unique_url": "http://www.bit.edu.cn/;https://www.megvii.com", "aff_unique_abbr": "BIT;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Shihao and Liu,\n Yingfei and Wang,\n Tiancai and Li,\n Ying and Zhang,\n Xiangyu\n},\n title = {\n Exploring Object-Centric Temporal Modeling for Efficient Multi-View 3D Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3621-3631\n} \n}" }, { "title": "Exploring Open-Vocabulary Semantic Segmentation from CLIP Vision Encoder Distillation Only", @@ -20224,14 +20904,15 @@ "author_num": 9, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chen_Exploring_Open-Vocabulary_Semantic_Segmentation_from_CLIP_Vision_Encoder_Distillation_Only_ICCV_2023_paper.html", "aff_unique_index": "0+1;0;0;0;1;1;1;1;0+1", - "aff_unique_norm": "King Abdullah University of Science and Technology;Meta", + "aff_unique_norm": "King Abdullah University of Science and Technology;Meta Platforms, Inc.", "aff_unique_dep": ";Meta AI Research", "aff_unique_url": "https://www.kaust.edu.sa;https://meta.com", "aff_unique_abbr": "KAUST;Meta AI", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0;0;0;1;1;1;1;0+1", - "aff_country_unique": "Saudi Arabia;United States" + "aff_country_unique": "Saudi Arabia;United States", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Jun and Zhu,\n Deyao and Qian,\n Guocheng and Ghanem,\n Bernard and Yan,\n Zhicheng and Zhu,\n Chenchen and Xiao,\n Fanyi and Culatana,\n Sean Chang and Elhoseiny,\n Mohamed\n},\n title = {\n Exploring Open-Vocabulary Semantic Segmentation from CLIP Vision Encoder Distillation Only\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 699-710\n} \n}" }, { "title": "Exploring Positional Characteristics of Dual-Pixel Data for Camera Autofocus", @@ -20256,14 +20937,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Choi_Exploring_Positional_Characteristics_of_Dual-Pixel_Data_for_Camera_Autofocus_ICCV_2023_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "Samsung", - "aff_unique_dep": "Samsung Advanced Institute of Technology", + "aff_unique_norm": "Samsung Advanced Institute of Technology", + "aff_unique_dep": "", "aff_unique_url": "https://www.sait.samsung.com", "aff_unique_abbr": "SAIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Choi_2023_ICCV,\n \n author = {\n Choi,\n Myungsub and Lee,\n Hana and Lee,\n Hyong-euk\n},\n title = {\n Exploring Positional Characteristics of Dual-Pixel Data for Camera Autofocus\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13158-13168\n} \n}" }, { "title": "Exploring Predicate Visual Context in Detecting of Human-Object Interactions", @@ -20275,7 +20957,7 @@ "author": "Frederic Z Zhang; Yuhui Yuan; Dylan Campbell; Zhuoyao Zhong; Stephen Gould", "abstract": "Recently, the DETR framework has emerged as the dominant approach for human--object interaction (HOI) research. In particular, two-stage transformer-based HOI detectors are amongst the most performant and training-efficient approaches. However, these often condition HOI classification on object features that lack fine-grained contextual information, eschewing pose and orientation information in favour of visual cues about object identity and box extremities. This naturally hinders the recognition of complex or ambiguous interactions. In this work, we study these issues through visualisations and carefully designed experiments. Accordingly, we investigate how best to re-introduce image features via cross-attention. With an improved query design, extensive exploration of keys and values, and box pair positional embeddings as spatial guidance, our model with enhanced predicate visual context (PViC) outperforms state-of-the-art methods on the HICO-DET and V-COCO benchmarks, while maintaining low training cost.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Zhang_Exploring_Predicate_Visual_Context_in_Detecting_of_Human-Object_Interactions_ICCV_2023_paper.pdf", - "aff": "The Australian National University\u2020; Microsoft Research Asia\u2020; The Australian National University; Microsoft Research Asia; The Australian National University\u2020", + "aff": "The Australian National University†; Microsoft Research Asia†; The Australian National University; Microsoft Research Asia; The Australian National University†", "project": "", "github": "https://github.com/fredzzhang/pvic", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Zhang_Exploring_Predicate_Visual_ICCV_2023_supplemental.pdf", @@ -20287,15 +20969,16 @@ "email": "anu.edu.au;microsoft.com; ; ; ", "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhang_Exploring_Predicate_Visual_Context_in_Detecting_of_Human-Object_Interactions_ICCV_2023_paper.html", - "aff_unique_index": "0;1;0;1;0", - "aff_unique_norm": "Australian National University;Microsoft", - "aff_unique_dep": ";Microsoft Research Asia", - "aff_unique_url": "https://www.anu.edu.au;https://www.microsoft.com/en-us/research/group/asia", - "aff_unique_abbr": "ANU;MSRA", + "aff_unique_index": "0;1;0;2;0", + "aff_unique_norm": "Australian National University;Microsoft Research Asia;Microsoft Research", + "aff_unique_dep": ";Microsoft Research;Research", + "aff_unique_url": "https://www.anu.edu.au;https://www.microsoft.com/en-us/research/group/asia;https://www.microsoft.com/en-us/research/group/asia", + "aff_unique_abbr": "ANU;MSRA;MSR Asia", "aff_campus_unique_index": "1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0;1;0;1;0", - "aff_country_unique": "Australia;China" + "aff_country_unique": "Australia;China", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Frederic Z and Yuan,\n Yuhui and Campbell,\n Dylan and Zhong,\n Zhuoyao and Gould,\n Stephen\n},\n title = {\n Exploring Predicate Visual Context in Detecting of Human-Object Interactions\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10411-10421\n} \n}" }, { "title": "Exploring Temporal Concurrency for Video-Language Representation Learning", @@ -20327,7 +21010,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Beijing;", "aff_country_unique_index": "0+0;0;0+0;0+0;1", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Heng and Liu,\n Daqing and Lv,\n Zezhong and Su,\n Bing and Tao,\n Dacheng\n},\n title = {\n Exploring Temporal Concurrency for Video-Language Representation Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15568-15578\n} \n}" }, { "title": "Exploring Temporal Frequency Spectrum in Deep Video Deblurring", @@ -20359,7 +21043,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhu_2023_ICCV,\n \n author = {\n Zhu,\n Qi and Zhou,\n Man and Zheng,\n Naishan and Li,\n Chongyi and Huang,\n Jie and Zhao,\n Feng\n},\n title = {\n Exploring Temporal Frequency Spectrum in Deep Video Deblurring\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12428-12437\n} \n}" }, { "title": "Exploring Transformers for Open-world Instance Segmentation", @@ -20384,14 +21069,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wu_Exploring_Transformers_for_Open-world_Instance_Segmentation_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;2;1;0+3", - "aff_unique_norm": "University of Hong Kong;ByteDance;Dalian University of Technology;Shanghai AI Laboratory", + "aff_unique_norm": "The University of Hong Kong;ByteDance;Dalian University of Technology;Shanghai AI Laboratory", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.hku.hk;https://www.bytedance.com;http://www.dlut.edu.cn/;https://www.shanghai-ai-lab.com", "aff_unique_abbr": "HKU;ByteDance;DUT;SAIL", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wu_2023_ICCV,\n \n author = {\n Wu,\n Jiannan and Jiang,\n Yi and Yan,\n Bin and Lu,\n Huchuan and Yuan,\n Zehuan and Luo,\n Ping\n},\n title = {\n Exploring Transformers for Open-world Instance Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6611-6621\n} \n}" }, { "title": "Exploring Video Quality Assessment on User Generated Contents from Aesthetic and Technical Perspectives", @@ -20416,14 +21102,15 @@ "author_num": 9, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wu_Exploring_Video_Quality_Assessment_on_User_Generated_Contents_from_Aesthetic_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0;0;0;1;1;0", - "aff_unique_norm": "Nanyang Technological University;SenseTime Research", + "aff_unique_norm": "Nanyang Technological University;Sensetime Research", "aff_unique_dep": "S-Lab;Research", "aff_unique_url": "https://www.ntu.edu.sg;https://www.sensetime.com/", "aff_unique_abbr": "NTU;SenseTime", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;1;1;0", - "aff_country_unique": "Singapore;China" + "aff_country_unique": "Singapore;China", + "bibtex": "@InProceedings{Wu_2023_ICCV,\n \n author = {\n Wu,\n Haoning and Zhang,\n Erli and Liao,\n Liang and Chen,\n Chaofeng and Hou,\n Jingwen and Wang,\n Annan and Sun,\n Wenxiu and Yan,\n Qiong and Lin,\n Weisi\n},\n title = {\n Exploring Video Quality Assessment on User Generated Contents from Aesthetic and Technical Perspectives\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20144-20154\n} \n}" }, { "title": "Exploring the Benefits of Visual Prompting in Differential Privacy", @@ -20435,7 +21122,7 @@ "author": "Yizhe Li; Yu-Lin Tsai; Chia-Mu Yu; Pin-Yu Chen; Xuebin Ren", "abstract": "Visual Prompting (VP) is an emerging and powerful technique that allows sample-efficient adaptation to downstream tasks by engineering a well-trained frozen source model. In this work, we explore the benefits of VP in constructing compelling neural network classifiers with differential privacy (DP). We explore and integrate VP into canonical DP training methods and demonstrate its simplicity and efficiency. In particular, we discover that VP in tandem with PATE, a state-of-the-art DP training method that leverages the knowledge transfer from an ensemble of teachers, achieves the state-of-the-art privacy-utility trade-off with minimum expenditure of privacy budget. Moreover, we conduct additional experiments on cross-domain image classification with a sufficient domain gap to further unveil the advantage of VP in DP. Lastly, we also conduct extensive ablation studies to validate the effectiveness and contribution of VP under DP consideration.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Li_Exploring_the_Benefits_of_Visual_Prompting_in_Differential_Privacy_ICCV_2023_paper.pdf", - "aff": "School of Computer Science and Technology, Xi\u2019an Jiaotong University; National Yang Ming Chiao Tung University; National Yang Ming Chiao Tung University; IBM Research; School of Computer Science and Technology, Xi\u2019an Jiaotong University", + "aff": "School of Computer Science and Technology, Xi’an Jiaotong University; National Yang Ming Chiao Tung University; National Yang Ming Chiao Tung University; IBM Research; School of Computer Science and Technology, Xi’an Jiaotong University", "project": "", "github": "https://github.com/EzzzLi/Prompt-PATE", "supp": "", @@ -20448,14 +21135,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_Exploring_the_Benefits_of_Visual_Prompting_in_Differential_Privacy_ICCV_2023_paper.html", "aff_unique_index": "0;1;1;2;0", - "aff_unique_norm": "Xi'an Jiao Tong University;National Yang Ming Chiao Tung University;IBM", + "aff_unique_norm": "Xi'an Jiaotong University;National Yang Ming Chiao Tung University;IBM", "aff_unique_dep": "School of Computer Science and Technology;;IBM Research", "aff_unique_url": "https://www.xjtu.edu.cn;https://www.nycu.edu.tw;https://www.ibm.com/research", "aff_unique_abbr": "XJTU;NYCU;IBM", "aff_campus_unique_index": "0;1;1;0", "aff_campus_unique": "Xi'an;Taiwan;", "aff_country_unique_index": "0;0;0;1;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Yizhe and Tsai,\n Yu-Lin and Yu,\n Chia-Mu and Chen,\n Pin-Yu and Ren,\n Xuebin\n},\n title = {\n Exploring the Benefits of Visual Prompting in Differential Privacy\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5158-5167\n} \n}" }, { "title": "Exploring the Sim2Real Gap Using Digital Twins", @@ -20480,14 +21168,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Sudhakar_Exploring_the_Sim2Real_Gap_Using_Digital_Twins_ICCV_2023_paper.html", "aff_unique_index": "0;1;1;1;1;1", - "aff_unique_norm": "Columbia University;Microsoft", + "aff_unique_norm": "Columbia University;Microsoft Corporation", "aff_unique_dep": ";Microsoft Research", "aff_unique_url": "https://www.columbia.edu;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "Columbia;MSR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Sudhakar_2023_ICCV,\n \n author = {\n Sudhakar,\n Sruthi and Hanzelka,\n Jon and Bobillot,\n Josh and Randhavane,\n Tanmay and Joshi,\n Neel and Vineet,\n Vibhav\n},\n title = {\n Exploring the Sim2Real Gap Using Digital Twins\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20418-20427\n} \n}" }, { "title": "ExposureDiffusion: Learning to Expose for Low-light Image Enhancement", @@ -20512,14 +21201,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wang_ExposureDiffusion_Learning_to_Expose_for_Low-light_Image_Enhancement_ICCV_2023_paper.html", "aff_unique_index": "0;0;1;0;2;0;0", - "aff_unique_norm": "Nanyang Technological University;Pengcheng Laboratory;Hong Kong Polytechnic University", - "aff_unique_dep": ";Peng Cheng Laboratory;", + "aff_unique_norm": "Nanyang Technological University;Peng Cheng Laboratory;The Hong Kong Polytechnic University", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.ntu.edu.sg;http://www.pcl.ac.cn;https://www.polyu.edu.hk", "aff_unique_abbr": "NTU;PCL;PolyU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;1;0;1;0;0", - "aff_country_unique": "Singapore;China" + "aff_country_unique": "Singapore;China", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Yufei and Yu,\n Yi and Yang,\n Wenhan and Guo,\n Lanqing and Chau,\n Lap-Pui and Kot,\n Alex C. and Wen,\n Bihan\n},\n title = {\n ExposureDiffusion: Learning to Expose for Low-light Image Enhancement\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12438-12448\n} \n}" }, { "title": "Expressive Text-to-Image Generation with Rich Text", @@ -20542,7 +21232,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ge_Expressive_Text-to-Image_Generation_with_Rich_Text_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ge_Expressive_Text-to-Image_Generation_with_Rich_Text_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Ge_2023_ICCV,\n \n author = {\n Ge,\n Songwei and Park,\n Taesung and Zhu,\n Jun-Yan and Huang,\n Jia-Bin\n},\n title = {\n Expressive Text-to-Image Generation with Rich Text\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7545-7556\n} \n}" }, { "title": "Extensible and Efficient Proxy for Neural Architecture Search", @@ -20567,14 +21258,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_Extensible_and_Efficient_Proxy_for_Neural_Architecture_Search_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;2;1;0", - "aff_unique_norm": "University of Illinois Urbana-Champaign;University at Buffalo;Georgia Institute of Technology", + "aff_unique_norm": "University of Illinois at Urbana-Champaign;University at Buffalo;Georgia Institute of Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://illinois.edu;https://www.buffalo.edu;https://www.gatech.edu", "aff_unique_abbr": "UIUC;UB;Georgia Tech", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Urbana-Champaign;", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Yuhong and Li,\n Jiajie and Hao,\n Cong and Li,\n Pan and Xiong,\n Jinjun and Chen,\n Deming\n},\n title = {\n Extensible and Efficient Proxy for Neural Architecture Search\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6199-6210\n} \n}" }, { "title": "F&F Attack: Adversarial Attack against Multiple Object Trackers by Inducing False Negatives and False Positives", @@ -20606,7 +21298,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0;1;0;0", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Zhou_2023_ICCV,\n \n author = {\n Zhou,\n Tao and Ye,\n Qi and Luo,\n Wenhan and Zhang,\n Kaihao and Shi,\n Zhiguo and Chen,\n Jiming\n},\n title = {\n F\\&F Attack: Adversarial Attack against Multiple Object Trackers by Inducing False Negatives and False Positives\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4573-4583\n} \n}" }, { "title": "FACET: Fairness in Computer Vision Evaluation Benchmark", @@ -20631,14 +21324,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Gustafson_FACET_Fairness_in_Computer_Vision_Evaluation_Benchmark_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0;0;0;0;0", - "aff_unique_norm": "Meta", + "aff_unique_norm": "Meta AI Research", "aff_unique_dep": "FAIR", "aff_unique_url": "https://research.facebook.com", "aff_unique_abbr": "Meta AI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Gustafson_2023_ICCV,\n \n author = {\n Gustafson,\n Laura and Rolland,\n Chloe and Ravi,\n Nikhila and Duval,\n Quentin and Adcock,\n Aaron and Fu,\n Cheng-Yang and Hall,\n Melissa and Ross,\n Candace\n},\n title = {\n FACET: Fairness in Computer Vision Evaluation Benchmark\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20370-20382\n} \n}" }, { "title": "FACTS: First Amplify Correlations and Then Slice to Discover Bias", @@ -20670,7 +21364,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Yenamandra_2023_ICCV,\n \n author = {\n Yenamandra,\n Sriram and Ramesh,\n Pratik and Prabhu,\n Viraj and Hoffman,\n Judy\n},\n title = {\n FACTS: First Amplify Correlations and Then Slice to Discover Bias\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4794-4804\n} \n}" }, { "title": "FB-BEV: BEV Representation from Forward-Backward View Transformations", @@ -20695,14 +21390,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_FB-BEV_BEV_Representation_from_Forward-Backward_View_Transformations_ICCV_2023_paper.html", "aff_unique_index": "0+1;1;2;1+3;0;1", - "aff_unique_norm": "Nanjing University;NVIDIA;Chinese University of Hong Kong;California Institute of Technology", - "aff_unique_dep": "National Key Lab for Novel Software Technology;NVIDIA Corporation;;", + "aff_unique_norm": "Nanjing University;NVIDIA Corporation;The Chinese University of Hong Kong;California Institute of Technology", + "aff_unique_dep": "National Key Lab for Novel Software Technology;;;", "aff_unique_url": "http://www.nju.edu.cn;https://www.nvidia.com;https://www.cuhk.edu.hk;https://www.caltech.edu", "aff_unique_abbr": "Nanjing U;NVIDIA;CUHK;Caltech", "aff_campus_unique_index": ";1;2", "aff_campus_unique": ";Hong Kong SAR;Pasadena", "aff_country_unique_index": "0+1;1;0;1+1;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Zhiqi and Yu,\n Zhiding and Wang,\n Wenhai and Anandkumar,\n Anima and Lu,\n Tong and Alvarez,\n Jose M.\n},\n title = {\n FB-BEV: BEV Representation from Forward-Backward View Transformations\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6919-6928\n} \n}" }, { "title": "FBLNet: FeedBack Loop Network for Driver Attention Prediction", @@ -20734,7 +21430,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Chongqing", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Yilong and Nan,\n Zhixiong and Xiang,\n Tao\n},\n title = {\n FBLNet: FeedBack Loop Network for Driver Attention Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13371-13380\n} \n}" }, { "title": "FCCNs: Fully Complex-valued Convolutional Networks using Complex-valued Color Model and Loss Function", @@ -20766,7 +21463,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Delhi", "aff_country_unique_index": "0;0", - "aff_country_unique": "India" + "aff_country_unique": "India", + "bibtex": "@InProceedings{Yadav_2023_ICCV,\n \n author = {\n Yadav,\n Saurabh and Jerripothula,\n Koteswar Rao\n},\n title = {\n FCCNs: Fully Complex-valued Convolutional Networks using Complex-valued Color Model and Loss Function\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10689-10698\n} \n}" }, { "title": "FDViT: Improve the Hierarchical Architecture of Vision Transformer", @@ -20798,7 +21496,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xu_2023_ICCV,\n \n author = {\n Xu,\n Yixing and Li,\n Chao and Li,\n Dong and Sheng,\n Xiao and Jiang,\n Fan and Tian,\n Lu and Sirasao,\n Ashish\n},\n title = {\n FDViT: Improve the Hierarchical Architecture of Vision Transformer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5950-5960\n} \n}" }, { "title": "FLIP: Cross-domain Face Anti-spoofing with Language Guidance", @@ -20821,7 +21520,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Srivatsan_FLIP_Cross-domain_Face_Anti-spoofing_with_Language_Guidance_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Srivatsan_FLIP_Cross-domain_Face_Anti-spoofing_with_Language_Guidance_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Srivatsan_2023_ICCV,\n \n author = {\n Srivatsan,\n Koushik and Naseer,\n Muzammal and Nandakumar,\n Karthik\n},\n title = {\n FLIP: Cross-domain Face Anti-spoofing with Language Guidance\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19685-19696\n} \n}" }, { "title": "FLatten Transformer: Vision Transformer using Focused Linear Attention", @@ -20853,7 +21553,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Han_2023_ICCV,\n \n author = {\n Han,\n Dongchen and Pan,\n Xuran and Han,\n Yizeng and Song,\n Shiji and Huang,\n Gao\n},\n title = {\n FLatten Transformer: Vision Transformer using Focused Linear Attention\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5961-5971\n} \n}" }, { "title": "FPR: False Positive Rectification for Weakly Supervised Semantic Segmentation", @@ -20865,7 +21566,7 @@ "author": "Liyi Chen; Chenyang Lei; Ruihuang Li; Shuai Li; Zhaoxiang Zhang; Lei Zhang", "abstract": "Many weakly supervised semantic segmentation (WSSS) methods employ the class activation map (CAM) to generate the initial segmentation results. However, CAM often fails to distinguish the foreground from its co-occurred background (e.g., train and railroad), resulting in inaccurate activation from the background. Previous endeavors address this co-occurrence issue by introducing external supervision and human priors. In this paper, we present a False Positive Rectification (FPR) approach to tackle the co-occurrence problem by leveraging the false positives of CAM. Based on the observation that the CAM-activated regions of absent classes contain class-specific co-occurred background cues, we collect these false positives and utilize them to guide the training of CAM network by proposing a region-level contrast loss and a pixel-level rectification loss. Without introducing any external supervision and human priors, the proposed FPR effectively suppresses wrong activations from the background objects. Extensive experiments on the PASCAL VOC 2012 and MS COCO 2014 demonstrate that FPR brings significant improvements for off-the-shelf methods and achieves state-of-the-art performance. Code is available at https://github.com/mt-cly/FPR.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Chen_FPR_False_Positive_Rectification_for_Weakly_Supervised_Semantic_Segmentation_ICCV_2023_paper.pdf", - "aff": "The Hong Kong Polytechnic University; Center for Arti\ufb01cial Intelligence and Robotics, HKISI, CAS; The Hong Kong Polytechnic University; The Hong Kong Polytechnic University; The Hong Kong Polytechnic University+State Key Laboratory of Multimodal Arti\ufb01cial Intelligence Systems, CASIA; The Hong Kong Polytechnic University", + "aff": "The Hong Kong Polytechnic University; Center for Artificial Intelligence and Robotics, HKISI, CAS; The Hong Kong Polytechnic University; The Hong Kong Polytechnic University; The Hong Kong Polytechnic University+State Key Laboratory of Multimodal Artificial Intelligence Systems, CASIA; The Hong Kong Polytechnic University", "project": "", "github": "https://github.com/mt-cly/FPR", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Chen_FPR_False_Positive_Rectification_for_Weakly_Supervised_Semantic_Segmentation_ICCV_2023_supplemental.pdf", @@ -20878,14 +21579,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chen_FPR_False_Positive_Rectification_for_Weakly_Supervised_Semantic_Segmentation_ICCV_2023_paper.html", "aff_unique_index": "0;1;0;0;0+2;0", - "aff_unique_norm": "Hong Kong Polytechnic University;Chinese Academy of Sciences;Chinese Academy of Sciences Institute of Automation", - "aff_unique_dep": ";Center for Arti\ufb01cial Intelligence and Robotics;State Key Laboratory of Multimodal Arti\ufb01cial Intelligence Systems", + "aff_unique_norm": "The Hong Kong Polytechnic University;Chinese Academy of Sciences;Chinese Academy of Sciences Institute of Automation", + "aff_unique_dep": ";Center for Artificial Intelligence and Robotics;State Key Laboratory of Multimodal Artificial Intelligence Systems", "aff_unique_url": "https://www.polyu.edu.hk;http://www.cas.cn/;http://www.ia.cas.cn", "aff_unique_abbr": "PolyU;CAS;CASIA", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Liyi and Lei,\n Chenyang and Li,\n Ruihuang and Li,\n Shuai and Zhang,\n Zhaoxiang and Zhang,\n Lei\n},\n title = {\n FPR: False Positive Rectification for Weakly Supervised Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1108-1118\n} \n}" }, { "title": "FRAug: Tackling Federated Learning with Non-IID Features via Representation Augmentation", @@ -20917,7 +21619,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0+0;0+0+0;0;1;0+0", - "aff_country_unique": "Germany;United Kingdom" + "aff_country_unique": "Germany;United Kingdom", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Haokun and Frikha,\n Ahmed and Krompass,\n Denis and Gu,\n Jindong and Tresp,\n Volker\n},\n title = {\n FRAug: Tackling Federated Learning with Non-IID Features via Representation Augmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4849-4859\n} \n}" }, { "title": "FS-DETR: Few-Shot DEtection TRansformer with Prompting and without Re-Training", @@ -20942,14 +21645,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Bulat_FS-DETR_Few-Shot_DEtection_TRansformer_with_Prompting_and_without_Re-Training_ICCV_2023_paper.html", "aff_unique_index": "0+1;0;0;0+2", - "aff_unique_norm": "Samsung;Technical University of Iasi;Queen Mary University of London", + "aff_unique_norm": "Samsung AI;Technical University of Iasi;Queen Mary University of London", "aff_unique_dep": "Samsung AI;;", "aff_unique_url": "https://www.samsung.com/uk;https://www.tuiasi.ro;https://www.qmul.ac.uk", "aff_unique_abbr": "Samsung AI;TUIASI;QMUL", "aff_campus_unique_index": "0;0;0;0+2", "aff_campus_unique": "Cambridge;;London", "aff_country_unique_index": "0+1;0;0;0+0", - "aff_country_unique": "United Kingdom;Romania" + "aff_country_unique": "United Kingdom;Romania", + "bibtex": "@InProceedings{Bulat_2023_ICCV,\n \n author = {\n Bulat,\n Adrian and Guerrero,\n Ricardo and Martinez,\n Brais and Tzimiropoulos,\n Georgios\n},\n title = {\n FS-DETR: Few-Shot DEtection TRansformer with Prompting and without Re-Training\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11793-11802\n} \n}" }, { "title": "FSAR: Federated Skeleton-based Action Recognition with Adaptive Topology Structure and Knowledge Distillation", @@ -20981,7 +21685,8 @@ "aff_campus_unique_index": "0;0;1;0;0;2", "aff_campus_unique": "Shenzhen;London;Singapore", "aff_country_unique_index": "0;0;1;0;0;2", - "aff_country_unique": "China;United Kingdom;Singapore" + "aff_country_unique": "China;United Kingdom;Singapore", + "bibtex": "@InProceedings{Guo_2023_ICCV,\n \n author = {\n Guo,\n Jingwen and Liu,\n Hong and Sun,\n Shitong and Guo,\n Tianyu and Zhang,\n Min and Si,\n Chenyang\n},\n title = {\n FSAR: Federated Skeleton-based Action Recognition with Adaptive Topology Structure and Knowledge Distillation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10400-10410\n} \n}" }, { "title": "FSI: Frequency and Spatial Interactive Learning for Image Restoration in Under-Display Cameras", @@ -20993,7 +21698,7 @@ "author": "Chengxu Liu; Xuan Wang; Shuai Li; Yuzhi Wang; Xueming Qian", "abstract": "Under-display camera (UDC) systems remove the screen notch for bezel-free displays and provide a better interactive experience. The main challenge is that the pixel array of light-emitting diodes used for display diffracts and attenuates the incident light, leading to complex degradation. Existing models eliminate spatial diffraction by maximizing model capacity through complex design and ignore the periodic distribution of diffraction in the frequency domain, which prevents these approaches from satisfactory results. In this paper, we introduce a new perspective to handle various diffraction in UDC images by jointly exploring the feature restoration in the frequency and spatial domains, and present a Frequency and Spatial Interactive Learning Network (FSI). It consists of a series of well-designed Frequency-Spatial Joint (FSJ) modules for feature learning and a color transform module for color enhancement. In particular, in the FSJ module, a frequency learning block uses the Fourier transform to eliminate spectral bias, a spatial learning block uses a multi-distillation structure to supplement the absence of local details, and a dual transfer unit to facilitate the interactive learning between features of different domains. Experimental results demonstrate the superiority of the proposed FSI over state-of-the-art models, through extensive quantitative and qualitative evaluations in three widely-used UDC benchmarks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Liu_FSI_Frequency_and_Spatial_Interactive_Learning_for_Image_Restoration_in_ICCV_2023_paper.pdf", - "aff": "Xi\u2019an Jiaotong University+Shaanxi Yulan Jiuzhou Intelligent Optoelectronic Technology Co., Ltd; MEGVII Technology; MEGVII Technology; MEGVII Technology; Xi\u2019an Jiaotong University+Shaanxi Yulan Jiuzhou Intelligent Optoelectronic Technology Co., Ltd", + "aff": "Xi’an Jiaotong University+Shaanxi Yulan Jiuzhou Intelligent Optoelectronic Technology Co., Ltd; MEGVII Technology; MEGVII Technology; MEGVII Technology; Xi’an Jiaotong University+Shaanxi Yulan Jiuzhou Intelligent Optoelectronic Technology Co., Ltd", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Liu_FSI_Frequency_and_Spatial_Interactive_Learning_for_Image_Restoration_in_ICCV_2023_supplemental.pdf", @@ -21006,14 +21711,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Liu_FSI_Frequency_and_Spatial_Interactive_Learning_for_Image_Restoration_in_ICCV_2023_paper.html", "aff_unique_index": "0+1;2;2;2;0+1", - "aff_unique_norm": "Xi'an Jiao Tong University;Shaanxi Yulan Jiuzhou Intelligent Optoelectronic Technology Co., Ltd;Megvii Technology", + "aff_unique_norm": "Xi'an Jiaotong University;Shaanxi Yulan Jiuzhou Intelligent Optoelectronic Technology Co., Ltd;MEGVII Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://www.xjtu.edu.cn;;https://www.megvii.com", "aff_unique_abbr": "XJTU;;", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Chengxu and Wang,\n Xuan and Li,\n Shuai and Wang,\n Yuzhi and Qian,\n Xueming\n},\n title = {\n FSI: Frequency and Spatial Interactive Learning for Image Restoration in Under-Display Cameras\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12537-12546\n} \n}" }, { "title": "FULLER: Unified Multi-modality Multi-task 3D Perception via Multi-level Gradient Calibration", @@ -21036,7 +21742,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Huang_FULLER_Unified_Multi-modality_Multi-task_3D_Perception_via_Multi-level_Gradient_Calibration_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Huang_FULLER_Unified_Multi-modality_Multi-task_3D_Perception_via_Multi-level_Gradient_Calibration_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Huang_2023_ICCV,\n \n author = {\n Huang,\n Zhijian and Lin,\n Sihao and Liu,\n Guiyu and Luo,\n Mukun and Ye,\n Chaoqiang and Xu,\n Hang and Chang,\n Xiaojun and Liang,\n Xiaodan\n},\n title = {\n FULLER: Unified Multi-modality Multi-task 3D Perception via Multi-level Gradient Calibration\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3502-3511\n} \n}" }, { "title": "Face Clustering via Graph Convolutional Networks with Confidence Edges", @@ -21068,7 +21775,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Beijing", "aff_country_unique_index": "0+0+0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wu_2023_ICCV,\n \n author = {\n Wu,\n Yang and Ge,\n Zhiwei and Luo,\n Yuhao and Liu,\n Lin and Xu,\n Sulong\n},\n title = {\n Face Clustering via Graph Convolutional Networks with Confidence Edges\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20990-20999\n} \n}" }, { "title": "FaceCLIPNeRF: Text-driven 3D Face Manipulation using Deformable Neural Radiance Fields", @@ -21100,7 +21808,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "South Korea;" + "aff_country_unique": "South Korea;", + "bibtex": "@InProceedings{Hwang_2023_ICCV,\n \n author = {\n Hwang,\n Sungwon and Hyung,\n Junha and Kim,\n Daejin and Kim,\n Min-Jung and Choo,\n Jaegul\n},\n title = {\n FaceCLIPNeRF: Text-driven 3D Face Manipulation using Deformable Neural Radiance Fields\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3469-3479\n} \n}" }, { "title": "Factorized Inverse Path Tracing for Efficient and Accurate Material-Lighting Estimation", @@ -21132,7 +21841,8 @@ "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "San Diego;", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Wu_2023_ICCV,\n \n author = {\n Wu,\n Liwen and Zhu,\n Rui and Yaldiz,\n Mustafa B. and Zhu,\n Yinhao and Cai,\n Hong and Matai,\n Janarbek and Porikli,\n Fatih and Li,\n Tzu-Mao and Chandraker,\n Manmohan and Ramamoorthi,\n Ravi\n},\n title = {\n Factorized Inverse Path Tracing for Efficient and Accurate Material-Lighting Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3848-3858\n} \n}" }, { "title": "Fan-Beam Binarization Difference Projection (FB-BDP): A Novel Local Object Descriptor for Fine-Grained Leaf Image Retrieval", @@ -21164,7 +21874,8 @@ "aff_campus_unique_index": "0;1+0;0", "aff_campus_unique": "Brisbane;Nanjing", "aff_country_unique_index": "0;1+0;0", - "aff_country_unique": "Australia;China" + "aff_country_unique": "Australia;China", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Xin and Wang,\n Bin and Gao,\n Yongsheng\n},\n title = {\n Fan-Beam Binarization Difference Projection (FB-BDP): A Novel Local Object Descriptor for Fine-Grained Leaf Image Retrieval\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11102-11111\n} \n}" }, { "title": "Fantasia3D: Disentangling Geometry and Appearance for High-quality Text-to-3D Content Creation", @@ -21172,6 +21883,7 @@ "status": "Poster", "track": "main", "pid": "1574", + "author_site": "Rui Chen, Yongwei Chen, Ningxin Jiao, Kui Jia", "author": "Rui Chen, Yongwei Chen, Ningxin Jiao, Kui Jia", "abstract": "Automatic 3D content creation has achieved rapid progress recently due to the availability of pre-trained, large language models and image diffusion models, forming the emerging topic of text-to-3D content creation. Existing text-to-3D methods commonly use implicit scene representations, which couple the geometry and appearance via volume rendering and are suboptimal in terms of recovering finer geometries and achieving photorealistic rendering; consequently, they are less effective for generating high-quality 3D assets. In this work, we propose a new method of Fantasia3D for high-quality text-to-3D content creation. Key to Fantasia3D is the disentangled modeling and learning of geometry and appearance. For geometry learning, we rely on a hybrid scene representation, and propose to encode surface normal extracted from the representation as the input of the image diffusion model. For appearance modeling, we introduce the spatially varying bidirectional reflectance distribution function (BRDF) into the text-to-3D task, and learn the surface material for photorealistic rendering of the generated surface. Our disentangled framework is more compatible with popular graphics engines, supporting relighting, editing, and physical simulation of the generated 3D assets. We conduct thorough experiments that show the advantages of our method over existing ones under different text-to-3D task settings. Project page and source codes: https://fantasia3d.github.io/.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Chen_Fantasia3D_Disentangling_Geometry_and_Appearance_for_High-quality_Text-to-3D_Content_Creation_ICCV_2023_paper.pdf", @@ -21183,7 +21895,8 @@ "gs_citation": 565, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11807539982708184053&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chen_Fantasia3D_Disentangling_Geometry_and_Appearance_for_High-quality_Text-to-3D_Content_Creation_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chen_Fantasia3D_Disentangling_Geometry_and_Appearance_for_High-quality_Text-to-3D_Content_Creation_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Rui and Chen,\n Yongwei and Jiao,\n Ningxin and Jia,\n Kui\n},\n title = {\n Fantasia3D: Disentangling Geometry and Appearance for High-quality Text-to-3D Content Creation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22246-22256\n} \n}" }, { "title": "FashionNTM: Multi-turn Fashion Image Retrieval via Cascaded Memory", @@ -21215,7 +21928,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "San Diego;", "aff_country_unique_index": "0;0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Pal_2023_ICCV,\n \n author = {\n Pal,\n Anwesan and Wadhwa,\n Sahil and Jaiswal,\n Ayush and Zhang,\n Xu and Wu,\n Yue and Chada,\n Rakesh and Natarajan,\n Pradeep and Christensen,\n Henrik I.\n},\n title = {\n FashionNTM: Multi-turn Fashion Image Retrieval via Cascaded Memory\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11323-11334\n} \n}" }, { "title": "Fast Adversarial Training with Smooth Convergence", @@ -21247,7 +21961,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhao_2023_ICCV,\n \n author = {\n Zhao,\n Mengnan and Zhang,\n Lihe and Kong,\n Yuqiu and Yin,\n Baocai\n},\n title = {\n Fast Adversarial Training with Smooth Convergence\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4720-4729\n} \n}" }, { "title": "Fast Full-frame Video Stabilization with Iterative Optimization", @@ -21270,7 +21985,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhao_Fast_Full-frame_Video_Stabilization_with_Iterative_Optimization_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhao_Fast_Full-frame_Video_Stabilization_with_Iterative_Optimization_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Zhao_2023_ICCV,\n \n author = {\n Zhao,\n Weiyue and Li,\n Xin and Peng,\n Zhan and Luo,\n Xianrui and Ye,\n Xinyi and Lu,\n Hao and Cao,\n Zhiguo\n},\n title = {\n Fast Full-frame Video Stabilization with Iterative Optimization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23534-23544\n} \n}" }, { "title": "Fast Globally Optimal Surface Normal Estimation from an Affine Correspondence", @@ -21278,11 +21994,11 @@ "status": "Poster", "track": "main", "pid": "10102", - "author_site": "Levente Hajder, Lajos L\u00f3czi, Daniel Barath", - "author": "Levente Hajder; Lajos L\u00f3czi; Daniel Barath", + "author_site": "Levente Hajder, Lajos Lóczi, Daniel Barath", + "author": "Levente Hajder; Lajos Lóczi; Daniel Barath", "abstract": "We present a new solver for estimating a surface normal from a single affine correspondence in two calibrated views. The proposed approach provides a new globally optimal solution for this over-determined problem and proves that it reduces to a linear system that can be solved extremely efficiently. This allows for performing significantly faster than other recent methods, solving the same problem and obtaining the same globally optimal solution. We demonstrate on 15k image pairs from standard benchmarks that the proposed approach leads to the same results as other optimal algorithms while being, on average, five times faster than the fastest alternative. Besides its theoretical value, we demonstrate that such an approach has clear benefits, e.g., in image-based visual localization, due to not requiring a dense point cloud to recover the surface normal. We show on the Cambridge Landmarks dataset that leveraging the proposed surface normal estimation further improves localization accuracy. Matlab and C++ implementations are also published in the supplementary material.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Hajder_Fast_Globally_Optimal_Surface_Normal_Estimation_from_an_Affine_Correspondence_ICCV_2023_paper.pdf", - "aff": "Geometric Computer Vision Group, E\u00a8otv\u00a8os Lor \u00b4and University, Budapest, Hungary; Geometric Computer Vision Group, E\u00a8otv\u00a8os Lor \u00b4and University, Budapest, Hungary; Computer Vision and Geometry Group, ETH Zurich, Switzerland", + "aff": "Geometric Computer Vision Group, E¨otv¨os Lor ´and University, Budapest, Hungary; Geometric Computer Vision Group, E¨otv¨os Lor ´and University, Budapest, Hungary; Computer Vision and Geometry Group, ETH Zurich, Switzerland", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Hajder_Fast_Globally_Optimal_ICCV_2023_supplemental.zip", @@ -21295,14 +22011,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Hajder_Fast_Globally_Optimal_Surface_Normal_Estimation_from_an_Affine_Correspondence_ICCV_2023_paper.html", "aff_unique_index": "0;0;1", - "aff_unique_norm": "E\u00f6tv\u00f6s Lor\u00e1nd University;ETH Zurich", + "aff_unique_norm": "Eötvös Loránd University;ETH Zurich", "aff_unique_dep": "Geometric Computer Vision Group;Computer Vision and Geometry Group", "aff_unique_url": "https://www.elte.hu;https://www.ethz.ch", "aff_unique_abbr": "ELTE;ETHZ", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Budapest;", "aff_country_unique_index": "0;0;1", - "aff_country_unique": "Hungary;Switzerland" + "aff_country_unique": "Hungary;Switzerland", + "bibtex": "@InProceedings{Hajder_2023_ICCV,\n \n author = {\n Hajder,\n Levente and L\\'oczi,\n Lajos and Barath,\n Daniel\n},\n title = {\n Fast Globally Optimal Surface Normal Estimation from an Affine Correspondence\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3390-3401\n} \n}" }, { "title": "Fast Inference and Update of Probabilistic Density Estimation on Trajectory Prediction", @@ -21334,7 +22051,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Maeda_2023_ICCV,\n \n author = {\n Maeda,\n Takahiro and Ukita,\n Norimichi\n},\n title = {\n Fast Inference and Update of Probabilistic Density Estimation on Trajectory Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9795-9805\n} \n}" }, { "title": "Fast Neural Scene Flow", @@ -21359,14 +22077,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_Fast_Neural_Scene_Flow_ICCV_2023_paper.html", "aff_unique_index": "0;0;1;2;0", - "aff_unique_norm": "University of Adelaide;NVIDIA;Latitude AI", - "aff_unique_dep": ";NVIDIA Corporation;", + "aff_unique_norm": "University of Adelaide;NVIDIA Corporation;Latitude AI", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.adelaide.edu.au;https://www.nvidia.com;https://www.latitude.ai", "aff_unique_abbr": "Adelaide;NVIDIA;Latitude AI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;1;0", - "aff_country_unique": "Australia;United States" + "aff_country_unique": "Australia;United States", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Xueqian and Zheng,\n Jianqiao and Ferroni,\n Francesco and Pontes,\n Jhony Kaesemodel and Lucey,\n Simon\n},\n title = {\n Fast Neural Scene Flow\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9878-9890\n} \n}" }, { "title": "Fast and Accurate Transferability Measurement by Evaluating Intra-class Feature Variance", @@ -21398,7 +22117,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Xu_2023_ICCV,\n \n author = {\n Xu,\n Huiwen and Kang,\n U\n},\n title = {\n Fast and Accurate Transferability Measurement by Evaluating Intra-class Feature Variance\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11474-11482\n} \n}" }, { "title": "FastRecon: Few-shot Industrial Anomaly Detection via Fast Feature Reconstruction", @@ -21423,14 +22143,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Fang_FastRecon_Few-shot_Industrial_Anomaly_Detection_via_Fast_Feature_Reconstruction_ICCV_2023_paper.html", "aff_unique_index": "0+1;0+1;2;0+2;2;0", - "aff_unique_norm": "Xi'an Jiao Tong-Liverpool University;Meta;Dinnar Automation Technology", - "aff_unique_dep": ";Metavision Technology Co., Ltd.;", + "aff_unique_norm": "Xi'an Jiaotong-Liverpool University;Metavision Technology;Dinnar Automation Technology", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.xjtlu.edu.cn;https://www.metavisioncn.com;", "aff_unique_abbr": "XJTLU;Metavisioncn;", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0;0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Fang_2023_ICCV,\n \n author = {\n Fang,\n Zheng and Wang,\n Xiaoyang and Li,\n Haocheng and Liu,\n Jiejie and Hu,\n Qiugui and Xiao,\n Jimin\n},\n title = {\n FastRecon: Few-shot Industrial Anomaly Detection via Fast Feature Reconstruction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17481-17490\n} \n}" }, { "title": "FastViT: A Fast Hybrid Vision Transformer Using Structural Reparameterization", @@ -21455,14 +22176,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Vasu_FastViT_A_Fast_Hybrid_Vision_Transformer_Using_Structural_Reparameterization_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0;0", - "aff_unique_norm": "Apple", - "aff_unique_dep": "Apple Inc.", + "aff_unique_norm": "Apple Inc.", + "aff_unique_dep": "", "aff_unique_url": "https://www.apple.com", "aff_unique_abbr": "Apple", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Vasu_2023_ICCV,\n \n author = {\n Vasu,\n Pavan Kumar Anasosalu and Gabriel,\n James and Zhu,\n Jeff and Tuzel,\n Oncel and Ranjan,\n Anurag\n},\n title = {\n FastViT: A Fast Hybrid Vision Transformer Using Structural Reparameterization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5785-5795\n} \n}" }, { "title": "FateZero: Fusing Attentions for Zero-shot Text-based Video Editing", @@ -21494,7 +22216,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{QI_2023_ICCV,\n \n author = {\n QI,\n Chenyang and Cun,\n Xiaodong and Zhang,\n Yong and Lei,\n Chenyang and Wang,\n Xintao and Shan,\n Ying and Chen,\n Qifeng\n},\n title = {\n FateZero: Fusing Attentions for Zero-shot Text-based Video Editing\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15932-15942\n} \n}" }, { "title": "Fcaformer: Forward Cross Attention in Hybrid Vision Transformer", @@ -21506,7 +22229,7 @@ "author": "Haokui Zhang; Wenze Hu; Xiaoyu Wang", "abstract": "Currently, one main research line in designing more efficient vision transformer is reducing computational cost of self attention modules by adopting sparse attention or using local attention windows. In contrast, we propose a different approach that aims to improve the performance of transformer-based architectures by densifying the attention pattern. Specifically, we proposed forward cross attention for hybrid vision transformer (FcaFormer), where tokens from previous blocks in the same stage are secondary used. To achieve this, the FcaFormer leverages two innovative components: learnable scale factors (LSFs) and a token merge and enhancement module (TME). The LSFs enable efficient processing of cross tokens, while the TME generates representative cross tokens. By integrating these components, the proposed FcaFormer enhances the interactions of tokens across blocks with potentially different semantics, and encourages more information flows to the lower levels. Based on the forward cross attention (Fca), we have designed a series of FcaFormer models that achieve the best trade-off between model size, computational cost, memory cost, and accuracy. For example, without the need for knowledge distillation to strengthen training, our FcaFormer achieves 83.1% top-1 accuracy on Imagenet with only 16.3 million parameters and about 3.6 billion MACs. This saves almost half of the parameters and a few computational cost while achieving 0.7% higher accuracy compared with distilled EfficientFormer", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Zhang_Fcaformer_Forward_Cross_Attention_in_Hybrid_Vision_Transformer_ICCV_2023_paper.pdf", - "aff": "Intellifusion+Yan\u2018an University; Intellifusion+Yan\u2018an University; The Hong Kong University of Science and Technology (Guangzhou)", + "aff": "Intellifusion+Yan‘an University; Intellifusion+Yan‘an University; The Hong Kong University of Science and Technology (Guangzhou)", "project": "", "github": "https://github.com/hkzhang-git/FcaFormer", "supp": "", @@ -21519,14 +22242,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhang_Fcaformer_Forward_Cross_Attention_in_Hybrid_Vision_Transformer_ICCV_2023_paper.html", "aff_unique_index": "0+1;0+1;2", - "aff_unique_norm": "Intellifusion;Yan'an University;Hong Kong University of Science and Technology", + "aff_unique_norm": "Intellifusion;Yan'an University;The Hong Kong University of Science and Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://www.intellifusion.com/;http://www.yau.edu.cn;https://www.ust.hk", "aff_unique_abbr": ";YAU;HKUST", "aff_campus_unique_index": ";;1", "aff_campus_unique": ";Guangzhou", "aff_country_unique_index": "0+0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Haokui and Hu,\n Wenze and Wang,\n Xiaoyu\n},\n title = {\n Fcaformer: Forward Cross Attention in Hybrid Vision Transformer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6060-6069\n} \n}" }, { "title": "FeatEnHancer: Enhancing Hierarchical Features for Object Detection and Beyond Under Low-Light Vision", @@ -21558,7 +22282,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0+0;0+0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Hashmi_2023_ICCV,\n \n author = {\n Hashmi,\n Khurram Azeem and Kallempudi,\n Goutham and Stricker,\n Didier and Afzal,\n Muhammad Zeshan\n},\n title = {\n FeatEnHancer: Enhancing Hierarchical Features for Object Detection and Beyond Under Low-Light Vision\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6725-6735\n} \n}" }, { "title": "Feature Modulation Transformer: Cross-Refinement of Global Representation via High-Frequency Prior for Image Super-Resolution", @@ -21590,7 +22315,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Ao and Zhang,\n Le and Liu,\n Yun and Zhu,\n Ce\n},\n title = {\n Feature Modulation Transformer: Cross-Refinement of Global Representation via High-Frequency Prior for Image Super-Resolution\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12514-12524\n} \n}" }, { "title": "Feature Prediction Diffusion Model for Video Anomaly Detection", @@ -21622,7 +22348,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Yan_2023_ICCV,\n \n author = {\n Yan,\n Cheng and Zhang,\n Shiyu and Liu,\n Yang and Pang,\n Guansong and Wang,\n Wenjun\n},\n title = {\n Feature Prediction Diffusion Model for Video Anomaly Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5527-5537\n} \n}" }, { "title": "Feature Proliferation -- the \"Cancer\" in StyleGAN and its Treatments", @@ -21654,7 +22381,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Song_2023_ICCV,\n \n author = {\n Song,\n Shuang and Liang,\n Yuanbang and Wu,\n Jing and Lai,\n Yu-Kun and Qin,\n Yipeng\n},\n title = {\n Feature Proliferation -- the ''Cancer'' in StyleGAN and its Treatments\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2360-2370\n} \n}" }, { "title": "FeatureNeRF: Learning Generalizable NeRFs by Distilling Foundation Models", @@ -21677,7 +22405,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ye_FeatureNeRF_Learning_Generalizable_NeRFs_by_Distilling_Foundation_Models_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ye_FeatureNeRF_Learning_Generalizable_NeRFs_by_Distilling_Foundation_Models_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Ye_2023_ICCV,\n \n author = {\n Ye,\n Jianglong and Wang,\n Naiyan and Wang,\n Xiaolong\n},\n title = {\n FeatureNeRF: Learning Generalizable NeRFs by Distilling Foundation Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8962-8973\n} \n}" }, { "title": "FedPD: Federated Open Set Recognition with Parameter Disentanglement", @@ -21702,14 +22431,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yang_FedPD_Federated_Open_Set_Recognition_with_Parameter_Disentanglement_ICCV_2023_paper.html", "aff_unique_index": "0;0;1;0+1", - "aff_unique_norm": "City University of Hong Kong;Chinese University of Hong Kong", + "aff_unique_norm": "City University of Hong Kong;The Chinese University of Hong Kong", "aff_unique_dep": ";", "aff_unique_url": "https://www.cityu.edu.hk;https://www.cuhk.edu.hk", "aff_unique_abbr": "CityU;CUHK", "aff_campus_unique_index": "0;0;0;0+0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n Chen and Zhu,\n Meilu and Liu,\n Yifan and Yuan,\n Yixuan\n},\n title = {\n FedPD: Federated Open Set Recognition with Parameter Disentanglement\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4882-4891\n} \n}" }, { "title": "FedPerfix: Towards Partial Model Personalization of Vision Transformers in Federated Learning", @@ -21738,10 +22468,11 @@ "aff_unique_dep": "Center for Research in Computer Vision;Intelligent Systems Program", "aff_unique_url": "https://www.ucf.edu;https://www.pitt.edu", "aff_unique_abbr": "UCF;Pitt", - "aff_campus_unique_index": "0;0;0", - "aff_campus_unique": "Central Florida;", + "aff_campus_unique_index": "", + "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Sun_2023_ICCV,\n \n author = {\n Sun,\n Guangyu and Mendieta,\n Matias and Luo,\n Jun and Wu,\n Shandong and Chen,\n Chen\n},\n title = {\n FedPerfix: Towards Partial Model Personalization of Vision Transformers in Federated Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4988-4998\n} \n}" }, { "title": "Federated Learning Over Images: Vertical Decompositions and Pre-Trained Backbones Are Difficult to Beat", @@ -21773,7 +22504,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Hu_2023_ICCV,\n \n author = {\n Hu,\n Erdong and Tang,\n Yuxin and Kyrillidis,\n Anastasios and Jermaine,\n Chris\n},\n title = {\n Federated Learning Over Images: Vertical Decompositions and Pre-Trained Backbones Are Difficult to Beat\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19385-19396\n} \n}" }, { "title": "FemtoDet: An Object Detection Baseline for Energy Versus Performance Tradeoffs", @@ -21805,7 +22537,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Tu_2023_ICCV,\n \n author = {\n Tu,\n Peng and Xie,\n Xu and Ai,\n Guo and Li,\n Yuexiang and Huang,\n Yawen and Zheng,\n Yefeng\n},\n title = {\n FemtoDet: An Object Detection Baseline for Energy Versus Performance Tradeoffs\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13318-13327\n} \n}" }, { "title": "FerKD: Surgical Label Adaptation for Efficient Distillation", @@ -21813,6 +22546,7 @@ "status": "Poster", "track": "main", "pid": "4222", + "author_site": "Zhiqiang Shen", "author": "Zhiqiang Shen", "abstract": "We present FerKD, a novel efficient knowledge distillation framework that incorporates partial soft-hard label adaptation coupled with a region-calibration mechanism. Our approach stems from the observation and intuition that standard data augmentations, such as RandomResizedCrop, tend to transform inputs into diverse conditions: easy positives, hard positives, or hard negatives. In traditional distillation frameworks, these transformed samples are utilized equally through their predictive probabilities derived from pretrained teacher models. However, merely relying on prediction values from a pretrained teacher, a common practice in prior studies, neglects the reliability of these soft label predictions. To address this, we propose a new scheme that calibrates the less-confident regions to be the context using softened hard groundtruth labels. Our approach involves the processes of hard regions mining + calibration. We demonstrate empirically that this method can dramatically improve the convergence speed and final accuracy. Additionally, we find that a consistent mixing strategy can stabilize the distributions of soft supervision, taking advantage of the soft labels. As a result, we introduce a stabilized SelfMix augmentation that weakens the variation of the mixed images and corresponding soft labels through mixing similar regions within the same image. FerKD is an intuitive and well-designed learning system that eliminates several heuristics and hyperparameters in former FKD solution. More importantly, it achieves remarkable improvement on ImageNet-1K and downstream tasks. For instance, FerKD achieves 81.2% on ImageNet-1K with ResNet-50, outperforming FKD and FunMatch by remarkable margins. Leveraging better pre-trained weights and larger architectures, our finetuned ViT-G14 even achieves 89.9%. Our code is available at https://github.com/szq0214/FKD/tree/main/FerKD.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Shen_FerKD_Surgical_Label_Adaptation_for_Efficient_Distillation_ICCV_2023_paper.pdf", @@ -21834,7 +22568,8 @@ "aff_unique_url": "https://mbzuai.ac.ae", "aff_unique_abbr": "MBZUAI", "aff_country_unique_index": "0", - "aff_country_unique": "United Arab Emirates" + "aff_country_unique": "United Arab Emirates", + "bibtex": "@InProceedings{Shen_2023_ICCV,\n \n author = {\n Shen,\n Zhiqiang\n},\n title = {\n FerKD: Surgical Label Adaptation for Efficient Distillation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1666-1675\n} \n}" }, { "title": "Few Shot Font Generation Via Transferring Similarity Guided Global Style and Quantization Local Style", @@ -21866,7 +22601,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Wuhan;", "aff_country_unique_index": "0;0;0;1;0", - "aff_country_unique": "China;Japan" + "aff_country_unique": "China;Japan", + "bibtex": "@InProceedings{Pan_2023_ICCV,\n \n author = {\n Pan,\n Wei and Zhu,\n Anna and Zhou,\n Xinyu and Iwana,\n Brian Kenji and Li,\n Shilin\n},\n title = {\n Few Shot Font Generation Via Transferring Similarity Guided Global Style and Quantization Local Style\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19506-19516\n} \n}" }, { "title": "Few-Shot Common Action Localization via Cross-Attentional Fusion of Context and Temporal Dynamics", @@ -21878,7 +22614,7 @@ "author": "Juntae Lee; Mihir Jain; Sungrack Yun", "abstract": "The goal of this paper is to localize action instances in a long untrimmed query video using just meager trimmed support videos representing a common action whose class information is not given. In this task, it is crucial to mine reliable temporal cues representing a common action from handful support videos. In our work, we develop an attention mechanism using cross-correlation. Based on this cross-attention, we first transform the support videos into query video's context to emphasize query-relevant important frames, and suppress less relevant ones. Next, we summarize sub-sequences of support video frames to represent temporal dynamics in coarse temporal granularity, which is then propagated to the fine-grained support video features through the cross-attention. In each case, the cross-attentions are applied to each support video in the individual-to-all strategy to balance heterogeneity and compatibility of the support videos. In contrast, the candidate instances in the query video are lastly attended by the resulting support video features, at once. In addition, we also develop a relational classifier head based on the query and support video representations. We show the effectiveness of our work with the state-of-the-art (SOTA) performance in benchmark datasets (ActivityNet1.3 and THUMOS14), and analyze each component extensively.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Lee_Few-Shot_Common_Action_Localization_via_Cross-Attentional_Fusion_of_Context_and_ICCV_2023_paper.pdf", - "aff": "Qualcomm AI Research\u2020; Qualcomm Technologies, Inc.*; Qualcomm AI Research\u2020", + "aff": "Qualcomm AI Research†; Qualcomm Technologies, Inc.*; Qualcomm AI Research†", "project": "", "github": "", "supp": "", @@ -21891,14 +22627,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Lee_Few-Shot_Common_Action_Localization_via_Cross-Attentional_Fusion_of_Context_and_ICCV_2023_paper.html", "aff_unique_index": "0;1;0", - "aff_unique_norm": "Qualcomm AI Research;Qualcomm Technologies", - "aff_unique_dep": "AI Research;Inc.", + "aff_unique_norm": "Qualcomm;Qualcomm Technologies", + "aff_unique_dep": "Qualcomm AI Research;Inc.", "aff_unique_url": "https://www.qualcomm.com/research;https://www.qualcomm.com", - "aff_unique_abbr": "QAI;QTI", + "aff_unique_abbr": "Qualcomm AI Research;QTI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Lee_2023_ICCV,\n \n author = {\n Lee,\n Juntae and Jain,\n Mihir and Yun,\n Sungrack\n},\n title = {\n Few-Shot Common Action Localization via Cross-Attentional Fusion of Context and Temporal Dynamics\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10214-10223\n} \n}" }, { "title": "Few-Shot Dataset Distillation via Translative Pre-Training", @@ -21930,7 +22667,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Songhua and Wang,\n Xinchao\n},\n title = {\n Few-Shot Dataset Distillation via Translative Pre-Training\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18654-18664\n} \n}" }, { "title": "Few-Shot Physically-Aware Articulated Mesh Generation via Hierarchical Deformation", @@ -21953,7 +22691,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Liu_Few-Shot_Physically-Aware_Articulated_Mesh_Generation_via_Hierarchical_Deformation_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Liu_Few-Shot_Physically-Aware_Articulated_Mesh_Generation_via_Hierarchical_Deformation_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Xueyi and Wang,\n Bin and Wang,\n He and Yi,\n Li\n},\n title = {\n Few-Shot Physically-Aware Articulated Mesh Generation via Hierarchical Deformation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 854-864\n} \n}" }, { "title": "Few-Shot Video Classification via Representation Fusion and Promotion Learning", @@ -21985,7 +22724,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Xia_2023_ICCV,\n \n author = {\n Xia,\n Haifeng and Li,\n Kai and Min,\n Martin Renqiang and Ding,\n Zhengming\n},\n title = {\n Few-Shot Video Classification via Representation Fusion and Promotion Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19311-19320\n} \n}" }, { "title": "Few-shot Continual Infomax Learning", @@ -22017,7 +22757,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Nanjing", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Gu_2023_ICCV,\n \n author = {\n Gu,\n Ziqi and Xu,\n Chunyan and Yang,\n Jian and Cui,\n Zhen\n},\n title = {\n Few-shot Continual Infomax Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19224-19233\n} \n}" }, { "title": "Fg-T2M: Fine-Grained Text-Driven Human Motion Generation via Diffusion Model", @@ -22049,7 +22790,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Beijing;", "aff_country_unique_index": "0;0+1;2;1;0+0", - "aff_country_unique": "China;Germany;United Kingdom" + "aff_country_unique": "China;Germany;United Kingdom", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Yin and Leng,\n Zhiying and Li,\n Frederick W. B. and Wu,\n Shun-Cheng and Liang,\n Xiaohui\n},\n title = {\n Fg-T2M: Fine-Grained Text-Driven Human Motion Generation via Diffusion Model\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22035-22044\n} \n}" }, { "title": "Fine-grained Unsupervised Domain Adaptation for Gait Recognition", @@ -22081,7 +22823,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1;1+0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Ma_2023_ICCV,\n \n author = {\n Ma,\n Kang and Fu,\n Ying and Zheng,\n Dezhi and Peng,\n Yunjie and Cao,\n Chunshui and Huang,\n Yongzhen\n},\n title = {\n Fine-grained Unsupervised Domain Adaptation for Gait Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11313-11322\n} \n}" }, { "title": "Fine-grained Visible Watermark Removal", @@ -22113,7 +22856,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Shanghai", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Niu_2023_ICCV,\n \n author = {\n Niu,\n Li and Zhao,\n Xing and Zhang,\n Bo and Zhang,\n Liqing\n},\n title = {\n Fine-grained Visible Watermark Removal\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12770-12779\n} \n}" }, { "title": "FineDance: A Fine-grained Choreography Dataset for 3D Full Body Dance Generation", @@ -22145,7 +22889,8 @@ "aff_campus_unique_index": "0;0;0;0;0;0;0", "aff_campus_unique": "Shenzhen;", "aff_country_unique_index": "0;0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Ronghui and Zhao,\n Junfan and Zhang,\n Yachao and Su,\n Mingyang and Ren,\n Zeping and Zhang,\n Han and Tang,\n Yansong and Li,\n Xiu\n},\n title = {\n FineDance: A Fine-grained Choreography Dataset for 3D Full Body Dance Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10234-10243\n} \n}" }, { "title": "FineRecon: Depth-aware Feed-forward Network for Detailed 3D Reconstruction", @@ -22170,14 +22915,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Stier_FineRecon_Depth-aware_Feed-forward_Network_for_Detailed_3D_Reconstruction_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0;0;0;0+1", - "aff_unique_norm": "Apple;University of California, Santa Barbara", - "aff_unique_dep": "Apple Inc.;", + "aff_unique_norm": "Apple Inc.;University of California, Santa Barbara", + "aff_unique_dep": ";", "aff_unique_url": "https://www.apple.com;https://www.ucsb.edu", "aff_unique_abbr": "Apple;UCSB", "aff_campus_unique_index": "1", "aff_campus_unique": ";Santa Barbara", "aff_country_unique_index": "0;0;0;0;0;0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Stier_2023_ICCV,\n \n author = {\n Stier,\n Noah and Ranjan,\n Anurag and Colburn,\n Alex and Yan,\n Yajie and Yang,\n Liang and Ma,\n Fangchang and Angles,\n Baptiste\n},\n title = {\n FineRecon: Depth-aware Feed-forward Network for Detailed 3D Reconstruction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18423-18432\n} \n}" }, { "title": "Fingerprinting Deep Image Restoration Models", @@ -22209,7 +22955,8 @@ "aff_campus_unique_index": "0+0;0+1;0+0;1", "aff_campus_unique": "Guangzhou;Hangzhou;", "aff_country_unique_index": "0+0;0+0;0+0;0;1", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Quan_2023_ICCV,\n \n author = {\n Quan,\n Yuhui and Teng,\n Huan and Xu,\n Ruotao and Huang,\n Jun and Ji,\n Hui\n},\n title = {\n Fingerprinting Deep Image Restoration Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13285-13295\n} \n}" }, { "title": "First Session Adaptation: A Strong Replay-Free Baseline for Class-Incremental Learning", @@ -22241,7 +22988,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Cambridge;", "aff_country_unique_index": "0;0;1;1;0", - "aff_country_unique": "United Kingdom;Unknown" + "aff_country_unique": "United Kingdom;Unknown", + "bibtex": "@InProceedings{Panos_2023_ICCV,\n \n author = {\n Panos,\n Aristeidis and Kobe,\n Yuriko and Reino,\n Daniel Olmeda and Aljundi,\n Rahaf and Turner,\n Richard E.\n},\n title = {\n First Session Adaptation: A Strong Replay-Free Baseline for Class-Incremental Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18820-18830\n} \n}" }, { "title": "FishNet: A Large-scale Dataset and Benchmark for Fish Recognition, Detection, and Functional Trait Prediction", @@ -22273,7 +23021,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Saudi Arabia" + "aff_country_unique": "Saudi Arabia", + "bibtex": "@InProceedings{Khan_2023_ICCV,\n \n author = {\n Khan,\n Faizan Farooq and Li,\n Xiang and Temple,\n Andrew J. and Elhoseiny,\n Mohamed\n},\n title = {\n FishNet: A Large-scale Dataset and Benchmark for Fish Recognition,\n Detection,\n and Functional Trait Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20496-20506\n} \n}" }, { "title": "Flatness-Aware Minimization for Domain Generalization", @@ -22305,7 +23054,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Xingxuan and Xu,\n Renzhe and Yu,\n Han and Dong,\n Yancheng and Tian,\n Pengfei and Cui,\n Peng\n},\n title = {\n Flatness-Aware Minimization for Domain Generalization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5189-5202\n} \n}" }, { "title": "Flexible Visual Recognition by Evidential Modeling of Confusion and Ignorance", @@ -22337,7 +23087,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Fan_2023_ICCV,\n \n author = {\n Fan,\n Lei and Liu,\n Bo and Li,\n Haoxiang and Wu,\n Ying and Hua,\n Gang\n},\n title = {\n Flexible Visual Recognition by Evidential Modeling of Confusion and Ignorance\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1338-1347\n} \n}" }, { "title": "FlipNeRF: Flipped Reflection Rays for Few-shot Novel View Synthesis", @@ -22369,7 +23120,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Seo_2023_ICCV,\n \n author = {\n Seo,\n Seunghyeon and Chang,\n Yeonjin and Kwak,\n Nojun\n},\n title = {\n FlipNeRF: Flipped Reflection Rays for Few-shot Novel View Synthesis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22883-22893\n} \n}" }, { "title": "Focal Network for Image Restoration", @@ -22401,7 +23153,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0;1;1;0", - "aff_country_unique": "Germany;China" + "aff_country_unique": "Germany;China", + "bibtex": "@InProceedings{Cui_2023_ICCV,\n \n author = {\n Cui,\n Yuning and Ren,\n Wenqi and Cao,\n Xiaochun and Knoll,\n Alois\n},\n title = {\n Focal Network for Image Restoration\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13001-13011\n} \n}" }, { "title": "FocalFormer3D: Focusing on Hard Instance for 3D Object Detection", @@ -22426,14 +23179,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chen_FocalFormer3D_Focusing_on_Hard_Instance_for_3D_Object_Detection_ICCV_2023_paper.html", "aff_unique_index": "0;1;0;1;2+1;0;1", - "aff_unique_norm": "Chinese University of Hong Kong;NVIDIA;California Institute of Technology", - "aff_unique_dep": ";NVIDIA Corporation;", + "aff_unique_norm": "The Chinese University of Hong Kong;NVIDIA Corporation;California Institute of Technology", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.nvidia.com;https://www.caltech.edu", "aff_unique_abbr": "CUHK;NVIDIA;Caltech", "aff_campus_unique_index": "0;0;2;0", "aff_campus_unique": "Hong Kong SAR;;Pasadena", "aff_country_unique_index": "0;1;0;1;1+1;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Yilun and Yu,\n Zhiding and Chen,\n Yukang and Lan,\n Shiyi and Anandkumar,\n Anima and Jia,\n Jiaya and Alvarez,\n Jose M.\n},\n title = {\n FocalFormer3D: Focusing on Hard Instance for 3D Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8394-8405\n} \n}" }, { "title": "Focus on Your Target: A Dual Teacher-Student Framework for Domain-Adaptive Semantic Segmentation", @@ -22459,13 +23213,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Huo_Focus_on_Your_Target_A_Dual_Teacher-Student_Framework_for_Domain-Adaptive_ICCV_2023_paper.html", "aff_unique_index": "0;1;0;0;1", "aff_unique_norm": "University of Science and Technology of China;Huawei", - "aff_unique_dep": ";Huawei", + "aff_unique_dep": ";", "aff_unique_url": "http://www.ustc.edu.cn;https://www.huawei.com", "aff_unique_abbr": "USTC;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Huo_2023_ICCV,\n \n author = {\n Huo,\n Xinyue and Xie,\n Lingxi and Zhou,\n Wengang and Li,\n Houqiang and Tian,\n Qi\n},\n title = {\n Focus on Your Target: A Dual Teacher-Student Framework for Domain-Adaptive Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19027-19038\n} \n}" }, { "title": "Focus the Discrepancy: Intra- and Inter-Correlation Learning for Image Anomaly Detection", @@ -22477,7 +23232,7 @@ "author": "Xincheng Yao; Ruoqi Li; Zefeng Qian; Yan Luo; Chongyang Zhang", "abstract": "Humans recognize anomalies through two aspects: larger patch-wise representation discrepancies and weaker patch-to-normal-patch correlations. However, the previous AD methods didn't sufficiently combine the two complementary aspects to design AD models. To this end, we find that Transformer can ideally satisfy the two aspects as its great power in the unified modeling of patchwise representations and patch-to-patch correlations. In this paper, we propose a novel AD framework: FOcus-the- Discrepancy (FOD), which can simultaneously spot the patch-wise, intra- and inter-discrepancies of anomalies. The major characteristic of our method is that we renovate the self attention maps in transformers to Intra-Inter-Correlation (I2Correlation). The I2Correlation contains a two-branch structure to first explicitly establish intraand inter-image correlations, and then fuses the features of two-branch to spotlight the abnormal patterns. To learn the intra- and inter-correlations adaptively, we propose the RBF-kernel-based target-correlations as learning targets for self-supervised learning. Besides, we introduce an entropy constraint strategy to solve the mode collapse issue in optimization and further amplify the normal abnormal distinguishability. Extensive experiments on three unsupervised real-world AD benchmarks show the superior performance of our approach. Code will be available at https://github.com/xcyao00/FOD.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Yao_Focus_the_Discrepancy_Intra-_and_Inter-Correlation_Learning_for_Image_Anomaly_ICCV_2023_paper.pdf", - "aff": "School of Electronic Information and Electrical Engineering, Shanghai Jiao Tong University; School of Electronic Information and Electrical Engineering, Shanghai Jiao Tong University; School of Electronic Information and Electrical Engineering, Shanghai Jiao Tong University; School of Electronic Information and Electrical Engineering, Shanghai Jiao Tong University; School of Electronic Information and Electrical Engineering, Shanghai Jiao Tong University + MoE Key Lab of Arti\ufb01cial Intelligence, AI Institute, Shanghai Jiao Tong University", + "aff": "School of Electronic Information and Electrical Engineering, Shanghai Jiao Tong University; School of Electronic Information and Electrical Engineering, Shanghai Jiao Tong University; School of Electronic Information and Electrical Engineering, Shanghai Jiao Tong University; School of Electronic Information and Electrical Engineering, Shanghai Jiao Tong University; School of Electronic Information and Electrical Engineering, Shanghai Jiao Tong University + MoE Key Lab of Artificial Intelligence, AI Institute, Shanghai Jiao Tong University", "project": "", "github": "https://github.com/xcyao00/FOD", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Yao_Focus_the_Discrepancy_ICCV_2023_supplemental.pdf", @@ -22494,10 +23249,11 @@ "aff_unique_dep": "School of Electronic Information and Electrical Engineering", "aff_unique_url": "https://www.sjtu.edu.cn", "aff_unique_abbr": "SJTU", - "aff_campus_unique_index": "0;0;0;0;0+0", - "aff_campus_unique": "Shanghai", + "aff_campus_unique_index": "0;0;0;0;0", + "aff_campus_unique": "Shanghai;", "aff_country_unique_index": "0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yao_2023_ICCV,\n \n author = {\n Yao,\n Xincheng and Li,\n Ruoqi and Qian,\n Zefeng and Luo,\n Yan and Zhang,\n Chongyang\n},\n title = {\n Focus the Discrepancy: Intra- and Inter-Correlation Learning for Image Anomaly Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6803-6813\n} \n}" }, { "title": "Forecast-MAE: Self-supervised Pre-training for Motion Forecasting with Masked Autoencoders", @@ -22529,7 +23285,8 @@ "aff_campus_unique_index": "0;0;0+1", "aff_campus_unique": "Hong Kong SAR;Guangzhou", "aff_country_unique_index": "0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Cheng_2023_ICCV,\n \n author = {\n Cheng,\n Jie and Mei,\n Xiaodong and Liu,\n Ming\n},\n title = {\n Forecast-MAE: Self-supervised Pre-training for Motion Forecasting with Masked Autoencoders\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8679-8689\n} \n}" }, { "title": "Foreground Object Search by Distilling Composite Image Feature", @@ -22554,14 +23311,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhang_Foreground_Object_Search_by_Distilling_Composite_Image_Feature_ICCV_2023_paper.html", "aff_unique_index": "0;1;0", - "aff_unique_norm": "Shanghai Jiao Tong University;Xi'an Jiao Tong University", + "aff_unique_norm": "Shanghai Jiao Tong University;Xi'an Jiaotong University", "aff_unique_dep": "Center for Machine Cognitive Computing;", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.xjtu.edu.cn", "aff_unique_abbr": "SJTU;XJTU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Xi'an", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Bo and Sui,\n Jiacheng and Niu,\n Li\n},\n title = {\n Foreground Object Search by Distilling Composite Image Feature\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22986-22995\n} \n}" }, { "title": "Foreground and Text-lines Aware Document Image Rectification", @@ -22586,14 +23344,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_Foreground_and_Text-lines_Aware_Document_Image_Rectification_ICCV_2023_paper.html", "aff_unique_index": "0;0+1;0+2;0", - "aff_unique_norm": "Harbin Institute of Technology;Hong Kong Polytechnic University;Pengcheng Laboratory", + "aff_unique_norm": "Harbin Institute of Technology;The Hong Kong Polytechnic University;PengCheng Laboratory", "aff_unique_dep": ";;", "aff_unique_url": "http://en.hust.edu.cn/;https://www.polyu.edu.hk;", "aff_unique_abbr": "HIT;PolyU;", "aff_campus_unique_index": "0;0+1;0+0;0", "aff_campus_unique": "Shenzhen;Hong Kong SAR", "aff_country_unique_index": "0;0+0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Heng and Wu,\n Xiangping and Chen,\n Qingcai and Xiang,\n Qianjin\n},\n title = {\n Foreground and Text-lines Aware Document Image Rectification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19574-19583\n} \n}" }, { "title": "Foreground-Background Distribution Modeling Transformer for Visual Object Tracking", @@ -22625,7 +23384,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n Dawei and He,\n Jianfeng and Ma,\n Yinchao and Yu,\n Qianjin and Zhang,\n Tianzhu\n},\n title = {\n Foreground-Background Distribution Modeling Transformer for Visual Object Tracking\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10117-10127\n} \n}" }, { "title": "Foreground-Background Separation through Concept Distillation from Generative Image Foundation Models", @@ -22637,7 +23397,7 @@ "author": "Mischa Dombrowski; Hadrien Reynaud; Matthew Baugh; Bernhard Kainz", "abstract": "Curating datasets for object segmentation is a difficult task. With the advent of large-scale pre-trained generative models, conditional image generation has been given a significant boost in result quality and ease of use. In this paper, we present a novel method that enables the generation of general foreground-background segmentation models from simple textual descriptions, without requiring segmentation labels. We leverage and explore pre-trained latent diffusion models, to automatically generate weak segmentation masks for concepts and objects. The masks are then used to fine-tune the diffusion model on an inpainting task, which enables fine-grained removal of the object, while at the same time providing a synthetic foreground and background dataset. We demonstrate that using this method beats previous methods in both discriminative and generative performance and closes the gap with fully supervised training while requiring no pixel-wise object labels. We show results on the task of segmenting four different objects (humans, dogs, cars, birds) and a use case scenario in medical image analysis. The code is available at https://github.com/MischaD/fobadiffusion.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Dombrowski_Foreground-Background_Separation_through_Concept_Distillation_from_Generative_Image_Foundation_Models_ICCV_2023_paper.pdf", - "aff": "Friedrich\u2013Alexander\u2013Universit \u00a8at Erlangen\u2013N \u00a8urnberg; Imperial College London; Imperial College London; Friedrich\u2013Alexander\u2013Universit \u00a8at Erlangen\u2013N \u00a8urnberg+Imperial College London", + "aff": "Friedrich–Alexander–Universit ¨at Erlangen–N ¨urnberg; Imperial College London; Imperial College London; Friedrich–Alexander–Universit ¨at Erlangen–N ¨urnberg+Imperial College London", "project": "", "github": "https://github.com/MischaD/fobadiffusion", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Dombrowski_Foreground-Background_Separation_through_ICCV_2023_supplemental.zip", @@ -22650,14 +23410,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Dombrowski_Foreground-Background_Separation_through_Concept_Distillation_from_Generative_Image_Foundation_Models_ICCV_2023_paper.html", "aff_unique_index": "0;1;1;0+1", - "aff_unique_norm": "Friedrich-Alexander-Universit\u00e4t Erlangen-N\u00fcrnberg;Imperial College London", + "aff_unique_norm": "Friedrich-Alexander-Universität Erlangen-Nürnberg;Imperial College London", "aff_unique_dep": ";", "aff_unique_url": "https://www fau.de;https://www.imperial.ac.uk", "aff_unique_abbr": "FAU;ICL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0+1", - "aff_country_unique": "Germany;United Kingdom" + "aff_country_unique": "Germany;United Kingdom", + "bibtex": "@InProceedings{Dombrowski_2023_ICCV,\n \n author = {\n Dombrowski,\n Mischa and Reynaud,\n Hadrien and Baugh,\n Matthew and Kainz,\n Bernhard\n},\n title = {\n Foreground-Background Separation through Concept Distillation from Generative Image Foundation Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 988-998\n} \n}" }, { "title": "Forward Flow for Novel View Synthesis of Dynamic Scenes", @@ -22682,14 +23443,15 @@ "author_num": 9, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Guo_Forward_Flow_for_Novel_View_Synthesis_of_Dynamic_Scenes_ICCV_2023_paper.html", "aff_unique_index": "0;0+1+2;0+1;3;2;2;2;2;2", - "aff_unique_norm": "Northwestern Polytechnical University;Shaanxi Key Laboratory of Information Acquisition and Processing;Baidu;Chinese University of Hong Kong, Shenzhen", - "aff_unique_dep": ";Information Acquisition and Processing;Baidu Inc.;Faculty of Nursing and School of Science and Engineering", + "aff_unique_norm": "Northwestern Polytechnical University;Shaanxi Key Laboratory of Information Acquisition and Processing;Baidu Inc.;Chinese University of Hong Kong, Shenzhen", + "aff_unique_dep": ";Information Acquisition and Processing;;Faculty of Nursing and School of Science and Engineering", "aff_unique_url": "https://www.nwpu.edu.cn;;https://www.baidu.com;https://www.cuhk.edu.cn/shenzhen", "aff_unique_abbr": "NWPU;;Baidu;CUHK-Shenzhen", "aff_campus_unique_index": ";;1", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0;0+0+0;0+0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Guo_2023_ICCV,\n \n author = {\n Guo,\n Xiang and Sun,\n Jiadai and Dai,\n Yuchao and Chen,\n Guanying and Ye,\n Xiaoqing and Tan,\n Xiao and Ding,\n Errui and Zhang,\n Yumeng and Wang,\n Jingdong\n},\n title = {\n Forward Flow for Novel View Synthesis of Dynamic Scenes\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16022-16033\n} \n}" }, { "title": "FreeCOS: Self-Supervised Learning from Fractals and Unlabeled Images for Curvilinear Object Segmentation", @@ -22721,7 +23483,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Shi_2023_ICCV,\n \n author = {\n Shi,\n Tianyi and Ding,\n Xiaohuan and Zhang,\n Liang and Yang,\n Xin\n},\n title = {\n FreeCOS: Self-Supervised Learning from Fractals and Unlabeled Images for Curvilinear Object Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 876-886\n} \n}" }, { "title": "FreeDoM: Training-Free Energy-Guided Conditional Diffusion Model", @@ -22753,7 +23516,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Shenzhen;", "aff_country_unique_index": "0;0;1;1;0", - "aff_country_unique": "China;Saudi Arabia" + "aff_country_unique": "China;Saudi Arabia", + "bibtex": "@InProceedings{Yu_2023_ICCV,\n \n author = {\n Yu,\n Jiwen and Wang,\n Yinhuai and Zhao,\n Chen and Ghanem,\n Bernard and Zhang,\n Jian\n},\n title = {\n FreeDoM: Training-Free Energy-Guided Conditional Diffusion Model\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23174-23184\n} \n}" }, { "title": "Frequency Guidance Matters in Few-Shot Learning", @@ -22785,7 +23549,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0;0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Cheng_2023_ICCV,\n \n author = {\n Cheng,\n Hao and Yang,\n Siyuan and Zhou,\n Joey Tianyi and Guo,\n Lanqing and Wen,\n Bihan\n},\n title = {\n Frequency Guidance Matters in Few-Shot Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11814-11824\n} \n}" }, { "title": "Frequency-aware GAN for Adversarial Manipulation Generation", @@ -22817,7 +23582,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Zhu_2023_ICCV,\n \n author = {\n Zhu,\n Peifei and Osada,\n Genki and Kataoka,\n Hirokatsu and Takahashi,\n Tsubasa\n},\n title = {\n Frequency-aware GAN for Adversarial Manipulation Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4315-4324\n} \n}" }, { "title": "From Chaos Comes Order: Ordering Event Representations for Object Recognition and Detection", @@ -22826,7 +23592,7 @@ "track": "main", "pid": "10936", "author_site": "Nikola Zubi?, Daniel Gehrig, Mathias Gehrig, Davide Scaramuzza", - "author": "Nikola Zubi\u0107; Daniel Gehrig; Mathias Gehrig; Davide Scaramuzza", + "author": "Nikola Zubić; Daniel Gehrig; Mathias Gehrig; Davide Scaramuzza", "abstract": "Today, state-of-the-art deep neural networks that process events first convert them into dense, grid-like input representations before using an off-the-shelf network. However, selecting the appropriate representation for the task traditionally requires training a neural network for each representation and selecting the best one based on the validation score, which is very time-consuming. This work eliminates this bottleneck by selecting representations based on the Gromov-Wasserstein Discrepancy (GWD) between raw events and their representation. It is about 200 times faster to compute than training a neural network and preserves the task performance ranking of event representations across multiple representations, network backbones, datasets, and tasks. Thus finding representations with high task scores is equivalent to finding representations with a low GWD. We use this insight to, for the first time, perform a hyperparameter search on a large family of event representations, revealing new and powerful representations that exceed the state-of-the-art. Our optimized representations outperform existing representations by 1.7 mAP on the 1 Mpx dataset and 0.3 mAP on the Gen1 dataset, two established object detection benchmarks, and reach a 3.8% higher classification score on the mini N-ImageNet benchmark. Moreover, we outperform state-of-the-art by 2.1 mAP on Gen1 and state-of-the-art feed-forward methods by 6.0 mAP on the 1 Mpx datasets. This work opens a new unexplored field of explicit representation optimization for event-based learning.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Zubic_From_Chaos_Comes_Order_Ordering_Event_Representations_for_Object_Recognition_ICCV_2023_paper.pdf", "aff": ";;;", @@ -22840,7 +23606,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zubic_From_Chaos_Comes_Order_Ordering_Event_Representations_for_Object_Recognition_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zubic_From_Chaos_Comes_Order_Ordering_Event_Representations_for_Object_Recognition_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Zubic_2023_ICCV,\n \n author = {\n Zubi\\'c,\n Nikola and Gehrig,\n Daniel and Gehrig,\n Mathias and Scaramuzza,\n Davide\n},\n title = {\n From Chaos Comes Order: Ordering Event Representations for Object Recognition and Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12846-12856\n} \n}" }, { "title": "From Knowledge Distillation to Self-Knowledge Distillation: A Unified Approach with Normalized Loss and Customized Soft Labels", @@ -22872,7 +23639,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Shenzhen;", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n Zhendong and Zeng,\n Ailing and Li,\n Zhe and Zhang,\n Tianke and Yuan,\n Chun and Li,\n Yu\n},\n title = {\n From Knowledge Distillation to Self-Knowledge Distillation: A Unified Approach with Normalized Loss and Customized Soft Labels\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17185-17194\n} \n}" }, { "title": "From Sky to the Ground: A Large-scale Benchmark and Simple Baseline Towards Real Rain Removal", @@ -22904,7 +23672,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Guo_2023_ICCV,\n \n author = {\n Guo,\n Yun and Xiao,\n Xueyao and Chang,\n Yi and Deng,\n Shumin and Yan,\n Luxin\n},\n title = {\n From Sky to the Ground: A Large-scale Benchmark and Simple Baseline Towards Real Rain Removal\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12097-12107\n} \n}" }, { "title": "FrozenRecon: Pose-free 3D Scene Reconstruction with Frozen Depth Models", @@ -22927,7 +23696,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Xu_FrozenRecon_Pose-free_3D_Scene_Reconstruction_with_Frozen_Depth_Models_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Xu_FrozenRecon_Pose-free_3D_Scene_Reconstruction_with_Frozen_Depth_Models_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Xu_2023_ICCV,\n \n author = {\n Xu,\n Guangkai and Yin,\n Wei and Chen,\n Hao and Shen,\n Chunhua and Cheng,\n Kai and Zhao,\n Feng\n},\n title = {\n FrozenRecon: Pose-free 3D Scene Reconstruction with Frozen Depth Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9310-9320\n} \n}" }, { "title": "Full-Body Articulated Human-Object Interaction", @@ -22959,7 +23729,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0+0;0+0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Jiang_2023_ICCV,\n \n author = {\n Jiang,\n Nan and Liu,\n Tengyu and Cao,\n Zhexuan and Cui,\n Jieming and Zhang,\n Zhiyuan and Chen,\n Yixin and Wang,\n He and Zhu,\n Yixin and Huang,\n Siyuan\n},\n title = {\n Full-Body Articulated Human-Object Interaction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9365-9376\n} \n}" }, { "title": "Fully Attentional Networks with Self-emerging Token Labeling", @@ -22984,14 +23755,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhao_Fully_Attentional_Networks_with_Self-emerging_Token_Labeling_ICCV_2023_paper.html", "aff_unique_index": "0+1;0;0;2;0+3;1;0", - "aff_unique_norm": "NVIDIA;Clemson University;Fudan University;California Institute of Technology", - "aff_unique_dep": "NVIDIA Corporation;;;", + "aff_unique_norm": "NVIDIA Corporation;Clemson University;Fudan University;California Institute of Technology", + "aff_unique_dep": ";;;", "aff_unique_url": "https://www.nvidia.com;https://www.clemson.edu;https://www.fudan.edu.cn;https://www.caltech.edu", "aff_unique_abbr": "NVIDIA;Clemson;Fudan;Caltech", "aff_campus_unique_index": ";1", "aff_campus_unique": ";Pasadena", "aff_country_unique_index": "0+0;0;0;1;0+0;0;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Zhao_2023_ICCV,\n \n author = {\n Zhao,\n Bingyin and Yu,\n Zhiding and Lan,\n Shiyi and Cheng,\n Yutao and Anandkumar,\n Anima and Lao,\n Yingjie and Alvarez,\n Jose M.\n},\n title = {\n Fully Attentional Networks with Self-emerging Token Labeling\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5585-5595\n} \n}" }, { "title": "FunnyBirds: A Synthetic Vision Dataset for a Part-Based Analysis of Explainable AI Methods", @@ -23014,7 +23786,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Hesse_FunnyBirds_A_Synthetic_Vision_Dataset_for_a_Part-Based_Analysis_of_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Hesse_FunnyBirds_A_Synthetic_Vision_Dataset_for_a_Part-Based_Analysis_of_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Hesse_2023_ICCV,\n \n author = {\n Hesse,\n Robin and Schaub-Meyer,\n Simone and Roth,\n Stefan\n},\n title = {\n FunnyBirds: A Synthetic Vision Dataset for a Part-Based Analysis of Explainable AI Methods\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3981-3991\n} \n}" }, { "title": "G2L: Semantically Aligned and Uniform Video Grounding via Geodesic and Game Theory", @@ -23046,7 +23819,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Hongxiang and Cao,\n Meng and Cheng,\n Xuxin and Li,\n Yaowei and Zhu,\n Zhihong and Zou,\n Yuexian\n},\n title = {\n G2L: Semantically Aligned and Uniform Video Grounding via Geodesic and Game Theory\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12032-12042\n} \n}" }, { "title": "GACE: Geometry Aware Confidence Enhancement for Black-Box 3D Object Detectors on LiDAR-Data", @@ -23078,7 +23852,8 @@ "aff_campus_unique_index": ";;;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0+0;0+0;0+0", - "aff_country_unique": "Austria" + "aff_country_unique": "Austria", + "bibtex": "@InProceedings{Schinagl_2023_ICCV,\n \n author = {\n Schinagl,\n David and Krispel,\n Georg and Fruhwirth-Reisinger,\n Christian and Possegger,\n Horst and Bischof,\n Horst\n},\n title = {\n GACE: Geometry Aware Confidence Enhancement for Black-Box 3D Object Detectors on LiDAR-Data\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6566-6576\n} \n}" }, { "title": "GAFlow: Incorporating Gaussian Attention into Optical Flow", @@ -23103,14 +23878,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Luo_GAFlow_Incorporating_Gaussian_Attention_into_Optical_Flow_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;3;2;0;3+0", - "aff_unique_norm": "Megvii Technology;Group 423;Beijing Jiao Tong University;University of Electronic Science and Technology of China", + "aff_unique_norm": "Megvii Technology;Group 423;Beijing Jiaotong University;University of Electronic Science and Technology of China", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.megvii.com;;http://www.njtu.edu.cn/en;https://www.uestc.edu.cn", "aff_unique_abbr": "Megvii;;BJTU;UESTC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0+0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Luo_2023_ICCV,\n \n author = {\n Luo,\n Ao and Yang,\n Fan and Li,\n Xin and Nie,\n Lang and Lin,\n Chunyu and Fan,\n Haoqiang and Liu,\n Shuaicheng\n},\n title = {\n GAFlow: Incorporating Gaussian Attention into Optical Flow\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9642-9651\n} \n}" }, { "title": "GAIT: Generating Aesthetic Indoor Tours with Deep Reinforcement Learning", @@ -23142,7 +23918,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Xie_2023_ICCV,\n \n author = {\n Xie,\n Desai and Hu,\n Ping and Sun,\n Xin and Pirk,\n Soren and Zhang,\n Jianming and Mech,\n Radomir and Kaufman,\n Arie E.\n},\n title = {\n GAIT: Generating Aesthetic Indoor Tours with Deep Reinforcement Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7409-7419\n} \n}" }, { "title": "GECCO: Geometrically-Conditioned Point Diffusion Models", @@ -23151,10 +23928,10 @@ "track": "main", "pid": "2739", "author_site": "Micha? J Tyszkiewicz, Pascal Fua, Eduard Trulls", - "author": "Micha\u0142 J Tyszkiewicz; Pascal Fua; Eduard Trulls", + "author": "Michał J Tyszkiewicz; Pascal Fua; Eduard Trulls", "abstract": "Diffusion models generating images conditionally on text, such as Dall-E 2 and Stable Diffusion, have recently made a splash far beyond the computer vision community. Here, we tackle the related problem of generating point clouds, both unconditionally, and conditionally with images. For the latter, we introduce a novel geometrically-motivated conditioning scheme based on projecting sparse image features into the point cloud and attaching them to each individual point, at every step in the denoising process. This approach improves geometric consistency and yields greater fidelity than current methods relying on unstructured, global latent codes. Additionally, we show how to apply recent continuous-time diffusion schemes. Our method performs on par or above the state of art on conditional and unconditional experiments on synthetic data, while being faster, lighter, and delivering tractable likelihoods. We show it can also scale to diverse indoors scenes.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Tyszkiewicz_GECCO_Geometrically-Conditioned_Point_Diffusion_Models_ICCV_2023_paper.pdf", - "aff": "\u00b4Ecole Polytechnique F\u00b4ed\u00b4erale de Lausanne (EPFL); \u00b4Ecole Polytechnique F\u00b4ed\u00b4erale de Lausanne (EPFL); Google Research, Zurich", + "aff": "´Ecole Polytechnique F´ed´erale de Lausanne (EPFL); ´Ecole Polytechnique F´ed´erale de Lausanne (EPFL); Google Research, Zurich", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Tyszkiewicz_GECCO_Geometrically-Conditioned_Point_ICCV_2023_supplemental.zip", @@ -23167,14 +23944,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Tyszkiewicz_GECCO_Geometrically-Conditioned_Point_Diffusion_Models_ICCV_2023_paper.html", "aff_unique_index": "0;0;1", - "aff_unique_norm": "EPFL;Google", + "aff_unique_norm": "Ecole Polytechnique Fédérale de Lausanne;Google", "aff_unique_dep": ";Google Research", "aff_unique_url": "https://www.epfl.ch;https://research.google", "aff_unique_abbr": "EPFL;Google", "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Lausanne;Zurich", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Switzerland" + "aff_country_unique": "Switzerland", + "bibtex": "@InProceedings{Tyszkiewicz_2023_ICCV,\n \n author = {\n Tyszkiewicz,\n Micha{\\l\n} J and Fua,\n Pascal and Trulls,\n Eduard\n},\n title = {\n GECCO: Geometrically-Conditioned Point Diffusion Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2128-2138\n} \n}" }, { "title": "GEDepth: Ground Embedding for Monocular Depth Estimation", @@ -23197,7 +23975,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yang_GEDepth_Ground_Embedding_for_Monocular_Depth_Estimation_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yang_GEDepth_Ground_Embedding_for_Monocular_Depth_Estimation_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n Xiaodong and Ma,\n Zhuang and Ji,\n Zhiyu and Ren,\n Zhe\n},\n title = {\n GEDepth: Ground Embedding for Monocular Depth Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12719-12727\n} \n}" }, { "title": "GET: Group Event Transformer for Event-Based Vision", @@ -23229,7 +24008,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Peng_2023_ICCV,\n \n author = {\n Peng,\n Yansong and Zhang,\n Yueyi and Xiong,\n Zhiwei and Sun,\n Xiaoyan and Wu,\n Feng\n},\n title = {\n GET: Group Event Transformer for Event-Based Vision\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6038-6048\n} \n}" }, { "title": "GETAvatar: Generative Textured Meshes for Animatable Human Avatars", @@ -23252,7 +24032,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhang_GETAvatar_Generative_Textured_Meshes_for_Animatable_Human_Avatars_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhang_GETAvatar_Generative_Textured_Meshes_for_Animatable_Human_Avatars_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Xuanmeng and Zhang,\n Jianfeng and Chacko,\n Rohan and Xu,\n Hongyi and Song,\n Guoxian and Yang,\n Yi and Feng,\n Jiashi\n},\n title = {\n GETAvatar: Generative Textured Meshes for Animatable Human Avatars\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2273-2282\n} \n}" }, { "title": "GIFD: A Generative Gradient Inversion Method with Feature Domain Optimization", @@ -23277,14 +24058,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Fang_GIFD_A_Generative_Gradient_Inversion_Method_with_Feature_Domain_Optimization_ICCV_2023_paper.html", "aff_unique_index": "0+1+2+3;0+1+2+3;0+1+2;1+2;1", - "aff_unique_norm": "Harbin Institute of Technology;Tsinghua University;Pengcheng Laboratory;Guangdong Provincial Key Laboratory of Novel Security Intelligence Technologies", - "aff_unique_dep": ";International Graduate School;Peng Cheng Laboratory;Provincial Key Laboratory of Novel Security Intelligence Technologies", + "aff_unique_norm": "Harbin Institute of Technology;Tsinghua University;Peng Cheng Laboratory;Guangdong Provincial Key Laboratory of Novel Security Intelligence Technologies", + "aff_unique_dep": ";International Graduate School;;Provincial Key Laboratory of Novel Security Intelligence Technologies", "aff_unique_url": "http://en.hhit.edu.cn/;https://www.tsinghua.edu.cn;http://www.pcl.ac.cn;", "aff_unique_abbr": "HIT;THU;PCL;", "aff_campus_unique_index": "0+0;0+0;0+0;0;0", "aff_campus_unique": "Shenzhen;", "aff_country_unique_index": "0+0+0+0;0+0+0+0;0+0+0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Fang_2023_ICCV,\n \n author = {\n Fang,\n Hao and Chen,\n Bin and Wang,\n Xuan and Wang,\n Zhi and Xia,\n Shu-Tao\n},\n title = {\n GIFD: A Generative Gradient Inversion Method with Feature Domain Optimization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4967-4976\n} \n}" }, { "title": "GLA-GCN: Global-local Adaptive Graph Convolutional Network for 3D Human Pose Estimation from Monocular Video", @@ -23309,14 +24091,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yu_GLA-GCN_Global-local_Adaptive_Graph_Convolutional_Network_for_3D_Human_Pose_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;1;0;0", - "aff_unique_norm": "Hong Kong Polytechnic University;Shenzhen University", + "aff_unique_norm": "The Hong Kong Polytechnic University;Shenzhen University", "aff_unique_dep": ";", "aff_unique_url": "https://www.polyu.edu.hk;https://www.szu.edu.cn", "aff_unique_abbr": "PolyU;SZU", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yu_2023_ICCV,\n \n author = {\n Yu,\n Bruce X.B. and Zhang,\n Zhi and Liu,\n Yongxu and Zhong,\n Sheng-hua and Liu,\n Yan and Chen,\n Chang Wen\n},\n title = {\n GLA-GCN: Global-local Adaptive Graph Convolutional Network for 3D Human Pose Estimation from Monocular Video\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8818-8829\n} \n}" }, { "title": "GO-SLAM: Global Optimization for Consistent 3D Instant Reconstruction", @@ -23348,7 +24131,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Italy" + "aff_country_unique": "Italy", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Youmin and Tosi,\n Fabio and Mattoccia,\n Stefano and Poggi,\n Matteo\n},\n title = {\n GO-SLAM: Global Optimization for Consistent 3D Instant Reconstruction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3727-3737\n} \n}" }, { "title": "GPA-3D: Geometry-aware Prototype Alignment for Unsupervised Domain Adaptive 3D Object Detection from Point Clouds", @@ -23360,7 +24144,7 @@ "author": "Ziyu Li; Jingming Guo; Tongtong Cao; Liu Bingbing; Wankou Yang", "abstract": "LiDAR-based 3D detection has made great progress in recent years. However, the performance of 3D detectors is considerably limited when deployed in unseen environments, owing to the severe domain gap problem. Existing domain adaptive 3D detection methods do not adequately consider the problem of the distributional discrepancy in feature space, thereby hindering the generalization of detectors across domains. In this work, we propose a novel unsupervised domain adaptive 3D detection framework, namely Geometry-aware Prototype Alignment (GPA-3D), which explicitly leverages the intrinsic geometric relationship from point cloud objects to reduce the feature discrepancy, thus facilitating cross-domain transferring. Specifically, GPA-3D assigns a series of tailored and learnable prototypes to point cloud objects with distinct geometric structures. Each prototype aligns BEV (bird's-eye-view) features derived from corresponding point cloud objects on source and target domains, reducing the distributional discrepancy and achieving better adaptation. The evaluation results obtained on various benchmarks, including Waymo, nuScenes and KITTI, demonstrate the superiority of our GPA-3D over the state-of-the-art approaches for different adaptation scenarios. The MindSpore version code will be publicly available at https://github.com/Liz66666/GPA3D.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Li_GPA-3D_Geometry-aware_Prototype_Alignment_for_Unsupervised_Domain_Adaptive_3D_Object_ICCV_2023_paper.pdf", - "aff": "School of Automation, Southeast University+Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; School of Automation, Southeast University", + "aff": "School of Automation, Southeast University+Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; School of Automation, Southeast University", "project": "", "github": "https://github.com/Liz66666/GPA3D", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Li_GPA-3D_Geometry-aware_Prototype_ICCV_2023_supplemental.pdf", @@ -23374,13 +24158,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_GPA-3D_Geometry-aware_Prototype_Alignment_for_Unsupervised_Domain_Adaptive_3D_Object_ICCV_2023_paper.html", "aff_unique_index": "0+1;1;1;1;0", "aff_unique_norm": "Southeast University;Huawei", - "aff_unique_dep": "School of Automation;Noah\u2019s Ark Lab", + "aff_unique_dep": "School of Automation;Noah’s Ark Lab", "aff_unique_url": "https://www.seu.edu.cn/;https://www.huawei.com", "aff_unique_abbr": "SEU;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Ziyu and Guo,\n Jingming and Cao,\n Tongtong and Bingbing,\n Liu and Yang,\n Wankou\n},\n title = {\n GPA-3D: Geometry-aware Prototype Alignment for Unsupervised Domain Adaptive 3D Object Detection from Point Clouds\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6394-6403\n} \n}" }, { "title": "GPFL: Simultaneously Learning Global and Personalized Feature Information for Personalized Federated Learning", @@ -23392,7 +24177,7 @@ "author": "Jianqing Zhang; Yang Hua; Hao Wang; Tao Song; Zhengui Xue; Ruhui Ma; Jian Cao; Haibing Guan", "abstract": "Federated Learning (FL) is popular for its privacy-preserving and collaborative learning capabilities. Recently, personalized FL (pFL) has received attention for its ability to address statistical heterogeneity and achieve personalization in FL. However, from the perspective of feature extraction, most existing pFL methods only focus on extracting global or personalized feature information during local training, which fails to meet the collaborative learning and personalization goals of pFL. To address this, we propose a new pFL method, named GPFL, to simultaneously learn global and personalized feature information on each client. We conduct extensive experiments on six datasets in three statistically heterogeneous settings and show the superiority of GPFL over ten state-of-the-art methods regarding effectiveness, scalability, fairness, stability, and privacy. Besides, GPFL mitigates overfitting and outperforms the baselines by up to 8.99% in accuracy.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Zhang_GPFL_Simultaneously_Learning_Global_and_Personalized_Feature_Information_for_Personalized_ICCV_2023_paper.pdf", - "aff": "Shanghai Jiao Tong University+Queen\u2019s University Belfast+Louisiana State University; Queen\u2019s University Belfast; Louisiana State University; Shanghai Jiao Tong University; Shanghai Jiao Tong University; Shanghai Jiao Tong University; Shanghai Jiao Tong University; Shanghai Jiao Tong University", + "aff": "Shanghai Jiao Tong University+Queen’s University Belfast+Louisiana State University; Queen’s University Belfast; Louisiana State University; Shanghai Jiao Tong University; Shanghai Jiao Tong University; Shanghai Jiao Tong University; Shanghai Jiao Tong University; Shanghai Jiao Tong University", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Zhang_GPFL_Simultaneously_Learning_ICCV_2023_supplemental.zip", @@ -23412,7 +24197,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1+2;1;2;0;0;0;0;0", - "aff_country_unique": "China;United Kingdom;United States" + "aff_country_unique": "China;United Kingdom;United States", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Jianqing and Hua,\n Yang and Wang,\n Hao and Song,\n Tao and Xue,\n Zhengui and Ma,\n Ruhui and Cao,\n Jian and Guan,\n Haibing\n},\n title = {\n GPFL: Simultaneously Learning Global and Personalized Feature Information for Personalized Federated Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5041-5051\n} \n}" }, { "title": "GPGait: Generalized Pose-based Gait Recognition", @@ -23444,7 +24230,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Beijing;", "aff_country_unique_index": "0+1;0+1;0;0;0+1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Fu_2023_ICCV,\n \n author = {\n Fu,\n Yang and Meng,\n Shibei and Hou,\n Saihui and Hu,\n Xuecai and Huang,\n Yongzhen\n},\n title = {\n GPGait: Generalized Pose-based Gait Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19595-19604\n} \n}" }, { "title": "GRAM-HD: 3D-Consistent Image Generation at High Resolution with Generative Radiance Manifolds", @@ -23469,20 +24256,25 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Xiang_GRAM-HD_3D-Consistent_Image_Generation_at_High_Resolution_with_Generative_Radiance_ICCV_2023_paper.html", "aff_unique_index": "0+1;1;0+1;1", - "aff_unique_norm": "Tsinghua University;Microsoft", + "aff_unique_norm": "Tsinghua University;Microsoft Research", "aff_unique_dep": ";Research", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.microsoft.com/en-us/research/group/asia", "aff_unique_abbr": "THU;MSR Asia", "aff_campus_unique_index": "1;1;1;1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0+0;0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xiang_2023_ICCV,\n \n author = {\n Xiang,\n Jianfeng and Yang,\n Jiaolong and Deng,\n Yu and Tong,\n Xin\n},\n title = {\n GRAM-HD: 3D-Consistent Image Generation at High Resolution with Generative Radiance Manifolds\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2195-2205\n} \n}" }, { "title": "GaFET: Learning Geometry-aware Facial Expression Translation from In-The-Wild Images", "session": "Image and video synthesis 1", + "pdf": "https://arxiv.org/pdf/2308.03413", + "bibtex": "@misc{ma2023gafetlearninggeometryawarefacial, title={GaFET: Learning Geometry-aware Facial Expression Translation from In-The-Wild Images}, author={Tianxiang Ma and Bingchuan Li and Qian He and Jing Dong and Tieniu Tan}, year={2023}, eprint={2308.03413}, archivePrefix={arXiv}, primaryClass={cs.CV}, url={https://arxiv.org/abs/2308.03413}, }", "author": "Tianxiang Ma, Bingchuan Li, Qian He, Jing Dong, Tieniu Tan", "status": "Poster", + "gs_citation": 3, + "abstract": "While current face animation methods can manipulate expressions individually, they suffer from several limitations. The expressions manipulated by some motion-based facial reenactment models are crude. Other ideas modeled with facial action units cannot generalize to arbitrary expressions not covered by annotations. In this paper, we introduce a novel Geometry-aware Facial Expression Translation (GaFET) framework, which is based on parametric 3D facial representations and can stably decoupled expression. Among them, a Multi-level Feature Aligned Transformer is proposed to complement non-geometric facial detail features while addressing the alignment challenge of spatial features. Further, we design a De-expression model based on StyleGAN, in order to reduce the learning difficulty of GaFET in unpaired \"in-the-wild\" images. Extensive qualitative and quantitative experiments demonstrate that we achieve higher-quality and more accurate facial expression transfer results compared to state-of-the-art methods, and demonstrate applicability of various poses and complex textures. Besides, videos or annotated training data are omitted, making our method easier to use and generalize.", "track": "main", "pid": "1998" }, @@ -23516,7 +24308,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hanoi", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Vietnam" + "aff_country_unique": "Vietnam", + "bibtex": "@InProceedings{Ngo_2023_ICCV,\n \n author = {\n Ngo,\n Tuan Duc and Hua,\n Binh-Son and Nguyen,\n Khoi\n},\n title = {\n GaPro: Box-Supervised 3D Point Cloud Instance Segmentation Using Gaussian Processes as Pseudo Labelers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17794-17803\n} \n}" }, { "title": "GameFormer: Game-theoretic Modeling and Learning of Transformer-based Interactive Prediction and Planning for Autonomous Driving", @@ -23548,7 +24341,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Huang_2023_ICCV,\n \n author = {\n Huang,\n Zhiyu and Liu,\n Haochen and Lv,\n Chen\n},\n title = {\n GameFormer: Game-theoretic Modeling and Learning of Transformer-based Interactive Prediction and Planning for Autonomous Driving\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3903-3913\n} \n}" }, { "title": "GasMono: Geometry-Aided Self-Supervised Monocular Depth Estimation for Indoor Scenes", @@ -23580,7 +24374,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0;0;0;1", - "aff_country_unique": "China;Italy" + "aff_country_unique": "China;Italy", + "bibtex": "@InProceedings{Zhao_2023_ICCV,\n \n author = {\n Zhao,\n Chaoqiang and Poggi,\n Matteo and Tosi,\n Fabio and Zhou,\n Lei and Sun,\n Qiyu and Tang,\n Yang and Mattoccia,\n Stefano\n},\n title = {\n GasMono: Geometry-Aided Self-Supervised Monocular Depth Estimation for Indoor Scenes\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16209-16220\n} \n}" }, { "title": "GePSAn: Generative Procedure Step Anticipation in Cooking Videos", @@ -23605,14 +24400,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Abdelsalam_GePSAn_Generative_Procedure_Step_Anticipation_in_Cooking_Videos_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;1;0+2;0", - "aff_unique_norm": "Samsung;Waabi;York University", + "aff_unique_norm": "Samsung AI Centre;Waabi;York University", "aff_unique_dep": "AI Centre;;", "aff_unique_url": "https://www.samsung.com;;https://www.yorku.ca", "aff_unique_abbr": "Samsung AI;;York U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0+2;0", - "aff_country_unique": "South Korea;;Canada" + "aff_country_unique": "South Korea;;Canada", + "bibtex": "@InProceedings{Abdelsalam_2023_ICCV,\n \n author = {\n Abdelsalam,\n Mohamed A. and Rangrej,\n Samrudhdhi B. and Hadji,\n Isma and Dvornik,\n Nikita and Derpanis,\n Konstantinos G. and Fazly,\n Afsaneh\n},\n title = {\n GePSAn: Generative Procedure Step Anticipation in Cooking Videos\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2988-2997\n} \n}" }, { "title": "GeT: Generative Target Structure Debiasing for Domain Adaptation", @@ -23644,7 +24440,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Can and Lee,\n Gim Hee\n},\n title = {\n GeT: Generative Target Structure Debiasing for Domain Adaptation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23577-23588\n} \n}" }, { "title": "Gender Artifacts in Visual Datasets", @@ -23676,7 +24473,8 @@ "aff_campus_unique_index": "0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;1;0;0;0;0", - "aff_country_unique": "United States;Japan" + "aff_country_unique": "United States;Japan", + "bibtex": "@InProceedings{Meister_2023_ICCV,\n \n author = {\n Meister,\n Nicole and Zhao,\n Dora and Wang,\n Angelina and Ramaswamy,\n Vikram V. and Fong,\n Ruth and Russakovsky,\n Olga\n},\n title = {\n Gender Artifacts in Visual Datasets\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4837-4848\n} \n}" }, { "title": "General Image-to-Image Translation with One-Shot Image Guidance", @@ -23708,7 +24506,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Cheng_2023_ICCV,\n \n author = {\n Cheng,\n Bin and Liu,\n Zuhao and Peng,\n Yunbo and Lin,\n Yue\n},\n title = {\n General Image-to-Image Translation with One-Shot Image Guidance\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22736-22746\n} \n}" }, { "title": "General Planar Motion from a Pair of 3D Correspondences", @@ -23740,7 +24539,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Dibene_2023_ICCV,\n \n author = {\n Dibene,\n Juan Carlos and Min,\n Zhixiang and Dunn,\n Enrique\n},\n title = {\n General Planar Motion from a Pair of 3D Correspondences\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8060-8070\n} \n}" }, { "title": "Generalizable Decision Boundaries: Dualistic Meta-Learning for Open Set Domain Generalization", @@ -23772,7 +24572,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Xiran and Zhang,\n Jian and Qi,\n Lei and Shi,\n Yinghuan\n},\n title = {\n Generalizable Decision Boundaries: Dualistic Meta-Learning for Open Set Domain Generalization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11564-11573\n} \n}" }, { "title": "Generalizable Neural Fields as Partially Observed Neural Processes", @@ -23804,7 +24605,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Gu_2023_ICCV,\n \n author = {\n Gu,\n Jeffrey and Wang,\n Kuan-Chieh and Yeung,\n Serena\n},\n title = {\n Generalizable Neural Fields as Partially Observed Neural Processes\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5330-5339\n} \n}" }, { "title": "Generalized Differentiable RANSAC", @@ -23836,7 +24638,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Prague;", "aff_country_unique_index": "0;0;0;0;1", - "aff_country_unique": "Czech Republic;Switzerland" + "aff_country_unique": "Czech Republic;Switzerland", + "bibtex": "@InProceedings{Wei_2023_ICCV,\n \n author = {\n Wei,\n Tong and Patel,\n Yash and Shekhovtsov,\n Alexander and Matas,\n Jiri and Barath,\n Daniel\n},\n title = {\n Generalized Differentiable RANSAC\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17649-17660\n} \n}" }, { "title": "Generalized Few-Shot Point Cloud Segmentation via Geometric Words", @@ -23868,7 +24671,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Xu_2023_ICCV,\n \n author = {\n Xu,\n Yating and Hu,\n Conghui and Zhao,\n Na and Lee,\n Gim Hee\n},\n title = {\n Generalized Few-Shot Point Cloud Segmentation via Geometric Words\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21506-21515\n} \n}" }, { "title": "Generalized Lightness Adaptation with Channel Selective Normalization", @@ -23900,7 +24704,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0;2;0", - "aff_country_unique": "China;New Zealand;Singapore" + "aff_country_unique": "China;New Zealand;Singapore", + "bibtex": "@InProceedings{Yao_2023_ICCV,\n \n author = {\n Yao,\n Mingde and Huang,\n Jie and Jin,\n Xin and Xu,\n Ruikang and Zhou,\n Shenglong and Zhou,\n Man and Xiong,\n Zhiwei\n},\n title = {\n Generalized Lightness Adaptation with Channel Selective Normalization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10668-10679\n} \n}" }, { "title": "Generalized Sum Pooling for Metric Learning", @@ -23908,8 +24713,8 @@ "status": "Poster", "track": "main", "pid": "11935", - "author_site": "Yeti Z. G\u00fcrb\u00fcz, Ozan Sener, A. Aydin Alatan", - "author": "Yeti Z. G\u00fcrb\u00fcz; Ozan Sener; A. Aydin Alatan", + "author_site": "Yeti Z. Gürbüz, Ozan Sener, A. Aydin Alatan", + "author": "Yeti Z. Gürbüz; Ozan Sener; A. Aydin Alatan", "abstract": "A common architectural choice for deep metric learning is a convolutional neural network followed by global average pooling (GAP). Albeit simple, GAP is a highly effective way to aggregate information. One possible explanation for the effectiveness of GAP is considering each feature vector as representing a different semantic entity and GAP as a convex combination of them. Following this perspective, we generalize GAP and propose a learnable generalized sum pooling method (GSP). GSP improves GAP with two distinct abilities: i) the ability to choose a subset of semantic entities, effectively learning to ignore nuisance information, and ii) learning the weights corresponding to the importance of each entity. Formally, we propose an entropy-smoothed optimal transport problem and show that it is a strict generalization of GAP, i.e., a specific realization of the problem gives back GAP. We show that this optimization problem enjoys analytical gradients enabling us to use it as a direct learnable replacement for GAP. We further propose a zero-shot loss to ease the learning of GSP. We show the effectiveness of our method with extensive evaluations on 4 popular metric learning benchmarks. Code is available at: GSP-DML Framework", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Gurbuz_Generalized_Sum_Pooling_for_Metric_Learning_ICCV_2023_paper.pdf", "aff": "RSiM, TU Berlin; Intel Labs; OGAM and METU", @@ -23925,14 +24730,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Gurbuz_Generalized_Sum_Pooling_for_Metric_Learning_ICCV_2023_paper.html", "aff_unique_index": "0;1", - "aff_unique_norm": "Technische Universit\u00e4t Berlin;Intel;", + "aff_unique_norm": "Technische Universität Berlin;Intel Corporation;", "aff_unique_dep": "RSiM;Intel Labs;", "aff_unique_url": "https://www.tu-berlin.de;https://www.intel.com;", "aff_unique_abbr": "TU Berlin;Intel;", "aff_campus_unique_index": "0", "aff_campus_unique": "Berlin;", "aff_country_unique_index": "0;1", - "aff_country_unique": "Germany;United States;" + "aff_country_unique": "Germany;United States;", + "bibtex": "@InProceedings{Gurbuz_2023_ICCV,\n \n author = {\n G\\"urb\\"uz,\n Yeti Z. and Sener,\n Ozan and Alatan,\n A. Aydin\n},\n title = {\n Generalized Sum Pooling for Metric Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5462-5473\n} \n}" }, { "title": "Generalizing Event-Based Motion Deblurring in Real-World Scenarios", @@ -23960,11 +24766,12 @@ "aff_unique_norm": "Wuhan University;Shenzhen Institute of Advanced Technology", "aff_unique_dep": ";", "aff_unique_url": "http://www.whu.edu.cn/;http://www.siat.ac.cn", - "aff_unique_abbr": "WHU;", + "aff_unique_abbr": "WHU;SIAT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Xiang and Yu,\n Lei and Yang,\n Wen and Liu,\n Jianzhuang and Xia,\n Gui-Song\n},\n title = {\n Generalizing Event-Based Motion Deblurring in Real-World Scenarios\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10734-10744\n} \n}" }, { "title": "Generalizing Neural Human Fitting to Unseen Poses With Articulated SE(3) Equivariance", @@ -23976,7 +24783,7 @@ "author": "Haiwen Feng; Peter Kulits; Shichen Liu; Michael J. Black; Victoria Fernandez Abrevaya", "abstract": "We address the problem of fitting a parametric human body model (SMPL) to point cloud data. Optimization based methods require careful initialization and are prone to becoming trapped in local optima. Learning-based methods address this but do not generalize well when the input pose is far from those seen during training. For rigid point clouds, remarkable generalization has been achieved by leveraging SE(3)-equivariant networks, but these methods do not work on articulated objects. In this work we extend this idea to human bodies and propose ArtEq, a novel part-based SE(3)-equivariant neural architecture for SMPL model estimation from point clouds. Specifically, we learn a part detection network by leveraging local SO(3) invariance, and regress shape and pose using articulated SE(3) shape-invariant and pose-equivariant networks, all trained end-to-end. Our novel pose regression module leverages the permutation-equivariant property of self-attention layers to preserve rotational equivariance. Experimental results show that ArtEq generalizes to poses not seen during training, outperforming state-of-the-art methods by 44%in terms of body reconstruction accuracy, without requiring an optimization refinement step. Furthermore, ArtEq is three orders of magnitude faster during inference than prior work and has 97.3% fewer parameters. The code and model are available for research purposes at https://arteq.is.tue.mpg.de.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Feng_Generalizing_Neural_Human_Fitting_to_Unseen_Poses_With_Articulated_SE3_ICCV_2023_paper.pdf", - "aff": "Max Planck Institute for Intelligent Systems, T\u00fcbingen, Germany; Max Planck Institute for Intelligent Systems, T\u00fcbingen, Germany; University of Southern California; Max Planck Institute for Intelligent Systems, T\u00fcbingen, Germany; Max Planck Institute for Intelligent Systems, T\u00fcbingen, Germany", + "aff": "Max Planck Institute for Intelligent Systems, Tübingen, Germany; Max Planck Institute for Intelligent Systems, Tübingen, Germany; University of Southern California; Max Planck Institute for Intelligent Systems, Tübingen, Germany; Max Planck Institute for Intelligent Systems, Tübingen, Germany", "project": "https://arteq.is.tue.mpg.de", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Feng_Generalizing_Neural_Human_ICCV_2023_supplemental.pdf", @@ -23994,9 +24801,10 @@ "aff_unique_url": "https://www.mpi-is.mpg.de;https://www.usc.edu", "aff_unique_abbr": "MPI-IS;USC", "aff_campus_unique_index": "0;0;1;0;0", - "aff_campus_unique": "T\u00fcbingen;Los Angeles", + "aff_campus_unique": "Tübingen;Los Angeles", "aff_country_unique_index": "0;0;1;0;0", - "aff_country_unique": "Germany;United States" + "aff_country_unique": "Germany;United States", + "bibtex": "@InProceedings{Feng_2023_ICCV,\n \n author = {\n Feng,\n Haiwen and Kulits,\n Peter and Liu,\n Shichen and Black,\n Michael J. and Abrevaya,\n Victoria Fernandez\n},\n title = {\n Generalizing Neural Human Fitting to Unseen Poses With Articulated SE(3) Equivariance\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7977-7988\n} \n}" }, { "title": "Generating Dynamic Kernels via Transformers for Lane Detection", @@ -24028,7 +24836,8 @@ "aff_campus_unique_index": "0;0;2;0;0", "aff_campus_unique": "Melbourne;;Wuhan", "aff_country_unique_index": "0;1;0;1;0;0", - "aff_country_unique": "Australia;China" + "aff_country_unique": "Australia;China", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Ziye and Liu,\n Yu and Gong,\n Mingming and Du,\n Bo and Qian,\n Guoqi and Smith-Miles,\n Kate\n},\n title = {\n Generating Dynamic Kernels via Transformers for Lane Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6835-6844\n} \n}" }, { "title": "Generating Instance-level Prompts for Rehearsal-free Continual Learning", @@ -24053,14 +24862,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Jung_Generating_Instance-level_Prompts_for_Rehearsal-free_Continual_Learning_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;3", - "aff_unique_norm": "Seoul National University;NAVER Corporation;NAVER Cloud;Amazon", + "aff_unique_norm": "Seoul National University;NAVER Corporation;NAVER Cloud;Amazon Web Services", "aff_unique_dep": "Department of Electrical and Computer Engineering;AI Lab;;AWS AI Labs", "aff_unique_url": "https://www.snu.ac.kr;https://www.naver.com;https://www.naver.com;https://aws.amazon.com", "aff_unique_abbr": "SNU;NAVER;NAVER;AWS", "aff_campus_unique_index": "0", "aff_campus_unique": "Seoul;", "aff_country_unique_index": "0;0;0;1", - "aff_country_unique": "South Korea;United States" + "aff_country_unique": "South Korea;United States", + "bibtex": "@InProceedings{Jung_2023_ICCV,\n \n author = {\n Jung,\n Dahuin and Han,\n Dongyoon and Bang,\n Jihwan and Song,\n Hwanjun\n},\n title = {\n Generating Instance-level Prompts for Rehearsal-free Continual Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11847-11857\n} \n}" }, { "title": "Generating Realistic Images from In-the-wild Sounds", @@ -24092,7 +24902,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Lee_2023_ICCV,\n \n author = {\n Lee,\n Taegyeong and Kang,\n Jeonghun and Kim,\n Hyeonyu and Kim,\n Taehwan\n},\n title = {\n Generating Realistic Images from In-the-wild Sounds\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7160-7170\n} \n}" }, { "title": "Generating Visual Scenes from Touch", @@ -24115,7 +24926,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yang_Generating_Visual_Scenes_from_Touch_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yang_Generating_Visual_Scenes_from_Touch_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n Fengyu and Zhang,\n Jiacheng and Owens,\n Andrew\n},\n title = {\n Generating Visual Scenes from Touch\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22070-22080\n} \n}" }, { "title": "Generative Action Description Prompts for Skeleton-based Action Recognition", @@ -24140,14 +24952,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Xiang_Generative_Action_Description_Prompts_for_Skeleton-based_Action_Recognition_ICCV_2023_paper.html", "aff_unique_index": "0+1;1+2;1+2;1;0", - "aff_unique_norm": "Hong Kong Polytechnic University;Alibaba Group;Mannheim University", + "aff_unique_norm": "The Hong Kong Polytechnic University;Alibaba Group;Mannheim University", "aff_unique_dep": ";DAMO Academy;", "aff_unique_url": "https://www.polyu.edu.hk;https://www.alibaba-group.com;https://www.uni-mannheim.de", "aff_unique_abbr": "PolyU;Alibaba;UM", "aff_campus_unique_index": "0;;;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0+0;0+1;0+1;0;0", - "aff_country_unique": "China;Germany" + "aff_country_unique": "China;Germany", + "bibtex": "@InProceedings{Xiang_2023_ICCV,\n \n author = {\n Xiang,\n Wangmeng and Li,\n Chao and Zhou,\n Yuxuan and Wang,\n Biao and Zhang,\n Lei\n},\n title = {\n Generative Action Description Prompts for Skeleton-based Action Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10276-10285\n} \n}" }, { "title": "Generative Gradient Inversion via Over-Parameterized Networks in Federated Learning", @@ -24179,7 +24992,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Chi and Xiaoman,\n Zhang and Sotthiwat,\n Ekanut and Xu,\n Yanyu and Liu,\n Ping and Zhen,\n Liangli and Liu,\n Yong\n},\n title = {\n Generative Gradient Inversion via Over-Parameterized Networks in Federated Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5126-5135\n} \n}" }, { "title": "Generative Multiplane Neural Radiance for 3D-Aware Image Generation", @@ -24191,7 +25005,7 @@ "author": "Amandeep Kumar; Ankan Kumar Bhunia; Sanath Narayan; Hisham Cholakkal; Rao Muhammad Anwer; Salman Khan; Ming-Hsuan Yang; Fahad Shahbaz Khan", "abstract": "We present a method to efficiently generate 3D-aware high-resolution images that are view-consistent across multiple target views. The proposed multiplane neural radiance model, named GMNR, consists of a novel a-guided view-dependent representation (a-VdR) module for learning view-dependent information. The a-VdR module, faciliated by an a-guided pixel sampling technique, computes the view-dependent representation efficiently by learning viewing direction and position coefficients. Moreover, we propose a view-consistency loss to enforce photometric similarity across multiple views. The GMNR model can generate 3D-aware high-resolution images that are view-consistent across multiple camera poses, while maintaining the computational efficiency in terms of both training and inference time. Experiments on three datasets demonstrate the effectiveness of the proposed modules, leading to favorable results in terms of both generation quality and inference time, compared to existing approaches. Our GMNR model generates 3D-aware images of 1024 x 1024 pixels with 17.6 FPS on a single V100. Code : https://github.com/VIROBO-15/GMNR", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Kumar_Generative_Multiplane_Neural_Radiance_for_3D-Aware_Image_Generation_ICCV_2023_paper.pdf", - "aff": "Mohamed bin Zayed University of AI; Mohamed bin Zayed University of AI; Technology Innovation Institute; Mohamed bin Zayed University of AI; Mohamed bin Zayed University of AI + Aalto University; Mohamed bin Zayed University of AI; University of California, Merced + Yonsei University + Google Research; Mohamed bin Zayed University of AI + Link\u00f6ping University", + "aff": "Mohamed bin Zayed University of AI; Mohamed bin Zayed University of AI; Technology Innovation Institute; Mohamed bin Zayed University of AI; Mohamed bin Zayed University of AI + Aalto University; Mohamed bin Zayed University of AI; University of California, Merced + Yonsei University + Google Research; Mohamed bin Zayed University of AI + Linköping University", "project": "", "github": "https://github.com/VIROBO-15/GMNR", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Kumar_Generative_Multiplane_Neural_ICCV_2023_supplemental.zip", @@ -24204,14 +25018,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Kumar_Generative_Multiplane_Neural_Radiance_for_3D-Aware_Image_Generation_ICCV_2023_paper.html", "aff_unique_index": "0;0;1;0;0+2;0;3+4+5;0+6", - "aff_unique_norm": "Mohamed bin Zayed University of Artificial Intelligence;Technology Innovation Institute;Aalto University;University of California, Merced;Yonsei University;Google;Link\u00f6ping University", + "aff_unique_norm": "Mohamed bin Zayed University of Artificial Intelligence;Technology Innovation Institute;Aalto University;University of California, Merced;Yonsei University;Google;Linköping University", "aff_unique_dep": ";;;;;Google Research;", "aff_unique_url": "https://mbzuai.ac.ae;;https://www.aalto.fi;https://www.ucmerced.edu;https://www.yonsei.ac.kr;https://research.google;https://www.liu.se", "aff_unique_abbr": "MBZUAI;;Aalto;UC Merced;Yonsei;Google Research;LiU", "aff_campus_unique_index": ";1+2;", "aff_campus_unique": ";Merced;Mountain View", "aff_country_unique_index": "0;0;0;0+2;0;3+4+3;0+5", - "aff_country_unique": "United Arab Emirates;;Finland;United States;South Korea;Sweden" + "aff_country_unique": "United Arab Emirates;;Finland;United States;South Korea;Sweden", + "bibtex": "@InProceedings{Kumar_2023_ICCV,\n \n author = {\n Kumar,\n Amandeep and Bhunia,\n Ankan Kumar and Narayan,\n Sanath and Cholakkal,\n Hisham and Anwer,\n Rao Muhammad and Khan,\n Salman and Yang,\n Ming-Hsuan and Khan,\n Fahad Shahbaz\n},\n title = {\n Generative Multiplane Neural Radiance for 3D-Aware Image Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7388-7398\n} \n}" }, { "title": "Generative Novel View Synthesis with 3D-Aware Diffusion Models", @@ -24236,14 +25051,15 @@ "author_num": 10, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chan_Generative_Novel_View_Synthesis_with_3D-Aware_Diffusion_Models_ICCV_2023_paper.html", "aff_unique_index": "0+1;1;1;0+1;0;0;1;1;1;0", - "aff_unique_norm": "Stanford University;NVIDIA", - "aff_unique_dep": ";NVIDIA Corporation", + "aff_unique_norm": "Stanford University;NVIDIA Corporation", + "aff_unique_dep": ";", "aff_unique_url": "https://www.stanford.edu;https://www.nvidia.com", "aff_unique_abbr": "Stanford;NVIDIA", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0+0;0;0;0+0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Chan_2023_ICCV,\n \n author = {\n Chan,\n Eric R. and Nagano,\n Koki and Chan,\n Matthew A. and Bergman,\n Alexander W. and Park,\n Jeong Joon and Levy,\n Axel and Aittala,\n Miika and De Mello,\n Shalini and Karras,\n Tero and Wetzstein,\n Gordon\n},\n title = {\n Generative Novel View Synthesis with 3D-Aware Diffusion Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4217-4229\n} \n}" }, { "title": "Generative Prompt Model for Weakly Supervised Object Localization", @@ -24275,7 +25091,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhao_2023_ICCV,\n \n author = {\n Zhao,\n Yuzhong and Ye,\n Qixiang and Wu,\n Weijia and Shen,\n Chunhua and Wan,\n Fang\n},\n title = {\n Generative Prompt Model for Weakly Supervised Object Localization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6351-6361\n} \n}" }, { "title": "GeoMIM: Towards Better 3D Knowledge Transfer via Masked Image Modeling for Multi-view 3D Understanding", @@ -24307,7 +25124,8 @@ "aff_campus_unique_index": "0+0;0+0;0;0+0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0+0+0;0+0+0;0;0;0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Jihao and Wang,\n Tai and Liu,\n Boxiao and Zhang,\n Qihang and Liu,\n Yu and Li,\n Hongsheng\n},\n title = {\n GeoMIM: Towards Better 3D Knowledge Transfer via Masked Image Modeling for Multi-view 3D Understanding\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17839-17849\n} \n}" }, { "title": "GeoUDF: Surface Reconstruction from 3D Point Clouds via Geometry-guided Distance Representation", @@ -24339,7 +25157,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0+0;0;0;1;2", - "aff_country_unique": "China;Singapore;United States" + "aff_country_unique": "China;Singapore;United States", + "bibtex": "@InProceedings{Ren_2023_ICCV,\n \n author = {\n Ren,\n Siyu and Hou,\n Junhui and Chen,\n Xiaodong and He,\n Ying and Wang,\n Wenping\n},\n title = {\n GeoUDF: Surface Reconstruction from 3D Point Clouds via Geometry-guided Distance Representation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14214-14224\n} \n}" }, { "title": "Geometric Viewpoint Learning with Hyper-Rays and Harmonics Encoding", @@ -24369,7 +25188,8 @@ "aff_unique_url": "https://www.stevens.edu", "aff_unique_abbr": "SIT", "aff_country_unique_index": "0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Min_2023_ICCV,\n \n author = {\n Min,\n Zhixiang and Dibene,\n Juan Carlos and Dunn,\n Enrique\n},\n title = {\n Geometric Viewpoint Learning with Hyper-Rays and Harmonics Encoding\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22520-22530\n} \n}" }, { "title": "Geometrized Transformer for Self-Supervised Homography Estimation", @@ -24401,7 +25221,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Jiazhen and Li,\n Xirong\n},\n title = {\n Geometrized Transformer for Self-Supervised Homography Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9556-9565\n} \n}" }, { "title": "Geometry-guided Feature Learning and Fusion for Indoor Scene Reconstruction", @@ -24433,7 +25254,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Netherlands;" + "aff_country_unique": "Netherlands;", + "bibtex": "@InProceedings{Yin_2023_ICCV,\n \n author = {\n Yin,\n Ruihong and Karaoglu,\n Sezer and Gevers,\n Theo\n},\n title = {\n Geometry-guided Feature Learning and Fusion for Indoor Scene Reconstruction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3652-3661\n} \n}" }, { "title": "Get the Best of Both Worlds: Improving Accuracy and Transferability by Grassmann Class Representation", @@ -24458,14 +25280,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wang_Get_the_Best_of_Both_Worlds_Improving_Accuracy_and_Transferability_ICCV_2023_paper.html", "aff_unique_index": "0;1+2;1+2", - "aff_unique_norm": "EPFL;SenseTime;Guangdong Provincial Key Laboratory of Digital Grid Technology", + "aff_unique_norm": "École Polytechnique Fédérale de Lausanne;SenseTime;Guangdong Provincial Key Laboratory of Digital Grid Technology", "aff_unique_dep": ";SenseTime Research;Digital Grid Technology", "aff_unique_url": "https://www.epfl.ch;https://www.sensetime.com;", "aff_unique_abbr": "EPFL;SenseTime;", "aff_campus_unique_index": "0;;", "aff_campus_unique": "Lausanne;", "aff_country_unique_index": "0;1+1;1+1", - "aff_country_unique": "Switzerland;China" + "aff_country_unique": "Switzerland;China", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Haoqi and Li,\n Zhizhong and Zhang,\n Wayne\n},\n title = {\n Get the Best of Both Worlds: Improving Accuracy and Transferability by Grassmann Class Representation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22478-22487\n} \n}" }, { "title": "Get3DHuman: Lifting StyleGAN-Human into a 3D Generative Model Using Pixel-Aligned Reconstruction Priors", @@ -24489,15 +25312,16 @@ "email": "cuhk.edu.cn; ; ; ; ; ;cuhk.edu.cn", "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Xiong_Get3DHuman_Lifting_StyleGAN-Human_into_a_3D_Generative_Model_Using_Pixel-Aligned_ICCV_2023_paper.html", - "aff_unique_index": "0;1;2;1;1;0;2", - "aff_unique_norm": "Fudan University;Tencent;Shenzhen University, College of Software Engineering", - "aff_unique_dep": ";Tencent AI Lab;College of Software Engineering", - "aff_unique_url": "https://www.fudan.edu.cn/en/;https://ai.tencent.com;http://sse.cuhkcz.edu.cn/", - "aff_unique_abbr": "Fudan;Tencent AI Lab;SSE", + "aff_unique_index": "0;1;2;3;1;0;2", + "aff_unique_norm": "Fudan University;Tencent;Shenzhen University, College of Software Engineering;Tencent America", + "aff_unique_dep": ";Tencent AI Lab;College of Software Engineering;", + "aff_unique_url": "https://www.fudan.edu.cn/en/;https://ai.tencent.com;http://sse.cuhksz.edu.cn/;https://www.tencent.com/en-us", + "aff_unique_abbr": "Fudan;Tencent AI Lab;SSE;Tencent America", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Shenzhen;", "aff_country_unique_index": "0;0;0;1;0;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Xiong_2023_ICCV,\n \n author = {\n Xiong,\n Zhangyang and Kang,\n Di and Jin,\n Derong and Chen,\n Weikai and Bao,\n Linchao and Cui,\n Shuguang and Han,\n Xiaoguang\n},\n title = {\n Get3DHuman: Lifting StyleGAN-Human into a 3D Generative Model Using Pixel-Aligned Reconstruction Priors\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9287-9297\n} \n}" }, { "title": "Global Adaptation Meets Local Generalization: Unsupervised Domain Adaptation for 3D Human Pose Estimation", @@ -24529,7 +25353,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Chai_2023_ICCV,\n \n author = {\n Chai,\n Wenhao and Jiang,\n Zhongyu and Hwang,\n Jenq-Neng and Wang,\n Gaoang\n},\n title = {\n Global Adaptation Meets Local Generalization: Unsupervised Domain Adaptation for 3D Human Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14655-14665\n} \n}" }, { "title": "Global Balanced Experts for Federated Long-Tailed Learning", @@ -24554,14 +25379,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zeng_Global_Balanced_Experts_for_Federated_Long-Tailed_Learning_ICCV_2023_paper.html", "aff_unique_index": "0;0;1;2;3;0", - "aff_unique_norm": "Chinese University of Hong Kong, Shenzhen;Hong Kong University of Science and Technology;JD;Alibaba Group", - "aff_unique_dep": "School of Data Science;;JD Explore Academy;", + "aff_unique_norm": "The Chinese University of Hong Kong, Shenzhen;Hong Kong University of Science and Technology;JD Explore Academy;Alibaba Group", + "aff_unique_dep": "School of Data Science;;;", "aff_unique_url": "https://www.cuhk.edu.cn/en/shenzhen;https://www.ust.hk;;https://www.alibaba.com", "aff_unique_abbr": "CUHK-Shenzhen;HKUST;;Alibaba", "aff_campus_unique_index": "0;0;1;0", "aff_campus_unique": "Shenzhen;Guangzhou;", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Zeng_2023_ICCV,\n \n author = {\n Zeng,\n Yaopei and Liu,\n Lei and Liu,\n Li and Shen,\n Li and Liu,\n Shaoguo and Wu,\n Baoyuan\n},\n title = {\n Global Balanced Experts for Federated Long-Tailed Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4815-4825\n} \n}" }, { "title": "Global Features are All You Need for Image Retrieval and Reranking", @@ -24569,8 +25395,8 @@ "status": "Poster", "track": "main", "pid": "9433", - "author_site": "Shihao Shao, Kaifeng Chen, Arjun Karpur, Qinghua Cui, Andr\u00e9 Araujo, Bingyi Cao", - "author": "Shihao Shao; Kaifeng Chen; Arjun Karpur; Qinghua Cui; Andr\u00e9 Araujo; Bingyi Cao", + "author_site": "Shihao Shao, Kaifeng Chen, Arjun Karpur, Qinghua Cui, André Araujo, Bingyi Cao", + "author": "Shihao Shao; Kaifeng Chen; Arjun Karpur; Qinghua Cui; André Araujo; Bingyi Cao", "abstract": "Image retrieval systems conventionally use a two-stage paradigm, leveraging global features for initial retrieval and local features for reranking. However, the scalability of this method is often limited due to the significant storage and computation cost incurred by local feature matching in the reranking stage. In this paper, we present SuperGlobal, a novel approach that exclusively employs global features for both stages, improving efficiency without sacrificing accuracy. SuperGlobal introduces key enhancements to the retrieval system, specifically focusing on the global feature extraction and reranking processes. For extraction, we identify sub-optimal performance when the widely-used ArcFace loss and Generalized Mean (GeM) pooling methods are combined and propose several new modules to improve GeM pooling. In the reranking stage, we introduce a novel method to update the global features of the query and top-ranked images by only considering feature refinement with a small set of images, thus being very compute and memory efficient. Our experiments demonstrate substantial improvements compared to the state of the art in standard benchmarks. Notably, on the Revisited Oxford+1M Hard dataset, our single-stage results improve by 7.1%, while our two-stage gain reaches 3.7% with a strong 64,865x speedup. Our two-stage system surpasses the current single-stage state-of-the-art by 16.3%, offering a scalable, accurate alternative for high-performing image retrieval systems with minimal time overhead.\n Code: https://github.com/ShihaoShao-GH/SuperGlobal.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Shao_Global_Features_are_All_You_Need_for_Image_Retrieval_and_ICCV_2023_paper.pdf", "aff": "Peking University; Google Research; Google Research; Peking University; Google Research; Google Research", @@ -24593,7 +25419,8 @@ "aff_campus_unique_index": "1;1;1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;1;0;1;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Shao_2023_ICCV,\n \n author = {\n Shao,\n Shihao and Chen,\n Kaifeng and Karpur,\n Arjun and Cui,\n Qinghua and Araujo,\n Andr\\'e and Cao,\n Bingyi\n},\n title = {\n Global Features are All You Need for Image Retrieval and Reranking\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11036-11046\n} \n}" }, { "title": "Global Knowledge Calibration for Fast Open-Vocabulary Segmentation", @@ -24618,14 +25445,15 @@ "author_num": 11, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Han_Global_Knowledge_Calibration_for_Fast_Open-Vocabulary_Segmentation_ICCV_2023_paper.html", "aff_unique_index": "0+1;2;3;4;3;2;2;2;3;0+1;0+1", - "aff_unique_norm": "Beijing Jiao Tong University;Beijing Key Laboratory of Advanced Information Science and Network Technology;Tsinghua University;ByteDance;Nanyang Technological University", + "aff_unique_norm": "Beijing Jiaotong University;Beijing Key Laboratory of Advanced Information Science and Network Technology;Tsinghua University;ByteDance;Nanyang Technological University", "aff_unique_dep": "Institute of Information Science;Advanced Information Science and Network Technology;International Graduate School;;", "aff_unique_url": "http://www.bjtu.edu.cn;;https://www.tsinghua.edu.cn;https://www.bytedance.com;https://www.ntu.edu.sg", "aff_unique_abbr": "BJTU;;THU;ByteDance;NTU", - "aff_campus_unique_index": "0;2;2;2;2;0;0", - "aff_campus_unique": "Beijing;;Shenzhen", + "aff_campus_unique_index": ";1;1;1;1;;", + "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0+0;0;0;1;0;0;0;0;0;0+0;0+0", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Han_2023_ICCV,\n \n author = {\n Han,\n Kunyang and Liu,\n Yong and Liew,\n Jun Hao and Ding,\n Henghui and Liu,\n Jiajun and Wang,\n Yitong and Tang,\n Yansong and Yang,\n Yujiu and Feng,\n Jiashi and Zhao,\n Yao and Wei,\n Yunchao\n},\n title = {\n Global Knowledge Calibration for Fast Open-Vocabulary Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 797-807\n} \n}" }, { "title": "Global Perception Based Autoregressive Neural Processes", @@ -24633,6 +25461,7 @@ "status": "Poster", "track": "main", "pid": "12421", + "author_site": "Jinyang Tai", "author": "Jinyang Tai", "abstract": "Increasingly, autoregressive approaches are being used to serialize observed variables based on specific criteria. The Neural Processes (NPs) model variable distribution as a continuous function and provide quick solutions for different tasks using a meta-learning framework. This paper proposes an autoregressive-based framework for NPs, based on their autoregressive properties. This framework leverages the autoregressive stacking effects of various variables to enhance the representation of the latent distribution, concurrently refining local and global relationships within the positional representation through the use of a sliding window\n mechanism. Autoregression improves function approximations in a stacked fashion, thereby raising the upper bound of the optimization. We have designated this framework as Autoregressive Neural Processes (AENPs) or Conditional Autoregressive Neural Processes (CAENPs). Traditional NP models and their variants aim to capture relationships between the context sample points, without addressing either local or global considerations. Specifically, we capture contextual relationships in the deterministic path and introduce sliding window attention and global attention to reconcile local and global relationships in the context sample points. Autoregressive constraints exist between multiple latent variables in the latent paths, thus building a complex global structure that allows our model to learn complex\n distributions. Finally, we demonstrate the effectiveness of the NPs or CFANPs models for 1D data, Bayesian optimization, and 2D data.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Tai_Global_Perception_Based_Autoregressive_Neural_Processes_ICCV_2023_paper.pdf", @@ -24656,7 +25485,8 @@ "aff_campus_unique_index": "0", "aff_campus_unique": "Shanghai", "aff_country_unique_index": "0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Tai_2023_ICCV,\n \n author = {\n Tai,\n Jinyang\n},\n title = {\n Global Perception Based Autoregressive Neural Processes\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10487-10497\n} \n}" }, { "title": "GlobalMapper: Arbitrary-Shaped Urban Layout Generation", @@ -24688,7 +25518,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{He_2023_ICCV,\n \n author = {\n He,\n Liu and Aliaga,\n Daniel\n},\n title = {\n GlobalMapper: Arbitrary-Shaped Urban Layout Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 454-464\n} \n}" }, { "title": "Gloss-Free Sign Language Translation: Improving from Visual-Language Pretraining", @@ -24696,8 +25527,8 @@ "status": "Poster", "track": "main", "pid": "2492", - "author_site": "Benjia Zhou, Zhigang Chen, Albert Clap\u00e9s, Jun Wan, Yanyan Liang, Sergio Escalera, Zhen Lei, Du Zhang", - "author": "Benjia Zhou; Zhigang Chen; Albert Clap\u00e9s; Jun Wan; Yanyan Liang; Sergio Escalera; Zhen Lei; Du Zhang", + "author_site": "Benjia Zhou, Zhigang Chen, Albert Clapés, Jun Wan, Yanyan Liang, Sergio Escalera, Zhen Lei, Du Zhang", + "author": "Benjia Zhou; Zhigang Chen; Albert Clapés; Jun Wan; Yanyan Liang; Sergio Escalera; Zhen Lei; Du Zhang", "abstract": "Sign Language Translation (SLT) is a challenging task due to its cross-domain nature, involving the translation of visual-gestural language to text. Many previous methods employ an intermediate representation,i.e., gloss sequences, to facilitate SLT, thus transforming it into a two-stage task of sign language recognition (SLR) followed by sign language translation (SLT). However, the scarcity of gloss-annotated sign language data, combined with the information bottleneck in the mid-level gloss representation, has hindered the further development of the SLT task. To address this challenge, we propose a novel Gloss-Free SLT base on Visual-Language Pretraining (GFSLT-VLP), which improves SLT by inheriting language-oriented prior knowledge from pre-trained models, without any gloss annotation assistance. Our approach involves two stages: (i) integrating Contrastive Language-Image Pre-training (CLIP) with masked self-supervised learning to create pre-tasks that bridge the semantic gap between visual and textual representations and restore masked sentences, and (ii) constructing an end-to-end architecture with an encoder-decoder-like structure that inherits the parameters of the pre-trained Visual Encoder and Text Decoder from the first stage. The seamless combination of these novel designs forms a robust sign language representation and significantly improves gloss-free sign language translation. In particular, we have achieved unprecedented improvements in terms of BLEU-4 score on the PHOENIX14T dataset (>=+5) and the CSL-Daily dataset (>=+3) compared to state-of-the-art gloss-free SLT methods. Furthermore, our approach also achieves competitive results on the PHOENIX14T dataset when compared with most of the gloss-based methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Zhou_Gloss-Free_Sign_Language_Translation_Improving_from_Visual-Language_Pretraining_ICCV_2023_paper.pdf", "aff": "MUST, Macau, China; UCAS, China+MAIS, CASIA, China; Universitat de Barcelona, Spain+Computer Vision Center, Spain; MUST, Macau, China+UCAS, China+MAIS, CASIA, China; MUST, Macau, China; Universitat de Barcelona, Spain+Computer Vision Center, Spain+AAU, Aalborg, Denmark; UCAS, China+MAIS, CASIA, China+CAIR, HKISI, CAS, Hong Kong, China; MUST, Macau, China", @@ -24720,7 +25551,8 @@ "aff_campus_unique_index": ";;;1;", "aff_campus_unique": ";Aalborg", "aff_country_unique_index": "0;0+0;1+1;0+0+0;0;1+1+2;0+0+0;0", - "aff_country_unique": "China;Spain;Denmark" + "aff_country_unique": "China;Spain;Denmark", + "bibtex": "@InProceedings{Zhou_2023_ICCV,\n \n author = {\n Zhou,\n Benjia and Chen,\n Zhigang and Clap\\'es,\n Albert and Wan,\n Jun and Liang,\n Yanyan and Escalera,\n Sergio and Lei,\n Zhen and Zhang,\n Du\n},\n title = {\n Gloss-Free Sign Language Translation: Improving from Visual-Language Pretraining\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20871-20881\n} \n}" }, { "title": "GlowGAN: Unsupervised Learning of HDR Images from LDR Images in the Wild", @@ -24728,8 +25560,8 @@ "status": "Poster", "track": "main", "pid": "10033", - "author_site": "Chao Wang, Ana Serrano, Xingang Pan, Bin Chen, Karol Myszkowski, Hans-Peter Seidel, Christian Theobalt, Thomas Leimk\u00fchler", - "author": "Chao Wang; Ana Serrano; Xingang Pan; Bin Chen; Karol Myszkowski; Hans-Peter Seidel; Christian Theobalt; Thomas Leimk\u00fchler", + "author_site": "Chao Wang, Ana Serrano, Xingang Pan, Bin Chen, Karol Myszkowski, Hans-Peter Seidel, Christian Theobalt, Thomas Leimkühler", + "author": "Chao Wang; Ana Serrano; Xingang Pan; Bin Chen; Karol Myszkowski; Hans-Peter Seidel; Christian Theobalt; Thomas Leimkühler", "abstract": "Most in-the-wild images are stored in Low Dynamic Range (LDR) form, serving as a partial observation of the High Dynamic Range (HDR) visual world. Despite limited dynamic range, these LDR images are often captured with different exposures, implicitly containing information about the underlying HDR image distribution. Inspired by this intuition, in this work we present, to the best of our knowledge, the first method for learning a generative model of HDR images from in-the-wild LDR image collections in a fully unsupervised manner. The key idea is to train a generative adversarial network (GAN) to generate HDR images which, when projected to LDR under various exposures, are indistinguishable from real LDR images. Experiments show that our method GlowGAN can synthesize photorealistic HDR images in many challenging cases such as landscapes, lightning, or windows, where previous supervised generative models produce overexposed images. With the assistance of GlowGAN, we showcase the innovative application of unsupervised inverse tone mapping (GlowGAN-ITM) that sets a new paradigm in this field. Unlike previous methods that gradually complete information from LDR input, GlowGAN-ITM searches the entire HDR image manifold modeled by GlowGAN for the HDR images which can be mapped back to the LDR input. GlowGAN-ITM method achieves more realistic reconstruction of overexposed regions compared to state-of-the-art supervised learning models, despite not requiring HDR images or paired multi-exposure images for training.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Wang_GlowGAN_Unsupervised_Learning_of_HDR_Images_from_LDR_Images_in_ICCV_2023_paper.pdf", "aff": "MPI Informatik; Universidad de Zaragoza; MPI Informatik+Nanyang Technological University; MPI Informatik; MPI Informatik; MPI Informatik; MPI Informatik; MPI Informatik", @@ -24752,7 +25584,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0+2;0;0;0;0;0", - "aff_country_unique": "Germany;Spain;Singapore" + "aff_country_unique": "Germany;Spain;Singapore", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Chao and Serrano,\n Ana and Pan,\n Xingang and Chen,\n Bin and Myszkowski,\n Karol and Seidel,\n Hans-Peter and Theobalt,\n Christian and Leimk\\"uhler,\n Thomas\n},\n title = {\n GlowGAN: Unsupervised Learning of HDR Images from LDR Images in the Wild\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10509-10519\n} \n}" }, { "title": "GlueGen: Plug and Play Multi-modal Encoders for X-to-image Generation", @@ -24784,7 +25617,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Qin_2023_ICCV,\n \n author = {\n Qin,\n Can and Yu,\n Ning and Xing,\n Chen and Zhang,\n Shu and Chen,\n Zeyuan and Ermon,\n Stefano and Fu,\n Yun and Xiong,\n Caiming and Xu,\n Ran\n},\n title = {\n GlueGen: Plug and Play Multi-modal Encoders for X-to-image Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23085-23096\n} \n}" }, { "title": "GlueStick: Robust Image Matching by Sticking Points and Lines Together", @@ -24792,8 +25626,8 @@ "status": "Poster", "track": "main", "pid": "10776", - "author_site": "R\u00e9mi Pautrat, Iago Su\u00e1rez, Yifan Yu, Marc Pollefeys, Viktor Larsson", - "author": "R\u00e9mi Pautrat; Iago Su\u00e1rez; Yifan Yu; Marc Pollefeys; Viktor Larsson", + "author_site": "Rémi Pautrat, Iago Suárez, Yifan Yu, Marc Pollefeys, Viktor Larsson", + "author": "Rémi Pautrat; Iago Suárez; Yifan Yu; Marc Pollefeys; Viktor Larsson", "abstract": "Line segments are powerful features complementary to points. They offer structural cues, robust to drastic viewpoint and illumination changes, and can be present even in texture-less areas. However, describing and matching them is more challenging compared to points due to partial occlusions, lack of texture, or repetitiveness. This paper introduces a new matching paradigm, where points, lines, and their descriptors are unified into a single wireframe structure. We propose GlueStick, a deep matching Graph Neural Network (GNN) that takes two wireframes from different images and leverages the connectivity information between nodes to better glue them together. In addition to the increased efficiency brought by the joint matching, we also demonstrate a large boost of performance when leveraging the complementary nature of these two features in a single architecture. We show that our matching strategy outperforms the state-of-the-art approaches independently matching line segments and points for a wide variety of datasets and tasks. Code is available at https://github.com/cvg/GlueStick.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Pautrat_GlueStick_Robust_Image_Matching_by_Sticking_Points_and_Lines_Together_ICCV_2023_paper.pdf", "aff": ";;;;", @@ -24807,7 +25641,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Pautrat_GlueStick_Robust_Image_Matching_by_Sticking_Points_and_Lines_Together_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Pautrat_GlueStick_Robust_Image_Matching_by_Sticking_Points_and_Lines_Together_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Pautrat_2023_ICCV,\n \n author = {\n Pautrat,\n R\\'emi and Su\\'arez,\n Iago and Yu,\n Yifan and Pollefeys,\n Marc and Larsson,\n Viktor\n},\n title = {\n GlueStick: Robust Image Matching by Sticking Points and Lines Together\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9706-9716\n} \n}" }, { "title": "Going Beyond Nouns With Vision & Language Models Using Synthetic Data", @@ -24819,7 +25654,7 @@ "author": "Paola Cascante-Bonilla; Khaled Shehada; James Seale Smith; Sivan Doveh; Donghyun Kim; Rameswar Panda; Gul Varol; Aude Oliva; Vicente Ordonez; Rogerio Feris; Leonid Karlinsky", "abstract": "Large-scale pre-trained Vision & Language (VL) models have shown remarkable performance in many applications, enabling replacing a fixed set of supported classes with zero-shot open vocabulary reasoning over (almost arbitrary) natural language prompts. However, recent works have uncovered a fundamental weakness of these models. For example, their difficulty to understand Visual Language Concepts (VLC) that go 'beyond nouns' such as the meaning of non-object words (e.g., attributes, actions, relations, states, etc.), or difficulty in performing compositional reasoning such as understanding the significance of the order of the words in a sentence. In this work, we investigate to which extent purely synthetic data could be leveraged to teach these models to overcome such shortcomings without compromising their zero-shot capabilities. We contribute Synthetic Visual Concepts (SyViC) - a million-scale synthetic dataset and data generation codebase allowing to generate additional suitable data to improve VLC understanding and compositional reasoning of VL models. Additionally, we propose a general VL finetuning strategy for effectively leveraging SyViC towards achieving these improvements. Our extensive experiments and ablations on VL-Checklist, Winoground, and ARO benchmarks demonstrate that it is possible to adapt strong pre-trained VL models with synthetic data significantly enhancing their VLC understanding (e.g. by 9.9% on ARO and 4.3% on VL-Checklist) with under 1% drop in their zero-shot accuracy.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Cascante-Bonilla_Going_Beyond_Nouns_With_Vision__Language_Models_Using_Synthetic_ICCV_2023_paper.pdf", - "aff": "Rice University+MIT-IBM Watson AI Lab; MIT-IBM Watson AI Lab+MIT; MIT-IBM Watson AI Lab+Georgia Institute of Technology; Weizmann Institute of Science+IBM Research; MIT-IBM Watson AI Lab+IBM Research; MIT-IBM Watson AI Lab+IBM Research; LIGM, \u00b4Ecole des Ponts; MIT-IBM Watson AI Lab+MIT; Rice University; MIT-IBM Watson AI Lab+IBM Research; MIT-IBM Watson AI Lab+IBM Research", + "aff": "Rice University+MIT-IBM Watson AI Lab; MIT-IBM Watson AI Lab+MIT; MIT-IBM Watson AI Lab+Georgia Institute of Technology; Weizmann Institute of Science+IBM Research; MIT-IBM Watson AI Lab+IBM Research; MIT-IBM Watson AI Lab+IBM Research; LIGM, ´Ecole des Ponts; MIT-IBM Watson AI Lab+MIT; Rice University; MIT-IBM Watson AI Lab+IBM Research; MIT-IBM Watson AI Lab+IBM Research", "project": "https://synthetic-vic.github.io/", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Cascante-Bonilla_Going_Beyond_Nouns_ICCV_2023_supplemental.pdf", @@ -24839,7 +25674,8 @@ "aff_campus_unique_index": ";;;;;;;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0+0;1+0;0+0;0+0;2;0+0;0;0+0;0+0", - "aff_country_unique": "United States;Israel;France" + "aff_country_unique": "United States;Israel;France", + "bibtex": "@InProceedings{Cascante-Bonilla_2023_ICCV,\n \n author = {\n Cascante-Bonilla,\n Paola and Shehada,\n Khaled and Smith,\n James Seale and Doveh,\n Sivan and Kim,\n Donghyun and Panda,\n Rameswar and Varol,\n Gul and Oliva,\n Aude and Ordonez,\n Vicente and Feris,\n Rogerio and Karlinsky,\n Leonid\n},\n title = {\n Going Beyond Nouns With Vision \\& Language Models Using Synthetic Data\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20155-20165\n} \n}" }, { "title": "Going Denser with Open-Vocabulary Part Segmentation", @@ -24862,7 +25698,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Sun_Going_Denser_with_Open-Vocabulary_Part_Segmentation_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Sun_Going_Denser_with_Open-Vocabulary_Part_Segmentation_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Sun_2023_ICCV,\n \n author = {\n Sun,\n Peize and Chen,\n Shoufa and Zhu,\n Chenchen and Xiao,\n Fanyi and Luo,\n Ping and Xie,\n Saining and Yan,\n Zhicheng\n},\n title = {\n Going Denser with Open-Vocabulary Part Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15453-15465\n} \n}" }, { "title": "Gradient-Regulated Meta-Prompt Learning for Generalizable Vision-Language Models", @@ -24894,7 +25731,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0;0;1;0;1;0", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Juncheng and Gao,\n Minghe and Wei,\n Longhui and Tang,\n Siliang and Zhang,\n Wenqiao and Li,\n Mengze and Ji,\n Wei and Tian,\n Qi and Chua,\n Tat-Seng and Zhuang,\n Yueting\n},\n title = {\n Gradient-Regulated Meta-Prompt Learning for Generalizable Vision-Language Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2551-2562\n} \n}" }, { "title": "Gradient-based Sampling for Class Imbalanced Semi-supervised Object Detection", @@ -24919,14 +25757,15 @@ "author_num": 9, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_Gradient-based_Sampling_for_Class_Imbalanced_Semi-supervised_Object_Detection_ICCV_2023_paper.html", "aff_unique_index": "0+0;1;1;1;1;1;1;1;0+0", - "aff_unique_norm": "Sun Yat-sen University;Baidu", + "aff_unique_norm": "Sun Yat-sen University;Baidu Inc.", "aff_unique_dep": "School of Computer Science and Engineering;Department of Computer Vision Technology (VIS)", "aff_unique_url": "http://www.sysu.edu.cn;https://www.baidu.com", "aff_unique_abbr": "SYSU;Baidu", "aff_campus_unique_index": "0+1;0+1", "aff_campus_unique": "Guangzhou;Shenzhen;", "aff_country_unique_index": "0+0;0;0;0;0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Jiaming and Lin,\n Xiangru and Zhang,\n Wei and Tan,\n Xiao and Li,\n Yingying and Han,\n Junyu and Ding,\n Errui and Wang,\n Jingdong and Li,\n Guanbin\n},\n title = {\n Gradient-based Sampling for Class Imbalanced Semi-supervised Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16390-16400\n} \n}" }, { "title": "Gram-based Attentive Neural Ordinary Differential Equations Network for Video Nystagmography Classification", @@ -24958,7 +25797,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0;0;0;0;0+0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Qiu_2023_ICCV,\n \n author = {\n Qiu,\n Xihe and Shi,\n Shaojie and Tan,\n Xiaoyu and Qu,\n Chao and Fang,\n Zhijun and Wang,\n Hailing and Gao,\n Yongbin and Wu,\n Peixia and Li,\n Huawei\n},\n title = {\n Gram-based Attentive Neural Ordinary Differential Equations Network for Video Nystagmography Classification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21339-21348\n} \n}" }, { "title": "Gramian Attention Heads are Strong yet Efficient Vision Learners", @@ -24990,7 +25830,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Ryu_2023_ICCV,\n \n author = {\n Ryu,\n Jongbin and Han,\n Dongyoon and Lim,\n Jongwoo\n},\n title = {\n Gramian Attention Heads are Strong yet Efficient Vision Learners\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5841-5851\n} \n}" }, { "title": "Graph Matching with Bi-level Noisy Correspondence", @@ -25022,7 +25863,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Lin_2023_ICCV,\n \n author = {\n Lin,\n Yijie and Yang,\n Mouxing and Yu,\n Jun and Hu,\n Peng and Zhang,\n Changqing and Peng,\n Xi\n},\n title = {\n Graph Matching with Bi-level Noisy Correspondence\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23362-23371\n} \n}" }, { "title": "GraphAlign: Enhancing Accurate Feature Alignment by Graph matching for Multi-Modal 3D Object Detection", @@ -25047,14 +25889,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Song_GraphAlign_Enhancing_Accurate_Feature_Alignment_by_Graph_matching_for_Multi-Modal_ICCV_2023_paper.html", "aff_unique_index": "0;1;0;2;0", - "aff_unique_norm": "Beijing Jiao Tong University;Hebei University of Science and Technology;Tsinghua University", + "aff_unique_norm": "Beijing Jiaotong University;Hebei University of Science and Technology;Tsinghua University", "aff_unique_dep": "School of Computer and Information Technology;School of Information Science and Engineering;State Key Laboratory of Automotive Safety and Energy", "aff_unique_url": "http://www.bjtu.edu.cn;;https://www.tsinghua.edu.cn", "aff_unique_abbr": "BJTU;;THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Song_2023_ICCV,\n \n author = {\n Song,\n Ziying and Wei,\n Haiyue and Bai,\n Lin and Yang,\n Lei and Jia,\n Caiyan\n},\n title = {\n GraphAlign: Enhancing Accurate Feature Alignment by Graph matching for Multi-Modal 3D Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3358-3369\n} \n}" }, { "title": "GraphEcho: Graph-Driven Unsupervised Domain Adaptation for Echocardiogram Video Segmentation", @@ -25066,7 +25909,7 @@ "author": "Jiewen Yang; Xinpeng Ding; Ziyang Zheng; Xiaowei Xu; Xiaomeng Li", "abstract": "Echocardiogram video segmentation plays an important role in cardiac disease diagnosis. This paper studies the unsupervised domain adaption (UDA) for echocardiogram video segmentation, where the goal is to generalize the model trained on the source domain to other unlabeled target domains. Existing UDA segmentation methods are not suitable for this task because they do not model local information and the cyclical consistency of heartbeat. In this paper, we introduce a newly collected CardiacUDA dataset and a novel GraphEcho method for cardiac structure segmentation. Our GraphEcho comprises two innovative modules, the Spatial-wise Cross-domain Graph Matching (SCGM) and the Temporal Cycle Consistency (TCC) module, which utilize prior knowledge of echocardiogram videos, i.e., consistent cardiac structure across patients and centers and the heartbeat cyclical consistency, respectively. These two modules can better align global and local features from source and target domains, leading to improved UDA segmentation results. Experimental results showed that our GraphEcho outperforms existing state-of-the-art UDA segmentation methods. Our collected dataset and code will be publicly released upon acceptance. This work will lay a new and solid cornerstone for cardiac structure segmentation from echocardiogram videos.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Yang_GraphEcho_Graph-Driven_Unsupervised_Domain_Adaptation_for_Echocardiogram_Video_Segmentation_ICCV_2023_paper.pdf", - "aff": "Hong Kong University of Science and Technology1; Hong Kong University of Science and Technology1; Hong Kong University of Science and Technology1+Guangdong Provincial People\u2019s Hospital, Institute of Cardiovascular Diseases, GuangZhou, China2; Guangdong Provincial People\u2019s Hospital, Institute of Cardiovascular Diseases, GuangZhou, China2; Hong Kong University of Science and Technology1", + "aff": "Hong Kong University of Science and Technology1; Hong Kong University of Science and Technology1; Hong Kong University of Science and Technology1+Guangdong Provincial People’s Hospital, Institute of Cardiovascular Diseases, GuangZhou, China2; Guangdong Provincial People’s Hospital, Institute of Cardiovascular Diseases, GuangZhou, China2; Hong Kong University of Science and Technology1", "project": "", "github": "https://github.com/xmed-lab/GraphEcho", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Yang_GraphEcho_Graph-Driven_Unsupervised_ICCV_2023_supplemental.zip", @@ -25079,14 +25922,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yang_GraphEcho_Graph-Driven_Unsupervised_Domain_Adaptation_for_Echocardiogram_Video_Segmentation_ICCV_2023_paper.html", "aff_unique_index": "0;0;0+1;1;0", - "aff_unique_norm": "Hong Kong University of Science and Technology;Guangdong Provincial People\u2019s Hospital", + "aff_unique_norm": "Hong Kong University of Science and Technology;Guangdong Provincial People’s Hospital", "aff_unique_dep": ";Institute of Cardiovascular Diseases", "aff_unique_url": "https://www.ust.hk;", "aff_unique_abbr": "HKUST;", "aff_campus_unique_index": "0;0;0+1;1;0", "aff_campus_unique": "Hong Kong SAR;GuangZhou", "aff_country_unique_index": "0;0;0+0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n Jiewen and Ding,\n Xinpeng and Zheng,\n Ziyang and Xu,\n Xiaowei and Li,\n Xiaomeng\n},\n title = {\n GraphEcho: Graph-Driven Unsupervised Domain Adaptation for Echocardiogram Video Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11878-11887\n} \n}" }, { "title": "Graphics2RAW: Mapping Computer Graphics Images to Sensor RAW Images", @@ -25110,15 +25954,16 @@ "email": "; ; ; ; ; ; ; ", "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Seo_Graphics2RAW_Mapping_Computer_Graphics_Images_to_Sensor_RAW_Images_ICCV_2023_paper.html", - "aff_unique_index": "0;0;0;1;0+2;0;0;0", - "aff_unique_norm": "Samsung;Google;York University", - "aff_unique_dep": "Samsung Electronics;Google Research;", - "aff_unique_url": "https://www.samsung.com;https://research.google;https://yorku.ca", - "aff_unique_abbr": "Samsung;Google Research;York U", + "aff_unique_index": "0;1;1;2;1+3;0;0;1", + "aff_unique_norm": "Samsung Electronics;Samsung AI Center;Google;York University", + "aff_unique_dep": ";AI Center;Google Research;", + "aff_unique_url": "https://www.samsung.com;https://www.samsung.com/global/innovation/ai-research/;https://research.google;https://yorku.ca", + "aff_unique_abbr": "Samsung;Samsung AI;Google Research;York U", "aff_campus_unique_index": "1;1;2;1+1;1", "aff_campus_unique": ";Toronto;Mountain View", "aff_country_unique_index": "0;1;1;2;1+1;0;0;1", - "aff_country_unique": "South Korea;Canada;United States" + "aff_country_unique": "South Korea;Canada;United States", + "bibtex": "@InProceedings{Seo_2023_ICCV,\n \n author = {\n Seo,\n Donghwan and Punnappurath,\n Abhijith and Zhao,\n Luxi and Abdelhamed,\n Abdelrahman and Tedla,\n Sai Kiran and Park,\n Sanguk and Choe,\n Jihwan and Brown,\n Michael S.\n},\n title = {\n Graphics2RAW: Mapping Computer Graphics Images to Sensor RAW Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12622-12631\n} \n}" }, { "title": "GridMM: Grid Memory Map for Vision-and-Language Navigation", @@ -25150,7 +25995,8 @@ "aff_campus_unique_index": "0+0;0+0;0+0;0+0;0+0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0+0;0+0;0+0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Zihan and Li,\n Xiangyang and Yang,\n Jiahao and Liu,\n Yeqi and Jiang,\n Shuqiang\n},\n title = {\n GridMM: Grid Memory Map for Vision-and-Language Navigation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15625-15636\n} \n}" }, { "title": "GridPull: Towards Scalability in Learning Implicit Representations from 3D Point Clouds", @@ -25182,7 +26028,8 @@ "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Beijing;Detroit", "aff_country_unique_index": "0;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Chao and Liu,\n Yu-Shen and Han,\n Zhizhong\n},\n title = {\n GridPull: Towards Scalability in Learning Implicit Representations from 3D Point Clouds\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18322-18334\n} \n}" }, { "title": "Grounded Entity-Landmark Adaptive Pre-Training for Vision-and-Language Navigation", @@ -25214,7 +26061,8 @@ "aff_campus_unique_index": ";;;1;;", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0+0;0+0;0+0;0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Cui_2023_ICCV,\n \n author = {\n Cui,\n Yibo and Xie,\n Liang and Zhang,\n Yakun and Zhang,\n Meishan and Yan,\n Ye and Yin,\n Erwei\n},\n title = {\n Grounded Entity-Landmark Adaptive Pre-Training for Vision-and-Language Navigation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12043-12053\n} \n}" }, { "title": "Grounded Image Text Matching with Mismatched Relation Reasoning", @@ -25237,7 +26085,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wu_Grounded_Image_Text_Matching_with_Mismatched_Relation_Reasoning_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wu_Grounded_Image_Text_Matching_with_Mismatched_Relation_Reasoning_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Wu_2023_ICCV,\n \n author = {\n Wu,\n Yu and Wei,\n Yana and Wang,\n Haozhe and Liu,\n Yongfei and Yang,\n Sibei and He,\n Xuming\n},\n title = {\n Grounded Image Text Matching with Mismatched Relation Reasoning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2976-2987\n} \n}" }, { "title": "Grounding 3D Object Affordance from 2D Interactions in Images", @@ -25269,7 +26118,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Hefei", "aff_country_unique_index": "0;0;0;0+0;1;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n Yuhang and Zhai,\n Wei and Luo,\n Hongchen and Cao,\n Yang and Luo,\n Jiebo and Zha,\n Zheng-Jun\n},\n title = {\n Grounding 3D Object Affordance from 2D Interactions in Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10905-10915\n} \n}" }, { "title": "Group DETR: Fast DETR Training with Group-Wise One-to-Many Assignment", @@ -25301,7 +26151,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0;0;0;0;0;0", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Qiang and Chen,\n Xiaokang and Wang,\n Jian and Zhang,\n Shan and Yao,\n Kun and Feng,\n Haocheng and Han,\n Junyu and Ding,\n Errui and Zeng,\n Gang and Wang,\n Jingdong\n},\n title = {\n Group DETR: Fast DETR Training with Group-Wise One-to-Many Assignment\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6633-6642\n} \n}" }, { "title": "Group Pose: A Simple Baseline for End-to-End Multi-Person Pose Estimation", @@ -25326,14 +26177,15 @@ "author_num": 12, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Liu_Group_Pose_A_Simple_Baseline_for_End-to-End_Multi-Person_Pose_Estimation_ICCV_2023_paper.html", "aff_unique_index": "0+1+2;1;1;1;1;1;0+1+2;1;1;1;0+1+2;1", - "aff_unique_norm": "Beijing Jiao Tong University;Baidu;Beijing Key Laboratory of Advanced Information Science and Network Technology", + "aff_unique_norm": "Beijing Jiaotong University;Baidu;Beijing Key Laboratory of Advanced Information Science and Network Technology", "aff_unique_dep": "Institute of Information Science;Baidu Visualization;Advanced Information Science and Network Technology", "aff_unique_url": "http://www.bjtu.edu.cn;https://www.baidu.com;", "aff_unique_abbr": "BJTU;Baidu;", - "aff_campus_unique_index": "0+0;0+0;0+0", - "aff_campus_unique": "Beijing;", + "aff_campus_unique_index": "1;1;1", + "aff_campus_unique": ";Beijing", "aff_country_unique_index": "0+0+0;0;0;0;0;0;0+0+0;0;0;0;0+0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Huan and Chen,\n Qiang and Tan,\n Zichang and Liu,\n Jiang-Jiang and Wang,\n Jian and Su,\n Xiangbo and Li,\n Xiaolong and Yao,\n Kun and Han,\n Junyu and Ding,\n Errui and Zhao,\n Yao and Wang,\n Jingdong\n},\n title = {\n Group Pose: A Simple Baseline for End-to-End Multi-Person Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15029-15038\n} \n}" }, { "title": "GrowCLIP: Data-Aware Automatic Model Growing for Large-scale Contrastive Language-Image Pre-Training", @@ -25345,7 +26197,7 @@ "author": "Xinchi Deng; Han Shi; Runhui Huang; Changlin Li; Hang Xu; Jianhua Han; James Kwok; Shen Zhao; Wei Zhang; Xiaodan Liang", "abstract": "Cross-modal pre-training has shown impressive performance on a wide range of downstream tasks, benefiting from massive image-text pairs collected from the Internet. In practice, online data are growing constantly, highlighting the importance of the ability of pre-trained model to learn from data that is continuously growing. Existing works on cross-modal pre-training mainly focus on training a network with fixed architecture. However, it is impractical to limit the model capacity when considering the continuously growing nature of pre-training data in real-world applications. On the other hand, it is important to utilize the knowledge in current model to obtain efficient training and better performance. To address the above issues, in this paper, we propose GrowCLIP, a data-driven automatic model growing algorithm for contrastive language-image pre-training with continuous image-text pairs as input. Specially, we adopt a dynamic growth space and seek out the optimal architecture at each growth step to adapt to online learning scenarios. And the shared encoder is proposed in our growth space to enhance the degree of cross-modal fusion. Besides, we explore the effect of growth in different dimensions, which could provide future references for the design of cross-modal model architecture. Finally, we employ parameter inheriting with momentum (PIM) to maintain the previous knowledge and address the issue of local minimum dilemma. Compared with the existing methods, GrowCLIP improve 2.3% average top-1 accuracy on zero-shot image classification of 9 downstream tasks. As for zero-shot image retrieval, GrowCLIP can improve 1.2% for top-1 image-to-text recall on Flickr30K dataset.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Deng_GrowCLIP_Data-Aware_Automatic_Model_Growing_for_Large-scale_Contrastive_Language-Image_Pre-Training_ICCV_2023_paper.pdf", - "aff": "Sun Yat-sen University; Huawei Noah\u2019s Ark Lab; University of Technology Sydney; The Hong Kong University of Science and Technology; Sun Yat-sen University; Huawei Noah\u2019s Ark Lab; The Hong Kong University of Science and Technology; Sun Yat-sen University; Huawei Noah\u2019s Ark Lab; Sun Yat-sen University", + "aff": "Sun Yat-sen University; Huawei Noah’s Ark Lab; University of Technology Sydney; The Hong Kong University of Science and Technology; Sun Yat-sen University; Huawei Noah’s Ark Lab; The Hong Kong University of Science and Technology; Sun Yat-sen University; Huawei Noah’s Ark Lab; Sun Yat-sen University", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Deng_GrowCLIP_Data-Aware_Automatic_ICCV_2023_supplemental.pdf", @@ -25359,13 +26211,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Deng_GrowCLIP_Data-Aware_Automatic_Model_Growing_for_Large-scale_Contrastive_Language-Image_Pre-Training_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;3;0;1;3;0;1;0", "aff_unique_norm": "Sun Yat-sen University;Huawei;University of Technology Sydney;Hong Kong University of Science and Technology", - "aff_unique_dep": ";Noah\u2019s Ark Lab;;", + "aff_unique_dep": ";Noah’s Ark Lab;;", "aff_unique_url": "http://www.sysu.edu.cn/;https://www.huawei.com;https://www.uts.edu.au;https://www.ust.hk", "aff_unique_abbr": "SYSU;Huawei;UTS;HKUST", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;1;0;0;0;0;0;0;0", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Deng_2023_ICCV,\n \n author = {\n Deng,\n Xinchi and Shi,\n Han and Huang,\n Runhui and Li,\n Changlin and Xu,\n Hang and Han,\n Jianhua and Kwok,\n James and Zhao,\n Shen and Zhang,\n Wei and Liang,\n Xiaodan\n},\n title = {\n GrowCLIP: Data-Aware Automatic Model Growing for Large-scale Contrastive Language-Image Pre-Training\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22178-22189\n} \n}" }, { "title": "Growing a Brain with Sparsity-Inducing Generation for Continual Learning", @@ -25390,14 +26243,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Jin_Growing_a_Brain_with_Sparsity-Inducing_Generation_for_Continual_Learning_ICCV_2023_paper.html", "aff_unique_index": "0;0;1;0", - "aff_unique_norm": "Chung-Ang University;Samsung", - "aff_unique_dep": "School of Computer Science and Engineering;Samsung Advanced Institute of Technology", + "aff_unique_norm": "Chung-Ang University;Samsung Advanced Institute of Technology", + "aff_unique_dep": "School of Computer Science and Engineering;", "aff_unique_url": "http://www.cau.ac.kr;https://www.sait.samsung.com", "aff_unique_abbr": "CAU;SAIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Jin_2023_ICCV,\n \n author = {\n Jin,\n Hyundong and Kim,\n Gyeong-hyeon and Ahn,\n Chanho and Kim,\n Eunwoo\n},\n title = {\n Growing a Brain with Sparsity-Inducing Generation for Continual Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18961-18970\n} \n}" }, { "title": "Guided Motion Diffusion for Controllable Human Motion Synthesis", @@ -25420,7 +26274,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Karunratanakul_Guided_Motion_Diffusion_for_Controllable_Human_Motion_Synthesis_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Karunratanakul_Guided_Motion_Diffusion_for_Controllable_Human_Motion_Synthesis_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Karunratanakul_2023_ICCV,\n \n author = {\n Karunratanakul,\n Korrawe and Preechakul,\n Konpat and Suwajanakorn,\n Supasorn and Tang,\n Siyu\n},\n title = {\n Guided Motion Diffusion for Controllable Human Motion Synthesis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2151-2162\n} \n}" }, { "title": "Guiding Image Captioning Models Toward More Specific Captions", @@ -25445,14 +26300,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Kornblith_Guiding_Image_Captioning_Models_Toward_More_Specific_Captions_ICCV_2023_paper.html", "aff_unique_index": "0;1;1+2;2", - "aff_unique_norm": "Google;Apple;University of Washington", - "aff_unique_dep": "Google DeepMind;AI/ML;", + "aff_unique_norm": "Google;Apple Inc.;University of Washington", + "aff_unique_dep": "DeepMind;AI/ML;", "aff_unique_url": "https://deepmind.com;https://www.apple.com;https://www.washington.edu", "aff_unique_abbr": "DeepMind;Apple;UW", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1+1;1", - "aff_country_unique": "United Kingdom;United States" + "aff_country_unique": "United Kingdom;United States", + "bibtex": "@InProceedings{Kornblith_2023_ICCV,\n \n author = {\n Kornblith,\n Simon and Li,\n Lala and Wang,\n Zirui and Nguyen,\n Thao\n},\n title = {\n Guiding Image Captioning Models Toward More Specific Captions\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15259-15269\n} \n}" }, { "title": "Guiding Local Feature Matching with Surface Curvature", @@ -25477,14 +26333,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wang_Guiding_Local_Feature_Matching_with_Surface_Curvature_ICCV_2023_paper.html", "aff_unique_index": "0;0;1+2;1", - "aff_unique_norm": "Aalto University;ETH Zurich;Microsoft", - "aff_unique_dep": ";;Microsoft Corporation", + "aff_unique_norm": "Aalto University;ETH Zurich;Microsoft Corporation", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.aalto.fi;https://www.ethz.ch;https://www.microsoft.com", "aff_unique_abbr": "Aalto;ETHZ;Microsoft", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Zurich", "aff_country_unique_index": "0;0;1+2;1", - "aff_country_unique": "Finland;Switzerland;United States" + "aff_country_unique": "Finland;Switzerland;United States", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Shuzhe and Kannala,\n Juho and Pollefeys,\n Marc and Barath,\n Daniel\n},\n title = {\n Guiding Local Feature Matching with Surface Curvature\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17981-17991\n} \n}" }, { "title": "H3WB: Human3.6M 3D WholeBody Dataset and Benchmark", @@ -25496,7 +26353,7 @@ "author": "Yue Zhu; Nermin Samet; David Picard", "abstract": "We present a benchmark for 3D human whole-body pose estimation, which involves identifying accurate 3D keypoints on the entire human body, including face, hands, body, and feet. Currently, the lack of a fully annotated and accurate 3D whole-body dataset results in deep networks being trained separately on specific body parts, which are combined during inference. Or they rely on pseudo-groundtruth provided by parametric body models which are not as accurate as detection based methods. To overcome these issues, we introduce the Human3.6M 3D WholeBody (H3WB) dataset, which provides whole-body annotations for the Human3.6M dataset using the COCO Wholebody layout. H3WB comprises 133 whole-body keypoint annotations on 100K images, made possible by our new multi-view pipeline. We also propose three tasks: i) 3D whole-body pose lifting from 2D complete whole-body pose, ii) 3D whole-body pose lifting from 2D incomplete whole-body pose, and iii) 3D whole-body pose estimation from a single RGB image. Additionally, we report several baselines from popular methods for these tasks. Furthermore, we also provide automated 3D whole-body annotations of TotalCapture and experimentally show that when used with H3WB it helps to improve the performance.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Zhu_H3WB_Human3.6M_3D_WholeBody_Dataset_and_Benchmark_ICCV_2023_paper.pdf", - "aff": "LIGM, Ecole des Ponts, Univ Gustave Eiffel, CNRS, Marne-la-Vall\u00e9e, France; LIGM, Ecole des Ponts, Univ Gustave Eiffel, CNRS, Marne-la-Vall\u00e9e, France; LIGM, Ecole des Ponts, Univ Gustave Eiffel, CNRS, Marne-la-Vall\u00e9e, France", + "aff": "LIGM, Ecole des Ponts, Univ Gustave Eiffel, CNRS, Marne-la-Vallée, France; LIGM, Ecole des Ponts, Univ Gustave Eiffel, CNRS, Marne-la-Vallée, France; LIGM, Ecole des Ponts, Univ Gustave Eiffel, CNRS, Marne-la-Vallée, France", "project": "", "github": "https://github.com/wholebody3d/wholebody3d", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Zhu_H3WB_Human3.6M_3D_ICCV_2023_supplemental.pdf", @@ -25514,9 +26371,10 @@ "aff_unique_url": "https://www.ponts.fr", "aff_unique_abbr": "ENPC", "aff_campus_unique_index": "0;0;0", - "aff_campus_unique": "Marne-la-Vall\u00e9e", + "aff_campus_unique": "Marne-la-Vallée", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "France" + "aff_country_unique": "France", + "bibtex": "@InProceedings{Zhu_2023_ICCV,\n \n author = {\n Zhu,\n Yue and Samet,\n Nermin and Picard,\n David\n},\n title = {\n H3WB: Human3.6M 3D WholeBody Dataset and Benchmark\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20166-20177\n} \n}" }, { "title": "HAL3D: Hierarchical Active Learning for Fine-Grained 3D Part Labeling", @@ -25541,14 +26399,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yu_HAL3D_Hierarchical_Active_Learning_for_Fine-Grained_3D_Part_Labeling_ICCV_2023_paper.html", "aff_unique_index": "0+1;0;0;0;0;0+1", - "aff_unique_norm": "Amazon;Simon Fraser University", - "aff_unique_dep": "Amazon.com, Inc.;", + "aff_unique_norm": "Amazon.com, Inc.;Simon Fraser University", + "aff_unique_dep": ";", "aff_unique_url": "https://www.amazon.com;https://www.sfu.ca", "aff_unique_abbr": "Amazon;SFU", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0;0;0;0;0+1", - "aff_country_unique": "United States;Canada" + "aff_country_unique": "United States;Canada", + "bibtex": "@InProceedings{Yu_2023_ICCV,\n \n author = {\n Yu,\n Fenggen and Qian,\n Yiming and Gil-Ureta,\n Francisca and Jackson,\n Brian and Bennett,\n Eric and Zhang,\n Hao\n},\n title = {\n HAL3D: Hierarchical Active Learning for Fine-Grained 3D Part Labeling\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 865-875\n} \n}" }, { "title": "HDG-ODE: A Hierarchical Continuous-Time Model for Human Pose Forecasting", @@ -25580,7 +26439,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Stony Brook", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Xing_2023_ICCV,\n \n author = {\n Xing,\n Yucheng and Wang,\n Xin\n},\n title = {\n HDG-ODE: A Hierarchical Continuous-Time Model for Human Pose Forecasting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14700-14712\n} \n}" }, { "title": "HM-ViT: Hetero-Modal Vehicle-to-Vehicle Cooperative Perception with Vision Transformer", @@ -25612,7 +26472,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Los Angeles", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Xiang_2023_ICCV,\n \n author = {\n Xiang,\n Hao and Xu,\n Runsheng and Ma,\n Jiaqi\n},\n title = {\n HM-ViT: Hetero-Modal Vehicle-to-Vehicle Cooperative Perception with Vision Transformer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 284-295\n} \n}" }, { "title": "HMD-NeMo: Online 3D Avatar Motion Generation From Sparse Observations", @@ -25637,14 +26498,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Aliakbarian_HMD-NeMo_Online_3D_Avatar_Motion_Generation_From_Sparse_Observations_ICCV_2023_paper.html", "aff_unique_index": "0", - "aff_unique_norm": "Microsoft", + "aff_unique_norm": "Microsoft Mixed Reality & AI Lab", "aff_unique_dep": "Mixed Reality & AI", "aff_unique_url": "https://www.microsoft.com", "aff_unique_abbr": "Microsoft", "aff_campus_unique_index": "0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Aliakbarian_2023_ICCV,\n \n author = {\n Aliakbarian,\n Sadegh and Saleh,\n Fatemeh and Collier,\n David and Cameron,\n Pashmina and Cosker,\n Darren\n},\n title = {\n HMD-NeMo: Online 3D Avatar Motion Generation From Sparse Observations\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9622-9631\n} \n}" }, { "title": "HOSNeRF: Dynamic Human-Object-Scene Neural Radiance Fields from a Single Video", @@ -25676,7 +26538,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Singapore;", "aff_country_unique_index": "0;1;0;0;0;1;1;0", - "aff_country_unique": "Singapore;China" + "aff_country_unique": "Singapore;China", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Jia-Wei and Cao,\n Yan-Pei and Yang,\n Tianyuan and Xu,\n Zhongcong and Keppo,\n Jussi and Shan,\n Ying and Qie,\n Xiaohu and Shou,\n Mike Zheng\n},\n title = {\n HOSNeRF: Dynamic Human-Object-Scene Neural Radiance Fields from a Single Video\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18483-18494\n} \n}" }, { "title": "HRS-Bench: Holistic, Reliable and Scalable Benchmark for Text-to-Image Models", @@ -25708,7 +26571,8 @@ "aff_campus_unique_index": ";;;", "aff_campus_unique": "", "aff_country_unique_index": "0+1;2;0+1;0+1;1;0+1", - "aff_country_unique": "Saudi Arabia;United States;Singapore" + "aff_country_unique": "Saudi Arabia;United States;Singapore", + "bibtex": "@InProceedings{Bakr_2023_ICCV,\n \n author = {\n Bakr,\n Eslam Mohamed and Sun,\n Pengzhan and Shen,\n Xiaoqian and Khan,\n Faizan Farooq and Li,\n Li Erran and Elhoseiny,\n Mohamed\n},\n title = {\n HRS-Bench: Holistic,\n Reliable and Scalable Benchmark for Text-to-Image Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20041-20053\n} \n}" }, { "title": "HSE: Hybrid Species Embedding for Deep Metric Learning", @@ -25731,7 +26595,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yang_HSE_Hybrid_Species_Embedding_for_Deep_Metric_Learning_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yang_HSE_Hybrid_Species_Embedding_for_Deep_Metric_Learning_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n Bailin and Sun,\n Haoqiang and Li,\n Frederick W. B. and Chen,\n Zheng and Cai,\n Jianlu and Song,\n Chao\n},\n title = {\n HSE: Hybrid Species Embedding for Deep Metric Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11047-11057\n} \n}" }, { "title": "HSR-Diff: Hyperspectral Image Super-Resolution via Conditional Diffusion Models", @@ -25763,7 +26628,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0;1", - "aff_country_unique": "China;United Kingdom" + "aff_country_unique": "China;United Kingdom", + "bibtex": "@InProceedings{Wu_2023_ICCV,\n \n author = {\n Wu,\n Chanyue and Wang,\n Dong and Bai,\n Yunpeng and Mao,\n Hanyu and Li,\n Ying and Shen,\n Qiang\n},\n title = {\n HSR-Diff: Hyperspectral Image Super-Resolution via Conditional Diffusion Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7083-7093\n} \n}" }, { "title": "HTML: Hybrid Temporal-scale Multimodal Learning Framework for Referring Video Object Segmentation", @@ -25788,14 +26654,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Han_HTML_Hybrid_Temporal-scale_Multimodal_Learning_Framework_for_Referring_Video_Object_ICCV_2023_paper.html", "aff_unique_index": "0+1+2;3+2;4;1;5+0;2", - "aff_unique_norm": "ReLER;CSIRO;Shanghai AI Laboratory;Shenzhen Institute of Advanced Technology;Qilu University of Technology;Mohamed bin Zayed University of Artificial Intelligence", + "aff_unique_norm": "ReLER;CSIRO;Shanghai AI Laboratory;Shenzhen Institutes of Advanced Technology;Qilu University of Technology;Mohamed bin Zayed University of Artificial Intelligence", "aff_unique_dep": ";Data61;;Provincial Key Laboratory of Computer Vision and Virtual Reality Technology;Shandong Artificial Intelligence;Department of Computer Vision", - "aff_unique_url": ";https://www.csiro.au;https://www.shanghaiailab.com;http://www.siat.ac.cn;https://www.qlu.edu.cn;https://www.mbru.ac.ae", - "aff_unique_abbr": ";CSIRO;SAIL;SIAT;QLU;MBZUAI", + "aff_unique_url": ";https://www.csiro.au;;http://www.siat.ac.cn;https://www.qlu.edu.cn;https://www.mbru.ac.ae", + "aff_unique_abbr": ";CSIRO;;SIAT;QLU;MBZUAI", "aff_campus_unique_index": "1;2+1;;1", "aff_campus_unique": ";Shanghai;Shenzhen", "aff_country_unique_index": "0+1+0;0+0;0;1;2+0;0", - "aff_country_unique": "China;Australia;United Arab Emirates" + "aff_country_unique": "China;Australia;United Arab Emirates", + "bibtex": "@InProceedings{Han_2023_ICCV,\n \n author = {\n Han,\n Mingfei and Wang,\n Yali and Li,\n Zhihui and Yao,\n Lina and Chang,\n Xiaojun and Qiao,\n Yu\n},\n title = {\n HTML: Hybrid Temporal-scale Multimodal Learning Framework for Referring Video Object Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13414-13423\n} \n}" }, { "title": "HaMuCo: Hand Pose Estimation via Multiview Collaborative Self-Supervised Learning", @@ -25822,12 +26689,13 @@ "aff_unique_index": "0+1;1;1;0+1;0", "aff_unique_norm": "Beijing University of Posts and Telecommunications;ByteDance", "aff_unique_dep": "State Key Laboratory of Networking and Switching Technology;PICO IDL", - "aff_unique_url": "http://www.bupt.edu.cn;https://www.bytedance.com", + "aff_unique_url": "http://www.bupt.edu.cn/;https://www.bytedance.com", "aff_unique_abbr": "BUPT;ByteDance", - "aff_campus_unique_index": "0+0;0;0;0+0;0", - "aff_campus_unique": "Beijing", + "aff_campus_unique_index": "1;1;1;1", + "aff_campus_unique": ";Beijing", "aff_country_unique_index": "0+0;0;0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zheng_2023_ICCV,\n \n author = {\n Zheng,\n Xiaozheng and Wen,\n Chao and Xue,\n Zhou and Ren,\n Pengfei and Wang,\n Jingyu\n},\n title = {\n HaMuCo: Hand Pose Estimation via Multiview Collaborative Self-Supervised Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20763-20773\n} \n}" }, { "title": "HairCLIPv2: Unifying Hair Editing via Proxy Feature Blending", @@ -25839,7 +26707,7 @@ "author": "Tianyi Wei; Dongdong Chen; Wenbo Zhou; Jing Liao; Weiming Zhang; Gang Hua; Nenghai Yu", "abstract": "Hair editing has made tremendous progress in recent years. Early hair editing methods use well-drawn sketches or masks to specify the editing conditions. Even though they can enable very fine-grained local control, such interaction modes are inefficient for the editing conditions that can be easily specified by language descriptions or reference images. Thanks to the recent breakthrough of cross-modal models (e.g., CLIP), HairCLIP is the first work that enables hair editing based on text descriptions or reference images. However, such text-driven and reference-driven interaction modes make HairCLIP unable to support fine-grained controls specified by sketch or mask. In this paper, we propose HairCLIPv2, aiming to support all the aforementioned interactions with one unified framework. Simultaneously, it improves upon HairCLIP with better irrelevant attributes (e.g., identity, background) preservation and unseen text descriptions support. The key idea is to convert all the hair editing tasks into hair transfer tasks, with editing conditions converted into different proxies accordingly. The editing effects are added upon the input image by blending the corresponding proxy features within the hairstyle or hair color feature spaces. Besides the unprecedented user interaction mode support, quantitative and qualitative experiments demonstrate the superiority of HairCLIPv2 in terms of editing effects, irrelevant attribute preservation and visual naturalness. Our code is available at https://github.com/wty-ustc/HairCLIPv2.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Wei_HairCLIPv2_Unifying_Hair_Editing_via_Proxy_Feature_Blending_ICCV_2023_paper.pdf", - "aff": "University of Science and Technology of China; Microsoft Cloud AI; University of Science and Technology of China; City University of Hong Kong; University of Science and Technology of China; Xi\u2019an Jiaotong University; University of Science and Technology of China", + "aff": "University of Science and Technology of China; Microsoft Cloud AI; University of Science and Technology of China; City University of Hong Kong; University of Science and Technology of China; Xi’an Jiaotong University; University of Science and Technology of China", "project": "", "github": "https://github.com/wty-ustc/HairCLIPv2", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Wei_HairCLIPv2_Unifying_Hair_ICCV_2023_supplemental.pdf", @@ -25852,14 +26720,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wei_HairCLIPv2_Unifying_Hair_Editing_via_Proxy_Feature_Blending_ICCV_2023_paper.html", "aff_unique_index": "0;1;0;2;0;3;0", - "aff_unique_norm": "University of Science and Technology of China;Microsoft;City University of Hong Kong;Xi'an Jiao Tong University", + "aff_unique_norm": "University of Science and Technology of China;Microsoft;City University of Hong Kong;Xi'an Jiaotong University", "aff_unique_dep": ";Cloud AI;;", "aff_unique_url": "http://www.ustc.edu.cn;https://www.microsoft.com/en-us/research/group/cloud-ai;https://www.cityu.edu.hk;https://www.xjtu.edu.cn", "aff_unique_abbr": "USTC;Microsoft Cloud AI;CityU;XJTU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;1;0;0;0;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Wei_2023_ICCV,\n \n author = {\n Wei,\n Tianyi and Chen,\n Dongdong and Zhou,\n Wenbo and Liao,\n Jing and Zhang,\n Weiming and Hua,\n Gang and Yu,\n Nenghai\n},\n title = {\n HairCLIPv2: Unifying Hair Editing via Proxy Feature Blending\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23589-23599\n} \n}" }, { "title": "HairNeRF: Geometry-Aware Image Synthesis for Hairstyle Transfer", @@ -25871,7 +26740,7 @@ "author": "Seunggyu Chang; Gihoon Kim; Hayeon Kim", "abstract": "We propose a novel hairstyle transferred image synthesis method considering the underlying head geometry of two input images. In traditional GAN-based methods, transferring hairstyle from one image to the other often makes the synthesized result awkward due to differences in pose, shape, and size of heads. To resolve this, we utilize neural rendering by registering two input heads in the volumetric space to make a transferred hairstyle fit on the head of a target image. Because of the geometric nature of neural rendering, our method can render view varying images of synthesized results from a single transfer process without causing distortion from which extant hairstyle transfer methods built upon traditional GAN-based generators suffer. We verify that our method surpasses other baselines in view of preserving the identity and hairstyle of two input images when synthesizing a hairstyle transferred image rendered at any point of view.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Chang_HairNeRF_Geometry-Aware_Image_Synthesis_for_Hairstyle_Transfer_ICCV_2023_paper.pdf", - "aff": "NA VER Cloud*; KAIST\u2020; UNIST\u2020", + "aff": "NA VER Cloud*; KAIST†; UNIST†", "project": "", "github": "", "supp": "", @@ -25891,7 +26760,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Chang_2023_ICCV,\n \n author = {\n Chang,\n Seunggyu and Kim,\n Gihoon and Kim,\n Hayeon\n},\n title = {\n HairNeRF: Geometry-Aware Image Synthesis for Hairstyle Transfer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2448-2458\n} \n}" }, { "title": "Hallucination Improves the Performance of Unsupervised Visual Representation Learning", @@ -25914,7 +26784,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wu_Hallucination_Improves_the_Performance_of_Unsupervised_Visual_Representation_Learning_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wu_Hallucination_Improves_the_Performance_of_Unsupervised_Visual_Representation_Learning_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Wu_2023_ICCV,\n \n author = {\n Wu,\n Jing and Hobbs,\n Jennifer and Hovakimyan,\n Naira\n},\n title = {\n Hallucination Improves the Performance of Unsupervised Visual Representation Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16132-16143\n} \n}" }, { "title": "HandR2N2: Iterative 3D Hand Pose Estimation Using a Residual Recurrent Neural Network", @@ -25946,7 +26817,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Cheng_2023_ICCV,\n \n author = {\n Cheng,\n Wencan and Ko,\n Jong Hwan\n},\n title = {\n HandR2N2: Iterative 3D Hand Pose Estimation Using a Residual Recurrent Neural Network\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20904-20913\n} \n}" }, { "title": "Handwritten and Printed Text Segmentation: A Signature Case Study", @@ -25978,7 +26850,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Toronto", "aff_country_unique_index": "0;0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Gholamian_2023_ICCV,\n \n author = {\n Gholamian,\n Sina and Vahdat,\n Ali\n},\n title = {\n Handwritten and Printed Text Segmentation: A Signature Case Study\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 582-592\n} \n}" }, { "title": "Hard No-Box Adversarial Attack on Skeleton-Based Human Action Recognition with Skeleton-Motion-Informed Gradient", @@ -25990,7 +26863,7 @@ "author": "Zhengzhi Lu; He Wang; Ziyi Chang; Guoan Yang; Hubert P. H. Shum", "abstract": "Recently, methods for skeleton-based human activity recognition have been shown to be vulnerable to adversarial attacks. However, these attack methods require either the full knowledge of the victim (i.e. white-box attacks), access to training data (i.e. transfer-based attacks) or frequent model queries (i.e. black-box attacks). All their requirements are highly restrictive, raising the question of how detrimental the vulnerability is. In this paper, we show that the vulnerability indeed exists. To this end, we consider a new attack task: the attacker has no access to the victim model or the training data or labels, where we coin the term hard no-box attack. Specifically, we first learn a motion manifold where we define an adversarial loss to compute a new gradient for the attack, named skeleton-motion-informed (SMI) gradient. Our gradient contains information of the motion dynamics, which is different from existing gradient-based attack methods that compute the loss gradient assuming each dimension in the data is independent. The SMI gradient can augment many gradient-based attack methods, leading to a new family of no-box attack methods. Extensive evaluation and comparison show that our method imposes a real threat to existing classifiers. They also show that the SMI gradient improves the transferability and imperceptibility of adversarial samples in both no-box and transfer-based black-box settings.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Lu_Hard_No-Box_Adversarial_Attack_on_Skeleton-Based_Human_Action_Recognition_with_ICCV_2023_paper.pdf", - "aff": "Durham University, UK+Xi\u2019an Jiaotong University, China; University College London, UK; Durham University, UK; Xi\u2019an Jiaotong University, China; Durham University, UK", + "aff": "Durham University, UK+Xi’an Jiaotong University, China; University College London, UK; Durham University, UK; Xi’an Jiaotong University, China; Durham University, UK", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Lu_Hard_No-Box_Adversarial_ICCV_2023_supplemental.zip", @@ -26003,14 +26876,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Lu_Hard_No-Box_Adversarial_Attack_on_Skeleton-Based_Human_Action_Recognition_with_ICCV_2023_paper.html", "aff_unique_index": "0+1;2;0;1;0", - "aff_unique_norm": "Durham University;Xi'an Jiao Tong University;University College London", + "aff_unique_norm": "Durham University;Xi'an Jiaotong University;University College London", "aff_unique_dep": ";;", "aff_unique_url": "https://www.dur.ac.uk;http://en.xjtu.edu.cn/;https://www.ucl.ac.uk", "aff_unique_abbr": "Durham;XJTU;UCL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0;0;1;0", - "aff_country_unique": "United Kingdom;China" + "aff_country_unique": "United Kingdom;China", + "bibtex": "@InProceedings{Lu_2023_ICCV,\n \n author = {\n Lu,\n Zhengzhi and Wang,\n He and Chang,\n Ziyi and Yang,\n Guoan and Shum,\n Hubert P. H.\n},\n title = {\n Hard No-Box Adversarial Attack on Skeleton-Based Human Action Recognition with Skeleton-Motion-Informed Gradient\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4597-4606\n} \n}" }, { "title": "Harnessing the Spatial-Temporal Attention of Diffusion Models for High-Fidelity Text-to-Image Synthesis", @@ -26042,7 +26916,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Santa Barbara;", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Wu_2023_ICCV,\n \n author = {\n Wu,\n Qiucheng and Liu,\n Yujian and Zhao,\n Handong and Bui,\n Trung and Lin,\n Zhe and Zhang,\n Yang and Chang,\n Shiyu\n},\n title = {\n Harnessing the Spatial-Temporal Attention of Diffusion Models for High-Fidelity Text-to-Image Synthesis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7766-7776\n} \n}" }, { "title": "Harvard Glaucoma Detection and Progression: A Multimodal Multitask Dataset and Generalization-Reinforced Semi-Supervised Learning", @@ -26074,7 +26949,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Luo_2023_ICCV,\n \n author = {\n Luo,\n Yan and Shi,\n Min and Tian,\n Yu and Elze,\n Tobias and Wang,\n Mengyu\n},\n title = {\n Harvard Glaucoma Detection and Progression: A Multimodal Multitask Dataset and Generalization-Reinforced Semi-Supervised Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20471-20482\n} \n}" }, { "title": "Hashing Neural Video Decomposition with Multiplicative Residuals in Space-Time", @@ -26097,7 +26973,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chan_Hashing_Neural_Video_Decomposition_with_Multiplicative_Residuals_in_Space-Time_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chan_Hashing_Neural_Video_Decomposition_with_Multiplicative_Residuals_in_Space-Time_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Chan_2023_ICCV,\n \n author = {\n Chan,\n Cheng-Hung and Yuan,\n Cheng-Yang and Sun,\n Cheng and Chen,\n Hwann-Tzong\n},\n title = {\n Hashing Neural Video Decomposition with Multiplicative Residuals in Space-Time\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7743-7753\n} \n}" }, { "title": "Helping Hands: An Object-Aware Ego-Centric Video Recognition Model", @@ -26123,13 +27000,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhang_Helping_Hands_An_Object-Aware_Ego-Centric_Video_Recognition_Model_ICCV_2023_paper.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Oxford;Google", - "aff_unique_dep": "VGG;Google DeepMind", + "aff_unique_dep": "VGG;DeepMind", "aff_unique_url": "https://www.ox.ac.uk;https://deepmind.com", "aff_unique_abbr": "Oxford;DeepMind", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Oxford;London", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Chuhan and Gupta,\n Ankush and Zisserman,\n Andrew\n},\n title = {\n Helping Hands: An Object-Aware Ego-Centric Video Recognition Model\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13901-13912\n} \n}" }, { "title": "Heterogeneous Diversity Driven Active Learning for Multi-Object Tracking", @@ -26154,14 +27032,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_Heterogeneous_Diversity_Driven_Active_Learning_for_Multi-Object_Tracking_ICCV_2023_paper.html", "aff_unique_index": "0+1;0;1;0;2+3+4;0", - "aff_unique_norm": "Beijing Jiao Tong University;Singapore University of Technology and Design;Institute of North Electronic Equipment;Pengcheng Laboratory;Intelligent Game and Decision Laboratory", - "aff_unique_dep": ";;;Peng Cheng Laboratory;Intelligent Game and Decision Laboratory", + "aff_unique_norm": "Beijing Jiaotong University;Singapore University of Technology and Design;Institute of North Electronic Equipment;Peng Cheng Laboratory;Intelligent Game and Decision Laboratory", + "aff_unique_dep": ";;;;Intelligent Game and Decision Laboratory", "aff_unique_url": "http://www.njtu.edu.cn/en;https://www.sutd.edu.sg;;http://www.pcl.ac.cn;", "aff_unique_abbr": "BJTU;SUTD;;PCL;", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0;1;0;0+0;0", - "aff_country_unique": "China;Singapore;" + "aff_country_unique": "China;Singapore;", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Rui and Zhang,\n Baopeng and Liu,\n Jun and Liu,\n Wei and Zhao,\n Jian and Teng,\n Zhu\n},\n title = {\n Heterogeneous Diversity Driven Active Learning for Multi-Object Tracking\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9932-9941\n} \n}" }, { "title": "Heterogeneous Forgetting Compensation for Class-Incremental Learning", @@ -26193,7 +27072,8 @@ "aff_campus_unique_index": "0+0+1;0+0+1;2;0+0+1", "aff_campus_unique": "Shenyang;Beijing;Guangzhou", "aff_country_unique_index": "0+0+0;0+0+0;0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Dong_2023_ICCV,\n \n author = {\n Dong,\n Jiahua and Liang,\n Wenqi and Cong,\n Yang and Sun,\n Gan\n},\n title = {\n Heterogeneous Forgetting Compensation for Class-Incremental Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11742-11751\n} \n}" }, { "title": "HiFace: High-Fidelity 3D Face Reconstruction by Learning Static and Dynamic Details", @@ -26217,15 +27097,16 @@ "email": "gmail.com;mails.tsinghua.edu.cn;microsoft.com;microsoft.com;microsoft.com;microsoft.com;microsoft.com;microsoft.com;sz.tsinghua.edu.cn;microsoft.com", "author_num": 10, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chai_HiFace_High-Fidelity_3D_Face_Reconstruction_by_Learning_Static_and_Dynamic_ICCV_2023_paper.html", - "aff_unique_index": "0+1;1;2;2;2;2+3;2+3;2+3;1;2", - "aff_unique_norm": "National University of Singapore;Tsinghua University;Microsoft;AI", - "aff_unique_dep": ";;Research;", - "aff_unique_url": "https://www.nus.edu.sg;https://www.tsinghua.edu.cn;https://www.microsoft.com/en-us/research/group/asia;", - "aff_unique_abbr": "NUS;THU;MSR Asia;", + "aff_unique_index": "0+1;1;2;2;3;3+4;3+4;3+4;1;2", + "aff_unique_norm": "National University of Singapore;Tsinghua University;Microsoft Research;Microsoft Corporation;AI", + "aff_unique_dep": ";;Research;Mixed Reality & AI Lab;", + "aff_unique_url": "https://www.nus.edu.sg;https://www.tsinghua.edu.cn;https://www.microsoft.com/en-us/research/group/asia;https://www.microsoft.com;", + "aff_unique_abbr": "NUS;THU;MSR Asia;Microsoft;", "aff_campus_unique_index": ";1;1;;;;1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0+1;1;1;1;2;2;2;2;1;1", - "aff_country_unique": "Singapore;China;United States;" + "aff_country_unique": "Singapore;China;United States;", + "bibtex": "@InProceedings{Chai_2023_ICCV,\n \n author = {\n Chai,\n Zenghao and Zhang,\n Tianke and He,\n Tianyu and Tan,\n Xu and Baltrusaitis,\n Tadas and Wu,\n HsiangTao and Li,\n Runnan and Zhao,\n Sheng and Yuan,\n Chun and Bian,\n Jiang\n},\n title = {\n HiFace: High-Fidelity 3D Face Reconstruction by Learning Static and Dynamic Details\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9087-9098\n} \n}" }, { "title": "HiLo: Exploiting High Low Frequency Relations for Unbiased Panoptic Scene Graph Generation", @@ -26237,7 +27118,7 @@ "author": "Zijian Zhou; Miaojing Shi; Holger Caesar", "abstract": "Panoptic Scene Graph generation (PSG) is a recently proposed task in image scene understanding that aims to segment the image and extract triplets of subjects, objects and their relations to build a scene graph. This task is particularly challenging for two reasons. First, it suffers from a long-tail problem in its relation categories, making naive biased methods more inclined to high-frequency relations. Existing unbiased methods tackle the long-tail problem by data/loss rebalancing to favor low-frequency relations. Second, a subject-object pair can have two or more semantically overlapping relations. While existing methods favor one over the other, our proposed HiLo framework lets different network branches specialize on low and high frequency relations, enforce their consistency and fuse the results. To the best of our knowledge we are the first to propose an explicitly unbiased PSG method. In extensive experiments we show that our HiLo framework achieves state-of-the-art results on the PSG task. We also apply our method to the Scene Graph Generation task that predicts boxes instead of masks and see improvements over all baseline methods. Code is available at https://github.com/franciszzj/HiLo.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Zhou_HiLo_Exploiting_High_Low_Frequency_Relations_for_Unbiased_Panoptic_Scene_ICCV_2023_paper.pdf", - "aff": "Department of Informatics, King\u2019s College London; College of Electronic and Information Engineering, Tongji University; Intelligent Vehicles Lab, Delft University of Technology", + "aff": "Department of Informatics, King’s College London; College of Electronic and Information Engineering, Tongji University; Intelligent Vehicles Lab, Delft University of Technology", "project": "", "github": "https://github.com/franciszzj/HiLo", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Zhou_HiLo_Exploiting_High_ICCV_2023_supplemental.pdf", @@ -26250,14 +27131,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhou_HiLo_Exploiting_High_Low_Frequency_Relations_for_Unbiased_Panoptic_Scene_ICCV_2023_paper.html", "aff_unique_index": "0;1;2", - "aff_unique_norm": "King\u2019s College London;Tongji University;Delft University of Technology", + "aff_unique_norm": "King’s College London;Tongji University;Delft University of Technology", "aff_unique_dep": "Department of Informatics;College of Electronic and Information Engineering;Intelligent Vehicles Lab", "aff_unique_url": "https://www.kcl.ac.uk;https://www.tongji.edu.cn;https://www.tudelft.nl", "aff_unique_abbr": "KCL;Tongji;TUDelft", "aff_campus_unique_index": "0", "aff_campus_unique": "London;", "aff_country_unique_index": "0;1;2", - "aff_country_unique": "United Kingdom;China;Netherlands" + "aff_country_unique": "United Kingdom;China;Netherlands", + "bibtex": "@InProceedings{Zhou_2023_ICCV,\n \n author = {\n Zhou,\n Zijian and Shi,\n Miaojing and Caesar,\n Holger\n},\n title = {\n HiLo: Exploiting High Low Frequency Relations for Unbiased Panoptic Scene Graph Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21637-21648\n} \n}" }, { "title": "HiTeA: Hierarchical Temporal-Aware Video-Language Pre-training", @@ -26289,7 +27171,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Ye_2023_ICCV,\n \n author = {\n Ye,\n Qinghao and Xu,\n Guohai and Yan,\n Ming and Xu,\n Haiyang and Qian,\n Qi and Zhang,\n Ji and Huang,\n Fei\n},\n title = {\n HiTeA: Hierarchical Temporal-Aware Video-Language Pre-training\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15405-15416\n} \n}" }, { "title": "HiVLP: Hierarchical Interactive Video-Language Pre-Training", @@ -26301,7 +27184,7 @@ "author": "Bin Shao; Jianzhuang Liu; Renjing Pei; Songcen Xu; Peng Dai; Juwei Lu; Weimian Li; Youliang Yan", "abstract": "Video-Language Pre-training (VLP) has become one of the most popular research topics in deep learning. However, compared to image-language pre-training, VLP has lagged far behind due to the lack of large amounts of video-text pairs. In this work, we train a VLP model with a hybrid of image-text and video-text pairs, which significantly outperforms pre-training with only the video-text pairs. Besides, existing methods usually model the cross-modal interaction using cross-attention between single-scale visual tokens and textual tokens. These visual features are either of low resolutions lacking fine-grained information, or of high resolutions without high-level semantics. To address the issue, we propose Hierarchical interactive Video-Language Pre-training (HiVLP) that efficiently uses a hierarchical visual feature group for multi-modal cross-attention during pre-training. In the hierarchical framework, low-resolution features are learned with focus on more global high-level semantic information, while high-resolution features carry fine-grained details. As a result, HiVLP has the ability to effectively learn both the global and fine-grained representations to achieve better alignment between video and text inputs. Furthermore, we design a hierarchical multi-scale vision contrastive loss for self-supervised learning to boost the interaction between them. Experimental results show that HiVLP establishes new state-of-the-art results in three downstream tasks, text-video retrieval, video-text retrieval, and video captioning.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Shao_HiVLP_Hierarchical_Interactive_Video-Language_Pre-Training_ICCV_2023_paper.pdf", - "aff": "Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab", + "aff": "Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Shao_HiVLP_Hierarchical_Interactive_ICCV_2023_supplemental.pdf", @@ -26315,13 +27198,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Shao_HiVLP_Hierarchical_Interactive_Video-Language_Pre-Training_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0;0;0;0;0", "aff_unique_norm": "Huawei", - "aff_unique_dep": "Noah\u2019s Ark Lab", + "aff_unique_dep": "Noah’s Ark Lab", "aff_unique_url": "https://www.huawei.com", "aff_unique_abbr": "Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Shao_2023_ICCV,\n \n author = {\n Shao,\n Bin and Liu,\n Jianzhuang and Pei,\n Renjing and Xu,\n Songcen and Dai,\n Peng and Lu,\n Juwei and Li,\n Weimian and Yan,\n Youliang\n},\n title = {\n HiVLP: Hierarchical Interactive Video-Language Pre-Training\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13756-13766\n} \n}" }, { "title": "Hidden Biases of End-to-End Driving Models", @@ -26344,7 +27228,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Jaeger_Hidden_Biases_of_End-to-End_Driving_Models_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Jaeger_Hidden_Biases_of_End-to-End_Driving_Models_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Jaeger_2023_ICCV,\n \n author = {\n Jaeger,\n Bernhard and Chitta,\n Kashyap and Geiger,\n Andreas\n},\n title = {\n Hidden Biases of End-to-End Driving Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8240-8249\n} \n}" }, { "title": "Hiding Visual Information via Obfuscating Adversarial Perturbations", @@ -26376,7 +27261,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Su_2023_ICCV,\n \n author = {\n Su,\n Zhigang and Zhou,\n Dawei and Wang,\n Nannan and Liu,\n Decheng and Wang,\n Zhen and Gao,\n Xinbo\n},\n title = {\n Hiding Visual Information via Obfuscating Adversarial Perturbations\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4356-4366\n} \n}" }, { "id": "9133e30ee1", @@ -26404,7 +27290,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Shenzhen", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Feng_2023_ICCV,\n \n author = {\n Feng,\n Xin and Xu,\n Yifeng and Lu,\n Guangming and Pei,\n Wenjie\n},\n title = {\n Hierarchical Contrastive Learning for Pattern-Generalizable Image Corruption Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12076-12085\n} \n}" }, { "title": "Hierarchical Generation of Human-Object Interactions with Diffusion Probabilistic Models", @@ -26436,7 +27323,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Pi_2023_ICCV,\n \n author = {\n Pi,\n Huaijin and Peng,\n Sida and Yang,\n Minghui and Zhou,\n Xiaowei and Bao,\n Hujun\n},\n title = {\n Hierarchical Generation of Human-Object Interactions with Diffusion Probabilistic Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15061-15073\n} \n}" }, { "title": "Hierarchical Point-based Active Learning for Semi-supervised Point Cloud Semantic Segmentation", @@ -26444,6 +27332,7 @@ "status": "Poster", "track": "main", "pid": "11416", + "author_site": "Zongyi Xu, Bo Yuan, Shanshan Zhao, Qianni Zhang, Xinbo Gao", "author": "Zongyi Xu, Bo Yuan, Shanshan Zhao, Qianni Zhang, Xinbo Gao", "abstract": "Impressive performance on point cloud semantic segmentation has been achieved by fully-supervised methods with large amounts of labelled data. As it is labour-intensive to acquire large-scale point cloud data with point-wise labels, many attempts have been made to explore learning 3D point cloud segmentation with limited annotations. Active learning is one of the effective strategies to achieve this purpose but is still under-explored. The most recent methods of this kind measure the uncertainty of each pre-divided region for manual labelling but they suffer from redundant information and require additional efforts for region division. This paper aims at addressing this issue by developing a hierarchical point-based active learning strategy. Specifically, we measure the uncertainty for each point by a hierarchical minimum margin uncertainty module which considers the contextual information at multiple levels. Then, a feature-distance suppression strategy is designed to select important and representative points for manual labelling. Besides, to better exploit the unlabelled data, we build a semi-supervised segmentation framework based on our active strategy. Extensive experiments on the S3DIS and ScanNetV2 datasets demonstrate that the proposed framework achieves 96.5% and 100% performance of fully-supervised baseline with only 0.07% and 0.1% training data, respectively, outperforming the state-of-the-art weakly-supervised and active learning methods. The code will be available at https://github.com/SmiletoE/HPAL.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Xu_Hierarchical_Point-based_Active_Learning_for_Semi-supervised_Point_Cloud_Semantic_Segmentation_ICCV_2023_paper.pdf", @@ -26455,7 +27344,8 @@ "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=948937000548041306&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Xu_Hierarchical_Point-based_Active_Learning_for_Semi-supervised_Point_Cloud_Semantic_Segmentation_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Xu_Hierarchical_Point-based_Active_Learning_for_Semi-supervised_Point_Cloud_Semantic_Segmentation_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Xu_2023_ICCV,\n \n author = {\n Xu,\n Zongyi and Yuan,\n Bo and Zhao,\n Shanshan and Zhang,\n Qianni and Gao,\n Xinbo\n},\n title = {\n Hierarchical Point-based Active Learning for Semi-supervised Point Cloud Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18098-18108\n} \n}" }, { "title": "Hierarchical Prior Mining for Non-local Multi-View Stereo", @@ -26487,7 +27377,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Ren_2023_ICCV,\n \n author = {\n Ren,\n Chunlin and Xu,\n Qingshan and Zhang,\n Shikun and Yang,\n Jiaqi\n},\n title = {\n Hierarchical Prior Mining for Non-local Multi-View Stereo\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3611-3620\n} \n}" }, { "title": "Hierarchical Spatio-Temporal Representation Learning for Gait Recognition", @@ -26510,7 +27401,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wang_Hierarchical_Spatio-Temporal_Representation_Learning_for_Gait_Recognition_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wang_Hierarchical_Spatio-Temporal_Representation_Learning_for_Gait_Recognition_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Lei and Liu,\n Bo and Liang,\n Fangfang and Wang,\n Bincheng\n},\n title = {\n Hierarchical Spatio-Temporal Representation Learning for Gait Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19639-19649\n} \n}" }, { "title": "Hierarchical Visual Categories Modeling: A Joint Representation Learning and Density Estimation Framework for Out-of-Distribution Detection", @@ -26542,7 +27434,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Jinglun and Zhou,\n Xinyu and Guo,\n Pinxue and Sun,\n Yixuan and Huang,\n Yiwen and Ge,\n Weifeng and Zhang,\n Wenqiang\n},\n title = {\n Hierarchical Visual Categories Modeling: A Joint Representation Learning and Density Estimation Framework for Out-of-Distribution Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23425-23435\n} \n}" }, { "title": "Hierarchical Visual Primitive Experts for Compositional Zero-Shot Learning", @@ -26565,7 +27458,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Kim_Hierarchical_Visual_Primitive_Experts_for_Compositional_Zero-Shot_Learning_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Kim_Hierarchical_Visual_Primitive_Experts_for_Compositional_Zero-Shot_Learning_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Kim_2023_ICCV,\n \n author = {\n Kim,\n Hanjae and Lee,\n Jiyoung and Park,\n Seongheon and Sohn,\n Kwanghoon\n},\n title = {\n Hierarchical Visual Primitive Experts for Compositional Zero-Shot Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5675-5685\n} \n}" }, { "title": "Hierarchically Decomposed Graph Convolutional Networks for Skeleton-Based Action Recognition", @@ -26597,7 +27491,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "South Korea;" + "aff_country_unique": "South Korea;", + "bibtex": "@InProceedings{Lee_2023_ICCV,\n \n author = {\n Lee,\n Jungho and Lee,\n Minhyeok and Lee,\n Dogyoon and Lee,\n Sangyoun\n},\n title = {\n Hierarchically Decomposed Graph Convolutional Networks for Skeleton-Based Action Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10444-10453\n} \n}" }, { "title": "High Quality Entity Segmentation", @@ -26620,7 +27515,8 @@ "aff_domain": ";;;;;;;;", "email": ";;;;;;;;", "author_num": 9, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Qi_High_Quality_Entity_Segmentation_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Qi_High_Quality_Entity_Segmentation_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Qi_2023_ICCV,\n \n author = {\n Qi,\n Lu and Kuen,\n Jason and Shen,\n Tiancheng and Gu,\n Jiuxiang and Li,\n Wenbo and Guo,\n Weidong and Jia,\n Jiaya and Lin,\n Zhe and Yang,\n Ming-Hsuan\n},\n title = {\n High Quality Entity Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4047-4056\n} \n}" }, { "title": "High-Resolution Document Shadow Removal via A Large-Scale Real-World Dataset and A Frequency-Aware Shadow Erasing Net", @@ -26652,7 +27548,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Macau SAR", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Zinuo and Chen,\n Xuhang and Pun,\n Chi-Man and Cun,\n Xiaodong\n},\n title = {\n High-Resolution Document Shadow Removal via A Large-Scale Real-World Dataset and A Frequency-Aware Shadow Erasing Net\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12449-12458\n} \n}" }, { "title": "Holistic Geometric Feature Learning for Structured Reconstruction", @@ -26684,7 +27581,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Wuhan", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Lu_2023_ICCV,\n \n author = {\n Lu,\n Ziqiong and Huan,\n Linxi and Ma,\n Qiyuan and Zheng,\n Xianwei\n},\n title = {\n Holistic Geometric Feature Learning for Structured Reconstruction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21807-21817\n} \n}" }, { "title": "Holistic Label Correction for Noisy Multi-Label Classification", @@ -26709,14 +27607,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Xia_Holistic_Label_Correction_for_Noisy_Multi-Label_Classification_ICCV_2023_paper.html", "aff_unique_index": "0;1;0;2;3;4+5;0", - "aff_unique_norm": "University of Sydney;Imperial College London;JD;Hong Kong Baptist University;Chinese Academy of Sciences;University of Chinese Academy of Sciences", - "aff_unique_dep": ";;JD Explore Academy;;;", + "aff_unique_norm": "University of Sydney;Imperial College London;JD Explore Academy;Hong Kong Baptist University;Chinese Academy of Sciences;University of Chinese Academy of Sciences", + "aff_unique_dep": ";;;;;", "aff_unique_url": "https://www.sydney.edu.au;https://www.imperial.ac.uk;;https://www.hkbu.edu.hk;https://www.cas.cn;http://www.ucas.ac.cn", "aff_unique_abbr": "USYD;ICL;;HKBU;CAS;UCAS", "aff_campus_unique_index": "1;", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;1;0;3;3+3;0", - "aff_country_unique": "Australia;United Kingdom;;China" + "aff_country_unique": "Australia;United Kingdom;;China", + "bibtex": "@InProceedings{Xia_2023_ICCV,\n \n author = {\n Xia,\n Xiaobo and Deng,\n Jiankang and Bao,\n Wei and Du,\n Yuxuan and Han,\n Bo and Shan,\n Shiguang and Liu,\n Tongliang\n},\n title = {\n Holistic Label Correction for Noisy Multi-Label Classification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1483-1493\n} \n}" }, { "title": "HollowNeRF: Pruning Hashgrid-Based NeRFs with Trainable Collision Mitigation", @@ -26739,7 +27638,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Xie_HollowNeRF_Pruning_Hashgrid-Based_NeRFs_with_Trainable_Collision_Mitigation_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Xie_HollowNeRF_Pruning_Hashgrid-Based_NeRFs_with_Trainable_Collision_Mitigation_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Xie_2023_ICCV,\n \n author = {\n Xie,\n Xiufeng and Gherardi,\n Riccardo and Pan,\n Zhihong and Huang,\n Stephen\n},\n title = {\n HollowNeRF: Pruning Hashgrid-Based NeRFs with Trainable Collision Mitigation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3480-3490\n} \n}" }, { "title": "HoloAssist: an Egocentric Human Interaction Dataset for Interactive AI Assistants in the Real World", @@ -26762,7 +27662,8 @@ "aff_domain": ";;;;;;;;;;;", "email": ";;;;;;;;;;;", "author_num": 12, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wang_HoloAssist_an_Egocentric_Human_Interaction_Dataset_for_Interactive_AI_Assistants_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wang_HoloAssist_an_Egocentric_Human_Interaction_Dataset_for_Interactive_AI_Assistants_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Xin and Kwon,\n Taein and Rad,\n Mahdi and Pan,\n Bowen and Chakraborty,\n Ishani and Andrist,\n Sean and Bohus,\n Dan and Feniello,\n Ashley and Tekin,\n Bugra and Frujeri,\n Felipe Vieira and Joshi,\n Neel and Pollefeys,\n Marc\n},\n title = {\n HoloAssist: an Egocentric Human Interaction Dataset for Interactive AI Assistants in the Real World\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20270-20281\n} \n}" }, { "title": "HoloFusion: Towards Photo-realistic 3D Generative Modeling", @@ -26787,14 +27688,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Karnewar_HoloFusion_Towards_Photo-realistic_3D_Generative_Modeling_ICCV_2023_paper.html", "aff_unique_index": "0;0;1;1", - "aff_unique_norm": "University College London;Meta", + "aff_unique_norm": "University College London;Meta Platforms, Inc.", "aff_unique_dep": ";Meta AI", "aff_unique_url": "https://www.ucl.ac.uk;https://meta.com", "aff_unique_abbr": "UCL;Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;1", - "aff_country_unique": "United Kingdom;United States" + "aff_country_unique": "United Kingdom;United States", + "bibtex": "@InProceedings{Karnewar_2023_ICCV,\n \n author = {\n Karnewar,\n Animesh and Mitra,\n Niloy J. and Vedaldi,\n Andrea and Novotny,\n David\n},\n title = {\n HoloFusion: Towards Photo-realistic 3D Generative Modeling\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22976-22985\n} \n}" }, { "title": "Homeomorphism Alignment for Unsupervised Domain Adaptation", @@ -26826,7 +27728,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0;1", - "aff_country_unique": "China;United Kingdom" + "aff_country_unique": "China;United Kingdom", + "bibtex": "@InProceedings{Zhou_2023_ICCV,\n \n author = {\n Zhou,\n Lihua and Ye,\n Mao and Zhu,\n Xiatian and Xiao,\n Siying and Fan,\n Xu-Qian and Neri,\n Ferrante\n},\n title = {\n Homeomorphism Alignment for Unsupervised Domain Adaptation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18699-18710\n} \n}" }, { "title": "Homography Guided Temporal Fusion for Road Line and Marking Segmentation", @@ -26858,7 +27761,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1;0;0;0;0", - "aff_country_unique": "Australia;China" + "aff_country_unique": "Australia;China", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Shan and Nguyen,\n Chuong and Liu,\n Jiawei and Zhang,\n Kaihao and Luo,\n Wenhan and Zhang,\n Yanhao and Muthu,\n Sundaram and Maken,\n Fahira Afzal and Li,\n Hongdong\n},\n title = {\n Homography Guided Temporal Fusion for Road Line and Marking Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1075-1085\n} \n}" }, { "title": "HopFIR: Hop-wise GraphFormer with Intragroup Joint Refinement for 3D Human Pose Estimation", @@ -26885,12 +27789,13 @@ "aff_unique_index": "0;1;0;2;0", "aff_unique_norm": "Hefei University of Technology;Tencent;Tsinghua University", "aff_unique_dep": ";Youtu Lab;", - "aff_unique_url": "http://www.hfut.edu.cn/;https://www.tencent.com;https://www.tsinghua.edu.cn", + "aff_unique_url": "http://www.hfut.edu.cn;https://www.tencent.com;https://www.tsinghua.edu.cn", "aff_unique_abbr": "HUT;Tencent;THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhai_2023_ICCV,\n \n author = {\n Zhai,\n Kai and Nie,\n Qiang and Ouyang,\n Bo and Li,\n Xiang and Yang,\n Shanlin\n},\n title = {\n HopFIR: Hop-wise GraphFormer with Intragroup Joint Refinement for 3D Human Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14985-14995\n} \n}" }, { "title": "Householder Projector for Unsupervised Latent Semantics Discovery", @@ -26915,14 +27820,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Song_Householder_Projector_for_Unsupervised_Latent_Semantics_Discovery_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;1", - "aff_unique_norm": "University of Trento;Beijing Jiao Tong University", + "aff_unique_norm": "University of Trento;Beijing Jiaotong University", "aff_unique_dep": "Department of Information Engineering and Computer Science;", "aff_unique_url": "https://www.unitn.it;http://www.bjtu.edu.cn", "aff_unique_abbr": "UniTN;BJTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1", - "aff_country_unique": "Italy;China" + "aff_country_unique": "Italy;China", + "bibtex": "@InProceedings{Song_2023_ICCV,\n \n author = {\n Song,\n Yue and Zhang,\n Jichao and Sebe,\n Nicu and Wang,\n Wei\n},\n title = {\n Householder Projector for Unsupervised Latent Semantics Discovery\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7712-7722\n} \n}" }, { "title": "How Far Pre-trained Models Are from Neural Collapse on the Target Dataset Informs their Transferability", @@ -26947,14 +27853,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wang_How_Far_Pre-trained_Models_Are_from_Neural_Collapse_on_the_ICCV_2023_paper.html", "aff_unique_index": "0;0;1;0;0", - "aff_unique_norm": "University of Queensland;Australian National University", + "aff_unique_norm": "The University of Queensland;Australian National University", "aff_unique_dep": ";", "aff_unique_url": "https://www.uq.edu.au;https://www.anu.edu.au", "aff_unique_abbr": "UQ;ANU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "Australia" + "aff_country_unique": "Australia", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Zijian and Luo,\n Yadan and Zheng,\n Liang and Huang,\n Zi and Baktashmotlagh,\n Mahsa\n},\n title = {\n How Far Pre-trained Models Are from Neural Collapse on the Target Dataset Informs their Transferability\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5549-5558\n} \n}" }, { "title": "How Much Temporal Long-Term Context is Needed for Action Segmentation?", @@ -26977,7 +27884,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Bahrami_How_Much_Temporal_Long-Term_Context_is_Needed_for_Action_Segmentation_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Bahrami_How_Much_Temporal_Long-Term_Context_is_Needed_for_Action_Segmentation_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Bahrami_2023_ICCV,\n \n author = {\n Bahrami,\n Emad and Francesca,\n Gianpiero and Gall,\n Juergen\n},\n title = {\n How Much Temporal Long-Term Context is Needed for Action Segmentation?\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10351-10361\n} \n}" }, { "title": "How to Boost Face Recognition with StyleGAN?", @@ -26985,11 +27893,11 @@ "status": "Poster", "track": "main", "pid": "11018", - "author_site": "Artem Sevastopolskiy, Yury Malkov, Nikita Durasov, Luisa Verdoliva, Matthias Nie\u00dfner", - "author": "Artem Sevastopolskiy; Yury Malkov; Nikita Durasov; Luisa Verdoliva; Matthias Nie\u00dfner", + "author_site": "Artem Sevastopolskiy, Yury Malkov, Nikita Durasov, Luisa Verdoliva, Matthias Nießner", + "author": "Artem Sevastopolskiy; Yury Malkov; Nikita Durasov; Luisa Verdoliva; Matthias Nießner", "abstract": "State-of-the-art face recognition systems require huge amounts of labeled training data. Given the priority of privacy in face recognition applications, the data is limited to celebrity web crawls, which have issues such as skewed distributions of ethnicities and limited numbers of identities. On the other hand, the self-supervised revolution in the industry motivates research on adaptation of the related techniques to facial recognition. One of the most popular practical tricks is to augment the dataset by the samples drawn from the high-resolution high-fidelity models (e.g. StyleGAN-like), while preserving the identity. We show that a simple approach based on fine-tuning an encoder for StyleGAN allows to improve upon the state-of-the-art facial recognition and performs better compared to training on synthetic face identities. We also collect large-scale unlabeled datasets with controllable ethnic constitution -- AfricanFaceSet-5M (5 million images of different people) and AsianFaceSet-3M (3 million images of different people) and we show that pretraining on each of them improves recognition of the respective ethnicities (as well as also others), while combining all unlabeled datasets results in the biggest performance increase. Our self-supervised strategy is the most useful with limited amounts of labeled training data, which can be beneficial for more tailored face recognition tasks and when facing privacy concerns. Evaluation is provided based on a standard RFW dataset and a new large-scale RB-WebFace benchmark.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Sevastopolskiy_How_to_Boost_Face_Recognition_with_StyleGAN_ICCV_2023_paper.pdf", - "aff": "Technical University of Munich, Germany+University Federico II of Naples, Italy; Twitter, US; \u00c9cole polytechnique f\u00e9d\u00e9rale de Lausanne, Switzerland; Technical University of Munich, Germany+University Federico II of Naples, Italy; Technical University of Munich, Germany", + "aff": "Technical University of Munich, Germany+University Federico II of Naples, Italy; Twitter, US; École polytechnique fédérale de Lausanne, Switzerland; Technical University of Munich, Germany+University Federico II of Naples, Italy; Technical University of Munich, Germany", "project": "", "github": "https://github.com/seva100/stylegan-for-facerec", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Sevastopolskiy_How_to_Boost_ICCV_2023_supplemental.zip", @@ -27002,14 +27910,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Sevastopolskiy_How_to_Boost_Face_Recognition_with_StyleGAN_ICCV_2023_paper.html", "aff_unique_index": "0+1;2;3;0+1;0", - "aff_unique_norm": "Technical University of Munich;University Federico II;Twitter;\u00c9cole polytechnique f\u00e9d\u00e9rale de Lausanne", + "aff_unique_norm": "Technical University of Munich;University Federico II;Twitter;École polytechnique fédérale de Lausanne", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.tum.de;https://www.unina.it;https://www.twitter.com;https://www.epfl.ch", "aff_unique_abbr": "TUM;UNINA;Twitter;EPFL", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Naples", "aff_country_unique_index": "0+1;2;3;0+1;0", - "aff_country_unique": "Germany;Italy;United States;Switzerland" + "aff_country_unique": "Germany;Italy;United States;Switzerland", + "bibtex": "@InProceedings{Sevastopolskiy_2023_ICCV,\n \n author = {\n Sevastopolskiy,\n Artem and Malkov,\n Yury and Durasov,\n Nikita and Verdoliva,\n Luisa and Nie{\\ss\n}ner,\n Matthias\n},\n title = {\n How to Boost Face Recognition with StyleGAN?\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20924-20934\n} \n}" }, { "title": "How to Choose your Best Allies for a Transferable Attack?", @@ -27041,7 +27950,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Rennes;", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "France;United Kingdom" + "aff_country_unique": "France;United Kingdom", + "bibtex": "@InProceedings{Maho_2023_ICCV,\n \n author = {\n Maho,\n Thibault and Moosavi-Dezfooli,\n Seyed-Mohsen and Furon,\n Teddy\n},\n title = {\n How to Choose your Best Allies for a Transferable Attack?\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4542-4551\n} \n}" }, { "title": "Human Part-wise 3D Motion Context Learning for Sign Language Recognition", @@ -27073,7 +27983,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Seoul", "aff_country_unique_index": "0;1;0+1", - "aff_country_unique": "China;South Korea" + "aff_country_unique": "China;South Korea", + "bibtex": "@InProceedings{Lee_2023_ICCV,\n \n author = {\n Lee,\n Taeryung and Oh,\n Yeonguk and Lee,\n Kyoung Mu\n},\n title = {\n Human Part-wise 3D Motion Context Learning for Sign Language Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20740-20750\n} \n}" }, { "title": "Human Preference Score: Better Aligning Text-to-Image Models with Human Preference", @@ -27096,7 +28007,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wu_Human_Preference_Score_Better_Aligning_Text-to-Image_Models_with_Human_Preference_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wu_Human_Preference_Score_Better_Aligning_Text-to-Image_Models_with_Human_Preference_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Wu_2023_ICCV,\n \n author = {\n Wu,\n Xiaoshi and Sun,\n Keqiang and Zhu,\n Feng and Zhao,\n Rui and Li,\n Hongsheng\n},\n title = {\n Human Preference Score: Better Aligning Text-to-Image Models with Human Preference\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2096-2105\n} \n}" }, { "title": "Human from Blur: Human Pose Tracking from Blurry Images", @@ -27119,7 +28031,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhao_Human_from_Blur_Human_Pose_Tracking_from_Blurry_Images_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhao_Human_from_Blur_Human_Pose_Tracking_from_Blurry_Images_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Zhao_2023_ICCV,\n \n author = {\n Zhao,\n Yiming and Rozumnyi,\n Denys and Song,\n Jie and Hilliges,\n Otmar and Pollefeys,\n Marc and Oswald,\n Martin R.\n},\n title = {\n Human from Blur: Human Pose Tracking from Blurry Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14905-14915\n} \n}" }, { "title": "Human-Inspired Facial Sketch Synthesis with Dynamic Adaptation", @@ -27151,7 +28064,8 @@ "aff_campus_unique_index": "0", "aff_campus_unique": "Hangzhou;", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Gao_2023_ICCV,\n \n author = {\n Gao,\n Fei and Zhu,\n Yifan and Jiang,\n Chang and Wang,\n Nannan\n},\n title = {\n Human-Inspired Facial Sketch Synthesis with Dynamic Adaptation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7237-7247\n} \n}" }, { "title": "Human-centric Scene Understanding for 3D Large-scale Scenarios", @@ -27176,14 +28090,15 @@ "author_num": 9, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Xu_Human-centric_Scene_Understanding_for_3D_Large-scale_Scenarios_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;1;2;3;0;0;0", - "aff_unique_norm": "ShanghaiTech University;University of Hong Kong;Shanghai AI Laboratory;Chinese University of Hong Kong", + "aff_unique_norm": "ShanghaiTech University;The University of Hong Kong;Shanghai AI Laboratory;The Chinese University of Hong Kong", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.shanghaitech.edu.cn;https://www.hku.hk;https://www.shanghai-ai-lab.com;https://www.cuhk.edu.hk", "aff_unique_abbr": "ShanghaiTech;HKU;SAIL;CUHK", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xu_2023_ICCV,\n \n author = {\n Xu,\n Yiteng and Cong,\n Peishan and Yao,\n Yichen and Chen,\n Runnan and Hou,\n Yuenan and Zhu,\n Xinge and He,\n Xuming and Yu,\n Jingyi and Ma,\n Yuexin\n},\n title = {\n Human-centric Scene Understanding for 3D Large-scale Scenarios\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20349-20359\n} \n}" }, { "title": "HumanMAC: Masked Motion Completion for Human Motion Prediction", @@ -27215,7 +28130,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;2;2", - "aff_country_unique": "China;Singapore;Australia" + "aff_country_unique": "China;Singapore;Australia", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Ling-Hao and Zhang,\n JiaWei and Li,\n Yewen and Pang,\n Yiren and Xia,\n Xiaobo and Liu,\n Tongliang\n},\n title = {\n HumanMAC: Masked Motion Completion for Human Motion Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9544-9555\n} \n}" }, { "title": "HumanSD: A Native Skeleton-Guided Diffusion Model for Human Image Generation", @@ -27240,14 +28156,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ju_HumanSD_A_Native_Skeleton-Guided_Diffusion_Model_for_Human_Image_Generation_ICCV_2023_paper.html", "aff_unique_index": "0;1;0;1;1;0", - "aff_unique_norm": "Chinese University of Hong Kong;International Digital Economy Academy", + "aff_unique_norm": "The Chinese University of Hong Kong;International Digital Economy Academy", "aff_unique_dep": ";", "aff_unique_url": "https://www.cuhk.edu.hk;", "aff_unique_abbr": "CUHK;", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Ju_2023_ICCV,\n \n author = {\n Ju,\n Xuan and Zeng,\n Ailing and Zhao,\n Chenchen and Wang,\n Jianan and Zhang,\n Lei and Xu,\n Qiang\n},\n title = {\n HumanSD: A Native Skeleton-Guided Diffusion Model for Human Image Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15988-15998\n} \n}" }, { "title": "Humans in 4D: Reconstructing and Tracking Humans with Transformers", @@ -27279,7 +28196,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Goel_2023_ICCV,\n \n author = {\n Goel,\n Shubham and Pavlakos,\n Georgios and Rajasegaran,\n Jathushan and Kanazawa,\n Angjoo and Malik,\n Jitendra\n},\n title = {\n Humans in 4D: Reconstructing and Tracking Humans with Transformers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14783-14794\n} \n}" }, { "title": "Hybrid Spectral Denoising Transformer with Guided Attention", @@ -27291,7 +28209,7 @@ "author": "Zeqiang Lai; Chenggang Yan; Ying Fu", "abstract": "In this paper, we present a Hybrid Spectral Denoising Transformer (HSDT) for hyperspectral image denoising. Challenges in adapting transformer for HSI arise from the capabilities to tackle existing limitations of CNN-based methods in capturing the global and local spatial-spectral correlations while maintaining efficiency and flexibility. To address these issues, we introduce a hybrid approach that combines the advantages of both models with a Spatial-Spectral Separable Convolution (S3Conv), Guided Spectral Self-Attention (GSSA), and Self-Modulated Feed-Forward Network (SM-FFN). Our S3Conv works as a lightweight alternative to 3D convolution, which extracts more spatial-spectral correlated features while keeping the flexibility to tackle HSIs with an arbitrary number of bands. These features are then adaptively processed by GSSA which performs 3D self-attention across the spectral bands, guided by a set of learnable queries that encode the spectral signatures. This not only enriches our model with powerful capabilities for identifying global spectral correlations but also maintains linear complexity. Moreover, our SM-FFN proposes the self-modulation that intensifies the activations of more informative regions, which further strengthens the aggregated features. Extensive experiments are conducted on various datasets under both simulated and real-world noise, and it shows that our HSDT significantly outperforms the existing state-of-the-art methods while maintaining low computational overhead. Code is at https://github.com/Zeqiang-Lai/HSDT.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Lai_Hybrid_Spectral_Denoising_Transformer_with_Guided_Attention_ICCV_2023_paper.pdf", - "aff": "Beijing Institute of Technology; Hangzhou Dianzi University; Beijing Institute of Technology\u2020", + "aff": "Beijing Institute of Technology; Hangzhou Dianzi University; Beijing Institute of Technology†", "project": "", "github": "https://github.com/Zeqiang-Lai/HSDT", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Lai_Hybrid_Spectral_Denoising_ICCV_2023_supplemental.pdf", @@ -27311,7 +28229,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Lai_2023_ICCV,\n \n author = {\n Lai,\n Zeqiang and Yan,\n Chenggang and Fu,\n Ying\n},\n title = {\n Hybrid Spectral Denoising Transformer with Guided Attention\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13065-13075\n} \n}" }, { "title": "HybridAugment++: Unified Frequency Spectra Perturbations for Model Robustness", @@ -27343,7 +28262,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "T\u00fcrkiye" + "aff_country_unique": "Turkey", + "bibtex": "@InProceedings{Yucel_2023_ICCV,\n \n author = {\n Yucel,\n Mehmet Kerim and Cinbis,\n Ramazan Gokberk and Duygulu,\n Pinar\n},\n title = {\n HybridAugment++: Unified Frequency Spectra Perturbations for Model Robustness\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5718-5728\n} \n}" }, { "title": "HyperDiffusion: Generating Implicit Neural Fields with Weight-Space Diffusion", @@ -27351,8 +28271,8 @@ "status": "Poster", "track": "main", "pid": "9487", - "author_site": "Ziya Erko\u00e7, Fangchang Ma, Qi Shan, Matthias Nie\u00dfner, Angela Dai", - "author": "Ziya Erko\u00e7; Fangchang Ma; Qi Shan; Matthias Nie\u00dfner; Angela Dai", + "author_site": "Ziya Erkoç, Fangchang Ma, Qi Shan, Matthias Nießner, Angela Dai", + "author": "Ziya Erkoç; Fangchang Ma; Qi Shan; Matthias Nießner; Angela Dai", "abstract": "Implicit neural fields, typically encoded by a multilayer perceptron (MLP) that maps from coordinates (e.g., xyz) to signals (e.g., signed distances), have shown remarkable promise as a high-fidelity and compact representation. However, the lack of a regular and explicit grid structure also makes it challenging to apply generative modeling directly on implicit neural fields in order to synthesize new data. To this end, we propose HyperDiffusion, a novel approach for unconditional generative modeling of implicit neural fields. HyperDiffusion operates directly on MLP weights and generates new neural implicit fields encoded by synthesized MLP parameters. Specifically, a collection of MLPs is first optimized to faithfully represent individual data samples. Subsequently, a diffusion process is trained in this MLP weight space to model the underlying distribution of neural implicit fields. HyperDiffusion enables diffusion modeling over a implicit, compact, and yet high-fidelity representation of complex signals across various dimensionalities within one single unified framework.\n Experiments on both 3D shapes and 4D mesh animations demonstrate the effectiveness of our approach with significant improvement over prior work in high-fidelity synthesis.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Erkoc_HyperDiffusion_Generating_Implicit_Neural_Fields_with_Weight-Space_Diffusion_ICCV_2023_paper.pdf", "aff": "Technical University of Munich; Apple; Apple; Technical University of Munich; Technical University of Munich", @@ -27368,14 +28288,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Erkoc_HyperDiffusion_Generating_Implicit_Neural_Fields_with_Weight-Space_Diffusion_ICCV_2023_paper.html", "aff_unique_index": "0;1;1;0;0", - "aff_unique_norm": "Technical University of Munich;Apple", - "aff_unique_dep": ";Apple Inc.", + "aff_unique_norm": "Technical University of Munich;Apple Inc.", + "aff_unique_dep": ";", "aff_unique_url": "https://www.tum.de;https://www.apple.com", "aff_unique_abbr": "TUM;Apple", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0;0", - "aff_country_unique": "Germany;United States" + "aff_country_unique": "Germany;United States", + "bibtex": "@InProceedings{Erkoc_2023_ICCV,\n \n author = {\n Erko\\c{c\n},\n Ziya and Ma,\n Fangchang and Shan,\n Qi and Nie{\\ss\n}ner,\n Matthias and Dai,\n Angela\n},\n title = {\n HyperDiffusion: Generating Implicit Neural Fields with Weight-Space Diffusion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14300-14310\n} \n}" }, { "title": "HyperReenact: One-Shot Reenactment via Jointly Learning to Refine and Retarget Faces", @@ -27407,7 +28328,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "London", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Bounareli_2023_ICCV,\n \n author = {\n Bounareli,\n Stella and Tzelepis,\n Christos and Argyriou,\n Vasileios and Patras,\n Ioannis and Tzimiropoulos,\n Georgios\n},\n title = {\n HyperReenact: One-Shot Reenactment via Jointly Learning to Refine and Retarget Faces\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7149-7159\n} \n}" }, { "title": "Hyperbolic Audio-visual Zero-shot Learning", @@ -27439,7 +28361,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0+0;1;0;0", - "aff_country_unique": "Australia;China" + "aff_country_unique": "Australia;China", + "bibtex": "@InProceedings{Hong_2023_ICCV,\n \n author = {\n Hong,\n Jie and Hayder,\n Zeeshan and Han,\n Junlin and Fang,\n Pengfei and Harandi,\n Mehrtash and Petersson,\n Lars\n},\n title = {\n Hyperbolic Audio-visual Zero-shot Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7873-7883\n} \n}" }, { "title": "Hyperbolic Chamfer Distance for Point Cloud Completion", @@ -27462,7 +28385,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Lin_Hyperbolic_Chamfer_Distance_for_Point_Cloud_Completion_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Lin_Hyperbolic_Chamfer_Distance_for_Point_Cloud_Completion_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Lin_2023_ICCV,\n \n author = {\n Lin,\n Fangzhou and Yue,\n Yun and Hou,\n Songlin and Yu,\n Xuechu and Xu,\n Yajun and Yamada,\n Kazunori D and Zhang,\n Ziming\n},\n title = {\n Hyperbolic Chamfer Distance for Point Cloud Completion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14595-14606\n} \n}" }, { "title": "I Can't Believe There's No Images! Learning Visual Tasks Using only Language Supervision", @@ -27494,7 +28418,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Gu_2023_ICCV,\n \n author = {\n Gu,\n Sophia and Clark,\n Christopher and Kembhavi,\n Aniruddha\n},\n title = {\n I Can't Believe There's No Images! Learning Visual Tasks Using only Language Supervision\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2672-2683\n} \n}" }, { "title": "I-ViT: Integer-only Quantization for Efficient Vision Transformer Inference", @@ -27526,7 +28451,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Zhikai and Gu,\n Qingyi\n},\n title = {\n I-ViT: Integer-only Quantization for Efficient Vision Transformer Inference\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17065-17075\n} \n}" }, { "title": "ICD-Face: Intra-class Compactness Distillation for Face Recognition", @@ -27558,7 +28484,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yu_2023_ICCV,\n \n author = {\n Yu,\n Zhipeng and Liu,\n Jiaheng and Qin,\n Haoyu and Wu,\n Yichao and Hu,\n Kun and Tian,\n Jiayi and Liang,\n Ding\n},\n title = {\n ICD-Face: Intra-class Compactness Distillation for Face Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21042-21052\n} \n}" }, { "title": "ICE-NeRF: Interactive Color Editing of NeRFs via Decomposition-Aware Weight Optimization", @@ -27590,7 +28517,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Lee_2023_ICCV,\n \n author = {\n Lee,\n Jae-Hyeok and Kim,\n Dae-Shik\n},\n title = {\n ICE-NeRF: Interactive Color Editing of NeRFs via Decomposition-Aware Weight Optimization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3491-3501\n} \n}" }, { "title": "ICICLE: Interpretable Class Incremental Continual Learning", @@ -27599,7 +28527,7 @@ "track": "main", "pid": "6303", "author_site": "Dawid Rymarczyk, Joost van de Weijer, Bartosz Zieli?ski, Bartlomiej Twardowski", - "author": "Dawid Rymarczyk; Joost van de Weijer; Bartosz Zieli\u0144ski; Bartlomiej Twardowski", + "author": "Dawid Rymarczyk; Joost van de Weijer; Bartosz Zieliński; Bartlomiej Twardowski", "abstract": "Continual learning enables incremental learning of new tasks without forgetting those previously learned, resulting in positive knowledge transfer that can enhance performance on both new and old tasks. However, continual learning poses new challenges for interpretability, as the rationale behind model predictions may change over time, leading to interpretability concept drift.\n We address this problem by proposing Interpretable Class-InCremental LEarning (ICICLE), an exemplar-free approach that adopts a prototypical part-based approach. It consists of three crucial novelties: interpretability regularization that distills previously learned concepts while preserving user-friendly positive reasoning; proximity-based prototype initialization strategy dedicated to the fine-grained setting; and task-recency bias compensation devoted to prototypical parts.\n Our experimental results demonstrate that ICICLE reduces the interpretability concept drift and outperforms the existing exemplar-free methods of common class-incremental learning when applied to concept-based models.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Rymarczyk_ICICLE_Interpretable_Class_Incremental_Continual_Learning_ICCV_2023_paper.pdf", "aff": "Faculty of Mathematics and Computer Science, Jagiellonian University + Doctoral School of Exact and Life Sciences, Jagiellonian University + Ardigen SA; Autonomous University of Barcelona + Computer Vision Center; Faculty of Mathematics and Computer Science, Jagiellonian University + Ardigen SA + IDEAS NCBR; Autonomous University of Barcelona + Computer Vision Center + IDEAS NCBR", @@ -27622,7 +28550,8 @@ "aff_campus_unique_index": ";;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0+0;1+1;0+0+0;1+1+0", - "aff_country_unique": "Poland;Spain" + "aff_country_unique": "Poland;Spain", + "bibtex": "@InProceedings{Rymarczyk_2023_ICCV,\n \n author = {\n Rymarczyk,\n Dawid and van de Weijer,\n Joost and Zieli\\'nski,\n Bartosz and Twardowski,\n Bartlomiej\n},\n title = {\n ICICLE: Interpretable Class Incremental Continual Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1887-1898\n} \n}" }, { "title": "ICL-D3IE: In-Context Learning with Diverse Demonstrations Updating for Document Information Extraction", @@ -27654,7 +28583,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0;0;0", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{He_2023_ICCV,\n \n author = {\n He,\n Jiabang and Wang,\n Lei and Hu,\n Yi and Liu,\n Ning and Liu,\n Hui and Xu,\n Xing and Shen,\n Heng Tao\n},\n title = {\n ICL-D3IE: In-Context Learning with Diverse Demonstrations Updating for Document Information Extraction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19485-19494\n} \n}" }, { "title": "IDiff-Face: Synthetic-based Face Recognition through Fizzy Identity-Conditioned Diffusion Model", @@ -27686,7 +28616,8 @@ "aff_campus_unique_index": "0;0;0+0;0+0", "aff_campus_unique": "Darmstadt", "aff_country_unique_index": "0;0;0+0;0+0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Boutros_2023_ICCV,\n \n author = {\n Boutros,\n Fadi and Grebe,\n Jonas Henry and Kuijper,\n Arjan and Damer,\n Naser\n},\n title = {\n IDiff-Face: Synthetic-based Face Recognition through Fizzy Identity-Conditioned Diffusion Model\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19650-19661\n} \n}" }, { "title": "IHNet: Iterative Hierarchical Network Guided by High-Resolution Estimated Information for Scene Flow Estimation", @@ -27718,7 +28649,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Wuhan", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Yun and Chi,\n Cheng and Lin,\n Min and Yang,\n Xin\n},\n title = {\n IHNet: Iterative Hierarchical Network Guided by High-Resolution Estimated Information for Scene Flow Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10073-10082\n} \n}" }, { "title": "IIEU: Rethinking Neural Feature Activation from Decision-Making", @@ -27726,6 +28658,7 @@ "status": "Poster", "track": "main", "pid": "8934", + "author_site": "Sudong Cai", "author": "Sudong Cai", "abstract": "Nonlinear Activation (Act) models which help fit the underlying mappings are critical for neural representation learning. Neuronal behaviors inspire basic Act functions, e.g., Softplus and ReLU. We instead seek improved explainable Act models by re-interpreting neural feature Act from a new philosophical perspective of Multi-Criteria Decision-Making (MCDM). By treating activation models as selective feature re-calibrators that suppress/emphasize features according to their importance scores measured by feature-filter similarities, we propose a set of specific properties of effective Act models with new intuitions. This helps us identify the unexcavated yet critical problem of mismatched feature scoring led by the differentiated norms of the features and filters. We present the Instantaneous Importance Estimation Units (IIEUs), a novel class of interpretable Act models that address the problem by re-calibrating the feature with the Instantaneous Importance (II) score (which we refer to as) estimated with the adaptive norm-decoupled feature-filter similarities, capable of modeling the cross-layer and -channel cues at a low cost. The extensive experiments on various vision benchmarks demonstrate the significant improvements of our IIEUs over the SOTA Act models and validate our interpretation of feature Act. By replacing the popular/SOTA Act models with IIEUs, the small ResNet-26s outperform/match the large ResNet-101s on ImageNet with far fewer parameters and computations.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Cai_IIEU_Rethinking_Neural_Feature_Activation_from_Decision-Making_ICCV_2023_paper.pdf", @@ -27749,7 +28682,8 @@ "aff_campus_unique_index": "0", "aff_campus_unique": "Kyoto", "aff_country_unique_index": "0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Cai_2023_ICCV,\n \n author = {\n Cai,\n Sudong\n},\n title = {\n IIEU: Rethinking Neural Feature Activation from Decision-Making\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5796-5806\n} \n}" }, { "title": "INSTA-BNN: Binary Neural Network with INSTAnce-aware Threshold", @@ -27772,7 +28706,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Lee_INSTA-BNN_Binary_Neural_Network_with_INSTAnce-aware_Threshold_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Lee_INSTA-BNN_Binary_Neural_Network_with_INSTAnce-aware_Threshold_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Lee_2023_ICCV,\n \n author = {\n Lee,\n Changhun and Kim,\n Hyungjun and Park,\n Eunhyeok and Kim,\n Jae-Joon\n},\n title = {\n INSTA-BNN: Binary Neural Network with INSTAnce-aware Threshold\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17325-17334\n} \n}" }, { "title": "INT2: Interactive Trajectory Prediction at Intersections", @@ -27795,7 +28730,8 @@ "aff_domain": ";;;;;;;;;;;;;;;;;;;;;", "email": ";;;;;;;;;;;;;;;;;;;;;", "author_num": 22, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yan_INT2_Interactive_Trajectory_Prediction_at_Intersections_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yan_INT2_Interactive_Trajectory_Prediction_at_Intersections_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Yan_2023_ICCV,\n \n author = {\n Yan,\n Zhijie and Li,\n Pengfei and Fu,\n Zheng and Xu,\n Shaocong and Shi,\n Yongliang and Chen,\n Xiaoxue and Zheng,\n Yuhang and Li,\n Yang and Liu,\n Tianyu and Li,\n Chuxuan and Luo,\n Nairui and Gao,\n Xu and Chen,\n Yilun and Wang,\n Zuoxu and Shi,\n Yifeng and Huang,\n Pengfei and Han,\n Zhengxiao and Yuan,\n Jirui and Gong,\n Jiangtao and Zhou,\n Guyue and Zhao,\n Hang and Zhao,\n Hao\n},\n title = {\n INT2: Interactive Trajectory Prediction at Intersections\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8536-8547\n} \n}" }, { "title": "IOMatch: Simplifying Open-Set Semi-Supervised Learning with Joint Inliers and Outliers Utilization", @@ -27827,7 +28763,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Zekun and Qi,\n Lei and Shi,\n Yinghuan and Gao,\n Yang\n},\n title = {\n IOMatch: Simplifying Open-Set Semi-Supervised Learning with Joint Inliers and Outliers Utilization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15870-15879\n} \n}" }, { "title": "IST-Net: Prior-Free Category-Level Pose Estimation with Implicit Space Transformation", @@ -27852,14 +28789,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Liu_IST-Net_Prior-Free_Category-Level_Pose_Estimation_with_Implicit_Space_Transformation_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;0", - "aff_unique_norm": "University of Hong Kong;Chinese University of Hong Kong;Baidu", - "aff_unique_dep": ";;Baidu, Inc.", + "aff_unique_norm": "The University of Hong Kong;The Chinese University of Hong Kong;Baidu, Inc.", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.hku.hk;https://www.cuhk.edu.hk;https://www.baidu.com", "aff_unique_abbr": "HKU;CUHK;Baidu", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Jianhui and Chen,\n Yukang and Ye,\n Xiaoqing and Qi,\n Xiaojuan\n},\n title = {\n IST-Net: Prior-Free Category-Level Pose Estimation with Implicit Space Transformation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13978-13988\n} \n}" }, { "title": "ITI-GEN: Inclusive Text-to-Image Generation", @@ -27885,13 +28823,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhang_ITI-GEN_Inclusive_Text-to-Image_Generation_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0;1;1;0", "aff_unique_norm": "Carnegie Mellon University;Google", - "aff_unique_dep": ";Google", + "aff_unique_dep": ";", "aff_unique_url": "https://www.cmu.edu;https://www.google.com", "aff_unique_abbr": "CMU;Google", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Cheng and Chen,\n Xuanbai and Chai,\n Siqi and Wu,\n Chen Henry and Lagun,\n Dmitry and Beeler,\n Thabo and De la Torre,\n Fernando\n},\n title = {\n ITI-GEN: Inclusive Text-to-Image Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3969-3980\n} \n}" }, { "title": "Identification of Systematic Errors of Image Classifiers on Rare Subgroups", @@ -27903,7 +28842,7 @@ "author": "Jan Hendrik Metzen; Robin Hutmacher; N. Grace Hua; Valentyn Boreiko; Dan Zhang", "abstract": "Despite excellent average-case performance of many image classifiers, their performance can substantially deteriorate on semantically coherent subgroups of the data that were under-represented in the training data. These systematic errors can impact both fairness for demographic minority groups as well as robustness and safety under domain shift. A major challenge is to identify such subgroups with subpar performance when the subgroups are not annotated and their occurrence is very rare. We leverage recent advances in text-to-image models and search in the space of textual descriptions of subgroups (\"prompts\") for subgroups where the target model has low performance on the prompt-conditioned synthesized data. To tackle the exponentially growing number of subgroups, we employ combinatorial testing. We denote this procedure as PromptAttack as it can be interpreted as an adversarial attack in a prompt space. We study subgroup coverage and identifiability with PromptAttack in a controlled setting and find that it identifies systematic errors with high accuracy. Thereupon, we apply PromptAttack to ImageNet classifiers and identify novel systematic errors on rare subgroups.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Metzen_Identification_of_Systematic_Errors_of_Image_Classifiers_on_Rare_Subgroups_ICCV_2023_paper.pdf", - "aff": "Bosch Center for Artificial Intelligence, Robert Bosch GmbH; Bosch Center for Artificial Intelligence, Robert Bosch GmbH; Bosch Center for Artificial Intelligence, Robert Bosch GmbH; Bosch Center for Artificial Intelligence, Robert Bosch GmbH + University of T\u00fcbingen; Bosch Center for Artificial Intelligence, Robert Bosch GmbH", + "aff": "Bosch Center for Artificial Intelligence, Robert Bosch GmbH; Bosch Center for Artificial Intelligence, Robert Bosch GmbH; Bosch Center for Artificial Intelligence, Robert Bosch GmbH; Bosch Center for Artificial Intelligence, Robert Bosch GmbH + University of Tübingen; Bosch Center for Artificial Intelligence, Robert Bosch GmbH", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Metzen_Identification_of_Systematic_ICCV_2023_supplemental.pdf", @@ -27916,14 +28855,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Metzen_Identification_of_Systematic_Errors_of_Image_Classifiers_on_Rare_Subgroups_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0+1;0", - "aff_unique_norm": "Robert Bosch GmbH;University of T\u00fcbingen", + "aff_unique_norm": "Robert Bosch GmbH;University of Tübingen", "aff_unique_dep": "Bosch Center for Artificial Intelligence;", "aff_unique_url": "https://www.bosch.com;https://www.uni-tuebingen.de/", - "aff_unique_abbr": "Bosch;Uni T\u00fcbingen", + "aff_unique_abbr": "Bosch;Uni Tübingen", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0+0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Metzen_2023_ICCV,\n \n author = {\n Metzen,\n Jan Hendrik and Hutmacher,\n Robin and Hua,\n N. Grace and Boreiko,\n Valentyn and Zhang,\n Dan\n},\n title = {\n Identification of Systematic Errors of Image Classifiers on Rare Subgroups\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5064-5073\n} \n}" }, { "title": "Identity-Consistent Aggregation for Video Object Detection", @@ -27955,7 +28895,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Adelaide;", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "Australia;United Kingdom" + "aff_country_unique": "Australia;United Kingdom", + "bibtex": "@InProceedings{Deng_2023_ICCV,\n \n author = {\n Deng,\n Chaorui and Chen,\n Da and Wu,\n Qi\n},\n title = {\n Identity-Consistent Aggregation for Video Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13434-13444\n} \n}" }, { "title": "Identity-Seeking Self-Supervised Representation Learning for Generalizable Person Re-Identification", @@ -27987,7 +28928,8 @@ "aff_campus_unique_index": ";;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Dou_2023_ICCV,\n \n author = {\n Dou,\n Zhaopeng and Wang,\n Zhongdao and Li,\n Yali and Wang,\n Shengjin\n},\n title = {\n Identity-Seeking Self-Supervised Representation Learning for Generalizable Person Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15847-15858\n} \n}" }, { "title": "ImGeoNet: Image-induced Geometry-aware Voxel Representation for Multi-view 3D Object Detection", @@ -28012,14 +28954,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Tu_ImGeoNet_Image-induced_Geometry-aware_Voxel_Representation_for_Multi-view_3D_Object_Detection_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;3;3;3;3;0+3", - "aff_unique_norm": "National Tsing Hua University;National Taiwan University;National Yang Ming Chiao Tung University;Amazon", - "aff_unique_dep": ";;;Amazon.com, Inc.", + "aff_unique_norm": "National Tsing Hua University;National Taiwan University;National Yang Ming Chiao Tung University;Amazon.com, Inc.", + "aff_unique_dep": ";;;", "aff_unique_url": "https://www.nthu.edu.tw;https://www.ntu.edu.tw;https://www.nycu.edu.tw;https://www.amazon.com", "aff_unique_abbr": "NTHU;NTU;NYCU;Amazon", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Taiwan;", "aff_country_unique_index": "0;0;0;1;1;1;1;0+1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Tu_2023_ICCV,\n \n author = {\n Tu,\n Tao and Chuang,\n Shun-Po and Liu,\n Yu-Lun and Sun,\n Cheng and Zhang,\n Ke and Roy,\n Donna and Kuo,\n Cheng-Hao and Sun,\n Min\n},\n title = {\n ImGeoNet: Image-induced Geometry-aware Voxel Representation for Multi-view 3D Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6996-7007\n} \n}" }, { "title": "Image-Free Classifier Injection for Zero-Shot Classification", @@ -28031,7 +28974,7 @@ "author": "Anders Christensen; Massimiliano Mancini; A. Sophia Koepke; Ole Winther; Zeynep Akata", "abstract": "Zero-shot learning models achieve remarkable results on image classification for samples from classes that were not seen during training. However, such models must be trained from scratch with specialised methods: therefore, access to a training dataset is required when the need for zero-shot classification arises. In this paper, we aim to equip pre-trained models with zero-shot classification capabilities without the use of image data. We achieve this with our proposed Image-free Classifier Injection with Semantics (ICIS) that injects classifiers for new, unseen classes into pre-trained classification models in a post-hoc fashion without relying on image data. Instead, the existing classifier weights and simple class-wise descriptors, such as class names or attributes, are used. ICIS has two encoder-decoder networks that learn to reconstruct classifier weights from descriptors (and vice versa), exploiting (cross-)reconstruction and cosine losses to regularise the decoding process. Notably, ICIS can be cheaply trained and applied directly on top of pre-trained classification models. Experiments on benchmark ZSL datasets show that ICIS produces unseen classifier weights that achieve strong (generalised) zero-shot classification performance. Code is available at https://github.com/ExplainableML/ImageFreeZSL.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Christensen_Image-Free_Classifier_Injection_for_Zero-Shot_Classification_ICCV_2023_paper.pdf", - "aff": "University of T\u00fcbingen+Technical University of Denmark; University of Trento; University of T\u00fcbingen; Technical University of Denmark+University of Copenhagen+Copenhagen University Hospital+FindZebra; University of T\u00fcbingen+MPI for Intelligent Systems", + "aff": "University of Tübingen+Technical University of Denmark; University of Trento; University of Tübingen; Technical University of Denmark+University of Copenhagen+Copenhagen University Hospital+FindZebra; University of Tübingen+MPI for Intelligent Systems", "project": "", "github": "https://github.com/ExplainableML/ImageFreeZSL", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Christensen_Image-Free_Classifier_Injection_ICCV_2023_supplemental.pdf", @@ -28044,14 +28987,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Christensen_Image-Free_Classifier_Injection_for_Zero-Shot_Classification_ICCV_2023_paper.html", "aff_unique_index": "0+1;2;0;1+3+4+5;0+6", - "aff_unique_norm": "University of T\u00fcbingen;Technical University of Denmark;University of Trento;University of Copenhagen;Copenhagen University Hospital;FindZebra;Max Planck Institute for Intelligent Systems", + "aff_unique_norm": "University of Tübingen;Technical University of Denmark;University of Trento;University of Copenhagen;Copenhagen University Hospital;FindZebra;Max Planck Institute for Intelligent Systems", "aff_unique_dep": ";;;;;;", "aff_unique_url": "https://www.uni-tuebingen.de/;https://www.tek.dk;https://www.unitn.it;https://www.ku.dk;https://www.cuh.dk;;https://www.mpi-is.mpg.de", - "aff_unique_abbr": "Uni T\u00fcbingen;DTU;UniTN;UCPH;;;MPI-IS", + "aff_unique_abbr": "Uni Tübingen;DTU;UniTN;UCPH;;;MPI-IS", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+1;2;0;1+1+1;0+0", - "aff_country_unique": "Germany;Denmark;Italy;" + "aff_country_unique": "Germany;Denmark;Italy;", + "bibtex": "@InProceedings{Christensen_2023_ICCV,\n \n author = {\n Christensen,\n Anders and Mancini,\n Massimiliano and Koepke,\n A. Sophia and Winther,\n Ole and Akata,\n Zeynep\n},\n title = {\n Image-Free Classifier Injection for Zero-Shot Classification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19072-19081\n} \n}" }, { "title": "ImbSAM: A Closer Look at Sharpness-Aware Minimization in Class-Imbalanced Recognition", @@ -28076,14 +29020,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhou_ImbSAM_A_Closer_Look_at_Sharpness-Aware_Minimization_in_Class-Imbalanced_Recognition_ICCV_2023_paper.html", "aff_unique_index": "0+1;0+1;0;0+1", - "aff_unique_norm": "University of Electronic Science and Technology of China;Pengcheng Laboratory", - "aff_unique_dep": "School of Computer Science and Engineering;Peng Cheng Laboratory", + "aff_unique_norm": "University of Electronic Science and Technology of China;Peng Cheng Laboratory", + "aff_unique_dep": "School of Computer Science and Engineering;", "aff_unique_url": "https://www.uestc.edu.cn;", "aff_unique_abbr": "UESTC;", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhou_2023_ICCV,\n \n author = {\n Zhou,\n Yixuan and Qu,\n Yi and Xu,\n Xing and Shen,\n Hengtao\n},\n title = {\n ImbSAM: A Closer Look at Sharpness-Aware Minimization in Class-Imbalanced Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11345-11355\n} \n}" }, { "title": "Imitator: Personalized Speech-driven 3D Facial Animation", @@ -28106,7 +29051,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Thambiraja_Imitator_Personalized_Speech-driven_3D_Facial_Animation_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Thambiraja_Imitator_Personalized_Speech-driven_3D_Facial_Animation_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Thambiraja_2023_ICCV,\n \n author = {\n Thambiraja,\n Balamurugan and Habibie,\n Ikhsanul and Aliakbarian,\n Sadegh and Cosker,\n Darren and Theobalt,\n Christian and Thies,\n Justus\n},\n title = {\n Imitator: Personalized Speech-driven 3D Facial Animation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20621-20631\n} \n}" }, { "title": "Implicit Autoencoder for Point-Cloud Self-Supervised Representation Learning", @@ -28138,7 +29084,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Austin;", "aff_country_unique_index": "0;0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Yan_2023_ICCV,\n \n author = {\n Yan,\n Siming and Yang,\n Zhenpei and Li,\n Haoxiang and Song,\n Chen and Guan,\n Li and Kang,\n Hao and Hua,\n Gang and Huang,\n Qixing\n},\n title = {\n Implicit Autoencoder for Point-Cloud Self-Supervised Representation Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14530-14542\n} \n}" }, { "title": "Implicit Identity Representation Conditioned Memory Compensation Network for Talking Head video Generation", @@ -28170,7 +29117,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Hong_2023_ICCV,\n \n author = {\n Hong,\n Fa-Ting and Xu,\n Dan\n},\n title = {\n Implicit Identity Representation Conditioned Memory Compensation Network for Talking Head video Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23062-23072\n} \n}" }, { "title": "Implicit Neural Representation for Cooperative Low-light Image Enhancement", @@ -28195,14 +29143,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yang_Implicit_Neural_Representation_for_Cooperative_Low-light_Image_Enhancement_ICCV_2023_paper.html", "aff_unique_index": "0+1;0;0;2;0", - "aff_unique_norm": "Peking University;Pengcheng Laboratory;University of Washington", - "aff_unique_dep": ";Peng Cheng Laboratory;", + "aff_unique_norm": "Peking University;Peng Cheng Laboratory;University of Washington", + "aff_unique_dep": ";;", "aff_unique_url": "http://www.pku.edu.cn;;https://www.washington.edu", "aff_unique_abbr": "PKU;;UW", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Shenzhen;", "aff_country_unique_index": "0+0;0;0;1;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n Shuzhou and Ding,\n Moxuan and Wu,\n Yanmin and Li,\n Zihan and Zhang,\n Jian\n},\n title = {\n Implicit Neural Representation for Cooperative Low-light Image Enhancement\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12918-12927\n} \n}" }, { "title": "Implicit Temporal Modeling with Learnable Alignment for Video Recognition", @@ -28227,14 +29176,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Tu_Implicit_Temporal_Modeling_with_Learnable_Alignment_for_Video_Recognition_ICCV_2023_paper.html", "aff_unique_index": "0+1;2;0+1;3;2;0+1", - "aff_unique_norm": "Fudan University;Shanghai Collaborative Innovation Center of Intelligent Visual Computing;Microsoft;Carnegie Mellon University", + "aff_unique_norm": "Fudan University;Shanghai Collaborative Innovation Center of Intelligent Visual Computing;Microsoft Research;Carnegie Mellon University", "aff_unique_dep": "School of Computer Science;Intelligent Visual Computing;Research;", "aff_unique_url": "https://www.fudan.edu.cn;;https://www.microsoft.com/en-us/research/group/asia;https://www.cmu.edu", "aff_unique_abbr": "Fudan;;MSR Asia;CMU", "aff_campus_unique_index": "0;2;0;2;0", "aff_campus_unique": "Shanghai;;Asia", "aff_country_unique_index": "0+0;0;0+0;1;0;0+0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Tu_2023_ICCV,\n \n author = {\n Tu,\n Shuyuan and Dai,\n Qi and Wu,\n Zuxuan and Cheng,\n Zhi-Qi and Hu,\n Han and Jiang,\n Yu-Gang\n},\n title = {\n Implicit Temporal Modeling with Learnable Alignment for Video Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19936-19947\n} \n}" }, { "title": "Improved Knowledge Transfer for Semi-Supervised Domain Adaptation via Trico Training Strategy", @@ -28266,7 +29216,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Ngo_2023_ICCV,\n \n author = {\n Ngo,\n Ba Hung and Chae,\n Yeon Jeong and Kwon,\n Jung Eun and Park,\n Jae Hyeon and Cho,\n Sung In\n},\n title = {\n Improved Knowledge Transfer for Semi-Supervised Domain Adaptation via Trico Training Strategy\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19214-19223\n} \n}" }, { "title": "Improved Visual Fine-tuning with Natural Language Supervision", @@ -28289,7 +29240,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wang_Improved_Visual_Fine-tuning_with_Natural_Language_Supervision_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wang_Improved_Visual_Fine-tuning_with_Natural_Language_Supervision_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Junyang and Xu,\n Yuanhong and Hu,\n Juhua and Yan,\n Ming and Sang,\n Jitao and Qian,\n Qi\n},\n title = {\n Improved Visual Fine-tuning with Natural Language Supervision\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11899-11909\n} \n}" }, { "title": "Improving 3D Imaging with Pre-Trained Perpendicular 2D Diffusion Models", @@ -28321,7 +29273,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;1;1;0", - "aff_country_unique": "South Korea;United States" + "aff_country_unique": "South Korea;United States", + "bibtex": "@InProceedings{Lee_2023_ICCV,\n \n author = {\n Lee,\n Suhyeon and Chung,\n Hyungjin and Park,\n Minyoung and Park,\n Jonghyuk and Ryu,\n Wi-Sun and Ye,\n Jong Chul\n},\n title = {\n Improving 3D Imaging with Pre-Trained Perpendicular 2D Diffusion Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10710-10720\n} \n}" }, { "title": "Improving Adversarial Robustness of Masked Autoencoders via Test-time Frequency-domain Prompting", @@ -28329,6 +29282,7 @@ "status": "Poster", "track": "main", "pid": "2368", + "author_site": "Qidong Huang, Xiaoyi Dong, Dongdong Chen, Yinpeng Chen, Lu Yuan, Gang Hua, Weiming Zhang, Nenghai Yu", "author": "Qidong Huang, Xiaoyi Dong, Dongdong Chen, Yinpeng Chen, Lu Yuan, Gang Hua, Weiming Zhang, Nenghai Yu", "abstract": "In this paper, we investigate the adversarial robustness of vision transformers that are equipped with BERT pretraining (e.g., BEiT, MAE). A surprising observation is that MAE has significantly worse adversarial robustness than other BERT pretraining methods. This observation drives us to rethink the basic differences between these BERT pretraining methods and how these differences affect the robustness against adversarial perturbations. Our empirical analysis reveals that the adversarial robustness of BERT pretraining is highly related to the reconstruction target, i.e., predicting the raw pixels of masked image patches will degrade more adversarial robustness of the model than predicting the semantic context, since it guides the model to concentrate more on medium-/high-frequency components of images. Based on our analysis, we provide a simple yet effective way to boost the adversarial robustness of MAE. The basic idea is using the dataset-extracted domain knowledge to occupy the medium-/high-frequency of images, thus narrowing the optimization space of adversarial perturbations. Specifically, we group the distribution of pretraining data and optimize a set of cluster-specific visual prompts on frequency domain. These prompts are incorporated with input images through prototype-based prompt selection during test period. Extensive evaluation shows that our method clearly boost MAE's adversarial robustness while maintaining its clean performance on ImageNet-1k classification. \n Our code is available at: https://github.com/shikiw/RobustMAE.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Huang_Improving_Adversarial_Robustness_of_Masked_Autoencoders_via_Test-time_Frequency-domain_Prompting_ICCV_2023_paper.pdf", @@ -28340,7 +29294,8 @@ "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5670224868019731163&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Huang_Improving_Adversarial_Robustness_of_Masked_Autoencoders_via_Test-time_Frequency-domain_Prompting_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Huang_Improving_Adversarial_Robustness_of_Masked_Autoencoders_via_Test-time_Frequency-domain_Prompting_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Huang_2023_ICCV,\n \n author = {\n Huang,\n Qidong and Dong,\n Xiaoyi and Chen,\n Dongdong and Chen,\n Yinpeng and Yuan,\n Lu and Hua,\n Gang and Zhang,\n Weiming and Yu,\n Nenghai\n},\n title = {\n Improving Adversarial Robustness of Masked Autoencoders via Test-time Frequency-domain Prompting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1600-1610\n} \n}" }, { "title": "Improving CLIP Fine-tuning Performance", @@ -28365,14 +29320,15 @@ "author_num": 9, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wei_Improving_CLIP_Fine-tuning_Performance_ICCV_2023_paper.html", "aff_unique_index": "0;1;0;2;1;1;1;1;1", - "aff_unique_norm": "Tsinghua University;Microsoft;University of Science and Technology of China", + "aff_unique_norm": "Tsinghua University;Microsoft Research;University of Science and Technology of China", "aff_unique_dep": ";Research;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.microsoft.com/en-us/research/group/asia;https://www.ustc.edu.cn", "aff_unique_abbr": "THU;MSR Asia;USTC", "aff_campus_unique_index": "1;1;1;1;1;1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wei_2023_ICCV,\n \n author = {\n Wei,\n Yixuan and Hu,\n Han and Xie,\n Zhenda and Liu,\n Ze and Zhang,\n Zheng and Cao,\n Yue and Bao,\n Jianmin and Chen,\n Dong and Guo,\n Baining\n},\n title = {\n Improving CLIP Fine-tuning Performance\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5439-5449\n} \n}" }, { "title": "Improving Continuous Sign Language Recognition with Cross-Lingual Signs", @@ -28397,14 +29353,15 @@ "author_num": 2, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wei_Improving_Continuous_Sign_Language_Recognition_with_Cross-Lingual_Signs_ICCV_2023_paper.html", "aff_unique_index": "0;0", - "aff_unique_norm": "Microsoft", + "aff_unique_norm": "Microsoft Research", "aff_unique_dep": "Research", "aff_unique_url": "https://www.microsoft.com/en-us/research/group/asia", "aff_unique_abbr": "MSR Asia", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Asia", "aff_country_unique_index": "0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wei_2023_ICCV,\n \n author = {\n Wei,\n Fangyun and Chen,\n Yutong\n},\n title = {\n Improving Continuous Sign Language Recognition with Cross-Lingual Signs\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23612-23621\n} \n}" }, { "title": "Improving Diversity in Zero-Shot GAN Adaptation with Semantic Variations", @@ -28427,7 +29384,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Jeon_Improving_Diversity_in_Zero-Shot_GAN_Adaptation_with_Semantic_Variations_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Jeon_Improving_Diversity_in_Zero-Shot_GAN_Adaptation_with_Semantic_Variations_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Jeon_2023_ICCV,\n \n author = {\n Jeon,\n Seogkyu and Liu,\n Bei and Lee,\n Pilhyeon and Hong,\n Kibeom and Fu,\n Jianlong and Byun,\n Hyeran\n},\n title = {\n Improving Diversity in Zero-Shot GAN Adaptation with Semantic Variations\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7258-7267\n} \n}" }, { "title": "Improving Equivariance in State-of-the-Art Supervised Depth and Normal Predictors", @@ -28452,14 +29410,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhong_Improving_Equivariance_in_State-of-the-Art_Supervised_Depth_and_Normal_Predictors_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0", - "aff_unique_norm": "University of Illinois Urbana-Champaign", + "aff_unique_norm": "University of Illinois at Urbana-Champaign", "aff_unique_dep": "", "aff_unique_url": "https://illinois.edu", "aff_unique_abbr": "UIUC", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Urbana-Champaign", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhong_2023_ICCV,\n \n author = {\n Zhong,\n Yuanyi and Bhattad,\n Anand and Wang,\n Yu-Xiong and Forsyth,\n David\n},\n title = {\n Improving Equivariance in State-of-the-Art Supervised Depth and Normal Predictors\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21775-21785\n} \n}" }, { "title": "Improving Generalization in Visual Reinforcement Learning via Conflict-aware Gradient Agreement Augmentation", @@ -28491,7 +29450,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Siao and Chen,\n Zhaoyu and Liu,\n Yang and Wang,\n Yuzheng and Yang,\n Dingkang and Zhao,\n Zhile and Zhou,\n Ziqing and Yi,\n Xie and Li,\n Wei and Zhang,\n Wenqiang and Gan,\n Zhongxue\n},\n title = {\n Improving Generalization in Visual Reinforcement Learning via Conflict-aware Gradient Agreement Augmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23436-23446\n} \n}" }, { "title": "Improving Generalization of Adversarial Training via Robust Critical Fine-Tuning", @@ -28516,14 +29476,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhu_Improving_Generalization_of_Adversarial_Training_via_Robust_Critical_Fine-Tuning_ICCV_2023_paper.html", "aff_unique_index": "0+1;2;3;3;0+1", - "aff_unique_norm": "University of Chinese Academy of Sciences;Chinese Academy of Sciences;City University of Hong Kong;Microsoft", + "aff_unique_norm": "University of Chinese Academy of Sciences;Chinese Academy of Sciences;City University of Hong Kong;Microsoft Corporation", "aff_unique_dep": "School of Artificial Intelligence;Institute of Automation;;Microsoft Research", "aff_unique_url": "http://www.ucas.ac.cn;http://www.ia.cas.cn;https://www.cityu.edu.hk;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "UCAS;CAS;CityU;MSR", "aff_campus_unique_index": ";1;", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0+0;0;1;1;0+0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Zhu_2023_ICCV,\n \n author = {\n Zhu,\n Kaijie and Hu,\n Xixu and Wang,\n Jindong and Xie,\n Xing and Yang,\n Ge\n},\n title = {\n Improving Generalization of Adversarial Training via Robust Critical Fine-Tuning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4424-4434\n} \n}" }, { "title": "Improving Lens Flare Removal with General-Purpose Pipeline and Multiple Light Sources Recovery", @@ -28555,7 +29516,8 @@ "aff_campus_unique_index": "0;0;0;0;1;2", "aff_campus_unique": "Nanjing;Shanghai;Tianjin", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhou_2023_ICCV,\n \n author = {\n Zhou,\n Yuyan and Liang,\n Dong and Chen,\n Songcan and Huang,\n Sheng-Jun and Yang,\n Shuo and Li,\n Chongyi\n},\n title = {\n Improving Lens Flare Removal with General-Purpose Pipeline and Multiple Light Sources Recovery\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12969-12979\n} \n}" }, { "title": "Improving Online Lane Graph Extraction by Object-Lane Clustering", @@ -28587,7 +29549,8 @@ "aff_campus_unique_index": "0;0;0+1;0+1", "aff_campus_unique": "Zurich;Sofia;", "aff_country_unique_index": "0;0;0+1;0+2+1", - "aff_country_unique": "Switzerland;Bulgaria;Belgium" + "aff_country_unique": "Switzerland;Bulgaria;Belgium", + "bibtex": "@InProceedings{Can_2023_ICCV,\n \n author = {\n Can,\n Yigit Baran and Liniger,\n Alexander and Paudel,\n Danda Pani and Van Gool,\n Luc\n},\n title = {\n Improving Online Lane Graph Extraction by Object-Lane Clustering\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8591-8601\n} \n}" }, { "title": "Improving Pixel-based MIM by Reducing Wasted Modeling Capability", @@ -28612,14 +29575,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Liu_Improving_Pixel-based_MIM_by_Reducing_Wasted_Modeling_Capability_ICCV_2023_paper.html", "aff_unique_index": "0;0;1;2;0;0+2", - "aff_unique_norm": "Shanghai AI Laboratory;Simon Fraser University;Chinese University of Hong Kong", + "aff_unique_norm": "Shanghai AI Laboratory;Simon Fraser University;The Chinese University of Hong Kong", "aff_unique_dep": ";;", "aff_unique_url": "https://www.shanghai-ai-lab.com;https://www.sfu.ca;https://www.cuhk.edu.hk", "aff_unique_abbr": "SAIL;SFU;CUHK", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;1;0;0;0+0", - "aff_country_unique": "China;Canada" + "aff_country_unique": "China;Canada", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Yuan and Zhang,\n Songyang and Chen,\n Jiacheng and Yu,\n Zhaohui and Chen,\n Kai and Lin,\n Dahua\n},\n title = {\n Improving Pixel-based MIM by Reducing Wasted Modeling Capability\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5361-5372\n} \n}" }, { "title": "Improving Representation Learning for Histopathologic Images with Cluster Constraints", @@ -28651,7 +29615,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Wu_2023_ICCV,\n \n author = {\n Wu,\n Weiyi and Gao,\n Chongyang and DiPalma,\n Joseph and Vosoughi,\n Soroush and Hassanpour,\n Saeed\n},\n title = {\n Improving Representation Learning for Histopathologic Images with Cluster Constraints\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21404-21414\n} \n}" }, { "title": "Improving Sample Quality of Diffusion Models Using Self-Attention Guidance", @@ -28683,7 +29648,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Seoul", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Hong_2023_ICCV,\n \n author = {\n Hong,\n Susung and Lee,\n Gyuseong and Jang,\n Wooseok and Kim,\n Seungryong\n},\n title = {\n Improving Sample Quality of Diffusion Models Using Self-Attention Guidance\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7462-7471\n} \n}" }, { "title": "Improving Transformer-based Image Matching by Cascaded Capturing Spatially Informative Keypoints", @@ -28715,7 +29681,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Cao_2023_ICCV,\n \n author = {\n Cao,\n Chenjie and Fu,\n Yanwei\n},\n title = {\n Improving Transformer-based Image Matching by Cascaded Capturing Spatially Informative Keypoints\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12129-12139\n} \n}" }, { "title": "Improving Unsupervised Visual Program Inference with Code Rewriting Families", @@ -28747,7 +29714,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Ganeshan_2023_ICCV,\n \n author = {\n Ganeshan,\n Aditya and Jones,\n R. Kenny and Ritchie,\n Daniel\n},\n title = {\n Improving Unsupervised Visual Program Inference with Code Rewriting Families\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15791-15801\n} \n}" }, { "title": "Improving Zero-Shot Generalization for CLIP with Synthesized Prompts", @@ -28755,7 +29723,11 @@ "author": "Zhengbo Wang, Jian Liang, Ran He, Nan Xu, Zilei Wang, Tieniu Tan", "status": "Poster", "track": "main", - "pid": "9372" + "gs_citation": 80, + "pid": "9372", + "bibtex": "@misc{wang2023,\n title={Improving Zero-Shot Generalization for CLIP with Synthesized Prompts},\n author={Zhengbo Wang and Jian Liang and Ran He and Nan Xu and Zilei Wang and Tieniu Tan},\n year={2023},\n eprint={2307.07397v1},\n archivePrefix={arXiv},\n primaryClass={cs.CV},\n url={https://arxiv.org/abs/2307.07397v1}\n}", + "abstract": "With the growing interest in pretrained vision-language models like CLIP,\nrecent research has focused on adapting these models to downstream tasks.\nDespite achieving promising results, most existing methods require labeled data\nfor all classes, which may not hold in real-world applications due to the long\ntail and Zipf's law. For example, some classes may lack labeled data entirely,\nsuch as emerging concepts. To address this problem, we propose a plug-and-play\ngenerative approach called \\textbf{S}ynt\\textbf{H}es\\textbf{I}zed\n\\textbf{P}rompts~(\\textbf{SHIP}) to improve existing fine-tuning methods.\nSpecifically, we follow variational autoencoders to introduce a generator that\nreconstructs the visual features by inputting the synthesized prompts and the\ncorresponding class names to the textual encoder of CLIP. In this manner, we\neasily obtain the synthesized features for the remaining label-only classes.\nThereafter, we fine-tune CLIP with off-the-shelf methods by combining labeled\nand synthesized features. Extensive experiments on base-to-new generalization,\ncross-dataset transfer learning, and generalized zero-shot learning demonstrate\nthe superiority of our approach. The code is available at\n\\url{https://github.com/mrflogs/SHIP}.", + "pdf_url": "http://arxiv.org/pdf/2307.07397v1" }, { "title": "In-Style: Bridging Text and Uncurated Videos with Style Transfer for Text-Video Retrieval", @@ -28787,7 +29759,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Frankfurt;", "aff_country_unique_index": "0+0+0;0;0;0+0+1", - "aff_country_unique": "Germany;United States" + "aff_country_unique": "Germany;United States", + "bibtex": "@InProceedings{Shvetsova_2023_ICCV,\n \n author = {\n Shvetsova,\n Nina and Kukleva,\n Anna and Schiele,\n Bernt and Kuehne,\n Hilde\n},\n title = {\n In-Style: Bridging Text and Uncurated Videos with Style Transfer for Text-Video Retrieval\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21981-21992\n} \n}" }, { "title": "Incremental Generalized Category Discovery", @@ -28819,7 +29792,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Zhao_2023_ICCV,\n \n author = {\n Zhao,\n Bingchen and Mac Aodha,\n Oisin\n},\n title = {\n Incremental Generalized Category Discovery\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19137-19147\n} \n}" }, { "title": "Indoor Depth Recovery Based on Deep Unfolding with Non-Local Prior", @@ -28851,7 +29825,8 @@ "aff_campus_unique_index": "1;1;1;1", "aff_campus_unique": ";Shanghai", "aff_country_unique_index": "0+0;0+0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Dai_2023_ICCV,\n \n author = {\n Dai,\n Yuhui and Zhang,\n Junkang and Fang,\n Faming and Zhang,\n Guixu\n},\n title = {\n Indoor Depth Recovery Based on Deep Unfolding with Non-Local Prior\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12355-12364\n} \n}" }, { "title": "Inducing Neural Collapse to a Fixed Hierarchy-Aware Frame for Reducing Mistake Severity", @@ -28874,7 +29849,8 @@ "aff_domain": ";", "email": ";", "author_num": 2, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Liang_Inducing_Neural_Collapse_to_a_Fixed_Hierarchy-Aware_Frame_for_Reducing_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Liang_Inducing_Neural_Collapse_to_a_Fixed_Hierarchy-Aware_Frame_for_Reducing_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Liang_2023_ICCV,\n \n author = {\n Liang,\n Tong and Davis,\n Jim\n},\n title = {\n Inducing Neural Collapse to a Fixed Hierarchy-Aware Frame for Reducing Mistake Severity\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1443-1452\n} \n}" }, { "title": "InfiniCity: Infinite-Scale City Synthesis", @@ -28897,7 +29873,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Lin_InfiniCity_Infinite-Scale_City_Synthesis_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Lin_InfiniCity_Infinite-Scale_City_Synthesis_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Lin_2023_ICCV,\n \n author = {\n Lin,\n Chieh Hubert and Lee,\n Hsin-Ying and Menapace,\n Willi and Chai,\n Menglei and Siarohin,\n Aliaksandr and Yang,\n Ming-Hsuan and Tulyakov,\n Sergey\n},\n title = {\n InfiniCity: Infinite-Scale City Synthesis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22808-22818\n} \n}" }, { "title": "Informative Data Mining for One-Shot Cross-Domain Semantic Segmentation", @@ -28922,14 +29899,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wang_Informative_Data_Mining_for_One-Shot_Cross-Domain_Semantic_Segmentation_ICCV_2023_paper.html", "aff_unique_index": "0+1+2+3;1+2+3;2;4;4;0+1+2+3", - "aff_unique_norm": "Hong Kong Institute of Science and Technology;Chinese Academy of Sciences;University of Chinese Academy of Sciences;State Key Laboratory of Multimodal Artificial Intelligence Systems;Tencent", - "aff_unique_dep": "Centre for Artificial Intelligence and Robotics;Institute of Automation;;;Tencent Holdings Limited", + "aff_unique_norm": "Hong Kong Institute of Science and Technology;Chinese Academy of Sciences;University of Chinese Academy of Sciences;State Key Laboratory of Multimodal Artificial Intelligence Systems;Tencent Holdings Limited", + "aff_unique_dep": "Centre for Artificial Intelligence and Robotics;Institute of Automation;;;", "aff_unique_url": "https://www.hkisi.edu.hk;http://www.ia.cas.cn;http://www.ucas.ac.cn;;https://www.tencent.com", "aff_unique_abbr": "HKISI;CAS;UCAS;;Tencent", "aff_campus_unique_index": "0;;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0+0+0+0;0+0+0;0;0;0;0+0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Yuxi and Liang,\n Jian and Xiao,\n Jun and Mei,\n Shuqi and Yang,\n Yuran and Zhang,\n Zhaoxiang\n},\n title = {\n Informative Data Mining for One-Shot Cross-Domain Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1064-1074\n} \n}" }, { "title": "Inherent Redundancy in Spiking Neural Networks", @@ -28952,7 +29930,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yao_Inherent_Redundancy_in_Spiking_Neural_Networks_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yao_Inherent_Redundancy_in_Spiking_Neural_Networks_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Yao_2023_ICCV,\n \n author = {\n Yao,\n Man and Hu,\n Jiakui and Zhao,\n Guangshe and Wang,\n Yaoyuan and Zhang,\n Ziyang and Xu,\n Bo and Li,\n Guoqi\n},\n title = {\n Inherent Redundancy in Spiking Neural Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16924-16934\n} \n}" }, { "title": "Innovating Real Fisheye Image Correction with Dual Diffusion Architecture", @@ -28977,14 +29956,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yang_Innovating_Real_Fisheye_Image_Correction_with_Dual_Diffusion_Architecture_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0", - "aff_unique_norm": "Beijing Jiao Tong University", + "aff_unique_norm": "Beijing Jiaotong University", "aff_unique_dep": "Institute of Information Science", "aff_unique_url": "http://www.bjtu.edu.cn", "aff_unique_abbr": "BJTU", - "aff_campus_unique_index": "0;0;0;0", - "aff_campus_unique": "Beijing", + "aff_campus_unique_index": "", + "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n Shangrong and Lin,\n Chunyu and Liao,\n Kang and Zhao,\n Yao\n},\n title = {\n Innovating Real Fisheye Image Correction with Dual Diffusion Architecture\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12699-12708\n} \n}" }, { "title": "Inspecting the Geographical Representativeness of Images from Text-to-Image Models", @@ -29016,7 +29996,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Bangalore", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "India" + "aff_country_unique": "India", + "bibtex": "@InProceedings{Basu_2023_ICCV,\n \n author = {\n Basu,\n Abhipsa and Babu,\n R. Venkatesh and Pruthi,\n Danish\n},\n title = {\n Inspecting the Geographical Representativeness of Images from Text-to-Image Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5136-5147\n} \n}" }, { "title": "Instance Neural Radiance Field", @@ -29048,7 +30029,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;1;1;1;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Yichen and Hu,\n Benran and Huang,\n Junkai and Tai,\n Yu-Wing and Tang,\n Chi-Keung\n},\n title = {\n Instance Neural Radiance Field\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 787-796\n} \n}" }, { "title": "Instance and Category Supervision are Alternate Learners for Continual Learning", @@ -29075,12 +30057,13 @@ "aff_unique_index": "0+1+0;0+1+0;0+1+0;2;2;3;4;0+0", "aff_unique_norm": "East China Normal University;Shanghai Key Laboratory of Computer Software Testing & Evaluating;Tencent;Xiamen University;Contemporary Amperex Technology Co., Limited", "aff_unique_dep": ";Computer Software Testing & Evaluating;YouTu Lab;;", - "aff_unique_url": "http://www.ecnu.edu.cn;;https://www.tencent.com;https://www.xmu.edu.cn;https://www.catl.com.cn", + "aff_unique_url": "http://www.ecnu.edu.cn;;https://www.tencent.com;https://www.xmu.edu.cn;https://www.catlglobal.com", "aff_unique_abbr": "ECNU;;Tencent;XMU;CATL", "aff_campus_unique_index": "1;1;1;1", "aff_campus_unique": ";Chongqing", "aff_country_unique_index": "0+0+0;0+0+0;0+0+0;0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Tian_2023_ICCV,\n \n author = {\n Tian,\n Xudong and Zhang,\n Zhizhong and Tan,\n Xin and Liu,\n Jun and Wang,\n Chengjie and Qu,\n Yanyun and Jiang,\n Guannan and Xie,\n Yuan\n},\n title = {\n Instance and Category Supervision are Alternate Learners for Continual Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5596-5605\n} \n}" }, { "title": "Instance-aware Dynamic Prompt Tuning for Pre-trained Point Cloud Models", @@ -29105,14 +30088,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zha_Instance-aware_Dynamic_Prompt_Tuning_for_Pre-trained_Point_Cloud_Models_ICCV_2023_paper.html", "aff_unique_index": "0;0;1;2;0;3", - "aff_unique_norm": "Tsinghua University;Shenzhen University;Harbin Institute of Technology;Pengcheng Laboratory", + "aff_unique_norm": "Tsinghua University;Shenzhen University;Harbin Institute of Technology;Peng Cheng Laboratory", "aff_unique_dep": "International Graduate School;College of Computer Science and Software Engineering;;Research Center of Artificial Intelligence", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.szu.edu.cn;http://en.hhit.edu.cn/;http://www.pcl.ac.cn", "aff_unique_abbr": "THU;SZU;HIT;", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Shenzhen;", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zha_2023_ICCV,\n \n author = {\n Zha,\n Yaohua and Wang,\n Jinpeng and Dai,\n Tao and Chen,\n Bin and Wang,\n Zhi and Xia,\n Shu-Tao\n},\n title = {\n Instance-aware Dynamic Prompt Tuning for Pre-trained Point Cloud Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14161-14170\n} \n}" }, { "title": "Instruct-NeRF2NeRF: Editing 3D Scenes with Instructions", @@ -29144,7 +30128,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Haque_2023_ICCV,\n \n author = {\n Haque,\n Ayaan and Tancik,\n Matthew and Efros,\n Alexei A. and Holynski,\n Aleksander and Kanazawa,\n Angjoo\n},\n title = {\n Instruct-NeRF2NeRF: Editing 3D Scenes with Instructions\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19740-19750\n} \n}" }, { "title": "Integrally Migrating Pre-trained Transformer Encoder-decoders for Visual Object Detection", @@ -29176,7 +30161,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0+0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Feng and Zhang,\n Xiaosong and Peng,\n Zhiliang and Guo,\n Zonghao and Wan,\n Fang and Ji,\n Xiangyang and Ye,\n Qixiang\n},\n title = {\n Integrally Migrating Pre-trained Transformer Encoder-decoders for Visual Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6825-6834\n} \n}" }, { "title": "Integrating Boxes and Masks: A Multi-Object Framework for Unified Visual Tracking and Segmentation", @@ -29201,14 +30187,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Xu_Integrating_Boxes_and_Masks_A_Multi-Object_Framework_for_Unified_Visual_ICCV_2023_paper.html", "aff_unique_index": "0+1;0;0", - "aff_unique_norm": "Zhejiang University;Baidu", - "aff_unique_dep": "ReLER, CCAI;Baidu Research", + "aff_unique_norm": "Zhejiang University;Baidu Research", + "aff_unique_dep": "ReLER, CCAI;", "aff_unique_url": "http://www.zju.edu.cn;https://research.baidu.com", "aff_unique_abbr": "ZJU;Baidu", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xu_2023_ICCV,\n \n author = {\n Xu,\n Yuanyou and Yang,\n Zongxin and Yang,\n Yi\n},\n title = {\n Integrating Boxes and Masks: A Multi-Object Framework for Unified Visual Tracking and Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9738-9751\n} \n}" }, { "title": "IntentQA: Context-aware Video Intent Reasoning", @@ -29220,7 +30207,7 @@ "author": "Jiapeng Li; Ping Wei; Wenjuan Han; Lifeng Fan", "abstract": "In this paper, we propose a novel task IntentQA, a special VideoQA task focusing on video intent reasoning, which has become increasingly important for AI with its advantages in equipping AI agents with the capability of reasoning beyond mere recognition in daily tasks. We also contribute a large-scale VideoQA dataset for this task. We propose a Context-aware Video Intent Reasoning model (CaVIR) consisting of i) Video Query Language (VQL) for better cross-modal representation of the situational context, ii) Contrastive Learning module for utilizing the contrastive context, and iii) Commonsense Reasoning module for incorporating the commonsense context. Comprehensive experiments on this challenging task demonstrate the effectiveness of each model component, the superiority of our full model over other baselines, and the generalizability of our model to a new VideoQA task. The dataset and codes are open-sourced at: https://github.com/JoseponLee/IntentQA.git", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Li_IntentQA_Context-aware_Video_Intent_Reasoning_ICCV_2023_paper.pdf", - "aff": "National Key Laboratory of Human-Machine Hybrid Augmented Intelligence, Xi\u2019an Jiaotong University, Xi\u2019an, China+National Key Laboratory of General Artificial Intelligence, Beijing Institute for General Artificial Intelligence (BIGAI), Beijing, China; National Key Laboratory of Human-Machine Hybrid Augmented Intelligence, Xi\u2019an Jiaotong University, Xi\u2019an, China; School of Computer and Information Technology, Beijing Jiaotong University, Beijing, China; National Key Laboratory of General Artificial Intelligence, Beijing Institute for General Artificial Intelligence (BIGAI), Beijing, China", + "aff": "National Key Laboratory of Human-Machine Hybrid Augmented Intelligence, Xi’an Jiaotong University, Xi’an, China+National Key Laboratory of General Artificial Intelligence, Beijing Institute for General Artificial Intelligence (BIGAI), Beijing, China; National Key Laboratory of Human-Machine Hybrid Augmented Intelligence, Xi’an Jiaotong University, Xi’an, China; School of Computer and Information Technology, Beijing Jiaotong University, Beijing, China; National Key Laboratory of General Artificial Intelligence, Beijing Institute for General Artificial Intelligence (BIGAI), Beijing, China", "project": "", "github": "https://github.com/JoseponLee/IntentQA.git", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Li_IntentQA_Context-aware_Video_ICCV_2023_supplemental.pdf", @@ -29233,14 +30220,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_IntentQA_Context-aware_Video_Intent_Reasoning_ICCV_2023_paper.html", "aff_unique_index": "0+1;0;2;1", - "aff_unique_norm": "Xi'an Jiao Tong University;Beijing Institute for General Artificial Intelligence;Beijing Jiao Tong University", + "aff_unique_norm": "Xi'an Jiaotong University;Beijing Institute for General Artificial Intelligence;Beijing Jiaotong University", "aff_unique_dep": "National Key Laboratory of Human-Machine Hybrid Augmented Intelligence;National Key Laboratory of General Artificial Intelligence;School of Computer and Information Technology", "aff_unique_url": "http://www.xjtu.edu.cn;http://www.bigmodel.cn/;http://www.bjtu.edu.cn", "aff_unique_abbr": "XJTU;BIGAI;BJTU", "aff_campus_unique_index": "0+1;0;1;1", "aff_campus_unique": "Xi'an;Beijing", "aff_country_unique_index": "0+0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Jiapeng and Wei,\n Ping and Han,\n Wenjuan and Fan,\n Lifeng\n},\n title = {\n IntentQA: Context-aware Video Intent Reasoning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11963-11974\n} \n}" }, { "title": "Inter-Realization Channels: Unsupervised Anomaly Detection Beyond One-Class Classification", @@ -29272,7 +30260,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{McIntosh_2023_ICCV,\n \n author = {\n McIntosh,\n Declan and Albu,\n Alexandra Branzan\n},\n title = {\n Inter-Realization Channels: Unsupervised Anomaly Detection Beyond One-Class Classification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6285-6295\n} \n}" }, { "title": "InterDiff: Generating 3D Human-Object Interactions with Physics-Informed Diffusion", @@ -29297,14 +30286,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Xu_InterDiff_Generating_3D_Human-Object_Interactions_with_Physics-Informed_Diffusion_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0", - "aff_unique_norm": "University of Illinois Urbana-Champaign", + "aff_unique_norm": "University of Illinois at Urbana-Champaign", "aff_unique_dep": "", "aff_unique_url": "https://illinois.edu", "aff_unique_abbr": "UIUC", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Urbana-Champaign", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Xu_2023_ICCV,\n \n author = {\n Xu,\n Sirui and Li,\n Zhengyuan and Wang,\n Yu-Xiong and Gui,\n Liang-Yan\n},\n title = {\n InterDiff: Generating 3D Human-Object Interactions with Physics-Informed Diffusion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14928-14940\n} \n}" }, { "title": "InterFormer: Real-time Interactive Image Segmentation", @@ -29332,11 +30322,12 @@ "aff_unique_norm": "Xiamen University;Contemporary Amperex Technology Co. Limited", "aff_unique_dep": "Key Laboratory of Multimedia Trusted Perception and Efficient Computing;Intelligent Manufacturing Department", "aff_unique_url": "https://www.xmu.edu.cn;https://www.catl.com.cn", - "aff_unique_abbr": ";CATL", + "aff_unique_abbr": "XMU;CATL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Huang_2023_ICCV,\n \n author = {\n Huang,\n You and Yang,\n Hao and Sun,\n Ke and Zhang,\n Shengchuan and Cao,\n Liujuan and Jiang,\n Guannan and Ji,\n Rongrong\n},\n title = {\n InterFormer: Real-time Interactive Image Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22301-22311\n} \n}" }, { "title": "Interaction-aware Joint Attention Estimation Using People Attributes", @@ -29359,7 +30350,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Nakatani_Interaction-aware_Joint_Attention_Estimation_Using_People_Attributes_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Nakatani_Interaction-aware_Joint_Attention_Estimation_Using_People_Attributes_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Nakatani_2023_ICCV,\n \n author = {\n Nakatani,\n Chihiro and Kawashima,\n Hiroaki and Ukita,\n Norimichi\n},\n title = {\n Interaction-aware Joint Attention Estimation Using People Attributes\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10224-10233\n} \n}" }, { "title": "Interactive Class-Agnostic Object Counting", @@ -29384,14 +30376,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Huang_Interactive_Class-Agnostic_Object_Counting_ICCV_2023_paper.html", "aff_unique_index": "0;1+0;0+2", - "aff_unique_norm": "Stony Brook University;Amazon;VinAI Research", - "aff_unique_dep": ";Amazon.com, Inc.;", - "aff_unique_url": "https://www.stonybrook.edu;https://www.amazon.com;https://www.vinai.io/", + "aff_unique_norm": "Stony Brook University;Amazon.com, Inc.;VinAI Research", + "aff_unique_dep": ";;", + "aff_unique_url": "https://www.stonybrook.edu;https://www.amazon.com;https://www.vinai.io", "aff_unique_abbr": "SBU;Amazon;VinAI", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0+1", - "aff_country_unique": "United States;Vietnam" + "aff_country_unique": "United States;Vietnam", + "bibtex": "@InProceedings{Huang_2023_ICCV,\n \n author = {\n Huang,\n Yifeng and Ranjan,\n Viresh and Hoai,\n Minh\n},\n title = {\n Interactive Class-Agnostic Object Counting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22312-22322\n} \n}" }, { "title": "IntrinsicNeRF: Learning Intrinsic Neural Radiance Fields for Editable Novel View Synthesis", @@ -29416,14 +30409,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ye_IntrinsicNeRF_Learning_Intrinsic_Neural_Radiance_Fields_for_Editable_Novel_View_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0;1+2;0;0", - "aff_unique_norm": "Zhejiang University;ETH Zurich;Microsoft", - "aff_unique_dep": "State Key Lab of CAD&CG;;Microsoft Corporation", + "aff_unique_norm": "Zhejiang University;ETH Zurich;Microsoft Corporation", + "aff_unique_dep": "State Key Lab of CAD&CG;;", "aff_unique_url": "http://www.zju.edu.cn;https://www.ethz.ch;https://www.microsoft.com", "aff_unique_abbr": "ZJU;ETHZ;Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1+2;0;0", - "aff_country_unique": "China;Switzerland;United States" + "aff_country_unique": "China;Switzerland;United States", + "bibtex": "@InProceedings{Ye_2023_ICCV,\n \n author = {\n Ye,\n Weicai and Chen,\n Shuo and Bao,\n Chong and Bao,\n Hujun and Pollefeys,\n Marc and Cui,\n Zhaopeng and Zhang,\n Guofeng\n},\n title = {\n IntrinsicNeRF: Learning Intrinsic Neural Radiance Fields for Editable Novel View Synthesis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 339-351\n} \n}" }, { "title": "Introducing Language Guidance in Prompt-based Continual Learning", @@ -29446,7 +30440,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Khan_Introducing_Language_Guidance_in_Prompt-based_Continual_Learning_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Khan_Introducing_Language_Guidance_in_Prompt-based_Continual_Learning_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Khan_2023_ICCV,\n \n author = {\n Khan,\n Muhammad Gul Zain Ali and Naeem,\n Muhammad Ferjad and Van Gool,\n Luc and Stricker,\n Didier and Tombari,\n Federico and Afzal,\n Muhammad Zeshan\n},\n title = {\n Introducing Language Guidance in Prompt-based Continual Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11463-11473\n} \n}" }, { "title": "Invariant Feature Regularization for Fair Face Recognition", @@ -29478,7 +30473,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Singapore;", "aff_country_unique_index": "0;0;1;1;0;0;0", - "aff_country_unique": "Singapore;Japan" + "aff_country_unique": "Singapore;Japan", + "bibtex": "@InProceedings{Ma_2023_ICCV,\n \n author = {\n Ma,\n Jiali and Yue,\n Zhongqi and Tomoyuki,\n Kagaya and Tomoki,\n Suzuki and Jayashree,\n Karlekar and Pranata,\n Sugiri and Zhang,\n Hanwang\n},\n title = {\n Invariant Feature Regularization for Fair Face Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20861-20870\n} \n}" }, { "title": "Invariant Training 2D-3D Joint Hard Samples for Few-Shot Point Cloud Recognition", @@ -29510,7 +30506,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0", - "aff_country_unique": "Singapore;Australia;" + "aff_country_unique": "Singapore;Australia;", + "bibtex": "@InProceedings{Yi_2023_ICCV,\n \n author = {\n Yi,\n Xuanyu and Deng,\n Jiajun and Sun,\n Qianru and Hua,\n Xian-Sheng and Lim,\n Joo-Hwee and Zhang,\n Hanwang\n},\n title = {\n Invariant Training 2D-3D Joint Hard Samples for Few-Shot Point Cloud Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14463-14474\n} \n}" }, { "title": "Inverse Compositional Learning for Weakly-supervised Relation Grounding", @@ -29522,7 +30519,7 @@ "author": "Huan Li; Ping Wei; Zeyu Ma; Nanning Zheng", "abstract": "Video relation grounding (VRG) is a significant and challenging problem in the domains of cross-modal learning and video understanding. In this study, we introduce a novel approach called inverse compositional learning (ICL) for weakly-supervised video relation grounding. Our approach represents relations at both the holistic and partial levels, formulating VRG as a joint optimization problem that encompasses reasoning at both levels.\n For holistic-level reasoning, we propose an inverse attention mechanism and a compositional encoder to generate compositional relevance features. Additionally, we introduce an inverse loss to evaluate and learn the relevance between visual features and relation features.\n At the partial-level reasoning, we introduce a grounding by classification scheme. By leveraging the learned holistic-level features and partial-level features, we train the entire model in an end-to-end manner.\n We conduct evaluations on two challenging datasets and demonstrate the substantial superiority of our proposed method over state-of-the-art methods. Extensive ablation studies confirm the effectiveness of our approach.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Li_Inverse_Compositional_Learning_for_Weakly-supervised_Relation_Grounding_ICCV_2023_paper.pdf", - "aff": "National Key Laboratory of Human-Machine Hybrid Augmented Intelligence, Xi\u2019an Jiaotong University, Xi\u2019an, China; National Key Laboratory of Human-Machine Hybrid Augmented Intelligence, Xi\u2019an Jiaotong University, Xi\u2019an, China; National Key Laboratory of Human-Machine Hybrid Augmented Intelligence, Xi\u2019an Jiaotong University, Xi\u2019an, China; National Key Laboratory of Human-Machine Hybrid Augmented Intelligence, Xi\u2019an Jiaotong University, Xi\u2019an, China", + "aff": "National Key Laboratory of Human-Machine Hybrid Augmented Intelligence, Xi’an Jiaotong University, Xi’an, China; National Key Laboratory of Human-Machine Hybrid Augmented Intelligence, Xi’an Jiaotong University, Xi’an, China; National Key Laboratory of Human-Machine Hybrid Augmented Intelligence, Xi’an Jiaotong University, Xi’an, China; National Key Laboratory of Human-Machine Hybrid Augmented Intelligence, Xi’an Jiaotong University, Xi’an, China", "project": "", "github": "", "supp": "", @@ -29535,14 +30532,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_Inverse_Compositional_Learning_for_Weakly-supervised_Relation_Grounding_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0", - "aff_unique_norm": "Xi'an Jiao Tong University", + "aff_unique_norm": "Xi'an Jiaotong University", "aff_unique_dep": "National Key Laboratory of Human-Machine Hybrid Augmented Intelligence", "aff_unique_url": "http://www.xjtu.edu.cn", "aff_unique_abbr": "XJTU", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Xi'an", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Huan and Wei,\n Ping and Ma,\n Zeyu and Zheng,\n Nanning\n},\n title = {\n Inverse Compositional Learning for Weakly-supervised Relation Grounding\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15477-15487\n} \n}" }, { "title": "Inverse Problem Regularization with Hierarchical Variational Autoencoders", @@ -29550,11 +30548,11 @@ "status": "Poster", "track": "main", "pid": "11126", - "author_site": "Jean Prost, Antoine Houdard, Andr\u00e9s Almansa, Nicolas Papadakis", - "author": "Jean Prost; Antoine Houdard; Andr\u00e9s Almansa; Nicolas Papadakis", + "author_site": "Jean Prost, Antoine Houdard, Andrés Almansa, Nicolas Papadakis", + "author": "Jean Prost; Antoine Houdard; Andrés Almansa; Nicolas Papadakis", "abstract": "In this paper, we propose to regularize ill-posed inverse problems using a deep hierarchical Variational AutoEncoder (HVAE) as an image prior. The proposed method synthesizes the advantages of i) denoiser-based Plug & Play approaches and ii) generative model based approaches to inverse problems. First, we exploit VAE properties to design an efficient algorithm that benefits from convergence guarantees of Plug-and-Play (PnP) methods. Second, our approach is not restricted to specialized datasets and the proposed PnP-HVAE model is able to solve image restoration problems on natural images of any size. Our experiments show that the proposed PnP-HVAE method is competitive with both SOTA denoiser-based PnP approaches, and other SOTA restoration methods based on generative models. The code for this project is available at https://github.com/jprost76/PnP-HVAE.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Prost_Inverse_Problem_Regularization_with_Hierarchical_Variational_Autoencoders_ICCV_2023_paper.pdf", - "aff": "Univ. Bordeaux, CNRS, Bordeaux INP, IMB, UMR 5251, F-33400 Talence, France; Ubisoft La Forge, Bordeaux; Universit\u00e9 Paris Cit\u00e9, CNRS, MAP5, UMR 8145; Univ. Bordeaux, CNRS, Bordeaux INP, IMB, UMR 5251, F-33400 Talence, France", + "aff": "Univ. Bordeaux, CNRS, Bordeaux INP, IMB, UMR 5251, F-33400 Talence, France; Ubisoft La Forge, Bordeaux; Université Paris Cité, CNRS, MAP5, UMR 8145; Univ. Bordeaux, CNRS, Bordeaux INP, IMB, UMR 5251, F-33400 Talence, France", "project": "", "github": "https://github.com/jprost76/PnP-HVAE", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Prost_Inverse_Problem_Regularization_ICCV_2023_supplemental.pdf", @@ -29567,14 +30565,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Prost_Inverse_Problem_Regularization_with_Hierarchical_Variational_Autoencoders_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;0", - "aff_unique_norm": "University of Bordeaux;Ubisoft;Universit\u00e9 Paris Cit\u00e9", - "aff_unique_dep": "Institut de Math\u00e9matiques de Bordeaux (IMB);La Forge;CNRS, MAP5, UMR 8145", + "aff_unique_norm": "University of Bordeaux;Ubisoft;Université Paris Cité", + "aff_unique_dep": "Institut de Mathématiques de Bordeaux (IMB);La Forge;CNRS, MAP5, UMR 8145", "aff_unique_url": "https://www.univ-bordeaux.fr;https://www.ubisoft.com;https://www.universite-paris.fr", "aff_unique_abbr": "Univ. Bordeaux;Ubisoft;UPC", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Talence;Bordeaux;", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "France" + "aff_country_unique": "France", + "bibtex": "@InProceedings{Prost_2023_ICCV,\n \n author = {\n Prost,\n Jean and Houdard,\n Antoine and Almansa,\n Andr\\'es and Papadakis,\n Nicolas\n},\n title = {\n Inverse Problem Regularization with Hierarchical Variational Autoencoders\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22894-22905\n} \n}" }, { "title": "Is Imitation All You Need? Generalized Decision-Making with Dual-Phase Training", @@ -29599,14 +30598,15 @@ "author_num": 10, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wei_Is_Imitation_All_You_Need_Generalized_Decision-Making_with_Dual-Phase_Training_ICCV_2023_paper.html", "aff_unique_index": "0;1;1;2;0;2;0;3;2;0", - "aff_unique_norm": "Microsoft;University of Maryland;Scaled Foundations;Zhejiang University", - "aff_unique_dep": "Microsoft Corporation;;;", + "aff_unique_norm": "Microsoft Corporation;University of Maryland;Scaled Foundations;Zhejiang University", + "aff_unique_dep": ";;;", "aff_unique_url": "https://www.microsoft.com;https://www/umd.edu;https://scaledfoundations.com;https://www.zju.edu.cn", "aff_unique_abbr": "Microsoft;UMD;;ZJU", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";College Park", "aff_country_unique_index": "0;0;0;0;0;0;0;1;0;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Wei_2023_ICCV,\n \n author = {\n Wei,\n Yao and Sun,\n Yanchao and Zheng,\n Ruijie and Vemprala,\n Sai and Bonatti,\n Rogerio and Chen,\n Shuhang and Madaan,\n Ratnesh and Ba,\n Zhongjie and Kapoor,\n Ashish and Ma,\n Shuang\n},\n title = {\n Is Imitation All You Need? Generalized Decision-Making with Dual-Phase Training\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16221-16231\n} \n}" }, { "title": "Isomer: Isomerous Transformer for Zero-shot Video Object Segmentation", @@ -29631,14 +30631,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yuan_Isomer_Isomerous_Transformer_for_Zero-shot_Video_Object_Segmentation_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0;0;1;1;2", - "aff_unique_norm": "Dalian University of Technology;OPPO Research Institute;Hong Kong Polytechnic University", + "aff_unique_norm": "Dalian University of Technology;OPPO Research Institute;The Hong Kong Polytechnic University", "aff_unique_dep": "School of Information and Communication Engineering;;", "aff_unique_url": "http://en.dlut.edu.cn/;https://www.oppo.com/en;https://www.polyu.edu.hk", "aff_unique_abbr": "DUT;OPPO RI;PolyU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yuan_2023_ICCV,\n \n author = {\n Yuan,\n Yichen and Wang,\n Yifan and Wang,\n Lijun and Zhao,\n Xiaoqi and Lu,\n Huchuan and Wang,\n Yu and Su,\n Weibo and Zhang,\n Lei\n},\n title = {\n Isomer: Isomerous Transformer for Zero-shot Video Object Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 966-976\n} \n}" }, { "title": "Iterative Denoiser and Noise Estimator for Self-Supervised Image Denoising", @@ -29670,7 +30671,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zou_2023_ICCV,\n \n author = {\n Zou,\n Yunhao and Yan,\n Chenggang and Fu,\n Ying\n},\n title = {\n Iterative Denoiser and Noise Estimator for Self-Supervised Image Denoising\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13265-13274\n} \n}" }, { "title": "Iterative Prompt Learning for Unsupervised Backlit Image Enhancement", @@ -29702,7 +30704,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Liang_2023_ICCV,\n \n author = {\n Liang,\n Zhexin and Li,\n Chongyi and Zhou,\n Shangchen and Feng,\n Ruicheng and Loy,\n Chen Change\n},\n title = {\n Iterative Prompt Learning for Unsupervised Backlit Image Enhancement\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8094-8103\n} \n}" }, { "title": "Iterative Soft Shrinkage Learning for Efficient Image Super-Resolution", @@ -29714,7 +30717,7 @@ "author": "Jiamian Wang; Huan Wang; Yulun Zhang; Yun Fu; Zhiqiang Tao", "abstract": "Image super-resolution (SR) has witnessed extensive neural network designs from CNN to transformer architectures. However, prevailing SR models suffer from prohibitive memory footprint and intensive computations, which limits further deployment on edge devices. This work investigates the potential of network pruning for super-resolution to take advantage of off-the-shelf network designs and reduce the underlying computational overhead. Two main challenges remain in applying pruning methods for SR. First, the widely-used filter pruning technique reflects limited granularity and restricted adaptability to diverse network structures. Second, existing pruning methods generally operate upon a pre-trained network for the sparse structure determination, hard to get rid of dense model training in the traditional SR paradigm. To address these challenges, we adopt unstructured pruning with sparse models directly trained from scratch. Specifically, we propose a novel Iterative Soft Shrinkage-Percentage (ISS-P) method by optimizing the sparse structure of a randomly initialized network at each iteration and tweaking unimportant weights with a small amount proportional to the magnitude scale on-the-fly. We observe that the proposed ISS-P can dynamically learn sparse structures adapting to the optimization process and preserve the sparse model's trainability by yielding a more regularized gradient throughput. Experiments on benchmark datasets demonstrate the effectiveness of the proposed ISS-P over diverse network architectures. Code is available at https://github.com/Jiamian-Wang/Iterative-Soft-Shrinkage-SR", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Wang_Iterative_Soft_Shrinkage_Learning_for_Efficient_Image_Super-Resolution_ICCV_2023_paper.pdf", - "aff": "Rochester Institute of Technology; Northeastern University; ETH Z\u00fcrich; Northeastern University; Rochester Institute of Technology", + "aff": "Rochester Institute of Technology; Northeastern University; ETH Zürich; Northeastern University; Rochester Institute of Technology", "project": "", "github": "https://github.com/Jiamian-Wang/Iterative-Soft-Shrinkage-SR", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Wang_Iterative_Soft_Shrinkage_Learning_for_Efficient_Image_Super-Resolution_ICCV_2023_supplemental.pdf", @@ -29727,14 +30730,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wang_Iterative_Soft_Shrinkage_Learning_for_Efficient_Image_Super-Resolution_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;1;0", - "aff_unique_norm": "Rochester Institute of Technology;Northeastern University;ETH Zurich", + "aff_unique_norm": "Rochester Institute of Technology;Northeastern University;ETH Zürich", "aff_unique_dep": ";;", "aff_unique_url": "https://www.rit.edu;https://www.northeastern.edu;https://www.ethz.ch", "aff_unique_abbr": "RIT;NEU;ETHZ", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0", - "aff_country_unique": "United States;Switzerland" + "aff_country_unique": "United States;Switzerland", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Jiamian and Wang,\n Huan and Zhang,\n Yulun and Fu,\n Yun and Tao,\n Zhiqiang\n},\n title = {\n Iterative Soft Shrinkage Learning for Efficient Image Super-Resolution\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12590-12599\n} \n}" }, { "title": "Iterative Superquadric Recomposition of 3D Objects from Multiple Views", @@ -29757,7 +30761,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Alaniz_Iterative_Superquadric_Recomposition_of_3D_Objects_from_Multiple_Views_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Alaniz_Iterative_Superquadric_Recomposition_of_3D_Objects_from_Multiple_Views_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Alaniz_2023_ICCV,\n \n author = {\n Alaniz,\n Stephan and Mancini,\n Massimiliano and Akata,\n Zeynep\n},\n title = {\n Iterative Superquadric Recomposition of 3D Objects from Multiple Views\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18013-18023\n} \n}" }, { "title": "JOTR: 3D Joint Contrastive Learning with Transformers for Occluded Human Mesh Recovery", @@ -29789,7 +30794,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Jiahao and Yang,\n Zongxin and Wang,\n Xiaohan and Ma,\n Jianxin and Zhou,\n Chang and Yang,\n Yi\n},\n title = {\n JOTR: 3D Joint Contrastive Learning with Transformers for Occluded Human Mesh Recovery\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9110-9121\n} \n}" }, { "title": "Joint Demosaicing and Deghosting of Time-Varying Exposures for Single-Shot HDR Imaging", @@ -29821,7 +30827,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Kim_2023_ICCV,\n \n author = {\n Kim,\n Jungwoo and Kim,\n Min H.\n},\n title = {\n Joint Demosaicing and Deghosting of Time-Varying Exposures for Single-Shot HDR Imaging\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12292-12301\n} \n}" }, { "title": "Joint Implicit Neural Representation for High-fidelity and Compact Vector Fonts", @@ -29853,7 +30860,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Chia-Hao and Liu,\n Ying-Tian and Zhang,\n Zhifei and Guo,\n Yuan-Chen and Zhang,\n Song-Hai\n},\n title = {\n Joint Implicit Neural Representation for High-fidelity and Compact Vector Fonts\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5538-5548\n} \n}" }, { "title": "Joint Metrics Matter: A Better Standard for Trajectory Forecasting", @@ -29885,7 +30893,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Weng_2023_ICCV,\n \n author = {\n Weng,\n Erica and Hoshino,\n Hana and Ramanan,\n Deva and Kitani,\n Kris\n},\n title = {\n Joint Metrics Matter: A Better Standard for Trajectory Forecasting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20315-20326\n} \n}" }, { "title": "Joint-Relation Transformer for Multi-Person Motion Prediction", @@ -29917,7 +30926,8 @@ "aff_campus_unique_index": ";;;;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0;0;0+0;0+0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xu_2023_ICCV,\n \n author = {\n Xu,\n Qingyao and Mao,\n Weibo and Gong,\n Jingze and Xu,\n Chenxin and Chen,\n Siheng and Xie,\n Weidi and Zhang,\n Ya and Wang,\n Yanfeng\n},\n title = {\n Joint-Relation Transformer for Multi-Person Motion Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9816-9826\n} \n}" }, { "title": "Jumping through Local Minima: Quantization in the Loss Landscape of Vision Transformers", @@ -29949,7 +30959,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Austin;", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "United States;United Kingdom" + "aff_country_unique": "United States;United Kingdom", + "bibtex": "@InProceedings{Frumkin_2023_ICCV,\n \n author = {\n Frumkin,\n Natalia and Gope,\n Dibakar and Marculescu,\n Diana\n},\n title = {\n Jumping through Local Minima: Quantization in the Loss Landscape of Vision Transformers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16978-16988\n} \n}" }, { "title": "KECOR: Kernel Coding Rate Maximization for Active 3D Object Detection", @@ -29974,14 +30985,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Luo_KECOR_Kernel_Coding_Rate_Maximization_for_Active_3D_Object_Detection_ICCV_2023_paper.html", "aff_unique_index": "0;0;1;2;0;0", - "aff_unique_norm": "University of Queensland;University of Technology Sydney;Harbin Institute of Technology", + "aff_unique_norm": "The University of Queensland;University of Technology Sydney;Harbin Institute of Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://www.uq.edu.au;https://www.uts.edu.au;http://en.hhit.edu.cn/", "aff_unique_abbr": "UQ;UTS;HIT", "aff_campus_unique_index": "1", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0;0;0;1;0;0", - "aff_country_unique": "Australia;China" + "aff_country_unique": "Australia;China", + "bibtex": "@InProceedings{Luo_2023_ICCV,\n \n author = {\n Luo,\n Yadan and Chen,\n Zhuoxiao and Fang,\n Zhen and Zhang,\n Zheng and Baktashmotlagh,\n Mahsa and Huang,\n Zi\n},\n title = {\n KECOR: Kernel Coding Rate Maximization for Active 3D Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18279-18290\n} \n}" }, { "title": "Keep It SimPool: Who Said Supervised Transformers Suffer from Attention Deficit?", @@ -30004,7 +31016,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Psomas_Keep_It_SimPool_Who_Said_Supervised_Transformers_Suffer_from_Attention_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Psomas_Keep_It_SimPool_Who_Said_Supervised_Transformers_Suffer_from_Attention_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Psomas_2023_ICCV,\n \n author = {\n Psomas,\n Bill and Kakogeorgiou,\n Ioannis and Karantzalos,\n Konstantinos and Avrithis,\n Yannis\n},\n title = {\n Keep It SimPool: Who Said Supervised Transformers Suffer from Attention Deficit?\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5350-5360\n} \n}" }, { "title": "Kick Back & Relax: Learning to Reconstruct the World by Watching SlowTV", @@ -30036,7 +31049,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Oxford", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Spencer_2023_ICCV,\n \n author = {\n Spencer,\n Jaime and Russell,\n Chris and Hadfield,\n Simon and Bowden,\n Richard\n},\n title = {\n Kick Back \\& Relax: Learning to Reconstruct the World by Watching SlowTV\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15768-15779\n} \n}" }, { "title": "Knowing Where to Focus: Event-aware Transformer for Video Grounding", @@ -30068,7 +31082,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0+0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Jang_2023_ICCV,\n \n author = {\n Jang,\n Jinhyun and Park,\n Jungin and Kim,\n Jin and Kwon,\n Hyeongjun and Sohn,\n Kwanghoon\n},\n title = {\n Knowing Where to Focus: Event-aware Transformer for Video Grounding\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13846-13856\n} \n}" }, { "title": "Knowledge Proxy Intervention for Deconfounded Video Question Answering", @@ -30080,7 +31095,7 @@ "author": "Jiangtong Li; Li Niu; Liqing Zhang", "abstract": "Recently, Video Question-Answering (VideoQA) has drawn more and more attention from both industry and research community. Despite all the success achieved by recent works, dataset bias always harmfully misleads current methods focusing on spurious correlations in training data. To analyze the effects of dataset bias, we frame the VideoQA pipeline into a causal graph, which shows the causalities among video, question, aligned feature between video and question, answer, and underlying confounder. Through the causal graph, we prove that the confounder and the backdoor path lead to spurious causality. To tackle the challenge that the confounder in VideoQA is unobserved and non-enumerable in general, we propose a model-agnostic framework called Knowledge Proxy Intervention (KPI), which introduces an extra knowledge proxy variable in the causal graph to cut the backdoor path and remove the confounder. Our KPI framework exploits the front-door adjustment, which requires no prior knowledge about the confounder. The effectiveness of our KPI framework is corroborated by three baseline methods on five benchmark datasets, including MSVD-QA, MSRVTT-QA, TGIF-QA, NExT-QA, and Causal-VidQA.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Li_Knowledge_Proxy_Intervention_for_Deconfounded_Video_Question_Answering_ICCV_2023_paper.pdf", - "aff": "Department of Computer Science and Engineering, MoE Key Lab of Arti\ufb01cial Intelligence, Shanghai Jiao Tong University; Department of Computer Science and Engineering, MoE Key Lab of Arti\ufb01cial Intelligence, Shanghai Jiao Tong University; Department of Computer Science and Engineering, MoE Key Lab of Arti\ufb01cial Intelligence, Shanghai Jiao Tong University", + "aff": "Department of Computer Science and Engineering, MoE Key Lab of Artificial Intelligence, Shanghai Jiao Tong University; Department of Computer Science and Engineering, MoE Key Lab of Artificial Intelligence, Shanghai Jiao Tong University; Department of Computer Science and Engineering, MoE Key Lab of Artificial Intelligence, Shanghai Jiao Tong University", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Li_Knowledge_Proxy_Intervention_ICCV_2023_supplemental.pdf", @@ -30100,7 +31115,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Shanghai", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Jiangtong and Niu,\n Li and Zhang,\n Liqing\n},\n title = {\n Knowledge Proxy Intervention for Deconfounded Video Question Answering\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2782-2793\n} \n}" }, { "title": "Knowledge Restore and Transfer for Multi-Label Class-Incremental Learning", @@ -30123,7 +31139,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Dong_Knowledge_Restore_and_Transfer_for_Multi-Label_Class-Incremental_Learning_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Dong_Knowledge_Restore_and_Transfer_for_Multi-Label_Class-Incremental_Learning_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Dong_2023_ICCV,\n \n author = {\n Dong,\n Songlin and Luo,\n Haoyu and He,\n Yuhang and Wei,\n Xing and Cheng,\n Jie and Gong,\n Yihong\n},\n title = {\n Knowledge Restore and Transfer for Multi-Label Class-Incremental Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18711-18720\n} \n}" }, { "title": "Knowledge-Aware Federated Active Learning with Non-IID Data", @@ -30148,14 +31165,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Cao_Knowledge-Aware_Federated_Active_Learning_with_Non-IID_Data_ICCV_2023_paper.html", "aff_unique_index": "0;1;0;1;0", - "aff_unique_norm": "University of Sydney;ShanghaiTech University", + "aff_unique_norm": "The University of Sydney;ShanghaiTech University", "aff_unique_dep": "School of Computer Science;", "aff_unique_url": "https://www.sydney.edu.au;https://www.shanghaitech.edu.cn", "aff_unique_abbr": "USYD;ShanghaiTech", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Sydney;", "aff_country_unique_index": "0;1;0;1;0", - "aff_country_unique": "Australia;China" + "aff_country_unique": "Australia;China", + "bibtex": "@InProceedings{Cao_2023_ICCV,\n \n author = {\n Cao,\n Yu-Tong and Shi,\n Ye and Yu,\n Baosheng and Wang,\n Jingya and Tao,\n Dacheng\n},\n title = {\n Knowledge-Aware Federated Active Learning with Non-IID Data\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22279-22289\n} \n}" }, { "title": "Knowledge-Aware Prompt Tuning for Generalizable Vision-Language Models", @@ -30180,14 +31198,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Kan_Knowledge-Aware_Prompt_Tuning_for_Generalizable_Vision-Language_Models_ICCV_2023_paper.html", "aff_unique_index": "0;1+2;0;3;4;1", - "aff_unique_norm": "Qilu University of Technology;Southern University of Science and Technology;University of Hong Kong;United Imaging Healthcare;Monash University", + "aff_unique_norm": "Qilu University of Technology;Southern University of Science and Technology;The University of Hong Kong;United Imaging Healthcare;Monash University", "aff_unique_dep": ";;;;", "aff_unique_url": "http://www.qilu.edu.cn/;https://www.sustech.edu.cn;https://www.hku.hk;https://www.united-imaging.com;https://www.monash.edu", "aff_unique_abbr": "QUT;SUSTech;HKU;;Monash", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0+0;0;0;1;0", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Kan_2023_ICCV,\n \n author = {\n Kan,\n Baoshuo and Wang,\n Teng and Lu,\n Wenpeng and Zhen,\n Xiantong and Guan,\n Weili and Zheng,\n Feng\n},\n title = {\n Knowledge-Aware Prompt Tuning for Generalizable Vision-Language Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15670-15680\n} \n}" }, { "title": "Knowledge-Spreader: Learning Semi-Supervised Facial Action Dynamics by Consistifying Knowledge Granularity", @@ -30219,7 +31238,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Binghamton", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Xiaotian and Zhang,\n Xiang and Wang,\n Taoyue and Yin,\n Lijun\n},\n title = {\n Knowledge-Spreader: Learning Semi-Supervised Facial Action Dynamics by Consistifying Knowledge Granularity\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20979-20989\n} \n}" }, { "title": "L-DAWA: Layer-wise Divergence Aware Weight Aggregation in Federated Self-Supervised Visual Representation Learning", @@ -30242,7 +31262,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Rehman_L-DAWA_Layer-wise_Divergence_Aware_Weight_Aggregation_in_Federated_Self-Supervised_Visual_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Rehman_L-DAWA_Layer-wise_Divergence_Aware_Weight_Aggregation_in_Federated_Self-Supervised_Visual_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Rehman_2023_ICCV,\n \n author = {\n Rehman,\n Yasar Abbas Ur and Gao,\n Yan and de Gusmao,\n Pedro Porto Buarque and Alibeigi,\n Mina and Shen,\n Jiajun and Lane,\n Nicholas D.\n},\n title = {\n L-DAWA: Layer-wise Divergence Aware Weight Aggregation in Federated Self-Supervised Visual Representation Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16464-16473\n} \n}" }, { "title": "LA-Net: Landmark-Aware Learning for Reliable Facial Expression Recognition under Label Noise", @@ -30274,7 +31295,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wu_2023_ICCV,\n \n author = {\n Wu,\n Zhiyu and Cui,\n Jinshi\n},\n title = {\n LA-Net: Landmark-Aware Learning for Reliable Facial Expression Recognition under Label Noise\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20698-20707\n} \n}" }, { "title": "LAC - Latent Action Composition for Skeleton-based Action Segmentation", @@ -30286,7 +31308,7 @@ "author": "Di Yang; Yaohui Wang; Antitza Dantcheva; Quan Kong; Lorenzo Garattoni; Gianpiero Francesca; Francois Bremond", "abstract": "Skeleton-based action segmentation requires recognizing composable actions in untrimmed videos. Current approaches decouple this problem by first extracting local visual features from skeleton sequences and then processing them by a temporal model to classify frame-wise actions. However, their performances remain limited as the visual features cannot sufficiently express composable actions. In this context, we propose Latent Action Composition (LAC), a novel self-supervised framework aiming at learning from synthesized composable motions for skeleton-based action segmentation. LAC is composed of a novel generation module towards synthesizing new sequences. Specifically, we design a linear latent space in the generator to represent primitive motion. New composed motions can be synthesized by simply performing arithmetic operations on latent representations of multiple input skeleton sequences. LAC leverages such synthesized sequences, which have large diversity and complexity, for learning visual representations of skeletons in both sequence and frame spaces via contrastive learning. The resulting visual encoder has a high expressive power and can be effectively transferred onto action segmentation tasks by end-to-end fine-tuning without the need for additional temporal models. We conduct a study focusing on transfer-learning and we show that representations learned from pre-trained LAC outperform the state-of-the-art by a large margin on TSU, Charades, PKU-MMD datasets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Yang_LAC_-_Latent_Action_Composition_for_Skeleton-based_Action_Segmentation_ICCV_2023_paper.pdf", - "aff": "Inria, Universit\u00e9 C\u00f4te d\u2019Azur; Inria, Universit\u00e9 C\u00f4te d\u2019Azur; Inria, Universit\u00e9 C\u00f4te d\u2019Azur; Woven by Toyota; Toyota Motor Europe; Toyota Motor Europe; Inria, Universit\u00e9 C\u00f4te d\u2019Azur", + "aff": "Inria, Université Côte d’Azur; Inria, Université Côte d’Azur; Inria, Université Côte d’Azur; Woven by Toyota; Toyota Motor Europe; Toyota Motor Europe; Inria, Université Côte d’Azur", "project": "https://walker1126.github.io/LAC/", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Yang_LAC_-_Latent_ICCV_2023_supplemental.pdf", @@ -30299,14 +31321,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yang_LAC_-_Latent_Action_Composition_for_Skeleton-based_Action_Segmentation_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;1;2;2;0", - "aff_unique_norm": "INRIA;Toyota;Toyota Motor Corporation", + "aff_unique_norm": "Inria;Toyota;Toyota Motor Corporation", "aff_unique_dep": ";Woven;", "aff_unique_url": "https://www.inria.fr;https://www.toyota-global.com;https://www.toyota-europe.com", "aff_unique_abbr": "Inria;Toyota;TME", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;2;2;0", - "aff_country_unique": "France;Japan;Unknown" + "aff_country_unique": "France;Japan;Unknown", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n Di and Wang,\n Yaohui and Dantcheva,\n Antitza and Kong,\n Quan and Garattoni,\n Lorenzo and Francesca,\n Gianpiero and Bremond,\n Francois\n},\n title = {\n LAC - Latent Action Composition for Skeleton-based Action Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13679-13690\n} \n}" }, { "title": "LAN-HDR: Luminance-based Alignment Network for High Dynamic Range Video Reconstruction", @@ -30338,7 +31361,8 @@ "aff_campus_unique_index": "0+0;0+0", "aff_campus_unique": "Seoul", "aff_country_unique_index": "0+0;0+0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Chung_2023_ICCV,\n \n author = {\n Chung,\n Haesoo and Cho,\n Nam Ik\n},\n title = {\n LAN-HDR: Luminance-based Alignment Network for High Dynamic Range Video Reconstruction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12760-12769\n} \n}" }, { "title": "LATR: 3D Lane Detection from Monocular Images with Transformer", @@ -30370,7 +31394,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Shenzhen;", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Luo_2023_ICCV,\n \n author = {\n Luo,\n Yueru and Zheng,\n Chaoda and Yan,\n Xu and Kun,\n Tang and Zheng,\n Chao and Cui,\n Shuguang and Li,\n Zhen\n},\n title = {\n LATR: 3D Lane Detection from Monocular Images with Transformer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7941-7952\n} \n}" }, { "title": "LAW-Diffusion: Complex Scene Generation by Diffusion with Layouts", @@ -30395,14 +31420,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yang_LAW-Diffusion_Complex_Scene_Generation_by_Diffusion_with_Layouts_ICCV_2023_paper.html", "aff_unique_index": "0;0;1;2;0;0", - "aff_unique_norm": "Sun Yat-sen University;Jinan University;University of Oxford", + "aff_unique_norm": "Sun Yat-Sen University;Jinan University;University of Oxford", "aff_unique_dep": ";;", "aff_unique_url": "http://www.sysu.edu.cn/;https://www.jnu.edu.cn;https://www.ox.ac.uk", "aff_unique_abbr": "SYSU;JNU;Oxford", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0;0", - "aff_country_unique": "China;United Kingdom" + "aff_country_unique": "China;United Kingdom", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n Binbin and Luo,\n Yi and Chen,\n Ziliang and Wang,\n Guangrun and Liang,\n Xiaodan and Lin,\n Liang\n},\n title = {\n LAW-Diffusion: Complex Scene Generation by Diffusion with Layouts\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22669-22679\n} \n}" }, { "title": "LD-ZNet: A Latent Diffusion Approach for Text-Based Image Segmentation", @@ -30414,7 +31440,7 @@ "author": "Koutilya PNVR; Bharat Singh; Pallabi Ghosh; Behjat Siddiquie; David Jacobs", "abstract": "Large-scale pre-training tasks like image classification, captioning, or self-supervised techniques do not incentivize learning the semantic boundaries of objects. However, recent generative foundation models built using text-based latent diffusion techniques may learn semantic boundaries. This is because they have to synthesize intricate details about all objects in an image based on a text description. Therefore, we present a technique for segmenting real and AI-generated images using latent diffusion models (LDMs) trained on internet-scale datasets. First, we show that the latent space of LDMs (z-space) is a better input representation compared to other feature representations like RGB images or CLIP encodings for text-based image segmentation. By training the segmentation models on the latent z-space, which creates a compressed representation across several domains like different forms of art, cartoons, illustrations, and photographs, we are also able to bridge the domain gap between real and AI-generated images. We show that the internal features of LDMs contain rich semantic information and present a technique in the form of LD-ZNet to further boost the performance of text-based segmentation. Overall, we show up to 6% improvement over standard baselines for text-to-image segmentation on natural images. For AI-generated imagery, we show close to 20% improvement compared to state-of-the-art techniques. The project is available at https://koutilya-pnvr.github.io/LD-ZNet/.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/PNVR_LD-ZNet_A_Latent_Diffusion_Approach_for_Text-Based_Image_Segmentation_ICCV_2023_paper.pdf", - "aff": "University of Maryland College Park\u2020; Vchar.ai\u2021; Amazon\u00a7; Amazon\u00a7; University of Maryland College Park\u2020", + "aff": "University of Maryland College Park†; Vchar.ai‡; Amazon§; Amazon§; University of Maryland College Park†", "project": "https://koutilya-pnvr.github.io/LD-ZNet/", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/PNVR_LD-ZNet_A_Latent_ICCV_2023_supplemental.pdf", @@ -30427,14 +31453,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/PNVR_LD-ZNet_A_Latent_Diffusion_Approach_for_Text-Based_Image_Segmentation_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;2;0", - "aff_unique_norm": "University of Maryland;Vchar.ai;Amazon", - "aff_unique_dep": ";;Amazon.com, Inc.", + "aff_unique_norm": "University of Maryland;Vchar.ai;Amazon.com, Inc.", + "aff_unique_dep": ";;", "aff_unique_url": "https://www/umd.edu;;https://www.amazon.com", "aff_unique_abbr": "UMD;;Amazon", "aff_campus_unique_index": "0;0", "aff_campus_unique": "College Park;", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States;" + "aff_country_unique": "United States;", + "bibtex": "@InProceedings{PNVR_2023_ICCV,\n \n author = {\n PNVR,\n Koutilya and Singh,\n Bharat and Ghosh,\n Pallabi and Siddiquie,\n Behjat and Jacobs,\n David\n},\n title = {\n LD-ZNet: A Latent Diffusion Approach for Text-Based Image Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4157-4168\n} \n}" }, { "title": "LDL: Line Distance Functions for Panoramic Localization", @@ -30466,7 +31493,8 @@ "aff_campus_unique_index": "0;0;0;0+0", "aff_campus_unique": "Seoul", "aff_country_unique_index": "0;0;0;0+0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Kim_2023_ICCV,\n \n author = {\n Kim,\n Junho and Choi,\n Changwoon and Jang,\n Hojun and Kim,\n Young Min\n},\n title = {\n LDL: Line Distance Functions for Panoramic Localization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17882-17892\n} \n}" }, { "title": "LDP-Feat: Image Features with Local Differential Privacy", @@ -30498,7 +31526,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Pittaluga_2023_ICCV,\n \n author = {\n Pittaluga,\n Francesco and Zhuang,\n Bingbing\n},\n title = {\n LDP-Feat: Image Features with Local Differential Privacy\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17580-17590\n} \n}" }, { "title": "LEA2: A Lightweight Ensemble Adversarial Attack via Non-overlapping Vulnerable Frequency Regions", @@ -30521,7 +31550,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Qian_LEA2_A_Lightweight_Ensemble_Adversarial_Attack_via_Non-overlapping_Vulnerable_Frequency_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Qian_LEA2_A_Lightweight_Ensemble_Adversarial_Attack_via_Non-overlapping_Vulnerable_Frequency_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Qian_2023_ICCV,\n \n author = {\n Qian,\n Yaguan and He,\n Shuke and Zhao,\n Chenyu and Sha,\n Jiaqiang and Wang,\n Wei and Wang,\n Bin\n},\n title = {\n LEA2: A Lightweight Ensemble Adversarial Attack via Non-overlapping Vulnerable Frequency Regions\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4510-4521\n} \n}" }, { "title": "LERF: Language Embedded Radiance Fields", @@ -30553,7 +31583,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Kerr_2023_ICCV,\n \n author = {\n Kerr,\n Justin and Kim,\n Chung Min and Goldberg,\n Ken and Kanazawa,\n Angjoo and Tancik,\n Matthew\n},\n title = {\n LERF: Language Embedded Radiance Fields\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19729-19739\n} \n}" }, { "title": "LFS-GAN: Lifelong Few-Shot Image Generation", @@ -30585,7 +31616,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Yongin;", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Seo_2023_ICCV,\n \n author = {\n Seo,\n Juwon and Kang,\n Ji-Su and Park,\n Gyeong-Moon\n},\n title = {\n LFS-GAN: Lifelong Few-Shot Image Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11356-11366\n} \n}" }, { "title": "LIMITR: Leveraging Local Information for Medical Image-Text Representation", @@ -30597,7 +31629,7 @@ "author": "Gefen Dawidowicz; Elad Hirsch; Ayellet Tal", "abstract": "Medical imaging analysis plays a critical role in the diagnosis and treatment of various medical conditions. This paper focuses on chest X-ray images and their corresponding radiological reports. It presents a new model that learns a joint X-ray image & report representation. The model is based on a novel alignment scheme between the visual data and the text, which takes into account both local and global information. Furthermore, the model integrates domain-specific information of two types -- lateral images and the consistent visual structure of chest images. Our representation is shown to benefit three types of retrieval tasks: text-image retrieval, class-based retrieval, and phrase-grounding.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Dawidowicz_LIMITR_Leveraging_Local_Information_for_Medical_Image-Text_Representation_ICCV_2023_paper.pdf", - "aff": "Technion \u2013 Israel Institute of Technology; Technion \u2013 Israel Institute of Technology; Technion \u2013 Israel Institute of Technology+Cornell Tech", + "aff": "Technion – Israel Institute of Technology; Technion – Israel Institute of Technology; Technion – Israel Institute of Technology+Cornell Tech", "project": "", "github": "https://github.com/gefend/LIMITR", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Dawidowicz_LIMITR_Leveraging_Local_ICCV_2023_supplemental.pdf", @@ -30610,14 +31642,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Dawidowicz_LIMITR_Leveraging_Local_Information_for_Medical_Image-Text_Representation_ICCV_2023_paper.html", "aff_unique_index": "0;0;0+1", - "aff_unique_norm": "Technion \u2013 Israel Institute of Technology;Cornell University", + "aff_unique_norm": "Technion – Israel Institute of Technology;Cornell University", "aff_unique_dep": ";", "aff_unique_url": "https://www.technion.ac.il/en/;https://tech.cornell.edu", "aff_unique_abbr": "Technion;Cornell Tech", "aff_campus_unique_index": "1", "aff_campus_unique": ";New York City", "aff_country_unique_index": "0;0;0+1", - "aff_country_unique": "Israel;United States" + "aff_country_unique": "Israel;United States", + "bibtex": "@InProceedings{Dawidowicz_2023_ICCV,\n \n author = {\n Dawidowicz,\n Gefen and Hirsch,\n Elad and Tal,\n Ayellet\n},\n title = {\n LIMITR: Leveraging Local Information for Medical Image-Text Representation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21165-21173\n} \n}" }, { "title": "LIST: Learning Implicitly from Spatial Transformers for Single-View 3D Reconstruction", @@ -30642,14 +31675,15 @@ "author_num": 2, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Arshad_LIST_Learning_Implicitly_from_Spatial_Transformers_for_Single-View_3D_Reconstruction_ICCV_2023_paper.html", "aff_unique_index": "0;0", - "aff_unique_norm": "University of Texas at Arlington", + "aff_unique_norm": "The University of Texas at Arlington", "aff_unique_dep": "Department of Computer Science and Engineering", "aff_unique_url": "https://www.uta.edu", "aff_unique_abbr": "UT Arlington", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Arlington", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Arshad_2023_ICCV,\n \n author = {\n Arshad,\n Mohammad Samiul and Beksi,\n William J.\n},\n title = {\n LIST: Learning Implicitly from Spatial Transformers for Single-View 3D Reconstruction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9321-9330\n} \n}" }, { "title": "LISTER: Neighbor Decoding for Length-Insensitive Scene Text Recognition", @@ -30681,7 +31715,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Cheng_2023_ICCV,\n \n author = {\n Cheng,\n Changxu and Wang,\n Peng and Da,\n Cheng and Zheng,\n Qi and Yao,\n Cong\n},\n title = {\n LISTER: Neighbor Decoding for Length-Insensitive Scene Text Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19541-19551\n} \n}" }, { "title": "LLM-Planner: Few-Shot Grounded Planning for Embodied Agents with Large Language Models", @@ -30706,14 +31741,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Song_LLM-Planner_Few-Shot_Grounded_Planning_for_Embodied_Agents_with_Large_Language_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;1;0;0", - "aff_unique_norm": "Ohio State University;U.S. Army Research Laboratory", + "aff_unique_norm": "The Ohio State University;U.S. Army Research Laboratory", "aff_unique_dep": ";", "aff_unique_url": "https://www.osu.edu;https://www.arl.army.mil", "aff_unique_abbr": "OSU;ARL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Song_2023_ICCV,\n \n author = {\n Song,\n Chan Hee and Wu,\n Jiaman and Washington,\n Clayton and Sadler,\n Brian M and Chao,\n Wei-Lun and Su,\n Yu\n},\n title = {\n LLM-Planner: Few-Shot Grounded Planning for Embodied Agents with Large Language Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2998-3009\n} \n}" }, { "title": "LMR: A Large-Scale Multi-Reference Dataset for Reference-Based Super-Resolution", @@ -30738,14 +31774,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhang_LMR_A_Large-Scale_Multi-Reference_Dataset_for_Reference-Based_Super-Resolution_ICCV_2023_paper.html", "aff_unique_index": "0+1+2+0+3;4;4;4;4;0+1+2+3", - "aff_unique_norm": "University of Chinese Academy of Sciences;Chinese Academy of Sciences;State Key Laboratory of Multimodal Artificial Intelligence Systems;Hong Kong Institute of Science and Technology;Baidu", + "aff_unique_norm": "University of Chinese Academy of Sciences;Chinese Academy of Sciences;State Key Laboratory of Multimodal Artificial Intelligence Systems;Hong Kong Institute of Science and Technology;Baidu Inc.", "aff_unique_dep": ";Institute of Automation;;Center for Artificial Intelligence and Robotics;Department of Computer Vision Technology (VIS)", "aff_unique_url": "http://www.ucas.ac.cn;http://www.ia.cas.cn;;https://www.hkisi.edu.hk;https://www.baidu.com", "aff_unique_abbr": "UCAS;CAS;;HKISI;Baidu", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0+0+0+0+0;0;0;0;0;0+0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Lin and Li,\n Xin and He,\n Dongliang and Li,\n Fu and Ding,\n Errui and Zhang,\n Zhaoxiang\n},\n title = {\n LMR: A Large-Scale Multi-Reference Dataset for Reference-Based Super-Resolution\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13118-13127\n} \n}" }, { "title": "LNPL-MIL: Learning from Noisy Pseudo Labels for Promoting Multiple Instance Learning in Whole Slide Image", @@ -30777,7 +31814,8 @@ "aff_campus_unique_index": "0;0;1;0;1;0;0", "aff_campus_unique": "Shenzhen;Harbin", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Shao_2023_ICCV,\n \n author = {\n Shao,\n Zhuchen and Wang,\n Yifeng and Chen,\n Yang and Bian,\n Hao and Liu,\n Shaohui and Wang,\n Haoqian and Zhang,\n Yongbing\n},\n title = {\n LNPL-MIL: Learning from Noisy Pseudo Labels for Promoting Multiple Instance Learning in Whole Slide Image\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21495-21505\n} \n}" }, { "title": "LPFF: A Portrait Dataset for Face Generators Across Large Poses", @@ -30809,7 +31847,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wu_2023_ICCV,\n \n author = {\n Wu,\n Yiqian and Zhang,\n Jing and Fu,\n Hongbo and Jin,\n Xiaogang\n},\n title = {\n LPFF: A Portrait Dataset for Face Generators Across Large Poses\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20327-20337\n} \n}" }, { "title": "LRRU: Long-short Range Recurrent Updating Networks for Depth Completion", @@ -30821,7 +31860,7 @@ "author": "Yufei Wang; Bo Li; Ge Zhang; Qi Liu; Tao Gao; Yuchao Dai", "abstract": "Existing deep learning-based depth completion methods generally employ massive stacked layers to predict the dense depth map from sparse input data. Although such approaches greatly advance this task, their accompanied huge computational complexity hinders their practical applications. To accomplish depth completion more efficiently, we propose a novel lightweight deep network framework, the Long-short Range Recurrent Updating (LRRU) network. Without learning complex feature representations, LRRU first roughly fills the sparse input to obtain an initial dense depth map, and then iteratively updates it through learned spatially-variant kernels. Our iterative update process is content-adaptive and highly flexible, where the kernel weights are learned by jointly considering the guidance RGB images and the depth map to be updated, and large-to-small kernel scopes are dynamically adjusted to capture long-to-short range dependencies. Our initial depth map has coarse but complete scene depth information, which helps relieve the burden of directly regressing the dense depth from sparse ones, while our proposed method can effectively refine it to an accurate depth map with less learnable parameters and inference time. Experimental results demonstrate that our proposed LRRU variants achieve state-of-the-art performance across different parameter regimes. In particular, the LRRU-Base model outperforms competing approaches on the NYUv2 dataset, and ranks 1st on the KITTI depth completion benchmark at the time of submission. Project page: https://npucvr.github.io/LRRU/.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Wang_LRRU_Long-short_Range_Recurrent_Updating_Networks_for_Depth_Completion_ICCV_2023_paper.pdf", - "aff": "Northwestern Polytechnical University and Shaanxi Key Laboratory of Information Acquisition and Processing; Northwestern Polytechnical University and Shaanxi Key Laboratory of Information Acquisition and Processing; Northwestern Polytechnical University and Shaanxi Key Laboratory of Information Acquisition and Processing; Northwestern Polytechnical University and Shaanxi Key Laboratory of Information Acquisition and Processing; Chang\u2019an University; Northwestern Polytechnical University and Shaanxi Key Laboratory of Information Acquisition and Processing", + "aff": "Northwestern Polytechnical University and Shaanxi Key Laboratory of Information Acquisition and Processing; Northwestern Polytechnical University and Shaanxi Key Laboratory of Information Acquisition and Processing; Northwestern Polytechnical University and Shaanxi Key Laboratory of Information Acquisition and Processing; Northwestern Polytechnical University and Shaanxi Key Laboratory of Information Acquisition and Processing; Chang’an University; Northwestern Polytechnical University and Shaanxi Key Laboratory of Information Acquisition and Processing", "project": "https://npucvr.github.io/LRRU/", "github": "", "supp": "", @@ -30841,7 +31880,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Yufei and Li,\n Bo and Zhang,\n Ge and Liu,\n Qi and Gao,\n Tao and Dai,\n Yuchao\n},\n title = {\n LRRU: Long-short Range Recurrent Updating Networks for Depth Completion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9422-9432\n} \n}" }, { "title": "LU-NeRF: Scene and Pose Estimation by Synchronizing Local Unposed NeRFs", @@ -30873,7 +31913,8 @@ "aff_campus_unique_index": "0+1;1;1;1;0;1", "aff_campus_unique": "Amherst;Mountain View", "aff_country_unique_index": "0+0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Cheng_2023_ICCV,\n \n author = {\n Cheng,\n Zezhou and Esteves,\n Carlos and Jampani,\n Varun and Kar,\n Abhishek and Maji,\n Subhransu and Makadia,\n Ameesh\n},\n title = {\n LU-NeRF: Scene and Pose Estimation by Synchronizing Local Unposed NeRFs\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18312-18321\n} \n}" }, { "title": "LVOS: A Benchmark for Long-term Video Object Segmentation", @@ -30905,7 +31946,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Shanghai;", "aff_country_unique_index": "0;0;0;0+0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Hong_2023_ICCV,\n \n author = {\n Hong,\n Lingyi and Chen,\n Wenchao and Liu,\n Zhongying and Zhang,\n Wei and Guo,\n Pinxue and Chen,\n Zhaoyu and Zhang,\n Wenqiang\n},\n title = {\n LVOS: A Benchmark for Long-term Video Object Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13480-13492\n} \n}" }, { "title": "LaPE: Layer-adaptive Position Embedding for Vision Transformers with Independent Layer Normalization", @@ -30930,14 +31972,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yu_LaPE_Layer-adaptive_Position_Embedding_for_Vision_Transformers_with_Independent_Layer_ICCV_2023_paper.html", "aff_unique_index": "0+0;1;0+0;0+0;2;3;2;0+1+0", - "aff_unique_norm": "Peking University;Pengcheng Laboratory;Tsinghua University;Zhejiang University", - "aff_unique_dep": "School of Electronic and Computer Engineering;Peng Cheng Laboratory;Department of Automation and BNRist;School of Computer Science and Technology", + "aff_unique_norm": "Peking University;Peng Cheng Laboratory;Tsinghua University;Zhejiang University", + "aff_unique_dep": "School of Electronic and Computer Engineering;;Department of Automation and BNRist;School of Computer Science and Technology", "aff_unique_url": "http://www.pku.edu.cn;;https://www.tsinghua.edu.cn;http://www.zju.edu.cn", "aff_unique_abbr": "PKU;;THU;ZJU", "aff_campus_unique_index": "0+0;0;0+0;0+0;1;2;1;0+0+0", "aff_campus_unique": "Shenzhen;Beijing;Zhejiang", "aff_country_unique_index": "0+0;0;0+0;0+0;0;0;0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yu_2023_ICCV,\n \n author = {\n Yu,\n Runyi and Wang,\n Zhennan and Wang,\n Yinhuai and Li,\n Kehan and Liu,\n Chang and Duan,\n Haoyi and Ji,\n Xiangyang and Chen,\n Jie\n},\n title = {\n LaPE: Layer-adaptive Position Embedding for Vision Transformers with Independent Layer Normalization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5886-5896\n} \n}" }, { "title": "LaRS: A Diverse Panoptic Maritime Obstacle Detection Dataset and Benchmark", @@ -30945,8 +31988,8 @@ "status": "Poster", "track": "main", "pid": "12536", - "author_site": "Lojze \u008eust, Janez Per\u009a, Matej Kristan", - "author": "Lojze \u017dust; Janez Per\u0161; Matej Kristan", + "author_site": "Lojze Žust, Janez Perš, Matej Kristan", + "author": "Lojze Žust; Janez Perš; Matej Kristan", "abstract": "The progress in maritime obstacle detection is hindered by the lack of a diverse dataset that adequately captures the complexity of general maritime environments. We present the first maritime panoptic obstacle detection benchmark LaRS, featuring scenes from Lakes, Rivers and Seas. Our major contribution is the new dataset, which boasts the largest diversity in recording locations, scene types, obstacle classes, and acquisition conditions among the related datasets. LaRS is composed of over 4000 per-pixel labeled key frames with nine preceding frames to allow utilization of the temporal texture, amounting to over 40k frames. Each key frame is annotated with 8 thing, 3 stuff classes and 19 global scene attributes. We report the results of 27 semantic and panoptic segmentation methods, along with several performance insights and future research directions. To enable objective evaluation, we have implemented an online evaluation server. The LaRS dataset, evaluation toolkit and benchmark are publicly available at: https://lojzezust.github.io/lars-dataset", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Zust_LaRS_A_Diverse_Panoptic_Maritime_Obstacle_Detection_Dataset_and_Benchmark_ICCV_2023_paper.pdf", "aff": "University of Ljubljana; University of Ljubljana; University of Ljubljana", @@ -30969,7 +32012,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Slovenia" + "aff_country_unique": "Slovenia", + "bibtex": "@InProceedings{Zust_2023_ICCV,\n \n author = {\n \\v{Z\n}ust,\n Lojze and Per\\v{s\n},\n Janez and Kristan,\n Matej\n},\n title = {\n LaRS: A Diverse Panoptic Maritime Obstacle Detection Dataset and Benchmark\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20304-20314\n} \n}" }, { "title": "Label Shift Adapter for Test-Time Adaptation under Covariate and Label Shifts", @@ -31001,7 +32045,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", - "aff_country_unique": "United States;South Korea" + "aff_country_unique": "United States;South Korea", + "bibtex": "@InProceedings{Park_2023_ICCV,\n \n author = {\n Park,\n Sunghyun and Yang,\n Seunghan and Choo,\n Jaegul and Yun,\n Sungrack\n},\n title = {\n Label Shift Adapter for Test-Time Adaptation under Covariate and Label Shifts\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16421-16431\n} \n}" }, { "title": "Label-Efficient Online Continual Object Detection in Streaming Video", @@ -31024,7 +32069,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wu_Label-Efficient_Online_Continual_Object_Detection_in_Streaming_Video_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wu_Label-Efficient_Online_Continual_Object_Detection_in_Streaming_Video_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Wu_2023_ICCV,\n \n author = {\n Wu,\n Jay Zhangjie and Zhang,\n David Junhao and Hsu,\n Wynne and Zhang,\n Mengmi and Shou,\n Mike Zheng\n},\n title = {\n Label-Efficient Online Continual Object Detection in Streaming Video\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19246-19255\n} \n}" }, { "title": "Label-Free Event-based Object Recognition via Joint Learning with Image Reconstruction from Events", @@ -31056,7 +32102,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Cho_2023_ICCV,\n \n author = {\n Cho,\n Hoonhee and Kim,\n Hyeonseong and Chae,\n Yujeong and Yoon,\n Kuk-Jin\n},\n title = {\n Label-Free Event-based Object Recognition via Joint Learning with Image Reconstruction from Events\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19866-19877\n} \n}" }, { "title": "Label-Guided Knowledge Distillation for Continual Semantic Segmentation on 2D Images and 3D Point Clouds", @@ -31088,7 +32135,8 @@ "aff_campus_unique_index": "0;0;0;0;0;0;0;0", "aff_campus_unique": "Singapore;", "aff_country_unique_index": "0;0;0;0;0;0;0;1;0", - "aff_country_unique": "Singapore;South Korea" + "aff_country_unique": "Singapore;South Korea", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n Ze and Li,\n Ruibo and Ling,\n Evan and Zhang,\n Chi and Wang,\n Yiming and Huang,\n Dezhao and Ma,\n Keng Teck and Hur,\n Minhoe and Lin,\n Guosheng\n},\n title = {\n Label-Guided Knowledge Distillation for Continual Semantic Segmentation on 2D Images and 3D Point Clouds\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18601-18612\n} \n}" }, { "title": "Label-Noise Learning with Intrinsically Long-Tailed Data", @@ -31120,7 +32168,8 @@ "aff_campus_unique_index": "0+0;0+0;1;1;0+0", "aff_campus_unique": "Xiamen;Hong Kong", "aff_country_unique_index": "0+0;0+0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Lu_2023_ICCV,\n \n author = {\n Lu,\n Yang and Zhang,\n Yiliang and Han,\n Bo and Cheung,\n Yiu-ming and Wang,\n Hanzi\n},\n title = {\n Label-Noise Learning with Intrinsically Long-Tailed Data\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1369-1378\n} \n}" }, { "title": "Landscape Learning for Neural Network Inversion", @@ -31152,7 +32201,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Ruoshi and Mao,\n Chengzhi and Tendulkar,\n Purva and Wang,\n Hao and Vondrick,\n Carl\n},\n title = {\n Landscape Learning for Neural Network Inversion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2239-2250\n} \n}" }, { "title": "Large Selective Kernel Network for Remote Sensing Object Detection", @@ -31175,7 +32225,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_Large_Selective_Kernel_Network_for_Remote_Sensing_Object_Detection_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_Large_Selective_Kernel_Network_for_Remote_Sensing_Object_Detection_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Yuxuan and Hou,\n Qibin and Zheng,\n Zhaohui and Cheng,\n Ming-Ming and Yang,\n Jian and Li,\n Xiang\n},\n title = {\n Large Selective Kernel Network for Remote Sensing Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16794-16805\n} \n}" }, { "title": "Large-Scale Land Cover Mapping with Fine-Grained Classes via Class-Aware Semi-Supervised Semantic Segmentation", @@ -31187,7 +32238,7 @@ "author": "Runmin Dong; Lichao Mou; Mengxuan Chen; Weijia Li; Xin-Yi Tong; Shuai Yuan; Lixian Zhang; Juepeng Zheng; Xiaoxiang Zhu; Haohuan Fu", "abstract": "Semi-supervised learning has attracted increasing attention in the large-scale land cover mapping task. However, existing methods overlook the potential to alleviate the class imbalance problem by selecting a suitable set of unlabeled data. Besides, in class-imbalanced scenarios, existing pseudo-labeling methods mostly only pick confident samples, failing to exploit the hard samples during training. To tackle these issues, we propose a unified Class-Aware Semi-Supervised Semantic Segmentation framework. The proposed framework consists of three key components. To construct a better semi-supervised learning dataset, we propose a class-aware unlabeled data selection method that is more balanced towards the minority classes. Based on the built dataset with improved class balance, we propose a Class-Balanced Cross Entropy loss, jointly considering the annotation bias and the class bias to re-weight the loss in both sample and class levels to alleviate the class imbalance problem. Moreover, we propose the Class Center Contrast method to jointly utilize the labeled and unlabeled data. Specifically, we decompose the feature embedding space using the ground truth and pseudo-labels, and employ the embedding centers for hard and easy samples of each class per image in the contrast loss to exploit the hard samples during training. Compared with state-of-the-art class-balanced pseudo-labeling methods, the proposed method improves the mean accuracy and mIoU by 4.28% and 1.70%, respectively, on the large-scale Sentinel-2 dataset with 24 land cover classes.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Dong_Large-Scale_Land_Cover_Mapping_with_Fine-Grained_Classes_via_Class-Aware_Semi-Supervised_ICCV_2023_paper.pdf", - "aff": "Tsinghua University; Technical University of Munich; Tsinghua University; Sun Yat-Sen University; Technical University of Munich; Tsinghua University - Xi\u2019an Institute of Surveying and Mapping Joint Research Center; Tsinghua University - Xi\u2019an Institute of Surveying and Mapping Joint Research Center; Sun Yat-Sen University; Technical University of Munich; Tsinghua University - Xi\u2019an Institute of Surveying and Mapping Joint Research Center", + "aff": "Tsinghua University; Technical University of Munich; Tsinghua University; Sun Yat-Sen University; Technical University of Munich; Tsinghua University - Xi’an Institute of Surveying and Mapping Joint Research Center; Tsinghua University - Xi’an Institute of Surveying and Mapping Joint Research Center; Sun Yat-Sen University; Technical University of Munich; Tsinghua University - Xi’an Institute of Surveying and Mapping Joint Research Center", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Dong_Large-Scale_Land_Cover_ICCV_2023_supplemental.pdf", @@ -31200,14 +32251,15 @@ "author_num": 10, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Dong_Large-Scale_Land_Cover_Mapping_with_Fine-Grained_Classes_via_Class-Aware_Semi-Supervised_ICCV_2023_paper.html", "aff_unique_index": "0;1;0;2;1;0;0;2;1;0", - "aff_unique_norm": "Tsinghua University;Technical University of Munich;Sun Yat-sen University", + "aff_unique_norm": "Tsinghua University;Technical University of Munich;Sun Yat-Sen University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.tum.de;http://www.sysu.edu.cn/", "aff_unique_abbr": "THU;TUM;SYSU", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Xi'an", "aff_country_unique_index": "0;1;0;0;1;0;0;0;1;0", - "aff_country_unique": "China;Germany" + "aff_country_unique": "China;Germany", + "bibtex": "@InProceedings{Dong_2023_ICCV,\n \n author = {\n Dong,\n Runmin and Mou,\n Lichao and Chen,\n Mengxuan and Li,\n Weijia and Tong,\n Xin-Yi and Yuan,\n Shuai and Zhang,\n Lixian and Zheng,\n Juepeng and Zhu,\n Xiaoxiang and Fu,\n Haohuan\n},\n title = {\n Large-Scale Land Cover Mapping with Fine-Grained Classes via Class-Aware Semi-Supervised Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16783-16793\n} \n}" }, { "title": "Large-Scale Person Detection and Localization Using Overhead Fisheye Cameras", @@ -31230,7 +32282,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yang_Large-Scale_Person_Detection_and_Localization_Using_Overhead_Fisheye_Cameras_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yang_Large-Scale_Person_Detection_and_Localization_Using_Overhead_Fisheye_Cameras_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n Lu and Li,\n Liulei and Xin,\n Xueshi and Sun,\n Yifan and Song,\n Qing and Wang,\n Wenguan\n},\n title = {\n Large-Scale Person Detection and Localization Using Overhead Fisheye Cameras\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19961-19971\n} \n}" }, { "title": "Late Stopping: Avoiding Confidently Learning from Mislabeled Examples", @@ -31262,7 +32315,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "Australia;Singapore" + "aff_country_unique": "Australia;Singapore", + "bibtex": "@InProceedings{Yuan_2023_ICCV,\n \n author = {\n Yuan,\n Suqin and Feng,\n Lei and Liu,\n Tongliang\n},\n title = {\n Late Stopping: Avoiding Confidently Learning from Mislabeled Examples\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16079-16088\n} \n}" }, { "title": "Latent-OFER: Detect, Mask, and Reconstruct with Latent Vectors for Occluded Facial Expression Recognition", @@ -31294,7 +32348,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Gwangju", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Lee_2023_ICCV,\n \n author = {\n Lee,\n Isack and Lee,\n Eungi and Yoo,\n Seok Bong\n},\n title = {\n Latent-OFER: Detect,\n Mask,\n and Reconstruct with Latent Vectors for Occluded Facial Expression Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1536-1546\n} \n}" }, { "title": "LayoutDiffusion: Improving Graphic Layout Generation by Discrete Diffusion Probabilistic Models", @@ -31319,14 +32374,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhang_LayoutDiffusion_Improving_Graphic_Layout_Generation_by_Discrete_Diffusion_Probabilistic_Models_ICCV_2023_paper.html", "aff_unique_index": "0;1;1;1;1", - "aff_unique_norm": "Shanghai Jiao Tong University;Microsoft", + "aff_unique_norm": "Shanghai Jiao Tong University;Microsoft Research", "aff_unique_dep": ";Research", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.microsoft.com/en-us/research/group/asia", "aff_unique_abbr": "SJTU;MSR Asia", "aff_campus_unique_index": "1;1;1;1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Junyi and Guo,\n Jiaqi and Sun,\n Shizhao and Lou,\n Jian-Guang and Zhang,\n Dongmei\n},\n title = {\n LayoutDiffusion: Improving Graphic Layout Generation by Discrete Diffusion Probabilistic Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7226-7236\n} \n}" }, { "title": "LeaF: Learning Frames for 4D Point Cloud Sequence Understanding", @@ -31351,14 +32407,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Liu_LeaF_Learning_Frames_for_4D_Point_Cloud_Sequence_Understanding_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;1;0+2+3", - "aff_unique_norm": "Tsinghua University;Huawei;Shanghai Artificial Intelligence Laboratory;Shanghai Qi Zhi Institute", - "aff_unique_dep": ";Huawei Technologies Co., Ltd.;;", + "aff_unique_norm": "Tsinghua University;Huawei Technologies Co., Ltd.;Shanghai Artificial Intelligence Laboratory;Shanghai Qi Zhi Institute", + "aff_unique_dep": ";;;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.huawei.com;http://www.shailab.org/;https://www.qz.io", "aff_unique_abbr": "THU;Huawei;Shanghai AI Lab;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Yunze and Chen,\n Junyu and Zhang,\n Zekai and Huang,\n Jingwei and Yi,\n Li\n},\n title = {\n LeaF: Learning Frames for 4D Point Cloud Sequence Understanding\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 604-613\n} \n}" }, { "title": "Leaping Into Memories: Space-Time Deep Feature Synthesis", @@ -31383,14 +32440,15 @@ "author_num": 2, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Stergiou_Leaping_Into_Memories_Space-Time_Deep_Feature_Synthesis_ICCV_2023_paper.html", "aff_unique_index": "0+1;0+1", - "aff_unique_norm": "Vrije Universiteit Brussel;IMEC", + "aff_unique_norm": "Vrije Universiteit Brussel;imec", "aff_unique_dep": ";", "aff_unique_url": "https://www.vub.be;https://www.imec-int.com", "aff_unique_abbr": "VUB;imec", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0", - "aff_country_unique": "Belgium" + "aff_country_unique": "Belgium", + "bibtex": "@InProceedings{Stergiou_2023_ICCV,\n \n author = {\n Stergiou,\n Alexandros and Deligiannis,\n Nikos\n},\n title = {\n Leaping Into Memories: Space-Time Deep Feature Synthesis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1966-1976\n} \n}" }, { "title": "Learn TAROT with MENTOR: A Meta-Learned Self-Supervised Approach for Trajectory Prediction", @@ -31402,7 +32460,7 @@ "author": "Mozhgan Pourkeshavarz; Changhe Chen; Amir Rasouli", "abstract": "Predicting diverse yet admissible trajectories that adhere to the map constraints is challenging. Graph-based scene encoders have been proven effective for preserving local structures of maps by defining lane-level connections. However, such encoders do not capture more complex patterns emerging from long-range heterogeneous connections between nonadjacent interacting lanes. To this end, we shed new light on learning common driving patterns by introducing meTA ROad paTh (TAROT) to formulate combinations of various relations between lanes on the road topology. Intuitively, this can be viewed as finding feasible routes. Furthermore, we propose MEta-road NeTwORk (MENTOR) that helps trajectory prediction by providing it with TAROT as navigation tips. More specifically, 1) we define TAROT prediction as a novel self-supervised proxy task to identify the complex heterogeneous structure of the map. 2) For typical driving actions, we establish several TAROTs that result in multiple Heterogeneous Structure Learning (HSL) tasks. These tasks are used in MENTOR, which performs meta-learning by simultaneously predicting trajectories along with proxy tasks, identifying an optimal combination of them, and automatically balancing them to improve the primary task. We show that our model achieves state-of-the-art performance on the Argoverse dataset, especially on diversity and admissibility metrics, achieving up to 20% improvements in challenging scenarios. We further investigate the contribution of proposed modules in ablation studies.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Pourkeshavarz_Learn_TAROT_with_MENTOR_A_Meta-Learned_Self-Supervised_Approach_for_Trajectory_ICCV_2023_paper.pdf", - "aff": "Noah\u2019s Ark Lab, Huawei; Noah\u2019s Ark Lab, Huawei; Noah\u2019s Ark Lab, Huawei", + "aff": "Noah’s Ark Lab, Huawei; Noah’s Ark Lab, Huawei; Noah’s Ark Lab, Huawei", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Pourkeshavarz_Learn_TAROT_with_ICCV_2023_supplemental.pdf", @@ -31416,13 +32474,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Pourkeshavarz_Learn_TAROT_with_MENTOR_A_Meta-Learned_Self-Supervised_Approach_for_Trajectory_ICCV_2023_paper.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Huawei", - "aff_unique_dep": "Noah\u2019s Ark Lab", + "aff_unique_dep": "Noah’s Ark Lab", "aff_unique_url": "https://www.huawei.com", "aff_unique_abbr": "Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Pourkeshavarz_2023_ICCV,\n \n author = {\n Pourkeshavarz,\n Mozhgan and Chen,\n Changhe and Rasouli,\n Amir\n},\n title = {\n Learn TAROT with MENTOR: A Meta-Learned Self-Supervised Approach for Trajectory Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8384-8393\n} \n}" }, { "title": "Learned Compressive Representations for Single-Photon 3D Imaging", @@ -31454,7 +32513,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Madison;", "aff_country_unique_index": "0;0;1;0;1;1;0;0;0", - "aff_country_unique": "United States;Switzerland" + "aff_country_unique": "United States;Switzerland", + "bibtex": "@InProceedings{Gutierrez-Barragan_2023_ICCV,\n \n author = {\n Gutierrez-Barragan,\n Felipe and Mu,\n Fangzhou and Ardelean,\n Andrei and Ingle,\n Atul and Bruschini,\n Claudio and Charbon,\n Edoardo and Li,\n Yin and Gupta,\n Mohit and Velten,\n Andreas\n},\n title = {\n Learned Compressive Representations for Single-Photon 3D Imaging\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10756-10766\n} \n}" }, { "title": "Learned Image Reasoning Prior Penetrates Deep Unfolding Network for Panchromatic and Multi-spectral Image Fusion", @@ -31486,7 +32546,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1", - "aff_country_unique": "Singapore;China" + "aff_country_unique": "Singapore;China", + "bibtex": "@InProceedings{Zhou_2023_ICCV,\n \n author = {\n Zhou,\n Man and Huang,\n Jie and Zheng,\n Naishan and Li,\n Chongyi\n},\n title = {\n Learned Image Reasoning Prior Penetrates Deep Unfolding Network for Panchromatic and Multi-spectral Image Fusion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12398-12407\n} \n}" }, { "title": "Learning Adaptive Neighborhoods for Graph Neural Networks", @@ -31518,7 +32579,8 @@ "aff_campus_unique_index": "0;0;1;0", "aff_campus_unique": "Guildford;Oxford", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Saha_2023_ICCV,\n \n author = {\n Saha,\n Avishkar and Mendez,\n Oscar and Russell,\n Chris and Bowden,\n Richard\n},\n title = {\n Learning Adaptive Neighborhoods for Graph Neural Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22541-22550\n} \n}" }, { "title": "Learning Clothing and Pose Invariant 3D Shape Representation for Long-Term Person Re-Identification", @@ -31550,7 +32612,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "East Lansing", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Feng and Kim,\n Minchul and Gu,\n ZiAng and Jain,\n Anil and Liu,\n Xiaoming\n},\n title = {\n Learning Clothing and Pose Invariant 3D Shape Representation for Long-Term Person Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19617-19626\n} \n}" }, { "title": "Learning Concise and Descriptive Attributes for Visual Recognition", @@ -31582,7 +32645,8 @@ "aff_campus_unique_index": "0;0;1+0;0;0;2;2;0;0", "aff_campus_unique": "San Diego;Madison;Santa Barbara", "aff_country_unique_index": "0;0;0+0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Yan_2023_ICCV,\n \n author = {\n Yan,\n An and Wang,\n Yu and Zhong,\n Yiwu and Dong,\n Chengyu and He,\n Zexue and Lu,\n Yujie and Wang,\n William Yang and Shang,\n Jingbo and McAuley,\n Julian\n},\n title = {\n Learning Concise and Descriptive Attributes for Visual Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3090-3100\n} \n}" }, { "title": "Learning Concordant Attention via Target-aware Alignment for Visible-Infrared Person Re-identification", @@ -31594,7 +32658,7 @@ "author": "Jianbing Wu; Hong Liu; Yuxin Su; Wei Shi; Hao Tang", "abstract": "Owing to the large distribution gap between the heterogeneous data in Visible-Infrared Person Re-identification (VI Re-ID), we point out that existing paradigms often suffer from the inter-modal semantic misalignment issue and thus fail to align and compare local details properly. In this paper, we present Concordant Attention Learning (CAL), a novel framework that learns semantic-aligned representations for VI Re-ID. Specifically, we design the Target-aware Concordant Alignment paradigm, which allows target-aware attention adaptation when aligning heterogeneous samples (i.e., adaptive attention adjustment according to the target image being aligned). This is achieved by exploiting the discriminative clues from the modality counterpart and designing effective modality-agnostic correspondence searching strategies. To ensure semantic concordance during the cross-modal retrieval stage, we further propose MatchDistill, which matches the attention patterns across modalities and learns their underlying semantic correlations by bipartite-graph-based similarity modeling and cross-modal knowledge exchange. Extensive experiments on VI Re-ID benchmark datasets demonstrate the effectiveness and superiority of the proposed CAL.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Wu_Learning_Concordant_Attention_via_Target-aware_Alignment_for_Visible-Infrared_Person_Re-identification_ICCV_2023_paper.pdf", - "aff": "Key Laboratory of Machine Perception, Shenzhen Graduate School, Peking University, China; Key Laboratory of Machine Perception, Shenzhen Graduate School, Peking University, China; Key Laboratory of Machine Perception, Shenzhen Graduate School, Peking University, China; Key Laboratory of Machine Perception, Shenzhen Graduate School, Peking University, China; Computer Vision Lab, ETH Z\u00fcrich, Switzerland", + "aff": "Key Laboratory of Machine Perception, Shenzhen Graduate School, Peking University, China; Key Laboratory of Machine Perception, Shenzhen Graduate School, Peking University, China; Key Laboratory of Machine Perception, Shenzhen Graduate School, Peking University, China; Key Laboratory of Machine Perception, Shenzhen Graduate School, Peking University, China; Computer Vision Lab, ETH Zürich, Switzerland", "project": "", "github": "", "supp": "", @@ -31607,14 +32671,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wu_Learning_Concordant_Attention_via_Target-aware_Alignment_for_Visible-Infrared_Person_Re-identification_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0;1", - "aff_unique_norm": "Peking University;ETH Zurich", + "aff_unique_norm": "Peking University;ETH Zürich", "aff_unique_dep": "Key Laboratory of Machine Perception;Computer Vision Lab", "aff_unique_url": "http://www.pku.edu.cn;https://www.ethz.ch", "aff_unique_abbr": "PKU;ETHZ", - "aff_campus_unique_index": "0;0;0;0", - "aff_campus_unique": "Shenzhen Graduate School;", + "aff_campus_unique_index": "0;0;0;0;1", + "aff_campus_unique": "Shenzhen Graduate School;Zürich", "aff_country_unique_index": "0;0;0;0;1", - "aff_country_unique": "China;Switzerland" + "aff_country_unique": "China;Switzerland", + "bibtex": "@InProceedings{Wu_2023_ICCV,\n \n author = {\n Wu,\n Jianbing and Liu,\n Hong and Su,\n Yuxin and Shi,\n Wei and Tang,\n Hao\n},\n title = {\n Learning Concordant Attention via Target-aware Alignment for Visible-Infrared Person Re-identification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11122-11131\n} \n}" }, { "title": "Learning Continuous Exposure Value Representations for Single-Image HDR Reconstruction", @@ -31637,7 +32702,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chen_Learning_Continuous_Exposure_Value_Representations_for_Single-Image_HDR_Reconstruction_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chen_Learning_Continuous_Exposure_Value_Representations_for_Single-Image_HDR_Reconstruction_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Su-Kai and Yen,\n Hung-Lin and Liu,\n Yu-Lun and Chen,\n Min-Hung and Hu,\n Hou-Ning and Peng,\n Wen-Hsiao and Lin,\n Yen-Yu\n},\n title = {\n Learning Continuous Exposure Value Representations for Single-Image HDR Reconstruction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12990-13000\n} \n}" }, { "title": "Learning Correction Filter via Degradation-Adaptive Regression for Blind Single Image Super-Resolution", @@ -31669,7 +32735,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhou_2023_ICCV,\n \n author = {\n Zhou,\n Hongyang and Zhu,\n Xiaobin and Zhu,\n Jianqing and Han,\n Zheng and Zhang,\n Shi-Xue and Qin,\n Jingyan and Yin,\n Xu-Cheng\n},\n title = {\n Learning Correction Filter via Degradation-Adaptive Regression for Blind Single Image Super-Resolution\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12365-12375\n} \n}" }, { "title": "Learning Cross-Modal Affinity for Referring Video Object Segmentation Targeting Limited Samples", @@ -31696,12 +32763,13 @@ "aff_unique_index": "0;1+2;0;3;1", "aff_unique_norm": "Anhui University of Technology;Southern University of Science and Technology;University of Warwick;United Imaging", "aff_unique_dep": ";;;", - "aff_unique_url": "http://www.ahtu.edu.cn;https://www.sustech.edu.cn;https://www.warwick.ac.uk;", + "aff_unique_url": "http://www.ahtu.edu.cn;https://www.sustech.edu.cn;https://www.warwick.ac.uk;https://www.united-imaging.com", "aff_unique_abbr": ";SUSTech;Warwick;", "aff_campus_unique_index": "", "aff_campus_unique": "", - "aff_country_unique_index": "0;0+1;0;0", - "aff_country_unique": "China;United Kingdom;" + "aff_country_unique_index": "0;0+1;0;0;0", + "aff_country_unique": "China;United Kingdom", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Guanghui and Gao,\n Mingqi and Liu,\n Heng and Zhen,\n Xiantong and Zheng,\n Feng\n},\n title = {\n Learning Cross-Modal Affinity for Referring Video Object Segmentation Targeting Limited Samples\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2684-2693\n} \n}" }, { "title": "Learning Cross-Representation Affinity Consistency for Sparsely Supervised Biomedical Instance Segmentation", @@ -31733,7 +32801,8 @@ "aff_campus_unique_index": "1;1;1;1", "aff_campus_unique": ";Hefei", "aff_country_unique_index": "0;0;0+0;0;0+0;0+0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Xiaoyu and Huang,\n Wei and Xiong,\n Zhiwei and Zhou,\n Shenglong and Zhang,\n Yueyi and Chen,\n Xuejin and Zha,\n Zheng-Jun and Wu,\n Feng\n},\n title = {\n Learning Cross-Representation Affinity Consistency for Sparsely Supervised Biomedical Instance Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21107-21117\n} \n}" }, { "title": "Learning Data-Driven Vector-Quantized Degradation Model for Animation Video Super-Resolution", @@ -31745,7 +32814,7 @@ "author": "Zixi Tuo; Huan Yang; Jianlong Fu; Yujie Dun; Xueming Qian", "abstract": "Existing real-world video super-resolution (VSR) methods focus on designing a general degradation pipeline for open-domain videos while ignoring data intrinsic characteristics which strongly limit their performance when applying to some specific domains (e.g., animation videos). In this paper, we thoroughly explore the characteristics of animation videos and leverage the rich priors in real-world animation data for a more practical animation VSR model. In particular, we propose a multi-scale Vector-Quantized Degradation model for animation video Super-Resolution (VQD-SR) to decompose the local details from global structures and transfer the degradation priors in real-world animation videos to a learned vector-quantized codebook for degradation modeling. A rich-content Real Animation Low-quality (RAL) video dataset is collected for extracting the priors. We further propose a data enhancement strategy for high-resolution (HR) training videos based on our observation that existing HR videos are mostly collected from the Web which contains conspicuous compression artifacts. The proposed strategy is valid to lift the upper bound of animation VSR performance, regardless of the specific VSR model. Experimental results demonstrate the superiority of the proposed VQD-SR over state-of-the-art methods, through extensive quantitative and qualitative evaluations of the latest animation video super-resolution benchmark. The code and pre-trained models can be downloaded at https://github.com/researchmm/VQD-SR.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Tuo_Learning_Data-Driven_Vector-Quantized_Degradation_Model_for_Animation_Video_Super-Resolution_ICCV_2023_paper.pdf", - "aff": "Xi\u2019an Jiaotong University; Microsoft Research Asia; Microsoft Research Asia; Xi\u2019an Jiaotong University; Xi\u2019an Jiaotong University+Shaanxi Yulan Jiuzhou Intelligent Optoelectronic Technology Co., Ltd", + "aff": "Xi’an Jiaotong University; Microsoft Research Asia; Microsoft Research Asia; Xi’an Jiaotong University; Xi’an Jiaotong University+Shaanxi Yulan Jiuzhou Intelligent Optoelectronic Technology Co., Ltd", "project": "", "github": "https://github.com/researchmm/VQD-SR", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Tuo_Learning_Data-Driven_Vector-Quantized_ICCV_2023_supplemental.pdf", @@ -31758,14 +32827,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Tuo_Learning_Data-Driven_Vector-Quantized_Degradation_Model_for_Animation_Video_Super-Resolution_ICCV_2023_paper.html", "aff_unique_index": "0;1;1;0;0+2", - "aff_unique_norm": "Xi'an Jiao Tong University;Microsoft;Shaanxi Yulan Jiuzhou Intelligent Optoelectronic Technology Co., Ltd", + "aff_unique_norm": "Xi'an Jiaotong University;Microsoft Research;Shaanxi Yulan Jiuzhou Intelligent Optoelectronic Technology Co., Ltd", "aff_unique_dep": ";Research;", "aff_unique_url": "https://www.xjtu.edu.cn;https://www.microsoft.com/en-us/research/group/asia;", "aff_unique_abbr": "XJTU;MSR Asia;", "aff_campus_unique_index": "1;1;", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Tuo_2023_ICCV,\n \n author = {\n Tuo,\n Zixi and Yang,\n Huan and Fu,\n Jianlong and Dun,\n Yujie and Qian,\n Xueming\n},\n title = {\n Learning Data-Driven Vector-Quantized Degradation Model for Animation Video Super-Resolution\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13179-13189\n} \n}" }, { "title": "Learning Depth Estimation for Transparent and Mirror Surfaces", @@ -31788,7 +32858,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Costanzino_Learning_Depth_Estimation_for_Transparent_and_Mirror_Surfaces_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Costanzino_Learning_Depth_Estimation_for_Transparent_and_Mirror_Surfaces_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Costanzino_2023_ICCV,\n \n author = {\n Costanzino,\n Alex and Ramirez,\n Pierluigi Zama and Poggi,\n Matteo and Tosi,\n Fabio and Mattoccia,\n Stefano and Di Stefano,\n Luigi\n},\n title = {\n Learning Depth Estimation for Transparent and Mirror Surfaces\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9244-9255\n} \n}" }, { "title": "Learning Fine-Grained Features for Pixel-Wise Video Correspondences", @@ -31820,7 +32891,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hefei", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Rui and Zhou,\n Shenglong and Liu,\n Dong\n},\n title = {\n Learning Fine-Grained Features for Pixel-Wise Video Correspondences\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9632-9641\n} \n}" }, { "title": "Learning Foresightful Dense Visual Affordance for Deformable Object Manipulation", @@ -31852,7 +32924,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0+0;0+0+0+0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wu_2023_ICCV,\n \n author = {\n Wu,\n Ruihai and Ning,\n Chuanruo and Dong,\n Hao\n},\n title = {\n Learning Foresightful Dense Visual Affordance for Deformable Object Manipulation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10947-10956\n} \n}" }, { "title": "Learning Gabor Texture Features for Fine-Grained Recognition", @@ -31884,7 +32957,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;2;0", - "aff_country_unique": "Singapore;China;United States" + "aff_country_unique": "Singapore;China;United States", + "bibtex": "@InProceedings{Zhu_2023_ICCV,\n \n author = {\n Zhu,\n Lanyun and Chen,\n Tianrun and Yin,\n Jianxiong and See,\n Simon and Liu,\n Jun\n},\n title = {\n Learning Gabor Texture Features for Fine-Grained Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1621-1631\n} \n}" }, { "title": "Learning Global-aware Kernel for Image Harmonization", @@ -31916,7 +32990,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Shen_2023_ICCV,\n \n author = {\n Shen,\n Xintian and Zhang,\n Jiangning and Chen,\n Jun and Bai,\n Shipeng and Han,\n Yue and Wang,\n Yabiao and Wang,\n Chengjie and Liu,\n Yong\n},\n title = {\n Learning Global-aware Kernel for Image Harmonization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7535-7544\n} \n}" }, { "title": "Learning Hierarchical Features with Joint Latent Space Energy-Based Prior", @@ -31948,7 +33023,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Cui_2023_ICCV,\n \n author = {\n Cui,\n Jiali and Wu,\n Ying Nian and Han,\n Tian\n},\n title = {\n Learning Hierarchical Features with Joint Latent Space Energy-Based Prior\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2218-2227\n} \n}" }, { "title": "Learning Human Dynamics in Autonomous Driving Scenarios", @@ -31971,7 +33047,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wang_Learning_Human_Dynamics_in_Autonomous_Driving_Scenarios_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wang_Learning_Human_Dynamics_in_Autonomous_Driving_Scenarios_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Jingbo and Yuan,\n Ye and Luo,\n Zhengyi and Xie,\n Kevin and Lin,\n Dahua and Iqbal,\n Umar and Fidler,\n Sanja and Khamis,\n Sameh\n},\n title = {\n Learning Human Dynamics in Autonomous Driving Scenarios\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20796-20806\n} \n}" }, { "title": "Learning Human-Human Interactions in Images from Weak Textual Supervision", @@ -32003,7 +33080,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Israel" + "aff_country_unique": "Israel", + "bibtex": "@InProceedings{Alper_2023_ICCV,\n \n author = {\n Alper,\n Morris and Averbuch-Elor,\n Hadar\n},\n title = {\n Learning Human-Human Interactions in Images from Weak Textual Supervision\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2887-2899\n} \n}" }, { "title": "Learning Image Harmonization in the Linear Color Space", @@ -32035,7 +33113,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xu_2023_ICCV,\n \n author = {\n Xu,\n Ke and Hancke,\n Gerhard Petrus and Lau,\n Rynson W.H.\n},\n title = {\n Learning Image Harmonization in the Linear Color Space\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12570-12579\n} \n}" }, { "title": "Learning Image-Adaptive Codebooks for Class-Agnostic Image Restoration", @@ -32060,14 +33139,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Liu_Learning_Image-Adaptive_Codebooks_for_Class-Agnostic_Image_Restoration_ICCV_2023_paper.html", "aff_unique_index": "0;1;2+0;1", - "aff_unique_norm": "University of Washington;Chinese University of Hong Kong;SenseBrain", + "aff_unique_norm": "University of Washington;The Chinese University of Hong Kong;SenseBrain", "aff_unique_dep": ";;", "aff_unique_url": "https://www.washington.edu;https://www.cuhk.edu.hk;", "aff_unique_abbr": "UW;CUHK;", "aff_campus_unique_index": "1;;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;1;0;1", - "aff_country_unique": "United States;China;" + "aff_country_unique": "United States;China;", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Kechun and Jiang,\n Yitong and Choi,\n Inchang and Gu,\n Jinwei\n},\n title = {\n Learning Image-Adaptive Codebooks for Class-Agnostic Image Restoration\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5373-5383\n} \n}" }, { "title": "Learning Long-Range Information with Dual-Scale Transformers for Indoor Scene Completion", @@ -32075,6 +33155,7 @@ "status": "Poster", "track": "main", "pid": "4685", + "author_site": "Ziqi Wang, Fei Luo, Xiaoxiao Long, Wenxiao Zhang, Chunxia Xiao", "author": "Ziqi Wang, Fei Luo, Xiaoxiao Long, Wenxiao Zhang, Chunxia Xiao", "abstract": "Due to the limited resolution of 3D sensors and the inevitable mutual occlusion between objects, 3D scans of real scenes are commonly incomplete. \n Previous scene completion methods struggle to capture long-range spatial feature, resulting in unsatisfactory completion results. \n To alleviate the problem, we propose a novel Dual-Scale Transformer Network (DST-Net) that efficiently utilizes both long-range and short-range spatial context information to improve the quality of 3D scene completion. \n To reduce the heavy computation cost of extracting long-range features via transformers, DST-Net adopts a self-supervised two-stage completion strategy. In the first stage, we split the input scene into blocks, and perform completion on individual blocks. In the second stage, the blocks are merged together as a whole and then further refined to improve completeness.\n More importantly, we propose a contrastive attention training strategy to encourage the transformers to learn distinguishable features for better scene completion.\n Experiments on datasets of Matterport3D, ScanNet, and ICL-NUIM demonstrate that our method can generate better completion results, and our method outperforms the state-of-the-art methods quantitatively and qualitatively.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Wang_Learning_Long-Range_Information_with_Dual-Scale_Transformers_for_Indoor_Scene_Completion_ICCV_2023_paper.pdf", @@ -32086,7 +33167,8 @@ "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3845392472293560523&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wang_Learning_Long-Range_Information_with_Dual-Scale_Transformers_for_Indoor_Scene_Completion_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wang_Learning_Long-Range_Information_with_Dual-Scale_Transformers_for_Indoor_Scene_Completion_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Ziqi and Luo,\n Fei and Long,\n Xiaoxiao and Zhang,\n Wenxiao and Xiao,\n Chunxia\n},\n title = {\n Learning Long-Range Information with Dual-Scale Transformers for Indoor Scene Completion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18569-18579\n} \n}" }, { "title": "Learning Navigational Visual Representations with Semantic Map Supervision", @@ -32118,7 +33200,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0;0;0;0;1;0", - "aff_country_unique": "United States;Australia" + "aff_country_unique": "United States;Australia", + "bibtex": "@InProceedings{Hong_2023_ICCV,\n \n author = {\n Hong,\n Yicong and Zhou,\n Yang and Zhang,\n Ruiyi and Dernoncourt,\n Franck and Bui,\n Trung and Gould,\n Stephen and Tan,\n Hao\n},\n title = {\n Learning Navigational Visual Representations with Semantic Map Supervision\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3055-3067\n} \n}" }, { "title": "Learning Neural Eigenfunctions for Unsupervised Semantic Segmentation", @@ -32130,7 +33213,7 @@ "author": "Zhijie Deng; Yucen Luo", "abstract": "Unsupervised semantic segmentation is a long-standing challenge in computer vision with great significance. Spectral clustering is a theoretically grounded solution to it where the spectral embeddings for pixels are computed to construct distinct clusters. Despite recent progress in enhancing spectral clustering with powerful pre-trained models, current approaches still suffer from inefficiencies in spectral decomposition and inflexibility in applying them to the test data. This work addresses these issues by casting spectral clustering as a parametric approach that employs neural network-based eigenfunctions to produce spectral embeddings. The outputs of the neural eigenfunctions are further restricted to discrete vectors that indicate clustering assignments directly. As a result, an end-to-end NN-based paradigm of spectral clustering emerges. In practice, the neural eigenfunctions are lightweight and take the features from pre-trained models as inputs, improving training efficiency and unleashing the potential of pre-trained models for dense prediction. We conduct extensive empirical studies to validate the effectiveness of our approach and observe significant performance gains over competitive baselines on Pascal Context, Cityscapes, and ADE20K benchmarks. The code is available at https://github.com/thudzj/NeuralEigenfunctionSegmentor.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Deng_Learning_Neural_Eigenfunctions_for_Unsupervised_Semantic_Segmentation_ICCV_2023_paper.pdf", - "aff": "Shanghai Jiao Tong University, Shanghai, China; Max Planck Institute for Intelligent Systems, T\u00a8ubingen, Germany", + "aff": "Shanghai Jiao Tong University, Shanghai, China; Max Planck Institute for Intelligent Systems, T¨ubingen, Germany", "project": "", "github": "https://github.com/thudzj/NeuralEigenfunctionSegmentor", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Deng_Learning_Neural_Eigenfunctions_ICCV_2023_supplemental.pdf", @@ -32148,9 +33231,10 @@ "aff_unique_url": "https://www.sjtu.edu.cn;https://www.mpi-is.mpg.de", "aff_unique_abbr": "SJTU;MPI-IS", "aff_campus_unique_index": "0;1", - "aff_campus_unique": "Shanghai;T\u00fcbingen", + "aff_campus_unique": "Shanghai;Tübingen", "aff_country_unique_index": "0;1", - "aff_country_unique": "China;Germany" + "aff_country_unique": "China;Germany", + "bibtex": "@InProceedings{Deng_2023_ICCV,\n \n author = {\n Deng,\n Zhijie and Luo,\n Yucen\n},\n title = {\n Learning Neural Eigenfunctions for Unsupervised Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 551-561\n} \n}" }, { "title": "Learning Neural Implicit Surfaces with Object-Aware Radiance Fields", @@ -32182,7 +33266,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Yiheng and Qiu,\n Zhaofan and Pan,\n Yingwei and Yao,\n Ting and Mei,\n Tao\n},\n title = {\n Learning Neural Implicit Surfaces with Object-Aware Radiance Fields\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17893-17902\n} \n}" }, { "title": "Learning Non-Local Spatial-Angular Correlation for Light Field Image Super-Resolution", @@ -32209,12 +33294,13 @@ "aff_unique_index": "0;0;1;0;0;0", "aff_unique_norm": "National University of Defense Technology;Aviation University of Air Force", "aff_unique_dep": ";", - "aff_unique_url": "http://www.nudt.edu.cn/;", - "aff_unique_abbr": "NUDT;", + "aff_unique_url": "http://www.nudt.edu.cn/;http://www.auaf.edu.cn", + "aff_unique_abbr": "NUDT;AUAF", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liang_2023_ICCV,\n \n author = {\n Liang,\n Zhengyu and Wang,\n Yingqian and Wang,\n Longguang and Yang,\n Jungang and Zhou,\n Shilin and Guo,\n Yulan\n},\n title = {\n Learning Non-Local Spatial-Angular Correlation for Light Field Image Super-Resolution\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12376-12386\n} \n}" }, { "title": "Learning Optical Flow from Event Camera with Rendered Dataset", @@ -32246,7 +33332,8 @@ "aff_campus_unique_index": "1;1;", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Luo_2023_ICCV,\n \n author = {\n Luo,\n Xinglong and Luo,\n Kunming and Luo,\n Ao and Wang,\n Zhengning and Tan,\n Ping and Liu,\n Shuaicheng\n},\n title = {\n Learning Optical Flow from Event Camera with Rendered Dataset\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9847-9857\n} \n}" }, { "title": "Learning Point Cloud Completion without Complete Point Clouds: A Pose-Aware Approach", @@ -32269,7 +33356,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Kim_Learning_Point_Cloud_Completion_without_Complete_Point_Clouds_A_Pose-Aware_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Kim_Learning_Point_Cloud_Completion_without_Complete_Point_Clouds_A_Pose-Aware_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Kim_2023_ICCV,\n \n author = {\n Kim,\n Jihun and Kweon,\n Hyeokjun and Yang,\n Yunseo and Yoon,\n Kuk-Jin\n},\n title = {\n Learning Point Cloud Completion without Complete Point Clouds: A Pose-Aware Approach\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14203-14213\n} \n}" }, { "title": "Learning Pseudo-Relations for Cross-domain Semantic Segmentation", @@ -32301,7 +33389,8 @@ "aff_campus_unique_index": "0;0;0;0;0;0;0", "aff_campus_unique": "Shaanxi", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhao_2023_ICCV,\n \n author = {\n Zhao,\n Dong and Wang,\n Shuang and Zang,\n Qi and Quan,\n Dou and Ye,\n Xiutiao and Yang,\n Rui and Jiao,\n Licheng\n},\n title = {\n Learning Pseudo-Relations for Cross-domain Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19191-19203\n} \n}" }, { "title": "Learning Rain Location Prior for Nighttime Deraining", @@ -32333,7 +33422,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "China;Netherlands;" + "aff_country_unique": "China;Netherlands;", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Fan and You,\n Shaodi and Li,\n Yu and Fu,\n Ying\n},\n title = {\n Learning Rain Location Prior for Nighttime Deraining\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13148-13157\n} \n}" }, { "title": "Learning Robust Representations with Information Bottleneck and Memory Network for RGB-D-based Gesture Recognition", @@ -32345,7 +33435,7 @@ "author": "Yunan Li; Huizhou Chen; Guanwen Feng; Qiguang Miao", "abstract": "Although previous RGB-D-based gesture recognition methods have shown promising performance, researchers often overlook the interference of task-irrelevant cues like illumination and background. These unnecessary factors are learned together with the predictive ones by the network and hinder accurate recognition. In this paper, we propose a convenient and analytical framework to learn a robust feature representation that is impervious to gesture-irrelevant factors. Based on the Information Bottleneck theory, two rules of Sufficiency and Compactness are derived to develop a new information-theoretic loss function, which cultivates a more sufficient and compact representation from the feature encoding and mitigates the impact of gesture-irrelevant information. To highlight the predictive information, we further integrate a memory network. Using our proposed content-based and contextual memory addressing scheme, we weaken the nuisances while preserving the task-relevant information, providing guidance for refining the feature representation. Experiments conducted on three public datasets demonstrate that our approach leads to a better feature representation and achieves better performance than state-of-the-art methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Li_Learning_Robust_Representations_with_Information_Bottleneck_and_Memory_Network_for_ICCV_2023_paper.pdf", - "aff": "1School of Computer Science and Technology, Xidian University, China+2Xi\u2019an Key Laboratory of Big Data and Intelligent Vision, China+3Key Laboratory of Smart Human-Computer Interaction and Wearable Technology of Shaanxi Province, China; 1School of Computer Science and Technology, Xidian University, China+2Xi\u2019an Key Laboratory of Big Data and Intelligent Vision, China; 1School of Computer Science and Technology, Xidian University, China+2Xi\u2019an Key Laboratory of Big Data and Intelligent Vision, China; 1School of Computer Science and Technology, Xidian University, China+2Xi\u2019an Key Laboratory of Big Data and Intelligent Vision, China+3Key Laboratory of Smart Human-Computer Interaction and Wearable Technology of Shaanxi Province, China", + "aff": "1School of Computer Science and Technology, Xidian University, China+2Xi’an Key Laboratory of Big Data and Intelligent Vision, China+3Key Laboratory of Smart Human-Computer Interaction and Wearable Technology of Shaanxi Province, China; 1School of Computer Science and Technology, Xidian University, China+2Xi’an Key Laboratory of Big Data and Intelligent Vision, China; 1School of Computer Science and Technology, Xidian University, China+2Xi’an Key Laboratory of Big Data and Intelligent Vision, China; 1School of Computer Science and Technology, Xidian University, China+2Xi’an Key Laboratory of Big Data and Intelligent Vision, China+3Key Laboratory of Smart Human-Computer Interaction and Wearable Technology of Shaanxi Province, China", "project": "", "github": "https://github.com/Carpumpkin/InBoMem", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Li_Learning_Robust_Representations_ICCV_2023_supplemental.pdf", @@ -32365,7 +33455,8 @@ "aff_campus_unique_index": ";;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0+0;0+0;0+0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Yunan and Chen,\n Huizhou and Feng,\n Guanwen and Miao,\n Qiguang\n},\n title = {\n Learning Robust Representations with Information Bottleneck and Memory Network for RGB-D-based Gesture Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20968-20978\n} \n}" }, { "title": "Learning Semi-supervised Gaussian Mixture Models for Generalized Category Discovery", @@ -32390,14 +33481,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhao_Learning_Semi-supervised_Gaussian_Mixture_Models_for_Generalized_Category_Discovery_ICCV_2023_paper.html", "aff_unique_index": "0;1;1", - "aff_unique_norm": "University of Edinburgh;University of Hong Kong", + "aff_unique_norm": "University of Edinburgh;The University of Hong Kong", "aff_unique_dep": ";", "aff_unique_url": "https://www.ed.ac.uk;https://www.hku.hk", "aff_unique_abbr": "Edinburgh;HKU", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;1;1", - "aff_country_unique": "United Kingdom;China" + "aff_country_unique": "United Kingdom;China", + "bibtex": "@InProceedings{Zhao_2023_ICCV,\n \n author = {\n Zhao,\n Bingchen and Wen,\n Xin and Han,\n Kai\n},\n title = {\n Learning Semi-supervised Gaussian Mixture Models for Generalized Category Discovery\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16623-16633\n} \n}" }, { "title": "Learning Shape Primitives via Implicit Convexity Regularization", @@ -32429,7 +33521,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Huang_2023_ICCV,\n \n author = {\n Huang,\n Xiaoyang and Zhang,\n Yi and Chen,\n Kai and Li,\n Teng and Zhang,\n Wenjun and Ni,\n Bingbing\n},\n title = {\n Learning Shape Primitives via Implicit Convexity Regularization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3642-3651\n} \n}" }, { "title": "Learning Spatial-context-aware Global Visual Feature Representation for Instance Image Retrieval", @@ -32461,7 +33554,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0+0", - "aff_country_unique": "Australia" + "aff_country_unique": "Australia", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Zhongyan and Wang,\n Lei and Zhou,\n Luping and Koniusz,\n Piotr\n},\n title = {\n Learning Spatial-context-aware Global Visual Feature Representation for Instance Image Retrieval\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11250-11259\n} \n}" }, { "title": "Learning Support and Trivial Prototypes for Interpretable Image Classification", @@ -32484,7 +33578,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wang_Learning_Support_and_Trivial_Prototypes_for_Interpretable_Image_Classification_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wang_Learning_Support_and_Trivial_Prototypes_for_Interpretable_Image_Classification_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Chong and Liu,\n Yuyuan and Chen,\n Yuanhong and Liu,\n Fengbei and Tian,\n Yu and McCarthy,\n Davis and Frazer,\n Helen and Carneiro,\n Gustavo\n},\n title = {\n Learning Support and Trivial Prototypes for Interpretable Image Classification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2062-2072\n} \n}" }, { "title": "Learning Symmetry-Aware Geometry Correspondences for 6D Object Pose Estimation", @@ -32496,7 +33591,7 @@ "author": "Heng Zhao; Shenxing Wei; Dahu Shi; Wenming Tan; Zheyang Li; Ye Ren; Xing Wei; Yi Yang; Shiliang Pu", "abstract": "Current 6D pose estimation methods focus on handling objects that are previously trained, which limits their applications in real dynamic world. To this end, we propose a geometry correspondence-based framework, termed GCPose, to estimate 6D pose of arbitrary unseen objects without any re-training. Specifically, the proposed method draws the idea from point cloud registration and resorts to object-agnostic geometry features to establish the 3D-3D correspondences between the object-scene point cloud and object-model point cloud. Then the 6D pose parameters are solved by a least-squares fitting algorithm. Taking the symmetry properties of objects into consideration, we design a symmetry-aware matching loss to facilitate the learning of dense point-wise geometry features and improve the performance considerably. Moreover, we introduce an online training data generation with special data augmentation and normalization to empower the network to learn diverse geometry prior. With training on synthetic objects from ShapeNet, our method outperforms previous approaches for unseen object pose estimation by a large margin on T-LESS, LINEMOD, Occluded-LINEMOD, and TUD-L datasets. Code is available at https://github.com/hikvision-research/GCPose.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Zhao_Learning_Symmetry-Aware_Geometry_Correspondences_for_6D_Object_Pose_Estimation_ICCV_2023_paper.pdf", - "aff": "Hikvision Research Institute; Xi\u2019an Jiaotong University; Hikvision Research Institute + Zhejiang University; Hikvision Research Institute; Hikvision Research Institute; Hikvision Research Institute; Xi\u2019an Jiaotong University; Zhejiang University; Hikvision Research Institute", + "aff": "Hikvision Research Institute; Xi’an Jiaotong University; Hikvision Research Institute + Zhejiang University; Hikvision Research Institute; Hikvision Research Institute; Hikvision Research Institute; Xi’an Jiaotong University; Zhejiang University; Hikvision Research Institute", "project": "", "github": "https://github.com/hikvision-research/GCPose", "supp": "", @@ -32509,14 +33604,15 @@ "author_num": 9, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhao_Learning_Symmetry-Aware_Geometry_Correspondences_for_6D_Object_Pose_Estimation_ICCV_2023_paper.html", "aff_unique_index": "0;1;0+2;0;0;0;1;2;0", - "aff_unique_norm": "Hikvision Research Institute;Xi'an Jiao Tong University;Zhejiang University", + "aff_unique_norm": "Hikvision Research Institute;Xi'an Jiaotong University;Zhejiang University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.hikvision.com/cn/;https://www.xjtu.edu.cn;https://www.zju.edu.cn", "aff_unique_abbr": "Hikvision;XJTU;ZJU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhao_2023_ICCV,\n \n author = {\n Zhao,\n Heng and Wei,\n Shenxing and Shi,\n Dahu and Tan,\n Wenming and Li,\n Zheyang and Ren,\n Ye and Wei,\n Xing and Yang,\n Yi and Pu,\n Shiliang\n},\n title = {\n Learning Symmetry-Aware Geometry Correspondences for 6D Object Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14045-14054\n} \n}" }, { "title": "Learning Trajectory-Word Alignments for Video-Language Tasks", @@ -32548,7 +33644,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0;1;0;0;0;0+0;0;0", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n Xu and Li,\n Zhangzikang and Xu,\n Haiyang and Zhang,\n Hanwang and Ye,\n Qinghao and Li,\n Chenliang and Yan,\n Ming and Zhang,\n Yu and Huang,\n Fei and Huang,\n Songfang\n},\n title = {\n Learning Trajectory-Word Alignments for Video-Language Tasks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2504-2514\n} \n}" }, { "title": "Learning Unified Decompositional and Compositional NeRF for Editable Novel View Synthesis", @@ -32580,7 +33677,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Yuxin and Wu,\n Wayne and Xu,\n Dan\n},\n title = {\n Learning Unified Decompositional and Compositional NeRF for Editable Novel View Synthesis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18247-18256\n} \n}" }, { "title": "Learning Versatile 3D Shape Generation with Improved Auto-regressive Models", @@ -32606,13 +33704,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Luo_Learning_Versatile_3D_Shape_Generation_with_Improved_Auto-regressive_Models_ICCV_2023_paper.html", "aff_unique_index": "0;0;0+1;2;1;1;1;0", "aff_unique_norm": "Fudan University;Tencent;Google", - "aff_unique_dep": ";Youtu Lab;Google", + "aff_unique_dep": ";Youtu Lab;", "aff_unique_url": "https://www.fudan.edu.cn;https://www.tencent.com;https://www.google.com", "aff_unique_abbr": "Fudan;Tencent;Google", "aff_campus_unique_index": ";1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0+0;1;0;0;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Luo_2023_ICCV,\n \n author = {\n Luo,\n Simian and Qian,\n Xuelin and Fu,\n Yanwei and Zhang,\n Yinda and Tai,\n Ying and Zhang,\n Zhenyu and Wang,\n Chengjie and Xue,\n Xiangyang\n},\n title = {\n Learning Versatile 3D Shape Generation with Improved Auto-regressive Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14139-14149\n} \n}" }, { "title": "Learning Vision-and-Language Navigation from YouTube Videos", @@ -32644,7 +33743,8 @@ "aff_campus_unique_index": ";1;2;;1", "aff_campus_unique": ";Amherst;Shenzhen", "aff_country_unique_index": "0+0;0;1+1;0;0+0;1+1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Lin_2023_ICCV,\n \n author = {\n Lin,\n Kunyang and Chen,\n Peihao and Huang,\n Diwei and Li,\n Thomas H. and Tan,\n Mingkui and Gan,\n Chuang\n},\n title = {\n Learning Vision-and-Language Navigation from YouTube Videos\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8317-8326\n} \n}" }, { "title": "Learning a More Continuous Zero Level Set in Unsigned Distance Fields through Level Set Projection", @@ -32676,7 +33776,8 @@ "aff_campus_unique_index": "0;0+0;0;0;1", "aff_campus_unique": "Beijing;Detroit", "aff_country_unique_index": "0;0+0;0;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Zhou_2023_ICCV,\n \n author = {\n Zhou,\n Junsheng and Ma,\n Baorui and Li,\n Shujuan and Liu,\n Yu-Shen and Han,\n Zhizhong\n},\n title = {\n Learning a More Continuous Zero Level Set in Unsigned Distance Fields through Level Set Projection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3181-3192\n} \n}" }, { "title": "Learning a Room with the Occ-SDF Hybrid: Signed Distance Function Mingled with Occupancy Aids Scene Representation", @@ -32699,7 +33800,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Lyu_Learning_a_Room_with_the_Occ-SDF_Hybrid_Signed_Distance_Function_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Lyu_Learning_a_Room_with_the_Occ-SDF_Hybrid_Signed_Distance_Function_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Lyu_2023_ICCV,\n \n author = {\n Lyu,\n Xiaoyang and Dai,\n Peng and Li,\n Zizhang and Yan,\n Dongyu and Lin,\n Yi and Peng,\n Yifan and Qi,\n Xiaojuan\n},\n title = {\n Learning a Room with the Occ-SDF Hybrid: Signed Distance Function Mingled with Occupancy Aids Scene Representation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8940-8950\n} \n}" }, { "title": "Learning by Sorting: Self-supervised Learning with Group Ordering Constraints", @@ -32731,7 +33833,8 @@ "aff_campus_unique_index": "0;2;0", "aff_campus_unique": "Frankfurt;;Stanford", "aff_country_unique_index": "0+0+1;1;0;0;0+0+1", - "aff_country_unique": "Germany;United States" + "aff_country_unique": "Germany;United States", + "bibtex": "@InProceedings{Shvetsova_2023_ICCV,\n \n author = {\n Shvetsova,\n Nina and Petersen,\n Felix and Kukleva,\n Anna and Schiele,\n Bernt and Kuehne,\n Hilde\n},\n title = {\n Learning by Sorting: Self-supervised Learning with Group Ordering Constraints\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16453-16463\n} \n}" }, { "title": "Learning from Noisy Data for Semi-Supervised 3D Object Detection", @@ -32763,7 +33866,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Harbin", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Zehui and Li,\n Zhenyu and Wang,\n Shuo and Fu,\n Dengpan and Zhao,\n Feng\n},\n title = {\n Learning from Noisy Data for Semi-Supervised 3D Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6929-6939\n} \n}" }, { "title": "Learning from Noisy Pseudo Labels for Semi-Supervised Temporal Action Localization", @@ -32775,7 +33879,7 @@ "author": "Kun Xia; Le Wang; Sanping Zhou; Gang Hua; Wei Tang", "abstract": "Semi-Supervised Temporal Action Localization (SS-TAL) aims to improve the generalization ability of action detectors with large-scale unlabeled videos. Albeit the recent advancement, one of the major challenges still remains: noisy pseudo labels hinder efficient learning on abundant unlabeled videos, embodied as location biases and category errors. In this paper, we dive deep into such an important but understudied dilemma. To this end, we propose a unified framework, termed Noisy Pseudo-Label Learning, to handle both location biases and category errors. Specifically, our method is featured with (1) Noisy Label Ranking to rank pseudo labels based on the semantic confidence and boundary reliability, (2) Noisy Label Filtering to address the class-imbalance problem of pseudo labels caused by category errors, (3) Noisy Label Learning to penalize inconsistent boundary predictions to achieve noise-tolerant learning for heavy location biases. As a result, our method could effectively handle the label noise problem and improve the utilization of a large amount of unlabeled videos. Extensive experiments on THUMOS14 and ActivityNet v1.3 demonstrate the effectiveness of our method. The code is available at github.com/kunnxia/NPL.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Xia_Learning_from_Noisy_Pseudo_Labels_for_Semi-Supervised_Temporal_Action_Localization_ICCV_2023_paper.pdf", - "aff": "National Key Laboratory of Human-Machine Hybrid Augmented Intelligence, National Engineering Research Center for Visual Information and Applications, Institute of Artificial Intelligence and Robotics, Xi\u2019an Jiaotong University; National Key Laboratory of Human-Machine Hybrid Augmented Intelligence, National Engineering Research Center for Visual Information and Applications, Institute of Artificial Intelligence and Robotics, Xi\u2019an Jiaotong University; National Key Laboratory of Human-Machine Hybrid Augmented Intelligence, National Engineering Research Center for Visual Information and Applications, Institute of Artificial Intelligence and Robotics, Xi\u2019an Jiaotong University; Wormpex AI Research; University of Illinois Chicago", + "aff": "National Key Laboratory of Human-Machine Hybrid Augmented Intelligence, National Engineering Research Center for Visual Information and Applications, Institute of Artificial Intelligence and Robotics, Xi’an Jiaotong University; National Key Laboratory of Human-Machine Hybrid Augmented Intelligence, National Engineering Research Center for Visual Information and Applications, Institute of Artificial Intelligence and Robotics, Xi’an Jiaotong University; National Key Laboratory of Human-Machine Hybrid Augmented Intelligence, National Engineering Research Center for Visual Information and Applications, Institute of Artificial Intelligence and Robotics, Xi’an Jiaotong University; Wormpex AI Research; University of Illinois Chicago", "project": "", "github": "github.com/kunnxia/NPL", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Xia_Learning_from_Noisy_Pseudo_Labels_for_Semi-Supervised_Temporal_Action_Localization_ICCV_2023_supplemental.pdf", @@ -32788,14 +33892,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Xia_Learning_from_Noisy_Pseudo_Labels_for_Semi-Supervised_Temporal_Action_Localization_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;1;2", - "aff_unique_norm": "Xi'an Jiao Tong University;Wormpex AI Research;University of Illinois at Chicago", + "aff_unique_norm": "Xi'an Jiaotong University;Wormpex AI Research;University of Illinois at Chicago", "aff_unique_dep": "Institute of Artificial Intelligence and Robotics;AI Research;", "aff_unique_url": "http://www.xjtu.edu.cn;;https://www.uic.edu", "aff_unique_abbr": "XJTU;Wormpex AI;UIC", "aff_campus_unique_index": "0;0;0;2", "aff_campus_unique": "Xi'an;;Chicago", "aff_country_unique_index": "0;0;0;1;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Xia_2023_ICCV,\n \n author = {\n Xia,\n Kun and Wang,\n Le and Zhou,\n Sanping and Hua,\n Gang and Tang,\n Wei\n},\n title = {\n Learning from Noisy Pseudo Labels for Semi-Supervised Temporal Action Localization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10160-10169\n} \n}" }, { "title": "Learning from Semantic Alignment between Unpaired Multiviews for Egocentric Video Recognition", @@ -32827,7 +33932,8 @@ "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Qitong and Zhao,\n Long and Yuan,\n Liangzhe and Liu,\n Ting and Peng,\n Xi\n},\n title = {\n Learning from Semantic Alignment between Unpaired Multiviews for Egocentric Video Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3307-3317\n} \n}" }, { "title": "Learning in Imperfect Environment: Multi-Label Classification with Long-Tailed Distribution and Partial Labels", @@ -32859,7 +33965,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;0;0", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Wenqiao and Liu,\n Changshuo and Zeng,\n Lingze and Ooi,\n Bengchin and Tang,\n Siliang and Zhuang,\n Yueting\n},\n title = {\n Learning in Imperfect Environment: Multi-Label Classification with Long-Tailed Distribution and Partial Labels\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1423-1432\n} \n}" }, { "title": "Learning to Distill Global Representation for Sparse-View CT", @@ -32884,14 +33991,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_Learning_to_Distill_Global_Representation_for_Sparse-View_CT_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0;0+1", - "aff_unique_norm": "Fudan University;Shanghai Center for Brain Science and Brain-Inspired Technology", + "aff_unique_norm": "Fudan University;Shanghai Center for Brain Science and Brain-inspired Technology", "aff_unique_dep": "School of Computer Science;", "aff_unique_url": "https://www.fudan.edu.cn;", "aff_unique_abbr": "Fudan;", "aff_campus_unique_index": "0;0;0;0;0+0", "aff_campus_unique": "Shanghai", "aff_country_unique_index": "0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Zilong and Ma,\n Chenglong and Chen,\n Jie and Zhang,\n Junping and Shan,\n Hongming\n},\n title = {\n Learning to Distill Global Representation for Sparse-View CT\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21196-21207\n} \n}" }, { "title": "Learning to Generate Semantic Layouts for Higher Text-Image Correspondence in Text-to-Image Synthesis", @@ -32923,7 +34031,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Park_2023_ICCV,\n \n author = {\n Park,\n Minho and Yun,\n Jooyeol and Choi,\n Seunghwan and Choo,\n Jaegul\n},\n title = {\n Learning to Generate Semantic Layouts for Higher Text-Image Correspondence in Text-to-Image Synthesis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7591-7600\n} \n}" }, { "title": "Learning to Ground Instructional Articles in Videos through Narrations", @@ -32948,14 +34057,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Mavroudi_Learning_to_Ground_Instructional_Articles_in_Videos_through_Narrations_ICCV_2023_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "Meta", + "aff_unique_norm": "Meta Platforms, Inc.", "aff_unique_dep": "Meta AI", "aff_unique_url": "https://meta.com", "aff_unique_abbr": "Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Mavroudi_2023_ICCV,\n \n author = {\n Mavroudi,\n Effrosyni and Afouras,\n Triantafyllos and Torresani,\n Lorenzo\n},\n title = {\n Learning to Ground Instructional Articles in Videos through Narrations\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15201-15213\n} \n}" }, { "title": "Learning to Identify Critical States for Reinforcement Learning from Videos", @@ -32963,8 +34073,8 @@ "status": "Poster", "track": "main", "pid": "1690", - "author_site": "Haozhe Liu, Mingchen Zhuge, Bing Li, Yuhui Wang, Francesco Faccio, Bernard Ghanem, J\u00fcrgen Schmidhuber", - "author": "Haozhe Liu; Mingchen Zhuge; Bing Li; Yuhui Wang; Francesco Faccio; Bernard Ghanem; J\u00fcrgen Schmidhuber", + "author_site": "Haozhe Liu, Mingchen Zhuge, Bing Li, Yuhui Wang, Francesco Faccio, Bernard Ghanem, Jürgen Schmidhuber", + "author": "Haozhe Liu; Mingchen Zhuge; Bing Li; Yuhui Wang; Francesco Faccio; Bernard Ghanem; Jürgen Schmidhuber", "abstract": "Recent work on deep reinforcement learning (DRL) has pointed out that algorithmic information about good policies can be extracted from offline data which lack explicit information about executed actions. For example, videos of humans or robots may convey a lot of implicit information about rewarding action sequences, but a DRL machine that wants to profit from watching such videos must first learn by itself to identify and recognize relevant states/actions/rewards. Without relying on ground-truth annotations, our new method called Deep State Identifier learns to predict returns from episodes encoded as videos. Then it uses a kind of mask-based sensitivity analysis to extract/identify important critical states. Extensive experiments showcase our method's potential for understanding and improving agent behavior. The source code and the generated datasets are available at https://github.com/AI-Initiative-KAUST/VideoRLCS.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Liu_Learning_to_Identify_Critical_States_for_Reinforcement_Learning_from_Videos_ICCV_2023_paper.pdf", "aff": "AI Initiative, King Abdullah University of Science and Technology+The Swiss AI Lab IDSIA/USI/SUPSI+NNAISENSE; AI Initiative, King Abdullah University of Science and Technology+The Swiss AI Lab IDSIA/USI/SUPSI+NNAISENSE; AI Initiative, King Abdullah University of Science and Technology; AI Initiative, King Abdullah University of Science and Technology; AI Initiative, King Abdullah University of Science and Technology+The Swiss AI Lab IDSIA/USI/SUPSI; AI Initiative, King Abdullah University of Science and Technology; AI Initiative, King Abdullah University of Science and Technology+The Swiss AI Lab IDSIA/USI/SUPSI+NNAISENSE", @@ -32987,7 +34097,8 @@ "aff_campus_unique_index": ";;;", "aff_campus_unique": "", "aff_country_unique_index": "0+1+2;0+1+2;0;0;0+1;0;0+1+2", - "aff_country_unique": "Saudi Arabia;Switzerland;China" + "aff_country_unique": "Saudi Arabia;Switzerland;China", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Haozhe and Zhuge,\n Mingchen and Li,\n Bing and Wang,\n Yuhui and Faccio,\n Francesco and Ghanem,\n Bernard and Schmidhuber,\n J\\"urgen\n},\n title = {\n Learning to Identify Critical States for Reinforcement Learning from Videos\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1955-1965\n} \n}" }, { "title": "Learning to Learn: How to Continuously Teach Humans and Machines", @@ -33010,7 +34121,8 @@ "aff_domain": ";;;;;;;;;", "email": ";;;;;;;;;", "author_num": 10, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Singh_Learning_to_Learn_How_to_Continuously_Teach_Humans_and_Machines_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Singh_Learning_to_Learn_How_to_Continuously_Teach_Humans_and_Machines_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Singh_2023_ICCV,\n \n author = {\n Singh,\n Parantak and Li,\n You and Sikarwar,\n Ankur and Lei,\n Stan Weixian and Gao,\n Difei and Talbot,\n Morgan B. and Sun,\n Ying and Shou,\n Mike Zheng and Kreiman,\n Gabriel and Zhang,\n Mengmi\n},\n title = {\n Learning to Learn: How to Continuously Teach Humans and Machines\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11708-11719\n} \n}" }, { "title": "Learning to Transform for Generalizable Instance-wise Invariance", @@ -33033,7 +34145,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Singhal_Learning_to_Transform_for_Generalizable_Instance-wise_Invariance_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Singhal_Learning_to_Transform_for_Generalizable_Instance-wise_Invariance_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Singhal_2023_ICCV,\n \n author = {\n Singhal,\n Utkarsh and Esteves,\n Carlos and Makadia,\n Ameesh and Yu,\n Stella X.\n},\n title = {\n Learning to Transform for Generalizable Instance-wise Invariance\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6211-6221\n} \n}" }, { "title": "Learning to Upsample by Learning to Sample", @@ -33065,7 +34178,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Wenze and Lu,\n Hao and Fu,\n Hongtao and Cao,\n Zhiguo\n},\n title = {\n Learning to Upsample by Learning to Sample\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6027-6037\n} \n}" }, { "title": "Learning with Diversity: Self-Expanded Equalization for Better Generalized Deep Metric Learning", @@ -33092,12 +34206,13 @@ "aff_unique_index": "0;0;0;0;1", "aff_unique_norm": "Xidian University;University of Maryland, College Park", "aff_unique_dep": "School of Computer Science and Technology;Department of Computer Science", - "aff_unique_url": "http://www.xidian.edu.cn;https://www/umd.edu", + "aff_unique_url": "http://www.xidian.edu.cn/;https://www/umd.edu", "aff_unique_abbr": "Xidian;UMD", "aff_campus_unique_index": "1", "aff_campus_unique": ";College Park", "aff_country_unique_index": "0;0;0;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Yan_2023_ICCV,\n \n author = {\n Yan,\n Jiexi and Yin,\n Zhihui and Yang,\n Erkun and Yang,\n Yanhua and Huang,\n Heng\n},\n title = {\n Learning with Diversity: Self-Expanded Equalization for Better Generalized Deep Metric Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19365-19374\n} \n}" }, { "title": "Lecture Presentations Multimodal Dataset: Towards Understanding Multimodality in Educational Videos", @@ -33129,7 +34244,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Lee_2023_ICCV,\n \n author = {\n Lee,\n Dong Won and Ahuja,\n Chaitanya and Liang,\n Paul Pu and Natu,\n Sanika and Morency,\n Louis-Philippe\n},\n title = {\n Lecture Presentations Multimodal Dataset: Towards Understanding Multimodality in Educational Videos\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20087-20098\n} \n}" }, { "title": "Lens Parameter Estimation for Realistic Depth of Field Modeling", @@ -33137,8 +34253,8 @@ "status": "Poster", "track": "main", "pid": "6503", - "author_site": "Dominique Pich\u00e9-Meunier, Yannick Hold-Geoffroy, Jianming Zhang, Jean-Fran\u00e7ois Lalonde", - "author": "Dominique Pich\u00e9-Meunier; Yannick Hold-Geoffroy; Jianming Zhang; Jean-Fran\u00e7ois Lalonde", + "author_site": "Dominique Piché-Meunier, Yannick Hold-Geoffroy, Jianming Zhang, Jean-François Lalonde", + "author": "Dominique Piché-Meunier; Yannick Hold-Geoffroy; Jianming Zhang; Jean-François Lalonde", "abstract": "We present a method to estimate the depth of field effect from a single image. Most existing methods related to this task provide either a per-pixel estimation of blur and/or depth. Instead, we go further and propose to use a lens-based representation that models the depth of field using two parameters: the blur factor and focus disparity. Those two parameters, along with the signed defocus representation, result in a more intuitive and linear representation which we solve using a novel weighting network. Furthermore, our method explicitly enforces consistency between the estimated defocus blur, the lens parameters, and the depth map. Finally, we train our deep-learning-based model on a mix of real images with synthetic depth of field and fully synthetic images. These improvements result in a more robust and accurate method, as demonstrated by our state-of-the-art results. In particular, our lens parametrization enables several applications, such as 3D staging for AR environments and seamless object compositing.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Piche-Meunier_Lens_Parameter_Estimation_for_Realistic_Depth_of_Field_Modeling_ICCV_2023_paper.pdf", "aff": ";;;", @@ -33152,7 +34268,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Piche-Meunier_Lens_Parameter_Estimation_for_Realistic_Depth_of_Field_Modeling_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Piche-Meunier_Lens_Parameter_Estimation_for_Realistic_Depth_of_Field_Modeling_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Piche-Meunier_2023_ICCV,\n \n author = {\n Pich\\'e-Meunier,\n Dominique and Hold-Geoffroy,\n Yannick and Zhang,\n Jianming and Lalonde,\n Jean-Fran\\c{c\n}ois\n},\n title = {\n Lens Parameter Estimation for Realistic Depth of Field Modeling\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 499-508\n} \n}" }, { "title": "Less is More: Focus Attention for Efficient DETR", @@ -33164,7 +34281,7 @@ "author": "Dehua Zheng; Wenhui Dong; Hailin Hu; Xinghao Chen; Yunhe Wang", "abstract": "DETR-like models have significantly boosted the performance of detectors and even outperformed classical convolutional models. However, all tokens are treated equally without discrimination brings a redundant computational burden in the traditional encoder structure. The recent sparsification strategies exploit a subset of informative tokens to reduce attention\n complexity maintaining performance through the sparse encoder. But these methods tend to rely on unreliable model statistics. Moreover, simply reducing the token population hinders the detection performance to a large extent, limiting the application of these sparse models. We propose Focus-DETR, which focuses attention on more informative tokens for a better trade-off between computation efficiency and model accuracy. Specifically, we reconstruct the encoder with dual attention, which includes a token scoring mechanism that considers both localization and category semantic information of the objects from multi-scale feature maps. We efficiently abandon the background queries and enhance the semantic interaction of the fine-grained object queries based on the scores. Compared with the state-of-the-art sparse DETR-like detectors under the same setting, our Focus-DETR gets comparable complexity while achieving 50.4AP (+2.2) on COCO. The code is available at https://github.com/huawei-noah/noah-research/tree/master/Focus-DETR and https://gitee.com/mindspore/models/tree/master/research/cv/Focus-DETR.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Zheng_Less_is_More_Focus_Attention_for_Efficient_DETR_ICCV_2023_paper.pdf", - "aff": "Huazhong University of Science and Technology+Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab", + "aff": "Huazhong University of Science and Technology+Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab", "project": "https://gitee.com/mindspore/models/tree/master/research/cv/Focus-DETR", "github": "https://github.com/huawei-noah/noah-research/tree/master/Focus-DETR", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Zheng_Less_is_More_ICCV_2023_supplemental.pdf", @@ -33178,13 +34295,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zheng_Less_is_More_Focus_Attention_for_Efficient_DETR_ICCV_2023_paper.html", "aff_unique_index": "0+1;1;1;1;1", "aff_unique_norm": "Huazhong University of Science and Technology;Huawei", - "aff_unique_dep": ";Noah\u2019s Ark Lab", + "aff_unique_dep": ";Noah’s Ark Lab", "aff_unique_url": "http://www.hust.edu.cn;https://www.huawei.com", "aff_unique_abbr": "HUST;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zheng_2023_ICCV,\n \n author = {\n Zheng,\n Dehua and Dong,\n Wenhui and Hu,\n Hailin and Chen,\n Xinghao and Wang,\n Yunhe\n},\n title = {\n Less is More: Focus Attention for Efficient DETR\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6674-6683\n} \n}" }, { "title": "Leveraging Inpainting for Single-Image Shadow Removal", @@ -33207,7 +34325,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_Leveraging_Inpainting_for_Single-Image_Shadow_Removal_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_Leveraging_Inpainting_for_Single-Image_Shadow_Removal_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Xiaoguang and Guo,\n Qing and Abdelfattah,\n Rabab and Lin,\n Di and Feng,\n Wei and Tsang,\n Ivor and Wang,\n Song\n},\n title = {\n Leveraging Inpainting for Single-Image Shadow Removal\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13055-13064\n} \n}" }, { "title": "Leveraging Intrinsic Properties for Non-Rigid Garment Alignment", @@ -33239,7 +34358,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Lin_2023_ICCV,\n \n author = {\n Lin,\n Siyou and Zhou,\n Boyao and Zheng,\n Zerong and Zhang,\n Hongwen and Liu,\n Yebin\n},\n title = {\n Leveraging Intrinsic Properties for Non-Rigid Garment Alignment\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14485-14496\n} \n}" }, { "title": "Leveraging SE(3) Equivariance for Learning 3D Geometric Shape Assembly", @@ -33262,7 +34382,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wu_Leveraging_SE3_Equivariance_for_Learning_3D_Geometric_Shape_Assembly_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wu_Leveraging_SE3_Equivariance_for_Learning_3D_Geometric_Shape_Assembly_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Wu_2023_ICCV,\n \n author = {\n Wu,\n Ruihai and Tie,\n Chenrui and Du,\n Yushi and Zhao,\n Yan and Dong,\n Hao\n},\n title = {\n Leveraging SE(3) Equivariance for Learning 3D Geometric Shape Assembly\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14311-14320\n} \n}" }, { "title": "Leveraging Spatio-Temporal Dependency for Skeleton-Based Action Recognition", @@ -33294,7 +34415,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0+0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Lee_2023_ICCV,\n \n author = {\n Lee,\n Jungho and Lee,\n Minhyeok and Cho,\n Suhwan and Woo,\n Sungmin and Jang,\n Sungjun and Lee,\n Sangyoun\n},\n title = {\n Leveraging Spatio-Temporal Dependency for Skeleton-Based Action Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10255-10264\n} \n}" }, { "title": "LexLIP: Lexicon-Bottlenecked Language-Image Pre-Training for Large-Scale Image-Text Sparse Retrieval", @@ -33319,14 +34441,15 @@ "author_num": 9, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Luo_LexLIP_Lexicon-Bottlenecked_Language-Image_Pre-Training_for_Large-Scale_Image-Text_Sparse_Retrieval_ICCV_2023_paper.html", "aff_unique_index": "0;1;1;1;1;1;0;1;1", - "aff_unique_norm": "Hong Kong Baptist University;Microsoft", - "aff_unique_dep": ";Microsoft Corporation", + "aff_unique_norm": "Hong Kong Baptist University;Microsoft Corporation", + "aff_unique_dep": ";", "aff_unique_url": "https://www.hkbu.edu.hk;https://www.microsoft.com", "aff_unique_abbr": "HKBU;Microsoft", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;1;1;1;1;1;0;1;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Luo_2023_ICCV,\n \n author = {\n Luo,\n Ziyang and Zhao,\n Pu and Xu,\n Can and Geng,\n Xiubo and Shen,\n Tao and Tao,\n Chongyang and Ma,\n Jing and Lin,\n Qingwei and Jiang,\n Daxin\n},\n title = {\n LexLIP: Lexicon-Bottlenecked Language-Image Pre-Training for Large-Scale Image-Text Sparse Retrieval\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11206-11217\n} \n}" }, { "title": "LiDAR-Camera Panoptic Segmentation via Geometry-Consistent and Semantic-Aware Alignment", @@ -33351,14 +34474,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhang_LiDAR-Camera_Panoptic_Segmentation_via_Geometry-Consistent_and_Semantic-Aware_Alignment_ICCV_2023_paper.html", "aff_unique_index": "0;1+1;1;0;1+1;0+1", - "aff_unique_norm": "Shanghai Jiao Tong University;East China Normal University", + "aff_unique_norm": "Shanghai Jiaotong University;East China Normal University", "aff_unique_dep": "School of Electronic Information and Electrical Engineering;Department of Computer Science and Engineering", "aff_unique_url": "https://www.sjtu.edu.cn;http://www.ecnu.edu.cn", "aff_unique_abbr": "SJTU;ECNU", "aff_campus_unique_index": "1;1;", "aff_campus_unique": ";Chongqing", "aff_country_unique_index": "0;0+0;0;0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Zhiwei and Zhang,\n Zhizhong and Yu,\n Qian and Yi,\n Ran and Xie,\n Yuan and Ma,\n Lizhuang\n},\n title = {\n LiDAR-Camera Panoptic Segmentation via Geometry-Consistent and Semantic-Aware Alignment\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3662-3671\n} \n}" }, { "title": "LiDAR-UDA: Self-ensembling Through Time for Unsupervised LiDAR Domain Adaptation", @@ -33390,7 +34514,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Shaban_2023_ICCV,\n \n author = {\n Shaban,\n Amirreza and Lee,\n JoonHo and Jung,\n Sanghun and Meng,\n Xiangyun and Boots,\n Byron\n},\n title = {\n LiDAR-UDA: Self-ensembling Through Time for Unsupervised LiDAR Domain Adaptation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19784-19794\n} \n}" }, { "title": "LightDepth: Single-View Depth Self-Supervision from Illumination Decline", @@ -33398,11 +34523,11 @@ "status": "Poster", "track": "main", "pid": "12440", - "author_site": "Javier Rodr\u00edguez-Puigvert, V\u00edctor M. Batlle, J.M.M. Montiel, Ruben Martinez-Cantin, Pascal Fua, Juan D. Tard\u00f3s, Javier Civera", - "author": "Javier Rodr\u00edguez-Puigvert; V\u00edctor M. Batlle; J.M.M. Montiel; Ruben Martinez-Cantin; Pascal Fua; Juan D. Tard\u00f3s; Javier Civera", + "author_site": "Javier Rodríguez-Puigvert, Víctor M. Batlle, J.M.M. Montiel, Ruben Martinez-Cantin, Pascal Fua, Juan D. Tardós, Javier Civera", + "author": "Javier Rodríguez-Puigvert; Víctor M. Batlle; J.M.M. Montiel; Ruben Martinez-Cantin; Pascal Fua; Juan D. Tardós; Javier Civera", "abstract": "Single-view depth estimation can be remarkably effective if there is enough ground-truth depth data for supervised training. However, there are scenarios, especially in medicine in the case of endoscopies, where such data cannot be obtained. In such cases, multi-view self-supervision and synthetic-to-real transfer serve as alternative approaches, however, with a considerable performance reduction in comparison to supervised case.\n Instead, we propose a single-view self-supervised method that achieves a performance similar to the supervised case. In some medical devices, such as endoscopes, the camera and light sources are co-located at a small distance from the target surfaces. Thus, we can exploit that, for any given albedo and surface orientation, pixel brightness is inversely proportional to the square of the distance to the surface, providing a strong single-view self-supervisory signal. In our experiments, our self-supervised models deliver accuracies comparable to those of fully supervised ones, while being applicable without depth ground-truth data.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Rodriguez-Puigvert_LightDepth_Single-View_Depth_Self-Supervision_from_Illumination_Decline_ICCV_2023_paper.pdf", - "aff": "I3A - Universidad de Zaragoza; I3A - Universidad de Zaragoza; I3A - Universidad de Zaragoza; I3A - Universidad de Zaragoza; \u00b4Ecole Polytechnique F \u00b4ed\u00b4erale de Lausanne; I3A - Universidad de Zaragoza; I3A - Universidad de Zaragoza", + "aff": "I3A - Universidad de Zaragoza; I3A - Universidad de Zaragoza; I3A - Universidad de Zaragoza; I3A - Universidad de Zaragoza; ´Ecole Polytechnique F ´ed´erale de Lausanne; I3A - Universidad de Zaragoza; I3A - Universidad de Zaragoza", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Rodriguez-Puigvert_LightDepth_Single-View_Depth_ICCV_2023_supplemental.zip", @@ -33415,14 +34540,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Rodriguez-Puigvert_LightDepth_Single-View_Depth_Self-Supervision_from_Illumination_Decline_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0;1;0;0", - "aff_unique_norm": "Universidad de Zaragoza;EPFL", + "aff_unique_norm": "Universidad de Zaragoza;Ecole Polytechnique Fédérale de Lausanne", "aff_unique_dep": "I3A;", "aff_unique_url": "https://www.unizar.es;https://www.epfl.ch", "aff_unique_abbr": ";EPFL", "aff_campus_unique_index": "1", "aff_campus_unique": ";Lausanne", "aff_country_unique_index": "0;0;0;0;1;0;0", - "aff_country_unique": "Spain;Switzerland" + "aff_country_unique": "Spain;Switzerland", + "bibtex": "@InProceedings{Rodriguez-Puigvert_2023_ICCV,\n \n author = {\n Rodr{\\'\\i\n}guez-Puigvert,\n Javier and Batlle,\n V{\\'\\i\n}ctor M. and Montiel,\n J.M.M. and Martinez-Cantin,\n Ruben and Fua,\n Pascal and Tard\\'os,\n Juan D. and Civera,\n Javier\n},\n title = {\n LightDepth: Single-View Depth Self-Supervision from Illumination Decline\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21273-21283\n} \n}" }, { "title": "LightGlue: Local Feature Matching at Light Speed", @@ -33447,14 +34573,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Lindenberger_LightGlue_Local_Feature_Matching_at_Light_Speed_ICCV_2023_paper.html", "aff_unique_index": "0;0;0+1", - "aff_unique_norm": "ETH Zurich;Microsoft", + "aff_unique_norm": "ETH Zurich;Microsoft Corporation", "aff_unique_dep": ";Mixed Reality & AI Lab", "aff_unique_url": "https://www.ethz.ch;https://www.microsoft.com", "aff_unique_abbr": "ETHZ;Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+1", - "aff_country_unique": "Switzerland;United States" + "aff_country_unique": "Switzerland;United States", + "bibtex": "@InProceedings{Lindenberger_2023_ICCV,\n \n author = {\n Lindenberger,\n Philipp and Sarlin,\n Paul-Edouard and Pollefeys,\n Marc\n},\n title = {\n LightGlue: Local Feature Matching at Light Speed\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17627-17638\n} \n}" }, { "title": "Lighting Every Darkness in Two Pairs: A Calibration-Free Pipeline for RAW Denoising", @@ -33486,7 +34613,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0+1", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Jin_2023_ICCV,\n \n author = {\n Jin,\n Xin and Xiao,\n Jia-Wen and Han,\n Ling-Hao and Guo,\n Chunle and Zhang,\n Ruixun and Liu,\n Xialei and Li,\n Chongyi\n},\n title = {\n Lighting Every Darkness in Two Pairs: A Calibration-Free Pipeline for RAW Denoising\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13275-13284\n} \n}" }, { "title": "Lighting up NeRF via Unsupervised Decomposition and Enhancement", @@ -33509,7 +34637,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wang_Lighting_up_NeRF_via_Unsupervised_Decomposition_and_Enhancement_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wang_Lighting_up_NeRF_via_Unsupervised_Decomposition_and_Enhancement_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Haoyuan and Xu,\n Xiaogang and Xu,\n Ke and Lau,\n Rynson W.H.\n},\n title = {\n Lighting up NeRF via Unsupervised Decomposition and Enhancement\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12632-12641\n} \n}" }, { "title": "Lightweight Image Super-Resolution with Superpixel Token Interaction", @@ -33534,14 +34663,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhang_Lightweight_Image_Super-Resolution_with_Superpixel_Token_Interaction_ICCV_2023_paper.html", "aff_unique_index": "0;0;1;0", - "aff_unique_norm": "Sun Yat-sen University;Baidu", - "aff_unique_dep": "School of Cyber Science and Technology;Baidu Inc.", + "aff_unique_norm": "Sun Yat-Sen University;Baidu Inc.", + "aff_unique_dep": "School of Cyber Science and Technology;", "aff_unique_url": "http://www.sysu.edu.cn/;https://www.baidu.com", "aff_unique_abbr": "SYSU;Baidu", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Aiping and Ren,\n Wenqi and Liu,\n Yi and Cao,\n Xiaochun\n},\n title = {\n Lightweight Image Super-Resolution with Superpixel Token Interaction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12728-12737\n} \n}" }, { "title": "Linear Spaces of Meanings: Compositional Structures in Vision-Language Models", @@ -33566,14 +34696,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Trager_Linear_Spaces_of_Meanings_Compositional_Structures_in_Vision-Language_Models_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0;0;0", - "aff_unique_norm": "Amazon", + "aff_unique_norm": "Amazon Web Services", "aff_unique_dep": "AWS AI Labs", "aff_unique_url": "https://aws.amazon.com", "aff_unique_abbr": "AWS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Trager_2023_ICCV,\n \n author = {\n Trager,\n Matthew and Perera,\n Pramuditha and Zancato,\n Luca and Achille,\n Alessandro and Bhatia,\n Parminder and Soatto,\n Stefano\n},\n title = {\n Linear Spaces of Meanings: Compositional Structures in Vision-Language Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15395-15404\n} \n}" }, { "title": "Linear-Covariance Loss for End-to-End Learning of 6D Pose Estimation", @@ -33598,14 +34729,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Liu_Linear-Covariance_Loss_for_End-to-End_Learning_of_6D_Pose_Estimation_ICCV_2023_paper.html", "aff_unique_index": "0;1;2+3", - "aff_unique_norm": "Beihang University;Magic Leap;EPFL;ClearSpace", + "aff_unique_norm": "Beihang University;Magic Leap;Ecole Polytechnique Fédérale de Lausanne;ClearSpace", "aff_unique_dep": ";;;", "aff_unique_url": "http://www.buaa.edu.cn/;https://www.magicleap.com;https://www.epfl.ch;", "aff_unique_abbr": "BUAA;Magic Leap;EPFL;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2", - "aff_country_unique": "China;United States;Switzerland;" + "aff_country_unique": "China;United States;Switzerland;", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Fulin and Hu,\n Yinlin and Salzmann,\n Mathieu\n},\n title = {\n Linear-Covariance Loss for End-to-End Learning of 6D Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14107-14117\n} \n}" }, { "title": "LinkGAN: Linking GAN Latents to Pixels for Controllable Image Synthesis", @@ -33637,7 +34769,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0+0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhu_2023_ICCV,\n \n author = {\n Zhu,\n Jiapeng and Yang,\n Ceyuan and Shen,\n Yujun and Shi,\n Zifan and Dai,\n Bo and Zhao,\n Deli and Chen,\n Qifeng\n},\n title = {\n LinkGAN: Linking GAN Latents to Pixels for Controllable Image Synthesis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7656-7666\n} \n}" }, { "title": "Lip Reading for Low-resource Languages by Learning and Combining General Speech Knowledge and Language-specific Knowledge", @@ -33669,7 +34802,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Kim_2023_ICCV,\n \n author = {\n Kim,\n Minsu and Yeo,\n Jeong Hun and Choi,\n Jeongsoo and Ro,\n Yong Man\n},\n title = {\n Lip Reading for Low-resource Languages by Learning and Combining General Speech Knowledge and Language-specific Knowledge\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15359-15371\n} \n}" }, { "title": "Lip2Vec: Efficient and Robust Visual Speech Recognition via Latent-to-Latent Visual to Audio Representation Mapping", @@ -33696,12 +34830,13 @@ "aff_unique_index": "0+1;0;0;0;0", "aff_unique_norm": "Technology Innovation Institute;Dublin City University", "aff_unique_dep": ";", - "aff_unique_url": ";https://www.dcu.ie", + "aff_unique_url": "https://www.tii.ae;https://www.dcu.ie", "aff_unique_abbr": ";DCU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0;0;0;0", - "aff_country_unique": "United Arab Emirates;Ireland" + "aff_country_unique": "United Arab Emirates;Ireland", + "bibtex": "@InProceedings{Djilali_2023_ICCV,\n \n author = {\n Djilali,\n Yasser Abdelaziz Dahou and Narayan,\n Sanath and Boussaid,\n Haithem and Almazrouei,\n Ebtessam and Debbah,\n Merouane\n},\n title = {\n Lip2Vec: Efficient and Robust Visual Speech Recognition via Latent-to-Latent Visual to Audio Representation Mapping\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13790-13801\n} \n}" }, { "title": "LiveHand: Real-time and Photorealistic Neural Hand Rendering", @@ -33733,7 +34868,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0;0;0+0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Mundra_2023_ICCV,\n \n author = {\n Mundra,\n Akshay and R,\n Mallikarjun B and Wang,\n Jiayi and Habermann,\n Marc and Theobalt,\n Christian and Elgharib,\n Mohamed\n},\n title = {\n LiveHand: Real-time and Photorealistic Neural Hand Rendering\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18035-18045\n} \n}" }, { "title": "LivePose: Online 3D Reconstruction from Monocular Video with Dynamic Camera Poses", @@ -33758,14 +34894,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Stier_LivePose_Online_3D_Reconstruction_from_Monocular_Video_with_Dynamic_Camera_ICCV_2023_paper.html", "aff_unique_index": "0+1;0;0;0;0;0", - "aff_unique_norm": "Apple;University of California, Santa Barbara", - "aff_unique_dep": "Apple Inc.;", + "aff_unique_norm": "Apple Inc.;University of California, Santa Barbara", + "aff_unique_dep": ";", "aff_unique_url": "https://www.apple.com;https://www.ucsb.edu", "aff_unique_abbr": "Apple;UCSB", "aff_campus_unique_index": "1", "aff_campus_unique": ";Santa Barbara", "aff_country_unique_index": "0+0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Stier_2023_ICCV,\n \n author = {\n Stier,\n Noah and Angles,\n Baptiste and Yang,\n Liang and Yan,\n Yajie and Colburn,\n Alex and Chuang,\n Ming\n},\n title = {\n LivePose: Online 3D Reconstruction from Monocular Video with Dynamic Camera Poses\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7921-7930\n} \n}" }, { "title": "LivelySpeaker: Towards Semantic-Aware Co-Speech Gesture Generation", @@ -33797,7 +34934,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;2;0;0+0+0", - "aff_country_unique": "China;;France" + "aff_country_unique": "China;;France", + "bibtex": "@InProceedings{Zhi_2023_ICCV,\n \n author = {\n Zhi,\n Yihao and Cun,\n Xiaodong and Chen,\n Xuelin and Shen,\n Xi and Guo,\n Wen and Huang,\n Shaoli and Gao,\n Shenghua\n},\n title = {\n LivelySpeaker: Towards Semantic-Aware Co-Speech Gesture Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20807-20817\n} \n}" }, { "title": "LoCUS: Learning Multiscale 3D-consistent Features from Posed Images", @@ -33805,8 +34943,8 @@ "status": "Poster", "track": "main", "pid": "10721", - "author_site": "Dominik A. Kloepfer, Dylan Campbell, Jo\u00e3o F. Henriques", - "author": "Dominik A. Kloepfer; Dylan Campbell; Jo\u00e3o F. Henriques", + "author_site": "Dominik A. Kloepfer, Dylan Campbell, João F. Henriques", + "author": "Dominik A. Kloepfer; Dylan Campbell; João F. Henriques", "abstract": "An important challenge for autonomous agents such as robots is to maintain a spatially and temporally consistent model of the world. It must be maintained through occlusions, previously-unseen views, and long time horizons (e.g., loop closure and re-identification). It is still an open question how to train such a versatile neural representation without supervision.\n We start from the idea that the training objective can be framed as a patch retrieval problem: given an image patch in one view of a scene, we would like to retrieve (with high precision and recall) all patches in other views that map to the same real-world location. One drawback is that this objective does not promote reusability of features: by being unique to a scene (achieving perfect precision/recall), a representation will not be useful in the context of other scenes. We find that it is possible to balance retrieval and reusability by constructing the retrieval set carefully, leaving out patches that map to far-away locations. Similarly, we can easily regulate the scale of the learned features (e.g., points, objects, or rooms) by adjusting the spatial tolerance for considering a retrieval to be positive. We optimize for (smooth) Average Precision (AP), in a single unified ranking-based objective. This objective also doubles as a criterion for choosing landmarks or keypoints, as patches with high AP.\n We show results creating sparse, multi-scale, semantic spatial maps composed of highly identifiable landmarks, with applications in landmark retrieval, localization, semantic segmentation and instance segmentation.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Kloepfer_LoCUS_Learning_Multiscale_3D-consistent_Features_from_Posed_Images_ICCV_2023_paper.pdf", "aff": "Visual Geometry Group, University of Oxford; Australian National University; Visual Geometry Group, University of Oxford", @@ -33829,7 +34967,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Oxford;", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "United Kingdom;Australia" + "aff_country_unique": "United Kingdom;Australia", + "bibtex": "@InProceedings{Kloepfer_2023_ICCV,\n \n author = {\n Kloepfer,\n Dominik A. and Campbell,\n Dylan and Henriques,\n Jo\\~ao F.\n},\n title = {\n LoCUS: Learning Multiscale 3D-consistent Features from Posed Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16634-16644\n} \n}" }, { "title": "LoGoPrompt: Synthetic Text Images Can Be Good Visual Prompts for Vision-Language Models", @@ -33861,7 +35000,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Shanghai", "aff_country_unique_index": "0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Shi_2023_ICCV,\n \n author = {\n Shi,\n Cheng and Yang,\n Sibei\n},\n title = {\n LoGoPrompt: Synthetic Text Images Can Be Good Visual Prompts for Vision-Language Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2932-2941\n} \n}" }, { "title": "LoLep: Single-View View Synthesis with Locally-Learned Planes and Self-Attention Occlusion Inference", @@ -33893,7 +35033,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Cong and Wang,\n Yu-Ping and Manocha,\n Dinesh\n},\n title = {\n LoLep: Single-View View Synthesis with Locally-Learned Planes and Self-Attention Occlusion Inference\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10841-10851\n} \n}" }, { "title": "LoTE-Animal: A Long Time-span Dataset for Endangered Animal Behavior Understanding", @@ -33925,7 +35066,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Dan and Hou,\n Jin and Huang,\n Shaoli and Liu,\n Jing and He,\n Yuxin and Zheng,\n Bochuan and Ning,\n Jifeng and Zhang,\n Jingdong\n},\n title = {\n LoTE-Animal: A Long Time-span Dataset for Endangered Animal Behavior Understanding\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20064-20075\n} \n}" }, { "title": "Local Context-Aware Active Domain Adaptation", @@ -33957,7 +35099,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Sun_2023_ICCV,\n \n author = {\n Sun,\n Tao and Lu,\n Cheng and Ling,\n Haibin\n},\n title = {\n Local Context-Aware Active Domain Adaptation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18634-18643\n} \n}" }, { "title": "Local and Global Logit Adjustments for Long-Tailed Learning", @@ -33989,7 +35132,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Tao_2023_ICCV,\n \n author = {\n Tao,\n Yingfan and Sun,\n Jingna and Yang,\n Hao and Chen,\n Li and Wang,\n Xu and Yang,\n Wenming and Du,\n Daniel and Zheng,\n Min\n},\n title = {\n Local and Global Logit Adjustments for Long-Tailed Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11783-11792\n} \n}" }, { "title": "Local or Global: Selective Knowledge Assimilation for Federated Learning with Limited Labels", @@ -34014,14 +35158,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Cho_Local_or_Global_Selective_Knowledge_Assimilation_for_Federated_Learning_with_ICCV_2023_paper.html", "aff_unique_index": "0;0;1+2", - "aff_unique_norm": "Carnegie Mellon University;Amazon;Microsoft", - "aff_unique_dep": ";Amazon.com, Inc.;Microsoft Research", + "aff_unique_norm": "Carnegie Mellon University;Amazon.com, Inc.;Microsoft Corporation", + "aff_unique_dep": ";;Microsoft Research", "aff_unique_url": "https://www.cmu.edu;https://www.amazon.com;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "CMU;Amazon;MSR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Cho_2023_ICCV,\n \n author = {\n Cho,\n Yae Jee and Joshi,\n Gauri and Dimitriadis,\n Dimitrios\n},\n title = {\n Local or Global: Selective Knowledge Assimilation for Federated Learning with Limited Labels\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17087-17096\n} \n}" }, { "title": "Localizing Moments in Long Video Via Multimodal Guidance", @@ -34053,7 +35198,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;1", - "aff_country_unique": "United States;Saudi Arabia" + "aff_country_unique": "United States;Saudi Arabia", + "bibtex": "@InProceedings{Barrios_2023_ICCV,\n \n author = {\n Barrios,\n Wayner and Soldan,\n Mattia and Ceballos-Arroyo,\n Alberto Mario and Heilbron,\n Fabian Caba and Ghanem,\n Bernard\n},\n title = {\n Localizing Moments in Long Video Via Multimodal Guidance\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13667-13678\n} \n}" }, { "title": "Localizing Object-Level Shape Variations with Text-to-Image Diffusion Models", @@ -34085,7 +35231,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Israel;" + "aff_country_unique": "Israel;", + "bibtex": "@InProceedings{Patashnik_2023_ICCV,\n \n author = {\n Patashnik,\n Or and Garibi,\n Daniel and Azuri,\n Idan and Averbuch-Elor,\n Hadar and Cohen-Or,\n Daniel\n},\n title = {\n Localizing Object-Level Shape Variations with Text-to-Image Diffusion Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23051-23061\n} \n}" }, { "title": "Locally Stylized Neural Radiance Fields", @@ -34108,7 +35255,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Pang_Locally_Stylized_Neural_Radiance_Fields_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Pang_Locally_Stylized_Neural_Radiance_Fields_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Pang_2023_ICCV,\n \n author = {\n Pang,\n Hong-Wing and Hua,\n Binh-Son and Yeung,\n Sai-Kit\n},\n title = {\n Locally Stylized Neural Radiance Fields\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 307-316\n} \n}" }, { "title": "Locating Noise is Halfway Denoising for Semi-Supervised Segmentation", @@ -34133,14 +35281,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Fang_Locating_Noise_is_Halfway_Denoising_for_Semi-Supervised_Segmentation_ICCV_2023_paper.html", "aff_unique_index": "0+1+2;3;4;2;0+1;0+1", - "aff_unique_norm": "Beijing Jiao Tong University;Beijing Key Laboratory of Advanced Information Science and Network Technology;Meitu Inc;University of Technology Sydney;University of Illinois Urbana-Champaign", + "aff_unique_norm": "Beijing Jiaotong University;Beijing Key Laboratory of Advanced Information Science and Network Technology;Meitu Inc;University of Technology Sydney;University of Illinois at Urbana-Champaign", "aff_unique_dep": "Institute of Information Science;Advanced Information Science and Network Technology;MT Lab;;", "aff_unique_url": "http://www.bjtu.edu.cn;;https://www.meitu.com;https://www.uts.edu.au;https://illinois.edu", "aff_unique_abbr": "BJTU;;Meitu;UTS;UIUC", - "aff_campus_unique_index": "0;2;0;0", - "aff_campus_unique": "Beijing;;Urbana-Champaign", + "aff_campus_unique_index": ";1;;", + "aff_campus_unique": ";Urbana-Champaign", "aff_country_unique_index": "0+0+0;1;2;0;0+0;0+0", - "aff_country_unique": "China;Australia;United States" + "aff_country_unique": "China;Australia;United States", + "bibtex": "@InProceedings{Fang_2023_ICCV,\n \n author = {\n Fang,\n Yan and Zhu,\n Feng and Cheng,\n Bowen and Liu,\n Luoqi and Zhao,\n Yao and Wei,\n Yunchao\n},\n title = {\n Locating Noise is Halfway Denoising for Semi-Supervised Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16612-16622\n} \n}" }, { "title": "Locomotion-Action-Manipulation: Synthesizing Human-Scene Interactions in Complex 3D Environments", @@ -34172,7 +35321,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Lee_2023_ICCV,\n \n author = {\n Lee,\n Jiye and Joo,\n Hanbyul\n},\n title = {\n Locomotion-Action-Manipulation: Synthesizing Human-Scene Interactions in Complex 3D Environments\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9663-9674\n} \n}" }, { "title": "Logic-induced Diagnostic Reasoning for Semi-supervised Semantic Segmentation", @@ -34195,7 +35345,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Liang_Logic-induced_Diagnostic_Reasoning_for_Semi-supervised_Semantic_Segmentation_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Liang_Logic-induced_Diagnostic_Reasoning_for_Semi-supervised_Semantic_Segmentation_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Liang_2023_ICCV,\n \n author = {\n Liang,\n Chen and Wang,\n Wenguan and Miao,\n Jiaxu and Yang,\n Yi\n},\n title = {\n Logic-induced Diagnostic Reasoning for Semi-supervised Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16197-16208\n} \n}" }, { "title": "LogicSeg: Parsing Visual Semantics with Neural Logic Learning and Reasoning", @@ -34227,7 +35378,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0;0", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Liulei and Wang,\n Wenguan and Yang,\n Yi\n},\n title = {\n LogicSeg: Parsing Visual Semantics with Neural Logic Learning and Reasoning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4122-4133\n} \n}" }, { "title": "Long-Range Grouping Transformer for Multi-View 3D Reconstruction", @@ -34235,6 +35387,7 @@ "status": "Poster", "track": "main", "pid": "3839", + "author_site": "Liying Yang, Zhenwei Zhu, Xuxin Lin, Jian Nong, Yanyan Liang", "author": "Liying Yang, Zhenwei Zhu, Xuxin Lin, Jian Nong, Yanyan Liang", "abstract": "Nowadays, transformer networks have demonstrated superior performance in many computer vision tasks. In a multi-view 3D reconstruction algorithm following this paradigm, self-attention processing has to deal with intricate image tokens including massive information when facing heavy amounts of view input. The curse of information content leads to the extreme difficulty of model learning. To alleviate this problem, recent methods compress the token number representing each view or discard the attention operations between the tokens from different views. Obviously, they give a negative impact on performance. Therefore, we propose long-range grouping attention (LGA) based on the divide-and-conquer principle. Tokens from all views are grouped for separate attention operations. The tokens in each group are sampled from all views and can provide macro representation for the resided view. The richness of feature learning is guaranteed by the diversity among different groups. An effective and efficient encoder can be established which connects inter-view features using LGA and extract intra-view features using the standard self-attention layer. Moreover, a novel progressive upsampling decoder is also designed for voxel generation with relatively high resolution. Hinging on the above, we construct a powerful transformer-based network, called LRGT. Experimental results on ShapeNet verify our method achieves SOTA accuracy in multi-view reconstruction. Code is available at https://github.com/LiyingCV/Long-Range-Grouping-Transformer.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Yang_Long-Range_Grouping_Transformer_for_Multi-View_3D_Reconstruction_ICCV_2023_paper.pdf", @@ -34246,7 +35399,8 @@ "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2650218214558283358&as_sdt=800005&sciodt=0,15&hl=en", "gs_version_total": 5, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yang_Long-Range_Grouping_Transformer_for_Multi-View_3D_Reconstruction_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yang_Long-Range_Grouping_Transformer_for_Multi-View_3D_Reconstruction_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n Liying and Zhu,\n Zhenwei and Lin,\n Xuxin and Nong,\n Jian and Liang,\n Yanyan\n},\n title = {\n Long-Range Grouping Transformer for Multi-View 3D Reconstruction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18257-18267\n} \n}" }, { "title": "Long-Term Photometric Consistent Novel View Synthesis with Diffusion Models", @@ -34278,7 +35432,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0+0;0+0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Yu_2023_ICCV,\n \n author = {\n Yu,\n Jason J. and Forghani,\n Fereshteh and Derpanis,\n Konstantinos G. and Brubaker,\n Marcus A.\n},\n title = {\n Long-Term Photometric Consistent Novel View Synthesis with Diffusion Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7094-7104\n} \n}" }, { "title": "Long-range Multimodal Pretraining for Movie Understanding", @@ -34303,14 +35458,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Argaw_Long-range_Multimodal_Pretraining_for_Movie_Understanding_ICCV_2023_paper.html", "aff_unique_index": "0;1;1;0;1", - "aff_unique_norm": "Korea Advanced Institute of Science and Technology;Adobe", - "aff_unique_dep": ";Adobe Inc.", + "aff_unique_norm": "Korea Advanced Institute of Science and Technology;Adobe Inc.", + "aff_unique_dep": ";", "aff_unique_url": "https://www.kaist.ac.kr;https://www.adobe.com", "aff_unique_abbr": "KAIST;Adobe", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0;1", - "aff_country_unique": "South Korea;United States" + "aff_country_unique": "South Korea;United States", + "bibtex": "@InProceedings{Argaw_2023_ICCV,\n \n author = {\n Argaw,\n Dawit Mureja and Lee,\n Joon-Young and Woodson,\n Markus and Kweon,\n In So and Heilbron,\n Fabian Caba\n},\n title = {\n Long-range Multimodal Pretraining for Movie Understanding\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13392-13403\n} \n}" }, { "title": "Look at the Neighbor: Distortion-aware Unsupervised Domain Adaptation for Panoramic Semantic Segmentation", @@ -34318,6 +35474,7 @@ "status": "Poster", "track": "main", "pid": "9091", + "author_site": "Xu Zheng, Tianbo Pan, Yunhao Luo, Lin Wang", "author": "Xu Zheng, Tianbo Pan, Yunhao Luo, Lin Wang", "abstract": "Endeavors have been recently made to transfer knowledge from the labeled pinhole image domain to the unlabeled panoramic image domain via Unsupervised Domain Adaptation (UDA). The aim is to tackle the domain gaps caused by the style disparities and distortion problem of the non-uniformly distributed pixels of equirectangular projection (ERP). Previous works typically focus on transferring knowledge based on geometric priors with specially designed multi-branch network architectures. As a result, considerable computational costs are induced, and meanwhile, their generalization abilities are profoundly hindered by the variation of distortion among pixels. In this paper, we find that the pixels' neighborhood regions of the ERP indeed introduce less distortion. Intuitively, we propose a novel UDA framework that can effectively address the distortion problems for panoramic semantic segmentation. In comparison, our method is simpler, easier to implement, and more computationally efficient. Specifically, we propose distortion-aware attention (DA) capturing the neighboring pixel distribution without using any geometric constraints. Moreover, we propose a class-wise feature aggregation (CFA) module to iteratively update the feature representations with a memory bank. As such, the feature similarity between two domains can be consistently optimized. Extensive experiments show that our method achieves new state-of-the-art performance while remarkably reducing 80% parameters.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Zheng_Look_at_the_Neighbor_Distortion-aware_Unsupervised_Domain_Adaptation_for_Panoramic_ICCV_2023_paper.pdf", @@ -34329,7 +35486,8 @@ "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3954966784684808971&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 5, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zheng_Look_at_the_Neighbor_Distortion-aware_Unsupervised_Domain_Adaptation_for_Panoramic_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zheng_Look_at_the_Neighbor_Distortion-aware_Unsupervised_Domain_Adaptation_for_Panoramic_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Zheng_2023_ICCV,\n \n author = {\n Zheng,\n Xu and Pan,\n Tianbo and Luo,\n Yunhao and Wang,\n Lin\n},\n title = {\n Look at the Neighbor: Distortion-aware Unsupervised Domain Adaptation for Panoramic Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18687-18698\n} \n}" }, { "title": "Lossy and Lossless (L2) Post-training Model Size Compression", @@ -34361,7 +35519,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0+0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Shi_2023_ICCV,\n \n author = {\n Shi,\n Yumeng and Bai,\n Shihao and Wei,\n Xiuying and Gong,\n Ruihao and Yang,\n Jianlei\n},\n title = {\n Lossy and Lossless (L2) Post-training Model Size Compression\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17546-17556\n} \n}" }, { "title": "Low-Light Image Enhancement with Illumination-Aware Gamma Correction and Complete Image Modelling Network", @@ -34373,7 +35532,7 @@ "author": "Yinglong Wang; Zhen Liu; Jianzhuang Liu; Songcen Xu; Shuaicheng Liu", "abstract": "This paper presents a novel network structure with illumination-aware gamma correction and complete image modelling to solve the low-light image enhancement problem. Low-light environments usually lead to less informative large-scale dark areas, directly learning deep representations from low-light images is insensitive to recovering normal illumination. We propose to integrate the effectiveness of gamma correction with the strong modelling capacities of deep networks, which enables the correction factor gamma to be learned in a coarse to elaborate manner via adaptively perceiving the deviated illumination. Because exponential operation introduces high computational complexity, we propose to use Taylor Series to approximate gamma correction, accelerating the training and inference speed. Dark areas usually occupy large scales in low-light images, common local modelling structures, e.g., CNN, SwinIR, are thus insufficient to recover accurate illumination across whole low-light images. We propose a novel Transformer block to completely simulate the dependencies of all pixels across images via a local-to-global hierarchical attention mechanism, so that dark areas could be inferred by borrowing the information from far informative regions in a highly effective manner. Extensive experiments on several benchmark datasets demonstrate that our approach outperforms state-of-the-art methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Wang_Low-Light_Image_Enhancement_with_Illumination-Aware_Gamma_Correction_and_Complete_Image_ICCV_2023_paper.pdf", - "aff": "Meituan Inc.; Megvii Technology; Shenzhen Institute of Advanced Technology; Huawei Noah\u2019s Ark Lab; University of Electronic Science and Technology of China", + "aff": "Meituan Inc.; Megvii Technology; Shenzhen Institute of Advanced Technology; Huawei Noah’s Ark Lab; University of Electronic Science and Technology of China", "project": "", "github": "", "supp": "", @@ -34387,13 +35546,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wang_Low-Light_Image_Enhancement_with_Illumination-Aware_Gamma_Correction_and_Complete_Image_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;3;4", "aff_unique_norm": "Meituan Inc.;Megvii Technology;Shenzhen Institute of Advanced Technology;Huawei;University of Electronic Science and Technology of China", - "aff_unique_dep": ";;;Noah\u2019s Ark Lab;", + "aff_unique_dep": ";;;Noah’s Ark Lab;", "aff_unique_url": "https://www.meituan.com;https://www.megvii.com;http://www.siat.ac.cn;https://www.huawei.com;https://www.uestc.edu.cn", - "aff_unique_abbr": "Meituan;Megvii;;Huawei;UESTC", + "aff_unique_abbr": "Meituan;Megvii;SIAT;Huawei;UESTC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Yinglong and Liu,\n Zhen and Liu,\n Jianzhuang and Xu,\n Songcen and Liu,\n Shuaicheng\n},\n title = {\n Low-Light Image Enhancement with Illumination-Aware Gamma Correction and Complete Image Modelling Network\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13128-13137\n} \n}" }, { "title": "Low-Light Image Enhancement with Multi-Stage Residue Quantization and Brightness-Aware Attention", @@ -34425,7 +35585,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Yunlong and Huang,\n Tao and Dong,\n Weisheng and Wu,\n Fangfang and Li,\n Xin and Shi,\n Guangming\n},\n title = {\n Low-Light Image Enhancement with Multi-Stage Residue Quantization and Brightness-Aware Attention\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12140-12149\n} \n}" }, { "title": "Luminance-aware Color Transform for Multiple Exposure Correction", @@ -34457,7 +35618,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0;0", - "aff_country_unique": "South Korea;United States" + "aff_country_unique": "South Korea;United States", + "bibtex": "@InProceedings{Baek_2023_ICCV,\n \n author = {\n Baek,\n Jong-Hyeon and Kim,\n DaeHyun and Choi,\n Su-Min and Lee,\n Hyo-jun and Kim,\n Hanul and Koh,\n Yeong Jun\n},\n title = {\n Luminance-aware Color Transform for Multiple Exposure Correction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6156-6165\n} \n}" }, { "title": "M2T: Masking Transformers Twice for Faster Decoding", @@ -34489,7 +35651,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;1", - "aff_country_unique": "United States;United Kingdom" + "aff_country_unique": "United States;United Kingdom", + "bibtex": "@InProceedings{Mentzer_2023_ICCV,\n \n author = {\n Mentzer,\n Fabian and Agustson,\n Eirikur and Tschannen,\n Michael\n},\n title = {\n M2T: Masking Transformers Twice for Faster Decoding\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5340-5349\n} \n}" }, { "title": "MAAL: Multimodality-Aware Autoencoder-Based Affordance Learning for 3D Articulated Objects", @@ -34518,10 +35681,11 @@ "aff_unique_dep": "ReLER Lab, AAII;CCAI", "aff_unique_url": "https://www.uts.edu.au;https://www.zju.edu.cn", "aff_unique_abbr": "UTS;", - "aff_campus_unique_index": "", - "aff_campus_unique": "", + "aff_campus_unique_index": "0", + "aff_campus_unique": "Sydney;", "aff_country_unique_index": "0;1;1;1", - "aff_country_unique": "Australia;China" + "aff_country_unique": "Australia;China", + "bibtex": "@InProceedings{Liang_2023_ICCV,\n \n author = {\n Liang,\n Yuanzhi and Wang,\n Xiaohan and Zhu,\n Linchao and Yang,\n Yi\n},\n title = {\n MAAL: Multimodality-Aware Autoencoder-Based Affordance Learning for 3D Articulated Objects\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 217-227\n} \n}" }, { "title": "MAGI: Multi-Annotated Explanation-Guided Learning", @@ -34544,7 +35708,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhang_MAGI_Multi-Annotated_Explanation-Guided_Learning_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhang_MAGI_Multi-Annotated_Explanation-Guided_Learning_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Yifei and Gu,\n Siyi and Gao,\n Yuyang and Pan,\n Bo and Yang,\n Xiaofeng and Zhao,\n Liang\n},\n title = {\n MAGI: Multi-Annotated Explanation-Guided Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1977-1987\n} \n}" }, { "title": "MAMo: Leveraging Memory and Attention for Monocular Video Depth Estimation", @@ -34567,7 +35732,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yasarla_MAMo_Leveraging_Memory_and_Attention_for_Monocular_Video_Depth_Estimation_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yasarla_MAMo_Leveraging_Memory_and_Attention_for_Monocular_Video_Depth_Estimation_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Yasarla_2023_ICCV,\n \n author = {\n Yasarla,\n Rajeev and Cai,\n Hong and Jeong,\n Jisoo and Shi,\n Yunxiao and Garrepalli,\n Risheek and Porikli,\n Fatih\n},\n title = {\n MAMo: Leveraging Memory and Attention for Monocular Video Depth Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8754-8764\n} \n}" }, { "title": "MAP: Towards Balanced Generalization of IID and OOD through Model-Agnostic Adapters", @@ -34599,7 +35765,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Min and Yuan,\n Junkun and He,\n Yue and Li,\n Wenbin and Chen,\n Zhengyu and Kuang,\n Kun\n},\n title = {\n MAP: Towards Balanced Generalization of IID and OOD through Model-Agnostic Adapters\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11921-11931\n} \n}" }, { "title": "MAPConNet: Self-supervised 3D Pose Transfer with Mesh and Point Contrastive Learning", @@ -34631,7 +35798,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", - "aff_country_unique": "United Kingdom;South Korea" + "aff_country_unique": "United Kingdom;South Korea", + "bibtex": "@InProceedings{Sun_2023_ICCV,\n \n author = {\n Sun,\n Jiaze and Chen,\n Zhixiang and Kim,\n Tae-Kyun\n},\n title = {\n MAPConNet: Self-supervised 3D Pose Transfer with Mesh and Point Contrastive Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14452-14462\n} \n}" }, { "title": "MARS: Model-agnostic Biased Object Removal without Additional Supervision for Weakly-Supervised Semantic Segmentation", @@ -34656,14 +35824,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Jo_MARS_Model-agnostic_Biased_Object_Removal_without_Additional_Supervision_for_Weakly-Supervised_ICCV_2023_paper.html", "aff_unique_index": "0;1;2", - "aff_unique_norm": "OGQ;Samsung;Sungkyunkwan University", - "aff_unique_dep": ";Samsung Electronics;Department of Data Convergence and Future Medicine", + "aff_unique_norm": "OGQ;Samsung Electronics;Sungkyunkwan University", + "aff_unique_dep": ";;Department of Data Convergence and Future Medicine", "aff_unique_url": ";https://www.samsung.com;http://www.skku.edu", "aff_unique_abbr": ";Samsung;SKKU", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Seoul;Suwon", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Jo_2023_ICCV,\n \n author = {\n Jo,\n Sanghyun and Yu,\n In-Jae and Kim,\n Kyungsu\n},\n title = {\n MARS: Model-agnostic Biased Object Removal without Additional Supervision for Weakly-Supervised Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 614-623\n} \n}" }, { "title": "MAS: Towards Resource-Efficient Federated Multiple-Task Learning", @@ -34695,7 +35864,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;2", - "aff_country_unique": "Japan;Singapore;China" + "aff_country_unique": "Japan;Singapore;China", + "bibtex": "@InProceedings{Zhuang_2023_ICCV,\n \n author = {\n Zhuang,\n Weiming and Wen,\n Yonggang and Lyu,\n Lingjuan and Zhang,\n Shuai\n},\n title = {\n MAS: Towards Resource-Efficient Federated Multiple-Task Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23414-23424\n} \n}" }, { "title": "MATE: Masked Autoencoders are Online 3D Test-Time Learners", @@ -34727,7 +35897,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Graz;", "aff_country_unique_index": "0;0;1;2;0;1;0;0;1;1;0+0", - "aff_country_unique": "Austria;South Korea;China" + "aff_country_unique": "Austria;South Korea;China", + "bibtex": "@InProceedings{Mirza_2023_ICCV,\n \n author = {\n Mirza,\n M. Jehanzeb and Shin,\n Inkyu and Lin,\n Wei and Schriebl,\n Andreas and Sun,\n Kunyang and Choe,\n Jaesung and Kozinski,\n Mateusz and Possegger,\n Horst and Kweon,\n In So and Yoon,\n Kuk-Jin and Bischof,\n Horst\n},\n title = {\n MATE: Masked Autoencoders are Online 3D Test-Time Learners\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16709-16718\n} \n}" }, { "title": "MAtch, eXpand and Improve: Unsupervised Finetuning for Zero-Shot Action Recognition with Language Knowledge", @@ -34752,14 +35923,15 @@ "author_num": 9, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Lin_MAtch_eXpand_and_Improve_Unsupervised_Finetuning_for_Zero-Shot_Action_Recognition_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;0;0;1;1;2+3;0", - "aff_unique_norm": "Graz University of Technology;IBM;Goethe University Frankfurt;University of Bonn", + "aff_unique_norm": "Graz University of Technology;MIT-IBM Watson AI Lab;Goethe University Frankfurt;University of Bonn", "aff_unique_dep": "Institute of Computer Graphics and Vision;AI Lab;;", "aff_unique_url": "https://www.tugraz.at;;https://www.uni-frankfurt.de;https://www.uni-bonn.de", "aff_unique_abbr": "TU Graz;MIT-IBM AI Lab;GU Frankfurt;UBonn", "aff_campus_unique_index": "0;0;0;;0", "aff_campus_unique": "Graz;", "aff_country_unique_index": "0;1;2;0;0;1;1;2+2;0", - "aff_country_unique": "Austria;United States;Germany" + "aff_country_unique": "Austria;United States;Germany", + "bibtex": "@InProceedings{Lin_2023_ICCV,\n \n author = {\n Lin,\n Wei and Karlinsky,\n Leonid and Shvetsova,\n Nina and Possegger,\n Horst and Kozinski,\n Mateusz and Panda,\n Rameswar and Feris,\n Rogerio and Kuehne,\n Hilde and Bischof,\n Horst\n},\n title = {\n MAtch,\n eXpand and Improve: Unsupervised Finetuning for Zero-Shot Action Recognition with Language Knowledge\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2851-2862\n} \n}" }, { "title": "MB-TaylorFormer: Multi-Branch Efficient Transformer Expanded by Taylor Formula for Image Dehazing", @@ -34791,7 +35963,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;1;0+0", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Qiu_2023_ICCV,\n \n author = {\n Qiu,\n Yuwei and Zhang,\n Kaihao and Wang,\n Chenxi and Luo,\n Wenhan and Li,\n Hongdong and Jin,\n Zhi\n},\n title = {\n MB-TaylorFormer: Multi-Branch Efficient Transformer Expanded by Taylor Formula for Image Dehazing\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12802-12813\n} \n}" }, { "title": "MBPTrack: Improving 3D Point Cloud Tracking with Memory Networks and Box Priors", @@ -34823,7 +35996,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", - "aff_country_unique": "China;United Kingdom" + "aff_country_unique": "China;United Kingdom", + "bibtex": "@InProceedings{Xu_2023_ICCV,\n \n author = {\n Xu,\n Tian-Xing and Guo,\n Yuan-Chen and Lai,\n Yu-Kun and Zhang,\n Song-Hai\n},\n title = {\n MBPTrack: Improving 3D Point Cloud Tracking with Memory Networks and Box Priors\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9911-9920\n} \n}" }, { "title": "MDCS: More Diverse Experts with Consistency Self-distillation for Long-tailed Recognition", @@ -34855,7 +36029,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0;0;0+1;1", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Zhao_2023_ICCV,\n \n author = {\n Zhao,\n Qihao and Jiang,\n Chen and Hu,\n Wei and Zhang,\n Fan and Liu,\n Jun\n},\n title = {\n MDCS: More Diverse Experts with Consistency Self-distillation for Long-tailed Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11597-11608\n} \n}" }, { "title": "MEFLUT: Unsupervised 1D Lookup Tables for Multi-exposure Image Fusion", @@ -34887,7 +36062,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Jiang_2023_ICCV,\n \n author = {\n Jiang,\n Ting and Wang,\n Chuan and Li,\n Xinpeng and Li,\n Ru and Fan,\n Haoqiang and Liu,\n Shuaicheng\n},\n title = {\n MEFLUT: Unsupervised 1D Lookup Tables for Multi-exposure Image Fusion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10542-10551\n} \n}" }, { "title": "MEGA: Multimodal Alignment Aggregation and Distillation For Cinematic Video Segmentation", @@ -34911,15 +36087,16 @@ "email": "amazon.com;amazon.com;amazon.com;amazon.com;amazon.com;amazon.com;amazon.com;amazon.com", "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Sadoughi_MEGA_Multimodal_Alignment_Aggregation_and_Distillation_For_Cinematic_Video_Segmentation_ICCV_2023_paper.html", - "aff_unique_index": "0;0;0;0;0;0;0;0", - "aff_unique_norm": "Amazon", - "aff_unique_dep": "Prime Video", - "aff_unique_url": "https://www.primevideo.com", - "aff_unique_abbr": "Amazon Prime Video", + "aff_unique_index": "0;0;0;0;1;0;0;0", + "aff_unique_norm": "Amazon;Amazon Web Services", + "aff_unique_dep": "Prime Video;AWS AI Labs", + "aff_unique_url": "https://www.primevideo.com;https://aws.amazon.com", + "aff_unique_abbr": "Amazon Prime Video;AWS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Sadoughi_2023_ICCV,\n \n author = {\n Sadoughi,\n Najmeh and Li,\n Xinyu and Vajpayee,\n Avijit and Fan,\n David and Shuai,\n Bing and Santos-Villalobos,\n Hector and Bhat,\n Vimal and MV,\n Rohith\n},\n title = {\n MEGA: Multimodal Alignment Aggregation and Distillation For Cinematic Video Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23331-23340\n} \n}" }, { "title": "MGMAE: Motion Guided Masking for Video Masked Autoencoding", @@ -34951,7 +36128,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Huang_2023_ICCV,\n \n author = {\n Huang,\n Bingkun and Zhao,\n Zhiyu and Zhang,\n Guozhen and Qiao,\n Yu and Wang,\n Limin\n},\n title = {\n MGMAE: Motion Guided Masking for Video Masked Autoencoding\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13493-13504\n} \n}" }, { "title": "MHCN: A Hyperbolic Neural Network Model for Multi-view Hierarchical Clustering", @@ -34976,14 +36154,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Lin_MHCN_A_Hyperbolic_Neural_Network_Model_for_Multi-view_Hierarchical_Clustering_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;3;0;4+5", - "aff_unique_norm": "University of Electronic Science and Technology of China;Tencent;Independent Researcher;University of California, Davis;Pengcheng Laboratory;Harbin Institute of Technology", - "aff_unique_dep": ";Tencent Security Big Data Lab;;;Peng Cheng Lab;", - "aff_unique_url": "https://www.uestc.edu.cn;https://www.tencent.com;;https://www.ucdavis.edu;;https://www.hit.edu.cn/", + "aff_unique_norm": "University of Electronic Science and Technology of China;Tencent;Independent Researcher;University of California, Davis;Peng Cheng Lab;Harbin Institute of Technology", + "aff_unique_dep": ";Tencent Security Big Data Lab;;;;", + "aff_unique_url": "https://www.uestc.edu.cn;https://www.tencent.com;;https://www.ucdavis.edu;;http://en.hhit.edu.cn/", "aff_unique_abbr": "UESTC;Tencent;;UC Davis;;HIT", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Davis;Shenzhen", "aff_country_unique_index": "0;0;2;0;0+0", - "aff_country_unique": "China;;United States" + "aff_country_unique": "China;;United States", + "bibtex": "@InProceedings{Lin_2023_ICCV,\n \n author = {\n Lin,\n Fangfei and Bai,\n Bing and Guo,\n Yiwen and Chen,\n Hao and Ren,\n Yazhou and Xu,\n Zenglin\n},\n title = {\n MHCN: A Hyperbolic Neural Network Model for Multi-view Hierarchical Clustering\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16525-16535\n} \n}" }, { "title": "MHEntropy: Entropy Meets Multiple Hypotheses for Pose and Shape Recovery", @@ -35015,7 +36194,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Rongyu and Yang,\n Linlin and Yao,\n Angela\n},\n title = {\n MHEntropy: Entropy Meets Multiple Hypotheses for Pose and Shape Recovery\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14840-14849\n} \n}" }, { "title": "MI-GAN: A Simple Baseline for Image Inpainting on Mobile Devices", @@ -35038,7 +36218,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Sargsyan_MI-GAN_A_Simple_Baseline_for_Image_Inpainting_on_Mobile_Devices_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Sargsyan_MI-GAN_A_Simple_Baseline_for_Image_Inpainting_on_Mobile_Devices_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Sargsyan_2023_ICCV,\n \n author = {\n Sargsyan,\n Andranik and Navasardyan,\n Shant and Xu,\n Xingqian and Shi,\n Humphrey\n},\n title = {\n MI-GAN: A Simple Baseline for Image Inpainting on Mobile Devices\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7335-7345\n} \n}" }, { "title": "MIMO-NeRF: Fast Neural Rendering with Multi-input Multi-output Neural Radiance Fields", @@ -35046,6 +36227,7 @@ "status": "Poster", "track": "main", "pid": "11569", + "author_site": "Takuhiro Kaneko", "author": "Takuhiro Kaneko", "abstract": "Neural radiance fields (NeRFs) have shown impressive results for novel view synthesis. However, they depend on the repetitive use of a single-input single-output multilayer perceptron (SISO MLP) that maps 3D coordinates and view direction to the color and volume density in a sample-wise manner, which slows the rendering. We propose a multi-input multi-output NeRF (MIMO-NeRF) that reduces the number of MLPs running by replacing the SISO MLP with a MIMO MLP and conducting mappings in a group-wise manner. One notable challenge with this approach is that the color and volume density of each point can differ according to a choice of input coordinates in a group, which can lead to some notable ambiguity. We also propose a self-supervised learning method that regularizes the MIMO MLP with multiple fast reformulated MLPs to alleviate this ambiguity without using pretrained models. The results of a comprehensive experimental evaluation including comparative and ablation studies are presented to show that MIMO-NeRF obtains a good trade-off between speed and quality with a reasonable training time. We then demonstrate that MIMO-NeRF is compatible with and complementary to previous advancements in NeRFs by applying it to two representative fast NeRFs, i.e., a NeRF with a sampling network (DONeRF) and a NeRF with alternative representations (TensoRF).", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Kaneko_MIMO-NeRF_Fast_Neural_Rendering_with_Multi-input_Multi-output_Neural_Radiance_Fields_ICCV_2023_paper.pdf", @@ -35067,7 +36249,8 @@ "aff_unique_url": "https://www.ntt.co.jp", "aff_unique_abbr": "NTT", "aff_country_unique_index": "0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Kaneko_2023_ICCV,\n \n author = {\n Kaneko,\n Takuhiro\n},\n title = {\n MIMO-NeRF: Fast Neural Rendering with Multi-input Multi-output Neural Radiance Fields\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3273-3283\n} \n}" }, { "title": "MMST-ViT: Climate Change-aware Crop Yield Prediction via Multi-Modal Spatial-Temporal Vision Transformer", @@ -35099,7 +36282,8 @@ "aff_campus_unique_index": "1;1;1;1;1;1;1;1", "aff_campus_unique": ";Lafayette", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Lin_2023_ICCV,\n \n author = {\n Lin,\n Fudong and Crawford,\n Summer and Guillot,\n Kaleb and Zhang,\n Yihe and Chen,\n Yan and Yuan,\n Xu and Chen,\n Li and Williams,\n Shelby and Minvielle,\n Robert and Xiao,\n Xiangming and Gholson,\n Drew and Ashwell,\n Nicolas and Setiyono,\n Tri and Tubana,\n Brenda and Peng,\n Lu and Bayoumi,\n Magdy and Tzeng,\n Nian-Feng\n},\n title = {\n MMST-ViT: Climate Change-aware Crop Yield Prediction via Multi-Modal Spatial-Temporal Vision Transformer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5774-5784\n} \n}" }, { "title": "MMVP: Motion-Matrix-Based Video Prediction", @@ -35124,14 +36308,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhong_MMVP_Motion-Matrix-Based_Video_Prediction_ICCV_2023_paper.html", "aff_unique_index": "0+1;1;1;0+1", - "aff_unique_norm": "University of Southern California;Microsoft", - "aff_unique_dep": ";Microsoft Corporation", + "aff_unique_norm": "University of Southern California;Microsoft Corporation", + "aff_unique_dep": ";", "aff_unique_url": "https://www.usc.edu;https://www.microsoft.com", "aff_unique_abbr": "USC;Microsoft", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0+0;0;0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhong_2023_ICCV,\n \n author = {\n Zhong,\n Yiqi and Liang,\n Luming and Zharkov,\n Ilya and Neumann,\n Ulrich\n},\n title = {\n MMVP: Motion-Matrix-Based Video Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4273-4283\n} \n}" }, { "title": "MODA: Mapping-Once Audio-driven Portrait Animation with Dual Attentions", @@ -35163,7 +36348,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "1;1", - "aff_country_unique": ";United States" + "aff_country_unique": ";United States", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Yunfei and Lin,\n Lijian and Yu,\n Fei and Zhou,\n Changyin and Li,\n Yu\n},\n title = {\n MODA: Mapping-Once Audio-driven Portrait Animation with Dual Attentions\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23020-23029\n} \n}" }, { "title": "MOSE: A New Dataset for Video Object Segmentation in Complex Scenes", @@ -35195,7 +36381,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1;2", - "aff_country_unique": "Singapore;United Kingdom;China" + "aff_country_unique": "Singapore;United Kingdom;China", + "bibtex": "@InProceedings{Ding_2023_ICCV,\n \n author = {\n Ding,\n Henghui and Liu,\n Chang and He,\n Shuting and Jiang,\n Xudong and Torr,\n Philip H.S. and Bai,\n Song\n},\n title = {\n MOSE: A New Dataset for Video Object Segmentation in Complex Scenes\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20224-20234\n} \n}" }, { "title": "MOST: Multiple Object Localization with Self-Supervised Transformers for Object Discovery", @@ -35220,14 +36407,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Rambhatla_MOST_Multiple_Object_Localization_with_Self-Supervised_Transformers_for_Object_Discovery_ICCV_2023_paper.html", "aff_unique_index": "0+1;0;2+1;1", - "aff_unique_norm": "Meta;University of Maryland;Johns Hopkins University", - "aff_unique_dep": "Meta Platforms, Inc.;;", + "aff_unique_norm": "Meta Platforms, Inc.;University of Maryland;Johns Hopkins University", + "aff_unique_dep": ";;", "aff_unique_url": "https://meta.com;https://www/umd.edu;https://www.jhu.edu", "aff_unique_abbr": "Meta;UMD;JHU", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";College Park", "aff_country_unique_index": "0+0;0;0+0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Rambhatla_2023_ICCV,\n \n author = {\n Rambhatla,\n Sai Saketh and Misra,\n Ishan and Chellappa,\n Rama and Shrivastava,\n Abhinav\n},\n title = {\n MOST: Multiple Object Localization with Self-Supervised Transformers for Object Discovery\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15823-15834\n} \n}" }, { "title": "MPCViT: Searching for Accurate and Efficient MPC-Friendly Vision Transformer with Heterogeneous Attention", @@ -35259,7 +36447,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0;0;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Zeng_2023_ICCV,\n \n author = {\n Zeng,\n Wenxuan and Li,\n Meng and Xiong,\n Wenjie and Tong,\n Tong and Lu,\n Wen-jie and Tan,\n Jin and Wang,\n Runsheng and Huang,\n Ru\n},\n title = {\n MPCViT: Searching for Accurate and Efficient MPC-Friendly Vision Transformer with Heterogeneous Attention\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5052-5063\n} \n}" }, { "title": "MPI-Flow: Learning Realistic Optical Flow with Multiplane Images", @@ -35267,6 +36456,7 @@ "status": "Poster", "track": "main", "pid": "3193", + "author_site": "Yingping Liang, Jiaming Liu, Debing Zhang, Ying Fu", "author": "Yingping Liang, Jiaming Liu, Debing Zhang, Ying Fu", "abstract": "The accuracy of learning-based optical flow estimation models heavily relies on the realism of the training datasets. Current approaches for generating such datasets either employ synthetic data or generate images with limited realism. However, the domain gap of these data with real-world scenes constrains the generalization of the trained model to real-world applications. To address this issue, we investigate generating realistic optical flow datasets from real-world images. Firstly, to generate highly realistic new images, we construct a layered depth representation, known as multiplane images (MPI), from single-view images. This allows us to generate novel view images that are highly realistic. To generate optical flow maps that correspond accurately to the new image, we calculate the optical flows of each plane using the camera matrix and plane depths. We then project these layered optical flows into the output optical flow map with volume rendering. Secondly, to ensure the realism of motion, we present an independent object motion module that can separate the camera and dynamic object motion in MPI. This module addresses the deficiency in MPI-based single-view methods, where optical flow is generated only by camera motion and does not account for any object movement. We additionally devise a depth-aware inpainting module to merge new images with dynamic objects and address unnatural motion occlusions. We show the superior performance of our method through extensive experiments on real-world datasets. Moreover, our approach achieves state-of-the-art performance in both unsupervised and supervised training of learning-based models. The code will be made publicly available at: https://github.com/Sharpiless/MPI-Flow.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Liang_MPI-Flow_Learning_Realistic_Optical_Flow_with_Multiplane_Images_ICCV_2023_paper.pdf", @@ -35278,7 +36468,8 @@ "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2303052683306804209&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Liang_MPI-Flow_Learning_Realistic_Optical_Flow_with_Multiplane_Images_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Liang_MPI-Flow_Learning_Realistic_Optical_Flow_with_Multiplane_Images_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Liang_2023_ICCV,\n \n author = {\n Liang,\n Yingping and Liu,\n Jiaming and Zhang,\n Debing and Fu,\n Ying\n},\n title = {\n MPI-Flow: Learning Realistic Optical Flow with Multiplane Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13857-13868\n} \n}" }, { "title": "MRM: Masked Relation Modeling for Medical Image Pre-Training with Genetics", @@ -35301,7 +36492,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yang_MRM_Masked_Relation_Modeling_for_Medical_Image_Pre-Training_with_Genetics_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yang_MRM_Masked_Relation_Modeling_for_Medical_Image_Pre-Training_with_Genetics_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n Qiushi and Li,\n Wuyang and Li,\n Baopu and Yuan,\n Yixuan\n},\n title = {\n MRM: Masked Relation Modeling for Medical Image Pre-Training with Genetics\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21452-21462\n} \n}" }, { "title": "MRN: Multiplexed Routing Network for Incremental Multilingual Text Recognition", @@ -35309,6 +36501,7 @@ "status": "Poster", "track": "main", "pid": "7472", + "author_site": "Tianlun Zheng, Zhineng Chen, Bingchen Huang, Wei Zhang, Yu-Gang Jiang", "author": "Tianlun Zheng, Zhineng Chen, Bingchen Huang, Wei Zhang, Yu-Gang Jiang", "abstract": "Multilingual text recognition (MLTR) systems typically focus on a fixed set of languages, which makes it difficult to handle newly added languages or adapt to ever-changing data distribution. In this paper, we propose the Incremental MLTR (IMLTR) task in the context of incremental learning (IL), where different languages are introduced in batches. IMLTR is particularly challenging due to rehearsal-imbalance, which refers to the uneven distribution of sample characters in the rehearsal set, used to retain a small amount of old data as past memories. To address this issue, we propose a Multiplexed Routing Network (MRN). MRN trains a recognizer for each language that is currently seen. Subsequently, a language domain predictor is learned based on the rehearsal set to weigh the recognizers. Since the recognizers are derived from the original data, MRN effectively reduces the reliance on older data and better fights against catastrophic forgetting, the core issue in IL. We extensively evaluate MRN on MLT17 and MLT19 datasets. It outperforms existing general-purpose IL methods by large margins, with average accuracy improvements ranging from 10.3% to 35.8% under different settings. Code is available at https://github.com/simplify23/MRN.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Zheng_MRN_Multiplexed_Routing_Network_for_Incremental_Multilingual_Text_Recognition_ICCV_2023_paper.pdf", @@ -35320,7 +36513,8 @@ "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16615925003560359897&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 5, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zheng_MRN_Multiplexed_Routing_Network_for_Incremental_Multilingual_Text_Recognition_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zheng_MRN_Multiplexed_Routing_Network_for_Incremental_Multilingual_Text_Recognition_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Zheng_2023_ICCV,\n \n author = {\n Zheng,\n Tianlun and Chen,\n Zhineng and Huang,\n Bingchen and Zhang,\n Wei and Jiang,\n Yu-Gang\n},\n title = {\n MRN: Multiplexed Routing Network for Incremental Multilingual Text Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18644-18653\n} \n}" }, { "title": "MSI: Maximize Support-Set Information for Few-Shot Segmentation", @@ -35345,22 +36539,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Moon_MSI_Maximize_Support-Set_Information_for_Few-Shot_Segmentation_ICCV_2023_paper.html", "aff_unique_index": "0;0;1;2;0;3;0", - "aff_unique_norm": "Rutgers University;NEC Laboratories America;College of New Jersey;Mohamed bin Zayed University of Artificial Intelligence", + "aff_unique_norm": "Rutgers University;NEC Laboratories America;The College of New Jersey;Mohamed Bin Zayed University of Artificial Intelligence", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.rutgers.edu;https://www.nec-labs.com;https://www.tcnj.edu;https://www.mbzuai.ac.ae", "aff_unique_abbr": "Rutgers;NEC Labs America;TCNJ;MBZUAI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;1;0", - "aff_country_unique": "United States;United Arab Emirates" - }, - { - "title": "MSRA-SR: Image Super-resolution Transformer with Multi-scale Shared Representation Acquisition", - "session": "Low-level and physics-based vision", - "author": "Xiaoqiang Zhou, Huaibo Huang, Ran He, Zilei Wang, Jie Hu, Tieniu Tan", - "status": "Poster", - "track": "main", - "pid": "1531" + "aff_country_unique": "United States;United Arab Emirates", + "bibtex": "@InProceedings{Moon_2023_ICCV,\n \n author = {\n Moon,\n Seonghyeon and Sohn,\n Samuel S. and Zhou,\n Honglu and Yoon,\n Sejong and Pavlovic,\n Vladimir and Khan,\n Muhammad Haris and Kapadia,\n Mubbasir\n},\n title = {\n MSI: Maximize Support-Set Information for Few-Shot Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19266-19276\n} \n}" }, { "title": "MST-compression: Compressing and Accelerating Binary Neural Networks with Minimum Spanning Tree", @@ -35392,7 +36579,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Vo_2023_ICCV,\n \n author = {\n Vo,\n Quang Hieu and Tran,\n Linh-Tam and Bae,\n Sung-Ho and Kim,\n Lok-Won and Hong,\n Choong Seon\n},\n title = {\n MST-compression: Compressing and Accelerating Binary Neural Networks with Minimum Spanning Tree\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6091-6100\n} \n}" }, { "title": "MULLER: Multilayer Laplacian Resizer for Vision", @@ -35424,7 +36612,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Tu_2023_ICCV,\n \n author = {\n Tu,\n Zhengzhong and Milanfar,\n Peyman and Talebi,\n Hossein\n},\n title = {\n MULLER: Multilayer Laplacian Resizer for Vision\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6877-6887\n} \n}" }, { "title": "MUVA: A New Large-Scale Benchmark for Multi-View Amodal Instance Segmentation in the Shopping Scenario", @@ -35456,7 +36645,8 @@ "aff_campus_unique_index": "0;0;0+0;0+0", "aff_campus_unique": "Beijing;", "aff_country_unique_index": "0;0;1;1;1;0+0;0+0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Zhixuan and Ye,\n Weining and Terven,\n Juan and Bennett,\n Zachary and Zheng,\n Ying and Jiang,\n Tingting and Huang,\n Tiejun\n},\n title = {\n MUVA: A New Large-Scale Benchmark for Multi-View Amodal Instance Segmentation in the Shopping Scenario\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23504-23513\n} \n}" }, { "title": "MUter: Machine Unlearning on Adversarially Trained Models", @@ -35468,7 +36658,7 @@ "author": "Junxu Liu; Mingsheng Xue; Jian Lou; Xiaoyu Zhang; Li Xiong; Zhan Qin", "abstract": "Machine unlearning is an emerging task of removing the influence of selected training datapoints from a trained model upon data deletion requests, which echoes the widely enforced data regulations mandating the Right to be Forgotten. Many unlearning methods have been proposed recently, achieving significant efficiency gains over the naive baseline of retraining from scratch. However, existing methods focus exclusively on unlearning from standard training models and do not apply to adversarial training models (ATMs) despite their popularity as effective defenses against adversarial examples. During adversarial training, the training data are involved in not only an outer loop for minimizing the training loss, but also an inner loop for generating the adversarial perturbation. Such bi-level optimization greatly complicates the influence measure for the data to be deleted and renders the unlearning more challenging than standard model training with single-level optimization. \n This paper proposes a new approach called MUter for unlearning from ATMs. We derive a closed-form unlearning step underpinned by a total Hessian-related data influence measure, while existing methods can mis-capture the data influence associated with the indirect Hessian part. We further alleviate the computational cost by introducing a series of approximations and conversions to avoid the most computationally demanding parts of Hessian inversions. The efficiency and effectiveness of MUter have been validated through experiments on four datasets using both linear and neural network models.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Liu_MUter_Machine_Unlearning_on_Adversarially_Trained_Models_ICCV_2023_paper.pdf", - "aff": "Renmin University of China; Guangzhou Institute of Technology, Xidian University; Zhejiang University+ZJU-Hangzhou Global Scienti\ufb01c and Technological Innovation Center; Xidian University; Emory University; Zhejiang University+ZJU-Hangzhou Global Scienti\ufb01c and Technological Innovation Center", + "aff": "Renmin University of China; Guangzhou Institute of Technology, Xidian University; Zhejiang University+ZJU-Hangzhou Global Scientific and Technological Innovation Center; Xidian University; Emory University; Zhejiang University+ZJU-Hangzhou Global Scientific and Technological Innovation Center", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Liu_MUter_Machine_Unlearning_ICCV_2023_supplemental.pdf", @@ -35488,7 +36678,8 @@ "aff_campus_unique_index": "1;2;2", "aff_campus_unique": ";Guangzhou;Hangzhou", "aff_country_unique_index": "0;0;0+0;0;1;0+0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Junxu and Xue,\n Mingsheng and Lou,\n Jian and Zhang,\n Xiaoyu and Xiong,\n Li and Qin,\n Zhan\n},\n title = {\n MUter: Machine Unlearning on Adversarially Trained Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4892-4902\n} \n}" }, { "title": "MV-DeepSDF: Implicit Modeling with Multi-Sweep Point Clouds for 3D Vehicle Reconstruction in Autonomous Driving", @@ -35500,7 +36691,7 @@ "author": "Yibo Liu; Kelly Zhu; Guile Wu; Yuan Ren; Bingbing Liu; Yang Liu; Jinjun Shan", "abstract": "Reconstructing 3D vehicles from noisy and sparse partial point clouds is of great significance to autonomous driving. Most existing 3D reconstruction methods cannot be directly applied to this problem because they are elaborately designed to deal with dense inputs with trivial noise. In this work, we propose a novel framework, dubbed MV-DeepSDF, which estimates the optimal Signed Distance Function (SDF) shape representation from multi-sweep point clouds\n to reconstruct vehicles in the wild. Although there have been some SDF-based implicit modeling methods, they only focus on single-view-based reconstruction, resulting in low fidelity. In contrast, we first analyze multi-sweep consistency and complementarity in the latent feature space and propose to transform the implicit space shape estimation problem into an element-to-set feature extraction problem. Then, we devise a new architecture to extract individual element-level representations and aggregate them to generate a set-level predicted latent code. This set-level latent code is an expression of the optimal 3D shape in the implicit space, and can be subsequently decoded to a continuous SDF of the vehicle. In this way, our approach learns consistent and complementary information among multi-sweeps for 3D vehicle reconstruction. We conduct thorough experiments on two real-world autonomous driving datasets (Waymo and KITTI) to demonstrate the superiority of our approach over state-of-the-art alternative methods both qualitatively and quantitatively.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Liu_MV-DeepSDF_Implicit_Modeling_with_Multi-Sweep_Point_Clouds_for_3D_Vehicle_ICCV_2023_paper.pdf", - "aff": "Huawei Noah\u2019s Ark Lab+York University; Huawei Noah\u2019s Ark Lab+University of Toronto; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; York University", + "aff": "Huawei Noah’s Ark Lab+York University; Huawei Noah’s Ark Lab+University of Toronto; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; York University", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Liu_MV-DeepSDF_Implicit_Modeling_ICCV_2023_supplemental.pdf", @@ -35514,13 +36705,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Liu_MV-DeepSDF_Implicit_Modeling_with_Multi-Sweep_Point_Clouds_for_3D_Vehicle_ICCV_2023_paper.html", "aff_unique_index": "0+1;0+2;0;0;0;0;1", "aff_unique_norm": "Huawei;York University;University of Toronto", - "aff_unique_dep": "Noah\u2019s Ark Lab;;", + "aff_unique_dep": "Noah’s Ark Lab;;", "aff_unique_url": "https://www.huawei.com;https://www.yorku.ca;https://www.utoronto.ca", "aff_unique_abbr": "Huawei;York U;U of T", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0+1;0;0;0;0;1", - "aff_country_unique": "China;Canada" + "aff_country_unique": "China;Canada", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Yibo and Zhu,\n Kelly and Wu,\n Guile and Ren,\n Yuan and Liu,\n Bingbing and Liu,\n Yang and Shan,\n Jinjun\n},\n title = {\n MV-DeepSDF: Implicit Modeling with Multi-Sweep Point Clouds for 3D Vehicle Reconstruction in Autonomous Driving\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8306-8316\n} \n}" }, { "title": "MV-Map: Offboard HD-Map Generation with Multi-view Consistency", @@ -35545,14 +36737,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Xie_MV-Map_Offboard_HD-Map_Generation_with_Multi-view_Consistency_ICCV_2023_paper.html", "aff_unique_index": "0+1;0;0", - "aff_unique_norm": "University of Illinois Urbana-Champaign;Fudan University", + "aff_unique_norm": "University of Illinois at Urbana-Champaign;Fudan University", "aff_unique_dep": ";", "aff_unique_url": "https://illinois.edu;https://www.fudan.edu.cn", "aff_unique_abbr": "UIUC;Fudan", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Urbana-Champaign;", "aff_country_unique_index": "0+1;0;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Xie_2023_ICCV,\n \n author = {\n Xie,\n Ziyang and Pang,\n Ziqi and Wang,\n Yu-Xiong\n},\n title = {\n MV-Map: Offboard HD-Map Generation with Multi-view Consistency\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8658-8668\n} \n}" }, { "title": "MVPSNet: Fast Generalizable Multi-view Photometric Stereo", @@ -35584,7 +36777,8 @@ "aff_campus_unique_index": "0;1;0;0;0", "aff_campus_unique": "Chapel Hill;College Park", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhao_2023_ICCV,\n \n author = {\n Zhao,\n Dongxu and Lichy,\n Daniel and Perrin,\n Pierre-Nicolas and Frahm,\n Jan-Michael and Sengupta,\n Soumyadip\n},\n title = {\n MVPSNet: Fast Generalizable Multi-view Photometric Stereo\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12525-12536\n} \n}" }, { "title": "MagicFusion: Boosting Text-to-Image Generation Performance by Fusing Diffusion Models", @@ -35609,14 +36803,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhao_MagicFusion_Boosting_Text-to-Image_Generation_Performance_by_Fusing_Diffusion_Models_ICCV_2023_paper.html", "aff_unique_index": "0;1;1;0;0", - "aff_unique_norm": "National University of Defense Technology;JD", - "aff_unique_dep": ";JD Explore Academy", + "aff_unique_norm": "National University of Defense Technology;JD Explore Academy", + "aff_unique_dep": ";", "aff_unique_url": "http://www.nudt.edu.cn/;", "aff_unique_abbr": "NUDT;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Zhao_2023_ICCV,\n \n author = {\n Zhao,\n Jing and Zheng,\n Heliang and Wang,\n Chaoyue and Lan,\n Long and Yang,\n Wenjing\n},\n title = {\n MagicFusion: Boosting Text-to-Image Generation Performance by Fusing Diffusion Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22592-22602\n} \n}" }, { "title": "Make Encoder Great Again in 3D GAN Inversion through Geometry and Occlusion-Aware Encoding", @@ -35648,7 +36843,8 @@ "aff_campus_unique_index": "0;0;2;0", "aff_campus_unique": "Shenzhen;;Hong Kong SAR", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Yuan_2023_ICCV,\n \n author = {\n Yuan,\n Ziyang and Zhu,\n Yiming and Li,\n Yu and Liu,\n Hongyu and Yuan,\n Chun\n},\n title = {\n Make Encoder Great Again in 3D GAN Inversion through Geometry and Occlusion-Aware Encoding\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2437-2447\n} \n}" }, { "title": "Make-An-Animation: Large-Scale Text-conditional 3D Human Motion Generation", @@ -35673,14 +36869,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Azadi_Make-An-Animation_Large-Scale_Text-conditional_3D_Human_Motion_Generation_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0;0", - "aff_unique_norm": "Meta", - "aff_unique_dep": "Meta Platforms, Inc.", + "aff_unique_norm": "Meta Platforms, Inc.", + "aff_unique_dep": "", "aff_unique_url": "https://meta.com", "aff_unique_abbr": "Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Azadi_2023_ICCV,\n \n author = {\n Azadi,\n Samaneh and Shah,\n Akbar and Hayes,\n Thomas and Parikh,\n Devi and Gupta,\n Sonal\n},\n title = {\n Make-An-Animation: Large-Scale Text-conditional 3D Human Motion Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15039-15048\n} \n}" }, { "title": "Make-It-3D: High-fidelity 3D Creation from A Single Image with Diffusion Prior", @@ -35705,14 +36902,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Tang_Make-It-3D_High-fidelity_3D_Creation_from_A_Single_Image_with_Diffusion_ICCV_2023_paper.html", "aff_unique_index": "0+1;2+1;1;1;0;0+1;1", - "aff_unique_norm": "Shanghai Jiao Tong University;Microsoft;Hong Kong University of Science and Technology", + "aff_unique_norm": "Shanghai Jiao Tong University;Microsoft Corporation;Hong Kong University of Science and Technology", "aff_unique_dep": ";Microsoft Research;", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.microsoft.com/en-us/research;https://www.ust.hk", "aff_unique_abbr": "SJTU;MSR;HKUST", "aff_campus_unique_index": ";1;", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0+1;0+1;1;1;0;0+1;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Tang_2023_ICCV,\n \n author = {\n Tang,\n Junshu and Wang,\n Tengfei and Zhang,\n Bo and Zhang,\n Ting and Yi,\n Ran and Ma,\n Lizhuang and Chen,\n Dong\n},\n title = {\n Make-It-3D: High-fidelity 3D Creation from A Single Image with Diffusion Prior\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22819-22829\n} \n}" }, { "title": "Manipulate by Seeing: Creating Manipulation Controllers from Pre-Trained Representations", @@ -35744,7 +36942,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Jianren and Dasari,\n Sudeep and Srirama,\n Mohan Kumar and Tulsiani,\n Shubham and Gupta,\n Abhinav\n},\n title = {\n Manipulate by Seeing: Creating Manipulation Controllers from Pre-Trained Representations\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3859-3868\n} \n}" }, { "title": "MapFormer: Boosting Change Detection by Using Pre-change Information", @@ -35752,8 +36951,8 @@ "status": "Poster", "track": "main", "pid": "10565", - "author_site": "Maximilian Bernhard, Niklas Strau\u00df, Matthias Schubert", - "author": "Maximilian Bernhard; Niklas Strau\u00df; Matthias Schubert", + "author_site": "Maximilian Bernhard, Niklas Strauß, Matthias Schubert", + "author": "Maximilian Bernhard; Niklas Strauß; Matthias Schubert", "abstract": "Change detection in remote sensing imagery is essential for a variety of applications such as urban planning, disaster management, and climate research. However, existing methods for identifying semantically changed areas overlook the availability of semantic information in the form of existing maps describing features of the earth's surface. In this paper, we leverage this information for change detection in bi-temporal images. We show that the simple integration of the additional information via concatenation of latent representations suffices to significantly outperform state-of-the-art change detection methods. Motivated by this observation, we propose the new task of Conditional Change Detection, where pre-change semantic information is used as input next to bi-temporal images. To fully exploit the extra information, we propose MapFormer, a novel architecture based on a multi-modal feature fusion module that allows for feature processing conditioned on the available semantic information. We further employ a supervised, cross-modal contrastive loss to guide the learning of visual representations. Our approach outperforms existing change detection methods by an absolute 11.7% and 18.4% in terms of binary change IoU on DynamicEarthNet and HRSCD, respectively. Furthermore, we demonstrate the robustness of our approach to the quality of the pre-change semantic information and the absence pre-change imagery. The code is available at https://github.com/mxbh/mapformer.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Bernhard_MapFormer_Boosting_Change_Detection_by_Using_Pre-change_Information_ICCV_2023_paper.pdf", "aff": "LMU Munich, MCML; LMU Munich, MCML; LMU Munich, MCML", @@ -35776,7 +36975,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Munich", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Bernhard_2023_ICCV,\n \n author = {\n Bernhard,\n Maximilian and Strau{\\ss\n},\n Niklas and Schubert,\n Matthias\n},\n title = {\n MapFormer: Boosting Change Detection by Using Pre-change Information\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16837-16846\n} \n}" }, { "title": "MapPrior: Bird's-Eye View Map Layout Estimation with Generative Models", @@ -35799,7 +36999,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhu_MapPrior_Birds-Eye_View_Map_Layout_Estimation_with_Generative_Models_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhu_MapPrior_Birds-Eye_View_Map_Layout_Estimation_with_Generative_Models_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Zhu_2023_ICCV,\n \n author = {\n Zhu,\n Xiyue and Zyrianov,\n Vlas and Liu,\n Zhijian and Wang,\n Shenlong\n},\n title = {\n MapPrior: Bird's-Eye View Map Layout Estimation with Generative Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8228-8239\n} \n}" }, { "title": "March in Chat: Interactive Prompting for Remote Embodied Referring Expression", @@ -35811,7 +37012,7 @@ "author": "Yanyuan Qiao; Yuankai Qi; Zheng Yu; Jing Liu; Qi Wu", "abstract": "Many Vision-and-Language Navigation (VLN) tasks have been proposed in recent years, from room-based to object-based and indoor to outdoor. The REVERIE (Remote Embodied Referring Expression) is interesting since it only provides high-level instructions to the agent, which are closer to human commands in practice. Nevertheless, this poses more challenges than other VLN tasks since it requires agents to infer a navigation plan only based on a short instruction. Large Language Models (LLMs) show great potential in robot action planning by providing proper prompts. Still, this strategy has not been explored under the REVERIE settings. There are several new challenges. For example, the LLM should be environment-aware so that the navigation plan can be adjusted based on the current visual observation. Moreover, the LLM planned actions should be adaptable to the much larger and more complex REVERIE environment. This paper proposes a March-in-Chat (MiC) model that can talk to the LLM on the fly and plan dynamically based on a newly proposed Room-and-Object Aware Scene Perceiver (ROASP). Our MiC model outperforms the previous state-of-the-art by large margins by SPL and RGSPL metrics on the REVERIE benchmark. The source code is available at https://github.com/YanyuanQiao/MiC", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Qiao_March_in_Chat_Interactive_Prompting_for_Remote_Embodied_Referring_Expression_ICCV_2023_paper.pdf", - "aff": "Australian Institute for Machine Learning, The University of Adelaide; Australian Institute for Machine Learning, The University of Adelaide; Australian Institute for Machine Learning, The University of Adelaide; Institute of Automation, Chinese Academy of Sciences + School of Arti\ufb01cial Intelligence, University of Chinese Academy of Sciences; Australian Institute for Machine Learning, The University of Adelaide", + "aff": "Australian Institute for Machine Learning, The University of Adelaide; Australian Institute for Machine Learning, The University of Adelaide; Australian Institute for Machine Learning, The University of Adelaide; Institute of Automation, Chinese Academy of Sciences + School of Artificial Intelligence, University of Chinese Academy of Sciences; Australian Institute for Machine Learning, The University of Adelaide", "project": "", "github": "https://github.com/YanyuanQiao/MiC", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Qiao_March_in_Chat_ICCV_2023_supplemental.pdf", @@ -35824,14 +37025,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Qiao_March_in_Chat_Interactive_Prompting_for_Remote_Embodied_Referring_Expression_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;1+2;0", - "aff_unique_norm": "University of Adelaide;Chinese Academy of Sciences;University of Chinese Academy of Sciences", - "aff_unique_dep": "Australian Institute for Machine Learning;Institute of Automation;School of Arti\ufb01cial Intelligence", + "aff_unique_norm": "The University of Adelaide;Chinese Academy of Sciences;University of Chinese Academy of Sciences", + "aff_unique_dep": "Australian Institute for Machine Learning;Institute of Automation;School of Artificial Intelligence", "aff_unique_url": "https://www.adelaide.edu.au;http://www.ia.cas.cn;http://www.ucas.ac.cn", "aff_unique_abbr": "Adelaide;CAS;UCAS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1+1;0", - "aff_country_unique": "Australia;China" + "aff_country_unique": "Australia;China", + "bibtex": "@InProceedings{Qiao_2023_ICCV,\n \n author = {\n Qiao,\n Yanyuan and Qi,\n Yuankai and Yu,\n Zheng and Liu,\n Jing and Wu,\n Qi\n},\n title = {\n March in Chat: Interactive Prompting for Remote Embodied Referring Expression\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15758-15767\n} \n}" }, { "title": "Markov Game Video Augmentation for Action Segmentation", @@ -35863,7 +37065,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Aziere_2023_ICCV,\n \n author = {\n Aziere,\n Nicolas and Todorovic,\n Sinisa\n},\n title = {\n Markov Game Video Augmentation for Action Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13505-13514\n} \n}" }, { "title": "MasQCLIP for Open-Vocabulary Universal Image Segmentation", @@ -35886,7 +37089,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Xu_MasQCLIP_for_Open-Vocabulary_Universal_Image_Segmentation_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Xu_MasQCLIP_for_Open-Vocabulary_Universal_Image_Segmentation_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Xu_2023_ICCV,\n \n author = {\n Xu,\n Xin and Xiong,\n Tianyi and Ding,\n Zheng and Tu,\n Zhuowen\n},\n title = {\n MasQCLIP for Open-Vocabulary Universal Image Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 887-898\n} \n}" }, { "title": "MasaCtrl: Tuning-Free Mutual Self-Attention Control for Consistent Image Synthesis and Editing", @@ -35918,7 +37122,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;1;0+1", - "aff_country_unique": "Japan;China" + "aff_country_unique": "Japan;China", + "bibtex": "@InProceedings{Cao_2023_ICCV,\n \n author = {\n Cao,\n Mingdeng and Wang,\n Xintao and Qi,\n Zhongang and Shan,\n Ying and Qie,\n Xiaohu and Zheng,\n Yinqiang\n},\n title = {\n MasaCtrl: Tuning-Free Mutual Self-Attention Control for Consistent Image Synthesis and Editing\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22560-22570\n} \n}" }, { "title": "Mask-Attention-Free Transformer for 3D Instance Segmentation", @@ -35943,14 +37148,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Lai_Mask-Attention-Free_Transformer_for_3D_Instance_Segmentation_ICCV_2023_paper.html", "aff_unique_index": "0;1;0;0;1;0+2", - "aff_unique_norm": "Chinese University of Hong Kong;Microsoft;SmartMore", + "aff_unique_norm": "The Chinese University of Hong Kong;Microsoft Research;SmartMore", "aff_unique_dep": ";Research;", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.microsoft.com/en-us/research/group/asia;", "aff_unique_abbr": "CUHK;MSR Asia;", "aff_campus_unique_index": "0;1;0;0;1;0", "aff_campus_unique": "Hong Kong SAR;Asia;", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Lai_2023_ICCV,\n \n author = {\n Lai,\n Xin and Yuan,\n Yuhui and Chu,\n Ruihang and Chen,\n Yukang and Hu,\n Han and Jia,\n Jiaya\n},\n title = {\n Mask-Attention-Free Transformer for 3D Instance Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3693-3703\n} \n}" }, { "title": "Masked Autoencoders Are Stronger Knowledge Distillers", @@ -35958,6 +37164,7 @@ "status": "Poster", "track": "main", "pid": "8816", + "author_site": "Shanshan Lao, Guanglu Song, Boxiao Liu, Yu Liu, Yujiu Yang", "author": "Shanshan Lao, Guanglu Song, Boxiao Liu, Yu Liu, Yujiu Yang", "abstract": "Knowledge distillation (KD) has shown great success in improving student's performance by mimicking the intermediate output of the high-capacity teacher in fine-grained visual tasks, e.g. object detection. This paper proposes a technique called Masked Knowledge Distillation (MKD) that enhances this process using a masked autoencoding scheme. In MKD, random patches of the input image are masked, and the corresponding missing feature is recovered by forcing it to imitate the output of the teacher. MKD is based on two core designs. First, using the student as the encoder, we develop an adaptive decoder architecture, which includes a spatial alignment module that operates on the multi-scale features in the feature pyramid network (FPN), a simple decoder, and a spatial recovery module that mimics the teacher's output from the latent representation and mask tokens. Second, we introduce the masked convolution in each convolution block to keep the masked patches unaffected by others. By coupling these two designs, we can further improve the completeness and effectiveness of teacher knowledge learning. We conduct extensive experiments on different architectures with object detection and semantic segmentation. The results show that all the students can achieve further improvements compared to the conventional KD. Notably, we establish the new state-of-the-art results by boosting RetinaNet ResNet-18, and ResNet-50 from 33.4 to 37.5 mAP, and 37.4 to 41.5 mAP, respectively.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Lao_Masked_Autoencoders_Are_Stronger_Knowledge_Distillers_ICCV_2023_paper.pdf", @@ -35969,7 +37176,8 @@ "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14788118741692308655&as_sdt=5,31&sciodt=0,31&hl=en", "gs_version_total": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Lao_Masked_Autoencoders_Are_Stronger_Knowledge_Distillers_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Lao_Masked_Autoencoders_Are_Stronger_Knowledge_Distillers_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Lao_2023_ICCV,\n \n author = {\n Lao,\n Shanshan and Song,\n Guanglu and Liu,\n Boxiao and Liu,\n Yu and Yang,\n Yujiu\n},\n title = {\n Masked Autoencoders Are Stronger Knowledge Distillers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6384-6393\n} \n}" }, { "title": "Masked Autoencoders are Efficient Class Incremental Learners", @@ -36001,7 +37209,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+1;1;0;0", - "aff_country_unique": "China;Italy" + "aff_country_unique": "China;Italy", + "bibtex": "@InProceedings{Zhai_2023_ICCV,\n \n author = {\n Zhai,\n Jiang-Tian and Liu,\n Xialei and Bagdanov,\n Andrew D. and Li,\n Ke and Cheng,\n Ming-Ming\n},\n title = {\n Masked Autoencoders are Efficient Class Incremental Learners\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19104-19113\n} \n}" }, { "title": "Masked Diffusion Transformer is a Strong Image Synthesizer", @@ -36033,7 +37242,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Gao_2023_ICCV,\n \n author = {\n Gao,\n Shanghua and Zhou,\n Pan and Cheng,\n Ming-Ming and Yan,\n Shuicheng\n},\n title = {\n Masked Diffusion Transformer is a Strong Image Synthesizer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23164-23173\n} \n}" }, { "title": "Masked Motion Predictors are Strong 3D Action Representation Learners", @@ -36065,7 +37275,8 @@ "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Hefei", "aff_country_unique_index": "0+0;1;0+0;0;0;0+0", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Mao_2023_ICCV,\n \n author = {\n Mao,\n Yunyao and Deng,\n Jiajun and Zhou,\n Wengang and Fang,\n Yao and Ouyang,\n Wanli and Li,\n Houqiang\n},\n title = {\n Masked Motion Predictors are Strong 3D Action Representation Learners\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10181-10191\n} \n}" }, { "title": "Masked Retraining Teacher-Student Framework for Domain Adaptive Object Detection", @@ -36097,7 +37308,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhao_2023_ICCV,\n \n author = {\n Zhao,\n Zijing and Wei,\n Sitong and Chen,\n Qingchao and Li,\n Dehui and Yang,\n Yifan and Peng,\n Yuxin and Liu,\n Yang\n},\n title = {\n Masked Retraining Teacher-Student Framework for Domain Adaptive Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19039-19049\n} \n}" }, { "title": "Masked Spatio-Temporal Structure Prediction for Self-supervised Learning on Point Cloud Videos", @@ -36124,12 +37336,13 @@ "aff_unique_index": "0;0;1;2;3;4;4;0+4", "aff_unique_norm": "Shanghai Jiao Tong University;Zhejiang University;Aviation University of Air Force;Sun Yat-sen University;CloudWalk Technology", "aff_unique_dep": ";;;;", - "aff_unique_url": "https://www.sjtu.edu.cn;https://www.zju.edu.cn;;http://www.sysu.edu.cn/;https://www.cloudwalk.cn", - "aff_unique_abbr": "SJTU;ZJU;;SYSU;CloudWalk", + "aff_unique_url": "https://www.sjtu.edu.cn;https://www.zju.edu.cn;http://www.auaf.edu.cn;http://www.sysu.edu.cn/;https://www.cloudwalk.cn", + "aff_unique_abbr": "SJTU;ZJU;AUAF;SYSU;CloudWalk", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Shen_2023_ICCV,\n \n author = {\n Shen,\n Zhiqiang and Sheng,\n Xiaoxiao and Fan,\n Hehe and Wang,\n Longguang and Guo,\n Yulan and Liu,\n Qiong and Wen,\n Hao and Zhou,\n Xi\n},\n title = {\n Masked Spatio-Temporal Structure Prediction for Self-supervised Learning on Point Cloud Videos\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16580-16589\n} \n}" }, { "title": "Masked Spiking Transformer", @@ -36154,14 +37367,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wang_Masked_Spiking_Transformer_ICCV_2023_paper.html", "aff_unique_index": "0+1;0;0;0;2+3;0", - "aff_unique_norm": "Hong Kong University of Science and Technology;North Carolina State University;University of Hong Kong;ACCESS - AI Chip Center for Emerging Smart Systems", + "aff_unique_norm": "The Hong Kong University of Science and Technology;North Carolina State University;The University of Hong Kong;ACCESS - AI Chip Center for Emerging Smart Systems", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.ust.hk;https://www.ncsu.edu;https://www.hku.hk;", "aff_unique_abbr": "HKUST;NCSU;HKU;", "aff_campus_unique_index": "0;0;0;0;2;0", "aff_campus_unique": "Guangzhou;;Hong Kong SAR", "aff_country_unique_index": "0+1;0;0;0;0+1;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Ziqing and Fang,\n Yuetong and Cao,\n Jiahang and Zhang,\n Qiang and Wang,\n Zhongrui and Xu,\n Renjing\n},\n title = {\n Masked Spiking Transformer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1761-1771\n} \n}" }, { "title": "Mastering Spatial Graph Prediction of Road Networks", @@ -36173,7 +37387,7 @@ "author": "Anagnostidis Sotiris; Aurelien Lucchi; Thomas Hofmann", "abstract": "Accurately predicting road networks from satellite images requires a global understanding of the network topology. We propose to capture such high-level information by introducing a graph-based framework that given a partially generated graph, sequentially adds new edges. To deal with misalignment between the model predictions and the intended purpose, and to optimize over complex, non-continuous metrics of interest, we adopt a reinforcement learning (RL) approach that nominates modifications that maximize a cumulative reward. As opposed to standard supervised techniques that tend to be more restricted to commonly used surrogate losses, our framework yields more power and flexibility to encode problem-dependent knowledge. Empirical results on several benchmark datasets demonstrate enhanced performance and increased high-level reasoning about the graph topology when using a tree-based search. We further demonstrate the superiority of our approach in handling examples with substantial occlusion and additionally provide evidence that our predictions better match the statistical properties of the ground dataset.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Sotiris_Mastering_Spatial_Graph_Prediction_of_Road_Networks_ICCV_2023_paper.pdf", - "aff": "ETH Z\u00fcrich; University of Basel; ETH Z\u00fcrich", + "aff": "ETH Zürich; University of Basel; ETH Zürich", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Sotiris_Mastering_Spatial_Graph_ICCV_2023_supplemental.pdf", @@ -36186,14 +37400,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Sotiris_Mastering_Spatial_Graph_Prediction_of_Road_Networks_ICCV_2023_paper.html", "aff_unique_index": "0;1;0", - "aff_unique_norm": "ETH Zurich;University of Basel", + "aff_unique_norm": "ETH Zürich;University of Basel", "aff_unique_dep": ";", "aff_unique_url": "https://www.ethz.ch;https://www.unibas.ch", "aff_unique_abbr": "ETHZ;UniBas", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Switzerland" + "aff_country_unique": "Switzerland", + "bibtex": "@InProceedings{Sotiris_2023_ICCV,\n \n author = {\n Sotiris,\n Anagnostidis and Lucchi,\n Aurelien and Hofmann,\n Thomas\n},\n title = {\n Mastering Spatial Graph Prediction of Road Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5408-5418\n} \n}" }, { "title": "MatrixCity: A Large-scale City Dataset for City-scale Neural Rendering and Beyond", @@ -36218,14 +37433,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_MatrixCity_A_Large-scale_City_Dataset_for_City-scale_Neural_Rendering_and_ICCV_2023_paper.html", "aff_unique_index": "0;1;0;0;0;0+1;1", - "aff_unique_norm": "Chinese University of Hong Kong;Shanghai AI Laboratory", + "aff_unique_norm": "The Chinese University of Hong Kong;Shanghai AI Laboratory", "aff_unique_dep": ";", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.shanghai-ai-lab.com", "aff_unique_abbr": "CUHK;SAIL", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Yixuan and Jiang,\n Lihan and Xu,\n Linning and Xiangli,\n Yuanbo and Wang,\n Zhenzhi and Lin,\n Dahua and Dai,\n Bo\n},\n title = {\n MatrixCity: A Large-scale City Dataset for City-scale Neural Rendering and Beyond\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3205-3215\n} \n}" }, { "title": "MatrixVT: Efficient Multi-Camera to BEV Transformation for 3D Perception", @@ -36250,14 +37466,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhou_MatrixVT_Efficient_Multi-Camera_to_BEV_Transformation_for_3D_Perception_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0", - "aff_unique_norm": "Megvii Technology", + "aff_unique_norm": "MEGVII Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.megvii.com", "aff_unique_abbr": "", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhou_2023_ICCV,\n \n author = {\n Zhou,\n Hongyu and Ge,\n Zheng and Li,\n Zeming and Zhang,\n Xiangyu\n},\n title = {\n MatrixVT: Efficient Multi-Camera to BEV Transformation for 3D Perception\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8548-8557\n} \n}" }, { "title": "MeMOTR: Long-Term Memory-Augmented Transformer for Multi-Object Tracking", @@ -36289,7 +37506,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Gao_2023_ICCV,\n \n author = {\n Gao,\n Ruopeng and Wang,\n Limin\n},\n title = {\n MeMOTR: Long-Term Memory-Augmented Transformer for Multi-Object Tracking\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9901-9910\n} \n}" }, { "title": "MeViS: A Large-scale Benchmark for Video Segmentation with Motion Expressions", @@ -36321,7 +37539,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Ding_2023_ICCV,\n \n author = {\n Ding,\n Henghui and Liu,\n Chang and He,\n Shuting and Jiang,\n Xudong and Loy,\n Chen Change\n},\n title = {\n MeViS: A Large-scale Benchmark for Video Segmentation with Motion Expressions\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2694-2703\n} \n}" }, { "title": "Measuring Asymmetric Gradient Discrepancy in Parallel Continual Learning", @@ -36353,7 +37572,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Lyu_2023_ICCV,\n \n author = {\n Lyu,\n Fan and Sun,\n Qing and Shang,\n Fanhua and Wan,\n Liang and Feng,\n Wei\n},\n title = {\n Measuring Asymmetric Gradient Discrepancy in Parallel Continual Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11411-11420\n} \n}" }, { "title": "MedKLIP: Medical Knowledge Enhanced Language-Image Pre-Training for X-ray Diagnosis", @@ -36385,7 +37605,8 @@ "aff_campus_unique_index": ";;;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0+0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wu_2023_ICCV,\n \n author = {\n Wu,\n Chaoyi and Zhang,\n Xiaoman and Zhang,\n Ya and Wang,\n Yanfeng and Xie,\n Weidi\n},\n title = {\n MedKLIP: Medical Knowledge Enhanced Language-Image Pre-Training for X-ray Diagnosis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21372-21383\n} \n}" }, { "title": "Membrane Potential Batch Normalization for Spiking Neural Networks", @@ -36393,6 +37614,7 @@ "status": "Poster", "track": "main", "pid": "5016", + "author_site": "Yufei Guo, Yuhan Zhang, Yuanpei Chen, Weihang Peng, Xiaode Liu, Liwen Zhang, Xuhui Huang, Zhe Ma", "author": "Yufei Guo, Yuhan Zhang, Yuanpei Chen, Weihang Peng, Xiaode Liu, Liwen Zhang, Xuhui Huang, Zhe Ma", "abstract": "As one of the energy-efficient alternatives of conventional neural networks (CNNs), spiking neural networks (SNNs) have gained more and more interest recently. To train the deep models, some effective batch normalization (BN) techniques are proposed in SNNs. All these BNs are suggested to be used after the convolution layer as usually doing in CNNs. However, the spiking neuron is much more complex with spatiotemporal dynamics. The regulated data flow after the BN layer will be disturbed again by the membrane potential updating operation before the firing function, i.e., the nonlinear activation. Therefore, we advocate adding another BN layer before the firing function to normalize the membrane potential again, called MPBN. To eliminate the induced time cost of MPBN, we also propose a training-inference-decoupled re-parameterization technique to fold the trained MPBN into the firing threshold. With the re-parameterization technique, the MPBN will not induce any extra time burden in the inference. Furthermore, the MPBN can also adopt the element-wised form, while the BN after the convolution layer can only use the channel-wised form. Experimental results show that the proposed MPBN performs well on both popular non-spiking static and neuromorphic datasets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Guo_Membrane_Potential_Batch_Normalization_for_Spiking_Neural_Networks_ICCV_2023_paper.pdf", @@ -36404,7 +37626,8 @@ "gs_citation": 49, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11871747328105567152&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Guo_Membrane_Potential_Batch_Normalization_for_Spiking_Neural_Networks_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Guo_Membrane_Potential_Batch_Normalization_for_Spiking_Neural_Networks_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Guo_2023_ICCV,\n \n author = {\n Guo,\n Yufei and Zhang,\n Yuhan and Chen,\n Yuanpei and Peng,\n Weihang and Liu,\n Xiaode and Zhang,\n Liwen and Huang,\n Xuhui and Ma,\n Zhe\n},\n title = {\n Membrane Potential Batch Normalization for Spiking Neural Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19420-19430\n} \n}" }, { "title": "Memory-and-Anticipation Transformer for Online Action Understanding", @@ -36436,7 +37659,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Jiahao and Chen,\n Guo and Huang,\n Yifei and Wang,\n Limin and Lu,\n Tong\n},\n title = {\n Memory-and-Anticipation Transformer for Online Action Understanding\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13824-13835\n} \n}" }, { "title": "MemorySeg: Online LiDAR Semantic Segmentation with a Latent Memory", @@ -36459,7 +37683,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_MemorySeg_Online_LiDAR_Semantic_Segmentation_with_a_Latent_Memory_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_MemorySeg_Online_LiDAR_Semantic_Segmentation_with_a_Latent_Memory_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Enxu and Casas,\n Sergio and Urtasun,\n Raquel\n},\n title = {\n MemorySeg: Online LiDAR Semantic Segmentation with a Latent Memory\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 745-754\n} \n}" }, { "title": "Mesh2Tex: Generating Mesh Textures from Image Queries", @@ -36491,7 +37716,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "Germany;United States" + "aff_country_unique": "Germany;United States", + "bibtex": "@InProceedings{Bokhovkin_2023_ICCV,\n \n author = {\n Bokhovkin,\n Alexey and Tulsiani,\n Shubham and Dai,\n Angela\n},\n title = {\n Mesh2Tex: Generating Mesh Textures from Image Queries\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8918-8928\n} \n}" }, { "title": "Meta OOD Learning For Continuously Adaptive OOD Detection", @@ -36523,7 +37749,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Sydney", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Australia" + "aff_country_unique": "Australia", + "bibtex": "@InProceedings{Wu_2023_ICCV,\n \n author = {\n Wu,\n Xinheng and Lu,\n Jie and Fang,\n Zhen and Zhang,\n Guangquan\n},\n title = {\n Meta OOD Learning For Continuously Adaptive OOD Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19353-19364\n} \n}" }, { "title": "Meta-ZSDETR: Zero-shot DETR with Meta-learning", @@ -36555,7 +37782,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Shanghai;", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Lu and Zhang,\n Chenbo and Zhao,\n Jiajia and Guan,\n Jihong and Zhou,\n Shuigeng\n},\n title = {\n Meta-ZSDETR: Zero-shot DETR with Meta-learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6845-6854\n} \n}" }, { "title": "MetaBEV: Solving Sensor Failures for 3D Detection and Map Segmentation", @@ -36567,7 +37795,7 @@ "author": "Chongjian Ge; Junsong Chen; Enze Xie; Zhongdao Wang; Lanqing Hong; Huchuan Lu; Zhenguo Li; Ping Luo", "abstract": "Perception systems in modern autonomous driving vehicles typically take inputs from complementary multi-modal sensors, e.g., LiDAR and cameras. However, in real-world applications, sensor corruptions and failures lead to inferior performances, thus compromising autonomous safety. In this paper, we propose a robust framework, called MetaBEV, to address extreme real-world environments, involving overall six sensor corruptions and two extreme sensor-missing situations. In MetaBEV, signals from multiple sensors are first processed by modal-specific encoders. Subsequently, a set of dense BEV queries are initialized, termed meta-BEV. These queries are then processed iteratively by a BEV-Evolving decoder, which selectively aggregates deep features from either LiDAR, cameras, or both modalities. The updated BEV representations are further leveraged for multiple 3D prediction tasks. Additionally, we introduce a new \\moe structure to alleviate the performance drop on distinct tasks in multi-task joint learning. Finally, MetaBEV is evaluated on the nuScenes dataset with 3D object detection and BEV map segmentation tasks. Experiments show MetaBEV outperforms prior arts by a large margin on both full and corrupted modalities. For instance, when the LiDAR signal is missing, MetaBEV improves 35.5% detection NDS and 17.7% segmentation mIoU upon the vanilla BEVFusion model; and when the camera signal is absent, MetaBEV still achieves 69.2% NDS and 53.7%mIoU, which is even higher than previous works that perform on full-modalities. Moreover, MetaBEV performs moderately against previous methods in both canonical perception and multi-task learning settings, refreshing state-of-the-art nuScenes BEV map segmentation with 70.4% mIoU.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Ge_MetaBEV_Solving_Sensor_Failures_for_3D_Detection_and_Map_Segmentation_ICCV_2023_paper.pdf", - "aff": "The University of Hong Kong; Huawei Noah\u2019s Ark Lab + The University of Hong Kong; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Dalian University of Technology; Huawei Noah\u2019s Ark Lab; The University of Hong Kong + Shanghai AI Laboratory", + "aff": "The University of Hong Kong; Huawei Noah’s Ark Lab + The University of Hong Kong; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; Dalian University of Technology; Huawei Noah’s Ark Lab; The University of Hong Kong + Shanghai AI Laboratory", "project": "https://chongjiange.github.io/metabev.html", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Ge_MetaBEV_Solving_Sensor_ICCV_2023_supplemental.pdf", @@ -36580,14 +37808,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ge_MetaBEV_Solving_Sensor_Failures_for_3D_Detection_and_Map_Segmentation_ICCV_2023_paper.html", "aff_unique_index": "0;1+0;1;1;1;2;1;0+3", - "aff_unique_norm": "University of Hong Kong;Huawei;Dalian University of Technology;Shanghai AI Laboratory", - "aff_unique_dep": ";Noah\u2019s Ark Lab;;", + "aff_unique_norm": "The University of Hong Kong;Huawei;Dalian University of Technology;Shanghai AI Laboratory", + "aff_unique_dep": ";Noah’s Ark Lab;;", "aff_unique_url": "https://www.hku.hk;https://www.huawei.com;http://www.dlut.edu.cn/;https://www.shanghai-ai-lab.com", "aff_unique_abbr": "HKU;Huawei;DUT;SAIL", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0+0;0;0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Ge_2023_ICCV,\n \n author = {\n Ge,\n Chongjian and Chen,\n Junsong and Xie,\n Enze and Wang,\n Zhongdao and Hong,\n Lanqing and Lu,\n Huchuan and Li,\n Zhenguo and Luo,\n Ping\n},\n title = {\n MetaBEV: Solving Sensor Failures for 3D Detection and Map Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8721-8731\n} \n}" }, { "title": "MetaF2N: Blind Image Super-Resolution by Learning Efficient Model Adaptation from Faces", @@ -36612,14 +37841,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yin_MetaF2N_Blind_Image_Super-Resolution_by_Learning_Efficient_Model_Adaptation_from_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;1;1;0+2", - "aff_unique_norm": "Harbin Institute of Technology;Shanghai Transsion Co, Ltd;Pengcheng Laboratory", - "aff_unique_dep": ";;Peng Cheng Laboratory", + "aff_unique_norm": "Harbin Institute of Technology;Shanghai Transsion Co, Ltd;Peng Cheng Laboratory", + "aff_unique_dep": ";;", "aff_unique_url": "http://www.hit.edu.cn/;;http://www.pcl.ac.cn", "aff_unique_abbr": "HIT;;PCL", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Harbin;", "aff_country_unique_index": "0;0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yin_2023_ICCV,\n \n author = {\n Yin,\n Zhicun and Liu,\n Ming and Li,\n Xiaoming and Yang,\n Hui and Xiao,\n Longan and Zuo,\n Wangmeng\n},\n title = {\n MetaF2N: Blind Image Super-Resolution by Learning Efficient Model Adaptation from Faces\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13033-13044\n} \n}" }, { "title": "MetaGCD: Learning to Continually Learn in Generalized Category Discovery", @@ -36644,14 +37874,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wu_MetaGCD_Learning_to_Continually_Learn_in_Generalized_Category_Discovery_ICCV_2023_paper.html", "aff_unique_index": "0+0;1;2;0+0", - "aff_unique_norm": "Beijing Jiao Tong University;University of Toronto;Concordia University", + "aff_unique_norm": "Beijing Jiaotong University;University of Toronto;Concordia University", "aff_unique_dep": "Key Laboratory of Big Data & Artificial Intelligence in Transportation;Department of Electrical and Computer Engineering;Department of Computer Science and Software Engineering", "aff_unique_url": "http://www.bjtu.edu.cn;https://www.utoronto.ca;https://www.concordia.ca", "aff_unique_abbr": "BJTU;U of T;Concordia", "aff_campus_unique_index": "0+0;1;2;0+0", "aff_campus_unique": "Beijing;Toronto;Montreal", "aff_country_unique_index": "0+0;1;1;0+0", - "aff_country_unique": "China;Canada" + "aff_country_unique": "China;Canada", + "bibtex": "@InProceedings{Wu_2023_ICCV,\n \n author = {\n Wu,\n Yanan and Chi,\n Zhixiang and Wang,\n Yang and Feng,\n Songhe\n},\n title = {\n MetaGCD: Learning to Continually Learn in Generalized Category Discovery\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1655-1665\n} \n}" }, { "title": "Metric3D: Towards Zero-shot Metric 3D Prediction from A Single Image", @@ -36674,7 +37905,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yin_Metric3D_Towards_Zero-shot_Metric_3D_Prediction_from_A_Single_Image_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yin_Metric3D_Towards_Zero-shot_Metric_3D_Prediction_from_A_Single_Image_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Yin_2023_ICCV,\n \n author = {\n Yin,\n Wei and Zhang,\n Chi and Chen,\n Hao and Cai,\n Zhipeng and Yu,\n Gang and Wang,\n Kaixuan and Chen,\n Xiaozhi and Shen,\n Chunhua\n},\n title = {\n Metric3D: Towards Zero-shot Metric 3D Prediction from A Single Image\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9043-9053\n} \n}" }, { "title": "Mimic3D: Thriving 3D-Aware GANs via 3D-to-2D Imitation", @@ -36697,7 +37929,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chen_Mimic3D_Thriving_3D-Aware_GANs_via_3D-to-2D_Imitation_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chen_Mimic3D_Thriving_3D-Aware_GANs_via_3D-to-2D_Imitation_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Xingyu and Deng,\n Yu and Wang,\n Baoyuan\n},\n title = {\n Mimic3D: Thriving 3D-Aware GANs via 3D-to-2D Imitation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2338-2348\n} \n}" }, { "title": "MiniROAD: Minimal RNN Framework for Online Action Detection", @@ -36729,7 +37962,8 @@ "aff_campus_unique_index": "1+2", "aff_campus_unique": ";Merced;Mountain View", "aff_country_unique_index": "0;0;0;0+1+1;0", - "aff_country_unique": "South Korea;United States" + "aff_country_unique": "South Korea;United States", + "bibtex": "@InProceedings{An_2023_ICCV,\n \n author = {\n An,\n Joungbin and Kang,\n Hyolim and Han,\n Su Ho and Yang,\n Ming-Hsuan and Kim,\n Seon Joo\n},\n title = {\n MiniROAD: Minimal RNN Framework for Online Action Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10341-10350\n} \n}" }, { "title": "Minimal Solutions to Generalized Three-View Relative Pose Problem", @@ -36737,8 +37971,8 @@ "status": "Oral", "track": "main", "pid": "4934", - "author_site": "Yaqing Ding, Chiang-Heng Chien, Viktor Larsson, Karl \u00c5str\u00f6m, Benjamin Kimia", - "author": "Yaqing Ding; Chiang-Heng Chien; Viktor Larsson; Karl \u00c5str\u00f6m; Benjamin Kimia", + "author_site": "Yaqing Ding, Chiang-Heng Chien, Viktor Larsson, Karl Åström, Benjamin Kimia", + "author": "Yaqing Ding; Chiang-Heng Chien; Viktor Larsson; Karl Åström; Benjamin Kimia", "abstract": "For a generalized (or non-central) camera model, the minimal problem for two views of six points has efficient solvers. However, minimal problems of three views with four points and three views of six lines have not yet been explored and solved, despite the efforts from the computer vision community. This paper develops the formulations of these two minimal problems and shows how state-of-the-art GPU implementations of Homotopy Continuation solver can be used effectively. The proposed methods are evaluated on both synthetic and real datasets, demonstrating that they are fast, accurate and that they improve on structure from motion estimations, when employed in an hypothesis and test setting.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Ding_Minimal_Solutions_to_Generalized_Three-View_Relative_Pose_Problem_ICCV_2023_paper.pdf", "aff": "Centre for Mathematical Sciences, Lund University; School of Engineering, Brown University; Centre for Mathematical Sciences, Lund University; Centre for Mathematical Sciences, Lund University; School of Engineering, Brown University", @@ -36761,7 +37995,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Lund;", "aff_country_unique_index": "0;1;0;0;1", - "aff_country_unique": "Sweden;United States" + "aff_country_unique": "Sweden;United States", + "bibtex": "@InProceedings{Ding_2023_ICCV,\n \n author = {\n Ding,\n Yaqing and Chien,\n Chiang-Heng and Larsson,\n Viktor and \\r{A\n}str\\"om,\n Karl and Kimia,\n Benjamin\n},\n title = {\n Minimal Solutions to Generalized Three-View Relative Pose Problem\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8156-8164\n} \n}" }, { "title": "Minimal Solutions to Uncalibrated Two-view Geometry with Known Epipoles", @@ -36769,6 +38004,7 @@ "status": "Poster", "track": "main", "pid": "9386", + "author_site": "Gaku Nakano", "author": "Gaku Nakano", "abstract": "This paper proposes minimal solutions to uncalibrated two-view geometry with known epipoles. Exploiting the epipoles, we can reduce the number of point correspondences needed to find the fundamental matrix together with the intrinsic parameters: the focal length and the radial lens distortion. We define four cases by the number of available epipoles and unknown intrinsic parameters, then derive a closed-form solution for each case formulated as a higher-order polynomial in a single variable. The proposed solvers are more numerically stable and faster by orders of magnitude than the conventional 6- or 7-point algorithms. Moreover, we demonstrate by experiments on the human pose dataset that the proposed method can solve two-view geometry even with 2D human pose, of which point localization is noisier than general feature point detectors.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Nakano_Minimal_Solutions_to_Uncalibrated_Two-view_Geometry_with_Known_Epipoles_ICCV_2023_paper.pdf", @@ -36790,7 +38026,8 @@ "aff_unique_url": "https://www.nec.com", "aff_unique_abbr": "NEC", "aff_country_unique_index": "0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Nakano_2023_ICCV,\n \n author = {\n Nakano,\n Gaku\n},\n title = {\n Minimal Solutions to Uncalibrated Two-view Geometry with Known Epipoles\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13361-13370\n} \n}" }, { "title": "Minimum Latency Deep Online Video Stabilization", @@ -36822,7 +38059,8 @@ "aff_campus_unique_index": "1;", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Zhuofan and Liu,\n Zhen and Tan,\n Ping and Zeng,\n Bing and Liu,\n Shuaicheng\n},\n title = {\n Minimum Latency Deep Online Video Stabilization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23030-23039\n} \n}" }, { "title": "Mining bias-target Alignment from Voronoi Cells", @@ -36830,8 +38068,8 @@ "status": "Poster", "track": "main", "pid": "12014", - "author_site": "R\u00e9mi Nahon, Van-Tam Nguyen, Enzo Tartaglione", - "author": "R\u00e9mi Nahon; Van-Tam Nguyen; Enzo Tartaglione", + "author_site": "Rémi Nahon, Van-Tam Nguyen, Enzo Tartaglione", + "author": "Rémi Nahon; Van-Tam Nguyen; Enzo Tartaglione", "abstract": "Despite significant research efforts, deep neural networks remain vulnerable to biases: this raises concerns about their fairness and limits their generalization. In this paper, we propose a bias-agnostic approach to mitigate the impact of biases in deep neural networks. Unlike traditional debiasing approaches, we rely on a metric to quantify \"bias alignment/misalignment\" on target classes and use this information to discourage the propagation of bias-target alignment information through the network. We conduct experiments on several commonly used datasets for debiasing and compare our method with supervised and bias-specific\n approaches. Our results indicate that the proposed method achieves comparable performance to state-of-the-art supervised approaches, despite being bias-agnostic, even in the presence of multiple biases in the same sample.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Nahon_Mining_bias-target_Alignment_from_Voronoi_Cells_ICCV_2023_paper.pdf", "aff": ";;", @@ -36845,7 +38083,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Nahon_Mining_bias-target_Alignment_from_Voronoi_Cells_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Nahon_Mining_bias-target_Alignment_from_Voronoi_Cells_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Nahon_2023_ICCV,\n \n author = {\n Nahon,\n R\\'emi and Nguyen,\n Van-Tam and Tartaglione,\n Enzo\n},\n title = {\n Mining bias-target Alignment from Voronoi Cells\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4946-4955\n} \n}" }, { "title": "Misalign, Contrast then Distill: Rethinking Misalignments in Language-Image Pre-training", @@ -36870,14 +38109,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Kim_Misalign_Contrast_then_Distill_Rethinking_Misalignments_in_Language-Image_Pre-training_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0", - "aff_unique_norm": "LG", - "aff_unique_dep": "LG AI Research", + "aff_unique_norm": "LG AI Research", + "aff_unique_dep": "", "aff_unique_url": "https://www.lgaires.com", "aff_unique_abbr": "LG AI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Kim_2023_ICCV,\n \n author = {\n Kim,\n Bumsoo and Jo,\n Yeonsik and Kim,\n Jinhyung and Kim,\n Seunghwan\n},\n title = {\n Misalign,\n Contrast then Distill: Rethinking Misalignments in Language-Image Pre-training\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2563-2572\n} \n}" }, { "title": "Mitigating Adversarial Vulnerability through Causal Parameter Estimation by Adversarial Double Machine Learning", @@ -36909,7 +38149,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Lee_2023_ICCV,\n \n author = {\n Lee,\n Byung-Kwan and Kim,\n Junho and Ro,\n Yong Man\n},\n title = {\n Mitigating Adversarial Vulnerability through Causal Parameter Estimation by Adversarial Double Machine Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4499-4509\n} \n}" }, { "title": "Mitigating and Evaluating Static Bias of Action Representations in the Background and the Foreground", @@ -36941,7 +38182,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", - "aff_country_unique": "Singapore;China" + "aff_country_unique": "Singapore;China", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Haoxin and Liu,\n Yuan and Zhang,\n Hanwang and Li,\n Boyang\n},\n title = {\n Mitigating and Evaluating Static Bias of Action Representations in the Background and the Foreground\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19911-19923\n} \n}" }, { "title": "MixBag: Bag-Level Data Augmentation for Learning from Label Proportions", @@ -36973,7 +38215,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Fukuoka;", "aff_country_unique_index": "0;0;0+0;0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Asanomi_2023_ICCV,\n \n author = {\n Asanomi,\n Takanori and Matsuo,\n Shinnosuke and Suehiro,\n Daiki and Bise,\n Ryoma\n},\n title = {\n MixBag: Bag-Level Data Augmentation for Learning from Label Proportions\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16570-16579\n} \n}" }, { "title": "MixCycle: Mixup Assisted Semi-Supervised 3D Single Object Tracking with Cycle Consistency", @@ -36985,7 +38228,7 @@ "author": "Qiao Wu; Jiaqi Yang; Kun Sun; Chu'ai Zhang; Yanning Zhang; Mathieu Salzmann", "abstract": "3D single object tracking (SOT) is an indispensable part of automated driving. Existing approaches rely heavily on large, densely labeled datasets. However, annotating point clouds is both costly and time-consuming. Inspired by the great success of cycle tracking in unsupervised 2D SOT, we introduce the first semi-supervised approach to 3D SOT. Specifically, we introduce two cycle-consistency strategies for supervision: 1) Self tracking cycles, which leverage labels to help the model converge better in the early stages of training; 2) forward-backward cycles, which strengthen the tracker's robustness to motion variations and the template noise caused by the template update strategy. Furthermore, we propose a data augmentation strategy named SOTMixup to improve the tracker's robustness to point cloud diversity. SOTMixup generates training samples by sampling points in two point clouds with a mixing rate and assigns a reasonable loss weight for training according to the mixing rate. The resulting MixCycle approach generalizes to appearance matching-based trackers. On the KITTI benchmark, based on the P2B tracker, MixCycle trained with 10% labels outperforms P2B trained with 100% labels, and achieves a 28.4% precision improvement when using 1% labels. Our code will be released at https://github.com/Mumuqiao/MixCycle.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Wu_MixCycle_Mixup_Assisted_Semi-Supervised_3D_Single_Object_Tracking_with_Cycle_ICCV_2023_paper.pdf", - "aff": "Northwestern Polytechnical University; Northwestern Polytechnical University; China University of Geosciences, Wuhan; Northwestern Polytechnical University; Northwestern Polytechnical University; \u00c9cole Polytechnique F\u00e9d\u00e9rale de Lausanne", + "aff": "Northwestern Polytechnical University; Northwestern Polytechnical University; China University of Geosciences, Wuhan; Northwestern Polytechnical University; Northwestern Polytechnical University; École Polytechnique Fédérale de Lausanne", "project": "", "github": "https://github.com/Mumuqiao/MixCycle", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Wu_MixCycle_Mixup_Assisted_ICCV_2023_supplemental.pdf", @@ -36998,14 +38241,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wu_MixCycle_Mixup_Assisted_Semi-Supervised_3D_Single_Object_Tracking_with_Cycle_ICCV_2023_paper.html", "aff_unique_index": "0;0;1;0;0;2", - "aff_unique_norm": "Northwestern Polytechnical University;China University of Geosciences;EPFL", + "aff_unique_norm": "Northwestern Polytechnical University;China University of Geosciences;École Polytechnique Fédérale de Lausanne", "aff_unique_dep": ";;", "aff_unique_url": "https://www.nwpu.edu.cn;http://www.cug.edu.cn/;https://www.epfl.ch", "aff_unique_abbr": "NWPU;CUG;EPFL", "aff_campus_unique_index": "1", "aff_campus_unique": ";Wuhan", "aff_country_unique_index": "0;0;0;0;0;1", - "aff_country_unique": "China;Switzerland" + "aff_country_unique": "China;Switzerland", + "bibtex": "@InProceedings{Wu_2023_ICCV,\n \n author = {\n Wu,\n Qiao and Yang,\n Jiaqi and Sun,\n Kun and Zhang,\n Chu'ai and Zhang,\n Yanning and Salzmann,\n Mathieu\n},\n title = {\n MixCycle: Mixup Assisted Semi-Supervised 3D Single Object Tracking with Cycle Consistency\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13956-13966\n} \n}" }, { "title": "MixPath: A Unified Approach for One-shot Neural Architecture Search", @@ -37028,7 +38272,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chu_MixPath_A_Unified_Approach_for_One-shot_Neural_Architecture_Search_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chu_MixPath_A_Unified_Approach_for_One-shot_Neural_Architecture_Search_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Chu_2023_ICCV,\n \n author = {\n Chu,\n Xiangxiang and Lu,\n Shun and Li,\n Xudong and Zhang,\n Bo\n},\n title = {\n MixPath: A Unified Approach for One-shot Neural Architecture Search\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5972-5981\n} \n}" }, { "title": "MixReorg: Cross-Modal Mixed Patch Reorganization is a Good Mask Learner for Open-World Semantic Segmentation", @@ -37051,7 +38296,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Cai_MixReorg_Cross-Modal_Mixed_Patch_Reorganization_is_a_Good_Mask_Learner_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Cai_MixReorg_Cross-Modal_Mixed_Patch_Reorganization_is_a_Good_Mask_Learner_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Cai_2023_ICCV,\n \n author = {\n Cai,\n Kaixin and Ren,\n Pengzhen and Zhu,\n Yi and Xu,\n Hang and Liu,\n Jianzhuang and Li,\n Changlin and Wang,\n Guangrun and Liang,\n Xiaodan\n},\n title = {\n MixReorg: Cross-Modal Mixed Patch Reorganization is a Good Mask Learner for Open-World Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1196-1205\n} \n}" }, { "title": "MixSpeech: Cross-Modality Self-Learning with Audio-Visual Stream Mixup for Visual Speech Translation and Recognition", @@ -37083,7 +38329,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Cheng_2023_ICCV,\n \n author = {\n Cheng,\n Xize and Jin,\n Tao and Huang,\n Rongjie and Li,\n Linjun and Lin,\n Wang and Wang,\n Zehan and Wang,\n Ye and Liu,\n Huadai and Yin,\n Aoxiong and Zhao,\n Zhou\n},\n title = {\n MixSpeech: Cross-Modality Self-Learning with Audio-Visual Stream Mixup for Visual Speech Translation and Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15735-15745\n} \n}" }, { "title": "MixSynthFormer: A Transformer Encoder-like Structure with Mixed Synthetic Self-attention for Efficient Human Pose Estimation", @@ -37108,14 +38355,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Sun_MixSynthFormer_A_Transformer_Encoder-like_Structure_with_Mixed_Synthetic_Self-attention_for_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0;0", - "aff_unique_norm": "University of Hong Kong", + "aff_unique_norm": "The University of Hong Kong", "aff_unique_dep": "", "aff_unique_url": "https://www.hku.hk", "aff_unique_abbr": "HKU", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Sun_2023_ICCV,\n \n author = {\n Sun,\n Yuran and Dougherty,\n Alan William and Zhang,\n Zhuoying and Choi,\n Yi King and Wu,\n Chuan\n},\n title = {\n MixSynthFormer: A Transformer Encoder-like Structure with Mixed Synthetic Self-attention for Efficient Human Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14884-14893\n} \n}" }, { "title": "Mixed Neural Voxels for Fast Multi-view Video Synthesis", @@ -37143,11 +38391,12 @@ "aff_unique_norm": "Tsinghua University;Hong Kong University of Science and Technology;Alibaba Group", "aff_unique_dep": "Department of Computer Science and Technology;;XR Lab, DAMO Academy", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.ust.hk;https://www.alibabagroup.com", - "aff_unique_abbr": "Tsinghua;HKUST;Alibaba", + "aff_unique_abbr": "THU;HKUST;Alibaba", "aff_campus_unique_index": "0;0;0;1;0", "aff_campus_unique": "Beijing;Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Feng and Tan,\n Sinan and Li,\n Xinghang and Tian,\n Zeyue and Song,\n Yafei and Liu,\n Huaping\n},\n title = {\n Mixed Neural Voxels for Fast Multi-view Video Synthesis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19706-19716\n} \n}" }, { "title": "MoTIF: Learning Motion Trajectories with Local Implicit Neural Functions for Continuous Space-Time Video Super-Resolution", @@ -37179,7 +38428,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Taiwan", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Yi-Hsin and Chen,\n Si-Cun and Chen,\n Yi-Hsin and Lin,\n Yen-Yu and Peng,\n Wen-Hsiao\n},\n title = {\n MoTIF: Learning Motion Trajectories with Local Implicit Neural Functions for Continuous Space-Time Video Super-Resolution\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23131-23141\n} \n}" }, { "title": "Modality Unifying Network for Visible-Infrared Person Re-Identification", @@ -37211,7 +38461,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;0;1;0;2", - "aff_country_unique": "China;United States;Finland" + "aff_country_unique": "China;United States;Finland", + "bibtex": "@InProceedings{Yu_2023_ICCV,\n \n author = {\n Yu,\n Hao and Cheng,\n Xu and Peng,\n Wei and Liu,\n Weihao and Zhao,\n Guoying\n},\n title = {\n Modality Unifying Network for Visible-Infrared Person Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11185-11195\n} \n}" }, { "title": "Model Calibration in Dense Classification with Adaptive Label Perturbation", @@ -37243,7 +38494,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "Australia" + "aff_country_unique": "Australia", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Jiawei and Ye,\n Changkun and Wang,\n Shan and Cui,\n Ruikai and Zhang,\n Jing and Zhang,\n Kaihao and Barnes,\n Nick\n},\n title = {\n Model Calibration in Dense Classification with Adaptive Label Perturbation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1173-1184\n} \n}" }, { "title": "ModelGiF: Gradient Fields for Model Functional Distance", @@ -37275,7 +38527,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Song_2023_ICCV,\n \n author = {\n Song,\n Jie and Xu,\n Zhengqi and Wu,\n Sai and Chen,\n Gang and Song,\n Mingli\n},\n title = {\n ModelGiF: Gradient Fields for Model Functional Distance\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6125-6135\n} \n}" }, { "title": "Modeling the Relative Visual Tempo for Self-supervised Skeleton-based Action Recognition", @@ -37303,11 +38556,12 @@ "aff_unique_norm": "Nanjing University of Posts and Telecommunications;Chinese Academy of Sciences;University of the Chinese Academy of Sciences;Kunming University of Science and Technology;Southeast University", "aff_unique_dep": ";Institute of Computing Technology;;Faculty of Information Engineering and Automation;School of Automation", "aff_unique_url": "http://www.njupt.edu.cn;http://www.cas.cn;http://www.ucas.ac.cn;http://www.kust.edu.cn;https://www.seu.edu.cn/", - "aff_unique_abbr": "NJUPT;CAS;UCAS;KUST;SEU", + "aff_unique_abbr": "NJUPT;CAS;UCAS;;SEU", "aff_campus_unique_index": "0;;2", "aff_campus_unique": "Nanjing;;Kunming", "aff_country_unique_index": "0;0+0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhu_2023_ICCV,\n \n author = {\n Zhu,\n Yisheng and Han,\n Hu and Yu,\n Zhengtao and Liu,\n Guangcan\n},\n title = {\n Modeling the Relative Visual Tempo for Self-supervised Skeleton-based Action Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13913-13922\n} \n}" }, { "title": "MolGrapher: Graph-based Visual Recognition of Chemical Structures", @@ -37339,7 +38593,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;1;0;0;0;0;0;1", - "aff_country_unique": "United States;Switzerland" + "aff_country_unique": "United States;Switzerland", + "bibtex": "@InProceedings{Morin_2023_ICCV,\n \n author = {\n Morin,\n Lucas and Danelljan,\n Martin and Agea,\n Maria Isabel and Nassar,\n Ahmed and Weber,\n Valery and Meijer,\n Ingmar and Staar,\n Peter and Yu,\n Fisher\n},\n title = {\n MolGrapher: Graph-based Visual Recognition of Chemical Structures\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19552-19561\n} \n}" }, { "title": "Moment Detection in Long Tutorial Videos", @@ -37362,7 +38617,8 @@ "aff_domain": ";;;;;;;;", "email": ";;;;;;;;", "author_num": 9, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Croitoru_Moment_Detection_in_Long_Tutorial_Videos_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Croitoru_Moment_Detection_in_Long_Tutorial_Videos_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Croitoru_2023_ICCV,\n \n author = {\n Croitoru,\n Ioana and Bogolin,\n Simion-Vlad and Albanie,\n Samuel and Liu,\n Yang and Wang,\n Zhaowen and Yoon,\n Seunghyun and Dernoncourt,\n Franck and Jin,\n Hailin and Bui,\n Trung\n},\n title = {\n Moment Detection in Long Tutorial Videos\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2594-2604\n} \n}" }, { "title": "MonoDETR: Depth-guided Transformer for Monocular 3D Object Detection", @@ -37394,7 +38650,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0+0;0;0+0;0;0;0;0+0;0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Renrui and Qiu,\n Han and Wang,\n Tai and Guo,\n Ziyu and Cui,\n Ziteng and Qiao,\n Yu and Li,\n Hongsheng and Gao,\n Peng\n},\n title = {\n MonoDETR: Depth-guided Transformer for Monocular 3D Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9155-9166\n} \n}" }, { "title": "MonoNeRD: NeRF-like Representations for Monocular 3D Object Detection", @@ -37419,14 +38676,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Xu_MonoNeRD_NeRF-like_Representations_for_Monocular_3D_Object_Detection_ICCV_2023_paper.html", "aff_unique_index": "0+1;0+1;0+1;1;1;2;0;0+1", - "aff_unique_norm": "Zhejiang University;Fabu Inc.;Fullong Inc", + "aff_unique_norm": "Zhejiang University;FABU Inc.;Fullong Inc", "aff_unique_dep": "State Key Lab of CAD & CG;;", "aff_unique_url": "http://www.zju.edu.cn;;", "aff_unique_abbr": "ZJU;;", "aff_campus_unique_index": ";;;", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0+1;0+1;1;1;0;0+1", - "aff_country_unique": "China;United States;" + "aff_country_unique": "China;United States;", + "bibtex": "@InProceedings{Xu_2023_ICCV,\n \n author = {\n Xu,\n Junkai and Peng,\n Liang and Cheng,\n Haoran and Li,\n Hao and Qian,\n Wei and Li,\n Ke and Wang,\n Wenxiao and Cai,\n Deng\n},\n title = {\n MonoNeRD: NeRF-like Representations for Monocular 3D Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6814-6824\n} \n}" }, { "title": "MonoNeRF: Learning a Generalizable Dynamic Radiance Field from Monocular Videos", @@ -37451,14 +38709,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Tian_MonoNeRF_Learning_a_Generalizable_Dynamic_Radiance_Field_from_Monocular_Videos_ICCV_2023_paper.html", "aff_unique_index": "0;0;1", - "aff_unique_norm": "Xi'an Jiao Tong University;Tsinghua University", + "aff_unique_norm": "Xi'an Jiaotong University;Tsinghua University", "aff_unique_dep": "Institute of Artificial Intelligence and Robotics;Department of Electronic Engineering", "aff_unique_url": "http://www.xjtu.edu.cn;https://www.tsinghua.edu.cn", "aff_unique_abbr": "XJTU;THU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Xi'an;", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Tian_2023_ICCV,\n \n author = {\n Tian,\n Fengrui and Du,\n Shaoyi and Duan,\n Yueqi\n},\n title = {\n MonoNeRF: Learning a Generalizable Dynamic Radiance Field from Monocular Videos\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17903-17913\n} \n}" }, { "title": "Monocular 3D Object Detection with Bounding Box Denoising in 3D by Perceiver", @@ -37490,7 +38749,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Seattle", "aff_country_unique_index": "0;0;0;1;0+1;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Xianpeng and Zheng,\n Ce and Cheng,\n Kelvin B and Xue,\n Nan and Qi,\n Guo-Jun and Wu,\n Tianfu\n},\n title = {\n Monocular 3D Object Detection with Bounding Box Denoising in 3D by Perceiver\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6436-6446\n} \n}" }, { "title": "Monte Carlo Linear Clustering with Single-Point Supervision is Enough for Infrared Small Target Detection", @@ -37517,12 +38777,13 @@ "aff_unique_index": "0;0;1;2;0;0;0;0", "aff_unique_norm": "National University of Defense Technology;Aviation University of Air Force;Shanghai Jiao Tong University", "aff_unique_dep": ";;", - "aff_unique_url": "http://www.nudt.edu.cn/;;https://www.sjtu.edu.cn", - "aff_unique_abbr": "NUDT;;SJTU", + "aff_unique_url": "http://www.nudt.edu.cn/;http://www.auaf.edu.cn;https://www.sjtu.edu.cn", + "aff_unique_abbr": "NUDT;AUAF;SJTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Boyang and Wang,\n Yingqian and Wang,\n Longguang and Zhang,\n Fei and Liu,\n Ting and Lin,\n Zaiping and An,\n Wei and Guo,\n Yulan\n},\n title = {\n Monte Carlo Linear Clustering with Single-Point Supervision is Enough for Infrared Small Target Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1009-1019\n} \n}" }, { "title": "MoreauGrad: Sparse and Robust Interpretation of Neural Networks via Moreau Envelope", @@ -37547,14 +38808,15 @@ "author_num": 2, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhang_MoreauGrad_Sparse_and_Robust_Interpretation_of_Neural_Networks_via_Moreau_ICCV_2023_paper.html", "aff_unique_index": "0;0", - "aff_unique_norm": "Chinese University of Hong Kong", + "aff_unique_norm": "The Chinese University of Hong Kong", "aff_unique_dep": "", "aff_unique_url": "https://www.cuhk.edu.hk", "aff_unique_abbr": "CUHK", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Jingwei and Farnia,\n Farzan\n},\n title = {\n MoreauGrad: Sparse and Robust Interpretation of Neural Networks via Moreau Envelope\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2021-2030\n} \n}" }, { "title": "MosaiQ: Quantum Generative Adversarial Networks for Image Generation on NISQ Computers", @@ -37586,7 +38848,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Silver_2023_ICCV,\n \n author = {\n Silver,\n Daniel and Patel,\n Tirthak and Cutler,\n William and Ranjan,\n Aditya and Gandhi,\n Harshitta and Tiwari,\n Devesh\n},\n title = {\n MosaiQ: Quantum Generative Adversarial Networks for Image Generation on NISQ Computers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7030-7039\n} \n}" }, { "title": "Most Important Person-Guided Dual-Branch Cross-Patch Attention for Group Affect Recognition", @@ -37611,14 +38874,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Xie_Most_Important_Person-Guided_Dual-Branch_Cross-Patch_Attention_for_Group_Affect_Recognition_ICCV_2023_paper.html", "aff_unique_index": "0;1;1;0;0;0;2", - "aff_unique_norm": "National Yang Ming Chiao Tung University;University of Illinois Urbana-Champaign;National Taiwan University", + "aff_unique_norm": "National Yang Ming Chiao Tung University;University of Illinois at Urbana-Champaign;National Taiwan University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.nycu.edu.tw;https://illinois.edu;https://www.ntu.edu.tw", "aff_unique_abbr": "NYCU;UIUC;NTU", "aff_campus_unique_index": "0;1;1;0;0;0;0", "aff_campus_unique": "Taiwan;Urbana-Champaign", "aff_country_unique_index": "0;1;1;0;0;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Xie_2023_ICCV,\n \n author = {\n Xie,\n Hongxia and Lee,\n Ming-Xian and Chen,\n Tzu-Jui and Chen,\n Hung-Jen and Liu,\n Hou-I and Shuai,\n Hong-Han and Cheng,\n Wen-Huang\n},\n title = {\n Most Important Person-Guided Dual-Branch Cross-Patch Attention for Group Affect Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20598-20608\n} \n}" }, { "title": "Motion-Guided Masking for Spatiotemporal Representation Learning", @@ -37642,15 +38906,16 @@ "email": "amazon.com;amazon.com;amazon.com;amazon.com;amazon.com;amazon.com;amazon.com;amazon.com", "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Fan_Motion-Guided_Masking_for_Spatiotemporal_Representation_Learning_ICCV_2023_paper.html", - "aff_unique_index": "0;0;0;0;0;0;0;0", - "aff_unique_norm": "Amazon", - "aff_unique_dep": "Prime Video", - "aff_unique_url": "https://www.primevideo.com", - "aff_unique_abbr": "Amazon Prime Video", + "aff_unique_index": "0;0;0;1;0;0;0;0", + "aff_unique_norm": "Amazon;Amazon Web Services", + "aff_unique_dep": "Prime Video;AWS AI Labs", + "aff_unique_url": "https://www.primevideo.com;https://aws.amazon.com", + "aff_unique_abbr": "Amazon Prime Video;AWS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Fan_2023_ICCV,\n \n author = {\n Fan,\n David and Wang,\n Jue and Liao,\n Shuai and Zhu,\n Yi and Bhat,\n Vimal and Santos-Villalobos,\n Hector and MV,\n Rohith and Li,\n Xinyu\n},\n title = {\n Motion-Guided Masking for Spatiotemporal Representation Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5619-5629\n} \n}" }, { "title": "MotionBERT: A Unified Perspective on Learning Human Motion Representations", @@ -37682,7 +38947,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhu_2023_ICCV,\n \n author = {\n Zhu,\n Wentao and Ma,\n Xiaoxuan and Liu,\n Zhaoyang and Liu,\n Libin and Wu,\n Wayne and Wang,\n Yizhou\n},\n title = {\n MotionBERT: A Unified Perspective on Learning Human Motion Representations\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15085-15099\n} \n}" }, { "title": "MotionDeltaCNN: Sparse CNN Inference of Frame Differences in Moving Camera Videos with Spherical Buffers and Padded Convolutions", @@ -37707,14 +38973,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Parger_MotionDeltaCNN_Sparse_CNN_Inference_of_Frame_Differences_in_Moving_Camera_ICCV_2023_paper.html", "aff_unique_index": "0;1;0;1;1;1;0", - "aff_unique_norm": "Graz University of Technology;Meta", - "aff_unique_dep": ";Meta Reality Labs", + "aff_unique_norm": "Graz University of Technology;Meta Reality Labs", + "aff_unique_dep": ";", "aff_unique_url": "https://www.tugraz.at;https://www.meta.com", "aff_unique_abbr": "TUGraz;MRL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;1;1;1;0", - "aff_country_unique": "Austria;United States" + "aff_country_unique": "Austria;United States", + "bibtex": "@InProceedings{Parger_2023_ICCV,\n \n author = {\n Parger,\n Mathias and Tang,\n Chengcheng and Neff,\n Thomas and Twigg,\n Christopher D. and Keskin,\n Cem and Wang,\n Robert and Steinberger,\n Markus\n},\n title = {\n MotionDeltaCNN: Sparse CNN Inference of Frame Differences in Moving Camera Videos with Spherical Buffers and Padded Convolutions\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17292-17301\n} \n}" }, { "title": "MotionLM: Multi-Agent Motion Forecasting as Language Modeling", @@ -37726,7 +38993,7 @@ "author": "Ari Seff; Brian Cera; Dian Chen; Mason Ng; Aurick Zhou; Nigamaa Nayakanti; Khaled S. Refaat; Rami Al-Rfou; Benjamin Sapp", "abstract": "Reliable forecasting of the future behavior of road agents is a critical component to safe planning in autonomous vehicles. Here, we represent continuous trajectories as sequences of discrete motion tokens and cast multi-agent motion prediction as a language modeling task over this domain. Our model, MotionLM, provides several advantages: First, it does not require anchors or explicit latent variable optimization to learn multimodal distributions. Instead, we leverage a single standard language modeling objective, maximizing the average log probability over sequence tokens. Second, our approach bypasses post-hoc interaction heuristics where individual agent trajectory generation is conducted prior to interactive scoring. Instead, MotionLM produces joint distributions over interactive agent futures in a single autoregressive decoding process. In addition, the model's sequential factorization enables temporally causal conditional rollouts. The proposed approach establishes new state-of-the-art performance for multi-agent motion prediction on the Waymo Open Motion Dataset, ranking 1st on the interactive challenge leaderboard.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Seff_MotionLM_Multi-Agent_Motion_Forecasting_as_Language_Modeling_ICCV_2023_paper.pdf", - "aff": "Waymo; Waymo; Waymo+\u21e4; Waymo; Waymo; Waymo; Waymo; Waymo; Waymo", + "aff": "Waymo; Waymo; Waymo+⇤; Waymo; Waymo; Waymo; Waymo; Waymo; Waymo", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Seff_MotionLM_Multi-Agent_Motion_ICCV_2023_supplemental.zip", @@ -37746,7 +39013,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0", - "aff_country_unique": "United States;" + "aff_country_unique": "United States;", + "bibtex": "@InProceedings{Seff_2023_ICCV,\n \n author = {\n Seff,\n Ari and Cera,\n Brian and Chen,\n Dian and Ng,\n Mason and Zhou,\n Aurick and Nayakanti,\n Nigamaa and Refaat,\n Khaled S. and Al-Rfou,\n Rami and Sapp,\n Benjamin\n},\n title = {\n MotionLM: Multi-Agent Motion Forecasting as Language Modeling\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8579-8590\n} \n}" }, { "title": "Movement Enhancement toward Multi-Scale Video Feature Representation for Temporal Action Detection", @@ -37778,7 +39046,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhao_2023_ICCV,\n \n author = {\n Zhao,\n Zixuan and Wang,\n Dongqi and Zhao,\n Xu\n},\n title = {\n Movement Enhancement toward Multi-Scale Video Feature Representation for Temporal Action Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13555-13564\n} \n}" }, { "title": "Multi-Directional Subspace Editing in Style-Space", @@ -37786,6 +39055,7 @@ "status": "Poster", "track": "main", "pid": "10190", + "author_site": "Chen Naveh", "author": "Chen Naveh", "abstract": "This paper describes a new technique for finding disentangled semantic directions in the latent space of StyleGAN. Our method identifies meaningful orthogonal subspaces that allow editing of one human face attribute, while minimizing undesired changes in other attributes. Our model is capable of editing a single attribute in multiple directions, resulting in a range of possible generated images. We compare our scheme with three state-of-the-art models and show that our method outperforms them in terms of face editing and disentanglement capabilities. Additionally, we suggest quantitative measures for evaluating attribute separation and disentanglement, and exhibit the superiority of our model with respect to those measures.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Naveh_Multi-Directional_Subspace_Editing_in_Style-Space_ICCV_2023_paper.pdf", @@ -37807,7 +39077,8 @@ "aff_unique_url": "https://www.reichman.ac.il", "aff_unique_abbr": "", "aff_country_unique_index": "0", - "aff_country_unique": "Israel" + "aff_country_unique": "Israel", + "bibtex": "@InProceedings{Naveh_2023_ICCV,\n \n author = {\n Naveh,\n Chen\n},\n title = {\n Multi-Directional Subspace Editing in Style-Space\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7138-7148\n} \n}" }, { "title": "Multi-Event Video-Text Retrieval", @@ -37834,12 +39105,13 @@ "aff_unique_index": "0+1;0;2;0+1", "aff_unique_norm": "Ludwig Maximilian University of Munich;Munich Center for Machine Learning;University of Oxford", "aff_unique_dep": ";;", - "aff_unique_url": "https://www.lmu.de;;https://www.ox.ac.uk", + "aff_unique_url": "https://www.lmu.de;https://www.munich-center-for-machine-learning.de;https://www.ox.ac.uk", "aff_unique_abbr": "LMU;;Oxford", "aff_campus_unique_index": "0+0;0;1;0+0", "aff_campus_unique": "Munich;Oxford", "aff_country_unique_index": "0+0;0;1;0+0", - "aff_country_unique": "Germany;United Kingdom" + "aff_country_unique": "Germany;United Kingdom", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Gengyuan and Ren,\n Jisen and Gu,\n Jindong and Tresp,\n Volker\n},\n title = {\n Multi-Event Video-Text Retrieval\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22113-22123\n} \n}" }, { "title": "Multi-Frequency Representation Enhancement with Privilege Information for Video Super-Resolution", @@ -37864,14 +39136,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_Multi-Frequency_Representation_Enhancement_with_Privilege_Information_for_Video_Super-Resolution_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;2;0", - "aff_unique_norm": "China Agricultural University;Tsinghua University;Samsung", - "aff_unique_dep": ";;Samsung Research", + "aff_unique_norm": "China Agricultural University;Tsinghua University;Samsung Research", + "aff_unique_dep": ";;", "aff_unique_url": "http://www.cau.edu.cn/;https://www.tsinghua.edu.cn;https://www.samsung.com/cn/research/", "aff_unique_abbr": "CAU;THU;SRC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Fei and Zhang,\n Linfeng and Liu,\n Zikun and Lei,\n Juan and Li,\n Zhenbo\n},\n title = {\n Multi-Frequency Representation Enhancement with Privilege Information for Video Super-Resolution\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12814-12825\n} \n}" }, { "title": "Multi-Label Knowledge Distillation", @@ -37896,14 +39169,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yang_Multi-Label_Knowledge_Distillation_ICCV_2023_paper.html", "aff_unique_index": "0+1;0+1;0+1;2;3;3+4;0+1", - "aff_unique_norm": "Nanjing University of Aeronautics and Astronautics;MIIT;Nanyang Technological University;RIKEN;University of Tokyo", + "aff_unique_norm": "Nanjing University of Aeronautics and Astronautics;MIIT;Nanyang Technological University;RIKEN;The University of Tokyo", "aff_unique_dep": "College of Computer Science and Technology;Key Laboratory of Pattern Analysis and Machine Intelligence;School of Computer Science and Engineering;Center for Advanced Intelligence Project;", "aff_unique_url": "http://www.nuaa.edu.cn;;https://www.ntu.edu.sg;https://www.riken.jp/en/;https://www.u-tokyo.ac.jp", "aff_unique_abbr": "NUAA;MIIT;NTU;RIKEN;UTokyo", "aff_campus_unique_index": "0+0;0+0;0+0;1;3;0+0", "aff_campus_unique": "Nanjing;Singapore;;Tokyo", "aff_country_unique_index": "0+0;0+0;0+0;1;2;2+2;0+0", - "aff_country_unique": "China;Singapore;Japan" + "aff_country_unique": "China;Singapore;Japan", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n Penghui and Xie,\n Ming-Kun and Zong,\n Chen-Chen and Feng,\n Lei and Niu,\n Gang and Sugiyama,\n Masashi and Huang,\n Sheng-Jun\n},\n title = {\n Multi-Label Knowledge Distillation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17271-17280\n} \n}" }, { "title": "Multi-Label Self-Supervised Learning with Scene Images", @@ -37935,7 +39209,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhu_2023_ICCV,\n \n author = {\n Zhu,\n Ke and Fu,\n Minghao and Wu,\n Jianxin\n},\n title = {\n Multi-Label Self-Supervised Learning with Scene Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6694-6703\n} \n}" }, { "title": "Multi-Metrics Adaptively Identifies Backdoors in Federated Learning", @@ -37967,7 +39242,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Huang_2023_ICCV,\n \n author = {\n Huang,\n Siquan and Li,\n Yijiang and Chen,\n Chong and Shi,\n Leyu and Gao,\n Ying\n},\n title = {\n Multi-Metrics Adaptively Identifies Backdoors in Federated Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4652-4662\n} \n}" }, { "title": "Multi-Modal Continual Test-Time Adaptation for 3D Semantic Segmentation", @@ -37999,7 +39275,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Cao_2023_ICCV,\n \n author = {\n Cao,\n Haozhi and Xu,\n Yuecong and Yang,\n Jianfei and Yin,\n Pengyu and Yuan,\n Shenghai and Xie,\n Lihua\n},\n title = {\n Multi-Modal Continual Test-Time Adaptation for 3D Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18809-18819\n} \n}" }, { "title": "Multi-Modal Gated Mixture of Local-to-Global Experts for Dynamic Image Fusion", @@ -38031,7 +39308,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Cao_2023_ICCV,\n \n author = {\n Cao,\n Bing and Sun,\n Yiming and Zhu,\n Pengfei and Hu,\n Qinghua\n},\n title = {\n Multi-Modal Gated Mixture of Local-to-Global Experts for Dynamic Image Fusion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23555-23564\n} \n}" }, { "title": "Multi-Modal Neural Radiance Field for Monocular Dense SLAM with a Light-Weight ToF Sensor", @@ -38057,13 +39335,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Liu_Multi-Modal_Neural_Radiance_Field_for_Monocular_Dense_SLAM_with_a_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0;0;1;0", "aff_unique_norm": "Zhejiang University;Google", - "aff_unique_dep": "State Key Lab of CAD&CG;Google", + "aff_unique_dep": "State Key Lab of CAD&CG;", "aff_unique_url": "http://www.zju.edu.cn;https://www.google.com", "aff_unique_abbr": "ZJU;Google", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0;0;1;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Xinyang and Li,\n Yijin and Teng,\n Yanbin and Bao,\n Hujun and Zhang,\n Guofeng and Zhang,\n Yinda and Cui,\n Zhaopeng\n},\n title = {\n Multi-Modal Neural Radiance Field for Monocular Dense SLAM with a Light-Weight ToF Sensor\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1-11\n} \n}" }, { "title": "Multi-Object Discovery by Low-Dimensional Object Motion", @@ -38071,8 +39350,8 @@ "status": "Poster", "track": "main", "pid": "1532", - "author_site": "Sadra Safadoust, Fatma G\u00fcney", - "author": "Sadra Safadoust; Fatma G\u00fcney", + "author_site": "Sadra Safadoust, Fatma Güney", + "author": "Sadra Safadoust; Fatma Güney", "abstract": "Recent work in unsupervised multi-object segmentation shows impressive results by predicting motion from a single image despite the inherent ambiguity in predicting motion without the next image. On the other hand, the set of possible motions for an image can be constrained to a low-dimensional space by considering the scene structure and moving objects in it. We propose to model pixel-wise geometry and object motion to remove ambiguity in reconstructing flow from a single image. Specifically, we divide the image into coherently moving regions and use depth to construct flow bases that best explain the observed flow in each region. We achieve state-of-the-art results in unsupervised multi-object segmentation on synthetic and real-world datasets by modeling the scene structure and object motion. Our evaluation of the predicted depth maps shows reliable performance in monocular depth estimation.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Safadoust_Multi-Object_Discovery_by_Low-Dimensional_Object_Motion_ICCV_2023_paper.pdf", "aff": "KUIS AI Center and Department of Computer Engineering, Koc University; KUIS AI Center and Department of Computer Engineering, Koc University", @@ -38095,7 +39374,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "T\u00fcrkiye" + "aff_country_unique": "Turkey", + "bibtex": "@InProceedings{Safadoust_2023_ICCV,\n \n author = {\n Safadoust,\n Sadra and G\\"uney,\n Fatma\n},\n title = {\n Multi-Object Discovery by Low-Dimensional Object Motion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 734-744\n} \n}" }, { "title": "Multi-Object Navigation with Dynamically Learned Neural Implicit Representations", @@ -38120,14 +39400,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Marza_Multi-Object_Navigation_with_Dynamically_Learned_Neural_Implicit_Representations_ICCV_2023_paper.html", "aff_unique_index": "0;1;0;2", - "aff_unique_norm": "INSA Lyon;University of California, Los Angeles;NAVER LABS", - "aff_unique_dep": ";;", + "aff_unique_norm": "INSA Lyon;University of California, Los Angeles;Naver Labs", + "aff_unique_dep": ";;Naver Labs Europe", "aff_unique_url": "https://www.insa-lyon.fr;https://www.ucla.edu;https://labs.naver.com", "aff_unique_abbr": "INSA Lyon;UCLA;NLE", "aff_campus_unique_index": "1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;1;0;2", - "aff_country_unique": "France;United States;Unknown" + "aff_country_unique": "France;United States;Unknown", + "bibtex": "@InProceedings{Marza_2023_ICCV,\n \n author = {\n Marza,\n Pierre and Matignon,\n Laetitia and Simonin,\n Olivier and Wolf,\n Christian\n},\n title = {\n Multi-Object Navigation with Dynamically Learned Neural Implicit Representations\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11004-11015\n} \n}" }, { "title": "Multi-Scale Bidirectional Recurrent Network with Hybrid Correlation for Point Cloud Based Scene Flow Estimation", @@ -38159,7 +39440,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Cheng_2023_ICCV,\n \n author = {\n Cheng,\n Wencan and Ko,\n Jong Hwan\n},\n title = {\n Multi-Scale Bidirectional Recurrent Network with Hybrid Correlation for Point Cloud Based Scene Flow Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10041-10050\n} \n}" }, { "title": "Multi-Scale Residual Low-Pass Filter Network for Image Deblurring", @@ -38182,7 +39464,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Dong_Multi-Scale_Residual_Low-Pass_Filter_Network_for_Image_Deblurring_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Dong_Multi-Scale_Residual_Low-Pass_Filter_Network_for_Image_Deblurring_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Dong_2023_ICCV,\n \n author = {\n Dong,\n Jiangxin and Pan,\n Jinshan and Yang,\n Zhongbao and Tang,\n Jinhui\n},\n title = {\n Multi-Scale Residual Low-Pass Filter Network for Image Deblurring\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12345-12354\n} \n}" }, { "title": "Multi-Task Learning with Knowledge Distillation for Dense Prediction", @@ -38214,7 +39497,8 @@ "aff_campus_unique_index": "0+0;1;0", "aff_campus_unique": "Wuhan;Jeddah", "aff_country_unique_index": "0+0;1;0", - "aff_country_unique": "China;Saudi Arabia" + "aff_country_unique": "China;Saudi Arabia", + "bibtex": "@InProceedings{Xu_2023_ICCV,\n \n author = {\n Xu,\n Yangyang and Yang,\n Yibo and Zhang,\n Lefei\n},\n title = {\n Multi-Task Learning with Knowledge Distillation for Dense Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21550-21559\n} \n}" }, { "title": "Multi-View Active Fine-Grained Visual Recognition", @@ -38246,7 +39530,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Du_2023_ICCV,\n \n author = {\n Du,\n Ruoyi and Yu,\n Wenqing and Wang,\n Heqing and Lin,\n Ting-En and Chang,\n Dongliang and Ma,\n Zhanyu\n},\n title = {\n Multi-View Active Fine-Grained Visual Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1568-1578\n} \n}" }, { "title": "Multi-body Depth and Camera Pose Estimation from Multiple Views", @@ -38278,7 +39563,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Italy" + "aff_country_unique": "Italy", + "bibtex": "@InProceedings{Cin_2023_ICCV,\n \n author = {\n Cin,\n Andrea Porfiri Dal and Boracchi,\n Giacomo and Magri,\n Luca\n},\n title = {\n Multi-body Depth and Camera Pose Estimation from Multiple Views\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17804-17814\n} \n}" }, { "title": "Multi-grained Temporal Prototype Learning for Few-shot Video Object Segmentation", @@ -38290,7 +39576,7 @@ "author": "Nian Liu; Kepan Nan; Wangbo Zhao; Yuanwei Liu; Xiwen Yao; Salman Khan; Hisham Cholakkal; Rao Muhammad Anwer; Junwei Han; Fahad Shahbaz Khan", "abstract": "Few-Shot Video Object Segmentation (FSVOS) aims to segment objects in a query video with the same category defined by a few annotated support images. However, this task was seldom explored. In this work, based on IPMT, a state-of-the-art few-shot image segmentation method that combines external support guidance information with adaptive query guidance cues, we propose to leverage multi-grained temporal guidance information for handling the temporal correlation nature of video data. We decompose the query video information into a clip prototype and a memory prototype for capturing local and long-term internal temporal guidance, respectively. Frame prototypes are further used for each frame independently to handle fine-grained adaptive guidance and enable bidirectional clip-frame prototype communication. To reduce the influence of noisy memory, we propose to leverage the structural similarity relation among different predicted regions and the support for selecting reliable memory frames. Furthermore, a new segmentation loss is also proposed to enhance the category discriminability of the learned prototypes. Experimental results demonstrate that our proposed video IPMT model significantly outperforms previous models on two benchmark datasets. Code is available at https://github.com/nankepan/VIPMT.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Liu_Multi-grained_Temporal_Prototype_Learning_for_Few-shot_Video_Object_Segmentation_ICCV_2023_paper.pdf", - "aff": "Mohamed bin Zayed University of Artificial Intelligence; Northwestern Polytechnical University; National University of Singapore; Northwestern Polytechnical University; Northwestern Polytechnical University; Mohamed bin Zayed University of Artificial Intelligence; Mohamed bin Zayed University of Artificial Intelligence; Mohamed bin Zayed University of Artificial Intelligence; Northwestern Polytechnical University; CVL, Link \u00a8oping University", + "aff": "Mohamed bin Zayed University of Artificial Intelligence; Northwestern Polytechnical University; National University of Singapore; Northwestern Polytechnical University; Northwestern Polytechnical University; Mohamed bin Zayed University of Artificial Intelligence; Mohamed bin Zayed University of Artificial Intelligence; Mohamed bin Zayed University of Artificial Intelligence; Northwestern Polytechnical University; CVL, Link ¨oping University", "project": "", "github": "https://github.com/nankepan/VIPMT", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Liu_Multi-grained_Temporal_Prototype_ICCV_2023_supplemental.zip", @@ -38303,14 +39589,15 @@ "author_num": 10, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Liu_Multi-grained_Temporal_Prototype_Learning_for_Few-shot_Video_Object_Segmentation_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;1;1;0;0;0;1;3", - "aff_unique_norm": "Mohamed bin Zayed University of Artificial Intelligence;Northwestern Polytechnical University;National University of Singapore;Link\u00f6ping University", + "aff_unique_norm": "Mohamed bin Zayed University of Artificial Intelligence;Northwestern Polytechnical University;National University of Singapore;Linköping University", "aff_unique_dep": ";;;Computer Vision Laboratory (CVL)", - "aff_unique_url": "https://mbzuai.ac.ae;https://www.nwpu.edu.cn;https://www.nus.edu.sg;https://www.liu.se", + "aff_unique_url": "https://www.mbzuai.ac.ae;https://www.nwpu.edu.cn;https://www.nus.edu.sg;https://www.liu.se", "aff_unique_abbr": "MBZUAI;NWPU;NUS;LiU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;1;1;0;0;0;1;3", - "aff_country_unique": "United Arab Emirates;China;Singapore;Sweden" + "aff_country_unique": "United Arab Emirates;China;Singapore;Sweden", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Nian and Nan,\n Kepan and Zhao,\n Wangbo and Liu,\n Yuanwei and Yao,\n Xiwen and Khan,\n Salman and Cholakkal,\n Hisham and Anwer,\n Rao Muhammad and Han,\n Junwei and Khan,\n Fahad Shahbaz\n},\n title = {\n Multi-grained Temporal Prototype Learning for Few-shot Video Object Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18862-18871\n} \n}" }, { "title": "Multi-granularity Interaction Simulation for Unsupervised Interactive Segmentation", @@ -38335,14 +39622,15 @@ "author_num": 9, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_Multi-granularity_Interaction_Simulation_for_Unsupervised_Interactive_Segmentation_ICCV_2023_paper.html", "aff_unique_index": "0+0;0+0;1+0;0+0;0+0;2;0+1+0;2;0+1+0", - "aff_unique_norm": "Peking University;Pengcheng Laboratory;Tsinghua University", - "aff_unique_dep": "School of Electronic and Computer Engineering;Peng Cheng Laboratory;Department of Automation and BNRist", + "aff_unique_norm": "Peking University;Peng Cheng Laboratory;Tsinghua University", + "aff_unique_dep": "School of Electronic and Computer Engineering;;Department of Automation and BNRist", "aff_unique_url": "http://www.pku.edu.cn;;https://www.tsinghua.edu.cn", "aff_unique_abbr": "PKU;;THU", "aff_campus_unique_index": "0+0;0+0;0+0;0+0;0+0;1;0+0+0;1;0+0+0", "aff_campus_unique": "Shenzhen;Beijing", "aff_country_unique_index": "0+0;0+0;0+0;0+0;0+0;0;0+0+0;0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Kehan and Zhao,\n Yian and Wang,\n Zhennan and Cheng,\n Zesen and Jin,\n Peng and Ji,\n Xiangyang and Yuan,\n Li and Liu,\n Chang and Chen,\n Jie\n},\n title = {\n Multi-granularity Interaction Simulation for Unsupervised Interactive Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 666-676\n} \n}" }, { "title": "Multi-interactive Feature Learning and a Full-time Multi-modality Benchmark for Image Fusion and Segmentation", @@ -38367,14 +39655,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Liu_Multi-interactive_Feature_Learning_and_a_Full-time_Multi-modality_Benchmark_for_Image_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0;0+1;0;0;0", - "aff_unique_norm": "Dalian University of Technology;Pengcheng Laboratory", - "aff_unique_dep": "School of Mechanical Engineering;Peng Cheng Laboratory", - "aff_unique_url": "http://www.dlut.edu.cn;http://www.pcl.ac.cn", + "aff_unique_norm": "Dalian University of Technology;Peng Cheng Laboratory", + "aff_unique_dep": "School of Mechanical Engineering;", + "aff_unique_url": "http://www.dlut.edu.cn/;http://www.pcl.ac.cn", "aff_unique_abbr": "DUT;PCL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0+0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Jinyuan and Liu,\n Zhu and Wu,\n Guanyao and Ma,\n Long and Liu,\n Risheng and Zhong,\n Wei and Luo,\n Zhongxuan and Fan,\n Xin\n},\n title = {\n Multi-interactive Feature Learning and a Full-time Multi-modality Benchmark for Image Fusion and Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8115-8124\n} \n}" }, { "title": "Multi-label Affordance Mapping from Egocentric Vision", @@ -38406,7 +39695,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Spain" + "aff_country_unique": "Spain", + "bibtex": "@InProceedings{Mur-Labadia_2023_ICCV,\n \n author = {\n Mur-Labadia,\n Lorenzo and Guerrero,\n Jose J. and Martinez-Cantin,\n Ruben\n},\n title = {\n Multi-label Affordance Mapping from Egocentric Vision\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5238-5249\n} \n}" }, { "title": "Multi-task View Synthesis with Neural Radiance Fields", @@ -38431,14 +39721,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zheng_Multi-task_View_Synthesis_with_Neural_Radiance_Fields_ICCV_2023_paper.html", "aff_unique_index": "0;1;1;0", - "aff_unique_norm": "University of Illinois Urbana-Champaign;Carnegie Mellon University", + "aff_unique_norm": "University of Illinois at Urbana-Champaign;Carnegie Mellon University", "aff_unique_dep": ";", "aff_unique_url": "https://illinois.edu;https://www.cmu.edu", "aff_unique_abbr": "UIUC;CMU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Urbana-Champaign;", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zheng_2023_ICCV,\n \n author = {\n Zheng,\n Shuhong and Bao,\n Zhipeng and Hebert,\n Martial and Wang,\n Yu-Xiong\n},\n title = {\n Multi-task View Synthesis with Neural Radiance Fields\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21538-21549\n} \n}" }, { "title": "Multi-view Self-supervised Disentanglement for General Image Denoising", @@ -38470,7 +39761,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;2;0", - "aff_country_unique": "United Kingdom;China;United States" + "aff_country_unique": "United Kingdom;China;United States", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Hao and Qu,\n Chenyuan and Zhang,\n Yu and Chen,\n Chen and Jiao,\n Jianbo\n},\n title = {\n Multi-view Self-supervised Disentanglement for General Image Denoising\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12281-12291\n} \n}" }, { "title": "Multi-view Spectral Polarization Propagation for Video Glass Segmentation", @@ -38502,7 +39794,8 @@ "aff_campus_unique_index": "0;0;0;0;0;0;0;0;0", "aff_campus_unique": "Los Angeles", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Qiao_2023_ICCV,\n \n author = {\n Qiao,\n Yu and Dong,\n Bo and Jin,\n Ao and Fu,\n Yu and Baek,\n Seung-Hwan and Heide,\n Felix and Peers,\n Pieter and Wei,\n Xiaopeng and Yang,\n Xin\n},\n title = {\n Multi-view Spectral Polarization Propagation for Video Glass Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23218-23228\n} \n}" }, { "title": "Multi-weather Image Restoration via Domain Translation", @@ -38534,7 +39827,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Geelong Warun Ponds;", "aff_country_unique_index": "0;0;0;0;1", - "aff_country_unique": "Australia;Ireland" + "aff_country_unique": "Australia;Ireland", + "bibtex": "@InProceedings{Patil_2023_ICCV,\n \n author = {\n Patil,\n Prashant W. and Gupta,\n Sunil and Rana,\n Santu and Venkatesh,\n Svetha and Murala,\n Subrahmanyam\n},\n title = {\n Multi-weather Image Restoration via Domain Translation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21696-21705\n} \n}" }, { "title": "Multi3DRefer: Grounding Text Description to Multiple 3D Objects", @@ -38566,7 +39860,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Yiming and Gong,\n ZeMing and Chang,\n Angel X.\n},\n title = {\n Multi3DRefer: Grounding Text Description to Multiple 3D Objects\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15225-15236\n} \n}" }, { "title": "Multimodal Distillation for Egocentric Action Recognition", @@ -38598,7 +39893,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "Belgium" + "aff_country_unique": "Belgium", + "bibtex": "@InProceedings{Radevski_2023_ICCV,\n \n author = {\n Radevski,\n Gorjan and Grujicic,\n Dusan and Blaschko,\n Matthew and Moens,\n Marie-Francine and Tuytelaars,\n Tinne\n},\n title = {\n Multimodal Distillation for Egocentric Action Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5213-5224\n} \n}" }, { "title": "Multimodal Garment Designer: Human-Centric Latent Diffusion Models for Fashion Image Editing", @@ -38630,7 +39926,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0;0;0;0+0", - "aff_country_unique": "Italy" + "aff_country_unique": "Italy", + "bibtex": "@InProceedings{Baldrati_2023_ICCV,\n \n author = {\n Baldrati,\n Alberto and Morelli,\n Davide and Cartella,\n Giuseppe and Cornia,\n Marcella and Bertini,\n Marco and Cucchiara,\n Rita\n},\n title = {\n Multimodal Garment Designer: Human-Centric Latent Diffusion Models for Fashion Image Editing\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23393-23402\n} \n}" }, { "title": "Multimodal High-order Relation Transformer for Scene Boundary Detection", @@ -38662,7 +39959,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wei_2023_ICCV,\n \n author = {\n Wei,\n Xi and Shi,\n Zhangxiang and Zhang,\n Tianzhu and Yu,\n Xiaoyuan and Xiao,\n Lei\n},\n title = {\n Multimodal High-order Relation Transformer for Scene Boundary Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22081-22090\n} \n}" }, { "title": "Multimodal Motion Conditioned Diffusion Model for Skeleton-based Video Anomaly Detection", @@ -38694,7 +39992,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "Italy" + "aff_country_unique": "Italy", + "bibtex": "@InProceedings{Flaborea_2023_ICCV,\n \n author = {\n Flaborea,\n Alessandro and Collorone,\n Luca and di Melendugno,\n Guido Maria D'Amely and D'Arrigo,\n Stefano and Prenkaj,\n Bardh and Galasso,\n Fabio\n},\n title = {\n Multimodal Motion Conditioned Diffusion Model for Skeleton-based Video Anomaly Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10318-10329\n} \n}" }, { "title": "Multimodal Optimal Transport-based Co-Attention Transformer with Global Structure Consistency for Survival Prediction", @@ -38726,7 +40025,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xu_2023_ICCV,\n \n author = {\n Xu,\n Yingxue and Chen,\n Hao\n},\n title = {\n Multimodal Optimal Transport-based Co-Attention Transformer with Global Structure Consistency for Survival Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21241-21251\n} \n}" }, { "title": "Multimodal Variational Auto-encoder based Audio-Visual Segmentation", @@ -38758,7 +40058,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Mao_2023_ICCV,\n \n author = {\n Mao,\n Yuxin and Zhang,\n Jing and Xiang,\n Mochu and Zhong,\n Yiran and Dai,\n Yuchao\n},\n title = {\n Multimodal Variational Auto-encoder based Audio-Visual Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 954-965\n} \n}" }, { "title": "Multiple Instance Learning Framework with Masked Hard Instance Mining for Whole Slide Image Classification", @@ -38790,7 +40091,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Tang_2023_ICCV,\n \n author = {\n Tang,\n Wenhao and Huang,\n Sheng and Zhang,\n Xiaoxian and Zhou,\n Fengtao and Zhang,\n Yi and Liu,\n Bo\n},\n title = {\n Multiple Instance Learning Framework with Masked Hard Instance Mining for Whole Slide Image Classification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4078-4087\n} \n}" }, { "title": "Multiple Planar Object Tracking", @@ -38822,7 +40124,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Zhicheng and Liu,\n Shengzhe and Yang,\n Jufeng\n},\n title = {\n Multiple Planar Object Tracking\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23460-23470\n} \n}" }, { "title": "Multiscale Representation for Real-Time Anti-Aliasing Neural Rendering", @@ -38848,13 +40151,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Hu_Multiscale_Representation_for_Real-Time_Anti-Aliasing_Neural_Rendering_ICCV_2023_paper.html", "aff_unique_index": "0;0;1;2;3+0;0", "aff_unique_norm": "University of Melbourne;Google;University of Sydney;Alibaba Group", - "aff_unique_dep": ";Google;;", + "aff_unique_dep": ";;;", "aff_unique_url": "https://www.unimelb.edu.au;https://www.google.com;https://www.sydney.edu.au;https://www.alibaba.com", "aff_unique_abbr": "UniMelb;Google;USYD;Alibaba", "aff_campus_unique_index": "1;", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;1;0;2+0;0", - "aff_country_unique": "Australia;United States;China" + "aff_country_unique": "Australia;United States;China", + "bibtex": "@InProceedings{Hu_2023_ICCV,\n \n author = {\n Hu,\n Dongting and Zhang,\n Zhenkai and Hou,\n Tingbo and Liu,\n Tongliang and Fu,\n Huan and Gong,\n Mingming\n},\n title = {\n Multiscale Representation for Real-Time Anti-Aliasing Neural Rendering\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17772-17783\n} \n}" }, { "title": "Multiscale Structure Guided Diffusion for Image Deblurring", @@ -38866,7 +40170,7 @@ "author": "Mengwei Ren; Mauricio Delbracio; Hossein Talebi; Guido Gerig; Peyman Milanfar", "abstract": "Diffusion Probabilistic Models (DPMs) have recently been employed for image deblurring, formulated as an image-conditioned generation process that maps Gaussian noise to the high-quality image, conditioned on the blurry input. Image-conditioned DPMs (icDPMs) have shown more realistic results than regression-based methods when trained on pairwise in-domain data. However, their robustness in restoring images is unclear when presented with out-of-domain images as they do not impose specific degradation models or intermediate constraints. To this end, we introduce a simple yet effective multiscale structure guidance as an implicit bias that informs the icDPM about the coarse structure of the sharp image at the intermediate layers. This guided formulation leads to a significant improvement of the deblurring results, particularly on unseen domain. The guidance is extracted from the latent space of a regression network trained to predict the clean-sharp target at multiple lower resolutions, thus maintaining the most salient sharp structures. With both the blurry input and multiscale guidance, the icDPM model can better understand the blur and recover the clean image. We evaluate a single-dataset trained model on diverse datasets and demonstrate more robust deblurring results with fewer artifacts on unseen data. Our method outperforms existing baselines, achieving state-of-the-art perceptual quality while keeping competitive distortion metrics.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Ren_Multiscale_Structure_Guided_Diffusion_for_Image_Deblurring_ICCV_2023_paper.pdf", - "aff": "New York University\u2020; Google Research\u2021; Google Research\u2021; New York University\u2020; Google Research\u2021", + "aff": "New York University†; Google Research‡; Google Research‡; New York University†; Google Research‡", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Ren_Multiscale_Structure_Guided_ICCV_2023_supplemental.pdf", @@ -38886,7 +40190,8 @@ "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Ren_2023_ICCV,\n \n author = {\n Ren,\n Mengwei and Delbracio,\n Mauricio and Talebi,\n Hossein and Gerig,\n Guido and Milanfar,\n Peyman\n},\n title = {\n Multiscale Structure Guided Diffusion for Image Deblurring\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10721-10733\n} \n}" }, { "title": "Muscles in Action", @@ -38918,7 +40223,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Chiquier_2023_ICCV,\n \n author = {\n Chiquier,\n Mia and Vondrick,\n Carl\n},\n title = {\n Muscles in Action\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22091-22101\n} \n}" }, { "title": "NAPA-VQ: Neighborhood-Aware Prototype Augmentation with Vector Quantization for Continual Learning", @@ -38943,14 +40249,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Malepathirana_NAPA-VQ_Neighborhood-Aware_Prototype_Augmentation_with_Vector_Quantization_for_Continual_Learning_ICCV_2023_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "University of Melbourne", + "aff_unique_norm": "The University of Melbourne", "aff_unique_dep": "Dept. of Mechanical Engineering", "aff_unique_url": "https://www.unimelb.edu.au", "aff_unique_abbr": "UniMelb", - "aff_campus_unique_index": "", - "aff_campus_unique": "", + "aff_campus_unique_index": "0;0;0", + "aff_campus_unique": "Melbourne", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Australia" + "aff_country_unique": "Australia", + "bibtex": "@InProceedings{Malepathirana_2023_ICCV,\n \n author = {\n Malepathirana,\n Tamasha and Senanayake,\n Damith and Halgamuge,\n Saman\n},\n title = {\n NAPA-VQ: Neighborhood-Aware Prototype Augmentation with Vector Quantization for Continual Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11674-11684\n} \n}" }, { "title": "NCHO: Unsupervised Learning for Neural 3D Composition of Humans and Objects", @@ -38975,14 +40282,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Kim_NCHO_Unsupervised_Learning_for_Neural_3D_Composition_of_Humans_and_ICCV_2023_paper.html", "aff_unique_index": "0;1;0", - "aff_unique_norm": "Seoul National University;Meta", - "aff_unique_dep": ";Meta Reality Labs", + "aff_unique_norm": "Seoul National University;Meta Reality Labs", + "aff_unique_dep": ";", "aff_unique_url": "https://www.snu.ac.kr;https://www.meta.com", "aff_unique_abbr": "SNU;MRL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "South Korea;United States" + "aff_country_unique": "South Korea;United States", + "bibtex": "@InProceedings{Kim_2023_ICCV,\n \n author = {\n Kim,\n Taeksoo and Saito,\n Shunsuke and Joo,\n Hanbyul\n},\n title = {\n NCHO: Unsupervised Learning for Neural 3D Composition of Humans and Objects\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14817-14828\n} \n}" }, { "title": "NDC-Scene: Boost Monocular 3D Semantic Scene Completion in Normalized Device Coordinates Space", @@ -39014,7 +40322,8 @@ "aff_campus_unique_index": ";1;1;1;;1+1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;1+2;2;2;2;1+2;2+2+2", - "aff_country_unique": "United States;Australia;China" + "aff_country_unique": "United States;Australia;China", + "bibtex": "@InProceedings{Yao_2023_ICCV,\n \n author = {\n Yao,\n Jiawei and Li,\n Chuming and Sun,\n Keqiang and Cai,\n Yingjie and Li,\n Hao and Ouyang,\n Wanli and Li,\n Hongsheng\n},\n title = {\n NDC-Scene: Boost Monocular 3D Semantic Scene Completion in Normalized Device Coordinates Space\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9455-9465\n} \n}" }, { "title": "NDDepth: Normal-Distance Assisted Monocular Depth Estimation", @@ -39037,7 +40346,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Shao_NDDepth_Normal-Distance_Assisted_Monocular_Depth_Estimation_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Shao_NDDepth_Normal-Distance_Assisted_Monocular_Depth_Estimation_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Shao_2023_ICCV,\n \n author = {\n Shao,\n Shuwei and Pei,\n Zhongcai and Chen,\n Weihai and Wu,\n Xingming and Li,\n Zhengguo\n},\n title = {\n NDDepth: Normal-Distance Assisted Monocular Depth Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7931-7940\n} \n}" }, { "title": "NEMTO: Neural Environment Matting for Novel View and Relighting Synthesis of Transparent Objects", @@ -39045,8 +40355,8 @@ "status": "Poster", "track": "main", "pid": "32", - "author_site": "Dongqing Wang, Tong Zhang, Sabine S\u00fcsstrunk", - "author": "Dongqing Wang; Tong Zhang; Sabine S\u00fcsstrunk", + "author_site": "Dongqing Wang, Tong Zhang, Sabine Süsstrunk", + "author": "Dongqing Wang; Tong Zhang; Sabine Süsstrunk", "abstract": "We propose NEMTO, the first end-to-end neural rendering pipeline to model 3D transparent objects with complex geometry and unknown indices of refraction. Commonly used appearance modeling such as the Disney BSDF model cannot accurately address this challenging problem due to the complex light paths bending through refractions and the strong dependency of surface appearance on illumination. With 2D images of the transparent object as input, our method is capable of high-quality novel view and relighting synthesis. We leverage implicit Signed Distance Functions (SDF) to model the object geometry and propose a refraction-aware ray bending network to model the effects of light refraction within the object. Our ray bending network is more tolerant to geometric inaccuracies than traditional physically-based methods for rendering transparent objects. We provide extensive evaluations on both synthetic and real-world datasets to demonstrate our high-quality synthesis and the applicability of our method.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Wang_NEMTO_Neural_Environment_Matting_for_Novel_View_and_Relighting_Synthesis_ICCV_2023_paper.pdf", "aff": "School of Computer and Communication Sciences, EPFL; School of Computer and Communication Sciences, EPFL; School of Computer and Communication Sciences, EPFL", @@ -39069,7 +40379,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Switzerland" + "aff_country_unique": "Switzerland", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Dongqing and Zhang,\n Tong and S\\"usstrunk,\n Sabine\n},\n title = {\n NEMTO: Neural Environment Matting for Novel View and Relighting Synthesis of Transparent Objects\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 317-327\n} \n}" }, { "title": "NIR-assisted Video Enhancement via Unpaired 24-hour Data", @@ -39101,7 +40412,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Niu_2023_ICCV,\n \n author = {\n Niu,\n Muyao and Zhong,\n Zhihang and Zheng,\n Yinqiang\n},\n title = {\n NIR-assisted Video Enhancement via Unpaired 24-hour Data\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10778-10788\n} \n}" }, { "title": "NLOS-NeuS: Non-line-of-sight Neural Implicit Surface", @@ -39133,7 +40445,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Fujimura_2023_ICCV,\n \n author = {\n Fujimura,\n Yuki and Kushida,\n Takahiro and Funatomi,\n Takuya and Mukaigawa,\n Yasuhiro\n},\n title = {\n NLOS-NeuS: Non-line-of-sight Neural Implicit Surface\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10532-10541\n} \n}" }, { "title": "NPC: Neural Point Characters from Video", @@ -39156,7 +40469,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Su_NPC_Neural_Point_Characters_from_Video_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Su_NPC_Neural_Point_Characters_from_Video_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Su_2023_ICCV,\n \n author = {\n Su,\n Shih-Yang and Bagautdinov,\n Timur and Rhodin,\n Helge\n},\n title = {\n NPC: Neural Point Characters from Video\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14795-14805\n} \n}" }, { "title": "NSF: Neural Surface Fields for Human Modeling from Monocular Depth", @@ -39168,7 +40482,7 @@ "author": "Yuxuan Xue; Bharat Lal Bhatnagar; Riccardo Marin; Nikolaos Sarafianos; Yuanlu Xu; Gerard Pons-Moll; Tony Tung", "abstract": "Obtaining personalized 3D animatable avatars from a monocular camera has several real world applications in gaming, virtual try-on, animation, and VR/XR, etc. However, it is very challenging to model dynamic and fine-grained clothing deformations from such sparse data. Existing methods for modeling 3D humans from depth data have limitations in terms of computational efficiency, mesh coherency, and flexibility in resolution and topology. For instance, reconstructing shapes using implicit functions and extracting explicit meshes per frame is computationally expensive and cannot ensure coherent meshes across frames. Moreover, predicting per-vertex deformations on a pre-designed human template with a discrete surface lacks flexibility in resolution and topology. To overcome these limitations, we propose a novel method 'NSF: Neural Surface Fields' for modeling 3D clothed humans from monocular depth. NSF defines a neural field solely on the base surface which models a continuous and flexible displacement field. NSF can be adapted to the base surface with different resolution and topology without retraining at inference time. Compared to existing approaches, our method eliminates the expensive per-frame surface extraction while maintaining mesh coherency, and is capable of reconstructing meshes with arbitrary resolution without retraining. To foster research in this direction, we release our code in project page at: https://yuxuan-xue.com/nsf.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Xue_NSF_Neural_Surface_Fields_for_Human_Modeling_from_Monocular_Depth_ICCV_2023_paper.pdf", - "aff": "T\u00fcbingen AI Center+University of T\u00fcbingen+Max Planck Institute for Informatics; T\u00fcbingen AI Center+University of T\u00fcbingen+Max Planck Institute for Informatics+Meta Reality Labs Research; T\u00fcbingen AI Center+University of T\u00fcbingen; Meta Reality Labs Research; Meta Reality Labs Research; T\u00fcbingen AI Center+University of T\u00fcbingen+Max Planck Institute for Informatics; Meta Reality Labs Research", + "aff": "Tübingen AI Center+University of Tübingen+Max Planck Institute for Informatics; Tübingen AI Center+University of Tübingen+Max Planck Institute for Informatics+Meta Reality Labs Research; Tübingen AI Center+University of Tübingen; Meta Reality Labs Research; Meta Reality Labs Research; Tübingen AI Center+University of Tübingen+Max Planck Institute for Informatics; Meta Reality Labs Research", "project": "https://yuxuan-xue.com/nsf", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Xue_NSF_Neural_Surface_ICCV_2023_supplemental.pdf", @@ -39181,14 +40495,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Xue_NSF_Neural_Surface_Fields_for_Human_Modeling_from_Monocular_Depth_ICCV_2023_paper.html", "aff_unique_index": "0+0+1;0+0+1+2;0+0;2;2;0+0+1;2", - "aff_unique_norm": "University of T\u00fcbingen;Max Planck Institute for Informatics;Meta", + "aff_unique_norm": "University of Tübingen;Max Planck Institute for Informatics;Meta Reality Labs", "aff_unique_dep": "AI Center;;Research", "aff_unique_url": "https://www.uni-tuebingen.de/;https://mpi-inf.mpg.de;https://www.meta.com", - "aff_unique_abbr": "Uni T\u00fcbingen;MPII;MRL", + "aff_unique_abbr": "Uni Tübingen;MPII;MRL", "aff_campus_unique_index": "0;0;0;0", - "aff_campus_unique": "T\u00fcbingen;", + "aff_campus_unique": "Tübingen;", "aff_country_unique_index": "0+0+0;0+0+0+1;0+0;1;1;0+0+0;1", - "aff_country_unique": "Germany;United States" + "aff_country_unique": "Germany;United States", + "bibtex": "@InProceedings{Xue_2023_ICCV,\n \n author = {\n Xue,\n Yuxuan and Bhatnagar,\n Bharat Lal and Marin,\n Riccardo and Sarafianos,\n Nikolaos and Xu,\n Yuanlu and Pons-Moll,\n Gerard and Tung,\n Tony\n},\n title = {\n NSF: Neural Surface Fields for Human Modeling from Monocular Depth\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15049-15060\n} \n}" }, { "title": "Name Your Colour For the Task: Artificially Discover Colour Naming via Colour Quantisation Transformer", @@ -39220,7 +40535,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0;1+1;0+0;0;1+1", - "aff_country_unique": "China;Japan" + "aff_country_unique": "China;Japan", + "bibtex": "@InProceedings{Su_2023_ICCV,\n \n author = {\n Su,\n Shenghan and Gu,\n Lin and Yang,\n Yue and Zhang,\n Zenghui and Harada,\n Tatsuya\n},\n title = {\n Name Your Colour For the Task: Artificially Discover Colour Naming via Colour Quantisation Transformer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12021-12031\n} \n}" }, { "title": "Narrator: Towards Natural Control of Human-Scene Interaction Generation via Relationship Reasoning", @@ -39252,7 +40568,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xuan_2023_ICCV,\n \n author = {\n Xuan,\n Haibiao and Li,\n Xiongzheng and Zhang,\n Jinsong and Zhang,\n Hongwen and Liu,\n Yebin and Li,\n Kun\n},\n title = {\n Narrator: Towards Natural Control of Human-Scene Interaction Generation via Relationship Reasoning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22268-22278\n} \n}" }, { "title": "NaviNeRF: NeRF-based 3D Representation Disentanglement by Latent Semantic Navigation", @@ -39284,7 +40601,8 @@ "aff_campus_unique_index": "1;1;1;1;1", "aff_campus_unique": ";Ningbo", "aff_country_unique_index": "0+0+0;0+0;0+0;0;0+0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xie_2023_ICCV,\n \n author = {\n Xie,\n Baao and Li,\n Bohan and Zhang,\n Zequn and Dong,\n Junting and Jin,\n Xin and Yang,\n Jingyu and Zeng,\n Wenjun\n},\n title = {\n NaviNeRF: NeRF-based 3D Representation Disentanglement by Latent Semantic Navigation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17992-18002\n} \n}" }, { "title": "Navigating to Objects Specified by Images", @@ -39307,7 +40625,8 @@ "aff_domain": ";;;;;;;;;", "email": ";;;;;;;;;", "author_num": 10, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Krantz_Navigating_to_Objects_Specified_by_Images_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Krantz_Navigating_to_Objects_Specified_by_Images_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Krantz_2023_ICCV,\n \n author = {\n Krantz,\n Jacob and Gervet,\n Theophile and Yadav,\n Karmesh and Wang,\n Austin and Paxton,\n Chris and Mottaghi,\n Roozbeh and Batra,\n Dhruv and Malik,\n Jitendra and Lee,\n Stefan and Chaplot,\n Devendra Singh\n},\n title = {\n Navigating to Objects Specified by Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10916-10925\n} \n}" }, { "title": "NeILF++: Inter-Reflectable Light Fields for Geometry and Material Estimation", @@ -39332,14 +40651,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhang_NeILF_Inter-Reflectable_Light_Fields_for_Geometry_and_Material_Estimation_ICCV_2023_paper.html", "aff_unique_index": "0;1;0;0;0;0;0;2", - "aff_unique_norm": "Apple;Nanjing University;Hong Kong University of Science and Technology", - "aff_unique_dep": "Apple Inc.;;", + "aff_unique_norm": "Apple Inc.;Nanjing University;Hong Kong University of Science and Technology", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.apple.com;https://www.nju.edu.cn;https://www.ust.hk", "aff_unique_abbr": "Apple;Nanjing U;HKUST", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;1;0;0;0;0;0;1", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Jingyang and Yao,\n Yao and Li,\n Shiwei and Liu,\n Jingbo and Fang,\n Tian and McKinnon,\n David and Tsin,\n Yanghai and Quan,\n Long\n},\n title = {\n NeILF++: Inter-Reflectable Light Fields for Geometry and Material Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3601-3610\n} \n}" }, { "title": "NeMF: Inverse Volume Rendering with Neural Microflake Field", @@ -39347,6 +40667,7 @@ "status": "Poster", "track": "main", "pid": "5427", + "author_site": "Youjia Zhang, Teng Xu, Junqing Yu, Yuteng Ye, Yanqing Jing, Junle Wang, Jingyi Yu, Wei Yang", "author": "Youjia Zhang, Teng Xu, Junqing Yu, Yuteng Ye, Yanqing Jing, Junle Wang, Jingyi Yu, Wei Yang", "abstract": "Recovering the physical attributes of an object's appearance from its images captured under an unknown illumination is challenging yet essential for photo-realistic rendering.Recent approaches adopt the emerging implicit scene representations and have shown impressive results.However, they unanimously adopt a surface-based representation,and hence can not well handle scenes with very complex geometry, translucent object and etc.In this paper, we propose to conduct inverse volume rendering, in contrast to surface-based, by representing a scene using microflake volume, which assumes the space is filled with infinite small flakes and light reflects or scatters at each spatial location according to microflake distributions. We further adopt the coordinate networks to implicitly encode the microflake volume, and develop a differentiable microflake volume renderer to train the network in an end-to-end way in principle.Our NeMF enables effective recovery of appearance attributes for highly complex geometry and scattering object, enables high-quality relighting, material editing, and especially simulates volume rendering effects, such as scattering, which is infeasible for surface-based approaches. Our data and code are available at: https://github.com/YoujiaZhang/NeMF.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Zhang_NeMF_Inverse_Volume_Rendering_with_Neural_Microflake_Field_ICCV_2023_paper.pdf", @@ -39358,7 +40679,8 @@ "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4733369493879995493&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhang_NeMF_Inverse_Volume_Rendering_with_Neural_Microflake_Field_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhang_NeMF_Inverse_Volume_Rendering_with_Neural_Microflake_Field_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Youjia and Xu,\n Teng and Yu,\n Junqing and Ye,\n Yuteng and Jing,\n Yanqing and Wang,\n Junle and Yu,\n Jingyi and Yang,\n Wei\n},\n title = {\n NeMF: Inverse Volume Rendering with Neural Microflake Field\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22919-22929\n} \n}" }, { "title": "NeO 360: Neural Fields for Sparse View Synthesis of Outdoor Scenes", @@ -39390,7 +40712,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Irshad_2023_ICCV,\n \n author = {\n Irshad,\n Muhammad Zubair and Zakharov,\n Sergey and Liu,\n Katherine and Guizilini,\n Vitor and Kollar,\n Thomas and Gaidon,\n Adrien and Kira,\n Zsolt and Ambrus,\n Rares\n},\n title = {\n NeO 360: Neural Fields for Sparse View Synthesis of Outdoor Scenes\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9187-9198\n} \n}" }, { "title": "NeRF-Det: Learning Geometry-Aware Volumetric Representation for Multi-View 3D Object Detection", @@ -39415,14 +40738,15 @@ "author_num": 11, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Xu_NeRF-Det_Learning_Geometry-Aware_Volumetric_Representation_for_Multi-View_3D_Object_Detection_ICCV_2023_paper.html", "aff_unique_index": "0;1;1;1;0;1;0;1;1;0;0", - "aff_unique_norm": "University of California, Berkeley;Meta", + "aff_unique_norm": "University of California, Berkeley;Meta Platforms, Inc.", "aff_unique_dep": ";Meta AI", "aff_unique_url": "https://www.berkeley.edu;https://meta.com", "aff_unique_abbr": "UC Berkeley;Meta", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Xu_2023_ICCV,\n \n author = {\n Xu,\n Chenfeng and Wu,\n Bichen and Hou,\n Ji and Tsai,\n Sam and Li,\n Ruilong and Wang,\n Jialiang and Zhan,\n Wei and He,\n Zijian and Vajda,\n Peter and Keutzer,\n Kurt and Tomizuka,\n Masayoshi\n},\n title = {\n NeRF-Det: Learning Geometry-Aware Volumetric Representation for Multi-View 3D Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23320-23330\n} \n}" }, { "title": "NeRF-LOAM: Neural Implicit Representation for Large-Scale Incremental LiDAR Odometry and Mapping", @@ -39454,7 +40778,8 @@ "aff_campus_unique_index": "0;0;0;0;0;0;0", "aff_campus_unique": "Shanghai;", "aff_country_unique_index": "0;0;0;0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Deng_2023_ICCV,\n \n author = {\n Deng,\n Junyuan and Wu,\n Qi and Chen,\n Xieyuanli and Xia,\n Songpengcheng and Sun,\n Zhen and Liu,\n Guoqing and Yu,\n Wenxian and Pei,\n Ling\n},\n title = {\n NeRF-LOAM: Neural Implicit Representation for Large-Scale Incremental LiDAR Odometry and Mapping\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8218-8227\n} \n}" }, { "title": "NeRF-MS: Neural Radiance Fields with Multi-Sequence", @@ -39466,7 +40791,7 @@ "author": "Peihao Li; Shaohui Wang; Chen Yang; Bingbing Liu; Weichao Qiu; Haoqian Wang", "abstract": "Neural radiance fields (NeRF) achieve impressive performance in novel view synthesis when trained on only single sequence data. However, leveraging multiple sequences captured by different cameras at different times is essential for better reconstruction performance. Multi-sequence data takes two main challenges: appearance variation due to different lighting conditions and non-static objects like pedestrians. To address these issues, we propose NeRF-MS, a novel approach to training NeRF with multi-sequence data. Specifically, we utilize a triplet loss to regularize the distribution of per-image appearance code, which leads to better high-frequency texture and consistent appearance, such as specular reflections. Then, we explicitly model non-static objects to reduce floaters. Extensive results demonstrate that NeRF-MS not only outperforms state-of-the-art view synthesis methods on outdoor and synthetic scenes, but also achieves 3D consistent rendering and robust appearance controlling. Project page: https://nerf-ms.github.io/.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Li_NeRF-MS_Neural_Radiance_Fields_with_Multi-Sequence_ICCV_2023_paper.pdf", - "aff": "Shenzhen International Graduate School, Tsinghua University; Shenzhen International Graduate School, Tsinghua University; Shanghai Jiao Tong University; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Shenzhen International Graduate School, Tsinghua University + Shenzhen Institute of Future Media Technology", + "aff": "Shenzhen International Graduate School, Tsinghua University; Shenzhen International Graduate School, Tsinghua University; Shanghai Jiao Tong University; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; Shenzhen International Graduate School, Tsinghua University + Shenzhen Institute of Future Media Technology", "project": "https://nerf-ms.github.io/", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Li_NeRF-MS_Neural_Radiance_ICCV_2023_supplemental.pdf", @@ -39480,13 +40805,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_NeRF-MS_Neural_Radiance_Fields_with_Multi-Sequence_ICCV_2023_paper.html", "aff_unique_index": "0;0;1;2;2;0+3", "aff_unique_norm": "Tsinghua University;Shanghai Jiao Tong University;Huawei;Shenzhen Institute of Future Media Technology", - "aff_unique_dep": "Shenzhen International Graduate School;;Noah\u2019s Ark Lab;", + "aff_unique_dep": "Shenzhen International Graduate School;;Noah’s Ark Lab;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.sjtu.edu.cn;https://www.huawei.com;", "aff_unique_abbr": "THU;SJTU;Huawei;", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Shenzhen;", "aff_country_unique_index": "0;0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Peihao and Wang,\n Shaohui and Yang,\n Chen and Liu,\n Bingbing and Qiu,\n Weichao and Wang,\n Haoqian\n},\n title = {\n NeRF-MS: Neural Radiance Fields with Multi-Sequence\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18591-18600\n} \n}" }, { "title": "NeRFrac: Neural Radiance Fields through Refractive Surface", @@ -39511,14 +40837,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhan_NeRFrac_Neural_Radiance_Fields_through_Refractive_Surface_ICCV_2023_paper.html", "aff_unique_index": "0;1;1;0", - "aff_unique_norm": "University of Tokyo;Kyoto University", + "aff_unique_norm": "The University of Tokyo;Kyoto University", "aff_unique_dep": ";", "aff_unique_url": "https://www.u-tokyo.ac.jp;https://www.kyoto-u.ac.jp", "aff_unique_abbr": "UTokyo;Kyoto U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Zhan_2023_ICCV,\n \n author = {\n Zhan,\n Yifan and Nobuhara,\n Shohei and Nishino,\n Ko and Zheng,\n Yinqiang\n},\n title = {\n NeRFrac: Neural Radiance Fields through Refractive Surface\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18402-18412\n} \n}" }, { "title": "NeSS-ST: Detecting Good and Stable Keypoints with a Neural Stability Score and the Shi-Tomasi detector", @@ -39550,7 +40877,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "Russian Federation;United Kingdom" + "aff_country_unique": "Russia;United Kingdom", + "bibtex": "@InProceedings{Pakulev_2023_ICCV,\n \n author = {\n Pakulev,\n Konstantin and Vakhitov,\n Alexander and Ferrer,\n Gonzalo\n},\n title = {\n NeSS-ST: Detecting Good and Stable Keypoints with a Neural Stability Score and the Shi-Tomasi detector\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9578-9588\n} \n}" }, { "title": "NeTO:Neural Reconstruction of Transparent Objects with Self-Occlusion Aware Refraction-Tracing", @@ -39575,14 +40903,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_NeTONeural_Reconstruction_of_Transparent_Objects_with_Self-Occlusion_Aware_Refraction-Tracing_ICCV_2023_paper.html", "aff_unique_index": "0;1;0;0;2;0;0", - "aff_unique_norm": "Wuhan University;University of Hong Kong;Texas A&M University", + "aff_unique_norm": "Wuhan University;The University of Hong Kong;Texas A&M University", "aff_unique_dep": "School of Computer Science;;", "aff_unique_url": "http://www.whu.edu.cn;https://www.hku.hk;https://www.tamu.edu", "aff_unique_abbr": "WHU;HKU;TAMU", "aff_campus_unique_index": "0;1;0;0;0;0", "aff_campus_unique": "Wuhan;Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;1;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Zongcheng and Long,\n Xiaoxiao and Wang,\n Yusen and Cao,\n Tuo and Wang,\n Wenping and Luo,\n Fei and Xiao,\n Chunxia\n},\n title = {\n NeTO:Neural Reconstruction of Transparent Objects with Self-Occlusion Aware Refraction-Tracing\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18547-18557\n} \n}" }, { "title": "Nearest Neighbor Guidance for Out-of-Distribution Detection", @@ -39614,7 +40943,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;2;0", - "aff_country_unique": "South Korea;;United States" + "aff_country_unique": "South Korea;;United States", + "bibtex": "@InProceedings{Park_2023_ICCV,\n \n author = {\n Park,\n Jaewoo and Jung,\n Yoon Gyo and Teoh,\n Andrew Beng Jin\n},\n title = {\n Nearest Neighbor Guidance for Out-of-Distribution Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1686-1695\n} \n}" }, { "title": "Neglected Free Lunch - Learning Image Classifiers Using Annotation Byproducts", @@ -39626,7 +40956,7 @@ "author": "Dongyoon Han; Junsuk Choe; Seonghyeok Chun; John Joon Young Chung; Minsuk Chang; Sangdoo Yun; Jean Y. Song; Seong Joon Oh", "abstract": "Supervised learning of image classifiers distills human knowledge into a parametric model through pairs of images and corresponding labels (X,Y). We argue that this simple and widely used representation of human knowledge neglects rich auxiliary information from the annotation procedure, such as the time-series of mouse traces and clicks left after image selection. Our insight is that such annotation byproducts Z provide approximate human attention that weakly guides the model to focus on the foreground cues, reducing spurious correlations and discouraging shortcut learning. To verify this, we create ImageNet-AB and COCO-AB. They are ImageNet and COCO training sets enriched with sample-wise annotation byproducts, collected by replicating the respective original annotation tasks. We refer to the new paradigm of training models with annotation byproducts as learning using annotation byproducts (LUAB). We show that a simple multitask loss for regressing Z together with Y already improves the generalisability and robustness of the learned models. Compared to the original supervised learning, LUAB does not require extra annotation costs. ImageNet-AB and COCO-AB are at https://github.com/naver-ai/NeglectedFreeLunch.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Han_Neglected_Free_Lunch_-_Learning_Image_Classifiers_Using_Annotation_Byproducts_ICCV_2023_paper.pdf", - "aff": "NA VER AI Lab; Sogang University; Dante Company; University of Michigan; NA VER AI Lab + Google; NA VER AI Lab; DGIST; University of T\u00fcbingen", + "aff": "NA VER AI Lab; Sogang University; Dante Company; University of Michigan; NA VER AI Lab + Google; NA VER AI Lab; DGIST; University of Tübingen", "project": "", "github": "github.com/naver-ai/NeglectedFreeLunch", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Han_Neglected_Free_Lunch_ICCV_2023_supplemental.pdf", @@ -39639,14 +40969,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Han_Neglected_Free_Lunch_-_Learning_Image_Classifiers_Using_Annotation_Byproducts_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;3;0+4;0;5;6", - "aff_unique_norm": "NAVER Corporation;Sogang University;Dante Company;University of Michigan;Google;Daegu Gyeongbuk Institute of Science and Technology;University of T\u00fcbingen", - "aff_unique_dep": "AI Lab;;;;Google;;", + "aff_unique_norm": "NAVER Corporation;Sogang University;Dante Company;University of Michigan;Google;Daegu Gyeongbuk Institute of Science and Technology;University of Tübingen", + "aff_unique_dep": "AI Lab;;;;;;", "aff_unique_url": "https://www.naver.com;https://www.sogang.ac.kr;;https://www.umich.edu;https://www.google.com;https://www.dgist.ac.kr;https://www.uni-tuebingen.de/", - "aff_unique_abbr": "NAVER;Sogang;;UM;Google;DGIST;Uni T\u00fcbingen", + "aff_unique_abbr": "NAVER;Sogang;;UM;Google;DGIST;Uni Tübingen", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;2;0+2;0;0;3", - "aff_country_unique": "South Korea;;United States;Germany" + "aff_country_unique": "South Korea;;United States;Germany", + "bibtex": "@InProceedings{Han_2023_ICCV,\n \n author = {\n Han,\n Dongyoon and Choe,\n Junsuk and Chun,\n Seonghyeok and Chung,\n John Joon Young and Chang,\n Minsuk and Yun,\n Sangdoo and Song,\n Jean Y. and Oh,\n Seong Joon\n},\n title = {\n Neglected Free Lunch - Learning Image Classifiers Using Annotation Byproducts\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20200-20212\n} \n}" }, { "title": "NerfAcc: Efficient Sampling Accelerates NeRFs", @@ -39678,7 +41009,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Ruilong and Gao,\n Hang and Tancik,\n Matthew and Kanazawa,\n Angjoo\n},\n title = {\n NerfAcc: Efficient Sampling Accelerates NeRFs\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18537-18546\n} \n}" }, { "title": "Nerfbusters: Removing Ghostly Artifacts from Casually Captured NeRFs", @@ -39710,7 +41042,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Warburg_2023_ICCV,\n \n author = {\n Warburg,\n Frederik and Weber,\n Ethan and Tancik,\n Matthew and Holynski,\n Aleksander and Kanazawa,\n Angjoo\n},\n title = {\n Nerfbusters: Removing Ghostly Artifacts from Casually Captured NeRFs\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18120-18130\n} \n}" }, { "title": "NeuRBF: A Neural Fields Representation with Adaptive Radial Basis Functions", @@ -39742,7 +41075,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1;0;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Zhang and Li,\n Zhong and Song,\n Liangchen and Chen,\n Lele and Yu,\n Jingyi and Yuan,\n Junsong and Xu,\n Yi\n},\n title = {\n NeuRBF: A Neural Fields Representation with Adaptive Radial Basis Functions\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4182-4194\n} \n}" }, { "title": "NeuS2: Fast Learning of Neural Implicit Surfaces for Multi-view Reconstruction", @@ -39774,7 +41108,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+1;1;2;0;2;0+2", - "aff_country_unique": "United States;China;Germany" + "aff_country_unique": "United States;China;Germany", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Yiming and Han,\n Qin and Habermann,\n Marc and Daniilidis,\n Kostas and Theobalt,\n Christian and Liu,\n Lingjie\n},\n title = {\n NeuS2: Fast Learning of Neural Implicit Surfaces for Multi-view Reconstruction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3295-3306\n} \n}" }, { "title": "Neural Characteristic Function Learning for Conditional Image Generation", @@ -39806,7 +41141,8 @@ "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Shengxi and Zhang,\n Jialu and Li,\n Yifei and Xu,\n Mai and Deng,\n Xin and Li,\n Li\n},\n title = {\n Neural Characteristic Function Learning for Conditional Image Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7204-7214\n} \n}" }, { "title": "Neural Collage Transfer: Artistic Reconstruction via Material Manipulation", @@ -39838,7 +41174,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Lee_2023_ICCV,\n \n author = {\n Lee,\n Ganghun and Kim,\n Minji and Lee,\n Yunsu and Lee,\n Minsu and Zhang,\n Byoung-Tak\n},\n title = {\n Neural Collage Transfer: Artistic Reconstruction via Material Manipulation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2394-2405\n} \n}" }, { "title": "Neural Deformable Models for 3D Bi-Ventricular Heart Shape Reconstruction and Modeling from 2D Sparse Cardiac Magnetic Resonance Imaging", @@ -39863,14 +41200,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ye_Neural_Deformable_Models_for_3D_Bi-Ventricular_Heart_Shape_Reconstruction_and_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;2;0", - "aff_unique_norm": "Rutgers University;NVIDIA;New York University", - "aff_unique_dep": ";NVIDIA Corporation;School of Medicine", + "aff_unique_norm": "Rutgers University;NVIDIA Corporation;New York University", + "aff_unique_dep": ";;School of Medicine", "aff_unique_url": "https://www.rutgers.edu;https://www.nvidia.com;https://nyu.edu", "aff_unique_abbr": "Rutgers;NVIDIA;NYU", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";New York", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Ye_2023_ICCV,\n \n author = {\n Ye,\n Meng and Yang,\n Dong and Kanski,\n Mikael and Axel,\n Leon and Metaxas,\n Dimitris\n},\n title = {\n Neural Deformable Models for 3D Bi-Ventricular Heart Shape Reconstruction and Modeling from 2D Sparse Cardiac Magnetic Resonance Imaging\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14247-14256\n} \n}" }, { "title": "Neural Fields for Structured Lighting", @@ -39893,7 +41231,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Shandilya_Neural_Fields_for_Structured_Lighting_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Shandilya_Neural_Fields_for_Structured_Lighting_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Shandilya_2023_ICCV,\n \n author = {\n Shandilya,\n Aarrushi and Attal,\n Benjamin and Richardt,\n Christian and Tompkin,\n James and O'toole,\n Matthew\n},\n title = {\n Neural Fields for Structured Lighting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3512-3522\n} \n}" }, { "title": "Neural Haircut: Prior-Guided Strand-Based Hair Reconstruction", @@ -39905,7 +41244,7 @@ "author": "Vanessa Sklyarova; Jenya Chelishev; Andreea Dogaru; Igor Medvedev; Victor Lempitsky; Egor Zakharov", "abstract": "Generating realistic human 3D reconstructions using image or video data is essential for various communication and entertainment applications. While existing methods achieved impressive results for body and facial regions, realistic hair modeling still remains challenging due to its high mechanical complexity. This work proposes an approach capable of accurate hair geometry reconstruction at a strand level from a monocular video or multi-view images captured in uncontrolled lighting conditions. Our method has two stages, with the first stage performing joint reconstruction of coarse hair and bust shapes and hair orientation using implicit volumetric representations. The second stage then estimates a strand-level hair reconstruction by reconciling in a single optimization process the coarse volumetric constraints with hair strand and hairstyle priors learned from the synthetic data. To further increase the reconstruction fidelity, we incorporate image-based losses into the fitting process using a new differentiable renderer. The combined system, named Neural Haircut, achieves high realism and personalization of the reconstructed hairstyles.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Sklyarova_Neural_Haircut_Prior-Guided_Strand-Based_Hair_Reconstruction_ICCV_2023_paper.pdf", - "aff": "Samsung AI Center; Rockstar Games; FAU Erlangen-N\u00fcrnberg; Cinemersive Labs; Samsung AI Center; Samsung AI Center", + "aff": "Samsung AI Center; Rockstar Games; FAU Erlangen-Nürnberg; Cinemersive Labs; Samsung AI Center; Samsung AI Center", "project": "https://samsunglabs.github.io/NeuralHaircut/", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Sklyarova_Neural_Haircut_Prior-Guided_ICCV_2023_supplemental.pdf", @@ -39918,14 +41257,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Sklyarova_Neural_Haircut_Prior-Guided_Strand-Based_Hair_Reconstruction_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;3;0;0", - "aff_unique_norm": "Samsung;Rockstar Games;Friedrich-Alexander-Universit\u00e4t Erlangen-N\u00fcrnberg;Cinemersive Labs", + "aff_unique_norm": "Samsung AI Center;Rockstar Games;Friedrich-Alexander-Universität Erlangen-Nürnberg;Cinemersive Labs", "aff_unique_dep": "AI Center;;;", "aff_unique_url": "https://www.samsung.com/global/careers/ai-center/;https://www.rockstargames.com;https://www.fau.de;", "aff_unique_abbr": "Samsung AI;Rockstar Games;FAU;", "aff_campus_unique_index": "1", - "aff_campus_unique": ";Erlangen-N\u00fcrnberg", + "aff_campus_unique": ";Erlangen-Nürnberg", "aff_country_unique_index": "0;1;2;0;0", - "aff_country_unique": "South Korea;United States;Germany;" + "aff_country_unique": "South Korea;United States;Germany;", + "bibtex": "@InProceedings{Sklyarova_2023_ICCV,\n \n author = {\n Sklyarova,\n Vanessa and Chelishev,\n Jenya and Dogaru,\n Andreea and Medvedev,\n Igor and Lempitsky,\n Victor and Zakharov,\n Egor\n},\n title = {\n Neural Haircut: Prior-Guided Strand-Based Hair Reconstruction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19762-19773\n} \n}" }, { "title": "Neural Implicit Surface Evolution", @@ -39948,7 +41288,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Novello_Neural_Implicit_Surface_Evolution_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Novello_Neural_Implicit_Surface_Evolution_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Novello_2023_ICCV,\n \n author = {\n Novello,\n Tiago and da Silva,\n Vinicius and Schardong,\n Guilherme and Schirmer,\n Luiz and Lopes,\n Helio and Velho,\n Luiz\n},\n title = {\n Neural Implicit Surface Evolution\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14279-14289\n} \n}" }, { "title": "Neural Interactive Keypoint Detection", @@ -39973,14 +41314,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yang_Neural_Interactive_Keypoint_Detection_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0;1;0", - "aff_unique_norm": "International Digital Economy Academy;Chinese University of Hong Kong, Shenzhen", + "aff_unique_norm": "International Digital Economy Academy;The Chinese University of Hong Kong, Shenzhen", "aff_unique_dep": ";School of Data Science", "aff_unique_url": ";https://www.szhk.edu.cn", "aff_unique_abbr": ";CUHK(SZ)", "aff_campus_unique_index": "1", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "1", - "aff_country_unique": ";China" + "aff_country_unique": ";China", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n Jie and Zeng,\n Ailing and Li,\n Feng and Liu,\n Shilong and Zhang,\n Ruimao and Zhang,\n Lei\n},\n title = {\n Neural Interactive Keypoint Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15122-15132\n} \n}" }, { "title": "Neural LiDAR Fields for Novel View Synthesis", @@ -40003,7 +41345,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Huang_Neural_LiDAR_Fields_for_Novel_View_Synthesis_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Huang_Neural_LiDAR_Fields_for_Novel_View_Synthesis_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Huang_2023_ICCV,\n \n author = {\n Huang,\n Shengyu and Gojcic,\n Zan and Wang,\n Zian and Williams,\n Francis and Kasten,\n Yoni and Fidler,\n Sanja and Schindler,\n Konrad and Litany,\n Or\n},\n title = {\n Neural LiDAR Fields for Novel View Synthesis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18236-18246\n} \n}" }, { "title": "Neural Microfacet Fields for Inverse Rendering", @@ -40035,7 +41378,8 @@ "aff_campus_unique_index": "0;1;0;2", "aff_campus_unique": "San Diego;Mountain View;Berkeley", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Mai_2023_ICCV,\n \n author = {\n Mai,\n Alexander and Verbin,\n Dor and Kuester,\n Falko and Fridovich-Keil,\n Sara\n},\n title = {\n Neural Microfacet Fields for Inverse Rendering\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 408-418\n} \n}" }, { "title": "Neural Radiance Field with LiDAR maps", @@ -40058,7 +41402,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chang_Neural_Radiance_Field_with_LiDAR_maps_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chang_Neural_Radiance_Field_with_LiDAR_maps_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Chang_2023_ICCV,\n \n author = {\n Chang,\n MingFang and Sharma,\n Akash and Kaess,\n Michael and Lucey,\n Simon\n},\n title = {\n Neural Radiance Field with LiDAR maps\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17914-17923\n} \n}" }, { "title": "Neural Reconstruction of Relightable Human Model from Monocular Video", @@ -40090,7 +41435,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Sun_2023_ICCV,\n \n author = {\n Sun,\n Wenzhang and Che,\n Yunlong and Huang,\n Han and Guo,\n Yandong\n},\n title = {\n Neural Reconstruction of Relightable Human Model from Monocular Video\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 397-407\n} \n}" }, { "title": "Neural Video Depth Stabilizer", @@ -40122,7 +41468,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;1;2;2", - "aff_country_unique": "China;United States;Singapore" + "aff_country_unique": "China;United States;Singapore", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Yiran and Shi,\n Min and Li,\n Jiaqi and Huang,\n Zihao and Cao,\n Zhiguo and Zhang,\n Jianming and Xian,\n Ke and Lin,\n Guosheng\n},\n title = {\n Neural Video Depth Stabilizer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9466-9476\n} \n}" }, { "title": "Neural-PBIR Reconstruction of Shape, Material, and Illumination", @@ -40154,7 +41501,8 @@ "aff_campus_unique_index": "1;2;2;3;2", "aff_campus_unique": ";Taiwan;Irvine;College Park", "aff_country_unique_index": "0+1;0+0;0;0;0;0;0+0;0;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Sun_2023_ICCV,\n \n author = {\n Sun,\n Cheng and Cai,\n Guangyan and Li,\n Zhengqin and Yan,\n Kai and Zhang,\n Cheng and Marshall,\n Carl and Huang,\n Jia-Bin and Zhao,\n Shuang and Dong,\n Zhao\n},\n title = {\n Neural-PBIR Reconstruction of Shape,\n Material,\n and Illumination\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18046-18056\n} \n}" }, { "title": "No Fear of Classifier Biases: Neural Collapse Inspired Federated Learning with Synthetic and Fixed Classifier", @@ -40186,7 +41534,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Zexi and Shang,\n Xinyi and He,\n Rui and Lin,\n Tao and Wu,\n Chao\n},\n title = {\n No Fear of Classifier Biases: Neural Collapse Inspired Federated Learning with Synthetic and Fixed Classifier\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5319-5329\n} \n}" }, { "title": "Noise-Aware Learning from Web-Crawled Image-Text Data for Image Captioning", @@ -40218,7 +41567,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Kang_2023_ICCV,\n \n author = {\n Kang,\n Wooyoung and Mun,\n Jonghwan and Lee,\n Sungjun and Roh,\n Byungseok\n},\n title = {\n Noise-Aware Learning from Web-Crawled Image-Text Data for Image Captioning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2942-2952\n} \n}" }, { "id": "b612f21880", @@ -40238,15 +41588,16 @@ "aff_domain": "connect.ust.hk;connect.ust.hk;hkust-gz.edu.cn;ust.hk", "email": "connect.ust.hk;connect.ust.hk;hkust-gz.edu.cn;ust.hk", "author_num": 4, - "aff_unique_index": "0+0;0+0;0+0;0+0", - "aff_unique_norm": "Hong Kong University of Science and Technology", - "aff_unique_dep": "", - "aff_unique_url": "https://www.ust.hk", - "aff_unique_abbr": "HKUST", + "aff_unique_index": "0+1;0+1;0+1;0+1", + "aff_unique_norm": "Hong Kong University of Science and Technology;The Hong Kong University of Science and Technology", + "aff_unique_dep": ";", + "aff_unique_url": "https://www.ust.hk;https://www.ust.hk", + "aff_unique_abbr": "HKUST;HKUST", "aff_campus_unique_index": "0+1;0+1;0+1;0+1", "aff_campus_unique": "Hong Kong;Guangzhou", "aff_country_unique_index": "0+0;0+0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Jiachuan and Di,\n Shimin and Chen,\n Lei and Ng,\n Charles Wang Wai\n},\n title = {\n Noise2Info: Noisy Image to Information of Noise for Self-Supervised Image Denoising\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16034-16043\n} \n}" }, { "title": "Non-Coaxial Event-Guided Motion Deblurring with Spatial Alignment", @@ -40278,7 +41629,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Cho_2023_ICCV,\n \n author = {\n Cho,\n Hoonhee and Jeong,\n Yuhwan and Kim,\n Taewoo and Yoon,\n Kuk-Jin\n},\n title = {\n Non-Coaxial Event-Guided Motion Deblurring with Spatial Alignment\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12492-12503\n} \n}" }, { "title": "Non-Semantics Suppressed Mask Learning for Unsupervised Video Semantic Compression", @@ -40286,6 +41638,7 @@ "status": "Poster", "track": "main", "pid": "6934", + "author_site": "Yuan Tian, Guo Lu, Guangtao Zhai, Zhiyong Gao", "author": "Yuan Tian, Guo Lu, Guangtao Zhai, Zhiyong Gao", "abstract": "Most video compression methods aim to improve the decoded video visual quality, instead of particularly guaranteeing the semantic-completeness, which deteriorates downstream video analysis tasks, e.g., action recognition. In this paper, we focus on a novel unsupervised video semantic compression problem, where video semantics is compressed in a downstream task-agnostic manner. To tackle this problem, we first propose a Semantic-Mining-then-Compensation (SMC) framework to enhance the plain video codec with powerful semantic coding capability. Then, we optimize the framework with only unlabeled video data, by masking out a proportion of the compressed video and reconstructing the masked regions of the original video, which is inspired by recent masked image modeling (MIM) methods. Although the MIM scheme learns generalizable semantic features, its inner generative learning paradigm may also facilitate the coding framework memorizing non-semantic information with extra bitcosts. To suppress this deficiency, we explicitly decrease the non-semantic information entropy of the decoded video features, by formulating it as a parametrized Gaussian Mixture Model conditioned on the mined video semantics. Comprehensive experimental results demonstrate the proposed approach shows remarkable superiority over previous traditional, learnable and perceptual-quality-oriented video codecs, on three video analysis tasks and seven datasets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Tian_Non-Semantics_Suppressed_Mask_Learning_for_Unsupervised_Video_Semantic_Compression_ICCV_2023_paper.pdf", @@ -40297,7 +41650,8 @@ "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16825658398318656954&as_sdt=5,31&sciodt=0,31&hl=en", "gs_version_total": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Tian_Non-Semantics_Suppressed_Mask_Learning_for_Unsupervised_Video_Semantic_Compression_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Tian_Non-Semantics_Suppressed_Mask_Learning_for_Unsupervised_Video_Semantic_Compression_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Tian_2023_ICCV,\n \n author = {\n Tian,\n Yuan and Lu,\n Guo and Zhai,\n Guangtao and Gao,\n Zhiyong\n},\n title = {\n Non-Semantics Suppressed Mask Learning for Unsupervised Video Semantic Compression\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13610-13622\n} \n}" }, { "title": "Nonrigid Object Contact Estimation With Regional Unwrapping Transformer", @@ -40329,7 +41683,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xie_2023_ICCV,\n \n author = {\n Xie,\n Wei and Zhao,\n Zimeng and Li,\n Shiying and Zuo,\n Binghui and Wang,\n Yangang\n},\n title = {\n Nonrigid Object Contact Estimation With Regional Unwrapping Transformer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9342-9351\n} \n}" }, { "title": "Normalizing Flows for Human Pose Anomaly Detection", @@ -40361,7 +41716,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Israel" + "aff_country_unique": "Israel", + "bibtex": "@InProceedings{Hirschorn_2023_ICCV,\n \n author = {\n Hirschorn,\n Or and Avidan,\n Shai\n},\n title = {\n Normalizing Flows for Human Pose Anomaly Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13545-13554\n} \n}" }, { "title": "Not All Features Matter: Enhancing Few-shot CLIP with Adaptive Prior Refinement", @@ -40386,14 +41742,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhu_Not_All_Features_Matter_Enhancing_Few-shot_CLIP_with_Adaptive_Prior_ICCV_2023_paper.html", "aff_unique_index": "0;1+2;0;1;2;2;2", - "aff_unique_norm": "City University of Hong Kong;Chinese University of Hong Kong;Shanghai Artificial Intelligence Laboratory", + "aff_unique_norm": "City University of Hong Kong;The Chinese University of Hong Kong;Shanghai Artificial Intelligence Laboratory", "aff_unique_dep": ";;", "aff_unique_url": "https://www.cityu.edu.hk;https://www.cuhk.edu.hk;http://www.shailab.org/", "aff_unique_abbr": "CityU;CUHK;Shanghai AI Lab", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0+0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhu_2023_ICCV,\n \n author = {\n Zhu,\n Xiangyang and Zhang,\n Renrui and He,\n Bowei and Zhou,\n Aojun and Wang,\n Dong and Zhao,\n Bin and Gao,\n Peng\n},\n title = {\n Not All Features Matter: Enhancing Few-shot CLIP with Adaptive Prior Refinement\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2605-2615\n} \n}" }, { "title": "Not All Steps are Created Equal: Selective Diffusion Distillation for Image Manipulation", @@ -40417,15 +41774,16 @@ "email": "; ; ; ", "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wang_Not_All_Steps_are_Created_Equal_Selective_Diffusion_Distillation_for_ICCV_2023_paper.html", - "aff_unique_index": "0;0;1;0+0", - "aff_unique_norm": "Hong Kong University of Science and Technology;SmartMore", - "aff_unique_dep": ";", - "aff_unique_url": "https://www.ust.hk;", - "aff_unique_abbr": "HKUST;", + "aff_unique_index": "0;0;1;2+2", + "aff_unique_norm": "The Hong Kong University of Science and Technology;SmartMore;Hong Kong University of Science and Technology", + "aff_unique_dep": ";;", + "aff_unique_url": "https://www.ust.hk;;https://www.ust.hk", + "aff_unique_abbr": "HKUST;;HKUST", "aff_campus_unique_index": "0;0;2+0", "aff_campus_unique": "Guangzhou;;Hong Kong SAR", "aff_country_unique_index": "0;0;0+0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Luozhou and Yang,\n Shuai and Liu,\n Shu and Chen,\n Ying-cong\n},\n title = {\n Not All Steps are Created Equal: Selective Diffusion Distillation for Image Manipulation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7472-7481\n} \n}" }, { "title": "Not Every Side Is Equal: Localization Uncertainty Estimation for Semi-Supervised 3D Object Detection", @@ -40457,7 +41815,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Chuxin and Yang,\n Wenfei and Zhang,\n Tianzhu\n},\n title = {\n Not Every Side Is Equal: Localization Uncertainty Estimation for Semi-Supervised 3D Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3814-3824\n} \n}" }, { "title": "Novel Scenes & Classes: Towards Adaptive Open-set Object Detection", @@ -40482,14 +41841,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_Novel_Scenes__Classes_Towards_Adaptive_Open-set_Object_Detection_ICCV_2023_paper.html", "aff_unique_index": "0;1+0;2", - "aff_unique_norm": "City University of Hong Kong;University of Oxford;Chinese University of Hong Kong", + "aff_unique_norm": "City University of Hong Kong;University of Oxford;The Chinese University of Hong Kong", "aff_unique_dep": ";;", "aff_unique_url": "https://www.cityu.edu.hk;https://www.ox.ac.uk;https://www.cuhk.edu.hk", "aff_unique_abbr": "CityU;Oxford;CUHK", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;1+0;0", - "aff_country_unique": "China;United Kingdom" + "aff_country_unique": "China;United Kingdom", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Wuyang and Guo,\n Xiaoqing and Yuan,\n Yixuan\n},\n title = {\n Novel Scenes \\& Classes: Towards Adaptive Open-set Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15780-15790\n} \n}" }, { "title": "Novel-View Synthesis and Pose Estimation for Hand-Object Interaction from Sparse Views", @@ -40515,13 +41875,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Qu_Novel-View_Synthesis_and_Pose_Estimation_for_Hand-Object_Interaction_from_Sparse_ICCV_2023_paper.html", "aff_unique_index": "0+1;2;3;0+1;0+1;0+1;0+1", "aff_unique_norm": "Chinese Academy of Sciences;University of Chinese Academy of Sciences;Zhejiang University;Google", - "aff_unique_dep": "Institute of Software;;State Key Lab of CAD &CG;Google", + "aff_unique_dep": "Institute of Software;;State Key Lab of CAD &CG;", "aff_unique_url": "http://www.ios.ac.cn;http://www.ucas.ac.cn;http://www.zju.edu.cn;https://www.google.com", "aff_unique_abbr": "CAS;UCAS;ZJU;Google", "aff_campus_unique_index": ";1;;;;", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0+0;0;1;0+0;0+0;0+0;0+0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Qu_2023_ICCV,\n \n author = {\n Qu,\n Wentian and Cui,\n Zhaopeng and Zhang,\n Yinda and Meng,\n Chenyu and Ma,\n Cuixia and Deng,\n Xiaoming and Wang,\n Hongan\n},\n title = {\n Novel-View Synthesis and Pose Estimation for Hand-Object Interaction from Sparse Views\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15100-15111\n} \n}" }, { "title": "OCHID-Fi: Occlusion-Robust Hand Pose Estimation in 3D via RF-Vision", @@ -40553,7 +41914,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;2;2;0", - "aff_country_unique": "Singapore;China;Australia" + "aff_country_unique": "Singapore;China;Australia", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Shujie and Zheng,\n Tianyue and Chen,\n Zhe and Hu,\n Jingzhi and Khamis,\n Abdelwahed and Liu,\n Jiajun and Luo,\n Jun\n},\n title = {\n OCHID-Fi: Occlusion-Robust Hand Pose Estimation in 3D via RF-Vision\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15112-15121\n} \n}" }, { "title": "OFVL-MS: Once for Visual Localization across Multiple Indoor Scenes", @@ -40585,7 +41947,8 @@ "aff_campus_unique_index": "0;0;0;0;0;0;0;0;0+2;0", "aff_campus_unique": "Harbin;;Zhengzhou", "aff_country_unique_index": "0;0;0+0;0;0;0;0;0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xie_2023_ICCV,\n \n author = {\n Xie,\n Tao and Dai,\n Kun and Lu,\n Siyi and Wang,\n Ke and Jiang,\n Zhiqiang and Gao,\n Jinghan and Liu,\n Dedong and Xu,\n Jie and Zhao,\n Lijun and Li,\n Ruifeng\n},\n title = {\n OFVL-MS: Once for Visual Localization across Multiple Indoor Scenes\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5516-5526\n} \n}" }, { "title": "OPERA: Omni-Supervised Representation Learning with Hierarchical Supervisions", @@ -40617,7 +41980,8 @@ "aff_campus_unique_index": ";;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Chengkun and Zheng,\n Wenzhao and Zhu,\n Zheng and Zhou,\n Jie and Lu,\n Jiwen\n},\n title = {\n OPERA: Omni-Supervised Representation Learning with Hierarchical Supervisions\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5559-5570\n} \n}" }, { "title": "ORC: Network Group-based Knowledge Distillation using Online Role Change", @@ -40642,14 +42006,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Choi_ORC_Network_Group-based_Knowledge_Distillation_using_Online_Role_Change_ICCV_2023_paper.html", "aff_unique_index": "0+1;0;0;0+2", - "aff_unique_norm": "Ajou University;Hyundai Motor Company;NAVER Corporation", + "aff_unique_norm": "Ajou University;Hyundai Motor Company;Naver Corporation", "aff_unique_dep": ";;Naver AI Lab", "aff_unique_url": "https://www.ajou.ac.kr;https://www.hyundai.com;https://www.naver.com", "aff_unique_abbr": "Ajou;HMC;Naver", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0+0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Choi_2023_ICCV,\n \n author = {\n Choi,\n Junyong and Cho,\n Hyeon and Cheung,\n Seokhwa and Hwang,\n Wonjun\n},\n title = {\n ORC: Network Group-based Knowledge Distillation using Online Role Change\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17381-17390\n} \n}" }, { "title": "Object as Query: Lifting Any 2D Object Detector to 3D Detection", @@ -40677,11 +42042,12 @@ "aff_unique_norm": "Beihang University;TuSimple", "aff_unique_dep": "Institute of Artificial Intelligence;", "aff_unique_url": "http://www.buaa.edu.cn;https://www.tusimple.com", - "aff_unique_abbr": "BUAA;TuSimple", + "aff_unique_abbr": "Beihang;TuSimple", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;1;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Zitian and Huang,\n Zehao and Fu,\n Jiahui and Wang,\n Naiyan and Liu,\n Si\n},\n title = {\n Object as Query: Lifting Any 2D Object Detector to 3D Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3791-3800\n} \n}" }, { "title": "Object-Centric Multiple Object Tracking", @@ -40706,14 +42072,15 @@ "author_num": 16, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhao_Object-Centric_Multiple_Object_Tracking_ICCV_2023_paper.html", "aff_unique_index": "0;1;0;2;0;0;0;0;0;0;0;0;2;0;0;0", - "aff_unique_norm": "Amazon;Chinese University of Hong Kong;Fudan University", - "aff_unique_dep": "Amazon Web Services;;", + "aff_unique_norm": "Amazon Web Services;The Chinese University of Hong Kong;Fudan University", + "aff_unique_dep": ";;", "aff_unique_url": "https://aws.amazon.com;https://www.cuhk.edu.hk;https://www.fudan.edu.cn", "aff_unique_abbr": "AWS;CUHK;Fudan", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;1;0;1;0;0;0;0;0;0;0;0;1;0;0;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Zhao_2023_ICCV,\n \n author = {\n Zhao,\n Zixu and Wang,\n Jiaze and Horn,\n Max and Ding,\n Yizhuo and He,\n Tong and Bai,\n Zechen and Zietlow,\n Dominik and Simon-Gabriel,\n Carl-Johann and Shuai,\n Bing and Tu,\n Zhuowen and Brox,\n Thomas and Schiele,\n Bernt and Fu,\n Yanwei and Locatello,\n Francesco and Zhang,\n Zheng and Xiao,\n Tianjun\n},\n title = {\n Object-Centric Multiple Object Tracking\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16601-16611\n} \n}" }, { "title": "Object-aware Gaze Target Detection", @@ -40745,7 +42112,8 @@ "aff_campus_unique_index": "0+0;0+1;0;0+0", "aff_campus_unique": "Trento;Pisa", "aff_country_unique_index": "0+0;0+0;0;0+0", - "aff_country_unique": "Italy" + "aff_country_unique": "Italy", + "bibtex": "@InProceedings{Tonini_2023_ICCV,\n \n author = {\n Tonini,\n Francesco and Dall'Asen,\n Nicola and Beyan,\n Cigdem and Ricci,\n Elisa\n},\n title = {\n Object-aware Gaze Target Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21860-21869\n} \n}" }, { "title": "ObjectFusion: Multi-modal 3D Object Detection with Object-Centric Fusion", @@ -40777,7 +42145,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;1;2", - "aff_country_unique": "China;United States;Singapore" + "aff_country_unique": "China;United States;Singapore", + "bibtex": "@InProceedings{Cai_2023_ICCV,\n \n author = {\n Cai,\n Qi and Pan,\n Yingwei and Yao,\n Ting and Ngo,\n Chong-Wah and Mei,\n Tao\n},\n title = {\n ObjectFusion: Multi-modal 3D Object Detection with Object-Centric Fusion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18067-18076\n} \n}" }, { "title": "ObjectSDF++: Improved Object-Compositional Neural Implicit Surfaces", @@ -40809,7 +42178,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;2;0", - "aff_country_unique": "Australia;United Kingdom;Singapore" + "aff_country_unique": "Australia;United Kingdom;Singapore", + "bibtex": "@InProceedings{Wu_2023_ICCV,\n \n author = {\n Wu,\n Qianyi and Wang,\n Kaisiyuan and Li,\n Kejie and Zheng,\n Jianmin and Cai,\n Jianfei\n},\n title = {\n ObjectSDF++: Improved Object-Compositional Neural Implicit Surfaces\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21764-21774\n} \n}" }, { "title": "Objects Do Not Disappear: Video Object Detection by Single-Frame Object Location Anticipation", @@ -40841,7 +42211,8 @@ "aff_campus_unique_index": "0;1;0;0;0", "aff_campus_unique": "Delft;Amsterdam", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "Netherlands" + "aff_country_unique": "Netherlands", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Xin and Nejadasl,\n Fatemeh Karimi and van Gemert,\n Jan C. and Booij,\n Olaf and Pintea,\n Silvia L.\n},\n title = {\n Objects Do Not Disappear: Video Object Detection by Single-Frame Object Location Anticipation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6950-6961\n} \n}" }, { "title": "OccFormer: Dual-path Transformer for Vision-based 3D Semantic Occupancy Prediction", @@ -40873,7 +42244,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "", - "aff_country_unique": "" + "aff_country_unique": "", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Yunpeng and Zhu,\n Zheng and Du,\n Dalong\n},\n title = {\n OccFormer: Dual-path Transformer for Vision-based 3D Semantic Occupancy Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9433-9443\n} \n}" }, { "title": "Occ^2Net: Robust Image Matching Based on 3D Occupancy Estimation for Occluded Regions", @@ -40898,14 +42270,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Fan_Occ2Net_Robust_Image_Matching_Based_on_3D_Occupancy_Estimation_for_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0", - "aff_unique_norm": "Megvii Technology", + "aff_unique_norm": "MEGVII Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.megvii.com", "aff_unique_abbr": "", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Fan_2023_ICCV,\n \n author = {\n Fan,\n Miao and Chen,\n Mingrui and Hu,\n Chen and Zhou,\n Shuchang\n},\n title = {\n Occ{\\textasciicircum\n}2Net: Robust Image Matching Based on 3D Occupancy Estimation for Occluded Regions\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9652-9662\n} \n}" }, { "title": "OmniLabel: A Challenging Benchmark for Language-Based Object Detection", @@ -40937,7 +42310,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Schulter_2023_ICCV,\n \n author = {\n Schulter,\n Samuel and G,\n Vijay Kumar B and Suh,\n Yumin and Dafnis,\n Konstantinos M. and Zhang,\n Zhixing and Zhao,\n Shiyu and Metaxas,\n Dimitris\n},\n title = {\n OmniLabel: A Challenging Benchmark for Language-Based Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11953-11962\n} \n}" }, { "title": "OmniZoomer: Learning to Move and Zoom in on Sphere at High-Resolution", @@ -40969,7 +42343,8 @@ "aff_campus_unique_index": "0;0;2+0", "aff_campus_unique": "Guangzhou;;Hong Kong SAR", "aff_country_unique_index": "0;0+0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Cao_2023_ICCV,\n \n author = {\n Cao,\n Zidong and Ai,\n Hao and Cao,\n Yan-Pei and Shan,\n Ying and Qie,\n Xiaohu and Wang,\n Lin\n},\n title = {\n OmniZoomer: Learning to Move and Zoom in on Sphere at High-Resolution\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12897-12907\n} \n}" }, { "title": "Omnidirectional Information Gathering for Knowledge Transfer-Based Audio-Visual Navigation", @@ -40994,14 +42369,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chen_Omnidirectional_Information_Gathering_for_Knowledge_Transfer-Based_Audio-Visual_Navigation_ICCV_2023_paper.html", "aff_unique_index": "0;1;0;2;1", - "aff_unique_norm": "Beihang University;Zhejiang University;Chinese University of Hong Kong", + "aff_unique_norm": "Beihang University;Zhejiang University;The Chinese University of Hong Kong", "aff_unique_dep": "Institute of Artificial Intelligence;ReLER, CCAI;", "aff_unique_url": "http://www.buaa.edu.cn;http://www.zju.edu.cn;https://www.cuhk.edu.hk", - "aff_unique_abbr": "BUAA;ZJU;CUHK", + "aff_unique_abbr": "Beihang;ZJU;CUHK", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Jinyu and Wang,\n Wenguan and Liu,\n Si and Li,\n Hongsheng and Yang,\n Yi\n},\n title = {\n Omnidirectional Information Gathering for Knowledge Transfer-Based Audio-Visual Navigation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10993-11003\n} \n}" }, { "title": "OmnimatteRF: Robust Omnimatte with 3D Background Modeling", @@ -41024,7 +42400,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Lin_OmnimatteRF_Robust_Omnimatte_with_3D_Background_Modeling_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Lin_OmnimatteRF_Robust_Omnimatte_with_3D_Background_Modeling_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Lin_2023_ICCV,\n \n author = {\n Lin,\n Geng and Gao,\n Chen and Huang,\n Jia-Bin and Kim,\n Changil and Wang,\n Yipeng and Zwicker,\n Matthias and Saraf,\n Ayush\n},\n title = {\n OmnimatteRF: Robust Omnimatte with 3D Background Modeling\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23471-23480\n} \n}" }, { "title": "On the Audio-visual Synchronization for Lip-to-Speech Synthesis", @@ -41049,14 +42426,15 @@ "author_num": 2, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Niu_On_the_Audio-visual_Synchronization_for_Lip-to-Speech_Synthesis_ICCV_2023_paper.html", "aff_unique_index": "0;0", - "aff_unique_norm": "Hong Kong University of Science and Technology", + "aff_unique_norm": "The Hong Kong University of Science and Technology", "aff_unique_dep": "Department of Computer Science and Engineering", "aff_unique_url": "https://www.ust.hk", "aff_unique_abbr": "HKUST", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Niu_2023_ICCV,\n \n author = {\n Niu,\n Zhe and Mak,\n Brian\n},\n title = {\n On the Audio-visual Synchronization for Lip-to-Speech Synthesis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7843-7852\n} \n}" }, { "title": "On the Effectiveness of Spectral Discriminators for Perceptual Quality Improvement", @@ -41088,7 +42466,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Hefei", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Luo_2023_ICCV,\n \n author = {\n Luo,\n Xin and Zhu,\n Yunan and Xu,\n Shunxin and Liu,\n Dong\n},\n title = {\n On the Effectiveness of Spectral Discriminators for Perceptual Quality Improvement\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13243-13253\n} \n}" }, { "title": "On the Robustness of Normalizing Flows for Inverse Problems in Imaging", @@ -41120,7 +42499,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "", - "aff_country_unique": "" + "aff_country_unique": "", + "bibtex": "@InProceedings{Hong_2023_ICCV,\n \n author = {\n Hong,\n Seongmin and Park,\n Inbum and Chun,\n Se Young\n},\n title = {\n On the Robustness of Normalizing Flows for Inverse Problems in Imaging\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10745-10755\n} \n}" }, { "title": "On the Robustness of Open-World Test-Time Training: Self-Training with Dynamic Prototype Expansion", @@ -41128,6 +42508,7 @@ "status": "Oral", "track": "main", "pid": "4446", + "author_site": "Yushu Li, Xun Xu, Yongyi Su, Kui Jia", "author": "Yushu Li, Xun Xu, Yongyi Su, Kui Jia", "abstract": "Generalizing deep learning models to unknown target domain distribution with low latency has motivated research into test-time training/adaptation (TTT/TTA). Existing approaches often focus on improving test-time training performance under well-curated target domain data. As figured out in this work, many state-of-the-art methods fail to maintain the performance when the target domain is contaminated with strong out-of-distribution (OOD) data, a.k.a. open-world test-time training (OWTTT). The failure is mainly due to the inability to distinguish strong OOD samples from regular weak OOD samples. To improve the robustness of OWTTT we first develop an adaptive strong OOD pruning which improves the efficacy of the self-training TTT method. We further propose a way to dynamically expand the prototypes to represent strong OOD samples for an improved weak/strong OOD data separation. Finally, we regularize self-training with distribution alignment and the combination yields the state-of-the-art performance on 5 OWTTT benchmarks. The code is available at https://github.com/Yushu-Li/OWTTT.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Li_On_the_Robustness_of_Open-World_Test-Time_Training_Self-Training_with_Dynamic_ICCV_2023_paper.pdf", @@ -41139,7 +42520,8 @@ "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1482379448776293966&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_On_the_Robustness_of_Open-World_Test-Time_Training_Self-Training_with_Dynamic_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_On_the_Robustness_of_Open-World_Test-Time_Training_Self-Training_with_Dynamic_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Yushu and Xu,\n Xun and Su,\n Yongyi and Jia,\n Kui\n},\n title = {\n On the Robustness of Open-World Test-Time Training: Self-Training with Dynamic Prototype Expansion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11836-11846\n} \n}" }, { "title": "Once Detected, Never Lost: Surpassing Human Performance in Offline LiDAR based 3D Object Detection", @@ -41171,7 +42553,8 @@ "aff_campus_unique_index": ";;1;;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0+0+0;0+0;0;0;1;1+0;0+0+0+0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Fan_2023_ICCV,\n \n author = {\n Fan,\n Lue and Yang,\n Yuxue and Mao,\n Yiming and Wang,\n Feng and Chen,\n Yuntao and Wang,\n Naiyan and Zhang,\n Zhaoxiang\n},\n title = {\n Once Detected,\n Never Lost: Surpassing Human Performance in Offline LiDAR based 3D Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19820-19829\n} \n}" }, { "title": "One-Shot Generative Domain Adaptation", @@ -41196,14 +42579,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yang_One-Shot_Generative_Domain_Adaptation_ICCV_2023_paper.html", "aff_unique_index": "0+1;2+3;2;1;4;5;6", - "aff_unique_norm": "Shanghai AI Laboratory;Chinese University of Hong Kong;ByteDance;Ant Group;Hong Kong University of Science and Technology;Microsoft;University of California, Los Angeles", - "aff_unique_dep": ";;;;;Microsoft Research Asia;", + "aff_unique_norm": "Shanghai AI Laboratory;The Chinese University of Hong Kong;ByteDance;Ant Group;Hong Kong University of Science and Technology;Microsoft Research Asia;University of California, Los Angeles", + "aff_unique_dep": ";;;;;;", "aff_unique_url": "https://www.shanghai-ai-lab.com;https://www.cuhk.edu.hk;https://www.bytedance.com;https://www.antgroup.com;https://www.ust.hk;https://www.msra.cn;https://www.ucla.edu", "aff_unique_abbr": "SAIL;CUHK;ByteDance;Ant Group;HKUST;MSRA;UCLA", "aff_campus_unique_index": "1;;1;1;2", "aff_campus_unique": ";Hong Kong SAR;Los Angeles", "aff_country_unique_index": "0+0;0+0;0;0;0;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n Ceyuan and Shen,\n Yujun and Zhang,\n Zhiyi and Xu,\n Yinghao and Zhu,\n Jiapeng and Wu,\n Zhirong and Zhou,\n Bolei\n},\n title = {\n One-Shot Generative Domain Adaptation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7733-7742\n} \n}" }, { "title": "One-Shot Recognition of Any Material Anywhere Using Contrastive Learning with Physics-Based Rendering", @@ -41235,7 +42619,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;1+1;1;1;1+1", - "aff_country_unique": "Germany;Canada" + "aff_country_unique": "Germany;Canada", + "bibtex": "@InProceedings{Drehwald_2023_ICCV,\n \n author = {\n Drehwald,\n Manuel S. and Eppel,\n Sagi and Li,\n Jolina and Hao,\n Han and Aspuru-Guzik,\n Alan\n},\n title = {\n One-Shot Recognition of Any Material Anywhere Using Contrastive Learning with Physics-Based Rendering\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23524-23533\n} \n}" }, { "title": "One-bit Flip is All You Need: When Bit-flip Attack Meets Model Training", @@ -41267,7 +42652,8 @@ "aff_campus_unique_index": ";;;;", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0+0+0;1;0+0;0+0;0+0;0", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Dong_2023_ICCV,\n \n author = {\n Dong,\n Jianshuo and Qiu,\n Han and Li,\n Yiming and Zhang,\n Tianwei and Li,\n Yuanjie and Lai,\n Zeqi and Zhang,\n Chao and Xia,\n Shu-Tao\n},\n title = {\n One-bit Flip is All You Need: When Bit-flip Attack Meets Model Training\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4688-4698\n} \n}" }, { "title": "One-shot Implicit Animatable Avatars with Model-based Priors", @@ -41279,7 +42665,7 @@ "author": "Yangyi Huang; Hongwei Yi; Weiyang Liu; Haofan Wang; Boxi Wu; Wenxiao Wang; Binbin Lin; Debing Zhang; Deng Cai", "abstract": "Existing neural rendering methods for creating human avatars typically either require dense input signals such as video or multi-view images, or leverage a learned prior from large-scale specific 3D human datasets such that reconstruction can be performed with sparse-view inputs. Most of these methods fail to achieve realistic reconstruction when only a single image is available. To enable the data-efficient creation of realistic animatable 3D humans, we propose ELICIT, a novel method for learning human-specific neural radiance fields from a single image. Inspired by the fact that humans can effortlessly estimate the body geometry and imagine full-body clothing from a single image, we leverage two priors in ELICIT: 3D geometry prior and visual semantic prior. Specifically, ELICIT utilizes the 3D body shape geometry prior from a skinned vertex-based template model (i.e., SMPL) and implements the visual clothing semantic prior with the CLIP-based pretrained models. Both priors are used to jointly guide the optimization for creating plausible content in the invisible areas. Taking advantage of the CLIP models, ELICIT can use text descriptions to generate text-conditioned unseen regions. In order to further improve visual details, we propose a segmentation-based sampling strategy that locally refines different parts of the avatar. Comprehensive evaluations on multiple popular benchmarks, including ZJU-MoCAP, Human3.6M, and DeepFashion, show that ELICIT has outperformed strong baseline methods of avatar creation when only a single image is available. The code is public for research purposes at https://huangyangyi.github.io/ELICIT/.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Huang_One-shot_Implicit_Animatable_Avatars_with_Model-based_Priors_ICCV_2023_paper.pdf", - "aff": "State Key Lab of CAD & CG, Zhejiang University; Max Planck Institute for Intelligent Systems, T\u00fcbingen; University of Cambridge; Xiaohongshu Inc.; School of Software Technology, Zhejiang University; Fullong Inc.; School of Software Technology, Zhejiang University; Xiaohongshu Inc.; State Key Lab of CAD & CG, Zhejiang University", + "aff": "State Key Lab of CAD & CG, Zhejiang University; Max Planck Institute for Intelligent Systems, Tübingen; University of Cambridge; Xiaohongshu Inc.; School of Software Technology, Zhejiang University; Fullong Inc.; School of Software Technology, Zhejiang University; Xiaohongshu Inc.; State Key Lab of CAD & CG, Zhejiang University", "project": "", "github": "https://huangyangyi.github.io/ELICIT", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Huang_One-shot_Implicit_Animatable_ICCV_2023_supplemental.pdf", @@ -41297,9 +42683,10 @@ "aff_unique_url": "http://www.zju.edu.cn;https://www.mpi-is.mpg.de;https://www.cam.ac.uk;https://www.xiaohongshu.com;", "aff_unique_abbr": "ZJU;MPI-IS;Cambridge;Xiaohongshu;", "aff_campus_unique_index": "1;2", - "aff_campus_unique": ";T\u00fcbingen;Cambridge", + "aff_campus_unique": ";Tübingen;Cambridge", "aff_country_unique_index": "0;1;2;0;0;3;0;0;0", - "aff_country_unique": "China;Germany;United Kingdom;United States" + "aff_country_unique": "China;Germany;United Kingdom;United States", + "bibtex": "@InProceedings{Huang_2023_ICCV,\n \n author = {\n Huang,\n Yangyi and Yi,\n Hongwei and Liu,\n Weiyang and Wang,\n Haofan and Wu,\n Boxi and Wang,\n Wenxiao and Lin,\n Binbin and Zhang,\n Debing and Cai,\n Deng\n},\n title = {\n One-shot Implicit Animatable Avatars with Model-based Priors\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8974-8985\n} \n}" }, { "title": "Online Class Incremental Learning on Stochastic Blurry Task Boundary via Mask and Visual Prompt Tuning", @@ -41331,7 +42718,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Moon_2023_ICCV,\n \n author = {\n Moon,\n Jun-Yeong and Park,\n Keon-Hee and Kim,\n Jung Uk and Park,\n Gyeong-Moon\n},\n title = {\n Online Class Incremental Learning on Stochastic Blurry Task Boundary via Mask and Visual Prompt Tuning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11731-11741\n} \n}" }, { "title": "Online Clustered Codebook", @@ -41363,7 +42751,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Oxford", "aff_country_unique_index": "0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Zheng_2023_ICCV,\n \n author = {\n Zheng,\n Chuanxia and Vedaldi,\n Andrea\n},\n title = {\n Online Clustered Codebook\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22798-22807\n} \n}" }, { "title": "Online Continual Learning on Hierarchical Label Expansion", @@ -41395,7 +42784,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "1;", - "aff_country_unique": ";South Korea" + "aff_country_unique": ";South Korea", + "bibtex": "@InProceedings{Lee_2023_ICCV,\n \n author = {\n Lee,\n Byung Hyun and Jung,\n Okchul and Choi,\n Jonghyun and Chun,\n Se Young\n},\n title = {\n Online Continual Learning on Hierarchical Label Expansion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11761-11770\n} \n}" }, { "title": "Online Prototype Learning for Online Continual Learning", @@ -41420,14 +42810,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wei_Online_Prototype_Learning_for_Online_Continual_Learning_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0;0+0+1", - "aff_unique_norm": "Fudan University;Shanghai Center for Brain Science and Brain-Inspired Technology", + "aff_unique_norm": "Fudan University;Shanghai Center for Brain Science and Brain-inspired Technology", "aff_unique_dep": "Institute of Science and Technology for Brain-inspired Intelligence;", "aff_unique_url": "https://www.fudan.edu.cn;", "aff_unique_abbr": "Fudan;", "aff_campus_unique_index": "1;1;", "aff_campus_unique": ";Shanghai", "aff_country_unique_index": "0;0;0;0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wei_2023_ICCV,\n \n author = {\n Wei,\n Yujie and Ye,\n Jiaxin and Huang,\n Zhizhong and Zhang,\n Junping and Shan,\n Hongming\n},\n title = {\n Online Prototype Learning for Online Continual Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18764-18774\n} \n}" }, { "title": "OnlineRefer: A Simple Online Baseline for Referring Video Object Segmentation", @@ -41452,14 +42843,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wu_OnlineRefer_A_Simple_Online_Baseline_for_Referring_Video_Object_Segmentation_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;3+4;4", - "aff_unique_norm": "Beijing Institute of Technology;Megvii Technology;Shanghai Jiao Tong University;Beijing Academy of Artificial Intelligence;University of Macau", + "aff_unique_norm": "Beijing Institute of Technology;MEGVII Technology;Shanghai Jiao Tong University;Beijing Academy of Artificial Intelligence;University of Macau", "aff_unique_dep": ";;;;Department of Computer and Information Science", "aff_unique_url": "http://www.bit.edu.cn/;https://www.megvii.com;https://www.sjtu.edu.cn;https://www.baaic.cn;https://www.um.edu.mo", "aff_unique_abbr": "BIT;;SJTU;BAAI;UM", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Macau SAR", "aff_country_unique_index": "0;0;0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wu_2023_ICCV,\n \n author = {\n Wu,\n Dongming and Wang,\n Tiancai and Zhang,\n Yuang and Zhang,\n Xiangyu and Shen,\n Jianbing\n},\n title = {\n OnlineRefer: A Simple Online Baseline for Referring Video Object Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2761-2770\n} \n}" }, { "title": "Open Set Video HOI detection from Action-Centric Chain-of-Look Prompting", @@ -41484,14 +42876,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Xi_Open_Set_Video_HOI_detection_from_Action-Centric_Chain-of-Look_Prompting_ICCV_2023_paper.html", "aff_unique_index": "0;1;0", - "aff_unique_norm": "State University of New York at Buffalo;Amazon", - "aff_unique_dep": ";Amazon.com, Inc.", + "aff_unique_norm": "State University of New York at Buffalo;Amazon.com, Inc.", + "aff_unique_dep": ";", "aff_unique_url": "https://www.buffalo.edu;https://www.amazon.com", "aff_unique_abbr": "SUNY Buffalo;Amazon", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Buffalo;", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Xi_2023_ICCV,\n \n author = {\n Xi,\n Nan and Meng,\n Jingjing and Yuan,\n Junsong\n},\n title = {\n Open Set Video HOI detection from Action-Centric Chain-of-Look Prompting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3079-3089\n} \n}" }, { "title": "Open-Vocabulary Object Detection With an Open Corpus", @@ -41523,7 +42916,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Jiong and Zhang,\n Huiming and Hong,\n Haiwen and Jin,\n Xuan and He,\n Yuan and Xue,\n Hui and Zhao,\n Zhou\n},\n title = {\n Open-Vocabulary Object Detection With an Open Corpus\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6759-6769\n} \n}" }, { "title": "Open-Vocabulary Semantic Segmentation with Decoupled One-Pass Network", @@ -41548,14 +42942,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Han_Open-Vocabulary_Semantic_Segmentation_with_Decoupled_One-Pass_Network_ICCV_2023_paper.html", "aff_unique_index": "0+1;0+1;0;1;0+1", - "aff_unique_norm": "Meituan Inc.;University of Hong Kong", + "aff_unique_norm": "Meituan Inc.;The University of Hong Kong", "aff_unique_dep": ";", "aff_unique_url": "https://www.meituan.com;https://www.hku.hk", "aff_unique_abbr": "Meituan;HKU", "aff_campus_unique_index": "1;1;1;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0+0;0+0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Han_2023_ICCV,\n \n author = {\n Han,\n Cong and Zhong,\n Yujie and Li,\n Dengjie and Han,\n Kai and Ma,\n Lin\n},\n title = {\n Open-Vocabulary Semantic Segmentation with Decoupled One-Pass Network\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1086-1096\n} \n}" }, { "title": "Open-domain Visual Entity Recognition: Towards Recognizing Millions of Wikipedia Entities", @@ -41578,7 +42973,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Hu_Open-domain_Visual_Entity_Recognition_Towards_Recognizing_Millions_of_Wikipedia_Entities_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Hu_Open-domain_Visual_Entity_Recognition_Towards_Recognizing_Millions_of_Wikipedia_Entities_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Hu_2023_ICCV,\n \n author = {\n Hu,\n Hexiang and Luan,\n Yi and Chen,\n Yang and Khandelwal,\n Urvashi and Joshi,\n Mandar and Lee,\n Kenton and Toutanova,\n Kristina and Chang,\n Ming-Wei\n},\n title = {\n Open-domain Visual Entity Recognition: Towards Recognizing Millions of Wikipedia Entities\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12065-12075\n} \n}" }, { "title": "Open-vocabulary Object Segmentation with Diffusion Models", @@ -41610,7 +43006,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0+0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Ziyi and Zhou,\n Qinye and Zhang,\n Xiaoyun and Zhang,\n Ya and Wang,\n Yanfeng and Xie,\n Weidi\n},\n title = {\n Open-vocabulary Object Segmentation with Diffusion Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7667-7676\n} \n}" }, { "title": "Open-vocabulary Panoptic Segmentation with Embedding Modulation", @@ -41635,14 +43032,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chen_Open-vocabulary_Panoptic_Segmentation_with_Embedding_Modulation_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;1;0+1", - "aff_unique_norm": "University of Hong Kong;Massachusetts Institute of Technology;Meta", + "aff_unique_norm": "The University of Hong Kong;Massachusetts Institute of Technology;Meta Platforms, Inc.", "aff_unique_dep": ";;Meta AI", "aff_unique_url": "https://www.hku.hk;https://web.mit.edu;https://meta.com", "aff_unique_abbr": "HKU;MIT;Meta", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;1;1;1;0+1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Xi and Li,\n Shuang and Lim,\n Ser-Nam and Torralba,\n Antonio and Zhao,\n Hengshuang\n},\n title = {\n Open-vocabulary Panoptic Segmentation with Embedding Modulation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1141-1150\n} \n}" }, { "title": "Open-vocabulary Video Question Answering: A New Benchmark for Evaluating the Generalizability of Video Question Answering Models", @@ -41674,7 +43072,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Ko_2023_ICCV,\n \n author = {\n Ko,\n Dohwan and Lee,\n Ji Soo and Choi,\n Miso and Chu,\n Jaewon and Park,\n Jihwan and Kim,\n Hyunwoo J.\n},\n title = {\n Open-vocabulary Video Question Answering: A New Benchmark for Evaluating the Generalizability of Video Question Answering Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3101-3112\n} \n}" }, { "title": "OpenOccupancy: A Large Scale Benchmark for Surrounding Semantic Occupancy Perception", @@ -41706,7 +43105,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Xiaofeng and Zhu,\n Zheng and Xu,\n Wenbo and Zhang,\n Yunpeng and Wei,\n Yi and Chi,\n Xu and Ye,\n Yun and Du,\n Dalong and Lu,\n Jiwen and Wang,\n Xingang\n},\n title = {\n OpenOccupancy: A Large Scale Benchmark for Surrounding Semantic Occupancy Perception\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17850-17859\n} \n}" }, { "title": "Optimizing the Placement of Roadside LiDARs for Autonomous Driving", @@ -41738,7 +43138,8 @@ "aff_campus_unique_index": ";1;1;;", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0+0+1;0;0;2;2;0+1;1;0+1", - "aff_country_unique": "China;Singapore;United States" + "aff_country_unique": "China;Singapore;United States", + "bibtex": "@InProceedings{Jiang_2023_ICCV,\n \n author = {\n Jiang,\n Wentao and Xiang,\n Hao and Cai,\n Xinyu and Xu,\n Runsheng and Ma,\n Jiaqi and Li,\n Yikang and Lee,\n Gim Hee and Liu,\n Si\n},\n title = {\n Optimizing the Placement of Roadside LiDARs for Autonomous Driving\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18381-18390\n} \n}" }, { "title": "Ord2Seq: Regarding Ordinal Regression as Label Sequence Prediction", @@ -41770,7 +43171,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+1;0;1;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Jinhong and Cheng,\n Yi and Chen,\n Jintai and Chen,\n TingTing and Chen,\n Danny and Wu,\n Jian\n},\n title = {\n Ord2Seq: Regarding Ordinal Regression as Label Sequence Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5865-5875\n} \n}" }, { "title": "Order-Prompted Tag Sequence Generation for Video Tagging", @@ -41802,7 +43204,8 @@ "aff_campus_unique_index": ";;;;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0+0;0+0+0;0+0+0;0;0+0+0;0+0+0;0;0+0+0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Ma_2023_ICCV,\n \n author = {\n Ma,\n Zongyang and Zhang,\n Ziqi and Chen,\n Yuxin and Qi,\n Zhongang and Luo,\n Yingmin and Li,\n Zekun and Yuan,\n Chunfeng and Li,\n Bing and Qie,\n Xiaohu and Shan,\n Ying and Hu,\n Weiming\n},\n title = {\n Order-Prompted Tag Sequence Generation for Video Tagging\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15681-15690\n} \n}" }, { "title": "Order-preserving Consistency Regularization for Domain Adaptation and Generalization", @@ -41834,7 +43237,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+1;1+0;0;1", - "aff_country_unique": "China;Netherlands" + "aff_country_unique": "China;Netherlands", + "bibtex": "@InProceedings{Jing_2023_ICCV,\n \n author = {\n Jing,\n Mengmeng and Zhen,\n Xiantong and Li,\n Jingjing and Snoek,\n Cees G. M.\n},\n title = {\n Order-preserving Consistency Regularization for Domain Adaptation and Generalization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18916-18927\n} \n}" }, { "title": "Ordered Atomic Activity for Fine-grained Interactive Traffic Scenario Understanding", @@ -41866,7 +43270,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Taiwan", "aff_country_unique_index": "0;1", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Agarwal_2023_ICCV,\n \n author = {\n Agarwal,\n Nakul and Chen,\n Yi-Ting\n},\n title = {\n Ordered Atomic Activity for Fine-grained Interactive Traffic Scenario Understanding\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8624-8636\n} \n}" }, { "title": "Ordinal Label Distribution Learning", @@ -41898,7 +43303,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wen_2023_ICCV,\n \n author = {\n Wen,\n Changsong and Zhang,\n Xin and Yao,\n Xingxu and Yang,\n Jufeng\n},\n title = {\n Ordinal Label Distribution Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23481-23491\n} \n}" }, { "title": "OrthoPlanes: A Novel Representation for Better 3D-Awareness of GANs", @@ -41930,7 +43336,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+1;0;0;0", - "aff_country_unique": "China;Switzerland" + "aff_country_unique": "China;Switzerland", + "bibtex": "@InProceedings{He_2023_ICCV,\n \n author = {\n He,\n Honglin and Yang,\n Zhuoqian and Li,\n Shikai and Dai,\n Bo and Wu,\n Wayne\n},\n title = {\n OrthoPlanes: A Novel Representation for Better 3D-Awareness of GANs\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22996-23007\n} \n}" }, { "title": "Out-of-Distribution Detection for Monocular Depth Estimation", @@ -41942,7 +43349,7 @@ "author": "Julia Hornauer; Adrian Holzbock; Vasileios Belagiannis", "abstract": "In monocular depth estimation, uncertainty estimation approaches mainly target the data uncertainty introduced by image noise. In contrast to prior work, we address the uncertainty due to lack of knowledge, which is relevant for the detection of data not represented by the training distribution, the so-called out-of-distribution (OOD) data. Motivated by anomaly detection, we propose to detect OOD images from an encoder-decoder depth estimation model based on the reconstruction error. Given the features extracted with the fixed depth encoder, we train an image decoder for image reconstruction using only in-distribution data. Consequently, OOD images result in a high reconstruction error, which we use to distinguish between in- and out-of-distribution samples. We built our experiments on the standard NYU Depth V2 and KITTI benchmarks as in-distribution data. Our post hoc method performs astonishingly well on different models and outperforms existing uncertainty estimation approaches without modifying the trained encoder-decoder depth estimation model.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Hornauer_Out-of-Distribution_Detection_for_Monocular_Depth_Estimation_ICCV_2023_paper.pdf", - "aff": "Ulm University, Germany; Ulm University, Germany; Friedrich-Alexander-Universit \u00a8at Erlangen-N \u00a8urnberg, Germany", + "aff": "Ulm University, Germany; Ulm University, Germany; Friedrich-Alexander-Universit ¨at Erlangen-N ¨urnberg, Germany", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Hornauer_Out-of-Distribution_Detection_for_ICCV_2023_supplemental.pdf", @@ -41955,14 +43362,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Hornauer_Out-of-Distribution_Detection_for_Monocular_Depth_Estimation_ICCV_2023_paper.html", "aff_unique_index": "0;0;1", - "aff_unique_norm": "Ulm University;Friedrich-Alexander-Universit\u00e4t Erlangen-N\u00fcrnberg", + "aff_unique_norm": "Ulm University;Friedrich-Alexander-Universität Erlangen-Nürnberg", "aff_unique_dep": ";", "aff_unique_url": "https://www.uni-ulm.de;https://www fau.de", "aff_unique_abbr": "U Ulm;FAU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Hornauer_2023_ICCV,\n \n author = {\n Hornauer,\n Julia and Holzbock,\n Adrian and Belagiannis,\n Vasileios\n},\n title = {\n Out-of-Distribution Detection for Monocular Depth Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1911-1921\n} \n}" }, { "title": "Out-of-Domain GAN Inversion via Invertibility Decomposition for Photo-Realistic Human Face Manipulation", @@ -41994,7 +43402,8 @@ "aff_campus_unique_index": "0+1+0;;0+1+0", "aff_campus_unique": "Guangzhou;Hong Kong SAR;", "aff_country_unique_index": "0+0+0;0+0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n Xin and XU,\n Xiaogang and Chen,\n Yingcong\n},\n title = {\n Out-of-Domain GAN Inversion via Invertibility Decomposition for Photo-Realistic Human Face Manipulation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7492-7501\n} \n}" }, { "title": "Overcoming Forgetting Catastrophe in Quantization-Aware Training", @@ -42026,7 +43435,8 @@ "aff_campus_unique_index": "0+0+0;0+0;0+0", "aff_campus_unique": "Taiwan", "aff_country_unique_index": "0+0+0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Ting-An and Yang,\n De-Nian and Chen,\n Ming-Syan\n},\n title = {\n Overcoming Forgetting Catastrophe in Quantization-Aware Training\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17358-17367\n} \n}" }, { "title": "Overwriting Pretrained Bias with Finetuning Data", @@ -42058,7 +43468,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Angelina and Russakovsky,\n Olga\n},\n title = {\n Overwriting Pretrained Bias with Finetuning Data\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3957-3968\n} \n}" }, { "title": "OxfordTVG-HIC: Can Machine Make Humorous Captions from Images?", @@ -42090,7 +43501,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Oxford;", "aff_country_unique_index": "0;0;1;0", - "aff_country_unique": "United Kingdom;Saudi Arabia" + "aff_country_unique": "United Kingdom;Saudi Arabia", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Runjia and Sun,\n Shuyang and Elhoseiny,\n Mohamed and Torr,\n Philip\n},\n title = {\n OxfordTVG-HIC: Can Machine Make Humorous Captions from Images?\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20293-20303\n} \n}" }, { "title": "P1AC: Revisiting Absolute Pose From a Single Affine Correspondence", @@ -42098,11 +43510,11 @@ "status": "Oral", "track": "main", "pid": "10244", - "author_site": "Jonathan Ventura, Zuzana Kukelova, Torsten Sattler, D\u00e1niel Bar\u00e1th", - "author": "Jonathan Ventura; Zuzana Kukelova; Torsten Sattler; D\u00e1niel Bar\u00e1th", + "author_site": "Jonathan Ventura, Zuzana Kukelova, Torsten Sattler, Dániel Baráth", + "author": "Jonathan Ventura; Zuzana Kukelova; Torsten Sattler; Dániel Baráth", "abstract": "Affine correspondences have traditionally been used to improve feature matching over wide baselines. While recent work has successfully used affine correspondences to solve various relative camera pose estimation problems, less attention has been given to their use in absolute pose estimation. We introduce the first general solution to the problem of estimating the pose of a calibrated camera given a single observation of an oriented point and an affine correspondence. The advantage of our approach (P1AC) is that it requires only a single correspondence, in comparison to the traditional point-based approach (P3P), significantly reducing the combinatorics in robust estimation. P1AC provides a general solution that removes restrictive assumptions made in prior work and is applicable to large-scale image-based localization. We propose a minimal solution to the P1AC problem and evaluate our novel solver on synthetic data, showing its numerical stability and performance under various types of noise. On standard image-based localization benchmarks we show that P1AC achieves more accurate results than the widely used P3P algorithm. Code for our method is available at https://github.com/jonathanventura/P1AC/.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Ventura_P1AC_Revisiting_Absolute_Pose_From_a_Single_Affine_Correspondence_ICCV_2023_paper.pdf", - "aff": "Department of Computer Science & Software Engineering, Cal Poly, San Luis Obispo; Visual Recognition Group, Faculty of Electrical Engineering, Czech Technical University in Prague; Czech Institute of Informatics, Robotics and Cybernetics, Czech Technical University in Prague; Computer Vision and Geometry Group, ETH Z \u00a8urich", + "aff": "Department of Computer Science & Software Engineering, Cal Poly, San Luis Obispo; Visual Recognition Group, Faculty of Electrical Engineering, Czech Technical University in Prague; Czech Institute of Informatics, Robotics and Cybernetics, Czech Technical University in Prague; Computer Vision and Geometry Group, ETH Z ¨urich", "project": "", "github": "https://github.com/jonathanventura/P1AC/", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Ventura_P1AC_Revisiting_Absolute_ICCV_2023_supplemental.pdf", @@ -42115,14 +43527,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ventura_P1AC_Revisiting_Absolute_Pose_From_a_Single_Affine_Correspondence_ICCV_2023_paper.html", "aff_unique_index": "0;1;1;2", - "aff_unique_norm": "California Polytechnic State University;Czech Technical University in Prague;ETH Zurich", + "aff_unique_norm": "California Polytechnic State University;Czech Technical University in Prague;ETH Zürich", "aff_unique_dep": "Department of Computer Science & Software Engineering;Faculty of Electrical Engineering;Computer Vision and Geometry Group", "aff_unique_url": "https://www.calpoly.edu;https://www.cvut.cz;https://www.ethz.ch", "aff_unique_abbr": "Cal Poly;CTU;ETHZ", "aff_campus_unique_index": "0;1;1", "aff_campus_unique": "San Luis Obispo;Prague;", "aff_country_unique_index": "0;1;1;2", - "aff_country_unique": "United States;Czech Republic;Switzerland" + "aff_country_unique": "United States;Czech Republic;Switzerland", + "bibtex": "@InProceedings{Ventura_2023_ICCV,\n \n author = {\n Ventura,\n Jonathan and Kukelova,\n Zuzana and Sattler,\n Torsten and Bar\\'ath,\n D\\'aniel\n},\n title = {\n P1AC: Revisiting Absolute Pose From a Single Affine Correspondence\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19751-19761\n} \n}" }, { "title": "P2C: Self-Supervised Point Cloud Completion from Single Partial Clouds", @@ -42154,7 +43567,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0;0;0", - "aff_country_unique": "Australia;Saudi Arabia" + "aff_country_unique": "Australia;Saudi Arabia", + "bibtex": "@InProceedings{Cui_2023_ICCV,\n \n author = {\n Cui,\n Ruikai and Qiu,\n Shi and Anwar,\n Saeed and Liu,\n Jiawei and Xing,\n Chaoyue and Zhang,\n Jing and Barnes,\n Nick\n},\n title = {\n P2C: Self-Supervised Point Cloud Completion from Single Partial Clouds\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14351-14360\n} \n}" }, { "title": "PADCLIP: Pseudo-labeling with Adaptive Debiasing in CLIP for Unsupervised Domain Adaptation", @@ -42179,14 +43593,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Lai_PADCLIP_Pseudo-labeling_with_Adaptive_Debiasing_in_CLIP_for_Unsupervised_Domain_ICCV_2023_paper.html", "aff_unique_index": "0+1;1;1;1;1;1;1;0+1", - "aff_unique_norm": "University of California, Davis;Amazon", - "aff_unique_dep": ";Amazon.com, Inc.", + "aff_unique_norm": "University of California, Davis;Amazon.com, Inc.", + "aff_unique_dep": ";", "aff_unique_url": "https://www.ucdavis.edu;https://www.amazon.com", "aff_unique_abbr": "UC Davis;Amazon", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Davis;", "aff_country_unique_index": "0+0;0;0;0;0;0;0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Lai_2023_ICCV,\n \n author = {\n Lai,\n Zhengfeng and Vesdapunt,\n Noranart and Zhou,\n Ning and Wu,\n Jun and Huynh,\n Cong Phuoc and Li,\n Xuelu and Fu,\n Kah Kuen and Chuah,\n Chen-Nee\n},\n title = {\n PADCLIP: Pseudo-labeling with Adaptive Debiasing in CLIP for Unsupervised Domain Adaptation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16155-16165\n} \n}" }, { "title": "PADDLES: Phase-Amplitude Spectrum Disentangled Early Stopping for Learning with Noisy Labels", @@ -42218,7 +43633,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";New York", "aff_country_unique_index": "0;0;1;0;0;0;0", - "aff_country_unique": "Australia;United States" + "aff_country_unique": "Australia;United States", + "bibtex": "@InProceedings{Huang_2023_ICCV,\n \n author = {\n Huang,\n Huaxi and Kang,\n Hui and Liu,\n Sheng and Salvado,\n Olivier and Rakotoarivelo,\n Thierry and Wang,\n Dadong and Liu,\n Tongliang\n},\n title = {\n PADDLES: Phase-Amplitude Spectrum Disentangled Early Stopping for Learning with Noisy Labels\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16719-16730\n} \n}" }, { "title": "PARF: Primitive-Aware Radiance Fusion for Indoor Scene Novel View Synthesis", @@ -42250,7 +43666,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Ying_2023_ICCV,\n \n author = {\n Ying,\n Haiyang and Jiang,\n Baowei and Zhang,\n Jinzhi and Xu,\n Di and Yu,\n Tao and Dai,\n Qionghai and Fang,\n Lu\n},\n title = {\n PARF: Primitive-Aware Radiance Fusion for Indoor Scene Novel View Synthesis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17706-17716\n} \n}" }, { "title": "PARIS: Part-level Reconstruction and Motion Analysis for Articulated Objects", @@ -42273,7 +43690,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Liu_PARIS_Part-level_Reconstruction_and_Motion_Analysis_for_Articulated_Objects_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Liu_PARIS_Part-level_Reconstruction_and_Motion_Analysis_for_Articulated_Objects_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Jiayi and Mahdavi-Amiri,\n Ali and Savva,\n Manolis\n},\n title = {\n PARIS: Part-level Reconstruction and Motion Analysis for Articulated Objects\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 352-363\n} \n}" }, { "title": "PARTNER: Level up the Polar Representation for LiDAR 3D Object Detection", @@ -42285,7 +43703,7 @@ "author": "Ming Nie; Yujing Xue; Chunwei Wang; Chaoqiang Ye; Hang Xu; Xinge Zhu; Qingqiu Huang; Michael Bi Mi; Xinchao Wang; Li Zhang", "abstract": "Recently, polar-based representation has shown promising properties in perceptual tasks. In addition to Cartesian-based approaches, which separate point clouds unevenly, representing point clouds as polar grids has been recognized as an alternative due to (1) its advantage in robust performance under different resolutions and (2) its superiority in streaming-based approaches. However, state-of-the-art polar-based detection methods inevitably suffer from the feature distortion problem because of the non-uniform division of polar representation, resulting in a non-negligible performance gap compared to Cartesian-based approaches. To tackle this issue, we present PARTNER, a novel 3D object detector in the polar coordinate. PARTNER alleviates the dilemma of feature distortion with global representation re-alignment and facilitates the regression by introducing instance-level geometric information into the detection head. Extensive experiments show overwhelming advantages in streaming-based detection and different resolutions. Furthermore, our method outperforms the previous polar-based works with remarkable margins of 3.68% and 9.15% on Waymo and ONCE validation set, thus achieving competitive results over the state-of-the-art methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Nie_PARTNER_Level_up_the_Polar_Representation_for_LiDAR_3D_Object_ICCV_2023_paper.pdf", - "aff": "School of Data Science, Fudan University; National University of Singapore + Huawei International Pte Ltd; Huawei Noah\u2019s Ark Lab; Huawei ADS; Huawei ADS; Huawei International Pte Ltd; Huawei Noah\u2019s Ark Lab; Huawei International Pte Ltd; National University of Singapore; School of Data Science, Fudan University", + "aff": "School of Data Science, Fudan University; National University of Singapore + Huawei International Pte Ltd; Huawei Noah’s Ark Lab; Huawei ADS; Huawei ADS; Huawei International Pte Ltd; Huawei Noah’s Ark Lab; Huawei International Pte Ltd; National University of Singapore; School of Data Science, Fudan University", "project": "", "github": "", "supp": "", @@ -42297,15 +43715,16 @@ "email": "fudan.edu.cn;u.nus.edu;huawei.com;huawei.com;huawei.com;huawei.com;huawei.com;huawei.com;u.nus.edu;fudan.edu.cn", "author_num": 10, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Nie_PARTNER_Level_up_the_Polar_Representation_for_LiDAR_3D_Object_ICCV_2023_paper.html", - "aff_unique_index": "0;1+2;2;2;2;2;2;2;1;0", - "aff_unique_norm": "Fudan University;National University of Singapore;Huawei", - "aff_unique_dep": "School of Data Science;;Huawei International Pte Ltd", - "aff_unique_url": "https://www.fudan.edu.cn;https://www.nus.edu.sg;https://www.huawei.com/en/", - "aff_unique_abbr": "Fudan;NUS;Huawei", + "aff_unique_index": "0;1+2;3;3;3;2;3;2;1;0", + "aff_unique_norm": "Fudan University;National University of Singapore;Huawei International Pte Ltd;Huawei", + "aff_unique_dep": "School of Data Science;;;Noah’s Ark Lab", + "aff_unique_url": "https://www.fudan.edu.cn;https://www.nus.edu.sg;https://www.huawei.com/en/;https://www.huawei.com", + "aff_unique_abbr": "Fudan;NUS;Huawei;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1+1;0;0;0;1;0;1;1;0", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Nie_2023_ICCV,\n \n author = {\n Nie,\n Ming and Xue,\n Yujing and Wang,\n Chunwei and Ye,\n Chaoqiang and Xu,\n Hang and Zhu,\n Xinge and Huang,\n Qingqiu and Mi,\n Michael Bi and Wang,\n Xinchao and Zhang,\n Li\n},\n title = {\n PARTNER: Level up the Polar Representation for LiDAR 3D Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3801-3813\n} \n}" }, { "title": "PASTA: Proportional Amplitude Spectrum Training Augmentation for Syn-to-Real Domain Generalization", @@ -42337,7 +43756,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Chattopadhyay_2023_ICCV,\n \n author = {\n Chattopadhyay,\n Prithvijit and Sarangmath,\n Kartik and Vijaykumar,\n Vivek and Hoffman,\n Judy\n},\n title = {\n PASTA: Proportional Amplitude Spectrum Training Augmentation for Syn-to-Real Domain Generalization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19288-19300\n} \n}" }, { "title": "PATMAT: Person Aware Tuning of Mask-Aware Transformer for Face Inpainting", @@ -42345,8 +43765,8 @@ "status": "Poster", "track": "main", "pid": "1076", - "author_site": "Saman Motamed, Jianjin Xu, Chen Henry Wu, Christian H\u00e4ne, Jean-Charles Bazin, Fernando De la Torre", - "author": "Saman Motamed; Jianjin Xu; Chen Henry Wu; Christian H\u00e4ne; Jean-Charles Bazin; Fernando De la Torre", + "author_site": "Saman Motamed, Jianjin Xu, Chen Henry Wu, Christian Häne, Jean-Charles Bazin, Fernando De la Torre", + "author": "Saman Motamed; Jianjin Xu; Chen Henry Wu; Christian Häne; Jean-Charles Bazin; Fernando De la Torre", "abstract": "Generative models such as StyleGAN2 and Stable Diffusion have achieved state-of-the-art performance in computer vision tasks such as image synthesis, inpainting, and de-noising. However, current generative models for face inpainting often fail to preserve fine facial details and the identity of the person, despite creating aesthetically convincing image structures and textures. In this work, we propose Person Aware Tuning (PAT) of Mask-Aware Transformer (MAT) for face inpainting, which addresses this issue. Our proposed method, PATMAT, effectively preserves identity by incorporating reference images of a subject and fine-tuning a MAT architecture trained on faces. By using 40 reference images, PATMAT creates anchor points in MAT's style module, and tunes the model using the fixed anchors to adapt the model to a new face identity. Moreover, PATMAT's use of multiple images per anchor during training allows the model to use fewer reference images than competing methods. We demonstrate that PATMAT outperforms state-of-the-art models in terms of image quality, the preservation of person-specific details, and the identity of the subject. Our results suggest that PATMAT can be a promising approach for improving the quality of personalized face inpainting.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Motamed_PATMAT_Person_Aware_Tuning_of_Mask-Aware_Transformer_for_Face_Inpainting_ICCV_2023_paper.pdf", "aff": "Robotics Institute, Carnegie Mellon University, Pittsburgh, PA + INSAIT, Sofia University, Bulgaria; Robotics Institute, Carnegie Mellon University, Pittsburgh, PA; Robotics Institute, Carnegie Mellon University, Pittsburgh, PA; Independent Researcher; Independent Researcher; Robotics Institute, Carnegie Mellon University, Pittsburgh, PA", @@ -42369,7 +43789,8 @@ "aff_campus_unique_index": "0+1;0;0;0", "aff_campus_unique": "Pittsburgh;Sofia;", "aff_country_unique_index": "0+1;0;0;0", - "aff_country_unique": "United States;Bulgaria;" + "aff_country_unique": "United States;Bulgaria;", + "bibtex": "@InProceedings{Motamed_2023_ICCV,\n \n author = {\n Motamed,\n Saman and Xu,\n Jianjin and Wu,\n Chen Henry and H\\"ane,\n Christian and Bazin,\n Jean-Charles and De la Torre,\n Fernando\n},\n title = {\n PATMAT: Person Aware Tuning of Mask-Aware Transformer for Face Inpainting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22778-22787\n} \n}" }, { "title": "PC-Adapter: Topology-Aware Adapter for Efficient Domain Adaption on Point Clouds with Rectified Pseudo-label", @@ -42401,7 +43822,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Park_2023_ICCV,\n \n author = {\n Park,\n Joonhyung and Seo,\n Hyunjin and Yang,\n Eunho\n},\n title = {\n PC-Adapter: Topology-Aware Adapter for Efficient Domain Adaption on Point Clouds with Rectified Pseudo-label\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11530-11540\n} \n}" }, { "title": "PDiscoNet: Semantically consistent part discovery for fine-grained recognition", @@ -42424,7 +43846,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/van_der_Klis_PDiscoNet_Semantically_consistent_part_discovery_for_fine-grained_recognition_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/van_der_Klis_PDiscoNet_Semantically_consistent_part_discovery_for_fine-grained_recognition_ICCV_2023_paper.html", + "bibtex": "@InProceedings{van_der_Klis_2023_ICCV,\n \n author = {\n van der Klis,\n Robert and Alaniz,\n Stephan and Mancini,\n Massimiliano and Dantas,\n Cassio F. and Ienco,\n Dino and Akata,\n Zeynep and Marcos,\n Diego\n},\n title = {\n PDiscoNet: Semantically consistent part discovery for fine-grained recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1866-1876\n} \n}" }, { "title": "PEANUT: Predicting and Navigating to Unseen Targets", @@ -42449,14 +43872,15 @@ "author_num": 2, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhai_PEANUT_Predicting_and_Navigating_to_Unseen_Targets_ICCV_2023_paper.html", "aff_unique_index": "0;0", - "aff_unique_norm": "University of Illinois Urbana-Champaign", + "aff_unique_norm": "University of Illinois at Urbana-Champaign", "aff_unique_dep": "", "aff_unique_url": "https://illinois.edu", "aff_unique_abbr": "UIUC", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Urbana-Champaign", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhai_2023_ICCV,\n \n author = {\n Zhai,\n Albert J. and Wang,\n Shenlong\n},\n title = {\n PEANUT: Predicting and Navigating to Unseen Targets\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10926-10935\n} \n}" }, { "title": "PETRv2: A Unified Framework for 3D Perception from Multi-Camera Images", @@ -42468,7 +43892,7 @@ "author": "Yingfei Liu; Junjie Yan; Fan Jia; Shuailin Li; Aqi Gao; Tiancai Wang; Xiangyu Zhang", "abstract": "In this paper, we propose PETRv2, a unified framework for 3D perception from multi-view images. Based on PETR, PETRv2 explores the effectiveness of temporal modeling, which utilizes the temporal information of previous frames to boost 3D object detection. More specifically, we extend the 3D position embedding (3D PE) in PETR for temporal modeling. The 3D PE achieves the temporal alignment on object position of different frames. To support for multi-task learning (e.g., BEV segmentation and 3D lane detection), PETRv2 provides a simple yet effective solution by introducing task-specific queries, which are initialized under different spaces. PETRv2 achieves state-of-the-art performance on 3D object detection, BEV segmentation and 3D lane detection. Detailed robustness analysis is also conducted on PETR framework. Code is available at https://github.com/megvii-research/PETR.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Liu_PETRv2_A_Unified_Framework_for_3D_Perception_from_Multi-Camera_Images_ICCV_2023_paper.pdf", - "aff": "MEGVII Technology\u2020; MEGVII Technology\u2020; MEGVII Technology\u2020; MEGVII Technology\u2020; MEGVII Technology\u2020; MEGVII Technology; MEGVII Technology", + "aff": "MEGVII Technology†; MEGVII Technology†; MEGVII Technology†; MEGVII Technology†; MEGVII Technology†; MEGVII Technology; MEGVII Technology", "project": "", "github": "https://github.com/megvii-research/PETR", "supp": "", @@ -42481,14 +43905,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Liu_PETRv2_A_Unified_Framework_for_3D_Perception_from_Multi-Camera_Images_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0;0;0;0", - "aff_unique_norm": "Megvii Technology", + "aff_unique_norm": "MEGVII Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.megvii.com", "aff_unique_abbr": "MEGVII", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Yingfei and Yan,\n Junjie and Jia,\n Fan and Li,\n Shuailin and Gao,\n Aqi and Wang,\n Tiancai and Zhang,\n Xiangyu\n},\n title = {\n PETRv2: A Unified Framework for 3D Perception from Multi-Camera Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3262-3272\n} \n}" }, { "title": "PG-RCNN: Semantic Surface Point Generation for 3D Object Detection", @@ -42520,7 +43945,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Koo_2023_ICCV,\n \n author = {\n Koo,\n Inyong and Lee,\n Inyoung and Kim,\n Se-Ho and Kim,\n Hee-Seon and Jeon,\n Woo-jin and Kim,\n Changick\n},\n title = {\n PG-RCNN: Semantic Surface Point Generation for 3D Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18142-18151\n} \n}" }, { "title": "PGFed: Personalize Each Client's Global Objective for Federated Learning", @@ -42552,7 +43978,8 @@ "aff_campus_unique_index": "0;1;1;0+0+0+0", "aff_campus_unique": "Pittsburgh;Orlando", "aff_country_unique_index": "0;0;0;0+0+0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Luo_2023_ICCV,\n \n author = {\n Luo,\n Jun and Mendieta,\n Matias and Chen,\n Chen and Wu,\n Shandong\n},\n title = {\n PGFed: Personalize Each Client's Global Objective for Federated Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3946-3956\n} \n}" }, { "title": "PHRIT: Parametric Hand Representation with Implicit Template", @@ -42584,7 +44011,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0", - "aff_country_unique": "China;Germany" + "aff_country_unique": "China;Germany", + "bibtex": "@InProceedings{Huang_2023_ICCV,\n \n author = {\n Huang,\n Zhisheng and Chen,\n Yujin and Kang,\n Di and Zhang,\n Jinlu and Tu,\n Zhigang\n},\n title = {\n PHRIT: Parametric Hand Representation with Implicit Template\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14974-14984\n} \n}" }, { "title": "PIDRo: Parallel Isomeric Attention with Dynamic Routing for Text-Video Retrieval", @@ -42596,7 +44024,7 @@ "author": "Peiyan Guan; Renjing Pei; Bin Shao; Jianzhuang Liu; Weimian Li; Jiaxi Gu; Hang Xu; Songcen Xu; Youliang Yan; Edmund Y. Lam", "abstract": "Text-video retrieval is a fundamental task with high practical value in multi-modal research. Inspired by the great success of pre-trained image-text models with large-scale data, such as CLIP, many methods are proposed to transfer the strong representation learning capability of CLIP to text-video retrieval. However, due to the modality difference between videos and images, how to effectively adapt CLIP to the video domain is still underexplored. In this paper, we investigate this problem from two aspects. First, we enhance the transferred image encoder of CLIP for fine-grained video understanding in a seamless fashion. Second, we conduct fine-grained contrast between videos and texts from both model improvement and loss design. Particularly, we propose a fine-grained contrastive model equipped with parallel isomeric attention and dynamic routing, namely PIDRo, for text-video retrieval. The parallel isomeric attention module is used as the video encoder, which consists of two parallel branches modeling the spatial-temporal information of videos from both patch and frame levels. The dynamic routing module is constructed to enhance the text encoder of CLIP, generating informative word representations by distributing the fine-grained information to the related word tokens within a sentence. Such model design provides us with informative patch, frame and word representations. We then conduct token-wise interaction upon them. With the enhanced encoders and the token-wise loss, we are able to achieve finer-grained text-video alignment and more accurate retrieval. PIDRo obtains state-of-the-art performance over various text-video retrieval benchmarks, including MSR-VTT, MSVD, LSMDC, DiDeMo and ActivityNet.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Guan_PIDRo_Parallel_Isomeric_Attention_with_Dynamic_Routing_for_Text-Video_Retrieval_ICCV_2023_paper.pdf", - "aff": "The University of Hong Kong; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; The University of Hong Kong+Huawei Noah\u2019s Ark Lab", + "aff": "The University of Hong Kong; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; The University of Hong Kong+Huawei Noah’s Ark Lab", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Guan_PIDRo_Parallel_Isomeric_ICCV_2023_supplemental.pdf", @@ -42609,14 +44037,15 @@ "author_num": 10, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Guan_PIDRo_Parallel_Isomeric_Attention_with_Dynamic_Routing_for_Text-Video_Retrieval_ICCV_2023_paper.html", "aff_unique_index": "0;1;1;1;1;1;1;1;1;0+1", - "aff_unique_norm": "University of Hong Kong;Huawei", - "aff_unique_dep": ";Noah\u2019s Ark Lab", + "aff_unique_norm": "The University of Hong Kong;Huawei", + "aff_unique_dep": ";Noah’s Ark Lab", "aff_unique_url": "https://www.hku.hk;https://www.huawei.com", "aff_unique_abbr": "HKU;Huawei", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Guan_2023_ICCV,\n \n author = {\n Guan,\n Peiyan and Pei,\n Renjing and Shao,\n Bin and Liu,\n Jianzhuang and Li,\n Weimian and Gu,\n Jiaxi and Xu,\n Hang and Xu,\n Songcen and Yan,\n Youliang and Lam,\n Edmund Y.\n},\n title = {\n PIDRo: Parallel Isomeric Attention with Dynamic Routing for Text-Video Retrieval\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11164-11173\n} \n}" }, { "title": "PIRNet: Privacy-Preserving Image Restoration Network via Wavelet Lifting", @@ -42648,7 +44077,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Deng_2023_ICCV,\n \n author = {\n Deng,\n Xin and Gao,\n Chao and Xu,\n Mai\n},\n title = {\n PIRNet: Privacy-Preserving Image Restoration Network via Wavelet Lifting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22368-22377\n} \n}" }, { "title": "PNI : Industrial Anomaly Detection using Position and Neighborhood Information", @@ -42680,7 +44110,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0;0", - "aff_country_unique": "United States;South Korea" + "aff_country_unique": "United States;South Korea", + "bibtex": "@InProceedings{Bae_2023_ICCV,\n \n author = {\n Bae,\n Jaehyeok and Lee,\n Jae-Han and Kim,\n Seyun\n},\n title = {\n PNI : Industrial Anomaly Detection using Position and Neighborhood Information\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6373-6383\n} \n}" }, { "title": "PODA: Prompt-driven Zero-shot Domain Adaptation", @@ -42688,8 +44119,8 @@ "status": "Poster", "track": "main", "pid": "6091", - "author_site": "Mohammad Fahes, Tuan-Hung Vu, Andrei Bursuc, Patrick P\u00e9rez, Raoul de Charette", - "author": "Mohammad Fahes; Tuan-Hung Vu; Andrei Bursuc; Patrick P\u00e9rez; Raoul de Charette", + "author_site": "Mohammad Fahes, Tuan-Hung Vu, Andrei Bursuc, Patrick Pérez, Raoul de Charette", + "author": "Mohammad Fahes; Tuan-Hung Vu; Andrei Bursuc; Patrick Pérez; Raoul de Charette", "abstract": "Domain adaptation has been vastly investigated in computer vision but still requires access to target images at train time, which might be intractable in some uncommon conditions. In this paper, we propose the task of 'Prompt-driven Zero-shot Domain Adaptation', where we adapt a model trained on a source domain using only a general description in natural language of the target domain, i.e., a prompt. First, we leverage a pretrained contrastive vision-language model (CLIP) to optimize affine transformations of source features, steering them towards the target text embedding while preserving their content and semantics. To achieve this, we propose Prompt-driven Instance Normalization (PIN). Second, we show that these prompt-driven augmentations can be used to perform zero-shot domain adaptation for semantic segmentation. Experiments demonstrate that our method significantly outperforms CLIP-based style transfer baselines on several datasets for the downstream task at hand, even surpassing one-shot unsupervised domain adaptation. A similar boost is observed on object detection and image classification. The code is available at https://github.com/astra-vision/PODA .", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Fahes_PODA_Prompt-driven_Zero-shot_Domain_Adaptation_ICCV_2023_paper.pdf", "aff": "Inria; Inria+Valeo.ai; Inria+Valeo.ai; Inria+Valeo.ai; Inria", @@ -42705,14 +44136,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Fahes_PODA_Prompt-driven_Zero-shot_Domain_Adaptation_ICCV_2023_paper.html", "aff_unique_index": "0;0+1;0+1;0+1;0", - "aff_unique_norm": "INRIA;Valeo", + "aff_unique_norm": "Inria;Valeo", "aff_unique_dep": ";Valeo.ai", "aff_unique_url": "https://www.inria.fr;https://www.valeo.com", "aff_unique_abbr": "Inria;Valeo", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0+0;0+0;0", - "aff_country_unique": "France" + "aff_country_unique": "France", + "bibtex": "@InProceedings{Fahes_2023_ICCV,\n \n author = {\n Fahes,\n Mohammad and Vu,\n Tuan-Hung and Bursuc,\n Andrei and P\\'erez,\n Patrick and de Charette,\n Raoul\n},\n title = {\n PODA: Prompt-driven Zero-shot Domain Adaptation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18623-18633\n} \n}" }, { "title": "PODIA-3D: Domain Adaptation of 3D Generative Model Across Large Domain Gap Using Pose-Preserved Text-to-Image Diffusion", @@ -42744,7 +44176,8 @@ "aff_campus_unique_index": "0;0;1+0", "aff_campus_unique": "Seoul;Los Angeles", "aff_country_unique_index": "0;0;1+0", - "aff_country_unique": "South Korea;United States" + "aff_country_unique": "South Korea;United States", + "bibtex": "@InProceedings{Kim_2023_ICCV,\n \n author = {\n Kim,\n Gwanghyun and Jang,\n Ji Ha and Chun,\n Se Young\n},\n title = {\n PODIA-3D: Domain Adaptation of 3D Generative Model Across Large Domain Gap Using Pose-Preserved Text-to-Image Diffusion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22603-22612\n} \n}" }, { "title": "PPR: Physically Plausible Reconstruction from Monocular Videos", @@ -42776,7 +44209,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n Gengshan and Yang,\n Shuo and Zhang,\n John Z. and Manchester,\n Zachary and Ramanan,\n Deva\n},\n title = {\n PPR: Physically Plausible Reconstruction from Monocular Videos\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3914-3924\n} \n}" }, { "title": "PRANC: Pseudo RAndom Networks for Compacting Deep Models", @@ -42808,7 +44242,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Davis;", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Nooralinejad_2023_ICCV,\n \n author = {\n Nooralinejad,\n Parsa and Abbasi,\n Ali and Koohpayegani,\n Soroush Abbasi and Meibodi,\n Kossar Pourahmadi and Khan,\n Rana Muhammad Shahroz and Kolouri,\n Soheil and Pirsiavash,\n Hamed\n},\n title = {\n PRANC: Pseudo RAndom Networks for Compacting Deep Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17021-17031\n} \n}" }, { "title": "PRIOR: Prototype Representation Joint Learning from Medical Images and Reports", @@ -42833,14 +44268,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Cheng_PRIOR_Prototype_Representation_Joint_Learning_from_Medical_Images_and_Reports_ICCV_2023_paper.html", "aff_unique_index": "0+0;1;2;3;4;0+0", - "aff_unique_norm": "Southern University of Science and Technology;University of Hong Kong;University of Queensland;University of British Columbia;Sun Yat-sen University", + "aff_unique_norm": "Southern University of Science and Technology;The University of Hong Kong;The University of Queensland;University of British Columbia;Sun Yat-sen University", "aff_unique_dep": "Department of Electronic and Electrical Engineering;Department of Electrical and Electronic Engineering;Queensland Brain Institute;School of Biomedical Engineering;", "aff_unique_url": "https://www.sustech.edu.cn;https://www.hku.hk;https://www.uq.edu.au;https://www.ubc.ca;http://www.sysu.edu.cn/", "aff_unique_abbr": "SUSTech;HKU;UQ;UBC;SYSU", "aff_campus_unique_index": "1;2;3;4;5;1", "aff_campus_unique": ";Jiaxing;Hong Kong SAR;Queensland;Vancouver;Shenzhen", "aff_country_unique_index": "0+0;0;1;2;0;0+0", - "aff_country_unique": "China;Australia;Canada" + "aff_country_unique": "China;Australia;Canada", + "bibtex": "@InProceedings{Cheng_2023_ICCV,\n \n author = {\n Cheng,\n Pujin and Lin,\n Li and Lyu,\n Junyan and Huang,\n Yijin and Luo,\n Wenhan and Tang,\n Xiaoying\n},\n title = {\n PRIOR: Prototype Representation Joint Learning from Medical Images and Reports\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21361-21371\n} \n}" }, { "title": "PVT++: A Simple End-to-End Latency-Aware Visual Tracking Framework", @@ -42863,7 +44299,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_PVT_A_Simple_End-to-End_Latency-Aware_Visual_Tracking_Framework_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_PVT_A_Simple_End-to-End_Latency-Aware_Visual_Tracking_Framework_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Bowen and Huang,\n Ziyuan and Ye,\n Junjie and Li,\n Yiming and Scherer,\n Sebastian and Zhao,\n Hang and Fu,\n Changhong\n},\n title = {\n PVT++: A Simple End-to-End Latency-Aware Visual Tracking Framework\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10006-10016\n} \n}" }, { "title": "Pairwise Similarity Learning is SimPLE", @@ -42871,8 +44308,8 @@ "status": "Poster", "track": "main", "pid": "10034", - "author_site": "Yandong Wen, Weiyang Liu, Yao Feng, Bhiksha Raj, Rita Singh, Adrian Weller, Michael J. Black, Bernhard Sch\u00f6lkopf", - "author": "Yandong Wen; Weiyang Liu; Yao Feng; Bhiksha Raj; Rita Singh; Adrian Weller; Michael J. Black; Bernhard Sch\u00f6lkopf", + "author_site": "Yandong Wen, Weiyang Liu, Yao Feng, Bhiksha Raj, Rita Singh, Adrian Weller, Michael J. Black, Bernhard Schölkopf", + "author": "Yandong Wen; Weiyang Liu; Yao Feng; Bhiksha Raj; Rita Singh; Adrian Weller; Michael J. Black; Bernhard Schölkopf", "abstract": "In this paper, we focus on a general yet important learning problem, pairwise similarity learning (PSL). PSL subsumes a wide range of important applications, such as open-set face recognition, speaker verification, image retrieval and person re-identification. The goal of PSL is to learn a pairwise similarity function assigning a higher similarity score to positive pairs (i.e., a pair of samples with the same label) than to negative pairs (i.e., a pair of samples with different label). We start by identifying a key desideratum for PSL, and then discuss how existing methods can achieve this desideratum. We then propose a surprisingly simple proxy-free method, called SimPLE, which requires neither feature/proxy normalization nor angular margin and yet is able to generalize well in open-set recognition. We apply the proposed method to three challenging PSL tasks: open-set face recognition, image retrieval and speaker verification. Comprehensive experimental results on large-scale benchmarks show that our method performs significantly better than current state-of-the-art methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Wen_Pairwise_Similarity_Learning_is_SimPLE_ICCV_2023_paper.pdf", "aff": "Max Planck Institute for Intelligent Systems; Max Planck Institute for Intelligent Systems+University of Cambridge; Max Planck Institute for Intelligent Systems; Carnegie Mellon University+Mohamed bin Zayed University of Artificial Intelligence; Carnegie Mellon University; University of Cambridge+The Alan Turing Institute; Max Planck Institute for Intelligent Systems; Max Planck Institute for Intelligent Systems", @@ -42888,14 +44325,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wen_Pairwise_Similarity_Learning_is_SimPLE_ICCV_2023_paper.html", "aff_unique_index": "0;0+1;0;2+3;2;1+4;0;0", - "aff_unique_norm": "Max Planck Institute for Intelligent Systems;University of Cambridge;Carnegie Mellon University;Mohamed bin Zayed University of Artificial Intelligence;Alan Turing Institute", + "aff_unique_norm": "Max Planck Institute for Intelligent Systems;University of Cambridge;Carnegie Mellon University;Mohamed bin Zayed University of Artificial Intelligence;The Alan Turing Institute", "aff_unique_dep": "Intelligent Systems;;;;", - "aff_unique_url": "https://www.mpi-is.mpg.de;https://www.cam.ac.uk;https://www.cmu.edu;https://mbzuai.ac.ae;https://www.turing.ac.uk", + "aff_unique_url": "https://www.mpi-is.mpg.de;https://www.cam.ac.uk;https://www.cmu.edu;https://www.mbzuai.ac.ae;https://www.turing.ac.uk", "aff_unique_abbr": "MPI-IS;Cambridge;CMU;MBZUAI;ATI", "aff_campus_unique_index": "1;;1", "aff_campus_unique": ";Cambridge", "aff_country_unique_index": "0;0+1;0;2+3;2;1+1;0;0", - "aff_country_unique": "Germany;United Kingdom;United States;United Arab Emirates" + "aff_country_unique": "Germany;United Kingdom;United States;United Arab Emirates", + "bibtex": "@InProceedings{Wen_2023_ICCV,\n \n author = {\n Wen,\n Yandong and Liu,\n Weiyang and Feng,\n Yao and Raj,\n Bhiksha and Singh,\n Rita and Weller,\n Adrian and Black,\n Michael J. and Sch\\"olkopf,\n Bernhard\n},\n title = {\n Pairwise Similarity Learning is SimPLE\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5308-5318\n} \n}" }, { "title": "PanFlowNet: A Flow-Based Deep Network for Pan-Sharpening", @@ -42907,7 +44345,7 @@ "author": "Gang Yang; Xiangyong Cao; Wenzhe Xiao; Man Zhou; Aiping Liu; Xun Chen; Deyu Meng", "abstract": "Pan-sharpening aims to generate a high-resolution multispectral (HRMS) image by integrating the spectral information of a low-resolution multispectral (LRMS) image with the texture details of a high-resolution panchromatic (PAN) image. It essentially inherits the ill-posed nature of the super-resolution (SR) task that diverse HRMS images can degrade into an LRMS image. However, existing deep learning-based methods recover only one HRMS image from the LRMS image and PAN image using a deterministic mapping, thus ignoring the diversity of the HRMS image. In this paper, to alleviate this ill-posed issue, we propose a flow-based pan-sharpening network (PanFlowNet) to directly learn the conditional distribution of HRMS image given LRMS image and PAN image instead of learning a deterministic mapping. Specifically, we first transform this unknown conditional distribution into a given Gaussian distribution by an invertible network, and the conditional distribution can thus be explicitly defined. Then, we design an invertible Conditional Affine Coupling Block (CACB) and further build the architecture of PanFlowNet by stacking a series of CACBs. Finally, the PanFlowNet is trained by maximizing the log-likelihood of the conditional distribution given a training set and can then be used to predict diverse HRMS images. The experimental results verify that the proposed PanFlowNet can generate various HRMS images given an LRMS image and a PAN image. Additionally, the experimental results on different kinds of satellite datasets also demonstrate the superiority of our PanFlowNet compared with other state-of-the-art methods both visually and quantitatively. Code is available at Github.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Yang_PanFlowNet_A_Flow-Based_Deep_Network_for_Pan-Sharpening_ICCV_2023_paper.pdf", - "aff": "University of Science and Technology of China; Xi\u2019an Jiaotong University; Xi\u2019an Jiaotong University; Nanyang Technological University; University of Science and Technology of China; University of Science and Technology of China; Xi\u2019an Jiaotong University+Macao Institute of Systems Engineering, Macau University of Science and Technology, Taipa, Macao", + "aff": "University of Science and Technology of China; Xi’an Jiaotong University; Xi’an Jiaotong University; Nanyang Technological University; University of Science and Technology of China; University of Science and Technology of China; Xi’an Jiaotong University+Macao Institute of Systems Engineering, Macau University of Science and Technology, Taipa, Macao", "project": "", "github": "", "supp": "", @@ -42920,14 +44358,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yang_PanFlowNet_A_Flow-Based_Deep_Network_for_Pan-Sharpening_ICCV_2023_paper.html", "aff_unique_index": "0;1;1;2;0;0;1+3", - "aff_unique_norm": "University of Science and Technology of China;Xi'an Jiao Tong University;Nanyang Technological University;Macau University of Science and Technology", + "aff_unique_norm": "University of Science and Technology of China;Xi'an Jiaotong University;Nanyang Technological University;Macau University of Science and Technology", "aff_unique_dep": ";;;Macao Institute of Systems Engineering", "aff_unique_url": "http://www.ustc.edu.cn;https://www.xjtu.edu.cn;https://www.ntu.edu.sg;https://www.must.edu.mo", "aff_unique_abbr": "USTC;XJTU;NTU;MUST", "aff_campus_unique_index": "1", "aff_campus_unique": ";Macau SAR", "aff_country_unique_index": "0;0;0;1;0;0;0+0", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n Gang and Cao,\n Xiangyong and Xiao,\n Wenzhe and Zhou,\n Man and Liu,\n Aiping and Chen,\n Xun and Meng,\n Deyu\n},\n title = {\n PanFlowNet: A Flow-Based Deep Network for Pan-Sharpening\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16857-16867\n} \n}" }, { "title": "Panoramas from Photons", @@ -42959,7 +44398,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Madison;", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Jungerman_2023_ICCV,\n \n author = {\n Jungerman,\n Sacha and Ingle,\n Atul and Gupta,\n Mohit\n},\n title = {\n Panoramas from Photons\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10626-10636\n} \n}" }, { "title": "ParCNetV2: Oversized Kernel with Enhanced Attention", @@ -42991,7 +44431,8 @@ "aff_campus_unique_index": ";1", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0+0;0+0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xu_2023_ICCV,\n \n author = {\n Xu,\n Ruihan and Zhang,\n Haokui and Hu,\n Wenze and Zhang,\n Shiliang and Wang,\n Xiaoyu\n},\n title = {\n ParCNetV2: Oversized Kernel with Enhanced Attention\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5752-5762\n} \n}" }, { "title": "Parallax-Tolerant Unsupervised Deep Image Stitching", @@ -43016,14 +44457,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Nie_Parallax-Tolerant_Unsupervised_Deep_Image_Stitching_ICCV_2023_paper.html", "aff_unique_index": "0+1;0+1;0+1;2;0+1", - "aff_unique_norm": "Beijing Jiao Tong University;Beijing Key Laboratory of Advanced Information Science and Network;University of Electronic Science and Technology of China", + "aff_unique_norm": "Beijing Jiaotong University;Beijing Key Laboratory of Advanced Information Science and Network;University of Electronic Science and Technology of China", "aff_unique_dep": "Institute of Information Science;Advanced Information Science and Network;", "aff_unique_url": "http://www.bjtu.edu.cn;;http://www.uestc.edu.cn", "aff_unique_abbr": "BJTU;;UESTC", "aff_campus_unique_index": "0+0;0+0;0+0;1;0+0", "aff_campus_unique": "Beijing;Chengdu", "aff_country_unique_index": "0+0;0+0;0+0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Nie_2023_ICCV,\n \n author = {\n Nie,\n Lang and Lin,\n Chunyu and Liao,\n Kang and Liu,\n Shuaicheng and Zhao,\n Yao\n},\n title = {\n Parallax-Tolerant Unsupervised Deep Image Stitching\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7399-7408\n} \n}" }, { "title": "Parallel Attention Interaction Network for Few-Shot Skeleton-Based Action Recognition", @@ -43035,7 +44477,7 @@ "author": "Xingyu Liu; Sanping Zhou; Le Wang; Gang Hua", "abstract": "Learning discriminative features from very few labeled samples to identify novel classes has received increasing attention in skeleton-based action recognition. Existing works aim to learn action-specific embeddings by exploiting either intra-skeleton or inter-skeleton spatial associations, which may lead to less discriminative representations. To address these issues, we propose a novel Parallel Attention Interaction Network (PAINet) that incorporates two complementary branches to strengthen the match by inter-skeleton and intra-skeleton correlation. Specifically, a topology encoding module utilizing topology and physical information is proposed to enhance the modeling of interactive parts and joint pairs in both branches. In the Cross Spatial Alignment branch, we employ a spatial cross-attention module to establish joint associations across sequences, and a directional Average Symmetric Surface Metric is introduced to locate the closest temporal similarity. In parallel, the Cross Temporal Alignment branch incorporates a spatial self-attention module to aggregate spatial context within sequences as well as applies the temporal cross-attention network to correct misalignment temporally and calculate similarity. Extensive experiments on three skeleton benchmarks, namely NTU-T, NTU-S, and Kinetics, demonstrate the superiority of our framework and consistently outperform state-of-the-art methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Liu_Parallel_Attention_Interaction_Network_for_Few-Shot_Skeleton-Based_Action_Recognition_ICCV_2023_paper.pdf", - "aff": "National Key Laboratory of Human-Machine Hybrid Augmented Intelligence, National Engineering Research Center for Visual Information and Applications, and Institute of Artificial Intelligence and Robotics, Xi\u2019an Jiaotong University; National Key Laboratory of Human-Machine Hybrid Augmented Intelligence, National Engineering Research Center for Visual Information and Applications, and Institute of Artificial Intelligence and Robotics, Xi\u2019an Jiaotong University; National Key Laboratory of Human-Machine Hybrid Augmented Intelligence, National Engineering Research Center for Visual Information and Applications, and Institute of Artificial Intelligence and Robotics, Xi\u2019an Jiaotong University; Wormpex AI Research", + "aff": "National Key Laboratory of Human-Machine Hybrid Augmented Intelligence, National Engineering Research Center for Visual Information and Applications, and Institute of Artificial Intelligence and Robotics, Xi’an Jiaotong University; National Key Laboratory of Human-Machine Hybrid Augmented Intelligence, National Engineering Research Center for Visual Information and Applications, and Institute of Artificial Intelligence and Robotics, Xi’an Jiaotong University; National Key Laboratory of Human-Machine Hybrid Augmented Intelligence, National Engineering Research Center for Visual Information and Applications, and Institute of Artificial Intelligence and Robotics, Xi’an Jiaotong University; Wormpex AI Research", "project": "", "github": "", "supp": "", @@ -43048,14 +44490,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Liu_Parallel_Attention_Interaction_Network_for_Few-Shot_Skeleton-Based_Action_Recognition_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;1", - "aff_unique_norm": "Xi'an Jiao Tong University;Wormpex AI Research", + "aff_unique_norm": "Xi'an Jiaotong University;Wormpex AI Research", "aff_unique_dep": "Institute of Artificial Intelligence and Robotics;AI Research", "aff_unique_url": "http://www.xjtu.edu.cn;", "aff_unique_abbr": "XJTU;Wormpex AI", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Xi'an;", "aff_country_unique_index": "0;0;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Xingyu and Zhou,\n Sanping and Wang,\n Le and Hua,\n Gang\n},\n title = {\n Parallel Attention Interaction Network for Few-Shot Skeleton-Based Action Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1379-1388\n} \n}" }, { "title": "Parameterized Cost Volume for Stereo Matching", @@ -43063,6 +44506,7 @@ "status": "Poster", "track": "main", "pid": "3418", + "author_site": "Jiaxi Zeng, Chengtang Yao, Lidong Yu, Yuwei Wu, Yunde Jia", "author": "Jiaxi Zeng, Chengtang Yao, Lidong Yu, Yuwei Wu, Yunde Jia", "abstract": "Stereo matching becomes computationally challenging when dealing with a large disparity range. Prior methods mainly alleviate the computation through dynamic cost volume by focusing on a local disparity space, but it requires many iterations to get close to the ground truth due to the lack of a global view. We find that the dynamic cost volume approximately encodes the disparity space as a single Gaussian distribution with a fixed and small variance at each iteration, which results in an inadequate global view over disparity space and a small update step at every iteration. In this paper, we propose a parameterized cost volume to encode the entire disparity space using multi-Gaussian distribution. The disparity distribution of each pixel is parameterized by weights, means, and variances. The means and variances are used to sample disparity candidates for cost computation, while the weights and means are used to calculate the disparity output. The above parameters are computed through a JS-divergence-based optimization, which is realized as a gradient descent update in a feed-forward differential module. Experiments show that our method speeds up the runtime of RAFT-Stereo by 4 15 times, achieving real-time performance and comparable accuracy.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Zeng_Parameterized_Cost_Volume_for_Stereo_Matching_ICCV_2023_paper.pdf", @@ -43074,7 +44518,8 @@ "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16922848387664216054&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zeng_Parameterized_Cost_Volume_for_Stereo_Matching_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zeng_Parameterized_Cost_Volume_for_Stereo_Matching_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Zeng_2023_ICCV,\n \n author = {\n Zeng,\n Jiaxi and Yao,\n Chengtang and Yu,\n Lidong and Wu,\n Yuwei and Jia,\n Yunde\n},\n title = {\n Parameterized Cost Volume for Stereo Matching\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18347-18357\n} \n}" }, { "title": "Parametric Classification for Generalized Category Discovery: A Baseline Study", @@ -43099,14 +44544,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wen_Parametric_Classification_for_Generalized_Category_Discovery_A_Baseline_Study_ICCV_2023_paper.html", "aff_unique_index": "0;1;0", - "aff_unique_norm": "University of Hong Kong;University of Edinburgh", + "aff_unique_norm": "The University of Hong Kong;University of Edinburgh", "aff_unique_dep": ";", "aff_unique_url": "https://www.hku.hk;https://www.ed.ac.uk", "aff_unique_abbr": "HKU;Edinburgh", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "China;United Kingdom" + "aff_country_unique": "China;United Kingdom", + "bibtex": "@InProceedings{Wen_2023_ICCV,\n \n author = {\n Wen,\n Xin and Zhao,\n Bingchen and Qi,\n Xiaojuan\n},\n title = {\n Parametric Classification for Generalized Category Discovery: A Baseline Study\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16590-16600\n} \n}" }, { "title": "Parametric Depth Based Feature Representation Learning for Object Detection and Segmentation in Bird's-Eye View", @@ -43131,14 +44577,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yang_Parametric_Depth_Based_Feature_Representation_Learning_for_Object_Detection_and_ICCV_2023_paper.html", "aff_unique_index": "0;1;0+2;2", - "aff_unique_norm": "Australian National University;University of Hong Kong;NVIDIA", - "aff_unique_dep": ";;NVIDIA Corporation", + "aff_unique_norm": "Australian National University;The University of Hong Kong;NVIDIA Corporation", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.anu.edu.au;https://www.hku.hk;https://www.nvidia.com", "aff_unique_abbr": "ANU;HKU;NVIDIA", "aff_campus_unique_index": "1;", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;1;0+2;2", - "aff_country_unique": "Australia;China;United States" + "aff_country_unique": "Australia;China;United States", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n Jiayu and Xie,\n Enze and Liu,\n Miaomiao and Alvarez,\n Jose M.\n},\n title = {\n Parametric Depth Based Feature Representation Learning for Object Detection and Segmentation in Bird's-Eye View\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8483-8492\n} \n}" }, { "title": "Parametric Information Maximization for Generalized Category Discovery", @@ -43150,7 +44597,7 @@ "author": "Florent Chiaroni; Jose Dolz; Ziko Imtiaz Masud; Amar Mitiche; Ismail Ben Ayed", "abstract": "We introduce a Parametric Information Maximization (PIM) model for the Generalized Category Discovery (GCD) problem. Specifically, we propose a bi-level optimization formulation, which explores a parameterized family of objective functions, each evaluating a weighted mutual information between the features and the latent labels, subject to supervision constraints from the labeled samples. Our formulation mitigates the class-balance bias encoded in standard information maximization approaches, thereby handling effectively both short-tailed and long-tailed data sets. We report extensive experiments and comparisons demonstrating that our PIM model consistently sets new state-of-the-art performances in GCD across six different datasets, more so when dealing with challenging fine-grained problems. Our code: https://github.com/ThalesGroup/pim-generalized-category-discovery.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Chiaroni_Parametric_Information_Maximization_for_Generalized_Category_Discovery_ICCV_2023_paper.pdf", - "aff": "\u00b4ETS Montreal + Thales Digital Solutions; \u00b4ETS Montreal; Thales Digital Solutions; INRS; \u00b4ETS Montreal", + "aff": "´ETS Montreal + Thales Digital Solutions; ´ETS Montreal; Thales Digital Solutions; INRS; ´ETS Montreal", "project": "", "github": "https://github.com/ThalesGroup/pim-generalized-category-discovery", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Chiaroni_Parametric_Information_Maximization_ICCV_2023_supplemental.pdf", @@ -43163,14 +44610,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chiaroni_Parametric_Information_Maximization_for_Generalized_Category_Discovery_ICCV_2023_paper.html", "aff_unique_index": "0+1;0;1;2;0", - "aff_unique_norm": "\u00c9cole de technologie sup\u00e9rieure;Thales;Institut National de la Recherche Scientifique", + "aff_unique_norm": "École de technologie supérieure;Thales;Institut National de la Recherche Scientifique", "aff_unique_dep": ";Digital Solutions;", "aff_unique_url": "https://www.etsmtl.ca;https://www.thalesgroup.com;https://www.inrs.ca", "aff_unique_abbr": "ETS;Thales;INRS", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Montreal;", "aff_country_unique_index": "0+1;0;1;0;0", - "aff_country_unique": "Canada;France" + "aff_country_unique": "Canada;France", + "bibtex": "@InProceedings{Chiaroni_2023_ICCV,\n \n author = {\n Chiaroni,\n Florent and Dolz,\n Jose and Masud,\n Ziko Imtiaz and Mitiche,\n Amar and Ben Ayed,\n Ismail\n},\n title = {\n Parametric Information Maximization for Generalized Category Discovery\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1729-1739\n} \n}" }, { "title": "Part-Aware Transformer for Generalizable Person Re-identification", @@ -43178,6 +44626,7 @@ "status": "Poster", "track": "main", "pid": "6052", + "author_site": "Hao Ni, Yuke Li, Lianli Gao, Heng Tao Shen, Jingkuan Song", "author": "Hao Ni, Yuke Li, Lianli Gao, Heng Tao Shen, Jingkuan Song", "abstract": "Domain generalization person re-identification (DG ReID) aims to train a model on source domains and generalize well on unseen domains. Vision Transformer usually yields better generalization ability than common CNN networks under distribution shifts. However, Transformer-based ReID models inevitably overfit to domain-specific biases due to the supervised learning strategy on the source domain. We observe that while the global images of different IDs should have different features, their similar local parts (e.g., black backpack) are not bounded by this constraint. Motivated by this, we propose a pure Transformer model (termed Part-aware Transformer) for DG-ReID by designing a proxy task, named Cross-ID Similarity Learning (CSL), to mine local visual information shared by different IDs. This proxy task allows the model to learn generic features because it only cares about the visual similarity of the parts regardless of the ID labels, thus alleviating the side effect of domain-specific biases. Based on the local similarity obtained in CSL, a Part-guided Self-Distillation (PSD) is proposed to further improve the generalization of global features. Our method achieves state-of-the-art performance under most DG ReID settings. The code is available at https://github.com/liyuke65535/Part-Aware-Transformer.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Ni_Part-Aware_Transformer_for_Generalizable_Person_Re-identification_ICCV_2023_paper.pdf", @@ -43189,7 +44638,8 @@ "gs_citation": 77, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6177184922429834583&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ni_Part-Aware_Transformer_for_Generalizable_Person_Re-identification_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ni_Part-Aware_Transformer_for_Generalizable_Person_Re-identification_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Ni_2023_ICCV,\n \n author = {\n Ni,\n Hao and Li,\n Yuke and Gao,\n Lianli and Shen,\n Heng Tao and Song,\n Jingkuan\n},\n title = {\n Part-Aware Transformer for Generalizable Person Re-identification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11280-11289\n} \n}" }, { "title": "Partition Speeds Up Learning Implicit Neural Representations Based on Exponential-Increase Hypothesis", @@ -43214,14 +44664,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Liu_Partition_Speeds_Up_Learning_Implicit_Neural_Representations_Based_on_Exponential-Increase_ICCV_2023_paper.html", "aff_unique_index": "0;1;0;0;0;2", - "aff_unique_norm": "Zhejiang University;University of Melbourne;Hong Kong Baptist University", + "aff_unique_norm": "Zhejiang University;The University of Melbourne;Hong Kong Baptist University", "aff_unique_dep": "College of Computer Science;School of Computing and Information Systems;Department of Computer Science", "aff_unique_url": "http://www.zju.edu.cn;https://www.unimelb.edu.au;https://www.hkbu.edu.hk", "aff_unique_abbr": "ZJU;UniMelb;HKBU", "aff_campus_unique_index": "0;1;0;0;0;2", "aff_campus_unique": "Hangzhou;Melbourne;Hong Kong", "aff_country_unique_index": "0;1;0;0;0;0", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Ke and Liu,\n Feng and Wang,\n Haishuai and Ma,\n Ning and Bu,\n Jiajun and Han,\n Bo\n},\n title = {\n Partition Speeds Up Learning Implicit Neural Representations Based on Exponential-Increase Hypothesis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5474-5483\n} \n}" }, { "title": "Partition-And-Debias: Agnostic Biases Mitigation via a Mixture of Biases-Specific Experts", @@ -43246,14 +44697,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_Partition-And-Debias_Agnostic_Biases_Mitigation_via_a_Mixture_of_Biases-Specific_Experts_ICCV_2023_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "University of Tokyo", + "aff_unique_norm": "The University of Tokyo", "aff_unique_dep": "", "aff_unique_url": "https://www.u-tokyo.ac.jp", "aff_unique_abbr": "UTokyo", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Jiaxuan and Vo,\n Duc Minh and Nakayama,\n Hideki\n},\n title = {\n Partition-And-Debias: Agnostic Biases Mitigation via a Mixture of Biases-Specific Experts\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4924-4934\n} \n}" }, { "title": "Passive Ultra-Wideband Single-Photon Imaging", @@ -43285,7 +44737,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Toronto", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Wei_2023_ICCV,\n \n author = {\n Wei,\n Mian and Nousias,\n Sotiris and Gulve,\n Rahul and Lindell,\n David B. and Kutulakos,\n Kiriakos N.\n},\n title = {\n Passive Ultra-Wideband Single-Photon Imaging\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8135-8146\n} \n}" }, { "title": "PatchCT: Aligning Patch Set and Label Set with Conditional Transport for Multi-Label Image Classification", @@ -43297,7 +44750,7 @@ "author": "Miaoge Li; Dongsheng Wang; Xinyang Liu; Zequn Zeng; Ruiying Lu; Bo Chen; Mingyuan Zhou", "abstract": "Multi-label image classification is a prediction task that aims to identify more than one label from a given image. This paper considers the semantic consistency of the latent space between the visual patch and linguistic label domains and introduces the conditional transport (CT) theory to bridge the acknowledged gap. While recent cross-modal attention-based studies have attempted to align such two representations and achieved impressive performance, they required carefully-designed alignment modules and extra complex operations in the attention computation. We find that by formulating the multi-label classification as a CT problem, we can exploit the interactions between the image and label efficiently by minimizing the bidirectional CT cost. Specifically, after feeding the images and textual labels into the modality-specific encoders, we view each image as a mixture of patch embeddings and a mixture of label embeddings, which capture the local region features and the class prototypes, respectively. CT is then employed to learn and align those two semantic sets by defining the forward and backward navigators. Importantly, the defined navigators in CT distance model the similarities between patches and labels, which provides an interpretable tool to visualize the learned prototypes. Extensive experiments on three public image benchmarks show that the proposed model consistently outperforms the previous methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Li_PatchCT_Aligning_Patch_Set_and_Label_Set_with_Conditional_Transport_ICCV_2023_paper.pdf", - "aff": "National Key Laboratory of Radar Signal Processing, Xidian University, Xi\u2019an, Shanxi 710071, China; National Key Laboratory of Radar Signal Processing, Xidian University, Xi\u2019an, Shanxi 710071, China; National Key Laboratory of Radar Signal Processing, Xidian University, Xi\u2019an, Shanxi 710071, China; National Key Laboratory of Radar Signal Processing, Xidian University, Xi\u2019an, Shanxi 710071, China; National Key Laboratory of Radar Signal Processing, Xidian University, Xi\u2019an, Shanxi 710071, China; National Key Laboratory of Radar Signal Processing, Xidian University, Xi\u2019an, Shanxi 710071, China; McCombs School of Business, The University of Texas at Austin, Austin, TX 78712, USA", + "aff": "National Key Laboratory of Radar Signal Processing, Xidian University, Xi’an, Shanxi 710071, China; National Key Laboratory of Radar Signal Processing, Xidian University, Xi’an, Shanxi 710071, China; National Key Laboratory of Radar Signal Processing, Xidian University, Xi’an, Shanxi 710071, China; National Key Laboratory of Radar Signal Processing, Xidian University, Xi’an, Shanxi 710071, China; National Key Laboratory of Radar Signal Processing, Xidian University, Xi’an, Shanxi 710071, China; National Key Laboratory of Radar Signal Processing, Xidian University, Xi’an, Shanxi 710071, China; McCombs School of Business, The University of Texas at Austin, Austin, TX 78712, USA", "project": "", "github": "", "supp": "", @@ -43310,14 +44763,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_PatchCT_Aligning_Patch_Set_and_Label_Set_with_Conditional_Transport_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0;0;0;1", - "aff_unique_norm": "Xidian University;University of Texas at Austin", + "aff_unique_norm": "Xidian University;The University of Texas at Austin", "aff_unique_dep": "National Key Laboratory of Radar Signal Processing;McCombs School of Business", "aff_unique_url": "http://www.xidian.edu.cn/;https://www.mccombs.utexas.edu", "aff_unique_abbr": "Xidian;UT Austin", "aff_campus_unique_index": "0;0;0;0;0;0;1", "aff_campus_unique": "Xi'an;Austin", "aff_country_unique_index": "0;0;0;0;0;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Miaoge and Wang,\n Dongsheng and Liu,\n Xinyang and Zeng,\n Zequn and Lu,\n Ruiying and Chen,\n Bo and Zhou,\n Mingyuan\n},\n title = {\n PatchCT: Aligning Patch Set and Label Set with Conditional Transport for Multi-Label Image Classification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15348-15358\n} \n}" }, { "title": "Perceptual Artifacts Localization for Image Synthesis Tasks", @@ -43340,7 +44794,8 @@ "aff_domain": ";;;;;;;;;", "email": ";;;;;;;;;", "author_num": 10, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhang_Perceptual_Artifacts_Localization_for_Image_Synthesis_Tasks_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhang_Perceptual_Artifacts_Localization_for_Image_Synthesis_Tasks_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Lingzhi and Xu,\n Zhengjie and Barnes,\n Connelly and Zhou,\n Yuqian and Liu,\n Qing and Zhang,\n He and Amirghodsi,\n Sohrab and Lin,\n Zhe and Shechtman,\n Eli and Shi,\n Jianbo\n},\n title = {\n Perceptual Artifacts Localization for Image Synthesis Tasks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7579-7590\n} \n}" }, { "title": "Perceptual Grouping in Contrastive Vision-Language Models", @@ -43352,7 +44807,7 @@ "author": "Kanchana Ranasinghe; Brandon McKinzie; Sachin Ravi; Yinfei Yang; Alexander Toshev; Jonathon Shlens", "abstract": "Recent advances in zero-shot image recognition suggest that vision-language models learn generic visual representations with a high degree of semantic information that may be arbitrarily probed with natural language phrases. Understanding an image, however, is not just about understanding what content resides within an image, but importantly, where that content resides. In this work we examine how well vision-language models are able to understand where objects reside within an image and group together visually related parts of the imagery. We demonstrate how contemporary vision and language representation learning models based on contrastive losses and large web-based data capture limited object localization information. We propose a minimal set of modifications that results in models that uniquely learn both semantic and spatial information. We measure this performance in terms of zero-shot image recognition, unsupervised bottom-up and top-down semantic segmentations, as well as robustness analyses. We find that the resulting model achieves state-of-the-art results in terms of unsupervised segmentation, and demonstrate that the learned representations are uniquely robust to spurious correlations in datasets designed to probe the causal behavior of vision models.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Ranasinghe_Perceptual_Grouping_in_Contrastive_Vision-Language_Models_ICCV_2023_paper.pdf", - "aff": "Apple*; Apple; Apple; Apple; Apple; Apple\u2020", + "aff": "Apple*; Apple; Apple; Apple; Apple; Apple†", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Ranasinghe_Perceptual_Grouping_in_ICCV_2023_supplemental.pdf", @@ -43365,14 +44820,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ranasinghe_Perceptual_Grouping_in_Contrastive_Vision-Language_Models_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0;0;0", - "aff_unique_norm": "Apple", - "aff_unique_dep": "Apple Inc.", + "aff_unique_norm": "Apple Inc.", + "aff_unique_dep": "", "aff_unique_url": "https://www.apple.com", "aff_unique_abbr": "Apple", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Ranasinghe_2023_ICCV,\n \n author = {\n Ranasinghe,\n Kanchana and McKinzie,\n Brandon and Ravi,\n Sachin and Yang,\n Yinfei and Toshev,\n Alexander and Shlens,\n Jonathon\n},\n title = {\n Perceptual Grouping in Contrastive Vision-Language Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5571-5584\n} \n}" }, { "title": "Periodically Exchange Teacher-Student for Source-Free Object Detection", @@ -43404,7 +44860,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Qipeng and Lin,\n Luojun and Shen,\n Zhifeng and Yang,\n Zhifeng\n},\n title = {\n Periodically Exchange Teacher-Student for Source-Free Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6414-6424\n} \n}" }, { "title": "Perpetual Humanoid Control for Real-time Simulated Avatars", @@ -43436,7 +44893,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0+0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Luo_2023_ICCV,\n \n author = {\n Luo,\n Zhengyi and Cao,\n Jinkun and AlexanderWinkler and Kitani,\n Kris and Xu,\n Weipeng\n},\n title = {\n Perpetual Humanoid Control for Real-time Simulated Avatars\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10895-10904\n} \n}" }, { "title": "Persistent-Transient Duality: A Multi-Mechanism Approach for Modeling Human-Object Interaction", @@ -43461,14 +44919,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Tran_Persistent-Transient_Duality_A_Multi-Mechanism_Approach_for_Modeling_Human-Object_Interaction_ICCV_2023_paper.html", "aff_unique_index": "0;1;0;0", - "aff_unique_norm": "Deakin University;Amazon", - "aff_unique_dep": "Applied AI Institute;Amazon.com, Inc.", + "aff_unique_norm": "Deakin University;Amazon.com, Inc.", + "aff_unique_dep": "Applied AI Institute;", "aff_unique_url": "https://www.deakin.edu.au;https://www.amazon.com", "aff_unique_abbr": ";Amazon", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", - "aff_country_unique": "Australia;United States" + "aff_country_unique": "Australia;United States", + "bibtex": "@InProceedings{Tran_2023_ICCV,\n \n author = {\n Tran,\n Hung and Le,\n Vuong and Venkatesh,\n Svetha and Tran,\n Truyen\n},\n title = {\n Persistent-Transient Duality: A Multi-Mechanism Approach for Modeling Human-Object Interaction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9858-9867\n} \n}" }, { "title": "Person Re-Identification without Identification via Event anonymization", @@ -43500,7 +44959,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0", - "aff_country_unique": "Italy" + "aff_country_unique": "Italy", + "bibtex": "@InProceedings{Ahmad_2023_ICCV,\n \n author = {\n Ahmad,\n Shafiq and Morerio,\n Pietro and Del Bue,\n Alessio\n},\n title = {\n Person Re-Identification without Identification via Event anonymization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11132-11141\n} \n}" }, { "title": "Personalized Image Generation for Color Vision Deficiency Population", @@ -43525,14 +44985,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Jiang_Personalized_Image_Generation_for_Color_Vision_Deficiency_Population_ICCV_2023_paper.html", "aff_unique_index": "0;0;1;0", - "aff_unique_norm": "University of Sydney;Pengcheng Laboratory", - "aff_unique_dep": ";Peng Cheng Laboratory", + "aff_unique_norm": "University of Sydney;Peng Cheng Laboratory", + "aff_unique_dep": ";", "aff_unique_url": "https://www.sydney.edu.au;http://www.pcl.ac.cn", "aff_unique_abbr": "USYD;PCL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", - "aff_country_unique": "Australia;China" + "aff_country_unique": "Australia;China", + "bibtex": "@InProceedings{Jiang_2023_ICCV,\n \n author = {\n Jiang,\n Shuyi and Liu,\n Daochang and Li,\n Dingquan and Xu,\n Chang\n},\n title = {\n Personalized Image Generation for Color Vision Deficiency Population\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22571-22580\n} \n}" }, { "title": "Personalized Semantics Excitation for Federated Image Classification", @@ -43564,7 +45025,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Xia_2023_ICCV,\n \n author = {\n Xia,\n Haifeng and Li,\n Kai and Ding,\n Zhengming\n},\n title = {\n Personalized Semantics Excitation for Federated Image Classification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19301-19310\n} \n}" }, { "title": "PhaseMP: Robust 3D Pose Estimation via Phase-conditioned Human Motion Prior", @@ -43589,14 +45051,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Shi_PhaseMP_Robust_3D_Pose_Estimation_via_Phase-conditioned_Human_Motion_Prior_ICCV_2023_paper.html", "aff_unique_index": "0;1;1;0;2", - "aff_unique_norm": "University of Hong Kong;Meta;Seoul National University", - "aff_unique_dep": ";Meta Platforms, Inc.;", + "aff_unique_norm": "The University of Hong Kong;Meta Platforms, Inc.;Seoul National University", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.hku.hk;https://meta.com;https://www.snu.ac.kr", "aff_unique_abbr": "HKU;Meta;SNU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;1;1;0;2", - "aff_country_unique": "China;United States;South Korea" + "aff_country_unique": "China;United States;South Korea", + "bibtex": "@InProceedings{Shi_2023_ICCV,\n \n author = {\n Shi,\n Mingyi and Starke,\n Sebastian and Ye,\n Yuting and Komura,\n Taku and Won,\n Jungdam\n},\n title = {\n PhaseMP: Robust 3D Pose Estimation via Phase-conditioned Human Motion Prior\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14725-14737\n} \n}" }, { "title": "Phasic Content Fusing Diffusion Model with Directional Distribution Consistency for Few-Shot Model Adaption", @@ -43628,7 +45091,8 @@ "aff_campus_unique_index": ";;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0+0;0;0;0;0;0+0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Hu_2023_ICCV,\n \n author = {\n Hu,\n Teng and Zhang,\n Jiangning and Liu,\n Liang and Yi,\n Ran and Kou,\n Siqi and Zhu,\n Haokun and Chen,\n Xu and Wang,\n Yabiao and Wang,\n Chengjie and Ma,\n Lizhuang\n},\n title = {\n Phasic Content Fusing Diffusion Model with Directional Distribution Consistency for Few-Shot Model Adaption\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2406-2415\n} \n}" }, { "title": "PhysDiff: Physics-Guided Human Motion Diffusion Model", @@ -43653,14 +45117,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yuan_PhysDiff_Physics-Guided_Human_Motion_Diffusion_Model_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0;0", - "aff_unique_norm": "NVIDIA", - "aff_unique_dep": "NVIDIA Corporation", + "aff_unique_norm": "NVIDIA Corporation", + "aff_unique_dep": "", "aff_unique_url": "https://www.nvidia.com", "aff_unique_abbr": "NVIDIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Yuan_2023_ICCV,\n \n author = {\n Yuan,\n Ye and Song,\n Jiaming and Iqbal,\n Umar and Vahdat,\n Arash and Kautz,\n Jan\n},\n title = {\n PhysDiff: Physics-Guided Human Motion Diffusion Model\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16010-16021\n} \n}" }, { "title": "Physically-Plausible Illumination Distribution Estimation", @@ -43683,7 +45148,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ershov_Physically-Plausible_Illumination_Distribution_Estimation_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ershov_Physically-Plausible_Illumination_Distribution_Estimation_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Ershov_2023_ICCV,\n \n author = {\n Ershov,\n Egor and Tesalin,\n Vasily and Ermakov,\n Ivan and Brown,\n Michael S.\n},\n title = {\n Physically-Plausible Illumination Distribution Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12928-12936\n} \n}" }, { "title": "Physics-Augmented Autoencoder for 3D Skeleton-Based Gait Recognition", @@ -43715,7 +45181,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Troy", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Guo_2023_ICCV,\n \n author = {\n Guo,\n Hongji and Ji,\n Qiang\n},\n title = {\n Physics-Augmented Autoencoder for 3D Skeleton-Based Gait Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19627-19638\n} \n}" }, { "title": "Physics-Driven Turbulence Image Restoration with Stochastic Refinement", @@ -43747,7 +45214,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Austin;", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Jaiswal_2023_ICCV,\n \n author = {\n Jaiswal,\n Ajay and Zhang,\n Xingguang and Chan,\n Stanley H. and Wang,\n Zhangyang\n},\n title = {\n Physics-Driven Turbulence Image Restoration with Stochastic Refinement\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12170-12181\n} \n}" }, { "title": "PivotNet: Vectorized Pivot Learning for End-to-end HD Map Construction", @@ -43772,14 +45240,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ding_PivotNet_Vectorized_Pivot_Learning_for_End-to-end_HD_Map_Construction_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0", - "aff_unique_norm": "Megvii Technology", + "aff_unique_norm": "MEGVII Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.megvii.com", "aff_unique_abbr": "", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Ding_2023_ICCV,\n \n author = {\n Ding,\n Wenjie and Qiao,\n Limeng and Qiu,\n Xi and Zhang,\n Chi\n},\n title = {\n PivotNet: Vectorized Pivot Learning for End-to-end HD Map Construction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3672-3682\n} \n}" }, { "title": "Pix2Video: Video Editing using Image Diffusion", @@ -43811,7 +45280,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+1", - "aff_country_unique": "United States;United Kingdom" + "aff_country_unique": "United States;United Kingdom", + "bibtex": "@InProceedings{Ceylan_2023_ICCV,\n \n author = {\n Ceylan,\n Duygu and Huang,\n Chun-Hao P. and Mitra,\n Niloy J.\n},\n title = {\n Pix2Video: Video Editing using Image Diffusion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23206-23217\n} \n}" }, { "title": "Pixel Adaptive Deep Unfolding Transformer for Hyperspectral Image Reconstruction", @@ -43823,7 +45293,7 @@ "author": "Miaoyu Li; Ying Fu; Ji Liu; Yulun Zhang", "abstract": "Hyperspectral Image (HSI) reconstruction has made gratifying progress with the deep unfolding framework by formulating the problem into a data module and a prior module. Nevertheless, existing methods still face the problem of insufficient matching with HSI data. The issues lie in three aspects: 1) fixed gradient descent step in the data module while the degradation of HSI is agnostic in the pixel-level. 2) inadequate prior module for 3D HSI cube. 3) stage interaction ignoring the differences in features at different stages. To address these issues, in this work, we propose a Pixel Adaptive Deep Unfolding Transformer (PADUT) for HSI reconstruction. In the data module, a pixel adaptive descent step is employed to focus on pixel-level agnostic degradation. In the prior module, we introduce the Non-local Spectral Transformer (NST) to emphasize the 3D characteristics of HSI for recovering. Moreover, inspired by the diverse expression of features in different stages and depths, the stage interaction is improved by the Fast Fourier Transform (FFT). Experimental results on both simulated and real scenes exhibit the superior performance of our method compared to state-of-the-art HSI reconstruction methods. The code is released at: https://github.com/MyuLi/PADUT", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Li_Pixel_Adaptive_Deep_Unfolding_Transformer_for_Hyperspectral_Image_Reconstruction_ICCV_2023_paper.pdf", - "aff": "Beijing Institute of Technology; Beijing Institute of Technology; Baidu Inc.; ETH Z\u00fcrich", + "aff": "Beijing Institute of Technology; Beijing Institute of Technology; Baidu Inc.; ETH Zürich", "project": "", "github": "https://github.com/MyuLi/PADUT", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Li_Pixel_Adaptive_Deep_ICCV_2023_supplemental.pdf", @@ -43836,14 +45306,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_Pixel_Adaptive_Deep_Unfolding_Transformer_for_Hyperspectral_Image_Reconstruction_ICCV_2023_paper.html", "aff_unique_index": "0;0;1;2", - "aff_unique_norm": "Beijing Institute of Technology;Baidu;ETH Zurich", - "aff_unique_dep": ";Baidu Inc.;", + "aff_unique_norm": "Beijing Institute of Technology;Baidu Inc.;ETH Zürich", + "aff_unique_dep": ";;", "aff_unique_url": "http://www.bit.edu.cn/;https://www.baidu.com;https://www.ethz.ch", "aff_unique_abbr": "BIT;Baidu;ETHZ", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1", - "aff_country_unique": "China;Switzerland" + "aff_country_unique": "China;Switzerland", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Miaoyu and Fu,\n Ying and Liu,\n Ji and Zhang,\n Yulun\n},\n title = {\n Pixel Adaptive Deep Unfolding Transformer for Hyperspectral Image Reconstruction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12959-12968\n} \n}" }, { "title": "Pixel-Aligned Recurrent Queries for Multi-View 3D Object Detection", @@ -43868,14 +45339,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Xie_Pixel-Aligned_Recurrent_Queries_for_Multi-View_3D_Object_Detection_ICCV_2023_paper.html", "aff_unique_index": "0;0;1;2", - "aff_unique_norm": "Northeastern University;California Institute of Technology;Meta", + "aff_unique_norm": "Northeastern University;California Institute of Technology;Meta Reality Labs", "aff_unique_dep": ";;Research", "aff_unique_url": "https://www.northeastern.edu;https://www.caltech.edu;https://www.meta.com", "aff_unique_abbr": "NEU;Caltech;MRL", "aff_campus_unique_index": "1", "aff_campus_unique": ";Pasadena", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Xie_2023_ICCV,\n \n author = {\n Xie,\n Yiming and Jiang,\n Huaizu and Gkioxari,\n Georgia and Straub,\n Julian\n},\n title = {\n Pixel-Aligned Recurrent Queries for Multi-View 3D Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18370-18380\n} \n}" }, { "title": "Pixel-Wise Contrastive Distillation", @@ -43887,7 +45359,7 @@ "author": "Junqiang Huang; Zichao Guo", "abstract": "We present a simple but effective pixel-level self-supervised distillation framework friendly to dense prediction tasks. Our method, called Pixel-Wise Contrastive Distillation (PCD), distills knowledge by attracting the corresponding pixels from student's and teacher's output feature maps. PCD includes a novel design called SpatialAdaptor which \"reshapes\" a part of the teacher network while preserving the distribution of its output features. Our ablation experiments suggest that this reshaping behavior enables more informative pixel-to-pixel distillation. Moreover, we utilize a plug-in multi-head self-attention module that explicitly relates the pixels of student's feature maps to enhance the effective receptive field, leading to a more competitive student. PCD outperforms previous self-supervised distillation methods on various dense prediction tasks. A backbone of ResNet-18-FPN distilled by PCD achieves 37.4 AP-bbox and 34.0 AP-mask on COCO dataset using the detector of Mask R-CNN. We hope our study will inspire future research on how to pre-train a small model friendly to dense prediction tasks in a self-supervised fashion.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Huang_Pixel-Wise_Contrastive_Distillation_ICCV_2023_paper.pdf", - "aff": "Shopee\u2020\u2021; Shopee\u2020", + "aff": "Shopee†‡; Shopee†", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Huang_Pixel-Wise_Contrastive_Distillation_ICCV_2023_supplemental.pdf", @@ -43907,7 +45379,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Huang_2023_ICCV,\n \n author = {\n Huang,\n Junqiang and Guo,\n Zichao\n},\n title = {\n Pixel-Wise Contrastive Distillation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16359-16369\n} \n}" }, { "title": "PlanarTrack: A Large-scale Challenging Benchmark for Planar Object Tracking", @@ -43939,7 +45412,8 @@ "aff_campus_unique_index": "0+0;1;1;1;0+0;1;1;1", "aff_campus_unique": "Beijing;Denton;", "aff_country_unique_index": "0+0;1;1;1;0+0;1;1;1", - "aff_country_unique": "China;United States;" + "aff_country_unique": "China;United States;", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Xinran and Liu,\n Xiaoqiong and Yi,\n Ziruo and Zhou,\n Xin and Le,\n Thanh and Zhang,\n Libo and Huang,\n Yan and Yang,\n Qing and Fan,\n Heng\n},\n title = {\n PlanarTrack: A Large-scale Challenging Benchmark for Planar Object Tracking\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20449-20458\n} \n}" }, { "title": "PlaneRecTR: Unified Query Learning for 3D Plane Recovery from a Single View", @@ -43971,7 +45445,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Shi_2023_ICCV,\n \n author = {\n Shi,\n Jingjia and Zhi,\n Shuaifeng and Xu,\n Kai\n},\n title = {\n PlaneRecTR: Unified Query Learning for 3D Plane Recovery from a Single View\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9377-9386\n} \n}" }, { "title": "PlankAssembly: Robust 3D Reconstruction from Three Orthographic Views with Learnt Shape Programs", @@ -43996,14 +45471,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Hu_PlankAssembly_Robust_3D_Reconstruction_from_Three_Orthographic_Views_with_Learnt_ICCV_2023_paper.html", "aff_unique_index": "0+1;2;3;3;0+1;2", - "aff_unique_norm": "Sun Yat-sen University;Guangdong Key Laboratory of Big Data Analysis and Processing;Manycore Tech Inc.;University of Electronic Science and Technology of China", + "aff_unique_norm": "Sun Yat-Sen University;Guangdong Key Laboratory of Big Data Analysis and Processing;Manycore Tech Inc.;University of Electronic Science and Technology of China", "aff_unique_dep": ";Big Data Analysis and Processing;;", "aff_unique_url": "http://www.sysu.edu.cn/;;;https://www.uestc.edu.cn", "aff_unique_abbr": "SYSU;;;UESTC", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;1;0;0;0+0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Hu_2023_ICCV,\n \n author = {\n Hu,\n Wentao and Zheng,\n Jia and Zhang,\n Zixin and Yuan,\n Xiaojun and Yin,\n Jian and Zhou,\n Zihan\n},\n title = {\n PlankAssembly: Robust 3D Reconstruction from Three Orthographic Views with Learnt Shape Programs\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18495-18505\n} \n}" }, { "title": "Plausible Uncertainties for Human Pose Regression", @@ -44011,8 +45487,8 @@ "status": "Poster", "track": "main", "pid": "12289", - "author_site": "Lennart Bramlage, Michelle Karg, Crist\u00f3bal Curio", - "author": "Lennart Bramlage; Michelle Karg; Crist\u00f3bal Curio", + "author_site": "Lennart Bramlage, Michelle Karg, Cristóbal Curio", + "author": "Lennart Bramlage; Michelle Karg; Cristóbal Curio", "abstract": "Human pose estimation (HPE) is integral to scene understanding in numerous safety-critical domains involving human-machine interaction, such as autonomous driving or semi-automated work environments. Avoiding costly mistakes is synonymous with anticipating failure in model predictions, which necessitates meta-judgments on the accuracy of the applied models. Here, we propose a straightforward human pose regression framework to examine the behavior of two established methods for simultaneous aleatoric and epistemic uncertainty estimation: maximum a-posteriori (MAP) estimation with Monte-Carlo variational inference and deep evidential regression (DER). First, we evaluate both approaches on the quality of their predicted variances and whether these truly capture the expected model error. The initial assessment indicates that both methods exhibit the overconfidence issue common in deep probabilistic models. This observation motivates our implementation of an additional recalibration step to extract reliable confidence intervals. We then take a closer look at deep evidential regression, which, to our knowledge, is applied comprehensively for the first time to the HPE problem. Experimental results indicate that DER behaves as expected in challenging and adverse conditions commonly occurring in HPE and that the predicted uncertainties match their purported aleatoric and epistemic sources. Notably, DER achieves smooth uncertainty estimates without the need for a costly sampling step, making it an attractive candidate for uncertainty estimation on resource-limited platforms.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Bramlage_Plausible_Uncertainties_for_Human_Pose_Regression_ICCV_2023_paper.pdf", "aff": "Cognitive Systems Group, Reutlingen University, Germany; Continental AG; Cognitive Systems Group, Reutlingen University, Germany", @@ -44035,7 +45511,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Bramlage_2023_ICCV,\n \n author = {\n Bramlage,\n Lennart and Karg,\n Michelle and Curio,\n Crist\\'obal\n},\n title = {\n Plausible Uncertainties for Human Pose Regression\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15133-15142\n} \n}" }, { "title": "Pluralistic Aging Diffusion Autoencoder", @@ -44067,7 +45544,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Beijing;", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Peipei and Wang,\n Rui and Huang,\n Huaibo and He,\n Ran and He,\n Zhaofeng\n},\n title = {\n Pluralistic Aging Diffusion Autoencoder\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22613-22623\n} \n}" }, { "title": "Poincare ResNet", @@ -44090,7 +45568,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/van_Spengler_Poincare_ResNet_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/van_Spengler_Poincare_ResNet_ICCV_2023_paper.html", + "bibtex": "@InProceedings{van_Spengler_2023_ICCV,\n \n author = {\n van Spengler,\n Max and Berkhout,\n Erwin and Mettes,\n Pascal\n},\n title = {\n Poincare ResNet\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5419-5428\n} \n}" }, { "title": "Point Contrastive Prediction with Semantic Clustering for Self-Supervised Learning on Point Cloud Videos", @@ -44117,12 +45596,13 @@ "aff_unique_index": "0;0;0+1;1;2;3", "aff_unique_norm": "Shanghai Jiao Tong University;Aviation University of Air Force;Sun Yat-sen University;Zhejiang University", "aff_unique_dep": ";;;", - "aff_unique_url": "https://www.sjtu.edu.cn;;http://www.sysu.edu.cn/;https://www.zju.edu.cn", - "aff_unique_abbr": "SJTU;;SYSU;ZJU", + "aff_unique_url": "https://www.sjtu.edu.cn;http://www.auaf.edu.cn;http://www.sysu.edu.cn/;https://www.zju.edu.cn", + "aff_unique_abbr": "SJTU;AUAF;SYSU;ZJU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Sheng_2023_ICCV,\n \n author = {\n Sheng,\n Xiaoxiao and Shen,\n Zhiqiang and Xiao,\n Gang and Wang,\n Longguang and Guo,\n Yulan and Fan,\n Hehe\n},\n title = {\n Point Contrastive Prediction with Semantic Clustering for Self-Supervised Learning on Point Cloud Videos\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16515-16524\n} \n}" }, { "title": "Point-Query Quadtree for Crowd Counting, Localization, and More", @@ -44147,14 +45627,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Liu_Point-Query_Quadtree_for_Crowd_Counting_Localization_and_More_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;1", - "aff_unique_norm": "Huazhong University of Science and Technology;University of Sydney", + "aff_unique_norm": "Huazhong University of Science and Technology;The University of Sydney", "aff_unique_dep": "School of Artificial Intelligence and Automation;", "aff_unique_url": "http://www.hust.edu.cn;https://www.sydney.edu.au", "aff_unique_abbr": "HUST;USYD", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Chengxin and Lu,\n Hao and Cao,\n Zhiguo and Liu,\n Tongliang\n},\n title = {\n Point-Query Quadtree for Crowd Counting,\n Localization,\n and More\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1676-1685\n} \n}" }, { "title": "Point-SLAM: Dense Neural Point Cloud-based SLAM", @@ -44162,11 +45643,11 @@ "status": "Poster", "track": "main", "pid": "1873", - "author_site": "Erik Sandstr\u00f6m, Yue Li, Luc Van Gool, Martin R. Oswald", - "author": "Erik Sandstr\u00f6m; Yue Li; Luc Van Gool; Martin R. Oswald", + "author_site": "Erik Sandström, Yue Li, Luc Van Gool, Martin R. Oswald", + "author": "Erik Sandström; Yue Li; Luc Van Gool; Martin R. Oswald", "abstract": "We propose a dense neural simultaneous localization and mapping (SLAM) approach for monocular RGBD input which anchors the features of a neural scene representation in a point cloud that is iteratively generated in an input-dependent data-driven manner. We demonstrate that both tracking and mapping can be performed with the same point-based neural scene representation by minimizing an RGBD-based re-rendering loss. In contrast to recent dense neural SLAM methods which anchor the scene features in a sparse grid, our point-based approach allows to dynamically adapt the anchor point density to the information density of the input. This strategy reduces runtime and memory usage in regions with fewer details and dedicates higher point density to resolve fine details. Our approach performs either better or competitive to existing dense neural RGBD SLAM methods in tracking, mapping and rendering accuracy on the Replica, TUM-RGBD and ScanNet datasets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Sandstrom_Point-SLAM_Dense_Neural_Point_Cloud-based_SLAM_ICCV_2023_paper.pdf", - "aff": "ETH Z\u00fcrich, Switzerland; ETH Z\u00fcrich, Switzerland; ETH Z\u00fcrich, Switzerland+KU Leuven, Belgium; ETH Z\u00fcrich, Switzerland+University of Amsterdam, Netherlands", + "aff": "ETH Zürich, Switzerland; ETH Zürich, Switzerland; ETH Zürich, Switzerland+KU Leuven, Belgium; ETH Zürich, Switzerland+University of Amsterdam, Netherlands", "project": "", "github": "https://github.com/eriksandstroem/Point-SLAM", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Sandstrom_Point-SLAM_Dense_Neural_ICCV_2023_supplemental.zip", @@ -44179,14 +45660,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Sandstrom_Point-SLAM_Dense_Neural_Point_Cloud-based_SLAM_ICCV_2023_paper.html", "aff_unique_index": "0;0;0+1;0+2", - "aff_unique_norm": "ETH Zurich;KU Leuven;University of Amsterdam", + "aff_unique_norm": "ETH Zürich;KU Leuven;University of Amsterdam", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ethz.ch;https://www.kuleuven.be;https://www.uva.nl", "aff_unique_abbr": "ETHZ;KU Leuven;UvA", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+1;0+2", - "aff_country_unique": "Switzerland;Belgium;Netherlands" + "aff_country_unique": "Switzerland;Belgium;Netherlands", + "bibtex": "@InProceedings{Sandstrom_2023_ICCV,\n \n author = {\n Sandstr\\"om,\n Erik and Li,\n Yue and Van Gool,\n Luc and Oswald,\n Martin R.\n},\n title = {\n Point-SLAM: Dense Neural Point Cloud-based SLAM\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18433-18444\n} \n}" }, { "title": "Point-TTA: Test-Time Adaptation for Point Cloud Registration Using Multitask Meta-Auxiliary Learning", @@ -44218,7 +45700,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Canada" + "aff_country_unique": "Canada", + "bibtex": "@InProceedings{Hatem_2023_ICCV,\n \n author = {\n Hatem,\n Ahmed and Qian,\n Yiming and Wang,\n Yang\n},\n title = {\n Point-TTA: Test-Time Adaptation for Point Cloud Registration Using Multitask Meta-Auxiliary Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16494-16504\n} \n}" }, { "title": "Point2Mask: Point-supervised Panoptic Segmentation via Optimal Transport", @@ -44250,7 +45733,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Wentong and Yuan,\n Yuqian and Wang,\n Song and Zhu,\n Jianke and Li,\n Jianshu and Liu,\n Jian and Zhang,\n Lei\n},\n title = {\n Point2Mask: Point-supervised Panoptic Segmentation via Optimal Transport\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 572-581\n} \n}" }, { "title": "PointCLIP V2: Prompting CLIP and GPT for Powerful 3D Open-world Learning", @@ -44275,14 +45759,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhu_PointCLIP_V2_Prompting_CLIP_and_GPT_for_Powerful_3D_Open-world_ICCV_2023_paper.html", "aff_unique_index": "0;1+2;0;1+2;3;1;4;2", - "aff_unique_norm": "City University of Hong Kong;Chinese University of Hong Kong;Shanghai Artificial Intelligence Laboratory;Yale University;Peking University", + "aff_unique_norm": "City University of Hong Kong;The Chinese University of Hong Kong;Shanghai Artificial Intelligence Laboratory;Yale University;Peking University", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.cityu.edu.hk;https://www.cuhk.edu.hk;http://www.shailab.org/;https://www.yale.edu;http://www.pku.edu.cn", "aff_unique_abbr": "CityU;CUHK;Shanghai AI Lab;Yale;Peking U", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0+0;0;0+0;1;0;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Zhu_2023_ICCV,\n \n author = {\n Zhu,\n Xiangyang and Zhang,\n Renrui and He,\n Bowei and Guo,\n Ziyu and Zeng,\n Ziyao and Qin,\n Zipeng and Zhang,\n Shanghang and Gao,\n Peng\n},\n title = {\n PointCLIP V2: Prompting CLIP and GPT for Powerful 3D Open-world Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2639-2650\n} \n}" }, { "title": "PointDC: Unsupervised Semantic Segmentation of 3D Point Clouds via Cross-Modal Distillation and Super-Voxel Clustering", @@ -44314,7 +45799,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0;0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Zisheng and Xu,\n Hongbin and Chen,\n Weitao and Zhou,\n Zhipeng and Xiao,\n Haihong and Sun,\n Baigui and Xie,\n Xuansong and kang,\n Wenxiong\n},\n title = {\n PointDC: Unsupervised Semantic Segmentation of 3D Point Clouds via Cross-Modal Distillation and Super-Voxel Clustering\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14290-14299\n} \n}" }, { "title": "PointMBF: A Multi-scale Bidirectional Fusion Network for Unsupervised RGB-D Point Cloud Registration", @@ -44322,6 +45808,7 @@ "status": "Poster", "track": "main", "pid": "1765", + "author_site": "Mingzhi Yuan, Kexue Fu, Zhihao Li, Yucong Meng, Manning Wang", "author": "Mingzhi Yuan, Kexue Fu, Zhihao Li, Yucong Meng, Manning Wang", "abstract": "Point cloud registration is a task to estimate the rigid transformation between two unaligned scans, which plays an important role in many computer vision applications. Previous learning-based works commonly focus on supervised registration, which have limitations in practice. Recently, with the advance of inexpensive RGB-D sensors, several learning-based works utilize RGB-D data to achieve unsupervised registration. However, most of existing unsupervised methods follow a cascaded design or fuse RGB-D data in a unidirectional manner, which do not fully exploit the complementary information in the RGB-D data. To leverage the complementary information more effectively, we propose a network implementing multi-scale bidirectional fusion between RGB images and point clouds generated from depth images. By bidirectionally fusing visual and geometric features in multi-scales, more distinctive deep features for correspondence estimation can be obtained, making our registration more accurate. Extensive experiments on ScanNet and 3DMatch demonstrate that our method achieves new state-of-the-art performance. Code will be released at https://github.com/phdymz/PointMBF.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Yuan_PointMBF_A_Multi-scale_Bidirectional_Fusion_Network_for_Unsupervised_RGB-D_Point_ICCV_2023_paper.pdf", @@ -44333,7 +45820,8 @@ "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13808121679718950635&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yuan_PointMBF_A_Multi-scale_Bidirectional_Fusion_Network_for_Unsupervised_RGB-D_Point_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yuan_PointMBF_A_Multi-scale_Bidirectional_Fusion_Network_for_Unsupervised_RGB-D_Point_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Yuan_2023_ICCV,\n \n author = {\n Yuan,\n Mingzhi and Fu,\n Kexue and Li,\n Zhihao and Meng,\n Yucong and Wang,\n Manning\n},\n title = {\n PointMBF: A Multi-scale Bidirectional Fusion Network for Unsupervised RGB-D Point Cloud Registration\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17694-17705\n} \n}" }, { "title": "PointOdyssey: A Large-Scale Synthetic Dataset for Long-Term Point Tracking", @@ -44365,7 +45853,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zheng_2023_ICCV,\n \n author = {\n Zheng,\n Yang and Harley,\n Adam W. and Shen,\n Bokui and Wetzstein,\n Gordon and Guibas,\n Leonidas J.\n},\n title = {\n PointOdyssey: A Large-Scale Synthetic Dataset for Long-Term Point Tracking\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19855-19865\n} \n}" }, { "title": "PolicyCleanse: Backdoor Detection and Mitigation for Competitive Reinforcement Learning", @@ -44397,7 +45886,8 @@ "aff_campus_unique_index": "0;2", "aff_campus_unique": "College Park;;Riverside", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States;" + "aff_country_unique": "United States;", + "bibtex": "@InProceedings{Guo_2023_ICCV,\n \n author = {\n Guo,\n Junfeng and Li,\n Ang and Wang,\n Lixu and Liu,\n Cong\n},\n title = {\n PolicyCleanse: Backdoor Detection and Mitigation for Competitive Reinforcement Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4699-4708\n} \n}" }, { "title": "Ponder: Point Cloud Pre-training via Neural Rendering", @@ -44429,7 +45919,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;1;1", - "aff_country_unique": "Australia;China" + "aff_country_unique": "Australia;China", + "bibtex": "@InProceedings{Huang_2023_ICCV,\n \n author = {\n Huang,\n Di and Peng,\n Sida and He,\n Tong and Yang,\n Honghui and Zhou,\n Xiaowei and Ouyang,\n Wanli\n},\n title = {\n Ponder: Point Cloud Pre-training via Neural Rendering\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16089-16098\n} \n}" }, { "title": "Pose-Free Neural Radiance Fields via Implicit Pose Regularization", @@ -44461,7 +45952,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0;2;2;0", - "aff_country_unique": "Singapore;Germany;China" + "aff_country_unique": "Singapore;Germany;China", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Jiahui and Zhan,\n Fangneng and Yu,\n Yingchen and Liu,\n Kunhao and Wu,\n Rongliang and Zhang,\n Xiaoqin and Shao,\n Ling and Lu,\n Shijian\n},\n title = {\n Pose-Free Neural Radiance Fields via Implicit Pose Regularization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3534-3543\n} \n}" }, { "title": "PoseDiffusion: Solving Pose Estimation via Diffusion-aided Bundle Adjustment", @@ -44486,14 +45978,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wang_PoseDiffusion_Solving_Pose_Estimation_via_Diffusion-aided_Bundle_Adjustment_ICCV_2023_paper.html", "aff_unique_index": "0+1;0;1", - "aff_unique_norm": "University of Oxford;Meta", + "aff_unique_norm": "University of Oxford;Meta Platforms, Inc.", "aff_unique_dep": "Visual Geometry Group;Meta AI", "aff_unique_url": "https://www.ox.ac.uk;https://meta.com", "aff_unique_abbr": "Oxford;Meta", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Oxford;", "aff_country_unique_index": "0+1;0;1", - "aff_country_unique": "United Kingdom;United States" + "aff_country_unique": "United Kingdom;United States", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Jianyuan and Rupprecht,\n Christian and Novotny,\n David\n},\n title = {\n PoseDiffusion: Solving Pose Estimation via Diffusion-aided Bundle Adjustment\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9773-9783\n} \n}" }, { "title": "PoseFix: Correcting 3D Human Poses with Natural Language", @@ -44501,8 +45994,8 @@ "status": "Poster", "track": "main", "pid": "10123", - "author_site": "Ginger Delmas, Philippe Weinzaepfel, Francesc Moreno-Noguer, Gr\u00e9gory Rogez", - "author": "Ginger Delmas; Philippe Weinzaepfel; Francesc Moreno-Noguer; Gr\u00e9gory Rogez", + "author_site": "Ginger Delmas, Philippe Weinzaepfel, Francesc Moreno-Noguer, Grégory Rogez", + "author": "Ginger Delmas; Philippe Weinzaepfel; Francesc Moreno-Noguer; Grégory Rogez", "abstract": "Automatically producing instructions to modify one's posture could open the door to endless applications, such as personalized coaching and in-home physical therapy. Tackling the reverse problem (i.e., refining a 3D pose based on some natural language feedback) could help for assisted 3D character animation or robot teaching, for instance.\n Although a few recent works explore the connections between natural language and 3D human pose, none focus on describing 3D body pose differences. In this paper, we tackle the problem of correcting 3D human poses with natural language.\n To this end, we introduce the PoseFix dataset, which consists of several thousand paired 3D poses and their corresponding text feedback, that describe how the source pose needs to be modified to obtain the target pose. We demonstrate the potential of this dataset on two tasks: (1) text-based pose editing, that aims at generating corrected 3D body poses given a query pose and a text modifier; and (2) correctional text generation, where instructions are generated based on the differences between two body poses. The dataset and the code are available at https://europe.naverlabs.com/research/computer-vision/posefix/.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Delmas_PoseFix_Correcting_3D_Human_Poses_with_Natural_Language_ICCV_2023_paper.pdf", "aff": "Institut de Rob `otica i Inform `atica Industrial, CSIC-UPC, Barcelona, Spain+NA VER LABS Europe; NA VER LABS Europe; Institut de Rob `otica i Inform `atica Industrial, CSIC-UPC, Barcelona, Spain; NA VER LABS Europe", @@ -44518,14 +46011,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Delmas_PoseFix_Correcting_3D_Human_Poses_with_Natural_Language_ICCV_2023_paper.html", "aff_unique_index": "0+1;1;0;1", - "aff_unique_norm": "Institut de Rob\u00f2tica i Inform\u00e0tica Industrial;NAVER LABS Europe", + "aff_unique_norm": "Institut de Robòtica i Informàtica Industrial;NAVER LABS Europe", "aff_unique_dep": "CSIC-UPC;", "aff_unique_url": "http://www.iri.upc.edu/;https://www.naverlabs.com/europe", "aff_unique_abbr": "IRI;NAVER LABS Europe", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Barcelona;", "aff_country_unique_index": "0+1;1;0;1", - "aff_country_unique": "Spain;Unknown" + "aff_country_unique": "Spain;Unknown", + "bibtex": "@InProceedings{Delmas_2023_ICCV,\n \n author = {\n Delmas,\n Ginger and Weinzaepfel,\n Philippe and Moreno-Noguer,\n Francesc and Rogez,\n Gr\\'egory\n},\n title = {\n PoseFix: Correcting 3D Human Poses with Natural Language\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15018-15028\n} \n}" }, { "title": "PourIt!: Weakly-Supervised Liquid Perception from a Single Image for Visual Closed-Loop Robotic Pouring", @@ -44537,7 +46031,7 @@ "author": "Haitao Lin; Yanwei Fu; Xiangyang Xue", "abstract": "Liquid perception is critical for robotic pouring tasks. It usually requires the robust visual detection of flowing liquid. However, while recent works have shown promising results in liquid perception, they typically require labeled data for model training, a process that is both time-consuming and reliant on human labor. To this end, this paper proposes a simple yet effective framework PourIt!, to serve as a tool for robotic pouring tasks. We design a simple data collection pipeline that only needs image-level labels to reduce the reliance on tedious pixel-wise annotations. Then, a binary classification model is trained to generate Class Activation Map (CAM) that focuses on the visual difference between these two kinds of collected data, i.e., the existence of liquid drop or not. We also devise a feature contrast strategy to improve the quality of the CAM, thus entirely and tightly covering the actual liquid regions. Then, the container pose is further utilized to facilitate the 3D point cloud recovery of the detected liquid region. Finally, the liquid-to-container distance is calculated for visual closed-loop control of the physical robot. To validate the effectiveness of our proposed method, we also contribute a novel dataset for our task and name it PourIt! dataset. Extensive results on this dataset and physical Franka robot have shown the utility and effectiveness of our method in the robotic pouring tasks. Our dataset, code and pre-trained models will be available on the project page.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Lin_PourIt_Weakly-Supervised_Liquid_Perception_from_a_Single_Image_for_Visual_ICCV_2023_paper.pdf", - "aff": "Academy for Engineering and Technology, and Engineering Research Center of AI and Robotics, Fudan University; School of Data Science, Fudan University, and Fudan ISTBI\u2014ZJNU Algorithm Centre for Brain-inspired Intelligence, Zhejiang Normal University, Jinhua, China; Fudan University", + "aff": "Academy for Engineering and Technology, and Engineering Research Center of AI and Robotics, Fudan University; School of Data Science, Fudan University, and Fudan ISTBI—ZJNU Algorithm Centre for Brain-inspired Intelligence, Zhejiang Normal University, Jinhua, China; Fudan University", "project": "https://hetolin.github.io/PourIt", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Lin_PourIt_Weakly-Supervised_Liquid_ICCV_2023_supplemental.zip", @@ -44557,7 +46051,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Fudan", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Lin_2023_ICCV,\n \n author = {\n Lin,\n Haitao and Fu,\n Yanwei and Xue,\n Xiangyang\n},\n title = {\n PourIt!: Weakly-Supervised Liquid Perception from a Single Image for Visual Closed-Loop Robotic Pouring\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 241-251\n} \n}" }, { "title": "Practical Membership Inference Attacks Against Large-Scale Multi-Modal Models: A Pilot Study", @@ -44589,7 +46084,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";St. Louis", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Ko_2023_ICCV,\n \n author = {\n Ko,\n Myeongseob and Jin,\n Ming and Wang,\n Chenguang and Jia,\n Ruoxi\n},\n title = {\n Practical Membership Inference Attacks Against Large-Scale Multi-Modal Models: A Pilot Study\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4871-4881\n} \n}" }, { "title": "Pre-Training-Free Image Manipulation Localization through Non-Mutually Exclusive Contrastive Learning", @@ -44614,14 +46110,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhou_Pre-Training-Free_Image_Manipulation_Localization_through_Non-Mutually_Exclusive_Contrastive_Learning_ICCV_2023_paper.html", "aff_unique_index": "0;0;1;2;0", - "aff_unique_norm": "Sichuan University;Xiamen University of Technology;Mohamed bin Zayed University of Artificial Intelligence", + "aff_unique_norm": "Sichuan University;Xiamen University of Technology;Mohamed Bin Zayed University for Humanities", "aff_unique_dep": "College of Computer Science;School of Computer and Information Engineering;Strategy Affairs Office", "aff_unique_url": "https://www.scu.edu.cn;;https://www.mbzuh.edu.ae", "aff_unique_abbr": ";;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0", - "aff_country_unique": "China;United Arab Emirates" + "aff_country_unique": "China;United Arab Emirates", + "bibtex": "@InProceedings{Zhou_2023_ICCV,\n \n author = {\n Zhou,\n Jizhe and Ma,\n Xiaochen and Du,\n Xia and Alhammadi,\n Ahmed Y. and Feng,\n Wentao\n},\n title = {\n Pre-Training-Free Image Manipulation Localization through Non-Mutually Exclusive Contrastive Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22346-22356\n} \n}" }, { "title": "Pre-training Vision Transformers with Very Limited Synthesized Images", @@ -44644,7 +46141,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Nakamura_Pre-training_Vision_Transformers_with_Very_Limited_Synthesized_Images_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Nakamura_Pre-training_Vision_Transformers_with_Very_Limited_Synthesized_Images_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Nakamura_2023_ICCV,\n \n author = {\n Nakamura,\n Ryo and Kataoka,\n Hirokatsu and Takashima,\n Sora and Noriega,\n Edgar Josafat Martinez and Yokota,\n Rio and Inoue,\n Nakamasa\n},\n title = {\n Pre-training Vision Transformers with Very Limited Synthesized Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20360-20369\n} \n}" }, { "title": "PreSTU: Pre-Training for Scene-Text Understanding", @@ -44669,14 +46167,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Kil_PreSTU_Pre-Training_for_Scene-Text_Understanding_ICCV_2023_paper.html", "aff_unique_index": "0;1;1;1;1;0;1", - "aff_unique_norm": "Ohio State University;Google", + "aff_unique_norm": "The Ohio State University;Google", "aff_unique_dep": ";Google Research", "aff_unique_url": "https://www.osu.edu;https://research.google", "aff_unique_abbr": "OSU;Google Research", "aff_campus_unique_index": "1;1;1;1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Kil_2023_ICCV,\n \n author = {\n Kil,\n Jihyung and Changpinyo,\n Soravit and Chen,\n Xi and Hu,\n Hexiang and Goodman,\n Sebastian and Chao,\n Wei-Lun and Soricut,\n Radu\n},\n title = {\n PreSTU: Pre-Training for Scene-Text Understanding\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15270-15280\n} \n}" }, { "title": "Predict to Detect: Prediction-guided 3D Object Detection using Sequential Images", @@ -44708,7 +46207,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Kim_2023_ICCV,\n \n author = {\n Kim,\n Sanmin and Kim,\n Youngseok and Lee,\n In-Jae and Kum,\n Dongsuk\n},\n title = {\n Predict to Detect: Prediction-guided 3D Object Detection using Sequential Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18057-18066\n} \n}" }, { "title": "Preface: A Data-driven Volumetric Prior for Few-shot Ultra High-resolution Face Synthesis", @@ -44716,8 +46216,8 @@ "status": "Poster", "track": "main", "pid": "10231", - "author_site": "Marcel C. B\u00fchler, Kripasindhu Sarkar, Tanmay Shah, Gengyan Li, Daoye Wang, Leonhard Helminger, Sergio Orts-Escolano, Dmitry Lagun, Otmar Hilliges, Thabo Beeler, Abhimitra Meka", - "author": "Marcel C. B\u00fchler; Kripasindhu Sarkar; Tanmay Shah; Gengyan Li; Daoye Wang; Leonhard Helminger; Sergio Orts-Escolano; Dmitry Lagun; Otmar Hilliges; Thabo Beeler; Abhimitra Meka", + "author_site": "Marcel C. Bühler, Kripasindhu Sarkar, Tanmay Shah, Gengyan Li, Daoye Wang, Leonhard Helminger, Sergio Orts-Escolano, Dmitry Lagun, Otmar Hilliges, Thabo Beeler, Abhimitra Meka", + "author": "Marcel C. Bühler; Kripasindhu Sarkar; Tanmay Shah; Gengyan Li; Daoye Wang; Leonhard Helminger; Sergio Orts-Escolano; Dmitry Lagun; Otmar Hilliges; Thabo Beeler; Abhimitra Meka", "abstract": "NeRFs have enabled highly realistic synthesis of human faces including complex appearance and reflectance effects of hair and skin. These methods typically require a large number of multi-view input images, making the process hardware intensive and cumbersome, limiting applicability to unconstrained settings. We propose a novel volumetric human face prior that enables the synthesis of ultra high-resolution novel views of subjects that are not part of the prior's training distribution. This prior model consists of an identity-conditioned NeRF, trained on a dataset of low-resolution multi-view images of diverse humans with known camera calibration. A simple sparse landmark-based 3D alignment of the training dataset allows our model to learn a smooth latent space of geometry and appearance despite a limited number of training identities. A high-quality volumetric representation of a novel subject can be obtained by model fitting to 2 or 3 camera views of arbitrary resolution. Importantly, our method requires as few as two views of casually captured images as input at inference time.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Buhler_Preface_A_Data-driven_Volumetric_Prior_for_Few-shot_Ultra_High-resolution_Face_ICCV_2023_paper.pdf", "aff": ";;;;;;;;;;", @@ -44731,7 +46231,8 @@ "aff_domain": ";;;;;;;;;;", "email": ";;;;;;;;;;", "author_num": 11, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Buhler_Preface_A_Data-driven_Volumetric_Prior_for_Few-shot_Ultra_High-resolution_Face_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Buhler_Preface_A_Data-driven_Volumetric_Prior_for_Few-shot_Ultra_High-resolution_Face_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Buhler_2023_ICCV,\n \n author = {\n B\\"uhler,\n Marcel C. and Sarkar,\n Kripasindhu and Shah,\n Tanmay and Li,\n Gengyan and Wang,\n Daoye and Helminger,\n Leonhard and Orts-Escolano,\n Sergio and Lagun,\n Dmitry and Hilliges,\n Otmar and Beeler,\n Thabo and Meka,\n Abhimitra\n},\n title = {\n Preface: A Data-driven Volumetric Prior for Few-shot Ultra High-resolution Face Synthesis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3402-3413\n} \n}" }, { "title": "Preparing the Future for Continual Semantic Segmentation", @@ -44763,7 +46264,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Hefei", "aff_country_unique_index": "0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Lin_2023_ICCV,\n \n author = {\n Lin,\n Zihan and Wang,\n Zilei and Zhang,\n Yixin\n},\n title = {\n Preparing the Future for Continual Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11910-11920\n} \n}" }, { "title": "Preserve Your Own Correlation: A Noise Prior for Video Diffusion Models", @@ -44788,14 +46290,15 @@ "author_num": 10, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ge_Preserve_Your_Own_Correlation_A_Noise_Prior_for_Video_Diffusion_ICCV_2023_paper.html", "aff_unique_index": "0;1;1;2;1;1;0;0;1;1", - "aff_unique_norm": "University of Maryland;NVIDIA;University of Chicago", - "aff_unique_dep": ";NVIDIA Corporation;", + "aff_unique_norm": "University of Maryland;NVIDIA Corporation;University of Chicago", + "aff_unique_dep": ";;", "aff_unique_url": "https://www/umd.edu;https://www.nvidia.com;https://www.uchicago.edu", "aff_unique_abbr": "UMD;NVIDIA;UChicago", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Ge_2023_ICCV,\n \n author = {\n Ge,\n Songwei and Nah,\n Seungjun and Liu,\n Guilin and Poon,\n Tyler and Tao,\n Andrew and Catanzaro,\n Bryan and Jacobs,\n David and Huang,\n Jia-Bin and Liu,\n Ming-Yu and Balaji,\n Yogesh\n},\n title = {\n Preserve Your Own Correlation: A Noise Prior for Video Diffusion Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22930-22941\n} \n}" }, { "title": "Preserving Modality Structure Improves Multi-Modal Learning", @@ -44821,13 +46324,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Swetha_Preserving_Modality_Structure_Improves_Multi-Modal_Learning_ICCV_2023_paper.html", "aff_unique_index": "0;0;1+2;1+2+3;0", "aff_unique_norm": "University of Central Florida;Goethe University Frankfurt;University of Bonn;Massachusetts Institute of Technology", - "aff_unique_dep": "Center for Research in Computer Vision;;;IBM Watson AI Lab", + "aff_unique_dep": "CRCV;;;IBM Watson AI Lab", "aff_unique_url": "https://www.ucf.edu;https://www.uni-frankfurt.de;https://www.uni-bonn.de;https://www.mitibmwatsonailab.org", "aff_unique_abbr": "UCF;GU Frankfurt;UBonn;MIT-IBM AI Lab", - "aff_campus_unique_index": "0;0;1;1;0", - "aff_campus_unique": "Orlando;Frankfurt;", + "aff_campus_unique_index": "1;1", + "aff_campus_unique": ";Frankfurt", "aff_country_unique_index": "0;0;1+1;1+1+0;0", - "aff_country_unique": "United States;Germany" + "aff_country_unique": "United States;Germany", + "bibtex": "@InProceedings{Swetha_2023_ICCV,\n \n author = {\n Swetha,\n Sirnam and Rizve,\n Mamshad Nayeem and Shvetsova,\n Nina and Kuehne,\n Hilde and Shah,\n Mubarak\n},\n title = {\n Preserving Modality Structure Improves Multi-Modal Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21993-22003\n} \n}" }, { "title": "Preserving Tumor Volumes for Unsupervised Medical Image Registration", @@ -44859,7 +46363,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Dong_2023_ICCV,\n \n author = {\n Dong,\n Qihua and Du,\n Hao and Song,\n Ying and Xu,\n Yan and Liao,\n Jing\n},\n title = {\n Preserving Tumor Volumes for Unsupervised Medical Image Registration\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21208-21218\n} \n}" }, { "title": "Pretrained Language Models as Visual Planners for Human Assistance", @@ -44884,14 +46389,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Patel_Pretrained_Language_Models_as_Visual_Planners_for_Human_Assistance_ICCV_2023_paper.html", "aff_unique_index": "0+1;0;0;0;0;0+1", - "aff_unique_norm": "Meta;University of Massachusetts Amherst", - "aff_unique_dep": "Meta Platforms, Inc.;", + "aff_unique_norm": "Meta Platforms, Inc.;University of Massachusetts Amherst", + "aff_unique_dep": ";", "aff_unique_url": "https://meta.com;https://www.umass.edu", "aff_unique_abbr": "Meta;UMass Amherst", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Amherst", "aff_country_unique_index": "0+0;0;0;0;0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Patel_2023_ICCV,\n \n author = {\n Patel,\n Dhruvesh and Eghbalzadeh,\n Hamid and Kamra,\n Nitin and Iuzzolino,\n Michael Louis and Jain,\n Unnat and Desai,\n Ruta\n},\n title = {\n Pretrained Language Models as Visual Planners for Human Assistance\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15302-15314\n} \n}" }, { "title": "Preventing Zero-Shot Transfer Degradation in Continual Learning of Vision-Language Models", @@ -44916,14 +46422,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zheng_Preventing_Zero-Shot_Transfer_Degradation_in_Continual_Learning_of_Vision-Language_Models_ICCV_2023_paper.html", "aff_unique_index": "0;1;0;0;2;0", - "aff_unique_norm": "National University of Singapore;University of California, Berkeley;Chinese University of Hong Kong", + "aff_unique_norm": "National University of Singapore;University of California, Berkeley;The Chinese University of Hong Kong", "aff_unique_dep": ";;", "aff_unique_url": "https://www.nus.edu.sg;https://www.berkeley.edu;https://www.cuhk.edu.hk", "aff_unique_abbr": "NUS;UC Berkeley;CUHK", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Berkeley;Hong Kong SAR", "aff_country_unique_index": "0;1;0;0;2;0", - "aff_country_unique": "Singapore;United States;China" + "aff_country_unique": "Singapore;United States;China", + "bibtex": "@InProceedings{Zheng_2023_ICCV,\n \n author = {\n Zheng,\n Zangwei and Ma,\n Mingyuan and Wang,\n Kai and Qin,\n Ziheng and Yue,\n Xiangyu and You,\n Yang\n},\n title = {\n Preventing Zero-Shot Transfer Degradation in Continual Learning of Vision-Language Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19125-19136\n} \n}" }, { "title": "Prior-guided Source-free Domain Adaptation for Human Pose Estimation", @@ -44948,14 +46455,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Raychaudhuri_Prior-guided_Source-free_Domain_Adaptation_for_Human_Pose_Estimation_ICCV_2023_paper.html", "aff_unique_index": "0+1;0;0;0;0", - "aff_unique_norm": "University of California, Riverside;Amazon", + "aff_unique_norm": "University of California, Riverside;Amazon Web Services", "aff_unique_dep": ";AWS AI Labs", "aff_unique_url": "https://www.ucr.edu;https://aws.amazon.com", "aff_unique_abbr": "UCR;AWS", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Riverside;", "aff_country_unique_index": "0+0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Raychaudhuri_2023_ICCV,\n \n author = {\n Raychaudhuri,\n Dripta S. and Ta,\n Calvin-Khang and Dutta,\n Arindam and Lal,\n Rohit and Roy-Chowdhury,\n Amit K.\n},\n title = {\n Prior-guided Source-free Domain Adaptation for Human Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14996-15006\n} \n}" }, { "title": "Priority-Centric Human Motion Generation in Discrete Latent Space", @@ -44980,14 +46488,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Kong_Priority-Centric_Human_Motion_Generation_in_Discrete_Latent_Space_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;1;0", - "aff_unique_norm": "National University of Singapore;Huawei", - "aff_unique_dep": ";Huawei International Pte Ltd", + "aff_unique_norm": "National University of Singapore;Huawei International Pte Ltd", + "aff_unique_dep": ";", "aff_unique_url": "https://www.nus.edu.sg;https://www.huawei.com/en/", "aff_unique_abbr": "NUS;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Kong_2023_ICCV,\n \n author = {\n Kong,\n Hanyang and Gong,\n Kehong and Lian,\n Dongze and Mi,\n Michael Bi and Wang,\n Xinchao\n},\n title = {\n Priority-Centric Human Motion Generation in Discrete Latent Space\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14806-14816\n} \n}" }, { "title": "Privacy Preserving Localization via Coordinate Permutations", @@ -44995,8 +46504,8 @@ "status": "Poster", "track": "main", "pid": "7428", - "author_site": "Linfei Pan, Johannes L. Sch\u00f6nberger, Viktor Larsson, Marc Pollefeys", - "author": "Linfei Pan; Johannes L. Sch\u00f6nberger; Viktor Larsson; Marc Pollefeys", + "author_site": "Linfei Pan, Johannes L. Schönberger, Viktor Larsson, Marc Pollefeys", + "author": "Linfei Pan; Johannes L. Schönberger; Viktor Larsson; Marc Pollefeys", "abstract": "Recent methods on privacy-preserving image-based localization use a random line parameterization to protect the privacy of query images and database maps. The lifting of points to lines effectively drops one of the two geometric constraints traditionally used with point-to-point correspondences in structure-based localization. This leads to a significant loss of accuracy for the privacy-preserving methods. In this paper, we overcome this limitation by devising a coordinate permutation scheme that allows for recovering the original point positions during pose estimation. The recovered points provide the full 2D geometric constraints and enable us to close the gap between privacy-preserving and traditional methods in terms of accuracy. Another limitation of random line methods is their vulnerability to density based 3D line cloud inversion attacks. Our method not only provides better accuracy than the original random line based approach but also provides stronger privacy guarantees against these recently proposed attacks. Extensive experiments on standard benchmark datasets demonstrate these improvements consistently across both scenarios of protecting the privacy of query images as well as the database map.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Pan_Privacy_Preserving_Localization_via_Coordinate_Permutations_ICCV_2023_paper.pdf", "aff": "ETH Zurich; Microsoft; Lund University; ETH Zurich+Microsoft", @@ -45012,14 +46521,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Pan_Privacy_Preserving_Localization_via_Coordinate_Permutations_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;0+1", - "aff_unique_norm": "ETH Zurich;Microsoft;Lund University", - "aff_unique_dep": ";Microsoft Corporation;", + "aff_unique_norm": "ETH Zurich;Microsoft Corporation;Lund University", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.ethz.ch;https://www.microsoft.com;https://www.lunduniversity.lu.se", "aff_unique_abbr": "ETHZ;Microsoft;LU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;0+1", - "aff_country_unique": "Switzerland;United States;Sweden" + "aff_country_unique": "Switzerland;United States;Sweden", + "bibtex": "@InProceedings{Pan_2023_ICCV,\n \n author = {\n Pan,\n Linfei and Sch\\"onberger,\n Johannes L. and Larsson,\n Viktor and Pollefeys,\n Marc\n},\n title = {\n Privacy Preserving Localization via Coordinate Permutations\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18174-18183\n} \n}" }, { "title": "Privacy-Preserving Face Recognition Using Random Frequency Components", @@ -45051,7 +46561,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Mi_2023_ICCV,\n \n author = {\n Mi,\n Yuxi and Huang,\n Yuge and Ji,\n Jiazhen and Zhao,\n Minyi and Wu,\n Jiaxiang and Xu,\n Xingkun and Ding,\n Shouhong and Zhou,\n Shuigeng\n},\n title = {\n Privacy-Preserving Face Recognition Using Random Frequency Components\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19673-19684\n} \n}" }, { "title": "ProPainter: Improving Propagation and Transformer for Video Inpainting", @@ -45083,7 +46594,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Zhou_2023_ICCV,\n \n author = {\n Zhou,\n Shangchen and Li,\n Chongyi and Chan,\n Kelvin C.K. and Loy,\n Chen Change\n},\n title = {\n ProPainter: Improving Propagation and Transformer for Video Inpainting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10477-10486\n} \n}" }, { "title": "ProbVLM: Probabilistic Adapter for Frozen Vison-Language Models", @@ -45095,7 +46607,7 @@ "author": "Uddeshya Upadhyay; Shyamgopal Karthik; Massimiliano Mancini; Zeynep Akata", "abstract": "Large-scale vision-language models (VLMs) like CLIP successfully find correspondences between images and text. Through the standard deterministic mapping process, an image or a text sample is mapped to a single vector in the embedding space. This is problematic: as multiple samples (images or text) can abstract the same concept in the physical world, deterministic embeddings do not reflect the inherent ambiguity in the embedding space. We propose ProbVLM, a probabilistic adapter that estimates probability distributions for the embeddings of pre-trained VLMs via inter/intra-modal alignment in a post-hoc manner without needing large-scale datasets or computing. On four challenging datasets, i.e., COCO, Flickr, CUB, and Oxford-flowers, we estimate the multi-modal embedding uncertainties for two VLMs, i.e., CLIP and BLIP, quantify the calibration of embedding uncertainties in retrieval tasks and show that ProbVLM outperforms other methods. Furthermore, we propose active learning and model selection as two real-world downstream tasks for VLMs and show that the estimated uncertainty aids both tasks. Lastly, we present a novel technique for visualizing the embedding distributions using a large-scale pre-trained latent diffusion model.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Upadhyay_ProbVLM_Probabilistic_Adapter_for_Frozen_Vison-Language_Models_ICCV_2023_paper.pdf", - "aff": "University of T\u00fcbingen; University of T\u00fcbingen; University of Trento; University of T\u00fcbingen+MPI for Intelligent Systems", + "aff": "University of Tübingen; University of Tübingen; University of Trento; University of Tübingen+MPI for Intelligent Systems", "project": "", "github": "https://github.com/ExplainableML/ProbVLM", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Upadhyay_ProbVLM_Probabilistic_Adapter_ICCV_2023_supplemental.pdf", @@ -45108,14 +46620,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Upadhyay_ProbVLM_Probabilistic_Adapter_for_Frozen_Vison-Language_Models_ICCV_2023_paper.html", "aff_unique_index": "0;0;1;0+2", - "aff_unique_norm": "University of T\u00fcbingen;University of Trento;Max Planck Institute for Intelligent Systems", + "aff_unique_norm": "University of Tübingen;University of Trento;Max Planck Institute for Intelligent Systems", "aff_unique_dep": ";;", "aff_unique_url": "https://www.uni-tuebingen.de/;https://www.unitn.it;https://www.mpi-is.mpg.de", - "aff_unique_abbr": "Uni T\u00fcbingen;UniTN;MPI-IS", + "aff_unique_abbr": "Uni Tübingen;UniTN;MPI-IS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0+0", - "aff_country_unique": "Germany;Italy" + "aff_country_unique": "Germany;Italy", + "bibtex": "@InProceedings{Upadhyay_2023_ICCV,\n \n author = {\n Upadhyay,\n Uddeshya and Karthik,\n Shyamgopal and Mancini,\n Massimiliano and Akata,\n Zeynep\n},\n title = {\n ProbVLM: Probabilistic Adapter for Frozen Vison-Language Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1899-1910\n} \n}" }, { "title": "Probabilistic Human Mesh Recovery in 3D Scenes from Egocentric Views", @@ -45127,7 +46640,7 @@ "author": "Siwei Zhang; Qianli Ma; Yan Zhang; Sadegh Aliakbarian; Darren Cosker; Siyu Tang", "abstract": "Automatic perception of human behaviors during social interactions is crucial for AR/VR applications, and an essential component is estimation of plausible 3D human pose and shape of our social partners from the egocentric view. One of the biggest challenges of this task is severe body truncation due to close social distances in egocentric scenarios, which brings large pose ambiguities for unseen body parts. To tackle this challenge, we propose a novel scene-conditioned diffusion method to model the body pose distribution. Conditioned on the 3D scene geometry, the diffusion model generates bodies in plausible human-scene interactions, with the sampling guided by a physics-based collision score to further resolve human-scene interpenetrations. The classifier-free training enables flexible sampling with different conditions and enhanced diversity. A visibility-aware graph convolution model guided by per-joint visibility serves as the diffusion denoiser to incorporate inter-joint dependencies and per-body-part control. Extensive evaluations show that our method generates bodies in plausible interactions with 3D scenes, achieving both superior accuracy for visible joints and diversity for invisible body parts. The code is available at https://sanweiliti.github.io/egohmr/egohmr.html.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Zhang_Probabilistic_Human_Mesh_Recovery_in_3D_Scenes_from_Egocentric_Views_ICCV_2023_paper.pdf", - "aff": "ETH Z\u00fcrich; ETH Z\u00fcrich + Max Planck Institute for Intelligent Systems; ETH Z\u00fcrich; Microsoft; Microsoft; ETH Z\u00fcrich", + "aff": "ETH Zürich; ETH Zürich + Max Planck Institute for Intelligent Systems; ETH Zürich; Microsoft; Microsoft; ETH Zürich", "project": "", "github": "https://sanweiliti.github.io/egohmr/egohmr.html", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Zhang_Probabilistic_Human_Mesh_ICCV_2023_supplemental.zip", @@ -45140,14 +46653,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhang_Probabilistic_Human_Mesh_Recovery_in_3D_Scenes_from_Egocentric_Views_ICCV_2023_paper.html", "aff_unique_index": "0;0+1;0;2;2;0", - "aff_unique_norm": "ETH Zurich;Max Planck Institute for Intelligent Systems;Microsoft", - "aff_unique_dep": ";Intelligent Systems;Microsoft Corporation", + "aff_unique_norm": "ETH Zürich;Max Planck Institute for Intelligent Systems;Microsoft Corporation", + "aff_unique_dep": ";Intelligent Systems;", "aff_unique_url": "https://www.ethz.ch;https://www.mpi-is.mpg.de;https://www.microsoft.com", "aff_unique_abbr": "ETHZ;MPI-IS;Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+1;0;2;2;0", - "aff_country_unique": "Switzerland;Germany;United States" + "aff_country_unique": "Switzerland;Germany;United States", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Siwei and Ma,\n Qianli and Zhang,\n Yan and Aliakbarian,\n Sadegh and Cosker,\n Darren and Tang,\n Siyu\n},\n title = {\n Probabilistic Human Mesh Recovery in 3D Scenes from Egocentric Views\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7989-8000\n} \n}" }, { "title": "Probabilistic Modeling of Inter- and Intra-observer Variability in Medical Image Segmentation", @@ -45155,8 +46669,8 @@ "status": "Poster", "track": "main", "pid": "10858", - "author_site": "Arne Schmidt, Pablo Morales-\u00c1lvarez, Rafael Molina", - "author": "Arne Schmidt; Pablo Morales-\u00c1lvarez; Rafael Molina", + "author_site": "Arne Schmidt, Pablo Morales-Álvarez, Rafael Molina", + "author": "Arne Schmidt; Pablo Morales-Álvarez; Rafael Molina", "abstract": "Medical image segmentation is a challenging task, particularly due to inter- and intra-observer variability, even between medical experts. In this paper, we propose a novel model, called Probabilistic Inter-Observer and iNtra-Observer variation NetwOrk (Pionono). It captures the labeling behavior of each rater with a multidimensional probability distribution and integrates this information with the feature maps of the image to produce probabilistic segmentation predictions. The model is optimized by variational inference and can be trained end-to-end. It outperforms state-of-the-art models such as STAPLE, Probabilistic U-Net, and models based on confusion matrices. Additionally, Pionono predicts multiple coherent segmentation maps that mimic the rater's expert opinion, which provides additional valuable information for the diagnostic process. Experiments on real-world cancer segmentation datasets demonstrate the high accuracy and efficiency of Pionono, making it a powerful tool for medical image analysis.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Schmidt_Probabilistic_Modeling_of_Inter-_and_Intra-observer_Variability_in_Medical_Image_ICCV_2023_paper.pdf", "aff": "Universidad de Granada, Granada, Spain; Universidad de Granada, Granada, Spain; Universidad de Granada, Granada, Spain", @@ -45179,7 +46693,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Granada", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Spain" + "aff_country_unique": "Spain", + "bibtex": "@InProceedings{Schmidt_2023_ICCV,\n \n author = {\n Schmidt,\n Arne and Morales-\\'Alvarez,\n Pablo and Molina,\n Rafael\n},\n title = {\n Probabilistic Modeling of Inter- and Intra-observer Variability in Medical Image Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21097-21106\n} \n}" }, { "title": "Probabilistic Precision and Recall Towards Reliable Evaluation of Generative Models", @@ -45211,7 +46726,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Park_2023_ICCV,\n \n author = {\n Park,\n Dogyun and Kim,\n Suhyun\n},\n title = {\n Probabilistic Precision and Recall Towards Reliable Evaluation of Generative Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20099-20109\n} \n}" }, { "title": "Probabilistic Triangulation for Uncalibrated Multi-View 3D Human Pose Estimation", @@ -45243,7 +46759,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Jiang_2023_ICCV,\n \n author = {\n Jiang,\n Boyuan and Hu,\n Lei and Xia,\n Shihong\n},\n title = {\n Probabilistic Triangulation for Uncalibrated Multi-View 3D Human Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14850-14860\n} \n}" }, { "title": "Progressive Spatio-Temporal Prototype Matching for Text-Video Retrieval", @@ -45275,7 +46792,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Pandeng and Xie,\n Chen-Wei and Zhao,\n Liming and Xie,\n Hongtao and Ge,\n Jiannan and Zheng,\n Yun and Zhao,\n Deli and Zhang,\n Yongdong\n},\n title = {\n Progressive Spatio-Temporal Prototype Matching for Text-Video Retrieval\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4100-4110\n} \n}" }, { "title": "Prompt Switch: Efficient CLIP Adaptation for Text-Video Retrieval", @@ -45307,7 +46825,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Adelaide;", "aff_country_unique_index": "0;0;1;2;0", - "aff_country_unique": "Australia;China;United Kingdom" + "aff_country_unique": "Australia;China;United Kingdom", + "bibtex": "@InProceedings{Deng_2023_ICCV,\n \n author = {\n Deng,\n Chaorui and Chen,\n Qi and Qin,\n Pengda and Chen,\n Da and Wu,\n Qi\n},\n title = {\n Prompt Switch: Efficient CLIP Adaptation for Text-Video Retrieval\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15648-15658\n} \n}" }, { "title": "Prompt Tuning Inversion for Text-driven Image Editing Using Diffusion Models", @@ -45339,7 +46858,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Dong_2023_ICCV,\n \n author = {\n Dong,\n Wenkai and Xue,\n Song and Duan,\n Xiaoyue and Han,\n Shumin\n},\n title = {\n Prompt Tuning Inversion for Text-driven Image Editing Using Diffusion Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7430-7440\n} \n}" }, { "title": "Prompt-aligned Gradient for Prompt Tuning", @@ -45371,7 +46891,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;2;0", - "aff_country_unique": "Singapore;United States;China" + "aff_country_unique": "Singapore;United States;China", + "bibtex": "@InProceedings{Zhu_2023_ICCV,\n \n author = {\n Zhu,\n Beier and Niu,\n Yulei and Han,\n Yucheng and Wu,\n Yue and Zhang,\n Hanwang\n},\n title = {\n Prompt-aligned Gradient for Prompt Tuning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15659-15669\n} \n}" }, { "title": "PromptCap: Prompt-Guided Image Captioning for VQA with GPT-3", @@ -45396,14 +46917,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Hu_PromptCap_Prompt-Guided_Image_Captioning_for_VQA_with_GPT-3_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;0;0;1", - "aff_unique_norm": "University of Washington;University of Rochester;Microsoft", - "aff_unique_dep": ";;Microsoft Corporation", + "aff_unique_norm": "University of Washington;University of Rochester;Microsoft Corporation", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.washington.edu;https://www.rochester.edu;https://www.microsoft.com", "aff_unique_abbr": "UW;U of R;Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Hu_2023_ICCV,\n \n author = {\n Hu,\n Yushi and Hua,\n Hang and Yang,\n Zhengyuan and Shi,\n Weijia and Smith,\n Noah A. and Luo,\n Jiebo\n},\n title = {\n PromptCap: Prompt-Guided Image Captioning for VQA with GPT-3\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2963-2975\n} \n}" }, { "title": "PromptStyler: Prompt-driven Style Generation for Source-free Domain Generalization", @@ -45426,7 +46948,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Cho_PromptStyler_Prompt-driven_Style_Generation_for_Source-free_Domain_Generalization_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Cho_PromptStyler_Prompt-driven_Style_Generation_for_Source-free_Domain_Generalization_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Cho_2023_ICCV,\n \n author = {\n Cho,\n Junhyeong and Nam,\n Gilhyun and Kim,\n Sungyeon and Yang,\n Hunmin and Kwak,\n Suha\n},\n title = {\n PromptStyler: Prompt-driven Style Generation for Source-free Domain Generalization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15702-15712\n} \n}" }, { "title": "ProtoFL: Unsupervised Federated Learning via Prototypical Distillation", @@ -45449,7 +46972,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Kim_ProtoFL_Unsupervised_Federated_Learning_via_Prototypical_Distillation_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Kim_ProtoFL_Unsupervised_Federated_Learning_via_Prototypical_Distillation_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Kim_2023_ICCV,\n \n author = {\n Kim,\n Hansol and Kwak,\n Youngjun and Jung,\n Minyoung and Shin,\n Jinho and Kim,\n Youngsung and Kim,\n Changick\n},\n title = {\n ProtoFL: Unsupervised Federated Learning via Prototypical Distillation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6470-6479\n} \n}" }, { "title": "ProtoTransfer: Cross-Modal Prototype Transfer for Point Cloud Segmentation", @@ -45478,10 +47002,11 @@ "aff_unique_dep": "AI Institute;Australian Institute for Machine Learning", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.adelaide.edu.au", "aff_unique_abbr": "SJTU;Adelaide", - "aff_campus_unique_index": "", - "aff_campus_unique": "", + "aff_campus_unique_index": "0;0", + "aff_campus_unique": "Shanghai;", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Tang_2023_ICCV,\n \n author = {\n Tang,\n Pin and Xu,\n Hai-Ming and Ma,\n Chao\n},\n title = {\n ProtoTransfer: Cross-Modal Prototype Transfer for Point Cloud Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3337-3347\n} \n}" }, { "title": "Prototype Reminiscence and Augmented Asymmetric Knowledge Aggregation for Non-Exemplar Class-Incremental Learning", @@ -45513,7 +47038,8 @@ "aff_campus_unique_index": "0;0+0", "aff_campus_unique": "Wuhan", "aff_country_unique_index": "0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Shi_2023_ICCV,\n \n author = {\n Shi,\n Wuxuan and Ye,\n Mang\n},\n title = {\n Prototype Reminiscence and Augmented Asymmetric Knowledge Aggregation for Non-Exemplar Class-Incremental Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1772-1781\n} \n}" }, { "title": "Prototype-based Dataset Comparison", @@ -45521,6 +47047,7 @@ "status": "Poster", "track": "main", "pid": "11046", + "author_site": "Nanne van Noord", "author": "Nanne van Noord", "abstract": "Dataset summarisation is a fruitful approach to dataset inspection. However, when applied to a single dataset the discovery of visual concepts is restricted to those most prominent. We argue that a comparative approach can expand upon this paradigm to enable richer forms of dataset inspection that go beyond the most prominent concepts. To enable dataset comparison we present a module that learns concept-level prototypes across datasets. We leverage self-supervised learning to discover these prototypes without supervision, and we demonstrate the benefits of our approach in two case-studies. Our findings show that dataset comparison extends dataset inspection and we hope to encourage more works in this direction. Code and usage instructions available at https://github.com/Nanne/ProtoSim", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/van_Noord_Protoype-based_Dataset_Comparison_ICCV_2023_paper.pdf", @@ -45535,7 +47062,8 @@ "aff_domain": "", "email": "", "author_num": 1, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/van_Noord_Protoype-based_Dataset_Comparison_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/van_Noord_Protoype-based_Dataset_Comparison_ICCV_2023_paper.html", + "bibtex": "@InProceedings{van_Noord_2023_ICCV,\n \n author = {\n van Noord,\n Nanne\n},\n title = {\n Prototype-based Dataset Comparison\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1944-1954\n} \n}" }, { "title": "Prototypes-oriented Transductive Few-shot Learning with Conditional Transport", @@ -45567,7 +47095,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "1+1;1+1", - "aff_country_unique": ";China" + "aff_country_unique": ";China", + "bibtex": "@InProceedings{Tian_2023_ICCV,\n \n author = {\n Tian,\n Long and Feng,\n Jingyi and Chai,\n Xiaoqiang and Chen,\n Wenchao and Wang,\n Liming and Liu,\n Xiyang and Chen,\n Bo\n},\n title = {\n Prototypes-oriented Transductive Few-shot Learning with Conditional Transport\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16317-16326\n} \n}" }, { "title": "Prototypical Kernel Learning and Open-set Foreground Perception for Generalized Few-shot Semantic Segmentation", @@ -45599,7 +47128,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Huang_2023_ICCV,\n \n author = {\n Huang,\n Kai and Wang,\n Feigege and Xi,\n Ye and Gao,\n Yutao\n},\n title = {\n Prototypical Kernel Learning and Open-set Foreground Perception for Generalized Few-shot Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19256-19265\n} \n}" }, { "title": "Prototypical Mixing and Retrieval-Based Refinement for Label Noise-Resistant Image Retrieval", @@ -45607,6 +47137,7 @@ "status": "Poster", "track": "main", "pid": "4539", + "author_site": "Xinlong Yang, Haixin Wang, Jinan Sun, Shikun Zhang, Chong Chen, Xian-Sheng Hua, Xiao Luo", "author": "Xinlong Yang, Haixin Wang, Jinan Sun, Shikun Zhang, Chong Chen, Xian-Sheng Hua, Xiao Luo", "abstract": "Label noise is pervasive in real-world applications, which influences the optimization of neural network models. This paper investigates a realistic but understudied problem of image retrieval under label noise, which could lead to severe overfitting or memorization of noisy samples during optimization. Moreover, identifying noisy samples correctly is still a challenging problem for retrieval models. In this paper, we propose a novel approach called Prototypical Mixing and Retrieval-based Refinement (TITAN) for label noise-resistant image retrieval, which corrects label noise and mitigates the effects of the memorization simultaneously. Specifically, we first characterize numerous prototypes with Gaussian distributions in the hidden space, which would direct the Mixing procedure in providing synthesized samples. These samples are fed into a similarity learning framework with varying emphasis based on the prototypical structure to learn semantics with reduced overfitting. In addition, we retrieve comparable samples for each prototype from simple to complex, which refine noisy samples in an accurate and class-balanced manner. Comprehensive experiments on five benchmark datasets demonstrate the superiority of our proposed TITAN compared with various competing baselines.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Yang_Prototypical_Mixing_and_Retrieval-Based_Refinement_for_Label_Noise-Resistant_Image_Retrieval_ICCV_2023_paper.pdf", @@ -45618,7 +47149,8 @@ "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6666026789177198950&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yang_Prototypical_Mixing_and_Retrieval-Based_Refinement_for_Label_Noise-Resistant_Image_Retrieval_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yang_Prototypical_Mixing_and_Retrieval-Based_Refinement_for_Label_Noise-Resistant_Image_Retrieval_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n Xinlong and Wang,\n Haixin and Sun,\n Jinan and Zhang,\n Shikun and Chen,\n Chong and Hua,\n Xian-Sheng and Luo,\n Xiao\n},\n title = {\n Prototypical Mixing and Retrieval-Based Refinement for Label Noise-Resistant Image Retrieval\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11239-11249\n} \n}" }, { "title": "Proxy Anchor-based Unsupervised Learning for Continuous Generalized Category Discovery", @@ -45643,14 +47175,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Kim_Proxy_Anchor-based_Unsupervised_Learning_for_Continuous_Generalized_Category_Discovery_ICCV_2023_paper.html", "aff_unique_index": "0+1;2+3;1;1;1;0", - "aff_unique_norm": "Korea Advanced Institute of Science and Technology;Samsung;German Research Center for Artificial Intelligence;RPTU Kaiserslautern-Landau", - "aff_unique_dep": ";Samsung Electro-Mechanics;;Department of Computer Science", + "aff_unique_norm": "Korea Advanced Institute of Science and Technology;Samsung Electro-Mechanics;German Research Center for Artificial Intelligence;RPTU Kaiserslautern-Landau", + "aff_unique_dep": ";;;Department of Computer Science", "aff_unique_url": "https://www.kaist.ac.kr;https://www.samsungsem.com;https://www.dFKI.de;https://www.rptu.de", "aff_unique_abbr": "KAIST;SEM;DFKI;", "aff_campus_unique_index": "0+1;2+2;1;1;1;0", "aff_campus_unique": "Daejeon;Suwon;Kaiserslautern", "aff_country_unique_index": "0+0;1+1;0;0;0;0", - "aff_country_unique": "South Korea;Germany" + "aff_country_unique": "South Korea;Germany", + "bibtex": "@InProceedings{Kim_2023_ICCV,\n \n author = {\n Kim,\n Hyungmin and Suh,\n Sungho and Kim,\n Daehwan and Jeong,\n Daun and Cho,\n Hansang and Kim,\n Junmo\n},\n title = {\n Proxy Anchor-based Unsupervised Learning for Continuous Generalized Category Discovery\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16688-16697\n} \n}" }, { "title": "Prune Spatio-temporal Tokens by Semantic-aware Temporal Accumulation", @@ -45675,14 +47208,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ding_Prune_Spatio-temporal_Tokens_by_Semantic-aware_Temporal_Accumulation_ICCV_2023_paper.html", "aff_unique_index": "0+1;2;2;1;0;2", - "aff_unique_norm": "Shanghai Jiao Tong University;Chinese University of Hong Kong;Huawei", + "aff_unique_norm": "Shanghai Jiao Tong University;The Chinese University of Hong Kong;Huawei", "aff_unique_dep": ";;Huawei Cloud", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.cuhk.edu.hk;https://www.huaweicloud.com", "aff_unique_abbr": "SJTU;CUHK;Huawei Cloud", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0+0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Ding_2023_ICCV,\n \n author = {\n Ding,\n Shuangrui and Zhao,\n Peisen and Zhang,\n Xiaopeng and Qian,\n Rui and Xiong,\n Hongkai and Tian,\n Qi\n},\n title = {\n Prune Spatio-temporal Tokens by Semantic-aware Temporal Accumulation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16945-16956\n} \n}" }, { "title": "Pseudo Flow Consistency for Self-Supervised 6D Object Pose Estimation", @@ -45714,7 +47248,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Hai_2023_ICCV,\n \n author = {\n Hai,\n Yang and Song,\n Rui and Li,\n Jiaojiao and Ferstl,\n David and Hu,\n Yinlin\n},\n title = {\n Pseudo Flow Consistency for Self-Supervised 6D Object Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14075-14085\n} \n}" }, { "title": "Pseudo-label Alignment for Semi-supervised Instance Segmentation", @@ -45742,11 +47277,12 @@ "aff_unique_norm": "Xiamen University;Contemporary Amperex Technology Co. Limited", "aff_unique_dep": "Key Laboratory of Multimedia Trusted Perception and Efficient Computing;", "aff_unique_url": "https://www.xmu.edu.cn;https://www.catlglobal.com", - "aff_unique_abbr": ";CATL", + "aff_unique_abbr": "XMU;CATL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Hu_2023_ICCV,\n \n author = {\n Hu,\n Jie and Chen,\n Chen and Cao,\n Liujuan and Zhang,\n Shengchuan and Shu,\n Annan and Jiang,\n Guannan and Ji,\n Rongrong\n},\n title = {\n Pseudo-label Alignment for Semi-supervised Instance Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16337-16347\n} \n}" }, { "title": "Pyramid Dual Domain Injection Network for Pan-sharpening", @@ -45778,7 +47314,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0;0+0;0;1", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{He_2023_ICCV,\n \n author = {\n He,\n Xuanhua and Yan,\n Keyu and Li,\n Rui and Xie,\n Chengjun and Zhang,\n Jie and Zhou,\n Man\n},\n title = {\n Pyramid Dual Domain Injection Network for Pan-sharpening\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12908-12917\n} \n}" }, { "title": "Q-Diffusion: Quantizing Diffusion Models", @@ -45801,7 +47338,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_Q-Diffusion_Quantizing_Diffusion_Models_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_Q-Diffusion_Quantizing_Diffusion_Models_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Xiuyu and Liu,\n Yijiang and Lian,\n Long and Yang,\n Huanrui and Dong,\n Zhen and Kang,\n Daniel and Zhang,\n Shanghang and Keutzer,\n Kurt\n},\n title = {\n Q-Diffusion: Quantizing Diffusion Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17535-17545\n} \n}" }, { "title": "QD-BEV : Quantization-aware View-guided Distillation for Multi-view 3D Object Detection", @@ -45833,7 +47371,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;1;0;0;0;0;1;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Yifan and Dong,\n Zhen and Yang,\n Huanrui and Lu,\n Ming and Tseng,\n Cheng-Ching and Du,\n Yuan and Keutzer,\n Kurt and Du,\n Li and Zhang,\n Shanghang\n},\n title = {\n QD-BEV : Quantization-aware View-guided Distillation for Multi-view 3D Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3825-3835\n} \n}" }, { "title": "Quality Diversity for Visual Pre-Training", @@ -45858,14 +47397,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chavhan_Quality_Diversity_for_Visual_Pre-Training_ICCV_2023_paper.html", "aff_unique_index": "0;0;1;0+1", - "aff_unique_norm": "University of Edinburgh;Samsung", + "aff_unique_norm": "University of Edinburgh;Samsung AI Center", "aff_unique_dep": ";AI Center", "aff_unique_url": "https://www.ed.ac.uk;https://www.samsung.com/global/research-innovation/ai-research-centers/samsung-ai-center-cambridge/", "aff_unique_abbr": "Edinburgh;SAC", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Cambridge", "aff_country_unique_index": "0;0;0;0+0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Chavhan_2023_ICCV,\n \n author = {\n Chavhan,\n Ruchika and Gouk,\n Henry and Li,\n Da and Hospedales,\n Timothy\n},\n title = {\n Quality Diversity for Visual Pre-Training\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5384-5394\n} \n}" }, { "title": "Quality-Agnostic Deepfake Detection with Intra-model Collaborative Learning", @@ -45897,7 +47437,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Suwon", "aff_country_unique_index": "0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Le_2023_ICCV,\n \n author = {\n Le,\n Binh M. and Woo,\n Simon S.\n},\n title = {\n Quality-Agnostic Deepfake Detection with Intra-model Collaborative Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22378-22389\n} \n}" }, { "title": "Query Refinement Transformer for 3D Instance Segmentation", @@ -45929,7 +47470,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Lu_2023_ICCV,\n \n author = {\n Lu,\n Jiahao and Deng,\n Jiacheng and Wang,\n Chuxin and He,\n Jianfeng and Zhang,\n Tianzhu\n},\n title = {\n Query Refinement Transformer for 3D Instance Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18516-18526\n} \n}" }, { "title": "Query6DoF: Learning Sparse Queries as Implicit Shape Prior for Category-Level 6DoF Pose Estimation", @@ -45961,7 +47503,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0;0+0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Ruiqi and Wang,\n Xinggang and Li,\n Te and Yang,\n Rong and Wan,\n Minhong and Liu,\n Wenyu\n},\n title = {\n Query6DoF: Learning Sparse Queries as Implicit Shape Prior for Category-Level 6DoF Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14055-14064\n} \n}" }, { "title": "R-Pred: Two-Stage Motion Prediction Via Tube-Query Attention-Based Trajectory Refinement", @@ -45984,7 +47527,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Choi_R-Pred_Two-Stage_Motion_Prediction_Via_Tube-Query_Attention-Based_Trajectory_Refinement_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Choi_R-Pred_Two-Stage_Motion_Prediction_Via_Tube-Query_Attention-Based_Trajectory_Refinement_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Choi_2023_ICCV,\n \n author = {\n Choi,\n Sehwan and Kim,\n Jungho and Yun,\n Junyong and Choi,\n Jun Won\n},\n title = {\n R-Pred: Two-Stage Motion Prediction Via Tube-Query Attention-Based Trajectory Refinement\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8525-8535\n} \n}" }, { "title": "R3D3: Dense 3D Reconstruction of Dynamic Scenes from Multiple Cameras", @@ -45996,7 +47540,7 @@ "author": "Aron Schmied; Tobias Fischer; Martin Danelljan; Marc Pollefeys; Fisher Yu", "abstract": "Dense 3D reconstruction and ego-motion estimation are key challenges in autonomous driving and robotics. Compared to the complex, multi-modal systems deployed today, multi-camera systems provide a simpler, low-cost alternative. However, camera-based 3D reconstruction of complex dynamic scenes has proven extremely difficult, as existing solutions often produce incomplete or incoherent results. We propose R3D3, a multi-camera system for dense 3D reconstruction and ego-motion estimation. Our approach iterates between geometric estimation that exploits spatial-temporal information from multiple cameras, and monocular depth refinement. We integrate multi-camera feature correlation and dense bundle adjustment operators that yield robust geometric depth and pose estimates. To improve reconstruction where geometric depth is unreliable, e.g. for moving objects or low-textured regions, we introduce learnable scene priors via a depth refinement network. We show that this design enables a dense, consistent 3D reconstruction of challenging, dynamic outdoor environments. Consequently, we achieve state-of-the-art dense depth prediction on the DDAD and NuScenes benchmarks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Schmied_R3D3_Dense_3D_Reconstruction_of_Dynamic_Scenes_from_Multiple_Cameras_ICCV_2023_paper.pdf", - "aff": "ETH Z\u00fcrich; ETH Z\u00fcrich; ETH Z\u00fcrich; ETH Z\u00fcrich+Microsoft; ETH Z\u00fcrich", + "aff": "ETH Zürich; ETH Zürich; ETH Zürich; ETH Zürich+Microsoft; ETH Zürich", "project": "https://www.vis.xyz/pub/r3d3/", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Schmied_R3D3_Dense_3D_ICCV_2023_supplemental.pdf", @@ -46009,14 +47553,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Schmied_R3D3_Dense_3D_Reconstruction_of_Dynamic_Scenes_from_Multiple_Cameras_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0+1;0", - "aff_unique_norm": "ETH Zurich;Microsoft", - "aff_unique_dep": ";Microsoft Corporation", + "aff_unique_norm": "ETH Zürich;Microsoft Corporation", + "aff_unique_dep": ";", "aff_unique_url": "https://www.ethz.ch;https://www.microsoft.com", "aff_unique_abbr": "ETHZ;Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0+1;0", - "aff_country_unique": "Switzerland;United States" + "aff_country_unique": "Switzerland;United States", + "bibtex": "@InProceedings{Schmied_2023_ICCV,\n \n author = {\n Schmied,\n Aron and Fischer,\n Tobias and Danelljan,\n Martin and Pollefeys,\n Marc and Yu,\n Fisher\n},\n title = {\n R3D3: Dense 3D Reconstruction of Dynamic Scenes from Multiple Cameras\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3216-3226\n} \n}" }, { "title": "RANA: Relightable Articulated Neural Avatars", @@ -46041,14 +47586,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Iqbal_RANA_Relightable_Articulated_Neural_Avatars_ICCV_2023_paper.html", "aff_unique_index": "0;1;0;0;0;0", - "aff_unique_norm": "NVIDIA;Flawless AI", - "aff_unique_dep": "NVIDIA Corporation;", + "aff_unique_norm": "NVIDIA Corporation;Flawless AI", + "aff_unique_dep": ";", "aff_unique_url": "https://www.nvidia.com;", "aff_unique_abbr": "NVIDIA;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States;" + "aff_country_unique": "United States;", + "bibtex": "@InProceedings{Iqbal_2023_ICCV,\n \n author = {\n Iqbal,\n Umar and Caliskan,\n Akin and Nagano,\n Koki and Khamis,\n Sameh and Molchanov,\n Pavlo and Kautz,\n Jan\n},\n title = {\n RANA: Relightable Articulated Neural Avatars\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23142-23153\n} \n}" }, { "title": "RCA-NOC: Relative Contrastive Alignment for Novel Object Captioning", @@ -46080,7 +47626,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Fan_2023_ICCV,\n \n author = {\n Fan,\n Jiashuo and Liang,\n Yaoyuan and Liu,\n Leyao and Huang,\n Shaolun and Zhang,\n Lei\n},\n title = {\n RCA-NOC: Relative Contrastive Alignment for Novel Object Captioning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15510-15520\n} \n}" }, { "title": "REAP: A Large-Scale Realistic Adversarial Patch Benchmark", @@ -46105,14 +47652,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Hingun_REAP_A_Large-Scale_Realistic_Adversarial_Patch_Benchmark_ICCV_2023_paper.html", "aff_unique_index": "0;0;1;0", - "aff_unique_norm": "University of California, Berkeley;Microsoft", - "aff_unique_dep": ";Microsoft Corporation", + "aff_unique_norm": "University of California, Berkeley;Microsoft Corporation", + "aff_unique_dep": ";", "aff_unique_url": "https://www.berkeley.edu;https://www.microsoft.com", "aff_unique_abbr": "UC Berkeley;Microsoft", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Hingun_2023_ICCV,\n \n author = {\n Hingun,\n Nabeel and Sitawarin,\n Chawin and Li,\n Jerry and Wagner,\n David\n},\n title = {\n REAP: A Large-Scale Realistic Adversarial Patch Benchmark\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4640-4651\n} \n}" }, { "title": "RED-PSM: Regularization by Denoising of Partially Separable Models for Dynamic Imaging", @@ -46137,14 +47685,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Iskender_RED-PSM_Regularization_by_Denoising_of_Partially_Separable_Models_for_Dynamic_ICCV_2023_paper.html", "aff_unique_index": "0+1;1;0", - "aff_unique_norm": "University of Illinois Urbana-Champaign;Los Alamos National Laboratory", + "aff_unique_norm": "University of Illinois at Urbana-Champaign;Los Alamos National Laboratory", "aff_unique_dep": ";", "aff_unique_url": "https://illinois.edu;https://www.lanl.gov", "aff_unique_abbr": "UIUC;LANL", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Urbana-Champaign;", "aff_country_unique_index": "0+0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Iskender_2023_ICCV,\n \n author = {\n Iskender,\n Berk and Klasky,\n Marc L. and Bresler,\n Yoram\n},\n title = {\n RED-PSM: Regularization by Denoising of Partially Separable Models for Dynamic Imaging\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10595-10604\n} \n}" }, { "title": "RFD-ECNet: Extreme Underwater Image Compression with Reference to Feature Dictionary", @@ -46176,7 +47725,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Shanghai", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Mengyao and Shen,\n Liquan and Ye,\n Peng and Feng,\n Guorui and Wang,\n Zheyin\n},\n title = {\n RFD-ECNet: Extreme Underwater Image Compression with Reference to Feature Dictionary\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12980-12989\n} \n}" }, { "title": "RFLA: A Stealthy Reflected Light Adversarial Attack in the Physical World", @@ -46208,7 +47758,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Donghua and Yao,\n Wen and Jiang,\n Tingsong and Li,\n Chao and Chen,\n Xiaoqian\n},\n title = {\n RFLA: A Stealthy Reflected Light Adversarial Attack in the Physical World\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4455-4465\n} \n}" }, { "title": "RICO: Regularizing the Unobservable for Indoor Compositional Reconstruction", @@ -46233,14 +47784,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_RICO_Regularizing_the_Unobservable_for_Indoor_Compositional_Reconstruction_ICCV_2023_paper.html", "aff_unique_index": "0;1;0;0;0;0", - "aff_unique_norm": "Zhejiang University;University of Hong Kong", + "aff_unique_norm": "Zhejiang University;The University of Hong Kong", "aff_unique_dep": ";", "aff_unique_url": "https://www.zju.edu.cn;https://www.hku.hk", "aff_unique_abbr": "ZJU;HKU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Zizhang and Lyu,\n Xiaoyang and Ding,\n Yuanyuan and Wang,\n Mengmeng and Liao,\n Yiyi and Liu,\n Yong\n},\n title = {\n RICO: Regularizing the Unobservable for Indoor Compositional Reconstruction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17761-17771\n} \n}" }, { "title": "RIGID: Recurrent GAN Inversion and Editing of Real Face Videos", @@ -46265,14 +47817,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Xu_RIGID_Recurrent_GAN_Inversion_and_Editing_of_Real_Face_Videos_ICCV_2023_paper.html", "aff_unique_index": "0;1;0+2;0+2", - "aff_unique_norm": "University of Hong Kong;Singapore Management University;Shanghai AI Laboratory", + "aff_unique_norm": "The University of Hong Kong;Singapore Management University;Shanghai AI Laboratory", "aff_unique_dep": "Department of Computer Science;School of Computing and Information Systems;", "aff_unique_url": "https://www.hku.hk;https://www.smu.edu.sg;https://www.shanghai-ai-lab.com", "aff_unique_abbr": "HKU;SMU;SAIL", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;1;0+0;0+0", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Xu_2023_ICCV,\n \n author = {\n Xu,\n Yangyang and He,\n Shengfeng and Wong,\n Kwan-Yee K. and Luo,\n Ping\n},\n title = {\n RIGID: Recurrent GAN Inversion and Editing of Real Face Videos\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13691-13701\n} \n}" }, { "title": "RLIPv2: Fast Scaling of Relational Language-Image Pre-Training", @@ -46304,7 +47857,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Cambridge", "aff_country_unique_index": "0;0;0;1;2;0;0;0;0;0", - "aff_country_unique": "China;United Kingdom;Singapore" + "aff_country_unique": "China;United Kingdom;Singapore", + "bibtex": "@InProceedings{Yuan_2023_ICCV,\n \n author = {\n Yuan,\n Hangjie and Zhang,\n Shiwei and Wang,\n Xiang and Albanie,\n Samuel and Pan,\n Yining and Feng,\n Tao and Jiang,\n Jianwen and Ni,\n Dong and Zhang,\n Yingya and Zhao,\n Deli\n},\n title = {\n RLIPv2: Fast Scaling of Relational Language-Image Pre-Training\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21649-21661\n} \n}" }, { "title": "RLSAC: Reinforcement Learning Enhanced Sample Consensus for End-to-End Robust Estimation", @@ -46316,7 +47870,7 @@ "author": "Chang Nie; Guangming Wang; Zhe Liu; Luca Cavalli; Marc Pollefeys; Hesheng Wang", "abstract": "Robust estimation is a crucial and still challenging task, which involves estimating model parameters in noisy environments. Although conventional sampling consensus-based algorithms sample several times to achieve robustness, these algorithms cannot use data features and historical information effectively. In this paper, we propose RLSAC, a novel Reinforcement Learning enhanced SAmple Consensus framework for end-to-end robust estimation. RLSAC employs a graph neural network to utilize both data and memory features to guide exploring directions for sampling the next minimum set. The feedback of downstream tasks serves as the reward for unsupervised training. Therefore, RLSAC can avoid differentiating to learn the features and the feedback of downstream tasks for end-to-end robust estimation. In addition, RLSAC integrates a state transition module that encodes both data and memory features. Our experimental results demonstrate that RLSAC can learn from features to gradually explore a better hypothesis. Through analysis, it is apparent that RLSAC can be easily transferred to other sampling consensus-based robust estimation tasks. To the best of our knowledge, RLSAC is also the first method that uses reinforcement learning to sample consensus for end-to-end robust estimation. We\n release our codes at https://github.com/IRMVLab/RLSAC.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Nie_RLSAC_Reinforcement_Learning_Enhanced_Sample_Consensus_for_End-to-End_Robust_Estimation_ICCV_2023_paper.pdf", - "aff": "Department of Automation, Key Laboratory of System Control and Information Processing of Ministry of Education, Shanghai Jiao Tong University; Department of Automation, Key Laboratory of System Control and Information Processing of Ministry of Education, Shanghai Jiao Tong University + Department of Computer Science, ETH Z\u00fcrich + Microsoft Mixed Reality and AI Z\u00fcrich Lab; MoE Key Lab of Artificial Intelligence, AI Institute, Shanghai Jiao Tong University; Department of Computer Science, ETH Z\u00fcrich; Department of Computer Science, ETH Z\u00fcrich + Microsoft Mixed Reality and AI Z\u00fcrich Lab; Department of Automation, Key Laboratory of System Control and Information Processing of Ministry of Education, Shanghai Jiao Tong University", + "aff": "Department of Automation, Key Laboratory of System Control and Information Processing of Ministry of Education, Shanghai Jiao Tong University; Department of Automation, Key Laboratory of System Control and Information Processing of Ministry of Education, Shanghai Jiao Tong University + Department of Computer Science, ETH Zürich + Microsoft Mixed Reality and AI Zürich Lab; MoE Key Lab of Artificial Intelligence, AI Institute, Shanghai Jiao Tong University; Department of Computer Science, ETH Zürich; Department of Computer Science, ETH Zürich + Microsoft Mixed Reality and AI Zürich Lab; Department of Automation, Key Laboratory of System Control and Information Processing of Ministry of Education, Shanghai Jiao Tong University", "project": "", "github": "https://github.com/IRMVLab/RLSAC", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Nie_RLSAC_Reinforcement_Learning_ICCV_2023_supplemental.pdf", @@ -46329,14 +47883,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Nie_RLSAC_Reinforcement_Learning_Enhanced_Sample_Consensus_for_End-to-End_Robust_Estimation_ICCV_2023_paper.html", "aff_unique_index": "0;0+1+2;0;1;1+2;0", - "aff_unique_norm": "Shanghai Jiao Tong University;ETH Zurich;Microsoft", + "aff_unique_norm": "Shanghai Jiao Tong University;ETH Zürich;Microsoft", "aff_unique_dep": "Department of Automation;Department of Computer Science;Mixed Reality and AI", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.ethz.ch;https://www.microsoft.com", "aff_unique_abbr": "SJTU;ETHZ;Microsoft", - "aff_campus_unique_index": "1;1", - "aff_campus_unique": ";Z\u00fcrich", + "aff_campus_unique_index": "1;2;1", + "aff_campus_unique": ";Zürich;Shanghai", "aff_country_unique_index": "0;0+1+1;0;1;1+1;0", - "aff_country_unique": "China;Switzerland" + "aff_country_unique": "China;Switzerland", + "bibtex": "@InProceedings{Nie_2023_ICCV,\n \n author = {\n Nie,\n Chang and Wang,\n Guangming and Liu,\n Zhe and Cavalli,\n Luca and Pollefeys,\n Marc and Wang,\n Hesheng\n},\n title = {\n RLSAC: Reinforcement Learning Enhanced Sample Consensus for End-to-End Robust Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9891-9900\n} \n}" }, { "title": "RMP-Loss: Regularizing Membrane Potential Distribution for Spiking Neural Networks", @@ -46364,11 +47919,12 @@ "aff_unique_norm": "Intelligent Science & Technology Academy;Scientific Research Laboratory of Aerospace Intelligent Systems and Technology", "aff_unique_dep": "CASIC;Aerospace Intelligent Systems and Technology", "aff_unique_url": ";", - "aff_unique_abbr": "ISTA;", + "aff_unique_abbr": ";", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Guo_2023_ICCV,\n \n author = {\n Guo,\n Yufei and Liu,\n Xiaode and Chen,\n Yuanpei and Zhang,\n Liwen and Peng,\n Weihang and Zhang,\n Yuhan and Huang,\n Xuhui and Ma,\n Zhe\n},\n title = {\n RMP-Loss: Regularizing Membrane Potential Distribution for Spiking Neural Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17391-17401\n} \n}" }, { "title": "ROME: Robustifying Memory-Efficient NAS via Topology Disentanglement and Gradient Accumulation", @@ -46400,7 +47956,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Xiaoxing and Chu,\n Xiangxiang and Fan,\n Yuda and Zhang,\n Zhexi and Zhang,\n Bo and Yang,\n Xiaokang and Yan,\n Junchi\n},\n title = {\n ROME: Robustifying Memory-Efficient NAS via Topology Disentanglement and Gradient Accumulation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5939-5949\n} \n}" }, { "title": "RPEFlow: Multimodal Fusion of RGB-PointCloud-Event for Joint Optical Flow and Scene Flow Estimation", @@ -46432,7 +47989,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Wan_2023_ICCV,\n \n author = {\n Wan,\n Zhexiong and Mao,\n Yuxin and Zhang,\n Jing and Dai,\n Yuchao\n},\n title = {\n RPEFlow: Multimodal Fusion of RGB-PointCloud-Event for Joint Optical Flow and Scene Flow Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10030-10040\n} \n}" }, { "title": "RPG-Palm: Realistic Pseudo-data Generation for Palmprint Recognition", @@ -46459,12 +48017,13 @@ "aff_unique_index": "0;1;0;1;2;0;0;0;1;1", "aff_unique_norm": "Tencent;Hefei University of Technology;University of California, Los Angeles", "aff_unique_dep": "Youtu Lab;;", - "aff_unique_url": "https://www.tencent.com;http://www.hfut.edu.cn/;https://www.ucla.edu", + "aff_unique_url": "https://www.tencent.com;http://www.hfut.edu.cn;https://www.ucla.edu", "aff_unique_abbr": "Tencent;HUT;UCLA", "aff_campus_unique_index": "1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;0;0;0;1;0;0;0;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Shen_2023_ICCV,\n \n author = {\n Shen,\n Lei and Jin,\n Jianlong and Zhang,\n Ruixin and Li,\n Huaen and Zhao,\n Kai and Zhang,\n Yingyi and Zhang,\n Jingyun and Ding,\n Shouhong and Zhao,\n Yang and Jia,\n Wei\n},\n title = {\n RPG-Palm: Realistic Pseudo-data Generation for Palmprint Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19605-19616\n} \n}" }, { "title": "RSFNet: A White-Box Image Retouching Approach using Region-Specific Color Filters", @@ -46496,7 +48055,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0;0", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Ouyang_2023_ICCV,\n \n author = {\n Ouyang,\n Wenqi and Dong,\n Yi and Kang,\n Xiaoyang and Ren,\n Peiran and Xu,\n Xin and Xie,\n Xuansong\n},\n title = {\n RSFNet: A White-Box Image Retouching Approach using Region-Specific Color Filters\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12160-12169\n} \n}" }, { "title": "Random Boxes Are Open-world Object Detectors", @@ -46528,7 +48088,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+1;0", - "aff_country_unique": "Singapore;China;" + "aff_country_unique": "Singapore;China;", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Yanghao and Yue,\n Zhongqi and Hua,\n Xian-Sheng and Zhang,\n Hanwang\n},\n title = {\n Random Boxes Are Open-world Object Detectors\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6233-6243\n} \n}" }, { "title": "Random Sub-Samples Generation for Self-Supervised Real Image Denoising", @@ -46553,14 +48114,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Pan_Random_Sub-Samples_Generation_for_Self-Supervised_Real_Image_Denoising_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;1;0", - "aff_unique_norm": "Sichuan University;Beijing Jiao Tong University", + "aff_unique_norm": "Sichuan University;Beijing Jiaotong University", "aff_unique_dep": "College of Electronics and Information Engineering;School of Computer and Information Technology", "aff_unique_url": "https://www.scu.edu.cn;http://www.bjtu.edu.cn", "aff_unique_abbr": ";BJTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Pan_2023_ICCV,\n \n author = {\n Pan,\n Yizhong and Liu,\n Xiao and Liao,\n Xiangyu and Cao,\n Yuanzhouhan and Ren,\n Chao\n},\n title = {\n Random Sub-Samples Generation for Self-Supervised Real Image Denoising\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12150-12159\n} \n}" }, { "title": "Randomized Quantization: A Generic Augmentation for Data Agnostic Self-supervised Learning", @@ -46585,14 +48147,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wu_Randomized_Quantization_A_Generic_Augmentation_for_Data_Agnostic_Self-supervised_Learning_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;3;0;0;4;4", - "aff_unique_norm": "Hong Kong University of Science and Technology;Chinese Academy of Sciences, Institute of Automation;Shanghai AI Lab;Peking University;Microsoft", - "aff_unique_dep": ";CAIR (Committee on Artificial Intelligence and Robotics);;;Research", + "aff_unique_norm": "Hong Kong University of Science and Technology;Chinese Academy of Sciences, Institute of Automation;Shanghai AI Lab;Peking University;Microsoft Research", + "aff_unique_dep": ";CAIR (Committee on Automation and Robotics);;;Research", "aff_unique_url": "https://www.ust.hk;http://www.ia.cas.cn;https://www.shanghaiailab.com;http://www.pku.edu.cn;https://www.microsoft.com/en-us/research/group/asia", "aff_unique_abbr": "HKUST;CAS;SAIL;Peking U;MSR Asia", "aff_campus_unique_index": "0;0;0;2;2", "aff_campus_unique": "Hong Kong SAR;;Asia", "aff_country_unique_index": "0;0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wu_2023_ICCV,\n \n author = {\n Wu,\n Huimin and Lei,\n Chenyang and Sun,\n Xiao and Wang,\n Peng-Shuai and Chen,\n Qifeng and Cheng,\n Kwang-Ting and Lin,\n Stephen and Wu,\n Zhirong\n},\n title = {\n Randomized Quantization: A Generic Augmentation for Data Agnostic Self-supervised Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16305-16316\n} \n}" }, { "title": "RankMatch: Fostering Confidence and Consistency in Learning with Noisy Labels", @@ -46615,7 +48178,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhang_RankMatch_Fostering_Confidence_and_Consistency_in_Learning_with_Noisy_Labels_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhang_RankMatch_Fostering_Confidence_and_Consistency_in_Learning_with_Noisy_Labels_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Ziyi and Chen,\n Weikai and Fang,\n Chaowei and Li,\n Zhen and Chen,\n Lechao and Lin,\n Liang and Li,\n Guanbin\n},\n title = {\n RankMatch: Fostering Confidence and Consistency in Learning with Noisy Labels\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1644-1654\n} \n}" }, { "title": "RankMixup: Ranking-Based Mixup Training for Network Calibration", @@ -46638,7 +48202,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Noh_RankMixup_Ranking-Based_Mixup_Training_for_Network_Calibration_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Noh_RankMixup_Ranking-Based_Mixup_Training_for_Network_Calibration_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Noh_2023_ICCV,\n \n author = {\n Noh,\n Jongyoun and Park,\n Hyekang and Lee,\n Junghyup and Ham,\n Bumsub\n},\n title = {\n RankMixup: Ranking-Based Mixup Training for Network Calibration\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1358-1368\n} \n}" }, { "title": "Rapid Adaptation in Online Continual Learning: Are We Evaluating It Right?", @@ -46663,14 +48228,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Al_Kader_Hammoud_Rapid_Adaptation_in_Online_Continual_Learning_Are_We_Evaluating_It_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;1;0;1", - "aff_unique_norm": "King Abdullah University of Science and Technology;University of Oxford;Meta", - "aff_unique_dep": ";;Meta AI3", + "aff_unique_norm": "King Abdullah University of Science and Technology;University of Oxford;Meta AI3", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.kaust.edu.sa;https://www.ox.ac.uk;https://meta.com", "aff_unique_abbr": "KAUST;Oxford;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;1;0;1", - "aff_country_unique": "Saudi Arabia;United Kingdom;United States" + "aff_country_unique": "Saudi Arabia;United Kingdom;United States", + "bibtex": "@InProceedings{Al_Kader_Hammoud_2023_ICCV,\n \n author = {\n Al Kader Hammoud,\n Hasan Abed and Prabhu,\n Ameya and Lim,\n Ser-Nam and Torr,\n Philip H.S. and Bibi,\n Adel and Ghanem,\n Bernard\n},\n title = {\n Rapid Adaptation in Online Continual Learning: Are We Evaluating It Right?\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18852-18861\n} \n}" }, { "title": "Rapid Network Adaptation: Learning to Adapt Neural Networks Using Test-Time Feedback", @@ -46679,7 +48245,7 @@ "track": "main", "pid": "5339", "author_site": "Teresa Yeo, O?uzhan Fatih Kar, Zahra Sodagar, Amir Zamir", - "author": "Teresa Yeo; O\u011fuzhan Fatih Kar; Zahra Sodagar; Amir Zamir", + "author": "Teresa Yeo; Oğuzhan Fatih Kar; Zahra Sodagar; Amir Zamir", "abstract": "We propose a method for adapting neural networks to distribution shifts at test-time. In contrast to training-time robustness mechanisms that attempt to anticipate the shift, we create a closed-loop system and make use of test-time feedback signal to adapt a network. We show that this loop can be effectively implemented using a learning-based function, which realizes an amortized optimizer for the network. This leads to an adaptation method, named Rapid Network Adaptation (RNA), that is notably more flexible and orders of magnitude faster than the baselines. Through a broad set of experiments using various adaptation signals and target tasks, we study the generality, efficiency, and flexibility of this method. We perform the evaluations using various datasets (Taskonomy, Replica, ScanNet, Hypersim, COCO, ImageNet), tasks (depth, optical flow, semantic segmentation, classification), and distribution shifts (Cross-datasets, 2D and 3D Common Corruptions) with promising results.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Yeo_Rapid_Network_Adaptation_Learning_to_Adapt_Neural_Networks_Using_Test-Time_ICCV_2023_paper.pdf", "aff": ";;;", @@ -46693,7 +48259,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yeo_Rapid_Network_Adaptation_Learning_to_Adapt_Neural_Networks_Using_Test-Time_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yeo_Rapid_Network_Adaptation_Learning_to_Adapt_Neural_Networks_Using_Test-Time_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Yeo_2023_ICCV,\n \n author = {\n Yeo,\n Teresa and Kar,\n O\\u{g\n}uzhan Fatih and Sodagar,\n Zahra and Zamir,\n Amir\n},\n title = {\n Rapid Network Adaptation: Learning to Adapt Neural Networks Using Test-Time Feedback\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4674-4687\n} \n}" }, { "title": "RawHDR: High Dynamic Range Image Reconstruction from a Single Raw Image", @@ -46725,7 +48292,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zou_2023_ICCV,\n \n author = {\n Zou,\n Yunhao and Yan,\n Chenggang and Fu,\n Ying\n},\n title = {\n RawHDR: High Dynamic Range Image Reconstruction from a Single Raw Image\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12334-12344\n} \n}" }, { "title": "Ray Conditioning: Trading Photo-consistency for Photo-realism in Multi-view Image Generation", @@ -46757,7 +48325,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Eric Ming and Holalkere,\n Sidhanth and Yan,\n Ruyu and Zhang,\n Kai and Davis,\n Abe\n},\n title = {\n Ray Conditioning: Trading Photo-consistency for Photo-realism in Multi-view Image Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23242-23251\n} \n}" }, { "title": "RbA: Segmenting Unknown Regions Rejected by All", @@ -46765,11 +48334,11 @@ "status": "Poster", "track": "main", "pid": "1530", - "author_site": "Nazir Nayal, Misra Yavuz, Jo\u00e3o F. Henriques, Fatma G\u00fcney", - "author": "Nazir Nayal; Misra Yavuz; Jo\u00e3o F. Henriques; Fatma G\u00fcney", + "author_site": "Nazir Nayal, Misra Yavuz, João F. Henriques, Fatma Güney", + "author": "Nazir Nayal; Misra Yavuz; João F. Henriques; Fatma Güney", "abstract": "Standard semantic segmentation models owe their success to curated datasets with a fixed set of semantic categories, without contemplating the possibility of identifying unknown objects from novel categories. Existing methods in outlier detection suffer from a lack of smoothness and objectness in their predictions, due to limitations of the per-pixel classification paradigm. Furthermore, additional training for detecting outliers harms the performance of known classes. In this paper, we explore another paradigm with region-level classification to better segment unknown objects. We show that the object queries in mask classification tend to behave like one vs. all classifiers. Based on this finding, we propose a novel outlier scoring function called RbA by defining the event of being an outlier as being rejected by all known classes. Our extensive experiments show that mask classification improves the performance of the existing outlier detection methods, and the best results are achieved with the proposed RbA. We also propose an objective to optimize RbA using minimal outlier supervision. Further fine-tuning with outliers improves the unknown performance, and unlike previous methods, it does not degrade the inlier performance. \n Project page: https://kuis-ai.github.io/RbA", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Nayal_RbA_Segmenting_Unknown_Regions_Rejected_by_All_ICCV_2023_paper.pdf", - "aff": "KUIS AI Center and Department of Computer Engineering, Koc \u00b8 University; KUIS AI Center and Department of Computer Engineering, Koc \u00b8 University; University of Oxford; KUIS AI Center and Department of Computer Engineering, Koc \u00b8 University", + "aff": "KUIS AI Center and Department of Computer Engineering, Koc ¸ University; KUIS AI Center and Department of Computer Engineering, Koc ¸ University; University of Oxford; KUIS AI Center and Department of Computer Engineering, Koc ¸ University", "project": "https://kuis-ai.github.io/RbA", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Nayal_RbA_Segmenting_Unknown_ICCV_2023_supplemental.pdf", @@ -46784,12 +48353,13 @@ "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Koc University;University of Oxford", "aff_unique_dep": "Department of Computer Engineering;", - "aff_unique_url": "https://www.kocuni.edu.tr;https://www.ox.ac.uk", - "aff_unique_abbr": "Koc Uni;Oxford", + "aff_unique_url": "https://www.ku.edu.tr;https://www.ox.ac.uk", + "aff_unique_abbr": "Koc U;Oxford", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", - "aff_country_unique": "T\u00fcrkiye;United Kingdom" + "aff_country_unique": "Turkey;United Kingdom", + "bibtex": "@InProceedings{Nayal_2023_ICCV,\n \n author = {\n Nayal,\n Nazir and Yavuz,\n Misra and Henriques,\n Jo\\~ao F. and G\\"uney,\n Fatma\n},\n title = {\n RbA: Segmenting Unknown Regions Rejected by All\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 711-722\n} \n}" }, { "title": "Re-ReND: Real-Time Rendering of NeRFs across Devices", @@ -46797,8 +48367,8 @@ "status": "Poster", "track": "main", "pid": "3760", - "author_site": "Sara Rojas, Jesus Zarzar, Juan C. P\u00e9rez, Artsiom Sanakoyeu, Ali Thabet, Albert Pumarola, Bernard Ghanem", - "author": "Sara Rojas; Jesus Zarzar; Juan C. P\u00e9rez; Artsiom Sanakoyeu; Ali Thabet; Albert Pumarola; Bernard Ghanem", + "author_site": "Sara Rojas, Jesus Zarzar, Juan C. Pérez, Artsiom Sanakoyeu, Ali Thabet, Albert Pumarola, Bernard Ghanem", + "author": "Sara Rojas; Jesus Zarzar; Juan C. Pérez; Artsiom Sanakoyeu; Ali Thabet; Albert Pumarola; Bernard Ghanem", "abstract": "This paper proposes a novel approach for rendering a pre-trained Neural Radiance Field (NeRF) in real-time on resource-constrained devices. We introduce Re-ReND, a method enabling Real-time Rendering of NeRFs across Devices. Re-ReND is designed to achieve real-time performance by converting the NeRF into a representation that can be efficiently processed by standard graphics pipelines. The proposed method distills the NeRF by extracting the learned density into a mesh, while the learned color information is factorized into a set of matrices that represent the scene's light field. Factorization implies the field is queried via inexpensive MLP-free matrix multiplications, while using a light field allows rendering a pixel by querying the field a single time--as opposed to hundreds of queries when employing a radiance field. Since the proposed representation can be implemented using a fragment shader, it can be directly integrated with standard rasterization frameworks. Our flexible implementation can render a NeRF in real-time with low memory requirements and on a wide range of resource-constrained devices, including mobiles and AR/VR headsets. Notably, we find that Re-ReND can achieve over a 2.6-fold increase in rendering speed versus the state-of-the-art without perceptible losses in quality.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Rojas_Re-ReND_Real-Time_Rendering_of_NeRFs_across_Devices_ICCV_2023_paper.pdf", "aff": "King Abdullah University of Science and Technology (KAUST); King Abdullah University of Science and Technology (KAUST); King Abdullah University of Science and Technology (KAUST); Meta Research; Meta Research; Meta Research; King Abdullah University of Science and Technology (KAUST)", @@ -46814,14 +48384,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Rojas_Re-ReND_Real-Time_Rendering_of_NeRFs_across_Devices_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;1;1;1;0", - "aff_unique_norm": "King Abdullah University of Science and Technology;Meta", - "aff_unique_dep": ";Meta Research", + "aff_unique_norm": "King Abdullah University of Science and Technology;Meta Research", + "aff_unique_dep": ";", "aff_unique_url": "https://www.kaust.edu.sa;https://research.facebook.com", "aff_unique_abbr": "KAUST;Meta Research", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;1;1;0", - "aff_country_unique": "Saudi Arabia;United States" + "aff_country_unique": "Saudi Arabia;United States", + "bibtex": "@InProceedings{Rojas_2023_ICCV,\n \n author = {\n Rojas,\n Sara and Zarzar,\n Jesus and P\\'erez,\n Juan C. and Sanakoyeu,\n Artsiom and Thabet,\n Ali and Pumarola,\n Albert and Ghanem,\n Bernard\n},\n title = {\n Re-ReND: Real-Time Rendering of NeRFs across Devices\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3632-3641\n} \n}" }, { "title": "Re-mine, Learn and Reason: Exploring the Cross-modal Semantic Correlations for Language-guided HOI detection", @@ -46853,7 +48424,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0;0;1", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Cao_2023_ICCV,\n \n author = {\n Cao,\n Yichao and Tang,\n Qingfei and Yang,\n Feng and Su,\n Xiu and You,\n Shan and Lu,\n Xiaobo and Xu,\n Chang\n},\n title = {\n Re-mine,\n Learn and Reason: Exploring the Cross-modal Semantic Correlations for Language-guided HOI detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23492-23503\n} \n}" }, { "title": "Re:PolyWorld - A Graph Neural Network for Polygonal Scene Parsing", @@ -46885,7 +48457,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Austria" + "aff_country_unique": "Austria", + "bibtex": "@InProceedings{Zorzi_2023_ICCV,\n \n author = {\n Zorzi,\n Stefano and Fraundorfer,\n Friedrich\n},\n title = {\n Re:PolyWorld - A Graph Neural Network for Polygonal Scene Parsing\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16762-16771\n} \n}" }, { "title": "ReFit: Recurrent Fitting Network for 3D Human Recovery", @@ -46917,7 +48490,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Yufu and Daniilidis,\n Kostas\n},\n title = {\n ReFit: Recurrent Fitting Network for 3D Human Recovery\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14644-14654\n} \n}" }, { "title": "ReGen: A good Generative Zero-Shot Video Classifier Should be Rewarded", @@ -46940,7 +48514,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Bulat_ReGen_A_good_Generative_Zero-Shot_Video_Classifier_Should_be_Rewarded_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Bulat_ReGen_A_good_Generative_Zero-Shot_Video_Classifier_Should_be_Rewarded_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Bulat_2023_ICCV,\n \n author = {\n Bulat,\n Adrian and Sanchez,\n Enrique and Martinez,\n Brais and Tzimiropoulos,\n Georgios\n},\n title = {\n ReGen: A good Generative Zero-Shot Video Classifier Should be Rewarded\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13523-13533\n} \n}" }, { "title": "ReLeaPS : Reinforcement Learning-based Illumination Planning for Generalized Photometric Stereo", @@ -46972,7 +48547,8 @@ "aff_campus_unique_index": ";;1;2;;", "aff_campus_unique": ";Beijing;Shanghai", "aff_country_unique_index": "0+0;0+0;0+1;0;0+0;0+0", - "aff_country_unique": "China;Japan" + "aff_country_unique": "China;Japan", + "bibtex": "@InProceedings{Chan_2023_ICCV,\n \n author = {\n Chan,\n Jun Hoong and Yu,\n Bohan and Guo,\n Heng and Ren,\n Jieji and Lu,\n Zongqing and Shi,\n Boxin\n},\n title = {\n ReLeaPS : Reinforcement Learning-based Illumination Planning for Generalized Photometric Stereo\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9167-9175\n} \n}" }, { "title": "ReMoDiffuse: Retrieval-Augmented Motion Diffusion Model", @@ -46995,7 +48571,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhang_ReMoDiffuse_Retrieval-Augmented_Motion_Diffusion_Model_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhang_ReMoDiffuse_Retrieval-Augmented_Motion_Diffusion_Model_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Mingyuan and Guo,\n Xinying and Pan,\n Liang and Cai,\n Zhongang and Hong,\n Fangzhou and Li,\n Huirong and Yang,\n Lei and Liu,\n Ziwei\n},\n title = {\n ReMoDiffuse: Retrieval-Augmented Motion Diffusion Model\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 364-373\n} \n}" }, { "title": "ReNeRF: Relightable Neural Radiance Fields with Nearfield Lighting", @@ -47007,7 +48584,7 @@ "author": "Yingyan Xu; Gaspard Zoss; Prashanth Chandran; Markus Gross; Derek Bradley; Paulo Gotardo", "abstract": "Recent work on radiance fields and volumetric inverse rendering (e.g., NeRFs) has provided excellent results in building data-driven models of real scenes for novel view synthesis with high photorealism. While full control over viewpoint is achieved, scene lighting is typically \"baked\" into the model and cannot be changed; other methods only capture limited variation in lighting or make restrictive assumptions about the captured scene. These limitations prevent the application on arbitrary materials and novel 3D environments with complex, distinct lighting. In this paper, we target the application scenario of capturing high-fidelity assets for neural relighting in controlled studio conditions, but without requiring a dense light stage. Instead, we leverage a small number of area lights commonly used in photogrammetry. We propose ReNeRF, a relightable radiance field model based on the intuitive and powerful approach of image-based relighting, which implicitly captures global light transport (for arbitrary objects) without complex, error-prone simulations. Thus, our new method is simple and provides full control over viewpoint and lighting, without simplistic assumptions about how light interacts with the scene. In addition, ReNeRF does not rely on the usual assumption of distant lighting - during training, we explicitly account for the distance between 3D points in the volume and point samples on the light sources. Thus, at test time, we achieve better generalization to novel, continuous lighting directions, including nearfield lighting effects.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Xu_ReNeRF_Relightable_Neural_Radiance_Fields_with_Nearfield_Lighting_ICCV_2023_paper.pdf", - "aff": "ETH Z\u00fcrich+DisneyResearch|Studios; DisneyResearch|Studios; DisneyResearch|Studios; ETH Z\u00fcrich+DisneyResearch|Studios; DisneyResearch|Studios; DisneyResearch|Studios", + "aff": "ETH Zürich+DisneyResearch|Studios; DisneyResearch|Studios; DisneyResearch|Studios; ETH Zürich+DisneyResearch|Studios; DisneyResearch|Studios; DisneyResearch|Studios", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Xu_ReNeRF_Relightable_Neural_ICCV_2023_supplemental.zip", @@ -47020,14 +48597,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Xu_ReNeRF_Relightable_Neural_Radiance_Fields_with_Nearfield_Lighting_ICCV_2023_paper.html", "aff_unique_index": "0+1;1;1;0+1;1;1", - "aff_unique_norm": "ETH Zurich;Disney Research", + "aff_unique_norm": "ETH Zürich;Disney Research", "aff_unique_dep": ";Studios", - "aff_unique_url": "https://www.ethz.ch;https://www.disneyresearch.com", + "aff_unique_url": "https://www.ethz.ch;https://research.disney.com", "aff_unique_abbr": "ETHZ;Disney Research", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+1;1;1;0+1;1;1", - "aff_country_unique": "Switzerland;United States" + "aff_country_unique": "Switzerland;United States", + "bibtex": "@InProceedings{Xu_2023_ICCV,\n \n author = {\n Xu,\n Yingyan and Zoss,\n Gaspard and Chandran,\n Prashanth and Gross,\n Markus and Bradley,\n Derek and Gotardo,\n Paulo\n},\n title = {\n ReNeRF: Relightable Neural Radiance Fields with Nearfield Lighting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22581-22591\n} \n}" }, { "title": "ReST: A Reconfigurable Spatial-Temporal Graph Model for Multi-Camera Multi-Object Tracking", @@ -47059,7 +48637,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Taiwan", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Cheng_2023_ICCV,\n \n author = {\n Cheng,\n Cheng-Che and Qiu,\n Min-Xuan and Chiang,\n Chen-Kuo and Lai,\n Shang-Hong\n},\n title = {\n ReST: A Reconfigurable Spatial-Temporal Graph Model for Multi-Camera Multi-Object Tracking\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10051-10060\n} \n}" }, { "title": "ReactioNet: Learning High-Order Facial Behavior from Universal Stimulus-Reaction by Dyadic Relation Reasoning", @@ -47091,7 +48670,8 @@ "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Binghamton", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Xiaotian and Wang,\n Taoyue and Zhao,\n Geran and Zhang,\n Xiang and Kang,\n Xi and Yin,\n Lijun\n},\n title = {\n ReactioNet: Learning High-Order Facial Behavior from Universal Stimulus-Reaction by Dyadic Relation Reasoning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20774-20785\n} \n}" }, { "title": "Read-only Prompt Optimization for Vision-Language Few-shot Learning", @@ -47123,7 +48703,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Lee_2023_ICCV,\n \n author = {\n Lee,\n Dongjun and Song,\n Seokwon and Suh,\n Jihee and Choi,\n Joonmyeong and Lee,\n Sanghyeok and Kim,\n Hyunwoo J.\n},\n title = {\n Read-only Prompt Optimization for Vision-Language Few-shot Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1401-1411\n} \n}" }, { "title": "Real-Time Neural Rasterization for Large Scenes", @@ -47155,7 +48736,8 @@ "aff_campus_unique_index": ";;;;;", "aff_campus_unique": "", "aff_country_unique_index": "1+1;1;1;1;1;1", - "aff_country_unique": ";Canada" + "aff_country_unique": ";Canada", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Jeffrey Yunfan and Chen,\n Yun and Yang,\n Ze and Wang,\n Jingkang and Manivasagam,\n Sivabalan and Urtasun,\n Raquel\n},\n title = {\n Real-Time Neural Rasterization for Large Scenes\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8416-8427\n} \n}" }, { "title": "RealGraph: A Multiview Dataset for 4D Real-world Context Graph Generation", @@ -47187,7 +48769,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Lin_2023_ICCV,\n \n author = {\n Lin,\n Haozhe and Chen,\n Zequn and Zhang,\n Jinzhi and Bai,\n Bing and Wang,\n Yu and Huang,\n Ruqi and Fang,\n Lu\n},\n title = {\n RealGraph: A Multiview Dataset for 4D Real-world Context Graph Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3758-3768\n} \n}" }, { "title": "Realistic Full-Body Tracking from Sparse Observations via Joint-Level Modeling", @@ -47219,7 +48802,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zheng_2023_ICCV,\n \n author = {\n Zheng,\n Xiaozheng and Su,\n Zhuo and Wen,\n Chao and Xue,\n Zhou and Jin,\n Xiaojie\n},\n title = {\n Realistic Full-Body Tracking from Sparse Observations via Joint-Level Modeling\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14678-14688\n} \n}" }, { "title": "RecRecNet: Rectangling Rectified Wide-Angle Images by Thin-Plate Spline Model and DoF-based Curriculum Learning", @@ -47244,14 +48828,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Liao_RecRecNet_Rectangling_Rectified_Wide-Angle_Images_by_Thin-Plate_Spline_Model_and_ICCV_2023_paper.html", "aff_unique_index": "0;0;1;0;0", - "aff_unique_norm": "Beijing Jiao Tong University;Beijing Key Laboratory of Advanced Information Science and Network", + "aff_unique_norm": "Beijing Jiaotong University;Beijing Key Laboratory of Advanced Information Science and Network", "aff_unique_dep": "Institute of Information Science;Advanced Information Science and Network", "aff_unique_url": "http://www.bjtu.edu.cn;", "aff_unique_abbr": "BJTU;", - "aff_campus_unique_index": "0;0;0;0", - "aff_campus_unique": "Beijing;", + "aff_campus_unique_index": "", + "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liao_2023_ICCV,\n \n author = {\n Liao,\n Kang and Nie,\n Lang and Lin,\n Chunyu and Zheng,\n Zishuo and Zhao,\n Yao\n},\n title = {\n RecRecNet: Rectangling Rectified Wide-Angle Images by Thin-Plate Spline Model and DoF-based Curriculum Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10800-10809\n} \n}" }, { "title": "Reconciling Object-Level and Global-Level Objectives for Long-Tail Detection", @@ -47283,7 +48868,8 @@ "aff_campus_unique_index": "0+0;0+0;0+0", "aff_campus_unique": "Beijing;", "aff_country_unique_index": "0+0;0+0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Shaoyu and Chen,\n Chen and Peng,\n Silong\n},\n title = {\n Reconciling Object-Level and Global-Level Objectives for Long-Tail Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18982-18992\n} \n}" }, { "title": "Reconstructed Convolution Module Based Look-Up Tables for Efficient Image Super-Resolution", @@ -47315,7 +48901,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Guandu and Ding,\n Yukang and Li,\n Mading and Sun,\n Ming and Wen,\n Xing and Wang,\n Bin\n},\n title = {\n Reconstructed Convolution Module Based Look-Up Tables for Efficient Image Super-Resolution\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12217-12226\n} \n}" }, { "title": "Reconstructing Groups of People with Hypergraph Relational Reasoning", @@ -47327,7 +48914,7 @@ "author": "Buzhen Huang; Jingyi Ju; Zhihao Li; Yangang Wang", "abstract": "Due to the mutual occlusion, severe scale variation, and complex spatial distribution, the current multi-person mesh recovery methods cannot produce accurate absolute body poses and shapes in large-scale crowded scenes. To address the obstacles, we fully exploit crowd features for reconstructing groups of people from a monocular image. A novel hypergraph relational reasoning network is proposed to formulate the complex and high-order relation correlations among individuals and groups in the crowd. We first extract compact human features and location information from the original high-resolution image. By conducting the relational reasoning on the extracted individual features, the underlying crowd collectiveness and interaction relationship can provide additional group information for the reconstruction. Finally, the updated individual features and the localization information are used to regress human meshes in camera coordinates. To facilitate the network training, we further build pseudo ground-truth on two crowd datasets, which may also promote future research on pose estimation and human behavior understanding in crowded scenes. The experimental results show that our approach outperforms other baseline methods both in crowded and common scenarios. The code and datasets are publicly available at https://github.com/boycehbz/GroupRec.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Huang_Reconstructing_Groups_of_People_with_Hypergraph_Relational_Reasoning_ICCV_2023_paper.pdf", - "aff": "Southeast University, China; Southeast University, China; Huawei Noah\u2019s Ark Lab; Southeast University, China", + "aff": "Southeast University, China; Southeast University, China; Huawei Noah’s Ark Lab; Southeast University, China", "project": "", "github": "https://github.com/boycehbz/GroupRec", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Huang_Reconstructing_Groups_of_ICCV_2023_supplemental.pdf", @@ -47341,13 +48928,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Huang_Reconstructing_Groups_of_People_with_Hypergraph_Relational_Reasoning_ICCV_2023_paper.html", "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Southeast University;Huawei", - "aff_unique_dep": ";Noah\u2019s Ark Lab", + "aff_unique_dep": ";Noah’s Ark Lab", "aff_unique_url": "https://www.seu.edu.cn/;https://www.huawei.com", "aff_unique_abbr": "SEU;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Huang_2023_ICCV,\n \n author = {\n Huang,\n Buzhen and Ju,\n Jingyi and Li,\n Zhihao and Wang,\n Yangang\n},\n title = {\n Reconstructing Groups of People with Hypergraph Relational Reasoning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14873-14883\n} \n}" }, { "title": "Reconstructing Interacting Hands with Interaction Prior from Monocular Images", @@ -47379,7 +48967,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Beijing", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zuo_2023_ICCV,\n \n author = {\n Zuo,\n Binghui and Zhao,\n Zimeng and Sun,\n Wenqian and Xie,\n Wei and Xue,\n Zhou and Wang,\n Yangang\n},\n title = {\n Reconstructing Interacting Hands with Interaction Prior from Monocular Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9054-9064\n} \n}" }, { "title": "Recovering a Molecule's 3D Dynamics from Liquid-phase Electron Microscopy Movies", @@ -47403,15 +48992,16 @@ "email": "stu.pku.edu.cn;dp.tech;pku.edu.cn;pku.edu.cn;ccme.pku.edu.cn;pku.edu.cn", "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ye_Recovering_a_Molecules_3D_Dynamics_from_Liquid-phase_Electron_Microscopy_Movies_ICCV_2023_paper.html", - "aff_unique_index": "0+0;0;0;0+0;0+0", - "aff_unique_norm": "Peking University;", + "aff_unique_index": "0+0;1;0;0;0+0;0+0", + "aff_unique_norm": "Peking University;DP Technology", "aff_unique_dep": "National Biomedical Imaging Center;", "aff_unique_url": "http://www.pku.edu.cn;", "aff_unique_abbr": "PKU;", "aff_campus_unique_index": "0+0;0;0;0+0;0+0", "aff_campus_unique": "Beijing;", - "aff_country_unique_index": "0+0;0;0;0+0;0+0", - "aff_country_unique": "China;" + "aff_country_unique_index": "0+0;0;0;0;0+0;0+0", + "aff_country_unique": "China", + "bibtex": "@InProceedings{Ye_2023_ICCV,\n \n author = {\n Ye,\n Enze and Wang,\n Yuhang and Zhang,\n Hong and Gao,\n Yiqin and Wang,\n Huan and Sun,\n He\n},\n title = {\n Recovering a Molecule's 3D Dynamics from Liquid-phase Electron Microscopy Movies\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10767-10777\n} \n}" }, { "title": "Recursive Video Lane Detection", @@ -47443,7 +49033,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Jin_2023_ICCV,\n \n author = {\n Jin,\n Dongkwon and Kim,\n Dahyun and Kim,\n Chang-Su\n},\n title = {\n Recursive Video Lane Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8473-8482\n} \n}" }, { "title": "RecursiveDet: End-to-End Region-Based Recursive Object Detection", @@ -47475,7 +49066,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Shanghai", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhao_2023_ICCV,\n \n author = {\n Zhao,\n Jing and Sun,\n Li and Li,\n Qingli\n},\n title = {\n RecursiveDet: End-to-End Region-Based Recursive Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6307-6316\n} \n}" }, { "title": "Reducing Training Time in Cross-Silo Federated Learning Using Multigraph Topology", @@ -47507,7 +49099,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0;0;2", - "aff_country_unique": "Singapore;Vietnam;United Kingdom" + "aff_country_unique": "Singapore;Vietnam;United Kingdom", + "bibtex": "@InProceedings{Do_2023_ICCV,\n \n author = {\n Do,\n Tuong and Nguyen,\n Binh X. and Pham,\n Vuong and Tran,\n Toan and Tjiputra,\n Erman and Tran,\n Quang D. and Nguyen,\n Anh\n},\n title = {\n Reducing Training Time in Cross-Silo Federated Learning Using Multigraph Topology\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19409-19419\n} \n}" }, { "title": "Ref-NeuS: Ambiguity-Reduced Neural Implicit Surface Learning for Multi-View Reconstruction with Reflection", @@ -47532,14 +49125,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ge_Ref-NeuS_Ambiguity-Reduced_Neural_Implicit_Surface_Learning_for_Multi-View_Reconstruction_with_ICCV_2023_paper.html", "aff_unique_index": "0;1;0;2;0+0", - "aff_unique_norm": "Hong Kong University of Science and Technology;Chinese University of Hong Kong;SmartMore", + "aff_unique_norm": "Hong Kong University of Science and Technology;The Chinese University of Hong Kong;SmartMore", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ust.hk;https://www.cuhk.edu.hk;", "aff_unique_abbr": "HKUST;CUHK;", "aff_campus_unique_index": "0;1;0;0+1", "aff_campus_unique": "Guangzhou;Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0+0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Ge_2023_ICCV,\n \n author = {\n Ge,\n Wenhang and Hu,\n Tao and Zhao,\n Haoyu and Liu,\n Shu and Chen,\n Ying-Cong\n},\n title = {\n Ref-NeuS: Ambiguity-Reduced Neural Implicit Surface Learning for Multi-View Reconstruction with Reflection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4251-4260\n} \n}" }, { "title": "RefEgo: Referring Expression Comprehension Dataset from First-Person Perception of Ego4D", @@ -47571,7 +49165,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0+0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Kurita_2023_ICCV,\n \n author = {\n Kurita,\n Shuhei and Katsura,\n Naoki and Onami,\n Eri\n},\n title = {\n RefEgo: Referring Expression Comprehension Dataset from First-Person Perception of Ego4D\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15214-15224\n} \n}" }, { "title": "Reference-guided Controllable Inpainting of Neural Radiance Fields", @@ -47594,7 +49189,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Mirzaei_Reference-guided_Controllable_Inpainting_of_Neural_Radiance_Fields_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Mirzaei_Reference-guided_Controllable_Inpainting_of_Neural_Radiance_Fields_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Mirzaei_2023_ICCV,\n \n author = {\n Mirzaei,\n Ashkan and Aumentado-Armstrong,\n Tristan and Brubaker,\n Marcus A. and Kelly,\n Jonathan and Levinshtein,\n Alex and Derpanis,\n Konstantinos G. and Gilitschenski,\n Igor\n},\n title = {\n Reference-guided Controllable Inpainting of Neural Radiance Fields\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17815-17825\n} \n}" }, { "title": "Referring Image Segmentation Using Text Supervision", @@ -47626,7 +49222,8 @@ "aff_campus_unique_index": "1;1;1;1;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0+0;0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Fang and Liu,\n Yuhao and Kong,\n Yuqiu and Xu,\n Ke and Zhang,\n Lihe and Yin,\n Baocai and Hancke,\n Gerhard and Lau,\n Rynson\n},\n title = {\n Referring Image Segmentation Using Text Supervision\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22124-22134\n} \n}" }, { "title": "RegFormer: An Efficient Projection-Aware Transformer Network for Large-Scale Point Cloud Registration", @@ -47651,14 +49248,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Liu_RegFormer_An_Efficient_Projection-Aware_Transformer_Network_for_Large-Scale_Point_Cloud_ICCV_2023_paper.html", "aff_unique_index": "0;0+1+2;0;3;1;0+1+2", - "aff_unique_norm": "Shanghai Jiao Tong University;ETH Zurich;Microsoft;China University of Mining and Technology", - "aff_unique_dep": "Department of Automation;;Microsoft Corporation;", + "aff_unique_norm": "Shanghai Jiao Tong University;ETH Zurich;Microsoft Corporation;China University of Mining and Technology", + "aff_unique_dep": "Department of Automation;;;", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.ethz.ch;https://www.microsoft.com;http://www.cumt.edu.cn/", "aff_unique_abbr": "SJTU;ETHZ;Microsoft;CUMT", - "aff_campus_unique_index": ";", - "aff_campus_unique": "", + "aff_campus_unique_index": ";1;", + "aff_campus_unique": ";Shanghai", "aff_country_unique_index": "0;0+1+2;0;0;1;0+1+2", - "aff_country_unique": "China;Switzerland;United States" + "aff_country_unique": "China;Switzerland;United States", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Jiuming and Wang,\n Guangming and Liu,\n Zhe and Jiang,\n Chaokang and Pollefeys,\n Marc and Wang,\n Hesheng\n},\n title = {\n RegFormer: An Efficient Projection-Aware Transformer Network for Large-Scale Point Cloud Registration\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8451-8460\n} \n}" }, { "title": "Regularized Mask Tuning: Uncovering Hidden Knowledge in Pre-Trained Vision-Language Models", @@ -47690,7 +49288,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zheng_2023_ICCV,\n \n author = {\n Zheng,\n Kecheng and Wu,\n Wei and Feng,\n Ruili and Zhu,\n Kai and Liu,\n Jiawei and Zhao,\n Deli and Zha,\n Zheng-Jun and Chen,\n Wei and Shen,\n Yujun\n},\n title = {\n Regularized Mask Tuning: Uncovering Hidden Knowledge in Pre-Trained Vision-Language Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11663-11673\n} \n}" }, { "title": "Regularized Primitive Graph Learning for Unified Vector Mapping", @@ -47715,14 +49314,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wang_Regularized_Primitive_Graph_Learning_for_Unified_Vector_Mapping_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0", - "aff_unique_norm": "Huawei", + "aff_unique_norm": "Huawei Technologies", "aff_unique_dep": "Riemann Lab", "aff_unique_url": "https://www.huawei.com", "aff_unique_abbr": "Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Lei and Dai,\n Min and He,\n Jianan and Huang,\n Jingwei\n},\n title = {\n Regularized Primitive Graph Learning for Unified Vector Mapping\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16817-16826\n} \n}" }, { "title": "Rehearsal-Free Domain Continual Face Anti-Spoofing: Generalize More and Forget Less", @@ -47747,14 +49347,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Cai_Rehearsal-Free_Domain_Continual_Face_Anti-Spoofing_Generalize_More_and_Forget_Less_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;3+4;5;6;0", - "aff_unique_norm": "Nanyang Technological University;University of Oulu;ByteDance;Great Bay University;Great Bay Institute for Advanced Study;City University of Hong Kong;South China University of Technology", + "aff_unique_norm": "Nanyang Technological University;University of Oulu;Bytedance;Great Bay University;Great Bay Institute for Advanced Study;City University of Hong Kong;South China University of Technology", "aff_unique_dep": "Rapid-Rich Object Search (ROSE) Lab;;;School of Computing and Information Technology;;;", "aff_unique_url": "https://www.ntu.edu.sg;https://www.oulu.fi;https://www.bytedance.com;;;https://www.cityu.edu.hk;https://www.scut.edu.cn", "aff_unique_abbr": "NTU;UOulu;Bytedance;;;CityU;SCUT", "aff_campus_unique_index": ";1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;1;2;3+3;2;2;0", - "aff_country_unique": "Singapore;Finland;China;United States" + "aff_country_unique": "Singapore;Finland;China;United States", + "bibtex": "@InProceedings{Cai_2023_ICCV,\n \n author = {\n Cai,\n Rizhao and Cui,\n Yawen and Li,\n Zhi and Yu,\n Zitong and Li,\n Haoliang and Hu,\n Yongjian and Kot,\n Alex\n},\n title = {\n Rehearsal-Free Domain Continual Face Anti-Spoofing: Generalize More and Forget Less\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8037-8048\n} \n}" }, { "title": "Reinforce Data, Multiply Impact: Improved Model Accuracy and Robustness with Dataset Reinforcement", @@ -47779,14 +49380,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Faghri_Reinforce_Data_Multiply_Impact_Improved_Model_Accuracy_and_Robustness_with_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0;0;0;0", - "aff_unique_norm": "Apple", - "aff_unique_dep": "Apple Inc.", + "aff_unique_norm": "Apple Inc.", + "aff_unique_dep": "", "aff_unique_url": "https://www.apple.com", "aff_unique_abbr": "Apple", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Faghri_2023_ICCV,\n \n author = {\n Faghri,\n Fartash and Pouransari,\n Hadi and Mehta,\n Sachin and Farajtabar,\n Mehrdad and Farhadi,\n Ali and Rastegari,\n Mohammad and Tuzel,\n Oncel\n},\n title = {\n Reinforce Data,\n Multiply Impact: Improved Model Accuracy and Robustness with Dataset Reinforcement\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17032-17043\n} \n}" }, { "title": "Reinforced Disentanglement for Face Swapping without Skip Connection", @@ -47818,7 +49420,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Ren_2023_ICCV,\n \n author = {\n Ren,\n Xiaohang and Chen,\n Xingyu and Yao,\n Pengfei and Shum,\n Heung-Yeung and Wang,\n Baoyuan\n},\n title = {\n Reinforced Disentanglement for Face Swapping without Skip Connection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20665-20675\n} \n}" }, { "title": "Relightify: Relightable 3D Faces from a Single Image via Diffusion Models", @@ -47830,7 +49433,7 @@ "author": "Foivos Paraperas Papantoniou; Alexandros Lattas; Stylianos Moschoglou; Stefanos Zafeiriou", "abstract": "Following the remarkable success of diffusion models on image generation, recent works have also demonstrated their impressive ability to address a number of inverse problems in an unsupervised way, by properly constraining the sampling process based on a conditioning input. Motivated by this, in this paper, we present the first approach to use diffusion models as a prior for highly accurate 3D facial BRDF reconstruction from a single image. We start by leveraging a high-quality UV dataset of facial reflectance (diffuse and specular albedo and normals), which we render under varying illumination settings to simulate natural RGB textures and, then, train an unconditional diffusion model on concatenated pairs of rendered textures and reflectance components. At test time, we fit a 3D morphable model to the given image and unwrap the face in a partial UV texture. By sampling from the diffusion model, while retaining the observed texture part intact, the model inpaints not only the self-occluded areas but also the unknown reflectance components, in a single sequence of denoising steps. In contrast to existing methods, we directly acquire the observed texture from the input image, thus, resulting in more faithful and consistent reflectance estimation. Through a series of qualitative and quantitative comparisons, we demonstrate superior performance in both texture completion as well as reflectance reconstruction tasks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Papantoniou_Relightify_Relightable_3D_Faces_from_a_Single_Image_via_Diffusion_ICCV_2023_paper.pdf", - "aff": "Imperial College London+Huawei Noah\u2019s Ark Lab; Imperial College London+Huawei Noah\u2019s Ark Lab; Imperial College London+Huawei Noah\u2019s Ark Lab; Imperial College London+Huawei Noah\u2019s Ark Lab", + "aff": "Imperial College London+Huawei Noah’s Ark Lab; Imperial College London+Huawei Noah’s Ark Lab; Imperial College London+Huawei Noah’s Ark Lab; Imperial College London+Huawei Noah’s Ark Lab", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Papantoniou_Relightify_Relightable_3D_ICCV_2023_supplemental.pdf", @@ -47844,13 +49447,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Papantoniou_Relightify_Relightable_3D_Faces_from_a_Single_Image_via_Diffusion_ICCV_2023_paper.html", "aff_unique_index": "0+1;0+1;0+1;0+1", "aff_unique_norm": "Imperial College London;Huawei", - "aff_unique_dep": ";Noah\u2019s Ark Lab", + "aff_unique_dep": ";Noah’s Ark Lab", "aff_unique_url": "https://www.imperial.ac.uk;https://www.huawei.com", "aff_unique_abbr": "ICL;Huawei", "aff_campus_unique_index": ";;;", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0+1;0+1;0+1", - "aff_country_unique": "United Kingdom;China" + "aff_country_unique": "United Kingdom;China", + "bibtex": "@InProceedings{Papantoniou_2023_ICCV,\n \n author = {\n Papantoniou,\n Foivos Paraperas and Lattas,\n Alexandros and Moschoglou,\n Stylianos and Zafeiriou,\n Stefanos\n},\n title = {\n Relightify: Relightable 3D Faces from a Single Image via Diffusion Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8806-8817\n} \n}" }, { "title": "Remembering Normality: Memory-guided Knowledge Distillation for Unsupervised Anomaly Detection", @@ -47882,7 +49486,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0+0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Gu_2023_ICCV,\n \n author = {\n Gu,\n Zhihao and Liu,\n Liang and Chen,\n Xu and Yi,\n Ran and Zhang,\n Jiangning and Wang,\n Yabiao and Wang,\n Chengjie and Shu,\n Annan and Jiang,\n Guannan and Ma,\n Lizhuang\n},\n title = {\n Remembering Normality: Memory-guided Knowledge Distillation for Unsupervised Anomaly Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16401-16409\n} \n}" }, { "title": "Removing Anomalies as Noises for Industrial Defect Localization", @@ -47907,14 +49512,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Lu_Removing_Anomalies_as_Noises_for_Industrial_Defect_Localization_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0+1", - "aff_unique_norm": "Chinese University of Hong Kong;SmartMore", + "aff_unique_norm": "The Chinese University of Hong Kong;SmartMore", "aff_unique_dep": ";", "aff_unique_url": "https://www.cuhk.edu.hk;", "aff_unique_abbr": "CUHK;", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Lu_2023_ICCV,\n \n author = {\n Lu,\n Fanbin and Yao,\n Xufeng and Fu,\n Chi-Wing and Jia,\n Jiaya\n},\n title = {\n Removing Anomalies as Noises for Industrial Defect Localization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16166-16175\n} \n}" }, { "title": "RenderIH: A Large-Scale Synthetic Dataset for 3D Interacting Hand Pose Estimation", @@ -47946,7 +49552,8 @@ "aff_campus_unique_index": ";1;2", "aff_campus_unique": ";Shenzhen;Orlando", "aff_country_unique_index": "0+0;0;0;0;0;0;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Lijun and Tian,\n Linrui and Zhang,\n Xindi and Wang,\n Qi and Zhang,\n Bang and Bo,\n Liefeng and Liu,\n Mengyuan and Chen,\n Chen\n},\n title = {\n RenderIH: A Large-Scale Synthetic Dataset for 3D Interacting Hand Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20395-20405\n} \n}" }, { "title": "Rendering Humans from Object-Occluded Monocular Videos", @@ -47978,7 +49585,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Xiang_2023_ICCV,\n \n author = {\n Xiang,\n Tiange and Sun,\n Adam and Wu,\n Jiajun and Adeli,\n Ehsan and Fei-Fei,\n Li\n},\n title = {\n Rendering Humans from Object-Occluded Monocular Videos\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3239-3250\n} \n}" }, { "title": "RepQ-ViT: Scale Reparameterization for Post-Training Quantization of Vision Transformers", @@ -48010,7 +49618,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Zhikai and Xiao,\n Junrui and Yang,\n Lianwei and Gu,\n Qingyi\n},\n title = {\n RepQ-ViT: Scale Reparameterization for Post-Training Quantization of Vision Transformers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17227-17236\n} \n}" }, { "title": "Replay: Multi-modal Multi-view Acted Videos for Casual Holography", @@ -48035,14 +49644,15 @@ "author_num": 9, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Shapovalov_Replay_Multi-modal_Multi-view_Acted_Videos_for_Casual_Holography_ICCV_2023_paper.html", "aff_unique_index": "0;1;0;1;0;1;0;0;0", - "aff_unique_norm": "Meta;University of Texas at Austin", - "aff_unique_dep": "Meta Platforms, Inc.;", + "aff_unique_norm": "Meta Platforms, Inc.;University of Texas at Austin", + "aff_unique_dep": ";", "aff_unique_url": "https://meta.com;https://www.utexas.edu", "aff_unique_abbr": "Meta;UT Austin", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Austin", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Shapovalov_2023_ICCV,\n \n author = {\n Shapovalov,\n Roman and Kleiman,\n Yanir and Rocco,\n Ignacio and Novotny,\n David and Vedaldi,\n Andrea and Chen,\n Changan and Kokkinos,\n Filippos and Graham,\n Ben and Neverova,\n Natalia\n},\n title = {\n Replay: Multi-modal Multi-view Acted Videos for Casual Holography\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20338-20348\n} \n}" }, { "title": "Representation Disparity-aware Distillation for 3D Object Detection", @@ -48067,14 +49677,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_Representation_Disparity-aware_Distillation_for_3D_Object_Detection_ICCV_2023_paper.html", "aff_unique_index": "0;0;1;0;0+2+3;0", - "aff_unique_norm": "Beihang University;Tencent;Zhongguancun Laboratory;Nanchang Institute of Technology", - "aff_unique_dep": ";Tencent Holdings Limited;;", + "aff_unique_norm": "Beihang University;Tencent Holdings Limited;Zhongguancun Laboratory;Nanchang Institute of Technology", + "aff_unique_dep": ";;;", "aff_unique_url": "http://www.buaa.edu.cn/;https://www.tencent.com;;http://www.njit.edu.cn", "aff_unique_abbr": "BUAA;Tencent;;NIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0+0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Yanjing and Xu,\n Sheng and Lin,\n Mingbao and Yin,\n Jihao and Zhang,\n Baochang and Cao,\n Xianbin\n},\n title = {\n Representation Disparity-aware Distillation for 3D Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6715-6724\n} \n}" }, { "title": "Representation Uncertainty in Self-Supervised Learning as Variational Inference", @@ -48106,7 +49717,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Nakamura_2023_ICCV,\n \n author = {\n Nakamura,\n Hiroki and Okada,\n Masashi and Taniguchi,\n Tadahiro\n},\n title = {\n Representation Uncertainty in Self-Supervised Learning as Variational Inference\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16484-16493\n} \n}" }, { "title": "ResQ: Residual Quantization for Video Perception", @@ -48138,7 +49750,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Abati_2023_ICCV,\n \n author = {\n Abati,\n Davide and Ben Yahia,\n Haitam and Nagel,\n Markus and Habibian,\n Amirhossein\n},\n title = {\n ResQ: Residual Quantization for Video Perception\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17119-17129\n} \n}" }, { "title": "Residual Pattern Learning for Pixel-Wise Out-of-Distribution Detection in Semantic Segmentation", @@ -48161,7 +49774,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Liu_Residual_Pattern_Learning_for_Pixel-Wise_Out-of-Distribution_Detection_in_Semantic_Segmentation_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Liu_Residual_Pattern_Learning_for_Pixel-Wise_Out-of-Distribution_Detection_in_Semantic_Segmentation_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Yuyuan and Ding,\n Choubo and Tian,\n Yu and Pang,\n Guansong and Belagiannis,\n Vasileios and Reid,\n Ian and Carneiro,\n Gustavo\n},\n title = {\n Residual Pattern Learning for Pixel-Wise Out-of-Distribution Detection in Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1151-1161\n} \n}" }, { "title": "Rethinking Amodal Video Segmentation from Learning Supervised Signals with Object-centric Representation", @@ -48186,14 +49800,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Fan_Rethinking_Amodal_Video_Segmentation_from_Learning_Supervised_Signals_with_Object-centric_ICCV_2023_paper.html", "aff_unique_index": "0+1;0+1;0;0;1;1;1;0", - "aff_unique_norm": "Fudan University;Amazon", - "aff_unique_dep": ";Amazon Web Services", + "aff_unique_norm": "Fudan University;Amazon Web Services", + "aff_unique_dep": ";", "aff_unique_url": "https://www.fudan.edu.cn;https://aws.amazon.com", "aff_unique_abbr": "Fudan;AWS", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0+1;0;0;1;1;1;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Fan_2023_ICCV,\n \n author = {\n Fan,\n Ke and Lei,\n Jingshi and Qian,\n Xuelin and Yu,\n Miaopeng and Xiao,\n Tianjun and He,\n Tong and Zhang,\n Zheng and Fu,\n Yanwei\n},\n title = {\n Rethinking Amodal Video Segmentation from Learning Supervised Signals with Object-centric Representation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1272-1281\n} \n}" }, { "title": "Rethinking Data Distillation: Do Not Overlook Calibration", @@ -48225,7 +49840,8 @@ "aff_campus_unique_index": "0;2", "aff_campus_unique": "San Diego;;College Park", "aff_country_unique_index": "0;0;1;0;0;0;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Zhu_2023_ICCV,\n \n author = {\n Zhu,\n Dongyao and Lei,\n Bowen and Zhang,\n Jie and Fang,\n Yanbo and Xie,\n Yiqun and Zhang,\n Ruqi and Xu,\n Dongkuan\n},\n title = {\n Rethinking Data Distillation: Do Not Overlook Calibration\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4935-4945\n} \n}" }, { "title": "Rethinking Fast Fourier Convolution in Image Inpainting", @@ -48252,12 +49868,13 @@ "aff_unique_index": "0;0;0;0;0;1;0;0;0", "aff_unique_norm": "Zhejiang University;Zhejiang Gongshang University", "aff_unique_dep": "College of Computer Science and Technology;", - "aff_unique_url": "http://www.zju.edu.cn;http://www.hzic.edu.cn", - "aff_unique_abbr": "ZJU;ZJGSU", + "aff_unique_url": "http://www.zju.edu.cn;http://www.hgh.edu.cn", + "aff_unique_abbr": "ZJU;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chu_2023_ICCV,\n \n author = {\n Chu,\n Tianyi and Chen,\n Jiafu and Sun,\n Jiakai and Lian,\n Shuobin and Wang,\n Zhizhong and Zuo,\n Zhiwen and Zhao,\n Lei and Xing,\n Wei and Lu,\n Dongming\n},\n title = {\n Rethinking Fast Fourier Convolution in Image Inpainting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23195-23205\n} \n}" }, { "title": "Rethinking Mobile Block for Efficient Attention-based Models", @@ -48289,7 +49906,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Jiangning and Li,\n Xiangtai and Li,\n Jian and Liu,\n Liang and Xue,\n Zhucun and Zhang,\n Boshen and Jiang,\n Zhengkai and Huang,\n Tianxin and Wang,\n Yabiao and Wang,\n Chengjie\n},\n title = {\n Rethinking Mobile Block for Efficient Attention-based Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1389-1400\n} \n}" }, { "title": "Rethinking Multi-Contrast MRI Super-Resolution: Rectangle-Window Cross-Attention Transformer and Arbitrary-Scale Upsampling", @@ -48321,7 +49939,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Guangyuan and Zhao,\n Lei and Sun,\n Jiakai and Lan,\n Zehua and Zhang,\n Zhanjie and Chen,\n Jiafu and Lin,\n Zhijie and Lin,\n Huaizhong and Xing,\n Wei\n},\n title = {\n Rethinking Multi-Contrast MRI Super-Resolution: Rectangle-Window Cross-Attention Transformer and Arbitrary-Scale Upsampling\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21230-21240\n} \n}" }, { "title": "Rethinking Point Cloud Registration as Masking and Reconstruction", @@ -48353,7 +49972,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Guangyan and Wang,\n Meiling and Yuan,\n Li and Yang,\n Yi and Yue,\n Yufeng\n},\n title = {\n Rethinking Point Cloud Registration as Masking and Reconstruction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17717-17727\n} \n}" }, { "title": "Rethinking Pose Estimation in Crowds: Overcoming the Detection Information Bottleneck and Ambiguity", @@ -48365,7 +49985,7 @@ "author": "Mu Zhou; Lucas Stoffl; Mackenzie Weygandt Mathis; Alexander Mathis", "abstract": "Frequent interactions between individuals are a fundamental challenge for pose estimation algorithms. Current pipelines either use an object detector together with a pose estimator (top-down approach), or localize all body parts first and then link them to predict the pose of individuals (bottom-up). Yet, when individuals closely interact, top-down methods are ill-defined due to overlapping individuals, and bottom-up methods often falsely infer connections to distant bodyparts. Thus, we propose a novel pipeline called bottom-up conditioned top-down pose estimation (BUCTD) that combines the strengths of bottom-up and top-down methods. Specifically, we propose to use a bottom-up model as the detector, which in addition to an estimated bounding box provides a pose proposal that is fed as condition to an attention-based top-down model. We demonstrate the performance and efficiency of our approach on animal and human pose estimation benchmarks. On CrowdPose and OCHuman, we outperform previous state-of-the-art models by a significant margin. We achieve 78.5 AP on CrowdPose and 48.5 AP on OCHuman, an improvement of 8.6% and 7.8% over the prior art, respectively. Furthermore, we show that our method strongly improves the performance on multi-animal benchmarks involving fish and monkeys. The code is available at https://github.com/amathislab/BUCTD.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Zhou_Rethinking_Pose_Estimation_in_Crowds_Overcoming_the_Detection_Information_Bottleneck_ICCV_2023_paper.pdf", - "aff": "\u00b4Ecole Polytechnique F\u00b4ed\u00b4erale de Lausanne (EPFL); \u00b4Ecole Polytechnique F\u00b4ed\u00b4erale de Lausanne (EPFL); \u00b4Ecole Polytechnique F\u00b4ed\u00b4erale de Lausanne (EPFL); \u00b4Ecole Polytechnique F\u00b4ed\u00b4erale de Lausanne (EPFL)", + "aff": "´Ecole Polytechnique F´ed´erale de Lausanne (EPFL); ´Ecole Polytechnique F´ed´erale de Lausanne (EPFL); ´Ecole Polytechnique F´ed´erale de Lausanne (EPFL); ´Ecole Polytechnique F´ed´erale de Lausanne (EPFL)", "project": "", "github": "https://github.com/amathislab/BUCTD", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Zhou_Rethinking_Pose_Estimation_ICCV_2023_supplemental.pdf", @@ -48378,14 +49998,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhou_Rethinking_Pose_Estimation_in_Crowds_Overcoming_the_Detection_Information_Bottleneck_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0", - "aff_unique_norm": "EPFL", + "aff_unique_norm": "Ecole Polytechnique Fédérale de Lausanne", "aff_unique_dep": "", "aff_unique_url": "https://www.epfl.ch", "aff_unique_abbr": "EPFL", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Lausanne", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Switzerland" + "aff_country_unique": "Switzerland", + "bibtex": "@InProceedings{Zhou_2023_ICCV,\n \n author = {\n Zhou,\n Mu and Stoffl,\n Lucas and Mathis,\n Mackenzie Weygandt and Mathis,\n Alexander\n},\n title = {\n Rethinking Pose Estimation in Crowds: Overcoming the Detection Information Bottleneck and Ambiguity\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14689-14699\n} \n}" }, { "title": "Rethinking Range View Representation for LiDAR Segmentation", @@ -48408,7 +50029,8 @@ "aff_domain": ";;;;;;;;", "email": ";;;;;;;;", "author_num": 9, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Kong_Rethinking_Range_View_Representation_for_LiDAR_Segmentation_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Kong_Rethinking_Range_View_Representation_for_LiDAR_Segmentation_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Kong_2023_ICCV,\n \n author = {\n Kong,\n Lingdong and Liu,\n Youquan and Chen,\n Runnan and Ma,\n Yuexin and Zhu,\n Xinge and Li,\n Yikang and Hou,\n Yuenan and Qiao,\n Yu and Liu,\n Ziwei\n},\n title = {\n Rethinking Range View Representation for LiDAR Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 228-240\n} \n}" }, { "title": "Rethinking Safe Semi-supervised Learning: Transferring the Open-set Problem to A Close-set One", @@ -48440,7 +50062,8 @@ "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Chengdu", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Ma_2023_ICCV,\n \n author = {\n Ma,\n Qiankun and Gao,\n Jiyao and Zhan,\n Bo and Guo,\n Yunpeng and Zhou,\n Jiliu and Wang,\n Yan\n},\n title = {\n Rethinking Safe Semi-supervised Learning: Transferring the Open-set Problem to A Close-set One\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16370-16379\n} \n}" }, { "title": "Rethinking Video Frame Interpolation from Shutter Mode Induced Degradation", @@ -48465,14 +50088,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ji_Rethinking_Video_Frame_Interpolation_from_Shutter_Mode_Induced_Degradation_ICCV_2023_paper.html", "aff_unique_index": "0+1;0+1;0+1;0", - "aff_unique_norm": "University of Tokyo;National Institute of Informatics", + "aff_unique_norm": "The University of Tokyo;National Institute of Informatics", "aff_unique_dep": ";", "aff_unique_url": "https://www.u-tokyo.ac.jp;https://www.nii.ac.jp", "aff_unique_abbr": "UTokyo;NII", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0+0;0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Ji_2023_ICCV,\n \n author = {\n Ji,\n Xiang and Wang,\n Zhixiang and Zhong,\n Zhihang and Zheng,\n Yinqiang\n},\n title = {\n Rethinking Video Frame Interpolation from Shutter Mode Induced Degradation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12259-12268\n} \n}" }, { "title": "Rethinking Vision Transformers for MobileNet Size and Speed", @@ -48504,7 +50128,8 @@ "aff_campus_unique_index": ";1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0+0;0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Yanyu and Hu,\n Ju and Wen,\n Yang and Evangelidis,\n Georgios and Salahi,\n Kamyar and Wang,\n Yanzhi and Tulyakov,\n Sergey and Ren,\n Jian\n},\n title = {\n Rethinking Vision Transformers for MobileNet Size and Speed\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16889-16900\n} \n}" }, { "title": "Rethinking the Role of Pre-Trained Networks in Source-Free Domain Adaptation", @@ -48528,15 +50153,16 @@ "email": ";;", "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhang_Rethinking_the_Role_of_Pre-Trained_Networks_in_Source-Free_Domain_Adaptation_ICCV_2023_paper.html", - "aff_unique_index": "1", - "aff_unique_norm": ";Agency for Science, Technology and Research", - "aff_unique_dep": ";Centre for Frontier AI Research", - "aff_unique_url": ";https://www.a-star.edu.sg", - "aff_unique_abbr": ";A*STAR", + "aff_unique_index": "0;0;0", + "aff_unique_norm": "Agency for Science, Technology and Research", + "aff_unique_dep": "Institute for Infocomm Research", + "aff_unique_url": "https://www.a-star.edu.sg", + "aff_unique_abbr": "A*STAR", "aff_campus_unique_index": "", "aff_campus_unique": "", - "aff_country_unique_index": "1", - "aff_country_unique": ";Singapore" + "aff_country_unique_index": "0;0;0", + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Wenyu and Shen,\n Li and Foo,\n Chuan-Sheng\n},\n title = {\n Rethinking the Role of Pre-Trained Networks in Source-Free Domain Adaptation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18841-18851\n} \n}" }, { "title": "Retinexformer: One-stage Retinex-based Transformer for Low-light Image Enhancement", @@ -48548,7 +50174,7 @@ "author": "Yuanhao Cai; Hao Bian; Jing Lin; Haoqian Wang; Radu Timofte; Yulun Zhang", "abstract": "When enhancing low-light images, many deep learning algorithms are based on the Retinex theory. However, the Retinex model does not consider the corruptions hidden in the dark or introduced by the light-up process. Besides, these methods usually require a tedious multi-stage training pipeline and rely on convolutional neural networks, showing limitations in capturing long-range dependencies. In this paper, we formulate a simple yet principled One-stage Retinex-based Framework (ORF). ORF first estimates the illumination information to light up the low-light image and then restores the corruption to produce the enhanced image. We design an Illumination-Guided Transformer (IGT) that utilizes illumination representations to direct the modeling of non-local interactions of regions with different lighting conditions. By plugging IGT into ORF, we obtain our algorithm, Retinexformer. Comprehensive quantitative and qualitative experiments demonstrate that our Retinexformer significantly outperforms state-of-the-art methods on thirteen benchmarks. The user study and application on low-light object detection also reveal the latent practical values of our method. Code is available at https://github.com/caiyuanhao1998/Retinexformer", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Cai_Retinexformer_One-stage_Retinex-based_Transformer_for_Low-light_Image_Enhancement_ICCV_2023_paper.pdf", - "aff": "Tsinghua University; Tsinghua University; Tsinghua University; Tsinghua University+ETH Z\u00fcrich; University of W\u00fcrzburg; ETH Z\u00fcrich", + "aff": "Tsinghua University; Tsinghua University; Tsinghua University; Tsinghua University+ETH Zürich; University of Würzburg; ETH Zürich", "project": "", "github": "https://github.com/caiyuanhao1998/Retinexformer", "supp": "", @@ -48561,14 +50187,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Cai_Retinexformer_One-stage_Retinex-based_Transformer_for_Low-light_Image_Enhancement_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0+1;2;1", - "aff_unique_norm": "Tsinghua University;ETH Zurich;University of W\u00fcrzburg", + "aff_unique_norm": "Tsinghua University;ETH Zürich;University of Würzburg", "aff_unique_dep": ";;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.ethz.ch;https://www.uni-wuerzburg.de", "aff_unique_abbr": "THU;ETHZ;UWue", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0+1;2;1", - "aff_country_unique": "China;Switzerland;Germany" + "aff_country_unique": "China;Switzerland;Germany", + "bibtex": "@InProceedings{Cai_2023_ICCV,\n \n author = {\n Cai,\n Yuanhao and Bian,\n Hao and Lin,\n Jing and Wang,\n Haoqian and Timofte,\n Radu and Zhang,\n Yulun\n},\n title = {\n Retinexformer: One-stage Retinex-based Transformer for Low-light Image Enhancement\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12504-12513\n} \n}" }, { "title": "Retro-FPN: Retrospective Feature Pyramid Network for Point Cloud Semantic Segmentation", @@ -48600,7 +50227,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Abu Dhabi", "aff_country_unique_index": "0;0;0+1;0;1;2", - "aff_country_unique": "China;United Arab Emirates;United States" + "aff_country_unique": "China;United Arab Emirates;United States", + "bibtex": "@InProceedings{Xiang_2023_ICCV,\n \n author = {\n Xiang,\n Peng and Wen,\n Xin and Liu,\n Yu-Shen and Zhang,\n Hui and Fang,\n Yi and Han,\n Zhizhong\n},\n title = {\n Retro-FPN: Retrospective Feature Pyramid Network for Point Cloud Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17826-17838\n} \n}" }, { "title": "Revisit PCA-based Technique for Out-of-Distribution Detection", @@ -48625,14 +50253,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Guan_Revisit_PCA-based_Technique_for_Out-of-Distribution_Detection_ICCV_2023_paper.html", "aff_unique_index": "0+1;0+1;0+1;0;0+2+1", - "aff_unique_norm": "Sun Yat-sen University;3Key Laboratory of Machine Intelligence and Advanced Computing;Pengcheng Laboratory", - "aff_unique_dep": "School of Computer Science and Engineering;MOE;Peng Cheng Laboratory", + "aff_unique_norm": "Sun Yat-sen University;3Key Laboratory of Machine Intelligence and Advanced Computing;Peng Cheng Laboratory", + "aff_unique_dep": "School of Computer Science and Engineering;MOE;", "aff_unique_url": "http://www.sysu.edu.cn;;", "aff_unique_abbr": "SYSU;;", "aff_campus_unique_index": "0;0;0;0;0+2", "aff_campus_unique": "Guangzhou;;Shenzhen", "aff_country_unique_index": "0+0;0+0;0+0;0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Guan_2023_ICCV,\n \n author = {\n Guan,\n Xiaoyuan and Liu,\n Zhouwu and Zheng,\n Wei-Shi and Zhou,\n Yuren and Wang,\n Ruixuan\n},\n title = {\n Revisit PCA-based Technique for Out-of-Distribution Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19431-19439\n} \n}" }, { "title": "Revisiting Domain-Adaptive 3D Object Detection by Reliable, Diverse and Class-balanced Pseudo-Labeling", @@ -48657,14 +50286,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chen_Revisiting_Domain-Adaptive_3D_Object_Detection_by_Reliable_Diverse_and_Class-balanced_ICCV_2023_paper.html", "aff_unique_index": "0;0;1;0;0", - "aff_unique_norm": "University of Queensland;University of Electronic Science and Technology of China", + "aff_unique_norm": "The University of Queensland;University of Electronic Science and Technology of China", "aff_unique_dep": ";", "aff_unique_url": "https://www.uq.edu.au;https://www.uestc.edu.cn", "aff_unique_abbr": "UQ;UESTC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0", - "aff_country_unique": "Australia;China" + "aff_country_unique": "Australia;China", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Zhuoxiao and Luo,\n Yadan and Wang,\n Zheng and Baktashmotlagh,\n Mahsa and Huang,\n Zi\n},\n title = {\n Revisiting Domain-Adaptive 3D Object Detection by Reliable,\n Diverse and Class-balanced Pseudo-Labeling\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3714-3726\n} \n}" }, { "title": "Revisiting Foreground and Background Separation in Weakly-supervised Temporal Action Localization: A Clustering-based Approach", @@ -48696,7 +50326,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Hefei", "aff_country_unique_index": "0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Qinying and Wang,\n Zilei and Rong,\n Shenghai and Li,\n Junjie and Zhang,\n Yixin\n},\n title = {\n Revisiting Foreground and Background Separation in Weakly-supervised Temporal Action Localization: A Clustering-based Approach\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10433-10443\n} \n}" }, { "title": "Revisiting Scene Text Recognition: A Data Perspective", @@ -48728,7 +50359,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Jiang_2023_ICCV,\n \n author = {\n Jiang,\n Qing and Wang,\n Jiapeng and Peng,\n Dezhi and Liu,\n Chongyu and Jin,\n Lianwen\n},\n title = {\n Revisiting Scene Text Recognition: A Data Perspective\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20543-20554\n} \n}" }, { "title": "Revisiting Vision Transformer from the View of Path Ensemble", @@ -48753,14 +50385,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chang_Revisiting_Vision_Transformer_from_the_View_of_Path_Ensemble_ICCV_2023_paper.html", "aff_unique_index": "0;1+2;1;1;0", - "aff_unique_norm": "National University of Singapore;Alibaba Group;Amazon", - "aff_unique_dep": "Show Lab;;Amazon.com, Inc.", + "aff_unique_norm": "National University of Singapore;Alibaba Group;Amazon.com, Inc.", + "aff_unique_dep": "Show Lab;;", "aff_unique_url": "https://www.nus.edu.sg;https://www.alibaba.com;https://www.amazon.com", "aff_unique_abbr": "NUS;Alibaba;Amazon", "aff_campus_unique_index": "0;;0", "aff_campus_unique": "Singapore;", "aff_country_unique_index": "0;1+2;1;1;0", - "aff_country_unique": "Singapore;China;United States" + "aff_country_unique": "Singapore;China;United States", + "bibtex": "@InProceedings{Chang_2023_ICCV,\n \n author = {\n Chang,\n Shuning and Wang,\n Pichao and Luo,\n Hao and Wang,\n Fan and Shou,\n Mike Zheng\n},\n title = {\n Revisiting Vision Transformer from the View of Path Ensemble\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19889-19899\n} \n}" }, { "title": "Revisiting the Parameter Efficiency of Adapters from the Perspective of Precision Redundancy", @@ -48792,7 +50425,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Jie_2023_ICCV,\n \n author = {\n Jie,\n Shibo and Wang,\n Haoqing and Deng,\n Zhi-Hong\n},\n title = {\n Revisiting the Parameter Efficiency of Adapters from the Perspective of Precision Redundancy\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17217-17226\n} \n}" }, { "title": "Rickrolling the Artist: Injecting Backdoors into Text Encoders for Text-to-Image Synthesis", @@ -48824,7 +50458,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0+0", - "aff_country_unique": "Germany;" + "aff_country_unique": "Germany;", + "bibtex": "@InProceedings{Struppek_2023_ICCV,\n \n author = {\n Struppek,\n Lukas and Hintersdorf,\n Dominik and Kersting,\n Kristian\n},\n title = {\n Rickrolling the Artist: Injecting Backdoors into Text Encoders for Text-to-Image Synthesis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4584-4596\n} \n}" }, { "title": "Robo3D: Towards Robust and Reliable 3D Perception against Corruptions", @@ -48847,7 +50482,8 @@ "aff_domain": ";;;;;;;;", "email": ";;;;;;;;", "author_num": 9, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Kong_Robo3D_Towards_Robust_and_Reliable_3D_Perception_against_Corruptions_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Kong_Robo3D_Towards_Robust_and_Reliable_3D_Perception_against_Corruptions_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Kong_2023_ICCV,\n \n author = {\n Kong,\n Lingdong and Liu,\n Youquan and Li,\n Xin and Chen,\n Runnan and Zhang,\n Wenwei and Ren,\n Jiawei and Pan,\n Liang and Chen,\n Kai and Liu,\n Ziwei\n},\n title = {\n Robo3D: Towards Robust and Reliable 3D Perception against Corruptions\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19994-20006\n} \n}" }, { "title": "Robust Evaluation of Diffusion-Based Adversarial Purification", @@ -48879,7 +50515,8 @@ "aff_campus_unique_index": "0", "aff_campus_unique": "Pohang;", "aff_country_unique_index": "0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Lee_2023_ICCV,\n \n author = {\n Lee,\n Minjong and Kim,\n Dongwoo\n},\n title = {\n Robust Evaluation of Diffusion-Based Adversarial Purification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 134-144\n} \n}" }, { "title": "Robust Frame-to-Frame Camera Rotation Estimation in Crowded Scenes", @@ -48911,7 +50548,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Amherst;", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Delattre_2023_ICCV,\n \n author = {\n Delattre,\n Fabien and Dirnfeld,\n David and Nguyen,\n Phat and Scarano,\n Stephen K and Jones,\n Michael J and Miraldo,\n Pedro and Learned-Miller,\n Erik\n},\n title = {\n Robust Frame-to-Frame Camera Rotation Estimation in Crowded Scenes\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9752-9762\n} \n}" }, { "title": "Robust Geometry-Preserving Depth Estimation Using Differentiable Rendering", @@ -48923,7 +50561,7 @@ "author": "Chi Zhang; Wei Yin; Gang Yu; Zhibin Wang; Tao Chen; Bin Fu; Joey Tianyi Zhou; Chunhua Shen", "abstract": "In this study, we address the challenge of 3D scene structure recovery from monocular depth estimation. While traditional depth estimation methods leverage labeled datasets to directly predict absolute depth, recent advancements advocate for mix-dataset training, enhancing generalization across diverse scenes. However, such mixed dataset training yields depth predictions only up to an unknown scale and shift, hindering accurate 3D reconstructions. Existing solutions necessitate extra 3D datasets or geometry-complete depth annotations, constraints that limit their versatility. In this paper, we propose a learning framework that trains models to predict geometry-preserving depth without requiring extra data or annotations. To produce realistic 3D structures, we render novel views of the reconstructed scenes and design loss functions to promote depth estimation consistency across different views. Comprehensive experiments underscore our framework's superior generalization capabilities, surpassing existing state-of-the-art methods on several benchmark datasets without leveraging extra training information. Moreover, our innovative loss functions empower the model to autonomously recover domain-specific scale-and-shift coefficients using solely unlabeled images.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Zhang_Robust_Geometry-Preserving_Depth_Estimation_Using_Differentiable_Rendering_ICCV_2023_paper.pdf", - "aff": "Tencent; DJI Technology; Tencent + Centre for Frontier AI Research, A\u2217STAR + Institute of High Performance Computing, A\u2217STAR; Tencent; Fudan University; Tencent; Centre for Frontier AI Research, A\u2217STAR + Institute of High Performance Computing, A\u2217STAR; Zhejiang University", + "aff": "Tencent; DJI Technology; Tencent + Centre for Frontier AI Research, A∗STAR + Institute of High Performance Computing, A∗STAR; Tencent; Fudan University; Tencent; Centre for Frontier AI Research, A∗STAR + Institute of High Performance Computing, A∗STAR; Zhejiang University", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Zhang_Robust_Geometry-Preserving_Depth_ICCV_2023_supplemental.zip", @@ -48936,14 +50574,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhang_Robust_Geometry-Preserving_Depth_Estimation_Using_Differentiable_Rendering_ICCV_2023_paper.html", "aff_unique_index": "0;1;0+2+3;0;4;0;2+3;5", - "aff_unique_norm": "Tencent;DJI Technology;A*STAR;A*STAR Institute of High Performance Computing;Fudan University;Zhejiang University", - "aff_unique_dep": "Tencent Holdings Limited;;Centre for Frontier AI Research;Institute of High Performance Computing;;", + "aff_unique_norm": "Tencent Holdings Limited;DJI Technology;A*STAR;A*STAR Institute of High Performance Computing;Fudan University;Zhejiang University", + "aff_unique_dep": ";;Centre for Frontier AI Research;Institute of High Performance Computing;;", "aff_unique_url": "https://www.tencent.com;https://www.dji.com;https://www.a-star.edu.sg;https://www.ihpc.a-star.edu.sg;https://www.fudan.edu.cn;https://www.zju.edu.cn", "aff_unique_abbr": "Tencent;DJI;A*STAR;IHPC;Fudan;ZJU", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+1+1;0;0;0;1+1;0", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Chi and Yin,\n Wei and Yu,\n Gang and Wang,\n Zhibin and Chen,\n Tao and Fu,\n Bin and Zhou,\n Joey Tianyi and Shen,\n Chunhua\n},\n title = {\n Robust Geometry-Preserving Depth Estimation Using Differentiable Rendering\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8951-8961\n} \n}" }, { "title": "Robust Heterogeneous Federated Learning under Data Corruption", @@ -48975,7 +50614,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Wuhan", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Fang_2023_ICCV,\n \n author = {\n Fang,\n Xiuwen and Ye,\n Mang and Yang,\n Xiyuan\n},\n title = {\n Robust Heterogeneous Federated Learning under Data Corruption\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5020-5030\n} \n}" }, { "title": "Robust Mixture-of-Expert Training for Convolutional Neural Networks", @@ -48998,7 +50638,8 @@ "aff_domain": ";;;;;;;;", "email": ";;;;;;;;", "author_num": 9, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhang_Robust_Mixture-of-Expert_Training_for_Convolutional_Neural_Networks_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhang_Robust_Mixture-of-Expert_Training_for_Convolutional_Neural_Networks_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Yihua and Cai,\n Ruisi and Chen,\n Tianlong and Zhang,\n Guanhua and Zhang,\n Huan and Chen,\n Pin-Yu and Chang,\n Shiyu and Wang,\n Zhangyang and Liu,\n Sijia\n},\n title = {\n Robust Mixture-of-Expert Training for Convolutional Neural Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 90-101\n} \n}" }, { "id": "5b5decee16", @@ -49020,13 +50661,14 @@ "author_num": 5, "aff_unique_index": "0;0;0;0;0+1", "aff_unique_norm": "Technical University of Munich;Google", - "aff_unique_dep": ";Google", + "aff_unique_dep": ";", "aff_unique_url": "https://www.tum.de;https://www.google.com", "aff_unique_abbr": "TUM;Google", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0;0+1", - "aff_country_unique": "Germany;United States" + "aff_country_unique": "Germany;United States", + "bibtex": "@InProceedings{Gasperini_2023_ICCV,\n \n author = {\n Gasperini,\n Stefano and Morbitzer,\n Nils and Jung,\n HyunJun and Navab,\n Nassir and Tombari,\n Federico\n},\n title = {\n Robust Monocular Depth Estimation under Challenging Conditions\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8177-8186\n} \n}" }, { "title": "Robust Object Modeling for Visual Tracking", @@ -49058,7 +50700,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Cai_2023_ICCV,\n \n author = {\n Cai,\n Yidong and Liu,\n Jie and Tang,\n Jie and Wu,\n Gangshan\n},\n title = {\n Robust Object Modeling for Visual Tracking\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9589-9600\n} \n}" }, { "title": "Robust One-Shot Face Video Re-enactment using Hybrid Latent Spaces of StyleGAN2", @@ -49090,7 +50733,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "College Park", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Oorloff_2023_ICCV,\n \n author = {\n Oorloff,\n Trevine and Yacoob,\n Yaser\n},\n title = {\n Robust One-Shot Face Video Re-enactment using Hybrid Latent Spaces of StyleGAN2\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20947-20957\n} \n}" }, { "title": "Robust Referring Video Object Segmentation with Cyclic Structural Consensus", @@ -49115,14 +50759,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_Robust_Referring_Video_Object_Segmentation_with_Cyclic_Structural_Consensus_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;1;0+3;1", - "aff_unique_norm": "Carnegie Mellon University;Microsoft;University of Michigan;Mohamed bin Zayed University of Artificial Intelligence", + "aff_unique_norm": "Carnegie Mellon University;Microsoft Research;University of Michigan;Mohamed bin Zayed University of Artificial Intelligence", "aff_unique_dep": ";Research;;", - "aff_unique_url": "https://www.cmu.edu;https://www.microsoft.com/en-us/research/group/asia;https://www.umich.edu;https://mbzuai.ac.ae", + "aff_unique_url": "https://www.cmu.edu;https://www.microsoft.com/en-us/research/group/asia;https://www.umich.edu;https://www.mbzuai.ac.ae", "aff_unique_abbr": "CMU;MSR Asia;UM;MBZUAI", "aff_campus_unique_index": "1;1;;1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0;1;0;1;0+2;1", - "aff_country_unique": "United States;China;United Arab Emirates" + "aff_country_unique": "United States;China;United Arab Emirates", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Xiang and Wang,\n Jinglu and Xu,\n Xiaohao and Li,\n Xiao and Raj,\n Bhiksha and Lu,\n Yan\n},\n title = {\n Robust Referring Video Object Segmentation with Cyclic Structural Consensus\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22236-22245\n} \n}" }, { "title": "Robust e-NeRF: NeRF from Sparse & Noisy Events under Non-Uniform Motion", @@ -49134,7 +50779,7 @@ "author": "Weng Fei Low; Gim Hee Lee", "abstract": "Event cameras offer many advantages over standard cameras due to their distinctive principle of operation: low power, low latency, high temporal resolution and high dynamic range. Nonetheless, the success of many downstream visual applications also hinges on an efficient and effective scene representation, where Neural Radiance Field (NeRF) is seen as the leading candidate. Such promise and potential of event cameras and NeRF inspired recent works to investigate on the reconstruction of NeRF from moving event cameras. However, these works are mainly limited in terms of the dependence on dense and low-noise event streams, as well as generalization to arbitrary contrast threshold values and camera speed profiles. In this work, we propose Robust e-NeRF, a novel method to directly and robustly reconstruct NeRFs from moving event cameras under various real-world conditions, especially from sparse and noisy events generated under non-uniform motion. It consists of two key components: a realistic event generation model that accounts for various intrinsic parameters (e.g. time-independent, asymmetric threshold and refractory period) and non-idealities (e.g. pixel-to-pixel threshold variation), as well as a complementary pair of normalized reconstruction losses that can effectively generalize to arbitrary speed profiles and intrinsic parameter values without such prior knowledge. Experiments on real and novel realistically simulated sequences verify our effectiveness. Our code, synthetic dataset and improved event simulator are public.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Low_Robust_e-NeRF_NeRF_from_Sparse__Noisy_Events_under_Non-Uniform_ICCV_2023_paper.pdf", - "aff": "The NUS Graduate School\u2019s Integrative Sciences and Engineering Programme (ISEP) + Institute of Data Science (IDS), National University of Singapore + Department of Computer Science, National University of Singapore; The NUS Graduate School\u2019s Integrative Sciences and Engineering Programme (ISEP) + Institute of Data Science (IDS), National University of Singapore + Department of Computer Science, National University of Singapore", + "aff": "The NUS Graduate School’s Integrative Sciences and Engineering Programme (ISEP) + Institute of Data Science (IDS), National University of Singapore + Department of Computer Science, National University of Singapore; The NUS Graduate School’s Integrative Sciences and Engineering Programme (ISEP) + Institute of Data Science (IDS), National University of Singapore + Department of Computer Science, National University of Singapore", "project": "", "github": "https://wengflow.github.io/robust-e-nerf", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Low_Robust_e-NeRF_NeRF_ICCV_2023_supplemental.pdf", @@ -49154,7 +50799,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0+0;0+0+0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Low_2023_ICCV,\n \n author = {\n Low,\n Weng Fei and Lee,\n Gim Hee\n},\n title = {\n Robust e-NeRF: NeRF from Sparse \\& Noisy Events under Non-Uniform Motion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18335-18346\n} \n}" }, { "title": "Robustifying Token Attention for Vision Transformers", @@ -49186,7 +50832,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Saarland", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Guo_2023_ICCV,\n \n author = {\n Guo,\n Yong and Stutz,\n David and Schiele,\n Bernt\n},\n title = {\n Robustifying Token Attention for Vision Transformers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17557-17568\n} \n}" }, { "title": "Role-Aware Interaction Generation from Textual Description", @@ -49218,7 +50865,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Tanaka_2023_ICCV,\n \n author = {\n Tanaka,\n Mikihiro and Fujiwara,\n Kent\n},\n title = {\n Role-Aware Interaction Generation from Textual Description\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15999-16009\n} \n}" }, { "title": "Root Pose Decomposition Towards Generic Non-rigid 3D Reconstruction with Monocular Videos", @@ -49250,7 +50898,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Beijing;", "aff_country_unique_index": "0;0+1;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Yikai and Dong,\n Yinpeng and Sun,\n Fuchun and Yang,\n Xiao\n},\n title = {\n Root Pose Decomposition Towards Generic Non-rigid 3D Reconstruction with Monocular Videos\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13890-13900\n} \n}" }, { "title": "Rosetta Neurons: Mining the Common Units in a Model Zoo", @@ -49273,7 +50922,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Dravid_Rosetta_Neurons_Mining_the_Common_Units_in_a_Model_Zoo_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Dravid_Rosetta_Neurons_Mining_the_Common_Units_in_a_Model_Zoo_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Dravid_2023_ICCV,\n \n author = {\n Dravid,\n Amil and Gandelsman,\n Yossi and Efros,\n Alexei A. and Shocher,\n Assaf\n},\n title = {\n Rosetta Neurons: Mining the Common Units in a Model Zoo\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1934-1943\n} \n}" }, { "title": "S-TREK: Sequential Translation and Rotation Equivariant Keypoints for Local Feature Extraction", @@ -49305,7 +50955,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;1;0", - "aff_country_unique": "Austria;Unknown" + "aff_country_unique": "Austria;Unknown", + "bibtex": "@InProceedings{Santellani_2023_ICCV,\n \n author = {\n Santellani,\n Emanuele and Sormann,\n Christian and Rossi,\n Mattia and Kuhn,\n Andreas and Fraundorfer,\n Friedrich\n},\n title = {\n S-TREK: Sequential Translation and Rotation Equivariant Keypoints for Local Feature Extraction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9728-9737\n} \n}" }, { "title": "S-VolSDF: Sparse Multi-View Stereo Regularization of Neural Implicit Surfaces", @@ -49337,7 +50988,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Wu_2023_ICCV,\n \n author = {\n Wu,\n Haoyu and Graikos,\n Alexandros and Samaras,\n Dimitris\n},\n title = {\n S-VolSDF: Sparse Multi-View Stereo Regularization of Neural Implicit Surfaces\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3556-3568\n} \n}" }, { "title": "S3IM: Stochastic Structural SIMilarity and Its Unreasonable Effectiveness for Neural Fields", @@ -49369,7 +51021,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xie_2023_ICCV,\n \n author = {\n Xie,\n Zeke and Yang,\n Xindi and Yang,\n Yujie and Sun,\n Qi and Jiang,\n Yixiang and Wang,\n Haoran and Cai,\n Yunfeng and Sun,\n Mingming\n},\n title = {\n S3IM: Stochastic Structural SIMilarity and Its Unreasonable Effectiveness for Neural Fields\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18024-18034\n} \n}" }, { "title": "SA-BEV: Generating Semantic-Aware Bird's-Eye-View Feature for Multi-view 3D Object Detection", @@ -49401,7 +51054,8 @@ "aff_campus_unique_index": "0;0;0+2;0+2", "aff_campus_unique": "Beijing;;Hangzhou", "aff_country_unique_index": "0;0;0+0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Jinqing and Zhang,\n Yanan and Liu,\n Qingjie and Wang,\n Yunhong\n},\n title = {\n SA-BEV: Generating Semantic-Aware Bird's-Eye-View Feature for Multi-view 3D Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3348-3357\n} \n}" }, { "title": "SAFARI: Versatile and Efficient Evaluations for Robustness of Interpretability", @@ -49427,13 +51081,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Huang_SAFARI_Versatile_and_Efficient_Evaluations_for_Robustness_of_Interpretability_ICCV_2023_paper.html", "aff_unique_index": "0;1+2;1+3;1", "aff_unique_norm": "Purple Mountain Laboratories;University of Liverpool;University of Warwick;Chinese Academy of Sciences", - "aff_unique_dep": ";;WMG;Institute of Software", - "aff_unique_url": "http://www.pmlab.com.cn;https://www.liverpool.ac.uk;https://www.wmg.warwick.ac.uk/;http://www.ios.ac.cn", + "aff_unique_dep": ";;WMG (Warwick Manufacturing Group);Institute of Software", + "aff_unique_url": "http://www.pmlab.com.cn;https://www.liverpool.ac.uk;https://www.wmg.ac.uk;http://www.ios.ac.cn", "aff_unique_abbr": ";Liv Uni;WMG;CAS", - "aff_campus_unique_index": ";", - "aff_campus_unique": "", + "aff_campus_unique_index": "1;", + "aff_campus_unique": ";Warwick", "aff_country_unique_index": "0;1+1;1+0;1", - "aff_country_unique": "China;United Kingdom" + "aff_country_unique": "China;United Kingdom", + "bibtex": "@InProceedings{Huang_2023_ICCV,\n \n author = {\n Huang,\n Wei and Zhao,\n Xingyu and Jin,\n Gaojie and Huang,\n Xiaowei\n},\n title = {\n SAFARI: Versatile and Efficient Evaluations for Robustness of Interpretability\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1988-1998\n} \n}" }, { "title": "SAFE: Machine Unlearning With Shard Graphs", @@ -49458,14 +51113,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Dukler_SAFE_Machine_Unlearning_With_Shard_Graphs_ICCV_2023_paper.html", "aff_unique_index": "0+1;0+1;0;0+1;0;0", - "aff_unique_norm": "Amazon;University of California, Los Angeles", + "aff_unique_norm": "Amazon Web Services;University of California, Los Angeles", "aff_unique_dep": "AWS AI Labs;", "aff_unique_url": "https://aws.amazon.com;https://www.ucla.edu", "aff_unique_abbr": "AWS;UCLA", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0+0;0+0;0;0+0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Dukler_2023_ICCV,\n \n author = {\n Dukler,\n Yonatan and Bowman,\n Benjamin and Achille,\n Alessandro and Golatkar,\n Aditya and Swaminathan,\n Ashwin and Soatto,\n Stefano\n},\n title = {\n SAFE: Machine Unlearning With Shard Graphs\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17108-17118\n} \n}" }, { "title": "SAFE: Sensitivity-Aware Features for Out-of-Distribution Object Detection", @@ -49473,8 +51129,8 @@ "status": "Poster", "track": "main", "pid": "5333", - "author_site": "Samuel Wilson, Tobias Fischer, Feras Dayoub, Dimity Miller, Niko S\u00fcnderhauf", - "author": "Samuel Wilson; Tobias Fischer; Feras Dayoub; Dimity Miller; Niko S\u00fcnderhauf", + "author_site": "Samuel Wilson, Tobias Fischer, Feras Dayoub, Dimity Miller, Niko Sünderhauf", + "author": "Samuel Wilson; Tobias Fischer; Feras Dayoub; Dimity Miller; Niko Sünderhauf", "abstract": "We address the problem of out-of-distribution (OOD) detection for the task of object detection. We show that residual convolutional layers with batch normalisation produce Sensitivity-Aware FEatures (SAFE) that are consistently powerful for distinguishing in-distribution from out-of-distribution detections. We extract SAFE vectors for every detected object, and train a multilayer perceptron on the surrogate task of distinguishing adversarially perturbed from clean in-distribution examples. This circumvents the need for realistic OOD training data, computationally expensive generative models, or retraining of the base object detector. SAFE outperforms the state-of-the-art OOD object detectors on multiple benchmarks by large margins, e.g. reducing the FPR95 by an absolute 30.6% from 48.3% to 17.7% on the OpenImages dataset.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Wilson_SAFE_Sensitivity-Aware_Features_for_Out-of-Distribution_Object_Detection_ICCV_2023_paper.pdf", "aff": "QUT Centre for Robotics, Queensland University of Technology; QUT Centre for Robotics, Queensland University of Technology; Australian Institute for Machine Learning, University of Adelaide; QUT Centre for Robotics, Queensland University of Technology; QUT Centre for Robotics, Queensland University of Technology", @@ -49497,7 +51153,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "Australia" + "aff_country_unique": "Australia", + "bibtex": "@InProceedings{Wilson_2023_ICCV,\n \n author = {\n Wilson,\n Samuel and Fischer,\n Tobias and Dayoub,\n Feras and Miller,\n Dimity and S\\"underhauf,\n Niko\n},\n title = {\n SAFE: Sensitivity-Aware Features for Out-of-Distribution Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23565-23576\n} \n}" }, { "title": "SAFL-Net: Semantic-Agnostic Feature Learning Network with Auxiliary Plugins for Image Manipulation Detection", @@ -49529,7 +51186,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0+0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Sun_2023_ICCV,\n \n author = {\n Sun,\n Zhihao and Jiang,\n Haoran and Wang,\n Danding and Li,\n Xirong and Cao,\n Juan\n},\n title = {\n SAFL-Net: Semantic-Agnostic Feature Learning Network with Auxiliary Plugins for Image Manipulation Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22424-22433\n} \n}" }, { "id": "1ac347f96a", @@ -49557,7 +51215,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Israel" + "aff_country_unique": "Israel", + "bibtex": "@InProceedings{Stolik_2023_ICCV,\n \n author = {\n Stolik,\n Tomer and Lang,\n Itai and Avidan,\n Shai\n},\n title = {\n SAGA: Spectral Adversarial Geometric Attack on 3D Meshes\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4284-4294\n} \n}" }, { "title": "SAL-ViT: Towards Latency Efficient Private Inference on ViT using Selective Attention Search with a Learnable Softmax Approximation", @@ -49582,14 +51241,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhang_SAL-ViT_Towards_Latency_Efficient_Private_Inference_on_ViT_using_Selective_ICCV_2023_paper.html", "aff_unique_index": "0;0;1;0;0", - "aff_unique_norm": "University of Southern California;Intel", + "aff_unique_norm": "University of Southern California;Intel Corporation", "aff_unique_dep": ";Intel Labs", "aff_unique_url": "https://www.usc.edu;https://www.intel.com", "aff_unique_abbr": "USC;Intel", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Yuke and Chen,\n Dake and Kundu,\n Souvik and Li,\n Chenghao and Beerel,\n Peter A.\n},\n title = {\n SAL-ViT: Towards Latency Efficient Private Inference on ViT using Selective Attention Search with a Learnable Softmax Approximation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5116-5125\n} \n}" }, { "title": "SALAD: Part-Level Latent Diffusion for 3D Shape Generation and Manipulation", @@ -49621,7 +51281,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Koo_2023_ICCV,\n \n author = {\n Koo,\n Juil and Yoo,\n Seungwoo and Nguyen,\n Minh Hieu and Sung,\n Minhyuk\n},\n title = {\n SALAD: Part-Level Latent Diffusion for 3D Shape Generation and Manipulation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14441-14451\n} \n}" }, { "title": "SAMPLING: Scene-adaptive Hierarchical Multiplane Images Representation for Novel View Synthesis from a Single Image", @@ -49653,7 +51314,8 @@ "aff_campus_unique_index": "1;1;1+2", "aff_campus_unique": ";Mountain View;Merced", "aff_country_unique_index": "0;0;0+1;0;1;1+1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Zhou_2023_ICCV,\n \n author = {\n Zhou,\n Xiaoyu and Lin,\n Zhiwei and Shan,\n Xiaojun and Wang,\n Yongtao and Sun,\n Deqing and Yang,\n Ming-Hsuan\n},\n title = {\n SAMPLING: Scene-adaptive Hierarchical Multiplane Images Representation for Novel View Synthesis from a Single Image\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22830-22840\n} \n}" }, { "title": "SATR: Zero-Shot Semantic Segmentation of 3D Shapes", @@ -49685,7 +51347,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", - "aff_country_unique": "Saudi Arabia;France" + "aff_country_unique": "Saudi Arabia;France", + "bibtex": "@InProceedings{Abdelreheem_2023_ICCV,\n \n author = {\n Abdelreheem,\n Ahmed and Skorokhodov,\n Ivan and Ovsjanikov,\n Maks and Wonka,\n Peter\n},\n title = {\n SATR: Zero-Shot Semantic Segmentation of 3D Shapes\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15166-15179\n} \n}" }, { "title": "SC3K: Self-supervised and Coherent 3D Keypoints Estimation from Rotated, Noisy, and Decimated Point Cloud Data", @@ -49711,13 +51374,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zohaib_SC3K_Self-supervised_and_Coherent_3D_Keypoints_Estimation_from_Rotated_Noisy_ICCV_2023_paper.html", "aff_unique_index": "0+1;0+1", "aff_unique_norm": "Pattern Analysis & Computer Vision;Italian Institute of Technology", - "aff_unique_dep": "PAVIS;", + "aff_unique_dep": "Computer Vision;", "aff_unique_url": ";https://www.iit.it", - "aff_unique_abbr": ";IIT", + "aff_unique_abbr": "PAVIS;IIT", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Genoa", "aff_country_unique_index": "1;1", - "aff_country_unique": ";Italy" + "aff_country_unique": ";Italy", + "bibtex": "@InProceedings{Zohaib_2023_ICCV,\n \n author = {\n Zohaib,\n Mohammad and Del Bue,\n Alessio\n},\n title = {\n SC3K: Self-supervised and Coherent 3D Keypoints Estimation from Rotated,\n Noisy,\n and Decimated Point Cloud Data\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22509-22519\n} \n}" }, { "title": "SCANet: Scene Complexity Aware Network for Weakly-Supervised Video Moment Retrieval", @@ -49749,7 +51413,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Yoon_2023_ICCV,\n \n author = {\n Yoon,\n Sunjae and Koo,\n Gwanhyeong and Kim,\n Dahyun and Yoo,\n Chang D.\n},\n title = {\n SCANet: Scene Complexity Aware Network for Weakly-Supervised Video Moment Retrieval\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13576-13586\n} \n}" }, { "title": "SCOB: Universal Text Understanding via Character-wise Supervised Contrastive Learning with Online Text Rendering for Bridging Domain Gap", @@ -49781,7 +51446,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Kim_2023_ICCV,\n \n author = {\n Kim,\n Daehee and Kim,\n Yoonsik and Kim,\n DongHyun and Lim,\n Yumin and Kim,\n Geewook and Kil,\n Taeho\n},\n title = {\n SCOB: Universal Text Understanding via Character-wise Supervised Contrastive Learning with Online Text Rendering for Bridging Domain Gap\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19562-19573\n} \n}" }, { "title": "SEFD: Learning to Distill Complex Pose and Occlusion", @@ -49806,14 +51472,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yang_SEFD_Learning_to_Distill_Complex_Pose_and_Occlusion_ICCV_2023_paper.html", "aff_unique_index": "0;1;2+0;3;3;3;0", - "aff_unique_norm": "Sogang University;Pusan National University;Samsung;NAVER Cloud Corp", - "aff_unique_dep": ";;Samsung Electronics;", + "aff_unique_norm": "Sogang University;Pusan National University;Samsung Electronics;NAVER Cloud Corp", + "aff_unique_dep": ";;;", "aff_unique_url": "https://www.sogang.ac.kr;https://www.pnu.ac.kr;https://www.samsung.com;https://www.naver.com", "aff_unique_abbr": "Sogang;PNU;Samsung;NCC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0;0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n ChangHee and Kong,\n Kyeongbo and Min,\n SungJun and Wee,\n Dongyoon and Jang,\n Ho-Deok and Cha,\n Geonho and Kang,\n SukJu\n},\n title = {\n SEFD: Learning to Distill Complex Pose and Occlusion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14941-14952\n} \n}" }, { "title": "SEMPART: Self-supervised Multi-resolution Partitioning of Image Semantics", @@ -49838,14 +51505,15 @@ "author_num": 2, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ravindran_SEMPART_Self-supervised_Multi-resolution_Partitioning_of_Image_Semantics_ICCV_2023_paper.html", "aff_unique_index": "0;0", - "aff_unique_norm": "Adobe", - "aff_unique_dep": "Adobe Inc.", + "aff_unique_norm": "Adobe Inc.", + "aff_unique_dep": "", "aff_unique_url": "https://www.adobe.com", "aff_unique_abbr": "Adobe", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Ravindran_2023_ICCV,\n \n author = {\n Ravindran,\n Sriram and Basu,\n Debraj\n},\n title = {\n SEMPART: Self-supervised Multi-resolution Partitioning of Image Semantics\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 723-733\n} \n}" }, { "title": "SFHarmony: Source Free Domain Adaptation for Distributed Neuroimaging Analysis", @@ -49877,7 +51545,8 @@ "aff_campus_unique_index": "0;0+1+1;0+0", "aff_campus_unique": "Oxford;Adelaide", "aff_country_unique_index": "0;0+1+1;0+0", - "aff_country_unique": "United Kingdom;Australia" + "aff_country_unique": "United Kingdom;Australia", + "bibtex": "@InProceedings{Dinsdale_2023_ICCV,\n \n author = {\n Dinsdale,\n Nicola K and Jenkinson,\n Mark and Namburete,\n Ana IL\n},\n title = {\n SFHarmony: Source Free Domain Adaptation for Distributed Neuroimaging Analysis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11494-11505\n} \n}" }, { "title": "SG-Former: Self-guided Transformer with Evolving Token Reallocation", @@ -49909,7 +51578,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Ren_2023_ICCV,\n \n author = {\n Ren,\n Sucheng and Yang,\n Xingyi and Liu,\n Songhua and Wang,\n Xinchao\n},\n title = {\n SG-Former: Self-guided Transformer with Evolving Token Reallocation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6003-6014\n} \n}" }, { "title": "SGAligner: 3D Scene Alignment with Scene Graphs", @@ -49941,7 +51611,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Zurich", "aff_country_unique_index": "0;0;0+0;0;0", - "aff_country_unique": "Switzerland" + "aff_country_unique": "Switzerland", + "bibtex": "@InProceedings{Sarkar_2023_ICCV,\n \n author = {\n Sarkar,\n Sayan Deb and Miksik,\n Ondrej and Pollefeys,\n Marc and Barath,\n Daniel and Armeni,\n Iro\n},\n title = {\n SGAligner: 3D Scene Alignment with Scene Graphs\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21927-21937\n} \n}" }, { "title": "SHACIRA: Scalable HAsh-grid Compression for Implicit Neural Representations", @@ -49973,7 +51644,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Girish_2023_ICCV,\n \n author = {\n Girish,\n Sharath and Shrivastava,\n Abhinav and Gupta,\n Kamal\n},\n title = {\n SHACIRA: Scalable HAsh-grid Compression for Implicit Neural Representations\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17513-17524\n} \n}" }, { "title": "SHERF: Generalizable Human NeRF from a Single Image", @@ -50005,7 +51677,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;1;0", - "aff_country_unique": "Singapore;China" + "aff_country_unique": "Singapore;China", + "bibtex": "@InProceedings{Hu_2023_ICCV,\n \n author = {\n Hu,\n Shoukang and Hong,\n Fangzhou and Pan,\n Liang and Mei,\n Haiyi and Yang,\n Lei and Liu,\n Ziwei\n},\n title = {\n SHERF: Generalizable Human NeRF from a Single Image\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9352-9364\n} \n}" }, { "title": "SHIFT3D: Synthesizing Hard Inputs For Tricking 3D Detectors", @@ -50037,7 +51710,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Hongge and Chen,\n Zhao and Meyer,\n Gregory P. and Park,\n Dennis and Vondrick,\n Carl and Shrivastava,\n Ashish and Chai,\n Yuning\n},\n title = {\n SHIFT3D: Synthesizing Hard Inputs For Tricking 3D Detectors\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8493-8503\n} \n}" }, { "title": "SIDGAN: High-Resolution Dubbed Video Generation via Shift-Invariant Learning", @@ -50069,7 +51743,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Muaz_2023_ICCV,\n \n author = {\n Muaz,\n Urwa and Jang,\n Wondong and Tripathi,\n Rohun and Mani,\n Santhosh and Ouyang,\n Wenbin and Gadde,\n Ravi Teja and Gecer,\n Baris and Elizondo,\n Sergio and Madad,\n Reza and Nair,\n Naveen\n},\n title = {\n SIDGAN: High-Resolution Dubbed Video Generation via Shift-Invariant Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7833-7842\n} \n}" }, { "title": "SIGMA: Scale-Invariant Global Sparse Shape Matching", @@ -50077,8 +51752,8 @@ "status": "Poster", "track": "main", "pid": "1652", - "author_site": "Maolin Gao, Paul Roetzer, Marvin Eisenberger, Zorah L\u00e4hner, Michael Moeller, Daniel Cremers, Florian Bernard", - "author": "Maolin Gao; Paul Roetzer; Marvin Eisenberger; Zorah L\u00e4hner; Michael Moeller; Daniel Cremers; Florian Bernard", + "author_site": "Maolin Gao, Paul Roetzer, Marvin Eisenberger, Zorah Lähner, Michael Moeller, Daniel Cremers, Florian Bernard", + "author": "Maolin Gao; Paul Roetzer; Marvin Eisenberger; Zorah Lähner; Michael Moeller; Daniel Cremers; Florian Bernard", "abstract": "We propose a novel mixed-integer programming (MIP) formulation for generating precise sparse correspondences for highly non-rigid shapes. To this end, we introduce a projected Laplace-Beltrami operator (PLBO) which combines intrinsic and extrinsic geometric information to measure the deformation quality induced by predicted correspondences. We integrate the PLBO, together with an orientation-aware regulariser, into a novel MIP formulation that can be solved to global optimality for many practical problems. In contrast to previous methods, our approach is provably invariant to rigid transformations and global scaling, initialisation-free, has optimality guarantees, and scales to high resolution meshes with (empirically observed) linear time. We show state-of-the-art results for sparse non-rigid matching on several challenging 3D datasets, including data with inconsistent meshing, as well as applications in mesh-to-point-cloud matching.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Gao_SIGMA_Scale-Invariant_Global_Sparse_Shape_Matching_ICCV_2023_paper.pdf", "aff": ";;;;;;", @@ -50092,7 +51767,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Gao_SIGMA_Scale-Invariant_Global_Sparse_Shape_Matching_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Gao_SIGMA_Scale-Invariant_Global_Sparse_Shape_Matching_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Gao_2023_ICCV,\n \n author = {\n Gao,\n Maolin and Roetzer,\n Paul and Eisenberger,\n Marvin and L\\"ahner,\n Zorah and Moeller,\n Michael and Cremers,\n Daniel and Bernard,\n Florian\n},\n title = {\n SIGMA: Scale-Invariant Global Sparse Shape Matching\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 645-654\n} \n}" }, { "title": "SILT: Shadow-Aware Iterative Label Tuning for Learning to Detect Shadows from Noisy Labels", @@ -50117,14 +51793,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yang_SILT_Shadow-Aware_Iterative_Label_Tuning_for_Learning_to_Detect_Shadows_ICCV_2023_paper.html", "aff_unique_index": "0+1;0+1;2+0;0+1", - "aff_unique_norm": "Chinese University of Hong Kong;Shun Hing Institute of Advanced Engineering;Shanghai Artificial Intelligence Laboratory", + "aff_unique_norm": "The Chinese University of Hong Kong;The Shun Hing Institute of Advanced Engineering;Shanghai Artificial Intelligence Laboratory", "aff_unique_dep": ";;", "aff_unique_url": "https://www.cuhk.edu.hk;;http://www.shailab.org/", "aff_unique_abbr": "CUHK;;Shanghai AI Lab", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0+0;0+0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n Han and Wang,\n Tianyu and Hu,\n Xiaowei and Fu,\n Chi-Wing\n},\n title = {\n SILT: Shadow-Aware Iterative Label Tuning for Learning to Detect Shadows from Noisy Labels\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12687-12698\n} \n}" }, { "title": "SINC: Self-Supervised In-Context Learning for Vision-Language Tasks", @@ -50149,14 +51826,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chen_SINC_Self-Supervised_In-Context_Learning_for_Vision-Language_Tasks_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;1;1;0", - "aff_unique_norm": "National Yang Ming Chiao Tung University;Microsoft", + "aff_unique_norm": "National Yang Ming Chiao Tung University;Microsoft Research", "aff_unique_dep": ";Research", "aff_unique_url": "https://www.nycu.edu.tw;https://www.microsoft.com/en-us/research/group/asia", "aff_unique_abbr": "NYCU;MSR Asia", "aff_campus_unique_index": "0;0;0;1;1;0", "aff_campus_unique": "Taiwan;Asia", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Yi-Syuan and Song,\n Yun-Zhu and Yeo,\n Cheng Yu and Liu,\n Bei and Fu,\n Jianlong and Shuai,\n Hong-Han\n},\n title = {\n SINC: Self-Supervised In-Context Learning for Vision-Language Tasks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15430-15442\n} \n}" }, { "title": "SINC: Spatial Composition of 3D Human Motions for Simultaneous Action Generation", @@ -50164,11 +51842,11 @@ "status": "Poster", "track": "main", "pid": "3650", - "author_site": "Nikos Athanasiou, Mathis Petrovich, Michael J. Black, G\u00fcl Varol", - "author": "Nikos Athanasiou; Mathis Petrovich; Michael J. Black; G\u00fcl Varol", + "author_site": "Nikos Athanasiou, Mathis Petrovich, Michael J. Black, Gül Varol", + "author": "Nikos Athanasiou; Mathis Petrovich; Michael J. Black; Gül Varol", "abstract": "Our goal is to synthesize 3D human motions given textual inputs describing simultaneous actions, for example `waving hand' while `walking' at the same time. We refer to generating such simultaneous movements as performing `spatial compositions'. In contrast to `temporal compositions' that seek to transition from one action to another, spatial compositing requires understanding which body parts are involved with which action, to be able to move them simultaneously. Motivated by the observation that the correspondence between actions and body parts is encoded in powerful language models, we extract this knowledge by prompting GPT-3 with text such as \"what are the body parts involved in the action ?\", while also providing the parts list and a few examples. Given this action-part mapping, we combine body parts from two motions together and establish the first automated method to spatially compose two actions. However, training data with compositional actions is always limited by the combinatorics. Hence, we further create synthetic data with this approach, and use it to train a new state-of-the-art text-to-motion generation model, called SINC (\"SImultaneous actioN Compositions for 3D human motions\"). In our experiments, we find that training with such GPT-guided synthetic data improves spatial composition generation over baselines.\n Our code is publicly available at https://sinc.is.tue.mpg.de/.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Athanasiou_SINC_Spatial_Composition_of_3D_Human_Motions_for_Simultaneous_Action_ICCV_2023_paper.pdf", - "aff": "Max Planck Institute for Intelligent Systems, T\u00a8ubingen, Germany; Max Planck Institute for Intelligent Systems, T\u00a8ubingen, Germany + LIGM, \u00b4Ecole des Ponts, Univ Gustave Eiffel, CNRS, France; Max Planck Institute for Intelligent Systems, T\u00a8ubingen, Germany; LIGM, \u00b4Ecole des Ponts, Univ Gustave Eiffel, CNRS, France", + "aff": "Max Planck Institute for Intelligent Systems, T¨ubingen, Germany; Max Planck Institute for Intelligent Systems, T¨ubingen, Germany + LIGM, ´Ecole des Ponts, Univ Gustave Eiffel, CNRS, France; Max Planck Institute for Intelligent Systems, T¨ubingen, Germany; LIGM, ´Ecole des Ponts, Univ Gustave Eiffel, CNRS, France", "project": "sinc.is.tue.mpg.de", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Athanasiou_SINC_Spatial_Composition_of_3D_Human_Motions_for_Simultaneous_Action_ICCV_2023_supplemental.pdf", @@ -50186,9 +51864,10 @@ "aff_unique_url": "https://www.mpi-is.mpg.de;https://www.ponts.org", "aff_unique_abbr": "MPI-IS;ENPC", "aff_campus_unique_index": "0;0;0", - "aff_campus_unique": "T\u00fcbingen;", + "aff_campus_unique": "Tübingen;", "aff_country_unique_index": "0;0+1;0;1", - "aff_country_unique": "Germany;France" + "aff_country_unique": "Germany;France", + "bibtex": "@InProceedings{Athanasiou_2023_ICCV,\n \n author = {\n Athanasiou,\n Nikos and Petrovich,\n Mathis and Black,\n Michael J. and Varol,\n G\\"ul\n},\n title = {\n SINC: Spatial Composition of 3D Human Motions for Simultaneous Action Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9984-9995\n} \n}" }, { "title": "SIRA-PCR: Sim-to-Real Adaptation for 3D Point Cloud Registration", @@ -50213,14 +51892,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chen_SIRA-PCR_Sim-to-Real_Adaptation_for_3D_Point_Cloud_Registration_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;3;1;0", - "aff_unique_norm": "University of Electronic Science and Technology of China;Chinese University of Hong Kong;Harbin Institute of Technology;Megvii Technology", + "aff_unique_norm": "University of Electronic Science and Technology of China;The Chinese University of Hong Kong;Harbin Institute of Technology;Megvii Technology", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.uestc.edu.cn;https://www.cuhk.edu.hk;http://www.hit.edu.cn/;https://www.megvii.com", "aff_unique_abbr": "UESTC;CUHK;HIT;Megvii", "aff_campus_unique_index": "1;2;1", "aff_campus_unique": ";Hong Kong SAR;Harbin", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Suyi and Xu,\n Hao and Li,\n Ru and Liu,\n Guanghui and Fu,\n Chi-Wing and Liu,\n Shuaicheng\n},\n title = {\n SIRA-PCR: Sim-to-Real Adaptation for 3D Point Cloud Registration\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14394-14405\n} \n}" }, { "title": "SKED: Sketch-guided Text-based 3D Editing", @@ -50245,14 +51925,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Mikaeili_SKED_Sketch-guided_Text-based_3D_Editing_ICCV_2023_paper.html", "aff_unique_index": "0;1;0;2;0", - "aff_unique_norm": "Simon Fraser University;NVIDIA;Tel Aviv University", - "aff_unique_dep": ";NVIDIA Corporation;", + "aff_unique_norm": "Simon Fraser University;NVIDIA Corporation;Tel Aviv University", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.sfu.ca;https://www.nvidia.com;https://www.tau.ac.il", "aff_unique_abbr": "SFU;NVIDIA;TAU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;2;0", - "aff_country_unique": "Canada;United States;Israel" + "aff_country_unique": "Canada;United States;Israel", + "bibtex": "@InProceedings{Mikaeili_2023_ICCV,\n \n author = {\n Mikaeili,\n Aryan and Perel,\n Or and Safaee,\n Mehdi and Cohen-Or,\n Daniel and Mahdavi-Amiri,\n Ali\n},\n title = {\n SKED: Sketch-guided Text-based 3D Editing\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14607-14619\n} \n}" }, { "title": "SKiT: a Fast Key Information Video Transformer for Online Surgical Phase Recognition", @@ -50260,6 +51941,7 @@ "status": "Poster", "track": "main", "pid": "9344", + "author_site": "Yang Liu, Jiayu Huo, Jingjing Peng, Rachel Sparks, Prokar Dasgupta, Alejandro Granados, Sebastien Ourselin", "author": "Yang Liu, Jiayu Huo, Jingjing Peng, Rachel Sparks, Prokar Dasgupta, Alejandro Granados, Sebastien Ourselin", "abstract": "This paper introduces SKiT, a fast Key information Transformer for phase recognition of videos. Unlike previous methods that rely on complex models to capture long-term temporal information, SKiT accurately recognizes high-level stages of videos using an efficient key pooling operation. This operation records important key information by retaining the maximum value recorded from the beginning up to the current video frame, with a time complexity of O(1). Experimental results on Cholec80 and AutoLaparo surgical datasets demonstrate the ability of our model to recognize phases in an online manner. SKiT achieves higher performance than state-of-the-art methods with an accuracy of 92.5% and 82.9% on Cholec80 and AutoLaparo, respectively, while running the temporal model eight times faster ( 7ms v.s. 55ms) than LoViT, which uses ProbSparse to capture global information. We highlight that the inference time of SKiT is constant, and independent from the input length, making it a stable choice for keeping a record of important global information, that appears on long surgical videos, essential for phase recognition. To sum up, we propose an effective and efficient model for surgical phase recognition that leverages key global information. This has an intrinsic value when performing this task in an online manner on long surgical videos for stable real-time surgical recognition systems.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Liu_SKiT_a_Fast_Key_Information_Video_Transformer_for_Online_Surgical_ICCV_2023_paper.pdf", @@ -50271,7 +51953,8 @@ "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8608673569704002278&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Liu_SKiT_a_Fast_Key_Information_Video_Transformer_for_Online_Surgical_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Liu_SKiT_a_Fast_Key_Information_Video_Transformer_for_Online_Surgical_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Yang and Huo,\n Jiayu and Peng,\n Jingjing and Sparks,\n Rachel and Dasgupta,\n Prokar and Granados,\n Alejandro and Ourselin,\n Sebastien\n},\n title = {\n SKiT: a Fast Key Information Video Transformer for Online Surgical Phase Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21074-21084\n} \n}" }, { "title": "SLAN: Self-Locator Aided Network for Vision-Language Understanding", @@ -50303,7 +51986,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhai_2023_ICCV,\n \n author = {\n Zhai,\n Jiang-Tian and Zhang,\n Qi and Wu,\n Tong and Chen,\n Xing-Yu and Liu,\n Jiang-Jiang and Cheng,\n Ming-Ming\n},\n title = {\n SLAN: Self-Locator Aided Network for Vision-Language Understanding\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21949-21958\n} \n}" }, { "title": "SLCA: Slow Learner with Classifier Alignment for Continual Learning on a Pre-trained Model", @@ -50328,14 +52012,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhang_SLCA_Slow_Learner_with_Classifier_Alignment_for_Continual_Learning_on_ICCV_2023_paper.html", "aff_unique_index": "0;1;2+3;0;4+5", - "aff_unique_norm": "University of Technology Sydney;Tsinghua University;Beihang University;Zhongguancun Laboratory;Beijing Jiao Tong University;Beijing Key Laboratory of Advanced Information Science and Network", + "aff_unique_norm": "University of Technology Sydney;Tsinghua University;Beihang University;Zhongguancun Laboratory;Beijing Jiaotong University;Beijing Key Laboratory of Advanced Information Science and Network", "aff_unique_dep": ";;;;Institute of Information Science;Advanced Information Science and Network", "aff_unique_url": "https://www.uts.edu.au;https://www.tsinghua.edu.cn;http://www.buaa.edu.cn/;;http://www.bjtu.edu.cn;", "aff_unique_abbr": "UTS;THU;BUAA;;BJTU;", - "aff_campus_unique_index": ";1", - "aff_campus_unique": ";Beijing", + "aff_campus_unique_index": ";", + "aff_campus_unique": "", "aff_country_unique_index": "0;1;1+1;0;1+1", - "aff_country_unique": "Australia;China" + "aff_country_unique": "Australia;China", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Gengwei and Wang,\n Liyuan and Kang,\n Guoliang and Chen,\n Ling and Wei,\n Yunchao\n},\n title = {\n SLCA: Slow Learner with Classifier Alignment for Continual Learning on a Pre-trained Model\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19148-19158\n} \n}" }, { "title": "SMAUG: Sparse Masked Autoencoder for Efficient Video-Language Pre-Training", @@ -50367,7 +52052,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Santa Cruz", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Lin_2023_ICCV,\n \n author = {\n Lin,\n Yuanze and Wei,\n Chen and Wang,\n Huiyu and Yuille,\n Alan and Xie,\n Cihang\n},\n title = {\n SMAUG: Sparse Masked Autoencoder for Efficient Video-Language Pre-Training\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2459-2469\n} \n}" }, { "title": "SMMix: Self-Motivated Image Mixing for Vision Transformers", @@ -50399,7 +52085,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Mengzhao and Lin,\n Mingbao and Lin,\n Zhihang and Zhang,\n Yuxin and Chao,\n Fei and Ji,\n Rongrong\n},\n title = {\n SMMix: Self-Motivated Image Mixing for Vision Transformers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17260-17270\n} \n}" }, { "title": "SOAR: Scene-debiasing Open-set Action Recognition", @@ -50431,7 +52118,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhai_2023_ICCV,\n \n author = {\n Zhai,\n Yuanhao and Liu,\n Ziyi and Wu,\n Zhenyu and Wu,\n Yi and Zhou,\n Chunluan and Doermann,\n David and Yuan,\n Junsong and Hua,\n Gang\n},\n title = {\n SOAR: Scene-debiasing Open-set Action Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10244-10254\n} \n}" }, { "title": "SOCS: Semantically-Aware Object Coordinate Space for Category-Level 6D Object Pose Estimation under Large Shape Variations", @@ -50463,7 +52151,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wan_2023_ICCV,\n \n author = {\n Wan,\n Boyan and Shi,\n Yifei and Xu,\n Kai\n},\n title = {\n SOCS: Semantically-Aware Object Coordinate Space for Category-Level 6D Object Pose Estimation under Large Shape Variations\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14065-14074\n} \n}" }, { "title": "SPACE: Speech-driven Portrait Animation with Controllable Expression", @@ -50488,14 +52177,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Gururani_SPACE_Speech-driven_Portrait_Animation_with_Controllable_Expression_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0;0", - "aff_unique_norm": "NVIDIA", - "aff_unique_dep": "NVIDIA Corporation", + "aff_unique_norm": "NVIDIA Corporation", + "aff_unique_dep": "", "aff_unique_url": "https://www.nvidia.com", "aff_unique_abbr": "NVIDIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Gururani_2023_ICCV,\n \n author = {\n Gururani,\n Siddharth and Mallya,\n Arun and Wang,\n Ting-Chun and Valle,\n Rafael and Liu,\n Ming-Yu\n},\n title = {\n SPACE: Speech-driven Portrait Animation with Controllable Expression\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20914-20923\n} \n}" }, { "title": "SPANet: Frequency-balancing Token Mixer using Spectral Pooling Aggregation Modulation", @@ -50527,7 +52217,8 @@ "aff_campus_unique_index": ";;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0+0;0+0;0+0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Yun_2023_ICCV,\n \n author = {\n Yun,\n Guhnoo and Yoo,\n Juhan and Kim,\n Kijung and Lee,\n Jeongho and Kim,\n Dong Hwan\n},\n title = {\n SPANet: Frequency-balancing Token Mixer using Spectral Pooling Aggregation Modulation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6113-6124\n} \n}" }, { "title": "SQAD: Automatic Smartphone Camera Quality Assessment and Benchmarking", @@ -50552,14 +52243,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Fang_SQAD_Automatic_Smartphone_Camera_Quality_Assessment_and_Benchmarking_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;1+2", - "aff_unique_norm": "National University of Singapore;ETH Zurich;University of W\u00fcrzburg", + "aff_unique_norm": "National University of Singapore;ETH Zurich;University of Würzburg", "aff_unique_dep": ";;", "aff_unique_url": "https://www.nus.edu.sg;https://www.ethz.ch;https://www.uni-wuerzburg.de", - "aff_unique_abbr": "NUS;ETHZ;Uni W\u00fcrzburg", + "aff_unique_abbr": "NUS;ETHZ;Uni Würzburg", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;1+2", - "aff_country_unique": "Singapore;Switzerland;Germany" + "aff_country_unique": "Singapore;Switzerland;Germany", + "bibtex": "@InProceedings{Fang_2023_ICCV,\n \n author = {\n Fang,\n Zilin and Ignatov,\n Andrey and Zamfir,\n Eduard and Timofte,\n Radu\n},\n title = {\n SQAD: Automatic Smartphone Camera Quality Assessment and Benchmarking\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20532-20542\n} \n}" }, { "title": "SRFormer: Permuted Self-Attention for Single Image Super-Resolution", @@ -50591,7 +52283,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0;0", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Zhou_2023_ICCV,\n \n author = {\n Zhou,\n Yupeng and Li,\n Zhen and Guo,\n Chun-Le and Bai,\n Song and Cheng,\n Ming-Ming and Hou,\n Qibin\n},\n title = {\n SRFormer: Permuted Self-Attention for Single Image Super-Resolution\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12780-12791\n} \n}" }, { "title": "SSB: Simple but Strong Baseline for Boosting Performance of Open-Set Semi-Supervised Learning", @@ -50603,7 +52296,7 @@ "author": "Yue Fan; Anna Kukleva; Dengxin Dai; Bernt Schiele", "abstract": "Semi-supervised learning (SSL) methods effectively leverage unlabeled data to improve model generalization. However, SSL models often underperform in open-set scenarios, where unlabeled data contain outliers from novel categories that do not appear in the labeled set. In this paper, we study the challenging and realistic open-set SSL setting, where the goal is to both correctly classify inliers and to detect outliers. Intuitively, the inlier classifier should be trained on inlier data only. However, we find that inlier classification performance can be largely improved by incorporating high-confidence pseudo-labeled data, regardless of whether they are inliers or outliers. Also, we propose to utilize non-linear transformations to separate the features used for inlier classification and outlier detection in the multi-task learning framework, preventing adverse effects between them. Additionally, we introduce pseudo-negative mining, which further boosts outlier detection performance. The three ingredients lead to what we call Simple but Strong Baseline (SSB) for open-set SSL. In experiments, SSB greatly improves both inlier classification and outlier detection performance, outperforming existing methods by a large margin. Our code will be released at https://github.com/YUE-FAN/SSB.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Fan_SSB_Simple_but_Strong_Baseline_for_Boosting_Performance_of_Open-Set_ICCV_2023_paper.pdf", - "aff": "Max Planck Institute for Informatics, Saarbr\u00fccken, Germany; Max Planck Institute for Informatics, Saarbr\u00fccken, Germany; Max Planck Institute for Informatics, Saarbr\u00fccken, Germany; Max Planck Institute for Informatics, Saarbr\u00fccken, Germany", + "aff": "Max Planck Institute for Informatics, Saarbrücken, Germany; Max Planck Institute for Informatics, Saarbrücken, Germany; Max Planck Institute for Informatics, Saarbrücken, Germany; Max Planck Institute for Informatics, Saarbrücken, Germany", "project": "", "github": "https://github.com/YUE-FAN/SSB", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Fan_SSB_Simple_but_ICCV_2023_supplemental.pdf", @@ -50621,9 +52314,10 @@ "aff_unique_url": "https://mpi-inf.mpg.de", "aff_unique_abbr": "MPII", "aff_campus_unique_index": "0;0;0;0", - "aff_campus_unique": "Saarbr\u00fccken", + "aff_campus_unique": "Saarbrücken", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Fan_2023_ICCV,\n \n author = {\n Fan,\n Yue and Kukleva,\n Anna and Dai,\n Dengxin and Schiele,\n Bernt\n},\n title = {\n SSB: Simple but Strong Baseline for Boosting Performance of Open-Set Semi-Supervised Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16068-16078\n} \n}" }, { "title": "SSDA: Secure Source-Free Domain Adaptation", @@ -50646,7 +52340,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ahmed_SSDA_Secure_Source-Free_Domain_Adaptation_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ahmed_SSDA_Secure_Source-Free_Domain_Adaptation_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Ahmed_2023_ICCV,\n \n author = {\n Ahmed,\n Sabbir and Al Arafat,\n Abdullah and Rizve,\n Mamshad Nayeem and Hossain,\n Rahim and Guo,\n Zhishan and Rakin,\n Adnan Siraj\n},\n title = {\n SSDA: Secure Source-Free Domain Adaptation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19180-19190\n} \n}" }, { "title": "SSF: Accelerating Training of Spiking Neural Networks with Stabilized Spiking Flow", @@ -50658,7 +52353,7 @@ "author": "Jingtao Wang; Zengjie Song; Yuxi Wang; Jun Xiao; Yuran Yang; Shuqi Mei; Zhaoxiang Zhang", "abstract": "Surrogate gradient (SG) is one of the most effective approaches for training spiking neural networks (SNNs). While assisting SNNs to achieve classification performance comparable to artificial neural networks, SG suffers from the problem of time-consuming training, preventing it from efficient learning. In this paper, we formally analyze the backward process of classic SG and find that the membrane accumulation through time leads to exponential growth of training time. With this discovery, we propose Stabilized Spiking Flow (SSF), a simple yet effective approach to accelerate training of SG-based SNNs. For each spiking neuron, SSF averages its input and output activations over time to yield stabilized input and output, respectively. Then, instead of back propagating all errors that are related to current neuron and inherently entangled in time domain, the auxiliary gradient is directly propagated from the stabilized output to input through a devised relationship mapping. Additionally, SSF method is suitable to different neuron models. Extensive experiments on both static and neuromorphic datasets demonstrate that SNNs trained with SSF approach can achieve performance comparable to the original counterparts, while reducing the training time significantly. In particular, SSF speeds up the training process of state-of-the-art SNN models up to 10x when time steps equal to 80.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Wang_SSF_Accelerating_Training_of_Spiking_Neural_Networks_with_Stabilized_Spiking_ICCV_2023_paper.pdf", - "aff": "University of Chinese Academy of Sciences+Centre for Arti\ufb01cial Intelligence and Robotics, HKISI-CAS+Institute of Automation, Chinese Academy of Sciences+State Key Laboratory of Multimodal Arti\ufb01cial Intelligence Systems; Xi\u2019an Jiaotong University; Centre for Arti\ufb01cial Intelligence and Robotics, HKISI-CAS; University of Chinese Academy of Sciences+State Key Laboratory of Multimodal Arti\ufb01cial Intelligence Systems; Tencent; Tencent; University of Chinese Academy of Sciences+Centre for Arti\ufb01cial Intelligence and Robotics, HKISI-CAS+Institute of Automation, Chinese Academy of Sciences+State Key Laboratory of Multimodal Arti\ufb01cial Intelligence Systems", + "aff": "University of Chinese Academy of Sciences+Centre for Artificial Intelligence and Robotics, HKISI-CAS+Institute of Automation, Chinese Academy of Sciences+State Key Laboratory of Multimodal Artificial Intelligence Systems; Xi’an Jiaotong University; Centre for Artificial Intelligence and Robotics, HKISI-CAS; University of Chinese Academy of Sciences+State Key Laboratory of Multimodal Artificial Intelligence Systems; Tencent; Tencent; University of Chinese Academy of Sciences+Centre for Artificial Intelligence and Robotics, HKISI-CAS+Institute of Automation, Chinese Academy of Sciences+State Key Laboratory of Multimodal Artificial Intelligence Systems", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Wang_SSF_Accelerating_Training_ICCV_2023_supplemental.pdf", @@ -50671,14 +52366,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wang_SSF_Accelerating_Training_of_Spiking_Neural_Networks_with_Stabilized_Spiking_ICCV_2023_paper.html", "aff_unique_index": "0+1+2+3;4;1;0+3;5;5;0+1+2+3", - "aff_unique_norm": "University of Chinese Academy of Sciences;Hong Kong Institute of Science and Technology;Chinese Academy of Sciences;State Key Laboratory of Multimodal Artificial Intelligence Systems;Xi'an Jiao Tong University;Tencent", - "aff_unique_dep": ";Centre for Arti\ufb01cial Intelligence and Robotics;Institute of Automation;;;Tencent Holdings Limited", + "aff_unique_norm": "University of Chinese Academy of Sciences;Hong Kong Institute of Science and Technology;Chinese Academy of Sciences;State Key Laboratory of Multimodal Artificial Intelligence Systems;Xi'an Jiaotong University;Tencent Holdings Limited", + "aff_unique_dep": ";Centre for Artificial Intelligence and Robotics;Institute of Automation;;;", "aff_unique_url": "http://www.ucas.ac.cn;;http://www.ia.cas.cn;;https://www.xjtu.edu.cn;https://www.tencent.com", "aff_unique_abbr": "UCAS;HKISI-CAS;CAS;;XJTU;Tencent", "aff_campus_unique_index": "1;1;;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0+0+0+0;0;0;0+0;0;0;0+0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Jingtao and Song,\n Zengjie and Wang,\n Yuxi and Xiao,\n Jun and Yang,\n Yuran and Mei,\n Shuqi and Zhang,\n Zhaoxiang\n},\n title = {\n SSF: Accelerating Training of Spiking Neural Networks with Stabilized Spiking Flow\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5982-5991\n} \n}" }, { "title": "STEERER: Resolving Scale Variations for Counting and Localization via Selective Inheritance Learning", @@ -50703,14 +52399,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Han_STEERER_Resolving_Scale_Variations_for_Counting_and_Localization_via_Selective_ICCV_2023_paper.html", "aff_unique_index": "0;0;1;0", - "aff_unique_norm": "Shanghai Artificial Intelligence Laboratory;Hong Kong Polytechnic University", + "aff_unique_norm": "Shanghai Artificial Intelligence Laboratory;The Hong Kong Polytechnic University", "aff_unique_dep": ";", "aff_unique_url": "http://www.shailab.org/;https://www.polyu.edu.hk", "aff_unique_abbr": "Shanghai AI Lab;PolyU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Han_2023_ICCV,\n \n author = {\n Han,\n Tao and Bai,\n Lei and Liu,\n Lingbo and Ouyang,\n Wanli\n},\n title = {\n STEERER: Resolving Scale Variations for Counting and Localization via Selective Inheritance Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21848-21859\n} \n}" }, { "title": "STEPs: Self-Supervised Key Step Extraction and Localization from Unlabeled Procedural Videos", @@ -50742,7 +52439,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Shah_2023_ICCV,\n \n author = {\n Shah,\n Anshul and Lundell,\n Benjamin and Sawhney,\n Harpreet and Chellappa,\n Rama\n},\n title = {\n STEPs: Self-Supervised Key Step Extraction and Localization from Unlabeled Procedural Videos\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10375-10387\n} \n}" }, { "title": "STPrivacy: Spatio-Temporal Privacy-Preserving Action Recognition", @@ -50754,7 +52452,7 @@ "author": "Ming Li; Xiangyu Xu; Hehe Fan; Pan Zhou; Jun Liu; Jia-Wei Liu; Jiahe Li; Jussi Keppo; Mike Zheng Shou; Shuicheng Yan", "abstract": "Existing methods of privacy-preserving action recognition (PPAR) mainly focus on frame-level (spatial) privacy removal through 2D CNNs. Unfortunately, they have two major drawbacks. First, they may compromise temporal dynamics in input videos, which are critical for accurate action recognition. Second, they are vulnerable to practical attacking scenarios where attackers probe for privacy from an entire video rather than individual frames. To address these issues, we propose a novel framework STPrivacy to perform video-level PPAR. For the first time, we introduce vision Transformers into PPAR by treating a video as a tubelet sequence, and accordingly design two complementary mechanisms, i.e., sparsification and anonymization, to remove privacy from a spatio-temporal perspective. In specific, our privacy sparsification mechanism applies adaptive token selection to abandon action-irrelevant tubelets. Then, our anonymization mechanism implicitly manipulates the remaining action-tubelets to erase privacy in the embedding space through adversarial learning. These mechanisms provide significant advantages in terms of privacy preservation for human eyes and action-privacy trade-off adjustment during deployment. We additionally contribute the first two large-scale PPAR benchmarks, VP-HMDB51 and VP-UCF101, to the community. Extensive evaluations on them, as well as two other tasks, validate the effectiveness and generalization capability of our framework.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Li_STPrivacy_Spatio-Temporal_Privacy-Preserving_Action_Recognition_ICCV_2023_paper.pdf", - "aff": "National University of Singapore; Xi\u2019an Jiaotong University; Zhejiang University; Sea AI Lab; Singapore University of Technology and Design; National University of Singapore; National University of Singapore; National University of Singapore; National University of Singapore; Sea AI Lab", + "aff": "National University of Singapore; Xi’an Jiaotong University; Zhejiang University; Sea AI Lab; Singapore University of Technology and Design; National University of Singapore; National University of Singapore; National University of Singapore; National University of Singapore; Sea AI Lab", "project": "", "github": "", "supp": "", @@ -50767,14 +52465,15 @@ "author_num": 10, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_STPrivacy_Spatio-Temporal_Privacy-Preserving_Action_Recognition_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;3;4;0;0;0;0;3", - "aff_unique_norm": "National University of Singapore;Xi'an Jiao Tong University;Zhejiang University;Sea AI Lab;Singapore University of Technology and Design", + "aff_unique_norm": "National University of Singapore;Xi'an Jiaotong University;Zhejiang University;Sea AI Lab;Singapore University of Technology and Design", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.nus.edu.sg;https://www.xjtu.edu.cn;https://www.zju.edu.cn;;https://www.sutd.edu.sg", "aff_unique_abbr": "NUS;XJTU;ZJU;;SUTD", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0;0;0;0;0", - "aff_country_unique": "Singapore;China;" + "aff_country_unique": "Singapore;China;", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Ming and Xu,\n Xiangyu and Fan,\n Hehe and Zhou,\n Pan and Liu,\n Jun and Liu,\n Jia-Wei and Li,\n Jiahe and Keppo,\n Jussi and Shou,\n Mike Zheng and Yan,\n Shuicheng\n},\n title = {\n STPrivacy: Spatio-Temporal Privacy-Preserving Action Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5106-5115\n} \n}" }, { "title": "SUMMIT: Source-Free Adaptation of Uni-Modal Models to Multi-Modal Targets", @@ -50799,14 +52498,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Simons_SUMMIT_Source-Free_Adaptation_of_Uni-Modal_Models_to_Multi-Modal_Targets_ICCV_2023_paper.html", "aff_unique_index": "0;0+1;0;2;0;0", - "aff_unique_norm": "University of California, Riverside;Amazon;United States Army Research Laboratory", + "aff_unique_norm": "University of California, Riverside;Amazon Web Services;United States Army Research Laboratory", "aff_unique_dep": ";AWS AI Labs;Army Research Laboratory", "aff_unique_url": "https://www.ucr.edu;https://aws.amazon.com;https://www.arl.army.mil", "aff_unique_abbr": "UCR;AWS;ARL", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Riverside;", "aff_country_unique_index": "0;0+0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Simons_2023_ICCV,\n \n author = {\n Simons,\n Cody and Raychaudhuri,\n Dripta S. and Ahmed,\n Sk Miraj and You,\n Suya and Karydis,\n Konstantinos and Roy-Chowdhury,\n Amit K.\n},\n title = {\n SUMMIT: Source-Free Adaptation of Uni-Modal Models to Multi-Modal Targets\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1239-1249\n} \n}" }, { "title": "SVDFormer: Complementing Point Cloud via Self-view Augmentation and Self-structure Dual-generator", @@ -50831,14 +52531,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhu_SVDFormer_Complementing_Point_Cloud_via_Self-view_Augmentation_and_Self-structure_Dual-generator_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;1;2;0", - "aff_unique_norm": "Nanjing University of Aeronautics and Astronautics;Hong Kong Metropolitan University;Hong Kong Polytechnic University", + "aff_unique_norm": "Nanjing University of Aeronautics and Astronautics;Hong Kong Metropolitan University;The Hong Kong Polytechnic University", "aff_unique_dep": ";;", "aff_unique_url": "http://www.nuaa.edu.cn;https://www.hkmu.edu.hk;https://www.polyu.edu.hk", "aff_unique_abbr": "NUAA;HKMU;PolyU", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhu_2023_ICCV,\n \n author = {\n Zhu,\n Zhe and Chen,\n Honghua and He,\n Xing and Wang,\n Weiming and Qin,\n Jing and Wei,\n Mingqiang\n},\n title = {\n SVDFormer: Complementing Point Cloud via Self-view Augmentation and Self-structure Dual-generator\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14508-14518\n} \n}" }, { "title": "SVDiff: Compact Parameter Space for Diffusion Fine-Tuning", @@ -50861,7 +52562,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Han_SVDiff_Compact_Parameter_Space_for_Diffusion_Fine-Tuning_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Han_SVDiff_Compact_Parameter_Space_for_Diffusion_Fine-Tuning_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Han_2023_ICCV,\n \n author = {\n Han,\n Ligong and Li,\n Yinxiao and Zhang,\n Han and Milanfar,\n Peyman and Metaxas,\n Dimitris and Yang,\n Feng\n},\n title = {\n SVDiff: Compact Parameter Space for Diffusion Fine-Tuning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7323-7334\n} \n}" }, { "title": "SVQNet: Sparse Voxel-Adjacent Query Network for 4D Spatio-Temporal LiDAR Semantic Segmentation", @@ -50884,7 +52586,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chen_SVQNet_Sparse_Voxel-Adjacent_Query_Network_for_4D_Spatio-Temporal_LiDAR_Semantic_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chen_SVQNet_Sparse_Voxel-Adjacent_Query_Network_for_4D_Spatio-Temporal_LiDAR_Semantic_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Xuechao and Xu,\n Shuangjie and Zou,\n Xiaoyi and Cao,\n Tongyi and Yeung,\n Dit-Yan and Fang,\n Lu\n},\n title = {\n SVQNet: Sparse Voxel-Adjacent Query Network for 4D Spatio-Temporal LiDAR Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8569-8578\n} \n}" }, { "title": "SYENet: A Simple Yet Effective Network for Multiple Low-Level Vision Tasks with Real-Time Performance on Mobile Device", @@ -50916,7 +52619,8 @@ "aff_campus_unique_index": ";;;;;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0+0;0+0;0+0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Gou_2023_ICCV,\n \n author = {\n Gou,\n Weiran and Yi,\n Ziyao and Xiang,\n Yan and Li,\n Shaoqing and Liu,\n Zibin and Kong,\n Dehui and Xu,\n Ke\n},\n title = {\n SYENet: A Simple Yet Effective Network for Multiple Low-Level Vision Tasks with Real-Time Performance on Mobile Device\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12182-12195\n} \n}" }, { "title": "Saliency Regularization for Self-Training with Partial Annotations", @@ -50948,7 +52652,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Shouwen and Wan,\n Qian and Xiang,\n Xiang and Zeng,\n Zhigang\n},\n title = {\n Saliency Regularization for Self-Training with Partial Annotations\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1611-1620\n} \n}" }, { "title": "Sample-adaptive Augmentation for Point Cloud Recognition Against Real-world Corruptions", @@ -50973,14 +52678,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wang_Sample-adaptive_Augmentation_for_Point_Cloud_Recognition_Against_Real-world_Corruptions_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0;0;1;0", - "aff_unique_norm": "Beijing Institute of Technology;Chinese University of Hong Kong", + "aff_unique_norm": "Beijing Institute of Technology;The Chinese University of Hong Kong", "aff_unique_dep": ";", "aff_unique_url": "http://www.bit.edu.cn/;https://www.cuhk.edu.hk", "aff_unique_abbr": "BIT;CUHK", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Jie and Ding,\n Lihe and Xu,\n Tingfa and Dong,\n Shaocong and Xu,\n Xinli and Bai,\n Long and Li,\n Jianan\n},\n title = {\n Sample-adaptive Augmentation for Point Cloud Recognition Against Real-world Corruptions\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14330-14339\n} \n}" }, { "title": "Sample-wise Label Confidence Incorporation for Learning with Noisy Labels", @@ -51005,14 +52711,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ahn_Sample-wise_Label_Confidence_Incorporation_for_Learning_with_Noisy_Labels_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0;0", - "aff_unique_norm": "Samsung", - "aff_unique_dep": "Samsung Advanced Institute of Technology", + "aff_unique_norm": "Samsung Advanced Institute of Technology", + "aff_unique_dep": "", "aff_unique_url": "https://www.sait.samsung.com", "aff_unique_abbr": "SAIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Ahn_2023_ICCV,\n \n author = {\n Ahn,\n Chanho and Kim,\n Kikyung and Baek,\n Ji-won and Lim,\n Jongin and Han,\n Seungju\n},\n title = {\n Sample-wise Label Confidence Incorporation for Learning with Noisy Labels\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1823-1832\n} \n}" }, { "title": "Sample4Geo: Hard Negative Sampling For Cross-View Geo-Localisation", @@ -51044,7 +52751,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Munich", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Deuser_2023_ICCV,\n \n author = {\n Deuser,\n Fabian and Habel,\n Konrad and Oswald,\n Norbert\n},\n title = {\n Sample4Geo: Hard Negative Sampling For Cross-View Geo-Localisation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16847-16856\n} \n}" }, { "title": "Sat2Density: Faithful Density Learning from Satellite-Ground Image Pairs", @@ -51076,7 +52784,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Wuhan;", "aff_country_unique_index": "0+0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Qian_2023_ICCV,\n \n author = {\n Qian,\n Ming and Xiong,\n Jincheng and Xia,\n Gui-Song and Xue,\n Nan\n},\n title = {\n Sat2Density: Faithful Density Learning from Satellite-Ground Image Pairs\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3683-3692\n} \n}" }, { "title": "SatlasPretrain: A Large-Scale Dataset for Remote Sensing Image Understanding", @@ -51108,7 +52817,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Bastani_2023_ICCV,\n \n author = {\n Bastani,\n Favyen and Wolters,\n Piper and Gupta,\n Ritwik and Ferdinando,\n Joe and Kembhavi,\n Aniruddha\n},\n title = {\n SatlasPretrain: A Large-Scale Dataset for Remote Sensing Image Understanding\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16772-16782\n} \n}" }, { "title": "Scalable Diffusion Models with Transformers", @@ -51140,7 +52850,8 @@ "aff_campus_unique_index": "0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Peebles_2023_ICCV,\n \n author = {\n Peebles,\n William and Xie,\n Saining\n},\n title = {\n Scalable Diffusion Models with Transformers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4195-4205\n} \n}" }, { "title": "Scalable Multi-Temporal Remote Sensing Change Data Generation via Simulating Stochastic Change Process", @@ -51172,7 +52883,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0+1;0;0;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Zheng_2023_ICCV,\n \n author = {\n Zheng,\n Zhuo and Tian,\n Shiqi and Ma,\n Ailong and Zhang,\n Liangpei and Zhong,\n Yanfei\n},\n title = {\n Scalable Multi-Temporal Remote Sensing Change Data Generation via Simulating Stochastic Change Process\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21818-21827\n} \n}" }, { "title": "Scalable Video Object Segmentation with Simplified Framework", @@ -51204,7 +52916,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Wu_2023_ICCV,\n \n author = {\n Wu,\n Qiangqiang and Yang,\n Tianyu and Wu,\n Wei and Chan,\n Antoni B.\n},\n title = {\n Scalable Video Object Segmentation with Simplified Framework\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13879-13889\n} \n}" }, { "title": "Scale-Aware Modulation Meet Transformer", @@ -51236,7 +52949,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Lin_2023_ICCV,\n \n author = {\n Lin,\n Weifeng and Wu,\n Ziheng and Chen,\n Jiayu and Huang,\n Jun and Jin,\n Lianwen\n},\n title = {\n Scale-Aware Modulation Meet Transformer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6015-6026\n} \n}" }, { "title": "Scale-MAE: A Scale-Aware Masked Autoencoder for Multiscale Geospatial Representation Learning", @@ -51261,14 +52975,15 @@ "author_num": 10, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Reed_Scale-MAE_A_Scale-Aware_Masked_Autoencoder_for_Multiscale_Geospatial_Representation_Learning_ICCV_2023_paper.html", "aff_unique_index": "0+1;0+1;0+1;2;2;2;0;1;1;0", - "aff_unique_norm": "University of California, Berkeley;Meta;Kitware Inc.", + "aff_unique_norm": "University of California, Berkeley;Facebook AI Research;Kitware Inc.", "aff_unique_dep": "Berkeley AI Research;Facebook AI Research;", "aff_unique_url": "https://www.berkeley.edu;https://research.facebook.com;https://www.kitware.com", "aff_unique_abbr": "UC Berkeley;FAIR;Kitware", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0+0;0+0;0+0;0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Reed_2023_ICCV,\n \n author = {\n Reed,\n Colorado J and Gupta,\n Ritwik and Li,\n Shufan and Brockman,\n Sarah and Funk,\n Christopher and Clipp,\n Brian and Keutzer,\n Kurt and Candido,\n Salvatore and Uyttendaele,\n Matt and Darrell,\n Trevor\n},\n title = {\n Scale-MAE: A Scale-Aware Masked Autoencoder for Multiscale Geospatial Representation Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4088-4099\n} \n}" }, { "title": "Scaling Data Generation in Vision-and-Language Navigation", @@ -51300,7 +53015,8 @@ "aff_campus_unique_index": "1;;1", "aff_campus_unique": ";Chapel Hill", "aff_country_unique_index": "0;1;0+2;2;0;1;0;1;2", - "aff_country_unique": "Australia;United States;China" + "aff_country_unique": "Australia;United States;China", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Zun and Li,\n Jialu and Hong,\n Yicong and Wang,\n Yi and Wu,\n Qi and Bansal,\n Mohit and Gould,\n Stephen and Tan,\n Hao and Qiao,\n Yu\n},\n title = {\n Scaling Data Generation in Vision-and-Language Navigation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12009-12020\n} \n}" }, { "title": "ScanNet++: A High-Fidelity Dataset of 3D Indoor Scenes", @@ -51308,8 +53024,8 @@ "status": "Oral", "track": "main", "pid": "9268", - "author_site": "Chandan Yeshwanth, Yueh-Cheng Liu, Matthias Nie\u00dfner, Angela Dai", - "author": "Chandan Yeshwanth; Yueh-Cheng Liu; Matthias Nie\u00dfner; Angela Dai", + "author_site": "Chandan Yeshwanth, Yueh-Cheng Liu, Matthias Nießner, Angela Dai", + "author": "Chandan Yeshwanth; Yueh-Cheng Liu; Matthias Nießner; Angela Dai", "abstract": "We present ScanNet++, a large-scale dataset that couples together capture of high-quality and commodity-level geometry and color of indoor scenes. Each scene is captured with a high-end laser scanner at sub-millimeter resolution, along with registered 33-megapixel images from a DSLR camera, and RGB-D streams from an iPhone. Scene reconstructions are further annotated with an open vocabulary of semantics, with label-ambiguous scenarios explicitly annotated for comprehensive semantic understanding. ScanNet++ enables a new real-world benchmark for novel view synthesis, both from high-quality RGB capture, and importantly also from commodity-level images, in addition to a new benchmark for 3D semantic scene understanding that comprehensively encapsulates diverse and ambiguous semantic labeling scenarios. Currently, ScanNet++ contains 460 scenes, 280,000 captured DSLR images, and over 3.7M iPhone RGBD frames.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Yeshwanth_ScanNet_A_High-Fidelity_Dataset_of_3D_Indoor_Scenes_ICCV_2023_paper.pdf", "aff": ";;;", @@ -51323,7 +53039,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yeshwanth_ScanNet_A_High-Fidelity_Dataset_of_3D_Indoor_Scenes_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yeshwanth_ScanNet_A_High-Fidelity_Dataset_of_3D_Indoor_Scenes_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Yeshwanth_2023_ICCV,\n \n author = {\n Yeshwanth,\n Chandan and Liu,\n Yueh-Cheng and Nie{\\ss\n}ner,\n Matthias and Dai,\n Angela\n},\n title = {\n ScanNet++: A High-Fidelity Dataset of 3D Indoor Scenes\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12-22\n} \n}" }, { "title": "Scanning Only Once: An End-to-end Framework for Fast Temporal Grounding in Long Videos", @@ -51346,7 +53063,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Pan_Scanning_Only_Once_An_End-to-end_Framework_for_Fast_Temporal_Grounding_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Pan_Scanning_Only_Once_An_End-to-end_Framework_for_Fast_Temporal_Grounding_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Pan_2023_ICCV,\n \n author = {\n Pan,\n Yulin and He,\n Xiangteng and Gong,\n Biao and Lv,\n Yiliang and Shen,\n Yujun and Peng,\n Yuxin and Zhao,\n Deli\n},\n title = {\n Scanning Only Once: An End-to-end Framework for Fast Temporal Grounding in Long Videos\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13767-13777\n} \n}" }, { "title": "ScatterNeRF: Seeing Through Fog with Physically-Based Inverse Neural Rendering", @@ -51378,7 +53096,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0;1", - "aff_country_unique": "Germany;United States" + "aff_country_unique": "Germany;United States", + "bibtex": "@InProceedings{Ramazzina_2023_ICCV,\n \n author = {\n Ramazzina,\n Andrea and Bijelic,\n Mario and Walz,\n Stefanie and Sanvito,\n Alessandro and Scheuble,\n Dominik and Heide,\n Felix\n},\n title = {\n ScatterNeRF: Seeing Through Fog with Physically-Based Inverse Neural Rendering\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17957-17968\n} \n}" }, { "title": "Scene Graph Contrastive Learning for Embodied Navigation", @@ -51401,7 +53120,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Singh_Scene_Graph_Contrastive_Learning_for_Embodied_Navigation_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Singh_Scene_Graph_Contrastive_Learning_for_Embodied_Navigation_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Singh_2023_ICCV,\n \n author = {\n Singh,\n Kunal Pratap and Salvador,\n Jordi and Weihs,\n Luca and Kembhavi,\n Aniruddha\n},\n title = {\n Scene Graph Contrastive Learning for Embodied Navigation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10884-10894\n} \n}" }, { "title": "Scene Matters: Model-based Deep Video Compression", @@ -51433,7 +53153,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Tang_2023_ICCV,\n \n author = {\n Tang,\n Lv and Zhang,\n Xinfeng and Zhang,\n Gai and Ma,\n Xiaoqi\n},\n title = {\n Scene Matters: Model-based Deep Video Compression\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12481-12491\n} \n}" }, { "title": "Scene as Occupancy", @@ -51458,14 +53179,15 @@ "author_num": 11, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Tong_Scene_as_Occupancy_ICCV_2023_paper.html", "aff_unique_index": "0+1;0+2+1;0+3;0+1;2;2;0;2;1;0+3;0", - "aff_unique_norm": "Shanghai AI Laboratory;University of Hong Kong;SenseTime;Chinese University of Hong Kong", + "aff_unique_norm": "Shanghai AI Laboratory;The University of Hong Kong;SenseTime;The Chinese University of Hong Kong", "aff_unique_dep": ";;SenseTime Research;", "aff_unique_url": "https://www.shanghai-ai-lab.com;https://www.hku.hk;https://www.sensetime.com;https://www.cuhk.edu.hk", "aff_unique_abbr": "SAIL;HKU;SenseTime;CUHK", "aff_campus_unique_index": "1;1;1;1;1;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0+0;0+0+0;0+0;0+0;0;0;0;0;0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Tong_2023_ICCV,\n \n author = {\n Tong,\n Wenwen and Sima,\n Chonghao and Wang,\n Tai and Chen,\n Li and Wu,\n Silei and Deng,\n Hanming and Gu,\n Yi and Lu,\n Lewei and Luo,\n Ping and Lin,\n Dahua and Li,\n Hongyang\n},\n title = {\n Scene as Occupancy\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8406-8415\n} \n}" }, { "title": "Scene-Aware Feature Matching", @@ -51497,7 +53219,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Nanjing", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Lu_2023_ICCV,\n \n author = {\n Lu,\n Xiaoyong and Yan,\n Yaping and Wei,\n Tong and Du,\n Songlin\n},\n title = {\n Scene-Aware Feature Matching\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3704-3713\n} \n}" }, { "title": "Scene-Aware Label Graph Learning for Multi-Label Image Classification", @@ -51529,7 +53252,8 @@ "aff_campus_unique_index": "1;2;2", "aff_campus_unique": ";Hangzhou;Nanjing", "aff_country_unique_index": "0;0;0;0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhu_2023_ICCV,\n \n author = {\n Zhu,\n Xuelin and Liu,\n Jian and Liu,\n Weijia and Ge,\n Jiawei and Liu,\n Bo and Cao,\n Jiuxin\n},\n title = {\n Scene-Aware Label Graph Learning for Multi-Label Image Classification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1473-1482\n} \n}" }, { "title": "SceneRF: Self-Supervised Monocular 3D Scene Reconstruction with Radiance Fields", @@ -51554,14 +53278,15 @@ "author_num": 2, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Cao_SceneRF_Self-Supervised_Monocular_3D_Scene_Reconstruction_with_Radiance_Fields_ICCV_2023_paper.html", "aff_unique_index": "0;0", - "aff_unique_norm": "INRIA", + "aff_unique_norm": "Inria", "aff_unique_dep": "", "aff_unique_url": "https://www.inria.fr", "aff_unique_abbr": "Inria", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "France" + "aff_country_unique": "France", + "bibtex": "@InProceedings{Cao_2023_ICCV,\n \n author = {\n Cao,\n Anh-Quan and de Charette,\n Raoul\n},\n title = {\n SceneRF: Self-Supervised Monocular 3D Scene Reconstruction with Radiance Fields\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9387-9398\n} \n}" }, { "title": "Scenimefy: Learning to Craft Anime Scene via Semi-Supervised Image-to-Image Translation", @@ -51593,7 +53318,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Jiang_2023_ICCV,\n \n author = {\n Jiang,\n Yuxin and Jiang,\n Liming and Yang,\n Shuai and Loy,\n Chen Change\n},\n title = {\n Scenimefy: Learning to Craft Anime Scene via Semi-Supervised Image-to-Image Translation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7357-7367\n} \n}" }, { "title": "Score Priors Guided Deep Variational Inference for Unsupervised Real-World Single Image Denoising", @@ -51625,7 +53351,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Wuhan", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Cheng_2023_ICCV,\n \n author = {\n Cheng,\n Jun and Liu,\n Tao and Tan,\n Shan\n},\n title = {\n Score Priors Guided Deep Variational Inference for Unsupervised Real-World Single Image Denoising\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12937-12948\n} \n}" }, { "title": "Score-Based Diffusion Models as Principled Priors for Inverse Imaging", @@ -51648,7 +53375,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Feng_Score-Based_Diffusion_Models_as_Principled_Priors_for_Inverse_Imaging_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Feng_Score-Based_Diffusion_Models_as_Principled_Priors_for_Inverse_Imaging_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Feng_2023_ICCV,\n \n author = {\n Feng,\n Berthy T. and Smith,\n Jamie and Rubinstein,\n Michael and Chang,\n Huiwen and Bouman,\n Katherine L. and Freeman,\n William T.\n},\n title = {\n Score-Based Diffusion Models as Principled Priors for Inverse Imaging\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10520-10531\n} \n}" }, { "title": "Scratch Each Other's Back: Incomplete Multi-Modal Brain Tumor Segmentation via Category Aware Group Self-Support Learning", @@ -51680,7 +53408,8 @@ "aff_campus_unique_index": ";;;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0+0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Qiu_2023_ICCV,\n \n author = {\n Qiu,\n Yansheng and Chen,\n Delin and Yao,\n Hongdou and Xu,\n Yongchao and Wang,\n Zheng\n},\n title = {\n Scratch Each Other's Back: Incomplete Multi-Modal Brain Tumor Segmentation via Category Aware Group Self-Support Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21317-21326\n} \n}" }, { "title": "Scratching Visual Transformer's Back with Uniform Attention", @@ -51692,7 +53421,7 @@ "author": "Nam Hyeon-Woo; Kim Yu-Ji; Byeongho Heo; Dongyoon Han; Seong Joon Oh; Tae-Hyun Oh", "abstract": "The favorable performance of Vision Transformers (ViTs) is often attributed to the multi-head self-attention (MSA), which enables global interactions at each layer of\n a ViT model. Previous works acknowledge the property of long-range dependency for the effectiveness in MSA. In this work, we study the role of MSA in terms of the different axis, density. Our preliminary analyses suggest that the spatial interactions of learned attention maps are close to dense interactions rather than sparse ones. This is a curious phenomenon because dense attention maps are harder for the model to learn due to softmax. We interpret this opposite behavior against softmax as a strong preference for the ViT models to include dense interaction. We thus manually insert the dense uniform attention to each layer of the ViT models to supply the much-needed dense interactions. We call this method Context Broadcasting, CB. Our study demonstrates the inclusion of CB takes the role of dense attention, and thereby reduces the degree of density in the original attention maps by complying softmax in MSA. We also show that, with negligible costs of CB (1 line in your model code and no additional parameters), both the capacity and generalizability of the ViT models are increased.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Hyeon-Woo_Scratching_Visual_Transformers_Back_with_Uniform_Attention_ICCV_2023_paper.pdf", - "aff": "POSTECH; POSTECH; NA VER AI Lab; NA VER AI Lab; T\u00a8ubingen University; POSTECH+Institute for Convergence Research and Education in Advanced Technology, Yonsei University", + "aff": "POSTECH; POSTECH; NA VER AI Lab; NA VER AI Lab; T¨ubingen University; POSTECH+Institute for Convergence Research and Education in Advanced Technology, Yonsei University", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Hyeon-Woo_Scratching_Visual_Transformers_ICCV_2023_supplemental.pdf", @@ -51705,14 +53434,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Hyeon-Woo_Scratching_Visual_Transformers_Back_with_Uniform_Attention_ICCV_2023_paper.html", "aff_unique_index": "0;0;1;1;2;0+3", - "aff_unique_norm": "Pohang University of Science and Technology;NAVER Corporation;University of T\u00fcbingen;Yonsei University", + "aff_unique_norm": "Pohang University of Science and Technology;NAVER Corporation;University of Tübingen;Yonsei University", "aff_unique_dep": ";AI Lab;;Institute for Convergence Research and Education in Advanced Technology", "aff_unique_url": "https://www.postech.ac.kr;https://www.naver.com;https://www.uni-tuebingen.de/;https://www.yonsei.ac.kr", - "aff_unique_abbr": "POSTECH;NAVER;Uni T\u00fcbingen;Yonsei", + "aff_unique_abbr": "POSTECH;NAVER;Uni Tübingen;Yonsei", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Pohang;", "aff_country_unique_index": "0;0;0;0;1;0+0", - "aff_country_unique": "South Korea;Germany" + "aff_country_unique": "South Korea;Germany", + "bibtex": "@InProceedings{Hyeon-Woo_2023_ICCV,\n \n author = {\n Hyeon-Woo,\n Nam and Yu-Ji,\n Kim and Heo,\n Byeongho and Han,\n Dongyoon and Oh,\n Seong Joon and Oh,\n Tae-Hyun\n},\n title = {\n Scratching Visual Transformer's Back with Uniform Attention\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5807-5818\n} \n}" }, { "title": "Seal-3D: Interactive Pixel-Level Editing for Neural Radiance Fields", @@ -51744,7 +53474,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0;0+0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Xiangyu and Zhu,\n Jingsen and Ye,\n Qi and Huo,\n Yuchi and Ran,\n Yunlong and Zhong,\n Zhihua and Chen,\n Jiming\n},\n title = {\n Seal-3D: Interactive Pixel-Level Editing for Neural Radiance Fields\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17683-17693\n} \n}" }, { "title": "Search for or Navigate to? Dual Adaptive Thinking for Object Navigation", @@ -51776,7 +53507,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Suzhou", "aff_country_unique_index": "0;0;0;0;0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Dang_2023_ICCV,\n \n author = {\n Dang,\n Ronghao and Wang,\n Liuyi and He,\n Zongtao and Su,\n Shuai and Tang,\n Jiagui and Liu,\n Chengju and Chen,\n Qijun\n},\n title = {\n Search for or Navigate to? Dual Adaptive Thinking for Object Navigation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8250-8259\n} \n}" }, { "title": "See More and Know More: Zero-shot Point Cloud Segmentation via Multi-modal Visual Data", @@ -51801,14 +53533,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Lu_See_More_and_Know_More_Zero-shot_Point_Cloud_Segmentation_via_ICCV_2023_paper.html", "aff_unique_index": "0;0;1;2;3;0", - "aff_unique_norm": "ShanghaiTech University;University of Hong Kong;Shanghai AI Laboratory;Chinese University of Hong Kong", + "aff_unique_norm": "ShanghaiTech University;The University of Hong Kong;Shanghai AI Laboratory;The Chinese University of Hong Kong", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.shanghaitech.edu.cn;https://www.hku.hk;https://www.shanghai-ai-lab.com;https://www.cuhk.edu.hk", "aff_unique_abbr": "ShanghaiTech;HKU;SAIL;CUHK", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Lu_2023_ICCV,\n \n author = {\n Lu,\n Yuhang and Jiang,\n Qi and Chen,\n Runnan and Hou,\n Yuenan and Zhu,\n Xinge and Ma,\n Yuexin\n},\n title = {\n See More and Know More: Zero-shot Point Cloud Segmentation via Multi-modal Visual Data\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21674-21684\n} \n}" }, { "title": "SeeABLE: Soft Discrepancies and Bounded Contrastive Learning for Exposing Deepfakes", @@ -51840,7 +53573,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Cergy;", "aff_country_unique_index": "0+1;0;1;1;0", - "aff_country_unique": "France;Slovenia" + "aff_country_unique": "France;Slovenia", + "bibtex": "@InProceedings{Larue_2023_ICCV,\n \n author = {\n Larue,\n Nicolas and Vu,\n Ngoc-Son and Struc,\n Vitomir and Peer,\n Peter and Christophides,\n Vassilis\n},\n title = {\n SeeABLE: Soft Discrepancies and Bounded Contrastive Learning for Exposing Deepfakes\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21011-21021\n} \n}" }, { "title": "Seeing Beyond the Patch: Scale-Adaptive Semantic Segmentation of High-resolution Remote Sensing Imagery based on Reinforcement Learning", @@ -51872,7 +53606,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Wuhan", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Yinhe and Shi,\n Sunan and Wang,\n Junjue and Zhong,\n Yanfei\n},\n title = {\n Seeing Beyond the Patch: Scale-Adaptive Semantic Segmentation of High-resolution Remote Sensing Imagery based on Reinforcement Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16868-16878\n} \n}" }, { "title": "SegGPT: Towards Segmenting Everything in Context", @@ -51904,7 +53639,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Xinlong and Zhang,\n Xiaosong and Cao,\n Yue and Wang,\n Wen and Shen,\n Chunhua and Huang,\n Tiejun\n},\n title = {\n SegGPT: Towards Segmenting Everything in Context\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1130-1140\n} \n}" }, { "title": "SegPrompt: Boosting Open-World Segmentation via Category-Level Prompt Learning", @@ -51936,7 +53672,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1+0;0;1;0+0", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Zhu_2023_ICCV,\n \n author = {\n Zhu,\n Muzhi and Li,\n Hengtao and Chen,\n Hao and Fan,\n Chengxiang and Mao,\n Weian and Jing,\n Chenchen and Liu,\n Yifan and Shen,\n Chunhua\n},\n title = {\n SegPrompt: Boosting Open-World Segmentation via Category-Level Prompt Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 999-1008\n} \n}" }, { "title": "SegRCDB: Semantic Segmentation via Formula-Driven Supervised Learning", @@ -51959,7 +53696,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Shinoda_SegRCDB_Semantic_Segmentation_via_Formula-Driven_Supervised_Learning_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Shinoda_SegRCDB_Semantic_Segmentation_via_Formula-Driven_Supervised_Learning_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Shinoda_2023_ICCV,\n \n author = {\n Shinoda,\n Risa and Hayamizu,\n Ryo and Nakashima,\n Kodai and Inoue,\n Nakamasa and Yokota,\n Rio and Kataoka,\n Hirokatsu\n},\n title = {\n SegRCDB: Semantic Segmentation via Formula-Driven Supervised Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20054-20063\n} \n}" }, { "title": "Segment Anything", @@ -51984,14 +53722,15 @@ "author_num": 12, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Kirillov_Segment_Anything_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0;0;0;0;0;0;0;0;0", - "aff_unique_norm": "Meta", + "aff_unique_norm": "Meta AI Research", "aff_unique_dep": "FAIR", "aff_unique_url": "https://research.facebook.com", "aff_unique_abbr": "Meta AI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Kirillov_2023_ICCV,\n \n author = {\n Kirillov,\n Alexander and Mintun,\n Eric and Ravi,\n Nikhila and Mao,\n Hanzi and Rolland,\n Chloe and Gustafson,\n Laura and Xiao,\n Tete and Whitehead,\n Spencer and Berg,\n Alexander C. and Lo,\n Wan-Yen and Dollar,\n Piotr and Girshick,\n Ross\n},\n title = {\n Segment Anything\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4015-4026\n} \n}" }, { "title": "Segment Every Reference Object in Spatial and Temporal Spaces", @@ -52016,14 +53755,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wu_Segment_Every_Reference_Object_in_Spatial_and_Temporal_Spaces_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;3;1;0+3", - "aff_unique_norm": "University of Hong Kong;ByteDance;Dalian University of Technology;Shanghai AI Laboratory", + "aff_unique_norm": "The University of Hong Kong;ByteDance;Dalian University of Technology;Shanghai AI Laboratory", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.hku.hk;https://www.bytedance.com;http://www.dlut.edu.cn/;https://www.shanghai-ai-lab.com", "aff_unique_abbr": "HKU;ByteDance;DUT;SAIL", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wu_2023_ICCV,\n \n author = {\n Wu,\n Jiannan and Jiang,\n Yi and Yan,\n Bin and Lu,\n Huchuan and Yuan,\n Zehuan and Luo,\n Ping\n},\n title = {\n Segment Every Reference Object in Spatial and Temporal Spaces\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2538-2550\n} \n}" }, { "title": "Segmentation of Tubular Structures Using Iterative Training with Tailored Samples", @@ -52031,6 +53771,7 @@ "status": "Poster", "track": "main", "pid": "3587", + "author_site": "Wei Liao", "author": "Wei Liao", "abstract": "We propose a minimal path method to simultaneously compute segmentation masks and extract centerlines of tubular structures with line-topology. Minimal path methods are commonly used for the segmentation of tubular structures in a wide variety of applications. Recent methods use features extracted by CNNs, and often outperform methods using hand-tuned features. However, for CNN-based methods, the samples used for training may be generated inappropriately, so that they can be very different from samples encountered during inference. We approach this discrepancy by introducing a novel iterative training scheme, which enables generating better training samples specifically tailored for the minimal path methods without changing existing annotations. In our method, segmentation masks and centerlines are not determined after one another by post-processing, but obtained using the same steps. Our method requires only very few annotated training images. Comparison with seven previous approaches on three public datasets, including satellite images and medical images, shows that our method achieves state-of-the-art results both for segmentation masks and centerlines.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Liao_Segmentation_of_Tubular_Structures_Using_Iterative_Training_with_Tailored_Samples_ICCV_2023_paper.pdf", @@ -52050,7 +53791,8 @@ "aff_unique_norm": "Independent Researcher", "aff_unique_dep": "", "aff_unique_url": "", - "aff_unique_abbr": "" + "aff_unique_abbr": "", + "bibtex": "@InProceedings{Liao_2023_ICCV,\n \n author = {\n Liao,\n Wei\n},\n title = {\n Segmentation of Tubular Structures Using Iterative Training with Tailored Samples\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23643-23652\n} \n}" }, { "title": "Segmenting Known Objects and Unseen Unknowns without Prior Knowledge", @@ -52076,13 +53818,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Gasperini_Segmenting_Known_Objects_and_Unseen_Unknowns_without_Prior_Knowledge_ICCV_2023_paper.html", "aff_unique_index": "0+1;1;1;0;0;0+2", "aff_unique_norm": "Technical University of Munich;BMW Group;Google", - "aff_unique_dep": ";;Google", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.tum.de;https://www.bmwgroup.com;https://www.google.com", "aff_unique_abbr": "TUM;BMW;Google", "aff_campus_unique_index": ";1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0+0;0;0;0;0;0+1", - "aff_country_unique": "Germany;United States" + "aff_country_unique": "Germany;United States", + "bibtex": "@InProceedings{Gasperini_2023_ICCV,\n \n author = {\n Gasperini,\n Stefano and Marcos-Ramiro,\n Alvaro and Schmidt,\n Michael and Navab,\n Nassir and Busam,\n Benjamin and Tombari,\n Federico\n},\n title = {\n Segmenting Known Objects and Unseen Unknowns without Prior Knowledge\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19321-19332\n} \n}" }, { "title": "SeiT: Storage-Efficient Vision Training with Tokens Using 1% of Pixel Storage", @@ -52114,7 +53857,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Park_2023_ICCV,\n \n author = {\n Park,\n Song and Chun,\n Sanghyuk and Heo,\n Byeongho and Kim,\n Wonjae and Yun,\n Sangdoo\n},\n title = {\n SeiT: Storage-Efficient Vision Training with Tokens Using 1\\% of Pixel Storage\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17248-17259\n} \n}" }, { "title": "Self-Calibrated Cross Attention Network for Few-Shot Segmentation", @@ -52146,7 +53890,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", - "aff_country_unique": "Singapore;China" + "aff_country_unique": "Singapore;China", + "bibtex": "@InProceedings{Xu_2023_ICCV,\n \n author = {\n Xu,\n Qianxiong and Zhao,\n Wenting and Lin,\n Guosheng and Long,\n Cheng\n},\n title = {\n Self-Calibrated Cross Attention Network for Few-Shot Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 655-665\n} \n}" }, { "title": "Self-Evolved Dynamic Expansion Model for Task-Free Continual Learning", @@ -52154,6 +53899,7 @@ "status": "Poster", "track": "main", "pid": "8916", + "author_site": "Fei Ye, Adrian G. Bors", "author": "Fei Ye, Adrian G. Bors", "abstract": "Task-Free Continual Learning (TFCL) aims to learn new concepts from a stream of data without any task information. The Dynamic Expansion Model (DEM) has shown promising results in TFCL by dynamically expanding the model's capacity to deal with shifts in the data distribution. However, existing approaches only consider the recognition of the input shift as the expansion signal and ignore the correlation between the newly incoming data and previously learned knowledge, resulting in adding and training unnecessary parameters. In this paper, we propose a novel and effective framework for TFCL, which dynamically expands the architecture of a DEM model through a self-assessment mechanism evaluating the diversity of knowledge among existing experts as expansion signals. This mechanism ensures learning additional underlying data distributions with a compact model structure. A novelty-aware sample selection approach is proposed to manage the memory buffer that forces the newly added expert to learn novel information from a data stream, which further promotes the diversity among experts. Moreover, we also propose to reuse previously learned representation information for learning new incoming data by using knowledge transfer in TFCL, which has not been explored before. The DEM expansion and training are regularized through a gradient updating mechanism to gradually explore the positive forward transfer, further improving the performance. Empirical results on TFCL benchmarks show that the proposed framework outperforms the state-of-the-art while using a reasonable number of parameters. The code is available at https://github.com/dtuzi123/SEDEM/.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Ye_Self-Evolved_Dynamic_Expansion_Model_for_Task-Free_Continual_Learning_ICCV_2023_paper.pdf", @@ -52165,7 +53911,8 @@ "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16987989937128754486&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ye_Self-Evolved_Dynamic_Expansion_Model_for_Task-Free_Continual_Learning_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ye_Self-Evolved_Dynamic_Expansion_Model_for_Task-Free_Continual_Learning_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Ye_2023_ICCV,\n \n author = {\n Ye,\n Fei and Bors,\n Adrian G.\n},\n title = {\n Self-Evolved Dynamic Expansion Model for Task-Free Continual Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22102-22112\n} \n}" }, { "title": "Self-Feedback DETR for Temporal Action Detection", @@ -52197,7 +53944,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Kim_2023_ICCV,\n \n author = {\n Kim,\n Jihwan and Lee,\n Miso and Heo,\n Jae-Pil\n},\n title = {\n Self-Feedback DETR for Temporal Action Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10286-10296\n} \n}" }, { "title": "Self-Ordering Point Clouds", @@ -52229,7 +53977,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Netherlands" + "aff_country_unique": "Netherlands", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n Pengwan and Snoek,\n Cees G. M. and Asano,\n Yuki M.\n},\n title = {\n Self-Ordering Point Clouds\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15813-15822\n} \n}" }, { "title": "Self-Organizing Pathway Expansion for Non-Exemplar Class-Incremental Learning", @@ -52261,7 +54010,8 @@ "aff_campus_unique_index": ";;;1", "aff_campus_unique": ";Hefei", "aff_country_unique_index": "0+0;0+0;0+0;0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhu_2023_ICCV,\n \n author = {\n Zhu,\n Kai and Zheng,\n Kecheng and Feng,\n Ruili and Zhao,\n Deli and Cao,\n Yang and Zha,\n Zheng-Jun\n},\n title = {\n Self-Organizing Pathway Expansion for Non-Exemplar Class-Incremental Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19204-19213\n} \n}" }, { "title": "Self-Supervised Burst Super-Resolution", @@ -52269,8 +54019,8 @@ "status": "Poster", "track": "main", "pid": "11156", - "author_site": "Goutam Bhat, Micha\u00ebl Gharbi, Jiawen Chen, Luc Van Gool, Zhihao Xia", - "author": "Goutam Bhat; Micha\u00ebl Gharbi; Jiawen Chen; Luc Van Gool; Zhihao Xia", + "author_site": "Goutam Bhat, Michaël Gharbi, Jiawen Chen, Luc Van Gool, Zhihao Xia", + "author": "Goutam Bhat; Michaël Gharbi; Jiawen Chen; Luc Van Gool; Zhihao Xia", "abstract": "We introduce a self-supervised training strategy for burst super-resolution that only uses noisy low-resolution bursts during training. Our approach eliminates the need to carefully tune synthetic data simulation pipelines, which often do not match real-world image statistics. Compared to weakly-paired training strategies, which require noisy smartphone burst photos of static scenes, paired with a clean reference obtained from a tripod-mounted DSLR camera, our approach is more scalable, and avoids the color mismatch between the smartphone and DSLR. To achieve this, we propose a new self-supervised objective that uses a forward imaging model to recover a high-resolution image from aliased high frequencies in the burst. Our approach does not require any manual tuning of the forward model's parameters; we learn them from data. Furthermore, we show our training strategy is robust to dynamic scene motion in the burst, which enables training burst super-resolution models using in-the-wild data. Extensive experiments on real and synthetic data show that, despite only using noisy bursts during training, models trained with our self-supervised strategy match, and sometimes surpass, the quality of fully-supervised baselines trained with synthetic data or weakly-paired ground-truth. Finally, we show our training strategy is general using four different burst super-resolution architectures.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Bhat_Self-Supervised_Burst_Super-Resolution_ICCV_2023_paper.pdf", "aff": "CVL, ETH Zurich; Adobe; Adobe; CVL, ETH Zurich; Adobe", @@ -52286,14 +54036,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Bhat_Self-Supervised_Burst_Super-Resolution_ICCV_2023_paper.html", "aff_unique_index": "0;1;1;0;1", - "aff_unique_norm": "ETH Zurich;Adobe", - "aff_unique_dep": "Computer Vision Laboratory;Adobe Inc.", + "aff_unique_norm": "ETH Zurich;Adobe Inc.", + "aff_unique_dep": "Computer Vision Laboratory;", "aff_unique_url": "https://www.ethz.ch;https://www.adobe.com", "aff_unique_abbr": "ETHZ;Adobe", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0;1", - "aff_country_unique": "Switzerland;United States" + "aff_country_unique": "Switzerland;United States", + "bibtex": "@InProceedings{Bhat_2023_ICCV,\n \n author = {\n Bhat,\n Goutam and Gharbi,\n Micha\\"el and Chen,\n Jiawen and Van Gool,\n Luc and Xia,\n Zhihao\n},\n title = {\n Self-Supervised Burst Super-Resolution\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10605-10614\n} \n}" }, { "title": "Self-Supervised Character-to-Character Distillation for Text Recognition", @@ -52322,10 +54073,11 @@ "aff_unique_dep": "AI Institute", "aff_unique_url": "https://www.sjtu.edu.cn", "aff_unique_abbr": "SJTU", - "aff_campus_unique_index": "", - "aff_campus_unique": "", + "aff_campus_unique_index": "0;0;0;0;0", + "aff_campus_unique": "Shanghai;", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Guan_2023_ICCV,\n \n author = {\n Guan,\n Tongkun and Shen,\n Wei and Yang,\n Xue and Feng,\n Qi and Jiang,\n Zekun and Yang,\n Xiaokang\n},\n title = {\n Self-Supervised Character-to-Character Distillation for Text Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19473-19484\n} \n}" }, { "title": "Self-Supervised Monocular Depth Estimation by Direction-aware Cumulative Convolution Network", @@ -52357,7 +54109,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Macau SAR;", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Han_2023_ICCV,\n \n author = {\n Han,\n Wencheng and Yin,\n Junbo and Shen,\n Jianbing\n},\n title = {\n Self-Supervised Monocular Depth Estimation by Direction-aware Cumulative Convolution Network\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8613-8623\n} \n}" }, { "title": "Self-Supervised Object Detection from Egocentric Videos", @@ -52380,7 +54133,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Akiva_Self-Supervised_Object_Detection_from_Egocentric_Videos_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Akiva_Self-Supervised_Object_Detection_from_Egocentric_Videos_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Akiva_2023_ICCV,\n \n author = {\n Akiva,\n Peri and Huang,\n Jing and Liang,\n Kevin J and Kovvuri,\n Rama and Chen,\n Xingyu and Feiszli,\n Matt and Dana,\n Kristin and Hassner,\n Tal\n},\n title = {\n Self-Supervised Object Detection from Egocentric Videos\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5225-5237\n} \n}" }, { "title": "Self-regulating Prompts: Foundational Model Adaptation without Forgetting", @@ -52392,7 +54146,7 @@ "author": "Muhammad Uzair Khattak; Syed Talal Wasim; Muzammal Naseer; Salman Khan; Ming-Hsuan Yang; Fahad Shahbaz Khan", "abstract": "Prompt learning has emerged as an efficient alternative for fine-tuning foundational models, such as CLIP, for various downstream tasks. Conventionally trained using the task-specific objective, i.e., cross-entropy loss, prompts tend to overfit downstream data distributions and find it challenging to capture task-agnostic general features from the frozen CLIP. This leads to the loss of the model's original generalization capability. To address this issue, our work introduces a self-regularization framework for prompting called PromptSRC (Prompting with Self-regulating Constraints). PromptSRC guides the prompts to optimize for both task-specific and task-agnostic general representations using a three-pronged approach by: (a) regulating prompted representations via mutual agreement maximization with the frozen model, (b) regulating with self-ensemble of prompts over the training trajectory to encode their complementary strengths, and (c) regulating with textual diversity to mitigate sample diversity imbalance with the visual branch. To the best of our knowledge, this is the first regularization framework for prompt learning that avoids overfitting by jointly attending to pre-trained model features, the training trajectory during prompting, and the textual diversity. PromptSRC explicitly steers the prompts to learn a representation space that maximizes performance on downstream tasks without compromising CLIP generalization. We perform extensive experiments on 4 benchmarks where PromptSRC overall performs favorably well compared to the existing methods. Our code and pre-trained models are publicly available.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Khattak_Self-regulating_Prompts_Foundational_Model_Adaptation_without_Forgetting_ICCV_2023_paper.pdf", - "aff": "Mohamed bin Zayed University of AI; Mohamed bin Zayed University of AI; Mohamed bin Zayed University of AI; Mohamed bin Zayed University of AI+Australian National University; University of California, Merced; Mohamed bin Zayed University of AI+Link \u00a8oping University+Google Research", + "aff": "Mohamed bin Zayed University of AI; Mohamed bin Zayed University of AI; Mohamed bin Zayed University of AI; Mohamed bin Zayed University of AI+Australian National University; University of California, Merced; Mohamed bin Zayed University of AI+Link ¨oping University+Google Research", "project": "", "github": "https://github.com/muzairkhattak/PromptSRC", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Khattak_Self-regulating_Prompts_Foundational_ICCV_2023_supplemental.pdf", @@ -52405,14 +54159,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Khattak_Self-regulating_Prompts_Foundational_Model_Adaptation_without_Forgetting_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0+1;2;0+3+4", - "aff_unique_norm": "Mohamed bin Zayed University of Artificial Intelligence;Australian National University;University of California, Merced;Link\u00f6ping University;Google", + "aff_unique_norm": "Mohamed bin Zayed University of Artificial Intelligence;Australian National University;University of California, Merced;Linköping University;Google", "aff_unique_dep": ";;;;Google Research", "aff_unique_url": "https://mbzuai.ac.ae;https://www.anu.edu.au;https://www.ucmerced.edu;https://www.liu.se;https://research.google", "aff_unique_abbr": "MBZUAI;ANU;UC Merced;LiU;Google Research", "aff_campus_unique_index": ";1;2", "aff_campus_unique": ";Merced;Mountain View", "aff_country_unique_index": "0;0;0;0+1;2;0+3+2", - "aff_country_unique": "United Arab Emirates;Australia;United States;Sweden" + "aff_country_unique": "United Arab Emirates;Australia;United States;Sweden", + "bibtex": "@InProceedings{Khattak_2023_ICCV,\n \n author = {\n Khattak,\n Muhammad Uzair and Wasim,\n Syed Talal and Naseer,\n Muzammal and Khan,\n Salman and Yang,\n Ming-Hsuan and Khan,\n Fahad Shahbaz\n},\n title = {\n Self-regulating Prompts: Foundational Model Adaptation without Forgetting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15190-15200\n} \n}" }, { "title": "Self-similarity Driven Scale-invariant Learning for Weakly Supervised Person Search", @@ -52420,6 +54175,7 @@ "status": "Poster", "track": "main", "pid": "4902", + "author_site": "Benzhi Wang, Yang Yang, Jinlin Wu, Guo-jun Qi, Zhen Lei", "author": "Benzhi Wang, Yang Yang, Jinlin Wu, Guo-jun Qi, Zhen Lei", "abstract": "Weakly supervised person search aims to jointly detect and match persons with only bounding box annotations. Existing approaches typically focus on improving the features by exploring the relations of persons. However, scale variation problem is a more severe obstacle and under-studied that a person often owns images with different scales (resolutions). For one thing, small-scale images contain less information of a person, thus affecting the accuracy of the generated pseudo labels. For another, different similarities between cross-scale images of a person increase the difficulty of matching. In this paper, we address it by proposing a novel one-step framework, named Self-similarity driven Scale-invariant Learning (SSL). Scale invariance can be explored based on the self-similarity prior that it shows the same statistical properties of an image at different scales. To this end, we introduce a Multi-scale Exemplar Branch to guide the network in concentrating on the foreground and learning scale-invariant features by hard exemplars mining. To enhance the discriminative power of the learned features, we further introduce a dynamic pseudo label prediction that progressively seeks true labels for training. Experimental results on two standard benchmarks, i.e., PRW and CUHK-SYSU datasets, demonstrate that the proposed method can solve scale variation problem effectively and perform favorably against state-of-the-art methods. Code is available at https://github.com/Wangbenzhi/SSL.git.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Wang_Self-similarity_Driven_Scale-invariant_Learning_for_Weakly_Supervised_Person_Search_ICCV_2023_paper.pdf", @@ -52431,7 +54187,8 @@ "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17858099963403912572&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wang_Self-similarity_Driven_Scale-invariant_Learning_for_Weakly_Supervised_Person_Search_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wang_Self-similarity_Driven_Scale-invariant_Learning_for_Weakly_Supervised_Person_Search_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Benzhi and Yang,\n Yang and Wu,\n Jinlin and Qi,\n Guo-jun and Lei,\n Zhen\n},\n title = {\n Self-similarity Driven Scale-invariant Learning for Weakly Supervised Person Search\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1813-1822\n} \n}" }, { "title": "Self-supervised Cross-view Representation Reconstruction for Change Captioning", @@ -52456,14 +54213,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Tu_Self-supervised_Cross-view_Representation_Reconstruction_for_Change_Captioning_ICCV_2023_paper.html", "aff_unique_index": "0;1+2;0+1+2;3;4+4;0+1+2", - "aff_unique_norm": "University of Chinese Academy of Sciences;Chinese Academy of Sciences;Pengcheng Laboratory;University of Science and Technology of China;Hangzhou Dianzi University", - "aff_unique_dep": ";Institute of Computing Technology;Peng Cheng Laboratory;;", + "aff_unique_norm": "University of Chinese Academy of Sciences;Chinese Academy of Sciences;Peng Cheng Laboratory;University of Science and Technology of China;Hangzhou Dianzi University", + "aff_unique_dep": ";Institute of Computing Technology;;;", "aff_unique_url": "http://www.ucas.ac.cn;http://www.cas.cn;http://www.pcl.ac.cn;http://www.ustc.edu.cn;http://www.hdu.edu.cn/", "aff_unique_abbr": "UCAS;CAS;PCL;USTC;HGHDU", "aff_campus_unique_index": ";;1;", "aff_campus_unique": ";Lishui", "aff_country_unique_index": "0;0+0;0+0+0;0;0+0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Tu_2023_ICCV,\n \n author = {\n Tu,\n Yunbin and Li,\n Liang and Su,\n Li and Zha,\n Zheng-Jun and Yan,\n Chenggang and Huang,\n Qingming\n},\n title = {\n Self-supervised Cross-view Representation Reconstruction for Change Captioning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2805-2815\n} \n}" }, { "title": "Self-supervised Image Denoising with Downsampled Invariance Loss and Conditional Blind-Spot Network", @@ -52495,7 +54253,8 @@ "aff_campus_unique_index": "0;0;0;0+0", "aff_campus_unique": "Seoul;", "aff_country_unique_index": "0;0;0;1;0+0", - "aff_country_unique": "South Korea;United States" + "aff_country_unique": "South Korea;United States", + "bibtex": "@InProceedings{Jang_2023_ICCV,\n \n author = {\n Jang,\n Yeong Il and Lee,\n Keuntek and Park,\n Gu Yong and Kim,\n Seyun and Cho,\n Nam Ik\n},\n title = {\n Self-supervised Image Denoising with Downsampled Invariance Loss and Conditional Blind-Spot Network\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12196-12205\n} \n}" }, { "title": "Self-supervised Learning of Implicit Shape Representation with Dense Correspondence for Deformable Objects", @@ -52518,7 +54277,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhang_Self-supervised_Learning_of_Implicit_Shape_Representation_with_Dense_Correspondence_for_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhang_Self-supervised_Learning_of_Implicit_Shape_Representation_with_Dense_Correspondence_for_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Baowen and Li,\n Jiahe and Deng,\n Xiaoming and Zhang,\n Yinda and Ma,\n Cuixia and Wang,\n Hongan\n},\n title = {\n Self-supervised Learning of Implicit Shape Representation with Dense Correspondence for Deformable Objects\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14268-14278\n} \n}" }, { "title": "Self-supervised Learning to Bring Dual Reversed Rolling Shutter Images Alive", @@ -52543,14 +54303,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Shang_Self-supervised_Learning_to_Bring_Dual_Reversed_Rolling_Shutter_Images_Alive_ICCV_2023_paper.html", "aff_unique_index": "0+1;0;0+2", - "aff_unique_norm": "Harbin Institute of Technology;City University of Hong Kong;Pengcheng Laboratory", - "aff_unique_dep": "School of Computer Science and Technology;;Peng Cheng Laboratory", + "aff_unique_norm": "Harbin Institute of Technology;City University of Hong Kong;Peng Cheng Laboratory", + "aff_unique_dep": "School of Computer Science and Technology;;", "aff_unique_url": "http://www.hit.edu.cn/;https://www.cityu.edu.hk;", "aff_unique_abbr": "HIT;CityU;", - "aff_campus_unique_index": "0+1;0;0+2", - "aff_campus_unique": "Harbin;Hong Kong SAR;Shenzhen", + "aff_campus_unique_index": "0+1;0;0", + "aff_campus_unique": "Harbin;Hong Kong SAR;", "aff_country_unique_index": "0+0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Shang_2023_ICCV,\n \n author = {\n Shang,\n Wei and Ren,\n Dongwei and Feng,\n Chaoyu and Wang,\n Xiaotao and Lei,\n Lei and Zuo,\n Wangmeng\n},\n title = {\n Self-supervised Learning to Bring Dual Reversed Rolling Shutter Images Alive\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13086-13094\n} \n}" }, { "title": "Self-supervised Monocular Depth Estimation: Let's Talk About The Weather", @@ -52582,7 +54343,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Saunders_2023_ICCV,\n \n author = {\n Saunders,\n Kieran and Vogiatzis,\n George and Manso,\n Luis J.\n},\n title = {\n Self-supervised Monocular Depth Estimation: Let's Talk About The Weather\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8907-8917\n} \n}" }, { "title": "Self-supervised Monocular Underwater Depth Recovery, Image Restoration, and a Real-sea Video Dataset", @@ -52614,7 +54376,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Madras", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "India" + "aff_country_unique": "India", + "bibtex": "@InProceedings{Varghese_2023_ICCV,\n \n author = {\n Varghese,\n Nisha and Kumar,\n Ashish and Rajagopalan,\n A. N.\n},\n title = {\n Self-supervised Monocular Underwater Depth Recovery,\n Image Restoration,\n and a Real-sea Video Dataset\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12248-12258\n} \n}" }, { "title": "Self-supervised Pre-training for Mirror Detection", @@ -52646,7 +54409,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Lin_2023_ICCV,\n \n author = {\n Lin,\n Jiaying and Lau,\n Rynson W.H.\n},\n title = {\n Self-supervised Pre-training for Mirror Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12227-12236\n} \n}" }, { "title": "SemARFlow: Injecting Semantics into Unsupervised Optical Flow Estimation for Autonomous Driving", @@ -52678,7 +54442,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Yuan_2023_ICCV,\n \n author = {\n Yuan,\n Shuai and Yu,\n Shuzhi and Kim,\n Hannah and Tomasi,\n Carlo\n},\n title = {\n SemARFlow: Injecting Semantics into Unsupervised Optical Flow Estimation for Autonomous Driving\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9566-9577\n} \n}" }, { "title": "Semantic Attention Flow Fields for Monocular Dynamic Scene Decomposition", @@ -52710,7 +54475,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Liang_2023_ICCV,\n \n author = {\n Liang,\n Yiqing and Laidlaw,\n Eliot and Meyerowitz,\n Alexander and Sridhar,\n Srinath and Tompkin,\n James\n},\n title = {\n Semantic Attention Flow Fields for Monocular Dynamic Scene Decomposition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21797-21806\n} \n}" }, { "title": "Semantic Information in Contrastive Learning", @@ -52735,14 +54501,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Quan_Semantic_Information_in_Contrastive_Learning_ICCV_2023_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "University of Tokyo", + "aff_unique_norm": "The University of Tokyo", "aff_unique_dep": "Graduate School of Engineering", "aff_unique_url": "https://www.u-tokyo.ac.jp", "aff_unique_abbr": "UTokyo", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Tokyo;", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Quan_2023_ICCV,\n \n author = {\n Quan,\n Shengjiang and Hirano,\n Masahiro and Yamakawa,\n Yuji\n},\n title = {\n Semantic Information in Contrastive Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5686-5696\n} \n}" }, { "title": "Semantic-Aware Dynamic Parameter for Video Inpainting Transformer", @@ -52774,7 +54541,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "", - "aff_country_unique": "" + "aff_country_unique": "", + "bibtex": "@InProceedings{Lee_2023_ICCV,\n \n author = {\n Lee,\n Eunhye and Yoo,\n Jinsu and Yang,\n Yunjeong and Baik,\n Sungyong and Kim,\n Tae Hyun\n},\n title = {\n Semantic-Aware Dynamic Parameter for Video Inpainting Transformer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12949-12958\n} \n}" }, { "title": "Semantic-Aware Implicit Template Learning via Part Deformation Consistency", @@ -52806,7 +54574,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Kim_2023_ICCV,\n \n author = {\n Kim,\n Sihyeon and Joo,\n Minseok and Lee,\n Jaewon and Ko,\n Juyeon and Cha,\n Juhan and Kim,\n Hyunwoo J.\n},\n title = {\n Semantic-Aware Implicit Template Learning via Part Deformation Consistency\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 593-603\n} \n}" }, { "title": "Semantically Structured Image Compression via Irregular Group-Based Decoupling", @@ -52838,7 +54607,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Ningbo", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Feng_2023_ICCV,\n \n author = {\n Feng,\n Ruoyu and Gao,\n Yixin and Jin,\n Xin and Feng,\n Runsen and Chen,\n Zhibo\n},\n title = {\n Semantically Structured Image Compression via Irregular Group-Based Decoupling\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17237-17247\n} \n}" }, { "title": "Semantics Meets Temporal Correspondence: Self-supervised Object-centric Learning in Videos", @@ -52863,14 +54633,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Qian_Semantics_Meets_Temporal_Correspondence_Self-supervised_Object-centric_Learning_in_Videos_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0+1", - "aff_unique_norm": "Chinese University of Hong Kong;Shanghai Artificial Intelligence Laboratory", + "aff_unique_norm": "The Chinese University of Hong Kong;Shanghai Artificial Intelligence Laboratory", "aff_unique_dep": ";", "aff_unique_url": "https://www.cuhk.edu.hk;http://www.shailab.org/", "aff_unique_abbr": "CUHK;Shanghai AI Lab", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Qian_2023_ICCV,\n \n author = {\n Qian,\n Rui and Ding,\n Shuangrui and Liu,\n Xian and Lin,\n Dahua\n},\n title = {\n Semantics Meets Temporal Correspondence: Self-supervised Object-centric Learning in Videos\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16675-16687\n} \n}" }, { "title": "Semantics-Consistent Feature Search for Self-Supervised Visual Representation Learning", @@ -52902,7 +54673,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Song_2023_ICCV,\n \n author = {\n Song,\n Kaiyou and Zhang,\n Shan and Luo,\n Zimeng and Wang,\n Tong and Xie,\n Jin\n},\n title = {\n Semantics-Consistent Feature Search for Self-Supervised Visual Representation Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16099-16108\n} \n}" }, { "title": "Semantify: Simplifying the Control of 3D Morphable Models Using CLIP", @@ -52925,7 +54697,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Gralnik_Semantify_Simplifying_the_Control_of_3D_Morphable_Models_Using_CLIP_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Gralnik_Semantify_Simplifying_the_Control_of_3D_Morphable_Models_Using_CLIP_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Gralnik_2023_ICCV,\n \n author = {\n Gralnik,\n Omer and Gafni,\n Guy and Shamir,\n Ariel\n},\n title = {\n Semantify: Simplifying the Control of 3D Morphable Models Using CLIP\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14554-14564\n} \n}" }, { "title": "Semi-Supervised Learning via Weight-Aware Distillation under Class Distribution Mismatch", @@ -52957,7 +54730,8 @@ "aff_campus_unique_index": "1;1;1;1;1", "aff_campus_unique": ";Beijing", "aff_country_unique_index": "0+0;0+0;0+0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Du_2023_ICCV,\n \n author = {\n Du,\n Pan and Zhao,\n Suyun and Sheng,\n Zisen and Li,\n Cuiping and Chen,\n Hong\n},\n title = {\n Semi-Supervised Learning via Weight-Aware Distillation under Class Distribution Mismatch\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16410-16420\n} \n}" }, { "title": "Semi-Supervised Semantic Segmentation under Label Noise via Diverse Learning Groups", @@ -52980,7 +54754,8 @@ "aff_domain": ";;;;;;;;;", "email": ";;;;;;;;;", "author_num": 10, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_Semi-Supervised_Semantic_Segmentation_under_Label_Noise_via_Diverse_Learning_Groups_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_Semi-Supervised_Semantic_Segmentation_under_Label_Noise_via_Diverse_Learning_Groups_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Peixia and Purkait,\n Pulak and Ajanthan,\n Thalaiyasingam and Abdolshah,\n Majid and Garg,\n Ravi and Husain,\n Hisham and Xu,\n Chenchen and Gould,\n Stephen and Ouyang,\n Wanli and van den Hengel,\n Anton\n},\n title = {\n Semi-Supervised Semantic Segmentation under Label Noise via Diverse Learning Groups\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1229-1238\n} \n}" }, { "title": "Semi-supervised Semantics-guided Adversarial Training for Robust Trajectory Prediction", @@ -53012,7 +54787,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Irvine", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Jiao_2023_ICCV,\n \n author = {\n Jiao,\n Ruochen and Liu,\n Xiangguo and Sato,\n Takami and Chen,\n Qi Alfred and Zhu,\n Qi\n},\n title = {\n Semi-supervised Semantics-guided Adversarial Training for Robust Trajectory Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8207-8217\n} \n}" }, { "title": "Semi-supervised Speech-driven 3D Facial Animation via Cross-modal Encoding", @@ -53038,13 +54814,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yang_Semi-supervised_Speech-driven_3D_Facial_Animation_via_Cross-modal_Encoding_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Tencent", - "aff_unique_dep": "Tencent", + "aff_unique_dep": "", "aff_unique_url": "https://www.tencent.com", "aff_unique_abbr": "Tencent", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Shenzhen", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n Peiji and Wei,\n Huawei and Zhong,\n Yicheng and Wang,\n Zhisheng\n},\n title = {\n Semi-supervised Speech-driven 3D Facial Animation via Cross-modal Encoding\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21032-21041\n} \n}" }, { "title": "Sensitivity-Aware Visual Parameter-Efficient Fine-Tuning", @@ -53076,7 +54853,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0+0", - "aff_country_unique": "Australia" + "aff_country_unique": "Australia", + "bibtex": "@InProceedings{He_2023_ICCV,\n \n author = {\n He,\n Haoyu and Cai,\n Jianfei and Zhang,\n Jing and Tao,\n Dacheng and Zhuang,\n Bohan\n},\n title = {\n Sensitivity-Aware Visual Parameter-Efficient Fine-Tuning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11825-11835\n} \n}" }, { "title": "Sentence Attention Blocks for Answer Grounding", @@ -53108,7 +54886,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Khoshsirat_2023_ICCV,\n \n author = {\n Khoshsirat,\n Seyedalireza and Kambhamettu,\n Chandra\n},\n title = {\n Sentence Attention Blocks for Answer Grounding\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6080-6090\n} \n}" }, { "title": "Sequential Texts Driven Cohesive Motions Synthesis with Natural Transitions", @@ -53140,7 +54919,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Beijing", "aff_country_unique_index": "0+0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Shuai and Zhuang,\n Sisi and Song,\n Wenfeng and Zhang,\n Xinyu and Chen,\n Hejia and Hao,\n Aimin\n},\n title = {\n Sequential Texts Driven Cohesive Motions Synthesis with Natural Transitions\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9498-9508\n} \n}" }, { "title": "Set-level Guidance Attack: Boosting Adversarial Transferability of Vision-Language Pre-training Models", @@ -53165,14 +54945,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Lu_Set-level_Guidance_Attack_Boosting_Adversarial_Transferability_of_Vision-Language_Pre-training_Models_ICCV_2023_paper.html", "aff_unique_index": "0;0;0+1;2;3;0+4", - "aff_unique_norm": "Southern University of Science and Technology;University of Hong Kong;Monash University;Temple University;Pengcheng Laboratory", - "aff_unique_dep": ";;;;Peng Cheng Laboratory", + "aff_unique_norm": "Southern University of Science and Technology;The University of Hong Kong;Monash University;Temple University;Peng Cheng Laboratory", + "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.sustech.edu.cn;https://www.hku.hk;https://www.monash.edu;https://www.temple.edu;http://www.pcl.ac.cn", "aff_unique_abbr": "SUSTech;HKU;Monash;Temple;PCL", "aff_campus_unique_index": "1;", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0+0;1;2;0+0", - "aff_country_unique": "China;Australia;United States" + "aff_country_unique": "China;Australia;United States", + "bibtex": "@InProceedings{Lu_2023_ICCV,\n \n author = {\n Lu,\n Dong and Wang,\n Zhiqiang and Wang,\n Teng and Guan,\n Weili and Gao,\n Hongchang and Zheng,\n Feng\n},\n title = {\n Set-level Guidance Attack: Boosting Adversarial Transferability of Vision-Language Pre-training Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 102-111\n} \n}" }, { "title": "Shape Analysis of Euclidean Curves under Frenet-Serret Framework", @@ -53200,11 +54981,12 @@ "aff_unique_norm": "University of Paris-Saclay;ENSIIE;Quantmetry", "aff_unique_dep": "LaMME;;", "aff_unique_url": "https://www.universite-paris-saclay.fr;https://www.ensiie.fr;https://www.quantmetry.com", - "aff_unique_abbr": ";ENSIIE;", + "aff_unique_abbr": ";;", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Evry", "aff_country_unique_index": "0;0+0;0+0+0", - "aff_country_unique": "France" + "aff_country_unique": "France", + "bibtex": "@InProceedings{Chassat_2023_ICCV,\n \n author = {\n Chassat,\n Perrine and Park,\n Juhyun and Brunel,\n Nicolas\n},\n title = {\n Shape Analysis of Euclidean Curves under Frenet-Serret Framework\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4027-4036\n} \n}" }, { "title": "Shape Anchor Guided Holistic Indoor Scene Understanding", @@ -53236,7 +55018,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Wuhan;", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Dong_2023_ICCV,\n \n author = {\n Dong,\n Mingyue and Huan,\n Linxi and Xiong,\n Hanjiang and Shen,\n Shuhan and Zheng,\n Xianwei\n},\n title = {\n Shape Anchor Guided Holistic Indoor Scene Understanding\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21916-21926\n} \n}" }, { "title": "ShapeScaffolder: Structure-Aware 3D Shape Generation from Text", @@ -53268,7 +55051,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", - "aff_country_unique": "United Kingdom;Australia" + "aff_country_unique": "United Kingdom;Australia", + "bibtex": "@InProceedings{Tian_2023_ICCV,\n \n author = {\n Tian,\n Xi and Yang,\n Yong-Liang and Wu,\n Qi\n},\n title = {\n ShapeScaffolder: Structure-Aware 3D Shape Generation from Text\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2715-2724\n} \n}" }, { "title": "Shatter and Gather: Learning Referring Image Segmentation with Text Supervision", @@ -53291,7 +55075,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Kim_Shatter_and_Gather_Learning_Referring_Image_Segmentation_with_Text_Supervision_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Kim_Shatter_and_Gather_Learning_Referring_Image_Segmentation_with_Text_Supervision_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Kim_2023_ICCV,\n \n author = {\n Kim,\n Dongwon and Kim,\n Namyup and Lan,\n Cuiling and Kwak,\n Suha\n},\n title = {\n Shatter and Gather: Learning Referring Image Segmentation with Text Supervision\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15547-15557\n} \n}" }, { "title": "Shift from Texture-bias to Shape-bias: Edge Deformation-based Augmentation for Robust Object Recognition", @@ -53323,7 +55108,8 @@ "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Shenzhen;", "aff_country_unique_index": "0+0+0;0+0+0;0+0+0;0+0+0;1;0+0+0;0+0+0", - "aff_country_unique": "China;United Kingdom" + "aff_country_unique": "China;United Kingdom", + "bibtex": "@InProceedings{He_2023_ICCV,\n \n author = {\n He,\n Xilin and Lin,\n Qinliang and Luo,\n Cheng and Xie,\n Weicheng and Song,\n Siyang and Liu,\n Feng and Shen,\n Linlin\n},\n title = {\n Shift from Texture-bias to Shape-bias: Edge Deformation-based Augmentation for Robust Object Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1526-1535\n} \n}" }, { "title": "ShiftNAS: Improving One-shot NAS via Probability Shift", @@ -53355,7 +55141,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Mingyang and Yu,\n Xinyi and Zhao,\n Haodong and Ou,\n Linlin\n},\n title = {\n ShiftNAS: Improving One-shot NAS via Probability Shift\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5919-5928\n} \n}" }, { "title": "Shortcut-V2V: Compression Framework for Video-to-Video Translation Based on Temporal Redundancy Reduction", @@ -53387,7 +55174,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Chung_2023_ICCV,\n \n author = {\n Chung,\n Chaeyeon and Park,\n Yeojeong and Choi,\n Seunghwan and Ganbat,\n Munkhsoyol and Choo,\n Jaegul\n},\n title = {\n Shortcut-V2V: Compression Framework for Video-to-Video Translation Based on Temporal Redundancy Reduction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7612-7622\n} \n}" }, { "title": "Shrinking Class Space for Enhanced Certainty in Semi-Supervised Learning", @@ -53412,14 +55200,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yang_Shrinking_Class_Space_for_Enhanced_Certainty_in_Semi-Supervised_Learning_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;3;4;0", - "aff_unique_norm": "University of Hong Kong;University of Sydney;Southeast University;Shanghai AI Laboratory;Nanjing University", + "aff_unique_norm": "The University of Hong Kong;University of Sydney;Southeast University;Shanghai AI Laboratory;Nanjing University", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.hku.hk;https://www.sydney.edu.au;https://www.seu.edu.cn/;https://www.shanghai-ai-lab.com;https://www.nju.edu.cn", "aff_unique_abbr": "HKU;USYD;SEU;SAIL;Nanjing U", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;1;0;0;0;0", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n Lihe and Zhao,\n Zhen and Qi,\n Lei and Qiao,\n Yu and Shi,\n Yinghuan and Zhao,\n Hengshuang\n},\n title = {\n Shrinking Class Space for Enhanced Certainty in Semi-Supervised Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16187-16196\n} \n}" }, { "title": "SiLK: Simple Learned Keypoints", @@ -53444,14 +55233,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Gleize_SiLK_Simple_Learned_Keypoints_ICCV_2023_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "Meta", + "aff_unique_norm": "Meta Platforms, Inc.", "aff_unique_dep": "Meta AI", "aff_unique_url": "https://meta.com", "aff_unique_abbr": "Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Gleize_2023_ICCV,\n \n author = {\n Gleize,\n Pierre and Wang,\n Weiyao and Feiszli,\n Matt\n},\n title = {\n SiLK: Simple Learned Keypoints\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22499-22508\n} \n}" }, { "title": "Sigmoid Loss for Language Image Pre-Training", @@ -53463,7 +55253,7 @@ "author": "Xiaohua Zhai; Basil Mustafa; Alexander Kolesnikov; Lucas Beyer", "abstract": "We propose a simple pairwise sigmoid loss for image-text pre-training. Unlike standard contrastive learning with softmax normalization, the sigmoid loss operates solely on image-text pairs and does not require a global view of the pairwise similarities for normalization. The sigmoid loss simultaneously allows further scaling up the batch size, while also performing better at smaller batch sizes. With only four TPUv4 chips, we can train a Base CLIP model at 4k batch size and a Large LiT model at 20k batch size, the latter achieves 84.5% ImageNet zero-shot accuracy in two days. This disentanglement of the batch size from the loss further allows us to study the impact of examples vs pairs and negative to positive ratio. Finally, we push the batch size to the extreme, up to one million, and find that the benefits of growing batch size quickly diminish, with a more reasonable batch size of 32k being sufficient. We hope our research motivates further explorations in improving the quality and efficiency of language-image pre-training.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Zhai_Sigmoid_Loss_for_Language_Image_Pre-Training_ICCV_2023_paper.pdf", - "aff": "Google DeepMind, Z\u00fcrich, Switzerland; Google DeepMind, Z\u00fcrich, Switzerland; Google DeepMind, Z\u00fcrich, Switzerland; Google DeepMind, Z\u00fcrich, Switzerland", + "aff": "Google DeepMind, Zürich, Switzerland; Google DeepMind, Zürich, Switzerland; Google DeepMind, Zürich, Switzerland; Google DeepMind, Zürich, Switzerland", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Zhai_Sigmoid_Loss_for_ICCV_2023_supplemental.pdf", @@ -53477,13 +55267,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhai_Sigmoid_Loss_for_Language_Image_Pre-Training_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Google", - "aff_unique_dep": "Google DeepMind", + "aff_unique_dep": "DeepMind", "aff_unique_url": "https://deepmind.com", "aff_unique_abbr": "DeepMind", "aff_campus_unique_index": "0;0;0;0", - "aff_campus_unique": "Z\u00fcrich", + "aff_campus_unique": "Zürich", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Switzerland" + "aff_country_unique": "Switzerland", + "bibtex": "@InProceedings{Zhai_2023_ICCV,\n \n author = {\n Zhai,\n Xiaohua and Mustafa,\n Basil and Kolesnikov,\n Alexander and Beyer,\n Lucas\n},\n title = {\n Sigmoid Loss for Language Image Pre-Training\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11975-11986\n} \n}" }, { "title": "Sign Language Translation with Iterative Prototype", @@ -53515,7 +55306,8 @@ "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Hefei", "aff_country_unique_index": "0+0;0+0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yao_2023_ICCV,\n \n author = {\n Yao,\n Huijie and Zhou,\n Wengang and Feng,\n Hao and Hu,\n Hezhen and Zhou,\n Hao and Li,\n Houqiang\n},\n title = {\n Sign Language Translation with Iterative Prototype\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15592-15601\n} \n}" }, { "title": "SimFIR: A Simple Framework for Fisheye Image Rectification with Self-supervised Representation Learning", @@ -53538,7 +55330,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Feng_SimFIR_A_Simple_Framework_for_Fisheye_Image_Rectification_with_Self-supervised_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Feng_SimFIR_A_Simple_Framework_for_Fisheye_Image_Rectification_with_Self-supervised_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Feng_2023_ICCV,\n \n author = {\n Feng,\n Hao and Wang,\n Wendi and Deng,\n Jiajun and Zhou,\n Wengang and Li,\n Li and Li,\n Houqiang\n},\n title = {\n SimFIR: A Simple Framework for Fisheye Image Rectification with Self-supervised Representation Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12418-12427\n} \n}" }, { "title": "SimMatchV2: Semi-Supervised Learning with Graph Consistency", @@ -53563,14 +55356,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zheng_SimMatchV2_Semi-Supervised_Learning_with_Graph_Consistency_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;3;4;1;0", - "aff_unique_norm": "University of Sydney;SenseTime;University of Tokyo;University of Science and Technology of China;State Grid Anhui Electric Power Research Institute", + "aff_unique_norm": "The University of Sydney;SenseTime;University of Tokyo;University of Science and Technology of China;State Grid Anhui Electric Power Research Institute", "aff_unique_dep": "School of Computer Science;SenseTime Research;;;Electric Power Research", "aff_unique_url": "https://www.sydney.edu.au;https://www.sensetime.com;https://www.u-tokyo.ac.jp;http://www.ustc.edu.cn;", "aff_unique_abbr": "USYD;SenseTime;UTokyo;USTC;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;1;1;1;0", - "aff_country_unique": "Australia;China;Japan" + "aff_country_unique": "Australia;China;Japan", + "bibtex": "@InProceedings{Zheng_2023_ICCV,\n \n author = {\n Zheng,\n Mingkai and You,\n Shan and Huang,\n Lang and Luo,\n Chen and Wang,\n Fei and Qian,\n Chen and Xu,\n Chang\n},\n title = {\n SimMatchV2: Semi-Supervised Learning with Graph Consistency\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16432-16442\n} \n}" }, { "title": "SimNP: Learning Self-Similarity Priors Between Neural Points", @@ -53602,7 +55396,8 @@ "aff_campus_unique_index": "0;1;0+1;0", "aff_campus_unique": "Saarland;Saarland Informatics Campus", "aff_country_unique_index": "0;0;0+0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Wewer_2023_ICCV,\n \n author = {\n Wewer,\n Christopher and Ilg,\n Eddy and Schiele,\n Bernt and Lenssen,\n Jan Eric\n},\n title = {\n SimNP: Learning Self-Similarity Priors Between Neural Points\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8841-8852\n} \n}" }, { "title": "Similarity Min-Max: Zero-Shot Day-Night Domain Adaptation", @@ -53627,14 +55422,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Luo_Similarity_Min-Max_Zero-Shot_Day-Night_Domain_Adaptation_ICCV_2023_paper.html", "aff_unique_index": "0+0+0;0;1;0", - "aff_unique_norm": "Peking University;Pengcheng Laboratory", - "aff_unique_dep": "Wangxuan Institute of Computer Technology;Peng Cheng Laboratory", + "aff_unique_norm": "Peking University;Peng Cheng Laboratory", + "aff_unique_dep": "Wangxuan Institute of Computer Technology;", "aff_unique_url": "http://www.pku.edu.cn;http://www.pcl.ac.cn", "aff_unique_abbr": "PKU;PCL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0+0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Luo_2023_ICCV,\n \n author = {\n Luo,\n Rundong and Wang,\n Wenjing and Yang,\n Wenhan and Liu,\n Jiaying\n},\n title = {\n Similarity Min-Max: Zero-Shot Day-Night Domain Adaptation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8104-8114\n} \n}" }, { "id": "5be399a1f4", @@ -53653,7 +55449,8 @@ "gs_version_total": 4, "aff_domain": ";;;;;", "email": ";;;;;", - "author_num": 6 + "author_num": 6, + "bibtex": "@InProceedings{Huang_2023_ICCV,\n \n author = {\n Huang,\n Yangru and Peng,\n Peixi and Zhao,\n Yifan and Zhai,\n Yunpeng and Xu,\n Haoran and Tian,\n Yonghong\n},\n title = {\n Simoun: Synergizing Interactive Motion-appearance Understanding for Vision-based Reinforcement Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 176-185\n} \n}" }, { "title": "Simple Baselines for Interactive Video Retrieval with Questions and Answers", @@ -53685,7 +55482,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Cambridge", "aff_country_unique_index": "0;1", - "aff_country_unique": "United States;United Kingdom" + "aff_country_unique": "United States;United Kingdom", + "bibtex": "@InProceedings{Liang_2023_ICCV,\n \n author = {\n Liang,\n Kaiqu and Albanie,\n Samuel\n},\n title = {\n Simple Baselines for Interactive Video Retrieval with Questions and Answers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11091-11101\n} \n}" }, { "title": "Simple and Effective Out-of-Distribution Detection via Cosine-based Softmax Loss", @@ -53717,7 +55515,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Noh_2023_ICCV,\n \n author = {\n Noh,\n SoonCheol and Jeong,\n DongEon and Lee,\n Jee-Hyong\n},\n title = {\n Simple and Effective Out-of-Distribution Detection via Cosine-based Softmax Loss\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16560-16569\n} \n}" }, { "title": "SimpleClick: Interactive Image Segmentation with Simple Vision Transformers", @@ -53749,7 +55548,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Chapel Hill", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Qin and Xu,\n Zhenlin and Bertasius,\n Gedas and Niethammer,\n Marc\n},\n title = {\n SimpleClick: Interactive Image Segmentation with Simple Vision Transformers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22290-22300\n} \n}" }, { "title": "Simulating Fluids in Real-World Still Images", @@ -53774,14 +55574,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Fan_Simulating_Fluids_in_Real-World_Still_Images_ICCV_2023_paper.html", "aff_unique_index": "0;0+1;0+1;2+1+3;2+1", - "aff_unique_norm": "SenseTime;Chinese University of Hong Kong;Shanghai AI Laboratory;Center for Process Innovation and Integration", + "aff_unique_norm": "SenseTime;The Chinese University of Hong Kong;Shanghai AI Laboratory;Center for Process Innovation and Integration", "aff_unique_dep": "SenseTime Research;;;", "aff_unique_url": "https://www.sensetime.com;https://www.cuhk.edu.hk;https://www.shanghai-ai-lab.com;", "aff_unique_abbr": "SenseTime;CUHK;SAIL;CPII", "aff_campus_unique_index": "1;1;1;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0+0;0+0;0+0;0+0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Fan_2023_ICCV,\n \n author = {\n Fan,\n Siming and Piao,\n Jingtan and Qian,\n Chen and Li,\n Hongsheng and Lin,\n Kwan-Yee\n},\n title = {\n Simulating Fluids in Real-World Still Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15922-15931\n} \n}" }, { "title": "Single Depth-image 3D Reflection Symmetry and Shape Prediction", @@ -53813,7 +55614,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;0;0;1;1;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Zhaoxuan and Dong,\n Bo and Li,\n Tong and Heide,\n Felix and Peers,\n Pieter and Yin,\n Baocai and Yang,\n Xin\n},\n title = {\n Single Depth-image 3D Reflection Symmetry and Shape Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8896-8906\n} \n}" }, { "title": "Single Image Deblurring with Row-dependent Blur Magnitude", @@ -53838,14 +55640,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ji_Single_Image_Deblurring_with_Row-dependent_Blur_Magnitude_ICCV_2023_paper.html", "aff_unique_index": "0;0+1;1+0;0", - "aff_unique_norm": "University of Tokyo;National Institute of Informatics", + "aff_unique_norm": "The University of Tokyo;National Institute of Informatics", "aff_unique_dep": ";", "aff_unique_url": "https://www.u-tokyo.ac.jp;https://www.nii.ac.jp", "aff_unique_abbr": "UTokyo;NII", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0+0;0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Ji_2023_ICCV,\n \n author = {\n Ji,\n Xiang and Wang,\n Zhixiang and Satoh,\n Shin'ichi and Zheng,\n Yinqiang\n},\n title = {\n Single Image Deblurring with Row-dependent Blur Magnitude\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12269-12280\n} \n}" }, { "title": "Single Image Defocus Deblurring via Implicit Neural Inverse Kernels", @@ -53877,7 +55680,8 @@ "aff_campus_unique_index": "0+0;0+0", "aff_campus_unique": "Guangzhou;", "aff_country_unique_index": "0+0;0+0;1", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Quan_2023_ICCV,\n \n author = {\n Quan,\n Yuhui and Yao,\n Xin and Ji,\n Hui\n},\n title = {\n Single Image Defocus Deblurring via Implicit Neural Inverse Kernels\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12600-12610\n} \n}" }, { "title": "Single Image Reflection Separation via Component Synergy", @@ -53909,7 +55713,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Tianjin", "aff_country_unique_index": "0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Hu_2023_ICCV,\n \n author = {\n Hu,\n Qiming and Guo,\n Xiaojie\n},\n title = {\n Single Image Reflection Separation via Component Synergy\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13138-13147\n} \n}" }, { "title": "Single-Stage Diffusion NeRF: A Unified Approach to 3D Generation and Reconstruction", @@ -53932,7 +55737,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chen_Single-Stage_Diffusion_NeRF_A_Unified_Approach_to_3D_Generation_and_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chen_Single-Stage_Diffusion_NeRF_A_Unified_Approach_to_3D_Generation_and_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Hansheng and Gu,\n Jiatao and Chen,\n Anpei and Tian,\n Wei and Tu,\n Zhuowen and Liu,\n Lingjie and Su,\n Hao\n},\n title = {\n Single-Stage Diffusion NeRF: A Unified Approach to 3D Generation and Reconstruction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2416-2425\n} \n}" }, { "title": "Size Does Matter: Size-aware Virtual Try-on via Clothing-oriented Transformation Try-on Network", @@ -53964,7 +55770,8 @@ "aff_campus_unique_index": "1;1;1;1", "aff_campus_unique": ";Taiwan", "aff_country_unique_index": "1;1;1;1", - "aff_country_unique": ";China" + "aff_country_unique": ";China", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Chieh-Yun and Chen,\n Yi-Chung and Shuai,\n Hong-Han and Cheng,\n Wen-Huang\n},\n title = {\n Size Does Matter: Size-aware Virtual Try-on via Clothing-oriented Transformation Try-on Network\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7513-7522\n} \n}" }, { "title": "SkeleTR: Towards Skeleton-based Action Recognition in the Wild", @@ -53989,14 +55796,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Duan_SkeleTR_Towards_Skeleton-based_Action_Recognition_in_the_Wild_ICCV_2023_paper.html", "aff_unique_index": "0;1;1;1;1;1;1", - "aff_unique_norm": "Chinese University of Hong Kong;Amazon", + "aff_unique_norm": "The Chinese University of Hong Kong;Amazon Web Services", "aff_unique_dep": ";AWS AI Labs", "aff_unique_url": "https://www.cuhk.edu.hk;https://aws.amazon.com", "aff_unique_abbr": "CUHK;AWS", "aff_campus_unique_index": "0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;1;1;1;1;1;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Duan_2023_ICCV,\n \n author = {\n Duan,\n Haodong and Xu,\n Mingze and Shuai,\n Bing and Modolo,\n Davide and Tu,\n Zhuowen and Tighe,\n Joseph and Bergamo,\n Alessandro\n},\n title = {\n SkeleTR: Towards Skeleton-based Action Recognition in the Wild\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13634-13644\n} \n}" }, { "title": "SkeletonMAE: Graph-based Masked Autoencoder for Skeleton Sequence Pre-training", @@ -54021,14 +55829,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yan_SkeletonMAE_Graph-based_Masked_Autoencoder_for_Skeleton_Sequence_Pre-training_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;1;0;0", - "aff_unique_norm": "Sun Yat-sen University;Chinese University of Hong Kong", + "aff_unique_norm": "Sun Yat-sen University;The Chinese University of Hong Kong", "aff_unique_dep": "School of Computer Science and Engineering;", "aff_unique_url": "http://www.sysu.edu.cn;https://www.cuhk.edu.cn", "aff_unique_abbr": "SYSU;CUHK", "aff_campus_unique_index": "0;0;0;1;0;0", "aff_campus_unique": "Guangzhou;Shenzhen", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yan_2023_ICCV,\n \n author = {\n Yan,\n Hong and Liu,\n Yang and Wei,\n Yushen and Li,\n Zhen and Li,\n Guanbin and Lin,\n Liang\n},\n title = {\n SkeletonMAE: Graph-based Masked Autoencoder for Skeleton Sequence Pre-training\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5606-5618\n} \n}" }, { "title": "Sketch and Text Guided Diffusion Model for Colored Point Cloud Generation", @@ -54060,7 +55869,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Wu_2023_ICCV,\n \n author = {\n Wu,\n Zijie and Wang,\n Yaonan and Feng,\n Mingtao and Xie,\n He and Mian,\n Ajmal\n},\n title = {\n Sketch and Text Guided Diffusion Model for Colored Point Cloud Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8929-8939\n} \n}" }, { "title": "Skill Transformer: A Monolithic Policy for Mobile Manipulation", @@ -54083,7 +55893,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Huang_Skill_Transformer_A_Monolithic_Policy_for_Mobile_Manipulation_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Huang_Skill_Transformer_A_Monolithic_Policy_for_Mobile_Manipulation_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Huang_2023_ICCV,\n \n author = {\n Huang,\n Xiaoyu and Batra,\n Dhruv and Rai,\n Akshara and Szot,\n Andrew\n},\n title = {\n Skill Transformer: A Monolithic Policy for Mobile Manipulation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10852-10862\n} \n}" }, { "title": "Skip-Plan: Procedure Planning in Instructional Videos via Condensed Action Space Learning", @@ -54115,7 +55926,8 @@ "aff_campus_unique_index": "1;1;;", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0;0;0;0;0+0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Zhiheng and Geng,\n Wenjia and Li,\n Muheng and Chen,\n Lei and Tang,\n Yansong and Lu,\n Jiwen and Zhou,\n Jie\n},\n title = {\n Skip-Plan: Procedure Planning in Instructional Videos via Condensed Action Space Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10297-10306\n} \n}" }, { "title": "SlaBins: Fisheye Depth Estimation using Slanted Bins on Road Environments", @@ -54127,7 +55939,7 @@ "author": "Jongsung Lee; Gyeongsu Cho; Jeongin Park; Kyongjun Kim; Seongoh Lee; Jung-Hee Kim; Seong-Gyun Jeong; Kyungdon Joo", "abstract": "Although 3D perception for autonomous vehicles has focused on frontal-view information, more than half of fatal accidents occur due to side impacts in practice (e.g., T-bone crash). Motivated by this fact, we investigate the problem of side-view depth estimation, especially for monocular fisheye cameras, which provide wide FoV information. However, since fisheye cameras head road areas, it observes road areas mostly and results in severe distortion on object areas, such as vehicles or pedestrians. To alleviate these issues, we propose a new fisheye depth estimation network, SlaBins, that infers an accurate and dense depth map based on a geometric property of road environments; most objects are standing (i.e., orthogonal) on the road environments. Concretely, we introduce a slanted multi-cylindrical image (MCI) representation, which allows us to describe a distance as a radius to a cylindrical layer orthogonal to the ground regardless of the camera viewing direction. Based on the slanted MCI, we estimate a set of adaptive bins and a per-pixel probability map for depth estimation. Then by combining it with the estimated slanted angle of viewing direction, we directly infer a dense and accurate depth map for fisheye cameras. Experiments demonstrate that SlaBins outperforms the state-of-the-art methods in both qualitative and quantitative evaluation on the SynWoodScape and KITTI-360 depth datasets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Lee_SlaBins_Fisheye_Depth_Estimation_using_Slanted_Bins_on_Road_Environments_ICCV_2023_paper.pdf", - "aff": "Arti\ufb01cial Intelligence Graduate School, UNIST,242dot Inc.; Arti\ufb01cial Intelligence Graduate School, UNIST,242dot Inc.; Arti\ufb01cial Intelligence Graduate School, UNIST,242dot Inc.; Arti\ufb01cial Intelligence Graduate School, UNIST,242dot Inc.; Arti\ufb01cial Intelligence Graduate School, UNIST,242dot Inc.; 42dot.ai; 42dot.ai; Arti\ufb01cial Intelligence Graduate School, UNIST,242dot Inc.", + "aff": "Artificial Intelligence Graduate School, UNIST,242dot Inc.; Artificial Intelligence Graduate School, UNIST,242dot Inc.; Artificial Intelligence Graduate School, UNIST,242dot Inc.; Artificial Intelligence Graduate School, UNIST,242dot Inc.; Artificial Intelligence Graduate School, UNIST,242dot Inc.; 42dot.ai; 42dot.ai; Artificial Intelligence Graduate School, UNIST,242dot Inc.", "project": "https://syniez.github.io/SlaBins/", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Lee_SlaBins_Fisheye_Depth_ICCV_2023_supplemental.pdf", @@ -54141,13 +55953,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Lee_SlaBins_Fisheye_Depth_Estimation_using_Slanted_Bins_on_Road_Environments_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0;0;1;1;0", "aff_unique_norm": "UNIST;42dot.ai", - "aff_unique_dep": "Arti\ufb01cial Intelligence Graduate School;", + "aff_unique_dep": "Artificial Intelligence Graduate School;", "aff_unique_url": "https://www.unist.ac.kr;https://42dot.ai", "aff_unique_abbr": "UNIST;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;1;1;0", - "aff_country_unique": "South Korea;United States" + "aff_country_unique": "South Korea;United States", + "bibtex": "@InProceedings{Lee_2023_ICCV,\n \n author = {\n Lee,\n Jongsung and Cho,\n Gyeongsu and Park,\n Jeongin and Kim,\n Kyongjun and Lee,\n Seongoh and Kim,\n Jung-Hee and Jeong,\n Seong-Gyun and Joo,\n Kyungdon\n},\n title = {\n SlaBins: Fisheye Depth Estimation using Slanted Bins on Road Environments\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8765-8774\n} \n}" }, { "title": "Small Object Detection via Coarse-to-fine Proposal Generation and Imitation Learning", @@ -54159,7 +55972,7 @@ "author": "Xiang Yuan; Gong Cheng; Kebing Yan; Qinghua Zeng; Junwei Han", "abstract": "The past few years have witnessed the immense success of object detection, while current excellent detectors struggle on tackling size-limited instances. Concretely, the well-known challenge of low overlaps between the priors and object regions leads to a constrained sample pool for optimization, and the paucity of discriminative information further aggravates the recognition. To alleviate the aforementioned issues, we propose CFINet, a two-stage framework tailored for small object detection based on the Coarse-to-fine pipeline and Feature Imitation learning. Firstly, we introduce Coarse-to-fine RPN (CRPN) to ensure sufficient and high-quality proposals for small objects through the dynamic anchor selection strategy and cascade regression. Then, we equip the conventional detection head with a Feature Imitation (FI) branch to facilitate the region representations of size-limited instances that perplex the model in an imitation manner. Moreover, an auxiliary imitation loss following supervised contrastive learning paradigm is devised to optimize this branch. When integrated with Faster RCNN, CFINet achieves state-of-the-art performance on the large-scale small object detection benchmarks, SODA-D and SODA-A, underscoring its superiority over baseline detector and other mainstream detection approaches. Code is available at https://github.com/shaunyuan22/CFINet.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Yuan_Small_Object_Detection_via_Coarse-to-fine_Proposal_Generation_and_Imitation_Learning_ICCV_2023_paper.pdf", - "aff": "School of Automation, Northwestern Polytechnical University, Xi\u2019an, China; School of Automation, Northwestern Polytechnical University, Xi\u2019an, China; School of Automation, Northwestern Polytechnical University, Xi\u2019an, China; School of Automation, Northwestern Polytechnical University, Xi\u2019an, China; School of Automation, Northwestern Polytechnical University, Xi\u2019an, China", + "aff": "School of Automation, Northwestern Polytechnical University, Xi’an, China; School of Automation, Northwestern Polytechnical University, Xi’an, China; School of Automation, Northwestern Polytechnical University, Xi’an, China; School of Automation, Northwestern Polytechnical University, Xi’an, China; School of Automation, Northwestern Polytechnical University, Xi’an, China", "project": "", "github": "https://github.com/shaunyuan22/CFINet", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Yuan_Small_Object_Detection_ICCV_2023_supplemental.pdf", @@ -54179,7 +55992,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Xi'an", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yuan_2023_ICCV,\n \n author = {\n Yuan,\n Xiang and Cheng,\n Gong and Yan,\n Kebing and Zeng,\n Qinghua and Han,\n Junwei\n},\n title = {\n Small Object Detection via Coarse-to-fine Proposal Generation and Imitation Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6317-6327\n} \n}" }, { "title": "Smoothness Similarity Regularization for Few-Shot GAN Adaptation", @@ -54211,7 +56025,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0+1", - "aff_country_unique": "Germany;United States" + "aff_country_unique": "Germany;United States", + "bibtex": "@InProceedings{Sushko_2023_ICCV,\n \n author = {\n Sushko,\n Vadim and Wang,\n Ruyu and Gall,\n Juergen\n},\n title = {\n Smoothness Similarity Regularization for Few-Shot GAN Adaptation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7073-7082\n} \n}" }, { "title": "Snow Removal in Video: A New Dataset and A Novel Method", @@ -54235,15 +56050,16 @@ "email": ";;;;;;", "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chen_Snow_Removal_in_Video_A_New_Dataset_and_A_Novel_ICCV_2023_paper.html", - "aff_unique_index": "0;0;1;0;2;3;0+0", - "aff_unique_norm": "Hong Kong University of Science and Technology;University of Sydney;La Trobe University;University of Maryland", - "aff_unique_dep": ";;;", - "aff_unique_url": "https://www.ust.hk;https://www.sydney.edu.au;https://www.latrobe.edu.au;https://www/umd.edu", - "aff_unique_abbr": "HKUST;USYD;LTU;UMD", + "aff_unique_index": "0;0;1;0;2;3;4+0", + "aff_unique_norm": "The Hong Kong University of Science and Technology;University of Sydney;La Trobe University;University of Maryland;Hong Kong University of Science and Technology", + "aff_unique_dep": ";;;;", + "aff_unique_url": "https://www.ust.hk;https://www.sydney.edu.au;https://www.latrobe.edu.au;https://www/umd.edu;https://www.ust.hk", + "aff_unique_abbr": "HKUST;USYD;LTU;UMD;HKUST", "aff_campus_unique_index": "0;0;0;2+0", "aff_campus_unique": "Guangzhou;;Hong Kong SAR", "aff_country_unique_index": "0;0;1;0;1;2;0+0", - "aff_country_unique": "China;Australia;United States" + "aff_country_unique": "China;Australia;United States", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Haoyu and Ren,\n Jingjing and Gu,\n Jinjin and Wu,\n Hongtao and Lu,\n Xuequan and Cai,\n Haoming and Zhu,\n Lei\n},\n title = {\n Snow Removal in Video: A New Dataset and A Novel Method\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13211-13222\n} \n}" }, { "title": "SoDaCam: Software-defined Cameras via Single-Photon Imaging", @@ -54255,7 +56071,7 @@ "author": "Varun Sundar; Andrei Ardelean; Tristan Swedish; Claudio Bruschini; Edoardo Charbon; Mohit Gupta", "abstract": "Reinterpretable cameras are defined by their post-processing capabilities that exceed traditional imaging. We present \"SoDaCam\" that provides reinterpretable cameras at the granularity of photons, from photon-cubes acquired by single-photon devices. Photon-cubes represent the spatio-temporal detections of photons as a sequence of binary frames, at frame-rates as high as 100 kHz. We show that simple transformations of the photon-cube, or photon-cube projections, provide the functionality of numerous imaging systems including: exposure bracketing, flutter shutter cameras, video compressive systems, event cameras, and even cameras that move during exposure. Our photon-cube projections offer the flexibility of being software-defined constructs that are only limited by what is computable, and shot-noise. We exploit this flexibility to provide new capabilities for the emulated cameras. As an added benefit, our projections provide camera-dependent compression of photon-cubes, which we demonstrate using an implementation of our projections on a novel compute architecture that is designed for single-photon imaging.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Sundar_SoDaCam_Software-defined_Cameras_via_Single-Photon_Imaging_ICCV_2023_paper.pdf", - "aff": "University of Wisconsin-Madison; Ecole polytechnique f \u00b4ed\u00b4erale de Lausanne; Ubicept; Ecole polytechnique f \u00b4ed\u00b4erale de Lausanne; Ecole polytechnique f \u00b4ed\u00b4erale de Lausanne; University of Wisconsin-Madison+Ubicept", + "aff": "University of Wisconsin-Madison; Ecole polytechnique f ´ed´erale de Lausanne; Ubicept; Ecole polytechnique f ´ed´erale de Lausanne; Ecole polytechnique f ´ed´erale de Lausanne; University of Wisconsin-Madison+Ubicept", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Sundar_SoDaCam_Software-defined_Cameras_ICCV_2023_supplemental.zip", @@ -54268,14 +56084,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Sundar_SoDaCam_Software-defined_Cameras_via_Single-Photon_Imaging_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;1;1;0+2", - "aff_unique_norm": "University of Wisconsin-Madison;EPFL;Ubicept", + "aff_unique_norm": "University of Wisconsin-Madison;Ecole Polytechnique Fédérale de Lausanne;Ubicept", "aff_unique_dep": ";;", "aff_unique_url": "https://www.wisc.edu;https://www.epfl.ch;", "aff_unique_abbr": "UW-Madison;EPFL;", "aff_campus_unique_index": "0;1;1;1;0", "aff_campus_unique": "Madison;Lausanne;", "aff_country_unique_index": "0;1;1;1;0", - "aff_country_unique": "United States;Switzerland;" + "aff_country_unique": "United States;Switzerland;", + "bibtex": "@InProceedings{Sundar_2023_ICCV,\n \n author = {\n Sundar,\n Varun and Ardelean,\n Andrei and Swedish,\n Tristan and Bruschini,\n Claudio and Charbon,\n Edoardo and Gupta,\n Mohit\n},\n title = {\n SoDaCam: Software-defined Cameras via Single-Photon Imaging\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8165-8176\n} \n}" }, { "title": "Social Diffusion: Long-term Multiple Human Motion Anticipation", @@ -54307,7 +56124,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+1;1;1;1;1;1;1;0+1;1", - "aff_country_unique": "Germany;United States" + "aff_country_unique": "Germany;United States", + "bibtex": "@InProceedings{Tanke_2023_ICCV,\n \n author = {\n Tanke,\n Julian and Zhang,\n Linguang and Zhao,\n Amy and Tang,\n Chengcheng and Cai,\n Yujun and Wang,\n Lezi and Wu,\n Po-Chen and Gall,\n Juergen and Keskin,\n Cem\n},\n title = {\n Social Diffusion: Long-term Multiple Human Motion Anticipation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9601-9611\n} \n}" }, { "title": "Sound Localization from Motion: Jointly Learning Sound Direction and Camera Rotation", @@ -54330,7 +56148,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chen_Sound_Localization_from_Motion_Jointly_Learning_Sound_Direction_and_Camera_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chen_Sound_Localization_from_Motion_Jointly_Learning_Sound_Direction_and_Camera_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Ziyang and Qian,\n Shengyi and Owens,\n Andrew\n},\n title = {\n Sound Localization from Motion: Jointly Learning Sound Direction and Camera Rotation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7897-7908\n} \n}" }, { "title": "Sound Source Localization is All about Cross-Modal Alignment", @@ -54353,7 +56172,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Senocak_Sound_Source_Localization_is_All_about_Cross-Modal_Alignment_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Senocak_Sound_Source_Localization_is_All_about_Cross-Modal_Alignment_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Senocak_2023_ICCV,\n \n author = {\n Senocak,\n Arda and Ryu,\n Hyeonggon and Kim,\n Junsik and Oh,\n Tae-Hyun and Pfister,\n Hanspeter and Chung,\n Joon Son\n},\n title = {\n Sound Source Localization is All about Cross-Modal Alignment\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7777-7787\n} \n}" }, { "title": "Source-free Depth for Object Pop-out", @@ -54361,8 +56181,8 @@ "status": "Poster", "track": "main", "pid": "5190", - "author_site": "Zongwei WU, Danda Pani Paudel, Deng-Ping Fan, Jingjing Wang, Shuo Wang, C\u00e9dric Demonceaux, Radu Timofte, Luc Van Gool", - "author": "Zongwei WU; Danda Pani Paudel; Deng-Ping Fan; Jingjing Wang; Shuo Wang; C\u00e9dric Demonceaux; Radu Timofte; Luc Van Gool", + "author_site": "Zongwei WU, Danda Pani Paudel, Deng-Ping Fan, Jingjing Wang, Shuo Wang, Cédric Demonceaux, Radu Timofte, Luc Van Gool", + "author": "Zongwei WU; Danda Pani Paudel; Deng-Ping Fan; Jingjing Wang; Shuo Wang; Cédric Demonceaux; Radu Timofte; Luc Van Gool", "abstract": "Depth cues are known to be useful for visual perception. However, direct measurement of depth is often impracticable. Fortunately, though, modern learning-based methods offer promising depth maps by inference in the wild. In this work, we adapt such depth inference models for object segmentation using the objects' \"pop-out\" prior in 3D. The \"pop-out\" is a simple composition prior that assumes objects reside on the background surface. Such compositional prior allows us to reason about objects in the 3D space. More specifically, we adapt the inferred depth maps such that objects can be localized using only 3D information. Such separation, however, requires knowledge about contact surface which we learn using the weak supervision of the segmentation mask. Our intermediate representation of contact surface, and thereby reasoning about objects purely in 3D, allows us to better transfer the depth knowledge into semantics. The proposed adaptation method uses only the depth model without needing the source data used for training, making the learning process efficient and practical. Our experiments on eight datasets of two challenging tasks, namely salient object detection and camouflaged object detection, consistently demonstrate the benefit of our method in terms of both performance and generalizability. The source code is publicly available at https://github.com/Zongwei97/PopNet.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/WU_Source-free_Depth_for_Object_Pop-out_ICCV_2023_paper.pdf", "aff": "CVL, ETH Zurich + University of Burgundy, CNRS, ICB + Computer Vision Lab, CAIDAS & IFI, University of Wurzburg; CVL, ETH Zurich + INSAIT, Sofia University; CVL, ETH Zurich; AUST; CVL, ETH Zurich; University of Burgundy, CNRS, ICB + University of Lorraine, CNRS, Inria, Loria; Computer Vision Lab, CAIDAS & IFI, University of Wurzburg; CVL, ETH Zurich + INSAIT, Sofia University", @@ -54385,7 +56205,8 @@ "aff_campus_unique_index": ";1;;1", "aff_campus_unique": ";Sofia", "aff_country_unique_index": "0+1+2;0+3;0;4;0;1+1;2;0+3", - "aff_country_unique": "Switzerland;France;Germany;Bulgaria;Lebanon" + "aff_country_unique": "Switzerland;France;Germany;Bulgaria;Lebanon", + "bibtex": "@InProceedings{WU_2023_ICCV,\n \n author = {\n WU,\n Zongwei and Paudel,\n Danda Pani and Fan,\n Deng-Ping and Wang,\n Jingjing and Wang,\n Shuo and Demonceaux,\n C\\'edric and Timofte,\n Radu and Van Gool,\n Luc\n},\n title = {\n Source-free Depth for Object Pop-out\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1032-1042\n} \n}" }, { "title": "Source-free Domain Adaptive Human Pose Estimation", @@ -54417,7 +56238,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Orlando", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Peng_2023_ICCV,\n \n author = {\n Peng,\n Qucheng and Zheng,\n Ce and Chen,\n Chen\n},\n title = {\n Source-free Domain Adaptive Human Pose Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4826-4836\n} \n}" }, { "title": "Space Engage: Collaborative Space Supervision for Contrastive-Based Semi-Supervised Semantic Segmentation", @@ -54442,14 +56264,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wang_Space_Engage_Collaborative_Space_Supervision_for_Contrastive-Based_Semi-Supervised_Semantic_Segmentation_ICCV_2023_paper.html", "aff_unique_index": "0;0;1;0+2;3", - "aff_unique_norm": "Northeastern University;Microsoft;Northeast University;Chinese University of Hong Kong", + "aff_unique_norm": "Northeastern University;Microsoft Research;Northeast University;The Chinese University of Hong Kong", "aff_unique_dep": "School of Computer Science and Engineering;Research;Key Laboratory of Intelligent Computing in Medical Image;", "aff_unique_url": "http://www.neu.edu.cn/;https://www.microsoft.com/en-us/research/group/asia;http://www.neu.edu.cn/;https://www.cuhk.edu.hk", "aff_unique_abbr": "NEU;MSR Asia;NEU;CUHK", "aff_campus_unique_index": "0;0;1;0;3", "aff_campus_unique": "Shenyang;Asia;;Hong Kong SAR", "aff_country_unique_index": "0;0;0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Changqi and Xie,\n Haoyu and Yuan,\n Yuhui and Fu,\n Chong and Yue,\n Xiangyu\n},\n title = {\n Space Engage: Collaborative Space Supervision for Contrastive-Based Semi-Supervised Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 931-942\n} \n}" }, { "title": "Space-time Prompting for Video Class-incremental Learning", @@ -54461,7 +56284,7 @@ "author": "Yixuan Pei; Zhiwu Qing; Shiwei Zhang; Xiang Wang; Yingya Zhang; Deli Zhao; Xueming Qian", "abstract": "Recently, prompt-based learning has made impressive progress on image class-incremental learning, but it still lacks sufficient exploration in the video domain. In this paper, we will fill this gap by learning multiple prompts based on a powerful image-language pre-trained model, i.e., CLIP, making it fit for video class-incremental learning (VCIL). For this purpose, we present a space-time prompting approach (ST-Prompt) which contains two kinds of prompts, i.e., task-specific prompts and task-agnostic prompts. The task-specific prompts are to address the catastrophic forgetting problem by learning multi-grained prompts, i.e., spatial prompts, temporal prompts and comprehensive prompts, for accurate task identification. The task-agnostic prompts maintain a globally-shared prompt pool, which can empower the pre-trained image models with temporal perception abilities by exchanging contexts between frames. By this means, ST-Prompt can transfer the plentiful knowledge in the image-language pre-trained models to the VCIL task with only a tiny set of prompts to be optimized. To evaluate ST-Prompt, we conduct extensive experiments on three standard benchmarks. The results show that ST-Prompt can significantly surpass the state-of-the-art VCIL methods, especially it gains 9.06% on HMDB51 dataset under the 1*25 stage setting.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Pei_Space-time_Prompting_for_Video_Class-incremental_Learning_ICCV_2023_paper.pdf", - "aff": "Xi\u2019an Jiaotong University; Huazhong University of Science and Technology; Alibaba Group; Alibaba Group; Alibaba Group; Alibaba Group; Xi\u2019an Jiaotong University + Shaanxi Yulan Jiuzhou Intelligent Optoelectronic Technology Co., Ltd", + "aff": "Xi’an Jiaotong University; Huazhong University of Science and Technology; Alibaba Group; Alibaba Group; Alibaba Group; Alibaba Group; Xi’an Jiaotong University + Shaanxi Yulan Jiuzhou Intelligent Optoelectronic Technology Co., Ltd", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Pei_Space-time_Prompting_for_Video_Class-incremental_Learning_ICCV_2023_supplemental.pdf", @@ -54474,14 +56297,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Pei_Space-time_Prompting_for_Video_Class-incremental_Learning_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;2;2;2;0+3", - "aff_unique_norm": "Xi'an Jiao Tong University;Huazhong University of Science and Technology;Alibaba Group;Shaanxi Yulan Jiuzhou Intelligent Optoelectronic Technology Co., Ltd", + "aff_unique_norm": "Xi'an Jiaotong University;Huazhong University of Science and Technology;Alibaba Group;Shaanxi Yulan Jiuzhou Intelligent Optoelectronic Technology Co., Ltd", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.xjtu.edu.cn;http://www.hust.edu.cn;https://www.alibaba.com;", "aff_unique_abbr": "XJTU;HUST;Alibaba;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Pei_2023_ICCV,\n \n author = {\n Pei,\n Yixuan and Qing,\n Zhiwu and Zhang,\n Shiwei and Wang,\n Xiang and Zhang,\n Yingya and Zhao,\n Deli and Qian,\n Xueming\n},\n title = {\n Space-time Prompting for Video Class-incremental Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11932-11942\n} \n}" }, { "title": "SpaceEvo: Hardware-Friendly Search Space Design for Efficient INT8 Inference", @@ -54506,14 +56330,15 @@ "author_num": 9, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wang_SpaceEvo_Hardware-Friendly_Search_Space_Design_for_Efficient_INT8_Inference_ICCV_2023_paper.html", "aff_unique_index": "0+1;1;1;1;1;1;1;1;1", - "aff_unique_norm": "Shanghai Jiao Tong University;Microsoft", + "aff_unique_norm": "Shanghai Jiao Tong University;Microsoft Corporation", "aff_unique_dep": ";Microsoft Research", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "SJTU;MSR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;1;1;1;1;1;1;1;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Xudong and Zhang,\n Li Lyna and Xu,\n Jiahang and Zhang,\n Quanlu and Wang,\n Yujing and Yang,\n Yuqing and Zheng,\n Ningxin and Cao,\n Ting and Yang,\n Mao\n},\n title = {\n SpaceEvo: Hardware-Friendly Search Space Design for Efficient INT8 Inference\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5819-5828\n} \n}" }, { "title": "Spacetime Surface Regularization for Neural Dynamic Scene Reconstruction", @@ -54538,14 +56363,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Choe_Spacetime_Surface_Regularization_for_Neural_Dynamic_Scene_Reconstruction_ICCV_2023_paper.html", "aff_unique_index": "0+1;0;2;1;0+3", - "aff_unique_norm": "NVIDIA;Korea Advanced Institute of Science and Technology;Seoul National University;California Institute of Technology", - "aff_unique_dep": "NVIDIA Corporation;;;", + "aff_unique_norm": "NVIDIA Corporation;Korea Advanced Institute of Science and Technology;Seoul National University;California Institute of Technology", + "aff_unique_dep": ";;;", "aff_unique_url": "https://www.nvidia.com;https://www.kaist.ac.kr;https://www.snu.ac.kr;https://www.caltech.edu", "aff_unique_abbr": "NVIDIA;KAIST;SNU;Caltech", "aff_campus_unique_index": ";1", "aff_campus_unique": ";Pasadena", "aff_country_unique_index": "0+1;0;1;1;0+0", - "aff_country_unique": "United States;South Korea" + "aff_country_unique": "United States;South Korea", + "bibtex": "@InProceedings{Choe_2023_ICCV,\n \n author = {\n Choe,\n Jaesung and Choy,\n Christopher and Park,\n Jaesik and Kweon,\n In So and Anandkumar,\n Anima\n},\n title = {\n Spacetime Surface Regularization for Neural Dynamic Scene Reconstruction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17871-17881\n} \n}" }, { "title": "Sparse Instance Conditioned Multimodal Trajectory Prediction", @@ -54570,14 +56396,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Dong_Sparse_Instance_Conditioned_Multimodal_Trajectory_Prediction_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;1", - "aff_unique_norm": "Xi'an Jiao Tong University;Wormpex AI Research", + "aff_unique_norm": "Xi'an Jiaotong University;Wormpex AI Research", "aff_unique_dep": "Institute of Artificial Intelligence and Robotics;AI Research", "aff_unique_url": "http://www.xjtu.edu.cn;", "aff_unique_abbr": "XJTU;Wormpex AI", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Xi'an;", "aff_country_unique_index": "0;0;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Dong_2023_ICCV,\n \n author = {\n Dong,\n Yonghao and Wang,\n Le and Zhou,\n Sanping and Hua,\n Gang\n},\n title = {\n Sparse Instance Conditioned Multimodal Trajectory Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9763-9772\n} \n}" }, { "title": "Sparse Point Guided 3D Lane Detection", @@ -54609,7 +56436,8 @@ "aff_campus_unique_index": ";1", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0+0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yao_2023_ICCV,\n \n author = {\n Yao,\n Chengtang and Yu,\n Lidong and Wu,\n Yuwei and Jia,\n Yunde\n},\n title = {\n Sparse Point Guided 3D Lane Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8363-8372\n} \n}" }, { "title": "Sparse Sampling Transformer with Uncertainty-Driven Ranking for Unified Removal of Raindrops and Rain Streaks", @@ -54633,15 +56461,16 @@ "email": "hkust-gz.edu.cn;hkust-gz.edu.cn;u.nus.edu;jmu.edu.cn;gmail.com;ust.hk", "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chen_Sparse_Sampling_Transformer_with_Uncertainty-Driven_Ranking_for_Unified_Removal_of_ICCV_2023_paper.html", - "aff_unique_index": "0;0;1;2;3;0", - "aff_unique_norm": "Hong Kong University of Science and Technology;National University of Singapore;Jimei University;Xinjiang University", - "aff_unique_dep": ";;School of Ocean Information Engineering;", - "aff_unique_url": "https://www.ust.hk;https://www.nus.edu.sg;http://www.jimei.edu.cn;http://www.xju.edu.cn", - "aff_unique_abbr": "HKUST;NUS;;XJU", + "aff_unique_index": "0;0;1;2;3;4", + "aff_unique_norm": "The Hong Kong University of Science and Technology;National University of Singapore;Jimei University;Xinjiang University;Hong Kong University of Science and Technology", + "aff_unique_dep": ";;School of Ocean Information Engineering;;", + "aff_unique_url": "https://www.ust.hk;https://www.nus.edu.sg;http://www.jimei.edu.cn;http://www.xju.edu.cn;https://www.ust.hk", + "aff_unique_abbr": "HKUST;NUS;;XJU;HKUST", "aff_campus_unique_index": "0;0;2", "aff_campus_unique": "Guangzhou;;Hong Kong SAR", "aff_country_unique_index": "0;0;1;0;0;0", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Sixiang and Ye,\n Tian and Bai,\n Jinbin and Chen,\n Erkang and Shi,\n Jun and Zhu,\n Lei\n},\n title = {\n Sparse Sampling Transformer with Uncertainty-Driven Ranking for Unified Removal of Raindrops and Rain Streaks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13106-13117\n} \n}" }, { "title": "SparseBEV: High-Performance Sparse 3D Object Detection from Multi-Camera Videos", @@ -54673,7 +56502,8 @@ "aff_campus_unique_index": ";;;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0+0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Haisong and Teng,\n Yao and Lu,\n Tao and Wang,\n Haiguang and Wang,\n Limin\n},\n title = {\n SparseBEV: High-Performance Sparse 3D Object Detection from Multi-Camera Videos\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18580-18590\n} \n}" }, { "title": "SparseDet: Improving Sparsely Annotated Object Detection with Pseudo-positive Mining", @@ -54705,7 +56535,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "College Park;", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Suri_2023_ICCV,\n \n author = {\n Suri,\n Saksham and Rambhatla,\n Saketh and Chellappa,\n Rama and Shrivastava,\n Abhinav\n},\n title = {\n SparseDet: Improving Sparsely Annotated Object Detection with Pseudo-positive Mining\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6770-6781\n} \n}" }, { "title": "SparseFusion: Fusing Multi-Modal Sparse Representations for Multi-Sensor 3D Object Detection", @@ -54728,7 +56559,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Xie_SparseFusion_Fusing_Multi-Modal_Sparse_Representations_for_Multi-Sensor_3D_Object_Detection_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Xie_SparseFusion_Fusing_Multi-Modal_Sparse_Representations_for_Multi-Sensor_3D_Object_Detection_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Xie_2023_ICCV,\n \n author = {\n Xie,\n Yichen and Xu,\n Chenfeng and Rakotosaona,\n Marie-Julie and Rim,\n Patrick and Tombari,\n Federico and Keutzer,\n Kurt and Tomizuka,\n Masayoshi and Zhan,\n Wei\n},\n title = {\n SparseFusion: Fusing Multi-Modal Sparse Representations for Multi-Sensor 3D Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17591-17602\n} \n}" }, { "title": "SparseMAE: Sparse Training Meets Masked Autoencoders", @@ -54753,14 +56585,15 @@ "author_num": 9, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhou_SparseMAE_Sparse_Training_Meets_Masked_Autoencoders_ICCV_2023_paper.html", "aff_unique_index": "0;1;0;0;0;0+2;1;2;0", - "aff_unique_norm": "Chinese University of Hong Kong;SenseTime;Shanghai AI Lab", + "aff_unique_norm": "The Chinese University of Hong Kong;SenseTime;Shanghai AI Lab", "aff_unique_dep": ";SenseTime Research;", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.sensetime.com;https://www.shanghaiailab.com", "aff_unique_abbr": "CUHK;SenseTime;SAIL", "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0;0+0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhou_2023_ICCV,\n \n author = {\n Zhou,\n Aojun and Li,\n Yang and Qin,\n Zipeng and Liu,\n Jianbo and Pan,\n Junting and Zhang,\n Renrui and Zhao,\n Rui and Gao,\n Peng and Li,\n Hongsheng\n},\n title = {\n SparseMAE: Sparse Training Meets Masked Autoencoders\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16176-16186\n} \n}" }, { "title": "SparseNeRF: Distilling Depth Ranking for Few-shot Novel View Synthesis", @@ -54792,7 +56625,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Guangcong and Chen,\n Zhaoxi and Loy,\n Chen Change and Liu,\n Ziwei\n},\n title = {\n SparseNeRF: Distilling Depth Ranking for Few-shot Novel View Synthesis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9065-9076\n} \n}" }, { "title": "Spatial Self-Distillation for Object Detection with Inaccurate Bounding Boxes", @@ -54824,7 +56658,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wu_2023_ICCV,\n \n author = {\n Wu,\n Di and Chen,\n Pengfei and Yu,\n Xuehui and Li,\n Guorong and Han,\n Zhenjun and Jiao,\n Jianbin\n},\n title = {\n Spatial Self-Distillation for Object Detection with Inaccurate Bounding Boxes\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6855-6865\n} \n}" }, { "title": "Spatial-Aware Token for Weakly Supervised Object Localization", @@ -54856,7 +56691,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Hefei", "aff_country_unique_index": "0;0+0;0+0;1;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Wu_2023_ICCV,\n \n author = {\n Wu,\n Pingyu and Zhai,\n Wei and Cao,\n Yang and Luo,\n Jiebo and Zha,\n Zheng-Jun\n},\n title = {\n Spatial-Aware Token for Weakly Supervised Object Localization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1844-1854\n} \n}" }, { "title": "Spatially and Spectrally Consistent Deep Functional Maps", @@ -54868,7 +56704,7 @@ "author": "Mingze Sun; Shiwei Mao; Puhua Jiang; Maks Ovsjanikov; Ruqi Huang", "abstract": "Cycle consistency has long been exploited as a powerful prior for jointly optimizing maps within a collection of shapes. In this paper, we investigate its utility in the approaches of Deep Functional Maps, which are considered state-of-the-art in non-rigid shape matching. We first justify that under certain conditions, the learned maps, when represented in the spectral domain, are already cycle consistent. Furthermore, we identify the discrepancy that spectrally consistent maps are not necessarily spatially, or point-wise, consistent. In light of this, we present a novel design of unsupervised Deep Functional Maps, which effectively enforces the harmony of learned maps under the spectral and the point-wise representation. By taking advantage of cycle consistency, our framework produces state-of-the-art results in mapping shapes even under significant distortions. Beyond that, by independently estimating maps in both spectral and spatial domains, our method naturally alleviates over-fitting in network training, yielding superior generalization performance and accuracy within an array of challenging tests for both near-isometric and non-isometric datasets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Sun_Spatially_and_Spectrally_Consistent_Deep_Functional_Maps_ICCV_2023_paper.pdf", - "aff": "Tsinghua Shenzhen International Graduate School, China; Tsinghua Shenzhen International Graduate School, China; Tsinghua Shenzhen International Graduate School, China + Peng Cheng Laboratory, China; LIX, \u00b4Ecole polytechnique, IP Paris, France; Tsinghua Shenzhen International Graduate School, China", + "aff": "Tsinghua Shenzhen International Graduate School, China; Tsinghua Shenzhen International Graduate School, China; Tsinghua Shenzhen International Graduate School, China + Peng Cheng Laboratory, China; LIX, ´Ecole polytechnique, IP Paris, France; Tsinghua Shenzhen International Graduate School, China", "project": "", "github": "https://github.com/rqhuang88/Spatially-and-Spectrally-Consistent-Deep-Functional-Maps", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Sun_Spatially_and_Spectrally_ICCV_2023_supplemental.pdf", @@ -54881,14 +56717,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Sun_Spatially_and_Spectrally_Consistent_Deep_Functional_Maps_ICCV_2023_paper.html", "aff_unique_index": "0;0;0+1;2;0", - "aff_unique_norm": "Tsinghua University;Pengcheng Laboratory;Ecole Polytechnique", - "aff_unique_dep": "International Graduate School;Peng Cheng Laboratory;LIX", + "aff_unique_norm": "Tsinghua University;Peng Cheng Laboratory;Ecole polytechnique", + "aff_unique_dep": "International Graduate School;;LIX", "aff_unique_url": "https://www.tsinghua.edu.cn;;https://www.ecolepolytechnique.fr", "aff_unique_abbr": "THU;;X", - "aff_campus_unique_index": "0;0;0;2;0", - "aff_campus_unique": "Shenzhen;;Paris", + "aff_campus_unique_index": "0;0;0;0", + "aff_campus_unique": "Shenzhen;", "aff_country_unique_index": "0;0;0+0;1;0", - "aff_country_unique": "China;France" + "aff_country_unique": "China;France", + "bibtex": "@InProceedings{Sun_2023_ICCV,\n \n author = {\n Sun,\n Mingze and Mao,\n Shiwei and Jiang,\n Puhua and Ovsjanikov,\n Maks and Huang,\n Ruqi\n},\n title = {\n Spatially and Spectrally Consistent Deep Functional Maps\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14497-14507\n} \n}" }, { "title": "Spatially-Adaptive Feature Modulation for Efficient Image Super-Resolution", @@ -54911,7 +56748,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Sun_Spatially-Adaptive_Feature_Modulation_for_Efficient_Image_Super-Resolution_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Sun_Spatially-Adaptive_Feature_Modulation_for_Efficient_Image_Super-Resolution_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Sun_2023_ICCV,\n \n author = {\n Sun,\n Long and Dong,\n Jiangxin and Tang,\n Jinhui and Pan,\n Jinshan\n},\n title = {\n Spatially-Adaptive Feature Modulation for Efficient Image Super-Resolution\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13190-13199\n} \n}" }, { "title": "Spatio-Temporal Crop Aggregation for Video Representation Learning", @@ -54943,7 +56781,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "Switzerland;United States" + "aff_country_unique": "Switzerland;United States", + "bibtex": "@InProceedings{Sameni_2023_ICCV,\n \n author = {\n Sameni,\n Sepehr and Jenni,\n Simon and Favaro,\n Paolo\n},\n title = {\n Spatio-Temporal Crop Aggregation for Video Representation Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5664-5674\n} \n}" }, { "title": "Spatio-Temporal Domain Awareness for Multi-Agent Collaborative Perception", @@ -54968,14 +56807,15 @@ "author_num": 9, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yang_Spatio-Temporal_Domain_Awareness_for_Multi-Agent_Collaborative_Perception_ICCV_2023_paper.html", "aff_unique_index": "0+1;0+1;0;0;0;0;0;2;0", - "aff_unique_norm": "Fudan University;Meta;Duke Kunshan University", - "aff_unique_dep": "Academy for Engineering and Technology;Institute of Meta-Medical;", - "aff_unique_url": "https://www.fudan.edu.cn;;https://www.duk/Dk.edu", + "aff_unique_norm": "Fudan University;Institute of Meta-Medical;Duke Kunshan University", + "aff_unique_dep": "Academy for Engineering and Technology;;", + "aff_unique_url": "https://www.fudan.edu.cn;;https://www.dukekunshan.edu.cn", "aff_unique_abbr": "Fudan;;DKU", "aff_campus_unique_index": ";;1", "aff_campus_unique": ";Kunshan", "aff_country_unique_index": "0+1;0+1;0;0;0;0;0;0;0", - "aff_country_unique": "China;Unknown" + "aff_country_unique": "China;Unknown", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n Kun and Yang,\n Dingkang and Zhang,\n Jingyu and Li,\n Mingcheng and Liu,\n Yang and Liu,\n Jing and Wang,\n Hanqi and Sun,\n Peng and Song,\n Liang\n},\n title = {\n Spatio-Temporal Domain Awareness for Multi-Agent Collaborative Perception\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23383-23392\n} \n}" }, { "title": "Spatio-temporal Prompting Network for Robust Video Feature Extraction", @@ -54987,7 +56827,7 @@ "author": "Guanxiong Sun; Chi Wang; Zhaoyu Zhang; Jiankang Deng; Stefanos Zafeiriou; Yang Hua", "abstract": "The frame quality deterioration problem is one of the main challenges in the field of video understanding. To compensate for the information loss caused by deteriorated frames, recent approaches exploit transformer-based integration modules to obtain spatio-temporal information. However, these integration modules are heavy and complex. Furthermore, each integration module is specifically tailored for its target task, making it difficult to generalise to multiple tasks. In this paper, we present a neat and unified framework, called Spatio-Temporal Prompting Network (STPN). It can efficiently extract robust and accurate video features by dynamically adjusting the input features in the backbone network. Specifically, STPN predicts several video prompts containing spatio-temporal information of neighbour frames. Then, these video prompts are prepended to the patch embeddings of the current frame as the updated input for video feature extraction. Moreover, STPN is easy to generalise to various video tasks because it does not contain task-specific modules. Without bells and whistles, STPN achieves state-of-the-art performance on three widely-used datasets for different video understanding tasks, i.e., ImageNetVID for video object detection, YouTubeVIS for video instance segmentation, and GOT-10k for visual object tracking. Codes are available at https://github.com/guanxiongsun/STPN.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Sun_Spatio-temporal_Prompting_Network_for_Robust_Video_Feature_Extraction_ICCV_2023_paper.pdf", - "aff": "Queen\u2019s University Belfast+Huawei UKRD; Queen\u2019s University Belfast; Queen\u2019s University Belfast; Huawei UKRD+Imperial College London; Imperial College London; Queen\u2019s University Belfast", + "aff": "Queen’s University Belfast+Huawei UKRD; Queen’s University Belfast; Queen’s University Belfast; Huawei UKRD+Imperial College London; Imperial College London; Queen’s University Belfast", "project": "", "github": "https://github.com/guanxiongsun/STPN", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Sun_Spatio-temporal_Prompting_Network_ICCV_2023_supplemental.pdf", @@ -55007,7 +56847,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0+0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Sun_2023_ICCV,\n \n author = {\n Sun,\n Guanxiong and Wang,\n Chi and Zhang,\n Zhaoyu and Deng,\n Jiankang and Zafeiriou,\n Stefanos and Hua,\n Yang\n},\n title = {\n Spatio-temporal Prompting Network for Robust Video Feature Extraction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13587-13597\n} \n}" }, { "title": "Spectral Graphormer: Spectral Graph-Based Transformer for Egocentric Two-Hand Reconstruction using Multi-View Color Images", @@ -55033,13 +56874,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Tse_Spectral_Graphormer_Spectral_Graph-Based_Transformer_for_Egocentric_Two-Hand_Reconstruction_using_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0;0;0;0;0;1;0;0", "aff_unique_norm": "Google;University of Birmingham", - "aff_unique_dep": "Google;", + "aff_unique_dep": ";", "aff_unique_url": "https://www.google.com;https://www.birmingham.ac.uk", "aff_unique_abbr": "Google;Birmingham", "aff_campus_unique_index": "0;0;0;0;0;0;0;0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;0;0;0;0;0;0;1;0;0", - "aff_country_unique": "United States;United Kingdom" + "aff_country_unique": "United States;United Kingdom", + "bibtex": "@InProceedings{Tse_2023_ICCV,\n \n author = {\n Tse,\n Tze Ho Elden and Mueller,\n Franziska and Shen,\n Zhengyang and Tang,\n Danhang and Beeler,\n Thabo and Dou,\n Mingsong and Zhang,\n Yinda and Petrovic,\n Sasa and Chang,\n Hyung Jin and Taylor,\n Jonathan and Doosti,\n Bardia\n},\n title = {\n Spectral Graphormer: Spectral Graph-Based Transformer for Egocentric Two-Hand Reconstruction using Multi-View Color Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14666-14677\n} \n}" }, { "title": "Spectrum-guided Multi-granularity Referring Video Object Segmentation", @@ -55051,7 +56893,7 @@ "author": "Bo Miao; Mohammed Bennamoun; Yongsheng Gao; Ajmal Mian", "abstract": "Current referring video object segmentation (R-VOS) techniques extract conditional kernels from encoded (low-resolution) vision-language features to segment the decoded high-resolution features. We discovered that this causes significant feature drift, which the segmentation kernels struggle to perceive during the forward computation. This negatively affects the ability of segmentation kernels. To address the drift problem, we propose a Spectrum-guided Multi-granularity (SgMg) approach, which performs direct segmentation on the encoded features and employs visual details to further optimize the masks. In addition, we propose Spectrum-guided Cross-modal Fusion (SCF) to perform intra-frame global interactions in the spectral domain for effective multimodal representation. Finally, we extend SgMg to perform multi-object R-VOS, a new paradigm that enables simultaneous segmentation of multiple referred objects in a video. This not only makes R-VOS faster, but also more practical. Extensive experiments show that SgMg achieves state-of-the-art performance on four video benchmark datasets, outperforming the nearest competitor by 2.8% points on Ref-YouTube-VOS. Our extended SgMg enables multi-object R-VOS, runs about 3 times faster while maintaining satisfactory performance. Code is available at https://github.com/bo-miao/SgMg.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Miao_Spectrum-guided_Multi-granularity_Referring_Video_Object_Segmentation_ICCV_2023_paper.pdf", - "aff": "The University of Western Australia; The University of Western Australia; Grif\ufb01th University; The University of Western Australia", + "aff": "The University of Western Australia; The University of Western Australia; Griffith University; The University of Western Australia", "project": "", "github": "https://github.com/bo-miao/SgMg", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Miao_Spectrum-guided_Multi-granularity_Referring_ICCV_2023_supplemental.zip", @@ -55071,7 +56913,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Australia" + "aff_country_unique": "Australia", + "bibtex": "@InProceedings{Miao_2023_ICCV,\n \n author = {\n Miao,\n Bo and Bennamoun,\n Mohammed and Gao,\n Yongsheng and Mian,\n Ajmal\n},\n title = {\n Spectrum-guided Multi-granularity Referring Video Object Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 920-930\n} \n}" }, { "title": "Speech2Lip: High-fidelity Speech to Lip Generation by Learning from a Short Video", @@ -55096,14 +56939,15 @@ "author_num": 9, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wu_Speech2Lip_High-fidelity_Speech_to_Lip_Generation_by_Learning_from_a_ICCV_2023_paper.html", "aff_unique_index": "0;1;2+2;0;2;2;1;2;0", - "aff_unique_norm": "University of Hong Kong;Tsinghua University;Tencent", + "aff_unique_norm": "The University of Hong Kong;Tsinghua University;Tencent", "aff_unique_dep": ";;ARC Lab", "aff_unique_url": "https://www.hku.hk;https://www.tsinghua.edu.cn;https://www.tencent.com", "aff_unique_abbr": "HKU;THU;Tencent", "aff_campus_unique_index": "0;;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0+0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wu_2023_ICCV,\n \n author = {\n Wu,\n Xiuzhe and Hu,\n Pengfei and Wu,\n Yang and Lyu,\n Xiaoyang and Cao,\n Yan-Pei and Shan,\n Ying and Yang,\n Wenming and Sun,\n Zhongqian and Qi,\n Xiaojuan\n},\n title = {\n Speech2Lip: High-fidelity Speech to Lip Generation by Learning from a Short Video\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22168-22177\n} \n}" }, { "title": "Speech4Mesh: Speech-Assisted Monocular 3D Facial Reconstruction for Speech-Driven 3D Facial Animation", @@ -55128,14 +56972,15 @@ "author_num": 9, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/He_Speech4Mesh_Speech-Assisted_Monocular_3D_Facial_Reconstruction_for_Speech-Driven_3D_Facial_ICCV_2023_paper.html", "aff_unique_index": "0+1;1;1;1;1;1;1;0;2", - "aff_unique_norm": "University of Science and Technology of China;iFLYTEK;University of Sydney", + "aff_unique_norm": "University of Science and Technology of China;IFLYTEK;University of Sydney", "aff_unique_dep": ";Research;", "aff_unique_url": "http://www.ustc.edu.cn;https://www.iflytek.com;https://www.sydney.edu.au", "aff_unique_abbr": "USTC;IFLYTEK;USYD", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0;0;0;0;0;1", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{He_2023_ICCV,\n \n author = {\n He,\n Shan and He,\n Haonan and Yang,\n Shuo and Wu,\n Xiaoyan and Xia,\n Pengcheng and Yin,\n Bing and Liu,\n Cong and Dai,\n Lirong and Xu,\n Chang\n},\n title = {\n Speech4Mesh: Speech-Assisted Monocular 3D Facial Reconstruction for Speech-Driven 3D Facial Animation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14192-14202\n} \n}" }, { "title": "Spherical Space Feature Decomposition for Guided Depth Map Super-Resolution", @@ -55147,7 +56992,7 @@ "author": "Zixiang Zhao; Jiangshe Zhang; Xiang Gu; Chengli Tan; Shuang Xu; Yulun Zhang; Radu Timofte; Luc Van Gool", "abstract": "Guided depth map super-resolution (GDSR), as a hot topic in multi-modal image processing, aims to upsample low-resolution (LR) depth maps with additional information involved in high-resolution (HR) RGB images from the same scene. The critical step of this task is to effectively extract domain-shared and domain-private RGB/depth features. In addition, three detailed issues, namely blurry edges, noisy surfaces, and over-transferred RGB texture, need to be addressed. In this paper, we propose the Spherical Space feature Decomposition Network (SSDNet) to solve the above issues. To better model cross-modality features, Restormer block-based RGB/depth encoders are employed for extracting local-global features. Then, the extracted features are mapped to the spherical space to complete the separation of private features and the alignment of shared features. Shared features of RGB are fused with the depth features to complete the GDSR task. Subsequently, a spherical contrast refinement (SCR) module is proposed to further address the detail issues. Patches that are classified according to imperfect categories are input into the SCR module, where the patch features are pulled closer to the ground truth and pushed away from the corresponding imperfect samples in the spherical feature space via contrastive learning. Extensive experiments demonstrate that our method can achieve state-of-the-art results on four test datasets, as well as successfully generalize to real-world scenes. The code is available at https://github.com/Zhaozixiang1228/GDSR-SSDNet.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Zhao_Spherical_Space_Feature_Decomposition_for_Guided_Depth_Map_Super-Resolution_ICCV_2023_paper.pdf", - "aff": "Xi\u2019an Jiaotong University+Computer Vision Lab, ETH Z\u00fcrich; Xi\u2019an Jiaotong University; Xi\u2019an Jiaotong University; Xi\u2019an Jiaotong University; Northwestern Polytechnical University; Computer Vision Lab, ETH Z\u00fcrich; Computer Vision Lab, ETH Z\u00fcrich+University of W\u00fcrzburg; Computer Vision Lab, ETH Z\u00fcrich", + "aff": "Xi’an Jiaotong University+Computer Vision Lab, ETH Zürich; Xi’an Jiaotong University; Xi’an Jiaotong University; Xi’an Jiaotong University; Northwestern Polytechnical University; Computer Vision Lab, ETH Zürich; Computer Vision Lab, ETH Zürich+University of Würzburg; Computer Vision Lab, ETH Zürich", "project": "", "github": "https://github.com/Zhaozixiang1228/GDSR-SSDNet", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Zhao_Spherical_Space_Feature_Decomposition_for_Guided_Depth_Map_Super-Resolution_ICCV_2023_supplemental.pdf", @@ -55160,14 +57005,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhao_Spherical_Space_Feature_Decomposition_for_Guided_Depth_Map_Super-Resolution_ICCV_2023_paper.html", "aff_unique_index": "0+1;0;0;0;2;1;1+3;1", - "aff_unique_norm": "Xi'an Jiao Tong University;ETH Zurich;Northwestern Polytechnical University;University of W\u00fcrzburg", + "aff_unique_norm": "Xi'an Jiaotong University;ETH Zürich;Northwestern Polytechnical University;University of Würzburg", "aff_unique_dep": ";Computer Vision Lab;;", "aff_unique_url": "https://www.xjtu.edu.cn;https://www.ethz.ch;https://www.nwpu.edu.cn;https://www.uni-wuerzburg.de", "aff_unique_abbr": "XJTU;ETHZ;NWPU;UWue", "aff_campus_unique_index": "1;1;1;1", - "aff_campus_unique": ";Z\u00fcrich", + "aff_campus_unique": ";Zürich", "aff_country_unique_index": "0+1;0;0;0;0;1;1+2;1", - "aff_country_unique": "China;Switzerland;Germany" + "aff_country_unique": "China;Switzerland;Germany", + "bibtex": "@InProceedings{Zhao_2023_ICCV,\n \n author = {\n Zhao,\n Zixiang and Zhang,\n Jiangshe and Gu,\n Xiang and Tan,\n Chengli and Xu,\n Shuang and Zhang,\n Yulun and Timofte,\n Radu and Van Gool,\n Luc\n},\n title = {\n Spherical Space Feature Decomposition for Guided Depth Map Super-Resolution\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12547-12558\n} \n}" }, { "title": "SpinCam: High-Speed Imaging via a Rotating Point-Spread Function", @@ -55199,7 +57045,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Pittsburgh", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Chan_2023_ICCV,\n \n author = {\n Chan,\n Dorian and Sheinin,\n Mark and O'Toole,\n Matthew\n},\n title = {\n SpinCam: High-Speed Imaging via a Rotating Point-Spread Function\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10789-10799\n} \n}" }, { "title": "SportsMOT: A Large Multi-Object Tracking Dataset in Multiple Sports Scenes", @@ -55231,7 +57078,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Cui_2023_ICCV,\n \n author = {\n Cui,\n Yutao and Zeng,\n Chenkai and Zhao,\n Xiaoyu and Yang,\n Yichun and Wu,\n Gangshan and Wang,\n Limin\n},\n title = {\n SportsMOT: A Large Multi-Object Tracking Dataset in Multiple Sports Scenes\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9921-9931\n} \n}" }, { "title": "Spurious Features Everywhere - Large-Scale Detection of Harmful Spurious Features in ImageNet", @@ -55243,7 +57091,7 @@ "author": "Yannic Neuhaus; Maximilian Augustin; Valentyn Boreiko; Matthias Hein", "abstract": "Benchmark performance of deep learning classifiers alone is not a reliable predictor for the performance of a deployed model. In particular, if the image classifier has picked up spurious features in the training data, its predictions can fail in unexpected ways. In this paper, we develop a framework that allows us to systematically identify spurious features in large datasets like ImageNet. It is based on our neural PCA components and their visualization. Previous work on spurious features often operates in toy settings or requires costly pixel-wise annotations. In contrast, we work with ImageNet and validate our results by showing that presence of the harmful spurious feature of a class alone is sufficient to trigger the prediction of that class. We introduce the novel dataset \"Spurious ImageNet\" which allows to measure the reliance of any ImageNet classifier on harmful spurious features. Moreover, we introduce SpuFix as a simple mitigation method to reduce the dependence of any ImageNet classifier on previously identified harmful spurious features without requiring additional labels or retraining of the model. We provide code and data at https://github.com/YanNeu/spurious_imagenet.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Neuhaus_Spurious_Features_Everywhere_-_Large-Scale_Detection_of_Harmful_Spurious_Features_ICCV_2023_paper.pdf", - "aff": "T\u00a8ubingen AI Center \u2013 University of T \u00a8ubingen; T\u00a8ubingen AI Center \u2013 University of T \u00a8ubingen; T\u00a8ubingen AI Center \u2013 University of T \u00a8ubingen; T\u00a8ubingen AI Center \u2013 University of T \u00a8ubingen", + "aff": "T¨ubingen AI Center – University of T ¨ubingen; T¨ubingen AI Center – University of T ¨ubingen; T¨ubingen AI Center – University of T ¨ubingen; T¨ubingen AI Center – University of T ¨ubingen", "project": "", "github": "https://github.com/YanNeu/spurious_imagenet", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Neuhaus_Spurious_Features_Everywhere_ICCV_2023_supplemental.pdf", @@ -55256,14 +57104,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Neuhaus_Spurious_Features_Everywhere_-_Large-Scale_Detection_of_Harmful_Spurious_Features_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0", - "aff_unique_norm": "University of T\u00fcbingen", - "aff_unique_dep": "T\u00fcbingen AI Center", + "aff_unique_norm": "University of Tübingen", + "aff_unique_dep": "Tübingen AI Center", "aff_unique_url": "https://www.uni-tuebingen.de/", - "aff_unique_abbr": "Uni T\u00fcbingen", + "aff_unique_abbr": "Uni Tübingen", "aff_campus_unique_index": "0;0;0;0", - "aff_campus_unique": "T\u00fcbingen", + "aff_campus_unique": "Tübingen", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Neuhaus_2023_ICCV,\n \n author = {\n Neuhaus,\n Yannic and Augustin,\n Maximilian and Boreiko,\n Valentyn and Hein,\n Matthias\n},\n title = {\n Spurious Features Everywhere - Large-Scale Detection of Harmful Spurious Features in ImageNet\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20235-20246\n} \n}" }, { "title": "Stabilizing Visual Reinforcement Learning via Asymmetric Interactive Cooperation", @@ -55288,14 +57137,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhai_Stabilizing_Visual_Reinforcement_Learning_via_Asymmetric_Interactive_Cooperation_ICCV_2023_paper.html", "aff_unique_index": "0;0+1;0;0;0+0+1", - "aff_unique_norm": "Peking University;Pengcheng Laboratory", - "aff_unique_dep": "School of Computer Science;Peng Cheng Laboratory", + "aff_unique_norm": "Peking University;Peng Cheng Laboratory", + "aff_unique_dep": "School of Computer Science;", "aff_unique_url": "http://www.pku.edu.cn;", "aff_unique_abbr": "PKU;", "aff_campus_unique_index": "0;1+2;0;0;0+1+2", "aff_campus_unique": "Beijing;Shenzhen Graduate School;Shenzhen", "aff_country_unique_index": "0;0+0;0;0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhai_2023_ICCV,\n \n author = {\n Zhai,\n Yunpeng and Peng,\n Peixi and Zhao,\n Yifan and Huang,\n Yangru and Tian,\n Yonghong\n},\n title = {\n Stabilizing Visual Reinforcement Learning via Asymmetric Interactive Cooperation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 207-216\n} \n}" }, { "title": "Stable Cluster Discrimination for Deep Clustering", @@ -55303,6 +57153,7 @@ "status": "Poster", "track": "main", "pid": "11010", + "author_site": "Qi Qian", "author": "Qi Qian", "abstract": "Deep clustering can optimize representations of instances (i.e., representation learning) and explore the inherent data distribution (i.e., clustering) simultaneously, which demonstrates a superior performance over conventional clustering methods with given features. However, the coupled objective implies a trivial solution that all instances collapse to the uniform features. To tackle the challenge, a two-stage training strategy is developed for decoupling, where it introduces an additional pre-training stage for representation learning and then fine-tunes the obtained model for clustering. Meanwhile, one-stage methods are developed mainly for representation learning rather than clustering, where various constraints for cluster assignments are designed to avoid collapsing explicitly. Despite the success of these methods, an appropriate learning objective tailored for deep clustering has not been investigated sufficiently. In this work, we first show that the prevalent discrimination task in supervised learning is unstable for one-stage clustering due to the lack of ground-truth labels and positive instances for certain clusters in each mini-batch. To mitigate the issue, a novel stable cluster discrimination (SeCu) task is proposed and a new hardness-aware clustering criterion can be obtained accordingly. Moreover, a global entropy constraint for cluster assignments is studied with efficient optimization. Extensive experiments are conducted on benchmark data sets and ImageNet. SeCu achieves state-of-the-art performance on all of them, which demonstrates the effectiveness of one-stage deep clustering.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Qian_Stable_Cluster_Discrimination_for_Deep_Clustering_ICCV_2023_paper.pdf", @@ -55326,7 +57177,8 @@ "aff_campus_unique_index": "0", "aff_campus_unique": "Bellevue", "aff_country_unique_index": "0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Qian_2023_ICCV,\n \n author = {\n Qian,\n Qi\n},\n title = {\n Stable Cluster Discrimination for Deep Clustering\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16645-16654\n} \n}" }, { "title": "Stable and Causal Inference for Discriminative Self-supervised Deep Visual Representations", @@ -55358,7 +57210,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n Yuewei and Li,\n Hai and Chen,\n Yiran\n},\n title = {\n Stable and Causal Inference for Discriminative Self-supervised Deep Visual Representations\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16109-16120\n} \n}" }, { "title": "StableVideo: Text-driven Consistency-aware Diffusion Video Editing", @@ -55383,14 +57236,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chai_StableVideo_Text-driven_Consistency-aware_Diffusion_Video_Editing_ICCV_2023_paper.html", "aff_unique_index": "0;1;0;1", - "aff_unique_norm": "Zhejiang University;Microsoft", + "aff_unique_norm": "Zhejiang University;Microsoft Research", "aff_unique_dep": ";Research", "aff_unique_url": "https://www.zju.edu.cn;https://www.microsoft.com/en-us/research/group/asia", "aff_unique_abbr": "ZJU;MSR Asia", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chai_2023_ICCV,\n \n author = {\n Chai,\n Wenhao and Guo,\n Xun and Wang,\n Gaoang and Lu,\n Yan\n},\n title = {\n StableVideo: Text-driven Consistency-aware Diffusion Video Editing\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23040-23050\n} \n}" }, { "title": "StageInteractor: Query-based Object Detector with Cross-stage Interaction", @@ -55422,7 +57276,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Teng_2023_ICCV,\n \n author = {\n Teng,\n Yao and Liu,\n Haisong and Guo,\n Sheng and Wang,\n Limin\n},\n title = {\n StageInteractor: Query-based Object Detector with Cross-stage Interaction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6577-6588\n} \n}" }, { "title": "Steered Diffusion: A Generalized Framework for Plug-and-Play Conditional Image Synthesis", @@ -55454,7 +57309,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Nair_2023_ICCV,\n \n author = {\n Nair,\n Nithin Gopalakrishnan and Cherian,\n Anoop and Lohit,\n Suhas and Wang,\n Ye and Koike-Akino,\n Toshiaki and Patel,\n Vishal M. and Marks,\n Tim K.\n},\n title = {\n Steered Diffusion: A Generalized Framework for Plug-and-Play Conditional Image Synthesis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20850-20860\n} \n}" }, { "title": "StegaNeRF: Embedding Invisible Information within Neural Radiance Fields", @@ -55486,7 +57342,8 @@ "aff_campus_unique_index": "0;2;2", "aff_campus_unique": "Hong Kong SAR;;Austin", "aff_country_unique_index": "0;1;1;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Chenxin and Feng,\n Brandon Y. and Fan,\n Zhiwen and Pan,\n Panwang and Wang,\n Zhangyang\n},\n title = {\n StegaNeRF: Embedding Invisible Information within Neural Radiance Fields\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 441-453\n} \n}" }, { "title": "Stochastic Segmentation with Conditional Categorical Diffusion Models", @@ -55494,8 +57351,8 @@ "status": "Poster", "track": "main", "pid": "10145", - "author_site": "Lukas Zbinden, Lars Doorenbos, Theodoros Pissas, Adrian Thomas Huber, Raphael Sznitman, Pablo M\u00e1rquez-Neila", - "author": "Lukas Zbinden; Lars Doorenbos; Theodoros Pissas; Adrian Thomas Huber; Raphael Sznitman; Pablo M\u00e1rquez-Neila", + "author_site": "Lukas Zbinden, Lars Doorenbos, Theodoros Pissas, Adrian Thomas Huber, Raphael Sznitman, Pablo Márquez-Neila", + "author": "Lukas Zbinden; Lars Doorenbos; Theodoros Pissas; Adrian Thomas Huber; Raphael Sznitman; Pablo Márquez-Neila", "abstract": "Semantic segmentation has made significant progress in recent years thanks to deep neural networks, but the common objective of generating a single segmentation output that accurately matches the image's content may not be suitable for safety-critical domains such as medical diagnostics and autonomous driving. Instead, multiple possible correct segmentation maps may be required to reflect the true distribution of annotation maps. In this context, stochastic semantic segmentation methods must learn to predict conditional distributions of labels given the image, but this is challenging due to the typically multimodal distributions, high-dimensional output spaces, and limited annotation data. To address these challenges, we propose a conditional categorical diffusion model (CCDM) for semantic segmentation based on Denoising Diffusion Probabilistic Models. Our model is conditioned to the input image, enabling it to generate multiple segmentation label maps that account for the aleatoric uncertainty arising from divergent ground truth annotations. Our experimental results show that CCDM achieves state-of-the-art performance on LIDC, a stochastic semantic segmentation dataset, and outperforms established baselines on the classical segmentation dataset Cityscapes.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Zbinden_Stochastic_Segmentation_with_Conditional_Categorical_Diffusion_Models_ICCV_2023_paper.pdf", "aff": "University of Bern, Bern, Switzerland; University of Bern, Bern, Switzerland; University of Bern, Bern, Switzerland; University of Bern, Bern, Switzerland; University of Bern, Bern, Switzerland; University of Bern, Bern, Switzerland", @@ -55518,7 +57375,8 @@ "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Bern", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "Switzerland" + "aff_country_unique": "Switzerland", + "bibtex": "@InProceedings{Zbinden_2023_ICCV,\n \n author = {\n Zbinden,\n Lukas and Doorenbos,\n Lars and Pissas,\n Theodoros and Huber,\n Adrian Thomas and Sznitman,\n Raphael and M\\'arquez-Neila,\n Pablo\n},\n title = {\n Stochastic Segmentation with Conditional Categorical Diffusion Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1119-1129\n} \n}" }, { "title": "Story Visualization by Online Text Augmentation with Context Memory", @@ -55543,14 +57401,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ahn_Story_Visualization_by_Online_Text_Augmentation_with_Context_Memory_ICCV_2023_paper.html", "aff_unique_index": "0;1;2+3;2;3;4;0", - "aff_unique_norm": "Yonsei University;Gwangju Institute of Science and Technology;LG;University of Michigan;University of Minnesota", - "aff_unique_dep": ";;LG AI Research;;", + "aff_unique_norm": "Yonsei University;Gwangju Institute of Science and Technology;LG AI Research;University of Michigan;University of Minnesota", + "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.yonsei.ac.kr;https://www.gist.ac.kr;https://www.lgaires.com;https://www.umich.edu;https://www.minnesota.edu", "aff_unique_abbr": "Yonsei;GIST;LG AI;UM;UMN", "aff_campus_unique_index": "1;", "aff_campus_unique": ";Gwangju", "aff_country_unique_index": "0;0;0+1;0;1;1;0", - "aff_country_unique": "South Korea;United States" + "aff_country_unique": "South Korea;United States", + "bibtex": "@InProceedings{Ahn_2023_ICCV,\n \n author = {\n Ahn,\n Daechul and Kim,\n Daneul and Song,\n Gwangmo and Kim,\n Seung Hwan and Lee,\n Honglak and Kang,\n Dongyeop and Choi,\n Jonghyun\n},\n title = {\n Story Visualization by Online Text Augmentation with Context Memory\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3125-3135\n} \n}" }, { "title": "Strata-NeRF : Neural Radiance Fields for Stratified Scenes", @@ -55575,14 +57434,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Dhiman_Strata-NeRF__Neural_Radiance_Fields_for_Stratified_Scenes_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0;1;2;0", - "aff_unique_norm": "Indian Institute of Science;Samsung;Brown University", - "aff_unique_dep": "Vision and AI Lab;Samsung R & D Institute India;", + "aff_unique_norm": "Indian Institute of Science;Samsung R & D Institute India;Brown University", + "aff_unique_dep": "Vision and AI Lab;;", "aff_unique_url": "https://www.iisc.ac.in;https://www.samsung.com/in/;https://www.brown.edu", "aff_unique_abbr": "IISc;SRII;Brown", "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Bangalore;", "aff_country_unique_index": "0;0;0;0;0;1;0", - "aff_country_unique": "India;United States" + "aff_country_unique": "India;United States", + "bibtex": "@InProceedings{Dhiman_2023_ICCV,\n \n author = {\n Dhiman,\n Ankit and Srinath,\n R and Rangwani,\n Harsh and Parihar,\n Rishubh and Boregowda,\n Lokesh R and Sridhar,\n Srinath and Babu,\n R Venkatesh\n},\n title = {\n Strata-NeRF : Neural Radiance Fields for Stratified Scenes\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17603-17614\n} \n}" }, { "title": "Strip-MLP: Efficient Token Interaction for Vision MLP", @@ -55607,14 +57467,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Cao_Strip-MLP_Efficient_Token_Interaction_for_Vision_MLP_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0+1;1;1;0+1", - "aff_unique_norm": "Southern University of Science and Technology;Pengcheng Laboratory", - "aff_unique_dep": ";Peng Cheng Laboratory", + "aff_unique_norm": "Southern University of Science and Technology;Peng Cheng Laboratory", + "aff_unique_dep": ";", "aff_unique_url": "https://www.sustech.edu.cn;http://www.pcl.ac.cn", "aff_unique_abbr": "SUSTech;PCL", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0+0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Cao_2023_ICCV,\n \n author = {\n Cao,\n Guiping and Luo,\n Shengda and Huang,\n Wenjian and Lan,\n Xiangyuan and Jiang,\n Dongmei and Wang,\n Yaowei and Zhang,\n Jianguo\n},\n title = {\n Strip-MLP: Efficient Token Interaction for Vision MLP\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1494-1504\n} \n}" }, { "title": "Strivec: Sparse Tri-Vector Radiance Fields", @@ -55646,7 +57507,8 @@ "aff_campus_unique_index": "0;0;1;0", "aff_campus_unique": "Los Angeles;San Diego;", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Gao_2023_ICCV,\n \n author = {\n Gao,\n Quankai and Xu,\n Qiangeng and Su,\n Hao and Neumann,\n Ulrich and Xu,\n Zexiang\n},\n title = {\n Strivec: Sparse Tri-Vector Radiance Fields\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17569-17579\n} \n}" }, { "title": "Structural Alignment for Network Pruning through Partial Regularization", @@ -55678,7 +57540,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";College Park", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Gao_2023_ICCV,\n \n author = {\n Gao,\n Shangqian and Zhang,\n Zeyu and Zhang,\n Yanfu and Huang,\n Feihu and Huang,\n Heng\n},\n title = {\n Structural Alignment for Network Pruning through Partial Regularization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17402-17412\n} \n}" }, { "title": "Structure Invariant Transformation for better Adversarial Transferability", @@ -55710,7 +57573,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Xiaosen and Zhang,\n Zeliang and Zhang,\n Jianping\n},\n title = {\n Structure Invariant Transformation for better Adversarial Transferability\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4607-4619\n} \n}" }, { "title": "Structure and Content-Guided Video Synthesis with Diffusion Models", @@ -55742,7 +57606,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Esser_2023_ICCV,\n \n author = {\n Esser,\n Patrick and Chiu,\n Johnathan and Atighehchian,\n Parmida and Granskog,\n Jonathan and Germanidis,\n Anastasis\n},\n title = {\n Structure and Content-Guided Video Synthesis with Diffusion Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7346-7356\n} \n}" }, { "title": "Structure-Aware Surface Reconstruction via Primitive Assembly", @@ -55767,14 +57632,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Jiang_Structure-Aware_Surface_Reconstruction_via_Primitive_Assembly_ICCV_2023_paper.html", "aff_unique_index": "0+1+2;0+3;4;5;0+1;1+2;0+1", - "aff_unique_norm": "Chinese Academy of Sciences Institute of Automation;Chinese Academy of Sciences;University and Colleges Admissions Service;Beijing Academy of Artificial Intelligence;Shandong University;University of Hong Kong", + "aff_unique_norm": "Chinese Academy of Sciences Institute of Automation;Chinese Academy of Sciences;University and Colleges Admissions Service;Beijing Academy of Artificial Intelligence;Shandong University;The University of Hong Kong", "aff_unique_dep": "MAIS & NLPR;Kavli Institute for Mathematics and Mechanics, Academy of Mathematics and Systems Science;;;;", "aff_unique_url": "http://www.ia.cas.cn;http://www.cas.cn;https://www.ucas.com;https://www.baaic.cn;http://www.sdu.edu.cn;https://www.hku.hk", "aff_unique_abbr": "CASIA;CAS;UCAS;BAAI;SDU;HKU", "aff_campus_unique_index": ";;1;;;", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0+0+1;0+0;0;0;0+0;0+1;0+0", - "aff_country_unique": "China;United Kingdom" + "aff_country_unique": "China;United Kingdom", + "bibtex": "@InProceedings{Jiang_2023_ICCV,\n \n author = {\n Jiang,\n Jingen and Zhao,\n Mingyang and Xin,\n Shiqing and Yang,\n Yanchao and Wang,\n Hanxiao and Jia,\n Xiaohong and Yan,\n Dong-Ming\n},\n title = {\n Structure-Aware Surface Reconstruction via Primitive Assembly\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14171-14180\n} \n}" }, { "title": "Studying How to Efficiently and Effectively Guide Models with Explanations", @@ -55782,11 +57648,11 @@ "status": "Poster", "track": "main", "pid": "10062", - "author_site": "Sukrut Rao, Moritz B\u00f6hle, Amin Parchami-Araghi, Bernt Schiele", - "author": "Sukrut Rao; Moritz B\u00f6hle; Amin Parchami-Araghi; Bernt Schiele", + "author_site": "Sukrut Rao, Moritz Böhle, Amin Parchami-Araghi, Bernt Schiele", + "author": "Sukrut Rao; Moritz Böhle; Amin Parchami-Araghi; Bernt Schiele", "abstract": "Despite being highly performant, deep neural networks might base their decisions on features that spuriously correlate with the provided labels, thus hurting generalization. To mitigate this, 'model guidance' has recently gained popularity, i.e. the idea of regularizing the models' explanations to ensure that they are \"right for the right reasons\". While various techniques to achieve such model guidance have been proposed, experimental validation of these approaches has thus far been limited to relatively simple and / or synthetic datasets. To better understand the effectiveness of the various design choices that have been explored in the context of model guidance, in this work we conduct an in-depth evaluation across various loss functions, attribution methods, models, and 'guidance depths' on the PASCAL VOC 2007 and MS COCO 2014 datasets. As annotation costs for model guidance can limit its applicability, we also place a particular focus on efficiency. Specifically, we guide the models via bounding box annotations, which are much cheaper to obtain than the commonly used segmentation masks, and evaluate the robustness of model guidance under limited (e.g. with only 1% of annotated images) or overly coarse annotations. Further, we propose using the EPG score as an additional evaluation metric and loss function ('Energy loss'). We show that optimizing for the Energy loss leads to models that exhibit a distinct focus on object-specific features, despite only using bounding box annotations that also include background regions. Lastly, we show that such model guidance can improve generalization under distribution shifts. Code available at: https://github.com/sukrutrao/Model-Guidance", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Rao_Studying_How_to_Efficiently_and_Effectively_Guide_Models_with_Explanations_ICCV_2023_paper.pdf", - "aff": "Max Planck Institute for Informatics, Saarland Informatics Campus, Saarbr\u00fccken, Germany; Max Planck Institute for Informatics, Saarland Informatics Campus, Saarbr\u00fccken, Germany; Max Planck Institute for Informatics, Saarland Informatics Campus, Saarbr\u00fccken, Germany; Max Planck Institute for Informatics, Saarland Informatics Campus, Saarbr\u00fccken, Germany", + "aff": "Max Planck Institute for Informatics, Saarland Informatics Campus, Saarbrücken, Germany; Max Planck Institute for Informatics, Saarland Informatics Campus, Saarbrücken, Germany; Max Planck Institute for Informatics, Saarland Informatics Campus, Saarbrücken, Germany; Max Planck Institute for Informatics, Saarland Informatics Campus, Saarbrücken, Germany", "project": "", "github": "https://github.com/sukrutrao/Model-Guidance", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Rao_Studying_How_to_ICCV_2023_supplemental.pdf", @@ -55804,9 +57670,10 @@ "aff_unique_url": "https://mpi-inf.mpg.de", "aff_unique_abbr": "MPII", "aff_campus_unique_index": "0;0;0;0", - "aff_campus_unique": "Saarbr\u00fccken", + "aff_campus_unique": "Saarbrücken", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Rao_2023_ICCV,\n \n author = {\n Rao,\n Sukrut and B\\"ohle,\n Moritz and Parchami-Araghi,\n Amin and Schiele,\n Bernt\n},\n title = {\n Studying How to Efficiently and Effectively Guide Models with Explanations\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1922-1933\n} \n}" }, { "title": "StyleDiffusion: Controllable Disentangled Style Transfer via Diffusion Models", @@ -55838,7 +57705,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Zhizhong and Zhao,\n Lei and Xing,\n Wei\n},\n title = {\n StyleDiffusion: Controllable Disentangled Style Transfer via Diffusion Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7677-7689\n} \n}" }, { "title": "StyleDomain: Efficient and Lightweight Parameterizations of StyleGAN for One-shot and Few-shot Domain Adaptation", @@ -55870,7 +57738,8 @@ "aff_campus_unique_index": ";1;", "aff_campus_unique": ";Moscow", "aff_country_unique_index": "0+1;1;1+0;0+1", - "aff_country_unique": "Russian Federation;Japan" + "aff_country_unique": "Russia;Japan", + "bibtex": "@InProceedings{Alanov_2023_ICCV,\n \n author = {\n Alanov,\n Aibek and Titov,\n Vadim and Nakhodnov,\n Maksim and Vetrov,\n Dmitry\n},\n title = {\n StyleDomain: Efficient and Lightweight Parameterizations of StyleGAN for One-shot and Few-shot Domain Adaptation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2184-2194\n} \n}" }, { "title": "StyleGANEX: StyleGAN-Based Manipulation Beyond Cropped Aligned Faces", @@ -55902,7 +57771,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n Shuai and Jiang,\n Liming and Liu,\n Ziwei and Loy,\n Chen Change\n},\n title = {\n StyleGANEX: StyleGAN-Based Manipulation Beyond Cropped Aligned Faces\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21000-21010\n} \n}" }, { "title": "StyleInV: A Temporal Style Modulated Inversion Network for Unconditional Video Generation", @@ -55934,7 +57804,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Yuhan and Jiang,\n Liming and Loy,\n Chen Change\n},\n title = {\n StyleInV: A Temporal Style Modulated Inversion Network for Unconditional Video Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22851-22861\n} \n}" }, { "title": "StyleLipSync: Style-based Personalized Lip-sync Video Generation", @@ -55966,7 +57837,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Ki_2023_ICCV,\n \n author = {\n Ki,\n Taekyung and Min,\n Dongchan\n},\n title = {\n StyleLipSync: Style-based Personalized Lip-sync Video Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22841-22850\n} \n}" }, { "title": "StylerDALLE: Language-Guided Style Transfer Using a Vector-Quantized Tokenizer of a Large-Scale Generative Model", @@ -55998,7 +57870,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Italy" + "aff_country_unique": "Italy", + "bibtex": "@InProceedings{Xu_2023_ICCV,\n \n author = {\n Xu,\n Zipeng and Sangineto,\n Enver and Sebe,\n Nicu\n},\n title = {\n StylerDALLE: Language-Guided Style Transfer Using a Vector-Quantized Tokenizer of a Large-Scale Generative Model\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7601-7611\n} \n}" }, { "title": "SuS-X: Training-Free Name-Only Transfer of Vision-Language Models", @@ -56030,7 +57903,8 @@ "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Cambridge;London", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Udandarao_2023_ICCV,\n \n author = {\n Udandarao,\n Vishaal and Gupta,\n Ankush and Albanie,\n Samuel\n},\n title = {\n SuS-X: Training-Free Name-Only Transfer of Vision-Language Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2725-2736\n} \n}" }, { "title": "Subclass-balancing Contrastive Learning for Long-tailed Recognition", @@ -56062,7 +57936,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;1", - "aff_country_unique": "China;United States;Singapore" + "aff_country_unique": "China;United States;Singapore", + "bibtex": "@InProceedings{Hou_2023_ICCV,\n \n author = {\n Hou,\n Chengkai and Zhang,\n Jieyu and Wang,\n Haonan and Zhou,\n Tianyi\n},\n title = {\n Subclass-balancing Contrastive Learning for Long-tailed Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5395-5407\n} \n}" }, { "title": "SupFusion: Supervised LiDAR-Camera Fusion for 3D Object Detection", @@ -56087,14 +57962,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Qin_SupFusion_Supervised_LiDAR-Camera_Fusion_for_3D_Object_Detection_ICCV_2023_paper.html", "aff_unique_index": "0;0;2;2;0;0", - "aff_unique_norm": "Chinese University of Hong Kong, Shenzhen;;NIO2", + "aff_unique_norm": "The Chinese University of Hong Kong, Shenzhen;;NIO2", "aff_unique_dep": "School of Data Science;;", "aff_unique_url": "https://www.cuhk.edu.cn/en/shenzhen;;", "aff_unique_abbr": "CUHK-Shenzhen;;", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Shenzhen;", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Qin_2023_ICCV,\n \n author = {\n Qin,\n Yiran and Wang,\n Chaoqun and Kang,\n Zijian and Ma,\n Ningning and Li,\n Zhen and Zhang,\n Ruimao\n},\n title = {\n SupFusion: Supervised LiDAR-Camera Fusion for 3D Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22014-22024\n} \n}" }, { "title": "Supervised Homography Learning with Realistic Dataset Generation", @@ -56126,7 +58002,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Jiang_2023_ICCV,\n \n author = {\n Jiang,\n Hai and Li,\n Haipeng and Han,\n Songchen and Fan,\n Haoqiang and Zeng,\n Bing and Liu,\n Shuaicheng\n},\n title = {\n Supervised Homography Learning with Realistic Dataset Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9806-9815\n} \n}" }, { "title": "Surface Extraction from Neural Unsigned Distance Fields", @@ -56151,14 +58028,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhang_Surface_Extraction_from_Neural_Unsigned_Distance_Fields_ICCV_2023_paper.html", "aff_unique_index": "0+1;0+1;0+1;2;0;2;2;2", - "aff_unique_norm": "University of Hong Kong;TransGP;Texas A&M University", + "aff_unique_norm": "The University of Hong Kong;TransGP;Texas A&M University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.hku.hk;;https://www.tamu.edu", "aff_unique_abbr": "HKU;;TAMU", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;2;0;2;2;2", - "aff_country_unique": "China;;United States" + "aff_country_unique": "China;;United States", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Congyi and Lin,\n Guying and Yang,\n Lei and Li,\n Xin and Komura,\n Taku and Schaefer,\n Scott and Keyser,\n John and Wang,\n Wenping\n},\n title = {\n Surface Extraction from Neural Unsigned Distance Fields\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22531-22540\n} \n}" }, { "title": "Surface Normal Clustering for Implicit Representation of Manhattan Scenes", @@ -56170,7 +58048,7 @@ "author": "Nikola Popovic; Danda Pani Paudel; Luc Van Gool", "abstract": "Novel view synthesis and 3D modeling using implicit neural field representation are shown to be very effective for calibrated multi-view cameras. Such representations are known to benefit from additional geometric and semantic supervision. Most existing methods that exploit additional supervision require dense pixel-wise labels or localized scene priors. These methods cannot benefit from high-level vague scene priors provided in terms of scenes' descriptions. In this work, we aim to leverage the geometric prior of Manhattan scenes to improve the implicit neural radiance field representations. More precisely, we assume that only the knowledge of the indoor scene (under investigation) being Manhattan is known -- with no additional information whatsoever -- with an unknown Manhattan coordinate frame. Such high-level prior is used to self-supervise the surface normals derived explicitly in the implicit neural fields. Our modeling allows us to cluster the derived normals and exploit their orthogonality constraints for self-supervision. Our exhaustive experiments on datasets of diverse indoor scenes demonstrate the significant benefit of the proposed method over the established baselines. The source code will be available at https://github.com/nikola3794/normal-clustering-nerf.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Popovic_Surface_Normal_Clustering_for_Implicit_Representation_of_Manhattan_Scenes_ICCV_2023_paper.pdf", - "aff": "Computer Vision Laboratory, ETH Zurich, Switzerland + INSAIT, So\ufb01a University, Bulgaria; Computer Vision Laboratory, ETH Zurich, Switzerland + INSAIT, So\ufb01a University, Bulgaria; Computer Vision Laboratory, ETH Zurich, Switzerland + INSAIT, So\ufb01a University, Bulgaria", + "aff": "Computer Vision Laboratory, ETH Zurich, Switzerland + INSAIT, Sofia University, Bulgaria; Computer Vision Laboratory, ETH Zurich, Switzerland + INSAIT, Sofia University, Bulgaria; Computer Vision Laboratory, ETH Zurich, Switzerland + INSAIT, Sofia University, Bulgaria", "project": "", "github": "https://github.com/nikola3794/normal-clustering-nerf", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Popovic_Surface_Normal_Clustering_ICCV_2023_supplemental.pdf", @@ -56183,14 +58061,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Popovic_Surface_Normal_Clustering_for_Implicit_Representation_of_Manhattan_Scenes_ICCV_2023_paper.html", "aff_unique_index": "0+1;0+1;0+1", - "aff_unique_norm": "ETH Zurich;So\ufb01a University", + "aff_unique_norm": "ETH Zurich;Sofia University", "aff_unique_dep": "Computer Vision Laboratory;INSAIT", "aff_unique_url": "https://www.ethz.ch;https://www.uni-sofia.bg", "aff_unique_abbr": "ETHZ;", - "aff_campus_unique_index": ";;", - "aff_campus_unique": "", + "aff_campus_unique_index": "0;0;0", + "aff_campus_unique": "Zurich;", "aff_country_unique_index": "0+1;0+1;0+1", - "aff_country_unique": "Switzerland;Bulgaria" + "aff_country_unique": "Switzerland;Bulgaria", + "bibtex": "@InProceedings{Popovic_2023_ICCV,\n \n author = {\n Popovic,\n Nikola and Paudel,\n Danda Pani and Van Gool,\n Luc\n},\n title = {\n Surface Normal Clustering for Implicit Representation of Manhattan Scenes\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17860-17870\n} \n}" }, { "title": "SurfsUP: Learning Fluid Simulation for Novel Surfaces", @@ -56222,7 +58101,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1+1;0;0", - "aff_country_unique": "United States;Canada" + "aff_country_unique": "United States;Canada", + "bibtex": "@InProceedings{Mani_2023_ICCV,\n \n author = {\n Mani,\n Arjun and Chandratreya,\n Ishaan Preetam and Creager,\n Elliot and Vondrick,\n Carl and Zemel,\n Richard\n},\n title = {\n SurfsUP: Learning Fluid Simulation for Novel Surfaces\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14225-14235\n} \n}" }, { "title": "SurroundOcc: Multi-camera 3D Occupancy Prediction for Autonomous Driving", @@ -56245,7 +58125,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wei_SurroundOcc_Multi-camera_3D_Occupancy_Prediction_for_Autonomous_Driving_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wei_SurroundOcc_Multi-camera_3D_Occupancy_Prediction_for_Autonomous_Driving_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Wei_2023_ICCV,\n \n author = {\n Wei,\n Yi and Zhao,\n Linqing and Zheng,\n Wenzhao and Zhu,\n Zheng and Zhou,\n Jie and Lu,\n Jiwen\n},\n title = {\n SurroundOcc: Multi-camera 3D Occupancy Prediction for Autonomous Driving\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21729-21740\n} \n}" }, { "title": "SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications", @@ -56270,14 +58151,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Shaker_SwiftFormer_Efficient_Additive_Attention_for_Transformer-based_Real-time_Mobile_Vision_Applications_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0;1+2+3;4", - "aff_unique_norm": "Mohamed bin Zayed University of Artificial Intelligence;University of California, Merced;Yonsei University;Google;Link\u00f6ping University", + "aff_unique_norm": "Mohamed bin Zayed University of Artificial Intelligence;University of California, Merced;Yonsei University;Google;Linköping University", "aff_unique_dep": ";;;Google Research;", "aff_unique_url": "https://mbzuai.ac.ae;https://www.ucmerced.edu;https://www.yonsei.ac.kr;https://research.google;https://www.liu.se", "aff_unique_abbr": "MBZUAI;UC Merced;Yonsei;Google Research;LiU", "aff_campus_unique_index": "1+2", "aff_campus_unique": ";Merced;Mountain View", "aff_country_unique_index": "0;0;0;0;1+2+1;3", - "aff_country_unique": "United Arab Emirates;United States;South Korea;Sweden" + "aff_country_unique": "United Arab Emirates;United States;South Korea;Sweden", + "bibtex": "@InProceedings{Shaker_2023_ICCV,\n \n author = {\n Shaker,\n Abdelrahman and Maaz,\n Muhammad and Rasheed,\n Hanoona and Khan,\n Salman and Yang,\n Ming-Hsuan and Khan,\n Fahad Shahbaz\n},\n title = {\n SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17425-17436\n} \n}" }, { "title": "SwinLSTM: Improving Spatiotemporal Prediction Accuracy using Swin Transformer and LSTM", @@ -56309,7 +58191,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Tang_2023_ICCV,\n \n author = {\n Tang,\n Song and Li,\n Chuang and Zhang,\n Pu and Tang,\n RongNian\n},\n title = {\n SwinLSTM: Improving Spatiotemporal Prediction Accuracy using Swin Transformer and LSTM\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13470-13479\n} \n}" }, { "title": "SynBody: Synthetic Dataset with Layered Human Models for 3D Human Perception and Modeling", @@ -56334,14 +58217,15 @@ "author_num": 15, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yang_SynBody_Synthetic_Dataset_with_Layered_Human_Models_for_3D_Human_ICCV_2023_paper.html", "aff_unique_index": "0;0+1+2;0;1;2;0;0;0;0;1;1;0;3;2;0+1", - "aff_unique_norm": "SenseTime;Shanghai AI Laboratory;Nanyang Technological University;Chinese University of Hong Kong", + "aff_unique_norm": "SenseTime;Shanghai AI Laboratory;Nanyang Technological University;The Chinese University of Hong Kong", "aff_unique_dep": "SenseTime Research;;S-Lab;", "aff_unique_url": "https://www.sensetime.com;https://www.shanghai-ai-lab.com;https://www.ntu.edu.sg;https://www.cuhk.edu.hk", "aff_unique_abbr": "SenseTime;SAIL;NTU;CUHK", "aff_campus_unique_index": ";1;", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0+0+1;0;0;1;0;0;0;0;0;0;0;0;1;0+0", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n Zhitao and Cai,\n Zhongang and Mei,\n Haiyi and Liu,\n Shuai and Chen,\n Zhaoxi and Xiao,\n Weiye and Wei,\n Yukun and Qing,\n Zhongfei and Wei,\n Chen and Dai,\n Bo and Wu,\n Wayne and Qian,\n Chen and Lin,\n Dahua and Liu,\n Ziwei and Yang,\n Lei\n},\n title = {\n SynBody: Synthetic Dataset with Layered Human Models for 3D Human Perception and Modeling\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20282-20292\n} \n}" }, { "title": "Synchronize Feature Extracting and Matching: A Single Branch Framework for 3D Object Tracking", @@ -56353,7 +58237,7 @@ "author": "Teli Ma; Mengmeng Wang; Jimin Xiao; Huifeng Wu; Yong Liu", "abstract": "Siamese network has been a de facto benchmark framework for 3D LiDAR object tracking with a shared-parametric encoder extracting features from template and search region, respectively. This paradigm relies heavily on an additional matching network to model the cross-correlation/similarity of the template and search region. In this paper, we forsake the conventional Siamese paradigm and propose a novel single-branch framework, SyncTrack, synchronizing the feature extracting and matching to avoid forwarding encoder twice for template and search region as well as introducing extra parameters of matching network. The synchronization mechanism is based on the dynamic affinity of the Transformer, and an in-depth analysis of the relevance is provided theoretically. Moreover, based on the synchronization, we introduce a novel Attentive Points-Sampling strategy into the Transformer layers (APST), replacing the random/Farthest Points Sampling (FPS) method with sampling under the supervision of attentive relations between the template and search region. It implies connecting point-wise sampling with the feature learning, beneficial to aggregating more distinctive and geometric features for tracking with sparse points. Extensive experiments on two benchmark datasets (KITTI and NuScenes) show that SyncTrack achieves state-of-the-art performance in real-time tracking.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Ma_Synchronize_Feature_Extracting_and_Matching_A_Single_Branch_Framework_for_ICCV_2023_paper.pdf", - "aff": "The Hong Kong University of Science and Technology, Guangzhou+Zhejiang University; Zhejiang University; Xi\u2019an Jiaotong-Liverpool University; Hangzhou Dianzi University; Zhejiang University", + "aff": "The Hong Kong University of Science and Technology, Guangzhou+Zhejiang University; Zhejiang University; Xi’an Jiaotong-Liverpool University; Hangzhou Dianzi University; Zhejiang University", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Ma_Synchronize_Feature_Extracting_ICCV_2023_supplemental.pdf", @@ -56366,14 +58250,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ma_Synchronize_Feature_Extracting_and_Matching_A_Single_Branch_Framework_for_ICCV_2023_paper.html", "aff_unique_index": "0+1;1;2;3;1", - "aff_unique_norm": "Hong Kong University of Science and Technology;Zhejiang University;Xi'an Jiao Tong-Liverpool University;Hangzhou Dianzi University", + "aff_unique_norm": "The Hong Kong University of Science and Technology;Zhejiang University;Xi'an Jiaotong-Liverpool University;Hangzhou Dianzi University", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.ust.hk;https://www.zju.edu.cn;https://www.xjtu.edu.cn;http://www.hdu.edu.cn/", "aff_unique_abbr": "HKUST;ZJU;XJTLU;HGHDU", "aff_campus_unique_index": "0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0+0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Ma_2023_ICCV,\n \n author = {\n Ma,\n Teli and Wang,\n Mengmeng and Xiao,\n Jimin and Wu,\n Huifeng and Liu,\n Yong\n},\n title = {\n Synchronize Feature Extracting and Matching: A Single Branch Framework for 3D Object Tracking\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9953-9963\n} \n}" }, { "title": "Synthesizing Diverse Human Motions in 3D Indoor Scenes", @@ -56385,7 +58270,7 @@ "author": "Kaifeng Zhao; Yan Zhang; Shaofei Wang; Thabo Beeler; Siyu Tang", "abstract": "We present a novel method for populating 3D indoor scenes with virtual humans that can navigate in the environment and interact with objects in a realistic manner. Existing approaches rely on high-quality training sequences that contain captured human motions and the 3D scenes they interact with. However, such interaction data are costly, difficult to capture, and can hardly cover the full range of plausible human-scene interactions in complex indoor environments. To address these challenges, we propose a reinforcement learning-based approach that enables virtual humans to navigate in 3D scenes and interact with objects realistically and autonomously, driven by learned motion control policies. The motion control policies employ latent motion action spaces, which correspond to realistic motion primitives and are learned from large-scale motion capture data using a powerful generative motion model. For navigation in a 3D environment, we propose a scene-aware policy with novel state and reward designs for collision avoidance. Combined with navigation mesh-based path-finding algorithms to generate intermediate waypoints, our approach enables the synthesis of diverse human motions navigating in 3D indoor scenes and avoiding obstacles. To generate fine-grained human-object interactions, we carefully curate interaction goal guidance using a marker-based body representation and leverage features based on the signed distance field (SDF) to encode human-scene proximity relations. Our method can synthesize realistic and diverse human-object interactions (e.g., sitting on a chair and then getting up) even for out-of-distribution test scenarios with different object shapes, orientations, starting body positions, and poses. Experimental results demonstrate that our approach outperforms state-of-the-art human-scene interaction synthesis methods in terms of both motion naturalness and diversity. Code, models, and demonstrative video results are publicly available at: https://zkf1997.github.io/DIMOS.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Zhao_Synthesizing_Diverse_Human_Motions_in_3D_Indoor_Scenes_ICCV_2023_paper.pdf", - "aff": "ETH Z\u00fcrich; ETH Z\u00fcrich; ETH Z\u00fcrich; Google; ETH Z\u00fcrich", + "aff": "ETH Zürich; ETH Zürich; ETH Zürich; Google; ETH Zürich", "project": "", "github": "https://zkf1997.github.io/DIMOS", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Zhao_Synthesizing_Diverse_Human_ICCV_2023_supplemental.pdf", @@ -56398,14 +58283,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhao_Synthesizing_Diverse_Human_Motions_in_3D_Indoor_Scenes_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;1;0", - "aff_unique_norm": "ETH Zurich;Google", - "aff_unique_dep": ";Google", + "aff_unique_norm": "ETH Zürich;Google", + "aff_unique_dep": ";", "aff_unique_url": "https://www.ethz.ch;https://www.google.com", "aff_unique_abbr": "ETHZ;Google", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;1;0", - "aff_country_unique": "Switzerland;United States" + "aff_country_unique": "Switzerland;United States", + "bibtex": "@InProceedings{Zhao_2023_ICCV,\n \n author = {\n Zhao,\n Kaifeng and Zhang,\n Yan and Wang,\n Shaofei and Beeler,\n Thabo and Tang,\n Siyu\n},\n title = {\n Synthesizing Diverse Human Motions in 3D Indoor Scenes\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14738-14749\n} \n}" }, { "title": "TALL: Thumbnail Layout for Deepfake Video Detection", @@ -56437,7 +58323,8 @@ "aff_campus_unique_index": ";;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0;0+0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xu_2023_ICCV,\n \n author = {\n Xu,\n Yuting and Liang,\n Jian and Jia,\n Gengyun and Yang,\n Ziming and Zhang,\n Yanhao and He,\n Ran\n},\n title = {\n TALL: Thumbnail Layout for Deepfake Video Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22658-22668\n} \n}" }, { "title": "TAPIR: Tracking Any Point with Per-Frame Initialization and Temporal Refinement", @@ -56460,7 +58347,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Doersch_TAPIR_Tracking_Any_Point_with_Per-Frame_Initialization_and_Temporal_Refinement_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Doersch_TAPIR_Tracking_Any_Point_with_Per-Frame_Initialization_and_Temporal_Refinement_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Doersch_2023_ICCV,\n \n author = {\n Doersch,\n Carl and Yang,\n Yi and Vecerik,\n Mel and Gokay,\n Dilara and Gupta,\n Ankush and Aytar,\n Yusuf and Carreira,\n Joao and Zisserman,\n Andrew\n},\n title = {\n TAPIR: Tracking Any Point with Per-Frame Initialization and Temporal Refinement\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10061-10072\n} \n}" }, { "title": "TARGET: Federated Class-Continual Learning via Exemplar-Free Distillation", @@ -56472,7 +58360,7 @@ "author": "Jie Zhang; Chen Chen; Weiming Zhuang; Lingjuan Lyu", "abstract": "This paper focuses on an under-explored yet important problem: Federated Class-Continual Learning (FCCL), where new classes are dynamically added in federated learning. Existing FCCL works suffer from various limitations, such as requiring additional datasets or storing the private data from previous tasks. In response, we first demonstrate that non-IID data exacerbates catastrophic forgetting issue in FL. Then we propose a novel method called TARGET (federatTed clAss-continual leaRninG via Exemplar-free disTillation), which alleviates catastrophic forgetting in FCCL while preserving client data privacy. Our proposed method leverages the previously trained global model to transfer knowledge of old tasks to the current task at the model level. Moreover, a generator is trained to produce synthetic data to simulate the global distribution of data on each client at the data level. Compared to previous FCCL methods, TARGET does not require any additional datasets or storing real data from previous tasks, which makes it ideal for data-sensitive scenarios.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Zhang_TARGET_Federated_Class-Continual_Learning_via_Exemplar-Free_Distillation_ICCV_2023_paper.pdf", - "aff": "ETH Zurich*; Sony AI; Sony AI; Sony AI\u2020", + "aff": "ETH Zurich*; Sony AI; Sony AI; Sony AI†", "project": "", "github": "", "supp": "", @@ -56492,7 +58380,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1", - "aff_country_unique": "Switzerland;Japan" + "aff_country_unique": "Switzerland;Japan", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Jie and Chen,\n Chen and Zhuang,\n Weiming and Lyu,\n Lingjuan\n},\n title = {\n TARGET: Federated Class-Continual Learning via Exemplar-Free Distillation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4782-4793\n} \n}" }, { "title": "TCOVIS: Temporally Consistent Online Video Instance Segmentation", @@ -56524,7 +58413,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Junlong and Yu,\n Bingyao and Rao,\n Yongming and Zhou,\n Jie and Lu,\n Jiwen\n},\n title = {\n TCOVIS: Temporally Consistent Online Video Instance Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1097-1107\n} \n}" }, { "title": "TEMPO: Efficient Multi-View Pose Estimation, Tracking, and Forecasting", @@ -56532,8 +58422,8 @@ "status": "Poster", "track": "main", "pid": "6415", - "author_site": "Rohan Choudhury, Kris M. Kitani, L\u00e1szl\u00f3 A. Jeni", - "author": "Rohan Choudhury; Kris M. Kitani; L\u00e1szl\u00f3 A. Jeni", + "author_site": "Rohan Choudhury, Kris M. Kitani, László A. Jeni", + "author": "Rohan Choudhury; Kris M. Kitani; László A. Jeni", "abstract": "Existing volumetric methods for predicting 3D human pose estimation are accurate, but computationally expensive and optimized for single time-step prediction. We present TEMPO, an efficient multi-view pose estimation model that learns a robust spatiotemporal representation, improving pose accuracy while also tracking and forecasting human pose. We significantly reduce computation compared to the state-of-the-art by recurrently computing per-person 2D pose features, fusing both spatial and temporal information into a single representation. In doing so, our model is able to use spatiotemporal context to predict more accurate human poses without sacrificing efficiency. We further use this representation to track human poses over time as well as predict future poses. Finally, we demonstrate that our model is able to generalize across datasets without scene-specific fine-tuning. TEMPO achieves 10% better MPJPE with a 33x improvement in FPS compared to TesseTrack on the challenging CMU Panoptic Studio dataset. Our code and demos are available at https://rccchoudhury.github.io/tempo2023.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Choudhury_TEMPO_Efficient_Multi-View_Pose_Estimation_Tracking_and_Forecasting_ICCV_2023_paper.pdf", "aff": "Robotics Institute, Carnegie Mellon University; Robotics Institute, Carnegie Mellon University; Robotics Institute, Carnegie Mellon University", @@ -56556,7 +58446,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Pittsburgh", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Choudhury_2023_ICCV,\n \n author = {\n Choudhury,\n Rohan and Kitani,\n Kris M. and Jeni,\n L\\'aszl\\'o A.\n},\n title = {\n TEMPO: Efficient Multi-View Pose Estimation,\n Tracking,\n and Forecasting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14750-14760\n} \n}" }, { "title": "TF-ICON: Diffusion-Based Training-Free Cross-Domain Image Composition", @@ -56588,7 +58479,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Singapore;", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Lu_2023_ICCV,\n \n author = {\n Lu,\n Shilin and Liu,\n Yanzhu and Kong,\n Adams Wai-Kin\n},\n title = {\n TF-ICON: Diffusion-Based Training-Free Cross-Domain Image Composition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2294-2305\n} \n}" }, { "title": "TIFA: Accurate and Interpretable Text-to-Image Faithfulness Evaluation with Question Answering", @@ -56620,7 +58512,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0+0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Hu_2023_ICCV,\n \n author = {\n Hu,\n Yushi and Liu,\n Benlin and Kasai,\n Jungo and Wang,\n Yizhong and Ostendorf,\n Mari and Krishna,\n Ranjay and Smith,\n Noah A.\n},\n title = {\n TIFA: Accurate and Interpretable Text-to-Image Faithfulness Evaluation with Question Answering\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20406-20417\n} \n}" }, { "title": "TIJO: Trigger Inversion with Joint Optimization for Defending Multimodal Backdoored Models", @@ -56652,7 +58545,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Sur_2023_ICCV,\n \n author = {\n Sur,\n Indranil and Sikka,\n Karan and Walmer,\n Matthew and Koneripalli,\n Kaushik and Roy,\n Anirban and Lin,\n Xiao and Divakaran,\n Ajay and Jha,\n Susmit\n},\n title = {\n TIJO: Trigger Inversion with Joint Optimization for Defending Multimodal Backdoored Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 165-175\n} \n}" }, { "title": "TM2D: Bimodality Driven 3D Dance Generation via Music-Text Integration", @@ -56677,14 +58571,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Gong_TM2D_Bimodality_Driven_3D_Dance_Generation_via_Music-Text_Integration_ICCV_2023_paper.html", "aff_unique_index": "0;0;1;1;0;1;1;0", - "aff_unique_norm": "National University of Singapore;Huawei", - "aff_unique_dep": ";Huawei Technologies", + "aff_unique_norm": "National University of Singapore;Huawei Technologies", + "aff_unique_dep": ";", "aff_unique_url": "https://www.nus.edu.sg;https://www.huawei.com", "aff_unique_abbr": "NUS;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;1;0;1;1;0", - "aff_country_unique": "Singapore;China" + "aff_country_unique": "Singapore;China", + "bibtex": "@InProceedings{Gong_2023_ICCV,\n \n author = {\n Gong,\n Kehong and Lian,\n Dongze and Chang,\n Heng and Guo,\n Chuan and Jiang,\n Zihang and Zuo,\n Xinxin and Mi,\n Michael Bi and Wang,\n Xinchao\n},\n title = {\n TM2D: Bimodality Driven 3D Dance Generation via Music-Text Integration\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9942-9952\n} \n}" }, { "title": "TMA: Temporal Motion Aggregation for Event-based Optical Flow", @@ -56716,7 +58611,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;1;0", - "aff_country_unique": "China;Germany" + "aff_country_unique": "China;Germany", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Haotian and Chen,\n Guang and Qu,\n Sanqing and Zhang,\n Yanping and Li,\n Zhijun and Knoll,\n Alois and Jiang,\n Changjun\n},\n title = {\n TMA: Temporal Motion Aggregation for Event-based Optical Flow\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9685-9694\n} \n}" }, { "title": "TMR: Text-to-Motion Retrieval Using Contrastive 3D Human Motion Synthesis", @@ -56724,11 +58620,11 @@ "status": "Poster", "track": "main", "pid": "5332", - "author_site": "Mathis Petrovich, Michael J. Black, G\u00fcl Varol", - "author": "Mathis Petrovich; Michael J. Black; G\u00fcl Varol", + "author_site": "Mathis Petrovich, Michael J. Black, Gül Varol", + "author": "Mathis Petrovich; Michael J. Black; Gül Varol", "abstract": "In this paper, we present TMR, a simple yet effective approach for text to 3D human motion retrieval. While previous work has only treated retrieval as a proxy evaluation metric, we tackle it as a standalone task.\n Our method extends the state-of-the-art text-to-motion synthesis model TEMOS, and incorporates a contrastive loss to better structure the cross-modal latent space. We show that maintaining the motion generation loss, along with the contrastive training, is crucial to obtain good performance. We introduce a benchmark for evaluation and provide an in-depth analysis by reporting results on several protocols. Our extensive experiments on the KIT-ML and HumanML3D datasets show that TMR outperforms the prior work by a significant margin, for example reducing the median rank from 54 to 19. Finally, we showcase the potential of our approach on moment retrieval. Our code and models are publicly available\n at https://mathis.petrovich.fr/tmr.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Petrovich_TMR_Text-to-Motion_Retrieval_Using_Contrastive_3D_Human_Motion_Synthesis_ICCV_2023_paper.pdf", - "aff": "LIGM, \u00b4Ecole des Ponts, Univ Gustave Eiffel, CNRS, France+Max Planck Institute for Intelligent Systems, T \u00a8ubingen, Germany; Max Planck Institute for Intelligent Systems, T \u00a8ubingen, Germany; LIGM, \u00b4Ecole des Ponts, Univ Gustave Eiffel, CNRS, France", + "aff": "LIGM, ´Ecole des Ponts, Univ Gustave Eiffel, CNRS, France+Max Planck Institute for Intelligent Systems, T ¨ubingen, Germany; Max Planck Institute for Intelligent Systems, T ¨ubingen, Germany; LIGM, ´Ecole des Ponts, Univ Gustave Eiffel, CNRS, France", "project": "https://mathis.petrovich.fr/tmr", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Petrovich_TMR_Text-to-Motion_Retrieval_ICCV_2023_supplemental.pdf", @@ -56746,9 +58642,10 @@ "aff_unique_url": "https://www.ponts.org;https://www.mpi-is.mpg.de", "aff_unique_abbr": "ENPC;MPI-IS", "aff_campus_unique_index": "1;1", - "aff_campus_unique": ";T\u00fcbingen", + "aff_campus_unique": ";Tübingen", "aff_country_unique_index": "0+1;1;0", - "aff_country_unique": "France;Germany" + "aff_country_unique": "France;Germany", + "bibtex": "@InProceedings{Petrovich_2023_ICCV,\n \n author = {\n Petrovich,\n Mathis and Black,\n Michael J. and Varol,\n G\\"ul\n},\n title = {\n TMR: Text-to-Motion Retrieval Using Contrastive 3D Human Motion Synthesis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9488-9497\n} \n}" }, { "title": "TORE: Token Reduction for Efficient Human Mesh Recovery with Transformer", @@ -56773,14 +58670,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Dou_TORE_Token_Reduction_for_Efficient_Human_Mesh_Recovery_with_Transformer_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;3;4;0;0;5", - "aff_unique_norm": "University of Hong Kong;University of Oxford;Tencent;University of Cambridge;City University of Hong Kong;Texas A&M University", + "aff_unique_norm": "The University of Hong Kong;University of Oxford;Tencent Holdings Limited;University of Cambridge;City University of Hong Kong;Texas A&M University", "aff_unique_dep": ";;Tencent Games;;;", "aff_unique_url": "https://www.hku.hk;https://www.ox.ac.uk;https://games.qq.com;https://www.cam.ac.uk;https://www.cityu.edu.hk;https://www.tamu.edu", "aff_unique_abbr": "HKU;Oxford;Tencent Games;Cambridge;CityU;TAMU", "aff_campus_unique_index": "0;2;0;0;0", "aff_campus_unique": "Hong Kong SAR;;Cambridge", "aff_country_unique_index": "0;1;0;1;0;0;0;2", - "aff_country_unique": "China;United Kingdom;United States" + "aff_country_unique": "China;United Kingdom;United States", + "bibtex": "@InProceedings{Dou_2023_ICCV,\n \n author = {\n Dou,\n Zhiyang and Wu,\n Qingxuan and Lin,\n Cheng and Cao,\n Zeyu and Wu,\n Qiangqiang and Wan,\n Weilin and Komura,\n Taku and Wang,\n Wenping\n},\n title = {\n TORE: Token Reduction for Efficient Human Mesh Recovery with Transformer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15143-15155\n} \n}" }, { "title": "TRM-UAP: Enhancing the Transferability of Data-Free Universal Adversarial Perturbation via Truncated Ratio Maximization", @@ -56812,7 +58710,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Yiran and Feng,\n Xin and Wang,\n Yunlong and Yang,\n Wu and Ming,\n Di\n},\n title = {\n TRM-UAP: Enhancing the Transferability of Data-Free Universal Adversarial Perturbation via Truncated Ratio Maximization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4762-4771\n} \n}" }, { "title": "Take-A-Photo: 3D-to-2D Generative Pre-training of Point Cloud Models", @@ -56844,7 +58743,8 @@ "aff_campus_unique_index": ";;;;", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Ziyi and Yu,\n Xumin and Rao,\n Yongming and Zhou,\n Jie and Lu,\n Jiwen\n},\n title = {\n Take-A-Photo: 3D-to-2D Generative Pre-training of Point Cloud Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5640-5650\n} \n}" }, { "title": "Talking Head Generation with Probabilistic Audio-to-Visual Diffusion Priors", @@ -56868,15 +58768,16 @@ "email": "xiaobing.ai;connect.ust.hk;connect.hkust-gz.edu.cn;xiaobing.ai;xiaobing.ai;xiaobing.ai", "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yu_Talking_Head_Generation_with_Probabilistic_Audio-to-Visual_Diffusion_Priors_ICCV_2023_paper.html", - "aff_unique_index": "0;0+1;0+1;0;0;0", - "aff_unique_norm": "Xiaobing.AI;Hong Kong University of Science and Technology", - "aff_unique_dep": ";", - "aff_unique_url": "https://xiaobing.ai;https://www.ust.hk", - "aff_unique_abbr": "Xiaobing.AI;HKUST", + "aff_unique_index": "0;0+1;0+2;0;0;0", + "aff_unique_norm": "Xiaobing.AI;Hong Kong University of Science and Technology;The Hong Kong University of Science and Technology", + "aff_unique_dep": ";;", + "aff_unique_url": "https://xiaobing.ai;https://www.ust.hk;https://www.ust.hk", + "aff_unique_abbr": "Xiaobing.AI;HKUST;HKUST", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Hong Kong SAR;Guangzhou", "aff_country_unique_index": "0;0+0;0+0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yu_2023_ICCV,\n \n author = {\n Yu,\n Zhentao and Yin,\n Zixin and Zhou,\n Deyu and Wang,\n Duomin and Wong,\n Finn and Wang,\n Baoyuan\n},\n title = {\n Talking Head Generation with Probabilistic Audio-to-Visual Diffusion Priors\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7645-7655\n} \n}" }, { "title": "Taming Contrast Maximization for Learning Sequential, Low-latency, Event-based Optical Flow", @@ -56884,8 +58785,8 @@ "status": "Poster", "track": "main", "pid": "10716", - "author_site": "Federico Paredes-Vall\u00e9s, Kirk Y. W. Scheper, Christophe De Wagter, Guido C. H. E. de Croon", - "author": "Federico Paredes-Vall\u00e9s; Kirk Y. W. Scheper; Christophe De Wagter; Guido C. H. E. de Croon", + "author_site": "Federico Paredes-Vallés, Kirk Y. W. Scheper, Christophe De Wagter, Guido C. H. E. de Croon", + "author": "Federico Paredes-Vallés; Kirk Y. W. Scheper; Christophe De Wagter; Guido C. H. E. de Croon", "abstract": "Event cameras have recently gained significant traction since they open up new avenues for low-latency and low-power solutions to complex computer vision problems. To unlock these solutions, it is necessary to develop algorithms that can leverage the unique nature of event data. However, the current state-of-the-art is still highly influenced by the frame-based literature, and usually fails to deliver on these promises. In this work, we take this into consideration and propose a novel self-supervised learning pipeline for the sequential estimation of event-based optical flow that allows for the scaling of the models to high inference frequencies. At its core, we have a continuously-running stateful neural model that is trained using a novel formulation of contrast maximization that makes it robust to nonlinearities and varying statistics in the input events. Results across multiple datasets confirm the effectiveness of our method, which establishes a new state of the art in terms of accuracy for approaches trained or optimized without ground truth.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Paredes-Valles_Taming_Contrast_Maximization_for_Learning_Sequential_Low-latency_Event-based_Optical_Flow_ICCV_2023_paper.pdf", "aff": ";;;", @@ -56899,7 +58800,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Paredes-Valles_Taming_Contrast_Maximization_for_Learning_Sequential_Low-latency_Event-based_Optical_Flow_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Paredes-Valles_Taming_Contrast_Maximization_for_Learning_Sequential_Low-latency_Event-based_Optical_Flow_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Paredes-Valles_2023_ICCV,\n \n author = {\n Paredes-Vall\\'es,\n Federico and Scheper,\n Kirk Y. W. and De Wagter,\n Christophe and de Croon,\n Guido C. H. E.\n},\n title = {\n Taming Contrast Maximization for Learning Sequential,\n Low-latency,\n Event-based Optical Flow\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9695-9705\n} \n}" }, { "title": "Tangent Model Composition for Ensembling and Continual Fine-tuning", @@ -56931,7 +58833,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Los Angeles", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Tian Yu and Soatto,\n Stefano\n},\n title = {\n Tangent Model Composition for Ensembling and Continual Fine-tuning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18676-18686\n} \n}" }, { "title": "Tangent Sampson Error: Fast Approximate Two-view Reprojection Error for Central Camera Models", @@ -56963,7 +58866,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", - "aff_country_unique": "Switzerland;Sweden" + "aff_country_unique": "Switzerland;Sweden", + "bibtex": "@InProceedings{Terekhov_2023_ICCV,\n \n author = {\n Terekhov,\n Mikhail and Larsson,\n Viktor\n},\n title = {\n Tangent Sampson Error: Fast Approximate Two-view Reprojection Error for Central Camera Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3370-3378\n} \n}" }, { "title": "Task Agnostic Restoration of Natural Video Dynamics", @@ -56995,7 +58899,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Seoul", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Ali_2023_ICCV,\n \n author = {\n Ali,\n Muhammad Kashif and Kim,\n Dongjin and Kim,\n Tae Hyun\n},\n title = {\n Task Agnostic Restoration of Natural Video Dynamics\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13534-13544\n} \n}" }, { "title": "Task-Oriented Multi-Modal Mutual Leaning for Vision-Language Models", @@ -57018,7 +58923,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Long_Task-Oriented_Multi-Modal_Mutual_Leaning_for_Vision-Language_Models_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Long_Task-Oriented_Multi-Modal_Mutual_Leaning_for_Vision-Language_Models_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Long_2023_ICCV,\n \n author = {\n Long,\n Sifan and Zhao,\n Zhen and Yuan,\n Junkun and Tan,\n Zichang and Liu,\n Jiangjiang and Zhou,\n Luping and Wang,\n Shengsheng and Wang,\n Jingdong\n},\n title = {\n Task-Oriented Multi-Modal Mutual Leaning for Vision-Language Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21959-21969\n} \n}" }, { "title": "Task-aware Adaptive Learning for Cross-domain Few-shot Learning", @@ -57050,7 +58956,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;1;0", - "aff_country_unique": "China;United Kingdom" + "aff_country_unique": "China;United Kingdom", + "bibtex": "@InProceedings{Guo_2023_ICCV,\n \n author = {\n Guo,\n Yurong and Du,\n Ruoyi and Dong,\n Yuan and Hospedales,\n Timothy and Song,\n Yi-Zhe and Ma,\n Zhanyu\n},\n title = {\n Task-aware Adaptive Learning for Cross-domain Few-shot Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1590-1599\n} \n}" }, { "title": "TaskExpert: Dynamically Assembling Multi-Task Representations with Memorial Mixture-of-Experts", @@ -57082,7 +58989,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Ye_2023_ICCV,\n \n author = {\n Ye,\n Hanrong and Xu,\n Dan\n},\n title = {\n TaskExpert: Dynamically Assembling Multi-Task Representations with Memorial Mixture-of-Experts\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21828-21837\n} \n}" }, { "title": "Taxonomy Adaptive Cross-Domain Adaptation in Medical Imaging via Optimization Trajectory Distillation", @@ -57107,14 +59015,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Fan_Taxonomy_Adaptive_Cross-Domain_Adaptation_in_Medical_Imaging_via_Optimization_Trajectory_ICCV_2023_paper.html", "aff_unique_index": "0;0;1;2;3;0", - "aff_unique_norm": "University of Sydney;Lawrence Berkeley National Laboratory;University of Maryland;Microsoft", - "aff_unique_dep": ";;;Microsoft Corporation", + "aff_unique_norm": "University of Sydney;Lawrence Berkeley National Laboratory;University of Maryland;Microsoft Corporation", + "aff_unique_dep": ";;;", "aff_unique_url": "https://www.sydney.edu.au;https://www.lbl.gov;https://www/umd.edu;https://www.microsoft.com", - "aff_unique_abbr": "USYD;LBNL;UMD;Microsoft", + "aff_unique_abbr": "USYD;LBL;UMD;Microsoft", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Berkeley;College Park", "aff_country_unique_index": "0;0;1;1;1;0", - "aff_country_unique": "Australia;United States" + "aff_country_unique": "Australia;United States", + "bibtex": "@InProceedings{Fan_2023_ICCV,\n \n author = {\n Fan,\n Jianan and Liu,\n Dongnan and Chang,\n Hang and Huang,\n Heng and Chen,\n Mei and Cai,\n Weidong\n},\n title = {\n Taxonomy Adaptive Cross-Domain Adaptation in Medical Imaging via Optimization Trajectory Distillation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21174-21184\n} \n}" }, { "title": "TeD-SPAD: Temporal Distinctiveness for Self-Supervised Privacy-Preservation for Video Anomaly Detection", @@ -57146,7 +59055,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Orlando", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Fioresi_2023_ICCV,\n \n author = {\n Fioresi,\n Joseph and Dave,\n Ishan Rajendrakumar and Shah,\n Mubarak\n},\n title = {\n TeD-SPAD: Temporal Distinctiveness for Self-Supervised Privacy-Preservation for Video Anomaly Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13598-13609\n} \n}" }, { "title": "Teaching CLIP to Count to Ten", @@ -57169,7 +59079,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Paiss_Teaching_CLIP_to_Count_to_Ten_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Paiss_Teaching_CLIP_to_Count_to_Ten_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Paiss_2023_ICCV,\n \n author = {\n Paiss,\n Roni and Ephrat,\n Ariel and Tov,\n Omer and Zada,\n Shiran and Mosseri,\n Inbar and Irani,\n Michal and Dekel,\n Tali\n},\n title = {\n Teaching CLIP to Count to Ten\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3170-3180\n} \n}" }, { "title": "Tem-Adapter: Adapting Image-Text Pretraining for Video Question Answer", @@ -57201,7 +59112,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0;1;2;0;2;3+4;3", - "aff_country_unique": "United States;Netherlands;United Kingdom;China;Canada" + "aff_country_unique": "United States;Netherlands;United Kingdom;China;Canada", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Guangyi and Liu,\n Xiao and Wang,\n Guangrun and Zhang,\n Kun and Torr,\n Philip H.S. and Zhang,\n Xiao-Ping and Tang,\n Yansong\n},\n title = {\n Tem-Adapter: Adapting Image-Text Pretraining for Video Question Answer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13945-13955\n} \n}" }, { "title": "Template Inversion Attack against Face Recognition Systems using 3D Face Reconstruction", @@ -57209,11 +59121,11 @@ "status": "Poster", "track": "main", "pid": "11713", - "author_site": "Hatef Otroshi Shahreza, S\u00e9bastien Marcel", - "author": "Hatef Otroshi Shahreza; S\u00e9bastien Marcel", + "author_site": "Hatef Otroshi Shahreza, Sébastien Marcel", + "author": "Hatef Otroshi Shahreza; Sébastien Marcel", "abstract": "Face recognition systems are increasingly being used in different applications. In such systems, some features (also known as embeddings or templates) are extracted from each face image. Then, the extracted templates are stored in the system's database during the enrollment stage and are later used for recognition. In this paper, we focus on template inversion attacks against face recognition systems and introduce a novel method (dubbed GaFaR) to reconstruct 3D face from facial templates. To this end, we use a geometry-aware generator network based on generative neural radiance fields (GNeRF), and learn a mapping from facial templates to the intermediate latent space of the generator network. We train our network with a semi-supervised learning approach using real and synthetic images simultaneously. For the real training data, we use a Generative Adversarial Network (GAN) based framework to learn the distribution of the latent space. For the synthetic training data, where we have the true latent code, we directly train in the latent space of the generator network. In addition, during the inference stage, we also propose optimization on the camera parameters to generate face images to improve the success attack rate (up to 17.14% in our experiments). We evaluate the performance of our method in the whitebox and blackbox attacks against state-of-the-art face recognition models on the LFW and MOBIO datasets. To our knowledge, this paper is the first work on 3D face reconstruction from facial templates. The project page is available at: https://www.idiap.ch/paper/gafar", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Shahreza_Template_Inversion_Attack_against_Face_Recognition_Systems_using_3D_Face_ICCV_2023_paper.pdf", - "aff": "Idiap Research Institute, Martigny, Switzerland + \u00b4Ecole Polytechnique F\u00b4ed\u00b4erale de Lausanne (EPFL), Lausanne, Switzerland; Idiap Research Institute, Martigny, Switzerland + Universit\u00b4e de Lausanne (UNIL), Lausanne, Switzerland", + "aff": "Idiap Research Institute, Martigny, Switzerland + ´Ecole Polytechnique F´ed´erale de Lausanne (EPFL), Lausanne, Switzerland; Idiap Research Institute, Martigny, Switzerland + Universit´e de Lausanne (UNIL), Lausanne, Switzerland", "project": "https://www.idiap.ch/paper/gafar", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Shahreza_Template_Inversion_Attack_ICCV_2023_supplemental.pdf", @@ -57226,14 +59138,15 @@ "author_num": 2, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Shahreza_Template_Inversion_Attack_against_Face_Recognition_Systems_using_3D_Face_ICCV_2023_paper.html", "aff_unique_index": "0+1;0+2", - "aff_unique_norm": "Idiap Research Institute;EPFL;Universit\u00e9 de Lausanne", + "aff_unique_norm": "Idiap Research Institute;Ecole Polytechnique Fédérale de Lausanne;Université de Lausanne", "aff_unique_dep": ";;", "aff_unique_url": "https://www.idiap.ch;https://www.epfl.ch;https://www.unil.ch", "aff_unique_abbr": "Idiap;EPFL;UNIL", "aff_campus_unique_index": "0+1;0+1", "aff_campus_unique": "Martigny;Lausanne", "aff_country_unique_index": "0+0;0+0", - "aff_country_unique": "Switzerland" + "aff_country_unique": "Switzerland", + "bibtex": "@InProceedings{Shahreza_2023_ICCV,\n \n author = {\n Shahreza,\n Hatef Otroshi and Marcel,\n S\\'ebastien\n},\n title = {\n Template Inversion Attack against Face Recognition Systems using 3D Face Reconstruction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19662-19672\n} \n}" }, { "title": "Template-guided Hierarchical Feature Restoration for Anomaly Detection", @@ -57258,14 +59171,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Guo_Template-guided_Hierarchical_Feature_Restoration_for_Anomaly_Detection_ICCV_2023_paper.html", "aff_unique_index": "0+1;2+1;1;2+1;1;1;2+1;0+1", - "aff_unique_norm": "Chinese Academy of Sciences;Microsoft;Tsinghua University", + "aff_unique_norm": "Chinese Academy of Sciences;Microsoft Research;Tsinghua University", "aff_unique_dep": "Institute of Automation;Research;", "aff_unique_url": "http://www.ia.cas.cn;https://www.microsoft.com/en-us/research/group/asia;https://www.tsinghua.edu.cn", "aff_unique_abbr": "CAS;MSR Asia;THU", "aff_campus_unique_index": "1;1;1;1;1;1;1;1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0+0;0+0;0;0+0;0;0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Guo_2023_ICCV,\n \n author = {\n Guo,\n Hewei and Ren,\n Liping and Fu,\n Jingjing and Wang,\n Yuwang and Zhang,\n Zhizheng and Lan,\n Cuiling and Wang,\n Haoqian and Hou,\n Xinwen\n},\n title = {\n Template-guided Hierarchical Feature Restoration for Anomaly Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6447-6458\n} \n}" }, { "title": "Temporal Collection and Distribution for Referring Video Object Segmentation", @@ -57297,7 +59211,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Shanghai", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Tang_2023_ICCV,\n \n author = {\n Tang,\n Jiajin and Zheng,\n Ge and Yang,\n Sibei\n},\n title = {\n Temporal Collection and Distribution for Referring Video Object Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15466-15476\n} \n}" }, { "title": "Temporal Enhanced Training of Multi-view 3D Object Detector via Historical Object Prediction", @@ -57329,7 +59244,8 @@ "aff_campus_unique_index": "1;1;2;1", "aff_campus_unique": ";Hong Kong SAR;Shenzhen", "aff_country_unique_index": "0;0+0;0;0;0;0+0;0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Zong_2023_ICCV,\n \n author = {\n Zong,\n Zhuofan and Jiang,\n Dongzhi and Song,\n Guanglu and Xue,\n Zeyue and Su,\n Jingyong and Li,\n Hongsheng and Liu,\n Yu\n},\n title = {\n Temporal Enhanced Training of Multi-view 3D Object Detector via Historical Object Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3781-3790\n} \n}" }, { "title": "Temporal-Coded Spiking Neural Networks with Dynamic Firing Threshold: Learning with Event-Driven Backpropagation", @@ -57337,6 +59253,7 @@ "status": "Poster", "track": "main", "pid": "11688", + "author_site": "Wenjie Wei, Malu Zhang, Hong Qu, Ammar Belatreche, Jian Zhang, Hong Chen", "author": "Wenjie Wei, Malu Zhang, Hong Qu, Ammar Belatreche, Jian Zhang, Hong Chen", "abstract": "Spiking Neural Networks (SNNs) offer a highly promising computing paradigm due to their biological plausibility, exceptional spatiotemporal information processing capability and low power consumption. As a temporal encoding scheme for SNNs, Time-To-First-Spike (TTFS) encodes information using the timing of a single spike, which allows spiking neurons to transmit information through sparse spike trains and results in lower power consumption and higher computational efficiency compared to traditional rate-based encoding counterparts. However, despite the advantages of the TTFS encoding scheme, the effective and efficient training of TTFS-based deep SNNs remains a significant and open research problem. In this work, we first examine the factors underlying the limitations of applying existing TTFS-based learning algorithms to deep SNNs. Specifically, we investigate issues related to over-sparsity of spikes and the complexity of finding the `causal set'. We then propose a simple yet efficient dynamic firing threshold (DFT) mechanism for spiking neurons to address these issues. Building upon the proposed DFT mechanism, we further introduce a novel direct training algorithm for TTFS-based deep SNNs, called DTA-TTFS. This method utilizes event-driven processing and spike timing to enable efficient learning of deep SNNs. The proposed training method was validated on the image classification task and experimental results clearly demonstrate that our proposed method achieves state-of-the-art accuracy in comparison to existing TTFS-based learning algorithms, while maintaining high levels of sparsity and energy efficiency on neuromorphic inference accelerator.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Wei_Temporal-Coded_Spiking_Neural_Networks_with_Dynamic_Firing_Threshold_Learning_with_ICCV_2023_paper.pdf", @@ -57348,7 +59265,8 @@ "gs_citation": 34, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18358858900879369919&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wei_Temporal-Coded_Spiking_Neural_Networks_with_Dynamic_Firing_Threshold_Learning_with_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wei_Temporal-Coded_Spiking_Neural_Networks_with_Dynamic_Firing_Threshold_Learning_with_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Wei_2023_ICCV,\n \n author = {\n Wei,\n Wenjie and Zhang,\n Malu and Qu,\n Hong and Belatreche,\n Ammar and Zhang,\n Jian and Chen,\n Hong\n},\n title = {\n Temporal-Coded Spiking Neural Networks with Dynamic Firing Threshold: Learning with Event-Driven Backpropagation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10552-10562\n} \n}" }, { "title": "Test Time Adaptation for Blind Image Quality Assessment", @@ -57380,7 +59298,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Bengaluru", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "India" + "aff_country_unique": "India", + "bibtex": "@InProceedings{Roy_2023_ICCV,\n \n author = {\n Roy,\n Subhadeep and Mitra,\n Shankhanil and Biswas,\n Soma and Soundararajan,\n Rajiv\n},\n title = {\n Test Time Adaptation for Blind Image Quality Assessment\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16742-16751\n} \n}" }, { "title": "Test-time Personalizable Forecasting of 3D Human Poses", @@ -57412,7 +59331,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;1;0", - "aff_country_unique": "China;Germany" + "aff_country_unique": "China;Germany", + "bibtex": "@InProceedings{Cui_2023_ICCV,\n \n author = {\n Cui,\n Qiongjie and Sun,\n Huaijiang and Lu,\n Jianfeng and Li,\n Weiqing and Li,\n Bin and Yi,\n Hongwei and Wang,\n Haofan\n},\n title = {\n Test-time Personalizable Forecasting of 3D Human Poses\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 274-283\n} \n}" }, { "title": "Tetra-NeRF: Representing Neural Radiance Fields Using Tetrahedra", @@ -57444,7 +59364,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Prague", "aff_country_unique_index": "0;0", - "aff_country_unique": "Czech Republic" + "aff_country_unique": "Czech Republic", + "bibtex": "@InProceedings{Kulhanek_2023_ICCV,\n \n author = {\n Kulhanek,\n Jonas and Sattler,\n Torsten\n},\n title = {\n Tetra-NeRF: Representing Neural Radiance Fields Using Tetrahedra\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18458-18469\n} \n}" }, { "title": "TexFusion: Synthesizing 3D Textures with Text-Guided Image Diffusion Models", @@ -57469,14 +59390,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Cao_TexFusion_Synthesizing_3D_Textures_with_Text-Guided_Image_Diffusion_Models_ICCV_2023_paper.html", "aff_unique_index": "0+1+2;0;0+1+2;0;0", - "aff_unique_norm": "NVIDIA;University of Toronto;Vector Institute", - "aff_unique_dep": "NVIDIA Corporation;;", + "aff_unique_norm": "NVIDIA Corporation;University of Toronto;Vector Institute", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.nvidia.com;https://www.utoronto.ca;https://vectorinstitute.ai/", "aff_unique_abbr": "NVIDIA;U of T;Vector Institute", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+1+1;0;0+1+1;0;0", - "aff_country_unique": "United States;Canada" + "aff_country_unique": "United States;Canada", + "bibtex": "@InProceedings{Cao_2023_ICCV,\n \n author = {\n Cao,\n Tianshi and Kreis,\n Karsten and Fidler,\n Sanja and Sharp,\n Nicholas and Yin,\n Kangxue\n},\n title = {\n TexFusion: Synthesizing 3D Textures with Text-Guided Image Diffusion Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4169-4181\n} \n}" }, { "title": "Text-Conditioned Sampling Framework for Text-to-Image Generation with Masked Generative Models", @@ -57508,7 +59430,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Lee_2023_ICCV,\n \n author = {\n Lee,\n Jaewoong and Jang,\n Sangwon and Jo,\n Jaehyeong and Yoon,\n Jaehong and Kim,\n Yunji and Kim,\n Jin-Hwa and Ha,\n Jung-Woo and Hwang,\n Sung Ju\n},\n title = {\n Text-Conditioned Sampling Framework for Text-to-Image Generation with Masked Generative Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23252-23262\n} \n}" }, { "title": "Text-Driven Generative Domain Adaptation with Spectral Consistency Regularization", @@ -57533,14 +59456,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Liu_Text-Driven_Generative_Domain_Adaptation_with_Spectral_Consistency_Regularization_ICCV_2023_paper.html", "aff_unique_index": "0+1;0;0+1;2;0+1+3", - "aff_unique_norm": "Chinese Academy of Sciences;University of Chinese Academy of Sciences;University of Science and Technology of China;Pengcheng Laboratory", - "aff_unique_dep": "Institute of Computing Technology;;;Peng Cheng Laboratory", + "aff_unique_norm": "Chinese Academy of Sciences;University of Chinese Academy of Sciences;University of Science and Technology of China;Peng Cheng Laboratory", + "aff_unique_dep": "Institute of Computing Technology;;;", "aff_unique_url": "http://www.cas.cn/;http://www.ucas.ac.cn;http://www.ustc.edu.cn;http://www.pcl.ac.cn", "aff_unique_abbr": "CAS;UCAS;USTC;PCL", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0+0;0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Zhenhuan and Li,\n Liang and Xiao,\n Jiayu and Zha,\n Zheng-Jun and Huang,\n Qingming\n},\n title = {\n Text-Driven Generative Domain Adaptation with Spectral Consistency Regularization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7019-7029\n} \n}" }, { "title": "Text2Performer: Text-Driven Human Video Generation", @@ -57572,7 +59496,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0;0", - "aff_country_unique": "Singapore;China" + "aff_country_unique": "Singapore;China", + "bibtex": "@InProceedings{Jiang_2023_ICCV,\n \n author = {\n Jiang,\n Yuming and Yang,\n Shuai and Koh,\n Tong Liang and Wu,\n Wayne and Loy,\n Chen Change and Liu,\n Ziwei\n},\n title = {\n Text2Performer: Text-Driven Human Video Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22747-22757\n} \n}" }, { "title": "Text2Room: Extracting Textured 3D Meshes from 2D Text-to-Image Models", @@ -57580,8 +59505,8 @@ "status": "Poster", "track": "main", "pid": "2901", - "author_site": "Lukas H\u00f6llein, Ang Cao, Andrew Owens, Justin Johnson, Matthias Nie\u00dfner", - "author": "Lukas H\u00f6llein; Ang Cao; Andrew Owens; Justin Johnson; Matthias Nie\u00dfner", + "author_site": "Lukas Höllein, Ang Cao, Andrew Owens, Justin Johnson, Matthias Nießner", + "author": "Lukas Höllein; Ang Cao; Andrew Owens; Justin Johnson; Matthias Nießner", "abstract": "We present Text2Room, a method for generating room-scale textured 3D meshes from a given text prompt as input. To this end, we leverage pre-trained 2D text-to-image models to synthesize a sequence of images from different poses. In order to lift these outputs into a consistent 3D scene representation, we combine monocular depth estimation with a text-conditioned inpainting model. The core idea of our approach is a tailored viewpoint selection such that the content of each image can be fused into a seamless, textured 3D mesh. More specifically, we propose a continuous alignment strategy that iteratively fuses scene frames with the existing geometry to create a seamless mesh. Unlike existing works that focus on generating single objects [56, 41] or zoom-out trajectories [18] from text, our method generates complete 3D scenes with multiple objects and explicit 3D geometry. We evaluate our approach using qualitative and quantitative metrics, demonstrating it as the first method to generate room-scale 3D geometry with compelling textures from only text as input.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Hollein_Text2Room_Extracting_Textured_3D_Meshes_from_2D_Text-to-Image_Models_ICCV_2023_paper.pdf", "aff": "Technical University of Munich; University of Michigan; University of Michigan; University of Michigan; Technical University of Munich", @@ -57604,7 +59529,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;0", - "aff_country_unique": "Germany;United States" + "aff_country_unique": "Germany;United States", + "bibtex": "@InProceedings{Hollein_2023_ICCV,\n \n author = {\n H\\"ollein,\n Lukas and Cao,\n Ang and Owens,\n Andrew and Johnson,\n Justin and Nie{\\ss\n}ner,\n Matthias\n},\n title = {\n Text2Room: Extracting Textured 3D Meshes from 2D Text-to-Image Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7909-7920\n} \n}" }, { "title": "Text2Tex: Text-driven Texture Synthesis via Diffusion Models", @@ -57612,8 +59538,8 @@ "status": "Poster", "track": "main", "pid": "4677", - "author_site": "Dave Zhenyu Chen, Yawar Siddiqui, Hsin-Ying Lee, Sergey Tulyakov, Matthias Nie\u00dfner", - "author": "Dave Zhenyu Chen; Yawar Siddiqui; Hsin-Ying Lee; Sergey Tulyakov; Matthias Nie\u00dfner", + "author_site": "Dave Zhenyu Chen, Yawar Siddiqui, Hsin-Ying Lee, Sergey Tulyakov, Matthias Nießner", + "author": "Dave Zhenyu Chen; Yawar Siddiqui; Hsin-Ying Lee; Sergey Tulyakov; Matthias Nießner", "abstract": "We present Text2Tex, a novel method for generating high-quality textures for 3D meshes from the given text prompts. Our method incorporates inpainting into a pre-trained depth-aware image diffusion model to progressively synthesize high resolution partial textures from multiple viewpoints. To avoid accumulating inconsistent and stretched artifacts across views, we dynamically segment the rendered view into a generation mask, which represents the generation status of each visible texel. This partitioned view representation guides the depth-aware inpainting model to generate and update partial textures for the corresponding regions. Furthermore, we propose an automatic view sequence generation scheme to determine the next best view for updating the partial texture. Extensive experiments demonstrate that our method significantly outperforms the existing text-driven approaches and GAN-based methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Chen_Text2Tex_Text-driven_Texture_Synthesis_via_Diffusion_Models_ICCV_2023_paper.pdf", "aff": "Technical University of Munich; Technical University of Munich; Snap Research; Snap Research; Technical University of Munich", @@ -57636,7 +59562,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;1;0", - "aff_country_unique": "Germany;United States" + "aff_country_unique": "Germany;United States", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Dave Zhenyu and Siddiqui,\n Yawar and Lee,\n Hsin-Ying and Tulyakov,\n Sergey and Nie{\\ss\n}ner,\n Matthias\n},\n title = {\n Text2Tex: Text-driven Texture Synthesis via Diffusion Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18558-18568\n} \n}" }, { "title": "Text2Video-Zero: Text-to-Image Diffusion Models are Zero-Shot Video Generators", @@ -57668,7 +59595,8 @@ "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Austin;Atlanta", "aff_country_unique_index": "0;0;0;0;0+0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Khachatryan_2023_ICCV,\n \n author = {\n Khachatryan,\n Levon and Movsisyan,\n Andranik and Tadevosyan,\n Vahram and Henschel,\n Roberto and Wang,\n Zhangyang and Navasardyan,\n Shant and Shi,\n Humphrey\n},\n title = {\n Text2Video-Zero: Text-to-Image Diffusion Models are Zero-Shot Video Generators\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15954-15964\n} \n}" }, { "title": "TextManiA: Enriching Visual Feature by Text-driven Manifold Augmentation", @@ -57693,14 +59621,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ye-Bin_TextManiA_Enriching_Visual_Feature_by_Text-driven_Manifold_Augmentation_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;3;4", - "aff_unique_norm": "POSTECH;Columbia University;Sungkyunkwan University;Microsoft;Yonsei University", + "aff_unique_norm": "POSTECH;Columbia University;Sungkyunkwan University;Microsoft Corporation;Yonsei University", "aff_unique_dep": "Dept. of Electrical Engineering;;;Azure Division;Institute for Convergence Research and Education in Advanced Technology", "aff_unique_url": "https://www.postech.ac.kr;https://www.columbia.edu;https://www.skku.edu;https://azure.microsoft.com;https://www.yonsei.ac.kr", "aff_unique_abbr": "POSTECH;Columbia;SKKU;Microsoft Azure;Yonsei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;1;0", - "aff_country_unique": "South Korea;United States" + "aff_country_unique": "South Korea;United States", + "bibtex": "@InProceedings{Ye-Bin_2023_ICCV,\n \n author = {\n Ye-Bin,\n Moon and Kim,\n Jisoo and Kim,\n Hongyeob and Son,\n Kilho and Oh,\n Tae-Hyun\n},\n title = {\n TextManiA: Enriching Visual Feature by Text-driven Manifold Augmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2526-2537\n} \n}" }, { "title": "TextPSG: Panoptic Scene Graph Generation from Textual Descriptions", @@ -57732,7 +59661,8 @@ "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Berkeley;Amherst", "aff_country_unique_index": "0;1;1;1;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Zhao_2023_ICCV,\n \n author = {\n Zhao,\n Chengyang and Shen,\n Yikang and Chen,\n Zhenfang and Ding,\n Mingyu and Gan,\n Chuang\n},\n title = {\n TextPSG: Panoptic Scene Graph Generation from Textual Descriptions\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2839-2850\n} \n}" }, { "title": "Texture Generation on 3D Meshes with Point-UV Diffusion", @@ -57757,14 +59687,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yu_Texture_Generation_on_3D_Meshes_with_Point-UV_Diffusion_ICCV_2023_paper.html", "aff_unique_index": "0;0;1;2;1;0", - "aff_unique_norm": "University of Hong Kong;Chinese University of Hong Kong;TCL Communication", + "aff_unique_norm": "The University of Hong Kong;The Chinese University of Hong Kong;TCL Communication", "aff_unique_dep": ";;Corporate Research", "aff_unique_url": "https://www.hku.hk;https://www.cuhk.edu.hk;https://www.tcl.com", "aff_unique_abbr": "HKU;CUHK;TCL", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yu_2023_ICCV,\n \n author = {\n Yu,\n Xin and Dai,\n Peng and Li,\n Wenbo and Ma,\n Lan and Liu,\n Zhengzhe and Qi,\n Xiaojuan\n},\n title = {\n Texture Generation on 3D Meshes with Point-UV Diffusion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4206-4216\n} \n}" }, { "title": "Texture Learning Domain Randomization for Domain Generalized Segmentation", @@ -57791,12 +59722,13 @@ "aff_unique_index": "0;0;0", "aff_unique_norm": "Agency for Defense Development", "aff_unique_dep": "", - "aff_unique_url": "http://www.add.re.kr/", + "aff_unique_url": "https://www.add.re.kr", "aff_unique_abbr": "ADD", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Kim_2023_ICCV,\n \n author = {\n Kim,\n Sunghwan and Kim,\n Dae-hwan and Kim,\n Hoseong\n},\n title = {\n Texture Learning Domain Randomization for Domain Generalized Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 677-687\n} \n}" }, { "title": "The Devil is in the Crack Orientation: A New Perspective for Crack Detection", @@ -57828,7 +59760,8 @@ "aff_campus_unique_index": "0;0;0;0;0;0;0", "aff_campus_unique": "Shenzhen", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Zhuangzhuang and Zhang,\n Jin and Lai,\n Zhuonan and Zhu,\n Guanming and Liu,\n Zun and Chen,\n Jie and Li,\n Jianqiang\n},\n title = {\n The Devil is in the Crack Orientation: A New Perspective for Crack Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6653-6663\n} \n}" }, { "title": "The Devil is in the Upsampling: Architectural Decisions Made Simpler for Denoising with Deep Image Prior", @@ -57860,7 +59793,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Chapel Hill", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Yilin and Li,\n Jiang and Pang,\n Yunkui and Nie,\n Dong and Yap,\n Pew-Thian\n},\n title = {\n The Devil is in the Upsampling: Architectural Decisions Made Simpler for Denoising with Deep Image Prior\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12408-12417\n} \n}" }, { "title": "The Effectiveness of MAE Pre-Pretraining for Billion-Scale Pretraining", @@ -57885,14 +59819,15 @@ "author_num": 12, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Singh_The_Effectiveness_of_MAE_Pre-Pretraining_for_Billion-Scale_Pretraining_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0;0;0;0;0;0;0;0;0", - "aff_unique_norm": "Meta", + "aff_unique_norm": "Meta Platforms, Inc.", "aff_unique_dep": "Meta AI", "aff_unique_url": "https://meta.com", "aff_unique_abbr": "Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Singh_2023_ICCV,\n \n author = {\n Singh,\n Mannat and Duval,\n Quentin and Alwala,\n Kalyan Vasudev and Fan,\n Haoqi and Aggarwal,\n Vaibhav and Adcock,\n Aaron and Joulin,\n Armand and Dollar,\n Piotr and Feichtenhofer,\n Christoph and Girshick,\n Ross and Girdhar,\n Rohit and Misra,\n Ishan\n},\n title = {\n The Effectiveness of MAE Pre-Pretraining for Billion-Scale Pretraining\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5484-5494\n} \n}" }, { "title": "The Euclidean Space is Evil: Hyperbolic Attribute Editing for Few-shot Image Generation", @@ -57924,7 +59859,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2", - "aff_country_unique": "United States;United Kingdom;China" + "aff_country_unique": "United States;United Kingdom;China", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Lingxiao and Zhang,\n Yi and Wang,\n Shuhui\n},\n title = {\n The Euclidean Space is Evil: Hyperbolic Attribute Editing for Few-shot Image Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22714-22724\n} \n}" }, { "title": "The Making and Breaking of Camouflage", @@ -57956,7 +59892,8 @@ "aff_campus_unique_index": "0;0+1;0", "aff_campus_unique": "Oxford;Shanghai", "aff_country_unique_index": "0;0+1;0", - "aff_country_unique": "United Kingdom;China" + "aff_country_unique": "United Kingdom;China", + "bibtex": "@InProceedings{Lamdouar_2023_ICCV,\n \n author = {\n Lamdouar,\n Hala and Xie,\n Weidi and Zisserman,\n Andrew\n},\n title = {\n The Making and Breaking of Camouflage\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 832-842\n} \n}" }, { "title": "The Perils of Learning From Unlabeled Data: Backdoor Attacks on Semi-supervised Learning", @@ -57988,7 +59925,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Amherst;", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "United States;Japan" + "aff_country_unique": "United States;Japan", + "bibtex": "@InProceedings{Shejwalkar_2023_ICCV,\n \n author = {\n Shejwalkar,\n Virat and Lyu,\n Lingjuan and Houmansadr,\n Amir\n},\n title = {\n The Perils of Learning From Unlabeled Data: Backdoor Attacks on Semi-supervised Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4730-4740\n} \n}" }, { "title": "The Power of Sound (TPoS): Audio Reactive Video Generation with Stable Diffusion", @@ -58000,7 +59938,7 @@ "author": "Yujin Jeong; Wonjeong Ryoo; Seunghyun Lee; Dabin Seo; Wonmin Byeon; Sangpil Kim; Jinkyu Kim", "abstract": "In recent years, video generation has become a prominent generative tool and has drawn significant attention. However, there is little consideration in audio-to-video generation, though audio contains unique qualities like temporal semantics and magnitude. Hence, we propose The Power of Sound (TPoS) model to incorporate audio input that includes both changeable temporal semantics and magnitude. To generate video frames, TPoS utilizes a latent stable diffusion model with textual semantic information, which is then guided by the sequential audio embedding from our pretrained Audio Encoder. As a result, this method produces audio reactive video contents. We demonstrate the effectiveness of TPoS across various tasks and compare its results with current state-of-the-art techniques in the field of audio-to-video generation. More examples are available at https://ku-vai.github.io/TPoS/", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Jeong_The_Power_of_Sound_TPoS_Audio_Reactive_Video_Generation_with_ICCV_2023_paper.pdf", - "aff": "Department of Computer Science and Engineering, Korea University; Department of Arti\ufb01cial Intelligence, Korea University; Department of Arti\ufb01cial Intelligence, Korea University; Department of Computer Science and Engineering, Korea University; NVIDIA Research; Department of Arti\ufb01cial Intelligence, Korea University + NVIDIA Research; Department of Computer Science and Engineering, Korea University + Department of Arti\ufb01cial Intelligence, Korea University", + "aff": "Department of Computer Science and Engineering, Korea University; Department of Artificial Intelligence, Korea University; Department of Artificial Intelligence, Korea University; Department of Computer Science and Engineering, Korea University; NVIDIA Research; Department of Artificial Intelligence, Korea University + NVIDIA Research; Department of Computer Science and Engineering, Korea University + Department of Artificial Intelligence, Korea University", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Jeong_The_Power_of_ICCV_2023_supplemental.pdf", @@ -58013,14 +59951,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Jeong_The_Power_of_Sound_TPoS_Audio_Reactive_Video_Generation_with_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0;1;0+1;0+0", - "aff_unique_norm": "Korea University;NVIDIA", + "aff_unique_norm": "Korea University;NVIDIA Corporation", "aff_unique_dep": "Department of Computer Science and Engineering;NVIDIA Research", "aff_unique_url": "https://www.korea.ac.kr;https://www.nvidia.com/research", "aff_unique_abbr": "KU;NVIDIA", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1;0+1;0+0", - "aff_country_unique": "South Korea;United States" + "aff_country_unique": "South Korea;United States", + "bibtex": "@InProceedings{Jeong_2023_ICCV,\n \n author = {\n Jeong,\n Yujin and Ryoo,\n Wonjeong and Lee,\n Seunghyun and Seo,\n Dabin and Byeon,\n Wonmin and Kim,\n Sangpil and Kim,\n Jinkyu\n},\n title = {\n The Power of Sound (TPoS): Audio Reactive Video Generation with Stable Diffusion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7822-7832\n} \n}" }, { "title": "The Stable Signature: Rooting Watermarks in Latent Diffusion Models", @@ -58028,11 +59967,11 @@ "status": "Poster", "track": "main", "pid": "5015", - "author_site": "Pierre Fernandez, Guillaume Couairon, Herv\u00e9 J\u00e9gou, Matthijs Douze, Teddy Furon", - "author": "Pierre Fernandez; Guillaume Couairon; Herv\u00e9 J\u00e9gou; Matthijs Douze; Teddy Furon", + "author_site": "Pierre Fernandez, Guillaume Couairon, Hervé Jégou, Matthijs Douze, Teddy Furon", + "author": "Pierre Fernandez; Guillaume Couairon; Hervé Jégou; Matthijs Douze; Teddy Furon", "abstract": "Generative image modeling enables a wide range of applications but raises ethical concerns about responsible deployment. This paper introduces an active strategy combining image watermarking and Latent Diffusion Models. The goal is for all generated images to conceal a watermark allowing for future detection and/or identification. The method quickly fine-tunes the image generator, conditioned on a binary signature. A pre-trained watermark extractor recovers the hidden signature from any generated image and a statistical test then determines whether it comes from the generative model. We evaluate the invisibility and robustness of our watermark on a variety of generation tasks, showing that Stable Signature works even after the images are modified. For instance, it detects the origin of an image generated from a text prompt, then cropped to keep 10% of the content, with 90+% accuracy at a false positive rate below 1e-6.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Fernandez_The_Stable_Signature_Rooting_Watermarks_in_Latent_Diffusion_Models_ICCV_2023_paper.pdf", - "aff": "Meta AI+Centre Inria de l\u2019Universit\u00e9 de Rennes+Sorbonne University; Meta AI+Centre Inria de l\u2019Universit\u00e9 de Rennes+Sorbonne University; Meta AI; Meta AI; Centre Inria de l\u2019Universit\u00e9 de Rennes", + "aff": "Meta AI+Centre Inria de l’Université de Rennes+Sorbonne University; Meta AI+Centre Inria de l’Université de Rennes+Sorbonne University; Meta AI; Meta AI; Centre Inria de l’Université de Rennes", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Fernandez_The_Stable_Signature_ICCV_2023_supplemental.pdf", @@ -58045,14 +59984,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Fernandez_The_Stable_Signature_Rooting_Watermarks_in_Latent_Diffusion_Models_ICCV_2023_paper.html", "aff_unique_index": "0+1+2;0+1+2;0;0;1", - "aff_unique_norm": "Meta;INRIA;Sorbonne University", + "aff_unique_norm": "Meta Platforms, Inc.;Inria;Sorbonne University", "aff_unique_dep": "Meta AI;;", "aff_unique_url": "https://meta.com;https://www.inria.fr;https://www.sorbonne.universite.fr", "aff_unique_abbr": "Meta;Inria;Sorbonne", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Rennes", "aff_country_unique_index": "0+1+1;0+1+1;0;0;1", - "aff_country_unique": "United States;France" + "aff_country_unique": "United States;France", + "bibtex": "@InProceedings{Fernandez_2023_ICCV,\n \n author = {\n Fernandez,\n Pierre and Couairon,\n Guillaume and J\\'egou,\n Herv\\'e and Douze,\n Matthijs and Furon,\n Teddy\n},\n title = {\n The Stable Signature: Rooting Watermarks in Latent Diffusion Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22466-22477\n} \n}" }, { "title": "The Unreasonable Effectiveness of Large Language-Vision Models for Source-Free Video Domain Adaptation", @@ -58060,11 +60000,11 @@ "status": "Poster", "track": "main", "pid": "10154", - "author_site": "Giacomo Zara, Alessandro Conti, Subhankar Roy, St\u00e9phane Lathuili\u00e8re, Paolo Rota, Elisa Ricci", - "author": "Giacomo Zara; Alessandro Conti; Subhankar Roy; St\u00e9phane Lathuili\u00e8re; Paolo Rota; Elisa Ricci", + "author_site": "Giacomo Zara, Alessandro Conti, Subhankar Roy, Stéphane Lathuilière, Paolo Rota, Elisa Ricci", + "author": "Giacomo Zara; Alessandro Conti; Subhankar Roy; Stéphane Lathuilière; Paolo Rota; Elisa Ricci", "abstract": "Source-Free Video Unsupervised Domain Adaptation (SFVUDA) task consists in adapting an action recognition model, trained on a labelled source dataset, to an unlabelled target dataset, without accessing the actual source data. The previous approaches have attempted to address SFVUDA by leveraging self-supervision (e.g., enforcing temporal consistency) derived from the target data itself. In this work, we take an orthogonal approach by exploiting \"web-supervision\" from Large Language-Vision Models (LLVMs), driven by the rationale that LLVMs contain a rich world prior surprisingly robust to domain-shift. We showcase the unreasonable effectiveness of integrating LLVMs for SFVUDA by devising an intuitive and parameter-efficient method, which we name Domain Adaptation with Large Language-Vision models (DALL-V), that distills the world prior and complementary source model information into a student network tailored for the target. Despite the simplicity, DALL-V achieves significant improvement over state-of-the-art SFVUDA methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Zara_The_Unreasonable_Effectiveness_of_Large_Language-Vision_Models_for_Source-Free_Video_ICCV_2023_paper.pdf", - "aff": "University of Trento, Italy; University of Trento, Italy; LTCI, T \u00b4el\u00b4ecom Paris, Institut polytechnique de Paris, France; LTCI, T \u00b4el\u00b4ecom Paris, Institut polytechnique de Paris, France; University of Trento, Italy; University of Trento, Italy+Fondazione Bruno Kessler, Italy", + "aff": "University of Trento, Italy; University of Trento, Italy; LTCI, T ´el´ecom Paris, Institut polytechnique de Paris, France; LTCI, T ´el´ecom Paris, Institut polytechnique de Paris, France; University of Trento, Italy; University of Trento, Italy+Fondazione Bruno Kessler, Italy", "project": "", "github": "https://github.com/giaczara/dallv", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Zara_The_Unreasonable_Effectiveness_ICCV_2023_supplemental.pdf", @@ -58077,14 +60017,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zara_The_Unreasonable_Effectiveness_of_Large_Language-Vision_Models_for_Source-Free_Video_ICCV_2023_paper.html", "aff_unique_index": "0;0;1;1;0;0+2", - "aff_unique_norm": "University of Trento;T\u00e9l\u00e9com Paris;Fondazione Bruno Kessler", + "aff_unique_norm": "University of Trento;Télécom Paris;Fondazione Bruno Kessler", "aff_unique_dep": ";LTCI;", "aff_unique_url": "https://www.unitn.it;https://www.telecom-paris.fr;https://www.fbk.eu", - "aff_unique_abbr": "UniTN;T\u00e9l\u00e9com Paris;FBK", + "aff_unique_abbr": "UniTN;Télécom Paris;FBK", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;1;0;0+0", - "aff_country_unique": "Italy;France" + "aff_country_unique": "Italy;France", + "bibtex": "@InProceedings{Zara_2023_ICCV,\n \n author = {\n Zara,\n Giacomo and Conti,\n Alessandro and Roy,\n Subhankar and Lathuili\\`ere,\n St\\'ephane and Rota,\n Paolo and Ricci,\n Elisa\n},\n title = {\n The Unreasonable Effectiveness of Large Language-Vision Models for Source-Free Video Domain Adaptation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10307-10317\n} \n}" }, { "title": "The Victim and The Beneficiary: Exploiting a Poisoned Model to Train a Clean Model on Poisoned Data", @@ -58116,7 +60057,8 @@ "aff_campus_unique_index": "0+0;0+0;0+0;0+0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0+0;0+0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhu_2023_ICCV,\n \n author = {\n Zhu,\n Zixuan and Wang,\n Rui and Zou,\n Cong and Jing,\n Lihua\n},\n title = {\n The Victim and The Beneficiary: Exploiting a Poisoned Model to Train a Clean Model on Poisoned Data\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 155-164\n} \n}" }, { "title": "Theoretical and Numerical Analysis of 3D Reconstruction Using Point and Line Incidences", @@ -58124,11 +60066,11 @@ "status": "Poster", "track": "main", "pid": "12433", - "author_site": "Felix Rydell, Elima Shehu, Ang\u00e9lica Torres", - "author": "Felix Rydell; Elima Shehu; Ang\u00e9lica Torres", + "author_site": "Felix Rydell, Elima Shehu, Angélica Torres", + "author": "Felix Rydell; Elima Shehu; Angélica Torres", "abstract": "We study the joint image of lines incident to points, meaning the set of image tuples obtained from fixed cameras observing a varying 3D point-line incidence. We prove a formula for the number of complex critical points of the triangulation problem that aims to compute a 3D point-line incidence from noisy images. Our formula works for an arbitrary number of images and measures the intrinsic difficulty of this triangulation. Additionally, we conduct numerical experiments using homotopy continuation methods, comparing different approaches of triangulation of such incidences. In our setup, exploiting the incidence relations gives a notably faster point reconstruction with comparable accuracy.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Rydell_Theoretical_and_Numerical_Analysis_of_3D_Reconstruction_Using_Point_and_ICCV_2023_paper.pdf", - "aff": "KTH Royal Institute of Technology; University of Osnabr \u00a8uck + MPI MiS; Centre de Recerca Matem `atica", + "aff": "KTH Royal Institute of Technology; University of Osnabr ¨uck + MPI MiS; Centre de Recerca Matem `atica", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Rydell_Theoretical_and_Numerical_ICCV_2023_supplemental.zip", @@ -58141,14 +60083,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Rydell_Theoretical_and_Numerical_Analysis_of_3D_Reconstruction_Using_Point_and_ICCV_2023_paper.html", "aff_unique_index": "0;1+2;3", - "aff_unique_norm": "KTH Royal Institute of Technology;University of Osnabr\u00fcck;Max Planck Institute for Mathematics in the Sciences;Centre de Recerca Matem\u00e0tica", - "aff_unique_dep": ";;;Mathematical Research Center", + "aff_unique_norm": "KTH Royal Institute of Technology;University of Osnabrück;Max Planck Institute for Mathematics in the Sciences;Centre de Recerca Matemàtica", + "aff_unique_dep": ";;;", "aff_unique_url": "https://www.kth.se;https://www.uni-osnabrueck.de;https://www.mis.mpg.de;", "aff_unique_abbr": "KTH;UOS;MPI MiS;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1+1;2", - "aff_country_unique": "Sweden;Germany;Spain" + "aff_country_unique": "Sweden;Germany;Spain", + "bibtex": "@InProceedings{Rydell_2023_ICCV,\n \n author = {\n Rydell,\n Felix and Shehu,\n Elima and Torres,\n Ang\\'elica\n},\n title = {\n Theoretical and Numerical Analysis of 3D Reconstruction Using Point and Line Incidences\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3748-3757\n} \n}" }, { "title": "Thinking Image Color Aesthetics Assessment: Models, Datasets and Benchmarks", @@ -58180,7 +60123,8 @@ "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{He_2023_ICCV,\n \n author = {\n He,\n Shuai and Ming,\n Anlong and Li,\n Yaqi and Sun,\n Jinyuan and Zheng,\n ShunTian and Ma,\n Huadong\n},\n title = {\n Thinking Image Color Aesthetics Assessment: Models,\n Datasets and Benchmarks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21838-21847\n} \n}" }, { "title": "TiDAL: Learning Training Dynamics for Active Learning", @@ -58212,7 +60156,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "1;2", - "aff_country_unique": ";United States;South Korea" + "aff_country_unique": ";United States;South Korea", + "bibtex": "@InProceedings{Kye_2023_ICCV,\n \n author = {\n Kye,\n Seong Min and Choi,\n Kwanghee and Byun,\n Hyeongmin and Chang,\n Buru\n},\n title = {\n TiDAL: Learning Training Dynamics for Active Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22335-22345\n} \n}" }, { "title": "TiDy-PSFs: Computational Imaging with Time-Averaged Dynamic Point-Spread-Functions", @@ -58244,7 +60189,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "College Park", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Shah_2023_ICCV,\n \n author = {\n Shah,\n Sachin and Kulshrestha,\n Sakshum and Metzler,\n Christopher A.\n},\n title = {\n TiDy-PSFs: Computational Imaging with Time-Averaged Dynamic Point-Spread-Functions\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10657-10667\n} \n}" }, { "title": "Tiled Multiplane Images for Practical 3D Photography", @@ -58269,14 +60215,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Khan_Tiled_Multiplane_Images_for_Practical_3D_Photography_ICCV_2023_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "Meta", - "aff_unique_dep": "Meta Platforms, Inc.", + "aff_unique_norm": "Meta Platforms, Inc.", + "aff_unique_dep": "", "aff_unique_url": "https://meta.com", "aff_unique_abbr": "Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Khan_2023_ICCV,\n \n author = {\n Khan,\n Numair and Xiao,\n Lei and Lanman,\n Douglas\n},\n title = {\n Tiled Multiplane Images for Practical 3D Photography\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10454-10464\n} \n}" }, { "title": "Time Does Tell: Self-Supervised Time-Tuning of Dense Image Representations", @@ -58308,7 +60255,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Netherlands" + "aff_country_unique": "Netherlands", + "bibtex": "@InProceedings{Salehi_2023_ICCV,\n \n author = {\n Salehi,\n Mohammadreza and Gavves,\n Efstratios and Snoek,\n Cees G.M. and Asano,\n Yuki M.\n},\n title = {\n Time Does Tell: Self-Supervised Time-Tuning of Dense Image Representations\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16536-16547\n} \n}" }, { "title": "Time-to-Contact Map by Joint Estimation of Up-to-Scale Inverse Depth and Global Motion using a Single Event Camera", @@ -58340,7 +60288,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "France" + "aff_country_unique": "France", + "bibtex": "@InProceedings{Nunes_2023_ICCV,\n \n author = {\n Nunes,\n Urbano Miguel and Perrinet,\n Laurent Udo and Ieng,\n Sio-Hoi\n},\n title = {\n Time-to-Contact Map by Joint Estimation of Up-to-Scale Inverse Depth and Global Motion using a Single Event Camera\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23653-23663\n} \n}" }, { "title": "Tiny Updater: Towards Efficient Neural Network-Driven Software Updating", @@ -58372,7 +60321,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Linfeng and Ma,\n Kaisheng\n},\n title = {\n Tiny Updater: Towards Efficient Neural Network-Driven Software Updating\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23447-23459\n} \n}" }, { "title": "TinyCLIP: CLIP Distillation via Affinity Mimicking and Weight Inheritance", @@ -58397,14 +60347,15 @@ "author_num": 12, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wu_TinyCLIP_CLIP_Distillation_via_Affinity_Mimicking_and_Weight_Inheritance_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;1;1;1;1;1;1;2;0;1", - "aff_unique_norm": "Sun Yat-sen University;Microsoft;Huazhong University of Science and Technology", - "aff_unique_dep": ";Microsoft Corporation;", + "aff_unique_norm": "Sun Yat-sen University;Microsoft Corporation;Huazhong University of Science and Technology", + "aff_unique_dep": ";;", "aff_unique_url": "http://www.sysu.edu.cn/;https://www.microsoft.com;http://www.hust.edu.cn", "aff_unique_abbr": "SYSU;Microsoft;HUST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;1;1;1;1;1;1;0;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Wu_2023_ICCV,\n \n author = {\n Wu,\n Kan and Peng,\n Houwen and Zhou,\n Zhenghong and Xiao,\n Bin and Liu,\n Mengchen and Yuan,\n Lu and Xuan,\n Hong and Valenzuela,\n Michael and Chen,\n Xi (Stephen) and Wang,\n Xinggang and Chao,\n Hongyang and Hu,\n Han\n},\n title = {\n TinyCLIP: CLIP Distillation via Affinity Mimicking and Weight Inheritance\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21970-21980\n} \n}" }, { "title": "To Adapt or Not to Adapt? Real-Time Adaptation for Semantic Segmentation", @@ -58412,8 +60363,8 @@ "status": "Poster", "track": "main", "pid": "8838", - "author_site": "Marc Botet Colomer, Pier Luigi Dovesi, Theodoros Panagiotakopoulos, Joao Frederico Carvalho, Linus H\u00e4renstam-Nielsen, Hossein Azizpour, Hedvig Kjellstr\u00f6m, Daniel Cremers, Matteo Poggi", - "author": "Marc Botet Colomer; Pier Luigi Dovesi; Theodoros Panagiotakopoulos; Joao Frederico Carvalho; Linus H\u00e4renstam-Nielsen; Hossein Azizpour; Hedvig Kjellstr\u00f6m; Daniel Cremers; Matteo Poggi", + "author_site": "Marc Botet Colomer, Pier Luigi Dovesi, Theodoros Panagiotakopoulos, Joao Frederico Carvalho, Linus Härenstam-Nielsen, Hossein Azizpour, Hedvig Kjellström, Daniel Cremers, Matteo Poggi", + "author": "Marc Botet Colomer; Pier Luigi Dovesi; Theodoros Panagiotakopoulos; Joao Frederico Carvalho; Linus Härenstam-Nielsen; Hossein Azizpour; Hedvig Kjellström; Daniel Cremers; Matteo Poggi", "abstract": "The goal of Online Domain Adaptation for semantic segmentation is to handle unforeseeable domain changes that occur during deployment, like sudden weather events. However, the high computational costs associated with brute-force adaptation make this paradigm unfeasible for real-world applications. In this paper we propose HAMLET, a Hardware-Aware Modular Least Expensive Training framework for real-time domain adaptation. Our approach includes a hardware-aware back-propagation orchestration agent (HAMT) and a dedicated domain-shift detector that enables active control over when and how the model is adapted (LT). Thanks to these advancements, our approach is capable of performing semantic segmentation while simultaneously adapting at more than 29FPS on a single consumer-grade GPU. Our framework's encouraging accuracy and speed trade-off is demonstrated on OnDA and SHIFT benchmarks through experimental results.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Colomer_To_Adapt_or_Not_to_Adapt_Real-Time_Adaptation_for_Semantic_ICCV_2023_paper.pdf", "aff": ";;;;;;;;", @@ -58427,7 +60378,8 @@ "aff_domain": ";;;;;;;;", "email": ";;;;;;;;", "author_num": 9, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Colomer_To_Adapt_or_Not_to_Adapt_Real-Time_Adaptation_for_Semantic_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Colomer_To_Adapt_or_Not_to_Adapt_Real-Time_Adaptation_for_Semantic_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Colomer_2023_ICCV,\n \n author = {\n Colomer,\n Marc Botet and Dovesi,\n Pier Luigi and Panagiotakopoulos,\n Theodoros and Carvalho,\n Joao Frederico and H\\"arenstam-Nielsen,\n Linus and Azizpour,\n Hossein and Kjellstr\\"om,\n Hedvig and Cremers,\n Daniel and Poggi,\n Matteo\n},\n title = {\n To Adapt or Not to Adapt? Real-Time Adaptation for Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16548-16559\n} \n}" }, { "title": "Token-Label Alignment for Vision Transformers", @@ -58450,7 +60402,8 @@ "aff_domain": ";;;;", "email": ";;;;", "author_num": 5, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Xiao_Token-Label_Alignment_for_Vision_Transformers_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Xiao_Token-Label_Alignment_for_Vision_Transformers_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Xiao_2023_ICCV,\n \n author = {\n Xiao,\n Han and Zheng,\n Wenzhao and Zhu,\n Zheng and Zhou,\n Jie and Lu,\n Jiwen\n},\n title = {\n Token-Label Alignment for Vision Transformers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5495-5504\n} \n}" }, { "title": "Too Large; Data Reduction for Vision-Language Pre-Training", @@ -58482,7 +60435,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Singapore", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Alex Jinpeng and Lin,\n Kevin Qinghong and Zhang,\n David Junhao and Lei,\n Stan Weixian and Shou,\n Mike Zheng\n},\n title = {\n Too Large; Data Reduction for Vision-Language Pre-Training\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3147-3157\n} \n}" }, { "title": "ToonTalker: Cross-Domain Face Reenactment", @@ -58490,6 +60444,7 @@ "status": "Poster", "track": "main", "pid": "8738", + "author_site": "Yuan Gong, Yong Zhang, Xiaodong Cun, Fei Yin, Yanbo Fan, Xuan Wang, Baoyuan Wu, Yujiu Yang", "author": "Yuan Gong, Yong Zhang, Xiaodong Cun, Fei Yin, Yanbo Fan, Xuan Wang, Baoyuan Wu, Yujiu Yang", "abstract": "We target cross-domain face reenactment in this paper, i.e., driving a cartoon image with the video of a real person and vice versa. Recently, many works have focused on one-shot talking face generation to drive a portrait with a real video, i.e., within-domain reenactment. Straightforwardly applying those methods to cross-domain animation will cause inaccurate expression transfer, blur effects, and even apparent artifacts due to the domain shift between cartoon and real faces. Only a few works attempt to settle cross-domain face reenactment. The most related work AnimeCeleb requires constructing a dataset with pose vector and cartoon image pairs by animating 3D characters, which makes it inapplicable anymore if no paired data is available. In this paper, we propose a novel method for cross-domain reenactment without paired data. Specifically, we propose a transformer-based framework to align the motions from different domains into a common latent space where motion transfer is conducted via latent code addition. Two domain-specific motion encoders and two learnable motion base memories are used to capture domain properties. A source query transformer and a driving one are exploited to project domain-specific motion to the canonical space. The edited motion is projected back to the domain of the source with a transformer. Moreover, since no paired data is provided, we propose a novel cross-domain training scheme using data from two domains with the designed analogy constraint. Besides, we contribute a cartoon dataset in Disney style. Extensive evaluations demonstrate the superiority of our method over competing methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Gong_ToonTalker_Cross-Domain_Face_Reenactment_ICCV_2023_paper.pdf", @@ -58501,7 +60456,8 @@ "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13208841070685422542&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Gong_ToonTalker_Cross-Domain_Face_Reenactment_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Gong_ToonTalker_Cross-Domain_Face_Reenactment_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Gong_2023_ICCV,\n \n author = {\n Gong,\n Yuan and Zhang,\n Yong and Cun,\n Xiaodong and Yin,\n Fei and Fan,\n Yanbo and Wang,\n Xuan and Wu,\n Baoyuan and Yang,\n Yujiu\n},\n title = {\n ToonTalker: Cross-Domain Face Reenactment\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7690-7700\n} \n}" }, { "title": "TopoSeg: Topology-Aware Nuclear Instance Segmentation", @@ -58509,6 +60465,7 @@ "status": "Poster", "track": "main", "pid": "4580", + "author_site": "Hongliang He, Jun Wang, Pengxu Wei, Fan Xu, Xiangyang Ji, Chang Liu, Jie Chen", "author": "Hongliang He, Jun Wang, Pengxu Wei, Fan Xu, Xiangyang Ji, Chang Liu, Jie Chen", "abstract": "Nuclear instance segmentation has been critical for pathology image analysis in medical science, e.g., cancer diagnosis. Current methods typically adopt pixel-wise optimization for nuclei boundary exploration, where rich structural information could be lost for subsequent quantitative morphology assessment. To address this issue, we develop a topology-aware segmentation approach, termed TopoSeg, which exploits topological structure information to keep the predictions rational, especially in common situations with densely touching and overlapping nucleus instances. Concretely, TopoSeg builds on a topology-aware module (TAM), which encodes dynamic changes of different topology structures within the three-class probability maps (inside, boundary, and background) of the nuclei to persistence barcodes and makes the topology-aware loss function. To efficiently focus on regions with high topological errors, we propose an adaptive topology-aware selection (ATS) strategy to enhance the topology-aware optimization procedure further. Experiments on three nuclear instance segmentation datasets justify the superiority of TopoSeg, which achieves state-of-the-art performance. The code is available at https://github.com/hhlisme/toposeg.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/He_TopoSeg_Topology-Aware_Nuclear_Instance_Segmentation_ICCV_2023_paper.pdf", @@ -58520,7 +60477,8 @@ "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9109896013160872443&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/He_TopoSeg_Topology-Aware_Nuclear_Instance_Segmentation_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/He_TopoSeg_Topology-Aware_Nuclear_Instance_Segmentation_ICCV_2023_paper.html", + "bibtex": "@InProceedings{He_2023_ICCV,\n \n author = {\n He,\n Hongliang and Wang,\n Jun and Wei,\n Pengxu and Xu,\n Fan and Ji,\n Xiangyang and Liu,\n Chang and Chen,\n Jie\n},\n title = {\n TopoSeg: Topology-Aware Nuclear Instance Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21307-21316\n} \n}" }, { "title": "Total-Recon: Deformable Scene Reconstruction for Embodied View Synthesis", @@ -58552,7 +60510,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Song_2023_ICCV,\n \n author = {\n Song,\n Chonghyuk and Yang,\n Gengshan and Deng,\n Kangle and Zhu,\n Jun-Yan and Ramanan,\n Deva\n},\n title = {\n Total-Recon: Deformable Scene Reconstruction for Embodied View Synthesis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17671-17682\n} \n}" }, { "title": "Toward Multi-Granularity Decision-Making: Explicit Visual Reasoning with Hierarchical Knowledge", @@ -58584,7 +60543,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Yifeng and Chen,\n Shi and Zhao,\n Qi\n},\n title = {\n Toward Multi-Granularity Decision-Making: Explicit Visual Reasoning with Hierarchical Knowledge\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2573-2583\n} \n}" }, { "title": "Toward Unsupervised Realistic Visual Question Answering", @@ -58616,7 +60576,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "San Diego", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Yuwei and Ho,\n Chih-Hui and Vasconcelos,\n Nuno\n},\n title = {\n Toward Unsupervised Realistic Visual Question Answering\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15613-15624\n} \n}" }, { "title": "Towards Attack-tolerant Federated Learning via Critical Parameter Analysis", @@ -58641,14 +60602,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Han_Towards_Attack-tolerant_Federated_Learning_via_Critical_Parameter_Analysis_ICCV_2023_paper.html", "aff_unique_index": "0;0;1;2;1;1;3+0", - "aff_unique_norm": "Korea Advanced Institute of Science and Technology;Microsoft;Gwangju Institute of Science and Technology;Institute for Basic Science", + "aff_unique_norm": "Korea Advanced Institute of Science and Technology;Microsoft Research;Gwangju Institute of Science and Technology;Institute for Basic Science", "aff_unique_dep": ";Research;;", "aff_unique_url": "https://www.kaist.ac.kr;https://www.microsoft.com/en-us/research/group/asia;https://www.gist.ac.kr;https://www.ibs.re.kr", "aff_unique_abbr": "KAIST;MSR Asia;GIST;IBS", "aff_campus_unique_index": "1;2;1;1;", "aff_campus_unique": ";Asia;Gwangju", "aff_country_unique_index": "0;0;1;0;1;1;0+0", - "aff_country_unique": "South Korea;China" + "aff_country_unique": "South Korea;China", + "bibtex": "@InProceedings{Han_2023_ICCV,\n \n author = {\n Han,\n Sungwon and Park,\n Sungwon and Wu,\n Fangzhao and Kim,\n Sundong and Zhu,\n Bin and Xie,\n Xing and Cha,\n Meeyoung\n},\n title = {\n Towards Attack-tolerant Federated Learning via Critical Parameter Analysis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4999-5008\n} \n}" }, { "title": "Towards Authentic Face Restoration with Iterative Diffusion Models and Beyond", @@ -58674,13 +60636,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhao_Towards_Authentic_Face_Restoration_with_Iterative_Diffusion_Models_and_Beyond_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "Google", - "aff_unique_dep": "Google", + "aff_unique_dep": "", "aff_unique_url": "https://www.google.com", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhao_2023_ICCV,\n \n author = {\n Zhao,\n Yang and Hou,\n Tingbo and Su,\n Yu-Chuan and Jia,\n Xuhui and Li,\n Yandong and Grundmann,\n Matthias\n},\n title = {\n Towards Authentic Face Restoration with Iterative Diffusion Models and Beyond\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7312-7322\n} \n}" }, { "title": "Towards Better Robustness against Common Corruptions for Unsupervised Domain Adaptation", @@ -58692,7 +60655,7 @@ "author": "Zhiqiang Gao; Kaizhu Huang; Rui Zhang; Dawei Liu; Jieming Ma", "abstract": "Recent studies have investigated how to achieve robustness for unsupervised domain adaptation (UDA). While most efforts focus on adversarial robustness, i.e. how the model performs against unseen malicious adversarial perturbations, robustness against benign common corruption (RaCC) surprisingly remains under-explored for UDA. Towards improving RaCC for UDA methods in an unsupervised manner, we propose a novel Distributionally and Discretely Adversarial Regularization (DDAR) framework in this paper. Formulated as a min-max optimization with a distribution distance, DDAR is theoretically well-founded to ensure generalization over unknown common corruptions. Meanwhile, we show that our regularization scheme effectively reduces a surrogate of RaCC, i.e., the perceptual distance between natural data and common corruption. To enable a better adversarial regularization, the design of the optimization pipeline relies on an image discretization scheme that can transform \"out-of-distribution\" adversarial data into \"in-distribution\" data augmentation. Through extensive experiments, in terms of RaCC, our method is superior to conventional unsupervised regularization mechanisms, widely improves the robustness of existing UDA methods, and achieves state-of-the-art performance.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Gao_Towards_Better_Robustness_against_Common_Corruptions_for_Unsupervised_Domain_Adaptation_ICCV_2023_paper.pdf", - "aff": "Duke Kunshan University, Kunshan, China+Xi\u2019an Jiatong-Liverpool University, Suzhou, China; Duke Kunshan University, Kunshan, China; Xi\u2019an Jiatong-Liverpool University, Suzhou, China; Duke Kunshan University, Kunshan, China; Xi\u2019an Jiatong-Liverpool University, Suzhou, China", + "aff": "Duke Kunshan University, Kunshan, China+Xi’an Jiatong-Liverpool University, Suzhou, China; Duke Kunshan University, Kunshan, China; Xi’an Jiatong-Liverpool University, Suzhou, China; Duke Kunshan University, Kunshan, China; Xi’an Jiatong-Liverpool University, Suzhou, China", "project": "", "github": "https://github.com/gzqhappy/DDAR", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Gao_Towards_Better_Robustness_ICCV_2023_supplemental.pdf", @@ -58712,7 +60675,8 @@ "aff_campus_unique_index": "0+1;0;1;0;1", "aff_campus_unique": "Kunshan;Suzhou", "aff_country_unique_index": "0+0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Gao_2023_ICCV,\n \n author = {\n Gao,\n Zhiqiang and Huang,\n Kaizhu and Zhang,\n Rui and Liu,\n Dawei and Ma,\n Jieming\n},\n title = {\n Towards Better Robustness against Common Corruptions for Unsupervised Domain Adaptation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18882-18893\n} \n}" }, { "title": "Towards Building More Robust Models with Frequency Bias", @@ -58737,14 +60701,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Bu_Towards_Building_More_Robust_Models_with_Frequency_Bias_ICCV_2023_paper.html", "aff_unique_index": "0+1;2;0+2", - "aff_unique_norm": "Shanghai Artificial Intelligence Laboratory;Shanghai Jiao Tong University;University of Hong Kong", + "aff_unique_norm": "Shanghai Artificial Intelligence Laboratory;Shanghai Jiao Tong University;The University of Hong Kong", "aff_unique_dep": ";;", "aff_unique_url": "http://www.shailab.org/;https://www.sjtu.edu.cn;https://www.hku.hk", "aff_unique_abbr": "Shanghai AI Lab;SJTU;HKU", "aff_campus_unique_index": ";1;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0+0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Bu_2023_ICCV,\n \n author = {\n Bu,\n Qingwen and Huang,\n Dong and Cui,\n Heming\n},\n title = {\n Towards Building More Robust Models with Frequency Bias\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4402-4411\n} \n}" }, { "title": "Towards Content-based Pixel Retrieval in Revisited Oxford and Paris", @@ -58776,7 +60741,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;1+1;0", - "aff_country_unique": "South Korea;China" + "aff_country_unique": "South Korea;China", + "bibtex": "@InProceedings{An_2023_ICCV,\n \n author = {\n An,\n Guoyuan and Kim,\n Woo Jae and Yang,\n Saelyne and Li,\n Rong and Huo,\n Yuchi and Yoon,\n Sun-Eui\n},\n title = {\n Towards Content-based Pixel Retrieval in Revisited Oxford and Paris\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20507-20518\n} \n}" }, { "title": "Towards Deeply Unified Depth-aware Panoptic Segmentation with Bi-directional Guidance Learning", @@ -58808,7 +60774,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{He_2023_ICCV,\n \n author = {\n He,\n Junwen and Wang,\n Yifan and Wang,\n Lijun and Lu,\n Huchuan and Luo,\n Bin and He,\n Jun-Yan and Lan,\n Jin-Peng and Geng,\n Yifeng and Xie,\n Xuansong\n},\n title = {\n Towards Deeply Unified Depth-aware Panoptic Segmentation with Bi-directional Guidance Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4111-4121\n} \n}" }, { "title": "Towards Effective Instance Discrimination Contrastive Loss for Unsupervised Domain Adaptation", @@ -58840,7 +60807,8 @@ "aff_campus_unique_index": "0", "aff_campus_unique": "Hefei;", "aff_country_unique_index": "0+0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Yixin and Wang,\n Zilei and Li,\n Junjie and Zhuang,\n Jiafan and Lin,\n Zihan\n},\n title = {\n Towards Effective Instance Discrimination Contrastive Loss for Unsupervised Domain Adaptation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11388-11399\n} \n}" }, { "title": "Towards Fair and Comprehensive Comparisons for Image-Based 3D Object Detection", @@ -58872,7 +60840,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0;0+1;0;0;0;0;0", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Ma_2023_ICCV,\n \n author = {\n Ma,\n Xinzhu and Wang,\n Yongtao and Zhang,\n Yinmin and Xia,\n Zhiyi and Meng,\n Yuan and Wang,\n Zhihui and Li,\n Haojie and Ouyang,\n Wanli\n},\n title = {\n Towards Fair and Comprehensive Comparisons for Image-Based 3D Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6425-6435\n} \n}" }, { "title": "Towards Fairness-aware Adversarial Network Pruning", @@ -58884,7 +60853,7 @@ "author": "Lei Zhang; Zhibo Wang; Xiaowei Dong; Yunhe Feng; Xiaoyi Pang; Zhifei Zhang; Kui Ren", "abstract": "Network pruning aims to compress models while minimizing loss in accuracy. With the increasing focus on bias in AI systems, the bias inheriting or even magnification nature of traditional network pruning methods has raised a new perspective towards fairness-aware network pruning. Straightforward pruning plus debias methods and recent designs for monitoring disparities of demographic attributes during pruning have endeavored to enhance fairness in pruning. However, neither simple assembling of two tasks nor specifically designed pruning strategies could achieve the optimal trade-off among pruning ratio, accuracy, and fairness. This paper proposes an end-to-end learnable framework for fairness-aware network pruning, which optimizes both pruning and debias tasks jointly by adversarial training against those final evaluation metrics like accuracy for pruning, and disparate impact (DI) and equalized odds (DEO) for fairness. In other words, our fairness-aware adversarial pruning method would learn to prune without any handcraft rules. Therefore, our approach could flexibly adapt to variate network structures. Exhaustive experimentation demonstrates the generalization capacity of our approach, as well as superior performance on pruning and debias simultaneously. To highlight, the proposed method could preserve the SOTA pruning performance while significantly improving fairness by around 50% as compared to traditional pruning methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Zhang_Towards_Fairness-aware_Adversarial_Network_Pruning_ICCV_2023_paper.pdf", - "aff": "Zhejiang University; Zhejiang University\u2021; Wuhan University; University of North Texas; Wuhan University; Adobe Research; Zhejiang University", + "aff": "Zhejiang University; Zhejiang University‡; Wuhan University; University of North Texas; Wuhan University; Adobe Research; Zhejiang University", "project": "", "github": "", "supp": "", @@ -58904,7 +60873,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0;1;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Lei and Wang,\n Zhibo and Dong,\n Xiaowei and Feng,\n Yunhe and Pang,\n Xiaoyi and Zhang,\n Zhifei and Ren,\n Kui\n},\n title = {\n Towards Fairness-aware Adversarial Network Pruning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5168-5177\n} \n}" }, { "title": "Towards General Low-Light Raw Noise Synthesis and Modeling", @@ -58936,7 +60906,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0+0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Feng and Xu,\n Bin and Li,\n Zhiqiang and Liu,\n Xinran and Lu,\n Qingbo and Gao,\n Changxin and Sang,\n Nong\n},\n title = {\n Towards General Low-Light Raw Noise Synthesis and Modeling\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10820-10830\n} \n}" }, { "title": "Towards Generic Image Manipulation Detection with Weakly-Supervised Self-Consistency Learning", @@ -58968,7 +60939,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhai_2023_ICCV,\n \n author = {\n Zhai,\n Yuanhao and Luan,\n Tianyu and Doermann,\n David and Yuan,\n Junsong\n},\n title = {\n Towards Generic Image Manipulation Detection with Weakly-Supervised Self-Consistency Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22390-22400\n} \n}" }, { "title": "Towards Geospatial Foundation Models via Continual Pretraining", @@ -58976,8 +60948,8 @@ "status": "Poster", "track": "main", "pid": "10512", - "author_site": "Mat\u00edas Mendieta, Boran Han, Xingjian Shi, Yi Zhu, Chen Chen", - "author": "Mat\u00edas Mendieta; Boran Han; Xingjian Shi; Yi Zhu; Chen Chen", + "author_site": "Matías Mendieta, Boran Han, Xingjian Shi, Yi Zhu, Chen Chen", + "author": "Matías Mendieta; Boran Han; Xingjian Shi; Yi Zhu; Chen Chen", "abstract": "Geospatial technologies are becoming increasingly essential in our world for a wide range of applications, including agriculture, urban planning, and disaster response. To help improve the applicability and performance of deep learning models on these geospatial tasks, various works have begun investigating foundation models for this domain. Researchers have explored two prominent approaches for introducing such models in geospatial applications, but both have drawbacks in terms of limited performance benefit or prohibitive training cost. Therefore, in this work, we propose a novel paradigm for building highly effective geospatial foundation models with minimal resource cost and carbon impact. We first construct a compact yet diverse dataset from multiple sources to promote feature diversity, which we term GeoPile. Then, we investigate the potential of continual pretraining from large-scale ImageNet-22k models and propose a multi-objective continual pretraining paradigm, which leverages the strong representations of ImageNet while simultaneously providing the freedom to learn valuable in-domain features. Our approach outperforms previous state-of-the-art geospatial pretraining methods in an extensive evaluation on seven downstream datasets covering various tasks such as change detection, classification, multi-label classification, semantic segmentation, and super-resolution. Code is available at https://github.com/mmendiet/GFM.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Mendieta_Towards_Geospatial_Foundation_Models_via_Continual_Pretraining_ICCV_2023_paper.pdf", "aff": "Center for Research in Computer Vision, University of Central Florida; Amazon Web Services; Boson AI; Boson AI; Center for Research in Computer Vision, University of Central Florida", @@ -58993,14 +60965,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Mendieta_Towards_Geospatial_Foundation_Models_via_Continual_Pretraining_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;2;0", - "aff_unique_norm": "University of Central Florida;Amazon;Boson AI", - "aff_unique_dep": "Center for Research in Computer Vision;Amazon Web Services;", + "aff_unique_norm": "University of Central Florida;Amazon Web Services;Boson AI", + "aff_unique_dep": "Center for Research in Computer Vision;;", "aff_unique_url": "https://www.ucf.edu;https://aws.amazon.com;https://www.boson.ai", "aff_unique_abbr": "UCF;AWS;Boson AI", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Orlando;", "aff_country_unique_index": "0;0;1;1;0", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Mendieta_2023_ICCV,\n \n author = {\n Mendieta,\n Mat{\\'\\i\n}as and Han,\n Boran and Shi,\n Xingjian and Zhu,\n Yi and Chen,\n Chen\n},\n title = {\n Towards Geospatial Foundation Models via Continual Pretraining\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16806-16816\n} \n}" }, { "title": "Towards Grand Unified Representation Learning for Unsupervised Visible-Infrared Person Re-Identification", @@ -59032,7 +61005,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Wuhan", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n Bin and Chen,\n Jun and Ye,\n Mang\n},\n title = {\n Towards Grand Unified Representation Learning for Unsupervised Visible-Infrared Person Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11069-11079\n} \n}" }, { "title": "Towards High-Fidelity Text-Guided 3D Face Generation and Manipulation Using only Images", @@ -59044,7 +61018,7 @@ "author": "Cuican Yu; Guansong Lu; Yihan Zeng; Jian Sun; Xiaodan Liang; Huibin Li; Zongben Xu; Songcen Xu; Wei Zhang; Hang Xu", "abstract": "Generating 3D faces from textual descriptions has a multitude of applications, such as gaming, movie and robotics. Recent progresses have demonstrated the success of unconditional 3D face generation and text-to-3D shape generation. However, due to the limited text-3D face data pairs, text-driven 3D face generation remains an open problem. In this paper, we propose a text-guided 3D faces generation method, refer as TG-3DFace, for generating realistic 3D face using text guidance. Specifically, we adopt an unconditional 3D face generation framework and equip it with text conditions, which learns the text-guided 3D face generation with only text-2D face data. On top of that, we propose two text-to-face cross-modal alignment techniques, including the global contrastive learning and the fine-grained alignment module, to facilitate high semantic consistency between generated 3D faces and input texts. Besides, we present directional classifier guidance during the inference process, which encourages creativity for out-of-domain generations. Compared to the existing methods, TG-3DFace creates more realistic and aesthetically pleasing 3D faces, boosting 9% multi-view consistency (MVIC) over Latent3D. The rendered face images generated by TG-3DFace achieve higher FID and CLIP score than text-to-2D face/image generation models, demonstrating our superiority in generating realistic and semantic-consistent textures.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Yu_Towards_High-Fidelity_Text-Guided_3D_Face_Generation_and_Manipulation_Using_only_ICCV_2023_paper.pdf", - "aff": "Xi\u2019an Jiaotong University; Huawei Noah\u2019s Ark Lab; Sun Yat-sen University; Xi\u2019an Jiaotong University; Sun Yat-sen University; Xi\u2019an Jiaotong University; Xi\u2019an Jiaotong University; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab", + "aff": "Xi’an Jiaotong University; Huawei Noah’s Ark Lab; Sun Yat-sen University; Xi’an Jiaotong University; Sun Yat-sen University; Xi’an Jiaotong University; Xi’an Jiaotong University; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Yu_Towards_High-Fidelity_Text-Guided_ICCV_2023_supplemental.zip", @@ -59057,14 +61031,15 @@ "author_num": 10, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yu_Towards_High-Fidelity_Text-Guided_3D_Face_Generation_and_Manipulation_Using_only_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;0;2;0;0;1;1;1", - "aff_unique_norm": "Xi'an Jiao Tong University;Huawei;Sun Yat-sen University", - "aff_unique_dep": ";Noah\u2019s Ark Lab;", + "aff_unique_norm": "Xi'an Jiaotong University;Huawei;Sun Yat-sen University", + "aff_unique_dep": ";Noah’s Ark Lab;", "aff_unique_url": "https://www.xjtu.edu.cn;https://www.huawei.com;http://www.sysu.edu.cn/", "aff_unique_abbr": "XJTU;Huawei;SYSU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yu_2023_ICCV,\n \n author = {\n Yu,\n Cuican and Lu,\n Guansong and Zeng,\n Yihan and Sun,\n Jian and Liang,\n Xiaodan and Li,\n Huibin and Xu,\n Zongben and Xu,\n Songcen and Zhang,\n Wei and Xu,\n Hang\n},\n title = {\n Towards High-Fidelity Text-Guided 3D Face Generation and Manipulation Using only Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15326-15337\n} \n}" }, { "title": "Towards High-Quality Specular Highlight Removal by Leveraging Large-Scale Synthetic Data", @@ -59088,15 +61063,16 @@ "email": "polyu.edu.hk;sysu.edu.cn;ust.hk;whu.edu.cn;polyu.edu.hk", "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Fu_Towards_High-Quality_Specular_Highlight_Removal_by_Leveraging_Large-Scale_Synthetic_Data_ICCV_2023_paper.html", - "aff_unique_index": "0;1;2+2;3;0", - "aff_unique_norm": "Hong Kong Polytechnic University;Sun Yat-sen University;Hong Kong University of Science and Technology;Wuhan University", - "aff_unique_dep": ";;;School of Computer Science", - "aff_unique_url": "https://www.polyu.edu.hk;http://www.sysu.edu.cn/;https://www.ust.hk;http://www.whu.edu.cn", - "aff_unique_abbr": "PolyU;SYSU;HKUST;WHU", + "aff_unique_index": "0;1;2+3;4;0", + "aff_unique_norm": "The Hong Kong Polytechnic University;Sun Yat-sen University;The Hong Kong University of Science and Technology;Hong Kong University of Science and Technology;Wuhan University", + "aff_unique_dep": ";;;;School of Computer Science", + "aff_unique_url": "https://www.polyu.edu.hk;http://www.sysu.edu.cn/;https://www.ust.hk;https://www.ust.hk;http://www.whu.edu.cn", + "aff_unique_abbr": "PolyU;SYSU;HKUST;HKUST;WHU", "aff_campus_unique_index": "0;1;1+0;2;0", "aff_campus_unique": "Hong Kong SAR;Guangzhou;Wuhan", "aff_country_unique_index": "0;0;0+0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Fu_2023_ICCV,\n \n author = {\n Fu,\n Gang and Zhang,\n Qing and Zhu,\n Lei and Xiao,\n Chunxia and Li,\n Ping\n},\n title = {\n Towards High-Quality Specular Highlight Removal by Leveraging Large-Scale Synthetic Data\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12857-12865\n} \n}" }, { "title": "Towards Improved Input Masking for Convolutional Neural Networks", @@ -59128,7 +61104,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "College Park", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Balasubramanian_2023_ICCV,\n \n author = {\n Balasubramanian,\n Sriram and Feizi,\n Soheil\n},\n title = {\n Towards Improved Input Masking for Convolutional Neural Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1855-1865\n} \n}" }, { "title": "Towards Inadequately Pre-trained Models in Transfer Learning", @@ -59160,7 +61137,8 @@ "aff_campus_unique_index": "1;2;1", "aff_campus_unique": ";Macau SAR;Birmingham", "aff_country_unique_index": "0;1+1;1;0;1;1", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Deng_2023_ICCV,\n \n author = {\n Deng,\n Andong and Li,\n Xingjian and Hu,\n Di and Wang,\n Tianyang and Xiong,\n Haoyi and Xu,\n Cheng-Zhong\n},\n title = {\n Towards Inadequately Pre-trained Models in Transfer Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19397-19408\n} \n}" }, { "title": "Towards Instance-adaptive Inference for Federated Learning", @@ -59192,7 +61170,8 @@ "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Canberra;Harbin", "aff_country_unique_index": "0;0;1;0;1+2;3", - "aff_country_unique": "Singapore;United Arab Emirates;Australia;China" + "aff_country_unique": "Singapore;United Arab Emirates;Australia;China", + "bibtex": "@InProceedings{Feng_2023_ICCV,\n \n author = {\n Feng,\n Chun-Mei and Yu,\n Kai and Liu,\n Nian and Xu,\n Xinxing and Khan,\n Salman and Zuo,\n Wangmeng\n},\n title = {\n Towards Instance-adaptive Inference for Federated Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23287-23296\n} \n}" }, { "title": "Towards Memory- and Time-Efficient Backpropagation for Training Spiking Neural Networks", @@ -59217,14 +61196,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Meng_Towards_Memory-_and_Time-Efficient_Backpropagation_for_Training_Spiking_Neural_Networks_ICCV_2023_paper.html", "aff_unique_index": "0+1;2;2;2+2+3;2+2+3;0+1", - "aff_unique_norm": "Chinese University of Hong Kong;Shenzhen Research Institute of Big Data;Peking University;Pengcheng Laboratory", - "aff_unique_dep": ";;School of Intelligence Science and Technology;Peng Cheng Laboratory", + "aff_unique_norm": "The Chinese University of Hong Kong;Shenzhen Research Institute of Big Data;Peking University;Peng Cheng Laboratory", + "aff_unique_dep": ";;School of Intelligence Science and Technology;", "aff_unique_url": "https://www.cuhk.edu.cn;http://www.sribd.cn;http://www.pku.edu.cn;http://www.pcl.ac.cn", "aff_unique_abbr": "CUHK;;Peking University;PCL", "aff_campus_unique_index": "0;2;;;0", "aff_campus_unique": "Shenzhen;;Beijing", "aff_country_unique_index": "0+0;0;0;0+0+0;0+0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Meng_2023_ICCV,\n \n author = {\n Meng,\n Qingyan and Xiao,\n Mingqing and Yan,\n Shen and Wang,\n Yisen and Lin,\n Zhouchen and Luo,\n Zhi-Quan\n},\n title = {\n Towards Memory- and Time-Efficient Backpropagation for Training Spiking Neural Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6166-6176\n} \n}" }, { "title": "Towards Models that Can See and Read", @@ -59249,14 +61229,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ganz_Towards_Models_that_Can_See_and_Read_ICCV_2023_paper.html", "aff_unique_index": "0;1;1;1;1;1", - "aff_unique_norm": "Technion - Israel Institute of Technology;Amazon", + "aff_unique_norm": "Technion - Israel Institute of Technology;Amazon Web Services", "aff_unique_dep": ";AWS AI Labs", "aff_unique_url": "https://www.technion.ac.il/en/;https://aws.amazon.com", "aff_unique_abbr": "Technion;AWS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;1;1", - "aff_country_unique": "Israel;United States" + "aff_country_unique": "Israel;United States", + "bibtex": "@InProceedings{Ganz_2023_ICCV,\n \n author = {\n Ganz,\n Roy and Nuriel,\n Oren and Aberdam,\n Aviad and Kittenplon,\n Yair and Mazor,\n Shai and Litman,\n Ron\n},\n title = {\n Towards Models that Can See and Read\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21718-21728\n} \n}" }, { "title": "Towards Multi-Layered 3D Garments Animation", @@ -59288,7 +61269,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", - "aff_country_unique": "Singapore;China" + "aff_country_unique": "Singapore;China", + "bibtex": "@InProceedings{Shao_2023_ICCV,\n \n author = {\n Shao,\n Yidi and Loy,\n Chen Change and Dai,\n Bo\n},\n title = {\n Towards Multi-Layered 3D Garments Animation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14361-14370\n} \n}" }, { "title": "Towards Nonlinear-Motion-Aware and Occlusion-Robust Rolling Shutter Correction", @@ -59320,7 +61302,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Qu_2023_ICCV,\n \n author = {\n Qu,\n Delin and Lao,\n Yizhen and Wang,\n Zhigang and Wang,\n Dong and Zhao,\n Bin and Li,\n Xuelong\n},\n title = {\n Towards Nonlinear-Motion-Aware and Occlusion-Robust Rolling Shutter Correction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10680-10688\n} \n}" }, { "title": "Towards Open-Set Test-Time Adaptation Utilizing the Wisdom of Crowds in Entropy Minimization", @@ -59352,7 +61335,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0;1;0", - "aff_country_unique": "United States;South Korea" + "aff_country_unique": "United States;South Korea", + "bibtex": "@InProceedings{Lee_2023_ICCV,\n \n author = {\n Lee,\n Jungsoo and Das,\n Debasmit and Choo,\n Jaegul and Choi,\n Sungha\n},\n title = {\n Towards Open-Set Test-Time Adaptation Utilizing the Wisdom of Crowds in Entropy Minimization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16380-16389\n} \n}" }, { "title": "Towards Open-Vocabulary Video Instance Segmentation", @@ -59384,7 +61368,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;1;1;1;1;0", - "aff_country_unique": "Netherlands;China" + "aff_country_unique": "Netherlands;China", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Haochen and Yan,\n Cilin and Wang,\n Shuai and Jiang,\n Xiaolong and Tang,\n Xu and Hu,\n Yao and Xie,\n Weidi and Gavves,\n Efstratios\n},\n title = {\n Towards Open-Vocabulary Video Instance Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4057-4066\n} \n}" }, { "title": "Towards Real-World Burst Image Super-Resolution: Benchmark and Method", @@ -59416,7 +61401,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0;0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wei_2023_ICCV,\n \n author = {\n Wei,\n Pengxu and Sun,\n Yujing and Guo,\n Xingbei and Liu,\n Chang and Li,\n Guanbin and Chen,\n Jie and Ji,\n Xiangyang and Lin,\n Liang\n},\n title = {\n Towards Real-World Burst Image Super-Resolution: Benchmark and Method\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13233-13242\n} \n}" }, { "title": "Towards Realistic Evaluation of Industrial Continual Learning Scenarios with an Emphasis on Energy Consumption and Computational Footprint", @@ -59424,8 +61410,8 @@ "status": "Poster", "track": "main", "pid": "12669", - "author_site": "Vivek Chavan, Paul Koch, Marian Schl\u00fcter, Clemens Briese", - "author": "Vivek Chavan; Paul Koch; Marian Schl\u00fcter; Clemens Briese", + "author_site": "Vivek Chavan, Paul Koch, Marian Schlüter, Clemens Briese", + "author": "Vivek Chavan; Paul Koch; Marian Schlüter; Clemens Briese", "abstract": "Incremental Learning (IL) aims to develop Machine Learning (ML) models that can learn from continuous streams of data and mitigate catastrophic forgetting. We analyse the current state-of-the-art Class-IL implementations and demonstrate why the current body of research tends to be one-dimensional, with an excessive focus on accuracy metrics. A realistic evaluation of Continual Learning methods should also emphasise energy consumption and overall computational load for a comprehensive understanding. This paper addresses research gaps between current IL research and industrial project environments, including varying incremental tasks and the introduction of Joint Training in tandem with IL. We introduce InVar-100 (Industrial Objects in Varied Contexts), a novel dataset meant to simulate the visual environments in industrial setups and perform various experiments for IL. Additionally, we incorporate explainability (using class activations) to interpret the model predictions. Our approach, RECIL (Real-World Scenarios and Energy Efficiency Considerations for Class Incremental Learning) provides meaningful insights about the applicability of IL approaches in practical use cases. The overarching aim is to bring the Incremental Learning and Green AI fields together and encourage the application of CIL methods in real-world scenarios. Code and dataset are available.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Chavan_Towards_Realistic_Evaluation_of_Industrial_Continual_Learning_Scenarios_with_an_ICCV_2023_paper.pdf", "aff": "Fraunhofer IPK, Berlin, Germany; Fraunhofer IPK, Berlin, Germany; Fraunhofer IPK, Berlin, Germany; Fraunhofer IPK, Berlin, Germany", @@ -59448,7 +61434,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Berlin", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Chavan_2023_ICCV,\n \n author = {\n Chavan,\n Vivek and Koch,\n Paul and Schl\\"uter,\n Marian and Briese,\n Clemens\n},\n title = {\n Towards Realistic Evaluation of Industrial Continual Learning Scenarios with an Emphasis on Energy Consumption and Computational Footprint\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11506-11518\n} \n}" }, { "title": "Towards Robust Model Watermark via Reducing Parametric Vulnerability", @@ -59473,14 +61460,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Gan_Towards_Robust_Model_Watermark_via_Reducing_Parametric_Vulnerability_ICCV_2023_paper.html", "aff_unique_index": "0;0+1;2;0+3", - "aff_unique_norm": "Tsinghua University;Ant Group;University of Tokyo;Pengcheng Laboratory", + "aff_unique_norm": "Tsinghua University;Ant Group;The University of Tokyo;Peng Cheng Laboratory", "aff_unique_dep": "International Graduate School;;;Research Center of Artificial Intelligence", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.antgroup.com;https://www.u-tokyo.ac.jp;", "aff_unique_abbr": "THU;Ant Group;UTokyo;", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Shenzhen;", "aff_country_unique_index": "0;0+0;1;0+0", - "aff_country_unique": "China;Japan" + "aff_country_unique": "China;Japan", + "bibtex": "@InProceedings{Gan_2023_ICCV,\n \n author = {\n Gan,\n Guanhao and Li,\n Yiming and Wu,\n Dongxian and Xia,\n Shu-Tao\n},\n title = {\n Towards Robust Model Watermark via Reducing Parametric Vulnerability\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4751-4761\n} \n}" }, { "title": "Towards Robust and Smooth 3D Multi-Person Pose Estimation from Monocular Videos in the Wild", @@ -59512,7 +61500,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Park_2023_ICCV,\n \n author = {\n Park,\n Sungchan and You,\n Eunyi and Lee,\n Inhoe and Lee,\n Joonseok\n},\n title = {\n Towards Robust and Smooth 3D Multi-Person Pose Estimation from Monocular Videos in the Wild\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14772-14782\n} \n}" }, { "title": "Towards Saner Deep Image Registration", @@ -59544,7 +61533,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Chicago", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Duan_2023_ICCV,\n \n author = {\n Duan,\n Bin and Zhong,\n Ming and Yan,\n Yan\n},\n title = {\n Towards Saner Deep Image Registration\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12459-12468\n} \n}" }, { "title": "Towards Semi-supervised Learning with Non-random Missing Labels", @@ -59576,7 +61566,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;1;1;0", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Duan_2023_ICCV,\n \n author = {\n Duan,\n Yue and Zhao,\n Zhen and Qi,\n Lei and Zhou,\n Luping and Wang,\n Lei and Shi,\n Yinghuan\n},\n title = {\n Towards Semi-supervised Learning with Non-random Missing Labels\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16121-16131\n} \n}" }, { "title": "Towards Understanding the Generalization of Deepfake Detectors from a Game-Theoretical View", @@ -59608,7 +61599,8 @@ "aff_campus_unique_index": "0;0;1;0", "aff_campus_unique": "Hangzhou;Beijing", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yao_2023_ICCV,\n \n author = {\n Yao,\n Kelu and Wang,\n Jin and Diao,\n Boyu and Li,\n Chao\n},\n title = {\n Towards Understanding the Generalization of Deepfake Detectors from a Game-Theoretical View\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2031-2041\n} \n}" }, { "title": "Towards Unifying Medical Vision-and-Language Pre-Training via Soft Prompts", @@ -59633,14 +61625,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chen_Towards_Unifying_Medical_Vision-and-Language_Pre-Training_via_Soft_Prompts_ICCV_2023_paper.html", "aff_unique_index": "0+1;2;0+1;3;1", - "aff_unique_norm": "Chinese University of Hong Kong;Shenzhen Research Institute of Big Data;Hong Kong University of Science and Technology;Sun Yat-sen University", + "aff_unique_norm": "The Chinese University of Hong Kong;Shenzhen Research Institute of Big Data;Hong Kong University of Science and Technology;Sun Yat-sen University", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.cuhk.edu.cn;http://www.sribd.cn;https://www.ust.hk;http://www.sysu.edu.cn/", "aff_unique_abbr": "CUHK;;HKUST;SYSU", "aff_campus_unique_index": "0;2;0", "aff_campus_unique": "Shenzhen;;Hong Kong SAR", "aff_country_unique_index": "0+0;0;0+0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Zhihong and Diao,\n Shizhe and Wang,\n Benyou and Li,\n Guanbin and Wan,\n Xiang\n},\n title = {\n Towards Unifying Medical Vision-and-Language Pre-Training via Soft Prompts\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23403-23413\n} \n}" }, { "title": "Towards Universal Image Embeddings: A Large-Scale Dataset and Challenge for Generic Image Representations", @@ -59648,8 +61641,8 @@ "status": "Poster", "track": "main", "pid": "7232", - "author_site": "Nikolaos-Antonios Ypsilantis, Kaifeng Chen, Bingyi Cao, M\u00e1rio Lipovsk\u00fd, Pelin Dogan-Sch\u00f6nberger, Grzegorz Makosa, Boris Bluntschli, Mojtaba Seyedhosseini, Ond?ej Chum, Andr\u00e9 Araujo", - "author": "Nikolaos-Antonios Ypsilantis; Kaifeng Chen; Bingyi Cao; M\u00e1rio Lipovsk\u00fd; Pelin Dogan-Sch\u00f6nberger; Grzegorz Makosa; Boris Bluntschli; Mojtaba Seyedhosseini; Ond\u0159ej Chum; Andr\u00e9 Araujo", + "author_site": "Nikolaos-Antonios Ypsilantis, Kaifeng Chen, Bingyi Cao, Mário Lipovský, Pelin Dogan-Schönberger, Grzegorz Makosa, Boris Bluntschli, Mojtaba Seyedhosseini, Ond?ej Chum, André Araujo", + "author": "Nikolaos-Antonios Ypsilantis; Kaifeng Chen; Bingyi Cao; Mário Lipovský; Pelin Dogan-Schönberger; Grzegorz Makosa; Boris Bluntschli; Mojtaba Seyedhosseini; Ondřej Chum; André Araujo", "abstract": "Fine-grained and instance-level recognition methods are commonly trained and evaluated on specific domains, in a model per domain scenario. Such an approach, however, is impractical in real large-scale applications. In this work, we address the problem of universal image embedding, where a single universal model is trained and used in multiple domains. First, we leverage existing domain-specific datasets to carefully construct a new large-scale public benchmark for the evaluation of universal image embeddings, with 241k query images, 1.4M index images and 2.8M training images across 8 different domains and 349k classes.\n We define suitable metrics, training and evaluation protocols to foster future research in this area. Second, we provide a comprehensive experimental evaluation on the new dataset, demonstrating that existing approaches and simplistic extensions lead to worse performance than an assembly of models trained for each domain separately. Finally, we conducted a public research competition on this topic, leveraging industrial datasets, which attracted the participation of more than 1k teams worldwide. This exercise generated many interesting research ideas and findings which we present in detail. Project webpage: https://cmp.felk.cvut.cz/univ_emb/", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Ypsilantis_Towards_Universal_Image_Embeddings_A_Large-Scale_Dataset_and_Challenge_for_ICCV_2023_paper.pdf", "aff": "VRG, FEE, Czech Technical University in Prague; Google; Google; Google; Google; Google; Google; Google; VRG, FEE, Czech Technical University in Prague; Google", @@ -59666,13 +61659,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ypsilantis_Towards_Universal_Image_Embeddings_A_Large-Scale_Dataset_and_Challenge_for_ICCV_2023_paper.html", "aff_unique_index": "0;1;1;1;1;1;1;1;0;1", "aff_unique_norm": "Czech Technical University in Prague;Google", - "aff_unique_dep": "Faculty of Electrical Engineering;Google", + "aff_unique_dep": "Faculty of Electrical Engineering;", "aff_unique_url": "https://www.fel.cvut.cz;https://www.google.com", "aff_unique_abbr": "CTU;Google", "aff_campus_unique_index": "1;1;1;1;1;1;1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;1;1;1;1;1;1;0;1", - "aff_country_unique": "Czech Republic;United States" + "aff_country_unique": "Czech Republic;United States", + "bibtex": "@InProceedings{Ypsilantis_2023_ICCV,\n \n author = {\n Ypsilantis,\n Nikolaos-Antonios and Chen,\n Kaifeng and Cao,\n Bingyi and Lipovsk\\'y,\n M\\'ario and Dogan-Sch\\"onberger,\n Pelin and Makosa,\n Grzegorz and Bluntschli,\n Boris and Seyedhosseini,\n Mojtaba and Chum,\n Ond\\v{r\n}ej and Araujo,\n Andr\\'e\n},\n title = {\n Towards Universal Image Embeddings: A Large-Scale Dataset and Challenge for Generic Image Representations\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11290-11301\n} \n}" }, { "title": "Towards Universal LiDAR-Based 3D Object Detection by Multi-Domain Knowledge Transfer", @@ -59684,7 +61678,7 @@ "author": "Guile Wu; Tongtong Cao; Bingbing Liu; Xingxin Chen; Yuan Ren", "abstract": "Contemporary LiDAR-based 3D object detection methods mostly focus on single-domain learning or cross-domain adaptive learning. However, for autonomous driving systems, optimizing a specific LiDAR-based 3D object detector for each domain is costly and lacks of scalability in real-world deployment. It is desirable to train a universal LiDAR-based 3D object detector from multiple domains. In this work, we propose the first attempt to explore multi-domain learning and generalization for LiDAR-based 3D object detection. We show that jointly optimizing a 3D object detector from multiple domains achieves better generalization capability compared to the conventional single-domain learning model. To explore informative knowledge across domains towards a universal 3D object detector, we propose a multi-domain knowledge transfer framework with universal feature transformation. This approach leverages spatial-wise and channel-wise knowledge across domains to learn universal feature representations, so it facilitates to optimize a universal 3D object detector for deployment at different domains. Extensive experiments on four benchmark datasets (Waymo, KITTI, NuScenes and ONCE) show the superiority of our approach over the state-of-the-art approaches for multi-domain learning and generalization in LiDAR-based 3D object detection.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Wu_Towards_Universal_LiDAR-Based_3D_Object_Detection_by_Multi-Domain_Knowledge_Transfer_ICCV_2023_paper.pdf", - "aff": "Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab", + "aff": "Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab", "project": "", "github": "", "supp": "", @@ -59698,13 +61692,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wu_Towards_Universal_LiDAR-Based_3D_Object_Detection_by_Multi-Domain_Knowledge_Transfer_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Huawei", - "aff_unique_dep": "Noah\u2019s Ark Lab", + "aff_unique_dep": "Noah’s Ark Lab", "aff_unique_url": "https://www.huawei.com", "aff_unique_abbr": "Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wu_2023_ICCV,\n \n author = {\n Wu,\n Guile and Cao,\n Tongtong and Liu,\n Bingbing and Chen,\n Xingxin and Ren,\n Yuan\n},\n title = {\n Towards Universal LiDAR-Based 3D Object Detection by Multi-Domain Knowledge Transfer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8669-8678\n} \n}" }, { "title": "Towards Unsupervised Domain Generalization for Face Anti-Spoofing", @@ -59736,7 +61731,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;1;0;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Yuchen and Chen,\n Yabo and Gou,\n Mengran and Huang,\n Chun-Ting and Wang,\n Yaoming and Dai,\n Wenrui and Xiong,\n Hongkai\n},\n title = {\n Towards Unsupervised Domain Generalization for Face Anti-Spoofing\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20654-20664\n} \n}" }, { "title": "Towards Viewpoint Robustness in Bird's Eye View Segmentation", @@ -59761,14 +61757,15 @@ "author_num": 9, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Klinghoffer_Towards_Viewpoint_Robustness_in_Birds_Eye_View_Segmentation_ICCV_2023_paper.html", "aff_unique_index": "0+1;2+1+3;2+1+3;2+1+3;2;2+4;0;2+1+3;2", - "aff_unique_norm": "Massachusetts Institute of Technology;University of Toronto;NVIDIA;Vector Institute;University of California, Los Angeles", - "aff_unique_dep": ";;NVIDIA Corporation;;", + "aff_unique_norm": "Massachusetts Institute of Technology;University of Toronto;NVIDIA Corporation;Vector Institute;University of California, Los Angeles", + "aff_unique_dep": ";;;;", "aff_unique_url": "https://web.mit.edu;https://www.utoronto.ca;https://www.nvidia.com;https://vectorinstitute.ai/;https://www.ucla.edu", "aff_unique_abbr": "MIT;U of T;NVIDIA;Vector Institute;UCLA", "aff_campus_unique_index": ";;;;1;", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0+1;0+1+1;0+1+1;0+1+1;0;0+0;0;0+1+1;0", - "aff_country_unique": "United States;Canada" + "aff_country_unique": "United States;Canada", + "bibtex": "@InProceedings{Klinghoffer_2023_ICCV,\n \n author = {\n Klinghoffer,\n Tzofi and Philion,\n Jonah and Chen,\n Wenzheng and Litany,\n Or and Gojcic,\n Zan and Joo,\n Jungseock and Raskar,\n Ramesh and Fidler,\n Sanja and Alvarez,\n Jose M.\n},\n title = {\n Towards Viewpoint Robustness in Bird's Eye View Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8515-8524\n} \n}" }, { "title": "Towards Viewpoint-Invariant Visual Recognition via Adversarial Training", @@ -59793,14 +61790,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ruan_Towards_Viewpoint-Invariant_Visual_Recognition_via_Adversarial_Training_ICCV_2023_paper.html", "aff_unique_index": "0;1+2+3+4;1;5;1;0", - "aff_unique_norm": "Beihang University;Tsinghua University;RealAI;Pengcheng Laboratory;Pazhou Laboratory;OPPO", - "aff_unique_dep": "Institute of Artificial Intelligence;Dept. of Comp. Sci. and Tech.;;Peng Cheng Laboratory;;", + "aff_unique_norm": "Beihang University;Tsinghua University;RealAI;Peng Cheng Laboratory;Pazhou Laboratory;OPPO", + "aff_unique_dep": "Institute of Artificial Intelligence;Dept. of Comp. Sci. and Tech.;;;;", "aff_unique_url": "http://www.buaa.edu.cn;https://www.tsinghua.edu.cn;https://www.realai.co;http://www.pcl.ac.cn;;https://www.oppo.com", - "aff_unique_abbr": "BUAA;THU;RealAI;PCL;;OPPO", + "aff_unique_abbr": "Beihang;THU;RealAI;PCL;;OPPO", "aff_campus_unique_index": "1", "aff_campus_unique": ";Huangpu", "aff_country_unique_index": "0;0+1+0+0;0;0;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Ruan_2023_ICCV,\n \n author = {\n Ruan,\n Shouwei and Dong,\n Yinpeng and Su,\n Hang and Peng,\n Jianteng and Chen,\n Ning and Wei,\n Xingxing\n},\n title = {\n Towards Viewpoint-Invariant Visual Recognition via Adversarial Training\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4709-4719\n} \n}" }, { "title": "Towards Zero Domain Gap: A Comprehensive Study of Realistic LiDAR Simulation for Autonomy Testing", @@ -59808,8 +61806,8 @@ "status": "Poster", "track": "main", "pid": "10898", - "author_site": "Sivabalan Manivasagam, Ioan Andrei B\u00e2rsan, Jingkang Wang, Ze Yang, Raquel Urtasun", - "author": "Sivabalan Manivasagam; Ioan Andrei B\u00e2rsan; Jingkang Wang; Ze Yang; Raquel Urtasun", + "author_site": "Sivabalan Manivasagam, Ioan Andrei Bârsan, Jingkang Wang, Ze Yang, Raquel Urtasun", + "author": "Sivabalan Manivasagam; Ioan Andrei Bârsan; Jingkang Wang; Ze Yang; Raquel Urtasun", "abstract": "Testing the full autonomy system in simulation is the safest and most scalable way to evaluate autonomous vehicle performance before deployment. This requires simulating sensor inputs such as LiDAR. To be effective, it is essential that the simulation has low domain gap with the real world. That is, the autonomy system in simulation should perform exactly the same way it would in the real world for the same scenario. To date, there has been limited analysis into what aspects of LiDAR phenomena affect autonomy performance. It is also difficult to evaluate the domain gap of existing LiDAR simulators, as they operate on fully synthetic scenes. In this paper, we propose a novel \"paired-scenario\" approach to evaluating the domain gap of a LiDAR simulator by reconstructing digital twins of real world scenarios. We can then simulate LiDAR in the scene and compare it to the real LiDAR. We leverage this setting to analyze what aspects of LiDAR simulation, such as pulse phenomena, scanning effects, and asset quality, affect the domain gap with respect to the autonomy system, including perception, prediction, and motion planning, and analyze how modifications to the simulated LiDAR influence each part. We identify key aspects that are important to model, such as motion blur, material reflectance, and the accurate geometric reconstruction of traffic participants. This helps provide research directions for improving LiDAR simulation and autonomy robustness to these effects. For more information, please visit the project website: https://waabi.ai/lidar-dg", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Manivasagam_Towards_Zero_Domain_Gap_A_Comprehensive_Study_of_Realistic_LiDAR_ICCV_2023_paper.pdf", "aff": "Waabi+University of Toronto; Waabi+University of Toronto; Waabi+University of Toronto; Waabi+University of Toronto; Waabi+University of Toronto", @@ -59832,7 +61830,8 @@ "aff_campus_unique_index": ";;;;", "aff_campus_unique": "", "aff_country_unique_index": "1;1;1;1;1", - "aff_country_unique": ";Canada" + "aff_country_unique": ";Canada", + "bibtex": "@InProceedings{Manivasagam_2023_ICCV,\n \n author = {\n Manivasagam,\n Sivabalan and B\\^arsan,\n Ioan Andrei and Wang,\n Jingkang and Yang,\n Ze and Urtasun,\n Raquel\n},\n title = {\n Towards Zero Domain Gap: A Comprehensive Study of Realistic LiDAR Simulation for Autonomy Testing\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8272-8282\n} \n}" }, { "title": "Towards Zero-Shot Scale-Aware Monocular Depth Estimation", @@ -59841,7 +61840,7 @@ "track": "main", "pid": "4232", "author_site": "Vitor Guizilini, Igor Vasiljevic, Dian Chen, Rare? Ambru?, Adrien Gaidon", - "author": "Vitor Guizilini; Igor Vasiljevic; Dian Chen; Rare\u0219 Ambru\u0219; Adrien Gaidon", + "author": "Vitor Guizilini; Igor Vasiljevic; Dian Chen; Rareș Ambruș; Adrien Gaidon", "abstract": "Monocular depth estimation is scale-ambiguous, and thus requires scale supervision to produce metric predictions. Even so, the resulting models will be geometry-specific, with learned scales that cannot be directly transferred across domains. Because of that, recent works focus instead on relative depth, eschewing scale in favor of improved up-to-scale zero-shot transfer. In this work we introduce ZeroDepth, a novel monocular depth estimation framework capable of predicting metric scale for arbitrary test images from different domains and camera parameters. This is achieved by (i) the use of input-level geometric embeddings that enable the network to learn a scale prior over objects; and (ii) decoupling the encoder and decoder stages, via a variational latent representation that is conditioned on single frame information. We evaluated ZeroDepth targeting both outdoor (KITTI, DDAD, nuScenes) and indoor (NYUv2) benchmarks, and achieved a new state-of-the-art in both settings using the same pre-trained model, outperforming methods that train on in-domain data and require test-time scaling to produce metric estimates.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Guizilini_Towards_Zero-Shot_Scale-Aware_Monocular_Depth_Estimation_ICCV_2023_paper.pdf", "aff": "Toyota Research Institute (TRI), Los Altos, CA; Toyota Research Institute (TRI), Los Altos, CA; Toyota Research Institute (TRI), Los Altos, CA; Toyota Research Institute (TRI), Los Altos, CA; Toyota Research Institute (TRI), Los Altos, CA", @@ -59859,12 +61858,13 @@ "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Toyota Research Institute", "aff_unique_dep": "", - "aff_unique_url": "https://www.tri.global", + "aff_unique_url": "https://www.tri.toyota.com", "aff_unique_abbr": "TRI", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Los Altos", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Guizilini_2023_ICCV,\n \n author = {\n Guizilini,\n Vitor and Vasiljevic,\n Igor and Chen,\n Dian and Ambruș,\n Rareș and Gaidon,\n Adrien\n},\n title = {\n Towards Zero-Shot Scale-Aware Monocular Depth Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9233-9243\n} \n}" }, { "title": "Tracing the Origin of Adversarial Attack for Forensic Investigation and Deterrence", @@ -59890,13 +61890,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Fang_Tracing_the_Origin_of_Adversarial_Attack_for_Forensic_Investigation_and_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0;1;1;0", "aff_unique_norm": "National University of Singapore;Huawei", - "aff_unique_dep": ";Huawei", + "aff_unique_dep": ";", "aff_unique_url": "https://www.nus.edu.sg;https://www.huawei.com", "aff_unique_abbr": "NUS;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1;1;0", - "aff_country_unique": "Singapore;China" + "aff_country_unique": "Singapore;China", + "bibtex": "@InProceedings{Fang_2023_ICCV,\n \n author = {\n Fang,\n Han and Zhang,\n Jiyi and Qiu,\n Yupeng and Liu,\n Jiayang and Xu,\n Ke and Fang,\n Chengfang and Chang,\n Ee-Chien\n},\n title = {\n Tracing the Origin of Adversarial Attack for Forensic Investigation and Deterrence\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4335-4344\n} \n}" }, { "title": "TrackFlow: Multi-Object tracking with Normalizing Flows", @@ -59928,7 +61929,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0+0", - "aff_country_unique": "Italy" + "aff_country_unique": "Italy", + "bibtex": "@InProceedings{Mancusi_2023_ICCV,\n \n author = {\n Mancusi,\n Gianluca and Panariello,\n Aniello and Porrello,\n Angelo and Fabbri,\n Matteo and Calderara,\n Simone and Cucchiara,\n Rita\n},\n title = {\n TrackFlow: Multi-Object tracking with Normalizing Flows\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9531-9543\n} \n}" }, { "title": "Tracking Anything with Decoupled Video Segmentation", @@ -59953,14 +61955,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Cheng_Tracking_Anything_with_Decoupled_Video_Segmentation_ICCV_2023_paper.html", "aff_unique_index": "0;1;1;0;1", - "aff_unique_norm": "University of Illinois Urbana-Champaign;Adobe", + "aff_unique_norm": "University of Illinois at Urbana-Champaign;Adobe", "aff_unique_dep": ";Adobe Research", "aff_unique_url": "https://illinois.edu;https://research.adobe.com", "aff_unique_abbr": "UIUC;Adobe", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Urbana-Champaign;", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Cheng_2023_ICCV,\n \n author = {\n Cheng,\n Ho Kei and Oh,\n Seoung Wug and Price,\n Brian and Schwing,\n Alexander and Lee,\n Joon-Young\n},\n title = {\n Tracking Anything with Decoupled Video Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1316-1326\n} \n}" }, { "title": "Tracking Everything Everywhere All at Once", @@ -59983,7 +61986,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wang_Tracking_Everything_Everywhere_All_at_Once_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wang_Tracking_Everything_Everywhere_All_at_Once_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Qianqian and Chang,\n Yen-Yu and Cai,\n Ruojin and Li,\n Zhengqi and Hariharan,\n Bharath and Holynski,\n Aleksander and Snavely,\n Noah\n},\n title = {\n Tracking Everything Everywhere All at Once\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19795-19806\n} \n}" }, { "title": "Tracking by 3D Model Estimation of Unknown Objects in Videos", @@ -59991,8 +61995,8 @@ "status": "Poster", "track": "main", "pid": "4135", - "author_site": "Denys Rozumnyi, Ji?\u00ed Matas, Marc Pollefeys, Vittorio Ferrari, Martin R. Oswald", - "author": "Denys Rozumnyi; Ji\u0159\u00ed Matas; Marc Pollefeys; Vittorio Ferrari; Martin R. Oswald", + "author_site": "Denys Rozumnyi, Ji?í Matas, Marc Pollefeys, Vittorio Ferrari, Martin R. Oswald", + "author": "Denys Rozumnyi; Jiří Matas; Marc Pollefeys; Vittorio Ferrari; Martin R. Oswald", "abstract": "Most model-free visual object tracking methods formulate the tracking task as object location estimation given by a 2D segmentation or a bounding box in each video frame. We argue that this representation is limited and instead propose to guide and improve 2D tracking with an explicit object representation, namely the textured 3D shape and 6DoF pose in each video frame. Our representation tackles a complex long-term dense correspondence problem between all 3D points on the object for all video frames, including frames where some points are invisible. To achieve that, the estimation is driven by re-rendering the input video frames as well as possible through differentiable rendering, which has not been used for tracking before. The proposed optimization minimizes a novel loss function to estimate the best 3D shape, texture, and 6DoF pose. We improve the state-of-the-art in 2D segmentation tracking on three different datasets with mostly rigid objects.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Rozumnyi_Tracking_by_3D_Model_Estimation_of_Unknown_Objects_in_Videos_ICCV_2023_paper.pdf", "aff": "Department of Computer Science, ETH Zurich + Czech Technical University in Prague; Czech Technical University in Prague; Department of Computer Science, ETH Zurich + Google Research; Google Research; Department of Computer Science, ETH Zurich + University of Amsterdam", @@ -60015,7 +62019,8 @@ "aff_campus_unique_index": "1;1;2;2;", "aff_campus_unique": ";Prague;Mountain View", "aff_country_unique_index": "0+1;1;0+2;2;0+3", - "aff_country_unique": "Switzerland;Czech Republic;United States;Netherlands" + "aff_country_unique": "Switzerland;Czech Republic;United States;Netherlands", + "bibtex": "@InProceedings{Rozumnyi_2023_ICCV,\n \n author = {\n Rozumnyi,\n Denys and Matas,\n Ji\\v{r\n}{\\'\\i\n} and Pollefeys,\n Marc and Ferrari,\n Vittorio and Oswald,\n Martin R.\n},\n title = {\n Tracking by 3D Model Estimation of Unknown Objects in Videos\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14086-14096\n} \n}" }, { "title": "Tracking by Natural Language Specification with Long Short-term Context Decoupling", @@ -60047,7 +62052,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Harbin", "aff_country_unique_index": "0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Ma_2023_ICCV,\n \n author = {\n Ma,\n Ding and Wu,\n Xiangqian\n},\n title = {\n Tracking by Natural Language Specification with Long Short-term Context Decoupling\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14012-14021\n} \n}" }, { "title": "Tracking without Label: Unsupervised Multiple Object Tracking via Contrastive Similarity Learning", @@ -60079,7 +62085,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Meng_2023_ICCV,\n \n author = {\n Meng,\n Sha and Shao,\n Dian and Guo,\n Jiacheng and Gao,\n Shan\n},\n title = {\n Tracking without Label: Unsupervised Multiple Object Tracking via Contrastive Similarity Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16264-16273\n} \n}" }, { "title": "Traj-MAE: Masked Autoencoders for Trajectory Prediction", @@ -60091,7 +62098,7 @@ "author": "Hao Chen; Jiaze Wang; Kun Shao; Furui Liu; Jianye Hao; Chenyong Guan; Guangyong Chen; Pheng-Ann Heng", "abstract": "Trajectory prediction has been a crucial task in building a reliable autonomous driving system by anticipating possible dangers. One key issue is to generate consistent trajectory predictions without colliding. To overcome the challenge, we propose an efficient masked autoencoder for trajectory prediction (Traj-MAE) that better represents the complicated behaviors of agents in the driving environment. Specifically, our Traj-MAE employs diverse masking strategies to pre-train the trajectory encoder and map encoder, allowing for the capture of social and temporal information among agents while leveraging the effect of environment from multiple granularities. To address the catastrophic forgetting problem that arises when pre-training the network with multiple masking strategies, we introduce a continual pre-training framework, which can help Traj-MAE learn valuable and diverse information from various strategies efficiently. Our experimental results in both multi-agent and single-agent settings demonstrate that Traj-MAE achieves competitive results with state-of-the-art methods and significantly outperforms our baseline model. Project page: https://jiazewang.com/projects/trajmae.html.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Chen_Traj-MAE_Masked_Autoencoders_for_Trajectory_Prediction_ICCV_2023_paper.pdf", - "aff": "Department of Computer Science and Engineering, The Chinese University of Hong Kong; Department of Computer Science and Engineering, The Chinese University of Hong Kong; Huawei Noah\u2019s Ark Lab; Zhejiang Lab; Huawei Noah\u2019s Ark Lab+Tianjin University; Gudsen Technology Co. Ltd; Zhejiang Lab; Department of Computer Science and Engineering, The Chinese University of Hong Kong+Institute of Medical Intelligence and XR, The Chinese University of Hong Kong", + "aff": "Department of Computer Science and Engineering, The Chinese University of Hong Kong; Department of Computer Science and Engineering, The Chinese University of Hong Kong; Huawei Noah’s Ark Lab; Zhejiang Lab; Huawei Noah’s Ark Lab+Tianjin University; Gudsen Technology Co. Ltd; Zhejiang Lab; Department of Computer Science and Engineering, The Chinese University of Hong Kong+Institute of Medical Intelligence and XR, The Chinese University of Hong Kong", "project": "https://jiazewang.com/projects/trajmae.html", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Chen_Traj-MAE_Masked_Autoencoders_ICCV_2023_supplemental.pdf", @@ -60104,14 +62111,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chen_Traj-MAE_Masked_Autoencoders_for_Trajectory_Prediction_ICCV_2023_paper.html", "aff_unique_index": "0;0;1;2;1+3;4;2;0+0", - "aff_unique_norm": "Chinese University of Hong Kong;Huawei;Zhejiang Lab;Tianjin University;Gudsen Technology", - "aff_unique_dep": "Department of Computer Science and Engineering;Noah\u2019s Ark Lab;;;Technology", + "aff_unique_norm": "The Chinese University of Hong Kong;Huawei;Zhejiang Lab;Tianjin University;Gudsen Technology", + "aff_unique_dep": "Department of Computer Science and Engineering;Noah’s Ark Lab;;;Technology", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.huawei.com;http://www.zhejianglab.com;http://www.tju.edu.cn;", "aff_unique_abbr": "CUHK;Huawei;;TJU;", "aff_campus_unique_index": "0;0;;0+0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0+0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Hao and Wang,\n Jiaze and Shao,\n Kun and Liu,\n Furui and Hao,\n Jianye and Guan,\n Chenyong and Chen,\n Guangyong and Heng,\n Pheng-Ann\n},\n title = {\n Traj-MAE: Masked Autoencoders for Trajectory Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8351-8362\n} \n}" }, { "title": "TrajPAC: Towards Robustness Verification of Pedestrian Trajectory Prediction Models", @@ -60143,7 +62151,8 @@ "aff_campus_unique_index": "0+0;0;0;0+0;1;0+0", "aff_campus_unique": "Beijing;Nanjing", "aff_country_unique_index": "0+0;0;0;0+0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Liang and Xu,\n Nathaniel and Yang,\n Pengfei and Jin,\n Gaojie and Huang,\n Cheng-Chao and Zhang,\n Lijun\n},\n title = {\n TrajPAC: Towards Robustness Verification of Pedestrian Trajectory Prediction Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8327-8339\n} \n}" }, { "title": "Trajectory Unified Transformer for Pedestrian Trajectory Prediction", @@ -60155,7 +62164,7 @@ "author": "Liushuai Shi; Le Wang; Sanping Zhou; Gang Hua", "abstract": "Pedestrian trajectory prediction is an essentially connecting link to understanding human behavior. Recent works achieve state-of-the-art performance gained from the hand-designed post-processing, e.g., clustering. However, this post-processing suffers from expensive inference time and neglects the probability of the predicted trajectory disturbing downstream safety decisions. In this paper, we present Trajectory Unified TRansformer, called TUTR, which unifies the trajectory prediction components, social interaction and multimodal trajectory prediction, into a transformer encoder-decoder architecture to effectively remove the need for post-processing. Specifically, TUTR parses the relationships across various motion modes by an explicit global prediction and an implicit mode-level transformer encoder. Then, TUTR attends to the social interactions with neighbors by a social-level transformer decoder. Finally, a dual prediction forecasts diverse trajectories and corresponding probabilities in parallel without post-processing. TUTR achieves state-of-the-art accuracy performance and about 10x - 40x inference speed improvements compared with previous well-tuning state-of-the-art methods using post-processing.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Shi_Trajectory_Unified_Transformer_for_Pedestrian_Trajectory_Prediction_ICCV_2023_paper.pdf", - "aff": "National Key Laboratory of Human-Machine Hybrid Augmented Intelligence, National Engineering Research Center for Visual Information and Applications, Institute of Artificial Intelligence and Robotics, Xi\u2019an Jiaotong University; National Key Laboratory of Human-Machine Hybrid Augmented Intelligence, National Engineering Research Center for Visual Information and Applications, Institute of Artificial Intelligence and Robotics, Xi\u2019an Jiaotong University; National Key Laboratory of Human-Machine Hybrid Augmented Intelligence, National Engineering Research Center for Visual Information and Applications, Institute of Artificial Intelligence and Robotics, Xi\u2019an Jiaotong University; Wormpex AI Research", + "aff": "National Key Laboratory of Human-Machine Hybrid Augmented Intelligence, National Engineering Research Center for Visual Information and Applications, Institute of Artificial Intelligence and Robotics, Xi’an Jiaotong University; National Key Laboratory of Human-Machine Hybrid Augmented Intelligence, National Engineering Research Center for Visual Information and Applications, Institute of Artificial Intelligence and Robotics, Xi’an Jiaotong University; National Key Laboratory of Human-Machine Hybrid Augmented Intelligence, National Engineering Research Center for Visual Information and Applications, Institute of Artificial Intelligence and Robotics, Xi’an Jiaotong University; Wormpex AI Research", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Shi_Trajectory_Unified_Transformer_ICCV_2023_supplemental.pdf", @@ -60168,14 +62177,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Shi_Trajectory_Unified_Transformer_for_Pedestrian_Trajectory_Prediction_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;1", - "aff_unique_norm": "Xi'an Jiao Tong University;Wormpex AI Research", + "aff_unique_norm": "Xi'an Jiaotong University;Wormpex AI Research", "aff_unique_dep": "Institute of Artificial Intelligence and Robotics;AI Research", "aff_unique_url": "http://www.xjtu.edu.cn;", "aff_unique_abbr": "XJTU;Wormpex AI", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Xi'an;", "aff_country_unique_index": "0;0;0;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Shi_2023_ICCV,\n \n author = {\n Shi,\n Liushuai and Wang,\n Le and Zhou,\n Sanping and Hua,\n Gang\n},\n title = {\n Trajectory Unified Transformer for Pedestrian Trajectory Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9675-9684\n} \n}" }, { "title": "TrajectoryFormer: 3D Object Tracking Transformer with Predictive Trajectory Hypotheses", @@ -60198,7 +62208,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chen_TrajectoryFormer_3D_Object_Tracking_Transformer_with_Predictive_Trajectory_Hypotheses_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chen_TrajectoryFormer_3D_Object_Tracking_Transformer_with_Predictive_Trajectory_Hypotheses_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Xuesong and Shi,\n Shaoshuai and Zhang,\n Chao and Zhu,\n Benjin and Wang,\n Qiang and Cheung,\n Ka Chun and See,\n Simon and Li,\n Hongsheng\n},\n title = {\n TrajectoryFormer: 3D Object Tracking Transformer with Predictive Trajectory Hypotheses\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18527-18536\n} \n}" }, { "title": "TransFace: Calibrating Transformer Training for Face Recognition from a Data-Centric Perspective", @@ -60230,7 +62241,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0;0;0", - "aff_country_unique": "China;United Kingdom" + "aff_country_unique": "China;United Kingdom", + "bibtex": "@InProceedings{Dan_2023_ICCV,\n \n author = {\n Dan,\n Jun and Liu,\n Yang and Xie,\n Haoyu and Deng,\n Jiankang and Xie,\n Haoran and Xie,\n Xuansong and Sun,\n Baigui\n},\n title = {\n TransFace: Calibrating Transformer Training for Face Recognition from a Data-Centric Perspective\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20642-20653\n} \n}" }, { "title": "TransHuman: A Transformer-based Human Representation for Generalizable Neural Human Rendering", @@ -60262,7 +62274,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Pan_2023_ICCV,\n \n author = {\n Pan,\n Xiao and Yang,\n Zongxin and Ma,\n Jianxin and Zhou,\n Chang and Yang,\n Yi\n},\n title = {\n TransHuman: A Transformer-based Human Representation for Generalizable Neural Human Rendering\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3544-3555\n} \n}" }, { "title": "TransIFF: An Instance-Level Feature Fusion Framework for Vehicle-Infrastructure Cooperative 3D Detection with Transformers", @@ -60287,14 +62300,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chen_TransIFF_An_Instance-Level_Feature_Fusion_Framework_for_Vehicle-Infrastructure_Cooperative_3D_ICCV_2023_paper.html", "aff_unique_index": "0;1;1", - "aff_unique_norm": "Beihang University;Baidu", - "aff_unique_dep": ";Baidu Inc.", + "aff_unique_norm": "Beihang University;Baidu Inc.", + "aff_unique_dep": ";", "aff_unique_url": "http://www.buaa.edu.cn/;https://www.baidu.com", "aff_unique_abbr": "BUAA;Baidu", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Ziming and Shi,\n Yifeng and Jia,\n Jinrang\n},\n title = {\n TransIFF: An Instance-Level Feature Fusion Framework for Vehicle-Infrastructure Cooperative 3D Detection with Transformers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18205-18214\n} \n}" }, { "title": "TransTIC: Transferring Transformer-based Image Compression from Human Perception to Machine Perception", @@ -60326,7 +62340,8 @@ "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Taiwan", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Yi-Hsin and Weng,\n Ying-Chieh and Kao,\n Chia-Hao and Chien,\n Cheng and Chiu,\n Wei-Chen and Peng,\n Wen-Hsiao\n},\n title = {\n TransTIC: Transferring Transformer-based Image Compression from Human Perception to Machine Perception\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23297-23307\n} \n}" }, { "title": "Transferable Adversarial Attack for Both Vision Transformers and Convolutional Networks via Momentum Integrated Gradients", @@ -60351,14 +62366,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ma_Transferable_Adversarial_Attack_for_Both_Vision_Transformers_and_Convolutional_Networks_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;0", - "aff_unique_norm": "Tsinghua University;Beijing Jiao Tong University;Beijing Big Data Centre", + "aff_unique_norm": "Tsinghua University;Beijing Jiaotong University;Beijing Big Data Centre", "aff_unique_dep": "Institute for Interdisciplinary Information Sciences;;", "aff_unique_url": "https://www.tsinghua.edu.cn;http://www.njtu.edu.cn/en;", "aff_unique_abbr": "THU;BJTU;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Ma_2023_ICCV,\n \n author = {\n Ma,\n Wenshuo and Li,\n Yidong and Jia,\n Xiaofeng and Xu,\n Wei\n},\n title = {\n Transferable Adversarial Attack for Both Vision Transformers and Convolutional Networks via Momentum Integrated Gradients\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4630-4639\n} \n}" }, { "title": "Transferable Decoding with Visual Entities for Zero-Shot Image Captioning", @@ -60383,14 +62399,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Fei_Transferable_Decoding_with_Visual_Entities_for_Zero-Shot_Image_Captioning_ICCV_2023_paper.html", "aff_unique_index": "0+1;0;0;2;3+4;0", - "aff_unique_norm": "Southern University of Science and Technology;University of Hong Kong;Harbin Institute of Technology;Tencent;Shanghai Jiao Tong University", - "aff_unique_dep": ";;;Tencent Holdings Limited;", + "aff_unique_norm": "Southern University of Science and Technology;The University of Hong Kong;Harbin Institute of Technology;Tencent Holdings Limited;Shanghai Jiao Tong University", + "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.sustech.edu.cn;https://www.hku.hk;http://en.hhit.edu.cn/;https://www.tencent.com;https://www.sjtu.edu.cn", "aff_unique_abbr": "SUSTech;HKU;HIT;Tencent;SJTU", "aff_campus_unique_index": "1;2;", "aff_campus_unique": ";Hong Kong SAR;Shenzhen", "aff_country_unique_index": "0+0;0;0;0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Fei_2023_ICCV,\n \n author = {\n Fei,\n Junjie and Wang,\n Teng and Zhang,\n Jinrui and He,\n Zhenyu and Wang,\n Chengjie and Zheng,\n Feng\n},\n title = {\n Transferable Decoding with Visual Entities for Zero-Shot Image Captioning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3136-3146\n} \n}" }, { "title": "Translating Images to Road Network: A Non-Autoregressive Sequence-to-Sequence Approach", @@ -60402,7 +62419,7 @@ "author": "Jiachen Lu; Renyuan Peng; Xinyue Cai; Hang Xu; Hongyang Li; Feng Wen; Wei Zhang; Li Zhang", "abstract": "The extraction of road network is essential for the generation of high-definition maps since it enables the precise localization of road landmarks and their interconnections. However, generating road network poses a significant challenge due to the conflicting underlying combination of Euclidean (e.g., road landmarks location) and non-Euclidean (e.g., road topological connectivity) structures. Existing methods struggle to merge the two types of data domains effectively, but few of them address it properly. Instead, our work establishes a unified representation of both types of data domain by projecting both Euclidean and non-Euclidean data into an integer series called RoadNet Sequence. Further than modeling an auto-regressive sequence-to-sequence Transformer model to understand RoadNet Sequence, we decouple the dependency of RoadNet Sequence into a mixture of auto-regressive and non-autoregressive dependency. Building on this, our proposed non-autoregressive sequence-to-sequence approach leverages non-autoregressive dependencies while fixing the gap towards auto-regressive dependencies, resulting in success on both efficiency and accuracy. Extensive experiments on nuScenes dataset demonstrate the superiority of RoadNet Sequence representation and the non-autoregressive approach compared to existing state-of-the-art alternatives.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Lu_Translating_Images_to_Road_Network_A_Non-Autoregressive_Sequence-to-Sequence_Approach_ICCV_2023_paper.pdf", - "aff": "Fudan University; Fudan University; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Shanghai AI Lab; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Fudan University", + "aff": "Fudan University; Fudan University; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; Shanghai AI Lab; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; Fudan University", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Lu_Translating_Images_to_ICCV_2023_supplemental.pdf", @@ -60416,13 +62433,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Lu_Translating_Images_to_Road_Network_A_Non-Autoregressive_Sequence-to-Sequence_Approach_ICCV_2023_paper.html", "aff_unique_index": "0;0;1;1;2;1;1;0", "aff_unique_norm": "Fudan University;Huawei;Shanghai AI Lab", - "aff_unique_dep": ";Noah\u2019s Ark Lab;", + "aff_unique_dep": ";Noah’s Ark Lab;", "aff_unique_url": "https://www.fudan.edu.cn;https://www.huawei.com;https://www.shanghaiailab.com", "aff_unique_abbr": "Fudan;Huawei;SAIL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Lu_2023_ICCV,\n \n author = {\n Lu,\n Jiachen and Peng,\n Renyuan and Cai,\n Xinyue and Xu,\n Hang and Li,\n Hongyang and Wen,\n Feng and Zhang,\n Wei and Zhang,\n Li\n},\n title = {\n Translating Images to Road Network: A Non-Autoregressive Sequence-to-Sequence Approach\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23-33\n} \n}" }, { "title": "Transparent Shape from a Single View Polarization Image", @@ -60454,7 +62472,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Shenzhen", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Shao_2023_ICCV,\n \n author = {\n Shao,\n Mingqi and Xia,\n Chongkun and Yang,\n Zhendong and Huang,\n Junnan and Wang,\n Xueqian\n},\n title = {\n Transparent Shape from a Single View Polarization Image\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9277-9286\n} \n}" }, { "title": "Treating Pseudo-labels Generation as Image Matting for Weakly Supervised Semantic Segmentation", @@ -60486,7 +62505,8 @@ "aff_campus_unique_index": ";;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Changwei and Xu,\n Rongtao and Xu,\n Shibiao and Meng,\n Weiliang and Zhang,\n Xiaopeng\n},\n title = {\n Treating Pseudo-labels Generation as Image Matting for Weakly Supervised Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 755-765\n} \n}" }, { "title": "Tree-Structured Shading Decomposition", @@ -60518,7 +62538,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Geng_2023_ICCV,\n \n author = {\n Geng,\n Chen and Yu,\n Hong-Xing and Zhang,\n Sharon and Agrawala,\n Maneesh and Wu,\n Jiajun\n},\n title = {\n Tree-Structured Shading Decomposition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 488-498\n} \n}" }, { "title": "Tri-MipRF: Tri-Mip Representation for Efficient Anti-Aliasing Neural Radiance Fields", @@ -60545,12 +62566,13 @@ "aff_unique_index": "0;0+1;0;0;2;0;0", "aff_unique_norm": "PICO;Tsinghua University;Chinese Academy of Sciences", "aff_unique_dep": ";;Institute of Computing Technology", - "aff_unique_url": ";https://www.tsinghua.edu.cn;http://www.ict.ac.cn", + "aff_unique_url": ";https://www.tsinghua.edu.cn;http://www.ict.cas.cn", "aff_unique_abbr": ";THU;CAS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Hu_2023_ICCV,\n \n author = {\n Hu,\n Wenbo and Wang,\n Yuling and Ma,\n Lin and Yang,\n Bangbang and Gao,\n Lin and Liu,\n Xiao and Ma,\n Yuewen\n},\n title = {\n Tri-MipRF: Tri-Mip Representation for Efficient Anti-Aliasing Neural Radiance Fields\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19774-19783\n} \n}" }, { "title": "TripLe: Revisiting Pretrained Model Reuse and Progressive Learning for Efficient Vision Transformer Scaling and Searching", @@ -60562,7 +62584,7 @@ "author": "Cheng Fu; Hanxian Huang; Zixuan Jiang; Yun Ni; Lifeng Nai; Gang Wu; Liqun Cheng; Yanqi Zhou; Sheng Li; Andrew Li; Jishen Zhao", "abstract": "One promising way to accelerate transformer training is to reuse small pretrained models to initialize the transformer, as their existing representation power facilitates faster model convergence. Previous works designed expansion operators to scale up pretrained models to the target model before training. Yet, model functionality is difficult to preserve when scaling a transformer in all dimensions at once. Moreover, maintaining the pretrained optimizer states for weights is critical for model scaling, whereas the new weights added during expansion lack these states in pretrained models. To address these issues, we propose TripLe, which partially scales a model before training, while growing the rest of the new parameters during training by copying both the warmed-up weights with the optimizer states from existing weights. As such, the new parameters introduced during training will obtain their training states. Furthermore, through serializing the scaling of model width and depth, the functionality of each expansion can be preserved. We evaluate TripLe in both single-trial model scaling and multi-trial neural architecture search (NAS). Due to the fast training convergence of TripLe, the proxy accuracy from TripLe better reveals the model quality compared to from-scratch training in multi-trial NAS. Experiments show that TripLe outperforms both from-scratch training and knowledge distillation (KD) in both training time and task performance. TripLe can also be combined with KD to achieve an even higher task accuracy. For NAS, the model obtained from TripLe outperforms DeiT-B in task accuracy with 69% reduction in parameter size and FLOPs.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Fu_TripLe_Revisiting_Pretrained_Model_Reuse_and_Progressive_Learning_for_Efficient_ICCV_2023_paper.pdf", - "aff": "UC San Diego\u2020; Google\u2021; Google\u2021; Google\u2021; Google\u2021; Google\u2021; Google\u2021; Google\u2021; Google\u2021; Google\u2021; Google\u2021", + "aff": "UC San Diego†; Google‡; Google‡; Google‡; Google‡; Google‡; Google‡; Google‡; Google‡; Google‡; Google‡", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Fu_TripLe_Revisiting_Pretrained_ICCV_2023_supplemental.pdf", @@ -60576,13 +62598,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Fu_TripLe_Revisiting_Pretrained_Model_Reuse_and_Progressive_Learning_for_Efficient_ICCV_2023_paper.html", "aff_unique_index": "0;1;1;1;1;1;1;1;1;1;1", "aff_unique_norm": "University of California, San Diego;Google", - "aff_unique_dep": ";Google", + "aff_unique_dep": ";", "aff_unique_url": "https://ucsd.edu;https://www.google.com", "aff_unique_abbr": "UCSD;Google", "aff_campus_unique_index": "0;1;1;1;1;1;1;1;1;1;1", "aff_campus_unique": "San Diego;Mountain View", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Fu_2023_ICCV,\n \n author = {\n Fu,\n Cheng and Huang,\n Hanxian and Jiang,\n Zixuan and Ni,\n Yun and Nai,\n Lifeng and Wu,\n Gang and Cheng,\n Liqun and Zhou,\n Yanqi and Li,\n Sheng and Li,\n Andrew and Zhao,\n Jishen\n},\n title = {\n TripLe: Revisiting Pretrained Model Reuse and Progressive Learning for Efficient Vision Transformer Scaling and Searching\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17153-17163\n} \n}" }, { "title": "Troubleshooting Ethnic Quality Bias with Curriculum Domain Adaptation for Face Image Quality Assessment", @@ -60614,7 +62637,8 @@ "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Tianjin;Hong Kong", "aff_country_unique_index": "0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Ou_2023_ICCV,\n \n author = {\n Ou,\n Fu-Zhao and Chen,\n Baoliang and Li,\n Chongyi and Wang,\n Shiqi and Kwong,\n Sam\n},\n title = {\n Troubleshooting Ethnic Quality Bias with Curriculum Domain Adaptation for Face Image Quality Assessment\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20718-20729\n} \n}" }, { "title": "Tube-Link: A Flexible Cross Tube Framework for Universal Video Segmentation", @@ -60646,7 +62670,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1+2;2;0", - "aff_country_unique": "Singapore;United Kingdom;China" + "aff_country_unique": "Singapore;United Kingdom;China", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Xiangtai and Yuan,\n Haobo and Zhang,\n Wenwei and Cheng,\n Guangliang and Pang,\n Jiangmiao and Loy,\n Chen Change\n},\n title = {\n Tube-Link: A Flexible Cross Tube Framework for Universal Video Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13923-13933\n} \n}" }, { "title": "Tubelet-Contrastive Self-Supervision for Video-Efficient Generalization", @@ -60669,7 +62694,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Thoker_Tubelet-Contrastive_Self-Supervision_for_Video-Efficient_Generalization_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Thoker_Tubelet-Contrastive_Self-Supervision_for_Video-Efficient_Generalization_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Thoker_2023_ICCV,\n \n author = {\n Thoker,\n Fida Mohammad and Doughty,\n Hazel and Snoek,\n Cees G. M.\n},\n title = {\n Tubelet-Contrastive Self-Supervision for Video-Efficient Generalization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13812-13823\n} \n}" }, { "title": "Tune-A-Video: One-Shot Tuning of Image Diffusion Models for Text-to-Video Generation", @@ -60701,7 +62727,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "1;2;1;2", - "aff_country_unique": ";Singapore;China" + "aff_country_unique": ";Singapore;China", + "bibtex": "@InProceedings{Wu_2023_ICCV,\n \n author = {\n Wu,\n Jay Zhangjie and Ge,\n Yixiao and Wang,\n Xintao and Lei,\n Stan Weixian and Gu,\n Yuchao and Shi,\n Yufei and Hsu,\n Wynne and Shan,\n Ying and Qie,\n Xiaohu and Shou,\n Mike Zheng\n},\n title = {\n Tune-A-Video: One-Shot Tuning of Image Diffusion Models for Text-to-Video Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7623-7633\n} \n}" }, { "title": "Tuning Pre-trained Model via Moment Probing", @@ -60726,14 +62753,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Gao_Tuning_Pre-trained_Model_via_Moment_Probing_ICCV_2023_paper.html", "aff_unique_index": "0+1;0;0;0;0;1", - "aff_unique_norm": "Tianjin University;Baidu", + "aff_unique_norm": "Tianjin University;Baidu Research", "aff_unique_dep": "College of Intelligence and Computing;Business Intelligence Lab", "aff_unique_url": "https://www.tju.edu.cn;https://baidu.com", "aff_unique_abbr": "Tianjin University;Baidu", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Gao_2023_ICCV,\n \n author = {\n Gao,\n Mingze and Wang,\n Qilong and Lin,\n Zhenyi and Zhu,\n Pengfei and Hu,\n Qinghua and Zhou,\n Jingbo\n},\n title = {\n Tuning Pre-trained Model via Moment Probing\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11803-11813\n} \n}" }, { "title": "Two Birds, One Stone: A Unified Framework for Joint Learning of Image and Video Style Transfers", @@ -60760,12 +62788,13 @@ "aff_unique_index": "0+1;2;0+1", "aff_unique_norm": "Chinese Academy of Sciences;University of Chinese Academy of Sciences;University of North Texas", "aff_unique_dep": "Institute of Software;;Department of Computer Science and Engineering", - "aff_unique_url": "http://english.ios.ac.cn/;http://www.ucas.ac.cn;https://www.unt.edu", + "aff_unique_url": "http://www.ios.ac.cn;http://www.ucas.ac.cn;https://www.unt.edu", "aff_unique_abbr": "CAS;UCAS;UNT", "aff_campus_unique_index": "0+0;1;0+0", "aff_campus_unique": "Beijing;Denton", "aff_country_unique_index": "0+0;1;0+0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Gu_2023_ICCV,\n \n author = {\n Gu,\n Bohai and Fan,\n Heng and Zhang,\n Libo\n},\n title = {\n Two Birds,\n One Stone: A Unified Framework for Joint Learning of Image and Video Style Transfers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23545-23554\n} \n}" }, { "title": "Two-in-One Depth: Bridging the Gap Between Monocular and Binocular Self-Supervised Depth Estimation", @@ -60797,7 +62826,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhou_2023_ICCV,\n \n author = {\n Zhou,\n Zhengming and Dong,\n Qiulei\n},\n title = {\n Two-in-One Depth: Bridging the Gap Between Monocular and Binocular Self-Supervised Depth Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9411-9421\n} \n}" }, { "title": "U-RED: Unsupervised 3D Shape Retrieval and Deformation for Partial Point Clouds", @@ -60822,14 +62852,15 @@ "author_num": 9, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Di_U-RED_Unsupervised_3D_Shape_Retrieval_and_Deformation_for_Partial_Point_ICCV_2023_paper.html", "aff_unique_index": "0;1;1;2;3;3;3;1;0+2", - "aff_unique_norm": "Technical University of Munich;Tsinghua University;Google;Deutsches Forschungszentrum f\u00fcr K\u00fcnstliche Intelligenz", - "aff_unique_dep": ";;Google;", + "aff_unique_norm": "Technical University of Munich;Tsinghua University;Google;Deutsches Forschungszentrum für Künstliche Intelligenz", + "aff_unique_dep": ";;;", "aff_unique_url": "https://www.tum.de;https://www.tsinghua.edu.cn;https://www.google.com;https://www.dfki.de", "aff_unique_abbr": "TUM;THU;Google;DFKI", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;1;2;0;0;0;1;0+2", - "aff_country_unique": "Germany;China;United States" + "aff_country_unique": "Germany;China;United States", + "bibtex": "@InProceedings{Di_2023_ICCV,\n \n author = {\n Di,\n Yan and Zhang,\n Chenyangguang and Zhang,\n Ruida and Manhardt,\n Fabian and Su,\n Yongzhi and Rambach,\n Jason and Stricker,\n Didier and Ji,\n Xiangyang and Tombari,\n Federico\n},\n title = {\n U-RED: Unsupervised 3D Shape Retrieval and Deformation for Partial Point Clouds\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8884-8895\n} \n}" }, { "title": "UATVR: Uncertainty-Adaptive Text-Video Retrieval", @@ -60852,7 +62883,8 @@ "aff_domain": ";;;;;;;;", "email": ";;;;;;;;", "author_num": 9, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Fang_UATVR_Uncertainty-Adaptive_Text-Video_Retrieval_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Fang_UATVR_Uncertainty-Adaptive_Text-Video_Retrieval_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Fang_2023_ICCV,\n \n author = {\n Fang,\n Bo and Wu,\n Wenhao and Liu,\n Chang and Zhou,\n Yu and Song,\n Yuxin and Wang,\n Weiping and Shu,\n Xiangbo and Ji,\n Xiangyang and Wang,\n Jingdong\n},\n title = {\n UATVR: Uncertainty-Adaptive Text-Video Retrieval\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13723-13733\n} \n}" }, { "title": "UCF: Uncovering Common Features for Generalizable Deepfake Detection", @@ -60877,14 +62909,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yan_UCF_Uncovering_Common_Features_for_Generalizable_Deepfake_Detection_ICCV_2023_paper.html", "aff_unique_index": "0;1;1;0", - "aff_unique_norm": "Chinese University of Hong Kong, Shenzhen;Tencent", + "aff_unique_norm": "The Chinese University of Hong Kong, Shenzhen;Tencent", "aff_unique_dep": "School of Data Science;Tencent AI Lab", "aff_unique_url": "https://www.cuhk.edu.cn;https://ai.tencent.com", "aff_unique_abbr": "CUHK-Shenzhen;Tencent AI Lab", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Shenzhen;", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yan_2023_ICCV,\n \n author = {\n Yan,\n Zhiyuan and Zhang,\n Yong and Fan,\n Yanbo and Wu,\n Baoyuan\n},\n title = {\n UCF: Uncovering Common Features for Generalizable Deepfake Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22412-22423\n} \n}" }, { "title": "UGC: Unified GAN Compression for Efficient Image-to-Image Translation", @@ -60916,7 +62949,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Ren_2023_ICCV,\n \n author = {\n Ren,\n Yuxi and Wu,\n Jie and Zhang,\n Peng and Zhang,\n Manlin and Xiao,\n Xuefeng and He,\n Qian and Wang,\n Rui and Zheng,\n Min and Pan,\n Xin\n},\n title = {\n UGC: Unified GAN Compression for Efficient Image-to-Image Translation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17281-17291\n} \n}" }, { "title": "UHDNeRF: Ultra-High-Definition Neural Radiance Fields", @@ -60928,7 +62962,7 @@ "author": "Quewei Li; Feichao Li; Jie Guo; Yanwen Guo", "abstract": "We propose UHDNeRF, a new framework for novel view synthesis on the challenging ultra-high-resolution (e.g., 4K) real-world scenes. Previous NeRF methods are not specifically designed for rendering on extremely high resolutions, leading to burry results with notable detail-losing problems even though trained on 4K images. This is mainly due to the mismatch between the high-resolution inputs and the low-dimensional volumetric representation. To address this issue, we introduce an adaptive implicit-explicit scene representation with which an explicit sparse point cloud is used to boost the performance of an implicit volume on modeling subtle details. Specifically, we reconstruct the complex real-world scene with a frequency separation strategy that the implicit volume learns to represent the low-frequency properties of the whole scene, and the sparse point cloud is used for reproducing high-frequency details. To better explore the information embedded in the point cloud, we extract a global structure feature and a local point-wise feature from the point cloud for each sample located in the high-frequency regions. Furthermore, a patch-based sampling strategy is introduced to reduce the computational cost. The high-fidelity rendering results demonstrate the superiority of our method for retaining high-frequency details at 4K ultra-high-resolution scenarios against state-of-the-art NeRF-based solutions.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Li_UHDNeRF_Ultra-High-Definition_Neural_Radiance_Fields_ICCV_2023_paper.pdf", - "aff": "National Key Lab for Novel Software Technology, Nanjing University, China; National Key Lab for Novel Software Technology, Nanjing University, China; National Key Lab for Novel Software Technology, Nanjing University, China\u2020; National Key Lab for Novel Software Technology, Nanjing University, China\u2020", + "aff": "National Key Lab for Novel Software Technology, Nanjing University, China; National Key Lab for Novel Software Technology, Nanjing University, China; National Key Lab for Novel Software Technology, Nanjing University, China†; National Key Lab for Novel Software Technology, Nanjing University, China†", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Li_UHDNeRF_Ultra-High-Definition_Neural_ICCV_2023_supplemental.zip", @@ -60948,7 +62982,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Quewei and Li,\n Feichao and Guo,\n Jie and Guo,\n Yanwen\n},\n title = {\n UHDNeRF: Ultra-High-Definition Neural Radiance Fields\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23097-23108\n} \n}" }, { "title": "UMC: A Unified Bandwidth-efficient and Multi-resolution based Collaborative Perception Framework", @@ -60960,7 +62995,7 @@ "author": "Tianhang Wang; Guang Chen; Kai Chen; Zhengfa Liu; Bo Zhang; Alois Knoll; Changjun Jiang", "abstract": "Multi-agent collaborative perception (MCP) has recently attracted much attention. It includes three key processes: communication for sharing, collaboration for integration, and reconstruction for different downstream tasks. Existing methods pursue designing the collaboration process alone, ignoring their intrinsic interactions and resulting in suboptimal performance. In contrast, we aim to propose a Unified Collaborative perception framework named UMC, optimizing the communication, collaboration, and reconstruction processes with the Multi-resolution technique. The communication introduces a novel trainable multi-resolution and selective-region (MRSR) mechanism, achieving higher quality and lower bandwidth. Then, a graph-based collaboration is proposed, conducting on each resolution to adapt the MRSR. Finally, the reconstruction integrates the multi-resolution collaborative features for downstream tasks. Since the general metric can not reflect the performance enhancement brought by MCP systematically, we introduce a brand-new evaluation metric that evaluates the MCP from different perspectives. To verify our algorithm, we conducted experiments on the V2X-Sim and OPV2V datasets. Our quantitative and qualitative experiments prove that the proposed UMC outperforms the state-of-the-art collaborative perception approaches.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Wang_UMC_A_Unified_Bandwidth-efficient_and_Multi-resolution_based_Collaborative_Perception_Framework_ICCV_2023_paper.pdf", - "aff": "Tongji University; Tongji University; Tongji University; Tongji University; Shanghai Westwell Technology Co., Ltd; Technische Universit \u00a8at M \u00a8unchen; Tongji University", + "aff": "Tongji University; Tongji University; Tongji University; Tongji University; Shanghai Westwell Technology Co., Ltd; Technische Universit ¨at M ¨unchen; Tongji University", "project": "", "github": "https://github.com/ispc-lab/UMC", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Wang_UMC_A_Unified_ICCV_2023_supplemental.pdf", @@ -60973,14 +63008,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wang_UMC_A_Unified_Bandwidth-efficient_and_Multi-resolution_based_Collaborative_Perception_Framework_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0;1;2;0", - "aff_unique_norm": "Tongji University;Shanghai Westwell Technology Co., Ltd;Technische Universit\u00e4t M\u00fcnchen", + "aff_unique_norm": "Tongji University;Shanghai Westwell Technology Co., Ltd;Technische Universität München", "aff_unique_dep": ";;", "aff_unique_url": "https://www.tongji.edu.cn;;https://www.tum.de", "aff_unique_abbr": "Tongji;;TUM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;1;0", - "aff_country_unique": "China;Germany" + "aff_country_unique": "China;Germany", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Tianhang and Chen,\n Guang and Chen,\n Kai and Liu,\n Zhengfa and Zhang,\n Bo and Knoll,\n Alois and Jiang,\n Changjun\n},\n title = {\n UMC: A Unified Bandwidth-efficient and Multi-resolution based Collaborative Perception Framework\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8187-8196\n} \n}" }, { "title": "UMFuse: Unified Multi View Fusion for Human Editing Applications", @@ -61012,7 +63048,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Jain_2023_ICCV,\n \n author = {\n Jain,\n Rishabh and Hemani,\n Mayur and Ceylan,\n Duygu and Singh,\n Krishna Kumar and Lu,\n Jingwan and Sarkar,\n Mausoom and Krishnamurthy,\n Balaji\n},\n title = {\n UMFuse: Unified Multi View Fusion for Human Editing Applications\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7182-7191\n} \n}" }, { "title": "UMIFormer: Mining the Correlations between Similar Tokens for Multi-View 3D Reconstruction", @@ -61044,7 +63081,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Macau SAR", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhu_2023_ICCV,\n \n author = {\n Zhu,\n Zhenwei and Yang,\n Liying and Li,\n Ning and Jiang,\n Chaohao and Liang,\n Yanyan\n},\n title = {\n UMIFormer: Mining the Correlations between Similar Tokens for Multi-View 3D Reconstruction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18226-18235\n} \n}" }, { "title": "USAGE: A Unified Seed Area Generation Paradigm for Weakly Supervised Semantic Segmentation", @@ -61067,7 +63105,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Peng_USAGE_A_Unified_Seed_Area_Generation_Paradigm_for_Weakly_Supervised_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Peng_USAGE_A_Unified_Seed_Area_Generation_Paradigm_for_Weakly_Supervised_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Peng_2023_ICCV,\n \n author = {\n Peng,\n Zelin and Wang,\n Guanchun and Xie,\n Lingxi and Jiang,\n Dongsheng and Shen,\n Wei and Tian,\n Qi\n},\n title = {\n USAGE: A Unified Seed Area Generation Paradigm for Weakly Supervised Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 624-634\n} \n}" }, { "title": "UnLoc: A Unified Framework for Video Localization Tasks", @@ -61092,14 +63131,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yan_UnLoc_A_Unified_Framework_for_Video_Localization_Tasks_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0;1;0;0;0", - "aff_unique_norm": "Google;University of Illinois Urbana-Champaign", + "aff_unique_norm": "Google;University of Illinois at Urbana-Champaign", "aff_unique_dep": "Google Research;", "aff_unique_url": "https://research.google;https://illinois.edu", "aff_unique_abbr": "Google Research;UIUC", "aff_campus_unique_index": "0;0;0;0;1;0;0;0", "aff_campus_unique": "Mountain View;Urbana-Champaign", "aff_country_unique_index": "0;0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Yan_2023_ICCV,\n \n author = {\n Yan,\n Shen and Xiong,\n Xuehan and Nagrani,\n Arsha and Arnab,\n Anurag and Wang,\n Zhonghao and Ge,\n Weina and Ross,\n David and Schmid,\n Cordelia\n},\n title = {\n UnLoc: A Unified Framework for Video Localization Tasks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13623-13633\n} \n}" }, { "title": "Unaligned 2D to 3D Translation with Conditional Vector-Quantized Code Diffusion using Transformers", @@ -61122,7 +63162,8 @@ "aff_domain": ";;;;;;", "email": ";;;;;;", "author_num": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Corona-Figueroa_Unaligned_2D_to_3D_Translation_with_Conditional_Vector-Quantized_Code_Diffusion_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Corona-Figueroa_Unaligned_2D_to_3D_Translation_with_Conditional_Vector-Quantized_Code_Diffusion_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Corona-Figueroa_2023_ICCV,\n \n author = {\n Corona-Figueroa,\n Abril and Bond-Taylor,\n Sam and Bhowmik,\n Neelanjan and Gaus,\n Yona Falinie A. and Breckon,\n Toby P. and Shum,\n Hubert P. H. and Willcocks,\n Chris G.\n},\n title = {\n Unaligned 2D to 3D Translation with Conditional Vector-Quantized Code Diffusion using Transformers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14585-14594\n} \n}" }, { "title": "Uncertainty Guided Adaptive Warping for Robust and Efficient Stereo Matching", @@ -61154,7 +63195,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;1;0;0;0;0;0;2;2", - "aff_country_unique": "China;Singapore;Canada" + "aff_country_unique": "China;Singapore;Canada", + "bibtex": "@InProceedings{Jing_2023_ICCV,\n \n author = {\n Jing,\n Junpeng and Li,\n Jiankun and Xiong,\n Pengfei and Liu,\n Jiangyu and Liu,\n Shuaicheng and Guo,\n Yichen and Deng,\n Xin and Xu,\n Mai and Jiang,\n Lai and Sigal,\n Leonid\n},\n title = {\n Uncertainty Guided Adaptive Warping for Robust and Efficient Stereo Matching\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3318-3327\n} \n}" }, { "title": "Uncertainty-aware State Space Transformer for Egocentric 3D Hand Trajectory Forecasting", @@ -61186,7 +63228,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Bao_2023_ICCV,\n \n author = {\n Bao,\n Wentao and Chen,\n Lele and Zeng,\n Libing and Li,\n Zhong and Xu,\n Yi and Yuan,\n Junsong and Kong,\n Yu\n},\n title = {\n Uncertainty-aware State Space Transformer for Egocentric 3D Hand Trajectory Forecasting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13702-13711\n} \n}" }, { "title": "Uncertainty-aware Unsupervised Multi-Object Tracking", @@ -61218,7 +63261,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Kai and Jin,\n Sheng and Fu,\n Zhihang and Chen,\n Ze and Jiang,\n Rongxin and Ye,\n Jieping\n},\n title = {\n Uncertainty-aware Unsupervised Multi-Object Tracking\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9996-10005\n} \n}" }, { "title": "Uncertainty-guided Learning for Improving Image Manipulation Detection", @@ -61250,7 +63294,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Ji_2023_ICCV,\n \n author = {\n Ji,\n Kaixiang and Chen,\n Feng and Guo,\n Xin and Xu,\n Yadong and Wang,\n Jian and Chen,\n Jingdong\n},\n title = {\n Uncertainty-guided Learning for Improving Image Manipulation Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22456-22465\n} \n}" }, { "title": "Under-Display Camera Image Restoration with Scattering Effect", @@ -61282,7 +63327,8 @@ "aff_campus_unique_index": "0;0+1;0;0", "aff_campus_unique": "Macau SAR;Shenzhen", "aff_country_unique_index": "0;0+0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Song_2023_ICCV,\n \n author = {\n Song,\n Binbin and Chen,\n Xiangyu and Xu,\n Shuning and Zhou,\n Jiantao\n},\n title = {\n Under-Display Camera Image Restoration with Scattering Effect\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12580-12589\n} \n}" }, { "title": "Understanding 3D Object Interaction from a Single Image", @@ -61314,7 +63360,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Qian_2023_ICCV,\n \n author = {\n Qian,\n Shengyi and Fouhey,\n David F.\n},\n title = {\n Understanding 3D Object Interaction from a Single Image\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21753-21763\n} \n}" }, { "title": "Understanding Hessian Alignment for Domain Generalization", @@ -61326,7 +63373,7 @@ "author": "Sobhan Hemati; Guojun Zhang; Amir Estiri; Xi Chen", "abstract": "Out-of-distribution (OOD) generalization is a critical ability for deep learning models in many real-world scenarios including healthcare and autonomous vehicles. Recently, different techniques have been proposed to improve OOD generalization. Among these methods, gradient-based regularizers have shown promising performance compared with other competitors. Despite this success, our understanding of the role of Hessian and gradient alignment in domain generalization is still limited. To address this shortcoming, we analyze the role of the classifier's head Hessian matrix and gradient in domain generalization using recent OoD theory of transferability. Theoretically, we show that spectral norm between the classifier's head Hessian matrices across domains is an upper bound of the transfer measure, a notion of distance between target and source domains. Furthermore, we analyze all the attributes that get aligned when we encourage similarity between Hessians and gradients. Our analysis explains the success of many regularizers like CORAL, IRM, V-REx, Fish, IGA, and Fishr as they regularize part of the classifier's head Hessian and/or gradient. Finally, we propose two simple yet effective methods to match the classifier's head Hessians and gradients in an efficient way, based on the Hessian Gradient Product (HGP) and Hutchinson's method (Hutchinson), and without directly calculating Hessians. We validate the OOD generalization ability of proposed methods in different scenarios, including transferability, severe correlation shift, label shift and diversity shift. Our results show that Hessian alignment methods achieve promising performance on various OOD benchmarks. Our code is available here.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Hemati_Understanding_Hessian_Alignment_for_Domain_Generalization_ICCV_2023_paper.pdf", - "aff": "Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab", + "aff": "Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab; Huawei Noah’s Ark Lab", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Hemati_Understanding_Hessian_Alignment_ICCV_2023_supplemental.pdf", @@ -61340,13 +63387,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Hemati_Understanding_Hessian_Alignment_for_Domain_Generalization_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Huawei", - "aff_unique_dep": "Noah\u2019s Ark Lab", + "aff_unique_dep": "Noah’s Ark Lab", "aff_unique_url": "https://www.huawei.com", "aff_unique_abbr": "Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Hemati_2023_ICCV,\n \n author = {\n Hemati,\n Sobhan and Zhang,\n Guojun and Estiri,\n Amir and Chen,\n Xi\n},\n title = {\n Understanding Hessian Alignment for Domain Generalization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19004-19014\n} \n}" }, { "title": "Understanding Self-attention Mechanism via Dynamical System Perspective", @@ -61378,7 +63426,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Huang_2023_ICCV,\n \n author = {\n Huang,\n Zhongzhan and Liang,\n Mingfu and Qin,\n Jinghui and Zhong,\n Shanshan and Lin,\n Liang\n},\n title = {\n Understanding Self-attention Mechanism via Dynamical System Perspective\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1412-1422\n} \n}" }, { "title": "Understanding the Feature Norm for Out-of-Distribution Detection", @@ -61401,7 +63450,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Park_Understanding_the_Feature_Norm_for_Out-of-Distribution_Detection_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Park_Understanding_the_Feature_Norm_for_Out-of-Distribution_Detection_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Park_2023_ICCV,\n \n author = {\n Park,\n Jaewoo and Chai,\n Jacky Chen Long and Yoon,\n Jaeho and Teoh,\n Andrew Beng Jin\n},\n title = {\n Understanding the Feature Norm for Out-of-Distribution Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1557-1567\n} \n}" }, { "title": "Unfolding Framework with Prior of Convolution-Transformer Mixture and Uncertainty Estimation for Video Snapshot Compressive Imaging", @@ -61433,7 +63483,8 @@ "aff_campus_unique_index": "0+0;1", "aff_campus_unique": "Beijing;Hangzhou", "aff_country_unique_index": "0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zheng_2023_ICCV,\n \n author = {\n Zheng,\n Siming and Yuan,\n Xin\n},\n title = {\n Unfolding Framework with Prior of Convolution-Transformer Mixture and Uncertainty Estimation for Video Snapshot Compressive Imaging\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12738-12749\n} \n}" }, { "title": "Uni-3D: A Universal Model for Panoptic 3D Scene Reconstruction", @@ -61465,7 +63516,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "San Diego;", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Xiang and Chen,\n Zeyuan and Wei,\n Fangyin and Tu,\n Zhuowen\n},\n title = {\n Uni-3D: A Universal Model for Panoptic 3D Scene Reconstruction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9256-9266\n} \n}" }, { "title": "UniDexGrasp++: Improving Dexterous Grasping Policy Learning via Geometry-Aware Curriculum and Iterative Generalist-Specialist Learning", @@ -61497,7 +63549,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0;0;0+0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wan_2023_ICCV,\n \n author = {\n Wan,\n Weikang and Geng,\n Haoran and Liu,\n Yun and Shan,\n Zikang and Yang,\n Yaodong and Yi,\n Li and Wang,\n He\n},\n title = {\n UniDexGrasp++: Improving Dexterous Grasping Policy Learning via Geometry-Aware Curriculum and Iterative Generalist-Specialist Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3891-3902\n} \n}" }, { "title": "UniFace: Unified Cross-Entropy Loss for Deep Face Recognition", @@ -61529,7 +63582,8 @@ "aff_campus_unique_index": "1;1+2;1;1;2", "aff_campus_unique": ";Shenzhen;Birmingham", "aff_country_unique_index": "0+0+0;0+0+1;0+0;0+0;1+1", - "aff_country_unique": "China;United Kingdom" + "aff_country_unique": "China;United Kingdom", + "bibtex": "@InProceedings{Zhou_2023_ICCV,\n \n author = {\n Zhou,\n Jiancan and Jia,\n Xi and Li,\n Qiufu and Shen,\n Linlin and Duan,\n Jinming\n},\n title = {\n UniFace: Unified Cross-Entropy Loss for Deep Face Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20730-20739\n} \n}" }, { "title": "UniFormerV2: Unlocking the Potential of Image ViTs for Video Understanding", @@ -61554,14 +63608,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_UniFormerV2_Unlocking_the_Potential_of_Image_ViTs_for_Video_Understanding_ICCV_2023_paper.html", "aff_unique_index": "0+1+2;0+2;2;3+2;2;4+2;2+0", - "aff_unique_norm": "Shenzhen Institute of Advanced Technology;University of Chinese Academy of Sciences;Shanghai AI Laboratory;University of Hong Kong;Nanjing University", + "aff_unique_norm": "Shenzhen Institute of Advanced Technology;University of Chinese Academy of Sciences;Shanghai AI Laboratory;The University of Hong Kong;Nanjing University", "aff_unique_dep": ";;;;State Key Laboratory for Novel Software Technology", "aff_unique_url": "http://www.siat.cas.cn;http://www.ucas.ac.cn;https://www.shanghai-ai-lab.com;https://www.hku.hk;http://www.nju.edu.cn", "aff_unique_abbr": "SIAT;UCAS;SAIL;HKU;Nanjing University", "aff_campus_unique_index": "0;0;2;;0", "aff_campus_unique": "Shenzhen;;Hong Kong SAR", "aff_country_unique_index": "0+0+0;0+0;0;0+0;0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Kunchang and Wang,\n Yali and He,\n Yinan and Li,\n Yizhuo and Wang,\n Yi and Wang,\n Limin and Qiao,\n Yu\n},\n title = {\n UniFormerV2: Unlocking the Potential of Image ViTs for Video Understanding\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1632-1643\n} \n}" }, { "title": "UniFusion: Unified Multi-View Fusion Transformer for Spatial-Temporal Representation in Bird's-Eye-View", @@ -61593,7 +63648,8 @@ "aff_campus_unique_index": "1+2;1+2", "aff_campus_unique": ";Shanghai;Hangzhou", "aff_country_unique_index": "0;0+0+0;0;0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Qin_2023_ICCV,\n \n author = {\n Qin,\n Zequn and Chen,\n Jingyu and Chen,\n Chao and Chen,\n Xiaozhi and Li,\n Xi\n},\n title = {\n UniFusion: Unified Multi-View Fusion Transformer for Spatial-Temporal Representation in Bird's-Eye-View\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8690-8699\n} \n}" }, { "title": "UniKD: Universal Knowledge Distillation for Mimicking Homogeneous or Heterogeneous Object Detectors", @@ -61601,6 +63657,7 @@ "status": "Poster", "track": "main", "pid": "8836", + "author_site": "Shanshan Lao, Guanglu Song, Boxiao Liu, Yu Liu, Yujiu Yang", "author": "Shanshan Lao, Guanglu Song, Boxiao Liu, Yu Liu, Yujiu Yang", "abstract": "Knowledge distillation (KD) has become a standard method to boost the performance of lightweight object detectors. Most previous works are feature-based, where students mimic the features of homogeneous teacher detectors. However, distilling the knowledge from the heterogeneous teacher fails in this manner due to the serious semantic gap, which greatly limits the flexibility of KD in practical applications. Bridging this semantic gap now requires case-by-case algorithm design which is time-consuming and heavily relies on experienced adjustment. To alleviate this problem, we propose Universal Knowledge Distillation (UniKD), introducing additional decoder heads with deformable cross-attention called Adaptive Knowledge Extractor (AKE). In UniKD, AKEs are first pretrained on the teacher's output to infuse the teacher's content and positional knowledge into a fixed-number set of knowledge embeddings. The fixed AKEs are then attached to the student's backbone to encourage the student to absorb the teacher's knowledge in these knowledge embeddings. In this query-based distillation paradigm, detection-relevant information can be dynamically aggregated into a knowledge embedding set and transferred between different detectors. When the teacher model is too large for online inference, its output can be stored on disk in advance to save the computation overhead, which is more storage efficient than feature-based methods. Extensive experiments demonstrate that our UniKD can plug and play in any homogeneous or heterogeneous teacher-student pairs and significantly outperforms conventional feature-based KD.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Lao_UniKD_Universal_Knowledge_Distillation_for_Mimicking_Homogeneous_or_Heterogeneous_Object_ICCV_2023_paper.pdf", @@ -61612,7 +63669,8 @@ "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5073768927309810606&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Lao_UniKD_Universal_Knowledge_Distillation_for_Mimicking_Homogeneous_or_Heterogeneous_Object_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Lao_UniKD_Universal_Knowledge_Distillation_for_Mimicking_Homogeneous_or_Heterogeneous_Object_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Lao_2023_ICCV,\n \n author = {\n Lao,\n Shanshan and Song,\n Guanglu and Liu,\n Boxiao and Liu,\n Yu and Yang,\n Yujiu\n},\n title = {\n UniKD: Universal Knowledge Distillation for Mimicking Homogeneous or Heterogeneous Object Detectors\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6362-6372\n} \n}" }, { "title": "UniSeg: A Unified Multi-Modal LiDAR Segmentation Network and the OpenPCSeg Codebase", @@ -61620,6 +63678,7 @@ "status": "Poster", "track": "main", "pid": "10592", + "author_site": "Youquan Liu, Runnan Chen, Xin Li, Lingdong Kong, Yuchen Yang, Zhaoyang Xia, Yeqi Bai, Xinge Zhu, Yuexin Ma, Yikang Li, Yu Qiao, Yuenan Hou", "author": "Youquan Liu, Runnan Chen, Xin Li, Lingdong Kong, Yuchen Yang, Zhaoyang Xia, Yeqi Bai, Xinge Zhu, Yuexin Ma, Yikang Li, Yu Qiao, Yuenan Hou", "abstract": "Point-, voxel-, and range-views are three representative forms of point clouds. All of them have accurate 3D measurements but lack color and texture information. RGB images are a natural complement to these point cloud views and fully utilizing the comprehensive information of them benefits more robust perceptions. In this paper, we present a unified multi-modal LiDAR segmentation network, termed UniSeg, which leverages the information of RGB images and three views of the point cloud, and accomplishes semantic segmentation and panoptic segmentation simultaneously. Specifically, we first design the Learnable cross-Modal Association (LMA) module to automatically fuse voxel-view and range-view features with image features, which fully utilize the rich semantic information of images and are robust to calibration errors. Then, the enhanced voxel-view and range-view features are transformed to the point space, where three views of point cloud features are further fused adaptively by the Learnable cross-View Association module (LVA). Notably, UniSeg achieves promising results in three public benchmarks, i.e., SemanticKITTI, nuScenes, and Waymo Open Dataset (WOD); it ranks 1st on two challenges of two benchmarks, including the LiDAR semantic segmentation challenge of nuScenes and panoptic segmentation challenges of SemanticKITTI. Besides, we construct the OpenPCSeg codebase, which is the largest and most comprehensive outdoor LiDAR segmentation codebase. It contains most of the popular outdoor LiDAR segmentation algorithms and provides reproducible implementations. The OpenPCSeg codebase will be made publicly available at https://github.com/PJLab-ADG/PCSeg.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Liu_UniSeg_A_Unified_Multi-Modal_LiDAR_Segmentation_Network_and_the_OpenPCSeg_ICCV_2023_paper.pdf", @@ -61631,7 +63690,8 @@ "gs_citation": 46, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3268614218905414956&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Liu_UniSeg_A_Unified_Multi-Modal_LiDAR_Segmentation_Network_and_the_OpenPCSeg_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Liu_UniSeg_A_Unified_Multi-Modal_LiDAR_Segmentation_Network_and_the_OpenPCSeg_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Youquan and Chen,\n Runnan and Li,\n Xin and Kong,\n Lingdong and Yang,\n Yuchen and Xia,\n Zhaoyang and Bai,\n Yeqi and Zhu,\n Xinge and Ma,\n Yuexin and Li,\n Yikang and Qiao,\n Yu and Hou,\n Yuenan\n},\n title = {\n UniSeg: A Unified Multi-Modal LiDAR Segmentation Network and the OpenPCSeg Codebase\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21662-21673\n} \n}" }, { "title": "UniT3D: A Unified Transformer for 3D Dense Captioning and Visual Grounding", @@ -61639,8 +63699,8 @@ "status": "Poster", "track": "main", "pid": "9485", - "author_site": "Zhenyu Chen, Ronghang Hu, Xinlei Chen, Matthias Nie\u00dfner, Angel X. Chang", - "author": "Zhenyu Chen; Ronghang Hu; Xinlei Chen; Matthias Nie\u00dfner; Angel X. Chang", + "author_site": "Zhenyu Chen, Ronghang Hu, Xinlei Chen, Matthias Nießner, Angel X. Chang", + "author": "Zhenyu Chen; Ronghang Hu; Xinlei Chen; Matthias Nießner; Angel X. Chang", "abstract": "Performing 3D dense captioning and visual grounding requires a common and shared understanding of the underlying multimodal relationships. However, despite some previous attempts on connecting these two related tasks with highly task-specific neural modules, it remains understudied how to explicitly depict their shared nature to learn them simultaneously. In this work, we propose UniT3D, a simple yet effective fully unified transformer-based architecture for jointly solving 3D visual grounding and dense captioning. UniT3D enables learning a strong multimodal representation across the two tasks through a supervised joint pre-training scheme with bidirectional and seq-to-seq objectives. With a generic architecture design, UniT3D allows expanding the pre-training scope to more various training sources such as the synthesized data from 2D prior knowledge to benefit 3D vision-language tasks. Extensive experiments and analysis demonstrate that UniT3D obtains significant gains for 3D dense captioning and visual grounding.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Chen_UniT3D_A_Unified_Transformer_for_3D_Dense_Captioning_and_Visual_ICCV_2023_paper.pdf", "aff": "Technical University of Munich; Meta AI; Meta AI; Technical University of Munich; Simon Fraser University", @@ -61656,14 +63716,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chen_UniT3D_A_Unified_Transformer_for_3D_Dense_Captioning_and_Visual_ICCV_2023_paper.html", "aff_unique_index": "0;1;1;0;2", - "aff_unique_norm": "Technical University of Munich;Meta;Simon Fraser University", + "aff_unique_norm": "Technical University of Munich;Meta Platforms, Inc.;Simon Fraser University", "aff_unique_dep": ";Meta AI;", "aff_unique_url": "https://www.tum.de;https://meta.com;https://www.sfu.ca", "aff_unique_abbr": "TUM;Meta;SFU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0;2", - "aff_country_unique": "Germany;United States;Canada" + "aff_country_unique": "Germany;United States;Canada", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Zhenyu and Hu,\n Ronghang and Chen,\n Xinlei and Nie{\\ss\n}ner,\n Matthias and Chang,\n Angel X.\n},\n title = {\n UniT3D: A Unified Transformer for 3D Dense Captioning and Visual Grounding\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18109-18119\n} \n}" }, { "title": "UniTR: A Unified and Efficient Multi-Modal Transformer for Bird's-Eye-View Representation", @@ -61689,13 +63750,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wang_UniTR_A_Unified_and_Efficient_Multi-Modal_Transformer_for_Birds-Eye-View_Representation_ICCV_2023_paper.html", "aff_unique_index": "0;0+1;2;3;3;2;0", "aff_unique_norm": "Peking University;Pazhou Laboratory;Max Planck Institute for Informatics;Huawei", - "aff_unique_dep": ";;;Huawei", + "aff_unique_dep": ";;;", "aff_unique_url": "http://www.pku.edu.cn;;https://mpi-inf.mpg.de;https://www.huawei.com", "aff_unique_abbr": "Peking U;;MPII;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;1;0;0;1;0", - "aff_country_unique": "China;Germany" + "aff_country_unique": "China;Germany", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Haiyang and Tang,\n Hao and Shi,\n Shaoshuai and Li,\n Aoxue and Li,\n Zhenguo and Schiele,\n Bernt and Wang,\n Liwei\n},\n title = {\n UniTR: A Unified and Efficient Multi-Modal Transformer for Bird's-Eye-View Representation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6792-6802\n} \n}" }, { "title": "UniVTG: Towards Unified Video-Language Temporal Grounding", @@ -61720,14 +63782,15 @@ "author_num": 8, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Lin_UniVTG_Towards_Unified_Video-Language_Temporal_Grounding_ICCV_2023_paper.html", "aff_unique_index": "0;1;0;2;0;0;0;0", - "aff_unique_norm": "National University of Singapore;Meta;Johns Hopkins University", + "aff_unique_norm": "National University of Singapore;Meta Platforms, Inc.;Johns Hopkins University", "aff_unique_dep": "Show Lab;Meta AI;", "aff_unique_url": "https://www.nus.edu.sg;https://meta.com;https://www.jhu.edu", "aff_unique_abbr": "NUS;Meta;JHU", "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Singapore;", "aff_country_unique_index": "0;1;0;1;0;0;0;0", - "aff_country_unique": "Singapore;United States" + "aff_country_unique": "Singapore;United States", + "bibtex": "@InProceedings{Lin_2023_ICCV,\n \n author = {\n Lin,\n Kevin Qinghong and Zhang,\n Pengchuan and Chen,\n Joya and Pramanick,\n Shraman and Gao,\n Difei and Wang,\n Alex Jinpeng and Yan,\n Rui and Shou,\n Mike Zheng\n},\n title = {\n UniVTG: Towards Unified Video-Language Temporal Grounding\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2794-2804\n} \n}" }, { "title": "Unified Adversarial Patch for Cross-Modal Attacks in the Physical World", @@ -61735,6 +63798,7 @@ "status": "Poster", "track": "main", "pid": "4896", + "author_site": "Xingxing Wei, Yao Huang, Yitong Sun, Jie Yu", "author": "Xingxing Wei, Yao Huang, Yitong Sun, Jie Yu", "abstract": "Recently, physical adversarial attacks have been presented to evade DNNs-based object detectors. To ensure the security, many scenarios are simultaneously deployed with visible sensors and infrared sensors, leading to the failures of these single-modal physical attacks. To show the potential risks under such scenes, we propose a unified adversarial patch to perform cross-modal physical attacks, i.e., fooling visible and infrared object detectors at the same time via a single patch. Considering different imaging mechanisms of visible and infrared sensors, our work focuses on modeling the shapes of adversarial patches, which can be captured in different modalities when they change. To this end, we design a novel boundary-limited shape optimization to achieve the compact and smooth shapes, and thus they can be easily implemented in the physical world. In addition, to balance the fooling degree between visible detector and infrared detector during the optimization process, we propose a score-aware iterative evaluation, which can guide the adversarial patch to iteratively reduce the predicted scores of the multi-modal sensors. We finally test our method against the one-stage detector: YOLOv3 and the two-stage detector: Faster RCNN. Results show that our unified patch achieves an Attack Success Rate (ASR) of 73.33% and 69.17%, respectively. More importantly, we verify the effective attacks in the physical world when visible and infrared sensors shoot the objects under various settings like different angles, distances, postures, and scenes.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Wei_Unified_Adversarial_Patch_for_Cross-Modal_Attacks_in_the_Physical_World_ICCV_2023_paper.pdf", @@ -61746,7 +63810,8 @@ "gs_citation": 32, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5539951076942933909&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wei_Unified_Adversarial_Patch_for_Cross-Modal_Attacks_in_the_Physical_World_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wei_Unified_Adversarial_Patch_for_Cross-Modal_Attacks_in_the_Physical_World_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Wei_2023_ICCV,\n \n author = {\n Wei,\n Xingxing and Huang,\n Yao and Sun,\n Yitong and Yu,\n Jie\n},\n title = {\n Unified Adversarial Patch for Cross-Modal Attacks in the Physical World\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4445-4454\n} \n}" }, { "title": "Unified Coarse-to-Fine Alignment for Video-Text Retrieval", @@ -61778,7 +63843,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Chapel Hill", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Ziyang and Sung,\n Yi-Lin and Cheng,\n Feng and Bertasius,\n Gedas and Bansal,\n Mohit\n},\n title = {\n Unified Coarse-to-Fine Alignment for Video-Text Retrieval\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2816-2827\n} \n}" }, { "title": "Unified Data-Free Compression: Pruning and Quantization without Fine-Tuning", @@ -61810,7 +63876,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Bai_2023_ICCV,\n \n author = {\n Bai,\n Shipeng and Chen,\n Jun and Shen,\n Xintian and Qian,\n Yixuan and Liu,\n Yong\n},\n title = {\n Unified Data-Free Compression: Pruning and Quantization without Fine-Tuning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5876-5885\n} \n}" }, { "title": "Unified Out-Of-Distribution Detection: A Model-Specific Perspective", @@ -61835,14 +63902,15 @@ "author_num": 2, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Averly_Unified_Out-Of-Distribution_Detection_A_Model-Specific_Perspective_ICCV_2023_paper.html", "aff_unique_index": "0;0", - "aff_unique_norm": "Ohio State University", + "aff_unique_norm": "The Ohio State University", "aff_unique_dep": "", "aff_unique_url": "https://www.osu.edu", "aff_unique_abbr": "OSU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Averly_2023_ICCV,\n \n author = {\n Averly,\n Reza and Chao,\n Wei-Lun\n},\n title = {\n Unified Out-Of-Distribution Detection: A Model-Specific Perspective\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1453-1463\n} \n}" }, { "title": "Unified Pre-Training with Pseudo Texts for Text-To-Image Person Re-Identification", @@ -61874,7 +63942,8 @@ "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0+0+0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Shao_2023_ICCV,\n \n author = {\n Shao,\n Zhiyin and Zhang,\n Xinyu and Ding,\n Changxing and Wang,\n Jian and Wang,\n Jingdong\n},\n title = {\n Unified Pre-Training with Pseudo Texts for Text-To-Image Person Re-Identification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11174-11184\n} \n}" }, { "title": "Unified Visual Relationship Detection with Vision and Language Models", @@ -61906,7 +63975,8 @@ "aff_campus_unique_index": "0;0;0;0;0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhao_2023_ICCV,\n \n author = {\n Zhao,\n Long and Yuan,\n Liangzhe and Gong,\n Boqing and Cui,\n Yin and Schroff,\n Florian and Yang,\n Ming-Hsuan and Adam,\n Hartwig and Liu,\n Ting\n},\n title = {\n Unified Visual Relationship Detection with Vision and Language Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6962-6973\n} \n}" }, { "title": "Unify, Align and Refine: Multi-Level Semantic Alignment for Radiology Report Generation", @@ -61931,14 +64001,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_Unify_Align_and_Refine_Multi-Level_Semantic_Alignment_for_Radiology_Report_ICCV_2023_paper.html", "aff_unique_index": "0+1;0+1;0;0;0;0", - "aff_unique_norm": "Peking University;Pengcheng Laboratory", - "aff_unique_dep": "School of Electronic and Computer Engineering;Peng Cheng Laboratory", + "aff_unique_norm": "Peking University;Peng Cheng Laboratory", + "aff_unique_dep": "School of Electronic and Computer Engineering;", "aff_unique_url": "http://www.pku.edu.cn;http://www.pcl.ac.cn", "aff_unique_abbr": "PKU;PCL", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Yaowei and Yang,\n Bang and Cheng,\n Xuxin and Zhu,\n Zhihong and Li,\n Hongxiang and Zou,\n Yuexian\n},\n title = {\n Unify,\n Align and Refine: Multi-Level Semantic Alignment for Radiology Report Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2863-2874\n} \n}" }, { "title": "Unilaterally Aggregated Contrastive Learning with Hierarchical Augmentation for Anomaly Detection", @@ -61961,7 +64032,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wang_Unilaterally_Aggregated_Contrastive_Learning_with_Hierarchical_Augmentation_for_Anomaly_Detection_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wang_Unilaterally_Aggregated_Contrastive_Learning_with_Hierarchical_Augmentation_for_Anomaly_Detection_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Guodong and Wang,\n Yunhong and Qin,\n Jie and Zhang,\n Dongming and Bao,\n Xiuguo and Huang,\n Di\n},\n title = {\n Unilaterally Aggregated Contrastive Learning with Hierarchical Augmentation for Anomaly Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6888-6897\n} \n}" }, { "title": "UnitedHuman: Harnessing Multi-Source Data for High-Resolution Human Generation", @@ -61986,14 +64058,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Fu_UnitedHuman_Harnessing_Multi-Source_Data_for_High-Resolution_Human_Generation_ICCV_2023_paper.html", "aff_unique_index": "0;0;1;0+2;0;1", - "aff_unique_norm": "Shanghai AI Laboratory;Nanyang Technological University;Chinese University of Hong Kong", + "aff_unique_norm": "Shanghai AI Laboratory;Nanyang Technological University;The Chinese University of Hong Kong", "aff_unique_dep": ";S-Lab;", "aff_unique_url": "https://www.shanghai-ai-lab.com;https://www.ntu.edu.sg;https://www.cuhk.edu.hk", "aff_unique_abbr": "SAIL;NTU;CUHK", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;1;0+0;0;1", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Fu_2023_ICCV,\n \n author = {\n Fu,\n Jianglin and Li,\n Shikai and Jiang,\n Yuming and Lin,\n Kwan-Yee and Wu,\n Wayne and Liu,\n Ziwei\n},\n title = {\n UnitedHuman: Harnessing Multi-Source Data for High-Resolution Human Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7301-7311\n} \n}" }, { "title": "UniverSeg: Universal Medical Image Segmentation", @@ -62025,7 +64098,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Cambridge;", "aff_country_unique_index": "0;0;0;0;0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Butoi_2023_ICCV,\n \n author = {\n Butoi,\n Victor Ion and Ortiz,\n Jose Javier Gonzalez and Ma,\n Tianyu and Sabuncu,\n Mert R. and Guttag,\n John and Dalca,\n Adrian V.\n},\n title = {\n UniverSeg: Universal Medical Image Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21438-21451\n} \n}" }, { "title": "Universal Domain Adaptation via Compressive Attention Matching", @@ -62037,7 +64111,7 @@ "author": "Didi Zhu; Yinchuan Li; Junkun Yuan; Zexi Li; Kun Kuang; Chao Wu", "abstract": "Universal domain adaptation (UniDA) aims to transfer knowledge from the source domain to the target domain without any prior knowledge about the label set. The challenge lies in how to determine whether the target samples belong to common categories. The mainstream methods make judgments based on the sample features, which overemphasizes global information while ignoring the most crucial local objects in the image, resulting in limited accuracy. To address this issue, we propose a Universal Attention Matching (UniAM) framework by exploiting the self-attention mechanism in vision transformer to capture the crucial object information. The proposed framework introduces a novel Compressive Attention Matching (CAM) approach to explore the core information by compressively representing attentions. Furthermore, CAM incorporates a residual-based measurement to determine the sample commonness. By utilizing the measurement, UniAM achieves domain-wise and category-wise Common Feature Alignment (CFA) and Target Class Separation (TCS). Notably, UniAM is the first method utilizing the attention in vision transformer directly to perform classification tasks. Extensive experiments show that UniAM outperforms the current state-of-the-art methods on various benchmark datasets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Zhu_Universal_Domain_Adaptation_via_Compressive_Attention_Matching_ICCV_2023_paper.pdf", - "aff": "Zhejiang University; Huawei Noah\u2019s Ark Lab; Zhejiang University; Zhejiang University; Zhejiang University; Zhejiang University", + "aff": "Zhejiang University; Huawei Noah’s Ark Lab; Zhejiang University; Zhejiang University; Zhejiang University; Zhejiang University", "project": "", "github": "", "supp": "", @@ -62051,13 +64125,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhu_Universal_Domain_Adaptation_via_Compressive_Attention_Matching_ICCV_2023_paper.html", "aff_unique_index": "0;1;0;0;0;0", "aff_unique_norm": "Zhejiang University;Huawei", - "aff_unique_dep": ";Noah\u2019s Ark Lab", + "aff_unique_dep": ";Noah’s Ark Lab", "aff_unique_url": "https://www.zju.edu.cn;https://www.huawei.com", "aff_unique_abbr": "ZJU;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhu_2023_ICCV,\n \n author = {\n Zhu,\n Didi and Li,\n Yinchuan and Yuan,\n Junkun and Li,\n Zexi and Kuang,\n Kun and Wu,\n Chao\n},\n title = {\n Universal Domain Adaptation via Compressive Attention Matching\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6974-6985\n} \n}" }, { "title": "Unleashing Text-to-Image Diffusion Models for Visual Perception", @@ -62089,7 +64164,8 @@ "aff_campus_unique_index": ";;;;", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;2;0;0", - "aff_country_unique": "China;;United States" + "aff_country_unique": "China;;United States", + "bibtex": "@InProceedings{Zhao_2023_ICCV,\n \n author = {\n Zhao,\n Wenliang and Rao,\n Yongming and Liu,\n Zuyan and Liu,\n Benlin and Zhou,\n Jie and Lu,\n Jiwen\n},\n title = {\n Unleashing Text-to-Image Diffusion Models for Visual Perception\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5729-5739\n} \n}" }, { "title": "Unleashing Vanilla Vision Transformer with Masked Image Modeling for Object Detection", @@ -62121,7 +64197,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Fang_2023_ICCV,\n \n author = {\n Fang,\n Yuxin and Yang,\n Shusheng and Wang,\n Shijie and Ge,\n Yixiao and Shan,\n Ying and Wang,\n Xinggang\n},\n title = {\n Unleashing Vanilla Vision Transformer with Masked Image Modeling for Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6244-6253\n} \n}" }, { "title": "Unleashing the Potential of Spiking Neural Networks with Dynamic Confidence", @@ -62146,14 +64223,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_Unleashing_the_Potential_of_Spiking_Neural_Networks_with_Dynamic_Confidence_ICCV_2023_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "University of Manchester", + "aff_unique_norm": "The University of Manchester", "aff_unique_dep": "", "aff_unique_url": "https://www.manchester.ac.uk", "aff_unique_abbr": "UoM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Chen and Jones,\n Edward G and Furber,\n Steve\n},\n title = {\n Unleashing the Potential of Spiking Neural Networks with Dynamic Confidence\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13350-13360\n} \n}" }, { "title": "Unleashing the Power of Gradient Signal-to-Noise Ratio for Zero-Shot NAS", @@ -62185,7 +64263,8 @@ "aff_campus_unique_index": ";;;;;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0+0;0+0;0+0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Sun_2023_ICCV,\n \n author = {\n Sun,\n Zihao and Sun,\n Yu and Yang,\n Longxing and Lu,\n Shun and Mei,\n Jilin and Zhao,\n Wenxiao and Hu,\n Yu\n},\n title = {\n Unleashing the Power of Gradient Signal-to-Noise Ratio for Zero-Shot NAS\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5763-5773\n} \n}" }, { "title": "Unmasked Teacher: Towards Training-Efficient Video Foundation Models", @@ -62210,14 +64289,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_Unmasked_Teacher_Towards_Training-Efficient_Video_Foundation_Models_ICCV_2023_paper.html", "aff_unique_index": "0+1+2;0+2;3+2;2;2;4+2;0+2", - "aff_unique_norm": "Shenzhen Institute of Advanced Technology;University of Chinese Academy of Sciences;Shanghai AI Laboratory;University of Hong Kong;Nanjing University", + "aff_unique_norm": "Shenzhen Institute of Advanced Technology;University of Chinese Academy of Sciences;Shanghai AI Laboratory;The University of Hong Kong;Nanjing University", "aff_unique_dep": ";;;;State Key Laboratory for Novel Software Technology", "aff_unique_url": "http://www.siat.cas.cn;http://www.ucas.ac.cn;https://www.shanghai-ai-lab.com;https://www.hku.hk;http://www.nju.edu.cn", "aff_unique_abbr": "SIAT;UCAS;SAIL;HKU;Nanjing University", "aff_campus_unique_index": "0;0;2;;0", "aff_campus_unique": "Shenzhen;;Hong Kong SAR", "aff_country_unique_index": "0+0+0;0+0;0+0;0;0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Kunchang and Wang,\n Yali and Li,\n Yizhuo and Wang,\n Yi and He,\n Yinan and Wang,\n Limin and Qiao,\n Yu\n},\n title = {\n Unmasked Teacher: Towards Training-Efficient Video Foundation Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19948-19960\n} \n}" }, { "title": "Unmasking Anomalies in Road-Scene Segmentation", @@ -62249,7 +64329,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0;0;0", - "aff_country_unique": "Italy" + "aff_country_unique": "Italy", + "bibtex": "@InProceedings{Rai_2023_ICCV,\n \n author = {\n Rai,\n Shyam Nandan and Cermelli,\n Fabio and Fontanel,\n Dario and Masone,\n Carlo and Caputo,\n Barbara\n},\n title = {\n Unmasking Anomalies in Road-Scene Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4037-4046\n} \n}" }, { "title": "Unpaired Multi-domain Attribute Translation of 3D Facial Shapes with a Square and Symmetric Geometric Map", @@ -62281,7 +64362,8 @@ "aff_campus_unique_index": ";;;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0+0;0+0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Fan_2023_ICCV,\n \n author = {\n Fan,\n Zhenfeng and Zhang,\n Zhiheng and Yang,\n Shuang and Zhong,\n Chongyang and Cao,\n Min and Xia,\n Shihong\n},\n title = {\n Unpaired Multi-domain Attribute Translation of 3D Facial Shapes with a Square and Symmetric Geometric Map\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20828-20838\n} \n}" }, { "title": "Unsupervised 3D Perception with 2D Vision-Language Distillation for Autonomous Driving", @@ -62313,7 +64395,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Najibi_2023_ICCV,\n \n author = {\n Najibi,\n Mahyar and Ji,\n Jingwei and Zhou,\n Yin and Qi,\n Charles R. and Yan,\n Xinchen and Ettinger,\n Scott and Anguelov,\n Dragomir\n},\n title = {\n Unsupervised 3D Perception with 2D Vision-Language Distillation for Autonomous Driving\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8602-8612\n} \n}" }, { "title": "Unsupervised Accuracy Estimation of Deep Visual Models using Domain-Adaptive Adversarial Perturbation without Source Samples", @@ -62337,15 +64420,16 @@ "email": "samsung.com;samsung.com;samsung.com;samsung.com", "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Lee_Unsupervised_Accuracy_Estimation_of_Deep_Visual_Models_using_Domain-Adaptive_Adversarial_ICCV_2023_paper.html", - "aff_unique_index": "0;0;0;0", - "aff_unique_norm": "Samsung", - "aff_unique_dep": "Samsung SDS", - "aff_unique_url": "https://www.samsungsds.com", - "aff_unique_abbr": "Samsung SDS", + "aff_unique_index": "0;1;1;0", + "aff_unique_norm": "Samsung SDS;Samsung SDS America", + "aff_unique_dep": ";", + "aff_unique_url": "https://www.samsungsds.com;https://www.samsungsds.com/en", + "aff_unique_abbr": "Samsung SDS;SSA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0", - "aff_country_unique": "South Korea;United States" + "aff_country_unique": "South Korea;United States", + "bibtex": "@InProceedings{Lee_2023_ICCV,\n \n author = {\n Lee,\n JoonHo and Woo,\n Jae Oh and Moon,\n Hankyu and Lee,\n Kwonho\n},\n title = {\n Unsupervised Accuracy Estimation of Deep Visual Models using Domain-Adaptive Adversarial Perturbation without Source Samples\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16443-16452\n} \n}" }, { "title": "Unsupervised Compositional Concepts Discovery with Text-to-Image Generative Models", @@ -62370,14 +64454,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Liu_Unsupervised_Compositional_Concepts_Discovery_with_Text-to-Image_Generative_Models_ICCV_2023_paper.html", "aff_unique_index": "0;1;1;1;1", - "aff_unique_norm": "University of Illinois Urbana-Champaign;Massachusetts Institute of Technology", + "aff_unique_norm": "University of Illinois at Urbana-Champaign;Massachusetts Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www illinois.edu;https://web.mit.edu", "aff_unique_abbr": "UIUC;MIT", "aff_campus_unique_index": "0", "aff_campus_unique": "Urbana-Champaign;", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Nan and Du,\n Yilun and Li,\n Shuang and Tenenbaum,\n Joshua B. and Torralba,\n Antonio\n},\n title = {\n Unsupervised Compositional Concepts Discovery with Text-to-Image Generative Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2085-2095\n} \n}" }, { "title": "Unsupervised Domain Adaptation for Training Event-Based Networks Using Contrastive Learning and Uncorrelated Conditioning", @@ -62409,7 +64494,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "Los Angeles", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Jian_2023_ICCV,\n \n author = {\n Jian,\n Dayuan and Rostami,\n Mohammad\n},\n title = {\n Unsupervised Domain Adaptation for Training Event-Based Networks Using Contrastive Learning and Uncorrelated Conditioning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18721-18731\n} \n}" }, { "title": "Unsupervised Domain Adaptive Detection with Network Stability Analysis", @@ -62441,7 +64527,8 @@ "aff_campus_unique_index": "0+0;1;0+0;0+0", "aff_campus_unique": "Beijing;Denton", "aff_country_unique_index": "0+0;1;0+0;0+0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Zhou_2023_ICCV,\n \n author = {\n Zhou,\n Wenzhang and Fan,\n Heng and Luo,\n Tiejian and Zhang,\n Libo\n},\n title = {\n Unsupervised Domain Adaptive Detection with Network Stability Analysis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6986-6995\n} \n}" }, { "title": "Unsupervised Facial Performance Editing via Vector-Quantized StyleGAN Representations", @@ -62473,7 +64560,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "", - "aff_country_unique": "" + "aff_country_unique": "", + "bibtex": "@InProceedings{Kicanaoglu_2023_ICCV,\n \n author = {\n Kicanaoglu,\n Berkay and Garrido,\n Pablo and Bharaj,\n Gaurav\n},\n title = {\n Unsupervised Facial Performance Editing via Vector-Quantized StyleGAN Representations\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2371-2382\n} \n}" }, { "title": "Unsupervised Feature Representation Learning for Domain-generalized Cross-domain Image Retrieval", @@ -62505,7 +64593,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Hu_2023_ICCV,\n \n author = {\n Hu,\n Conghui and Zhang,\n Can and Lee,\n Gim Hee\n},\n title = {\n Unsupervised Feature Representation Learning for Domain-generalized Cross-domain Image Retrieval\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11016-11025\n} \n}" }, { "title": "Unsupervised Image Denoising in Real-World Scenarios via Self-Collaboration Parallel Generative Adversarial Branches", @@ -62537,7 +64626,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Lin_2023_ICCV,\n \n author = {\n Lin,\n Xin and Ren,\n Chao and Liu,\n Xiao and Huang,\n Jie and Lei,\n Yinjie\n},\n title = {\n Unsupervised Image Denoising in Real-World Scenarios via Self-Collaboration Parallel Generative Adversarial Branches\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12642-12652\n} \n}" }, { "title": "Unsupervised Learning of Object-Centric Embeddings for Cell Instance Segmentation in Microscopy Images", @@ -62569,7 +64659,8 @@ "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Janelia", "aff_country_unique_index": "0;1;0;1", - "aff_country_unique": "United Kingdom;United States" + "aff_country_unique": "United Kingdom;United States", + "bibtex": "@InProceedings{Wolf_2023_ICCV,\n \n author = {\n Wolf,\n Steffen and Lalit,\n Manan and McDole,\n Katie and Funke,\n Jan\n},\n title = {\n Unsupervised Learning of Object-Centric Embeddings for Cell Instance Segmentation in Microscopy Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21263-21272\n} \n}" }, { "title": "Unsupervised Manifold Linearizing and Clustering", @@ -62592,7 +64683,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ding_Unsupervised_Manifold_Linearizing_and_Clustering_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ding_Unsupervised_Manifold_Linearizing_and_Clustering_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Ding_2023_ICCV,\n \n author = {\n Ding,\n Tianjiao and Tong,\n Shengbang and Chan,\n Kwan Ho Ryan and Dai,\n Xili and Ma,\n Yi and Haeffele,\n Benjamin D.\n},\n title = {\n Unsupervised Manifold Linearizing and Clustering\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5450-5461\n} \n}" }, { "title": "Unsupervised Object Localization with Representer Point Selection", @@ -62624,7 +64716,8 @@ "aff_campus_unique_index": "0;0;1;0", "aff_campus_unique": "Gwangju;Cambridge", "aff_country_unique_index": "0;0;1;0", - "aff_country_unique": "South Korea;United States" + "aff_country_unique": "South Korea;United States", + "bibtex": "@InProceedings{Song_2023_ICCV,\n \n author = {\n Song,\n Yeonghwan and Jang,\n Seokwoo and Katabi,\n Dina and Son,\n Jeany\n},\n title = {\n Unsupervised Object Localization with Representer Point Selection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6534-6544\n} \n}" }, { "title": "Unsupervised Open-Vocabulary Object Localization in Videos", @@ -62649,14 +64742,15 @@ "author_num": 14, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Fan_Unsupervised_Open-Vocabulary_Object_Localization_in_Videos_ICCV_2023_paper.html", "aff_unique_index": "0+1;1;1;1;1;1;1;2;1;1;1;1+0;0+0;1", - "aff_unique_norm": "Fudan University;Amazon;National University of Singapore", - "aff_unique_dep": ";Amazon Web Services;", + "aff_unique_norm": "Fudan University;Amazon Web Services;National University of Singapore", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.fudan.edu.cn;https://aws.amazon.com;https://www.nus.edu.sg", "aff_unique_abbr": "Fudan;AWS;NUS", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+1;1;1;1;1;1;1;2;1;1;1;1+0;0+0;1", - "aff_country_unique": "China;United States;Singapore" + "aff_country_unique": "China;United States;Singapore", + "bibtex": "@InProceedings{Fan_2023_ICCV,\n \n author = {\n Fan,\n Ke and Bai,\n Zechen and Xiao,\n Tianjun and Zietlow,\n Dominik and Horn,\n Max and Zhao,\n Zixu and Simon-Gabriel,\n Carl-Johann and Shou,\n Mike Zheng and Locatello,\n Francesco and Schiele,\n Bernt and Brox,\n Thomas and Zhang,\n Zheng and Fu,\n Yanwei and He,\n Tong\n},\n title = {\n Unsupervised Open-Vocabulary Object Localization in Videos\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13747-13755\n} \n}" }, { "title": "Unsupervised Prompt Tuning for Text-Driven Object Detection", @@ -62683,12 +64777,13 @@ "aff_unique_index": "0;0+1+2;1;1;1+2;3;0;0+2", "aff_unique_norm": "Zhejiang University;Hikvision Research Institute;Zhejiang Province Key Laboratory of Peace-building Big Data;Fuzhou University", "aff_unique_dep": ";;Key Laboratory of Peace-building Big Data;", - "aff_unique_url": "https://www.zju.edu.cn;https://www.hikvision.com/cn/;;https://www.fznu.edu.cn", + "aff_unique_url": "https://www.zju.edu.cn;https://www.hikvision.com/cn/;;https://www.fzu.edu.cn", "aff_unique_abbr": "ZJU;Hikvision;;FZU", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0+0;0;0;0+0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{He_2023_ICCV,\n \n author = {\n He,\n Weizhen and Chen,\n Weijie and Chen,\n Binbin and Yang,\n Shicai and Xie,\n Di and Lin,\n Luojun and Qi,\n Donglian and Zhuang,\n Yueting\n},\n title = {\n Unsupervised Prompt Tuning for Text-Driven Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2651-2661\n} \n}" }, { "title": "Unsupervised Self-Driving Attention Prediction via Uncertainty Mining and Knowledge Embedding", @@ -62720,7 +64815,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Beijing;", "aff_country_unique_index": "0+0;0;0;1;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Zhu_2023_ICCV,\n \n author = {\n Zhu,\n Pengfei and Qi,\n Mengshi and Li,\n Xia and Li,\n Weijian and Ma,\n Huadong\n},\n title = {\n Unsupervised Self-Driving Attention Prediction via Uncertainty Mining and Knowledge Embedding\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8558-8568\n} \n}" }, { "title": "Unsupervised Surface Anomaly Detection with Diffusion Probabilistic Model", @@ -62745,14 +64841,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhang_Unsupervised_Surface_Anomaly_Detection_with_Diffusion_Probabilistic_Model_ICCV_2023_paper.html", "aff_unique_index": "0;0+1;2;3+1;0;0+1", - "aff_unique_norm": "Tsinghua University;Pengcheng Laboratory;Huawei;Shenzhen University", + "aff_unique_norm": "Tsinghua University;Peng Cheng Laboratory;Huawei;Shenzhen University", "aff_unique_dep": "International Graduate School;Research Center of Artificial Intelligence;Manufacturing;", "aff_unique_url": "https://www.tsinghua.edu.cn;http://www.pcl.ac.cn;https://www.huawei.com;https://www.szu.edu.cn", "aff_unique_abbr": "THU;;Huawei;SZU", "aff_campus_unique_index": "0;0;;0;0", "aff_campus_unique": "Shenzhen;", "aff_country_unique_index": "0;0+0;0;0+0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Xinyi and Li,\n Naiqi and Li,\n Jiawei and Dai,\n Tao and Jiang,\n Yong and Xia,\n Shu-Tao\n},\n title = {\n Unsupervised Surface Anomaly Detection with Diffusion Probabilistic Model\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 6782-6791\n} \n}" }, { "title": "Unsupervised Video Deraining with An Event Camera", @@ -62784,7 +64881,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Jin and Weng,\n Wenming and Zhang,\n Yueyi and Xiong,\n Zhiwei\n},\n title = {\n Unsupervised Video Deraining with An Event Camera\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10831-10840\n} \n}" }, { "title": "Unsupervised Video Object Segmentation with Online Adversarial Self-Tuning", @@ -62816,7 +64914,8 @@ "aff_campus_unique_index": "0;0;2;0", "aff_campus_unique": "Nanjing;;Sunnyvale", "aff_country_unique_index": "0;0;1;1;0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Su_2023_ICCV,\n \n author = {\n Su,\n Tiankang and Song,\n Huihui and Liu,\n Dong and Liu,\n Bo and Liu,\n Qingshan\n},\n title = {\n Unsupervised Video Object Segmentation with Online Adversarial Self-Tuning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 688-698\n} \n}" }, { "title": "UpCycling: Semi-supervised 3D Object Detection without Sharing Raw-level Unlabeled Scenes", @@ -62848,7 +64947,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Seoul", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Hwang_2023_ICCV,\n \n author = {\n Hwang,\n Sunwook and Kim,\n Youngseok and Kim,\n Seongwon and Bahk,\n Saewoong and Kim,\n Hyung-Sin\n},\n title = {\n UpCycling: Semi-supervised 3D Object Detection without Sharing Raw-level Unlabeled Scenes\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23351-23361\n} \n}" }, { "title": "Urban Radiance Field Representation with Deformable Neural Mesh Primitives", @@ -62873,14 +64973,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Lu_Urban_Radiance_Field_Representation_with_Deformable_Neural_Mesh_Primitives_ICCV_2023_paper.html", "aff_unique_index": "0;1;0+2+3;1+2;1;0", - "aff_unique_norm": "Tongji University;Chinese University of Hong Kong;Shanghai AI Laboratory;Center for Process Innovation and Integration", + "aff_unique_norm": "Tongji University;The Chinese University of Hong Kong;Shanghai AI Laboratory;Center for Process Innovation and Integration", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.tongji.edu.cn;https://www.cuhk.edu.hk;https://www.shanghai-ai-lab.com;", "aff_unique_abbr": "Tongji;CUHK;SAIL;CPII", "aff_campus_unique_index": "1;;1;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0+0;0+0;0;0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Lu_2023_ICCV,\n \n author = {\n Lu,\n Fan and Xu,\n Yan and Chen,\n Guang and Li,\n Hongsheng and Lin,\n Kwan-Yee and Jiang,\n Changjun\n},\n title = {\n Urban Radiance Field Representation with Deformable Neural Mesh Primitives\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 465-476\n} \n}" }, { "title": "UrbanGIRAFFE: Representing Urban Scenes as Compositional Generative Neural Feature Fields", @@ -62912,7 +65013,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n Yuanbo and Yang,\n Yifei and Guo,\n Hanlei and Xiong,\n Rong and Wang,\n Yue and Liao,\n Yiyi\n},\n title = {\n UrbanGIRAFFE: Representing Urban Scenes as Compositional Generative Neural Feature Fields\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9199-9210\n} \n}" }, { "title": "Using a Waffle Iron for Automotive Point Cloud Semantic Segmentation", @@ -62935,7 +65037,8 @@ "aff_domain": ";;", "email": ";;", "author_num": 3, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Puy_Using_a_Waffle_Iron_for_Automotive_Point_Cloud_Semantic_Segmentation_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Puy_Using_a_Waffle_Iron_for_Automotive_Point_Cloud_Semantic_Segmentation_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Puy_2023_ICCV,\n \n author = {\n Puy,\n Gilles and Boulch,\n Alexandre and Marlet,\n Renaud\n},\n title = {\n Using a Waffle Iron for Automotive Point Cloud Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3379-3389\n} \n}" }, { "title": "V-FUSE: Volumetric Depth Map Fusion with Long-Range Constraints", @@ -62967,7 +65070,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Burgdorfer_2023_ICCV,\n \n author = {\n Burgdorfer,\n Nathaniel and Mordohai,\n Philippos\n},\n title = {\n V-FUSE: Volumetric Depth Map Fusion with Long-Range Constraints\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3449-3458\n} \n}" }, { "title": "V3Det: Vast Vocabulary Visual Detection Dataset", @@ -62992,14 +65096,15 @@ "author_num": 9, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wang_V3Det_Vast_Vocabulary_Visual_Detection_Dataset_ICCV_2023_paper.html", "aff_unique_index": "0+1+2;0+1+2;0+1+2;1;0;1;0;0;0+1+2", - "aff_unique_norm": "Shanghai AI Laboratory;Chinese University of Hong Kong;Centre of Perceptual and Interactive Intelligence", + "aff_unique_norm": "Shanghai AI Laboratory;The Chinese University of Hong Kong;Centre of Perceptual and Interactive Intelligence", "aff_unique_dep": ";;", "aff_unique_url": "https://www.shanghai-ai-lab.com;https://www.cuhk.edu.hk;", "aff_unique_abbr": "SAIL;CUHK;", "aff_campus_unique_index": "1;1;1;1;1;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0+0;0+0;0+0;0;0;0;0;0;0+0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Jiaqi and Zhang,\n Pan and Chu,\n Tao and Cao,\n Yuhang and Zhou,\n Yujie and Wu,\n Tong and Wang,\n Bin and He,\n Conghui and Lin,\n Dahua\n},\n title = {\n V3Det: Vast Vocabulary Visual Detection Dataset\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19844-19854\n} \n}" }, { "title": "VAD: Vectorized Scene Representation for Efficient Autonomous Driving", @@ -63031,7 +65136,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0;0+0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Jiang_2023_ICCV,\n \n author = {\n Jiang,\n Bo and Chen,\n Shaoyu and Xu,\n Qing and Liao,\n Bencheng and Chen,\n Jiajie and Zhou,\n Helong and Zhang,\n Qian and Liu,\n Wenyu and Huang,\n Chang and Wang,\n Xinggang\n},\n title = {\n VAD: Vectorized Scene Representation for Efficient Autonomous Driving\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8340-8350\n} \n}" }, { "title": "VADER: Video Alignment Differencing and Retrieval", @@ -63063,7 +65169,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;1;1;1;1;0+1", - "aff_country_unique": "United Kingdom;United States" + "aff_country_unique": "United Kingdom;United States", + "bibtex": "@InProceedings{Black_2023_ICCV,\n \n author = {\n Black,\n Alexander and Jenni,\n Simon and Bui,\n Tu and Tanjim,\n Md. Mehrab and Petrangeli,\n Stefano and Sinha,\n Ritwik and Swaminathan,\n Viswanathan and Collomosse,\n John\n},\n title = {\n VADER: Video Alignment Differencing and Retrieval\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22357-22367\n} \n}" }, { "title": "VAPCNet: Viewpoint-Aware 3D Point Cloud Completion", @@ -63086,7 +65193,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Fu_VAPCNet_Viewpoint-Aware_3D_Point_Cloud_Completion_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Fu_VAPCNet_Viewpoint-Aware_3D_Point_Cloud_Completion_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Fu_2023_ICCV,\n \n author = {\n Fu,\n Zhiheng and Wang,\n Longguang and Xu,\n Lian and Wang,\n Zhiyong and Laga,\n Hamid and Guo,\n Yulan and Boussaid,\n Farid and Bennamoun,\n Mohammed\n},\n title = {\n VAPCNet: Viewpoint-Aware 3D Point Cloud Completion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12108-12118\n} \n}" }, { "title": "VI-Net: Boosting Category-level 6D Object Pose Estimation via Learning Decoupled Rotations on the Spherical Representations", @@ -63118,7 +65226,8 @@ "aff_campus_unique_index": ";1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0+0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Lin_2023_ICCV,\n \n author = {\n Lin,\n Jiehong and Wei,\n Zewei and Zhang,\n Yabin and Jia,\n Kui\n},\n title = {\n VI-Net: Boosting Category-level 6D Object Pose Estimation via Learning Decoupled Rotations on the Spherical Representations\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14001-14011\n} \n}" }, { "title": "VL-Match: Enhancing Vision-Language Pretraining with Token-Level and Instance-Level Matching", @@ -63143,14 +65252,15 @@ "author_num": 10, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Bi_VL-Match_Enhancing_Vision-Language_Pretraining_with_Token-Level_and_Instance-Level_Matching_ICCV_2023_paper.html", "aff_unique_index": "0+1;2;0+1;2;2;0+1;2;2;2;2", - "aff_unique_norm": "Chinese Academy of Sciences;University of Chinese Academy of Sciences;Microsoft", - "aff_unique_dep": "Institute of Computing Technology;;Microsoft Corporation", + "aff_unique_norm": "Chinese Academy of Sciences;University of Chinese Academy of Sciences;Microsoft Corporation", + "aff_unique_dep": "Institute of Computing Technology;;", "aff_unique_url": "http://www.ict.ac.cn;http://www.ucas.ac.cn;https://www.microsoft.com", "aff_unique_abbr": "CAS;UCAS;Microsoft", "aff_campus_unique_index": "0+0;0+0;0+0", "aff_campus_unique": "Beijing;", "aff_country_unique_index": "0+0;1;0+0;1;1;0+0;1;1;1;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Bi_2023_ICCV,\n \n author = {\n Bi,\n Junyu and Cheng,\n Daixuan and Yao,\n Ping and Pang,\n Bochen and Zhan,\n Yuefeng and Yang,\n Chuanguang and Wang,\n Yujing and Sun,\n Hao and Deng,\n Weiwei and Zhang,\n Qi\n},\n title = {\n VL-Match: Enhancing Vision-Language Pretraining with Token-Level and Instance-Level Matching\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2584-2593\n} \n}" }, { "title": "VL-PET: Vision-and-Language Parameter-Efficient Tuning via Granularity Control", @@ -63175,14 +65285,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Hu_VL-PET_Vision-and-Language_Parameter-Efficient_Tuning_via_Granularity_Control_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0+1+2", - "aff_unique_norm": "Chinese University of Hong Kong;Centre for Perceptual and Interactive Intelligence;Shanghai Artificial Intelligence Laboratory", + "aff_unique_norm": "The Chinese University of Hong Kong;Centre for Perceptual and Interactive Intelligence;Shanghai Artificial Intelligence Laboratory", "aff_unique_dep": ";;", "aff_unique_url": "https://www.cuhk.edu.hk;;http://www.shailab.org/", "aff_unique_abbr": "CUHK;;Shanghai AI Lab", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0+0", - "aff_country_unique": "China;" + "aff_country_unique": "China;", + "bibtex": "@InProceedings{Hu_2023_ICCV,\n \n author = {\n Hu,\n Zi-Yuan and Li,\n Yanyang and Lyu,\n Michael R. and Wang,\n Liwei\n},\n title = {\n VL-PET: Vision-and-Language Parameter-Efficient Tuning via Granularity Control\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3010-3020\n} \n}" }, { "title": "VLN-PETL: Parameter-Efficient Transfer Learning for Vision-and-Language Navigation", @@ -63207,14 +65318,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Qiao_VLN-PETL_Parameter-Efficient_Transfer_Learning_for_Vision-and-Language_Navigation_ICCV_2023_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "University of Adelaide", + "aff_unique_norm": "The University of Adelaide", "aff_unique_dep": "Australian Institute for Machine Learning", "aff_unique_url": "https://www.adelaide.edu.au", "aff_unique_abbr": "Adelaide", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Australia" + "aff_country_unique": "Australia", + "bibtex": "@InProceedings{Qiao_2023_ICCV,\n \n author = {\n Qiao,\n Yanyuan and Yu,\n Zheng and Wu,\n Qi\n},\n title = {\n VLN-PETL: Parameter-Efficient Transfer Learning for Vision-and-Language Navigation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15443-15452\n} \n}" }, { "title": "VLSlice: Interactive Vision-and-Language Slice Discovery", @@ -63246,7 +65358,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Slyman_2023_ICCV,\n \n author = {\n Slyman,\n Eric and Kahng,\n Minsuk and Lee,\n Stefan\n},\n title = {\n VLSlice: Interactive Vision-and-Language Slice Discovery\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15291-15301\n} \n}" }, { "title": "VQ3D: Learning a 3D-Aware Generative Model on ImageNet", @@ -63278,7 +65391,8 @@ "aff_campus_unique_index": "0;2;2;2;0;2", "aff_campus_unique": "Stanford;;Mountain View", "aff_country_unique_index": "0;0;0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Sargent_2023_ICCV,\n \n author = {\n Sargent,\n Kyle and Koh,\n Jing Yu and Zhang,\n Han and Chang,\n Huiwen and Herrmann,\n Charles and Srinivasan,\n Pratul and Wu,\n Jiajun and Sun,\n Deqing\n},\n title = {\n VQ3D: Learning a 3D-Aware Generative Model on ImageNet\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4240-4250\n} \n}" }, { "title": "VQA Therapy: Exploring Answer Differences by Visually Grounding Answers", @@ -63306,11 +65420,12 @@ "aff_unique_norm": "University of Texas at Austin;University of Colorado", "aff_unique_dep": ";", "aff_unique_url": "https://www.utexas.edu;https://www.colorado.edu", - "aff_unique_abbr": "UT Austin;CU", + "aff_unique_abbr": "UT Austin;CU Boulder", "aff_campus_unique_index": "0;1;0+1", "aff_campus_unique": "Austin;Boulder", "aff_country_unique_index": "0;0;0+0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Chongyan and Anjum,\n Samreen and Gurari,\n Danna\n},\n title = {\n VQA Therapy: Exploring Answer Differences by Visually Grounding Answers\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15315-15325\n} \n}" }, { "title": "VQA-GNN: Reasoning with Multimodal Knowledge via Graph Neural Networks for Visual Question Answering", @@ -63342,7 +65457,8 @@ "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;1;1;0;1", - "aff_country_unique": "Japan;United States" + "aff_country_unique": "Japan;United States", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Yanan and Yasunaga,\n Michihiro and Ren,\n Hongyu and Wada,\n Shinya and Leskovec,\n Jure\n},\n title = {\n VQA-GNN: Reasoning with Multimodal Knowledge via Graph Neural Networks for Visual Question Answering\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21582-21592\n} \n}" }, { "title": "Vanishing Point Estimation in Uncalibrated Images with Prior Gravity Direction", @@ -63350,8 +65466,8 @@ "status": "Poster", "track": "main", "pid": "10761", - "author_site": "R\u00e9mi Pautrat, Shaohui Liu, Petr Hruby, Marc Pollefeys, Daniel Barath", - "author": "R\u00e9mi Pautrat; Shaohui Liu; Petr Hruby; Marc Pollefeys; Daniel Barath", + "author_site": "Rémi Pautrat, Shaohui Liu, Petr Hruby, Marc Pollefeys, Daniel Barath", + "author": "Rémi Pautrat; Shaohui Liu; Petr Hruby; Marc Pollefeys; Daniel Barath", "abstract": "We tackle the problem of estimating a Manhattan frame, i.e. three orthogonal vanishing points, and the unknown focal length of the camera, leveraging a prior vertical direction. The direction can come from an Inertial Measurement Unit that is a standard component of recent consumer devices, e.g., smartphones. We provide an exhaustive analysis of minimal line configurations and derive two new 2-line solvers, one of which does not suffer from singularities affecting existing solvers. Additionally, we design a new non-minimal method, running on an arbitrary number of lines, to boost the performance in local optimization. Combining all solvers in a hybrid robust estimator, our method achieves increased accuracy even with a rough prior. Experiments on synthetic and real-world datasets demonstrate the superior accuracy of our method compared to the state of the art, while having comparable runtimes. We further demonstrate the applicability of our solvers for relative rotation estimation. The code is available at https://github.com/cvg/VP-Estimation-with-Prior-Gravity.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Pautrat_Vanishing_Point_Estimation_in_Uncalibrated_Images_with_Prior_Gravity_Direction_ICCV_2023_paper.pdf", "aff": "Department of Computer Science, ETH Zurich; Department of Computer Science, ETH Zurich; Department of Computer Science, ETH Zurich; Department of Computer Science, ETH Zurich + Microsoft Mixed Reality and AI Zurich lab; Department of Computer Science, ETH Zurich", @@ -63374,7 +65490,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Zurich", "aff_country_unique_index": "0;0;0;0+0;0", - "aff_country_unique": "Switzerland" + "aff_country_unique": "Switzerland", + "bibtex": "@InProceedings{Pautrat_2023_ICCV,\n \n author = {\n Pautrat,\n R\\'emi and Liu,\n Shaohui and Hruby,\n Petr and Pollefeys,\n Marc and Barath,\n Daniel\n},\n title = {\n Vanishing Point Estimation in Uncalibrated Images with Prior Gravity Direction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14118-14127\n} \n}" }, { "title": "Variational Causal Inference Network for Explanatory Visual Question Answering", @@ -63399,14 +65516,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Xue_Variational_Causal_Inference_Network_for_Explanatory_Visual_Question_Answering_ICCV_2023_paper.html", "aff_unique_index": "0+1;0+1;0+1+2", - "aff_unique_norm": "Chinese Academy of Sciences;University of Chinese Academy of Sciences;Pengcheng Laboratory", - "aff_unique_dep": "Institute of Automation;;Peng Cheng Laboratory", + "aff_unique_norm": "Chinese Academy of Sciences;University of Chinese Academy of Sciences;Peng Cheng Laboratory", + "aff_unique_dep": "Institute of Automation;;", "aff_unique_url": "http://www.ia.cas.cn;http://www.ucas.ac.cn;http://www.pcl.ac.cn", "aff_unique_abbr": "CAS;UCAS;PCL", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Xue_2023_ICCV,\n \n author = {\n Xue,\n Dizhan and Qian,\n Shengsheng and Xu,\n Changsheng\n},\n title = {\n Variational Causal Inference Network for Explanatory Visual Question Answering\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2515-2525\n} \n}" }, { "title": "Variational Degeneration to Structural Refinement: A Unified Framework for Superimposed Image Decomposition", @@ -63438,7 +65556,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Tianjin", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Wenyu and Xu,\n Yan and Yang,\n Yang and Ji,\n Haoran and Lang,\n Yue\n},\n title = {\n Variational Degeneration to Structural Refinement: A Unified Framework for Superimposed Image Decomposition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12206-12216\n} \n}" }, { "title": "VeRi3D: Generative Vertex-based Radiance Fields for 3D Controllable Human Image Synthesis", @@ -63470,7 +65589,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Xinya and Huang,\n Jiaxin and Bin,\n Yanrui and Yu,\n Lu and Liao,\n Yiyi\n},\n title = {\n VeRi3D: Generative Vertex-based Radiance Fields for 3D Controllable Human Image Synthesis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8986-8997\n} \n}" }, { "title": "Verbs in Action: Improving Verb Understanding in Video-Language Models", @@ -63502,7 +65622,8 @@ "aff_campus_unique_index": "0;1;1;0;1", "aff_campus_unique": "Oxford;Mountain View", "aff_country_unique_index": "0;1;1;0;1", - "aff_country_unique": "United Kingdom;United States" + "aff_country_unique": "United Kingdom;United States", + "bibtex": "@InProceedings{Momeni_2023_ICCV,\n \n author = {\n Momeni,\n Liliane and Caron,\n Mathilde and Nagrani,\n Arsha and Zisserman,\n Andrew and Schmid,\n Cordelia\n},\n title = {\n Verbs in Action: Improving Verb Understanding in Video-Language Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15579-15591\n} \n}" }, { "title": "Versatile Diffusion: Text, Images and Variations All in One Diffusion Model", @@ -63534,7 +65655,8 @@ "aff_campus_unique_index": "0;1;0;0", "aff_campus_unique": "Atlanta;Austin;", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Xu_2023_ICCV,\n \n author = {\n Xu,\n Xingqian and Wang,\n Zhangyang and Zhang,\n Gong and Wang,\n Kai and Shi,\n Humphrey\n},\n title = {\n Versatile Diffusion: Text,\n Images and Variations All in One Diffusion Model\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7754-7765\n} \n}" }, { "title": "VertexSerum: Poisoning Graph Neural Networks for Link Inference", @@ -63566,7 +65688,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Boston", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Ding_2023_ICCV,\n \n author = {\n Ding,\n Ruyi and Duan,\n Shijin and Xu,\n Xiaolin and Fei,\n Yunsi\n},\n title = {\n VertexSerum: Poisoning Graph Neural Networks for Link Inference\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4532-4541\n} \n}" }, { "title": "ViLLA: Fine-Grained Vision-Language Representation Learning from Real-World Data", @@ -63598,7 +65721,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Varma_2023_ICCV,\n \n author = {\n Varma,\n Maya and Delbrouck,\n Jean-Benoit and Hooper,\n Sarah and Chaudhari,\n Akshay and Langlotz,\n Curtis\n},\n title = {\n ViLLA: Fine-Grained Vision-Language Representation Learning from Real-World Data\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22225-22235\n} \n}" }, { "title": "ViLTA: Enhancing Vision-Language Pre-training through Textual Augmentation", @@ -63630,7 +65754,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Weihan and Yang,\n Zhen and Xu,\n Bin and Li,\n Juanzi and Sun,\n Yankui\n},\n title = {\n ViLTA: Enhancing Vision-Language Pre-training through Textual Augmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3158-3169\n} \n}" }, { "title": "ViM: Vision Middleware for Unified Downstream Transferring", @@ -63662,7 +65787,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Feng_2023_ICCV,\n \n author = {\n Feng,\n Yutong and Gong,\n Biao and Jiang,\n Jianwen and Lv,\n Yiliang and Shen,\n Yujun and Zhao,\n Deli and Zhou,\n Jingren\n},\n title = {\n ViM: Vision Middleware for Unified Downstream Transferring\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11696-11707\n} \n}" }, { "title": "VidStyleODE: Disentangled Video Editing via StyleGAN and NeuralODEs", @@ -63694,7 +65820,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;2;0;0;0", - "aff_country_unique": "T\u00fcrkiye;United Kingdom;United States" + "aff_country_unique": "Turkey;United Kingdom;United States", + "bibtex": "@InProceedings{Ali_2023_ICCV,\n \n author = {\n Ali,\n Moayed Haji and Bond,\n Andrew and Birdal,\n Tolga and Ceylan,\n Duygu and Karacan,\n Levent and Erdem,\n Erkut and Erdem,\n Aykut\n},\n title = {\n VidStyleODE: Disentangled Video Editing via StyleGAN and NeuralODEs\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7523-7534\n} \n}" }, { "title": "Video Action Recognition with Attentive Semantic Units", @@ -63706,7 +65833,7 @@ "author": "Yifei Chen; Dapeng Chen; Ruijin Liu; Hao Li; Wei Peng", "abstract": "Visual-Language Models (VLMs) have significantly advanced video action recognition. Supervised by the semantics of action labels, recent works adapt the visual branch of VLMs to learn video representations. Despite the effectiveness proved by these works, we believe that the potential of VLMs has yet to be fully harnessed. In light of this, we exploit the semantic units (SU) hiding behind the action labels and leverage their correlations with fine-grained items in frames for more accurate action recognition. SUs are entities extracted from the language descriptions of the entire action set, including body parts, objects, scenes, and motions. To further enhance the alignments between visual contents and the SUs, we introduce a multi-region module (MRA) to the visual branch of the VLM. The MRA allows the perception of region-aware visual features beyond the original global feature. Our method adaptively attends to and selects relevant SUs with visual features of frames. With a cross-modal decoder, the selected SUs serve to decode spatiotemporal video representations. In summary, the SUs as the medium can boost discriminative ability and transferability. Specifically, in fully-supervised learning, our method achieved 87.8% top-1 accuracy on Kinetics-400. In K=2 few-shot experiments, our method surpassed the previous state-of-the-art by +7.1% and +15.0% on HMDB-51 and UCF-101, respectively.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Chen_Video_Action_Recognition_with_Attentive_Semantic_Units_ICCV_2023_paper.pdf", - "aff": "IIRC, Huawei; IIRC, Huawei; Xi\u2019an Jiaotong University; Xiamen University; IIRC, Huawei", + "aff": "IIRC, Huawei; IIRC, Huawei; Xi’an Jiaotong University; Xiamen University; IIRC, Huawei", "project": "", "github": "", "supp": "", @@ -63719,14 +65846,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chen_Video_Action_Recognition_with_Attentive_Semantic_Units_ICCV_2023_paper.html", "aff_unique_index": "0;0;1;2;0", - "aff_unique_norm": "Huawei;Xi'an Jiao Tong University;Xiamen University", - "aff_unique_dep": "Huawei;;", + "aff_unique_norm": "Huawei;Xi'an Jiaotong University;Xiamen University", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.huawei.com;https://www.xjtu.edu.cn;https://www.xmu.edu.cn", "aff_unique_abbr": "Huawei;XJTU;XMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Yifei and Chen,\n Dapeng and Liu,\n Ruijin and Li,\n Hao and Peng,\n Wei\n},\n title = {\n Video Action Recognition with Attentive Semantic Units\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10170-10180\n} \n}" }, { "title": "Video Action Segmentation via Contextually Refined Temporal Keypoints", @@ -63758,7 +65886,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Jiang_2023_ICCV,\n \n author = {\n Jiang,\n Borui and Jin,\n Yang and Tan,\n Zhentao and Mu,\n Yadong\n},\n title = {\n Video Action Segmentation via Contextually Refined Temporal Keypoints\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13836-13845\n} \n}" }, { "title": "Video Adverse-Weather-Component Suppression Network via Weather Messenger and Adversarial Backpropagation", @@ -63782,15 +65911,16 @@ "email": "ust.hk;eng.cam.ac.uk;ihpc.a-star.edu.sg;tju.edu.cn;hkmu.edu.hk;ust.hk", "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yang_Video_Adverse-Weather-Component_Suppression_Network_via_Weather_Messenger_and_Adversarial_Backpropagation_ICCV_2023_paper.html", - "aff_unique_index": "0+0;1;2;3;4;0+0", - "aff_unique_norm": "Hong Kong University of Science and Technology;University of Cambridge;Agency for Science, Technology and Research;Tianjin University;Hong Kong Metropolitan University", - "aff_unique_dep": ";;Institute of High Performance Computing;;", - "aff_unique_url": "https://www.ust.hk;https://www.cam.ac.uk;https://www.a-star.edu.sg;http://www.tju.edu.cn;https://www.hkmu.edu.hk", - "aff_unique_abbr": "HKUST;Cambridge;A*STAR;TJU;HKMU", + "aff_unique_index": "0+1;2;3;4;5;0+1", + "aff_unique_norm": "The Hong Kong University of Science and Technology;Hong Kong University of Science and Technology;University of Cambridge;Agency for Science, Technology and Research;Tianjin University;Hong Kong Metropolitan University", + "aff_unique_dep": ";;;Institute of High Performance Computing;;", + "aff_unique_url": "https://www.ust.hk;https://www.ust.hk;https://www.cam.ac.uk;https://www.a-star.edu.sg;http://www.tju.edu.cn;https://www.hkmu.edu.hk", + "aff_unique_abbr": "HKUST;HKUST;Cambridge;A*STAR;TJU;HKMU", "aff_campus_unique_index": "0+1;2;1;0+1", "aff_campus_unique": "Guangzhou;Hong Kong SAR;Cambridge;", "aff_country_unique_index": "0+0;1;2;0;0;0+0", - "aff_country_unique": "China;United Kingdom;Singapore" + "aff_country_unique": "China;United Kingdom;Singapore", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n Yijun and Aviles-Rivero,\n Angelica I. and Fu,\n Huazhu and Liu,\n Ye and Wang,\n Weiming and Zhu,\n Lei\n},\n title = {\n Video Adverse-Weather-Component Suppression Network via Weather Messenger and Adversarial Backpropagation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13200-13210\n} \n}" }, { "title": "Video Anomaly Detection via Sequentially Learning Multiple Pretext Tasks", @@ -63822,7 +65952,8 @@ "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0+0;0+0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Shi_2023_ICCV,\n \n author = {\n Shi,\n Chenrui and Sun,\n Che and Wu,\n Yuwei and Jia,\n Yunde\n},\n title = {\n Video Anomaly Detection via Sequentially Learning Multiple Pretext Tasks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10330-10340\n} \n}" }, { "title": "Video Background Music Generation: Dataset, Method and Evaluation", @@ -63854,7 +65985,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Edinburgh", "aff_country_unique_index": "0;0;0;0+1;0;0;0;0;0;0", - "aff_country_unique": "China;United Kingdom" + "aff_country_unique": "China;United Kingdom", + "bibtex": "@InProceedings{Zhuo_2023_ICCV,\n \n author = {\n Zhuo,\n Le and Wang,\n Zhaokai and Wang,\n Baisen and Liao,\n Yue and Bao,\n Chenxi and Peng,\n Stanley and Han,\n Songhao and Zhang,\n Aixi and Fang,\n Fei and Liu,\n Si\n},\n title = {\n Video Background Music Generation: Dataset,\n Method and Evaluation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15637-15647\n} \n}" }, { "title": "Video OWL-ViT: Temporally-consistent Open-world Localization in Video", @@ -63863,7 +65995,7 @@ "track": "main", "pid": "12571", "author_site": "Georg Heigold, Matthias Minderer, Alexey Gritsenko, Alex Bewley, Daniel Keysers, Mario Lu?i?, Fisher Yu, Thomas Kipf", - "author": "Georg Heigold; Matthias Minderer; Alexey Gritsenko; Alex Bewley; Daniel Keysers; Mario Lu\u010di\u0107; Fisher Yu; Thomas Kipf", + "author": "Georg Heigold; Matthias Minderer; Alexey Gritsenko; Alex Bewley; Daniel Keysers; Mario Lučić; Fisher Yu; Thomas Kipf", "abstract": "We present an architecture and a training recipe that adapts pretrained open-world image models to localization in videos. Understanding the open visual world (without being constrained by fixed label spaces) is crucial for many real-world vision tasks. Contrastive pre-training on large image-text datasets has recently led to significant improvements for image-level tasks. For more structured tasks involving object localization applying pre-trained models is more challenging. This is particularly true for video tasks, where task-specific data is limited. We show successful transfer of open-world models by building on the OWL-ViT open-vocabulary detection model and adapting it to video by adding a transformer decoder. The decoder propagates object representations recurrently through time by using the output tokens for one frame as the object queries for the next. Our model is end-to-end trainable on video data and enjoys improved temporal consistency compared to tracking-by-detection baselines, while retaining the open-world capabilities of the backbone detector. We evaluate our model on the challenging TAO-OW benchmark and demonstrate that open-world capabilities, learned from large-scale image-text pretraining, can be transferred successfully to open-world localization across diverse videos.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Heigold_Video_OWL-ViT_Temporally-consistent_Open-world_Localization_in_Video_ICCV_2023_paper.pdf", "aff": "Google DeepMind; Google DeepMind; Google DeepMind; Google DeepMind; Google DeepMind; Google DeepMind; ETH Zurich; Google DeepMind", @@ -63880,13 +66012,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Heigold_Video_OWL-ViT_Temporally-consistent_Open-world_Localization_in_Video_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0;0;0;1;0", "aff_unique_norm": "Google;ETH Zurich", - "aff_unique_dep": "Google DeepMind;", + "aff_unique_dep": "DeepMind;", "aff_unique_url": "https://deepmind.com;https://www.ethz.ch", "aff_unique_abbr": "DeepMind;ETHZ", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;1;0", - "aff_country_unique": "United Kingdom;Switzerland" + "aff_country_unique": "United Kingdom;Switzerland", + "bibtex": "@InProceedings{Heigold_2023_ICCV,\n \n author = {\n Heigold,\n Georg and Minderer,\n Matthias and Gritsenko,\n Alexey and Bewley,\n Alex and Keysers,\n Daniel and Lu\\v{c\n}i\\'c,\n Mario and Yu,\n Fisher and Kipf,\n Thomas\n},\n title = {\n Video OWL-ViT: Temporally-consistent Open-world Localization in Video\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13802-13811\n} \n}" }, { "title": "Video Object Segmentation-aware Video Frame Interpolation", @@ -63918,7 +66051,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Yoo_2023_ICCV,\n \n author = {\n Yoo,\n Jun-Sang and Lee,\n Hongjae and Jung,\n Seung-Won\n},\n title = {\n Video Object Segmentation-aware Video Frame Interpolation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12322-12333\n} \n}" }, { "title": "Video State-Changing Object Segmentation", @@ -63943,14 +66077,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Yu_Video_State-Changing_Object_Segmentation_ICCV_2023_paper.html", "aff_unique_index": "0;0;1;2;0", - "aff_unique_norm": "University of Illinois Urbana-Champaign;Carnegie Mellon University;Tencent", - "aff_unique_dep": ";;AI Lab", + "aff_unique_norm": "University of Illinois at Urbana-Champaign;Carnegie Mellon University;Tencent", + "aff_unique_dep": ";;Tencent AI Lab", "aff_unique_url": "https://illinois.edu;https://www.cmu.edu;https://ai.tencent.com", "aff_unique_abbr": "UIUC;CMU;Tencent AI Lab", "aff_campus_unique_index": "0;0;2;0", "aff_campus_unique": "Urbana-Champaign;;Bellevue", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Yu_2023_ICCV,\n \n author = {\n Yu,\n Jiangwei and Li,\n Xiang and Zhao,\n Xinran and Zhang,\n Hongming and Wang,\n Yu-Xiong\n},\n title = {\n Video State-Changing Object Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20439-20448\n} \n}" }, { "title": "Video Task Decathlon: Unifying Image and Video Tasks in Autonomous Driving", @@ -63973,7 +66108,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Huang_Video_Task_Decathlon_Unifying_Image_and_Video_Tasks_in_Autonomous_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Huang_Video_Task_Decathlon_Unifying_Image_and_Video_Tasks_in_Autonomous_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Huang_2023_ICCV,\n \n author = {\n Huang,\n Thomas E. and Liu,\n Yifan and Van Gool,\n Luc and Yu,\n Fisher\n},\n title = {\n Video Task Decathlon: Unifying Image and Video Tasks in Autonomous Driving\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8647-8657\n} \n}" }, { "title": "Video-FocalNets: Spatio-Temporal Focal Modulation for Video Action Recognition", @@ -63985,7 +66121,7 @@ "author": "Syed Talal Wasim; Muhammad Uzair Khattak; Muzammal Naseer; Salman Khan; Mubarak Shah; Fahad Shahbaz Khan", "abstract": "Recent video recognition models utilize Transformer models for long-range spatio-temporal context modeling. Video transformer designs are based on self-attention that can model global context at a high computational cost. In comparison, convolutional designs for videos offer an efficient alternative but lack long-range dependency modeling. Towards achieving the best of both designs, this work proposes Video-FocalNet, an effective and efficient architecture for video recognition that models both local and global contexts. Video-FocalNet is based on a spatio-temporal focal modulation architecture that reverses the interaction and aggregation steps of self-attention for better efficiency. Further, the aggregation step and the interaction step are both implemented using efficient convolution and element-wise multiplication operations that are computationally less expensive than their self-attention counterparts on video representations. We extensively explore the design space of focal modulation-based spatio-temporal context modeling and demonstrate our parallel spatial and temporal encoding design to be the optimal choice. Video-FocalNets perform favorably well against the state-of-the-art transformer-based models for video recognition on five large-scale datasets (Kinetics-400, Kinetics-600, SS-v2, Diving-48, and ActivityNet-1.3) at a lower computational cost. Our code/models are released at https://github.com/TalalWasim/Video-FocalNets.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Wasim_Video-FocalNets_Spatio-Temporal_Focal_Modulation_for_Video_Action_Recognition_ICCV_2023_paper.pdf", - "aff": "Mohamed bin Zayed University of AI; Mohamed bin Zayed University of AI + Australian National University; Mohamed bin Zayed University of AI; Mohamed bin Zayed University of AI + Link\u00f6ping University; University of Central Florida; Mohamed bin Zayed University of AI + Link\u00f6ping University", + "aff": "Mohamed bin Zayed University of AI; Mohamed bin Zayed University of AI + Australian National University; Mohamed bin Zayed University of AI; Mohamed bin Zayed University of AI + Linköping University; University of Central Florida; Mohamed bin Zayed University of AI + Linköping University", "project": "", "github": "https://github.com/TalalWasim/Video-FocalNets", "supp": "", @@ -63998,14 +66134,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wasim_Video-FocalNets_Spatio-Temporal_Focal_Modulation_for_Video_Action_Recognition_ICCV_2023_paper.html", "aff_unique_index": "0;0+1;0;0+2;3;0+2", - "aff_unique_norm": "Mohamed bin Zayed University of Artificial Intelligence;Australian National University;Link\u00f6ping University;University of Central Florida", + "aff_unique_norm": "Mohamed bin Zayed University of Artificial Intelligence;Australian National University;Linköping University;University of Central Florida", "aff_unique_dep": ";;;", "aff_unique_url": "https://mbzuai.ac.ae;https://www.anu.edu.au;https://www.liu.se;https://www.ucf.edu", "aff_unique_abbr": "MBZUAI;ANU;LiU;UCF", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0;0+1;0;0+2;3;0+2", - "aff_country_unique": "United Arab Emirates;Australia;Sweden;United States" + "aff_country_unique": "United Arab Emirates;Australia;Sweden;United States", + "bibtex": "@InProceedings{Wasim_2023_ICCV,\n \n author = {\n Wasim,\n Syed Talal and Khattak,\n Muhammad Uzair and Naseer,\n Muzammal and Khan,\n Salman and Shah,\n Mubarak and Khan,\n Fahad Shahbaz\n},\n title = {\n Video-FocalNets: Spatio-Temporal Focal Modulation for Video Action Recognition\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13778-13789\n} \n}" }, { "title": "VideoFlow: Exploiting Temporal Cues for Multi-frame Optical Flow Estimation", @@ -64030,14 +66167,15 @@ "author_num": 10, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Shi_VideoFlow_Exploiting_Temporal_Cues_for_Multi-frame_Optical_Flow_Estimation_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;0;0;1;1;2;3;0+4+5", - "aff_unique_norm": "Chinese University of Hong Kong;NVIDIA;SenseTime;Tsinghua University;Centre for Perceptual and Interactive Intelligence;Shanghai AI Laboratory", + "aff_unique_norm": "The Chinese University of Hong Kong;NVIDIA;SenseTime;Tsinghua University;Centre for Perceptual and Interactive Intelligence;Shanghai AI Laboratory", "aff_unique_dep": "Multimedia Laboratory;NVIDIA AI Technology Center;SenseTime Research;;Centre for Perceptual and Interactive Intelligence;", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.nvidia.com/en-us/research/;https://www.sensetime.com;https://www.tsinghua.edu.cn;;https://www.shanghai-ai-lab.com", "aff_unique_abbr": "CUHK;NVIDIA;SenseTime;THU;CPII;SAIL", "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0;1;1;0;0;0+0", - "aff_country_unique": "China;United States;" + "aff_country_unique": "China;United States;", + "bibtex": "@InProceedings{Shi_2023_ICCV,\n \n author = {\n Shi,\n Xiaoyu and Huang,\n Zhaoyang and Bian,\n Weikang and Li,\n Dasong and Zhang,\n Manyuan and Cheung,\n Ka Chun and See,\n Simon and Qin,\n Hongwei and Dai,\n Jifeng and Li,\n Hongsheng\n},\n title = {\n VideoFlow: Exploiting Temporal Cues for Multi-frame Optical Flow Estimation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 12469-12480\n} \n}" }, { "title": "View Consistent Purification for Accurate Cross-View Localization", @@ -64069,7 +66207,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;1;1;0", - "aff_country_unique": "Australia;United States" + "aff_country_unique": "Australia;United States", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Shan and Zhang,\n Yanhao and Perincherry,\n Akhil and Vora,\n Ankit and Li,\n Hongdong\n},\n title = {\n View Consistent Purification for Accurate Cross-View Localization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8197-8206\n} \n}" }, { "title": "ViewRefer: Grasp the Multi-view Knowledge for 3D Visual Grounding", @@ -64094,14 +66233,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Guo_ViewRefer_Grasp_the_Multi-view_Knowledge_for_3D_Visual_Grounding_ICCV_2023_paper.html", "aff_unique_index": "0+1;0+1;0+1;0;0;0+2;0+2", - "aff_unique_norm": "Shanghai Artificial Intelligence Laboratory;Chinese University of Hong Kong;Northwestern Polytechnical University", + "aff_unique_norm": "Shanghai Artificial Intelligence Laboratory;The Chinese University of Hong Kong;Northwestern Polytechnical University", "aff_unique_dep": ";;", "aff_unique_url": "http://www.shailab.org/;https://www.cuhk.edu.hk;https://www.nwpu.edu.cn", "aff_unique_abbr": "Shanghai AI Lab;CUHK;NWPU", "aff_campus_unique_index": "1;1;1;;", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0+0;0+0;0+0;0;0;0+0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Guo_2023_ICCV,\n \n author = {\n Guo,\n Zoey and Tang,\n Yiwen and Zhang,\n Ray and Wang,\n Dong and Wang,\n Zhigang and Zhao,\n Bin and Li,\n Xuelong\n},\n title = {\n ViewRefer: Grasp the Multi-view Knowledge for 3D Visual Grounding\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15372-15383\n} \n}" }, { "title": "Viewing Graph Solvability in Practice", @@ -64133,7 +66273,8 @@ "aff_campus_unique_index": "1", "aff_campus_unique": ";Prague", "aff_country_unique_index": "0;1;0", - "aff_country_unique": "Italy;Czechia" + "aff_country_unique": "Italy;Czechia", + "bibtex": "@InProceedings{Arrigoni_2023_ICCV,\n \n author = {\n Arrigoni,\n Federica and Pajdla,\n Tomas and Fusiello,\n Andrea\n},\n title = {\n Viewing Graph Solvability in Practice\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8147-8155\n} \n}" }, { "title": "Viewset Diffusion: (0-)Image-Conditioned 3D Generative Models from 2D Data", @@ -64145,7 +66286,7 @@ "author": "Stanislaw Szymanowicz; Christian Rupprecht; Andrea Vedaldi", "abstract": "We present Viewset Diffusion, a diffusion-based generator that outputs 3D objects while only using multi-view 2D data for supervision. We note that there exists a one-to-one mapping between viewsets, i.e., collections of several 2D views of an object, and 3D models. Hence, we train a diffusion model to generate viewsets, but design the neural network generator to reconstruct internally corresponding 3D models, thus generating those too. We fit a diffusion model to a large number of viewsets for a given category of objects. The resulting generator can be conditioned on zero, one or more input views. Conditioned on a single view, it performs 3D reconstruction accounting for the ambiguity of the task and allowing to sample multiple solutions compatible with the input. The model performs reconstruction efficiently, in a feed-forward manner, and is trained using only rendering losses using as few as three views per viewset. Project page: szymanowiczs.github.io/viewset-diffusion", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Szymanowicz_Viewset_Diffusion_0-Image-Conditioned_3D_Generative_Models_from_2D_Data_ICCV_2023_paper.pdf", - "aff": "Visual Geometry Group \u2014 University of Oxford; Visual Geometry Group \u2014 University of Oxford; Visual Geometry Group \u2014 University of Oxford", + "aff": "Visual Geometry Group — University of Oxford; Visual Geometry Group — University of Oxford; Visual Geometry Group — University of Oxford", "project": "szymanowiczs.github.io/viewset-diffusion", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Szymanowicz_Viewset_Diffusion_0-Image-Conditioned_ICCV_2023_supplemental.pdf", @@ -64165,7 +66306,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Oxford", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Szymanowicz_2023_ICCV,\n \n author = {\n Szymanowicz,\n Stanislaw and Rupprecht,\n Christian and Vedaldi,\n Andrea\n},\n title = {\n Viewset Diffusion: (0-)Image-Conditioned 3D Generative Models from 2D Data\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 8863-8873\n} \n}" }, { "title": "ViperGPT: Visual Inference via Python Execution for Reasoning", @@ -64173,8 +66315,8 @@ "status": "Oral", "track": "main", "pid": "11390", - "author_site": "D\u00eddac Sur\u00eds, Sachit Menon, Carl Vondrick", - "author": "D\u00eddac Sur\u00eds; Sachit Menon; Carl Vondrick", + "author_site": "Dídac Surís, Sachit Menon, Carl Vondrick", + "author": "Dídac Surís; Sachit Menon; Carl Vondrick", "abstract": "Answering visual queries is a complex task that requires both visual processing and reasoning. End-to-end models, the dominant approach for this task, do not explicitly differentiate between the two, limiting interpretability and generalization. Learning modular programs presents a promising alternative, but has proven challenging due to the difficulty of learning both the programs and modules simultaneously. We introduce ViperGPT, a framework that leverages code-generation models to compose vision-and-language models into subroutines to produce a result for any query. ViperGPT utilizes a provided API to access the available modules, and composes them by generating Python code that is later executed. This simple approach requires no further training, and achieves state-of-the-art results across various complex visual tasks.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Suris_ViperGPT_Visual_Inference_via_Python_Execution_for_Reasoning_ICCV_2023_paper.pdf", "aff": "Columbia University; Columbia University; Columbia University", @@ -64197,7 +66339,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Suris_2023_ICCV,\n \n author = {\n Sur{\\'\\i\n}s,\n D{\\'\\i\n}dac and Menon,\n Sachit and Vondrick,\n Carl\n},\n title = {\n ViperGPT: Visual Inference via Python Execution for Reasoning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11888-11898\n} \n}" }, { "title": "Virtual Try-On with Pose-Garment Keypoints Guided Inpainting", @@ -64222,14 +66365,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Li_Virtual_Try-On_with_Pose-Garment_Keypoints_Guided_Inpainting_ICCV_2023_paper.html", "aff_unique_index": "0+1;0;0;0;1", - "aff_unique_norm": "ByteDance;Nanyang Technological University", + "aff_unique_norm": "Bytedance;Nanyang Technological University", "aff_unique_dep": ";", "aff_unique_url": "https://www.bytedance.com;https://www.ntu.edu.sg", "aff_unique_abbr": "Bytedance;NTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0;0;0;1", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Zhi and Wei,\n Pengfei and Yin,\n Xiang and Ma,\n Zejun and Kot,\n Alex C.\n},\n title = {\n Virtual Try-On with Pose-Garment Keypoints Guided Inpainting\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22788-22797\n} \n}" }, { "title": "Visible-Infrared Person Re-Identification via Semantic Alignment and Affinity Inference", @@ -64261,7 +66405,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Fang_2023_ICCV,\n \n author = {\n Fang,\n Xingye and Yang,\n Yang and Fu,\n Ying\n},\n title = {\n Visible-Infrared Person Re-Identification via Semantic Alignment and Affinity Inference\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11270-11279\n} \n}" }, { "title": "Vision Grid Transformer for Document Layout Analysis", @@ -64293,7 +66438,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Da_2023_ICCV,\n \n author = {\n Da,\n Cheng and Luo,\n Chuwei and Zheng,\n Qi and Yao,\n Cong\n},\n title = {\n Vision Grid Transformer for Document Layout Analysis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19462-19472\n} \n}" }, { "title": "Vision HGNN: An Image is More than a Graph of Nodes", @@ -64318,14 +66464,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Han_Vision_HGNN_An_Image_is_More_than_a_Graph_of_ICCV_2023_paper.html", "aff_unique_index": "0;0;1;0;0", - "aff_unique_norm": "University of Texas at Austin;Intel", + "aff_unique_norm": "University of Texas at Austin;Intel Corporation", "aff_unique_dep": ";Intel Labs", "aff_unique_url": "https://www.utexas.edu;https://www.intel.com", "aff_unique_abbr": "UT Austin;Intel", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Austin;", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Han_2023_ICCV,\n \n author = {\n Han,\n Yan and Wang,\n Peihao and Kundu,\n Souvik and Ding,\n Ying and Wang,\n Zhangyang\n},\n title = {\n Vision HGNN: An Image is More than a Graph of Nodes\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19878-19888\n} \n}" }, { "title": "Vision Relation Transformer for Unbiased Scene Graph Generation", @@ -64350,14 +66497,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Sudhakaran_Vision_Relation_Transformer_for_Unbiased_Scene_Graph_Generation_ICCV_2023_paper.html", "aff_unique_index": "0+1;0+1;0+2+1;0+2+1", - "aff_unique_norm": "Technical University of Darmstadt;Hessian Center for AI;Technische Universit\u00e4t Darmstadt", + "aff_unique_norm": "Technical University of Darmstadt;Hessian Center for AI;Technische Universität Darmstadt", "aff_unique_dep": "Department of Computer Science;AI Research;Centre for Cognitive Science", "aff_unique_url": "https://www.tu-darmstadt.de;https://hessian.ai;https://www.tu-darmstadt.de", "aff_unique_abbr": "TUD;hessian.AI;TU Darmstadt", "aff_campus_unique_index": ";;1;1", "aff_campus_unique": ";Darmstadt", "aff_country_unique_index": "0+0;0+0;0+0+0;0+0+0", - "aff_country_unique": "Germany" + "aff_country_unique": "Germany", + "bibtex": "@InProceedings{Sudhakaran_2023_ICCV,\n \n author = {\n Sudhakaran,\n Gopika and Dhami,\n Devendra Singh and Kersting,\n Kristian and Roth,\n Stefan\n},\n title = {\n Vision Relation Transformer for Unbiased Scene Graph Generation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21882-21893\n} \n}" }, { "title": "Vision Transformer Adapters for Generalizable Multitask Learning", @@ -64365,8 +66513,8 @@ "status": "Poster", "track": "main", "pid": "10950", - "author_site": "Deblina Bhattacharjee, Sabine S\u00fcsstrunk, Mathieu Salzmann", - "author": "Deblina Bhattacharjee; Sabine S\u00fcsstrunk; Mathieu Salzmann", + "author_site": "Deblina Bhattacharjee, Sabine Süsstrunk, Mathieu Salzmann", + "author": "Deblina Bhattacharjee; Sabine Süsstrunk; Mathieu Salzmann", "abstract": "We introduce the first multitasking vision transformer adapters that learn generalizable task affinities which can be applied to novel tasks and domains. Integrated into an off-the-shelf vision transformer backbone, our adapters can simultaneously solve multiple dense vision tasks in a parameter-efficient manner, unlike existing multitasking transformers that are parametrically expensive. In contrast to concurrent methods, we do not require retraining or fine-tuning whenever a new task or domain is added. We introduce a task-adapted attention mechanism within our adapter framework that combines gradient-based task similarities with attention-based ones. The learned task affinities generalize to the following settings: zero-shot task transfer, unsupervised domain adaptation, and generalization without fine-tuning to novel domains. We demonstrate that our approach outperforms not only the existing convolutional neural network-based multitasking methods but also the vision transformer-based ones. Our project page is at https://ivrl.github.io/VTAGML.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Bhattacharjee_Vision_Transformer_Adapters_for_Generalizable_Multitask_Learning_ICCV_2023_paper.pdf", "aff": "School of Computer and Communication Sciences, EPFL, Switzerland; School of Computer and Communication Sciences, EPFL, Switzerland; School of Computer and Communication Sciences, EPFL, Switzerland", @@ -64382,14 +66530,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Bhattacharjee_Vision_Transformer_Adapters_for_Generalizable_Multitask_Learning_ICCV_2023_paper.html", "aff_unique_index": "0;0;0", - "aff_unique_norm": "EPFL", + "aff_unique_norm": "École Polytechnique Fédérale de Lausanne", "aff_unique_dep": "School of Computer and Communication Sciences", "aff_unique_url": "https://www.epfl.ch", "aff_unique_abbr": "EPFL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Switzerland" + "aff_country_unique": "Switzerland", + "bibtex": "@InProceedings{Bhattacharjee_2023_ICCV,\n \n author = {\n Bhattacharjee,\n Deblina and S\\"usstrunk,\n Sabine and Salzmann,\n Mathieu\n},\n title = {\n Vision Transformer Adapters for Generalizable Multitask Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19015-19026\n} \n}" }, { "title": "Visual Explanations via Iterated Integrated Attributions", @@ -64398,7 +66547,7 @@ "track": "main", "pid": "12462", "author_site": "Oren Barkan, ?Yehonatan Elisha??, Yuval Asher, Amit Eshel, Noam Koenigstein", - "author": "Oren Barkan; \u202aYehonatan Elisha\u202c\u200f; Yuval Asher; Amit Eshel; Noam Koenigstein", + "author": "Oren Barkan; ‪Yehonatan Elisha‬‏; Yuval Asher; Amit Eshel; Noam Koenigstein", "abstract": "We introduce Iterated Integrated Attributions (IIA) - a generic method for explaining the predictions of vision models. IIA employs iterative integration across the input image, the internal representations generated by the model, and their gradients, yielding precise and focused explanation maps. We demonstrate the effectiveness of IIA through comprehensive evaluations across various tasks, datasets, and network architectures. Our results showcase that IIA produces accurate explanation maps, outperforming other state-of-the-art explanation techniques.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Barkan_Visual_Explanations_via_Iterated_Integrated_Attributions_ICCV_2023_paper.pdf", "aff": "The Open University; The Open University; Tel Aviv University; Tel Aviv University; Tel Aviv University", @@ -64414,14 +66563,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Barkan_Visual_Explanations_via_Iterated_Integrated_Attributions_ICCV_2023_paper.html", "aff_unique_index": "0;0;1;1;1", - "aff_unique_norm": "Open University;Tel Aviv University", + "aff_unique_norm": "The Open University;Tel Aviv University", "aff_unique_dep": ";", "aff_unique_url": "https://www.open.ac.uk;https://www.tau.ac.il", "aff_unique_abbr": "OU;TAU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;1;1", - "aff_country_unique": "United Kingdom;Israel" + "aff_country_unique": "United Kingdom;Israel", + "bibtex": "@InProceedings{Barkan_2023_ICCV,\n \n author = {\n Barkan,\n Oren and Elisha‬‏,\n ‪Yehonatan and Asher,\n Yuval and Eshel,\n Amit and Koenigstein,\n Noam\n},\n title = {\n Visual Explanations via Iterated Integrated Attributions\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2073-2084\n} \n}" }, { "title": "Visual Traffic Knowledge Graph Generation from Scene Images", @@ -64446,14 +66596,15 @@ "author_num": 7, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Guo_Visual_Traffic_Knowledge_Graph_Generation_from_Scene_Images_ICCV_2023_paper.html", "aff_unique_index": "0+1;0+1;0+1;2;2;2;0+1", - "aff_unique_norm": "Chinese Academy of Sciences;University of Chinese Academy of Sciences;Tencent", + "aff_unique_norm": "Chinese Academy of Sciences;University of Chinese Academy of Sciences;Tencent Technology (Beijing) Co., Ltd.", "aff_unique_dep": "Institute of Automation;School of Artificial Intelligence;T Lab, Tencent Map", "aff_unique_url": "http://www.ia.cas.cn;http://www.ucas.ac.cn;https://www.tencent.com", "aff_unique_abbr": "CAS;UCAS;Tencent", "aff_campus_unique_index": ";;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0+0;0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Guo_2023_ICCV,\n \n author = {\n Guo,\n Yunfei and Yin,\n Fei and Li,\n Xiao-hui and Yan,\n Xudong and Xue,\n Tao and Mei,\n Shuqi and Liu,\n Cheng-Lin\n},\n title = {\n Visual Traffic Knowledge Graph Generation from Scene Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21604-21613\n} \n}" }, { "title": "Visually-Prompted Language Model for Fine-Grained Scene Graph Generation in an Open World", @@ -64485,7 +66636,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1;0", - "aff_country_unique": "China;Singapore" + "aff_country_unique": "China;Singapore", + "bibtex": "@InProceedings{Yu_2023_ICCV,\n \n author = {\n Yu,\n Qifan and Li,\n Juncheng and Wu,\n Yu and Tang,\n Siliang and Ji,\n Wei and Zhuang,\n Yueting\n},\n title = {\n Visually-Prompted Language Model for Fine-Grained Scene Graph Generation in an Open World\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21560-21571\n} \n}" }, { "title": "VoroMesh: Learning Watertight Surface Meshes with Voronoi Diagrams", @@ -64497,7 +66649,7 @@ "author": "Nissim Maruani; Roman Klokov; Maks Ovsjanikov; Pierre Alliez; Mathieu Desbrun", "abstract": "In stark contrast to the case of images, finding a concise, learnable discrete representation of 3D surfaces remains a challenge. In particular, while polygon meshes are arguably the most common surface representation used in geometry processing, their irregular and combinatorial structure often make them unsuitable for learning-based applications. In this work, we present VoroMesh, a novel and differentiable of watertight 3D shape surfaces. From a set of 3D points (called generators) and their associated occupancy, we define our boundary representation through the Voronoi diagram of the generators as the subset of Voronoi faces whose two associated (equidistant) generators are of opposite occupancy: the resulting polygon mesh forms a watertight approximation of the target shape's boundary. To learn the position of the generators, we propose a novel loss function, dubbed VoroLoss, that minimizes the distance from ground truth surface samples to the closest faces of the Voronoi diagram which does not require an explicit construction of the entire Voronoi diagram. A direct optimization of the Voroloss to obtain generators on the Thingi32 dataset demonstrates the geometric efficiency of our representation compared to axiomatic meshing algorithms and recent learning-based mesh representations. We further use VoroMesh in a learning-based mesh prediction task from input SDF grids on the ABC dataset, and show comparable performance to state-of-the-art methods while guaranteeing closed output surfaces free of self-intersections.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Maruani_VoroMesh_Learning_Watertight_Surface_Meshes_with_Voronoi_Diagrams_ICCV_2023_paper.pdf", - "aff": "Inria, Universit \u00b4e C\u02c6ote d\u2019Azur; LIX, \u00b4Ecole Polytechnique, IP Paris; LIX, \u00b4Ecole Polytechnique, IP Paris; Inria, Universit \u00b4e C\u02c6ote d\u2019Azur; Inria Saclay - Ecole Polytechnique", + "aff": "Inria, Universit ´e Cˆote d’Azur; LIX, ´Ecole Polytechnique, IP Paris; LIX, ´Ecole Polytechnique, IP Paris; Inria, Universit ´e Cˆote d’Azur; Inria Saclay - Ecole Polytechnique", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Maruani_VoroMesh_Learning_Watertight_ICCV_2023_supplemental.pdf", @@ -64510,14 +66662,15 @@ "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Maruani_VoroMesh_Learning_Watertight_Surface_Meshes_with_Voronoi_Diagrams_ICCV_2023_paper.html", "aff_unique_index": "0;1;1;0;1", - "aff_unique_norm": "INRIA;Ecole Polytechnique", + "aff_unique_norm": "Inria;Ecole Polytechnique", "aff_unique_dep": ";LIX", "aff_unique_url": "https://www.inria.fr;https://www.ecolepolytechnique.fr", "aff_unique_abbr": "Inria;Polytechnique", "aff_campus_unique_index": "1", "aff_campus_unique": ";Saclay", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "France" + "aff_country_unique": "France", + "bibtex": "@InProceedings{Maruani_2023_ICCV,\n \n author = {\n Maruani,\n Nissim and Klokov,\n Roman and Ovsjanikov,\n Maks and Alliez,\n Pierre and Desbrun,\n Mathieu\n},\n title = {\n VoroMesh: Learning Watertight Surface Meshes with Voronoi Diagrams\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 14565-14574\n} \n}" }, { "title": "Vox-E: Text-Guided Voxel Editing of 3D Objects", @@ -64540,7 +66693,8 @@ "aff_domain": ";;;", "email": ";;;", "author_num": 4, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Sella_Vox-E_Text-Guided_Voxel_Editing_of_3D_Objects_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Sella_Vox-E_Text-Guided_Voxel_Editing_of_3D_Objects_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Sella_2023_ICCV,\n \n author = {\n Sella,\n Etai and Fiebelman,\n Gal and Hedman,\n Peter and Averbuch-Elor,\n Hadar\n},\n title = {\n Vox-E: Text-Guided Voxel Editing of 3D Objects\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 430-440\n} \n}" }, { "title": "WALDO: Future Video Synthesis Using Object Layer Decomposition and Parametric Flow Prediction", @@ -64552,7 +66706,7 @@ "author": "Guillaume Le Moing; Jean Ponce; Cordelia Schmid", "abstract": "This paper presents WALDO (WArping Layer-Decomposed Objects), a novel approach to the prediction of future video frames from past ones. Individual images are decomposed into multiple layers combining object masks and a small set of control points. The layer structure is shared across all frames in each video to build dense inter-frame connections. Complex scene motions are modeled by combining parametric geometric transformations associated with individual layers, and video synthesis is broken down into discovering the layers associated with past frames, predicting the corresponding transformations for upcoming ones and warping the associated object regions accordingly, and filling in the remaining image parts. Extensive experiments on multiple benchmarks including urban videos (Cityscapes and KITTI) and videos featuring nonrigid motions (UCF-Sports and H3.6M), show that our method consistently outperforms the state of the art by a significant margin in every case. Code, pretrained models, and video samples synthesized by our approach can be found in the project webpage.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Le_Moing_WALDO_Future_Video_Synthesis_Using_Object_Layer_Decomposition_and_Parametric_ICCV_2023_paper.pdf", - "aff": "Inria + D\u00b4epartement d\u2019informatique de l\u2019ENS (CNRS, ENS-PSL, Inria); Inria + Center for Data Science New York University; Inria + D\u00b4epartement d\u2019informatique de l\u2019ENS (CNRS, ENS-PSL, Inria)", + "aff": "Inria + D´epartement d’informatique de l’ENS (CNRS, ENS-PSL, Inria); Inria + Center for Data Science New York University; Inria + D´epartement d’informatique de l’ENS (CNRS, ENS-PSL, Inria)", "project": "https://16lemoing.github.io/waldo", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Le_Moing_WALDO_Future_Video_ICCV_2023_supplemental.pdf", @@ -64565,14 +66719,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Le_Moing_WALDO_Future_Video_Synthesis_Using_Object_Layer_Decomposition_and_Parametric_ICCV_2023_paper.html", "aff_unique_index": "0+1;0+2;0+1", - "aff_unique_norm": "INRIA;\u00c9cole Normale Sup\u00e9rieure;New York University", - "aff_unique_dep": ";D\u00e9partement d\u2019informatique;Center for Data Science", + "aff_unique_norm": "Inria;École Normale Supérieure;New York University", + "aff_unique_dep": ";Département d’informatique;Center for Data Science", "aff_unique_url": "https://www.inria.fr;https://www.ens.fr;https://www.nyu.edu", "aff_unique_abbr": "Inria;ENS;NYU", "aff_campus_unique_index": ";1;", "aff_campus_unique": ";New York", "aff_country_unique_index": "0+0;0+1;0+0", - "aff_country_unique": "France;United States" + "aff_country_unique": "France;United States", + "bibtex": "@InProceedings{Le_Moing_2023_ICCV,\n \n author = {\n Le Moing,\n Guillaume and Ponce,\n Jean and Schmid,\n Cordelia\n},\n title = {\n WALDO: Future Video Synthesis Using Object Layer Decomposition and Parametric Flow Prediction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 23229-23241\n} \n}" }, { "title": "WDiscOOD: Out-of-Distribution Detection via Whitened Linear Discriminant Analysis", @@ -64604,7 +66759,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Yiye and Lin,\n Yunzhi and Xu,\n Ruinian and Vela,\n Patricio A.\n},\n title = {\n WDiscOOD: Out-of-Distribution Detection via Whitened Linear Discriminant Analysis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5298-5307\n} \n}" }, { "title": "Waffling Around for Performance: Visual Classification with Random Words and Broad Concepts", @@ -64616,7 +66772,7 @@ "author": "Karsten Roth; Jae Myung Kim; A. Sophia Koepke; Oriol Vinyals; Cordelia Schmid; Zeynep Akata", "abstract": "The visual classification performance of vision-language models such as CLIP has been shown to benefit from additional semantic knowledge from large language models (LLMs) such as GPT-3. In particular, averaging over LLM-generated class descriptors, e.g. \"waffle, which has a round shape\", can notably improve generalization performance. In this work, we critically study this behavior and propose WaffleCLIP, a framework for zero-shot visual classification which simply replaces LLM-generated descriptors with random character and word descriptors. Without querying external models, we achieve comparable performance gains on a large number of visual classification tasks. This allows WaffleCLIP to both serve as a low-cost \n alternative, as well as a sanity check for any future LLM-based vision-language model extensions. We conduct an extensive experimental study on the impact and shortcomings of additional semantics introduced with LLM-generated descriptors, and showcase how - if available - semantic context is better leveraged by querying LLMs for high-level concepts, which we show can be done to jointly resolve potential class name ambiguities. Code is available here: https://github.com/ExplainableML/WaffleCLIP.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Roth_Waffling_Around_for_Performance_Visual_Classification_with_Random_Words_and_ICCV_2023_paper.pdf", - "aff": "University of T\u00fcbingen, T\u00fcbingen AI Center; University of T\u00fcbingen, T\u00fcbingen AI Center; University of T\u00fcbingen, T\u00fcbingen AI Center; Google DeepMind; Inria, Ecole normale sup\u00e9rieure, CNRS, PSL Research University; MPI for Intelligent Systems", + "aff": "University of Tübingen, Tübingen AI Center; University of Tübingen, Tübingen AI Center; University of Tübingen, Tübingen AI Center; Google DeepMind; Inria, Ecole normale supérieure, CNRS, PSL Research University; MPI for Intelligent Systems", "project": "", "github": "https://github.com/ExplainableML/WaffleCLIP", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Roth_Waffling_Around_for_ICCV_2023_supplemental.pdf", @@ -64629,14 +66785,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Roth_Waffling_Around_for_Performance_Visual_Classification_with_Random_Words_and_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;1;2;3", - "aff_unique_norm": "University of T\u00fcbingen;Google;INRIA;Max Planck Institute for Intelligent Systems", - "aff_unique_dep": "T\u00fcbingen AI Center;Google DeepMind;;", + "aff_unique_norm": "University of Tübingen;Google;Inria;Max Planck Institute for Intelligent Systems", + "aff_unique_dep": "Tübingen AI Center;DeepMind;;", "aff_unique_url": "https://www.uni-tuebingen.de/;https://deepmind.com;https://www.inria.fr;https://www.mpi-is.mpg.de", - "aff_unique_abbr": "Uni T\u00fcbingen;DeepMind;Inria;MPI-IS", + "aff_unique_abbr": "Uni Tübingen;DeepMind;Inria;MPI-IS", "aff_campus_unique_index": "0;0;0", - "aff_campus_unique": "T\u00fcbingen;", + "aff_campus_unique": "Tübingen;", "aff_country_unique_index": "0;0;0;1;2;0", - "aff_country_unique": "Germany;United Kingdom;France" + "aff_country_unique": "Germany;United Kingdom;France", + "bibtex": "@InProceedings{Roth_2023_ICCV,\n \n author = {\n Roth,\n Karsten and Kim,\n Jae Myung and Koepke,\n A. Sophia and Vinyals,\n Oriol and Schmid,\n Cordelia and Akata,\n Zeynep\n},\n title = {\n Waffling Around for Performance: Visual Classification with Random Words and Broad Concepts\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15746-15757\n} \n}" }, { "title": "Walking Your LiDOG: A Journey Through Multiple Domains for LiDAR Semantic Segmentation", @@ -64644,8 +66801,8 @@ "status": "Poster", "track": "main", "pid": "1702", - "author_site": "Cristiano Saltori, Aljosa Osep, Elisa Ricci, Laura Leal-Taix\u00e9", - "author": "Cristiano Saltori; Aljosa Osep; Elisa Ricci; Laura Leal-Taix\u00e9", + "author_site": "Cristiano Saltori, Aljosa Osep, Elisa Ricci, Laura Leal-Taixé", + "author": "Cristiano Saltori; Aljosa Osep; Elisa Ricci; Laura Leal-Taixé", "abstract": "The ability to deploy robots that can operate safely in diverse environments is crucial for developing embodied intelligent agents. As a community, we have made tremendous progress in within-domain LiDAR semantic segmentation. However, do these methods generalize across domains? \n To answer this question, we design the first experimental setup for studying domain generalization (DG) for LiDAR semantic segmentation (DG-LSS). Our results confirm a significant gap between methods, evaluated in a cross-domain setting: for example, a model trained on the source dataset (SemanticKITTI) obtains 26.53 mIoU on the target data, compared to 48.49 mIoU obtained by the model trained on the target domain (nuScenes). \n To tackle this gap, we propose the first method specifically designed for DG-LSS, which obtains 34.88 mIoU on the target domain, outperforming all baselines. Our method augments a sparse-convolutional encoder-decoder 3D segmentation network with an additional, dense 2D convolutional decoder that learns to classify a birds-eye view of the point cloud. This simple auxiliary task encourages the 3D network to learn features that are robust to sensor placement shifts and resolution, and are transferable across domains. With this work, we aim to inspire the community to develop and evaluate future models in such cross-domain conditions.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Saltori_Walking_Your_LiDOG_A_Journey_Through_Multiple_Domains_for_LiDAR_ICCV_2023_paper.pdf", "aff": "University of Trento; TU Munich; Federazione Bruno Kessler; NVIDIA", @@ -64661,14 +66818,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Saltori_Walking_Your_LiDOG_A_Journey_Through_Multiple_Domains_for_LiDAR_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;3", - "aff_unique_norm": "University of Trento;Technical University of Munich;Federazione Bruno Kessler;NVIDIA", - "aff_unique_dep": ";;;NVIDIA Corporation", + "aff_unique_norm": "University of Trento;Technical University of Munich;Federazione Bruno Kessler;NVIDIA Corporation", + "aff_unique_dep": ";;;", "aff_unique_url": "https://www.unitn.it;https://www.tum.de;https://www.fbk.eu;https://www.nvidia.com", "aff_unique_abbr": "UniTN;TUM;FBK;NVIDIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;2", - "aff_country_unique": "Italy;Germany;United States" + "aff_country_unique": "Italy;Germany;United States", + "bibtex": "@InProceedings{Saltori_2023_ICCV,\n \n author = {\n Saltori,\n Cristiano and Osep,\n Aljosa and Ricci,\n Elisa and Leal-Taix\\'e,\n Laura\n},\n title = {\n Walking Your LiDOG: A Journey Through Multiple Domains for LiDAR Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 196-206\n} \n}" }, { "title": "Wasserstein Expansible Variational Autoencoder for Discriminative and Generative Continual Learning", @@ -64700,7 +66858,8 @@ "aff_campus_unique_index": "0;0", "aff_campus_unique": "York", "aff_country_unique_index": "0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Ye_2023_ICCV,\n \n author = {\n Ye,\n Fei and Bors,\n Adrian G.\n},\n title = {\n Wasserstein Expansible Variational Autoencoder for Discriminative and Generative Continual Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18665-18675\n} \n}" }, { "title": "WaterMask: Instance Segmentation for Underwater Imagery", @@ -64732,7 +66891,8 @@ "aff_campus_unique_index": ";1+1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0+0;0;0;0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Lian_2023_ICCV,\n \n author = {\n Lian,\n Shijie and Li,\n Hua and Cong,\n Runmin and Li,\n Suqi and Zhang,\n Wei and Kwong,\n Sam\n},\n title = {\n WaterMask: Instance Segmentation for Underwater Imagery\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1305-1315\n} \n}" }, { "title": "WaveIPT: Joint Attention and Flow Alignment in the Wavelet domain for Pose Transfer", @@ -64764,7 +66924,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Ma_2023_ICCV,\n \n author = {\n Ma,\n Liyuan and Gao,\n Tingwei and Jiang,\n Haitian and Shen,\n Haibin and Huang,\n Kejie\n},\n title = {\n WaveIPT: Joint Attention and Flow Alignment in the Wavelet domain for Pose Transfer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 7215-7225\n} \n}" }, { "title": "WaveNeRF: Wavelet-based Generalizable Neural Radiance Fields", @@ -64787,7 +66948,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Xu_WaveNeRF_Wavelet-based_Generalizable_Neural_Radiance_Fields_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Xu_WaveNeRF_Wavelet-based_Generalizable_Neural_Radiance_Fields_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Xu_2023_ICCV,\n \n author = {\n Xu,\n Muyu and Zhan,\n Fangneng and Zhang,\n Jiahui and Yu,\n Yingchen and Zhang,\n Xiaoqin and Theobalt,\n Christian and Shao,\n Ling and Lu,\n Shijian\n},\n title = {\n WaveNeRF: Wavelet-based Generalizable Neural Radiance Fields\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18195-18204\n} \n}" }, { "title": "Weakly Supervised Learning of Semantic Correspondence through Cascaded Online Correspondence Refinement", @@ -64819,7 +66981,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Shanghai;", "aff_country_unique_index": "0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Huang_2023_ICCV,\n \n author = {\n Huang,\n Yiwen and Sun,\n Yixuan and Lai,\n Chenghang and Xu,\n Qing and Wang,\n Xiaomei and Shen,\n Xuli and Ge,\n Weifeng\n},\n title = {\n Weakly Supervised Learning of Semantic Correspondence through Cascaded Online Correspondence Refinement\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16254-16263\n} \n}" }, { "title": "Weakly Supervised Referring Image Segmentation with Intra-Chunk and Inter-Chunk Consistency", @@ -64844,14 +67007,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Lee_Weakly_Supervised_Referring_Image_Segmentation_with_Intra-Chunk_and_Inter-Chunk_Consistency_ICCV_2023_paper.html", "aff_unique_index": "0;0;0;1;0;0", - "aff_unique_norm": "Amazon;NAVER Corporation", - "aff_unique_dep": "Amazon.com, Inc.;Search US", + "aff_unique_norm": "Amazon.com, Inc.;NAVER Corporation", + "aff_unique_dep": ";Search US", "aff_unique_url": "https://www.amazon.com;https://www.naver.com", "aff_unique_abbr": "Amazon;NAVER", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Lee_2023_ICCV,\n \n author = {\n Lee,\n Jungbeom and Lee,\n Sungjin and Nam,\n Jinseok and Yu,\n Seunghak and Do,\n Jaeyoung and Taghavi,\n Tara\n},\n title = {\n Weakly Supervised Referring Image Segmentation with Intra-Chunk and Inter-Chunk Consistency\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21870-21881\n} \n}" }, { "title": "Weakly-Supervised Action Localization by Hierarchically-Structured Latent Attention Modeling", @@ -64874,7 +67038,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wang_Weakly-Supervised_Action_Localization_by_Hierarchically-Structured_Latent_Attention_Modeling_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wang_Weakly-Supervised_Action_Localization_by_Hierarchically-Structured_Latent_Attention_Modeling_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Guiqin and Zhao,\n Peng and Zhao,\n Cong and Yang,\n Shusen and Cheng,\n Jie and Leng,\n Luziwei and Liao,\n Jianxing and Guo,\n Qinghai\n},\n title = {\n Weakly-Supervised Action Localization by Hierarchically-Structured Latent Attention Modeling\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10203-10213\n} \n}" }, { "title": "Weakly-Supervised Action Segmentation and Unseen Error Detection in Anomalous Instructional Videos", @@ -64906,7 +67071,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Ghoddoosian_2023_ICCV,\n \n author = {\n Ghoddoosian,\n Reza and Dwivedi,\n Isht and Agarwal,\n Nakul and Dariush,\n Behzad\n},\n title = {\n Weakly-Supervised Action Segmentation and Unseen Error Detection in Anomalous Instructional Videos\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10128-10138\n} \n}" }, { "title": "Weakly-Supervised Text-Driven Contrastive Learning for Facial Behavior Understanding", @@ -64938,7 +67104,8 @@ "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Binghamton;", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Xiang and Wang,\n Taoyue and Li,\n Xiaotian and Yang,\n Huiyuan and Yin,\n Lijun\n},\n title = {\n Weakly-Supervised Text-Driven Contrastive Learning for Facial Behavior Understanding\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20751-20762\n} \n}" }, { "title": "Weakly-supervised 3D Pose Transfer with Keypoints", @@ -64970,7 +67137,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Singapore" + "aff_country_unique": "Singapore", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Jinnan and Li,\n Chen and Lee,\n Gim Hee\n},\n title = {\n Weakly-supervised 3D Pose Transfer with Keypoints\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15156-15165\n} \n}" }, { "title": "What Can Simple Arithmetic Operations Do for Temporal Modeling?", @@ -64995,14 +67163,15 @@ "author_num": 6, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wu_What_Can_Simple_Arithmetic_Operations_Do_for_Temporal_Modeling_ICCV_2023_paper.html", "aff_unique_index": "0;1;1;1;0;2+0", - "aff_unique_norm": "University of Sydney;Baidu;Shanghai AI Laboratory", - "aff_unique_dep": ";Baidu Inc.;", + "aff_unique_norm": "University of Sydney;Baidu Inc.;Shanghai AI Laboratory", + "aff_unique_dep": ";;", "aff_unique_url": "https://www.sydney.edu.au;https://www.baidu.com;https://www.shanghai-ai-lab.com", "aff_unique_abbr": "USYD;Baidu;SAIL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;0;1+0", - "aff_country_unique": "Australia;China" + "aff_country_unique": "Australia;China", + "bibtex": "@InProceedings{Wu_2023_ICCV,\n \n author = {\n Wu,\n Wenhao and Song,\n Yuxin and Sun,\n Zhun and Wang,\n Jingdong and Xu,\n Chang and Ouyang,\n Wanli\n},\n title = {\n What Can Simple Arithmetic Operations Do for Temporal Modeling?\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13712-13722\n} \n}" }, { "title": "What Can a Cook in Italy Teach a Mechanic in India? Action Recognition Generalisation Over Scenarios and Locations", @@ -65034,7 +67203,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;1", - "aff_country_unique": "Italy;United Kingdom" + "aff_country_unique": "Italy;United Kingdom", + "bibtex": "@InProceedings{Plizzari_2023_ICCV,\n \n author = {\n Plizzari,\n Chiara and Perrett,\n Toby and Caputo,\n Barbara and Damen,\n Dima\n},\n title = {\n What Can a Cook in Italy Teach a Mechanic in India? Action Recognition Generalisation Over Scenarios and Locations\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13656-13666\n} \n}" }, { "title": "What Does a Platypus Look Like? Generating Customized Prompts for Zero-Shot Image Classification", @@ -65060,13 +67230,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Pratt_What_Does_a_Platypus_Look_Like_Generating_Customized_Prompts_for_ICCV_2023_paper.html", "aff_unique_index": "0;0;1+2;0", "aff_unique_norm": "University of Washington;Google;ML Collective", - "aff_unique_dep": ";Google DeepMind;", + "aff_unique_dep": ";DeepMind;", "aff_unique_url": "https://www.washington.edu;https://deepmind.com;", "aff_unique_abbr": "UW;DeepMind;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", - "aff_country_unique": "United States;United Kingdom;" + "aff_country_unique": "United States;United Kingdom;", + "bibtex": "@InProceedings{Pratt_2023_ICCV,\n \n author = {\n Pratt,\n Sarah and Covert,\n Ian and Liu,\n Rosanne and Farhadi,\n Ali\n},\n title = {\n What Does a Platypus Look Like? Generating Customized Prompts for Zero-Shot Image Classification\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15691-15701\n} \n}" }, { "title": "What can Discriminator do? Towards Box-free Ownership Verification of Generative Adversarial Networks", @@ -65093,12 +67264,13 @@ "aff_unique_index": "0;0;0;0;1;2;0;0", "aff_unique_norm": "Wuhan University;Chongqing University;Nanjing University of Aeronautics and Astronautics", "aff_unique_dep": "School of Cyber Science and Engineering;College of Computer Science;College of Computer Science and Technology", - "aff_unique_url": "http://www.whu.edu.cn/;http://www.cqu.edu.cn;http://www.nuaa.edu.cn", + "aff_unique_url": "http://www.whu.edu.cn/;http://en.cqu.edu.cn/;http://www.nuaa.edu.cn", "aff_unique_abbr": "WHU;CQU;NUAA", "aff_campus_unique_index": "1", "aff_campus_unique": ";Nanjing", "aff_country_unique_index": "0;0;0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Huang_2023_ICCV,\n \n author = {\n Huang,\n Ziheng and Li,\n Boheng and Cai,\n Yan and Wang,\n Run and Guo,\n Shangwei and Fang,\n Liming and Chen,\n Jing and Wang,\n Lina\n},\n title = {\n What can Discriminator do? Towards Box-free Ownership Verification of Generative Adversarial Networks\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5009-5019\n} \n}" }, { "title": "What do neural networks learn in image classification? A frequency shortcut perspective", @@ -65130,7 +67302,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "Netherlands" + "aff_country_unique": "Netherlands", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Shunxin and Veldhuis,\n Raymond and Brune,\n Christoph and Strisciuglio,\n Nicola\n},\n title = {\n What do neural networks learn in image classification? A frequency shortcut perspective\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1433-1442\n} \n}" }, { "title": "What does CLIP know about a red circle? Visual prompt engineering for VLMs", @@ -65162,7 +67335,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Oxford", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Shtedritski_2023_ICCV,\n \n author = {\n Shtedritski,\n Aleksandar and Rupprecht,\n Christian and Vedaldi,\n Andrea\n},\n title = {\n What does CLIP know about a red circle? Visual prompt engineering for VLMs\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11987-11997\n} \n}" }, { "title": "When Do Curricula Work in Federated Learning?", @@ -65194,7 +67368,8 @@ "aff_campus_unique_index": "0;0;0;0;2;2;0", "aff_campus_unique": "San Diego;;Orlando", "aff_country_unique_index": "0;0;0;0;1;0;0;0", - "aff_country_unique": "United States;Czech Republic" + "aff_country_unique": "United States;Czech Republic", + "bibtex": "@InProceedings{Vahidian_2023_ICCV,\n \n author = {\n Vahidian,\n Saeed and Kadaveru,\n Sreevatsank and Baek,\n Woonjoon and Wang,\n Weijia and Kungurtsev,\n Vyacheslav and Chen,\n Chen and Shah,\n Mubarak and Lin,\n Bill\n},\n title = {\n When Do Curricula Work in Federated Learning?\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 5084-5094\n} \n}" }, { "title": "When Epipolar Constraint Meets Non-Local Operators in Multi-View Stereo", @@ -65202,6 +67377,7 @@ "status": "Poster", "track": "main", "pid": "11187", + "author_site": "Tianqi Liu, Xinyi Ye, Weiyue Zhao, Zhiyu Pan, Min Shi, Zhiguo Cao", "author": "Tianqi Liu, Xinyi Ye, Weiyue Zhao, Zhiyu Pan, Min Shi, Zhiguo Cao", "abstract": "Learning-based multi-view stereo (MVS) method heavily relies on feature matching, which requires distinctive and descriptive representations. An effective solution is to apply non-local feature aggregation, e.g., Transformer. Albeit useful, these techniques introduce heavy computation overheads for MVS. Each pixel densely attends to the whole image. In contrast, we propose to constrain non-local feature augmentation within a pair of lines: each point only attends the corresponding pair of epipolar lines. Our idea takes inspiration from the classic epipolar geometry, which shows that one point with different depth hypotheses will be projected to the epipolar line on the other view. This constraint reduces the 2D search space into the epipolar line in stereo matching. Similarly, this suggests that the matching of MVS is to distinguish a series of points lying on the same line. Inspired by this point-to-line search, we devise a line-to-point non-local augmentation strategy. We first devise an optimized searching algorithm to split the 2D feature maps into epipolar line pairs. Then, an Epipolar Transformer (ET) performs non-local feature augmentation among epipolar line pairs. We incorporate the ET into a learning-based MVS baseline, named ET-MVSNet. ET-MVSNet achieves state-of-the-art reconstruction performance on both the DTU and Tanks-and-Temples benchmark with high efficiency. Code is available at https://github.com/TQTQliu/ET-MVSNet.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Liu_When_Epipolar_Constraint_Meets_Non-Local_Operators_in_Multi-View_Stereo_ICCV_2023_paper.pdf", @@ -65213,7 +67389,8 @@ "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16564244676386604780&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Liu_When_Epipolar_Constraint_Meets_Non-Local_Operators_in_Multi-View_Stereo_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Liu_When_Epipolar_Constraint_Meets_Non-Local_Operators_in_Multi-View_Stereo_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Tianqi and Ye,\n Xinyi and Zhao,\n Weiyue and Pan,\n Zhiyu and Shi,\n Min and Cao,\n Zhiguo\n},\n title = {\n When Epipolar Constraint Meets Non-Local Operators in Multi-View Stereo\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18088-18097\n} \n}" }, { "title": "When Noisy Labels Meet Long Tail Dilemmas: A Representation Calibration Method", @@ -65225,7 +67402,7 @@ "author": "Manyi Zhang; Xuyang Zhao; Jun Yao; Chun Yuan; Weiran Huang", "abstract": "Real-world large-scale datasets are both noisily labeled and class-imbalanced. The issues seriously hurt the generalization of trained models. It is hence significant to address the simultaneous incorrect labeling and class-imbalance, i.e., the problem of learning with noisy labels on long-tailed data. Previous works develop several methods for the problem. However, they always rely on strong assumptions that are invalid or hard to be checked in practice. In this paper, to handle the problem and address the limitations of prior works, we propose a representation calibration method RCAL. Specifically, RCAL works with the representations extracted by unsupervised contrastive learning. We assume that without incorrect labeling and class imbalance, the representations of instances in each class conform to a multivariate Gaussian distribution, which is much milder and easier to be checked. Based on the assumption, we recover underlying representation distributions from polluted ones resulting from mislabeled and class-imbalanced data. Additional data points are then sampled from the recovered distributions to help generalization. Moreover, during classifier training, representation learning takes advantage of representation robustness brought by contrastive learning, which further improves the classifier performance. We derive theoretical results to discuss the effectiveness of our representation calibration. Experiments on multiple benchmarks justify our claims and confirm the superiority of the proposed method.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Zhang_When_Noisy_Labels_Meet_Long_Tail_Dilemmas_A_Representation_Calibration_ICCV_2023_paper.pdf", - "aff": "SIGS, Tsinghua University; Peking University; Huawei Noah\u2019s Ark Lab; SIGS, Tsinghua University + Qing Yuan Research Institute, SEIEE, Shanghai Jiao Tong University; Qing Yuan Research Institute, SEIEE, Shanghai Jiao Tong University", + "aff": "SIGS, Tsinghua University; Peking University; Huawei Noah’s Ark Lab; SIGS, Tsinghua University + Qing Yuan Research Institute, SEIEE, Shanghai Jiao Tong University; Qing Yuan Research Institute, SEIEE, Shanghai Jiao Tong University", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Zhang_When_Noisy_Labels_ICCV_2023_supplemental.pdf", @@ -65239,13 +67416,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Zhang_When_Noisy_Labels_Meet_Long_Tail_Dilemmas_A_Representation_Calibration_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;0+3;3", "aff_unique_norm": "Tsinghua University;Peking University;Huawei;Shanghai Jiao Tong University", - "aff_unique_dep": "School of Information Management and Engineering;;Noah\u2019s Ark Lab;School of Electronic, Information and Electrical Engineering", + "aff_unique_dep": "School of Information Management and Engineering;;Noah’s Ark Lab;School of Electronic, Information and Electrical Engineering", "aff_unique_url": "https://www.tsinghua.edu.cn;http://www.pku.edu.cn;https://www.huawei.com;https://www.sjtu.edu.cn", "aff_unique_abbr": "THU;Peking U;Huawei;SJTU", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Shanghai", "aff_country_unique_index": "0;0;0;0+0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Zhang_2023_ICCV,\n \n author = {\n Zhang,\n Manyi and Zhao,\n Xuyang and Yao,\n Jun and Yuan,\n Chun and Huang,\n Weiran\n},\n title = {\n When Noisy Labels Meet Long Tail Dilemmas: A Representation Calibration Method\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15890-15900\n} \n}" }, { "title": "When Prompt-based Incremental Learning Does Not Meet Strong Pretraining", @@ -65270,14 +67448,15 @@ "author_num": 3, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Tang_When_Prompt-based_Incremental_Learning_Does_Not_Meet_Strong_Pretraining_ICCV_2023_paper.html", "aff_unique_index": "0+1;0+1;0+2+1", - "aff_unique_norm": "Sun Yat-sen University;Key Laboratory of Machine Intelligence and Advanced Computing;Pengcheng Laboratory", - "aff_unique_dep": "School of Computer Science and Engineering;Ministry of Education;Peng Cheng Laboratory", + "aff_unique_norm": "Sun Yat-sen University;Key Laboratory of Machine Intelligence and Advanced Computing;Peng Cheng Laboratory", + "aff_unique_dep": "School of Computer Science and Engineering;Ministry of Education;", "aff_unique_url": "http://www.sysu.edu.cn;;", "aff_unique_abbr": "SYSU;;", "aff_campus_unique_index": ";;1", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0+0;0+0;0+0+0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Tang_2023_ICCV,\n \n author = {\n Tang,\n Yu-Ming and Peng,\n Yi-Xing and Zheng,\n Wei-Shi\n},\n title = {\n When Prompt-based Incremental Learning Does Not Meet Strong Pretraining\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1706-1716\n} \n}" }, { "title": "When to Learn What: Model-Adaptive Data Augmentation Curriculum", @@ -65309,7 +67488,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Hou_2023_ICCV,\n \n author = {\n Hou,\n Chengkai and Zhang,\n Jieyu and Zhou,\n Tianyi\n},\n title = {\n When to Learn What: Model-Adaptive Data Augmentation Curriculum\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1717-1728\n} \n}" }, { "title": "Who Are You Referring To? Coreference Resolution In Image Narrations", @@ -65335,13 +67515,14 @@ "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Goel_Who_Are_You_Referring_To_Coreference_Resolution_In_Image_Narrations_ICCV_2023_paper.html", "aff_unique_index": "0;1;0;0", "aff_unique_norm": "University of Edinburgh;Agency for Science, Technology and Research", - "aff_unique_dep": "School of Informatics;Computational Fluid Dynamics Laboratory", + "aff_unique_dep": "School of Informatics;Computational Biology Group", "aff_unique_url": "https://www.ed.ac.uk;https://www.a-star.edu.sg", "aff_unique_abbr": "Edinburgh;A*STAR", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Edinburgh;", "aff_country_unique_index": "0;1;0;0", - "aff_country_unique": "United Kingdom;Singapore" + "aff_country_unique": "United Kingdom;Singapore", + "bibtex": "@InProceedings{Goel_2023_ICCV,\n \n author = {\n Goel,\n Arushi and Fernando,\n Basura and Keller,\n Frank and Bilen,\n Hakan\n},\n title = {\n Who Are You Referring To? Coreference Resolution In Image Narrations\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15247-15258\n} \n}" }, { "title": "Why Is Prompt Tuning for Vision-Language Models Robust to Noisy Labels?", @@ -65373,7 +67554,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Madison;", "aff_country_unique_index": "0;1;1;1;0;0;1", - "aff_country_unique": "United States;China" + "aff_country_unique": "United States;China", + "bibtex": "@InProceedings{Wu_2023_ICCV,\n \n author = {\n Wu,\n Cheng-En and Tian,\n Yu and Yu,\n Haichao and Wang,\n Heng and Morgado,\n Pedro and Hu,\n Yu Hen and Yang,\n Linjie\n},\n title = {\n Why Is Prompt Tuning for Vision-Language Models Robust to Noisy Labels?\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15488-15497\n} \n}" }, { "title": "Why do networks have inhibitory/negative connections?", @@ -65405,7 +67587,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Qingyang and Powell,\n Mike A. and Geisa,\n Ali and Bridgeford,\n Eric and Priebe,\n Carey E. and Vogelstein,\n Joshua T.\n},\n title = {\n Why do networks have inhibitory/negative connections?\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22551-22559\n} \n}" }, { "title": "Will Large-scale Generative Models Corrupt Future Datasets?", @@ -65437,7 +67620,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "Japan" + "aff_country_unique": "Japan", + "bibtex": "@InProceedings{Hataya_2023_ICCV,\n \n author = {\n Hataya,\n Ryuichiro and Bao,\n Han and Arai,\n Hiromi\n},\n title = {\n Will Large-scale Generative Models Corrupt Future Datasets?\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20555-20565\n} \n}" }, { "title": "Window-Based Early-Exit Cascades for Uncertainty Estimation: When Deep Ensembles are More Efficient than Single Models", @@ -65469,7 +67653,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", - "aff_country_unique": "United Kingdom" + "aff_country_unique": "United Kingdom", + "bibtex": "@InProceedings{Xia_2023_ICCV,\n \n author = {\n Xia,\n Guoxuan and Bouganis,\n Christos-Savvas\n},\n title = {\n Window-Based Early-Exit Cascades for Uncertainty Estimation: When Deep Ensembles are More Efficient than Single Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 17368-17380\n} \n}" }, { "title": "With a Little Help from Your Own Past: Prototypical Memory Networks for Image Captioning", @@ -65501,7 +67686,8 @@ "aff_campus_unique_index": "0+1;0+1;0+1;0+1;0+1", "aff_campus_unique": "Modena;Pisa", "aff_country_unique_index": "0+0;0+0;0+0;0+0;0+0", - "aff_country_unique": "Italy" + "aff_country_unique": "Italy", + "bibtex": "@InProceedings{Barraco_2023_ICCV,\n \n author = {\n Barraco,\n Manuele and Sarto,\n Sara and Cornia,\n Marcella and Baraldi,\n Lorenzo and Cucchiara,\n Rita\n},\n title = {\n With a Little Help from Your Own Past: Prototypical Memory Networks for Image Captioning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3021-3031\n} \n}" }, { "title": "Workie-Talkie: Accelerating Federated Learning by Overlapping Computing and Communications via Contrastive Regularization", @@ -65524,7 +67710,8 @@ "aff_domain": ";;;;;;;", "email": ";;;;;;;", "author_num": 8, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chen_Workie-Talkie_Accelerating_Federated_Learning_by_Overlapping_Computing_and_Communications_via_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Chen_Workie-Talkie_Accelerating_Federated_Learning_by_Overlapping_Computing_and_Communications_via_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Chen_2023_ICCV,\n \n author = {\n Chen,\n Rui and Wan,\n Qiyu and Prakash,\n Pavana and Zhang,\n Lan and Yuan,\n Xu and Gong,\n Yanmin and Fu,\n Xin and Pan,\n Miao\n},\n title = {\n Workie-Talkie: Accelerating Federated Learning by Overlapping Computing and Communications via Contrastive Regularization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16999-17009\n} \n}" }, { "title": "X-Mesh: Towards Fast and Accurate Text-driven 3D Stylization via Dynamic Textual Guidance", @@ -65532,6 +67719,7 @@ "status": "Poster", "track": "main", "pid": "1489", + "author_site": "Yiwei Ma, Xiaoqing Zhang, Xiaoshuai Sun, Jiayi Ji, Haowei Wang, Guannan Jiang, Weilin Zhuang, Rongrong Ji", "author": "Yiwei Ma, Xiaoqing Zhang, Xiaoshuai Sun, Jiayi Ji, Haowei Wang, Guannan Jiang, Weilin Zhuang, Rongrong Ji", "abstract": "Text-driven 3D stylization is a complex and crucial task in the fields of computer vision (CV) and computer graphics (CG), aimed at transforming a bare mesh to fit a target text. Prior methods adopt text-independent multilayer perceptrons (MLPs) to predict the attributes of the target mesh with the supervision of CLIP loss. However, such text-independent architecture lacks textual guidance during predicting attributes, thus leading to unsatisfactory stylization and slow convergence. To address these limitations, we present X-Mesh, an innovative text-driven 3D stylization framework that incorporates a novel Text-guided Dynamic Attention Module (TDAM). The TDAM dynamically integrates the guidance of the target text by utilizing text-relevant spatial and channel-wise attentions during vertex feature extraction, resulting in more accurate attribute prediction and faster convergence speed. Furthermore, existing works lack standard benchmarks and automated metrics for evaluation, often relying on subjective and non-reproducible user studies to assess the quality of stylized 3D assets. To overcome this limitation, we introduce a new standard text-mesh benchmark, namely MIT-30, and two automated metrics, which will enable future research to achieve fair and objective comparisons. Our extensive qualitative and quantitative experiments demonstrate that X-Mesh outperforms previous state-of-the-art methods.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Ma_X-Mesh_Towards_Fast_and_Accurate_Text-driven_3D_Stylization_via_Dynamic_ICCV_2023_paper.pdf", @@ -65543,7 +67731,8 @@ "gs_citation": 41, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2955214599809977933&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 5, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ma_X-Mesh_Towards_Fast_and_Accurate_Text-driven_3D_Stylization_via_Dynamic_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Ma_X-Mesh_Towards_Fast_and_Accurate_Text-driven_3D_Stylization_via_Dynamic_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Ma_2023_ICCV,\n \n author = {\n Ma,\n Yiwei and Zhang,\n Xiaoqing and Sun,\n Xiaoshuai and Ji,\n Jiayi and Wang,\n Haowei and Jiang,\n Guannan and Zhuang,\n Weilin and Ji,\n Rongrong\n},\n title = {\n X-Mesh: Towards Fast and Accurate Text-driven 3D Stylization via Dynamic Textual Guidance\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2749-2760\n} \n}" }, { "title": "X-VoE: Measuring eXplanatory Violation of Expectation in Physical Events", @@ -65575,7 +67764,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0+0;0+0+0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Dai_2023_ICCV,\n \n author = {\n Dai,\n Bo and Wang,\n Linge and Jia,\n Baoxiong and Zhang,\n Zeyu and Zhu,\n Song-Chun and Zhang,\n Chi and Zhu,\n Yixin\n},\n title = {\n X-VoE: Measuring eXplanatory Violation of Expectation in Physical Events\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3992-4002\n} \n}" }, { "title": "XMem++: Production-level Video Segmentation From Few Annotated Frames", @@ -65600,14 +67790,15 @@ "author_num": 4, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Bekuzarov_XMem_Production-level_Video_Segmentation_From_Few_Annotated_Frames_ICCV_2023_paper.html", "aff_unique_index": "0+1;0+1;2;0+1+2", - "aff_unique_norm": "Mohamed bin Zayed University of Artificial Intelligence;Pinscreen;Adobe", + "aff_unique_norm": "Mohamed Bin Zayed University of Artificial Intelligence;Pinscreen;Adobe", "aff_unique_dep": ";;Adobe Research", "aff_unique_url": "https://www.mbzuai.ac.ae;https://www.pinscreen.com;https://research.adobe.com", "aff_unique_abbr": "MBZUAI;;Adobe", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0+1;2;0+1+2", - "aff_country_unique": "United Arab Emirates;Israel;United States" + "aff_country_unique": "United Arab Emirates;Israel;United States", + "bibtex": "@InProceedings{Bekuzarov_2023_ICCV,\n \n author = {\n Bekuzarov,\n Maksym and Bermudez,\n Ariana and Lee,\n Joon-Young and Li,\n Hao\n},\n title = {\n XMem++: Production-level Video Segmentation From Few Annotated Frames\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 635-644\n} \n}" }, { "title": "XNet: Wavelet-Based Low and High Frequency Fusion Networks for Fully- and Semi-Supervised Semantic Segmentation of Biomedical Images", @@ -65639,7 +67830,8 @@ "aff_campus_unique_index": "0+0;0+0;0+0", "aff_campus_unique": "Beijing;", "aff_country_unique_index": "0+0;0+0;1;1;0+0", - "aff_country_unique": "China;United States" + "aff_country_unique": "China;United States", + "bibtex": "@InProceedings{Zhou_2023_ICCV,\n \n author = {\n Zhou,\n Yanfeng and Huang,\n Jiaxing and Wang,\n Chenlong and Song,\n Le and Yang,\n Ge\n},\n title = {\n XNet: Wavelet-Based Low and High Frequency Fusion Networks for Fully- and Semi-Supervised Semantic Segmentation of Biomedical Images\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21085-21096\n} \n}" }, { "title": "XVO: Generalized Visual Odometry via Cross-Modal Self-Training", @@ -65671,7 +67863,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Lai_2023_ICCV,\n \n author = {\n Lai,\n Lei and Shangguan,\n Zhongkai and Zhang,\n Jimuyang and Ohn-Bar,\n Eshed\n},\n title = {\n XVO: Generalized Visual Odometry via Cross-Modal Self-Training\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 10094-10105\n} \n}" }, { "title": "XiNet: Efficient Neural Networks for tinyML", @@ -65703,7 +67896,8 @@ "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "1;1;1", - "aff_country_unique": ";Italy" + "aff_country_unique": ";Italy", + "bibtex": "@InProceedings{Ancilotto_2023_ICCV,\n \n author = {\n Ancilotto,\n Alberto and Paissan,\n Francesco and Farella,\n Elisabetta\n},\n title = {\n XiNet: Efficient Neural Networks for tinyML\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 16968-16977\n} \n}" }, { "title": "Yes, we CANN: Constrained Approximate Nearest Neighbors for Local Feature-Based Visual Localization", @@ -65735,7 +67929,8 @@ "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Aiger_2023_ICCV,\n \n author = {\n Aiger,\n Dror and Araujo,\n Andre and Lynen,\n Simon\n},\n title = {\n Yes,\n we CANN: Constrained Approximate Nearest Neighbors for Local Feature-Based Visual Localization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 13339-13349\n} \n}" }, { "title": "You Never Get a Second Chance To Make a Good First Impression: Seeding Active Learning for 3D Semantic Segmentation", @@ -65743,8 +67938,8 @@ "status": "Poster", "track": "main", "pid": "1856", - "author_site": "Nermin Samet, Oriane Sim\u00e9oni, Gilles Puy, Georgy Ponimatkin, Renaud Marlet, Vincent Lepetit", - "author": "Nermin Samet; Oriane Sim\u00e9oni; Gilles Puy; Georgy Ponimatkin; Renaud Marlet; Vincent Lepetit", + "author_site": "Nermin Samet, Oriane Siméoni, Gilles Puy, Georgy Ponimatkin, Renaud Marlet, Vincent Lepetit", + "author": "Nermin Samet; Oriane Siméoni; Gilles Puy; Georgy Ponimatkin; Renaud Marlet; Vincent Lepetit", "abstract": "We propose SeedAL, a method to seed active learning for efficient annotation of 3D point clouds for semantic segmentation. Active Learning (AL) iteratively selects relevant data fractions to annotate within a given budget, but requires a first fraction of the dataset (a 'seed') to be already annotated to estimate the benefit of annotating other data fractions. We first show that the choice of the seed can significantly affect the performance of many AL methods. We then propose a method for automatically constructing a seed that will ensure good performance for AL. Assuming that images of the point clouds are available, which is common, our method relies on powerful unsupervised image features to measure the diversity of the point clouds. It selects the point clouds for the seed by optimizing the diversity under an annotation budget, which can be done by solving a linear optimization problem. Our experiments demonstrate the effectiveness of our approach compared to random seeding and existing methods on both the S3DIS and SemanticKitti datasets. Code is available at https://github.com/nerminsamet/seedal.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Samet_You_Never_Get_a_Second_Chance_To_Make_a_Good_ICCV_2023_paper.pdf", "aff": ";;;;;", @@ -65758,7 +67953,8 @@ "aff_domain": ";;;;;", "email": ";;;;;", "author_num": 6, - "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Samet_You_Never_Get_a_Second_Chance_To_Make_a_Good_ICCV_2023_paper.html" + "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Samet_You_Never_Get_a_Second_Chance_To_Make_a_Good_ICCV_2023_paper.html", + "bibtex": "@InProceedings{Samet_2023_ICCV,\n \n author = {\n Samet,\n Nermin and Sim\\'eoni,\n Oriane and Puy,\n Gilles and Ponimatkin,\n Georgy and Marlet,\n Renaud and Lepetit,\n Vincent\n},\n title = {\n You Never Get a Second Chance To Make a Good First Impression: Seeding Active Learning for 3D Semantic Segmentation\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18445-18457\n} \n}" }, { "title": "Your Diffusion Model is Secretly a Zero-Shot Classifier", @@ -65790,7 +67986,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Li_2023_ICCV,\n \n author = {\n Li,\n Alexander C. and Prabhudesai,\n Mihir and Duggal,\n Shivam and Brown,\n Ellis and Pathak,\n Deepak\n},\n title = {\n Your Diffusion Model is Secretly a Zero-Shot Classifier\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2206-2217\n} \n}" }, { "title": "Zenseact Open Dataset: A Large-Scale and Diverse Multimodal Dataset for Autonomous Driving", @@ -65798,8 +67995,8 @@ "status": "Poster", "track": "main", "pid": "8866", - "author_site": "Mina Alibeigi, William Ljungbergh, Adam Tonderski, Georg Hess, Adam Lilja, Carl Lindstr\u00f6m, Daria Motorniuk, Junsheng Fu, Jenny Widahl, Christoffer Petersson", - "author": "Mina Alibeigi; William Ljungbergh; Adam Tonderski; Georg Hess; Adam Lilja; Carl Lindstr\u00f6m; Daria Motorniuk; Junsheng Fu; Jenny Widahl; Christoffer Petersson", + "author_site": "Mina Alibeigi, William Ljungbergh, Adam Tonderski, Georg Hess, Adam Lilja, Carl Lindström, Daria Motorniuk, Junsheng Fu, Jenny Widahl, Christoffer Petersson", + "author": "Mina Alibeigi; William Ljungbergh; Adam Tonderski; Georg Hess; Adam Lilja; Carl Lindström; Daria Motorniuk; Junsheng Fu; Jenny Widahl; Christoffer Petersson", "abstract": "Existing datasets for autonomous driving (AD) often lack diversity and long-range capabilities, focusing instead on 360* perception and temporal reasoning. To address this gap, we introduce ZOD, a large-scale and diverse multimodal dataset collected over two years in various European countries, covering an area 9x that of existing datasets. ZOD boasts the highest range and resolution sensors among comparable datasets, coupled with detailed keyframe annotations for 2D and 3D objects (up to 245m), road instance/semantic segmentation, traffic sign recognition, and road classification. We believe that this unique combination will facilitate breakthroughs in long-range perception and multi-task learning. The dataset is composed of Frames, Sequences, and Drives, designed to encompass both data diversity and support for spatio-temporal learning, sensor fusion, localization, and mapping. Frames consist of 100k curated camera images with two seconds of other supporting sensor data, while the 1473 Sequences and 29 Drives include the entire sensor suite for 20 seconds and a few minutes, respectively. ZOD is the only AD dataset released under the permissive CC BY-SA 4.0 license, allowing for both research and commercial use. More information, and an extensive devkit, can be found at zod.zenseact.com.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Alibeigi_Zenseact_Open_Dataset_A_Large-Scale_and_Diverse_Multimodal_Dataset_for_ICCV_2023_paper.pdf", "aff": "Zenseact; Zenseact; Zenseact; Zenseact; Zenseact; Zenseact; Zenseact; Zenseact; Zenseact; Zenseact", @@ -65822,7 +68019,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0", - "aff_country_unique": "Sweden" + "aff_country_unique": "Sweden", + "bibtex": "@InProceedings{Alibeigi_2023_ICCV,\n \n author = {\n Alibeigi,\n Mina and Ljungbergh,\n William and Tonderski,\n Adam and Hess,\n Georg and Lilja,\n Adam and Lindstr\\"om,\n Carl and Motorniuk,\n Daria and Fu,\n Junsheng and Widahl,\n Jenny and Petersson,\n Christoffer\n},\n title = {\n Zenseact Open Dataset: A Large-Scale and Diverse Multimodal Dataset for Autonomous Driving\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 20178-20188\n} \n}" }, { "title": "Zero-1-to-3: Zero-shot One Image to 3D Object", @@ -65854,7 +68052,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Liu_2023_ICCV,\n \n author = {\n Liu,\n Ruoshi and Wu,\n Rundi and Van Hoorick,\n Basile and Tokmakov,\n Pavel and Zakharov,\n Sergey and Vondrick,\n Carl\n},\n title = {\n Zero-1-to-3: Zero-shot One Image to 3D Object\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 9298-9309\n} \n}" }, { "title": "Zero-Shot Composed Image Retrieval with Textual Inversion", @@ -65886,7 +68085,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0", - "aff_country_unique": "Italy" + "aff_country_unique": "Italy", + "bibtex": "@InProceedings{Baldrati_2023_ICCV,\n \n author = {\n Baldrati,\n Alberto and Agnolucci,\n Lorenzo and Bertini,\n Marco and Del Bimbo,\n Alberto\n},\n title = {\n Zero-Shot Composed Image Retrieval with Textual Inversion\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 15338-15347\n} \n}" }, { "title": "Zero-Shot Contrastive Loss for Text-Guided Diffusion Image Style Transfer", @@ -65918,7 +68118,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", - "aff_country_unique": "South Korea" + "aff_country_unique": "South Korea", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n Serin and Hwang,\n Hyunmin and Ye,\n Jong Chul\n},\n title = {\n Zero-Shot Contrastive Loss for Text-Guided Diffusion Image Style Transfer\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22873-22882\n} \n}" }, { "title": "Zero-Shot Point Cloud Segmentation by Semantic-Visual Aware Synthesis", @@ -65950,7 +68151,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;2;0", - "aff_country_unique": "China;Australia;Singapore" + "aff_country_unique": "China;Australia;Singapore", + "bibtex": "@InProceedings{Yang_2023_ICCV,\n \n author = {\n Yang,\n Yuwei and Hayat,\n Munawar and Jin,\n Zhao and Zhu,\n Hongyuan and Lei,\n Yinjie\n},\n title = {\n Zero-Shot Point Cloud Segmentation by Semantic-Visual Aware Synthesis\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 11586-11596\n} \n}" }, { "title": "Zero-Shot Spatial Layout Conditioning for Text-to-Image Diffusion Models", @@ -65958,11 +68160,11 @@ "status": "Poster", "track": "main", "pid": "4195", - "author_site": "Guillaume Couairon, Marl\u00e8ne Careil, Matthieu Cord, St\u00e9phane Lathuili\u00e8re, Jakob Verbeek", - "author": "Guillaume Couairon; Marl\u00e8ne Careil; Matthieu Cord; St\u00e9phane Lathuili\u00e8re; Jakob Verbeek", + "author_site": "Guillaume Couairon, Marlène Careil, Matthieu Cord, Stéphane Lathuilière, Jakob Verbeek", + "author": "Guillaume Couairon; Marlène Careil; Matthieu Cord; Stéphane Lathuilière; Jakob Verbeek", "abstract": "Large-scale text-to-image diffusion models have significantly improved the state of the art in generative image modeling and allow for an intuitive and powerful user interface to drive the image generation process. Expressing spatial constraints, e.g. to position specific objects in particular locations, is cumbersome using text; and current text-based image generation models are not able to accurately follow such instructions. In this paper we consider image generation from text associated with segments on the image canvas, which combines an intuitive natural language interface with precise spatial control over the generated content. We propose ZestGuide, a \"ZEro-shot\" SegmenTation Guidance approach that can be plugged into pre-trained text-to-image diffusion models, and does not require any additional training. It leverages implicit segmentation maps that can be extracted from cross-attention layers, and uses them to align the generation with input masks. Our experimental results combine high image quality with accurate alignment of generated content with input segmentations, and improve over prior work both quantitatively and qualitatively, including methods that require training on images with corresponding segmentations. Compared to Paint with Words, the previous state-of-the art in image generation with zero-shot segmentation conditioning, we improve by 5 to 10 mIoU points on the COCO dataset with similar FID scores.", "pdf": "https://openaccess.thecvf.com/content/ICCV2023/papers/Couairon_Zero-Shot_Spatial_Layout_Conditioning_for_Text-to-Image_Diffusion_Models_ICCV_2023_paper.pdf", - "aff": "Meta AI, Sorbonne Universit\u00e9; Meta AI, LTCI, T\u00e9l\u00e9com Paris, IP Paris; Sorbonne Universit\u00e9, Valeo.ai; LTCI, T\u00e9l\u00e9com Paris, IP Paris; Meta AI", + "aff": "Meta AI, Sorbonne Université; Meta AI, LTCI, Télécom Paris, IP Paris; Sorbonne Université, Valeo.ai; LTCI, Télécom Paris, IP Paris; Meta AI", "project": "", "github": "", "supp": "https://openaccess.thecvf.com/content/ICCV2023/supplemental/Couairon_Zero-Shot_Spatial_Layout_ICCV_2023_supplemental.pdf", @@ -65974,15 +68176,16 @@ "email": "meta.com;meta.com;sorbonne-universite.fr;telecom-paris.fr;meta.com", "author_num": 5, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Couairon_Zero-Shot_Spatial_Layout_Conditioning_for_Text-to-Image_Diffusion_Models_ICCV_2023_paper.html", - "aff_unique_index": "0;1;0;2;1", - "aff_unique_norm": "Sorbonne Universit\u00e9;Meta;T\u00e9l\u00e9com Paris", - "aff_unique_dep": "Meta AI;LTCI;LTCI", - "aff_unique_url": "https://www.sorbonne-universite.fr;https://meta.ai;https://www.telecom-paris.fr", - "aff_unique_abbr": ";Meta;T\u00e9l\u00e9com Paris", + "aff_unique_index": "0;1;0;2;3", + "aff_unique_norm": "Sorbonne Université;Meta AI;Télécom Paris;Meta Platforms, Inc.", + "aff_unique_dep": "Meta AI;LTCI;LTCI;Meta AI", + "aff_unique_url": "https://www.sorbonne-universite.fr;https://meta.ai;https://www.telecom-paris.fr;https://meta.com", + "aff_unique_abbr": ";Meta;Télécom Paris;Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1", - "aff_country_unique": "France;United States" + "aff_country_unique": "France;United States", + "bibtex": "@InProceedings{Couairon_2023_ICCV,\n \n author = {\n Couairon,\n Guillaume and Careil,\n Marl\\`ene and Cord,\n Matthieu and Lathuili\\`ere,\n St\\'ephane and Verbeek,\n Jakob\n},\n title = {\n Zero-Shot Spatial Layout Conditioning for Text-to-Image Diffusion Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 2174-2183\n} \n}" }, { "title": "Zero-guidance Segmentation Using Zero Segment Labels", @@ -66014,7 +68217,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0;0", - "aff_country_unique": "Thailand" + "aff_country_unique": "Thailand", + "bibtex": "@InProceedings{Rewatbowornwong_2023_ICCV,\n \n author = {\n Rewatbowornwong,\n Pitchaporn and Chatthee,\n Nattanat and Chuangsuwanich,\n Ekapol and Suwajanakorn,\n Supasorn\n},\n title = {\n Zero-guidance Segmentation Using Zero Segment Labels\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 1162-1172\n} \n}" }, { "title": "Zip-NeRF: Anti-Aliased Grid-Based Neural Radiance Fields", @@ -66046,7 +68250,8 @@ "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Barron_2023_ICCV,\n \n author = {\n Barron,\n Jonathan T. and Mildenhall,\n Ben and Verbin,\n Dor and Srinivasan,\n Pratul P. and Hedman,\n Peter\n},\n title = {\n Zip-NeRF: Anti-Aliased Grid-Based Neural Radiance Fields\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19697-19705\n} \n}" }, { "title": "Zolly: Zoom Focal Length Correctly for Perspective-Distorted Human Mesh Reconstruction", @@ -66071,14 +68276,15 @@ "author_num": 9, "oa": "https://openaccess.thecvf.com/content/ICCV2023/html/Wang_Zolly_Zoom_Focal_Length_Correctly_for_Perspective-Distorted_Human_Mesh_Reconstruction_ICCV_2023_paper.html", "aff_unique_index": "0;1;2;3;4;2;4;3;0", - "aff_unique_norm": "University of Hong Kong;University of Adelaide;SenseTime;Shanghai AI Laboratory;Zhejiang University", + "aff_unique_norm": "The University of Hong Kong;University of Adelaide;SenseTime;Shanghai AI Laboratory;Zhejiang University", "aff_unique_dep": ";;SenseTime Research;;", "aff_unique_url": "https://www.hku.hk;https://www.adelaide.edu.au;https://www.sensetime.com;https://www.shanghai-ai-lab.com;https://www.zju.edu.cn", "aff_unique_abbr": "HKU;Adelaide;SenseTime;SAIL;ZJU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;1;0;0;0;0;0;0;0", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Wang_2023_ICCV,\n \n author = {\n Wang,\n Wenjia and Ge,\n Yongtao and Mei,\n Haiyi and Cai,\n Zhongang and Sun,\n Qingping and Wang,\n Yanjun and Shen,\n Chunhua and Yang,\n Lei and Komura,\n Taku\n},\n title = {\n Zolly: Zoom Focal Length Correctly for Perspective-Distorted Human Mesh Reconstruction\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 3925-3935\n} \n}" }, { "title": "eP-ALM: Efficient Perceptual Augmentation of Language Models", @@ -66110,7 +68316,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0", - "aff_country_unique": "France" + "aff_country_unique": "France", + "bibtex": "@InProceedings{Shukor_2023_ICCV,\n \n author = {\n Shukor,\n Mustafa and Dancette,\n Corentin and Cord,\n Matthieu\n},\n title = {\n eP-ALM: Efficient Perceptual Augmentation of Language Models\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22056-22069\n} \n}" }, { "title": "iDAG: Invariant DAG Searching for Domain Generalization", @@ -66142,7 +68349,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Huang_2023_ICCV,\n \n author = {\n Huang,\n Zenan and Wang,\n Haobo and Zhao,\n Junbo and Zheng,\n Nenggan\n},\n title = {\n iDAG: Invariant DAG Searching for Domain Generalization\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 19169-19179\n} \n}" }, { "title": "iVS-Net: Learning Human View Synthesis from Internet Videos", @@ -66174,7 +68382,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "China" + "aff_country_unique": "China", + "bibtex": "@InProceedings{Dong_2023_ICCV,\n \n author = {\n Dong,\n Junting and Fang,\n Qi and Yang,\n Tianshuo and Shuai,\n Qing and Qiao,\n Chengyu and Peng,\n Sida\n},\n title = {\n iVS-Net: Learning Human View Synthesis from Internet Videos\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 22942-22951\n} \n}" }, { "title": "s-Adaptive Decoupled Prototype for Few-Shot Object Detection", @@ -66206,7 +68415,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0;1;0;0;0", - "aff_country_unique": "China;Australia" + "aff_country_unique": "China;Australia", + "bibtex": "@InProceedings{Du_2023_ICCV,\n \n author = {\n Du,\n Jinhao and Zhang,\n Shan and Chen,\n Qiang and Le,\n Haifeng and Sun,\n Yanpeng and Ni,\n Yao and Wang,\n Jian and He,\n Bin and Wang,\n Jingdong\n},\n title = {\n s-Adaptive Decoupled Prototype for Few-Shot Object Detection\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 18950-18960\n} \n}" }, { "title": "uSplit: Image Decomposition for Fluorescence Microscopy", @@ -66238,7 +68448,8 @@ "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0", - "aff_country_unique": "Italy;United Kingdom" + "aff_country_unique": "Italy;United Kingdom", + "bibtex": "@InProceedings{Ashesh_2023_ICCV,\n \n author = {\n Ashesh,\n Ashesh and Krull,\n Alexander and Di Sante,\n Moises and Pasqualini,\n Francesco and Jug,\n Florian\n},\n title = {\n uSplit: Image Decomposition for Fluorescence Microscopy\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 21219-21229\n} \n}" }, { "title": "zPROBE: Zero Peek Robustness Checks for Federated Learning", @@ -66270,6 +68481,7 @@ "aff_campus_unique_index": "1;1;1;1", "aff_campus_unique": ";San Diego", "aff_country_unique_index": "0;0;0;0;0;0", - "aff_country_unique": "United States" + "aff_country_unique": "United States", + "bibtex": "@InProceedings{Ghodsi_2023_ICCV,\n \n author = {\n Ghodsi,\n Zahra and Javaheripi,\n Mojan and Sheybani,\n Nojan and Zhang,\n Xinqiao and Huang,\n Ke and Koushanfar,\n Farinaz\n},\n title = {\n zPROBE: Zero Peek Robustness Checks for Federated Learning\n},\n booktitle = {\n Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)\n},\n month = {\n October\n},\n year = {\n 2023\n},\n pages = {\n 4860-4870\n} \n}" } ] \ No newline at end of file